Dzunisani007 commited on
Commit
0e6e3fe
Β·
1 Parent(s): 21c3065

πŸš€ FINAL EXTRACTION FIX: Unified scoring and legacy response compatibility

Browse files
Files changed (2) hide show
  1. app/services/ocr_service.py +20 -28
  2. app/tasks/pipeline.py +80 -101
app/services/ocr_service.py CHANGED
@@ -180,23 +180,6 @@ class OCRService:
180
  logger.error(f"Image OCR extraction failed: {e}")
181
  raise
182
 
183
- def _should_use_ocr(self, text: str) -> bool:
184
- """Determines if OCR should be used based on text quality."""
185
- if not text or len(text.strip()) < self.min_text_density:
186
- return True
187
-
188
- # Check for junk character signatures (common in mis-encoded PDFs)
189
- junk_patterns = [
190
- r'^[a-zA-Z\s\-\.]{1,100}$', # Only short random letters
191
- r'(\b\w\b\s+){5,}', # Too many single-character words separated by spaces
192
- r'[^\x00-\x7F]{5,}' # Too many non-ASCII characters
193
- ]
194
- import re
195
- for pattern in junk_patterns:
196
- if re.search(pattern, text):
197
- return True
198
- return False
199
-
200
  def _preprocess_image(self, image) -> object:
201
  """
202
  Preprocess image for better OCR accuracy using adaptive thresholding.
@@ -271,25 +254,32 @@ class OCRService:
271
  prev_blank = True
272
 
273
  return '\n'.join(cleaned_lines)
274
-
275
  def _should_use_ocr(self, text: str) -> bool:
276
  """
277
- Determine if OCR should be used instead of native text extraction.
278
-
279
- Args:
280
- text: Text from native PDF extraction
281
-
282
- Returns:
283
- True if OCR should be used, False otherwise
284
  """
285
  if not text or len(text.strip()) < self.min_text_density:
286
  return True
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  # Check for junk characters that indicate broken PDF encoding
289
  junk_indicators = [
290
- '', # Replacement character
291
- '', # Unicode replacement char
292
- '\x00', # Null byte
293
  '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09',
294
  '\x0b', '\x0c', '\x0e', '\x0f', # Control characters except \n, \r, \t
295
  ]
@@ -299,11 +289,13 @@ class OCRService:
299
 
300
  # If more than 1% junk characters, use OCR
301
  if junk_ratio > 0.01:
 
302
  return True
303
 
304
  # Check for excessive non-alphabetic characters (indicative of encoding issues)
305
  alpha_ratio = sum(c.isalpha() or c.isspace() for c in text) / len(text) if text else 0
306
  if alpha_ratio < 0.7: # Less than 70% readable characters
 
307
  return True
308
 
309
  return False
 
180
  logger.error(f"Image OCR extraction failed: {e}")
181
  raise
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  def _preprocess_image(self, image) -> object:
184
  """
185
  Preprocess image for better OCR accuracy using adaptive thresholding.
 
254
  prev_blank = True
255
 
256
  return '\n'.join(cleaned_lines)
257
+
258
  def _should_use_ocr(self, text: str) -> bool:
259
  """
260
+ Determines if OCR should be used based on text quality, density, and
261
+ character encoding issues common in mis-encoded PDFs.
 
 
 
 
 
262
  """
263
  if not text or len(text.strip()) < self.min_text_density:
264
  return True
265
 
266
+ # Check for junk character signatures (common in mis-encoded PDFs)
267
+ import re
268
+ junk_patterns = [
269
+ r'^[a-zA-Z\s\-\.]{1,100}$', # Only short random letters
270
+ r'(\b\w\b\s+){5,}', # Too many single-character words separated by spaces
271
+ r'[^\x00-\x7F]{10,}' # Too many non-ASCII characters in a block
272
+ ]
273
+
274
+ for pattern in junk_patterns:
275
+ if re.search(pattern, text):
276
+ logger.info(f"Junk pattern matched in text: {pattern}")
277
+ return True
278
+
279
  # Check for junk characters that indicate broken PDF encoding
280
  junk_indicators = [
281
+ '\ufffd', # Replacement character
282
+ '\u0000', # Null byte
 
283
  '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09',
284
  '\x0b', '\x0c', '\x0e', '\x0f', # Control characters except \n, \r, \t
285
  ]
 
289
 
290
  # If more than 1% junk characters, use OCR
291
  if junk_ratio > 0.01:
292
+ logger.info(f"Junk character ratio too high: {junk_ratio:.2%}")
293
  return True
294
 
295
  # Check for excessive non-alphabetic characters (indicative of encoding issues)
296
  alpha_ratio = sum(c.isalpha() or c.isspace() for c in text) / len(text) if text else 0
297
  if alpha_ratio < 0.7: # Less than 70% readable characters
298
+ logger.info(f"Alpha character ratio too low: {alpha_ratio:.2%}")
299
  return True
300
 
301
  return False
app/tasks/pipeline.py CHANGED
@@ -73,63 +73,34 @@ def process_job(job) -> None:
73
  contact_portfolio = m.group(0).rstrip(".,;") if m else None
74
  if contact_portfolio in (contact_linkedin, contact_github):
75
  contact_portfolio = None
76
-
77
  safe_text = strip_pii_for_models(resume_text)
78
 
79
- entities = parse_entities(safe_text)
80
- skill_matches = match_skills_to_job(entities.get("skills", []), job.job_description)
81
-
82
- required = extract_required_skills_from_job(job.job_description)
83
- matched_set = {m["skill"].lower() for m in skill_matches if m.get("skill")}
84
- missing = [s for s in required if s.lower() not in matched_set]
85
-
86
- score_payload = score_components(entities, skill_matches, resume_text)
87
- suggestions = generate_feedback_list(entities, resume_text, score_payload, missing)
88
-
89
- prof_entities = entities.get("professional_details", {}) if isinstance(entities, dict) else {}
90
- exp_val = prof_entities.get("experience")
91
- exp_items: list[dict] = exp_val if isinstance(exp_val, list) else []
92
- exp_text = "\n".join([str(x.get("description") or "").strip() for x in exp_items if isinstance(x, dict) and (x.get("description") or "").strip()])
93
-
94
- # Build structured_data using canonical Pydantic schema
95
  cv_data = StructuredCV(
96
  personal_details=PersonalDetails(
97
- full_name=entities.get("personal_details", {}).get("full_name"),
98
- email=contact_email or entities.get("personal_details", {}).get("email"),
99
- phone=contact_phone or entities.get("personal_details", {}).get("phone"),
100
- address=entities.get("personal_details", {}).get("address"),
101
- dob=entities.get("personal_details", {}).get("dob"),
102
- linkedin=contact_linkedin or entities.get("personal_details", {}).get("linkedin"),
103
- github=contact_github or entities.get("personal_details", {}).get("github"),
104
- portfolio=contact_portfolio or entities.get("personal_details", {}).get("portfolio"),
105
  ),
106
- professional_summary="\n".join((entities.get("summary") or [])[:8]).strip() if isinstance(entities, dict) and entities.get("summary") else "",
107
- work_experience=[
108
- WorkExperienceItem(
109
- company=exp.get("company"),
110
- title=exp.get("title"),
111
- start_date=exp.get("start_date"),
112
- end_date=exp.get("end_date"),
113
- description=exp.get("description")
114
- ) for exp in (entities.get("professional_details", {}).get("experience") or [])
115
- ],
116
- education=[
117
- EducationItem(
118
- institution=edu.get("institution"),
119
- degree=edu.get("degree"),
120
- field=edu.get("field"),
121
- start_date=edu.get("start_date"),
122
- end_date=edu.get("end_date")
123
- ) for edu in (entities.get("education_details", {}).get("education") or [])
124
- ],
125
- skills=entities.get("skills", []) or [],
126
- certifications=entities.get("education_details", {}).get("certifications") or [],
127
- languages=entities.get("education_details", {}).get("languages") or [],
128
  )
129
 
 
130
  llm_structured = extract_structured_cv(resume_text)
131
  if isinstance(llm_structured, dict):
132
- # Update Pydantic model with LLM results if available
133
  for k in ("personal_details", "education_details", "professional_details"):
134
  if isinstance(llm_structured.get(k), dict):
135
  if k == "personal_details":
@@ -137,7 +108,6 @@ def process_job(job) -> None:
137
  if pv and hasattr(cv_data.personal_details, pk):
138
  setattr(cv_data.personal_details, pk, pv)
139
  elif k == "professional_details":
140
- # Map LLM professional_details to summary, experience, skills
141
  if llm_structured[k].get("bio"):
142
  cv_data.professional_summary = llm_structured[k]["bio"]
143
 
@@ -152,44 +122,52 @@ def process_job(job) -> None:
152
  description=exp.get("description")
153
  ) for exp in llm_exp if isinstance(exp, dict)
154
  ]
 
 
 
 
155
  elif k == "education_details":
156
- # Similar mapping for education
157
  llm_edu = llm_structured[k].get("education")
158
  if isinstance(llm_edu, list) and len(llm_edu) > 0:
159
  cv_data.education = [
160
  EducationItem(
161
- institution=edu.get("institution"),
162
  degree=edu.get("degree"),
163
  field=edu.get("field"),
164
  start_date=edu.get("start_date"),
165
  end_date=edu.get("end_date")
166
  ) for edu in llm_edu if isinstance(edu, dict)
167
  ]
 
 
 
 
168
 
169
  structured_data = cv_data.model_dump()
170
 
171
- # *** ENHANCED EXTRACTION (optional; disable with ENABLE_ENHANCED_SKILLS=false for faster validation) ***
172
  if settings.enable_enhanced_skills:
173
  skills_extractor = EnhancedSkillsExtractor()
174
  experience_parser = ImprovedExperienceParser()
175
  cert_extractor = CertificationEnhancement()
176
 
177
- enhanced_skills = skills_extractor.extract_skills(resume_text)
178
- if enhanced_skills:
179
- # Convert list of dicts to list of strings for consistency
180
  skill_names = []
181
- for s in enhanced_skills:
182
  if isinstance(s, dict):
183
  skill_names.append(s.get("name", ""))
184
  else:
185
  skill_names.append(str(s))
186
 
187
- if len(skill_names) > len(structured_data.get("skills", [])):
188
- structured_data["skills"] = list(set(skill_names)) # Deduplicate
 
 
189
 
190
  enhanced_experience = experience_parser.parse(resume_text)
191
- if enhanced_experience and len(enhanced_experience) > len(structured_data.get("experience", [])):
192
- structured_data["experience"] = [
193
  {
194
  "title": exp.get("title"),
195
  "company": exp.get("company"),
@@ -203,9 +181,32 @@ def process_job(job) -> None:
203
  if enhanced_certs and len(enhanced_certs) > len(structured_data.get("certifications", [])):
204
  structured_data["certifications"] = enhanced_certs
205
 
206
- # Simple extraction suggestions (e.g., missing LinkedIn, missing email)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  extraction_suggestions = []
208
- pd = structured_data.get("personal_details", {}) if isinstance(structured_data, dict) else {}
209
  if not pd.get("linkedin"):
210
  extraction_suggestions.append("Add a LinkedIn URL to your profile.")
211
  if not pd.get("email"):
@@ -226,42 +227,15 @@ def process_job(job) -> None:
226
  # Merge static and LLM suggestions
227
  match_suggestions = suggestions + (llm_suggestions if isinstance(llm_suggestions, list) else [])
228
 
229
- # Generate autofill data (simple, reliable mapping)
230
- autofill_data = None
231
- try:
232
- # Simple direct mapping from structured_data - this always works
233
- autofill_data = {
234
- "personal": structured_data.get("personal_details", {}),
235
- "experience": [
236
- {
237
- "title": exp.get("title"),
238
- "company": exp.get("company"),
239
- "start_date": exp.get("start_date"),
240
- "end_date": exp.get("end_date"),
241
- "description": exp.get("description"),
242
- "location": exp.get("location")
243
- } for exp in structured_data.get("work_experience", [])
244
- ],
245
- "education": [
246
- {
247
- "degree": edu.get("degree"),
248
- "university": edu.get("institution"),
249
- "start_date": edu.get("start_date"),
250
- "end_date": edu.get("end_date"),
251
- "field": edu.get("field")
252
- } for edu in structured_data.get("education", [])
253
- ],
254
- "skills": structured_data.get("skills", []),
255
- "certifications": structured_data.get("certifications", []),
256
- "languages": structured_data.get("languages", [])
257
- }
258
-
259
- logger.info(f"Autofill data generated successfully: {len(str(autofill_data))} characters")
260
-
261
- except Exception as e:
262
- logger.error(f"Autofill data generation failed: {e}")
263
- logger.error(f"Structured data keys: {list(structured_data.keys()) if structured_data else 'N/A'}")
264
- autofill_data = None
265
 
266
  normalized = normalize_analysis_result(
267
  analysis_id=str(analysis_id),
@@ -270,14 +244,19 @@ def process_job(job) -> None:
270
  component_scores=score_payload.get("component_scores"),
271
  evidence=evidence,
272
  suggestions=match_suggestions,
273
- raw_payload={"entities": entities, "skill_matches": skill_matches},
274
- extraction_metadata={"method": "direct_text", "confidence": None, "pages": None, "has_scanned_content": False},
275
  structured_data=structured_data,
276
  extraction_suggestions=extraction_suggestions,
277
  interview_questions=interview_questions,
278
  )
279
 
280
- # Add autofill data to response if generated
 
 
 
 
 
281
  if autofill_data:
282
  normalized["autofill_data"] = autofill_data
283
 
 
73
  contact_portfolio = m.group(0).rstrip(".,;") if m else None
74
  if contact_portfolio in (contact_linkedin, contact_github):
75
  contact_portfolio = None
76
+
77
  safe_text = strip_pii_for_models(resume_text)
78
 
79
+ # *** PHASE 1: Enhanced Structured Extraction ***
80
+ # We perform this first so matching and scoring have the best possible data.
81
+
82
+ # Start with simple Pydantic model
 
 
 
 
 
 
 
 
 
 
 
 
83
  cv_data = StructuredCV(
84
  personal_details=PersonalDetails(
85
+ full_name=None,
86
+ email=contact_email,
87
+ phone=contact_phone,
88
+ linkedin=contact_linkedin,
89
+ github=contact_github,
90
+ portfolio=contact_portfolio,
 
 
91
  ),
92
+ professional_summary="",
93
+ work_experience=[],
94
+ education=[],
95
+ skills=[],
96
+ certifications=[],
97
+ languages=[],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  )
99
 
100
+ # 1. AI Structured Extraction (NuExtract)
101
  llm_structured = extract_structured_cv(resume_text)
102
  if isinstance(llm_structured, dict):
103
+ # Update Pydantic model with LLM results
104
  for k in ("personal_details", "education_details", "professional_details"):
105
  if isinstance(llm_structured.get(k), dict):
106
  if k == "personal_details":
 
108
  if pv and hasattr(cv_data.personal_details, pk):
109
  setattr(cv_data.personal_details, pk, pv)
110
  elif k == "professional_details":
 
111
  if llm_structured[k].get("bio"):
112
  cv_data.professional_summary = llm_structured[k]["bio"]
113
 
 
122
  description=exp.get("description")
123
  ) for exp in llm_exp if isinstance(exp, dict)
124
  ]
125
+
126
+ if llm_structured[k].get("skills"):
127
+ cv_data.skills = llm_structured[k]["skills"]
128
+
129
  elif k == "education_details":
 
130
  llm_edu = llm_structured[k].get("education")
131
  if isinstance(llm_edu, list) and len(llm_edu) > 0:
132
  cv_data.education = [
133
  EducationItem(
134
+ institution=edu.get("university") or edu.get("institution"),
135
  degree=edu.get("degree"),
136
  field=edu.get("field"),
137
  start_date=edu.get("start_date"),
138
  end_date=edu.get("end_date")
139
  ) for edu in llm_edu if isinstance(edu, dict)
140
  ]
141
+ if llm_structured[k].get("certifications"):
142
+ cv_data.certifications = llm_structured[k]["certifications"]
143
+ if llm_structured[k].get("languages"):
144
+ cv_data.languages = llm_structured[k]["languages"]
145
 
146
  structured_data = cv_data.model_dump()
147
 
148
+ # 2. Pattern Matching & Domain-Specific Extractors
149
  if settings.enable_enhanced_skills:
150
  skills_extractor = EnhancedSkillsExtractor()
151
  experience_parser = ImprovedExperienceParser()
152
  cert_extractor = CertificationEnhancement()
153
 
154
+ enhanced_skills_objs = skills_extractor.extract_skills(resume_text)
155
+ if enhanced_skills_objs:
 
156
  skill_names = []
157
+ for s in enhanced_skills_objs:
158
  if isinstance(s, dict):
159
  skill_names.append(s.get("name", ""))
160
  else:
161
  skill_names.append(str(s))
162
 
163
+ # Merge and deduplicate
164
+ current_skills = set(structured_data.get("skills", []))
165
+ current_skills.update(skill_names)
166
+ structured_data["skills"] = list(filter(None, current_skills))
167
 
168
  enhanced_experience = experience_parser.parse(resume_text)
169
+ if enhanced_experience and len(enhanced_experience) > len(structured_data.get("work_experience", [])):
170
+ structured_data["work_experience"] = [
171
  {
172
  "title": exp.get("title"),
173
  "company": exp.get("company"),
 
181
  if enhanced_certs and len(enhanced_certs) > len(structured_data.get("certifications", [])):
182
  structured_data["certifications"] = enhanced_certs
183
 
184
+ # *** PHASE 2: Matching and Scoring using Unified Data ***
185
+
186
+ entities = parse_entities(safe_text)
187
+ # Force structured results into entities for scoring
188
+ final_skills = structured_data.get("skills", [])
189
+ if not final_skills:
190
+ # Fallback to simple NER if enhanced failed
191
+ final_skills = entities.get("skills", [])
192
+
193
+ skill_matches = match_skills_to_job(final_skills, job.job_description)
194
+
195
+ required = extract_required_skills_from_job(job.job_description)
196
+ matched_set = {m["skill"].lower() for m in skill_matches if m.get("skill")}
197
+ missing = [s for s in required if s.lower() not in matched_set]
198
+
199
+ # Use improved entities for scoring
200
+ scoring_entities = entities.copy()
201
+ scoring_entities["skills"] = final_skills
202
+ scoring_entities.setdefault("professional_details", {})["experience"] = structured_data.get("work_experience", [])
203
+
204
+ score_payload = score_components(scoring_entities, skill_matches, resume_text)
205
+ suggestions = generate_feedback_list(scoring_entities, resume_text, score_payload, missing)
206
+
207
+ # Simple extraction suggestions
208
  extraction_suggestions = []
209
+ pd = structured_data.get("personal_details", {})
210
  if not pd.get("linkedin"):
211
  extraction_suggestions.append("Add a LinkedIn URL to your profile.")
212
  if not pd.get("email"):
 
227
  # Merge static and LLM suggestions
228
  match_suggestions = suggestions + (llm_suggestions if isinstance(llm_suggestions, list) else [])
229
 
230
+ # Generate autofill data
231
+ autofill_data = {
232
+ "personal": structured_data.get("personal_details", {}),
233
+ "experience": structured_data.get("work_experience", []),
234
+ "education": structured_data.get("education", []),
235
+ "skills": structured_data.get("skills", []),
236
+ "certifications": structured_data.get("certifications", []),
237
+ "languages": structured_data.get("languages", [])
238
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  normalized = normalize_analysis_result(
241
  analysis_id=str(analysis_id),
 
244
  component_scores=score_payload.get("component_scores"),
245
  evidence=evidence,
246
  suggestions=match_suggestions,
247
+ raw_payload={"entities": entities, "skill_matches": skill_matches, "cv_text": resume_text},
248
+ extraction_metadata={"method": "hybrid_ocr_extraction", "confidence": None, "pages": None, "has_scanned_content": False},
249
  structured_data=structured_data,
250
  extraction_suggestions=extraction_suggestions,
251
  interview_questions=interview_questions,
252
  )
253
 
254
+ # πŸ”₯ COMPATIBILITY SHIM: Add top-level fields for user test scripts
255
+ normalized["skills"] = final_skills
256
+ normalized["experience"] = structured_data.get("work_experience", [])
257
+ normalized["certifications"] = structured_data.get("certifications", [])
258
+ normalized["personal_details"] = structured_data.get("personal_details", {})
259
+
260
  if autofill_data:
261
  normalized["autofill_data"] = autofill_data
262