Spaces:
Running
Running
Commit Β·
0e6e3fe
1
Parent(s): 21c3065
π FINAL EXTRACTION FIX: Unified scoring and legacy response compatibility
Browse files- app/services/ocr_service.py +20 -28
- app/tasks/pipeline.py +80 -101
app/services/ocr_service.py
CHANGED
|
@@ -180,23 +180,6 @@ class OCRService:
|
|
| 180 |
logger.error(f"Image OCR extraction failed: {e}")
|
| 181 |
raise
|
| 182 |
|
| 183 |
-
def _should_use_ocr(self, text: str) -> bool:
|
| 184 |
-
"""Determines if OCR should be used based on text quality."""
|
| 185 |
-
if not text or len(text.strip()) < self.min_text_density:
|
| 186 |
-
return True
|
| 187 |
-
|
| 188 |
-
# Check for junk character signatures (common in mis-encoded PDFs)
|
| 189 |
-
junk_patterns = [
|
| 190 |
-
r'^[a-zA-Z\s\-\.]{1,100}$', # Only short random letters
|
| 191 |
-
r'(\b\w\b\s+){5,}', # Too many single-character words separated by spaces
|
| 192 |
-
r'[^\x00-\x7F]{5,}' # Too many non-ASCII characters
|
| 193 |
-
]
|
| 194 |
-
import re
|
| 195 |
-
for pattern in junk_patterns:
|
| 196 |
-
if re.search(pattern, text):
|
| 197 |
-
return True
|
| 198 |
-
return False
|
| 199 |
-
|
| 200 |
def _preprocess_image(self, image) -> object:
|
| 201 |
"""
|
| 202 |
Preprocess image for better OCR accuracy using adaptive thresholding.
|
|
@@ -271,25 +254,32 @@ class OCRService:
|
|
| 271 |
prev_blank = True
|
| 272 |
|
| 273 |
return '\n'.join(cleaned_lines)
|
| 274 |
-
|
| 275 |
def _should_use_ocr(self, text: str) -> bool:
|
| 276 |
"""
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
Args:
|
| 280 |
-
text: Text from native PDF extraction
|
| 281 |
-
|
| 282 |
-
Returns:
|
| 283 |
-
True if OCR should be used, False otherwise
|
| 284 |
"""
|
| 285 |
if not text or len(text.strip()) < self.min_text_density:
|
| 286 |
return True
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
# Check for junk characters that indicate broken PDF encoding
|
| 289 |
junk_indicators = [
|
| 290 |
-
'', # Replacement character
|
| 291 |
-
'', #
|
| 292 |
-
'\x00', # Null byte
|
| 293 |
'\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09',
|
| 294 |
'\x0b', '\x0c', '\x0e', '\x0f', # Control characters except \n, \r, \t
|
| 295 |
]
|
|
@@ -299,11 +289,13 @@ class OCRService:
|
|
| 299 |
|
| 300 |
# If more than 1% junk characters, use OCR
|
| 301 |
if junk_ratio > 0.01:
|
|
|
|
| 302 |
return True
|
| 303 |
|
| 304 |
# Check for excessive non-alphabetic characters (indicative of encoding issues)
|
| 305 |
alpha_ratio = sum(c.isalpha() or c.isspace() for c in text) / len(text) if text else 0
|
| 306 |
if alpha_ratio < 0.7: # Less than 70% readable characters
|
|
|
|
| 307 |
return True
|
| 308 |
|
| 309 |
return False
|
|
|
|
| 180 |
logger.error(f"Image OCR extraction failed: {e}")
|
| 181 |
raise
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
def _preprocess_image(self, image) -> object:
|
| 184 |
"""
|
| 185 |
Preprocess image for better OCR accuracy using adaptive thresholding.
|
|
|
|
| 254 |
prev_blank = True
|
| 255 |
|
| 256 |
return '\n'.join(cleaned_lines)
|
| 257 |
+
|
| 258 |
def _should_use_ocr(self, text: str) -> bool:
|
| 259 |
"""
|
| 260 |
+
Determines if OCR should be used based on text quality, density, and
|
| 261 |
+
character encoding issues common in mis-encoded PDFs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
"""
|
| 263 |
if not text or len(text.strip()) < self.min_text_density:
|
| 264 |
return True
|
| 265 |
|
| 266 |
+
# Check for junk character signatures (common in mis-encoded PDFs)
|
| 267 |
+
import re
|
| 268 |
+
junk_patterns = [
|
| 269 |
+
r'^[a-zA-Z\s\-\.]{1,100}$', # Only short random letters
|
| 270 |
+
r'(\b\w\b\s+){5,}', # Too many single-character words separated by spaces
|
| 271 |
+
r'[^\x00-\x7F]{10,}' # Too many non-ASCII characters in a block
|
| 272 |
+
]
|
| 273 |
+
|
| 274 |
+
for pattern in junk_patterns:
|
| 275 |
+
if re.search(pattern, text):
|
| 276 |
+
logger.info(f"Junk pattern matched in text: {pattern}")
|
| 277 |
+
return True
|
| 278 |
+
|
| 279 |
# Check for junk characters that indicate broken PDF encoding
|
| 280 |
junk_indicators = [
|
| 281 |
+
'\ufffd', # Replacement character
|
| 282 |
+
'\u0000', # Null byte
|
|
|
|
| 283 |
'\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09',
|
| 284 |
'\x0b', '\x0c', '\x0e', '\x0f', # Control characters except \n, \r, \t
|
| 285 |
]
|
|
|
|
| 289 |
|
| 290 |
# If more than 1% junk characters, use OCR
|
| 291 |
if junk_ratio > 0.01:
|
| 292 |
+
logger.info(f"Junk character ratio too high: {junk_ratio:.2%}")
|
| 293 |
return True
|
| 294 |
|
| 295 |
# Check for excessive non-alphabetic characters (indicative of encoding issues)
|
| 296 |
alpha_ratio = sum(c.isalpha() or c.isspace() for c in text) / len(text) if text else 0
|
| 297 |
if alpha_ratio < 0.7: # Less than 70% readable characters
|
| 298 |
+
logger.info(f"Alpha character ratio too low: {alpha_ratio:.2%}")
|
| 299 |
return True
|
| 300 |
|
| 301 |
return False
|
app/tasks/pipeline.py
CHANGED
|
@@ -73,63 +73,34 @@ def process_job(job) -> None:
|
|
| 73 |
contact_portfolio = m.group(0).rstrip(".,;") if m else None
|
| 74 |
if contact_portfolio in (contact_linkedin, contact_github):
|
| 75 |
contact_portfolio = None
|
| 76 |
-
|
| 77 |
safe_text = strip_pii_for_models(resume_text)
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
matched_set = {m["skill"].lower() for m in skill_matches if m.get("skill")}
|
| 84 |
-
missing = [s for s in required if s.lower() not in matched_set]
|
| 85 |
-
|
| 86 |
-
score_payload = score_components(entities, skill_matches, resume_text)
|
| 87 |
-
suggestions = generate_feedback_list(entities, resume_text, score_payload, missing)
|
| 88 |
-
|
| 89 |
-
prof_entities = entities.get("professional_details", {}) if isinstance(entities, dict) else {}
|
| 90 |
-
exp_val = prof_entities.get("experience")
|
| 91 |
-
exp_items: list[dict] = exp_val if isinstance(exp_val, list) else []
|
| 92 |
-
exp_text = "\n".join([str(x.get("description") or "").strip() for x in exp_items if isinstance(x, dict) and (x.get("description") or "").strip()])
|
| 93 |
-
|
| 94 |
-
# Build structured_data using canonical Pydantic schema
|
| 95 |
cv_data = StructuredCV(
|
| 96 |
personal_details=PersonalDetails(
|
| 97 |
-
full_name=
|
| 98 |
-
email=contact_email
|
| 99 |
-
phone=contact_phone
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
github=contact_github or entities.get("personal_details", {}).get("github"),
|
| 104 |
-
portfolio=contact_portfolio or entities.get("personal_details", {}).get("portfolio"),
|
| 105 |
),
|
| 106 |
-
professional_summary="
|
| 107 |
-
work_experience=[
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
end_date=exp.get("end_date"),
|
| 113 |
-
description=exp.get("description")
|
| 114 |
-
) for exp in (entities.get("professional_details", {}).get("experience") or [])
|
| 115 |
-
],
|
| 116 |
-
education=[
|
| 117 |
-
EducationItem(
|
| 118 |
-
institution=edu.get("institution"),
|
| 119 |
-
degree=edu.get("degree"),
|
| 120 |
-
field=edu.get("field"),
|
| 121 |
-
start_date=edu.get("start_date"),
|
| 122 |
-
end_date=edu.get("end_date")
|
| 123 |
-
) for edu in (entities.get("education_details", {}).get("education") or [])
|
| 124 |
-
],
|
| 125 |
-
skills=entities.get("skills", []) or [],
|
| 126 |
-
certifications=entities.get("education_details", {}).get("certifications") or [],
|
| 127 |
-
languages=entities.get("education_details", {}).get("languages") or [],
|
| 128 |
)
|
| 129 |
|
|
|
|
| 130 |
llm_structured = extract_structured_cv(resume_text)
|
| 131 |
if isinstance(llm_structured, dict):
|
| 132 |
-
# Update Pydantic model with LLM results
|
| 133 |
for k in ("personal_details", "education_details", "professional_details"):
|
| 134 |
if isinstance(llm_structured.get(k), dict):
|
| 135 |
if k == "personal_details":
|
|
@@ -137,7 +108,6 @@ def process_job(job) -> None:
|
|
| 137 |
if pv and hasattr(cv_data.personal_details, pk):
|
| 138 |
setattr(cv_data.personal_details, pk, pv)
|
| 139 |
elif k == "professional_details":
|
| 140 |
-
# Map LLM professional_details to summary, experience, skills
|
| 141 |
if llm_structured[k].get("bio"):
|
| 142 |
cv_data.professional_summary = llm_structured[k]["bio"]
|
| 143 |
|
|
@@ -152,44 +122,52 @@ def process_job(job) -> None:
|
|
| 152 |
description=exp.get("description")
|
| 153 |
) for exp in llm_exp if isinstance(exp, dict)
|
| 154 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
elif k == "education_details":
|
| 156 |
-
# Similar mapping for education
|
| 157 |
llm_edu = llm_structured[k].get("education")
|
| 158 |
if isinstance(llm_edu, list) and len(llm_edu) > 0:
|
| 159 |
cv_data.education = [
|
| 160 |
EducationItem(
|
| 161 |
-
institution=edu.get("institution"),
|
| 162 |
degree=edu.get("degree"),
|
| 163 |
field=edu.get("field"),
|
| 164 |
start_date=edu.get("start_date"),
|
| 165 |
end_date=edu.get("end_date")
|
| 166 |
) for edu in llm_edu if isinstance(edu, dict)
|
| 167 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
structured_data = cv_data.model_dump()
|
| 170 |
|
| 171 |
-
#
|
| 172 |
if settings.enable_enhanced_skills:
|
| 173 |
skills_extractor = EnhancedSkillsExtractor()
|
| 174 |
experience_parser = ImprovedExperienceParser()
|
| 175 |
cert_extractor = CertificationEnhancement()
|
| 176 |
|
| 177 |
-
|
| 178 |
-
if
|
| 179 |
-
# Convert list of dicts to list of strings for consistency
|
| 180 |
skill_names = []
|
| 181 |
-
for s in
|
| 182 |
if isinstance(s, dict):
|
| 183 |
skill_names.append(s.get("name", ""))
|
| 184 |
else:
|
| 185 |
skill_names.append(str(s))
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
| 189 |
|
| 190 |
enhanced_experience = experience_parser.parse(resume_text)
|
| 191 |
-
if enhanced_experience and len(enhanced_experience) > len(structured_data.get("
|
| 192 |
-
structured_data["
|
| 193 |
{
|
| 194 |
"title": exp.get("title"),
|
| 195 |
"company": exp.get("company"),
|
|
@@ -203,9 +181,32 @@ def process_job(job) -> None:
|
|
| 203 |
if enhanced_certs and len(enhanced_certs) > len(structured_data.get("certifications", [])):
|
| 204 |
structured_data["certifications"] = enhanced_certs
|
| 205 |
|
| 206 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
extraction_suggestions = []
|
| 208 |
-
pd = structured_data.get("personal_details", {})
|
| 209 |
if not pd.get("linkedin"):
|
| 210 |
extraction_suggestions.append("Add a LinkedIn URL to your profile.")
|
| 211 |
if not pd.get("email"):
|
|
@@ -226,42 +227,15 @@ def process_job(job) -> None:
|
|
| 226 |
# Merge static and LLM suggestions
|
| 227 |
match_suggestions = suggestions + (llm_suggestions if isinstance(llm_suggestions, list) else [])
|
| 228 |
|
| 229 |
-
# Generate autofill data
|
| 230 |
-
autofill_data =
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
"company": exp.get("company"),
|
| 239 |
-
"start_date": exp.get("start_date"),
|
| 240 |
-
"end_date": exp.get("end_date"),
|
| 241 |
-
"description": exp.get("description"),
|
| 242 |
-
"location": exp.get("location")
|
| 243 |
-
} for exp in structured_data.get("work_experience", [])
|
| 244 |
-
],
|
| 245 |
-
"education": [
|
| 246 |
-
{
|
| 247 |
-
"degree": edu.get("degree"),
|
| 248 |
-
"university": edu.get("institution"),
|
| 249 |
-
"start_date": edu.get("start_date"),
|
| 250 |
-
"end_date": edu.get("end_date"),
|
| 251 |
-
"field": edu.get("field")
|
| 252 |
-
} for edu in structured_data.get("education", [])
|
| 253 |
-
],
|
| 254 |
-
"skills": structured_data.get("skills", []),
|
| 255 |
-
"certifications": structured_data.get("certifications", []),
|
| 256 |
-
"languages": structured_data.get("languages", [])
|
| 257 |
-
}
|
| 258 |
-
|
| 259 |
-
logger.info(f"Autofill data generated successfully: {len(str(autofill_data))} characters")
|
| 260 |
-
|
| 261 |
-
except Exception as e:
|
| 262 |
-
logger.error(f"Autofill data generation failed: {e}")
|
| 263 |
-
logger.error(f"Structured data keys: {list(structured_data.keys()) if structured_data else 'N/A'}")
|
| 264 |
-
autofill_data = None
|
| 265 |
|
| 266 |
normalized = normalize_analysis_result(
|
| 267 |
analysis_id=str(analysis_id),
|
|
@@ -270,14 +244,19 @@ def process_job(job) -> None:
|
|
| 270 |
component_scores=score_payload.get("component_scores"),
|
| 271 |
evidence=evidence,
|
| 272 |
suggestions=match_suggestions,
|
| 273 |
-
raw_payload={"entities": entities, "skill_matches": skill_matches},
|
| 274 |
-
extraction_metadata={"method": "
|
| 275 |
structured_data=structured_data,
|
| 276 |
extraction_suggestions=extraction_suggestions,
|
| 277 |
interview_questions=interview_questions,
|
| 278 |
)
|
| 279 |
|
| 280 |
-
# Add
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
if autofill_data:
|
| 282 |
normalized["autofill_data"] = autofill_data
|
| 283 |
|
|
|
|
| 73 |
contact_portfolio = m.group(0).rstrip(".,;") if m else None
|
| 74 |
if contact_portfolio in (contact_linkedin, contact_github):
|
| 75 |
contact_portfolio = None
|
| 76 |
+
|
| 77 |
safe_text = strip_pii_for_models(resume_text)
|
| 78 |
|
| 79 |
+
# *** PHASE 1: Enhanced Structured Extraction ***
|
| 80 |
+
# We perform this first so matching and scoring have the best possible data.
|
| 81 |
+
|
| 82 |
+
# Start with simple Pydantic model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
cv_data = StructuredCV(
|
| 84 |
personal_details=PersonalDetails(
|
| 85 |
+
full_name=None,
|
| 86 |
+
email=contact_email,
|
| 87 |
+
phone=contact_phone,
|
| 88 |
+
linkedin=contact_linkedin,
|
| 89 |
+
github=contact_github,
|
| 90 |
+
portfolio=contact_portfolio,
|
|
|
|
|
|
|
| 91 |
),
|
| 92 |
+
professional_summary="",
|
| 93 |
+
work_experience=[],
|
| 94 |
+
education=[],
|
| 95 |
+
skills=[],
|
| 96 |
+
certifications=[],
|
| 97 |
+
languages=[],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
)
|
| 99 |
|
| 100 |
+
# 1. AI Structured Extraction (NuExtract)
|
| 101 |
llm_structured = extract_structured_cv(resume_text)
|
| 102 |
if isinstance(llm_structured, dict):
|
| 103 |
+
# Update Pydantic model with LLM results
|
| 104 |
for k in ("personal_details", "education_details", "professional_details"):
|
| 105 |
if isinstance(llm_structured.get(k), dict):
|
| 106 |
if k == "personal_details":
|
|
|
|
| 108 |
if pv and hasattr(cv_data.personal_details, pk):
|
| 109 |
setattr(cv_data.personal_details, pk, pv)
|
| 110 |
elif k == "professional_details":
|
|
|
|
| 111 |
if llm_structured[k].get("bio"):
|
| 112 |
cv_data.professional_summary = llm_structured[k]["bio"]
|
| 113 |
|
|
|
|
| 122 |
description=exp.get("description")
|
| 123 |
) for exp in llm_exp if isinstance(exp, dict)
|
| 124 |
]
|
| 125 |
+
|
| 126 |
+
if llm_structured[k].get("skills"):
|
| 127 |
+
cv_data.skills = llm_structured[k]["skills"]
|
| 128 |
+
|
| 129 |
elif k == "education_details":
|
|
|
|
| 130 |
llm_edu = llm_structured[k].get("education")
|
| 131 |
if isinstance(llm_edu, list) and len(llm_edu) > 0:
|
| 132 |
cv_data.education = [
|
| 133 |
EducationItem(
|
| 134 |
+
institution=edu.get("university") or edu.get("institution"),
|
| 135 |
degree=edu.get("degree"),
|
| 136 |
field=edu.get("field"),
|
| 137 |
start_date=edu.get("start_date"),
|
| 138 |
end_date=edu.get("end_date")
|
| 139 |
) for edu in llm_edu if isinstance(edu, dict)
|
| 140 |
]
|
| 141 |
+
if llm_structured[k].get("certifications"):
|
| 142 |
+
cv_data.certifications = llm_structured[k]["certifications"]
|
| 143 |
+
if llm_structured[k].get("languages"):
|
| 144 |
+
cv_data.languages = llm_structured[k]["languages"]
|
| 145 |
|
| 146 |
structured_data = cv_data.model_dump()
|
| 147 |
|
| 148 |
+
# 2. Pattern Matching & Domain-Specific Extractors
|
| 149 |
if settings.enable_enhanced_skills:
|
| 150 |
skills_extractor = EnhancedSkillsExtractor()
|
| 151 |
experience_parser = ImprovedExperienceParser()
|
| 152 |
cert_extractor = CertificationEnhancement()
|
| 153 |
|
| 154 |
+
enhanced_skills_objs = skills_extractor.extract_skills(resume_text)
|
| 155 |
+
if enhanced_skills_objs:
|
|
|
|
| 156 |
skill_names = []
|
| 157 |
+
for s in enhanced_skills_objs:
|
| 158 |
if isinstance(s, dict):
|
| 159 |
skill_names.append(s.get("name", ""))
|
| 160 |
else:
|
| 161 |
skill_names.append(str(s))
|
| 162 |
|
| 163 |
+
# Merge and deduplicate
|
| 164 |
+
current_skills = set(structured_data.get("skills", []))
|
| 165 |
+
current_skills.update(skill_names)
|
| 166 |
+
structured_data["skills"] = list(filter(None, current_skills))
|
| 167 |
|
| 168 |
enhanced_experience = experience_parser.parse(resume_text)
|
| 169 |
+
if enhanced_experience and len(enhanced_experience) > len(structured_data.get("work_experience", [])):
|
| 170 |
+
structured_data["work_experience"] = [
|
| 171 |
{
|
| 172 |
"title": exp.get("title"),
|
| 173 |
"company": exp.get("company"),
|
|
|
|
| 181 |
if enhanced_certs and len(enhanced_certs) > len(structured_data.get("certifications", [])):
|
| 182 |
structured_data["certifications"] = enhanced_certs
|
| 183 |
|
| 184 |
+
# *** PHASE 2: Matching and Scoring using Unified Data ***
|
| 185 |
+
|
| 186 |
+
entities = parse_entities(safe_text)
|
| 187 |
+
# Force structured results into entities for scoring
|
| 188 |
+
final_skills = structured_data.get("skills", [])
|
| 189 |
+
if not final_skills:
|
| 190 |
+
# Fallback to simple NER if enhanced failed
|
| 191 |
+
final_skills = entities.get("skills", [])
|
| 192 |
+
|
| 193 |
+
skill_matches = match_skills_to_job(final_skills, job.job_description)
|
| 194 |
+
|
| 195 |
+
required = extract_required_skills_from_job(job.job_description)
|
| 196 |
+
matched_set = {m["skill"].lower() for m in skill_matches if m.get("skill")}
|
| 197 |
+
missing = [s for s in required if s.lower() not in matched_set]
|
| 198 |
+
|
| 199 |
+
# Use improved entities for scoring
|
| 200 |
+
scoring_entities = entities.copy()
|
| 201 |
+
scoring_entities["skills"] = final_skills
|
| 202 |
+
scoring_entities.setdefault("professional_details", {})["experience"] = structured_data.get("work_experience", [])
|
| 203 |
+
|
| 204 |
+
score_payload = score_components(scoring_entities, skill_matches, resume_text)
|
| 205 |
+
suggestions = generate_feedback_list(scoring_entities, resume_text, score_payload, missing)
|
| 206 |
+
|
| 207 |
+
# Simple extraction suggestions
|
| 208 |
extraction_suggestions = []
|
| 209 |
+
pd = structured_data.get("personal_details", {})
|
| 210 |
if not pd.get("linkedin"):
|
| 211 |
extraction_suggestions.append("Add a LinkedIn URL to your profile.")
|
| 212 |
if not pd.get("email"):
|
|
|
|
| 227 |
# Merge static and LLM suggestions
|
| 228 |
match_suggestions = suggestions + (llm_suggestions if isinstance(llm_suggestions, list) else [])
|
| 229 |
|
| 230 |
+
# Generate autofill data
|
| 231 |
+
autofill_data = {
|
| 232 |
+
"personal": structured_data.get("personal_details", {}),
|
| 233 |
+
"experience": structured_data.get("work_experience", []),
|
| 234 |
+
"education": structured_data.get("education", []),
|
| 235 |
+
"skills": structured_data.get("skills", []),
|
| 236 |
+
"certifications": structured_data.get("certifications", []),
|
| 237 |
+
"languages": structured_data.get("languages", [])
|
| 238 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
normalized = normalize_analysis_result(
|
| 241 |
analysis_id=str(analysis_id),
|
|
|
|
| 244 |
component_scores=score_payload.get("component_scores"),
|
| 245 |
evidence=evidence,
|
| 246 |
suggestions=match_suggestions,
|
| 247 |
+
raw_payload={"entities": entities, "skill_matches": skill_matches, "cv_text": resume_text},
|
| 248 |
+
extraction_metadata={"method": "hybrid_ocr_extraction", "confidence": None, "pages": None, "has_scanned_content": False},
|
| 249 |
structured_data=structured_data,
|
| 250 |
extraction_suggestions=extraction_suggestions,
|
| 251 |
interview_questions=interview_questions,
|
| 252 |
)
|
| 253 |
|
| 254 |
+
# π₯ COMPATIBILITY SHIM: Add top-level fields for user test scripts
|
| 255 |
+
normalized["skills"] = final_skills
|
| 256 |
+
normalized["experience"] = structured_data.get("work_experience", [])
|
| 257 |
+
normalized["certifications"] = structured_data.get("certifications", [])
|
| 258 |
+
normalized["personal_details"] = structured_data.get("personal_details", {})
|
| 259 |
+
|
| 260 |
if autofill_data:
|
| 261 |
normalized["autofill_data"] = autofill_data
|
| 262 |
|