Dzunisani007 commited on
Commit
83c9f71
·
1 Parent(s): 0e6e3fe

Fix Hugging Face deployment issues

Browse files

🔧 Critical Fixes:
- Update Dockerfile and Dockerfile.hf-ocr
- Fix main.py startup and model loading
- Update models.py for better database handling
- Fix OCR service and structured extraction
- Fix pipeline status completion

📋 Testing Tools:
- Add OCR dependency verification
- Add extraction diagnosis tools
- Add PDF peek utility
- Add pipeline testing script

Dockerfile CHANGED
@@ -6,6 +6,16 @@ RUN apt-get update && apt-get install -y \
6
  curl \
7
  tesseract-ocr \
8
  tesseract-ocr-eng \
 
 
 
 
 
 
 
 
 
 
9
  tesseract-ocr-osd \
10
  poppler-utils \
11
  libtesseract-dev \
 
6
  curl \
7
  tesseract-ocr \
8
  tesseract-ocr-eng \
9
+ tesseract-ocr-afr \
10
+ tesseract-ocr-zul \
11
+ tesseract-ocr-xho \
12
+ tesseract-ocr-nso \
13
+ tesseract-ocr-sot \
14
+ tesseract-ocr-tsn \
15
+ tesseract-ocr-ssw \
16
+ tesseract-ocr-ven \
17
+ tesseract-ocr-tso \
18
+ tesseract-ocr-nbl \
19
  tesseract-ocr-osd \
20
  poppler-utils \
21
  libtesseract-dev \
Dockerfile.hf-ocr CHANGED
@@ -5,6 +5,16 @@ RUN apt-get update && apt-get install -y \
5
  build-essential \
6
  curl \
7
  tesseract-ocr \
 
 
 
 
 
 
 
 
 
 
8
  libtesseract-dev \
9
  poppler-utils \
10
  libgl1-mesa-glx \
 
5
  build-essential \
6
  curl \
7
  tesseract-ocr \
8
+ tesseract-ocr-afr \
9
+ tesseract-ocr-zul \
10
+ tesseract-ocr-xho \
11
+ tesseract-ocr-nso \
12
+ tesseract-ocr-sot \
13
+ tesseract-ocr-tsn \
14
+ tesseract-ocr-ssw \
15
+ tesseract-ocr-ven \
16
+ tesseract-ocr-tso \
17
+ tesseract-ocr-nbl \
18
  libtesseract-dev \
19
  poppler-utils \
20
  libgl1-mesa-glx \
app/main.py CHANGED
@@ -53,6 +53,19 @@ def root():
53
  @app.on_event("startup")
54
  def _startup() -> None:
55
  init_session_factory()
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # Fail fast if the worker pipeline cannot be imported (prevents silent worker failures).
58
  try:
 
53
  @app.on_event("startup")
54
  def _startup() -> None:
55
  init_session_factory()
56
+
57
+ # Auto-create tables for SQLite during local dev
58
+ if settings.database_url.startswith("sqlite"):
59
+ try:
60
+ from app.db import get_engine, Base
61
+ from app.models import CVRecord, CVAnalysis, ResumeSkill, ResumeScore, AuditLog, WorkflowAuditLog # Ensure models are registered
62
+ engine = get_engine()
63
+ Base.metadata.create_all(bind=engine)
64
+ import logging
65
+ logging.getLogger(__name__).info("SQLite tables created/verified successfully")
66
+ except Exception as e:
67
+ import logging
68
+ logging.getLogger(__name__).error(f"Failed to create SQLite tables: {e}")
69
 
70
  # Fail fast if the worker pipeline cannot be imported (prevents silent worker failures).
71
  try:
app/models.py CHANGED
@@ -11,7 +11,9 @@ from app.db import Base
11
  class CVRecord(Base):
12
  """Stores raw CV text for analysis (no file storage)."""
13
  __tablename__ = "cv_records"
14
- __table_args__ = {"schema": "cv_analyser"}
 
 
15
 
16
  id: Mapped[uuid.UUID] = mapped_column(
17
  sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
@@ -30,7 +32,9 @@ class CVRecord(Base):
30
  class CVAnalysis(Base):
31
  """Analysis result for a CV record."""
32
  __tablename__ = "cv_analyses"
33
- __table_args__ = {"schema": "cv_analyser"}
 
 
34
 
35
  id: Mapped[uuid.UUID] = mapped_column(
36
  sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
@@ -63,7 +67,9 @@ class CVAnalysis(Base):
63
 
64
  class ResumeSkill(Base):
65
  __tablename__ = "cv_resume_skills"
66
- __table_args__ = {"schema": "cv_analyser"}
 
 
67
 
68
  id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
69
  resume_id: Mapped[uuid.UUID] = mapped_column(
@@ -77,7 +83,9 @@ class ResumeSkill(Base):
77
 
78
  class ResumeScore(Base):
79
  __tablename__ = "cv_resume_scores"
80
- __table_args__ = {"schema": "cv_analyser"}
 
 
81
 
82
  id: Mapped[uuid.UUID] = mapped_column(
83
  sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
@@ -94,7 +102,9 @@ class ResumeScore(Base):
94
 
95
  class AuditLog(Base):
96
  __tablename__ = "cv_audit_logs"
97
- __table_args__ = {"schema": "cv_analyser"}
 
 
98
 
99
  id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
100
  entity_type: Mapped[str | None] = mapped_column(Text, nullable=True)
@@ -108,7 +118,9 @@ class AuditLog(Base):
108
  class WorkflowAuditLog(Base):
109
  """Audit log for Risk Gate workflow progression."""
110
  __tablename__ = "cv_workflow_audit_logs"
111
- __table_args__ = {"schema": "cv_analyser"}
 
 
112
 
113
  id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
114
  analysis_id: Mapped[uuid.UUID] = mapped_column(
 
11
  class CVRecord(Base):
12
  """Stores raw CV text for analysis (no file storage)."""
13
  __tablename__ = "cv_records"
14
+ __table_args__ = (
15
+ {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
16
+ )
17
 
18
  id: Mapped[uuid.UUID] = mapped_column(
19
  sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
 
32
  class CVAnalysis(Base):
33
  """Analysis result for a CV record."""
34
  __tablename__ = "cv_analyses"
35
+ __table_args__ = (
36
+ {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
37
+ )
38
 
39
  id: Mapped[uuid.UUID] = mapped_column(
40
  sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
 
67
 
68
  class ResumeSkill(Base):
69
  __tablename__ = "cv_resume_skills"
70
+ __table_args__ = (
71
+ {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
72
+ )
73
 
74
  id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
75
  resume_id: Mapped[uuid.UUID] = mapped_column(
 
83
 
84
  class ResumeScore(Base):
85
  __tablename__ = "cv_resume_scores"
86
+ __table_args__ = (
87
+ {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
88
+ )
89
 
90
  id: Mapped[uuid.UUID] = mapped_column(
91
  sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
 
102
 
103
  class AuditLog(Base):
104
  __tablename__ = "cv_audit_logs"
105
+ __table_args__ = (
106
+ {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
107
+ )
108
 
109
  id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
110
  entity_type: Mapped[str | None] = mapped_column(Text, nullable=True)
 
118
  class WorkflowAuditLog(Base):
119
  """Audit log for Risk Gate workflow progression."""
120
  __tablename__ = "cv_workflow_audit_logs"
121
+ __table_args__ = (
122
+ {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
123
+ )
124
 
125
  id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
126
  analysis_id: Mapped[uuid.UUID] = mapped_column(
app/services/ocr_service.py CHANGED
@@ -108,10 +108,12 @@ class OCRService:
108
  # Preprocess image for better OCR
109
  processed_image = self._preprocess_image(image)
110
 
111
- # Extract text using Tesseract
 
112
  page_text = pytesseract.image_to_string(
113
  processed_image,
114
- config=self.tesseract_config
 
115
  )
116
 
117
  if page_text.strip():
@@ -169,9 +171,11 @@ class OCRService:
169
  image = Image.open(file_path)
170
  processed_image = self._preprocess_image(image)
171
 
 
172
  raw_text = pytesseract.image_to_string(
173
  processed_image,
174
- config=self.tesseract_config
 
175
  )
176
 
177
  return self._clean_ocr_text(raw_text)
 
108
  # Preprocess image for better OCR
109
  processed_image = self._preprocess_image(image)
110
 
111
+ # Extract text using Tesseract with South African language support
112
+ langs = 'eng+afr+zul+xho+nso+sot+tsn+ssw+ven+tso+nbl'
113
  page_text = pytesseract.image_to_string(
114
  processed_image,
115
+ config=self.tesseract_config,
116
+ lang=langs
117
  )
118
 
119
  if page_text.strip():
 
171
  image = Image.open(file_path)
172
  processed_image = self._preprocess_image(image)
173
 
174
+ langs = 'eng+afr+zul+xho+nso+sot+tsn+ssw+ven+tso+nbl'
175
  raw_text = pytesseract.image_to_string(
176
  processed_image,
177
+ config=self.tesseract_config,
178
+ lang=langs
179
  )
180
 
181
  return self._clean_ocr_text(raw_text)
app/services/structured_extraction.py CHANGED
@@ -150,10 +150,13 @@ def extract_structured_cv_external(resume_text: str) -> dict[str, Any] | None:
150
  return None
151
 
152
  if not _looks_like_structured_data(parsed):
 
153
  return None
154
 
155
- logger.info("External HF API extraction successful")
156
- return parsed
 
 
157
 
158
  except Exception as e:
159
  logger.error("External HF API extraction failed: %r", e)
@@ -200,12 +203,52 @@ def _parse_first_json_object(text: str) -> Any:
200
 
201
 
202
  def _looks_like_structured_data(data: Any) -> bool:
203
- """Check if data looks like our structured CV schema."""
204
  if not isinstance(data, dict):
205
  return False
206
 
207
- required_keys = {"personal_details", "education_details", "professional_details"}
208
- return required_keys.issubset(data.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
 
211
  def get_extraction_status() -> dict[str, Any]:
 
150
  return None
151
 
152
  if not _looks_like_structured_data(parsed):
153
+ logger.warning("External HF API returned JSON but it doesn't look like a CV: %s", str(parsed)[:500])
154
  return None
155
 
156
+ # Robust normalization for AI variations (flat vs nested)
157
+ normalized_data = _normalize_extracted_json(parsed)
158
+ logger.info("External HF API extraction successful (normalized)")
159
+ return normalized_data
160
 
161
  except Exception as e:
162
  logger.error("External HF API extraction failed: %r", e)
 
203
 
204
 
205
  def _looks_like_structured_data(data: Any) -> bool:
206
+ """Check if data has properties of a CV (nested or flat)."""
207
  if not isinstance(data, dict):
208
  return False
209
 
210
+ # Check for presence of key CV indicators anywhere in the keys or nested keys
211
+ cv_indicators = {"skills", "experience", "education", "full_name", "work_experience"}
212
+
213
+ # Check top level
214
+ if any(k in data for k in cv_indicators):
215
+ return True
216
+
217
+ # Check nested level 1
218
+ for v in data.values():
219
+ if isinstance(v, dict):
220
+ if any(k in v for k in cv_indicators):
221
+ return True
222
+
223
+ return False
224
+
225
+
226
+ def _normalize_extracted_json(data: dict) -> dict:
227
+ """Ensure the JSON matches our expected nested schema even if the LLM flattens it."""
228
+ out = {
229
+ "personal_details": data.get("personal_details", {}),
230
+ "education_details": data.get("education_details", {"education": [], "certifications": [], "languages": []}),
231
+ "professional_details": data.get("professional_details", {"skills": [], "experience": [], "bio": ""}),
232
+ }
233
+
234
+ # If LLM flattened personal details
235
+ for k in ["full_name", "email", "phone", "linkedin", "github", "portfolio", "address", "dob"]:
236
+ if k in data and k not in out["personal_details"]:
237
+ out["personal_details"][k] = data[k]
238
+
239
+ # If LLM flattened professional details
240
+ for k in ["skills", "experience", "work_experience", "bio", "position"]:
241
+ if k in data and k not in out["professional_details"]:
242
+ target_key = "experience" if k == "work_experience" else k
243
+ out["professional_details"][target_key] = data[k]
244
+
245
+ # If LLM flattened education
246
+ for k in ["education", "certifications", "languages", "university", "degrees"]:
247
+ if k in data and k not in out["education_details"]:
248
+ target_key = "education" if k in ["university", "degrees"] else k
249
+ out["education_details"][target_key] = data[k]
250
+
251
+ return out
252
 
253
 
254
  def get_extraction_status() -> dict[str, Any]:
app/tasks/pipeline.py CHANGED
@@ -97,51 +97,88 @@ def process_job(job) -> None:
97
  languages=[],
98
  )
99
 
100
- # 1. AI Structured Extraction (NuExtract)
101
  llm_structured = extract_structured_cv(resume_text)
102
  if isinstance(llm_structured, dict):
103
- # Update Pydantic model with LLM results
104
- for k in ("personal_details", "education_details", "professional_details"):
105
- if isinstance(llm_structured.get(k), dict):
106
- if k == "personal_details":
107
- for pk, pv in llm_structured[k].items():
108
- if pv and hasattr(cv_data.personal_details, pk):
109
- setattr(cv_data.personal_details, pk, pv)
110
- elif k == "professional_details":
111
- if llm_structured[k].get("bio"):
112
- cv_data.professional_summary = llm_structured[k]["bio"]
113
-
114
- llm_exp = llm_structured[k].get("experience")
115
- if isinstance(llm_exp, list) and len(llm_exp) > 0:
116
- cv_data.work_experience = [
117
- WorkExperienceItem(
118
- company=exp.get("company"),
119
- title=exp.get("title"),
120
- start_date=exp.get("start_date"),
121
- end_date=exp.get("end_date"),
122
- description=exp.get("description")
123
- ) for exp in llm_exp if isinstance(exp, dict)
124
- ]
125
-
126
- if llm_structured[k].get("skills"):
127
- cv_data.skills = llm_structured[k]["skills"]
128
-
129
- elif k == "education_details":
130
- llm_edu = llm_structured[k].get("education")
131
- if isinstance(llm_edu, list) and len(llm_edu) > 0:
132
- cv_data.education = [
133
- EducationItem(
134
- institution=edu.get("university") or edu.get("institution"),
135
- degree=edu.get("degree"),
136
- field=edu.get("field"),
137
- start_date=edu.get("start_date"),
138
- end_date=edu.get("end_date")
139
- ) for edu in llm_edu if isinstance(edu, dict)
140
- ]
141
- if llm_structured[k].get("certifications"):
142
- cv_data.certifications = llm_structured[k]["certifications"]
143
- if llm_structured[k].get("languages"):
144
- cv_data.languages = llm_structured[k]["languages"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  structured_data = cv_data.model_dump()
147
 
 
97
  languages=[],
98
  )
99
 
100
+ # 1. AI Structured Extraction (NuExtract/LLM)
101
  llm_structured = extract_structured_cv(resume_text)
102
  if isinstance(llm_structured, dict):
103
+ # Function to safely get data from nested or flat structure
104
+ def get_data(keys, default=None):
105
+ # Try absolute nested path first
106
+ curr = llm_structured
107
+ found_nested = True
108
+ for k in keys:
109
+ if isinstance(curr, dict) and k in curr:
110
+ curr = curr[k]
111
+ else:
112
+ found_nested = False
113
+ break
114
+ if found_nested:
115
+ return curr
116
+
117
+ # Try flat key (last item in the key path) if it exists at top level
118
+ flat_key = keys[-1]
119
+ if flat_key in llm_structured:
120
+ return llm_structured[flat_key]
121
+
122
+ # Special cases for common alternate names
123
+ if flat_key == "education" and "education_details" in llm_structured:
124
+ return llm_structured["education_details"].get("education")
125
+ if flat_key == "skills" and "professional_details" in llm_structured:
126
+ return llm_structured["professional_details"].get("skills")
127
+ if flat_key in ["experience", "work_experience"] and "professional_details" in llm_structured:
128
+ return llm_structured["professional_details"].get("experience") or llm_structured["professional_details"].get("work_experience")
129
+
130
+ return default
131
+
132
+ # Update Personal Details
133
+ pd_src = get_data(["personal_details"])
134
+ if isinstance(pd_src, dict):
135
+ for pk, pv in pd_src.items():
136
+ if pv and hasattr(cv_data.personal_details, pk):
137
+ setattr(cv_data.personal_details, pk, pv)
138
+ else:
139
+ # Handle flat personal details
140
+ for pk in ["full_name", "email", "phone", "linkedin", "github", "portfolio"]:
141
+ val = llm_structured.get(pk)
142
+ if val and hasattr(cv_data.personal_details, pk):
143
+ setattr(cv_data.personal_details, pk, val)
144
+
145
+ # Update Summary/Bio
146
+ cv_data.professional_summary = get_data(["professional_details", "bio"]) or llm_structured.get("professional_summary") or llm_structured.get("bio") or ""
147
+
148
+ # Update Experience
149
+ llm_exp = get_data(["professional_details", "experience"]) or llm_structured.get("experience") or llm_structured.get("work_experience")
150
+ if isinstance(llm_exp, list):
151
+ cv_data.work_experience = [
152
+ WorkExperienceItem(
153
+ company=exp.get("company"),
154
+ title=exp.get("title"),
155
+ start_date=exp.get("start_date"),
156
+ end_date=exp.get("end_date"),
157
+ description=exp.get("description") or exp.get("responsibilities")
158
+ ) for exp in llm_exp if isinstance(exp, dict)
159
+ ]
160
+
161
+ # Update Skills
162
+ llm_skills = get_data(["professional_details", "skills"]) or llm_structured.get("skills")
163
+ if isinstance(llm_skills, list):
164
+ cv_data.skills = [str(s) for s in llm_skills if s]
165
+
166
+ # Update Education
167
+ llm_edu = get_data(["education_details", "education"]) or llm_structured.get("education")
168
+ if isinstance(llm_edu, list):
169
+ cv_data.education = [
170
+ EducationItem(
171
+ institution=edu.get("university") or edu.get("institution") or edu.get("school"),
172
+ degree=edu.get("degree"),
173
+ field=edu.get("field") or edu.get("major"),
174
+ start_date=edu.get("start_date"),
175
+ end_date=edu.get("end_date")
176
+ ) for edu in llm_edu if isinstance(edu, dict)
177
+ ]
178
+
179
+ # Update Others
180
+ cv_data.certifications = get_data(["education_details", "certifications"]) or llm_structured.get("certifications") or []
181
+ cv_data.languages = get_data(["education_details", "languages"]) or llm_structured.get("languages") or []
182
 
183
  structured_data = cv_data.model_dump()
184
 
check_ocr_deps.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import shutil
3
+
4
+ def check_env():
5
+ print("Checking for Tesseract...")
6
+ tess_path = shutil.which("tesseract")
7
+ if tess_path:
8
+ print(f"Tesseract found: {tess_path}")
9
+ try:
10
+ res = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
11
+ print(res.stdout)
12
+ except Exception as e:
13
+ print(f"Error running tesseract: {e}")
14
+ else:
15
+ print("Tesseract NOT found in PATH")
16
+
17
+ print("\nChecking for pdftoppm (poppler)...")
18
+ poppler_path = shutil.which("pdftoppm")
19
+ if poppler_path:
20
+ print(f"pdftoppm found: {poppler_path}")
21
+ try:
22
+ res = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
23
+ print(res.stderr) # pdftoppm prints version to stderr
24
+ except Exception as e:
25
+ print(f"Error running pdftoppm: {e}")
26
+ else:
27
+ print("pdftoppm NOT found in PATH")
28
+
29
+ if __name__ == "__main__":
30
+ check_env()
diagnose_extraction.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ from pathlib import Path
5
+ import logging
6
+
7
+ # Configure logging
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # Add the app directory to the path to allow imports
12
+ sys.path.append(str(Path(__file__).parent))
13
+
14
+ from app.services.ocr_service import OCRService
15
+ from app.services.local_nuextract_service import extract_cv_structured_local
16
+ from app.services.enhanced_skills_extractor import EnhancedSkillsExtractor
17
+
18
+ def diagnose(file_path):
19
+ print(f"\n{'='*80}")
20
+ print(f"DIAGNOSING: {os.path.basename(file_path)}")
21
+ print(f"{'='*80}")
22
+
23
+ ocr = OCRService()
24
+
25
+ # 1. Text Extraction Stage
26
+ print("\n1. [TEXT EXTRACTION]")
27
+ try:
28
+ import pdfplumber
29
+ print(f"pdfplumber version: {pdfplumber.__version__}")
30
+
31
+ with pdfplumber.open(file_path) as pdf:
32
+ print(f"PDF pages: {len(pdf.pages)}")
33
+ print(f"PDF metadata: {pdf.metadata}")
34
+
35
+ native_text = ocr._native_pdf_extraction(file_path)
36
+ print(f"Native extraction: {len(native_text)} characters.")
37
+
38
+ is_scanned = ocr._is_scanned_document(native_text)
39
+ print(f"Is scanned (density < {ocr.min_text_density}): {is_scanned}")
40
+
41
+ final_text = ""
42
+ if is_scanned:
43
+ print("Triggering OCR...")
44
+ final_text = ocr._ocr_pdf_extraction(file_path)
45
+ print(f"OCR result: {len(final_text)} characters.")
46
+ else:
47
+ final_text = native_text
48
+
49
+ if len(final_text) < 100:
50
+ print(f"WARNING: Very low text quality detected: {repr(final_text[:100])}")
51
+ else:
52
+ print(f"Sample text (first 500 characters):\n{final_text[:500]}")
53
+
54
+ except Exception as e:
55
+ print(f"Extraction failed: {e}")
56
+ import traceback
57
+ traceback.print_exc()
58
+ return
59
+
60
+ # 2. Structured Extraction Stage (AI)
61
+ print("\n2. [STRUCTURED EXTRACTION - AI]")
62
+ try:
63
+ if len(final_text) < 50:
64
+ print("Skipping AI extraction due to low text quality.")
65
+ structured = None
66
+ else:
67
+ print("Running NuExtract (local)...")
68
+ structured = extract_cv_structured_local(final_text)
69
+ if structured:
70
+ print("AI extraction SUCCESS.")
71
+ print(f"Skills found: {len(structured.get('skills', []))}")
72
+ print(f"Experience entries: {len(structured.get('experience', []))}")
73
+ print(f"Education entries: {len(structured.get('education', []))}")
74
+ print(f"Personal Info: {structured.get('personal_details', {})}")
75
+ else:
76
+ print("AI extraction returned NONE.")
77
+ except Exception as e:
78
+ print(f"AI extraction failed: {e}")
79
+ structured = None
80
+
81
+ # 3. Enhanced Matching Stage (Regex)
82
+ print("\n3. [ENHANCED MATCHING - REGEX]")
83
+ try:
84
+ skills_extractor = EnhancedSkillsExtractor()
85
+ enhanced_skills = skills_extractor.extract_skills(final_text)
86
+ print(f"Regex skills found: {len(enhanced_skills)}")
87
+ if enhanced_skills:
88
+ # Group by category for better overview
89
+ categories = {}
90
+ for s in enhanced_skills:
91
+ cat = s.get('category', 'other')
92
+ if cat not in categories: categories[cat] = []
93
+ categories[cat].append(s['name'])
94
+
95
+ for cat, names in categories.items():
96
+ print(f" [{cat.upper()}]: {', '.join(names[:10])}")
97
+ except Exception as e:
98
+ print(f"Regex matching failed: {e}")
99
+
100
+ print(f"\n{'='*80}")
101
+ print("DIAGNOSIS COMPLETE")
102
+ print(f"{'='*80}")
103
+
104
+ if __name__ == "__main__":
105
+ # Test on multiple files to see different behaviors
106
+ test_pdfs = [
107
+ "C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser\\Untitled document (3).pdf",
108
+ "C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser\\Dzunisani Data Analyst cv.pdf"
109
+ ]
110
+ for pdf in test_pdfs:
111
+ if os.path.exists(pdf):
112
+ diagnose(pdf)
113
+ else:
114
+ print(f"File not found: {pdf}")
peek_pdfs.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ # Add the app directory to the path to allow imports
6
+ sys.path.append(str(Path(__file__).parent))
7
+
8
+ from app.services.ocr_service import OCRService
9
+
10
+ def test_file_extraction(file_path):
11
+ print(f"\n[{'='*20}] Testing extraction for: {os.path.basename(file_path)}")
12
+ ocr = OCRService()
13
+ try:
14
+ # To see what happened, let's call the internal methods
15
+ native_text = ocr._native_pdf_extraction(file_path)
16
+ print(f"Native extraction: {len(native_text)} chars.")
17
+ if len(native_text) < 100:
18
+ print(f"Native result snippet: {native_text[:100]!r}")
19
+ print("Triggering OCR fallback...")
20
+ ocr_text = ocr._ocr_pdf_extraction(file_path)
21
+ print(f"OCR extraction: {len(ocr_text)} chars.")
22
+ print(f"OCR result snippet: {ocr_text[:500]!r}")
23
+ else:
24
+ print(f"Native extraction sufficient. First 500 chars:\n{native_text[:500]}")
25
+
26
+ except Exception as e:
27
+ print(f"Extraction failed: {e}")
28
+ import traceback
29
+ traceback.print_exc()
30
+
31
+ if __name__ == "__main__":
32
+ files = [
33
+ "Untitled document (3).pdf",
34
+ "Dzunisani Data Analyst cv.pdf",
35
+ "KRM N6 (1).pdf"
36
+ ]
37
+ for f in files:
38
+ path = os.path.join("C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser", f)
39
+ if os.path.exists(path):
40
+ test_file_extraction(path)
41
+ else:
42
+ print(f"File not found: {path}")
test_full_pipeline_v2.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import json
4
+ import logging
5
+
6
+ # Configure logging
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+ # Add the cv-analyser app to the path
10
+ sys.path.append("/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser")
11
+
12
+ try:
13
+ from app.tasks.pipeline import process_job
14
+ from app.models import CVAnalysis, CVRecord
15
+ from app.db import session_scope
16
+ import uuid
17
+
18
+ # Mock Job object
19
+ class MockJob:
20
+ def __init__(self, analysis_id, resume_id, job_description):
21
+ self.analysis_id = str(analysis_id)
22
+ self.resume_id = str(resume_id)
23
+ self.job_description = job_description
24
+
25
+ # Test file
26
+ test_pdf = "/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser/Untitled document (3).pdf"
27
+
28
+ with session_scope() as db:
29
+ # Create a test record
30
+ record_id = uuid.uuid4()
31
+ analysis_id = uuid.uuid4()
32
+
33
+ # Read the file
34
+ from app.services.ocr_service import OCRService
35
+ ocr = OCRService()
36
+ raw_text = ocr.extract_text(test_pdf, "pdf")
37
+
38
+ record = CVRecord(
39
+ id=record_id,
40
+ cv_text=raw_text
41
+ )
42
+ db.add(record)
43
+
44
+ analysis = CVAnalysis(
45
+ id=analysis_id,
46
+ record_id=record_id,
47
+ status="pending",
48
+ job_description="Python Developer with Flask and AWS experience"
49
+ )
50
+ db.add(analysis)
51
+ db.commit()
52
+
53
+ print(f"Created Test Analysis: {analysis_id}")
54
+
55
+ # Run the pipeline
56
+ job = MockJob(analysis_id, record_id, analysis.job_description)
57
+ process_job(job)
58
+
59
+ # Fetch the result
60
+ db.expire_all()
61
+ result_analysis = db.get(CVAnalysis, analysis_id)
62
+ res = result_analysis.result
63
+
64
+ print("\n--- FINAL RESULT INSPECTION ---")
65
+ print(f"Status: {result_analysis.status}")
66
+ print(f"Overall Score: {result_analysis.overall_score}")
67
+
68
+ # Check for top-level keys
69
+ print(f"\nTop-level keys in response: {list(res.keys())}")
70
+
71
+ skills = res.get("skills", [])
72
+ print(f"Top-level skills count: {len(skills)}")
73
+ if skills:
74
+ print(f"Sample skills: {skills[:5]}")
75
+
76
+ experience = res.get("experience", [])
77
+ print(f"Top-level experience count: {len(experience)}")
78
+
79
+ # Check structured_data
80
+ struct = res.get("structured_data", {})
81
+ struct_skills = struct.get("skills", [])
82
+ print(f"Structured skills count: {len(struct_skills)}")
83
+
84
+ if len(skills) == len(struct_skills) and len(skills) > 0:
85
+ print("\n✅ SUCCESS: Top-level and structured data are in sync!")
86
+ else:
87
+ print("\n❌ WARNING: Data desync or no data found.")
88
+
89
+ except Exception as e:
90
+ print(f"Error: {e}")
91
+ import traceback
92
+ traceback.print_exc()
verify_fix.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import logging
4
+
5
+ # Configure logging to see OCR details
6
+ logging.basicConfig(level=logging.INFO)
7
+
8
+ # Add the cv-analyser app to the path
9
+ sys.path.append("/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser")
10
+
11
+ try:
12
+ from app.services.ocr_service import OCRService
13
+ from app.services.enhanced_skills_extractor import EnhancedSkillsExtractor
14
+
15
+ ocr = OCRService()
16
+ test_file = "/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser/Untitled document (3).pdf"
17
+
18
+ if not os.path.exists(test_file):
19
+ print(f"Error: Test file not found at {test_file}")
20
+ sys.exit(1)
21
+
22
+ print(f"Testing extraction on: {test_file}")
23
+ text = ocr._extract_from_pdf(test_file)
24
+
25
+ print(f"\n--- EXTRACTION RESULT ---")
26
+ print(f"Characters: {len(text)}")
27
+ if len(text) > 0:
28
+ print(f"First 500 chars: {repr(text[:500])}")
29
+
30
+ # Skill test
31
+ extractor = EnhancedSkillsExtractor()
32
+ skills = extractor.extract_skills(text)
33
+ print(f"Skills found: {len(skills)}")
34
+ if skills:
35
+ print(f"Sample skills: {[s.get('name') for s in skills[:10]]}")
36
+ else:
37
+ print("FAIL: No text extracted")
38
+
39
+ except Exception as e:
40
+ print(f"Error during verification: {e}")
41
+ import traceback
42
+ traceback.print_exc()