Spaces:
Running
Running
Commit ·
83c9f71
1
Parent(s): 0e6e3fe
Fix Hugging Face deployment issues
Browse files🔧 Critical Fixes:
- Update Dockerfile and Dockerfile.hf-ocr
- Fix main.py startup and model loading
- Update models.py for better database handling
- Fix OCR service and structured extraction
- Fix pipeline status completion
📋 Testing Tools:
- Add OCR dependency verification
- Add extraction diagnosis tools
- Add PDF peek utility
- Add pipeline testing script
- Dockerfile +10 -0
- Dockerfile.hf-ocr +10 -0
- app/main.py +13 -0
- app/models.py +18 -6
- app/services/ocr_service.py +7 -3
- app/services/structured_extraction.py +48 -5
- app/tasks/pipeline.py +80 -43
- check_ocr_deps.py +30 -0
- diagnose_extraction.py +114 -0
- peek_pdfs.py +42 -0
- test_full_pipeline_v2.py +92 -0
- verify_fix.py +42 -0
Dockerfile
CHANGED
|
@@ -6,6 +6,16 @@ RUN apt-get update && apt-get install -y \
|
|
| 6 |
curl \
|
| 7 |
tesseract-ocr \
|
| 8 |
tesseract-ocr-eng \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
tesseract-ocr-osd \
|
| 10 |
poppler-utils \
|
| 11 |
libtesseract-dev \
|
|
|
|
| 6 |
curl \
|
| 7 |
tesseract-ocr \
|
| 8 |
tesseract-ocr-eng \
|
| 9 |
+
tesseract-ocr-afr \
|
| 10 |
+
tesseract-ocr-zul \
|
| 11 |
+
tesseract-ocr-xho \
|
| 12 |
+
tesseract-ocr-nso \
|
| 13 |
+
tesseract-ocr-sot \
|
| 14 |
+
tesseract-ocr-tsn \
|
| 15 |
+
tesseract-ocr-ssw \
|
| 16 |
+
tesseract-ocr-ven \
|
| 17 |
+
tesseract-ocr-tso \
|
| 18 |
+
tesseract-ocr-nbl \
|
| 19 |
tesseract-ocr-osd \
|
| 20 |
poppler-utils \
|
| 21 |
libtesseract-dev \
|
Dockerfile.hf-ocr
CHANGED
|
@@ -5,6 +5,16 @@ RUN apt-get update && apt-get install -y \
|
|
| 5 |
build-essential \
|
| 6 |
curl \
|
| 7 |
tesseract-ocr \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
libtesseract-dev \
|
| 9 |
poppler-utils \
|
| 10 |
libgl1-mesa-glx \
|
|
|
|
| 5 |
build-essential \
|
| 6 |
curl \
|
| 7 |
tesseract-ocr \
|
| 8 |
+
tesseract-ocr-afr \
|
| 9 |
+
tesseract-ocr-zul \
|
| 10 |
+
tesseract-ocr-xho \
|
| 11 |
+
tesseract-ocr-nso \
|
| 12 |
+
tesseract-ocr-sot \
|
| 13 |
+
tesseract-ocr-tsn \
|
| 14 |
+
tesseract-ocr-ssw \
|
| 15 |
+
tesseract-ocr-ven \
|
| 16 |
+
tesseract-ocr-tso \
|
| 17 |
+
tesseract-ocr-nbl \
|
| 18 |
libtesseract-dev \
|
| 19 |
poppler-utils \
|
| 20 |
libgl1-mesa-glx \
|
app/main.py
CHANGED
|
@@ -53,6 +53,19 @@ def root():
|
|
| 53 |
@app.on_event("startup")
|
| 54 |
def _startup() -> None:
|
| 55 |
init_session_factory()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# Fail fast if the worker pipeline cannot be imported (prevents silent worker failures).
|
| 58 |
try:
|
|
|
|
| 53 |
@app.on_event("startup")
|
| 54 |
def _startup() -> None:
|
| 55 |
init_session_factory()
|
| 56 |
+
|
| 57 |
+
# Auto-create tables for SQLite during local dev
|
| 58 |
+
if settings.database_url.startswith("sqlite"):
|
| 59 |
+
try:
|
| 60 |
+
from app.db import get_engine, Base
|
| 61 |
+
from app.models import CVRecord, CVAnalysis, ResumeSkill, ResumeScore, AuditLog, WorkflowAuditLog # Ensure models are registered
|
| 62 |
+
engine = get_engine()
|
| 63 |
+
Base.metadata.create_all(bind=engine)
|
| 64 |
+
import logging
|
| 65 |
+
logging.getLogger(__name__).info("SQLite tables created/verified successfully")
|
| 66 |
+
except Exception as e:
|
| 67 |
+
import logging
|
| 68 |
+
logging.getLogger(__name__).error(f"Failed to create SQLite tables: {e}")
|
| 69 |
|
| 70 |
# Fail fast if the worker pipeline cannot be imported (prevents silent worker failures).
|
| 71 |
try:
|
app/models.py
CHANGED
|
@@ -11,7 +11,9 @@ from app.db import Base
|
|
| 11 |
class CVRecord(Base):
|
| 12 |
"""Stores raw CV text for analysis (no file storage)."""
|
| 13 |
__tablename__ = "cv_records"
|
| 14 |
-
__table_args__ =
|
|
|
|
|
|
|
| 15 |
|
| 16 |
id: Mapped[uuid.UUID] = mapped_column(
|
| 17 |
sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
|
|
@@ -30,7 +32,9 @@ class CVRecord(Base):
|
|
| 30 |
class CVAnalysis(Base):
|
| 31 |
"""Analysis result for a CV record."""
|
| 32 |
__tablename__ = "cv_analyses"
|
| 33 |
-
__table_args__ =
|
|
|
|
|
|
|
| 34 |
|
| 35 |
id: Mapped[uuid.UUID] = mapped_column(
|
| 36 |
sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
|
|
@@ -63,7 +67,9 @@ class CVAnalysis(Base):
|
|
| 63 |
|
| 64 |
class ResumeSkill(Base):
|
| 65 |
__tablename__ = "cv_resume_skills"
|
| 66 |
-
__table_args__ =
|
|
|
|
|
|
|
| 67 |
|
| 68 |
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
| 69 |
resume_id: Mapped[uuid.UUID] = mapped_column(
|
|
@@ -77,7 +83,9 @@ class ResumeSkill(Base):
|
|
| 77 |
|
| 78 |
class ResumeScore(Base):
|
| 79 |
__tablename__ = "cv_resume_scores"
|
| 80 |
-
__table_args__ =
|
|
|
|
|
|
|
| 81 |
|
| 82 |
id: Mapped[uuid.UUID] = mapped_column(
|
| 83 |
sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
|
|
@@ -94,7 +102,9 @@ class ResumeScore(Base):
|
|
| 94 |
|
| 95 |
class AuditLog(Base):
|
| 96 |
__tablename__ = "cv_audit_logs"
|
| 97 |
-
__table_args__ =
|
|
|
|
|
|
|
| 98 |
|
| 99 |
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
| 100 |
entity_type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
@@ -108,7 +118,9 @@ class AuditLog(Base):
|
|
| 108 |
class WorkflowAuditLog(Base):
|
| 109 |
"""Audit log for Risk Gate workflow progression."""
|
| 110 |
__tablename__ = "cv_workflow_audit_logs"
|
| 111 |
-
__table_args__ =
|
|
|
|
|
|
|
| 112 |
|
| 113 |
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
| 114 |
analysis_id: Mapped[uuid.UUID] = mapped_column(
|
|
|
|
| 11 |
class CVRecord(Base):
|
| 12 |
"""Stores raw CV text for analysis (no file storage)."""
|
| 13 |
__tablename__ = "cv_records"
|
| 14 |
+
__table_args__ = (
|
| 15 |
+
{"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
|
| 16 |
+
)
|
| 17 |
|
| 18 |
id: Mapped[uuid.UUID] = mapped_column(
|
| 19 |
sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
|
|
|
|
| 32 |
class CVAnalysis(Base):
|
| 33 |
"""Analysis result for a CV record."""
|
| 34 |
__tablename__ = "cv_analyses"
|
| 35 |
+
__table_args__ = (
|
| 36 |
+
{"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
|
| 37 |
+
)
|
| 38 |
|
| 39 |
id: Mapped[uuid.UUID] = mapped_column(
|
| 40 |
sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
|
|
|
|
| 67 |
|
| 68 |
class ResumeSkill(Base):
|
| 69 |
__tablename__ = "cv_resume_skills"
|
| 70 |
+
__table_args__ = (
|
| 71 |
+
{"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
|
| 72 |
+
)
|
| 73 |
|
| 74 |
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
| 75 |
resume_id: Mapped[uuid.UUID] = mapped_column(
|
|
|
|
| 83 |
|
| 84 |
class ResumeScore(Base):
|
| 85 |
__tablename__ = "cv_resume_scores"
|
| 86 |
+
__table_args__ = (
|
| 87 |
+
{"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
|
| 88 |
+
)
|
| 89 |
|
| 90 |
id: Mapped[uuid.UUID] = mapped_column(
|
| 91 |
sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
|
|
|
|
| 102 |
|
| 103 |
class AuditLog(Base):
|
| 104 |
__tablename__ = "cv_audit_logs"
|
| 105 |
+
__table_args__ = (
|
| 106 |
+
{"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
|
| 107 |
+
)
|
| 108 |
|
| 109 |
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
| 110 |
entity_type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
|
|
| 118 |
class WorkflowAuditLog(Base):
|
| 119 |
"""Audit log for Risk Gate workflow progression."""
|
| 120 |
__tablename__ = "cv_workflow_audit_logs"
|
| 121 |
+
__table_args__ = (
|
| 122 |
+
{"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
|
| 123 |
+
)
|
| 124 |
|
| 125 |
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
| 126 |
analysis_id: Mapped[uuid.UUID] = mapped_column(
|
app/services/ocr_service.py
CHANGED
|
@@ -108,10 +108,12 @@ class OCRService:
|
|
| 108 |
# Preprocess image for better OCR
|
| 109 |
processed_image = self._preprocess_image(image)
|
| 110 |
|
| 111 |
-
# Extract text using Tesseract
|
|
|
|
| 112 |
page_text = pytesseract.image_to_string(
|
| 113 |
processed_image,
|
| 114 |
-
config=self.tesseract_config
|
|
|
|
| 115 |
)
|
| 116 |
|
| 117 |
if page_text.strip():
|
|
@@ -169,9 +171,11 @@ class OCRService:
|
|
| 169 |
image = Image.open(file_path)
|
| 170 |
processed_image = self._preprocess_image(image)
|
| 171 |
|
|
|
|
| 172 |
raw_text = pytesseract.image_to_string(
|
| 173 |
processed_image,
|
| 174 |
-
config=self.tesseract_config
|
|
|
|
| 175 |
)
|
| 176 |
|
| 177 |
return self._clean_ocr_text(raw_text)
|
|
|
|
| 108 |
# Preprocess image for better OCR
|
| 109 |
processed_image = self._preprocess_image(image)
|
| 110 |
|
| 111 |
+
# Extract text using Tesseract with South African language support
|
| 112 |
+
langs = 'eng+afr+zul+xho+nso+sot+tsn+ssw+ven+tso+nbl'
|
| 113 |
page_text = pytesseract.image_to_string(
|
| 114 |
processed_image,
|
| 115 |
+
config=self.tesseract_config,
|
| 116 |
+
lang=langs
|
| 117 |
)
|
| 118 |
|
| 119 |
if page_text.strip():
|
|
|
|
| 171 |
image = Image.open(file_path)
|
| 172 |
processed_image = self._preprocess_image(image)
|
| 173 |
|
| 174 |
+
langs = 'eng+afr+zul+xho+nso+sot+tsn+ssw+ven+tso+nbl'
|
| 175 |
raw_text = pytesseract.image_to_string(
|
| 176 |
processed_image,
|
| 177 |
+
config=self.tesseract_config,
|
| 178 |
+
lang=langs
|
| 179 |
)
|
| 180 |
|
| 181 |
return self._clean_ocr_text(raw_text)
|
app/services/structured_extraction.py
CHANGED
|
@@ -150,10 +150,13 @@ def extract_structured_cv_external(resume_text: str) -> dict[str, Any] | None:
|
|
| 150 |
return None
|
| 151 |
|
| 152 |
if not _looks_like_structured_data(parsed):
|
|
|
|
| 153 |
return None
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
| 157 |
|
| 158 |
except Exception as e:
|
| 159 |
logger.error("External HF API extraction failed: %r", e)
|
|
@@ -200,12 +203,52 @@ def _parse_first_json_object(text: str) -> Any:
|
|
| 200 |
|
| 201 |
|
| 202 |
def _looks_like_structured_data(data: Any) -> bool:
|
| 203 |
-
"""Check if data
|
| 204 |
if not isinstance(data, dict):
|
| 205 |
return False
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
def get_extraction_status() -> dict[str, Any]:
|
|
|
|
| 150 |
return None
|
| 151 |
|
| 152 |
if not _looks_like_structured_data(parsed):
|
| 153 |
+
logger.warning("External HF API returned JSON but it doesn't look like a CV: %s", str(parsed)[:500])
|
| 154 |
return None
|
| 155 |
|
| 156 |
+
# Robust normalization for AI variations (flat vs nested)
|
| 157 |
+
normalized_data = _normalize_extracted_json(parsed)
|
| 158 |
+
logger.info("External HF API extraction successful (normalized)")
|
| 159 |
+
return normalized_data
|
| 160 |
|
| 161 |
except Exception as e:
|
| 162 |
logger.error("External HF API extraction failed: %r", e)
|
|
|
|
| 203 |
|
| 204 |
|
| 205 |
def _looks_like_structured_data(data: Any) -> bool:
|
| 206 |
+
"""Check if data has properties of a CV (nested or flat)."""
|
| 207 |
if not isinstance(data, dict):
|
| 208 |
return False
|
| 209 |
|
| 210 |
+
# Check for presence of key CV indicators anywhere in the keys or nested keys
|
| 211 |
+
cv_indicators = {"skills", "experience", "education", "full_name", "work_experience"}
|
| 212 |
+
|
| 213 |
+
# Check top level
|
| 214 |
+
if any(k in data for k in cv_indicators):
|
| 215 |
+
return True
|
| 216 |
+
|
| 217 |
+
# Check nested level 1
|
| 218 |
+
for v in data.values():
|
| 219 |
+
if isinstance(v, dict):
|
| 220 |
+
if any(k in v for k in cv_indicators):
|
| 221 |
+
return True
|
| 222 |
+
|
| 223 |
+
return False
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _normalize_extracted_json(data: dict) -> dict:
|
| 227 |
+
"""Ensure the JSON matches our expected nested schema even if the LLM flattens it."""
|
| 228 |
+
out = {
|
| 229 |
+
"personal_details": data.get("personal_details", {}),
|
| 230 |
+
"education_details": data.get("education_details", {"education": [], "certifications": [], "languages": []}),
|
| 231 |
+
"professional_details": data.get("professional_details", {"skills": [], "experience": [], "bio": ""}),
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
# If LLM flattened personal details
|
| 235 |
+
for k in ["full_name", "email", "phone", "linkedin", "github", "portfolio", "address", "dob"]:
|
| 236 |
+
if k in data and k not in out["personal_details"]:
|
| 237 |
+
out["personal_details"][k] = data[k]
|
| 238 |
+
|
| 239 |
+
# If LLM flattened professional details
|
| 240 |
+
for k in ["skills", "experience", "work_experience", "bio", "position"]:
|
| 241 |
+
if k in data and k not in out["professional_details"]:
|
| 242 |
+
target_key = "experience" if k == "work_experience" else k
|
| 243 |
+
out["professional_details"][target_key] = data[k]
|
| 244 |
+
|
| 245 |
+
# If LLM flattened education
|
| 246 |
+
for k in ["education", "certifications", "languages", "university", "degrees"]:
|
| 247 |
+
if k in data and k not in out["education_details"]:
|
| 248 |
+
target_key = "education" if k in ["university", "degrees"] else k
|
| 249 |
+
out["education_details"][target_key] = data[k]
|
| 250 |
+
|
| 251 |
+
return out
|
| 252 |
|
| 253 |
|
| 254 |
def get_extraction_status() -> dict[str, Any]:
|
app/tasks/pipeline.py
CHANGED
|
@@ -97,51 +97,88 @@ def process_job(job) -> None:
|
|
| 97 |
languages=[],
|
| 98 |
)
|
| 99 |
|
| 100 |
-
# 1. AI Structured Extraction (NuExtract)
|
| 101 |
llm_structured = extract_structured_cv(resume_text)
|
| 102 |
if isinstance(llm_structured, dict):
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
structured_data = cv_data.model_dump()
|
| 147 |
|
|
|
|
| 97 |
languages=[],
|
| 98 |
)
|
| 99 |
|
| 100 |
+
# 1. AI Structured Extraction (NuExtract/LLM)
|
| 101 |
llm_structured = extract_structured_cv(resume_text)
|
| 102 |
if isinstance(llm_structured, dict):
|
| 103 |
+
# Function to safely get data from nested or flat structure
|
| 104 |
+
def get_data(keys, default=None):
|
| 105 |
+
# Try absolute nested path first
|
| 106 |
+
curr = llm_structured
|
| 107 |
+
found_nested = True
|
| 108 |
+
for k in keys:
|
| 109 |
+
if isinstance(curr, dict) and k in curr:
|
| 110 |
+
curr = curr[k]
|
| 111 |
+
else:
|
| 112 |
+
found_nested = False
|
| 113 |
+
break
|
| 114 |
+
if found_nested:
|
| 115 |
+
return curr
|
| 116 |
+
|
| 117 |
+
# Try flat key (last item in the key path) if it exists at top level
|
| 118 |
+
flat_key = keys[-1]
|
| 119 |
+
if flat_key in llm_structured:
|
| 120 |
+
return llm_structured[flat_key]
|
| 121 |
+
|
| 122 |
+
# Special cases for common alternate names
|
| 123 |
+
if flat_key == "education" and "education_details" in llm_structured:
|
| 124 |
+
return llm_structured["education_details"].get("education")
|
| 125 |
+
if flat_key == "skills" and "professional_details" in llm_structured:
|
| 126 |
+
return llm_structured["professional_details"].get("skills")
|
| 127 |
+
if flat_key in ["experience", "work_experience"] and "professional_details" in llm_structured:
|
| 128 |
+
return llm_structured["professional_details"].get("experience") or llm_structured["professional_details"].get("work_experience")
|
| 129 |
+
|
| 130 |
+
return default
|
| 131 |
+
|
| 132 |
+
# Update Personal Details
|
| 133 |
+
pd_src = get_data(["personal_details"])
|
| 134 |
+
if isinstance(pd_src, dict):
|
| 135 |
+
for pk, pv in pd_src.items():
|
| 136 |
+
if pv and hasattr(cv_data.personal_details, pk):
|
| 137 |
+
setattr(cv_data.personal_details, pk, pv)
|
| 138 |
+
else:
|
| 139 |
+
# Handle flat personal details
|
| 140 |
+
for pk in ["full_name", "email", "phone", "linkedin", "github", "portfolio"]:
|
| 141 |
+
val = llm_structured.get(pk)
|
| 142 |
+
if val and hasattr(cv_data.personal_details, pk):
|
| 143 |
+
setattr(cv_data.personal_details, pk, val)
|
| 144 |
+
|
| 145 |
+
# Update Summary/Bio
|
| 146 |
+
cv_data.professional_summary = get_data(["professional_details", "bio"]) or llm_structured.get("professional_summary") or llm_structured.get("bio") or ""
|
| 147 |
+
|
| 148 |
+
# Update Experience
|
| 149 |
+
llm_exp = get_data(["professional_details", "experience"]) or llm_structured.get("experience") or llm_structured.get("work_experience")
|
| 150 |
+
if isinstance(llm_exp, list):
|
| 151 |
+
cv_data.work_experience = [
|
| 152 |
+
WorkExperienceItem(
|
| 153 |
+
company=exp.get("company"),
|
| 154 |
+
title=exp.get("title"),
|
| 155 |
+
start_date=exp.get("start_date"),
|
| 156 |
+
end_date=exp.get("end_date"),
|
| 157 |
+
description=exp.get("description") or exp.get("responsibilities")
|
| 158 |
+
) for exp in llm_exp if isinstance(exp, dict)
|
| 159 |
+
]
|
| 160 |
+
|
| 161 |
+
# Update Skills
|
| 162 |
+
llm_skills = get_data(["professional_details", "skills"]) or llm_structured.get("skills")
|
| 163 |
+
if isinstance(llm_skills, list):
|
| 164 |
+
cv_data.skills = [str(s) for s in llm_skills if s]
|
| 165 |
+
|
| 166 |
+
# Update Education
|
| 167 |
+
llm_edu = get_data(["education_details", "education"]) or llm_structured.get("education")
|
| 168 |
+
if isinstance(llm_edu, list):
|
| 169 |
+
cv_data.education = [
|
| 170 |
+
EducationItem(
|
| 171 |
+
institution=edu.get("university") or edu.get("institution") or edu.get("school"),
|
| 172 |
+
degree=edu.get("degree"),
|
| 173 |
+
field=edu.get("field") or edu.get("major"),
|
| 174 |
+
start_date=edu.get("start_date"),
|
| 175 |
+
end_date=edu.get("end_date")
|
| 176 |
+
) for edu in llm_edu if isinstance(edu, dict)
|
| 177 |
+
]
|
| 178 |
+
|
| 179 |
+
# Update Others
|
| 180 |
+
cv_data.certifications = get_data(["education_details", "certifications"]) or llm_structured.get("certifications") or []
|
| 181 |
+
cv_data.languages = get_data(["education_details", "languages"]) or llm_structured.get("languages") or []
|
| 182 |
|
| 183 |
structured_data = cv_data.model_dump()
|
| 184 |
|
check_ocr_deps.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import shutil
|
| 3 |
+
|
| 4 |
+
def check_env():
|
| 5 |
+
print("Checking for Tesseract...")
|
| 6 |
+
tess_path = shutil.which("tesseract")
|
| 7 |
+
if tess_path:
|
| 8 |
+
print(f"Tesseract found: {tess_path}")
|
| 9 |
+
try:
|
| 10 |
+
res = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
|
| 11 |
+
print(res.stdout)
|
| 12 |
+
except Exception as e:
|
| 13 |
+
print(f"Error running tesseract: {e}")
|
| 14 |
+
else:
|
| 15 |
+
print("Tesseract NOT found in PATH")
|
| 16 |
+
|
| 17 |
+
print("\nChecking for pdftoppm (poppler)...")
|
| 18 |
+
poppler_path = shutil.which("pdftoppm")
|
| 19 |
+
if poppler_path:
|
| 20 |
+
print(f"pdftoppm found: {poppler_path}")
|
| 21 |
+
try:
|
| 22 |
+
res = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
|
| 23 |
+
print(res.stderr) # pdftoppm prints version to stderr
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"Error running pdftoppm: {e}")
|
| 26 |
+
else:
|
| 27 |
+
print("pdftoppm NOT found in PATH")
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
check_env()
|
diagnose_extraction.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
# Configure logging
|
| 8 |
+
logging.basicConfig(level=logging.INFO)
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
# Add the app directory to the path to allow imports
|
| 12 |
+
sys.path.append(str(Path(__file__).parent))
|
| 13 |
+
|
| 14 |
+
from app.services.ocr_service import OCRService
|
| 15 |
+
from app.services.local_nuextract_service import extract_cv_structured_local
|
| 16 |
+
from app.services.enhanced_skills_extractor import EnhancedSkillsExtractor
|
| 17 |
+
|
| 18 |
+
def diagnose(file_path):
|
| 19 |
+
print(f"\n{'='*80}")
|
| 20 |
+
print(f"DIAGNOSING: {os.path.basename(file_path)}")
|
| 21 |
+
print(f"{'='*80}")
|
| 22 |
+
|
| 23 |
+
ocr = OCRService()
|
| 24 |
+
|
| 25 |
+
# 1. Text Extraction Stage
|
| 26 |
+
print("\n1. [TEXT EXTRACTION]")
|
| 27 |
+
try:
|
| 28 |
+
import pdfplumber
|
| 29 |
+
print(f"pdfplumber version: {pdfplumber.__version__}")
|
| 30 |
+
|
| 31 |
+
with pdfplumber.open(file_path) as pdf:
|
| 32 |
+
print(f"PDF pages: {len(pdf.pages)}")
|
| 33 |
+
print(f"PDF metadata: {pdf.metadata}")
|
| 34 |
+
|
| 35 |
+
native_text = ocr._native_pdf_extraction(file_path)
|
| 36 |
+
print(f"Native extraction: {len(native_text)} characters.")
|
| 37 |
+
|
| 38 |
+
is_scanned = ocr._is_scanned_document(native_text)
|
| 39 |
+
print(f"Is scanned (density < {ocr.min_text_density}): {is_scanned}")
|
| 40 |
+
|
| 41 |
+
final_text = ""
|
| 42 |
+
if is_scanned:
|
| 43 |
+
print("Triggering OCR...")
|
| 44 |
+
final_text = ocr._ocr_pdf_extraction(file_path)
|
| 45 |
+
print(f"OCR result: {len(final_text)} characters.")
|
| 46 |
+
else:
|
| 47 |
+
final_text = native_text
|
| 48 |
+
|
| 49 |
+
if len(final_text) < 100:
|
| 50 |
+
print(f"WARNING: Very low text quality detected: {repr(final_text[:100])}")
|
| 51 |
+
else:
|
| 52 |
+
print(f"Sample text (first 500 characters):\n{final_text[:500]}")
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"Extraction failed: {e}")
|
| 56 |
+
import traceback
|
| 57 |
+
traceback.print_exc()
|
| 58 |
+
return
|
| 59 |
+
|
| 60 |
+
# 2. Structured Extraction Stage (AI)
|
| 61 |
+
print("\n2. [STRUCTURED EXTRACTION - AI]")
|
| 62 |
+
try:
|
| 63 |
+
if len(final_text) < 50:
|
| 64 |
+
print("Skipping AI extraction due to low text quality.")
|
| 65 |
+
structured = None
|
| 66 |
+
else:
|
| 67 |
+
print("Running NuExtract (local)...")
|
| 68 |
+
structured = extract_cv_structured_local(final_text)
|
| 69 |
+
if structured:
|
| 70 |
+
print("AI extraction SUCCESS.")
|
| 71 |
+
print(f"Skills found: {len(structured.get('skills', []))}")
|
| 72 |
+
print(f"Experience entries: {len(structured.get('experience', []))}")
|
| 73 |
+
print(f"Education entries: {len(structured.get('education', []))}")
|
| 74 |
+
print(f"Personal Info: {structured.get('personal_details', {})}")
|
| 75 |
+
else:
|
| 76 |
+
print("AI extraction returned NONE.")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"AI extraction failed: {e}")
|
| 79 |
+
structured = None
|
| 80 |
+
|
| 81 |
+
# 3. Enhanced Matching Stage (Regex)
|
| 82 |
+
print("\n3. [ENHANCED MATCHING - REGEX]")
|
| 83 |
+
try:
|
| 84 |
+
skills_extractor = EnhancedSkillsExtractor()
|
| 85 |
+
enhanced_skills = skills_extractor.extract_skills(final_text)
|
| 86 |
+
print(f"Regex skills found: {len(enhanced_skills)}")
|
| 87 |
+
if enhanced_skills:
|
| 88 |
+
# Group by category for better overview
|
| 89 |
+
categories = {}
|
| 90 |
+
for s in enhanced_skills:
|
| 91 |
+
cat = s.get('category', 'other')
|
| 92 |
+
if cat not in categories: categories[cat] = []
|
| 93 |
+
categories[cat].append(s['name'])
|
| 94 |
+
|
| 95 |
+
for cat, names in categories.items():
|
| 96 |
+
print(f" [{cat.upper()}]: {', '.join(names[:10])}")
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f"Regex matching failed: {e}")
|
| 99 |
+
|
| 100 |
+
print(f"\n{'='*80}")
|
| 101 |
+
print("DIAGNOSIS COMPLETE")
|
| 102 |
+
print(f"{'='*80}")
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
# Test on multiple files to see different behaviors
|
| 106 |
+
test_pdfs = [
|
| 107 |
+
"C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser\\Untitled document (3).pdf",
|
| 108 |
+
"C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser\\Dzunisani Data Analyst cv.pdf"
|
| 109 |
+
]
|
| 110 |
+
for pdf in test_pdfs:
|
| 111 |
+
if os.path.exists(pdf):
|
| 112 |
+
diagnose(pdf)
|
| 113 |
+
else:
|
| 114 |
+
print(f"File not found: {pdf}")
|
peek_pdfs.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
# Add the app directory to the path to allow imports
|
| 6 |
+
sys.path.append(str(Path(__file__).parent))
|
| 7 |
+
|
| 8 |
+
from app.services.ocr_service import OCRService
|
| 9 |
+
|
| 10 |
+
def test_file_extraction(file_path):
|
| 11 |
+
print(f"\n[{'='*20}] Testing extraction for: {os.path.basename(file_path)}")
|
| 12 |
+
ocr = OCRService()
|
| 13 |
+
try:
|
| 14 |
+
# To see what happened, let's call the internal methods
|
| 15 |
+
native_text = ocr._native_pdf_extraction(file_path)
|
| 16 |
+
print(f"Native extraction: {len(native_text)} chars.")
|
| 17 |
+
if len(native_text) < 100:
|
| 18 |
+
print(f"Native result snippet: {native_text[:100]!r}")
|
| 19 |
+
print("Triggering OCR fallback...")
|
| 20 |
+
ocr_text = ocr._ocr_pdf_extraction(file_path)
|
| 21 |
+
print(f"OCR extraction: {len(ocr_text)} chars.")
|
| 22 |
+
print(f"OCR result snippet: {ocr_text[:500]!r}")
|
| 23 |
+
else:
|
| 24 |
+
print(f"Native extraction sufficient. First 500 chars:\n{native_text[:500]}")
|
| 25 |
+
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"Extraction failed: {e}")
|
| 28 |
+
import traceback
|
| 29 |
+
traceback.print_exc()
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
files = [
|
| 33 |
+
"Untitled document (3).pdf",
|
| 34 |
+
"Dzunisani Data Analyst cv.pdf",
|
| 35 |
+
"KRM N6 (1).pdf"
|
| 36 |
+
]
|
| 37 |
+
for f in files:
|
| 38 |
+
path = os.path.join("C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser", f)
|
| 39 |
+
if os.path.exists(path):
|
| 40 |
+
test_file_extraction(path)
|
| 41 |
+
else:
|
| 42 |
+
print(f"File not found: {path}")
|
test_full_pipeline_v2.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
# Configure logging
|
| 7 |
+
logging.basicConfig(level=logging.INFO)
|
| 8 |
+
|
| 9 |
+
# Add the cv-analyser app to the path
|
| 10 |
+
sys.path.append("/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser")
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from app.tasks.pipeline import process_job
|
| 14 |
+
from app.models import CVAnalysis, CVRecord
|
| 15 |
+
from app.db import session_scope
|
| 16 |
+
import uuid
|
| 17 |
+
|
| 18 |
+
# Mock Job object
|
| 19 |
+
class MockJob:
|
| 20 |
+
def __init__(self, analysis_id, resume_id, job_description):
|
| 21 |
+
self.analysis_id = str(analysis_id)
|
| 22 |
+
self.resume_id = str(resume_id)
|
| 23 |
+
self.job_description = job_description
|
| 24 |
+
|
| 25 |
+
# Test file
|
| 26 |
+
test_pdf = "/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser/Untitled document (3).pdf"
|
| 27 |
+
|
| 28 |
+
with session_scope() as db:
|
| 29 |
+
# Create a test record
|
| 30 |
+
record_id = uuid.uuid4()
|
| 31 |
+
analysis_id = uuid.uuid4()
|
| 32 |
+
|
| 33 |
+
# Read the file
|
| 34 |
+
from app.services.ocr_service import OCRService
|
| 35 |
+
ocr = OCRService()
|
| 36 |
+
raw_text = ocr.extract_text(test_pdf, "pdf")
|
| 37 |
+
|
| 38 |
+
record = CVRecord(
|
| 39 |
+
id=record_id,
|
| 40 |
+
cv_text=raw_text
|
| 41 |
+
)
|
| 42 |
+
db.add(record)
|
| 43 |
+
|
| 44 |
+
analysis = CVAnalysis(
|
| 45 |
+
id=analysis_id,
|
| 46 |
+
record_id=record_id,
|
| 47 |
+
status="pending",
|
| 48 |
+
job_description="Python Developer with Flask and AWS experience"
|
| 49 |
+
)
|
| 50 |
+
db.add(analysis)
|
| 51 |
+
db.commit()
|
| 52 |
+
|
| 53 |
+
print(f"Created Test Analysis: {analysis_id}")
|
| 54 |
+
|
| 55 |
+
# Run the pipeline
|
| 56 |
+
job = MockJob(analysis_id, record_id, analysis.job_description)
|
| 57 |
+
process_job(job)
|
| 58 |
+
|
| 59 |
+
# Fetch the result
|
| 60 |
+
db.expire_all()
|
| 61 |
+
result_analysis = db.get(CVAnalysis, analysis_id)
|
| 62 |
+
res = result_analysis.result
|
| 63 |
+
|
| 64 |
+
print("\n--- FINAL RESULT INSPECTION ---")
|
| 65 |
+
print(f"Status: {result_analysis.status}")
|
| 66 |
+
print(f"Overall Score: {result_analysis.overall_score}")
|
| 67 |
+
|
| 68 |
+
# Check for top-level keys
|
| 69 |
+
print(f"\nTop-level keys in response: {list(res.keys())}")
|
| 70 |
+
|
| 71 |
+
skills = res.get("skills", [])
|
| 72 |
+
print(f"Top-level skills count: {len(skills)}")
|
| 73 |
+
if skills:
|
| 74 |
+
print(f"Sample skills: {skills[:5]}")
|
| 75 |
+
|
| 76 |
+
experience = res.get("experience", [])
|
| 77 |
+
print(f"Top-level experience count: {len(experience)}")
|
| 78 |
+
|
| 79 |
+
# Check structured_data
|
| 80 |
+
struct = res.get("structured_data", {})
|
| 81 |
+
struct_skills = struct.get("skills", [])
|
| 82 |
+
print(f"Structured skills count: {len(struct_skills)}")
|
| 83 |
+
|
| 84 |
+
if len(skills) == len(struct_skills) and len(skills) > 0:
|
| 85 |
+
print("\n✅ SUCCESS: Top-level and structured data are in sync!")
|
| 86 |
+
else:
|
| 87 |
+
print("\n❌ WARNING: Data desync or no data found.")
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"Error: {e}")
|
| 91 |
+
import traceback
|
| 92 |
+
traceback.print_exc()
|
verify_fix.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
# Configure logging to see OCR details
|
| 6 |
+
logging.basicConfig(level=logging.INFO)
|
| 7 |
+
|
| 8 |
+
# Add the cv-analyser app to the path
|
| 9 |
+
sys.path.append("/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser")
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from app.services.ocr_service import OCRService
|
| 13 |
+
from app.services.enhanced_skills_extractor import EnhancedSkillsExtractor
|
| 14 |
+
|
| 15 |
+
ocr = OCRService()
|
| 16 |
+
test_file = "/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser/Untitled document (3).pdf"
|
| 17 |
+
|
| 18 |
+
if not os.path.exists(test_file):
|
| 19 |
+
print(f"Error: Test file not found at {test_file}")
|
| 20 |
+
sys.exit(1)
|
| 21 |
+
|
| 22 |
+
print(f"Testing extraction on: {test_file}")
|
| 23 |
+
text = ocr._extract_from_pdf(test_file)
|
| 24 |
+
|
| 25 |
+
print(f"\n--- EXTRACTION RESULT ---")
|
| 26 |
+
print(f"Characters: {len(text)}")
|
| 27 |
+
if len(text) > 0:
|
| 28 |
+
print(f"First 500 chars: {repr(text[:500])}")
|
| 29 |
+
|
| 30 |
+
# Skill test
|
| 31 |
+
extractor = EnhancedSkillsExtractor()
|
| 32 |
+
skills = extractor.extract_skills(text)
|
| 33 |
+
print(f"Skills found: {len(skills)}")
|
| 34 |
+
if skills:
|
| 35 |
+
print(f"Sample skills: {[s.get('name') for s in skills[:10]]}")
|
| 36 |
+
else:
|
| 37 |
+
print("FAIL: No text extracted")
|
| 38 |
+
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"Error during verification: {e}")
|
| 41 |
+
import traceback
|
| 42 |
+
traceback.print_exc()
|