Spaces:

Dzunisani007
/

cv-analyser

Running

Dzunisani007 commited on 29 days ago

Commit

83c9f71

1 Parent(s): 0e6e3fe

Fix Hugging Face deployment issues

🔧 Critical Fixes:
- Update Dockerfile and Dockerfile.hf-ocr
- Fix main.py startup and model loading
- Update models.py for better database handling
- Fix OCR service and structured extraction
- Fix pipeline status completion

📋 Testing Tools:
- Add OCR dependency verification
- Add extraction diagnosis tools
- Add PDF peek utility
- Add pipeline testing script

Files changed (12) hide show

Dockerfile +10 -0
Dockerfile.hf-ocr +10 -0
app/main.py +13 -0
app/models.py +18 -6
app/services/ocr_service.py +7 -3
app/services/structured_extraction.py +48 -5
app/tasks/pipeline.py +80 -43
check_ocr_deps.py +30 -0
diagnose_extraction.py +114 -0
peek_pdfs.py +42 -0
test_full_pipeline_v2.py +92 -0
verify_fix.py +42 -0

Dockerfile CHANGED Viewed

@@ -6,6 +6,16 @@ RUN apt-get update && apt-get install -y \
     curl \
     tesseract-ocr \
     tesseract-ocr-eng \
     tesseract-ocr-osd \
     poppler-utils \
     libtesseract-dev \

     curl \
     tesseract-ocr \
     tesseract-ocr-eng \
+    tesseract-ocr-afr \
+    tesseract-ocr-zul \
+    tesseract-ocr-xho \
+    tesseract-ocr-nso \
+    tesseract-ocr-sot \
+    tesseract-ocr-tsn \
+    tesseract-ocr-ssw \
+    tesseract-ocr-ven \
+    tesseract-ocr-tso \
+    tesseract-ocr-nbl \
     tesseract-ocr-osd \
     poppler-utils \
     libtesseract-dev \

Dockerfile.hf-ocr CHANGED Viewed

@@ -5,6 +5,16 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     tesseract-ocr \
     libtesseract-dev \
     poppler-utils \
     libgl1-mesa-glx \

     build-essential \
     curl \
     tesseract-ocr \
+    tesseract-ocr-afr \
+    tesseract-ocr-zul \
+    tesseract-ocr-xho \
+    tesseract-ocr-nso \
+    tesseract-ocr-sot \
+    tesseract-ocr-tsn \
+    tesseract-ocr-ssw \
+    tesseract-ocr-ven \
+    tesseract-ocr-tso \
+    tesseract-ocr-nbl \
     libtesseract-dev \
     poppler-utils \
     libgl1-mesa-glx \

app/main.py CHANGED Viewed

@@ -53,6 +53,19 @@ def root():
 @app.on_event("startup")
 def _startup() -> None:
     init_session_factory()
     # Fail fast if the worker pipeline cannot be imported (prevents silent worker failures).
     try:

 @app.on_event("startup")
 def _startup() -> None:
     init_session_factory()
+    # Auto-create tables for SQLite during local dev
+    if settings.database_url.startswith("sqlite"):
+        try:
+            from app.db import get_engine, Base
+            from app.models import CVRecord, CVAnalysis, ResumeSkill, ResumeScore, AuditLog, WorkflowAuditLog # Ensure models are registered
+            engine = get_engine()
+            Base.metadata.create_all(bind=engine)
+            import logging
+            logging.getLogger(__name__).info("SQLite tables created/verified successfully")
+        except Exception as e:
+            import logging
+            logging.getLogger(__name__).error(f"Failed to create SQLite tables: {e}")
     # Fail fast if the worker pipeline cannot be imported (prevents silent worker failures).
     try:

app/models.py CHANGED Viewed

@@ -11,7 +11,9 @@ from app.db import Base
 class CVRecord(Base):
     """Stores raw CV text for analysis (no file storage)."""
     __tablename__ = "cv_records"
-    __table_args__ = {"schema": "cv_analyser"}
     id: Mapped[uuid.UUID] = mapped_column(
         sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
@@ -30,7 +32,9 @@ class CVRecord(Base):
 class CVAnalysis(Base):
     """Analysis result for a CV record."""
     __tablename__ = "cv_analyses"
-    __table_args__ = {"schema": "cv_analyser"}
     id: Mapped[uuid.UUID] = mapped_column(
         sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
@@ -63,7 +67,9 @@ class CVAnalysis(Base):
 class ResumeSkill(Base):
     __tablename__ = "cv_resume_skills"
-    __table_args__ = {"schema": "cv_analyser"}
     id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
     resume_id: Mapped[uuid.UUID] = mapped_column(
@@ -77,7 +83,9 @@ class ResumeSkill(Base):
 class ResumeScore(Base):
     __tablename__ = "cv_resume_scores"
-    __table_args__ = {"schema": "cv_analyser"}
     id: Mapped[uuid.UUID] = mapped_column(
         sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
@@ -94,7 +102,9 @@ class ResumeScore(Base):
 class AuditLog(Base):
     __tablename__ = "cv_audit_logs"
-    __table_args__ = {"schema": "cv_analyser"}
     id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
     entity_type: Mapped[str | None] = mapped_column(Text, nullable=True)
@@ -108,7 +118,9 @@ class AuditLog(Base):
 class WorkflowAuditLog(Base):
     """Audit log for Risk Gate workflow progression."""
     __tablename__ = "cv_workflow_audit_logs"
-    __table_args__ = {"schema": "cv_analyser"}
     id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
     analysis_id: Mapped[uuid.UUID] = mapped_column(

 class CVRecord(Base):
     """Stores raw CV text for analysis (no file storage)."""
     __tablename__ = "cv_records"
+    __table_args__ = (
+        {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
+    )
     id: Mapped[uuid.UUID] = mapped_column(
         sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
 class CVAnalysis(Base):
     """Analysis result for a CV record."""
     __tablename__ = "cv_analyses"
+    __table_args__ = (
+        {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
+    )
     id: Mapped[uuid.UUID] = mapped_column(
         sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
 class ResumeSkill(Base):
     __tablename__ = "cv_resume_skills"
+    __table_args__ = (
+        {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
+    )
     id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
     resume_id: Mapped[uuid.UUID] = mapped_column(
 class ResumeScore(Base):
     __tablename__ = "cv_resume_scores"
+    __table_args__ = (
+        {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
+    )
     id: Mapped[uuid.UUID] = mapped_column(
         sa.UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
 class AuditLog(Base):
     __tablename__ = "cv_audit_logs"
+    __table_args__ = (
+        {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
+    )
     id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
     entity_type: Mapped[str | None] = mapped_column(Text, nullable=True)
 class WorkflowAuditLog(Base):
     """Audit log for Risk Gate workflow progression."""
     __tablename__ = "cv_workflow_audit_logs"
+    __table_args__ = (
+        {"schema": "cv_analyser"} if not settings.database_url.startswith("sqlite") else {}
+    )
     id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
     analysis_id: Mapped[uuid.UUID] = mapped_column(

app/services/ocr_service.py CHANGED Viewed

@@ -108,10 +108,12 @@ class OCRService:
                     # Preprocess image for better OCR
                     processed_image = self._preprocess_image(image)
-                    # Extract text using Tesseract
                     page_text = pytesseract.image_to_string(
                         processed_image,
-                        config=self.tesseract_config
                     )
                     if page_text.strip():
@@ -169,9 +171,11 @@ class OCRService:
             image = Image.open(file_path)
             processed_image = self._preprocess_image(image)
             raw_text = pytesseract.image_to_string(
                 processed_image,
-                config=self.tesseract_config
             )
             return self._clean_ocr_text(raw_text)

                     # Preprocess image for better OCR
                     processed_image = self._preprocess_image(image)
+                    # Extract text using Tesseract with South African language support
+                    langs = 'eng+afr+zul+xho+nso+sot+tsn+ssw+ven+tso+nbl'
                     page_text = pytesseract.image_to_string(
                         processed_image,
+                        config=self.tesseract_config,
+                        lang=langs
                     )
                     if page_text.strip():
             image = Image.open(file_path)
             processed_image = self._preprocess_image(image)
+            langs = 'eng+afr+zul+xho+nso+sot+tsn+ssw+ven+tso+nbl'
             raw_text = pytesseract.image_to_string(
                 processed_image,
+                config=self.tesseract_config,
+                lang=langs
             )
             return self._clean_ocr_text(raw_text)

app/services/structured_extraction.py CHANGED Viewed

@@ -150,10 +150,13 @@ def extract_structured_cv_external(resume_text: str) -> dict[str, Any] | None:
             return None
         if not _looks_like_structured_data(parsed):
             return None
-        logger.info("External HF API extraction successful")
-        return parsed
     except Exception as e:
         logger.error("External HF API extraction failed: %r", e)
@@ -200,12 +203,52 @@ def _parse_first_json_object(text: str) -> Any:
 def _looks_like_structured_data(data: Any) -> bool:
-    """Check if data looks like our structured CV schema."""
     if not isinstance(data, dict):
         return False
-    required_keys = {"personal_details", "education_details", "professional_details"}
-    return required_keys.issubset(data.keys())
 def get_extraction_status() -> dict[str, Any]:

             return None
         if not _looks_like_structured_data(parsed):
+            logger.warning("External HF API returned JSON but it doesn't look like a CV: %s", str(parsed)[:500])
             return None
+        # Robust normalization for AI variations (flat vs nested)
+        normalized_data = _normalize_extracted_json(parsed)
+        logger.info("External HF API extraction successful (normalized)")
+        return normalized_data
     except Exception as e:
         logger.error("External HF API extraction failed: %r", e)
 def _looks_like_structured_data(data: Any) -> bool:
+    """Check if data has properties of a CV (nested or flat)."""
     if not isinstance(data, dict):
         return False
+    # Check for presence of key CV indicators anywhere in the keys or nested keys
+    cv_indicators = {"skills", "experience", "education", "full_name", "work_experience"}
+    # Check top level
+    if any(k in data for k in cv_indicators):
+        return True
+    # Check nested level 1
+    for v in data.values():
+        if isinstance(v, dict):
+            if any(k in v for k in cv_indicators):
+                return True
+    return False
+def _normalize_extracted_json(data: dict) -> dict:
+    """Ensure the JSON matches our expected nested schema even if the LLM flattens it."""
+    out = {
+        "personal_details": data.get("personal_details", {}),
+        "education_details": data.get("education_details", {"education": [], "certifications": [], "languages": []}),
+        "professional_details": data.get("professional_details", {"skills": [], "experience": [], "bio": ""}),
+    }
+    # If LLM flattened personal details
+    for k in ["full_name", "email", "phone", "linkedin", "github", "portfolio", "address", "dob"]:
+        if k in data and k not in out["personal_details"]:
+             out["personal_details"][k] = data[k]
+    # If LLM flattened professional details
+    for k in ["skills", "experience", "work_experience", "bio", "position"]:
+        if k in data and k not in out["professional_details"]:
+            target_key = "experience" if k == "work_experience" else k
+            out["professional_details"][target_key] = data[k]
+    # If LLM flattened education
+    for k in ["education", "certifications", "languages", "university", "degrees"]:
+        if k in data and k not in out["education_details"]:
+            target_key = "education" if k in ["university", "degrees"] else k
+            out["education_details"][target_key] = data[k]
+    return out
 def get_extraction_status() -> dict[str, Any]:

app/tasks/pipeline.py CHANGED Viewed

@@ -97,51 +97,88 @@ def process_job(job) -> None:
             languages=[],
         )
-        # 1. AI Structured Extraction (NuExtract)
         llm_structured = extract_structured_cv(resume_text)
         if isinstance(llm_structured, dict):
-            # Update Pydantic model with LLM results
-            for k in ("personal_details", "education_details", "professional_details"):
-                if isinstance(llm_structured.get(k), dict):
-                    if k == "personal_details":
-                        for pk, pv in llm_structured[k].items():
-                            if pv and hasattr(cv_data.personal_details, pk):
-                                setattr(cv_data.personal_details, pk, pv)
-                    elif k == "professional_details":
-                        if llm_structured[k].get("bio"):
-                            cv_data.professional_summary = llm_structured[k]["bio"]
-                        llm_exp = llm_structured[k].get("experience")
-                        if isinstance(llm_exp, list) and len(llm_exp) > 0:
-                            cv_data.work_experience = [
-                                WorkExperienceItem(
-                                    company=exp.get("company"),
-                                    title=exp.get("title"),
-                                    start_date=exp.get("start_date"),
-                                    end_date=exp.get("end_date"),
-                                    description=exp.get("description")
-                                ) for exp in llm_exp if isinstance(exp, dict)
-                            ]
-                        if llm_structured[k].get("skills"):
-                            cv_data.skills = llm_structured[k]["skills"]
-                    elif k == "education_details":
-                        llm_edu = llm_structured[k].get("education")
-                        if isinstance(llm_edu, list) and len(llm_edu) > 0:
-                            cv_data.education = [
-                                EducationItem(
-                                    institution=edu.get("university") or edu.get("institution"),
-                                    degree=edu.get("degree"),
-                                    field=edu.get("field"),
-                                    start_date=edu.get("start_date"),
-                                    end_date=edu.get("end_date")
-                                ) for edu in llm_edu if isinstance(edu, dict)
-                            ]
-                        if llm_structured[k].get("certifications"):
-                            cv_data.certifications = llm_structured[k]["certifications"]
-                        if llm_structured[k].get("languages"):
-                            cv_data.languages = llm_structured[k]["languages"]
         structured_data = cv_data.model_dump()

             languages=[],
         )
+        # 1. AI Structured Extraction (NuExtract/LLM)
         llm_structured = extract_structured_cv(resume_text)
         if isinstance(llm_structured, dict):
+            # Function to safely get data from nested or flat structure
+            def get_data(keys, default=None):
+                # Try absolute nested path first
+                curr = llm_structured
+                found_nested = True
+                for k in keys:
+                    if isinstance(curr, dict) and k in curr:
+                        curr = curr[k]
+                    else:
+                        found_nested = False
+                        break
+                if found_nested:
+                    return curr
+                # Try flat key (last item in the key path) if it exists at top level
+                flat_key = keys[-1]
+                if flat_key in llm_structured:
+                    return llm_structured[flat_key]
+                # Special cases for common alternate names
+                if flat_key == "education" and "education_details" in llm_structured:
+                    return llm_structured["education_details"].get("education")
+                if flat_key == "skills" and "professional_details" in llm_structured:
+                    return llm_structured["professional_details"].get("skills")
+                if flat_key in ["experience", "work_experience"] and "professional_details" in llm_structured:
+                    return llm_structured["professional_details"].get("experience") or llm_structured["professional_details"].get("work_experience")
+                return default
+            # Update Personal Details
+            pd_src = get_data(["personal_details"])
+            if isinstance(pd_src, dict):
+                for pk, pv in pd_src.items():
+                    if pv and hasattr(cv_data.personal_details, pk):
+                        setattr(cv_data.personal_details, pk, pv)
+            else:
+                # Handle flat personal details
+                for pk in ["full_name", "email", "phone", "linkedin", "github", "portfolio"]:
+                    val = llm_structured.get(pk)
+                    if val and hasattr(cv_data.personal_details, pk):
+                        setattr(cv_data.personal_details, pk, val)
+            # Update Summary/Bio
+            cv_data.professional_summary = get_data(["professional_details", "bio"]) or llm_structured.get("professional_summary") or llm_structured.get("bio") or ""
+            # Update Experience
+            llm_exp = get_data(["professional_details", "experience"]) or llm_structured.get("experience") or llm_structured.get("work_experience")
+            if isinstance(llm_exp, list):
+                cv_data.work_experience = [
+                    WorkExperienceItem(
+                        company=exp.get("company"),
+                        title=exp.get("title"),
+                        start_date=exp.get("start_date"),
+                        end_date=exp.get("end_date"),
+                        description=exp.get("description") or exp.get("responsibilities")
+                    ) for exp in llm_exp if isinstance(exp, dict)
+                ]
+            # Update Skills
+            llm_skills = get_data(["professional_details", "skills"]) or llm_structured.get("skills")
+            if isinstance(llm_skills, list):
+                cv_data.skills = [str(s) for s in llm_skills if s]
+            # Update Education
+            llm_edu = get_data(["education_details", "education"]) or llm_structured.get("education")
+            if isinstance(llm_edu, list):
+                cv_data.education = [
+                    EducationItem(
+                        institution=edu.get("university") or edu.get("institution") or edu.get("school"),
+                        degree=edu.get("degree"),
+                        field=edu.get("field") or edu.get("major"),
+                        start_date=edu.get("start_date"),
+                        end_date=edu.get("end_date")
+                    ) for edu in llm_edu if isinstance(edu, dict)
+                ]
+            # Update Others
+            cv_data.certifications = get_data(["education_details", "certifications"]) or llm_structured.get("certifications") or []
+            cv_data.languages = get_data(["education_details", "languages"]) or llm_structured.get("languages") or []
         structured_data = cv_data.model_dump()

check_ocr_deps.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import subprocess
+import shutil
+def check_env():
+    print("Checking for Tesseract...")
+    tess_path = shutil.which("tesseract")
+    if tess_path:
+        print(f"Tesseract found: {tess_path}")
+        try:
+            res = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
+            print(res.stdout)
+        except Exception as e:
+            print(f"Error running tesseract: {e}")
+    else:
+        print("Tesseract NOT found in PATH")
+    print("\nChecking for pdftoppm (poppler)...")
+    poppler_path = shutil.which("pdftoppm")
+    if poppler_path:
+        print(f"pdftoppm found: {poppler_path}")
+        try:
+            res = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
+            print(res.stderr) # pdftoppm prints version to stderr
+        except Exception as e:
+            print(f"Error running pdftoppm: {e}")
+    else:
+        print("pdftoppm NOT found in PATH")
+if __name__ == "__main__":
+    check_env()

diagnose_extraction.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import sys
+import json
+from pathlib import Path
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Add the app directory to the path to allow imports
+sys.path.append(str(Path(__file__).parent))
+from app.services.ocr_service import OCRService
+from app.services.local_nuextract_service import extract_cv_structured_local
+from app.services.enhanced_skills_extractor import EnhancedSkillsExtractor
+def diagnose(file_path):
+    print(f"\n{'='*80}")
+    print(f"DIAGNOSING: {os.path.basename(file_path)}")
+    print(f"{'='*80}")
+    ocr = OCRService()
+    # 1. Text Extraction Stage
+    print("\n1. [TEXT EXTRACTION]")
+    try:
+        import pdfplumber
+        print(f"pdfplumber version: {pdfplumber.__version__}")
+        with pdfplumber.open(file_path) as pdf:
+            print(f"PDF pages: {len(pdf.pages)}")
+            print(f"PDF metadata: {pdf.metadata}")
+        native_text = ocr._native_pdf_extraction(file_path)
+        print(f"Native extraction: {len(native_text)} characters.")
+        is_scanned = ocr._is_scanned_document(native_text)
+        print(f"Is scanned (density < {ocr.min_text_density}): {is_scanned}")
+        final_text = ""
+        if is_scanned:
+             print("Triggering OCR...")
+             final_text = ocr._ocr_pdf_extraction(file_path)
+             print(f"OCR result: {len(final_text)} characters.")
+        else:
+             final_text = native_text
+        if len(final_text) < 100:
+            print(f"WARNING: Very low text quality detected: {repr(final_text[:100])}")
+        else:
+            print(f"Sample text (first 500 characters):\n{final_text[:500]}")
+    except Exception as e:
+        print(f"Extraction failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return
+    # 2. Structured Extraction Stage (AI)
+    print("\n2. [STRUCTURED EXTRACTION - AI]")
+    try:
+        if len(final_text) < 50:
+            print("Skipping AI extraction due to low text quality.")
+            structured = None
+        else:
+            print("Running NuExtract (local)...")
+            structured = extract_cv_structured_local(final_text)
+            if structured:
+                print("AI extraction SUCCESS.")
+                print(f"Skills found: {len(structured.get('skills', []))}")
+                print(f"Experience entries: {len(structured.get('experience', []))}")
+                print(f"Education entries: {len(structured.get('education', []))}")
+                print(f"Personal Info: {structured.get('personal_details', {})}")
+            else:
+                print("AI extraction returned NONE.")
+    except Exception as e:
+        print(f"AI extraction failed: {e}")
+        structured = None
+    # 3. Enhanced Matching Stage (Regex)
+    print("\n3. [ENHANCED MATCHING - REGEX]")
+    try:
+        skills_extractor = EnhancedSkillsExtractor()
+        enhanced_skills = skills_extractor.extract_skills(final_text)
+        print(f"Regex skills found: {len(enhanced_skills)}")
+        if enhanced_skills:
+            # Group by category for better overview
+            categories = {}
+            for s in enhanced_skills:
+                cat = s.get('category', 'other')
+                if cat not in categories: categories[cat] = []
+                categories[cat].append(s['name'])
+            for cat, names in categories.items():
+                print(f"  [{cat.upper()}]: {', '.join(names[:10])}")
+    except Exception as e:
+        print(f"Regex matching failed: {e}")
+    print(f"\n{'='*80}")
+    print("DIAGNOSIS COMPLETE")
+    print(f"{'='*80}")
+if __name__ == "__main__":
+    # Test on multiple files to see different behaviors
+    test_pdfs = [
+        "C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser\\Untitled document (3).pdf",
+        "C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser\\Dzunisani Data Analyst cv.pdf"
+    ]
+    for pdf in test_pdfs:
+        if os.path.exists(pdf):
+            diagnose(pdf)
+        else:
+            print(f"File not found: {pdf}")

peek_pdfs.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import sys
+from pathlib import Path
+# Add the app directory to the path to allow imports
+sys.path.append(str(Path(__file__).parent))
+from app.services.ocr_service import OCRService
+def test_file_extraction(file_path):
+    print(f"\n[{'='*20}] Testing extraction for: {os.path.basename(file_path)}")
+    ocr = OCRService()
+    try:
+        # To see what happened, let's call the internal methods
+        native_text = ocr._native_pdf_extraction(file_path)
+        print(f"Native extraction: {len(native_text)} chars.")
+        if len(native_text) < 100:
+             print(f"Native result snippet: {native_text[:100]!r}")
+             print("Triggering OCR fallback...")
+             ocr_text = ocr._ocr_pdf_extraction(file_path)
+             print(f"OCR extraction: {len(ocr_text)} chars.")
+             print(f"OCR result snippet: {ocr_text[:500]!r}")
+        else:
+             print(f"Native extraction sufficient. First 500 chars:\n{native_text[:500]}")
+    except Exception as e:
+        print(f"Extraction failed: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    files = [
+        "Untitled document (3).pdf",
+        "Dzunisani Data Analyst cv.pdf",
+        "KRM N6 (1).pdf"
+    ]
+    for f in files:
+        path = os.path.join("C:\\Users\\User\\CascadeProjects\\cv-analyser-backend\\cv-analyser", f)
+        if os.path.exists(path):
+            test_file_extraction(path)
+        else:
+            print(f"File not found: {path}")

test_full_pipeline_v2.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import sys
+import os
+import json
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# Add the cv-analyser app to the path
+sys.path.append("/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser")
+try:
+    from app.tasks.pipeline import process_job
+    from app.models import CVAnalysis, CVRecord
+    from app.db import session_scope
+    import uuid
+    # Mock Job object
+    class MockJob:
+        def __init__(self, analysis_id, resume_id, job_description):
+            self.analysis_id = str(analysis_id)
+            self.resume_id = str(resume_id)
+            self.job_description = job_description
+    # Test file
+    test_pdf = "/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser/Untitled document (3).pdf"
+    with session_scope() as db:
+        # Create a test record
+        record_id = uuid.uuid4()
+        analysis_id = uuid.uuid4()
+        # Read the file
+        from app.services.ocr_service import OCRService
+        ocr = OCRService()
+        raw_text = ocr.extract_text(test_pdf, "pdf")
+        record = CVRecord(
+            id=record_id,
+            cv_text=raw_text
+        )
+        db.add(record)
+        analysis = CVAnalysis(
+            id=analysis_id,
+            record_id=record_id,
+            status="pending",
+            job_description="Python Developer with Flask and AWS experience"
+        )
+        db.add(analysis)
+        db.commit()
+        print(f"Created Test Analysis: {analysis_id}")
+        # Run the pipeline
+        job = MockJob(analysis_id, record_id, analysis.job_description)
+        process_job(job)
+        # Fetch the result
+        db.expire_all()
+        result_analysis = db.get(CVAnalysis, analysis_id)
+        res = result_analysis.result
+        print("\n--- FINAL RESULT INSPECTION ---")
+        print(f"Status: {result_analysis.status}")
+        print(f"Overall Score: {result_analysis.overall_score}")
+        # Check for top-level keys
+        print(f"\nTop-level keys in response: {list(res.keys())}")
+        skills = res.get("skills", [])
+        print(f"Top-level skills count: {len(skills)}")
+        if skills:
+            print(f"Sample skills: {skills[:5]}")
+        experience = res.get("experience", [])
+        print(f"Top-level experience count: {len(experience)}")
+        # Check structured_data
+        struct = res.get("structured_data", {})
+        struct_skills = struct.get("skills", [])
+        print(f"Structured skills count: {len(struct_skills)}")
+        if len(skills) == len(struct_skills) and len(skills) > 0:
+            print("\n✅ SUCCESS: Top-level and structured data are in sync!")
+        else:
+            print("\n❌ WARNING: Data desync or no data found.")
+except Exception as e:
+    print(f"Error: {e}")
+    import traceback
+    traceback.print_exc()

verify_fix.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import sys
+import os
+import logging
+# Configure logging to see OCR details
+logging.basicConfig(level=logging.INFO)
+# Add the cv-analyser app to the path
+sys.path.append("/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser")
+try:
+    from app.services.ocr_service import OCRService
+    from app.services.enhanced_skills_extractor import EnhancedSkillsExtractor
+    ocr = OCRService()
+    test_file = "/mnt/c/Users/User/CascadeProjects/cv-analyser-backend/cv-analyser/Untitled document (3).pdf"
+    if not os.path.exists(test_file):
+        print(f"Error: Test file not found at {test_file}")
+        sys.exit(1)
+    print(f"Testing extraction on: {test_file}")
+    text = ocr._extract_from_pdf(test_file)
+    print(f"\n--- EXTRACTION RESULT ---")
+    print(f"Characters: {len(text)}")
+    if len(text) > 0:
+        print(f"First 500 chars: {repr(text[:500])}")
+        # Skill test
+        extractor = EnhancedSkillsExtractor()
+        skills = extractor.extract_skills(text)
+        print(f"Skills found: {len(skills)}")
+        if skills:
+            print(f"Sample skills: {[s.get('name') for s in skills[:10]]}")
+    else:
+        print("FAIL: No text extracted")
+except Exception as e:
+    print(f"Error during verification: {e}")
+    import traceback
+    traceback.print_exc()