# math_sanitizer.py - V1.1 ProductionMathSanitizer import re import logging logger = logging.getLogger(__name__) class ProductionMathSanitizer: @staticmethod def normalize_latex(latex_str: str) -> str: """ V1.1: Standardizes LaTeX for SymPy and LLM comparison. """ if not latex_str: return "" # 1. Basic Cleaning clean = latex_str.strip() clean = clean.replace(r'\ ', '') clean = clean.replace(r'\times', '*') clean = clean.replace(r'\cdot', '*') # 2. Bracket Normalization clean = clean.replace(r'\left(', '(').replace(r'\right)', ')') clean = clean.replace(r'\left[', '[').replace(r'\right]', ']') clean = clean.replace('{', '(').replace('}', ')') # 3. Fractions while r'\frac' in clean: clean = re.sub(r'\\frac\s*\((.*?)\)\((.*?)\)', r'(\1)/(\2)', clean) if r'\frac' in clean and '(' not in clean: # Fallback for simple fractions clean = re.sub(r'\\frac\s*(.*?)\s*(.*?)', r'(\1)/(\2)', clean) # 4. Implicit Multiplication Guard (V1.1) clean = re.sub(r'(\d)([a-zA-Z(])', r'\1*\2', clean) clean = re.sub(r'\)([a-zA-Z0-9(])', r')*\1', clean) return clean @staticmethod def validate_semantic_completeness(anchor_data: dict, formula_tokens: list[str]) -> bool: """ V1.1: Partial Semantic Recovery Check. Returns True if the missing tokens are non-critical. """ # Logic to check if critical variables/values are missing # For now, a simple check if the main function key is present. critical_keys = ['function_equations', 'equations'] for key in critical_keys: if key in anchor_data and anchor_data[key]: return True return False @staticmethod def get_symbolic_bridge(proof_graph) -> str: """ V1.1: Zero Hallucination Bridge. Converts the Immutable ProofGraph to a clean mathematical context for the LLM. """ bridge = "════════════════════════════════════════\n" bridge += "📜 VERIFIED SYMBOLIC BRIDGE (V1.1):\n" bridge += "════════════════════════════════════════\n" for step in proof_graph.steps: bridge += f"Step {step.step_id}: {step.math_content} ({step.logic_description or ''})\n" # V6 Ontology Injection if hasattr(step, 'allowed_concepts') and getattr(step, 'allowed_concepts'): concepts_str = ", ".join(step.allowed_concepts) tag = getattr(step, 'pedagogical_tag', 'כללי') bridge += f"For step {step.step_id}, your pedagogical_tag is '{tag}'. You MUST build your explanation using ONLY the concepts from this list: [{concepts_str}]. Do NOT introduce any other mathematical concepts. Keep it under 2 sentences.\n" bridge += "════════════════════════════════════════\n" bridge += "RULE: USE ONLY THE DATA ABOVE. DO NOT HALLUCINATE OR CHANGE MATH.\n" return bridge def sanitize_math_ocr_hotfix(text: str) -> str: """ V1.1.1 Aggressive Sanitizer: Removes all spaces and fixes frac regex. Fixes failures caused by leading spaces or visual artifacts. """ if not text: return "" # תיקון קריטי: הסרת כל הרווחים למניעת כשלי Regex (פתרון לשאלה 2 ו-3) text = text.replace(" ", "") # ניקוי שאריות ויזואליות text = text.replace("\\left", "").replace("\\right", "") # נרמול שברים (עובד עכשיו על מחרוזת נקייה מרווחים) import re text = re.sub( r"frac\(([^()]+)\)\(([^()]+)\)", lambda m: f"(({m.group(1)})/({m.group(2)}))", text ) return text.strip()