BuddyMath / math_sanitizer.py
dotandru's picture
Fix: Clean production deployment with sse-starlette
9d29c62
# math_sanitizer.py - V1.1 ProductionMathSanitizer
import re
import logging
logger = logging.getLogger(__name__)
class ProductionMathSanitizer:
@staticmethod
def normalize_latex(latex_str: str) -> str:
"""
V1.1: Standardizes LaTeX for SymPy and LLM comparison.
"""
if not latex_str: return ""
# 1. Basic Cleaning
clean = latex_str.strip()
clean = clean.replace(r'\ ', '')
clean = clean.replace(r'\times', '*')
clean = clean.replace(r'\cdot', '*')
# 2. Bracket Normalization
clean = clean.replace(r'\left(', '(').replace(r'\right)', ')')
clean = clean.replace(r'\left[', '[').replace(r'\right]', ']')
clean = clean.replace('{', '(').replace('}', ')')
# 3. Fractions
while r'\frac' in clean:
clean = re.sub(r'\\frac\s*\((.*?)\)\((.*?)\)', r'(\1)/(\2)', clean)
if r'\frac' in clean and '(' not in clean: # Fallback for simple fractions
clean = re.sub(r'\\frac\s*(.*?)\s*(.*?)', r'(\1)/(\2)', clean)
# 4. Implicit Multiplication Guard (V1.1)
clean = re.sub(r'(\d)([a-zA-Z(])', r'\1*\2', clean)
clean = re.sub(r'\)([a-zA-Z0-9(])', r')*\1', clean)
return clean
@staticmethod
def validate_semantic_completeness(anchor_data: dict, formula_tokens: list[str]) -> bool:
"""
V1.1: Partial Semantic Recovery Check.
Returns True if the missing tokens are non-critical.
"""
# Logic to check if critical variables/values are missing
# For now, a simple check if the main function key is present.
critical_keys = ['function_equations', 'equations']
for key in critical_keys:
if key in anchor_data and anchor_data[key]:
return True
return False
@staticmethod
def get_symbolic_bridge(proof_graph) -> str:
"""
V1.1: Zero Hallucination Bridge.
Converts the Immutable ProofGraph to a clean mathematical context for the LLM.
"""
bridge = "鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲\n"
bridge += "馃摐 VERIFIED SYMBOLIC BRIDGE (V1.1):\n"
bridge += "鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲\n"
for step in proof_graph.steps:
bridge += f"Step {step.step_id}: {step.math_content} ({step.logic_description or ''})\n"
# V6 Ontology Injection
if hasattr(step, 'allowed_concepts') and getattr(step, 'allowed_concepts'):
concepts_str = ", ".join(step.allowed_concepts)
tag = getattr(step, 'pedagogical_tag', '讻诇诇讬')
bridge += f"For step {step.step_id}, your pedagogical_tag is '{tag}'. You MUST build your explanation using ONLY the concepts from this list: [{concepts_str}]. Do NOT introduce any other mathematical concepts. Keep it under 2 sentences.\n"
bridge += "鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲鈺愨晲\n"
bridge += "RULE: USE ONLY THE DATA ABOVE. DO NOT HALLUCINATE OR CHANGE MATH.\n"
return bridge
def sanitize_math_ocr_hotfix(text: str) -> str:
"""
V1.1.1 Aggressive Sanitizer: Removes all spaces and fixes frac regex.
Fixes failures caused by leading spaces or visual artifacts.
"""
if not text: return ""
# 转讬拽讜谉 拽专讬讟讬: 讛住专转 讻诇 讛专讜讜讞讬诐 诇诪谞讬注转 讻砖诇讬 Regex (驻转专讜谉 诇砖讗诇讛 2 讜-3)
text = text.replace(" ", "")
# 谞讬拽讜讬 砖讗专讬讜转 讜讬讝讜讗诇讬讜转
text = text.replace("\\left", "").replace("\\right", "")
# 谞专诪讜诇 砖讘专讬诐 (注讜讘讚 注讻砖讬讜 注诇 诪讞专讜讝转 谞拽讬讬讛 诪专讜讜讞讬诐)
import re
text = re.sub(
r"frac\(([^()]+)\)\(([^()]+)\)",
lambda m: f"(({m.group(1)})/({m.group(2)}))",
text
)
return text.strip()