Spaces:

dotandru
/

BuddyMath

Sleeping

App Files Files Community

BuddyMath / memory_service.py

dotandru

Fix: Clean production deployment with sse-starlette

9d29c62 3 months ago

raw

history blame

5.78 kB

	# memory_service.py
	"""
	Buddy Math - Memory Service (Pinecone Edition)
	==============================================
	גרסה v2.1: שיפור מנגנון זיהוי הדמיון.
	במקום לבדוק סדר מילים (שנשבר ב-OCR), בודקים חפיפת מילים (Jaccard).
	"""

	import os
	import json
	import logging
	import uuid
	import re
	from typing import Optional, Dict

	from sentence_transformers import SentenceTransformer
	from pinecone import Pinecone

	logger = logging.getLogger("MemoryService")

	class MemoryService:
	def __init__(self):
	self.api_key = os.environ.get("PINECONE_API_KEY")

	if not self.api_key:
	logger.warning("⚠️ PINECONE_API_KEY not found! Memory will be disabled.")
	self.index = None
	return

	try:
	self.pc = Pinecone(api_key=self.api_key)
	self.index_name = "buddy-math"
	self.index = self.pc.Index(self.index_name)

	logger.info("⏳ Loading embedding model...")
	self.embedder = SentenceTransformer('all-MiniLM-L6-v2')

	logger.info("✅ Brain initialized (Pinecone + MiniLM)")

	except Exception as e:
	logger.error(f"❌ Failed to init Pinecone: {e}")
	self.index = None

	def find_similar_solution(self, problem_text: str, vector_threshold: float = 0.85) -> Optional[Dict]:
	"""
	מחפש פתרון בזיכרון.
	מבצע אימות כפול: וקטורי (משמעות) + חפיפת מילים (תוכן).
	"""
	if not self.index: return None

	try:
	# 1. חיפוש וקטורי (מהיר)
	vector = self.embedder.encode(problem_text).tolist()

	results = self.index.query(
	vector=vector,
	top_k=1,
	include_metadata=True
	)

	if not results['matches']:
	return None

	match = results['matches'][0]
	vector_score = match['score']

	# בדיקת סף וקטורי
	if vector_score < vector_threshold:
	return None

	# שליפת הטקסט המקורי מהזיכרון
	cached_text = match['metadata'].get('text', '')

	# 2. בדיקת דמיון משופרת (Jaccard Similarity)
	# בודקים כמה מילים משותפות יש, בלי קשר לסדר
	text_similarity = self._calculate_jaccard_similarity(problem_text, cached_text)

	logger.info(f"🧠 Brain Check: Vector={vector_score:.3f}, Jaccard={text_similarity:.3f}")

	# הורדנו את הרף ל-30% חפיפה (מספיק לזיהוי אותה שאלה ב-OCR משובש)
	if text_similarity < 0.3:
	logger.warning("⚠️ High vector score but low text overlap. Ignoring.")
	return None

	# שליפת ה-JSON
	solution_json = match['metadata'].get('solution_json')
	if solution_json:
	logger.info("🧠 Brain HIT! Verified match found.")
	return json.loads(solution_json)

	return None

	except Exception as e:
	logger.error(f"Memory search failed: {e}")
	return None

	def learn_solution(self, problem_text: str, solution_data: dict):
	"""שומר פתרון חדש"""
	if not self.index: return

	try:
	clean_text = problem_text.strip()
	if len(clean_text) < 10: return

	vector = self.embedder.encode(clean_text).tolist()
	json_str = json.dumps(solution_data, ensure_ascii=False)

	# הגנה: Pinecone מגביל Metadata ל-40KB
	if len(json_str.encode('utf-8')) > 38000:
	logger.warning("⚠️ Solution too big for memory. Skipping save.")
	return

	metadata = {
	"text": clean_text[:1000],
	"topic": solution_data.get("meta", {}).get("topic", "unknown"),
	"solution_json": json_str
	}

	self.index.upsert(vectors=[{
	"id": str(uuid.uuid4()),
	"values": vector,
	"metadata": metadata
	}])

	logger.info("🧠 Brain LEARNED and saved to Cloud!")

	except Exception as e:
	logger.error(f"Memory learn failed: {e}")

	def _calculate_jaccard_similarity(self, text1: str, text2: str) -> float:
	"""
	מחשב דמיון לפי חפיפת מילים (מתעלם מסדר המילים).
	טוב ל-OCR עברית/אנגלית שמתהפך.
	"""
	# ניקוי ופירוק למילים ייחודיות (Tokens)
	tokens1 = self._tokenize(text1)
	tokens2 = self._tokenize(text2)

	if not tokens1 or not tokens2:
	return 0.0

	# חיתוך (מילים משותפות) חלקי איחוד (כל המילים)
	intersection = len(tokens1.intersection(tokens2))
	union = len(tokens1.union(tokens2))

	return intersection / union

	def _tokenize(self, text: str) -> set:
	"""מפרק טקסט לסט של מילים נקיות"""
	# משאיר רק אותיות ומספרים
	clean = re.sub(r'[^\w\s]', '', text)
	# פירוק למילים
	words = clean.lower().split()
	# מסנן מילים קצרות מדי (כמו "של", "את")
	return {w for w in words if len(w) > 1}