Upload model.py with huggingface_hub
Browse files
model.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SimpleSymbolicParser:
|
| 8 |
+
"""Parse simple logical queries like 'color:red AND type:image'"""
|
| 9 |
+
|
| 10 |
+
def parse(self, query: str) -> Dict:
|
| 11 |
+
"""Extract filters from query string"""
|
| 12 |
+
filters = {}
|
| 13 |
+
|
| 14 |
+
# Extract key:value patterns
|
| 15 |
+
pattern = r'(\w+):(\w+)'
|
| 16 |
+
matches = re.findall(pattern, query)
|
| 17 |
+
|
| 18 |
+
for key, value in matches:
|
| 19 |
+
if key not in filters:
|
| 20 |
+
filters[key] = []
|
| 21 |
+
filters[key].append(value.lower())
|
| 22 |
+
|
| 23 |
+
# Detect operators
|
| 24 |
+
has_and = 'AND' in query.upper()
|
| 25 |
+
has_or = 'OR' in query.upper()
|
| 26 |
+
|
| 27 |
+
return {
|
| 28 |
+
'filters': filters,
|
| 29 |
+
'operator': 'AND' if has_and else 'OR' if has_or else 'AND'
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class SimpleNeuroSymbolicSearch:
|
| 34 |
+
"""
|
| 35 |
+
Simple Neuro-Symbolic Search combining:
|
| 36 |
+
1. Neural: Sentence embeddings for semantic search
|
| 37 |
+
2. Symbolic: Rule-based filtering
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
|
| 41 |
+
"""Initialize with a lightweight sentence transformer"""
|
| 42 |
+
print(f"Loading model: {model_name}")
|
| 43 |
+
self.encoder = SentenceTransformer(model_name)
|
| 44 |
+
self.parser = SimpleSymbolicParser()
|
| 45 |
+
|
| 46 |
+
# Store indexed data
|
| 47 |
+
self.documents = []
|
| 48 |
+
self.embeddings = None
|
| 49 |
+
self.metadata = []
|
| 50 |
+
|
| 51 |
+
def index_documents(self, documents: List[str], metadata: List[Dict] = None):
|
| 52 |
+
"""
|
| 53 |
+
Index documents for search
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
documents: List of text documents
|
| 57 |
+
metadata: Optional metadata for each document
|
| 58 |
+
"""
|
| 59 |
+
print(f"Indexing {len(documents)} documents...")
|
| 60 |
+
|
| 61 |
+
self.documents = documents
|
| 62 |
+
self.metadata = metadata if metadata else [{} for _ in documents]
|
| 63 |
+
|
| 64 |
+
# Generate embeddings
|
| 65 |
+
self.embeddings = self.encoder.encode(
|
| 66 |
+
documents,
|
| 67 |
+
convert_to_tensor=True,
|
| 68 |
+
show_progress_bar=True
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
print("✅ Indexing complete!")
|
| 72 |
+
|
| 73 |
+
def apply_symbolic_filters(self, indices: List[int], filters: Dict) -> List[int]:
|
| 74 |
+
"""Apply rule-based filters to results"""
|
| 75 |
+
if not filters:
|
| 76 |
+
return indices
|
| 77 |
+
|
| 78 |
+
filtered = []
|
| 79 |
+
for idx in indices:
|
| 80 |
+
metadata = self.metadata[idx]
|
| 81 |
+
match = True
|
| 82 |
+
|
| 83 |
+
for key, values in filters.items():
|
| 84 |
+
if key not in metadata:
|
| 85 |
+
match = False
|
| 86 |
+
break
|
| 87 |
+
|
| 88 |
+
meta_value = str(metadata[key]).lower()
|
| 89 |
+
if not any(v in meta_value for v in values):
|
| 90 |
+
match = False
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
if match:
|
| 94 |
+
filtered.append(idx)
|
| 95 |
+
|
| 96 |
+
return filtered
|
| 97 |
+
|
| 98 |
+
def search(
|
| 99 |
+
self,
|
| 100 |
+
query: str,
|
| 101 |
+
top_k: int = 5,
|
| 102 |
+
use_symbolic: bool = True
|
| 103 |
+
) -> List[Dict]:
|
| 104 |
+
"""
|
| 105 |
+
Search documents using neuro-symbolic approach
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
query: Search query (can include filters like 'color:red')
|
| 109 |
+
top_k: Number of results to return
|
| 110 |
+
use_symbolic: Whether to apply symbolic filtering
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
List of results with scores and metadata
|
| 114 |
+
"""
|
| 115 |
+
if self.embeddings is None:
|
| 116 |
+
raise ValueError("No documents indexed. Call index_documents() first.")
|
| 117 |
+
|
| 118 |
+
# Parse query for symbolic filters
|
| 119 |
+
parsed = self.parser.parse(query) if use_symbolic else {'filters': {}}
|
| 120 |
+
|
| 121 |
+
# Remove filter syntax from query for neural search
|
| 122 |
+
clean_query = re.sub(r'\w+:\w+', '', query)
|
| 123 |
+
clean_query = re.sub(r'\s+', ' ', clean_query).strip()
|
| 124 |
+
clean_query = clean_query.replace('AND', '').replace('OR', '').strip()
|
| 125 |
+
|
| 126 |
+
if not clean_query:
|
| 127 |
+
clean_query = query
|
| 128 |
+
|
| 129 |
+
# Neural search: compute similarity
|
| 130 |
+
query_embedding = self.encoder.encode(clean_query, convert_to_tensor=True)
|
| 131 |
+
similarities = torch.cosine_similarity(
|
| 132 |
+
query_embedding.unsqueeze(0),
|
| 133 |
+
self.embeddings
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Get top results
|
| 137 |
+
top_results = torch.topk(similarities, k=min(top_k * 3, len(self.documents)))
|
| 138 |
+
top_indices = top_results.indices.tolist()
|
| 139 |
+
top_scores = top_results.values.tolist()
|
| 140 |
+
|
| 141 |
+
# Apply symbolic filters
|
| 142 |
+
if use_symbolic and parsed['filters']:
|
| 143 |
+
print(f"🔧 Applying filters: {parsed['filters']}")
|
| 144 |
+
top_indices = self.apply_symbolic_filters(top_indices, parsed['filters'])
|
| 145 |
+
|
| 146 |
+
# Format results
|
| 147 |
+
results = []
|
| 148 |
+
for i, idx in enumerate(top_indices[:top_k]):
|
| 149 |
+
score_idx = top_indices.index(idx) if idx in top_indices else i
|
| 150 |
+
results.append({
|
| 151 |
+
'rank': i + 1,
|
| 152 |
+
'score': top_scores[score_idx] if score_idx < len(top_scores) else 0.0,
|
| 153 |
+
'document': self.documents[idx],
|
| 154 |
+
'metadata': self.metadata[idx]
|
| 155 |
+
})
|
| 156 |
+
|
| 157 |
+
return results
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def demo():
|
| 161 |
+
"""Demo of the neuro-symbolic search system"""
|
| 162 |
+
|
| 163 |
+
# Sample documents with metadata
|
| 164 |
+
documents = [
|
| 165 |
+
"A red sports car racing on the track",
|
| 166 |
+
"A blue sedan parked in the garage",
|
| 167 |
+
"A yellow taxi driving in the city",
|
| 168 |
+
"A green bicycle leaning against a wall",
|
| 169 |
+
"A red fire truck responding to emergency",
|
| 170 |
+
"A black motorcycle parked on the street",
|
| 171 |
+
"Children playing soccer in the park",
|
| 172 |
+
"A cat sleeping on a red couch",
|
| 173 |
+
]
|
| 174 |
+
|
| 175 |
+
metadata = [
|
| 176 |
+
{'type': 'vehicle', 'color': 'red', 'category': 'car'},
|
| 177 |
+
{'type': 'vehicle', 'color': 'blue', 'category': 'car'},
|
| 178 |
+
{'type': 'vehicle', 'color': 'yellow', 'category': 'car'},
|
| 179 |
+
{'type': 'vehicle', 'color': 'green', 'category': 'bicycle'},
|
| 180 |
+
{'type': 'vehicle', 'color': 'red', 'category': 'truck'},
|
| 181 |
+
{'type': 'vehicle', 'color': 'black', 'category': 'motorcycle'},
|
| 182 |
+
{'type': 'activity', 'color': 'none', 'category': 'sports'},
|
| 183 |
+
{'type': 'animal', 'color': 'red', 'category': 'pet'},
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
# Initialize search system
|
| 187 |
+
search = SimpleNeuroSymbolicSearch()
|
| 188 |
+
|
| 189 |
+
# Index documents
|
| 190 |
+
search.index_documents(documents, metadata)
|
| 191 |
+
|
| 192 |
+
print("\n" + "="*70)
|
| 193 |
+
print("🔍 DEMO: Neuro-Symbolic Search")
|
| 194 |
+
print("="*70)
|
| 195 |
+
|
| 196 |
+
# Test 1: Pure neural search
|
| 197 |
+
print("\n1. Neural Search: 'red vehicle'")
|
| 198 |
+
print("-" * 70)
|
| 199 |
+
results = search.search("red vehicle", top_k=3, use_symbolic=False)
|
| 200 |
+
for r in results:
|
| 201 |
+
print(f" [{r['rank']}] Score: {r['score']:.3f} | {r['document']}")
|
| 202 |
+
|
| 203 |
+
# Test 2: Neuro-symbolic search with filters
|
| 204 |
+
print("\n2. Neuro-Symbolic: 'vehicle color:red'")
|
| 205 |
+
print("-" * 70)
|
| 206 |
+
results = search.search("vehicle color:red", top_k=3)
|
| 207 |
+
for r in results:
|
| 208 |
+
print(f" [{r['rank']}] Score: {r['score']:.3f} | {r['document']}")
|
| 209 |
+
print(f" Metadata: {r['metadata']}")
|
| 210 |
+
|
| 211 |
+
# Test 3: Complex query
|
| 212 |
+
print("\n3. Neuro-Symbolic: 'fast vehicle color:red category:car'")
|
| 213 |
+
print("-" * 70)
|
| 214 |
+
results = search.search("fast vehicle color:red category:car", top_k=3)
|
| 215 |
+
for r in results:
|
| 216 |
+
print(f" [{r['rank']}] Score: {r['score']:.3f} | {r['document']}")
|
| 217 |
+
|
| 218 |
+
print("\n" + "="*70)
|
| 219 |
+
print("✅ Demo complete!")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
if __name__ == "__main__":
|
| 223 |
+
demo()
|