kofdai commited on
Commit
b2be821
verified
1 Parent(s): 6f561fe

Upload quick_eval_hle.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. quick_eval_hle.py +80 -0
quick_eval_hle.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick HLE 2500 evaluation
3
+ """
4
+ import sys
5
+ import json
6
+ import time
7
+
8
+ sys.path.insert(0, '/Users/motonishikoudai/.openclaw/workspace/verantyx_v6')
9
+
10
+ from pipeline_enhanced import VerantyxV6Enhanced
11
+ from core.answer_matcher import flexible_match
12
+
13
+ # Load dataset
14
+ print("Loading HLE 2500...")
15
+ questions = []
16
+ with open("hle_2500_eval.jsonl", 'r', encoding='utf-8') as f:
17
+ for line in f:
18
+ questions.append(json.loads(line))
19
+ print(f"Loaded {len(questions)} questions")
20
+
21
+ # Initialize pipeline
22
+ print("Initializing pipeline...")
23
+ pipeline = VerantyxV6Enhanced(piece_db_path="pieces/piece_db.jsonl")
24
+ print("Ready")
25
+
26
+ # Evaluate
27
+ print("\nEvaluating...")
28
+ start_time = time.time()
29
+ correct = 0
30
+ total = 0
31
+ category_stats = {}
32
+
33
+ for i, q in enumerate(questions):
34
+ if (i + 1) % 100 == 0:
35
+ print(f"Progress: {i+1}/{len(questions)} ({(i+1)/len(questions)*100:.1f}%)")
36
+
37
+ category = q.get('category', 'Unknown')
38
+ if category not in category_stats:
39
+ category_stats[category] = {'total': 0, 'correct': 0}
40
+
41
+ category_stats[category]['total'] += 1
42
+ total += 1
43
+
44
+ try:
45
+ result = pipeline.solve(q['question'])
46
+ answer = result.get('answer')
47
+ expected = q['answer']
48
+
49
+ if answer and expected and flexible_match(answer, expected, tolerance=1e-4):
50
+ correct += 1
51
+ category_stats[category]['correct'] += 1
52
+ except Exception as e:
53
+ pass
54
+
55
+ elapsed = time.time() - start_time
56
+
57
+ # Results
58
+ print(f"\n{'='*80}")
59
+ print("RESULTS")
60
+ print(f"{'='*80}")
61
+ print(f"Total: {total}")
62
+ print(f"Correct: {correct}")
63
+ print(f"Accuracy: {correct/total*100:.2f}%")
64
+ print(f"Time: {elapsed:.1f}s")
65
+ print()
66
+ print("Category breakdown:")
67
+ for cat, stats in sorted(category_stats.items(), key=lambda x: -x[1]['correct']/x[1]['total'] if x[1]['total'] > 0 else 0):
68
+ pct = stats['correct']/stats['total']*100 if stats['total'] > 0 else 0
69
+ print(f" {cat}: {stats['correct']}/{stats['total']} ({pct:.1f}%)")
70
+
71
+ # Save
72
+ with open('hle_2500_phase5h_final.json', 'w') as f:
73
+ json.dump({
74
+ 'total': total,
75
+ 'correct': correct,
76
+ 'accuracy': correct/total*100,
77
+ 'time': elapsed,
78
+ 'category_stats': category_stats
79
+ }, f, indent=2)
80
+ print(f"\nSaved to hle_2500_phase5h_final.json")