GabeA commited on
Commit
a2f2223
·
verified ·
1 Parent(s): 36cb153

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. results/voyage-3-large.json +546 -0
results/voyage-3-large.json ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_id": "conversation_similarity",
4
+ "task_name": "Conversation Similarity",
5
+ "model_name": "voyage-3-large",
6
+ "benchmark_version": "0.1.0",
7
+ "dataset_revision": "",
8
+ "mrr_at_10": 0.1140340689121177,
9
+ "map_at_10": 0.05175022583559168,
10
+ "recall_at_1": 0.0,
11
+ "recall_at_5": 0.23170731707317074,
12
+ "recall_at_10": 0.45121951219512196,
13
+ "ndcg_at_10": 0.10921881438171957,
14
+ "num_queries": 82,
15
+ "num_corpus_docs": 1595,
16
+ "confidence_intervals": {
17
+ "MRR@10": [
18
+ 0.07998923248161052,
19
+ 0.14816056910569103
20
+ ],
21
+ "MAP@10": [
22
+ 0.033713221060782034,
23
+ 0.07145680087753259
24
+ ],
25
+ "R@1": [
26
+ 0.0,
27
+ 0.0
28
+ ],
29
+ "R@5": [
30
+ 0.14634146341463414,
31
+ 0.31737804878048753
32
+ ],
33
+ "R@10": [
34
+ 0.34146341463414637,
35
+ 0.5490853658536583
36
+ ],
37
+ "NDCG@10": [
38
+ 0.07815159631767281,
39
+ 0.13875558398642276
40
+ ]
41
+ },
42
+ "per_difficulty": {
43
+ "unknown": {
44
+ "MRR@10": 0.1140340689121177,
45
+ "MAP@10": 0.05175022583559168,
46
+ "R@1": 0.0,
47
+ "R@5": 0.23170731707317074,
48
+ "R@10": 0.45121951219512196,
49
+ "NDCG@10": 0.10921881438171957,
50
+ "count": 82
51
+ }
52
+ },
53
+ "hard_negative_metrics": {},
54
+ "robustness_score": null,
55
+ "metric_std_devs": {},
56
+ "mean_query_stability": null,
57
+ "n_paraphrases": 0
58
+ },
59
+ {
60
+ "task_id": "cross_channel",
61
+ "task_name": "Cross Channel",
62
+ "model_name": "voyage-3-large",
63
+ "benchmark_version": "0.1.0",
64
+ "dataset_revision": "",
65
+ "mrr_at_10": 0.3200418314136726,
66
+ "map_at_10": 0.15571194582927436,
67
+ "recall_at_1": 0.17509025270758122,
68
+ "recall_at_5": 0.5126353790613718,
69
+ "recall_at_10": 0.7220216606498195,
70
+ "ndcg_at_10": 0.2551008842387347,
71
+ "num_queries": 554,
72
+ "num_corpus_docs": 1595,
73
+ "confidence_intervals": {
74
+ "MRR@10": [
75
+ 0.2915454164517792,
76
+ 0.3524498237923328
77
+ ],
78
+ "MAP@10": [
79
+ 0.1409206671537448,
80
+ 0.1720265567995148
81
+ ],
82
+ "R@1": [
83
+ 0.1444043321299639,
84
+ 0.2075812274368231
85
+ ],
86
+ "R@5": [
87
+ 0.4729241877256318,
88
+ 0.5577617328519856
89
+ ],
90
+ "R@10": [
91
+ 0.6841155234657039,
92
+ 0.759927797833935
93
+ ],
94
+ "NDCG@10": [
95
+ 0.2363556281226957,
96
+ 0.27561158230539357
97
+ ]
98
+ },
99
+ "per_difficulty": {
100
+ "easy": {
101
+ "MRR@10": 0.2559862976529643,
102
+ "MAP@10": 0.11582214082214082,
103
+ "R@1": 0.10256410256410256,
104
+ "R@5": 0.46153846153846156,
105
+ "R@10": 0.7264957264957265,
106
+ "NDCG@10": 0.21392533137141628,
107
+ "count": 117
108
+ },
109
+ "hard": {
110
+ "MRR@10": 0.3544979873927242,
111
+ "MAP@10": 0.1653557821320979,
112
+ "R@1": 0.22009569377990432,
113
+ "R@5": 0.5215311004784688,
114
+ "R@10": 0.6985645933014354,
115
+ "NDCG@10": 0.2625001923284304,
116
+ "count": 209
117
+ },
118
+ "medium": {
119
+ "MRR@10": 0.3213276246170983,
120
+ "MAP@10": 0.16734153091060983,
121
+ "R@1": 0.17105263157894737,
122
+ "R@5": 0.5307017543859649,
123
+ "R@10": 0.7412280701754386,
124
+ "NDCG@10": 0.2694477451805324,
125
+ "count": 228
126
+ }
127
+ },
128
+ "hard_negative_metrics": {
129
+ "hn_mean_rank": 9.653429602888087,
130
+ "hn_above_relevant_rate": 0.12815884476534295,
131
+ "hn_query_count": 554
132
+ },
133
+ "robustness_score": null,
134
+ "metric_std_devs": {},
135
+ "mean_query_stability": null,
136
+ "n_paraphrases": 0
137
+ },
138
+ {
139
+ "task_id": "cross_platform_transfer",
140
+ "task_name": "Cross-Platform Transfer (slack)",
141
+ "model_name": "voyage-3-large",
142
+ "benchmark_version": "0.1.0",
143
+ "dataset_revision": "",
144
+ "mrr_at_10": 0.8568057311420142,
145
+ "map_at_10": 0.8568057311420142,
146
+ "recall_at_1": 0.7964601769911505,
147
+ "recall_at_5": 0.9292035398230089,
148
+ "recall_at_10": 0.9734513274336283,
149
+ "ndcg_at_10": 0.8852294883180861,
150
+ "num_queries": 113,
151
+ "num_corpus_docs": 1232,
152
+ "confidence_intervals": {
153
+ "MRR@10": [
154
+ 0.8005193847450484,
155
+ 0.9074083438685208
156
+ ],
157
+ "MAP@10": [
158
+ 0.8005193847450484,
159
+ 0.9074083438685208
160
+ ],
161
+ "R@1": [
162
+ 0.7168141592920354,
163
+ 0.8672566371681416
164
+ ],
165
+ "R@5": [
166
+ 0.8849557522123894,
167
+ 0.9734513274336283
168
+ ],
169
+ "R@10": [
170
+ 0.9380530973451328,
171
+ 1.0
172
+ ],
173
+ "NDCG@10": [
174
+ 0.8377052536121854,
175
+ 0.9281102743762883
176
+ ]
177
+ },
178
+ "per_difficulty": {
179
+ "unknown": {
180
+ "MRR@10": 0.8568057311420142,
181
+ "MAP@10": 0.8568057311420142,
182
+ "R@1": 0.7964601769911505,
183
+ "R@5": 0.9292035398230089,
184
+ "R@10": 0.9734513274336283,
185
+ "NDCG@10": 0.8852294883180861,
186
+ "count": 113
187
+ }
188
+ },
189
+ "hard_negative_metrics": {},
190
+ "robustness_score": null,
191
+ "metric_std_devs": {},
192
+ "mean_query_stability": null,
193
+ "n_paraphrases": 0
194
+ },
195
+ {
196
+ "task_id": "response_retrieval",
197
+ "task_name": "Response Retrieval",
198
+ "model_name": "voyage-3-large",
199
+ "benchmark_version": "0.1.0",
200
+ "dataset_revision": "",
201
+ "mrr_at_10": 0.48689126984126985,
202
+ "map_at_10": 0.48689126984126985,
203
+ "recall_at_1": 0.416,
204
+ "recall_at_5": 0.574,
205
+ "recall_at_10": 0.64,
206
+ "ndcg_at_10": 0.5235468640787492,
207
+ "num_queries": 500,
208
+ "num_corpus_docs": 1173,
209
+ "confidence_intervals": {
210
+ "MRR@10": [
211
+ 0.4463512301587302,
212
+ 0.5278171428571428
213
+ ],
214
+ "MAP@10": [
215
+ 0.4463512301587302,
216
+ 0.5278171428571428
217
+ ],
218
+ "R@1": [
219
+ 0.37395000000000006,
220
+ 0.462
221
+ ],
222
+ "R@5": [
223
+ 0.53,
224
+ 0.618
225
+ ],
226
+ "R@10": [
227
+ 0.596,
228
+ 0.68005
229
+ ],
230
+ "NDCG@10": [
231
+ 0.48266746398429305,
232
+ 0.5630537660518903
233
+ ]
234
+ },
235
+ "per_difficulty": {
236
+ "unknown": {
237
+ "MRR@10": 0.48689126984126985,
238
+ "MAP@10": 0.48689126984126985,
239
+ "R@1": 0.416,
240
+ "R@5": 0.574,
241
+ "R@10": 0.64,
242
+ "NDCG@10": 0.5235468640787492,
243
+ "count": 500
244
+ }
245
+ },
246
+ "hard_negative_metrics": {},
247
+ "robustness_score": null,
248
+ "metric_std_devs": {},
249
+ "mean_query_stability": null,
250
+ "n_paraphrases": 0
251
+ },
252
+ {
253
+ "task_id": "specific_detail",
254
+ "task_name": "Specific Detail",
255
+ "model_name": "voyage-3-large",
256
+ "benchmark_version": "0.1.0",
257
+ "dataset_revision": "",
258
+ "mrr_at_10": 0.3294289196482981,
259
+ "map_at_10": 0.3294289196482981,
260
+ "recall_at_1": 0.1773308957952468,
261
+ "recall_at_5": 0.5301645338208409,
262
+ "recall_at_10": 0.7294332723948812,
263
+ "ndcg_at_10": 0.42377023479275683,
264
+ "num_queries": 547,
265
+ "num_corpus_docs": 1595,
266
+ "confidence_intervals": {
267
+ "MRR@10": [
268
+ 0.29899881387655614,
269
+ 0.3577598045616784
270
+ ],
271
+ "MAP@10": [
272
+ 0.29899881387655614,
273
+ 0.3577598045616784
274
+ ],
275
+ "R@1": [
276
+ 0.14620658135283368,
277
+ 0.21023765996343693
278
+ ],
279
+ "R@5": [
280
+ 0.4880712979890311,
281
+ 0.5685557586837294
282
+ ],
283
+ "R@10": [
284
+ 0.6892138939670932,
285
+ 0.7659963436928702
286
+ ],
287
+ "NDCG@10": [
288
+ 0.3942246959871027,
289
+ 0.4513162163232951
290
+ ]
291
+ },
292
+ "per_difficulty": {
293
+ "easy": {
294
+ "MRR@10": 0.3287136867325547,
295
+ "MAP@10": 0.3287136867325547,
296
+ "R@1": 0.16981132075471697,
297
+ "R@5": 0.5566037735849056,
298
+ "R@10": 0.8018867924528302,
299
+ "NDCG@10": 0.4395466947913532,
300
+ "count": 106
301
+ },
302
+ "hard": {
303
+ "MRR@10": 0.318499373433584,
304
+ "MAP@10": 0.318499373433584,
305
+ "R@1": 0.17543859649122806,
306
+ "R@5": 0.49122807017543857,
307
+ "R@10": 0.6842105263157895,
308
+ "NDCG@10": 0.4045863919363865,
309
+ "count": 228
310
+ },
311
+ "medium": {
312
+ "MRR@10": 0.34148408972352634,
313
+ "MAP@10": 0.34148408972352634,
314
+ "R@1": 0.18309859154929578,
315
+ "R@5": 0.5586854460093896,
316
+ "R@10": 0.7417840375586855,
317
+ "NDCG@10": 0.43645385644252777,
318
+ "count": 213
319
+ }
320
+ },
321
+ "hard_negative_metrics": {
322
+ "hn_mean_rank": 7.595978062157221,
323
+ "hn_above_relevant_rate": 0.2979890310786106,
324
+ "hn_query_count": 547
325
+ },
326
+ "robustness_score": null,
327
+ "metric_std_devs": {},
328
+ "mean_query_stability": null,
329
+ "n_paraphrases": 0
330
+ },
331
+ {
332
+ "task_id": "thread_discrimination",
333
+ "task_name": "Thread Discrimination",
334
+ "model_name": "voyage-3-large",
335
+ "benchmark_version": "0.1.0",
336
+ "dataset_revision": "",
337
+ "mrr_at_10": 0.5443265755146942,
338
+ "map_at_10": 0.5443265755146942,
339
+ "recall_at_1": 0.3782178217821782,
340
+ "recall_at_5": 0.7683168316831683,
341
+ "recall_at_10": 0.9188118811881189,
342
+ "ndcg_at_10": 0.6338301198377989,
343
+ "num_queries": 505,
344
+ "num_corpus_docs": 1595,
345
+ "confidence_intervals": {
346
+ "MRR@10": [
347
+ 0.5099766619519094,
348
+ 0.5764179828697155
349
+ ],
350
+ "MAP@10": [
351
+ 0.5099766619519094,
352
+ 0.5764179828697155
353
+ ],
354
+ "R@1": [
355
+ 0.33663366336633666,
356
+ 0.4178217821782178
357
+ ],
358
+ "R@5": [
359
+ 0.7287128712871287,
360
+ 0.805940594059406
361
+ ],
362
+ "R@10": [
363
+ 0.8950495049504951,
364
+ 0.9405940594059405
365
+ ],
366
+ "NDCG@10": [
367
+ 0.6042357834066427,
368
+ 0.6608133396278401
369
+ ]
370
+ },
371
+ "per_difficulty": {
372
+ "easy": {
373
+ "MRR@10": 0.5475919913419914,
374
+ "MAP@10": 0.5475919913419914,
375
+ "R@1": 0.3409090909090909,
376
+ "R@5": 0.8181818181818182,
377
+ "R@10": 0.9772727272727273,
378
+ "NDCG@10": 0.6510224048778747,
379
+ "count": 44
380
+ },
381
+ "hard": {
382
+ "MRR@10": 0.5082574895714826,
383
+ "MAP@10": 0.5082574895714826,
384
+ "R@1": 0.3378839590443686,
385
+ "R@5": 0.7406143344709898,
386
+ "R@10": 0.9112627986348123,
387
+ "NDCG@10": 0.6042707869145505,
388
+ "count": 293
389
+ },
390
+ "medium": {
391
+ "MRR@10": 0.6063775510204081,
392
+ "MAP@10": 0.6063775510204081,
393
+ "R@1": 0.4583333333333333,
394
+ "R@5": 0.8035714285714286,
395
+ "R@10": 0.9166666666666666,
396
+ "NDCG@10": 0.6808802627232063,
397
+ "count": 168
398
+ }
399
+ },
400
+ "hard_negative_metrics": {
401
+ "hn_mean_rank": 4.9485148514851485,
402
+ "hn_above_relevant_rate": 0.3603960396039604,
403
+ "hn_query_count": 505
404
+ },
405
+ "robustness_score": null,
406
+ "metric_std_devs": {},
407
+ "mean_query_stability": null,
408
+ "n_paraphrases": 0
409
+ },
410
+ {
411
+ "task_id": "thread_retrieval",
412
+ "task_name": "Thread Retrieval",
413
+ "model_name": "voyage-3-large",
414
+ "benchmark_version": "0.1.0",
415
+ "dataset_revision": "",
416
+ "mrr_at_10": 0.516886507936508,
417
+ "map_at_10": 0.516886507936508,
418
+ "recall_at_1": 0.45,
419
+ "recall_at_5": 0.614,
420
+ "recall_at_10": 0.654,
421
+ "ndcg_at_10": 0.5500967749445125,
422
+ "num_queries": 500,
423
+ "num_corpus_docs": 1232,
424
+ "confidence_intervals": {
425
+ "MRR@10": [
426
+ 0.47520793650793647,
427
+ 0.5578661706349206
428
+ ],
429
+ "MAP@10": [
430
+ 0.47520793650793647,
431
+ 0.5578661706349206
432
+ ],
433
+ "R@1": [
434
+ 0.406,
435
+ 0.494
436
+ ],
437
+ "R@5": [
438
+ 0.57,
439
+ 0.66
440
+ ],
441
+ "R@10": [
442
+ 0.612,
443
+ 0.698
444
+ ],
445
+ "NDCG@10": [
446
+ 0.5086154307321877,
447
+ 0.5916009947937753
448
+ ]
449
+ },
450
+ "per_difficulty": {
451
+ "unknown": {
452
+ "MRR@10": 0.516886507936508,
453
+ "MAP@10": 0.516886507936508,
454
+ "R@1": 0.45,
455
+ "R@5": 0.614,
456
+ "R@10": 0.654,
457
+ "NDCG@10": 0.5500967749445125,
458
+ "count": 500
459
+ }
460
+ },
461
+ "hard_negative_metrics": {},
462
+ "robustness_score": null,
463
+ "metric_std_devs": {},
464
+ "mean_query_stability": null,
465
+ "n_paraphrases": 0
466
+ },
467
+ {
468
+ "task_id": "topic_retrieval",
469
+ "task_name": "Topic Retrieval",
470
+ "model_name": "voyage-3-large",
471
+ "benchmark_version": "0.1.0",
472
+ "dataset_revision": "",
473
+ "mrr_at_10": 0.47772130944645563,
474
+ "map_at_10": 0.475276926885114,
475
+ "recall_at_1": 0.3079922027290448,
476
+ "recall_at_5": 0.6998050682261209,
477
+ "recall_at_10": 0.9103313840155945,
478
+ "ndcg_at_10": 0.5786813479948444,
479
+ "num_queries": 513,
480
+ "num_corpus_docs": 1595,
481
+ "confidence_intervals": {
482
+ "MRR@10": [
483
+ 0.44517704368947053,
484
+ 0.5107191590086326
485
+ ],
486
+ "MAP@10": [
487
+ 0.44275772378477063,
488
+ 0.5082153299916458
489
+ ],
490
+ "R@1": [
491
+ 0.26900584795321636,
492
+ 0.3489278752436647
493
+ ],
494
+ "R@5": [
495
+ 0.6607699805068227,
496
+ 0.7387914230019493
497
+ ],
498
+ "R@10": [
499
+ 0.8849415204678363,
500
+ 0.9318226120857699
501
+ ],
502
+ "NDCG@10": [
503
+ 0.5509874540825015,
504
+ 0.6076346782730433
505
+ ]
506
+ },
507
+ "per_difficulty": {
508
+ "easy": {
509
+ "MRR@10": 0.4537283130306386,
510
+ "MAP@10": 0.4537283130306386,
511
+ "R@1": 0.2713178294573643,
512
+ "R@5": 0.7286821705426356,
513
+ "R@10": 0.9147286821705426,
514
+ "NDCG@10": 0.5633487416393298,
515
+ "count": 129
516
+ },
517
+ "hard": {
518
+ "MRR@10": 0.5193487394957983,
519
+ "MAP@10": 0.5120424836601307,
520
+ "R@1": 0.35294117647058826,
521
+ "R@5": 0.7176470588235294,
522
+ "R@10": 0.9117647058823529,
523
+ "NDCG@10": 0.6090460583949658,
524
+ "count": 170
525
+ },
526
+ "medium": {
527
+ "MRR@10": 0.4591158581812787,
528
+ "MAP@10": 0.4590602284527518,
529
+ "R@1": 0.29439252336448596,
530
+ "R@5": 0.6682242990654206,
531
+ "R@10": 0.9065420560747663,
532
+ "NDCG@10": 0.5638024015081188,
533
+ "count": 214
534
+ }
535
+ },
536
+ "hard_negative_metrics": {
537
+ "hn_mean_rank": 7.105263157894737,
538
+ "hn_above_relevant_rate": 0.27485380116959063,
539
+ "hn_query_count": 513
540
+ },
541
+ "robustness_score": null,
542
+ "metric_std_devs": {},
543
+ "mean_query_stability": null,
544
+ "n_paraphrases": 0
545
+ }
546
+ ]