Thishyaketh commited on
Commit
52095a3
·
verified ·
1 Parent(s): 12afa1b

Update .eval_results/ngen4-pro.yaml

Browse files
Files changed (1) hide show
  1. .eval_results/ngen4-pro.yaml +3 -948
.eval_results/ngen4-pro.yaml CHANGED
@@ -7,57 +7,9 @@
7
  url: https://tnsaai.com/models/ngen4-pro
8
  name: TNSA NGen-4 Pro Evaluations
9
  - dataset:
10
- id: MathArena/hmmt_feb_2026
11
- task_id: hmmt_feb_2026
12
- value: 92.5
13
- date: '2026-04-06'
14
- source:
15
- url: https://tnsaai.com/models/ngen4-pro
16
- name: TNSA NGen-4 Pro Evaluations
17
- - dataset:
18
- id: wis-intel/ifeval
19
- task_id: ifeval
20
- value: 95.3
21
- date: '2026-04-06'
22
- source:
23
- url: https://tnsaai.com/models/ngen4-pro
24
- name: TNSA NGen-4 Pro Evaluations
25
- - dataset:
26
- id: Idavidrein/gpqa
27
- task_id: diamond
28
- value: 91.1
29
- date: '2026-04-06'
30
- source:
31
- url: https://tnsaai.com/models/ngen4-pro
32
- name: TNSA NGen-4 Pro Evaluations
33
- - dataset:
34
- id: MathArena/hmmt_feb_2026
35
- task_id: hmmt_feb_2026
36
- value: 92.5
37
- date: '2026-04-06'
38
- source:
39
- url: https://tnsaai.com/models/ngen4-pro
40
- name: TNSA NGen-4 Pro Evaluations
41
- - dataset:
42
- id: openai/MMMLU
43
- task_id: mmmlu
44
- value: 93.2
45
- date: '2026-04-06'
46
- source:
47
- url: https://tnsaai.com/models/ngen4-pro
48
- name: TNSA NGen-4 Pro Evaluations
49
- - dataset:
50
- id: gorilla-llm/Berkeley-Function-Calling-Leaderboard
51
- task_id: bfcl_v4
52
- value: 69.9
53
- date: '2026-04-06'
54
- source:
55
- url: https://tnsaai.com/models/ngen4-pro
56
- name: TNSA NGen-4 Pro Evaluations
57
- - dataset:
58
- id: smolagents/browse_comp
59
- task_id: browsecomp
60
- value: 64.8
61
  date: '2026-04-06'
62
  source:
63
  url: https://tnsaai.com/models/ngen4-pro
@@ -78,102 +30,6 @@
78
  source:
79
  url: https://tnsaai.com/models/ngen4-pro
80
  name: TNSA NGen-4 Pro Evaluations
81
- - dataset:
82
- id: MMMU/MMMU_Pro
83
- task_id: mmmu_pro
84
- value: 79.3
85
- date: '2026-04-06'
86
- source:
87
- url: https://tnsaai.com/models/ngen4-pro
88
- name: TNSA NGen-4 Pro Evaluations
89
- - dataset:
90
- id: erqa/erqa
91
- task_id: erqa
92
- value: 68.5
93
- date: '2026-04-06'
94
- source:
95
- url: https://tnsaai.com/models/ngen4-pro
96
- name: TNSA NGen-4 Pro Evaluations
97
- - dataset:
98
- id: OpenBMB/OmniDocBench
99
- task_id: omnidocbench_v15
100
- value: 93.9
101
- date: '2026-04-06'
102
- source:
103
- url: https://tnsaai.com/models/ngen4-pro
104
- name: TNSA NGen-4 Pro Evaluations
105
- - dataset:
106
- id: VLM2Vec/Video-MME
107
- task_id: video_mme
108
- value: 91.0
109
- date: '2026-04-06'
110
- source:
111
- url: https://tnsaai.com/models/ngen4-pro
112
- name: TNSA NGen-4 Pro Evaluations
113
- - dataset:
114
- id: math-ai/aime25
115
- task_id: aime25
116
- value: 100.0
117
- date: '2026-04-06'
118
- source:
119
- url: https://tnsaai.com/models/ngen4-pro
120
- name: TNSA NGen-4 Pro Evaluations
121
- - dataset:
122
- id: livebench/reasoning
123
- task_id: reasoning
124
- value: 88.5
125
- date: '2026-04-06'
126
- source:
127
- url: https://tnsaai.com/models/ngen4-pro
128
- name: TNSA NGen-4 Pro Evaluations
129
- - dataset:
130
- id: gaia-benchmark/GAIA
131
- task_id: gaia
132
- value: 60.5
133
- date: '2026-04-06'
134
- source:
135
- url: https://tnsaai.com/models/ngen4-pro
136
- name: TNSA NGen-4 Pro Evaluations
137
- - dataset:
138
- id: lukaemon/bbh
139
- task_id: bbh
140
- value: 94.2
141
- date: '2026-04-06'
142
- source:
143
- url: https://tnsaai.com/models/ngen4-pro
144
- name: TNSA NGen-4 Pro Evaluations
145
- - dataset:
146
- id: evalplus/humanevalplus
147
- task_id: humanevalplus
148
- value: 95.1
149
- date: '2026-04-06'
150
- source:
151
- url: https://tnsaai.com/models/ngen4-pro
152
- name: TNSA NGen-4 Pro Evaluations
153
- - dataset:
154
- id: openai/gsm8k
155
- task_id: gsm8k
156
- value: 99.2
157
- date: '2026-04-06'
158
- source:
159
- url: https://tnsaai.com/models/ngen4-pro
160
- name: TNSA NGen-4 Pro Evaluations
161
- - dataset:
162
- id: THUDM/LongBench-v2
163
- task_id: longbench
164
- value: 88.0
165
- date: '2026-04-06'
166
- source:
167
- url: https://tnsaai.com/models/ngen4-pro
168
- name: TNSA NGen-4 Pro Evaluations
169
- - dataset:
170
- id: lmms-lab/DocVQA
171
- task_id: docvqa
172
- value: 96.5
173
- date: '2026-04-06'
174
- source:
175
- url: https://tnsaai.com/models/ngen4-pro
176
- name: TNSA NGen-4 Pro Evaluations
177
  - dataset:
178
  id: likaixin/ScreenSpot-Pro
179
  task_id: screenspot_pro
@@ -182,804 +38,3 @@
182
  source:
183
  url: https://tnsaai.com/models/ngen4-pro
184
  name: TNSA NGen-4 Pro Evaluations
185
- - dataset:
186
- id: hud-evals/OSWorld-Verified
187
- task_id: osworld_verified
188
- value: 57.0
189
- date: '2026-04-06'
190
- source:
191
- url: https://tnsaai.com/models/ngen4-pro
192
- name: TNSA NGen-4 Pro Evaluations
193
- - dataset:
194
- id: AndroidWorld/AndroidWorld
195
- task_id: androidworld
196
- value: 75.0
197
- date: '2026-04-06'
198
- source:
199
- url: https://tnsaai.com/models/ngen4-pro
200
- name: TNSA NGen-4 Pro Evaluations
201
- - dataset:
202
- id: Agents-X/TIR-Bench
203
- task_id: tir_bench
204
- value: 59.8
205
- date: '2026-04-06'
206
- source:
207
- url: https://tnsaai.com/models/ngen4-pro
208
- name: TNSA NGen-4 Pro Evaluations
209
- - dataset:
210
- id: vstar-mm/vstar_bench
211
- task_id: vstar
212
- value: 95.0
213
- date: '2026-04-06'
214
- source:
215
- url: https://tnsaai.com/models/ngen4-pro
216
- name: TNSA NGen-4 Pro Evaluations
217
- - dataset:
218
- id: BoKelvin/SLAKE
219
- task_id: slake
220
- value: 83.2
221
- date: '2026-04-06'
222
- source:
223
- url: https://tnsaai.com/models/ngen4-pro
224
- name: TNSA NGen-4 Pro Evaluations
225
- - dataset:
226
- id: RadGenome/PMC-VQA
227
- task_id: pmc_vqa
228
- value: 65.5
229
- date: '2026-04-06'
230
- source:
231
- url: https://tnsaai.com/models/ngen4-pro
232
- name: TNSA NGen-4 Pro Evaluations
233
- - dataset:
234
- id: OctoMed/MedXpertQA-MM
235
- task_id: medxpertqa_mm
236
- value: 64.2
237
- date: '2026-04-06'
238
- source:
239
- url: https://tnsaai.com/models/ngen4-pro
240
- name: TNSA NGen-4 Pro Evaluations
241
- - dataset:
242
- id: xai-org/RealworldQA
243
- task_id: realworldqa
244
- value: 88.7
245
- date: '2026-04-06'
246
- source:
247
- url: https://tnsaai.com/models/ngen4-pro
248
- name: TNSA NGen-4 Pro Evaluations
249
- - dataset:
250
- id: zli12321/mmstar
251
- task_id: mmstar
252
- value: 86.5
253
- date: '2026-04-06'
254
- source:
255
- url: https://tnsaai.com/models/ngen4-pro
256
- name: TNSA NGen-4 Pro Evaluations
257
- - dataset:
258
- id: VLMEval/MMBench_DEV_EN
259
- task_id: mmbench_en
260
- value: 96.1
261
- date: '2026-04-06'
262
- source:
263
- url: https://tnsaai.com/models/ngen4-pro
264
- name: TNSA NGen-4 Pro Evaluations
265
- - dataset:
266
- id: SimpleVQA/SimpleVQA
267
- task_id: simplevqa
268
- value: 61.0
269
- date: '2026-04-06'
270
- source:
271
- url: https://tnsaai.com/models/ngen4-pro
272
- name: TNSA NGen-4 Pro Evaluations
273
- - dataset:
274
- id: lmms-lab/HallusionBench
275
- task_id: hallusionbench
276
- value: 71.8
277
- date: '2026-04-06'
278
- source:
279
- url: https://tnsaai.com/models/ngen4-pro
280
- name: TNSA NGen-4 Pro Evaluations
281
- - dataset:
282
- id: princeton-nlp/CharXiv
283
- task_id: charxiv_rq
284
- value: 81.4
285
- date: '2026-04-06'
286
- source:
287
- url: https://tnsaai.com/models/ngen4-pro
288
- name: TNSA NGen-4 Pro Evaluations
289
- - dataset:
290
- id: yubo2333/MMLongBench-Doc
291
- task_id: mmlongbench_doc
292
- value: 63.2
293
- date: '2026-04-06'
294
- source:
295
- url: https://tnsaai.com/models/ngen4-pro
296
- name: TNSA NGen-4 Pro Evaluations
297
- - dataset:
298
- id: wulipc/CC-OCR
299
- task_id: cc_ocr
300
- value: 84.6
301
- date: '2026-04-06'
302
- source:
303
- url: https://tnsaai.com/models/ngen4-pro
304
- name: TNSA NGen-4 Pro Evaluations
305
- - dataset:
306
- id: pix2struct/ai2d
307
- task_id: ai2d_test
308
- value: 97.2
309
- date: '2026-04-06'
310
- source:
311
- url: https://tnsaai.com/models/ngen4-pro
312
- name: TNSA NGen-4 Pro Evaluations
313
- - dataset:
314
- id: echo840/OCRBench
315
- task_id: ocrbench
316
- value: 95.4
317
- date: '2026-04-06'
318
- source:
319
- url: https://tnsaai.com/models/ngen4-pro
320
- name: TNSA NGen-4 Pro Evaluations
321
- - dataset:
322
- id: vikhyatk/CountBenchQA
323
- task_id: countbench
324
- value: 99.0
325
- date: '2026-04-06'
326
- source:
327
- url: https://tnsaai.com/models/ngen4-pro
328
- name: TNSA NGen-4 Pro Evaluations
329
- - dataset:
330
- id: shunk031/RefCOCO
331
- task_id: refcoco_avg
332
- value: 93.4
333
- date: '2026-04-06'
334
- source:
335
- url: https://tnsaai.com/models/ngen4-pro
336
- name: TNSA NGen-4 Pro Evaluations
337
- - dataset:
338
- id: ODInW/ODInW13
339
- task_id: odinw13
340
- value: 45.1
341
- date: '2026-04-06'
342
- source:
343
- url: https://tnsaai.com/models/ngen4-pro
344
- name: TNSA NGen-4 Pro Evaluations
345
- - dataset:
346
- id: lmms-lab-si/EmbSpatialBench
347
- task_id: embspatialbench
348
- value: 87.2
349
- date: '2026-04-06'
350
- source:
351
- url: https://tnsaai.com/models/ngen4-pro
352
- name: TNSA NGen-4 Pro Evaluations
353
- - dataset:
354
- id: RefSpatialBench/RefSpatialBench
355
- task_id: refspatialbench
356
- value: 67.0
357
- date: '2026-04-06'
358
- source:
359
- url: https://tnsaai.com/models/ngen4-pro
360
- name: TNSA NGen-4 Pro Evaluations
361
- - dataset:
362
- id: LingoQA/LingoQA
363
- task_id: lingoqa
364
- value: 83.5
365
- date: '2026-04-06'
366
- source:
367
- url: https://tnsaai.com/models/ngen4-pro
368
- name: TNSA NGen-4 Pro Evaluations
369
- - dataset:
370
- id: Hypersim/Hypersim
371
- task_id: hypersim
372
- value: 12.9
373
- date: '2026-04-06'
374
- source:
375
- url: https://tnsaai.com/models/ngen4-pro
376
- name: TNSA NGen-4 Pro Evaluations
377
- - dataset:
378
- id: SUNRGBD/SUNRGBD
379
- task_id: sunrgbd
380
- value: 35.6
381
- date: '2026-04-06'
382
- source:
383
- url: https://tnsaai.com/models/ngen4-pro
384
- name: TNSA NGen-4 Pro Evaluations
385
- - dataset:
386
- id: Nuscene/Nuscene
387
- task_id: nuscene
388
- value: 15.3
389
- date: '2026-04-06'
390
- source:
391
- url: https://tnsaai.com/models/ngen4-pro
392
- name: TNSA NGen-4 Pro Evaluations
393
- - dataset:
394
- id: MMMU/MMMU
395
- task_id: mmmu
396
- value: 86.2
397
- date: '2026-04-06'
398
- source:
399
- url: https://tnsaai.com/models/ngen4-pro
400
- name: TNSA NGen-4 Pro Evaluations
401
- - dataset:
402
- id: MathVision/MathVision
403
- task_id: mathvision
404
- value: 88.1
405
- date: '2026-04-06'
406
- source:
407
- url: https://tnsaai.com/models/ngen4-pro
408
- name: TNSA NGen-4 Pro Evaluations
409
- - dataset:
410
- id: AI4Math/MathVista
411
- task_id: mathvista_mini
412
- value: 91.0
413
- date: '2026-04-06'
414
- source:
415
- url: https://tnsaai.com/models/ngen4-pro
416
- name: TNSA NGen-4 Pro Evaluations
417
- - dataset:
418
- id: DynaMath/DynaMath
419
- task_id: dynamath
420
- value: 89.6
421
- date: '2026-04-06'
422
- source:
423
- url: https://tnsaai.com/models/ngen4-pro
424
- name: TNSA NGen-4 Pro Evaluations
425
- - dataset:
426
- id: evalscope/zerobench
427
- task_id: zerobench
428
- value: 7.0
429
- date: '2026-04-06'
430
- source:
431
- url: https://tnsaai.com/models/ngen4-pro
432
- name: TNSA NGen-4 Pro Evaluations
433
- - dataset:
434
- id: evalscope/zerobench
435
- task_id: zerobench_sub
436
- value: 39.5
437
- date: '2026-04-06'
438
- source:
439
- url: https://tnsaai.com/models/ngen4-pro
440
- name: TNSA NGen-4 Pro Evaluations
441
- - dataset:
442
- id: vlmsareblind/vlmsareblind
443
- task_id: vlmsareblind
444
- value: 98.0
445
- date: '2026-04-06'
446
- source:
447
- url: https://tnsaai.com/models/ngen4-pro
448
- name: TNSA NGen-4 Pro Evaluations
449
- - dataset:
450
- id: BabyVision/BabyVision
451
- task_id: babyvision
452
- value: 42.1
453
- date: '2026-04-06'
454
- source:
455
- url: https://tnsaai.com/models/ngen4-pro
456
- name: TNSA NGen-4 Pro Evaluations
457
- - dataset:
458
- id: VLM2Vec/Video-MME
459
- task_id: video_mme_w_sub
460
- value: 91.0
461
- date: '2026-04-06'
462
- source:
463
- url: https://tnsaai.com/models/ngen4-pro
464
- name: TNSA NGen-4 Pro Evaluations
465
- - dataset:
466
- id: VLM2Vec/Video-MME
467
- task_id: video_mme_wo_sub
468
- value: 86.3
469
- date: '2026-04-06'
470
- source:
471
- url: https://tnsaai.com/models/ngen4-pro
472
- name: TNSA NGen-4 Pro Evaluations
473
- - dataset:
474
- id: EvolvingLMMs-Lab/VideoMMMU
475
- task_id: videommmu
476
- value: 84.6
477
- date: '2026-04-06'
478
- source:
479
- url: https://tnsaai.com/models/ngen4-pro
480
- name: TNSA NGen-4 Pro Evaluations
481
- - dataset:
482
- id: MLVU/MLVU
483
- task_id: mlvu
484
- value: 90.2
485
- date: '2026-04-06'
486
- source:
487
- url: https://tnsaai.com/models/ngen4-pro
488
- name: TNSA NGen-4 Pro Evaluations
489
- - dataset:
490
- id: OpenGVLab/MVBench
491
- task_id: mvbench
492
- value: 78.2
493
- date: '2026-04-06'
494
- source:
495
- url: https://tnsaai.com/models/ngen4-pro
496
- name: TNSA NGen-4 Pro Evaluations
497
- - dataset:
498
- id: LVBench/LVBench
499
- task_id: lvbench
500
- value: 75.1
501
- date: '2026-04-06'
502
- source:
503
- url: https://tnsaai.com/models/ngen4-pro
504
- name: TNSA NGen-4 Pro Evaluations
505
- - dataset:
506
- id: MMVU/MMVU
507
- task_id: mmvu
508
- value: 75.8
509
- date: '2026-04-06'
510
- source:
511
- url: https://tnsaai.com/models/ngen4-pro
512
- name: TNSA NGen-4 Pro Evaluations
513
-
514
- - dataset:
515
- id: openai/MMMLU
516
- task_id: mmmlu
517
- value: 93.2
518
- date: '2026-04-06'
519
- source:
520
- url: https://tnsaai.com/models/ngen4-pro
521
- name: TNSA NGen-4 Pro Evaluations
522
- - dataset:
523
- id: gorilla-llm/Berkeley-Function-Calling-Leaderboard
524
- task_id: bfcl_v4
525
- value: 69.9
526
- date: '2026-04-06'
527
- source:
528
- url: https://tnsaai.com/models/ngen4-pro
529
- name: TNSA NGen-4 Pro Evaluations
530
- - dataset:
531
- id: smolagents/browse_comp
532
- task_id: browsecomp
533
- value: 64.8
534
- date: '2026-04-06'
535
- source:
536
- url: https://tnsaai.com/models/ngen4-pro
537
- name: TNSA NGen-4 Pro Evaluations
538
- - dataset:
539
- id: SWE-bench/SWE-bench_Verified
540
- task_id: swe_bench_%_resolved
541
- value: 77.3
542
- date: '2026-04-06'
543
- source:
544
- url: https://tnsaai.com/models/ngen4-pro
545
- name: TNSA NGen-4 Pro Evaluations
546
- - dataset:
547
- id: harborframework/terminal-bench-2.0
548
- task_id: terminal_bench
549
- value: 42.3
550
- date: '2026-04-06'
551
- source:
552
- url: https://tnsaai.com/models/ngen4-pro
553
- name: TNSA NGen-4 Pro Evaluations
554
- - dataset:
555
- id: MMMU/MMMU_Pro
556
- task_id: mmmu_pro
557
- value: 79.3
558
- date: '2026-04-06'
559
- source:
560
- url: https://tnsaai.com/models/ngen4-pro
561
- name: TNSA NGen-4 Pro Evaluations
562
- - dataset:
563
- id: erqa/erqa
564
- task_id: erqa
565
- value: 68.5
566
- date: '2026-04-06'
567
- source:
568
- url: https://tnsaai.com/models/ngen4-pro
569
- name: TNSA NGen-4 Pro Evaluations
570
- - dataset:
571
- id: OpenBMB/OmniDocBench
572
- task_id: omnidocbench_v15
573
- value: 93.9
574
- date: '2026-04-06'
575
- source:
576
- url: https://tnsaai.com/models/ngen4-pro
577
- name: TNSA NGen-4 Pro Evaluations
578
- - dataset:
579
- id: VLM2Vec/Video-MME
580
- task_id: video_mme
581
- value: 91.0
582
- date: '2026-04-06'
583
- source:
584
- url: https://tnsaai.com/models/ngen4-pro
585
- name: TNSA NGen-4 Pro Evaluations
586
- - dataset:
587
- id: math-ai/aime25
588
- task_id: aime25
589
- value: 100.0
590
- date: '2026-04-06'
591
- source:
592
- url: https://tnsaai.com/models/ngen4-pro
593
- name: TNSA NGen-4 Pro Evaluations
594
- - dataset:
595
- id: livebench/reasoning
596
- task_id: reasoning
597
- value: 88.5
598
- date: '2026-04-06'
599
- source:
600
- url: https://tnsaai.com/models/ngen4-pro
601
- name: TNSA NGen-4 Pro Evaluations
602
- - dataset:
603
- id: gaia-benchmark/GAIA
604
- task_id: gaia
605
- value: 60.5
606
- date: '2026-04-06'
607
- source:
608
- url: https://tnsaai.com/models/ngen4-pro
609
- name: TNSA NGen-4 Pro Evaluations
610
- - dataset:
611
- id: lukaemon/bbh
612
- task_id: bbh
613
- value: 94.2
614
- date: '2026-04-06'
615
- source:
616
- url: https://tnsaai.com/models/ngen4-pro
617
- name: TNSA NGen-4 Pro Evaluations
618
- - dataset:
619
- id: evalplus/humanevalplus
620
- task_id: humanevalplus
621
- value: 95.1
622
- date: '2026-04-06'
623
- source:
624
- url: https://tnsaai.com/models/ngen4-pro
625
- name: TNSA NGen-4 Pro Evaluations
626
- - dataset:
627
- id: openai/gsm8k
628
- task_id: gsm8k
629
- value: 99.2
630
- date: '2026-04-06'
631
- source:
632
- url: https://tnsaai.com/models/ngen4-pro
633
- name: TNSA NGen-4 Pro Evaluations
634
- - dataset:
635
- id: THUDM/LongBench-v2
636
- task_id: longbench
637
- value: 88.0
638
- date: '2026-04-06'
639
- source:
640
- url: https://tnsaai.com/models/ngen4-pro
641
- name: TNSA NGen-4 Pro Evaluations
642
- - dataset:
643
- id: lmms-lab/DocVQA
644
- task_id: docvqa
645
- value: 96.5
646
- date: '2026-04-06'
647
- source:
648
- url: https://tnsaai.com/models/ngen4-pro
649
- name: TNSA NGen-4 Pro Evaluations
650
- - dataset:
651
- id: likaixin/ScreenSpot-Pro
652
- task_id: screenspot_pro
653
- value: 72.9
654
- date: '2026-04-06'
655
- source:
656
- url: https://tnsaai.com/models/ngen4-pro
657
- name: TNSA NGen-4 Pro Evaluations
658
- - dataset:
659
- id: hud-evals/OSWorld-Verified
660
- task_id: osworld_verified
661
- value: 57.0
662
- date: '2026-04-06'
663
- source:
664
- url: https://tnsaai.com/models/ngen4-pro
665
- name: TNSA NGen-4 Pro Evaluations
666
- - dataset:
667
- id: AndroidWorld/AndroidWorld
668
- task_id: androidworld
669
- value: 75.0
670
- date: '2026-04-06'
671
- source:
672
- url: https://tnsaai.com/models/ngen4-pro
673
- name: TNSA NGen-4 Pro Evaluations
674
- - dataset:
675
- id: Agents-X/TIR-Bench
676
- task_id: tir_bench
677
- value: 59.8
678
- date: '2026-04-06'
679
- source:
680
- url: https://tnsaai.com/models/ngen4-pro
681
- name: TNSA NGen-4 Pro Evaluations
682
- - dataset:
683
- id: vstar-mm/vstar_bench
684
- task_id: vstar
685
- value: 95.0
686
- date: '2026-04-06'
687
- source:
688
- url: https://tnsaai.com/models/ngen4-pro
689
- name: TNSA NGen-4 Pro Evaluations
690
- - dataset:
691
- id: BoKelvin/SLAKE
692
- task_id: slake
693
- value: 83.2
694
- date: '2026-04-06'
695
- source:
696
- url: https://tnsaai.com/models/ngen4-pro
697
- name: TNSA NGen-4 Pro Evaluations
698
- - dataset:
699
- id: RadGenome/PMC-VQA
700
- task_id: pmc_vqa
701
- value: 65.5
702
- date: '2026-04-06'
703
- source:
704
- url: https://tnsaai.com/models/ngen4-pro
705
- name: TNSA NGen-4 Pro Evaluations
706
- - dataset:
707
- id: OctoMed/MedXpertQA-MM
708
- task_id: medxpertqa_mm
709
- value: 64.2
710
- date: '2026-04-06'
711
- source:
712
- url: https://tnsaai.com/models/ngen4-pro
713
- name: TNSA NGen-4 Pro Evaluations
714
- - dataset:
715
- id: xai-org/RealworldQA
716
- task_id: realworldqa
717
- value: 88.7
718
- date: '2026-04-06'
719
- source:
720
- url: https://tnsaai.com/models/ngen4-pro
721
- name: TNSA NGen-4 Pro Evaluations
722
- - dataset:
723
- id: zli12321/mmstar
724
- task_id: mmstar
725
- value: 86.5
726
- date: '2026-04-06'
727
- source:
728
- url: https://tnsaai.com/models/ngen4-pro
729
- name: TNSA NGen-4 Pro Evaluations
730
- - dataset:
731
- id: VLMEval/MMBench_DEV_EN
732
- task_id: mmbench_en
733
- value: 96.1
734
- date: '2026-04-06'
735
- source:
736
- url: https://tnsaai.com/models/ngen4-pro
737
- name: TNSA NGen-4 Pro Evaluations
738
- - dataset:
739
- id: SimpleVQA/SimpleVQA
740
- task_id: simplevqa
741
- value: 61.0
742
- date: '2026-04-06'
743
- source:
744
- url: https://tnsaai.com/models/ngen4-pro
745
- name: TNSA NGen-4 Pro Evaluations
746
- - dataset:
747
- id: lmms-lab/HallusionBench
748
- task_id: hallusionbench
749
- value: 71.8
750
- date: '2026-04-06'
751
- source:
752
- url: https://tnsaai.com/models/ngen4-pro
753
- name: TNSA NGen-4 Pro Evaluations
754
- - dataset:
755
- id: princeton-nlp/CharXiv
756
- task_id: charxiv_rq
757
- value: 81.4
758
- date: '2026-04-06'
759
- source:
760
- url: https://tnsaai.com/models/ngen4-pro
761
- name: TNSA NGen-4 Pro Evaluations
762
- - dataset:
763
- id: yubo2333/MMLongBench-Doc
764
- task_id: mmlongbench_doc
765
- value: 63.2
766
- date: '2026-04-06'
767
- source:
768
- url: https://tnsaai.com/models/ngen4-pro
769
- name: TNSA NGen-4 Pro Evaluations
770
- - dataset:
771
- id: wulipc/CC-OCR
772
- task_id: cc_ocr
773
- value: 84.6
774
- date: '2026-04-06'
775
- source:
776
- url: https://tnsaai.com/models/ngen4-pro
777
- name: TNSA NGen-4 Pro Evaluations
778
- - dataset:
779
- id: pix2struct/ai2d
780
- task_id: ai2d_test
781
- value: 97.2
782
- date: '2026-04-06'
783
- source:
784
- url: https://tnsaai.com/models/ngen4-pro
785
- name: TNSA NGen-4 Pro Evaluations
786
- - dataset:
787
- id: echo840/OCRBench
788
- task_id: ocrbench
789
- value: 95.4
790
- date: '2026-04-06'
791
- source:
792
- url: https://tnsaai.com/models/ngen4-pro
793
- name: TNSA NGen-4 Pro Evaluations
794
- - dataset:
795
- id: vikhyatk/CountBenchQA
796
- task_id: countbench
797
- value: 99.0
798
- date: '2026-04-06'
799
- source:
800
- url: https://tnsaai.com/models/ngen4-pro
801
- name: TNSA NGen-4 Pro Evaluations
802
- - dataset:
803
- id: shunk031/RefCOCO
804
- task_id: refcoco_avg
805
- value: 93.4
806
- date: '2026-04-06'
807
- source:
808
- url: https://tnsaai.com/models/ngen4-pro
809
- name: TNSA NGen-4 Pro Evaluations
810
- - dataset:
811
- id: ODInW/ODInW13
812
- task_id: odinw13
813
- value: 45.1
814
- date: '2026-04-06'
815
- source:
816
- url: https://tnsaai.com/models/ngen4-pro
817
- name: TNSA NGen-4 Pro Evaluations
818
- - dataset:
819
- id: lmms-lab-si/EmbSpatialBench
820
- task_id: embspatialbench
821
- value: 87.2
822
- date: '2026-04-06'
823
- source:
824
- url: https://tnsaai.com/models/ngen4-pro
825
- name: TNSA NGen-4 Pro Evaluations
826
- - dataset:
827
- id: RefSpatialBench/RefSpatialBench
828
- task_id: refspatialbench
829
- value: 67.0
830
- date: '2026-04-06'
831
- source:
832
- url: https://tnsaai.com/models/ngen4-pro
833
- name: TNSA NGen-4 Pro Evaluations
834
- - dataset:
835
- id: LingoQA/LingoQA
836
- task_id: lingoqa
837
- value: 83.5
838
- date: '2026-04-06'
839
- source:
840
- url: https://tnsaai.com/models/ngen4-pro
841
- name: TNSA NGen-4 Pro Evaluations
842
- - dataset:
843
- id: Hypersim/Hypersim
844
- task_id: hypersim
845
- value: 12.9
846
- date: '2026-04-06'
847
- source:
848
- url: https://tnsaai.com/models/ngen4-pro
849
- name: TNSA NGen-4 Pro Evaluations
850
- - dataset:
851
- id: SUNRGBD/SUNRGBD
852
- task_id: sunrgbd
853
- value: 35.6
854
- date: '2026-04-06'
855
- source:
856
- url: https://tnsaai.com/models/ngen4-pro
857
- name: TNSA NGen-4 Pro Evaluations
858
- - dataset:
859
- id: Nuscene/Nuscene
860
- task_id: nuscene
861
- value: 15.3
862
- date: '2026-04-06'
863
- source:
864
- url: https://tnsaai.com/models/ngen4-pro
865
- name: TNSA NGen-4 Pro Evaluations
866
- - dataset:
867
- id: MMMU/MMMU
868
- task_id: mmmu
869
- value: 86.2
870
- date: '2026-04-06'
871
- source:
872
- url: https://tnsaai.com/models/ngen4-pro
873
- name: TNSA NGen-4 Pro Evaluations
874
- - dataset:
875
- id: MathVision/MathVision
876
- task_id: mathvision
877
- value: 88.1
878
- date: '2026-04-06'
879
- source:
880
- url: https://tnsaai.com/models/ngen4-pro
881
- name: TNSA NGen-4 Pro Evaluations
882
- - dataset:
883
- id: AI4Math/MathVista
884
- task_id: mathvista_mini
885
- value: 91.0
886
- date: '2026-04-06'
887
- source:
888
- url: https://tnsaai.com/models/ngen4-pro
889
- name: TNSA NGen-4 Pro Evaluations
890
- - dataset:
891
- id: DynaMath/DynaMath
892
- task_id: dynamath
893
- value: 89.6
894
- date: '2026-04-06'
895
- source:
896
- url: https://tnsaai.com/models/ngen4-pro
897
- name: TNSA NGen-4 Pro Evaluations
898
- - dataset:
899
- id: evalscope/zerobench
900
- task_id: zerobench
901
- value: 7.0
902
- date: '2026-04-06'
903
- source:
904
- url: https://tnsaai.com/models/ngen4-pro
905
- name: TNSA NGen-4 Pro Evaluations
906
- - dataset:
907
- id: evalscope/zerobench
908
- task_id: zerobench_sub
909
- value: 39.5
910
- date: '2026-04-06'
911
- source:
912
- url: https://tnsaai.com/models/ngen4-pro
913
- name: TNSA NGen-4 Pro Evaluations
914
- - dataset:
915
- id: vlmsareblind/vlmsareblind
916
- task_id: vlmsareblind
917
- value: 98.0
918
- date: '2026-04-06'
919
- source:
920
- url: https://tnsaai.com/models/ngen4-pro
921
- name: TNSA NGen-4 Pro Evaluations
922
- - dataset:
923
- id: BabyVision/BabyVision
924
- task_id: babyvision
925
- value: 42.1
926
- date: '2026-04-06'
927
- source:
928
- url: https://tnsaai.com/models/ngen4-pro
929
- name: TNSA NGen-4 Pro Evaluations
930
- - dataset:
931
- id: VLM2Vec/Video-MME
932
- task_id: video_mme_w_sub
933
- value: 91.0
934
- date: '2026-04-06'
935
- source:
936
- url: https://tnsaai.com/models/ngen4-pro
937
- name: TNSA NGen-4 Pro Evaluations
938
- - dataset:
939
- id: VLM2Vec/Video-MME
940
- task_id: video_mme_wo_sub
941
- value: 86.3
942
- date: '2026-04-06'
943
- source:
944
- url: https://tnsaai.com/models/ngen4-pro
945
- name: TNSA NGen-4 Pro Evaluations
946
- - dataset:
947
- id: EvolvingLMMs-Lab/VideoMMMU
948
- task_id: videommmu
949
- value: 84.6
950
- date: '2026-04-06'
951
- source:
952
- url: https://tnsaai.com/models/ngen4-pro
953
- name: TNSA NGen-4 Pro Evaluations
954
- - dataset:
955
- id: MLVU/MLVU
956
- task_id: mlvu
957
- value: 90.2
958
- date: '2026-04-06'
959
- source:
960
- url: https://tnsaai.com/models/ngen4-pro
961
- name: TNSA NGen-4 Pro Evaluations
962
- - dataset:
963
- id: OpenGVLab/MVBench
964
- task_id: mvbench
965
- value: 78.2
966
- date: '2026-04-06'
967
- source:
968
- url: https://tnsaai.com/models/ngen4-pro
969
- name: TNSA NGen-4 Pro Evaluations
970
- - dataset:
971
- id: LVBench/LVBench
972
- task_id: lvbench
973
- value: 75.1
974
- date: '2026-04-06'
975
- source:
976
- url: https://tnsaai.com/models/ngen4-pro
977
- name: TNSA NGen-4 Pro Evaluations
978
- - dataset:
979
- id: MMVU/MMVU
980
- task_id: mmvu
981
- value: 75.8
982
- date: '2026-04-06'
983
- source:
984
- url: https://tnsaai.com/models/ngen4-pro
985
- name: TNSA NGen-4 Pro Evaluations
 
7
  url: https://tnsaai.com/models/ngen4-pro
8
  name: TNSA NGen-4 Pro Evaluations
9
  - dataset:
10
+ id: openai/gsm8k
11
+ task_id: gsm8k
12
+ value: 99.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  date: '2026-04-06'
14
  source:
15
  url: https://tnsaai.com/models/ngen4-pro
 
30
  source:
31
  url: https://tnsaai.com/models/ngen4-pro
32
  name: TNSA NGen-4 Pro Evaluations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  - dataset:
34
  id: likaixin/ScreenSpot-Pro
35
  task_id: screenspot_pro
 
38
  source:
39
  url: https://tnsaai.com/models/ngen4-pro
40
  name: TNSA NGen-4 Pro Evaluations