Spaces:

ServiceNow
/

browsergym-leaderboard

Running

xhluca commited on 6 days ago

Commit

d7a5d72

verified ·

1 Parent(s): 91c446a

Remove redundant training data notes from comments

Files changed (3) hide show

results/A3-Qwen3.5-9B/miniwob.json CHANGED Viewed

@@ -10,7 +10,7 @@
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
         "reproducible": "Yes",
-        "comments": "625 tasks. Fine-tuned on A3-Synth trajectories via the Agent-as-Annotators framework. Not trained on MiniWoB data.",
         "original_or_reproduced": "Original"
     }
 ]

         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
         "reproducible": "Yes",
+        "comments": "625 tasks. Fine-tuned on A3-Synth trajectories via the Agent-as-Annotators framework.",
         "original_or_reproduced": "Original"
     }
 ]

results/A3-Qwen3.5-9B/workarena-l1.json CHANGED Viewed

@@ -10,7 +10,7 @@
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
         "reproducible": "Yes",
-        "comments": "330 tasks. Fine-tuned on A3-Synth trajectories via the Agent-as-Annotators framework. Not trained on ServiceNow data.",
         "original_or_reproduced": "Original"
     }
 ]

         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
         "reproducible": "Yes",
+        "comments": "330 tasks. Fine-tuned on A3-Synth trajectories via the Agent-as-Annotators framework.",
         "original_or_reproduced": "Original"
     }
 ]

results/A3-Qwen3.5-9B/workarena-l2.json CHANGED Viewed

@@ -10,7 +10,7 @@
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
         "reproducible": "Yes",
-        "comments": "341 tasks (full benchmark). Fine-tuned on A3-Synth trajectories via the Agent-as-Annotators framework. Not trained on ServiceNow data.",
         "original_or_reproduced": "Original"
     }
 ]

         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",
         "reproducible": "Yes",
+        "comments": "341 tasks (full benchmark). Fine-tuned on A3-Synth trajectories via the Agent-as-Annotators framework.",
         "original_or_reproduced": "Original"
     }
 ]