autophagycode_M_meta-llama__Meta-Llama-3.1-8B-Instruct_gen1_TEST / checkpoint-150 /trainer_state.json
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.8330781010719757, | |
| "eval_steps": 50, | |
| "global_step": 150, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.6279787886887789, | |
| "epoch": 0.01225114854517611, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.29868432879447937, | |
| "mean_token_accuracy": 0.9193268120288849, | |
| "num_tokens": 6880.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 0.5897643249481916, | |
| "epoch": 0.02450229709035222, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.00019878048780487805, | |
| "loss": 0.2462826669216156, | |
| "mean_token_accuracy": 0.9280684292316437, | |
| "num_tokens": 13136.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 0.5422123614698648, | |
| "epoch": 0.036753445635528334, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 0.0001975609756097561, | |
| "loss": 0.20162586867809296, | |
| "mean_token_accuracy": 0.9337548539042473, | |
| "num_tokens": 19103.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 0.4943904746323824, | |
| "epoch": 0.04900459418070444, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00019634146341463416, | |
| "loss": 0.14251382648944855, | |
| "mean_token_accuracy": 0.95210937038064, | |
| "num_tokens": 24423.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 0.45216177217662334, | |
| "epoch": 0.06125574272588055, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 0.0001951219512195122, | |
| "loss": 0.1607554852962494, | |
| "mean_token_accuracy": 0.9485567137598991, | |
| "num_tokens": 30608.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 0.40192629024386406, | |
| "epoch": 0.07350689127105667, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00019390243902439025, | |
| "loss": 0.16246657073497772, | |
| "mean_token_accuracy": 0.9427705891430378, | |
| "num_tokens": 36811.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 0.3904233919456601, | |
| "epoch": 0.08575803981623277, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.0001926829268292683, | |
| "loss": 0.17223770916461945, | |
| "mean_token_accuracy": 0.941681407392025, | |
| "num_tokens": 43080.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 0.3865286596119404, | |
| "epoch": 0.09800918836140889, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.00019146341463414633, | |
| "loss": 0.14094355702400208, | |
| "mean_token_accuracy": 0.9572678282856941, | |
| "num_tokens": 48449.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 0.370680570602417, | |
| "epoch": 0.11026033690658499, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 0.0001902439024390244, | |
| "loss": 0.14527979493141174, | |
| "mean_token_accuracy": 0.9480055123567581, | |
| "num_tokens": 54626.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.41665984131395817, | |
| "epoch": 0.1225114854517611, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 0.00018902439024390244, | |
| "loss": 0.13078074157238007, | |
| "mean_token_accuracy": 0.9527099393308163, | |
| "num_tokens": 60057.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.41189629677683115, | |
| "epoch": 0.13476263399693722, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 0.0001878048780487805, | |
| "loss": 0.12864071130752563, | |
| "mean_token_accuracy": 0.9570614397525787, | |
| "num_tokens": 66167.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.454614844173193, | |
| "epoch": 0.14701378254211334, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 0.00018658536585365856, | |
| "loss": 0.11721982061862946, | |
| "mean_token_accuracy": 0.9633984379470348, | |
| "num_tokens": 72797.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.42440748680382967, | |
| "epoch": 0.15926493108728942, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 0.0001853658536585366, | |
| "loss": 0.12409229576587677, | |
| "mean_token_accuracy": 0.9605162478983402, | |
| "num_tokens": 78607.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.4054669588804245, | |
| "epoch": 0.17151607963246554, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 0.00018414634146341464, | |
| "loss": 0.12331730872392654, | |
| "mean_token_accuracy": 0.9529691338539124, | |
| "num_tokens": 84391.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.4127539284527302, | |
| "epoch": 0.18376722817764166, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.0001829268292682927, | |
| "loss": 0.1280103325843811, | |
| "mean_token_accuracy": 0.9521399922668934, | |
| "num_tokens": 89112.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.40787679236382246, | |
| "epoch": 0.19601837672281777, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.00018170731707317075, | |
| "loss": 0.1468421369791031, | |
| "mean_token_accuracy": 0.9514360092580318, | |
| "num_tokens": 94028.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.3514019288122654, | |
| "epoch": 0.2082695252679939, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 0.0001804878048780488, | |
| "loss": 0.10981348156929016, | |
| "mean_token_accuracy": 0.9557906277477741, | |
| "num_tokens": 99737.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.348665127530694, | |
| "epoch": 0.22052067381316998, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00017926829268292684, | |
| "loss": 0.12969893217086792, | |
| "mean_token_accuracy": 0.951546210795641, | |
| "num_tokens": 105657.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.3983056088909507, | |
| "epoch": 0.2327718223583461, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 0.00017804878048780488, | |
| "loss": 0.14060811698436737, | |
| "mean_token_accuracy": 0.9409481771290302, | |
| "num_tokens": 111514.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.3584689646959305, | |
| "epoch": 0.2450229709035222, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.00017682926829268295, | |
| "loss": 0.12189538776874542, | |
| "mean_token_accuracy": 0.9588135071098804, | |
| "num_tokens": 116957.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.3528327913954854, | |
| "epoch": 0.2572741194486983, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.000175609756097561, | |
| "loss": 0.10923069715499878, | |
| "mean_token_accuracy": 0.9603886790573597, | |
| "num_tokens": 123277.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.349135366268456, | |
| "epoch": 0.26952526799387444, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 0.00017439024390243903, | |
| "loss": 0.13121618330478668, | |
| "mean_token_accuracy": 0.9497882351279259, | |
| "num_tokens": 130406.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.3748163701966405, | |
| "epoch": 0.28177641653905056, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00017317073170731708, | |
| "loss": 0.13371798396110535, | |
| "mean_token_accuracy": 0.9507532455027103, | |
| "num_tokens": 136085.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.395503643900156, | |
| "epoch": 0.29402756508422667, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.00017195121951219512, | |
| "loss": 0.15047620236873627, | |
| "mean_token_accuracy": 0.9478774890303612, | |
| "num_tokens": 142531.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.35646383836865425, | |
| "epoch": 0.30627871362940273, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.0001707317073170732, | |
| "loss": 0.12330407649278641, | |
| "mean_token_accuracy": 0.9542177952826023, | |
| "num_tokens": 148432.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.32762761414051056, | |
| "epoch": 0.31852986217457885, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 0.00016951219512195123, | |
| "loss": 0.12972016632556915, | |
| "mean_token_accuracy": 0.9510112181305885, | |
| "num_tokens": 154616.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.3109707301482558, | |
| "epoch": 0.33078101071975496, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.00016829268292682927, | |
| "loss": 0.09245749562978745, | |
| "mean_token_accuracy": 0.9632142670452595, | |
| "num_tokens": 160355.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.343580374494195, | |
| "epoch": 0.3430321592649311, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00016707317073170731, | |
| "loss": 0.1068074107170105, | |
| "mean_token_accuracy": 0.955970574170351, | |
| "num_tokens": 166182.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.38475809153169394, | |
| "epoch": 0.3552833078101072, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00016585365853658536, | |
| "loss": 0.11399275064468384, | |
| "mean_token_accuracy": 0.9570133797824383, | |
| "num_tokens": 171286.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.3666055165231228, | |
| "epoch": 0.3675344563552833, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00016463414634146343, | |
| "loss": 0.1474432796239853, | |
| "mean_token_accuracy": 0.9397772029042244, | |
| "num_tokens": 177662.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.31479439605027437, | |
| "epoch": 0.37978560490045943, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.00016341463414634147, | |
| "loss": 0.11059485375881195, | |
| "mean_token_accuracy": 0.9584382586181164, | |
| "num_tokens": 182284.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.3475527623668313, | |
| "epoch": 0.39203675344563554, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00016219512195121954, | |
| "loss": 0.13328994810581207, | |
| "mean_token_accuracy": 0.9535585716366768, | |
| "num_tokens": 189322.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.3634985350072384, | |
| "epoch": 0.40428790199081166, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 0.00016097560975609758, | |
| "loss": 0.15225957334041595, | |
| "mean_token_accuracy": 0.9502801597118378, | |
| "num_tokens": 195113.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.3299488425254822, | |
| "epoch": 0.4165390505359878, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.00015975609756097562, | |
| "loss": 0.12952829897403717, | |
| "mean_token_accuracy": 0.9594988077878952, | |
| "num_tokens": 200835.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.3228675974532962, | |
| "epoch": 0.42879019908116384, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 0.00015853658536585366, | |
| "loss": 0.10655219852924347, | |
| "mean_token_accuracy": 0.9599267169833183, | |
| "num_tokens": 206172.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.3190277460962534, | |
| "epoch": 0.44104134762633995, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 0.00015731707317073173, | |
| "loss": 0.11382263898849487, | |
| "mean_token_accuracy": 0.9600558690726757, | |
| "num_tokens": 211901.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.3038849513977766, | |
| "epoch": 0.45329249617151607, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 0.00015609756097560978, | |
| "loss": 0.10026074945926666, | |
| "mean_token_accuracy": 0.9635081477463245, | |
| "num_tokens": 217629.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.31116786785423756, | |
| "epoch": 0.4655436447166922, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 0.00015487804878048782, | |
| "loss": 0.13140451908111572, | |
| "mean_token_accuracy": 0.9521206878125668, | |
| "num_tokens": 223352.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.3148405561223626, | |
| "epoch": 0.4777947932618683, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 0.00015365853658536586, | |
| "loss": 0.11006736010313034, | |
| "mean_token_accuracy": 0.9543089419603348, | |
| "num_tokens": 229089.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.3327226936817169, | |
| "epoch": 0.4900459418070444, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 0.0001524390243902439, | |
| "loss": 0.11151966452598572, | |
| "mean_token_accuracy": 0.9555698931217194, | |
| "num_tokens": 234629.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.29484597500413656, | |
| "epoch": 0.5022970903522205, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 0.00015121951219512197, | |
| "loss": 0.11951664835214615, | |
| "mean_token_accuracy": 0.9610061757266521, | |
| "num_tokens": 239950.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.33220329508185387, | |
| "epoch": 0.5145482388973966, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.10372602194547653, | |
| "mean_token_accuracy": 0.9613832570612431, | |
| "num_tokens": 244905.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.3460714379325509, | |
| "epoch": 0.5267993874425727, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00014878048780487806, | |
| "loss": 0.13158759474754333, | |
| "mean_token_accuracy": 0.9516530968248844, | |
| "num_tokens": 251374.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.350130352191627, | |
| "epoch": 0.5390505359877489, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 0.0001475609756097561, | |
| "loss": 0.13656970858573914, | |
| "mean_token_accuracy": 0.9539419040083885, | |
| "num_tokens": 257465.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.3217963883653283, | |
| "epoch": 0.5513016845329249, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00014634146341463414, | |
| "loss": 0.12704257667064667, | |
| "mean_token_accuracy": 0.954754151403904, | |
| "num_tokens": 262964.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.3284407975152135, | |
| "epoch": 0.5635528330781011, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 0.0001451219512195122, | |
| "loss": 0.09583695977926254, | |
| "mean_token_accuracy": 0.9638715162873268, | |
| "num_tokens": 268846.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.34618470072746277, | |
| "epoch": 0.5758039816232772, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00014390243902439025, | |
| "loss": 0.14501920342445374, | |
| "mean_token_accuracy": 0.9476289339363575, | |
| "num_tokens": 274384.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.31532789394259453, | |
| "epoch": 0.5880551301684533, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.0001426829268292683, | |
| "loss": 0.12755413353443146, | |
| "mean_token_accuracy": 0.954731572419405, | |
| "num_tokens": 279484.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.363907678052783, | |
| "epoch": 0.6003062787136294, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 0.00014146341463414634, | |
| "loss": 0.10707266628742218, | |
| "mean_token_accuracy": 0.954724483191967, | |
| "num_tokens": 284817.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.31933102291077375, | |
| "epoch": 0.6125574272588055, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 0.00014024390243902438, | |
| "loss": 0.09467475861310959, | |
| "mean_token_accuracy": 0.9665305241942406, | |
| "num_tokens": 290524.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6125574272588055, | |
| "eval_entropy": 0.3426319423361101, | |
| "eval_loss": 0.10967054218053818, | |
| "eval_mean_token_accuracy": 0.9585090970647507, | |
| "eval_num_tokens": 290524.0, | |
| "eval_runtime": 56.6704, | |
| "eval_samples_per_second": 1.218, | |
| "eval_steps_per_second": 1.218, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.34031340666115284, | |
| "epoch": 0.6248085758039816, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 0.00013902439024390245, | |
| "loss": 0.09569472074508667, | |
| "mean_token_accuracy": 0.9653150551021099, | |
| "num_tokens": 296069.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.3571790661662817, | |
| "epoch": 0.6370597243491577, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.0001378048780487805, | |
| "loss": 0.10000051558017731, | |
| "mean_token_accuracy": 0.9638084918260574, | |
| "num_tokens": 301514.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.36696077417582273, | |
| "epoch": 0.6493108728943339, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.00013658536585365856, | |
| "loss": 0.10484882444143295, | |
| "mean_token_accuracy": 0.9642562530934811, | |
| "num_tokens": 307445.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.31568020675331354, | |
| "epoch": 0.6615620214395099, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.0001353658536585366, | |
| "loss": 0.11220870912075043, | |
| "mean_token_accuracy": 0.9582751281559467, | |
| "num_tokens": 312864.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.35635274462401867, | |
| "epoch": 0.6738131699846861, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00013414634146341464, | |
| "loss": 0.13630808889865875, | |
| "mean_token_accuracy": 0.9525270387530327, | |
| "num_tokens": 319021.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.335802904330194, | |
| "epoch": 0.6860643185298622, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 0.0001329268292682927, | |
| "loss": 0.11126557737588882, | |
| "mean_token_accuracy": 0.9602809473872185, | |
| "num_tokens": 324169.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.37789826188236475, | |
| "epoch": 0.6983154670750383, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00013170731707317076, | |
| "loss": 0.11750061064958572, | |
| "mean_token_accuracy": 0.9616654589772224, | |
| "num_tokens": 330105.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.3473435193300247, | |
| "epoch": 0.7105666156202144, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.0001304878048780488, | |
| "loss": 0.13293127715587616, | |
| "mean_token_accuracy": 0.960812620818615, | |
| "num_tokens": 336000.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.37645846977829933, | |
| "epoch": 0.7228177641653905, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00012926829268292684, | |
| "loss": 0.14020267128944397, | |
| "mean_token_accuracy": 0.9494834020733833, | |
| "num_tokens": 341300.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.3391531016677618, | |
| "epoch": 0.7350689127105666, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 0.00012804878048780488, | |
| "loss": 0.11145544052124023, | |
| "mean_token_accuracy": 0.9526276290416718, | |
| "num_tokens": 347644.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.3284269040450454, | |
| "epoch": 0.7473200612557427, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 0.00012682926829268293, | |
| "loss": 0.13039623200893402, | |
| "mean_token_accuracy": 0.9597245752811432, | |
| "num_tokens": 354096.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.37672379054129124, | |
| "epoch": 0.7595712098009189, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 0.000125609756097561, | |
| "loss": 0.12314963340759277, | |
| "mean_token_accuracy": 0.9572922959923744, | |
| "num_tokens": 360519.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.3247836837545037, | |
| "epoch": 0.7718223583460949, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00012439024390243904, | |
| "loss": 0.10076416283845901, | |
| "mean_token_accuracy": 0.9550469256937504, | |
| "num_tokens": 365666.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.3481680192053318, | |
| "epoch": 0.7840735068912711, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 0.00012317073170731708, | |
| "loss": 0.1191372275352478, | |
| "mean_token_accuracy": 0.9540783166885376, | |
| "num_tokens": 370703.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.3864388270303607, | |
| "epoch": 0.7963246554364471, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 0.00012195121951219512, | |
| "loss": 0.11211425065994263, | |
| "mean_token_accuracy": 0.9600110091269016, | |
| "num_tokens": 375933.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.3722238801419735, | |
| "epoch": 0.8085758039816233, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00012073170731707318, | |
| "loss": 0.1060609444975853, | |
| "mean_token_accuracy": 0.9619267173111439, | |
| "num_tokens": 381934.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.31903401017189026, | |
| "epoch": 0.8208269525267994, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.00011951219512195122, | |
| "loss": 0.10838343948125839, | |
| "mean_token_accuracy": 0.9615302868187428, | |
| "num_tokens": 388141.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.35833382699638605, | |
| "epoch": 0.8330781010719756, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 0.00011829268292682926, | |
| "loss": 0.11101208627223969, | |
| "mean_token_accuracy": 0.9624687656760216, | |
| "num_tokens": 393485.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.3384561138227582, | |
| "epoch": 0.8453292496171516, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 0.00011707317073170732, | |
| "loss": 0.11302048712968826, | |
| "mean_token_accuracy": 0.959359273314476, | |
| "num_tokens": 398110.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.3644536528736353, | |
| "epoch": 0.8575803981623277, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 0.00011585365853658536, | |
| "loss": 0.11044176667928696, | |
| "mean_token_accuracy": 0.9590198397636414, | |
| "num_tokens": 403100.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.38477543368935585, | |
| "epoch": 0.8698315467075038, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00011463414634146342, | |
| "loss": 0.1475592851638794, | |
| "mean_token_accuracy": 0.9498578049242496, | |
| "num_tokens": 408866.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.3160585919395089, | |
| "epoch": 0.8820826952526799, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 0.00011341463414634146, | |
| "loss": 0.12280108034610748, | |
| "mean_token_accuracy": 0.9519775547087193, | |
| "num_tokens": 414474.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.35043725837022066, | |
| "epoch": 0.8943338437978561, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 0.00011219512195121953, | |
| "loss": 0.11857884377241135, | |
| "mean_token_accuracy": 0.957142923027277, | |
| "num_tokens": 420975.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.3428537016734481, | |
| "epoch": 0.9065849923430321, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 0.00011097560975609757, | |
| "loss": 0.09723620116710663, | |
| "mean_token_accuracy": 0.9623003490269184, | |
| "num_tokens": 427160.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.3439481556415558, | |
| "epoch": 0.9188361408882083, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 0.00010975609756097563, | |
| "loss": 0.12700851261615753, | |
| "mean_token_accuracy": 0.9501284696161747, | |
| "num_tokens": 432740.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.396132281050086, | |
| "epoch": 0.9310872894333844, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.00010853658536585367, | |
| "loss": 0.11111584305763245, | |
| "mean_token_accuracy": 0.954656295478344, | |
| "num_tokens": 437553.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.31287234649062157, | |
| "epoch": 0.9433384379785605, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 0.00010731707317073172, | |
| "loss": 0.09217967838048935, | |
| "mean_token_accuracy": 0.9652052000164986, | |
| "num_tokens": 443939.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.3873864635825157, | |
| "epoch": 0.9555895865237366, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.00010609756097560977, | |
| "loss": 0.11941950023174286, | |
| "mean_token_accuracy": 0.9584939330816269, | |
| "num_tokens": 449426.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.3076135413721204, | |
| "epoch": 0.9678407350689127, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 0.00010487804878048781, | |
| "loss": 0.10192415118217468, | |
| "mean_token_accuracy": 0.9579608179628849, | |
| "num_tokens": 454962.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.33057918306440115, | |
| "epoch": 0.9800918836140888, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 0.00010365853658536586, | |
| "loss": 0.09825513511896133, | |
| "mean_token_accuracy": 0.9628020562231541, | |
| "num_tokens": 461420.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.344515610486269, | |
| "epoch": 0.9923430321592649, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.0001024390243902439, | |
| "loss": 0.1298864483833313, | |
| "mean_token_accuracy": 0.9483123049139977, | |
| "num_tokens": 467214.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.35632768720388414, | |
| "epoch": 1.0, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00010121951219512196, | |
| "loss": 0.0866069495677948, | |
| "mean_token_accuracy": 0.9732943534851074, | |
| "num_tokens": 470456.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.32443390041589737, | |
| "epoch": 1.0122511485451762, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.08388859778642654, | |
| "mean_token_accuracy": 0.9714479111135006, | |
| "num_tokens": 476660.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.27756834030151367, | |
| "epoch": 1.0245022970903521, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 9.878048780487805e-05, | |
| "loss": 0.07270920276641846, | |
| "mean_token_accuracy": 0.9756675288081169, | |
| "num_tokens": 482852.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.37221179995685816, | |
| "epoch": 1.0367534456355283, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 9.75609756097561e-05, | |
| "loss": 0.09018392115831375, | |
| "mean_token_accuracy": 0.9692036546766758, | |
| "num_tokens": 489720.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.2815575134009123, | |
| "epoch": 1.0490045941807045, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 9.634146341463415e-05, | |
| "loss": 0.0751088559627533, | |
| "mean_token_accuracy": 0.9700377807021141, | |
| "num_tokens": 494962.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.3032172666862607, | |
| "epoch": 1.0612557427258806, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 9.51219512195122e-05, | |
| "loss": 0.08939642459154129, | |
| "mean_token_accuracy": 0.9708281457424164, | |
| "num_tokens": 501167.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.3396748472005129, | |
| "epoch": 1.0735068912710566, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 9.390243902439024e-05, | |
| "loss": 0.07543614506721497, | |
| "mean_token_accuracy": 0.9790169149637222, | |
| "num_tokens": 506009.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.3040660824626684, | |
| "epoch": 1.0857580398162328, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 9.26829268292683e-05, | |
| "loss": 0.0811336562037468, | |
| "mean_token_accuracy": 0.9736967124044895, | |
| "num_tokens": 511394.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.31143750343471766, | |
| "epoch": 1.098009188361409, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 9.146341463414635e-05, | |
| "loss": 0.08774841576814651, | |
| "mean_token_accuracy": 0.9692316688597202, | |
| "num_tokens": 516837.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.34162401407957077, | |
| "epoch": 1.110260336906585, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 9.02439024390244e-05, | |
| "loss": 0.09738526493310928, | |
| "mean_token_accuracy": 0.9668772779405117, | |
| "num_tokens": 521646.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.2694440744817257, | |
| "epoch": 1.122511485451761, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 8.902439024390244e-05, | |
| "loss": 0.0640217661857605, | |
| "mean_token_accuracy": 0.9778573513031006, | |
| "num_tokens": 527366.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.2505084676668048, | |
| "epoch": 1.1347626339969372, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 8.78048780487805e-05, | |
| "loss": 0.07979685813188553, | |
| "mean_token_accuracy": 0.973389033228159, | |
| "num_tokens": 533210.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.2535351850092411, | |
| "epoch": 1.1470137825421134, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 8.658536585365854e-05, | |
| "loss": 0.06256209313869476, | |
| "mean_token_accuracy": 0.9790448397397995, | |
| "num_tokens": 538589.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.2352813482284546, | |
| "epoch": 1.1592649310872893, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 8.53658536585366e-05, | |
| "loss": 0.07825516164302826, | |
| "mean_token_accuracy": 0.9773083217442036, | |
| "num_tokens": 544486.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.23897481244057417, | |
| "epoch": 1.1715160796324655, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 8.414634146341464e-05, | |
| "loss": 0.0690295547246933, | |
| "mean_token_accuracy": 0.9745013862848282, | |
| "num_tokens": 549647.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.2606777008622885, | |
| "epoch": 1.1837672281776417, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.292682926829268e-05, | |
| "loss": 0.07471512258052826, | |
| "mean_token_accuracy": 0.9725949242711067, | |
| "num_tokens": 554687.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.27557028736919165, | |
| "epoch": 1.1960183767228179, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 8.170731707317073e-05, | |
| "loss": 0.06931524723768234, | |
| "mean_token_accuracy": 0.978624414652586, | |
| "num_tokens": 560533.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.2943765129894018, | |
| "epoch": 1.2082695252679938, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 8.048780487804879e-05, | |
| "loss": 0.0812952071428299, | |
| "mean_token_accuracy": 0.9725493676960468, | |
| "num_tokens": 566604.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.2441987576894462, | |
| "epoch": 1.22052067381317, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 7.926829268292683e-05, | |
| "loss": 0.059226248413324356, | |
| "mean_token_accuracy": 0.9759947806596756, | |
| "num_tokens": 572527.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.22052067381317, | |
| "eval_entropy": 0.27095302215952805, | |
| "eval_loss": 0.11123082786798477, | |
| "eval_mean_token_accuracy": 0.9592912940011509, | |
| "eval_num_tokens": 572527.0, | |
| "eval_runtime": 56.8972, | |
| "eval_samples_per_second": 1.213, | |
| "eval_steps_per_second": 1.213, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.2656272081658244, | |
| "epoch": 1.2327718223583461, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 7.804878048780489e-05, | |
| "loss": 0.08751720190048218, | |
| "mean_token_accuracy": 0.9679245948791504, | |
| "num_tokens": 578760.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.2706137653440237, | |
| "epoch": 1.245022970903522, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 7.682926829268293e-05, | |
| "loss": 0.08201148360967636, | |
| "mean_token_accuracy": 0.9710356555879116, | |
| "num_tokens": 584503.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.2744300989434123, | |
| "epoch": 1.2572741194486983, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 7.560975609756099e-05, | |
| "loss": 0.05856996402144432, | |
| "mean_token_accuracy": 0.9778167866170406, | |
| "num_tokens": 590506.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.26921656634658575, | |
| "epoch": 1.2695252679938744, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 7.439024390243903e-05, | |
| "loss": 0.0782560482621193, | |
| "mean_token_accuracy": 0.9701778888702393, | |
| "num_tokens": 596390.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.2885159160941839, | |
| "epoch": 1.2817764165390506, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 7.317073170731707e-05, | |
| "loss": 0.06652253121137619, | |
| "mean_token_accuracy": 0.982722382992506, | |
| "num_tokens": 601411.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.2918844725936651, | |
| "epoch": 1.2940275650842268, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 7.195121951219513e-05, | |
| "loss": 0.05815374106168747, | |
| "mean_token_accuracy": 0.9814562760293484, | |
| "num_tokens": 606925.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.3184075551107526, | |
| "epoch": 1.3062787136294027, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 7.073170731707317e-05, | |
| "loss": 0.07176389545202255, | |
| "mean_token_accuracy": 0.9732142426073551, | |
| "num_tokens": 611981.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.28589071705937386, | |
| "epoch": 1.318529862174579, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 6.951219512195122e-05, | |
| "loss": 0.06908947974443436, | |
| "mean_token_accuracy": 0.9742955937981606, | |
| "num_tokens": 617502.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.30427863635122776, | |
| "epoch": 1.3307810107197549, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 6.829268292682928e-05, | |
| "loss": 0.08661782741546631, | |
| "mean_token_accuracy": 0.9723380617797375, | |
| "num_tokens": 623345.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.2784799374639988, | |
| "epoch": 1.343032159264931, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 6.707317073170732e-05, | |
| "loss": 0.08172982931137085, | |
| "mean_token_accuracy": 0.9692776277661324, | |
| "num_tokens": 629819.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.2621075566858053, | |
| "epoch": 1.3552833078101072, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 6.585365853658538e-05, | |
| "loss": 0.06896749883890152, | |
| "mean_token_accuracy": 0.9779512844979763, | |
| "num_tokens": 635274.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.2867768844589591, | |
| "epoch": 1.3675344563552834, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 6.463414634146342e-05, | |
| "loss": 0.06578939408063889, | |
| "mean_token_accuracy": 0.9736802577972412, | |
| "num_tokens": 640792.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.2833663960918784, | |
| "epoch": 1.3797856049004595, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 6.341463414634146e-05, | |
| "loss": 0.0802934393286705, | |
| "mean_token_accuracy": 0.975794829428196, | |
| "num_tokens": 647553.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.2789901904761791, | |
| "epoch": 1.3920367534456355, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 6.219512195121952e-05, | |
| "loss": 0.07617770880460739, | |
| "mean_token_accuracy": 0.9711511395871639, | |
| "num_tokens": 653249.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.27549734245985746, | |
| "epoch": 1.4042879019908117, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 6.097560975609756e-05, | |
| "loss": 0.050225261598825455, | |
| "mean_token_accuracy": 0.981207113713026, | |
| "num_tokens": 659773.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.2797435289248824, | |
| "epoch": 1.4165390505359878, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 5.975609756097561e-05, | |
| "loss": 0.05845767632126808, | |
| "mean_token_accuracy": 0.97660356387496, | |
| "num_tokens": 665061.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.26317309867590666, | |
| "epoch": 1.4287901990811638, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 5.853658536585366e-05, | |
| "loss": 0.07422497868537903, | |
| "mean_token_accuracy": 0.970589954406023, | |
| "num_tokens": 670610.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.2940791519358754, | |
| "epoch": 1.44104134762634, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 5.731707317073171e-05, | |
| "loss": 0.07870218902826309, | |
| "mean_token_accuracy": 0.9769303686916828, | |
| "num_tokens": 676597.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.3041226239874959, | |
| "epoch": 1.4532924961715161, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 5.6097560975609764e-05, | |
| "loss": 0.07072751969099045, | |
| "mean_token_accuracy": 0.9737785942852497, | |
| "num_tokens": 682445.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.2781915618106723, | |
| "epoch": 1.4655436447166923, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 5.487804878048781e-05, | |
| "loss": 0.07238440960645676, | |
| "mean_token_accuracy": 0.9758684188127518, | |
| "num_tokens": 688019.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.2433428610675037, | |
| "epoch": 1.4777947932618682, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.365853658536586e-05, | |
| "loss": 0.07184246182441711, | |
| "mean_token_accuracy": 0.9751845635473728, | |
| "num_tokens": 694026.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.2679327353835106, | |
| "epoch": 1.4900459418070444, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 5.2439024390243904e-05, | |
| "loss": 0.07859291881322861, | |
| "mean_token_accuracy": 0.9759643562138081, | |
| "num_tokens": 699917.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.25062850676476955, | |
| "epoch": 1.5022970903522204, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 5.121951219512195e-05, | |
| "loss": 0.052012164145708084, | |
| "mean_token_accuracy": 0.9831658490002155, | |
| "num_tokens": 705423.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.2631832705810666, | |
| "epoch": 1.5145482388973965, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0810304507613182, | |
| "mean_token_accuracy": 0.9739143140614033, | |
| "num_tokens": 712049.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.28171470761299133, | |
| "epoch": 1.5267993874425727, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 4.878048780487805e-05, | |
| "loss": 0.07931914180517197, | |
| "mean_token_accuracy": 0.9697326384484768, | |
| "num_tokens": 718759.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.2730549927800894, | |
| "epoch": 1.5390505359877489, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 4.75609756097561e-05, | |
| "loss": 0.06596571952104568, | |
| "mean_token_accuracy": 0.9794163964688778, | |
| "num_tokens": 724151.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.3451000778004527, | |
| "epoch": 1.551301684532925, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.634146341463415e-05, | |
| "loss": 0.07738037407398224, | |
| "mean_token_accuracy": 0.9751269891858101, | |
| "num_tokens": 729663.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.3187516676262021, | |
| "epoch": 1.5635528330781012, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 4.51219512195122e-05, | |
| "loss": 0.07062625885009766, | |
| "mean_token_accuracy": 0.9731609113514423, | |
| "num_tokens": 734823.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.25056853611022234, | |
| "epoch": 1.5758039816232772, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 4.390243902439025e-05, | |
| "loss": 0.050548747181892395, | |
| "mean_token_accuracy": 0.9795470051467419, | |
| "num_tokens": 739981.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.24678445514291525, | |
| "epoch": 1.5880551301684533, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.26829268292683e-05, | |
| "loss": 0.08084772527217865, | |
| "mean_token_accuracy": 0.9744056761264801, | |
| "num_tokens": 745848.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.2766247531399131, | |
| "epoch": 1.6003062787136293, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 4.146341463414634e-05, | |
| "loss": 0.06591574102640152, | |
| "mean_token_accuracy": 0.9837718568742275, | |
| "num_tokens": 751338.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.2634483175352216, | |
| "epoch": 1.6125574272588055, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 4.0243902439024395e-05, | |
| "loss": 0.08018708974123001, | |
| "mean_token_accuracy": 0.9686751514673233, | |
| "num_tokens": 757944.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.2640473246574402, | |
| "epoch": 1.6248085758039816, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 3.9024390243902444e-05, | |
| "loss": 0.058747079223394394, | |
| "mean_token_accuracy": 0.9809320084750652, | |
| "num_tokens": 762909.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.2685444802045822, | |
| "epoch": 1.6370597243491578, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 3.780487804878049e-05, | |
| "loss": 0.06983562558889389, | |
| "mean_token_accuracy": 0.9774856679141521, | |
| "num_tokens": 768121.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.28071946976706386, | |
| "epoch": 1.649310872894334, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 3.6585365853658535e-05, | |
| "loss": 0.06817604601383209, | |
| "mean_token_accuracy": 0.9757047519087791, | |
| "num_tokens": 773835.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.3044859105721116, | |
| "epoch": 1.66156202143951, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 3.5365853658536584e-05, | |
| "loss": 0.07083944231271744, | |
| "mean_token_accuracy": 0.9756592996418476, | |
| "num_tokens": 779008.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.28345807548612356, | |
| "epoch": 1.673813169984686, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 3.414634146341464e-05, | |
| "loss": 0.08097834140062332, | |
| "mean_token_accuracy": 0.97182647138834, | |
| "num_tokens": 784955.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.2775820689275861, | |
| "epoch": 1.686064318529862, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 3.292682926829269e-05, | |
| "loss": 0.0830724835395813, | |
| "mean_token_accuracy": 0.97340302541852, | |
| "num_tokens": 792300.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.28273867163807154, | |
| "epoch": 1.6983154670750382, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 3.170731707317073e-05, | |
| "loss": 0.06122542545199394, | |
| "mean_token_accuracy": 0.97363705560565, | |
| "num_tokens": 797635.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.33463940117508173, | |
| "epoch": 1.7105666156202144, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.048780487804878e-05, | |
| "loss": 0.09268856793642044, | |
| "mean_token_accuracy": 0.968308299779892, | |
| "num_tokens": 803321.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.26861980091780424, | |
| "epoch": 1.7228177641653906, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 2.926829268292683e-05, | |
| "loss": 0.06270366907119751, | |
| "mean_token_accuracy": 0.9758359678089619, | |
| "num_tokens": 808696.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.24330784939229488, | |
| "epoch": 1.7350689127105667, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 2.8048780487804882e-05, | |
| "loss": 0.05480020493268967, | |
| "mean_token_accuracy": 0.9805137030780315, | |
| "num_tokens": 813988.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.2642217818647623, | |
| "epoch": 1.7473200612557427, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 2.682926829268293e-05, | |
| "loss": 0.06715261191129684, | |
| "mean_token_accuracy": 0.9766382575035095, | |
| "num_tokens": 820087.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.3016277579590678, | |
| "epoch": 1.7595712098009189, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 2.5609756097560977e-05, | |
| "loss": 0.057253021746873856, | |
| "mean_token_accuracy": 0.9784747660160065, | |
| "num_tokens": 825766.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.27436165884137154, | |
| "epoch": 1.7718223583460948, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 2.4390243902439026e-05, | |
| "loss": 0.05420134961605072, | |
| "mean_token_accuracy": 0.980743058025837, | |
| "num_tokens": 831298.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.28155668918043375, | |
| "epoch": 1.784073506891271, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 2.3170731707317075e-05, | |
| "loss": 0.07169967144727707, | |
| "mean_token_accuracy": 0.9731598235666752, | |
| "num_tokens": 837254.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.2712876806035638, | |
| "epoch": 1.7963246554364471, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 2.1951219512195124e-05, | |
| "loss": 0.06996440887451172, | |
| "mean_token_accuracy": 0.9760563708841801, | |
| "num_tokens": 843796.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.29682911094278097, | |
| "epoch": 1.8085758039816233, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 2.073170731707317e-05, | |
| "loss": 0.06667114794254303, | |
| "mean_token_accuracy": 0.9743672311306, | |
| "num_tokens": 849621.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.29692134354263544, | |
| "epoch": 1.8208269525267995, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 1.9512195121951222e-05, | |
| "loss": 0.07836263626813889, | |
| "mean_token_accuracy": 0.9702205285429955, | |
| "num_tokens": 854578.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.2837211322039366, | |
| "epoch": 1.8330781010719757, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.8292682926829268e-05, | |
| "loss": 0.07421581447124481, | |
| "mean_token_accuracy": 0.9740220792591572, | |
| "num_tokens": 861001.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.8330781010719757, | |
| "eval_entropy": 0.28257156163454056, | |
| "eval_loss": 0.11007164418697357, | |
| "eval_mean_token_accuracy": 0.9602361796558767, | |
| "eval_num_tokens": 861001.0, | |
| "eval_runtime": 56.9589, | |
| "eval_samples_per_second": 1.211, | |
| "eval_steps_per_second": 1.211, | |
| "step": 150 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 164, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.898716388139827e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |