{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8330781010719757, "eval_steps": 50, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.6279787886887789, "epoch": 0.01225114854517611, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.29868432879447937, "mean_token_accuracy": 0.9193268120288849, "num_tokens": 6880.0, "step": 1 }, { "entropy": 0.5897643249481916, "epoch": 0.02450229709035222, "grad_norm": 0.30078125, "learning_rate": 0.00019878048780487805, "loss": 0.2462826669216156, "mean_token_accuracy": 0.9280684292316437, "num_tokens": 13136.0, "step": 2 }, { "entropy": 0.5422123614698648, "epoch": 0.036753445635528334, "grad_norm": 0.2314453125, "learning_rate": 0.0001975609756097561, "loss": 0.20162586867809296, "mean_token_accuracy": 0.9337548539042473, "num_tokens": 19103.0, "step": 3 }, { "entropy": 0.4943904746323824, "epoch": 0.04900459418070444, "grad_norm": 0.25390625, "learning_rate": 0.00019634146341463416, "loss": 0.14251382648944855, "mean_token_accuracy": 0.95210937038064, "num_tokens": 24423.0, "step": 4 }, { "entropy": 0.45216177217662334, "epoch": 0.06125574272588055, "grad_norm": 0.234375, "learning_rate": 0.0001951219512195122, "loss": 0.1607554852962494, "mean_token_accuracy": 0.9485567137598991, "num_tokens": 30608.0, "step": 5 }, { "entropy": 0.40192629024386406, "epoch": 0.07350689127105667, "grad_norm": 0.2578125, "learning_rate": 0.00019390243902439025, "loss": 0.16246657073497772, "mean_token_accuracy": 0.9427705891430378, "num_tokens": 36811.0, "step": 6 }, { "entropy": 0.3904233919456601, "epoch": 0.08575803981623277, "grad_norm": 0.2890625, "learning_rate": 0.0001926829268292683, "loss": 0.17223770916461945, "mean_token_accuracy": 0.941681407392025, "num_tokens": 43080.0, "step": 7 }, { "entropy": 0.3865286596119404, "epoch": 0.09800918836140889, "grad_norm": 0.263671875, "learning_rate": 0.00019146341463414633, "loss": 0.14094355702400208, "mean_token_accuracy": 0.9572678282856941, "num_tokens": 48449.0, "step": 8 }, { "entropy": 0.370680570602417, "epoch": 0.11026033690658499, "grad_norm": 0.1943359375, "learning_rate": 0.0001902439024390244, "loss": 0.14527979493141174, "mean_token_accuracy": 0.9480055123567581, "num_tokens": 54626.0, "step": 9 }, { "entropy": 0.41665984131395817, "epoch": 0.1225114854517611, "grad_norm": 0.2080078125, "learning_rate": 0.00018902439024390244, "loss": 0.13078074157238007, "mean_token_accuracy": 0.9527099393308163, "num_tokens": 60057.0, "step": 10 }, { "entropy": 0.41189629677683115, "epoch": 0.13476263399693722, "grad_norm": 0.224609375, "learning_rate": 0.0001878048780487805, "loss": 0.12864071130752563, "mean_token_accuracy": 0.9570614397525787, "num_tokens": 66167.0, "step": 11 }, { "entropy": 0.454614844173193, "epoch": 0.14701378254211334, "grad_norm": 0.2109375, "learning_rate": 0.00018658536585365856, "loss": 0.11721982061862946, "mean_token_accuracy": 0.9633984379470348, "num_tokens": 72797.0, "step": 12 }, { "entropy": 0.42440748680382967, "epoch": 0.15926493108728942, "grad_norm": 0.2060546875, "learning_rate": 0.0001853658536585366, "loss": 0.12409229576587677, "mean_token_accuracy": 0.9605162478983402, "num_tokens": 78607.0, "step": 13 }, { "entropy": 0.4054669588804245, "epoch": 0.17151607963246554, "grad_norm": 0.2060546875, "learning_rate": 0.00018414634146341464, "loss": 0.12331730872392654, "mean_token_accuracy": 0.9529691338539124, "num_tokens": 84391.0, "step": 14 }, { "entropy": 0.4127539284527302, "epoch": 0.18376722817764166, "grad_norm": 0.2734375, "learning_rate": 0.0001829268292682927, "loss": 0.1280103325843811, "mean_token_accuracy": 0.9521399922668934, "num_tokens": 89112.0, "step": 15 }, { "entropy": 0.40787679236382246, "epoch": 0.19601837672281777, "grad_norm": 0.2890625, "learning_rate": 0.00018170731707317075, "loss": 0.1468421369791031, "mean_token_accuracy": 0.9514360092580318, "num_tokens": 94028.0, "step": 16 }, { "entropy": 0.3514019288122654, "epoch": 0.2082695252679939, "grad_norm": 0.2109375, "learning_rate": 0.0001804878048780488, "loss": 0.10981348156929016, "mean_token_accuracy": 0.9557906277477741, "num_tokens": 99737.0, "step": 17 }, { "entropy": 0.348665127530694, "epoch": 0.22052067381316998, "grad_norm": 0.314453125, "learning_rate": 0.00017926829268292684, "loss": 0.12969893217086792, "mean_token_accuracy": 0.951546210795641, "num_tokens": 105657.0, "step": 18 }, { "entropy": 0.3983056088909507, "epoch": 0.2327718223583461, "grad_norm": 0.24609375, "learning_rate": 0.00017804878048780488, "loss": 0.14060811698436737, "mean_token_accuracy": 0.9409481771290302, "num_tokens": 111514.0, "step": 19 }, { "entropy": 0.3584689646959305, "epoch": 0.2450229709035222, "grad_norm": 0.298828125, "learning_rate": 0.00017682926829268295, "loss": 0.12189538776874542, "mean_token_accuracy": 0.9588135071098804, "num_tokens": 116957.0, "step": 20 }, { "entropy": 0.3528327913954854, "epoch": 0.2572741194486983, "grad_norm": 0.2265625, "learning_rate": 0.000175609756097561, "loss": 0.10923069715499878, "mean_token_accuracy": 0.9603886790573597, "num_tokens": 123277.0, "step": 21 }, { "entropy": 0.349135366268456, "epoch": 0.26952526799387444, "grad_norm": 0.2060546875, "learning_rate": 0.00017439024390243903, "loss": 0.13121618330478668, "mean_token_accuracy": 0.9497882351279259, "num_tokens": 130406.0, "step": 22 }, { "entropy": 0.3748163701966405, "epoch": 0.28177641653905056, "grad_norm": 0.25390625, "learning_rate": 0.00017317073170731708, "loss": 0.13371798396110535, "mean_token_accuracy": 0.9507532455027103, "num_tokens": 136085.0, "step": 23 }, { "entropy": 0.395503643900156, "epoch": 0.29402756508422667, "grad_norm": 0.22265625, "learning_rate": 0.00017195121951219512, "loss": 0.15047620236873627, "mean_token_accuracy": 0.9478774890303612, "num_tokens": 142531.0, "step": 24 }, { "entropy": 0.35646383836865425, "epoch": 0.30627871362940273, "grad_norm": 0.232421875, "learning_rate": 0.0001707317073170732, "loss": 0.12330407649278641, "mean_token_accuracy": 0.9542177952826023, "num_tokens": 148432.0, "step": 25 }, { "entropy": 0.32762761414051056, "epoch": 0.31852986217457885, "grad_norm": 0.193359375, "learning_rate": 0.00016951219512195123, "loss": 0.12972016632556915, "mean_token_accuracy": 0.9510112181305885, "num_tokens": 154616.0, "step": 26 }, { "entropy": 0.3109707301482558, "epoch": 0.33078101071975496, "grad_norm": 0.27734375, "learning_rate": 0.00016829268292682927, "loss": 0.09245749562978745, "mean_token_accuracy": 0.9632142670452595, "num_tokens": 160355.0, "step": 27 }, { "entropy": 0.343580374494195, "epoch": 0.3430321592649311, "grad_norm": 0.267578125, "learning_rate": 0.00016707317073170731, "loss": 0.1068074107170105, "mean_token_accuracy": 0.955970574170351, "num_tokens": 166182.0, "step": 28 }, { "entropy": 0.38475809153169394, "epoch": 0.3552833078101072, "grad_norm": 0.2578125, "learning_rate": 0.00016585365853658536, "loss": 0.11399275064468384, "mean_token_accuracy": 0.9570133797824383, "num_tokens": 171286.0, "step": 29 }, { "entropy": 0.3666055165231228, "epoch": 0.3675344563552833, "grad_norm": 0.267578125, "learning_rate": 0.00016463414634146343, "loss": 0.1474432796239853, "mean_token_accuracy": 0.9397772029042244, "num_tokens": 177662.0, "step": 30 }, { "entropy": 0.31479439605027437, "epoch": 0.37978560490045943, "grad_norm": 0.27734375, "learning_rate": 0.00016341463414634147, "loss": 0.11059485375881195, "mean_token_accuracy": 0.9584382586181164, "num_tokens": 182284.0, "step": 31 }, { "entropy": 0.3475527623668313, "epoch": 0.39203675344563554, "grad_norm": 0.2373046875, "learning_rate": 0.00016219512195121954, "loss": 0.13328994810581207, "mean_token_accuracy": 0.9535585716366768, "num_tokens": 189322.0, "step": 32 }, { "entropy": 0.3634985350072384, "epoch": 0.40428790199081166, "grad_norm": 0.21875, "learning_rate": 0.00016097560975609758, "loss": 0.15225957334041595, "mean_token_accuracy": 0.9502801597118378, "num_tokens": 195113.0, "step": 33 }, { "entropy": 0.3299488425254822, "epoch": 0.4165390505359878, "grad_norm": 0.25, "learning_rate": 0.00015975609756097562, "loss": 0.12952829897403717, "mean_token_accuracy": 0.9594988077878952, "num_tokens": 200835.0, "step": 34 }, { "entropy": 0.3228675974532962, "epoch": 0.42879019908116384, "grad_norm": 0.2197265625, "learning_rate": 0.00015853658536585366, "loss": 0.10655219852924347, "mean_token_accuracy": 0.9599267169833183, "num_tokens": 206172.0, "step": 35 }, { "entropy": 0.3190277460962534, "epoch": 0.44104134762633995, "grad_norm": 0.2080078125, "learning_rate": 0.00015731707317073173, "loss": 0.11382263898849487, "mean_token_accuracy": 0.9600558690726757, "num_tokens": 211901.0, "step": 36 }, { "entropy": 0.3038849513977766, "epoch": 0.45329249617151607, "grad_norm": 0.2021484375, "learning_rate": 0.00015609756097560978, "loss": 0.10026074945926666, "mean_token_accuracy": 0.9635081477463245, "num_tokens": 217629.0, "step": 37 }, { "entropy": 0.31116786785423756, "epoch": 0.4655436447166922, "grad_norm": 0.1962890625, "learning_rate": 0.00015487804878048782, "loss": 0.13140451908111572, "mean_token_accuracy": 0.9521206878125668, "num_tokens": 223352.0, "step": 38 }, { "entropy": 0.3148405561223626, "epoch": 0.4777947932618683, "grad_norm": 0.201171875, "learning_rate": 0.00015365853658536586, "loss": 0.11006736010313034, "mean_token_accuracy": 0.9543089419603348, "num_tokens": 229089.0, "step": 39 }, { "entropy": 0.3327226936817169, "epoch": 0.4900459418070444, "grad_norm": 0.21875, "learning_rate": 0.0001524390243902439, "loss": 0.11151966452598572, "mean_token_accuracy": 0.9555698931217194, "num_tokens": 234629.0, "step": 40 }, { "entropy": 0.29484597500413656, "epoch": 0.5022970903522205, "grad_norm": 0.171875, "learning_rate": 0.00015121951219512197, "loss": 0.11951664835214615, "mean_token_accuracy": 0.9610061757266521, "num_tokens": 239950.0, "step": 41 }, { "entropy": 0.33220329508185387, "epoch": 0.5145482388973966, "grad_norm": 0.25390625, "learning_rate": 0.00015000000000000001, "loss": 0.10372602194547653, "mean_token_accuracy": 0.9613832570612431, "num_tokens": 244905.0, "step": 42 }, { "entropy": 0.3460714379325509, "epoch": 0.5267993874425727, "grad_norm": 0.2373046875, "learning_rate": 0.00014878048780487806, "loss": 0.13158759474754333, "mean_token_accuracy": 0.9516530968248844, "num_tokens": 251374.0, "step": 43 }, { "entropy": 0.350130352191627, "epoch": 0.5390505359877489, "grad_norm": 0.21484375, "learning_rate": 0.0001475609756097561, "loss": 0.13656970858573914, "mean_token_accuracy": 0.9539419040083885, "num_tokens": 257465.0, "step": 44 }, { "entropy": 0.3217963883653283, "epoch": 0.5513016845329249, "grad_norm": 0.2421875, "learning_rate": 0.00014634146341463414, "loss": 0.12704257667064667, "mean_token_accuracy": 0.954754151403904, "num_tokens": 262964.0, "step": 45 }, { "entropy": 0.3284407975152135, "epoch": 0.5635528330781011, "grad_norm": 0.1796875, "learning_rate": 0.0001451219512195122, "loss": 0.09583695977926254, "mean_token_accuracy": 0.9638715162873268, "num_tokens": 268846.0, "step": 46 }, { "entropy": 0.34618470072746277, "epoch": 0.5758039816232772, "grad_norm": 0.267578125, "learning_rate": 0.00014390243902439025, "loss": 0.14501920342445374, "mean_token_accuracy": 0.9476289339363575, "num_tokens": 274384.0, "step": 47 }, { "entropy": 0.31532789394259453, "epoch": 0.5880551301684533, "grad_norm": 0.2451171875, "learning_rate": 0.0001426829268292683, "loss": 0.12755413353443146, "mean_token_accuracy": 0.954731572419405, "num_tokens": 279484.0, "step": 48 }, { "entropy": 0.363907678052783, "epoch": 0.6003062787136294, "grad_norm": 0.19921875, "learning_rate": 0.00014146341463414634, "loss": 0.10707266628742218, "mean_token_accuracy": 0.954724483191967, "num_tokens": 284817.0, "step": 49 }, { "entropy": 0.31933102291077375, "epoch": 0.6125574272588055, "grad_norm": 0.1904296875, "learning_rate": 0.00014024390243902438, "loss": 0.09467475861310959, "mean_token_accuracy": 0.9665305241942406, "num_tokens": 290524.0, "step": 50 }, { "epoch": 0.6125574272588055, "eval_entropy": 0.3426319423361101, "eval_loss": 0.10967054218053818, "eval_mean_token_accuracy": 0.9585090970647507, "eval_num_tokens": 290524.0, "eval_runtime": 56.6704, "eval_samples_per_second": 1.218, "eval_steps_per_second": 1.218, "step": 50 }, { "entropy": 0.34031340666115284, "epoch": 0.6248085758039816, "grad_norm": 0.173828125, "learning_rate": 0.00013902439024390245, "loss": 0.09569472074508667, "mean_token_accuracy": 0.9653150551021099, "num_tokens": 296069.0, "step": 51 }, { "entropy": 0.3571790661662817, "epoch": 0.6370597243491577, "grad_norm": 0.2421875, "learning_rate": 0.0001378048780487805, "loss": 0.10000051558017731, "mean_token_accuracy": 0.9638084918260574, "num_tokens": 301514.0, "step": 52 }, { "entropy": 0.36696077417582273, "epoch": 0.6493108728943339, "grad_norm": 0.244140625, "learning_rate": 0.00013658536585365856, "loss": 0.10484882444143295, "mean_token_accuracy": 0.9642562530934811, "num_tokens": 307445.0, "step": 53 }, { "entropy": 0.31568020675331354, "epoch": 0.6615620214395099, "grad_norm": 0.283203125, "learning_rate": 0.0001353658536585366, "loss": 0.11220870912075043, "mean_token_accuracy": 0.9582751281559467, "num_tokens": 312864.0, "step": 54 }, { "entropy": 0.35635274462401867, "epoch": 0.6738131699846861, "grad_norm": 0.255859375, "learning_rate": 0.00013414634146341464, "loss": 0.13630808889865875, "mean_token_accuracy": 0.9525270387530327, "num_tokens": 319021.0, "step": 55 }, { "entropy": 0.335802904330194, "epoch": 0.6860643185298622, "grad_norm": 0.2197265625, "learning_rate": 0.0001329268292682927, "loss": 0.11126557737588882, "mean_token_accuracy": 0.9602809473872185, "num_tokens": 324169.0, "step": 56 }, { "entropy": 0.37789826188236475, "epoch": 0.6983154670750383, "grad_norm": 0.25390625, "learning_rate": 0.00013170731707317076, "loss": 0.11750061064958572, "mean_token_accuracy": 0.9616654589772224, "num_tokens": 330105.0, "step": 57 }, { "entropy": 0.3473435193300247, "epoch": 0.7105666156202144, "grad_norm": 0.244140625, "learning_rate": 0.0001304878048780488, "loss": 0.13293127715587616, "mean_token_accuracy": 0.960812620818615, "num_tokens": 336000.0, "step": 58 }, { "entropy": 0.37645846977829933, "epoch": 0.7228177641653905, "grad_norm": 0.25390625, "learning_rate": 0.00012926829268292684, "loss": 0.14020267128944397, "mean_token_accuracy": 0.9494834020733833, "num_tokens": 341300.0, "step": 59 }, { "entropy": 0.3391531016677618, "epoch": 0.7350689127105666, "grad_norm": 0.2041015625, "learning_rate": 0.00012804878048780488, "loss": 0.11145544052124023, "mean_token_accuracy": 0.9526276290416718, "num_tokens": 347644.0, "step": 60 }, { "entropy": 0.3284269040450454, "epoch": 0.7473200612557427, "grad_norm": 0.201171875, "learning_rate": 0.00012682926829268293, "loss": 0.13039623200893402, "mean_token_accuracy": 0.9597245752811432, "num_tokens": 354096.0, "step": 61 }, { "entropy": 0.37672379054129124, "epoch": 0.7595712098009189, "grad_norm": 0.18359375, "learning_rate": 0.000125609756097561, "loss": 0.12314963340759277, "mean_token_accuracy": 0.9572922959923744, "num_tokens": 360519.0, "step": 62 }, { "entropy": 0.3247836837545037, "epoch": 0.7718223583460949, "grad_norm": 0.251953125, "learning_rate": 0.00012439024390243904, "loss": 0.10076416283845901, "mean_token_accuracy": 0.9550469256937504, "num_tokens": 365666.0, "step": 63 }, { "entropy": 0.3481680192053318, "epoch": 0.7840735068912711, "grad_norm": 0.2099609375, "learning_rate": 0.00012317073170731708, "loss": 0.1191372275352478, "mean_token_accuracy": 0.9540783166885376, "num_tokens": 370703.0, "step": 64 }, { "entropy": 0.3864388270303607, "epoch": 0.7963246554364471, "grad_norm": 0.2177734375, "learning_rate": 0.00012195121951219512, "loss": 0.11211425065994263, "mean_token_accuracy": 0.9600110091269016, "num_tokens": 375933.0, "step": 65 }, { "entropy": 0.3722238801419735, "epoch": 0.8085758039816233, "grad_norm": 0.271484375, "learning_rate": 0.00012073170731707318, "loss": 0.1060609444975853, "mean_token_accuracy": 0.9619267173111439, "num_tokens": 381934.0, "step": 66 }, { "entropy": 0.31903401017189026, "epoch": 0.8208269525267994, "grad_norm": 0.1689453125, "learning_rate": 0.00011951219512195122, "loss": 0.10838343948125839, "mean_token_accuracy": 0.9615302868187428, "num_tokens": 388141.0, "step": 67 }, { "entropy": 0.35833382699638605, "epoch": 0.8330781010719756, "grad_norm": 0.2333984375, "learning_rate": 0.00011829268292682926, "loss": 0.11101208627223969, "mean_token_accuracy": 0.9624687656760216, "num_tokens": 393485.0, "step": 68 }, { "entropy": 0.3384561138227582, "epoch": 0.8453292496171516, "grad_norm": 0.2392578125, "learning_rate": 0.00011707317073170732, "loss": 0.11302048712968826, "mean_token_accuracy": 0.959359273314476, "num_tokens": 398110.0, "step": 69 }, { "entropy": 0.3644536528736353, "epoch": 0.8575803981623277, "grad_norm": 0.2119140625, "learning_rate": 0.00011585365853658536, "loss": 0.11044176667928696, "mean_token_accuracy": 0.9590198397636414, "num_tokens": 403100.0, "step": 70 }, { "entropy": 0.38477543368935585, "epoch": 0.8698315467075038, "grad_norm": 0.326171875, "learning_rate": 0.00011463414634146342, "loss": 0.1475592851638794, "mean_token_accuracy": 0.9498578049242496, "num_tokens": 408866.0, "step": 71 }, { "entropy": 0.3160585919395089, "epoch": 0.8820826952526799, "grad_norm": 0.2138671875, "learning_rate": 0.00011341463414634146, "loss": 0.12280108034610748, "mean_token_accuracy": 0.9519775547087193, "num_tokens": 414474.0, "step": 72 }, { "entropy": 0.35043725837022066, "epoch": 0.8943338437978561, "grad_norm": 0.2021484375, "learning_rate": 0.00011219512195121953, "loss": 0.11857884377241135, "mean_token_accuracy": 0.957142923027277, "num_tokens": 420975.0, "step": 73 }, { "entropy": 0.3428537016734481, "epoch": 0.9065849923430321, "grad_norm": 0.1953125, "learning_rate": 0.00011097560975609757, "loss": 0.09723620116710663, "mean_token_accuracy": 0.9623003490269184, "num_tokens": 427160.0, "step": 74 }, { "entropy": 0.3439481556415558, "epoch": 0.9188361408882083, "grad_norm": 0.2333984375, "learning_rate": 0.00010975609756097563, "loss": 0.12700851261615753, "mean_token_accuracy": 0.9501284696161747, "num_tokens": 432740.0, "step": 75 }, { "entropy": 0.396132281050086, "epoch": 0.9310872894333844, "grad_norm": 0.2412109375, "learning_rate": 0.00010853658536585367, "loss": 0.11111584305763245, "mean_token_accuracy": 0.954656295478344, "num_tokens": 437553.0, "step": 76 }, { "entropy": 0.31287234649062157, "epoch": 0.9433384379785605, "grad_norm": 0.1572265625, "learning_rate": 0.00010731707317073172, "loss": 0.09217967838048935, "mean_token_accuracy": 0.9652052000164986, "num_tokens": 443939.0, "step": 77 }, { "entropy": 0.3873864635825157, "epoch": 0.9555895865237366, "grad_norm": 0.2890625, "learning_rate": 0.00010609756097560977, "loss": 0.11941950023174286, "mean_token_accuracy": 0.9584939330816269, "num_tokens": 449426.0, "step": 78 }, { "entropy": 0.3076135413721204, "epoch": 0.9678407350689127, "grad_norm": 0.236328125, "learning_rate": 0.00010487804878048781, "loss": 0.10192415118217468, "mean_token_accuracy": 0.9579608179628849, "num_tokens": 454962.0, "step": 79 }, { "entropy": 0.33057918306440115, "epoch": 0.9800918836140888, "grad_norm": 0.1748046875, "learning_rate": 0.00010365853658536586, "loss": 0.09825513511896133, "mean_token_accuracy": 0.9628020562231541, "num_tokens": 461420.0, "step": 80 }, { "entropy": 0.344515610486269, "epoch": 0.9923430321592649, "grad_norm": 0.25390625, "learning_rate": 0.0001024390243902439, "loss": 0.1298864483833313, "mean_token_accuracy": 0.9483123049139977, "num_tokens": 467214.0, "step": 81 }, { "entropy": 0.35632768720388414, "epoch": 1.0, "grad_norm": 0.267578125, "learning_rate": 0.00010121951219512196, "loss": 0.0866069495677948, "mean_token_accuracy": 0.9732943534851074, "num_tokens": 470456.0, "step": 82 }, { "entropy": 0.32443390041589737, "epoch": 1.0122511485451762, "grad_norm": 0.158203125, "learning_rate": 0.0001, "loss": 0.08388859778642654, "mean_token_accuracy": 0.9714479111135006, "num_tokens": 476660.0, "step": 83 }, { "entropy": 0.27756834030151367, "epoch": 1.0245022970903521, "grad_norm": 0.134765625, "learning_rate": 9.878048780487805e-05, "loss": 0.07270920276641846, "mean_token_accuracy": 0.9756675288081169, "num_tokens": 482852.0, "step": 84 }, { "entropy": 0.37221179995685816, "epoch": 1.0367534456355283, "grad_norm": 0.173828125, "learning_rate": 9.75609756097561e-05, "loss": 0.09018392115831375, "mean_token_accuracy": 0.9692036546766758, "num_tokens": 489720.0, "step": 85 }, { "entropy": 0.2815575134009123, "epoch": 1.0490045941807045, "grad_norm": 0.1328125, "learning_rate": 9.634146341463415e-05, "loss": 0.0751088559627533, "mean_token_accuracy": 0.9700377807021141, "num_tokens": 494962.0, "step": 86 }, { "entropy": 0.3032172666862607, "epoch": 1.0612557427258806, "grad_norm": 0.1630859375, "learning_rate": 9.51219512195122e-05, "loss": 0.08939642459154129, "mean_token_accuracy": 0.9708281457424164, "num_tokens": 501167.0, "step": 87 }, { "entropy": 0.3396748472005129, "epoch": 1.0735068912710566, "grad_norm": 0.146484375, "learning_rate": 9.390243902439024e-05, "loss": 0.07543614506721497, "mean_token_accuracy": 0.9790169149637222, "num_tokens": 506009.0, "step": 88 }, { "entropy": 0.3040660824626684, "epoch": 1.0857580398162328, "grad_norm": 0.15234375, "learning_rate": 9.26829268292683e-05, "loss": 0.0811336562037468, "mean_token_accuracy": 0.9736967124044895, "num_tokens": 511394.0, "step": 89 }, { "entropy": 0.31143750343471766, "epoch": 1.098009188361409, "grad_norm": 0.173828125, "learning_rate": 9.146341463414635e-05, "loss": 0.08774841576814651, "mean_token_accuracy": 0.9692316688597202, "num_tokens": 516837.0, "step": 90 }, { "entropy": 0.34162401407957077, "epoch": 1.110260336906585, "grad_norm": 0.294921875, "learning_rate": 9.02439024390244e-05, "loss": 0.09738526493310928, "mean_token_accuracy": 0.9668772779405117, "num_tokens": 521646.0, "step": 91 }, { "entropy": 0.2694440744817257, "epoch": 1.122511485451761, "grad_norm": 0.140625, "learning_rate": 8.902439024390244e-05, "loss": 0.0640217661857605, "mean_token_accuracy": 0.9778573513031006, "num_tokens": 527366.0, "step": 92 }, { "entropy": 0.2505084676668048, "epoch": 1.1347626339969372, "grad_norm": 0.2265625, "learning_rate": 8.78048780487805e-05, "loss": 0.07979685813188553, "mean_token_accuracy": 0.973389033228159, "num_tokens": 533210.0, "step": 93 }, { "entropy": 0.2535351850092411, "epoch": 1.1470137825421134, "grad_norm": 0.205078125, "learning_rate": 8.658536585365854e-05, "loss": 0.06256209313869476, "mean_token_accuracy": 0.9790448397397995, "num_tokens": 538589.0, "step": 94 }, { "entropy": 0.2352813482284546, "epoch": 1.1592649310872893, "grad_norm": 0.1962890625, "learning_rate": 8.53658536585366e-05, "loss": 0.07825516164302826, "mean_token_accuracy": 0.9773083217442036, "num_tokens": 544486.0, "step": 95 }, { "entropy": 0.23897481244057417, "epoch": 1.1715160796324655, "grad_norm": 0.2734375, "learning_rate": 8.414634146341464e-05, "loss": 0.0690295547246933, "mean_token_accuracy": 0.9745013862848282, "num_tokens": 549647.0, "step": 96 }, { "entropy": 0.2606777008622885, "epoch": 1.1837672281776417, "grad_norm": 0.259765625, "learning_rate": 8.292682926829268e-05, "loss": 0.07471512258052826, "mean_token_accuracy": 0.9725949242711067, "num_tokens": 554687.0, "step": 97 }, { "entropy": 0.27557028736919165, "epoch": 1.1960183767228179, "grad_norm": 0.2314453125, "learning_rate": 8.170731707317073e-05, "loss": 0.06931524723768234, "mean_token_accuracy": 0.978624414652586, "num_tokens": 560533.0, "step": 98 }, { "entropy": 0.2943765129894018, "epoch": 1.2082695252679938, "grad_norm": 0.216796875, "learning_rate": 8.048780487804879e-05, "loss": 0.0812952071428299, "mean_token_accuracy": 0.9725493676960468, "num_tokens": 566604.0, "step": 99 }, { "entropy": 0.2441987576894462, "epoch": 1.22052067381317, "grad_norm": 0.255859375, "learning_rate": 7.926829268292683e-05, "loss": 0.059226248413324356, "mean_token_accuracy": 0.9759947806596756, "num_tokens": 572527.0, "step": 100 }, { "epoch": 1.22052067381317, "eval_entropy": 0.27095302215952805, "eval_loss": 0.11123082786798477, "eval_mean_token_accuracy": 0.9592912940011509, "eval_num_tokens": 572527.0, "eval_runtime": 56.8972, "eval_samples_per_second": 1.213, "eval_steps_per_second": 1.213, "step": 100 }, { "entropy": 0.2656272081658244, "epoch": 1.2327718223583461, "grad_norm": 0.21875, "learning_rate": 7.804878048780489e-05, "loss": 0.08751720190048218, "mean_token_accuracy": 0.9679245948791504, "num_tokens": 578760.0, "step": 101 }, { "entropy": 0.2706137653440237, "epoch": 1.245022970903522, "grad_norm": 0.2275390625, "learning_rate": 7.682926829268293e-05, "loss": 0.08201148360967636, "mean_token_accuracy": 0.9710356555879116, "num_tokens": 584503.0, "step": 102 }, { "entropy": 0.2744300989434123, "epoch": 1.2572741194486983, "grad_norm": 0.2578125, "learning_rate": 7.560975609756099e-05, "loss": 0.05856996402144432, "mean_token_accuracy": 0.9778167866170406, "num_tokens": 590506.0, "step": 103 }, { "entropy": 0.26921656634658575, "epoch": 1.2695252679938744, "grad_norm": 0.23828125, "learning_rate": 7.439024390243903e-05, "loss": 0.0782560482621193, "mean_token_accuracy": 0.9701778888702393, "num_tokens": 596390.0, "step": 104 }, { "entropy": 0.2885159160941839, "epoch": 1.2817764165390506, "grad_norm": 0.166015625, "learning_rate": 7.317073170731707e-05, "loss": 0.06652253121137619, "mean_token_accuracy": 0.982722382992506, "num_tokens": 601411.0, "step": 105 }, { "entropy": 0.2918844725936651, "epoch": 1.2940275650842268, "grad_norm": 0.1875, "learning_rate": 7.195121951219513e-05, "loss": 0.05815374106168747, "mean_token_accuracy": 0.9814562760293484, "num_tokens": 606925.0, "step": 106 }, { "entropy": 0.3184075551107526, "epoch": 1.3062787136294027, "grad_norm": 0.181640625, "learning_rate": 7.073170731707317e-05, "loss": 0.07176389545202255, "mean_token_accuracy": 0.9732142426073551, "num_tokens": 611981.0, "step": 107 }, { "entropy": 0.28589071705937386, "epoch": 1.318529862174579, "grad_norm": 0.216796875, "learning_rate": 6.951219512195122e-05, "loss": 0.06908947974443436, "mean_token_accuracy": 0.9742955937981606, "num_tokens": 617502.0, "step": 108 }, { "entropy": 0.30427863635122776, "epoch": 1.3307810107197549, "grad_norm": 0.2275390625, "learning_rate": 6.829268292682928e-05, "loss": 0.08661782741546631, "mean_token_accuracy": 0.9723380617797375, "num_tokens": 623345.0, "step": 109 }, { "entropy": 0.2784799374639988, "epoch": 1.343032159264931, "grad_norm": 0.2080078125, "learning_rate": 6.707317073170732e-05, "loss": 0.08172982931137085, "mean_token_accuracy": 0.9692776277661324, "num_tokens": 629819.0, "step": 110 }, { "entropy": 0.2621075566858053, "epoch": 1.3552833078101072, "grad_norm": 0.2197265625, "learning_rate": 6.585365853658538e-05, "loss": 0.06896749883890152, "mean_token_accuracy": 0.9779512844979763, "num_tokens": 635274.0, "step": 111 }, { "entropy": 0.2867768844589591, "epoch": 1.3675344563552834, "grad_norm": 0.205078125, "learning_rate": 6.463414634146342e-05, "loss": 0.06578939408063889, "mean_token_accuracy": 0.9736802577972412, "num_tokens": 640792.0, "step": 112 }, { "entropy": 0.2833663960918784, "epoch": 1.3797856049004595, "grad_norm": 0.2138671875, "learning_rate": 6.341463414634146e-05, "loss": 0.0802934393286705, "mean_token_accuracy": 0.975794829428196, "num_tokens": 647553.0, "step": 113 }, { "entropy": 0.2789901904761791, "epoch": 1.3920367534456355, "grad_norm": 0.271484375, "learning_rate": 6.219512195121952e-05, "loss": 0.07617770880460739, "mean_token_accuracy": 0.9711511395871639, "num_tokens": 653249.0, "step": 114 }, { "entropy": 0.27549734245985746, "epoch": 1.4042879019908117, "grad_norm": 0.162109375, "learning_rate": 6.097560975609756e-05, "loss": 0.050225261598825455, "mean_token_accuracy": 0.981207113713026, "num_tokens": 659773.0, "step": 115 }, { "entropy": 0.2797435289248824, "epoch": 1.4165390505359878, "grad_norm": 0.1796875, "learning_rate": 5.975609756097561e-05, "loss": 0.05845767632126808, "mean_token_accuracy": 0.97660356387496, "num_tokens": 665061.0, "step": 116 }, { "entropy": 0.26317309867590666, "epoch": 1.4287901990811638, "grad_norm": 0.2294921875, "learning_rate": 5.853658536585366e-05, "loss": 0.07422497868537903, "mean_token_accuracy": 0.970589954406023, "num_tokens": 670610.0, "step": 117 }, { "entropy": 0.2940791519358754, "epoch": 1.44104134762634, "grad_norm": 0.2265625, "learning_rate": 5.731707317073171e-05, "loss": 0.07870218902826309, "mean_token_accuracy": 0.9769303686916828, "num_tokens": 676597.0, "step": 118 }, { "entropy": 0.3041226239874959, "epoch": 1.4532924961715161, "grad_norm": 0.2236328125, "learning_rate": 5.6097560975609764e-05, "loss": 0.07072751969099045, "mean_token_accuracy": 0.9737785942852497, "num_tokens": 682445.0, "step": 119 }, { "entropy": 0.2781915618106723, "epoch": 1.4655436447166923, "grad_norm": 0.1884765625, "learning_rate": 5.487804878048781e-05, "loss": 0.07238440960645676, "mean_token_accuracy": 0.9758684188127518, "num_tokens": 688019.0, "step": 120 }, { "entropy": 0.2433428610675037, "epoch": 1.4777947932618682, "grad_norm": 0.263671875, "learning_rate": 5.365853658536586e-05, "loss": 0.07184246182441711, "mean_token_accuracy": 0.9751845635473728, "num_tokens": 694026.0, "step": 121 }, { "entropy": 0.2679327353835106, "epoch": 1.4900459418070444, "grad_norm": 0.2236328125, "learning_rate": 5.2439024390243904e-05, "loss": 0.07859291881322861, "mean_token_accuracy": 0.9759643562138081, "num_tokens": 699917.0, "step": 122 }, { "entropy": 0.25062850676476955, "epoch": 1.5022970903522204, "grad_norm": 0.16796875, "learning_rate": 5.121951219512195e-05, "loss": 0.052012164145708084, "mean_token_accuracy": 0.9831658490002155, "num_tokens": 705423.0, "step": 123 }, { "entropy": 0.2631832705810666, "epoch": 1.5145482388973965, "grad_norm": 0.169921875, "learning_rate": 5e-05, "loss": 0.0810304507613182, "mean_token_accuracy": 0.9739143140614033, "num_tokens": 712049.0, "step": 124 }, { "entropy": 0.28171470761299133, "epoch": 1.5267993874425727, "grad_norm": 0.2021484375, "learning_rate": 4.878048780487805e-05, "loss": 0.07931914180517197, "mean_token_accuracy": 0.9697326384484768, "num_tokens": 718759.0, "step": 125 }, { "entropy": 0.2730549927800894, "epoch": 1.5390505359877489, "grad_norm": 0.19140625, "learning_rate": 4.75609756097561e-05, "loss": 0.06596571952104568, "mean_token_accuracy": 0.9794163964688778, "num_tokens": 724151.0, "step": 126 }, { "entropy": 0.3451000778004527, "epoch": 1.551301684532925, "grad_norm": 0.298828125, "learning_rate": 4.634146341463415e-05, "loss": 0.07738037407398224, "mean_token_accuracy": 0.9751269891858101, "num_tokens": 729663.0, "step": 127 }, { "entropy": 0.3187516676262021, "epoch": 1.5635528330781012, "grad_norm": 0.23046875, "learning_rate": 4.51219512195122e-05, "loss": 0.07062625885009766, "mean_token_accuracy": 0.9731609113514423, "num_tokens": 734823.0, "step": 128 }, { "entropy": 0.25056853611022234, "epoch": 1.5758039816232772, "grad_norm": 0.1669921875, "learning_rate": 4.390243902439025e-05, "loss": 0.050548747181892395, "mean_token_accuracy": 0.9795470051467419, "num_tokens": 739981.0, "step": 129 }, { "entropy": 0.24678445514291525, "epoch": 1.5880551301684533, "grad_norm": 0.28515625, "learning_rate": 4.26829268292683e-05, "loss": 0.08084772527217865, "mean_token_accuracy": 0.9744056761264801, "num_tokens": 745848.0, "step": 130 }, { "entropy": 0.2766247531399131, "epoch": 1.6003062787136293, "grad_norm": 0.2119140625, "learning_rate": 4.146341463414634e-05, "loss": 0.06591574102640152, "mean_token_accuracy": 0.9837718568742275, "num_tokens": 751338.0, "step": 131 }, { "entropy": 0.2634483175352216, "epoch": 1.6125574272588055, "grad_norm": 0.1845703125, "learning_rate": 4.0243902439024395e-05, "loss": 0.08018708974123001, "mean_token_accuracy": 0.9686751514673233, "num_tokens": 757944.0, "step": 132 }, { "entropy": 0.2640473246574402, "epoch": 1.6248085758039816, "grad_norm": 0.1953125, "learning_rate": 3.9024390243902444e-05, "loss": 0.058747079223394394, "mean_token_accuracy": 0.9809320084750652, "num_tokens": 762909.0, "step": 133 }, { "entropy": 0.2685444802045822, "epoch": 1.6370597243491578, "grad_norm": 0.1982421875, "learning_rate": 3.780487804878049e-05, "loss": 0.06983562558889389, "mean_token_accuracy": 0.9774856679141521, "num_tokens": 768121.0, "step": 134 }, { "entropy": 0.28071946976706386, "epoch": 1.649310872894334, "grad_norm": 0.1806640625, "learning_rate": 3.6585365853658535e-05, "loss": 0.06817604601383209, "mean_token_accuracy": 0.9757047519087791, "num_tokens": 773835.0, "step": 135 }, { "entropy": 0.3044859105721116, "epoch": 1.66156202143951, "grad_norm": 0.181640625, "learning_rate": 3.5365853658536584e-05, "loss": 0.07083944231271744, "mean_token_accuracy": 0.9756592996418476, "num_tokens": 779008.0, "step": 136 }, { "entropy": 0.28345807548612356, "epoch": 1.673813169984686, "grad_norm": 0.2412109375, "learning_rate": 3.414634146341464e-05, "loss": 0.08097834140062332, "mean_token_accuracy": 0.97182647138834, "num_tokens": 784955.0, "step": 137 }, { "entropy": 0.2775820689275861, "epoch": 1.686064318529862, "grad_norm": 0.1630859375, "learning_rate": 3.292682926829269e-05, "loss": 0.0830724835395813, "mean_token_accuracy": 0.97340302541852, "num_tokens": 792300.0, "step": 138 }, { "entropy": 0.28273867163807154, "epoch": 1.6983154670750382, "grad_norm": 0.1728515625, "learning_rate": 3.170731707317073e-05, "loss": 0.06122542545199394, "mean_token_accuracy": 0.97363705560565, "num_tokens": 797635.0, "step": 139 }, { "entropy": 0.33463940117508173, "epoch": 1.7105666156202144, "grad_norm": 0.2109375, "learning_rate": 3.048780487804878e-05, "loss": 0.09268856793642044, "mean_token_accuracy": 0.968308299779892, "num_tokens": 803321.0, "step": 140 }, { "entropy": 0.26861980091780424, "epoch": 1.7228177641653906, "grad_norm": 0.1923828125, "learning_rate": 2.926829268292683e-05, "loss": 0.06270366907119751, "mean_token_accuracy": 0.9758359678089619, "num_tokens": 808696.0, "step": 141 }, { "entropy": 0.24330784939229488, "epoch": 1.7350689127105667, "grad_norm": 0.2412109375, "learning_rate": 2.8048780487804882e-05, "loss": 0.05480020493268967, "mean_token_accuracy": 0.9805137030780315, "num_tokens": 813988.0, "step": 142 }, { "entropy": 0.2642217818647623, "epoch": 1.7473200612557427, "grad_norm": 0.1669921875, "learning_rate": 2.682926829268293e-05, "loss": 0.06715261191129684, "mean_token_accuracy": 0.9766382575035095, "num_tokens": 820087.0, "step": 143 }, { "entropy": 0.3016277579590678, "epoch": 1.7595712098009189, "grad_norm": 0.2265625, "learning_rate": 2.5609756097560977e-05, "loss": 0.057253021746873856, "mean_token_accuracy": 0.9784747660160065, "num_tokens": 825766.0, "step": 144 }, { "entropy": 0.27436165884137154, "epoch": 1.7718223583460948, "grad_norm": 0.2275390625, "learning_rate": 2.4390243902439026e-05, "loss": 0.05420134961605072, "mean_token_accuracy": 0.980743058025837, "num_tokens": 831298.0, "step": 145 }, { "entropy": 0.28155668918043375, "epoch": 1.784073506891271, "grad_norm": 0.1630859375, "learning_rate": 2.3170731707317075e-05, "loss": 0.07169967144727707, "mean_token_accuracy": 0.9731598235666752, "num_tokens": 837254.0, "step": 146 }, { "entropy": 0.2712876806035638, "epoch": 1.7963246554364471, "grad_norm": 0.1904296875, "learning_rate": 2.1951219512195124e-05, "loss": 0.06996440887451172, "mean_token_accuracy": 0.9760563708841801, "num_tokens": 843796.0, "step": 147 }, { "entropy": 0.29682911094278097, "epoch": 1.8085758039816233, "grad_norm": 0.1767578125, "learning_rate": 2.073170731707317e-05, "loss": 0.06667114794254303, "mean_token_accuracy": 0.9743672311306, "num_tokens": 849621.0, "step": 148 }, { "entropy": 0.29692134354263544, "epoch": 1.8208269525267995, "grad_norm": 0.26953125, "learning_rate": 1.9512195121951222e-05, "loss": 0.07836263626813889, "mean_token_accuracy": 0.9702205285429955, "num_tokens": 854578.0, "step": 149 }, { "entropy": 0.2837211322039366, "epoch": 1.8330781010719757, "grad_norm": 0.2177734375, "learning_rate": 1.8292682926829268e-05, "loss": 0.07421581447124481, "mean_token_accuracy": 0.9740220792591572, "num_tokens": 861001.0, "step": 150 }, { "epoch": 1.8330781010719757, "eval_entropy": 0.28257156163454056, "eval_loss": 0.11007164418697357, "eval_mean_token_accuracy": 0.9602361796558767, "eval_num_tokens": 861001.0, "eval_runtime": 56.9589, "eval_samples_per_second": 1.211, "eval_steps_per_second": 1.211, "step": 150 } ], "logging_steps": 1, "max_steps": 164, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.898716388139827e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }