stefanocarrera's picture
Upload folder using huggingface_hub
54f2a48 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8330781010719757,
"eval_steps": 50,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.6279787886887789,
"epoch": 0.01225114854517611,
"grad_norm": 0.279296875,
"learning_rate": 0.0002,
"loss": 0.29868432879447937,
"mean_token_accuracy": 0.9193268120288849,
"num_tokens": 6880.0,
"step": 1
},
{
"entropy": 0.5897643249481916,
"epoch": 0.02450229709035222,
"grad_norm": 0.30078125,
"learning_rate": 0.00019878048780487805,
"loss": 0.2462826669216156,
"mean_token_accuracy": 0.9280684292316437,
"num_tokens": 13136.0,
"step": 2
},
{
"entropy": 0.5422123614698648,
"epoch": 0.036753445635528334,
"grad_norm": 0.2314453125,
"learning_rate": 0.0001975609756097561,
"loss": 0.20162586867809296,
"mean_token_accuracy": 0.9337548539042473,
"num_tokens": 19103.0,
"step": 3
},
{
"entropy": 0.4943904746323824,
"epoch": 0.04900459418070444,
"grad_norm": 0.25390625,
"learning_rate": 0.00019634146341463416,
"loss": 0.14251382648944855,
"mean_token_accuracy": 0.95210937038064,
"num_tokens": 24423.0,
"step": 4
},
{
"entropy": 0.45216177217662334,
"epoch": 0.06125574272588055,
"grad_norm": 0.234375,
"learning_rate": 0.0001951219512195122,
"loss": 0.1607554852962494,
"mean_token_accuracy": 0.9485567137598991,
"num_tokens": 30608.0,
"step": 5
},
{
"entropy": 0.40192629024386406,
"epoch": 0.07350689127105667,
"grad_norm": 0.2578125,
"learning_rate": 0.00019390243902439025,
"loss": 0.16246657073497772,
"mean_token_accuracy": 0.9427705891430378,
"num_tokens": 36811.0,
"step": 6
},
{
"entropy": 0.3904233919456601,
"epoch": 0.08575803981623277,
"grad_norm": 0.2890625,
"learning_rate": 0.0001926829268292683,
"loss": 0.17223770916461945,
"mean_token_accuracy": 0.941681407392025,
"num_tokens": 43080.0,
"step": 7
},
{
"entropy": 0.3865286596119404,
"epoch": 0.09800918836140889,
"grad_norm": 0.263671875,
"learning_rate": 0.00019146341463414633,
"loss": 0.14094355702400208,
"mean_token_accuracy": 0.9572678282856941,
"num_tokens": 48449.0,
"step": 8
},
{
"entropy": 0.370680570602417,
"epoch": 0.11026033690658499,
"grad_norm": 0.1943359375,
"learning_rate": 0.0001902439024390244,
"loss": 0.14527979493141174,
"mean_token_accuracy": 0.9480055123567581,
"num_tokens": 54626.0,
"step": 9
},
{
"entropy": 0.41665984131395817,
"epoch": 0.1225114854517611,
"grad_norm": 0.2080078125,
"learning_rate": 0.00018902439024390244,
"loss": 0.13078074157238007,
"mean_token_accuracy": 0.9527099393308163,
"num_tokens": 60057.0,
"step": 10
},
{
"entropy": 0.41189629677683115,
"epoch": 0.13476263399693722,
"grad_norm": 0.224609375,
"learning_rate": 0.0001878048780487805,
"loss": 0.12864071130752563,
"mean_token_accuracy": 0.9570614397525787,
"num_tokens": 66167.0,
"step": 11
},
{
"entropy": 0.454614844173193,
"epoch": 0.14701378254211334,
"grad_norm": 0.2109375,
"learning_rate": 0.00018658536585365856,
"loss": 0.11721982061862946,
"mean_token_accuracy": 0.9633984379470348,
"num_tokens": 72797.0,
"step": 12
},
{
"entropy": 0.42440748680382967,
"epoch": 0.15926493108728942,
"grad_norm": 0.2060546875,
"learning_rate": 0.0001853658536585366,
"loss": 0.12409229576587677,
"mean_token_accuracy": 0.9605162478983402,
"num_tokens": 78607.0,
"step": 13
},
{
"entropy": 0.4054669588804245,
"epoch": 0.17151607963246554,
"grad_norm": 0.2060546875,
"learning_rate": 0.00018414634146341464,
"loss": 0.12331730872392654,
"mean_token_accuracy": 0.9529691338539124,
"num_tokens": 84391.0,
"step": 14
},
{
"entropy": 0.4127539284527302,
"epoch": 0.18376722817764166,
"grad_norm": 0.2734375,
"learning_rate": 0.0001829268292682927,
"loss": 0.1280103325843811,
"mean_token_accuracy": 0.9521399922668934,
"num_tokens": 89112.0,
"step": 15
},
{
"entropy": 0.40787679236382246,
"epoch": 0.19601837672281777,
"grad_norm": 0.2890625,
"learning_rate": 0.00018170731707317075,
"loss": 0.1468421369791031,
"mean_token_accuracy": 0.9514360092580318,
"num_tokens": 94028.0,
"step": 16
},
{
"entropy": 0.3514019288122654,
"epoch": 0.2082695252679939,
"grad_norm": 0.2109375,
"learning_rate": 0.0001804878048780488,
"loss": 0.10981348156929016,
"mean_token_accuracy": 0.9557906277477741,
"num_tokens": 99737.0,
"step": 17
},
{
"entropy": 0.348665127530694,
"epoch": 0.22052067381316998,
"grad_norm": 0.314453125,
"learning_rate": 0.00017926829268292684,
"loss": 0.12969893217086792,
"mean_token_accuracy": 0.951546210795641,
"num_tokens": 105657.0,
"step": 18
},
{
"entropy": 0.3983056088909507,
"epoch": 0.2327718223583461,
"grad_norm": 0.24609375,
"learning_rate": 0.00017804878048780488,
"loss": 0.14060811698436737,
"mean_token_accuracy": 0.9409481771290302,
"num_tokens": 111514.0,
"step": 19
},
{
"entropy": 0.3584689646959305,
"epoch": 0.2450229709035222,
"grad_norm": 0.298828125,
"learning_rate": 0.00017682926829268295,
"loss": 0.12189538776874542,
"mean_token_accuracy": 0.9588135071098804,
"num_tokens": 116957.0,
"step": 20
},
{
"entropy": 0.3528327913954854,
"epoch": 0.2572741194486983,
"grad_norm": 0.2265625,
"learning_rate": 0.000175609756097561,
"loss": 0.10923069715499878,
"mean_token_accuracy": 0.9603886790573597,
"num_tokens": 123277.0,
"step": 21
},
{
"entropy": 0.349135366268456,
"epoch": 0.26952526799387444,
"grad_norm": 0.2060546875,
"learning_rate": 0.00017439024390243903,
"loss": 0.13121618330478668,
"mean_token_accuracy": 0.9497882351279259,
"num_tokens": 130406.0,
"step": 22
},
{
"entropy": 0.3748163701966405,
"epoch": 0.28177641653905056,
"grad_norm": 0.25390625,
"learning_rate": 0.00017317073170731708,
"loss": 0.13371798396110535,
"mean_token_accuracy": 0.9507532455027103,
"num_tokens": 136085.0,
"step": 23
},
{
"entropy": 0.395503643900156,
"epoch": 0.29402756508422667,
"grad_norm": 0.22265625,
"learning_rate": 0.00017195121951219512,
"loss": 0.15047620236873627,
"mean_token_accuracy": 0.9478774890303612,
"num_tokens": 142531.0,
"step": 24
},
{
"entropy": 0.35646383836865425,
"epoch": 0.30627871362940273,
"grad_norm": 0.232421875,
"learning_rate": 0.0001707317073170732,
"loss": 0.12330407649278641,
"mean_token_accuracy": 0.9542177952826023,
"num_tokens": 148432.0,
"step": 25
},
{
"entropy": 0.32762761414051056,
"epoch": 0.31852986217457885,
"grad_norm": 0.193359375,
"learning_rate": 0.00016951219512195123,
"loss": 0.12972016632556915,
"mean_token_accuracy": 0.9510112181305885,
"num_tokens": 154616.0,
"step": 26
},
{
"entropy": 0.3109707301482558,
"epoch": 0.33078101071975496,
"grad_norm": 0.27734375,
"learning_rate": 0.00016829268292682927,
"loss": 0.09245749562978745,
"mean_token_accuracy": 0.9632142670452595,
"num_tokens": 160355.0,
"step": 27
},
{
"entropy": 0.343580374494195,
"epoch": 0.3430321592649311,
"grad_norm": 0.267578125,
"learning_rate": 0.00016707317073170731,
"loss": 0.1068074107170105,
"mean_token_accuracy": 0.955970574170351,
"num_tokens": 166182.0,
"step": 28
},
{
"entropy": 0.38475809153169394,
"epoch": 0.3552833078101072,
"grad_norm": 0.2578125,
"learning_rate": 0.00016585365853658536,
"loss": 0.11399275064468384,
"mean_token_accuracy": 0.9570133797824383,
"num_tokens": 171286.0,
"step": 29
},
{
"entropy": 0.3666055165231228,
"epoch": 0.3675344563552833,
"grad_norm": 0.267578125,
"learning_rate": 0.00016463414634146343,
"loss": 0.1474432796239853,
"mean_token_accuracy": 0.9397772029042244,
"num_tokens": 177662.0,
"step": 30
},
{
"entropy": 0.31479439605027437,
"epoch": 0.37978560490045943,
"grad_norm": 0.27734375,
"learning_rate": 0.00016341463414634147,
"loss": 0.11059485375881195,
"mean_token_accuracy": 0.9584382586181164,
"num_tokens": 182284.0,
"step": 31
},
{
"entropy": 0.3475527623668313,
"epoch": 0.39203675344563554,
"grad_norm": 0.2373046875,
"learning_rate": 0.00016219512195121954,
"loss": 0.13328994810581207,
"mean_token_accuracy": 0.9535585716366768,
"num_tokens": 189322.0,
"step": 32
},
{
"entropy": 0.3634985350072384,
"epoch": 0.40428790199081166,
"grad_norm": 0.21875,
"learning_rate": 0.00016097560975609758,
"loss": 0.15225957334041595,
"mean_token_accuracy": 0.9502801597118378,
"num_tokens": 195113.0,
"step": 33
},
{
"entropy": 0.3299488425254822,
"epoch": 0.4165390505359878,
"grad_norm": 0.25,
"learning_rate": 0.00015975609756097562,
"loss": 0.12952829897403717,
"mean_token_accuracy": 0.9594988077878952,
"num_tokens": 200835.0,
"step": 34
},
{
"entropy": 0.3228675974532962,
"epoch": 0.42879019908116384,
"grad_norm": 0.2197265625,
"learning_rate": 0.00015853658536585366,
"loss": 0.10655219852924347,
"mean_token_accuracy": 0.9599267169833183,
"num_tokens": 206172.0,
"step": 35
},
{
"entropy": 0.3190277460962534,
"epoch": 0.44104134762633995,
"grad_norm": 0.2080078125,
"learning_rate": 0.00015731707317073173,
"loss": 0.11382263898849487,
"mean_token_accuracy": 0.9600558690726757,
"num_tokens": 211901.0,
"step": 36
},
{
"entropy": 0.3038849513977766,
"epoch": 0.45329249617151607,
"grad_norm": 0.2021484375,
"learning_rate": 0.00015609756097560978,
"loss": 0.10026074945926666,
"mean_token_accuracy": 0.9635081477463245,
"num_tokens": 217629.0,
"step": 37
},
{
"entropy": 0.31116786785423756,
"epoch": 0.4655436447166922,
"grad_norm": 0.1962890625,
"learning_rate": 0.00015487804878048782,
"loss": 0.13140451908111572,
"mean_token_accuracy": 0.9521206878125668,
"num_tokens": 223352.0,
"step": 38
},
{
"entropy": 0.3148405561223626,
"epoch": 0.4777947932618683,
"grad_norm": 0.201171875,
"learning_rate": 0.00015365853658536586,
"loss": 0.11006736010313034,
"mean_token_accuracy": 0.9543089419603348,
"num_tokens": 229089.0,
"step": 39
},
{
"entropy": 0.3327226936817169,
"epoch": 0.4900459418070444,
"grad_norm": 0.21875,
"learning_rate": 0.0001524390243902439,
"loss": 0.11151966452598572,
"mean_token_accuracy": 0.9555698931217194,
"num_tokens": 234629.0,
"step": 40
},
{
"entropy": 0.29484597500413656,
"epoch": 0.5022970903522205,
"grad_norm": 0.171875,
"learning_rate": 0.00015121951219512197,
"loss": 0.11951664835214615,
"mean_token_accuracy": 0.9610061757266521,
"num_tokens": 239950.0,
"step": 41
},
{
"entropy": 0.33220329508185387,
"epoch": 0.5145482388973966,
"grad_norm": 0.25390625,
"learning_rate": 0.00015000000000000001,
"loss": 0.10372602194547653,
"mean_token_accuracy": 0.9613832570612431,
"num_tokens": 244905.0,
"step": 42
},
{
"entropy": 0.3460714379325509,
"epoch": 0.5267993874425727,
"grad_norm": 0.2373046875,
"learning_rate": 0.00014878048780487806,
"loss": 0.13158759474754333,
"mean_token_accuracy": 0.9516530968248844,
"num_tokens": 251374.0,
"step": 43
},
{
"entropy": 0.350130352191627,
"epoch": 0.5390505359877489,
"grad_norm": 0.21484375,
"learning_rate": 0.0001475609756097561,
"loss": 0.13656970858573914,
"mean_token_accuracy": 0.9539419040083885,
"num_tokens": 257465.0,
"step": 44
},
{
"entropy": 0.3217963883653283,
"epoch": 0.5513016845329249,
"grad_norm": 0.2421875,
"learning_rate": 0.00014634146341463414,
"loss": 0.12704257667064667,
"mean_token_accuracy": 0.954754151403904,
"num_tokens": 262964.0,
"step": 45
},
{
"entropy": 0.3284407975152135,
"epoch": 0.5635528330781011,
"grad_norm": 0.1796875,
"learning_rate": 0.0001451219512195122,
"loss": 0.09583695977926254,
"mean_token_accuracy": 0.9638715162873268,
"num_tokens": 268846.0,
"step": 46
},
{
"entropy": 0.34618470072746277,
"epoch": 0.5758039816232772,
"grad_norm": 0.267578125,
"learning_rate": 0.00014390243902439025,
"loss": 0.14501920342445374,
"mean_token_accuracy": 0.9476289339363575,
"num_tokens": 274384.0,
"step": 47
},
{
"entropy": 0.31532789394259453,
"epoch": 0.5880551301684533,
"grad_norm": 0.2451171875,
"learning_rate": 0.0001426829268292683,
"loss": 0.12755413353443146,
"mean_token_accuracy": 0.954731572419405,
"num_tokens": 279484.0,
"step": 48
},
{
"entropy": 0.363907678052783,
"epoch": 0.6003062787136294,
"grad_norm": 0.19921875,
"learning_rate": 0.00014146341463414634,
"loss": 0.10707266628742218,
"mean_token_accuracy": 0.954724483191967,
"num_tokens": 284817.0,
"step": 49
},
{
"entropy": 0.31933102291077375,
"epoch": 0.6125574272588055,
"grad_norm": 0.1904296875,
"learning_rate": 0.00014024390243902438,
"loss": 0.09467475861310959,
"mean_token_accuracy": 0.9665305241942406,
"num_tokens": 290524.0,
"step": 50
},
{
"epoch": 0.6125574272588055,
"eval_entropy": 0.3426319423361101,
"eval_loss": 0.10967054218053818,
"eval_mean_token_accuracy": 0.9585090970647507,
"eval_num_tokens": 290524.0,
"eval_runtime": 56.6704,
"eval_samples_per_second": 1.218,
"eval_steps_per_second": 1.218,
"step": 50
},
{
"entropy": 0.34031340666115284,
"epoch": 0.6248085758039816,
"grad_norm": 0.173828125,
"learning_rate": 0.00013902439024390245,
"loss": 0.09569472074508667,
"mean_token_accuracy": 0.9653150551021099,
"num_tokens": 296069.0,
"step": 51
},
{
"entropy": 0.3571790661662817,
"epoch": 0.6370597243491577,
"grad_norm": 0.2421875,
"learning_rate": 0.0001378048780487805,
"loss": 0.10000051558017731,
"mean_token_accuracy": 0.9638084918260574,
"num_tokens": 301514.0,
"step": 52
},
{
"entropy": 0.36696077417582273,
"epoch": 0.6493108728943339,
"grad_norm": 0.244140625,
"learning_rate": 0.00013658536585365856,
"loss": 0.10484882444143295,
"mean_token_accuracy": 0.9642562530934811,
"num_tokens": 307445.0,
"step": 53
},
{
"entropy": 0.31568020675331354,
"epoch": 0.6615620214395099,
"grad_norm": 0.283203125,
"learning_rate": 0.0001353658536585366,
"loss": 0.11220870912075043,
"mean_token_accuracy": 0.9582751281559467,
"num_tokens": 312864.0,
"step": 54
},
{
"entropy": 0.35635274462401867,
"epoch": 0.6738131699846861,
"grad_norm": 0.255859375,
"learning_rate": 0.00013414634146341464,
"loss": 0.13630808889865875,
"mean_token_accuracy": 0.9525270387530327,
"num_tokens": 319021.0,
"step": 55
},
{
"entropy": 0.335802904330194,
"epoch": 0.6860643185298622,
"grad_norm": 0.2197265625,
"learning_rate": 0.0001329268292682927,
"loss": 0.11126557737588882,
"mean_token_accuracy": 0.9602809473872185,
"num_tokens": 324169.0,
"step": 56
},
{
"entropy": 0.37789826188236475,
"epoch": 0.6983154670750383,
"grad_norm": 0.25390625,
"learning_rate": 0.00013170731707317076,
"loss": 0.11750061064958572,
"mean_token_accuracy": 0.9616654589772224,
"num_tokens": 330105.0,
"step": 57
},
{
"entropy": 0.3473435193300247,
"epoch": 0.7105666156202144,
"grad_norm": 0.244140625,
"learning_rate": 0.0001304878048780488,
"loss": 0.13293127715587616,
"mean_token_accuracy": 0.960812620818615,
"num_tokens": 336000.0,
"step": 58
},
{
"entropy": 0.37645846977829933,
"epoch": 0.7228177641653905,
"grad_norm": 0.25390625,
"learning_rate": 0.00012926829268292684,
"loss": 0.14020267128944397,
"mean_token_accuracy": 0.9494834020733833,
"num_tokens": 341300.0,
"step": 59
},
{
"entropy": 0.3391531016677618,
"epoch": 0.7350689127105666,
"grad_norm": 0.2041015625,
"learning_rate": 0.00012804878048780488,
"loss": 0.11145544052124023,
"mean_token_accuracy": 0.9526276290416718,
"num_tokens": 347644.0,
"step": 60
},
{
"entropy": 0.3284269040450454,
"epoch": 0.7473200612557427,
"grad_norm": 0.201171875,
"learning_rate": 0.00012682926829268293,
"loss": 0.13039623200893402,
"mean_token_accuracy": 0.9597245752811432,
"num_tokens": 354096.0,
"step": 61
},
{
"entropy": 0.37672379054129124,
"epoch": 0.7595712098009189,
"grad_norm": 0.18359375,
"learning_rate": 0.000125609756097561,
"loss": 0.12314963340759277,
"mean_token_accuracy": 0.9572922959923744,
"num_tokens": 360519.0,
"step": 62
},
{
"entropy": 0.3247836837545037,
"epoch": 0.7718223583460949,
"grad_norm": 0.251953125,
"learning_rate": 0.00012439024390243904,
"loss": 0.10076416283845901,
"mean_token_accuracy": 0.9550469256937504,
"num_tokens": 365666.0,
"step": 63
},
{
"entropy": 0.3481680192053318,
"epoch": 0.7840735068912711,
"grad_norm": 0.2099609375,
"learning_rate": 0.00012317073170731708,
"loss": 0.1191372275352478,
"mean_token_accuracy": 0.9540783166885376,
"num_tokens": 370703.0,
"step": 64
},
{
"entropy": 0.3864388270303607,
"epoch": 0.7963246554364471,
"grad_norm": 0.2177734375,
"learning_rate": 0.00012195121951219512,
"loss": 0.11211425065994263,
"mean_token_accuracy": 0.9600110091269016,
"num_tokens": 375933.0,
"step": 65
},
{
"entropy": 0.3722238801419735,
"epoch": 0.8085758039816233,
"grad_norm": 0.271484375,
"learning_rate": 0.00012073170731707318,
"loss": 0.1060609444975853,
"mean_token_accuracy": 0.9619267173111439,
"num_tokens": 381934.0,
"step": 66
},
{
"entropy": 0.31903401017189026,
"epoch": 0.8208269525267994,
"grad_norm": 0.1689453125,
"learning_rate": 0.00011951219512195122,
"loss": 0.10838343948125839,
"mean_token_accuracy": 0.9615302868187428,
"num_tokens": 388141.0,
"step": 67
},
{
"entropy": 0.35833382699638605,
"epoch": 0.8330781010719756,
"grad_norm": 0.2333984375,
"learning_rate": 0.00011829268292682926,
"loss": 0.11101208627223969,
"mean_token_accuracy": 0.9624687656760216,
"num_tokens": 393485.0,
"step": 68
},
{
"entropy": 0.3384561138227582,
"epoch": 0.8453292496171516,
"grad_norm": 0.2392578125,
"learning_rate": 0.00011707317073170732,
"loss": 0.11302048712968826,
"mean_token_accuracy": 0.959359273314476,
"num_tokens": 398110.0,
"step": 69
},
{
"entropy": 0.3644536528736353,
"epoch": 0.8575803981623277,
"grad_norm": 0.2119140625,
"learning_rate": 0.00011585365853658536,
"loss": 0.11044176667928696,
"mean_token_accuracy": 0.9590198397636414,
"num_tokens": 403100.0,
"step": 70
},
{
"entropy": 0.38477543368935585,
"epoch": 0.8698315467075038,
"grad_norm": 0.326171875,
"learning_rate": 0.00011463414634146342,
"loss": 0.1475592851638794,
"mean_token_accuracy": 0.9498578049242496,
"num_tokens": 408866.0,
"step": 71
},
{
"entropy": 0.3160585919395089,
"epoch": 0.8820826952526799,
"grad_norm": 0.2138671875,
"learning_rate": 0.00011341463414634146,
"loss": 0.12280108034610748,
"mean_token_accuracy": 0.9519775547087193,
"num_tokens": 414474.0,
"step": 72
},
{
"entropy": 0.35043725837022066,
"epoch": 0.8943338437978561,
"grad_norm": 0.2021484375,
"learning_rate": 0.00011219512195121953,
"loss": 0.11857884377241135,
"mean_token_accuracy": 0.957142923027277,
"num_tokens": 420975.0,
"step": 73
},
{
"entropy": 0.3428537016734481,
"epoch": 0.9065849923430321,
"grad_norm": 0.1953125,
"learning_rate": 0.00011097560975609757,
"loss": 0.09723620116710663,
"mean_token_accuracy": 0.9623003490269184,
"num_tokens": 427160.0,
"step": 74
},
{
"entropy": 0.3439481556415558,
"epoch": 0.9188361408882083,
"grad_norm": 0.2333984375,
"learning_rate": 0.00010975609756097563,
"loss": 0.12700851261615753,
"mean_token_accuracy": 0.9501284696161747,
"num_tokens": 432740.0,
"step": 75
},
{
"entropy": 0.396132281050086,
"epoch": 0.9310872894333844,
"grad_norm": 0.2412109375,
"learning_rate": 0.00010853658536585367,
"loss": 0.11111584305763245,
"mean_token_accuracy": 0.954656295478344,
"num_tokens": 437553.0,
"step": 76
},
{
"entropy": 0.31287234649062157,
"epoch": 0.9433384379785605,
"grad_norm": 0.1572265625,
"learning_rate": 0.00010731707317073172,
"loss": 0.09217967838048935,
"mean_token_accuracy": 0.9652052000164986,
"num_tokens": 443939.0,
"step": 77
},
{
"entropy": 0.3873864635825157,
"epoch": 0.9555895865237366,
"grad_norm": 0.2890625,
"learning_rate": 0.00010609756097560977,
"loss": 0.11941950023174286,
"mean_token_accuracy": 0.9584939330816269,
"num_tokens": 449426.0,
"step": 78
},
{
"entropy": 0.3076135413721204,
"epoch": 0.9678407350689127,
"grad_norm": 0.236328125,
"learning_rate": 0.00010487804878048781,
"loss": 0.10192415118217468,
"mean_token_accuracy": 0.9579608179628849,
"num_tokens": 454962.0,
"step": 79
},
{
"entropy": 0.33057918306440115,
"epoch": 0.9800918836140888,
"grad_norm": 0.1748046875,
"learning_rate": 0.00010365853658536586,
"loss": 0.09825513511896133,
"mean_token_accuracy": 0.9628020562231541,
"num_tokens": 461420.0,
"step": 80
},
{
"entropy": 0.344515610486269,
"epoch": 0.9923430321592649,
"grad_norm": 0.25390625,
"learning_rate": 0.0001024390243902439,
"loss": 0.1298864483833313,
"mean_token_accuracy": 0.9483123049139977,
"num_tokens": 467214.0,
"step": 81
},
{
"entropy": 0.35632768720388414,
"epoch": 1.0,
"grad_norm": 0.267578125,
"learning_rate": 0.00010121951219512196,
"loss": 0.0866069495677948,
"mean_token_accuracy": 0.9732943534851074,
"num_tokens": 470456.0,
"step": 82
},
{
"entropy": 0.32443390041589737,
"epoch": 1.0122511485451762,
"grad_norm": 0.158203125,
"learning_rate": 0.0001,
"loss": 0.08388859778642654,
"mean_token_accuracy": 0.9714479111135006,
"num_tokens": 476660.0,
"step": 83
},
{
"entropy": 0.27756834030151367,
"epoch": 1.0245022970903521,
"grad_norm": 0.134765625,
"learning_rate": 9.878048780487805e-05,
"loss": 0.07270920276641846,
"mean_token_accuracy": 0.9756675288081169,
"num_tokens": 482852.0,
"step": 84
},
{
"entropy": 0.37221179995685816,
"epoch": 1.0367534456355283,
"grad_norm": 0.173828125,
"learning_rate": 9.75609756097561e-05,
"loss": 0.09018392115831375,
"mean_token_accuracy": 0.9692036546766758,
"num_tokens": 489720.0,
"step": 85
},
{
"entropy": 0.2815575134009123,
"epoch": 1.0490045941807045,
"grad_norm": 0.1328125,
"learning_rate": 9.634146341463415e-05,
"loss": 0.0751088559627533,
"mean_token_accuracy": 0.9700377807021141,
"num_tokens": 494962.0,
"step": 86
},
{
"entropy": 0.3032172666862607,
"epoch": 1.0612557427258806,
"grad_norm": 0.1630859375,
"learning_rate": 9.51219512195122e-05,
"loss": 0.08939642459154129,
"mean_token_accuracy": 0.9708281457424164,
"num_tokens": 501167.0,
"step": 87
},
{
"entropy": 0.3396748472005129,
"epoch": 1.0735068912710566,
"grad_norm": 0.146484375,
"learning_rate": 9.390243902439024e-05,
"loss": 0.07543614506721497,
"mean_token_accuracy": 0.9790169149637222,
"num_tokens": 506009.0,
"step": 88
},
{
"entropy": 0.3040660824626684,
"epoch": 1.0857580398162328,
"grad_norm": 0.15234375,
"learning_rate": 9.26829268292683e-05,
"loss": 0.0811336562037468,
"mean_token_accuracy": 0.9736967124044895,
"num_tokens": 511394.0,
"step": 89
},
{
"entropy": 0.31143750343471766,
"epoch": 1.098009188361409,
"grad_norm": 0.173828125,
"learning_rate": 9.146341463414635e-05,
"loss": 0.08774841576814651,
"mean_token_accuracy": 0.9692316688597202,
"num_tokens": 516837.0,
"step": 90
},
{
"entropy": 0.34162401407957077,
"epoch": 1.110260336906585,
"grad_norm": 0.294921875,
"learning_rate": 9.02439024390244e-05,
"loss": 0.09738526493310928,
"mean_token_accuracy": 0.9668772779405117,
"num_tokens": 521646.0,
"step": 91
},
{
"entropy": 0.2694440744817257,
"epoch": 1.122511485451761,
"grad_norm": 0.140625,
"learning_rate": 8.902439024390244e-05,
"loss": 0.0640217661857605,
"mean_token_accuracy": 0.9778573513031006,
"num_tokens": 527366.0,
"step": 92
},
{
"entropy": 0.2505084676668048,
"epoch": 1.1347626339969372,
"grad_norm": 0.2265625,
"learning_rate": 8.78048780487805e-05,
"loss": 0.07979685813188553,
"mean_token_accuracy": 0.973389033228159,
"num_tokens": 533210.0,
"step": 93
},
{
"entropy": 0.2535351850092411,
"epoch": 1.1470137825421134,
"grad_norm": 0.205078125,
"learning_rate": 8.658536585365854e-05,
"loss": 0.06256209313869476,
"mean_token_accuracy": 0.9790448397397995,
"num_tokens": 538589.0,
"step": 94
},
{
"entropy": 0.2352813482284546,
"epoch": 1.1592649310872893,
"grad_norm": 0.1962890625,
"learning_rate": 8.53658536585366e-05,
"loss": 0.07825516164302826,
"mean_token_accuracy": 0.9773083217442036,
"num_tokens": 544486.0,
"step": 95
},
{
"entropy": 0.23897481244057417,
"epoch": 1.1715160796324655,
"grad_norm": 0.2734375,
"learning_rate": 8.414634146341464e-05,
"loss": 0.0690295547246933,
"mean_token_accuracy": 0.9745013862848282,
"num_tokens": 549647.0,
"step": 96
},
{
"entropy": 0.2606777008622885,
"epoch": 1.1837672281776417,
"grad_norm": 0.259765625,
"learning_rate": 8.292682926829268e-05,
"loss": 0.07471512258052826,
"mean_token_accuracy": 0.9725949242711067,
"num_tokens": 554687.0,
"step": 97
},
{
"entropy": 0.27557028736919165,
"epoch": 1.1960183767228179,
"grad_norm": 0.2314453125,
"learning_rate": 8.170731707317073e-05,
"loss": 0.06931524723768234,
"mean_token_accuracy": 0.978624414652586,
"num_tokens": 560533.0,
"step": 98
},
{
"entropy": 0.2943765129894018,
"epoch": 1.2082695252679938,
"grad_norm": 0.216796875,
"learning_rate": 8.048780487804879e-05,
"loss": 0.0812952071428299,
"mean_token_accuracy": 0.9725493676960468,
"num_tokens": 566604.0,
"step": 99
},
{
"entropy": 0.2441987576894462,
"epoch": 1.22052067381317,
"grad_norm": 0.255859375,
"learning_rate": 7.926829268292683e-05,
"loss": 0.059226248413324356,
"mean_token_accuracy": 0.9759947806596756,
"num_tokens": 572527.0,
"step": 100
},
{
"epoch": 1.22052067381317,
"eval_entropy": 0.27095302215952805,
"eval_loss": 0.11123082786798477,
"eval_mean_token_accuracy": 0.9592912940011509,
"eval_num_tokens": 572527.0,
"eval_runtime": 56.8972,
"eval_samples_per_second": 1.213,
"eval_steps_per_second": 1.213,
"step": 100
},
{
"entropy": 0.2656272081658244,
"epoch": 1.2327718223583461,
"grad_norm": 0.21875,
"learning_rate": 7.804878048780489e-05,
"loss": 0.08751720190048218,
"mean_token_accuracy": 0.9679245948791504,
"num_tokens": 578760.0,
"step": 101
},
{
"entropy": 0.2706137653440237,
"epoch": 1.245022970903522,
"grad_norm": 0.2275390625,
"learning_rate": 7.682926829268293e-05,
"loss": 0.08201148360967636,
"mean_token_accuracy": 0.9710356555879116,
"num_tokens": 584503.0,
"step": 102
},
{
"entropy": 0.2744300989434123,
"epoch": 1.2572741194486983,
"grad_norm": 0.2578125,
"learning_rate": 7.560975609756099e-05,
"loss": 0.05856996402144432,
"mean_token_accuracy": 0.9778167866170406,
"num_tokens": 590506.0,
"step": 103
},
{
"entropy": 0.26921656634658575,
"epoch": 1.2695252679938744,
"grad_norm": 0.23828125,
"learning_rate": 7.439024390243903e-05,
"loss": 0.0782560482621193,
"mean_token_accuracy": 0.9701778888702393,
"num_tokens": 596390.0,
"step": 104
},
{
"entropy": 0.2885159160941839,
"epoch": 1.2817764165390506,
"grad_norm": 0.166015625,
"learning_rate": 7.317073170731707e-05,
"loss": 0.06652253121137619,
"mean_token_accuracy": 0.982722382992506,
"num_tokens": 601411.0,
"step": 105
},
{
"entropy": 0.2918844725936651,
"epoch": 1.2940275650842268,
"grad_norm": 0.1875,
"learning_rate": 7.195121951219513e-05,
"loss": 0.05815374106168747,
"mean_token_accuracy": 0.9814562760293484,
"num_tokens": 606925.0,
"step": 106
},
{
"entropy": 0.3184075551107526,
"epoch": 1.3062787136294027,
"grad_norm": 0.181640625,
"learning_rate": 7.073170731707317e-05,
"loss": 0.07176389545202255,
"mean_token_accuracy": 0.9732142426073551,
"num_tokens": 611981.0,
"step": 107
},
{
"entropy": 0.28589071705937386,
"epoch": 1.318529862174579,
"grad_norm": 0.216796875,
"learning_rate": 6.951219512195122e-05,
"loss": 0.06908947974443436,
"mean_token_accuracy": 0.9742955937981606,
"num_tokens": 617502.0,
"step": 108
},
{
"entropy": 0.30427863635122776,
"epoch": 1.3307810107197549,
"grad_norm": 0.2275390625,
"learning_rate": 6.829268292682928e-05,
"loss": 0.08661782741546631,
"mean_token_accuracy": 0.9723380617797375,
"num_tokens": 623345.0,
"step": 109
},
{
"entropy": 0.2784799374639988,
"epoch": 1.343032159264931,
"grad_norm": 0.2080078125,
"learning_rate": 6.707317073170732e-05,
"loss": 0.08172982931137085,
"mean_token_accuracy": 0.9692776277661324,
"num_tokens": 629819.0,
"step": 110
},
{
"entropy": 0.2621075566858053,
"epoch": 1.3552833078101072,
"grad_norm": 0.2197265625,
"learning_rate": 6.585365853658538e-05,
"loss": 0.06896749883890152,
"mean_token_accuracy": 0.9779512844979763,
"num_tokens": 635274.0,
"step": 111
},
{
"entropy": 0.2867768844589591,
"epoch": 1.3675344563552834,
"grad_norm": 0.205078125,
"learning_rate": 6.463414634146342e-05,
"loss": 0.06578939408063889,
"mean_token_accuracy": 0.9736802577972412,
"num_tokens": 640792.0,
"step": 112
},
{
"entropy": 0.2833663960918784,
"epoch": 1.3797856049004595,
"grad_norm": 0.2138671875,
"learning_rate": 6.341463414634146e-05,
"loss": 0.0802934393286705,
"mean_token_accuracy": 0.975794829428196,
"num_tokens": 647553.0,
"step": 113
},
{
"entropy": 0.2789901904761791,
"epoch": 1.3920367534456355,
"grad_norm": 0.271484375,
"learning_rate": 6.219512195121952e-05,
"loss": 0.07617770880460739,
"mean_token_accuracy": 0.9711511395871639,
"num_tokens": 653249.0,
"step": 114
},
{
"entropy": 0.27549734245985746,
"epoch": 1.4042879019908117,
"grad_norm": 0.162109375,
"learning_rate": 6.097560975609756e-05,
"loss": 0.050225261598825455,
"mean_token_accuracy": 0.981207113713026,
"num_tokens": 659773.0,
"step": 115
},
{
"entropy": 0.2797435289248824,
"epoch": 1.4165390505359878,
"grad_norm": 0.1796875,
"learning_rate": 5.975609756097561e-05,
"loss": 0.05845767632126808,
"mean_token_accuracy": 0.97660356387496,
"num_tokens": 665061.0,
"step": 116
},
{
"entropy": 0.26317309867590666,
"epoch": 1.4287901990811638,
"grad_norm": 0.2294921875,
"learning_rate": 5.853658536585366e-05,
"loss": 0.07422497868537903,
"mean_token_accuracy": 0.970589954406023,
"num_tokens": 670610.0,
"step": 117
},
{
"entropy": 0.2940791519358754,
"epoch": 1.44104134762634,
"grad_norm": 0.2265625,
"learning_rate": 5.731707317073171e-05,
"loss": 0.07870218902826309,
"mean_token_accuracy": 0.9769303686916828,
"num_tokens": 676597.0,
"step": 118
},
{
"entropy": 0.3041226239874959,
"epoch": 1.4532924961715161,
"grad_norm": 0.2236328125,
"learning_rate": 5.6097560975609764e-05,
"loss": 0.07072751969099045,
"mean_token_accuracy": 0.9737785942852497,
"num_tokens": 682445.0,
"step": 119
},
{
"entropy": 0.2781915618106723,
"epoch": 1.4655436447166923,
"grad_norm": 0.1884765625,
"learning_rate": 5.487804878048781e-05,
"loss": 0.07238440960645676,
"mean_token_accuracy": 0.9758684188127518,
"num_tokens": 688019.0,
"step": 120
},
{
"entropy": 0.2433428610675037,
"epoch": 1.4777947932618682,
"grad_norm": 0.263671875,
"learning_rate": 5.365853658536586e-05,
"loss": 0.07184246182441711,
"mean_token_accuracy": 0.9751845635473728,
"num_tokens": 694026.0,
"step": 121
},
{
"entropy": 0.2679327353835106,
"epoch": 1.4900459418070444,
"grad_norm": 0.2236328125,
"learning_rate": 5.2439024390243904e-05,
"loss": 0.07859291881322861,
"mean_token_accuracy": 0.9759643562138081,
"num_tokens": 699917.0,
"step": 122
},
{
"entropy": 0.25062850676476955,
"epoch": 1.5022970903522204,
"grad_norm": 0.16796875,
"learning_rate": 5.121951219512195e-05,
"loss": 0.052012164145708084,
"mean_token_accuracy": 0.9831658490002155,
"num_tokens": 705423.0,
"step": 123
},
{
"entropy": 0.2631832705810666,
"epoch": 1.5145482388973965,
"grad_norm": 0.169921875,
"learning_rate": 5e-05,
"loss": 0.0810304507613182,
"mean_token_accuracy": 0.9739143140614033,
"num_tokens": 712049.0,
"step": 124
},
{
"entropy": 0.28171470761299133,
"epoch": 1.5267993874425727,
"grad_norm": 0.2021484375,
"learning_rate": 4.878048780487805e-05,
"loss": 0.07931914180517197,
"mean_token_accuracy": 0.9697326384484768,
"num_tokens": 718759.0,
"step": 125
},
{
"entropy": 0.2730549927800894,
"epoch": 1.5390505359877489,
"grad_norm": 0.19140625,
"learning_rate": 4.75609756097561e-05,
"loss": 0.06596571952104568,
"mean_token_accuracy": 0.9794163964688778,
"num_tokens": 724151.0,
"step": 126
},
{
"entropy": 0.3451000778004527,
"epoch": 1.551301684532925,
"grad_norm": 0.298828125,
"learning_rate": 4.634146341463415e-05,
"loss": 0.07738037407398224,
"mean_token_accuracy": 0.9751269891858101,
"num_tokens": 729663.0,
"step": 127
},
{
"entropy": 0.3187516676262021,
"epoch": 1.5635528330781012,
"grad_norm": 0.23046875,
"learning_rate": 4.51219512195122e-05,
"loss": 0.07062625885009766,
"mean_token_accuracy": 0.9731609113514423,
"num_tokens": 734823.0,
"step": 128
},
{
"entropy": 0.25056853611022234,
"epoch": 1.5758039816232772,
"grad_norm": 0.1669921875,
"learning_rate": 4.390243902439025e-05,
"loss": 0.050548747181892395,
"mean_token_accuracy": 0.9795470051467419,
"num_tokens": 739981.0,
"step": 129
},
{
"entropy": 0.24678445514291525,
"epoch": 1.5880551301684533,
"grad_norm": 0.28515625,
"learning_rate": 4.26829268292683e-05,
"loss": 0.08084772527217865,
"mean_token_accuracy": 0.9744056761264801,
"num_tokens": 745848.0,
"step": 130
},
{
"entropy": 0.2766247531399131,
"epoch": 1.6003062787136293,
"grad_norm": 0.2119140625,
"learning_rate": 4.146341463414634e-05,
"loss": 0.06591574102640152,
"mean_token_accuracy": 0.9837718568742275,
"num_tokens": 751338.0,
"step": 131
},
{
"entropy": 0.2634483175352216,
"epoch": 1.6125574272588055,
"grad_norm": 0.1845703125,
"learning_rate": 4.0243902439024395e-05,
"loss": 0.08018708974123001,
"mean_token_accuracy": 0.9686751514673233,
"num_tokens": 757944.0,
"step": 132
},
{
"entropy": 0.2640473246574402,
"epoch": 1.6248085758039816,
"grad_norm": 0.1953125,
"learning_rate": 3.9024390243902444e-05,
"loss": 0.058747079223394394,
"mean_token_accuracy": 0.9809320084750652,
"num_tokens": 762909.0,
"step": 133
},
{
"entropy": 0.2685444802045822,
"epoch": 1.6370597243491578,
"grad_norm": 0.1982421875,
"learning_rate": 3.780487804878049e-05,
"loss": 0.06983562558889389,
"mean_token_accuracy": 0.9774856679141521,
"num_tokens": 768121.0,
"step": 134
},
{
"entropy": 0.28071946976706386,
"epoch": 1.649310872894334,
"grad_norm": 0.1806640625,
"learning_rate": 3.6585365853658535e-05,
"loss": 0.06817604601383209,
"mean_token_accuracy": 0.9757047519087791,
"num_tokens": 773835.0,
"step": 135
},
{
"entropy": 0.3044859105721116,
"epoch": 1.66156202143951,
"grad_norm": 0.181640625,
"learning_rate": 3.5365853658536584e-05,
"loss": 0.07083944231271744,
"mean_token_accuracy": 0.9756592996418476,
"num_tokens": 779008.0,
"step": 136
},
{
"entropy": 0.28345807548612356,
"epoch": 1.673813169984686,
"grad_norm": 0.2412109375,
"learning_rate": 3.414634146341464e-05,
"loss": 0.08097834140062332,
"mean_token_accuracy": 0.97182647138834,
"num_tokens": 784955.0,
"step": 137
},
{
"entropy": 0.2775820689275861,
"epoch": 1.686064318529862,
"grad_norm": 0.1630859375,
"learning_rate": 3.292682926829269e-05,
"loss": 0.0830724835395813,
"mean_token_accuracy": 0.97340302541852,
"num_tokens": 792300.0,
"step": 138
},
{
"entropy": 0.28273867163807154,
"epoch": 1.6983154670750382,
"grad_norm": 0.1728515625,
"learning_rate": 3.170731707317073e-05,
"loss": 0.06122542545199394,
"mean_token_accuracy": 0.97363705560565,
"num_tokens": 797635.0,
"step": 139
},
{
"entropy": 0.33463940117508173,
"epoch": 1.7105666156202144,
"grad_norm": 0.2109375,
"learning_rate": 3.048780487804878e-05,
"loss": 0.09268856793642044,
"mean_token_accuracy": 0.968308299779892,
"num_tokens": 803321.0,
"step": 140
},
{
"entropy": 0.26861980091780424,
"epoch": 1.7228177641653906,
"grad_norm": 0.1923828125,
"learning_rate": 2.926829268292683e-05,
"loss": 0.06270366907119751,
"mean_token_accuracy": 0.9758359678089619,
"num_tokens": 808696.0,
"step": 141
},
{
"entropy": 0.24330784939229488,
"epoch": 1.7350689127105667,
"grad_norm": 0.2412109375,
"learning_rate": 2.8048780487804882e-05,
"loss": 0.05480020493268967,
"mean_token_accuracy": 0.9805137030780315,
"num_tokens": 813988.0,
"step": 142
},
{
"entropy": 0.2642217818647623,
"epoch": 1.7473200612557427,
"grad_norm": 0.1669921875,
"learning_rate": 2.682926829268293e-05,
"loss": 0.06715261191129684,
"mean_token_accuracy": 0.9766382575035095,
"num_tokens": 820087.0,
"step": 143
},
{
"entropy": 0.3016277579590678,
"epoch": 1.7595712098009189,
"grad_norm": 0.2265625,
"learning_rate": 2.5609756097560977e-05,
"loss": 0.057253021746873856,
"mean_token_accuracy": 0.9784747660160065,
"num_tokens": 825766.0,
"step": 144
},
{
"entropy": 0.27436165884137154,
"epoch": 1.7718223583460948,
"grad_norm": 0.2275390625,
"learning_rate": 2.4390243902439026e-05,
"loss": 0.05420134961605072,
"mean_token_accuracy": 0.980743058025837,
"num_tokens": 831298.0,
"step": 145
},
{
"entropy": 0.28155668918043375,
"epoch": 1.784073506891271,
"grad_norm": 0.1630859375,
"learning_rate": 2.3170731707317075e-05,
"loss": 0.07169967144727707,
"mean_token_accuracy": 0.9731598235666752,
"num_tokens": 837254.0,
"step": 146
},
{
"entropy": 0.2712876806035638,
"epoch": 1.7963246554364471,
"grad_norm": 0.1904296875,
"learning_rate": 2.1951219512195124e-05,
"loss": 0.06996440887451172,
"mean_token_accuracy": 0.9760563708841801,
"num_tokens": 843796.0,
"step": 147
},
{
"entropy": 0.29682911094278097,
"epoch": 1.8085758039816233,
"grad_norm": 0.1767578125,
"learning_rate": 2.073170731707317e-05,
"loss": 0.06667114794254303,
"mean_token_accuracy": 0.9743672311306,
"num_tokens": 849621.0,
"step": 148
},
{
"entropy": 0.29692134354263544,
"epoch": 1.8208269525267995,
"grad_norm": 0.26953125,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.07836263626813889,
"mean_token_accuracy": 0.9702205285429955,
"num_tokens": 854578.0,
"step": 149
},
{
"entropy": 0.2837211322039366,
"epoch": 1.8330781010719757,
"grad_norm": 0.2177734375,
"learning_rate": 1.8292682926829268e-05,
"loss": 0.07421581447124481,
"mean_token_accuracy": 0.9740220792591572,
"num_tokens": 861001.0,
"step": 150
},
{
"epoch": 1.8330781010719757,
"eval_entropy": 0.28257156163454056,
"eval_loss": 0.11007164418697357,
"eval_mean_token_accuracy": 0.9602361796558767,
"eval_num_tokens": 861001.0,
"eval_runtime": 56.9589,
"eval_samples_per_second": 1.211,
"eval_steps_per_second": 1.211,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 164,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.898716388139827e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}