{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9899409228804088, "eval_steps": 500, "global_step": 12400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.703125, "learning_rate": 1.9999968642467102e-05, "loss": 4.2386, "step": 20 }, { "epoch": 0.0, "grad_norm": 1.8515625, "learning_rate": 1.999987438156715e-05, "loss": 3.1965, "step": 40 }, { "epoch": 0.0, "grad_norm": 1.4921875, "learning_rate": 1.9999717217822316e-05, "loss": 2.7844, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.25, "learning_rate": 1.999949715222121e-05, "loss": 2.6013, "step": 80 }, { "epoch": 0.01, "grad_norm": 2.28125, "learning_rate": 1.9999214186148133e-05, "loss": 2.5417, "step": 100 }, { "epoch": 0.01, "grad_norm": 1.1953125, "learning_rate": 1.9998868321383038e-05, "loss": 2.4376, "step": 120 }, { "epoch": 0.01, "grad_norm": 1.25, "learning_rate": 1.9998459560101546e-05, "loss": 2.3875, "step": 140 }, { "epoch": 0.01, "grad_norm": 1.3359375, "learning_rate": 1.9997987904874905e-05, "loss": 2.3568, "step": 160 }, { "epoch": 0.01, "grad_norm": 1.359375, "learning_rate": 1.9997453358670004e-05, "loss": 2.3034, "step": 180 }, { "epoch": 0.02, "grad_norm": 1.9296875, "learning_rate": 1.9996855924849337e-05, "loss": 2.2779, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.6015625, "learning_rate": 1.999619560717097e-05, "loss": 2.2728, "step": 220 }, { "epoch": 0.02, "grad_norm": 1.4921875, "learning_rate": 1.9995472409788548e-05, "loss": 2.2436, "step": 240 }, { "epoch": 0.02, "grad_norm": 1.625, "learning_rate": 1.999468633725125e-05, "loss": 2.2062, "step": 260 }, { "epoch": 0.02, "grad_norm": 1.6875, "learning_rate": 1.9993837394503745e-05, "loss": 2.1873, "step": 280 }, { "epoch": 0.02, "grad_norm": 2.328125, "learning_rate": 1.99929255868862e-05, "loss": 2.1973, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.5546875, "learning_rate": 1.999195092013422e-05, "loss": 2.1891, "step": 320 }, { "epoch": 0.03, "grad_norm": 2.8125, "learning_rate": 1.99909134003788e-05, "loss": 2.1813, "step": 340 }, { "epoch": 0.03, "grad_norm": 2.1875, "learning_rate": 1.998981303414633e-05, "loss": 2.1609, "step": 360 }, { "epoch": 0.03, "grad_norm": 2.234375, "learning_rate": 1.9988649828358504e-05, "loss": 2.1693, "step": 380 }, { "epoch": 0.03, "grad_norm": 1.8359375, "learning_rate": 1.9987423790332315e-05, "loss": 2.1465, "step": 400 }, { "epoch": 0.03, "grad_norm": 1.71875, "learning_rate": 1.9986134927779986e-05, "loss": 2.1387, "step": 420 }, { "epoch": 0.04, "grad_norm": 1.734375, "learning_rate": 1.998478324880893e-05, "loss": 2.1236, "step": 440 }, { "epoch": 0.04, "grad_norm": 1.7265625, "learning_rate": 1.9983368761921703e-05, "loss": 2.1144, "step": 460 }, { "epoch": 0.04, "grad_norm": 1.640625, "learning_rate": 1.9981891476015936e-05, "loss": 2.1164, "step": 480 }, { "epoch": 0.04, "grad_norm": 1.65625, "learning_rate": 1.99803514003843e-05, "loss": 2.1083, "step": 500 }, { "epoch": 0.04, "grad_norm": 1.8515625, "learning_rate": 1.9978748544714427e-05, "loss": 2.0906, "step": 520 }, { "epoch": 0.04, "grad_norm": 1.890625, "learning_rate": 1.997708291908886e-05, "loss": 2.1043, "step": 540 }, { "epoch": 0.04, "grad_norm": 2.0, "learning_rate": 1.9975354533984995e-05, "loss": 2.1028, "step": 560 }, { "epoch": 0.05, "grad_norm": 1.8984375, "learning_rate": 1.9973563400274994e-05, "loss": 2.082, "step": 580 }, { "epoch": 0.05, "grad_norm": 2.1875, "learning_rate": 1.9971709529225754e-05, "loss": 2.0806, "step": 600 }, { "epoch": 0.05, "grad_norm": 2.375, "learning_rate": 1.9969792932498783e-05, "loss": 2.0803, "step": 620 }, { "epoch": 0.05, "grad_norm": 1.84375, "learning_rate": 1.9967813622150177e-05, "loss": 2.0731, "step": 640 }, { "epoch": 0.05, "grad_norm": 2.015625, "learning_rate": 1.996577161063052e-05, "loss": 2.0662, "step": 660 }, { "epoch": 0.05, "grad_norm": 1.859375, "learning_rate": 1.99636669107848e-05, "loss": 2.0472, "step": 680 }, { "epoch": 0.06, "grad_norm": 1.9609375, "learning_rate": 1.996149953585235e-05, "loss": 2.0562, "step": 700 }, { "epoch": 0.06, "grad_norm": 1.84375, "learning_rate": 1.9959269499466746e-05, "loss": 2.0587, "step": 720 }, { "epoch": 0.06, "grad_norm": 1.890625, "learning_rate": 1.9956976815655723e-05, "loss": 2.0576, "step": 740 }, { "epoch": 0.06, "grad_norm": 1.9609375, "learning_rate": 1.99546214988411e-05, "loss": 2.0508, "step": 760 }, { "epoch": 0.06, "grad_norm": 1.953125, "learning_rate": 1.9952203563838676e-05, "loss": 2.034, "step": 780 }, { "epoch": 0.06, "grad_norm": 1.8046875, "learning_rate": 1.9949723025858136e-05, "loss": 2.0259, "step": 800 }, { "epoch": 0.07, "grad_norm": 2.078125, "learning_rate": 1.994717990050297e-05, "loss": 2.0439, "step": 820 }, { "epoch": 0.07, "grad_norm": 2.875, "learning_rate": 1.9944574203770365e-05, "loss": 2.0371, "step": 840 }, { "epoch": 0.07, "grad_norm": 2.515625, "learning_rate": 1.994190595205109e-05, "loss": 2.0375, "step": 860 }, { "epoch": 0.07, "grad_norm": 2.0625, "learning_rate": 1.9939175162129427e-05, "loss": 2.0227, "step": 880 }, { "epoch": 0.07, "grad_norm": 2.078125, "learning_rate": 1.9936381851183032e-05, "loss": 2.0182, "step": 900 }, { "epoch": 0.07, "grad_norm": 1.8203125, "learning_rate": 1.9933526036782852e-05, "loss": 2.0208, "step": 920 }, { "epoch": 0.08, "grad_norm": 1.9921875, "learning_rate": 1.993060773689299e-05, "loss": 2.0177, "step": 940 }, { "epoch": 0.08, "grad_norm": 1.8125, "learning_rate": 1.992762696987062e-05, "loss": 2.0208, "step": 960 }, { "epoch": 0.08, "grad_norm": 2.0, "learning_rate": 1.9924583754465842e-05, "loss": 1.9938, "step": 980 }, { "epoch": 0.08, "grad_norm": 2.109375, "learning_rate": 1.9921478109821598e-05, "loss": 2.0132, "step": 1000 }, { "epoch": 0.08, "grad_norm": 1.8046875, "learning_rate": 1.9918310055473515e-05, "loss": 2.0062, "step": 1020 }, { "epoch": 0.08, "grad_norm": 1.8828125, "learning_rate": 1.991507961134981e-05, "loss": 2.0074, "step": 1040 }, { "epoch": 0.08, "grad_norm": 2.046875, "learning_rate": 1.9911786797771144e-05, "loss": 2.0153, "step": 1060 }, { "epoch": 0.09, "grad_norm": 2.078125, "learning_rate": 1.990843163545052e-05, "loss": 1.996, "step": 1080 }, { "epoch": 0.09, "grad_norm": 2.21875, "learning_rate": 1.990501414549312e-05, "loss": 2.0067, "step": 1100 }, { "epoch": 0.09, "grad_norm": 1.8359375, "learning_rate": 1.9901534349396204e-05, "loss": 1.9922, "step": 1120 }, { "epoch": 0.09, "grad_norm": 2.25, "learning_rate": 1.9897992269048953e-05, "loss": 1.9953, "step": 1140 }, { "epoch": 0.09, "grad_norm": 2.109375, "learning_rate": 1.9894387926732342e-05, "loss": 1.9968, "step": 1160 }, { "epoch": 0.09, "grad_norm": 2.390625, "learning_rate": 1.9890721345118987e-05, "loss": 1.9851, "step": 1180 }, { "epoch": 0.1, "grad_norm": 2.171875, "learning_rate": 1.988699254727303e-05, "loss": 1.9749, "step": 1200 }, { "epoch": 0.1, "grad_norm": 2.25, "learning_rate": 1.988320155664996e-05, "loss": 2.003, "step": 1220 }, { "epoch": 0.1, "grad_norm": 2.140625, "learning_rate": 1.9879348397096482e-05, "loss": 1.9779, "step": 1240 }, { "epoch": 0.1, "grad_norm": 2.09375, "learning_rate": 1.9875433092850376e-05, "loss": 1.9633, "step": 1260 }, { "epoch": 0.1, "grad_norm": 1.9921875, "learning_rate": 1.9871455668540325e-05, "loss": 1.9824, "step": 1280 }, { "epoch": 0.1, "grad_norm": 2.3125, "learning_rate": 1.9867416149185774e-05, "loss": 1.9785, "step": 1300 }, { "epoch": 0.11, "grad_norm": 2.265625, "learning_rate": 1.9863314560196775e-05, "loss": 1.9923, "step": 1320 }, { "epoch": 0.11, "grad_norm": 2.140625, "learning_rate": 1.9859150927373803e-05, "loss": 1.9839, "step": 1340 }, { "epoch": 0.11, "grad_norm": 2.4375, "learning_rate": 1.9854925276907627e-05, "loss": 1.985, "step": 1360 }, { "epoch": 0.11, "grad_norm": 1.953125, "learning_rate": 1.985063763537913e-05, "loss": 1.974, "step": 1380 }, { "epoch": 0.11, "grad_norm": 1.9453125, "learning_rate": 1.9846288029759124e-05, "loss": 1.9801, "step": 1400 }, { "epoch": 0.11, "grad_norm": 2.03125, "learning_rate": 1.984187648740822e-05, "loss": 1.9733, "step": 1420 }, { "epoch": 0.11, "grad_norm": 2.046875, "learning_rate": 1.983740303607662e-05, "loss": 1.9653, "step": 1440 }, { "epoch": 0.12, "grad_norm": 2.4375, "learning_rate": 1.9832867703903953e-05, "loss": 1.9672, "step": 1460 }, { "epoch": 0.12, "grad_norm": 2.15625, "learning_rate": 1.9828270519419115e-05, "loss": 1.9625, "step": 1480 }, { "epoch": 0.12, "grad_norm": 2.0625, "learning_rate": 1.9823611511540064e-05, "loss": 1.9542, "step": 1500 }, { "epoch": 0.12, "grad_norm": 2.328125, "learning_rate": 1.9818890709573652e-05, "loss": 1.9475, "step": 1520 }, { "epoch": 0.12, "grad_norm": 2.234375, "learning_rate": 1.9814108143215446e-05, "loss": 1.9642, "step": 1540 }, { "epoch": 0.12, "grad_norm": 2.015625, "learning_rate": 1.9809263842549516e-05, "loss": 1.9541, "step": 1560 }, { "epoch": 0.13, "grad_norm": 1.9921875, "learning_rate": 1.980435783804828e-05, "loss": 1.956, "step": 1580 }, { "epoch": 0.13, "grad_norm": 1.96875, "learning_rate": 1.9799390160572295e-05, "loss": 1.9812, "step": 1600 }, { "epoch": 0.13, "grad_norm": 1.953125, "learning_rate": 1.979436084137005e-05, "loss": 1.9617, "step": 1620 }, { "epoch": 0.13, "grad_norm": 2.125, "learning_rate": 1.9789269912077792e-05, "loss": 1.9534, "step": 1640 }, { "epoch": 0.13, "grad_norm": 2.140625, "learning_rate": 1.9784117404719324e-05, "loss": 1.9519, "step": 1660 }, { "epoch": 0.13, "grad_norm": 2.046875, "learning_rate": 1.977890335170578e-05, "loss": 1.9448, "step": 1680 }, { "epoch": 0.14, "grad_norm": 2.21875, "learning_rate": 1.9773627785835454e-05, "loss": 1.9361, "step": 1700 }, { "epoch": 0.14, "grad_norm": 2.265625, "learning_rate": 1.9768290740293573e-05, "loss": 1.9485, "step": 1720 }, { "epoch": 0.14, "grad_norm": 2.046875, "learning_rate": 1.9762892248652093e-05, "loss": 1.9356, "step": 1740 }, { "epoch": 0.14, "grad_norm": 2.453125, "learning_rate": 1.975743234486949e-05, "loss": 1.9484, "step": 1760 }, { "epoch": 0.14, "grad_norm": 2.28125, "learning_rate": 1.9751911063290542e-05, "loss": 1.9358, "step": 1780 }, { "epoch": 0.14, "grad_norm": 2.203125, "learning_rate": 1.974632843864612e-05, "loss": 1.9453, "step": 1800 }, { "epoch": 0.15, "grad_norm": 2.359375, "learning_rate": 1.9740684506052958e-05, "loss": 1.9217, "step": 1820 }, { "epoch": 0.15, "grad_norm": 2.171875, "learning_rate": 1.9734979301013445e-05, "loss": 1.9243, "step": 1840 }, { "epoch": 0.15, "grad_norm": 2.09375, "learning_rate": 1.9729212859415397e-05, "loss": 1.9421, "step": 1860 }, { "epoch": 0.15, "grad_norm": 2.625, "learning_rate": 1.9723385217531824e-05, "loss": 1.9311, "step": 1880 }, { "epoch": 0.15, "grad_norm": 1.9609375, "learning_rate": 1.9717496412020717e-05, "loss": 1.9402, "step": 1900 }, { "epoch": 0.15, "grad_norm": 2.140625, "learning_rate": 1.9711546479924797e-05, "loss": 1.9433, "step": 1920 }, { "epoch": 0.15, "grad_norm": 2.203125, "learning_rate": 1.9705535458671304e-05, "loss": 1.9181, "step": 1940 }, { "epoch": 0.16, "grad_norm": 2.375, "learning_rate": 1.9699463386071748e-05, "loss": 1.929, "step": 1960 }, { "epoch": 0.16, "grad_norm": 2.25, "learning_rate": 1.9693330300321666e-05, "loss": 1.941, "step": 1980 }, { "epoch": 0.16, "grad_norm": 2.3125, "learning_rate": 1.96871362400004e-05, "loss": 1.9172, "step": 2000 }, { "epoch": 0.16, "grad_norm": 2.375, "learning_rate": 1.9680881244070848e-05, "loss": 1.9103, "step": 2020 }, { "epoch": 0.16, "grad_norm": 1.9921875, "learning_rate": 1.96745653518792e-05, "loss": 1.9323, "step": 2040 }, { "epoch": 0.16, "grad_norm": 2.015625, "learning_rate": 1.9668188603154716e-05, "loss": 1.9333, "step": 2060 }, { "epoch": 0.17, "grad_norm": 2.640625, "learning_rate": 1.9661751038009463e-05, "loss": 1.9243, "step": 2080 }, { "epoch": 0.17, "grad_norm": 2.03125, "learning_rate": 1.965525269693807e-05, "loss": 1.9386, "step": 2100 }, { "epoch": 0.17, "grad_norm": 2.8125, "learning_rate": 1.9648693620817455e-05, "loss": 1.9293, "step": 2120 }, { "epoch": 0.17, "grad_norm": 2.125, "learning_rate": 1.96420738509066e-05, "loss": 1.9175, "step": 2140 }, { "epoch": 0.17, "grad_norm": 1.90625, "learning_rate": 1.963539342884626e-05, "loss": 1.9176, "step": 2160 }, { "epoch": 0.17, "grad_norm": 2.15625, "learning_rate": 1.9628652396658725e-05, "loss": 1.9182, "step": 2180 }, { "epoch": 0.18, "grad_norm": 2.015625, "learning_rate": 1.9621850796747528e-05, "loss": 1.9048, "step": 2200 }, { "epoch": 0.18, "grad_norm": 2.4375, "learning_rate": 1.9614988671897208e-05, "loss": 1.9209, "step": 2220 }, { "epoch": 0.18, "grad_norm": 2.1875, "learning_rate": 1.960806606527303e-05, "loss": 1.9064, "step": 2240 }, { "epoch": 0.18, "grad_norm": 2.515625, "learning_rate": 1.96010830204207e-05, "loss": 1.9192, "step": 2260 }, { "epoch": 0.18, "grad_norm": 2.203125, "learning_rate": 1.9594039581266107e-05, "loss": 1.9326, "step": 2280 }, { "epoch": 0.18, "grad_norm": 2.0, "learning_rate": 1.958693579211505e-05, "loss": 1.9194, "step": 2300 }, { "epoch": 0.19, "grad_norm": 2.0625, "learning_rate": 1.957977169765294e-05, "loss": 1.8903, "step": 2320 }, { "epoch": 0.19, "grad_norm": 2.703125, "learning_rate": 1.957254734294454e-05, "loss": 1.9135, "step": 2340 }, { "epoch": 0.19, "grad_norm": 2.125, "learning_rate": 1.956526277343366e-05, "loss": 1.9228, "step": 2360 }, { "epoch": 0.19, "grad_norm": 2.28125, "learning_rate": 1.95579180349429e-05, "loss": 1.9094, "step": 2380 }, { "epoch": 0.19, "grad_norm": 2.015625, "learning_rate": 1.955051317367333e-05, "loss": 1.9102, "step": 2400 }, { "epoch": 0.19, "grad_norm": 2.109375, "learning_rate": 1.9543048236204215e-05, "loss": 1.8987, "step": 2420 }, { "epoch": 0.19, "grad_norm": 2.328125, "learning_rate": 1.9535523269492733e-05, "loss": 1.9124, "step": 2440 }, { "epoch": 0.2, "grad_norm": 2.171875, "learning_rate": 1.9527938320873652e-05, "loss": 1.9137, "step": 2460 }, { "epoch": 0.2, "grad_norm": 2.078125, "learning_rate": 1.9520293438059065e-05, "loss": 1.9078, "step": 2480 }, { "epoch": 0.2, "grad_norm": 1.9765625, "learning_rate": 1.9512588669138055e-05, "loss": 1.9092, "step": 2500 }, { "epoch": 0.2, "grad_norm": 2.09375, "learning_rate": 1.9504824062576425e-05, "loss": 1.9114, "step": 2520 }, { "epoch": 0.2, "grad_norm": 2.046875, "learning_rate": 1.949699966721637e-05, "loss": 1.9121, "step": 2540 }, { "epoch": 0.2, "grad_norm": 2.171875, "learning_rate": 1.9489115532276182e-05, "loss": 1.9139, "step": 2560 }, { "epoch": 0.21, "grad_norm": 2.0625, "learning_rate": 1.9481171707349936e-05, "loss": 1.8889, "step": 2580 }, { "epoch": 0.21, "grad_norm": 2.03125, "learning_rate": 1.9473168242407183e-05, "loss": 1.9233, "step": 2600 }, { "epoch": 0.21, "grad_norm": 2.203125, "learning_rate": 1.9465105187792617e-05, "loss": 1.8928, "step": 2620 }, { "epoch": 0.21, "grad_norm": 2.15625, "learning_rate": 1.9456982594225787e-05, "loss": 1.9101, "step": 2640 }, { "epoch": 0.21, "grad_norm": 2.234375, "learning_rate": 1.9448800512800762e-05, "loss": 1.8862, "step": 2660 }, { "epoch": 0.21, "grad_norm": 2.203125, "learning_rate": 1.9440558994985805e-05, "loss": 1.8912, "step": 2680 }, { "epoch": 0.22, "grad_norm": 2.3125, "learning_rate": 1.943225809262306e-05, "loss": 1.8983, "step": 2700 }, { "epoch": 0.22, "grad_norm": 2.28125, "learning_rate": 1.942389785792822e-05, "loss": 1.9031, "step": 2720 }, { "epoch": 0.22, "grad_norm": 2.0625, "learning_rate": 1.94154783434902e-05, "loss": 1.9023, "step": 2740 }, { "epoch": 0.22, "grad_norm": 2.46875, "learning_rate": 1.940699960227081e-05, "loss": 1.8974, "step": 2760 }, { "epoch": 0.22, "grad_norm": 2.0625, "learning_rate": 1.939846168760441e-05, "loss": 1.9007, "step": 2780 }, { "epoch": 0.22, "grad_norm": 2.15625, "learning_rate": 1.938986465319759e-05, "loss": 1.8949, "step": 2800 }, { "epoch": 0.23, "grad_norm": 2.375, "learning_rate": 1.9381208553128813e-05, "loss": 1.8864, "step": 2820 }, { "epoch": 0.23, "grad_norm": 2.359375, "learning_rate": 1.9372493441848105e-05, "loss": 1.9024, "step": 2840 }, { "epoch": 0.23, "grad_norm": 2.109375, "learning_rate": 1.9363719374176683e-05, "loss": 1.8891, "step": 2860 }, { "epoch": 0.23, "grad_norm": 2.28125, "learning_rate": 1.935488640530662e-05, "loss": 1.8849, "step": 2880 }, { "epoch": 0.23, "grad_norm": 2.234375, "learning_rate": 1.9345994590800498e-05, "loss": 1.8939, "step": 2900 }, { "epoch": 0.23, "grad_norm": 2.453125, "learning_rate": 1.9337043986591064e-05, "loss": 1.8903, "step": 2920 }, { "epoch": 0.23, "grad_norm": 2.265625, "learning_rate": 1.9328034648980874e-05, "loss": 1.8731, "step": 2940 }, { "epoch": 0.24, "grad_norm": 2.203125, "learning_rate": 1.9318966634641936e-05, "loss": 1.8781, "step": 2960 }, { "epoch": 0.24, "grad_norm": 2.109375, "learning_rate": 1.9309840000615358e-05, "loss": 1.8855, "step": 2980 }, { "epoch": 0.24, "grad_norm": 2.03125, "learning_rate": 1.930065480431098e-05, "loss": 1.89, "step": 3000 }, { "epoch": 0.24, "grad_norm": 2.53125, "learning_rate": 1.9291411103507033e-05, "loss": 1.878, "step": 3020 }, { "epoch": 0.24, "grad_norm": 2.453125, "learning_rate": 1.9282108956349754e-05, "loss": 1.8896, "step": 3040 }, { "epoch": 0.24, "grad_norm": 2.140625, "learning_rate": 1.9272748421353023e-05, "loss": 1.8763, "step": 3060 }, { "epoch": 0.25, "grad_norm": 2.015625, "learning_rate": 1.9263329557398012e-05, "loss": 1.8741, "step": 3080 }, { "epoch": 0.25, "grad_norm": 2.28125, "learning_rate": 1.9253852423732803e-05, "loss": 1.8664, "step": 3100 }, { "epoch": 0.25, "grad_norm": 2.25, "learning_rate": 1.9244317079972007e-05, "loss": 1.8706, "step": 3120 }, { "epoch": 0.25, "grad_norm": 2.3125, "learning_rate": 1.92347235860964e-05, "loss": 1.8791, "step": 3140 }, { "epoch": 0.25, "grad_norm": 2.34375, "learning_rate": 1.9225072002452557e-05, "loss": 1.8834, "step": 3160 }, { "epoch": 0.25, "grad_norm": 2.359375, "learning_rate": 1.9215362389752434e-05, "loss": 1.8849, "step": 3180 }, { "epoch": 0.26, "grad_norm": 2.3125, "learning_rate": 1.9205594809073035e-05, "loss": 1.8804, "step": 3200 }, { "epoch": 0.26, "grad_norm": 2.3125, "learning_rate": 1.9195769321855984e-05, "loss": 1.8717, "step": 3220 }, { "epoch": 0.26, "grad_norm": 2.1875, "learning_rate": 1.9185885989907173e-05, "loss": 1.8701, "step": 3240 }, { "epoch": 0.26, "grad_norm": 2.0625, "learning_rate": 1.917594487539635e-05, "loss": 1.8764, "step": 3260 }, { "epoch": 0.26, "grad_norm": 2.234375, "learning_rate": 1.9165946040856747e-05, "loss": 1.8695, "step": 3280 }, { "epoch": 0.26, "grad_norm": 2.59375, "learning_rate": 1.9155889549184657e-05, "loss": 1.8747, "step": 3300 }, { "epoch": 0.27, "grad_norm": 3.3125, "learning_rate": 1.9145775463639073e-05, "loss": 1.858, "step": 3320 }, { "epoch": 0.27, "grad_norm": 2.4375, "learning_rate": 1.9135603847841266e-05, "loss": 1.8668, "step": 3340 }, { "epoch": 0.27, "grad_norm": 2.28125, "learning_rate": 1.9125374765774404e-05, "loss": 1.8479, "step": 3360 }, { "epoch": 0.27, "grad_norm": 2.1875, "learning_rate": 1.911508828178312e-05, "loss": 1.8627, "step": 3380 }, { "epoch": 0.27, "grad_norm": 2.40625, "learning_rate": 1.9104744460573156e-05, "loss": 1.8924, "step": 3400 }, { "epoch": 0.27, "grad_norm": 2.15625, "learning_rate": 1.909434336721089e-05, "loss": 1.8739, "step": 3420 }, { "epoch": 0.27, "grad_norm": 2.1875, "learning_rate": 1.9083885067122985e-05, "loss": 1.8762, "step": 3440 }, { "epoch": 0.28, "grad_norm": 2.5, "learning_rate": 1.9073369626095958e-05, "loss": 1.8711, "step": 3460 }, { "epoch": 0.28, "grad_norm": 2.046875, "learning_rate": 1.9062797110275743e-05, "loss": 1.8768, "step": 3480 }, { "epoch": 0.28, "grad_norm": 2.65625, "learning_rate": 1.9052167586167315e-05, "loss": 1.8683, "step": 3500 }, { "epoch": 0.28, "grad_norm": 2.234375, "learning_rate": 1.9041481120634248e-05, "loss": 1.8697, "step": 3520 }, { "epoch": 0.28, "grad_norm": 2.375, "learning_rate": 1.9030737780898284e-05, "loss": 1.863, "step": 3540 }, { "epoch": 0.28, "grad_norm": 2.25, "learning_rate": 1.9019937634538946e-05, "loss": 1.8664, "step": 3560 }, { "epoch": 0.29, "grad_norm": 2.203125, "learning_rate": 1.900908074949307e-05, "loss": 1.8684, "step": 3580 }, { "epoch": 0.29, "grad_norm": 2.1875, "learning_rate": 1.8998167194054425e-05, "loss": 1.8525, "step": 3600 }, { "epoch": 0.29, "grad_norm": 2.5625, "learning_rate": 1.8987197036873227e-05, "loss": 1.8582, "step": 3620 }, { "epoch": 0.29, "grad_norm": 2.796875, "learning_rate": 1.897617034695576e-05, "loss": 1.8664, "step": 3640 }, { "epoch": 0.29, "grad_norm": 2.1875, "learning_rate": 1.8965087193663906e-05, "loss": 1.8692, "step": 3660 }, { "epoch": 0.29, "grad_norm": 2.21875, "learning_rate": 1.895394764671473e-05, "loss": 1.8534, "step": 3680 }, { "epoch": 0.3, "grad_norm": 2.359375, "learning_rate": 1.894275177618004e-05, "loss": 1.852, "step": 3700 }, { "epoch": 0.3, "grad_norm": 2.4375, "learning_rate": 1.893149965248592e-05, "loss": 1.8699, "step": 3720 }, { "epoch": 0.3, "grad_norm": 2.203125, "learning_rate": 1.8920191346412326e-05, "loss": 1.8649, "step": 3740 }, { "epoch": 0.3, "grad_norm": 2.15625, "learning_rate": 1.8908826929092607e-05, "loss": 1.857, "step": 3760 }, { "epoch": 0.3, "grad_norm": 2.28125, "learning_rate": 1.8897406472013084e-05, "loss": 1.8404, "step": 3780 }, { "epoch": 0.3, "grad_norm": 2.65625, "learning_rate": 1.8885930047012585e-05, "loss": 1.864, "step": 3800 }, { "epoch": 0.3, "grad_norm": 2.640625, "learning_rate": 1.887439772628199e-05, "loss": 1.8578, "step": 3820 }, { "epoch": 0.31, "grad_norm": 1.9765625, "learning_rate": 1.886280958236379e-05, "loss": 1.8603, "step": 3840 }, { "epoch": 0.31, "grad_norm": 2.484375, "learning_rate": 1.8851165688151627e-05, "loss": 1.8603, "step": 3860 }, { "epoch": 0.31, "grad_norm": 2.65625, "learning_rate": 1.8839466116889823e-05, "loss": 1.8752, "step": 3880 }, { "epoch": 0.31, "grad_norm": 2.140625, "learning_rate": 1.882771094217293e-05, "loss": 1.8628, "step": 3900 }, { "epoch": 0.31, "grad_norm": 2.28125, "learning_rate": 1.8815900237945284e-05, "loss": 1.8575, "step": 3920 }, { "epoch": 0.31, "grad_norm": 2.28125, "learning_rate": 1.8804034078500497e-05, "loss": 1.85, "step": 3940 }, { "epoch": 0.32, "grad_norm": 2.140625, "learning_rate": 1.8792112538481025e-05, "loss": 1.8687, "step": 3960 }, { "epoch": 0.32, "grad_norm": 2.21875, "learning_rate": 1.8780135692877693e-05, "loss": 1.8465, "step": 3980 }, { "epoch": 0.32, "grad_norm": 2.3125, "learning_rate": 1.8768103617029213e-05, "loss": 1.8569, "step": 4000 }, { "epoch": 0.32, "grad_norm": 2.328125, "learning_rate": 1.8756016386621712e-05, "loss": 1.8401, "step": 4020 }, { "epoch": 0.32, "grad_norm": 2.21875, "learning_rate": 1.874387407768827e-05, "loss": 1.8356, "step": 4040 }, { "epoch": 0.32, "grad_norm": 2.21875, "learning_rate": 1.873167676660842e-05, "loss": 1.8605, "step": 4060 }, { "epoch": 0.33, "grad_norm": 2.125, "learning_rate": 1.8719424530107674e-05, "loss": 1.8598, "step": 4080 }, { "epoch": 0.33, "grad_norm": 2.890625, "learning_rate": 1.8707117445257067e-05, "loss": 1.8512, "step": 4100 }, { "epoch": 0.33, "grad_norm": 2.59375, "learning_rate": 1.8694755589472633e-05, "loss": 1.8482, "step": 4120 }, { "epoch": 0.33, "grad_norm": 2.359375, "learning_rate": 1.8682339040514933e-05, "loss": 1.8479, "step": 4140 }, { "epoch": 0.33, "grad_norm": 2.140625, "learning_rate": 1.8669867876488578e-05, "loss": 1.8397, "step": 4160 }, { "epoch": 0.33, "grad_norm": 2.0625, "learning_rate": 1.8657342175841722e-05, "loss": 1.8579, "step": 4180 }, { "epoch": 0.34, "grad_norm": 2.78125, "learning_rate": 1.8644762017365576e-05, "loss": 1.8508, "step": 4200 }, { "epoch": 0.34, "grad_norm": 2.34375, "learning_rate": 1.863212748019391e-05, "loss": 1.8335, "step": 4220 }, { "epoch": 0.34, "grad_norm": 3.03125, "learning_rate": 1.861943864380255e-05, "loss": 1.8415, "step": 4240 }, { "epoch": 0.34, "grad_norm": 2.375, "learning_rate": 1.86066955880089e-05, "loss": 1.8543, "step": 4260 }, { "epoch": 0.34, "grad_norm": 2.625, "learning_rate": 1.85938983929714e-05, "loss": 1.861, "step": 4280 }, { "epoch": 0.34, "grad_norm": 2.4375, "learning_rate": 1.858104713918907e-05, "loss": 1.8387, "step": 4300 }, { "epoch": 0.34, "grad_norm": 2.21875, "learning_rate": 1.8568141907500964e-05, "loss": 1.8561, "step": 4320 }, { "epoch": 0.35, "grad_norm": 2.484375, "learning_rate": 1.8555182779085678e-05, "loss": 1.8442, "step": 4340 }, { "epoch": 0.35, "grad_norm": 2.34375, "learning_rate": 1.8542169835460846e-05, "loss": 1.8582, "step": 4360 }, { "epoch": 0.35, "grad_norm": 2.265625, "learning_rate": 1.8529103158482605e-05, "loss": 1.8319, "step": 4380 }, { "epoch": 0.35, "grad_norm": 2.875, "learning_rate": 1.8515982830345115e-05, "loss": 1.8388, "step": 4400 }, { "epoch": 0.35, "grad_norm": 2.28125, "learning_rate": 1.850280893358e-05, "loss": 1.8552, "step": 4420 }, { "epoch": 0.35, "grad_norm": 2.296875, "learning_rate": 1.848958155105586e-05, "loss": 1.8317, "step": 4440 }, { "epoch": 0.36, "grad_norm": 2.203125, "learning_rate": 1.847630076597774e-05, "loss": 1.8413, "step": 4460 }, { "epoch": 0.36, "grad_norm": 2.5, "learning_rate": 1.846296666188661e-05, "loss": 1.8251, "step": 4480 }, { "epoch": 0.36, "grad_norm": 2.15625, "learning_rate": 1.8449579322658827e-05, "loss": 1.8445, "step": 4500 }, { "epoch": 0.36, "grad_norm": 2.40625, "learning_rate": 1.8436138832505623e-05, "loss": 1.8672, "step": 4520 }, { "epoch": 0.36, "grad_norm": 2.109375, "learning_rate": 1.842264527597257e-05, "loss": 1.8343, "step": 4540 }, { "epoch": 0.36, "grad_norm": 2.390625, "learning_rate": 1.8409098737939038e-05, "loss": 1.8272, "step": 4560 }, { "epoch": 0.37, "grad_norm": 2.296875, "learning_rate": 1.8395499303617677e-05, "loss": 1.8448, "step": 4580 }, { "epoch": 0.37, "grad_norm": 2.421875, "learning_rate": 1.8381847058553872e-05, "loss": 1.835, "step": 4600 }, { "epoch": 0.37, "grad_norm": 2.171875, "learning_rate": 1.8368142088625213e-05, "loss": 1.8356, "step": 4620 }, { "epoch": 0.37, "grad_norm": 2.40625, "learning_rate": 1.8354384480040935e-05, "loss": 1.8175, "step": 4640 }, { "epoch": 0.37, "grad_norm": 2.703125, "learning_rate": 1.83405743193414e-05, "loss": 1.8218, "step": 4660 }, { "epoch": 0.37, "grad_norm": 2.265625, "learning_rate": 1.8326711693397537e-05, "loss": 1.8409, "step": 4680 }, { "epoch": 0.38, "grad_norm": 2.25, "learning_rate": 1.831279668941031e-05, "loss": 1.8471, "step": 4700 }, { "epoch": 0.38, "grad_norm": 2.109375, "learning_rate": 1.8298829394910146e-05, "loss": 1.8708, "step": 4720 }, { "epoch": 0.38, "grad_norm": 2.28125, "learning_rate": 1.82848098977564e-05, "loss": 1.8397, "step": 4740 }, { "epoch": 0.38, "grad_norm": 2.25, "learning_rate": 1.8270738286136815e-05, "loss": 1.8166, "step": 4760 }, { "epoch": 0.38, "grad_norm": 2.1875, "learning_rate": 1.8256614648566937e-05, "loss": 1.8257, "step": 4780 }, { "epoch": 0.38, "grad_norm": 2.40625, "learning_rate": 1.824243907388958e-05, "loss": 1.8483, "step": 4800 }, { "epoch": 0.38, "grad_norm": 2.59375, "learning_rate": 1.8228211651274264e-05, "loss": 1.8235, "step": 4820 }, { "epoch": 0.39, "grad_norm": 2.640625, "learning_rate": 1.8213932470216652e-05, "loss": 1.8561, "step": 4840 }, { "epoch": 0.39, "grad_norm": 2.828125, "learning_rate": 1.8199601620537977e-05, "loss": 1.8324, "step": 4860 }, { "epoch": 0.39, "grad_norm": 2.171875, "learning_rate": 1.81852191923845e-05, "loss": 1.8389, "step": 4880 }, { "epoch": 0.39, "grad_norm": 2.171875, "learning_rate": 1.8170785276226915e-05, "loss": 1.8372, "step": 4900 }, { "epoch": 0.39, "grad_norm": 2.484375, "learning_rate": 1.8156299962859805e-05, "loss": 1.8367, "step": 4920 }, { "epoch": 0.39, "grad_norm": 2.21875, "learning_rate": 1.8141763343401057e-05, "loss": 1.8078, "step": 4940 }, { "epoch": 0.4, "grad_norm": 2.21875, "learning_rate": 1.8127175509291292e-05, "loss": 1.8181, "step": 4960 }, { "epoch": 0.4, "grad_norm": 2.359375, "learning_rate": 1.8112536552293286e-05, "loss": 1.8273, "step": 4980 }, { "epoch": 0.4, "grad_norm": 3.140625, "learning_rate": 1.80978465644914e-05, "loss": 1.8302, "step": 5000 }, { "epoch": 0.4, "grad_norm": 2.125, "learning_rate": 1.8083105638291e-05, "loss": 1.8469, "step": 5020 }, { "epoch": 0.4, "grad_norm": 2.46875, "learning_rate": 1.8068313866417876e-05, "loss": 1.8235, "step": 5040 }, { "epoch": 0.4, "grad_norm": 2.328125, "learning_rate": 1.8053471341917636e-05, "loss": 1.8302, "step": 5060 }, { "epoch": 0.41, "grad_norm": 2.625, "learning_rate": 1.8038578158155163e-05, "loss": 1.8218, "step": 5080 }, { "epoch": 0.41, "grad_norm": 2.234375, "learning_rate": 1.8023634408814e-05, "loss": 1.8322, "step": 5100 }, { "epoch": 0.41, "grad_norm": 2.4375, "learning_rate": 1.8008640187895755e-05, "loss": 1.8091, "step": 5120 }, { "epoch": 0.41, "grad_norm": 2.53125, "learning_rate": 1.7993595589719533e-05, "loss": 1.828, "step": 5140 }, { "epoch": 0.41, "grad_norm": 2.40625, "learning_rate": 1.797850070892132e-05, "loss": 1.8188, "step": 5160 }, { "epoch": 0.41, "grad_norm": 2.84375, "learning_rate": 1.7963355640453407e-05, "loss": 1.8106, "step": 5180 }, { "epoch": 0.42, "grad_norm": 2.15625, "learning_rate": 1.7948160479583783e-05, "loss": 1.8172, "step": 5200 }, { "epoch": 0.42, "grad_norm": 2.375, "learning_rate": 1.793291532189553e-05, "loss": 1.8324, "step": 5220 }, { "epoch": 0.42, "grad_norm": 2.359375, "learning_rate": 1.791762026328623e-05, "loss": 1.8202, "step": 5240 }, { "epoch": 0.42, "grad_norm": 2.171875, "learning_rate": 1.7902275399967363e-05, "loss": 1.8183, "step": 5260 }, { "epoch": 0.42, "grad_norm": 2.5625, "learning_rate": 1.78868808284637e-05, "loss": 1.8347, "step": 5280 }, { "epoch": 0.42, "grad_norm": 2.5625, "learning_rate": 1.7871436645612685e-05, "loss": 1.831, "step": 5300 }, { "epoch": 0.42, "grad_norm": 2.796875, "learning_rate": 1.785594294856385e-05, "loss": 1.8263, "step": 5320 }, { "epoch": 0.43, "grad_norm": 2.265625, "learning_rate": 1.7840399834778176e-05, "loss": 1.847, "step": 5340 }, { "epoch": 0.43, "grad_norm": 2.375, "learning_rate": 1.7824807402027504e-05, "loss": 1.8249, "step": 5360 }, { "epoch": 0.43, "grad_norm": 2.34375, "learning_rate": 1.78091657483939e-05, "loss": 1.8206, "step": 5380 }, { "epoch": 0.43, "grad_norm": 2.328125, "learning_rate": 1.779347497226905e-05, "loss": 1.8251, "step": 5400 }, { "epoch": 0.43, "grad_norm": 2.5, "learning_rate": 1.777773517235364e-05, "loss": 1.8226, "step": 5420 }, { "epoch": 0.43, "grad_norm": 2.1875, "learning_rate": 1.7761946447656736e-05, "loss": 1.8309, "step": 5440 }, { "epoch": 0.44, "grad_norm": 2.234375, "learning_rate": 1.7746108897495157e-05, "loss": 1.8283, "step": 5460 }, { "epoch": 0.44, "grad_norm": 2.796875, "learning_rate": 1.7730222621492846e-05, "loss": 1.8275, "step": 5480 }, { "epoch": 0.44, "grad_norm": 2.390625, "learning_rate": 1.7714287719580254e-05, "loss": 1.8059, "step": 5500 }, { "epoch": 0.44, "grad_norm": 2.28125, "learning_rate": 1.769830429199371e-05, "loss": 1.8235, "step": 5520 }, { "epoch": 0.44, "grad_norm": 2.40625, "learning_rate": 1.7682272439274778e-05, "loss": 1.8104, "step": 5540 }, { "epoch": 0.44, "grad_norm": 2.609375, "learning_rate": 1.766619226226965e-05, "loss": 1.8212, "step": 5560 }, { "epoch": 0.45, "grad_norm": 2.140625, "learning_rate": 1.765006386212847e-05, "loss": 1.8269, "step": 5580 }, { "epoch": 0.45, "grad_norm": 2.21875, "learning_rate": 1.763388734030475e-05, "loss": 1.8212, "step": 5600 }, { "epoch": 0.45, "grad_norm": 2.390625, "learning_rate": 1.7617662798554685e-05, "loss": 1.8447, "step": 5620 }, { "epoch": 0.45, "grad_norm": 2.53125, "learning_rate": 1.7601390338936547e-05, "loss": 1.8244, "step": 5640 }, { "epoch": 0.45, "grad_norm": 2.375, "learning_rate": 1.7585070063810014e-05, "loss": 1.8125, "step": 5660 }, { "epoch": 0.45, "grad_norm": 2.53125, "learning_rate": 1.7568702075835557e-05, "loss": 1.8114, "step": 5680 }, { "epoch": 0.46, "grad_norm": 2.34375, "learning_rate": 1.7552286477973766e-05, "loss": 1.8136, "step": 5700 }, { "epoch": 0.46, "grad_norm": 2.359375, "learning_rate": 1.7535823373484716e-05, "loss": 1.8261, "step": 5720 }, { "epoch": 0.46, "grad_norm": 2.3125, "learning_rate": 1.751931286592732e-05, "loss": 1.8085, "step": 5740 }, { "epoch": 0.46, "grad_norm": 2.34375, "learning_rate": 1.7502755059158683e-05, "loss": 1.8297, "step": 5760 }, { "epoch": 0.46, "grad_norm": 2.296875, "learning_rate": 1.7486150057333416e-05, "loss": 1.7937, "step": 5780 }, { "epoch": 0.46, "grad_norm": 2.15625, "learning_rate": 1.7469497964903018e-05, "loss": 1.8052, "step": 5800 }, { "epoch": 0.46, "grad_norm": 2.328125, "learning_rate": 1.7452798886615205e-05, "loss": 1.8216, "step": 5820 }, { "epoch": 0.47, "grad_norm": 2.171875, "learning_rate": 1.7436052927513254e-05, "loss": 1.8322, "step": 5840 }, { "epoch": 0.47, "grad_norm": 2.484375, "learning_rate": 1.741926019293533e-05, "loss": 1.8182, "step": 5860 }, { "epoch": 0.47, "grad_norm": 2.578125, "learning_rate": 1.740242078851384e-05, "loss": 1.8262, "step": 5880 }, { "epoch": 0.47, "grad_norm": 2.734375, "learning_rate": 1.7385534820174757e-05, "loss": 1.7948, "step": 5900 }, { "epoch": 0.47, "grad_norm": 3.0, "learning_rate": 1.7368602394136964e-05, "loss": 1.8332, "step": 5920 }, { "epoch": 0.47, "grad_norm": 2.234375, "learning_rate": 1.735162361691157e-05, "loss": 1.8016, "step": 5940 }, { "epoch": 0.48, "grad_norm": 2.25, "learning_rate": 1.7334598595301257e-05, "loss": 1.8103, "step": 5960 }, { "epoch": 0.48, "grad_norm": 2.953125, "learning_rate": 1.7317527436399603e-05, "loss": 1.8014, "step": 5980 }, { "epoch": 0.48, "grad_norm": 2.25, "learning_rate": 1.7300410247590402e-05, "loss": 1.8071, "step": 6000 }, { "epoch": 0.48, "grad_norm": 2.375, "learning_rate": 1.7283247136546996e-05, "loss": 1.809, "step": 6020 }, { "epoch": 0.48, "grad_norm": 2.46875, "learning_rate": 1.7266038211231583e-05, "loss": 1.8236, "step": 6040 }, { "epoch": 0.48, "grad_norm": 2.234375, "learning_rate": 1.724878357989457e-05, "loss": 1.8306, "step": 6060 }, { "epoch": 0.49, "grad_norm": 2.265625, "learning_rate": 1.7231483351073858e-05, "loss": 1.8165, "step": 6080 }, { "epoch": 0.49, "grad_norm": 2.1875, "learning_rate": 1.721413763359417e-05, "loss": 1.8162, "step": 6100 }, { "epoch": 0.49, "grad_norm": 2.25, "learning_rate": 1.7196746536566376e-05, "loss": 1.8346, "step": 6120 }, { "epoch": 0.49, "grad_norm": 2.265625, "learning_rate": 1.71793101693868e-05, "loss": 1.8082, "step": 6140 }, { "epoch": 0.49, "grad_norm": 2.15625, "learning_rate": 1.7161828641736527e-05, "loss": 1.8105, "step": 6160 }, { "epoch": 0.49, "grad_norm": 2.4375, "learning_rate": 1.7144302063580726e-05, "loss": 1.8105, "step": 6180 }, { "epoch": 0.49, "grad_norm": 2.46875, "learning_rate": 1.712673054516794e-05, "loss": 1.8232, "step": 6200 }, { "epoch": 0.5, "grad_norm": 2.75, "learning_rate": 1.7109114197029408e-05, "loss": 1.8227, "step": 6220 }, { "epoch": 0.5, "grad_norm": 2.265625, "learning_rate": 1.7091453129978363e-05, "loss": 1.8181, "step": 6240 }, { "epoch": 0.5, "grad_norm": 2.59375, "learning_rate": 1.7073747455109336e-05, "loss": 1.8006, "step": 6260 }, { "epoch": 0.5, "grad_norm": 2.71875, "learning_rate": 1.7055997283797463e-05, "loss": 1.7975, "step": 6280 }, { "epoch": 0.5, "grad_norm": 2.28125, "learning_rate": 1.7038202727697766e-05, "loss": 1.8105, "step": 6300 }, { "epoch": 0.5, "grad_norm": 2.59375, "learning_rate": 1.7020363898744477e-05, "loss": 1.7994, "step": 6320 }, { "epoch": 0.51, "grad_norm": 2.59375, "learning_rate": 1.7002480909150316e-05, "loss": 1.8193, "step": 6340 }, { "epoch": 0.51, "grad_norm": 2.40625, "learning_rate": 1.6984553871405783e-05, "loss": 1.8347, "step": 6360 }, { "epoch": 0.51, "grad_norm": 2.234375, "learning_rate": 1.6966582898278466e-05, "loss": 1.8159, "step": 6380 }, { "epoch": 0.51, "grad_norm": 2.3125, "learning_rate": 1.694856810281232e-05, "loss": 1.8053, "step": 6400 }, { "epoch": 0.51, "grad_norm": 2.34375, "learning_rate": 1.6930509598326948e-05, "loss": 1.828, "step": 6420 }, { "epoch": 0.51, "grad_norm": 2.1875, "learning_rate": 1.6912407498416914e-05, "loss": 1.8186, "step": 6440 }, { "epoch": 0.52, "grad_norm": 2.296875, "learning_rate": 1.689426191695101e-05, "loss": 1.8027, "step": 6460 }, { "epoch": 0.52, "grad_norm": 2.375, "learning_rate": 1.6876072968071532e-05, "loss": 1.8098, "step": 6480 }, { "epoch": 0.52, "grad_norm": 2.46875, "learning_rate": 1.6857840766193586e-05, "loss": 1.8129, "step": 6500 }, { "epoch": 0.52, "grad_norm": 2.328125, "learning_rate": 1.6839565426004346e-05, "loss": 1.8054, "step": 6520 }, { "epoch": 0.52, "grad_norm": 2.390625, "learning_rate": 1.6821247062462347e-05, "loss": 1.8123, "step": 6540 }, { "epoch": 0.52, "grad_norm": 2.328125, "learning_rate": 1.6802885790796753e-05, "loss": 1.8074, "step": 6560 }, { "epoch": 0.53, "grad_norm": 2.65625, "learning_rate": 1.678448172650664e-05, "loss": 1.7996, "step": 6580 }, { "epoch": 0.53, "grad_norm": 2.609375, "learning_rate": 1.676603498536026e-05, "loss": 1.8098, "step": 6600 }, { "epoch": 0.53, "grad_norm": 2.265625, "learning_rate": 1.6747545683394322e-05, "loss": 1.8016, "step": 6620 }, { "epoch": 0.53, "grad_norm": 2.6875, "learning_rate": 1.672901393691325e-05, "loss": 1.8093, "step": 6640 }, { "epoch": 0.53, "grad_norm": 2.484375, "learning_rate": 1.6710439862488478e-05, "loss": 1.8023, "step": 6660 }, { "epoch": 0.53, "grad_norm": 2.234375, "learning_rate": 1.6691823576957676e-05, "loss": 1.8075, "step": 6680 }, { "epoch": 0.53, "grad_norm": 2.6875, "learning_rate": 1.667316519742405e-05, "loss": 1.8052, "step": 6700 }, { "epoch": 0.54, "grad_norm": 2.34375, "learning_rate": 1.6654464841255586e-05, "loss": 1.8011, "step": 6720 }, { "epoch": 0.54, "grad_norm": 2.25, "learning_rate": 1.663572262608433e-05, "loss": 1.8075, "step": 6740 }, { "epoch": 0.54, "grad_norm": 2.296875, "learning_rate": 1.6616938669805622e-05, "loss": 1.7911, "step": 6760 }, { "epoch": 0.54, "grad_norm": 2.203125, "learning_rate": 1.659811309057738e-05, "loss": 1.8026, "step": 6780 }, { "epoch": 0.54, "grad_norm": 2.1875, "learning_rate": 1.6579246006819335e-05, "loss": 1.8088, "step": 6800 }, { "epoch": 0.54, "grad_norm": 2.328125, "learning_rate": 1.6560337537212306e-05, "loss": 1.8155, "step": 6820 }, { "epoch": 0.55, "grad_norm": 2.21875, "learning_rate": 1.6541387800697438e-05, "loss": 1.7997, "step": 6840 }, { "epoch": 0.55, "grad_norm": 2.25, "learning_rate": 1.6522396916475468e-05, "loss": 1.8253, "step": 6860 }, { "epoch": 0.55, "grad_norm": 2.265625, "learning_rate": 1.650336500400595e-05, "loss": 1.8037, "step": 6880 }, { "epoch": 0.55, "grad_norm": 2.421875, "learning_rate": 1.6484292183006542e-05, "loss": 1.8154, "step": 6900 }, { "epoch": 0.55, "grad_norm": 2.515625, "learning_rate": 1.6465178573452214e-05, "loss": 1.8169, "step": 6920 }, { "epoch": 0.55, "grad_norm": 2.359375, "learning_rate": 1.6446024295574522e-05, "loss": 1.8002, "step": 6940 }, { "epoch": 0.56, "grad_norm": 2.28125, "learning_rate": 1.6426829469860837e-05, "loss": 1.7999, "step": 6960 }, { "epoch": 0.56, "grad_norm": 2.375, "learning_rate": 1.6407594217053587e-05, "loss": 1.7973, "step": 6980 }, { "epoch": 0.56, "grad_norm": 2.296875, "learning_rate": 1.638831865814951e-05, "loss": 1.8073, "step": 7000 }, { "epoch": 0.56, "grad_norm": 2.265625, "learning_rate": 1.6369002914398874e-05, "loss": 1.795, "step": 7020 }, { "epoch": 0.56, "grad_norm": 2.34375, "learning_rate": 1.6349647107304724e-05, "loss": 1.7985, "step": 7040 }, { "epoch": 0.56, "grad_norm": 2.125, "learning_rate": 1.633025135862213e-05, "loss": 1.7936, "step": 7060 }, { "epoch": 0.57, "grad_norm": 2.890625, "learning_rate": 1.6310815790357404e-05, "loss": 1.8036, "step": 7080 }, { "epoch": 0.57, "grad_norm": 2.4375, "learning_rate": 1.6291340524767327e-05, "loss": 1.8046, "step": 7100 }, { "epoch": 0.57, "grad_norm": 2.6875, "learning_rate": 1.6271825684358404e-05, "loss": 1.8052, "step": 7120 }, { "epoch": 0.57, "grad_norm": 2.484375, "learning_rate": 1.625227139188607e-05, "loss": 1.8105, "step": 7140 }, { "epoch": 0.57, "grad_norm": 2.375, "learning_rate": 1.6232677770353936e-05, "loss": 1.7952, "step": 7160 }, { "epoch": 0.57, "grad_norm": 2.625, "learning_rate": 1.621304494301301e-05, "loss": 1.8102, "step": 7180 }, { "epoch": 0.57, "grad_norm": 2.609375, "learning_rate": 1.6193373033360904e-05, "loss": 1.7962, "step": 7200 }, { "epoch": 0.58, "grad_norm": 2.328125, "learning_rate": 1.6173662165141084e-05, "loss": 1.8078, "step": 7220 }, { "epoch": 0.58, "grad_norm": 2.5, "learning_rate": 1.6153912462342073e-05, "loss": 1.8051, "step": 7240 }, { "epoch": 0.58, "grad_norm": 2.15625, "learning_rate": 1.6134124049196688e-05, "loss": 1.8057, "step": 7260 }, { "epoch": 0.58, "grad_norm": 2.78125, "learning_rate": 1.6114297050181235e-05, "loss": 1.8153, "step": 7280 }, { "epoch": 0.58, "grad_norm": 2.171875, "learning_rate": 1.6094431590014746e-05, "loss": 1.8047, "step": 7300 }, { "epoch": 0.58, "grad_norm": 2.359375, "learning_rate": 1.6074527793658186e-05, "loss": 1.8069, "step": 7320 }, { "epoch": 0.59, "grad_norm": 2.4375, "learning_rate": 1.605458578631367e-05, "loss": 1.7919, "step": 7340 }, { "epoch": 0.59, "grad_norm": 2.390625, "learning_rate": 1.6034605693423676e-05, "loss": 1.8104, "step": 7360 }, { "epoch": 0.59, "grad_norm": 2.703125, "learning_rate": 1.6014587640670244e-05, "loss": 1.7971, "step": 7380 }, { "epoch": 0.59, "grad_norm": 2.703125, "learning_rate": 1.599453175397421e-05, "loss": 1.7987, "step": 7400 }, { "epoch": 0.59, "grad_norm": 2.46875, "learning_rate": 1.597443815949439e-05, "loss": 1.8057, "step": 7420 }, { "epoch": 0.59, "grad_norm": 2.359375, "learning_rate": 1.59543069836268e-05, "loss": 1.7817, "step": 7440 }, { "epoch": 0.6, "grad_norm": 2.515625, "learning_rate": 1.5934138353003845e-05, "loss": 1.8009, "step": 7460 }, { "epoch": 0.6, "grad_norm": 2.5, "learning_rate": 1.5913932394493548e-05, "loss": 1.7939, "step": 7480 }, { "epoch": 0.6, "grad_norm": 2.40625, "learning_rate": 1.589368923519874e-05, "loss": 1.8014, "step": 7500 }, { "epoch": 0.6, "grad_norm": 2.328125, "learning_rate": 1.587340900245624e-05, "loss": 1.7879, "step": 7520 }, { "epoch": 0.6, "grad_norm": 2.25, "learning_rate": 1.5853091823836087e-05, "loss": 1.8, "step": 7540 }, { "epoch": 0.6, "grad_norm": 2.140625, "learning_rate": 1.5832737827140727e-05, "loss": 1.7894, "step": 7560 }, { "epoch": 0.61, "grad_norm": 2.6875, "learning_rate": 1.581234714040419e-05, "loss": 1.7845, "step": 7580 }, { "epoch": 0.61, "grad_norm": 2.359375, "learning_rate": 1.5791919891891313e-05, "loss": 1.7841, "step": 7600 }, { "epoch": 0.61, "grad_norm": 2.3125, "learning_rate": 1.5771456210096913e-05, "loss": 1.8057, "step": 7620 }, { "epoch": 0.61, "grad_norm": 2.84375, "learning_rate": 1.5750956223744985e-05, "loss": 1.7961, "step": 7640 }, { "epoch": 0.61, "grad_norm": 2.28125, "learning_rate": 1.5730420061787898e-05, "loss": 1.7908, "step": 7660 }, { "epoch": 0.61, "grad_norm": 2.578125, "learning_rate": 1.5709847853405574e-05, "loss": 1.7888, "step": 7680 }, { "epoch": 0.61, "grad_norm": 2.328125, "learning_rate": 1.568923972800468e-05, "loss": 1.7742, "step": 7700 }, { "epoch": 0.62, "grad_norm": 2.484375, "learning_rate": 1.566859581521782e-05, "loss": 1.7902, "step": 7720 }, { "epoch": 0.62, "grad_norm": 2.46875, "learning_rate": 1.5647916244902707e-05, "loss": 1.7918, "step": 7740 }, { "epoch": 0.62, "grad_norm": 2.390625, "learning_rate": 1.5627201147141357e-05, "loss": 1.806, "step": 7760 }, { "epoch": 0.62, "grad_norm": 2.25, "learning_rate": 1.5606450652239263e-05, "loss": 1.7925, "step": 7780 }, { "epoch": 0.62, "grad_norm": 2.578125, "learning_rate": 1.5585664890724584e-05, "loss": 1.7921, "step": 7800 }, { "epoch": 0.62, "grad_norm": 2.390625, "learning_rate": 1.5564843993347313e-05, "loss": 1.7901, "step": 7820 }, { "epoch": 0.63, "grad_norm": 2.390625, "learning_rate": 1.5543988091078467e-05, "loss": 1.7881, "step": 7840 }, { "epoch": 0.63, "grad_norm": 2.6875, "learning_rate": 1.5523097315109245e-05, "loss": 1.7948, "step": 7860 }, { "epoch": 0.63, "grad_norm": 2.484375, "learning_rate": 1.5502171796850226e-05, "loss": 1.7958, "step": 7880 }, { "epoch": 0.63, "grad_norm": 2.40625, "learning_rate": 1.5481211667930528e-05, "loss": 1.7911, "step": 7900 }, { "epoch": 0.63, "grad_norm": 2.453125, "learning_rate": 1.5460217060196986e-05, "loss": 1.7709, "step": 7920 }, { "epoch": 0.63, "grad_norm": 2.40625, "learning_rate": 1.54391881057133e-05, "loss": 1.7914, "step": 7940 }, { "epoch": 0.64, "grad_norm": 2.328125, "learning_rate": 1.541812493675925e-05, "loss": 1.8062, "step": 7960 }, { "epoch": 0.64, "grad_norm": 2.34375, "learning_rate": 1.539702768582982e-05, "loss": 1.8074, "step": 7980 }, { "epoch": 0.64, "grad_norm": 2.75, "learning_rate": 1.5375896485634386e-05, "loss": 1.7788, "step": 8000 }, { "epoch": 0.64, "grad_norm": 2.453125, "learning_rate": 1.5354731469095884e-05, "loss": 1.7814, "step": 8020 }, { "epoch": 0.64, "grad_norm": 2.421875, "learning_rate": 1.5333532769349955e-05, "loss": 1.7854, "step": 8040 }, { "epoch": 0.64, "grad_norm": 2.65625, "learning_rate": 1.5312300519744135e-05, "loss": 1.7869, "step": 8060 }, { "epoch": 0.65, "grad_norm": 2.625, "learning_rate": 1.529103485383699e-05, "loss": 1.7736, "step": 8080 }, { "epoch": 0.65, "grad_norm": 2.28125, "learning_rate": 1.5269735905397278e-05, "loss": 1.7966, "step": 8100 }, { "epoch": 0.65, "grad_norm": 2.59375, "learning_rate": 1.524840380840314e-05, "loss": 1.7907, "step": 8120 }, { "epoch": 0.65, "grad_norm": 2.671875, "learning_rate": 1.5227038697041216e-05, "loss": 1.7767, "step": 8140 }, { "epoch": 0.65, "grad_norm": 2.234375, "learning_rate": 1.520564070570582e-05, "loss": 1.7963, "step": 8160 }, { "epoch": 0.65, "grad_norm": 2.421875, "learning_rate": 1.5184209968998098e-05, "loss": 1.7822, "step": 8180 }, { "epoch": 0.65, "grad_norm": 2.390625, "learning_rate": 1.5162746621725176e-05, "loss": 1.7806, "step": 8200 }, { "epoch": 0.66, "grad_norm": 2.359375, "learning_rate": 1.5141250798899307e-05, "loss": 1.7836, "step": 8220 }, { "epoch": 0.66, "grad_norm": 2.640625, "learning_rate": 1.5119722635737035e-05, "loss": 1.7825, "step": 8240 }, { "epoch": 0.66, "grad_norm": 2.953125, "learning_rate": 1.5098162267658323e-05, "loss": 1.7877, "step": 8260 }, { "epoch": 0.66, "grad_norm": 2.3125, "learning_rate": 1.5076569830285736e-05, "loss": 1.791, "step": 8280 }, { "epoch": 0.66, "grad_norm": 2.4375, "learning_rate": 1.5054945459443544e-05, "loss": 1.781, "step": 8300 }, { "epoch": 0.66, "grad_norm": 2.609375, "learning_rate": 1.5033289291156905e-05, "loss": 1.7873, "step": 8320 }, { "epoch": 0.67, "grad_norm": 2.296875, "learning_rate": 1.501160146165099e-05, "loss": 1.7963, "step": 8340 }, { "epoch": 0.67, "grad_norm": 2.515625, "learning_rate": 1.498988210735013e-05, "loss": 1.794, "step": 8360 }, { "epoch": 0.67, "grad_norm": 2.265625, "learning_rate": 1.4968131364876952e-05, "loss": 1.8001, "step": 8380 }, { "epoch": 0.67, "grad_norm": 2.46875, "learning_rate": 1.4946349371051541e-05, "loss": 1.7728, "step": 8400 }, { "epoch": 0.67, "grad_norm": 2.59375, "learning_rate": 1.4924536262890557e-05, "loss": 1.7732, "step": 8420 }, { "epoch": 0.67, "grad_norm": 2.671875, "learning_rate": 1.4902692177606368e-05, "loss": 1.7822, "step": 8440 }, { "epoch": 0.68, "grad_norm": 2.609375, "learning_rate": 1.4880817252606226e-05, "loss": 1.7862, "step": 8460 }, { "epoch": 0.68, "grad_norm": 2.421875, "learning_rate": 1.4858911625491352e-05, "loss": 1.801, "step": 8480 }, { "epoch": 0.68, "grad_norm": 2.40625, "learning_rate": 1.4836975434056102e-05, "loss": 1.8229, "step": 8500 }, { "epoch": 0.68, "grad_norm": 2.578125, "learning_rate": 1.48150088162871e-05, "loss": 1.7954, "step": 8520 }, { "epoch": 0.68, "grad_norm": 2.46875, "learning_rate": 1.4793011910362352e-05, "loss": 1.7996, "step": 8540 }, { "epoch": 0.68, "grad_norm": 2.40625, "learning_rate": 1.4770984854650397e-05, "loss": 1.8033, "step": 8560 }, { "epoch": 0.68, "grad_norm": 2.140625, "learning_rate": 1.4748927787709417e-05, "loss": 1.7883, "step": 8580 }, { "epoch": 0.69, "grad_norm": 2.265625, "learning_rate": 1.4726840848286385e-05, "loss": 1.7939, "step": 8600 }, { "epoch": 0.69, "grad_norm": 2.421875, "learning_rate": 1.4704724175316181e-05, "loss": 1.7975, "step": 8620 }, { "epoch": 0.69, "grad_norm": 2.46875, "learning_rate": 1.4682577907920707e-05, "loss": 1.8029, "step": 8640 }, { "epoch": 0.69, "grad_norm": 2.359375, "learning_rate": 1.4660402185408046e-05, "loss": 1.7807, "step": 8660 }, { "epoch": 0.69, "grad_norm": 2.5, "learning_rate": 1.4638197147271548e-05, "loss": 1.7953, "step": 8680 }, { "epoch": 0.69, "grad_norm": 2.4375, "learning_rate": 1.4615962933188981e-05, "loss": 1.7902, "step": 8700 }, { "epoch": 0.7, "grad_norm": 2.5625, "learning_rate": 1.4593699683021625e-05, "loss": 1.7849, "step": 8720 }, { "epoch": 0.7, "grad_norm": 2.765625, "learning_rate": 1.4571407536813422e-05, "loss": 1.7814, "step": 8740 }, { "epoch": 0.7, "grad_norm": 2.375, "learning_rate": 1.4549086634790075e-05, "loss": 1.7932, "step": 8760 }, { "epoch": 0.7, "grad_norm": 2.4375, "learning_rate": 1.4526737117358167e-05, "loss": 1.789, "step": 8780 }, { "epoch": 0.7, "grad_norm": 2.546875, "learning_rate": 1.4504359125104292e-05, "loss": 1.7828, "step": 8800 }, { "epoch": 0.7, "grad_norm": 2.34375, "learning_rate": 1.4481952798794152e-05, "loss": 1.7876, "step": 8820 }, { "epoch": 0.71, "grad_norm": 2.328125, "learning_rate": 1.4459518279371692e-05, "loss": 1.794, "step": 8840 }, { "epoch": 0.71, "grad_norm": 2.734375, "learning_rate": 1.4437055707958184e-05, "loss": 1.7919, "step": 8860 }, { "epoch": 0.71, "grad_norm": 2.421875, "learning_rate": 1.4414565225851371e-05, "loss": 1.7846, "step": 8880 }, { "epoch": 0.71, "grad_norm": 2.453125, "learning_rate": 1.4392046974524565e-05, "loss": 1.7843, "step": 8900 }, { "epoch": 0.71, "grad_norm": 2.40625, "learning_rate": 1.4369501095625747e-05, "loss": 1.7726, "step": 8920 }, { "epoch": 0.71, "grad_norm": 2.390625, "learning_rate": 1.4346927730976691e-05, "loss": 1.7836, "step": 8940 }, { "epoch": 0.72, "grad_norm": 2.8125, "learning_rate": 1.4324327022572073e-05, "loss": 1.776, "step": 8960 }, { "epoch": 0.72, "grad_norm": 2.296875, "learning_rate": 1.4301699112578557e-05, "loss": 1.7903, "step": 8980 }, { "epoch": 0.72, "grad_norm": 2.5, "learning_rate": 1.4279044143333926e-05, "loss": 1.7757, "step": 9000 }, { "epoch": 0.72, "grad_norm": 2.3125, "learning_rate": 1.425636225734617e-05, "loss": 1.7705, "step": 9020 }, { "epoch": 0.72, "grad_norm": 2.5, "learning_rate": 1.42336535972926e-05, "loss": 1.8011, "step": 9040 }, { "epoch": 0.72, "grad_norm": 2.375, "learning_rate": 1.4210918306018937e-05, "loss": 1.7795, "step": 9060 }, { "epoch": 0.72, "grad_norm": 2.78125, "learning_rate": 1.4188156526538435e-05, "loss": 1.7965, "step": 9080 }, { "epoch": 0.73, "grad_norm": 2.796875, "learning_rate": 1.4165368402030952e-05, "loss": 1.7631, "step": 9100 }, { "epoch": 0.73, "grad_norm": 2.453125, "learning_rate": 1.4142554075842083e-05, "loss": 1.7949, "step": 9120 }, { "epoch": 0.73, "grad_norm": 2.625, "learning_rate": 1.4119713691482228e-05, "loss": 1.785, "step": 9140 }, { "epoch": 0.73, "grad_norm": 2.46875, "learning_rate": 1.4096847392625708e-05, "loss": 1.777, "step": 9160 }, { "epoch": 0.73, "grad_norm": 2.546875, "learning_rate": 1.4073955323109859e-05, "loss": 1.779, "step": 9180 }, { "epoch": 0.73, "grad_norm": 2.5, "learning_rate": 1.4051037626934112e-05, "loss": 1.7815, "step": 9200 }, { "epoch": 0.74, "grad_norm": 2.828125, "learning_rate": 1.4028094448259113e-05, "loss": 1.7852, "step": 9220 }, { "epoch": 0.74, "grad_norm": 2.375, "learning_rate": 1.4005125931405792e-05, "loss": 1.7999, "step": 9240 }, { "epoch": 0.74, "grad_norm": 2.390625, "learning_rate": 1.3982132220854472e-05, "loss": 1.791, "step": 9260 }, { "epoch": 0.74, "grad_norm": 2.484375, "learning_rate": 1.3959113461243952e-05, "loss": 1.7836, "step": 9280 }, { "epoch": 0.74, "grad_norm": 2.6875, "learning_rate": 1.3936069797370591e-05, "loss": 1.778, "step": 9300 }, { "epoch": 0.74, "grad_norm": 2.234375, "learning_rate": 1.3913001374187421e-05, "loss": 1.8065, "step": 9320 }, { "epoch": 0.75, "grad_norm": 2.671875, "learning_rate": 1.3889908336803198e-05, "loss": 1.8035, "step": 9340 }, { "epoch": 0.75, "grad_norm": 2.4375, "learning_rate": 1.3866790830481529e-05, "loss": 1.7789, "step": 9360 }, { "epoch": 0.75, "grad_norm": 2.65625, "learning_rate": 1.3843649000639933e-05, "loss": 1.7706, "step": 9380 }, { "epoch": 0.75, "grad_norm": 2.546875, "learning_rate": 1.3820482992848929e-05, "loss": 1.7685, "step": 9400 }, { "epoch": 0.75, "grad_norm": 2.46875, "learning_rate": 1.3797292952831127e-05, "loss": 1.7687, "step": 9420 }, { "epoch": 0.75, "grad_norm": 2.375, "learning_rate": 1.3774079026460308e-05, "loss": 1.7768, "step": 9440 }, { "epoch": 0.76, "grad_norm": 2.5, "learning_rate": 1.3750841359760511e-05, "loss": 1.7878, "step": 9460 }, { "epoch": 0.76, "grad_norm": 2.40625, "learning_rate": 1.37275800989051e-05, "loss": 1.792, "step": 9480 }, { "epoch": 0.76, "grad_norm": 2.75, "learning_rate": 1.3704295390215868e-05, "loss": 1.7822, "step": 9500 }, { "epoch": 0.76, "grad_norm": 2.4375, "learning_rate": 1.3680987380162095e-05, "loss": 1.7831, "step": 9520 }, { "epoch": 0.76, "grad_norm": 2.453125, "learning_rate": 1.3657656215359634e-05, "loss": 1.7819, "step": 9540 }, { "epoch": 0.76, "grad_norm": 2.546875, "learning_rate": 1.3634302042569995e-05, "loss": 1.7839, "step": 9560 }, { "epoch": 0.76, "grad_norm": 2.3125, "learning_rate": 1.3610925008699413e-05, "loss": 1.7905, "step": 9580 }, { "epoch": 0.77, "grad_norm": 2.484375, "learning_rate": 1.3587525260797934e-05, "loss": 1.7785, "step": 9600 }, { "epoch": 0.77, "grad_norm": 2.328125, "learning_rate": 1.3564102946058468e-05, "loss": 1.7846, "step": 9620 }, { "epoch": 0.77, "grad_norm": 2.53125, "learning_rate": 1.3540658211815898e-05, "loss": 1.7841, "step": 9640 }, { "epoch": 0.77, "grad_norm": 2.453125, "learning_rate": 1.3517191205546121e-05, "loss": 1.774, "step": 9660 }, { "epoch": 0.77, "grad_norm": 2.578125, "learning_rate": 1.3493702074865139e-05, "loss": 1.7947, "step": 9680 }, { "epoch": 0.77, "grad_norm": 2.390625, "learning_rate": 1.3470190967528118e-05, "loss": 1.7843, "step": 9700 }, { "epoch": 0.78, "grad_norm": 2.296875, "learning_rate": 1.3446658031428474e-05, "loss": 1.7796, "step": 9720 }, { "epoch": 0.78, "grad_norm": 2.5, "learning_rate": 1.3423103414596929e-05, "loss": 1.7713, "step": 9740 }, { "epoch": 0.78, "grad_norm": 2.734375, "learning_rate": 1.3399527265200581e-05, "loss": 1.7769, "step": 9760 }, { "epoch": 0.78, "grad_norm": 2.546875, "learning_rate": 1.3375929731541986e-05, "loss": 1.7823, "step": 9780 }, { "epoch": 0.78, "grad_norm": 2.59375, "learning_rate": 1.3352310962058202e-05, "loss": 1.7642, "step": 9800 }, { "epoch": 0.78, "grad_norm": 2.296875, "learning_rate": 1.332867110531988e-05, "loss": 1.7841, "step": 9820 }, { "epoch": 0.79, "grad_norm": 2.609375, "learning_rate": 1.3305010310030311e-05, "loss": 1.7897, "step": 9840 }, { "epoch": 0.79, "grad_norm": 2.421875, "learning_rate": 1.3281328725024496e-05, "loss": 1.7813, "step": 9860 }, { "epoch": 0.79, "grad_norm": 2.359375, "learning_rate": 1.3257626499268217e-05, "loss": 1.7828, "step": 9880 }, { "epoch": 0.79, "grad_norm": 2.390625, "learning_rate": 1.3233903781857084e-05, "loss": 1.7809, "step": 9900 }, { "epoch": 0.79, "grad_norm": 2.453125, "learning_rate": 1.3210160722015619e-05, "loss": 1.7768, "step": 9920 }, { "epoch": 0.79, "grad_norm": 2.46875, "learning_rate": 1.3186397469096295e-05, "loss": 1.7816, "step": 9940 }, { "epoch": 0.8, "grad_norm": 2.703125, "learning_rate": 1.3162614172578614e-05, "loss": 1.7741, "step": 9960 }, { "epoch": 0.8, "grad_norm": 2.484375, "learning_rate": 1.3138810982068154e-05, "loss": 1.7801, "step": 9980 }, { "epoch": 0.8, "grad_norm": 2.46875, "learning_rate": 1.3114988047295638e-05, "loss": 1.7711, "step": 10000 }, { "epoch": 0.8, "grad_norm": 2.3125, "learning_rate": 1.3091145518115982e-05, "loss": 1.7807, "step": 10020 }, { "epoch": 0.8, "grad_norm": 2.578125, "learning_rate": 1.3067283544507366e-05, "loss": 1.7835, "step": 10040 }, { "epoch": 0.8, "grad_norm": 2.609375, "learning_rate": 1.3043402276570276e-05, "loss": 1.7746, "step": 10060 }, { "epoch": 0.8, "grad_norm": 2.53125, "learning_rate": 1.3019501864526565e-05, "loss": 1.7742, "step": 10080 }, { "epoch": 0.81, "grad_norm": 2.25, "learning_rate": 1.2995582458718518e-05, "loss": 1.7811, "step": 10100 }, { "epoch": 0.81, "grad_norm": 2.3125, "learning_rate": 1.2971644209607893e-05, "loss": 1.7684, "step": 10120 }, { "epoch": 0.81, "grad_norm": 2.703125, "learning_rate": 1.2947687267774973e-05, "loss": 1.7778, "step": 10140 }, { "epoch": 0.81, "grad_norm": 2.46875, "learning_rate": 1.2923711783917637e-05, "loss": 1.7587, "step": 10160 }, { "epoch": 0.81, "grad_norm": 2.296875, "learning_rate": 1.2899717908850385e-05, "loss": 1.784, "step": 10180 }, { "epoch": 0.81, "grad_norm": 2.515625, "learning_rate": 1.2875705793503424e-05, "loss": 1.773, "step": 10200 }, { "epoch": 0.82, "grad_norm": 2.6875, "learning_rate": 1.2851675588921677e-05, "loss": 1.7721, "step": 10220 }, { "epoch": 0.82, "grad_norm": 2.75, "learning_rate": 1.2827627446263877e-05, "loss": 1.7781, "step": 10240 }, { "epoch": 0.82, "grad_norm": 2.734375, "learning_rate": 1.2803561516801575e-05, "loss": 1.7935, "step": 10260 }, { "epoch": 0.82, "grad_norm": 2.34375, "learning_rate": 1.2779477951918217e-05, "loss": 1.7746, "step": 10280 }, { "epoch": 0.82, "grad_norm": 2.453125, "learning_rate": 1.2755376903108183e-05, "loss": 1.7783, "step": 10300 }, { "epoch": 0.82, "grad_norm": 2.421875, "learning_rate": 1.2731258521975829e-05, "loss": 1.7812, "step": 10320 }, { "epoch": 0.83, "grad_norm": 2.46875, "learning_rate": 1.2707122960234544e-05, "loss": 1.7742, "step": 10340 }, { "epoch": 0.83, "grad_norm": 2.671875, "learning_rate": 1.2682970369705773e-05, "loss": 1.7585, "step": 10360 }, { "epoch": 0.83, "grad_norm": 2.671875, "learning_rate": 1.2658800902318103e-05, "loss": 1.7848, "step": 10380 }, { "epoch": 0.83, "grad_norm": 2.609375, "learning_rate": 1.2634614710106266e-05, "loss": 1.7784, "step": 10400 }, { "epoch": 0.83, "grad_norm": 2.421875, "learning_rate": 1.2610411945210199e-05, "loss": 1.7762, "step": 10420 }, { "epoch": 0.83, "grad_norm": 2.359375, "learning_rate": 1.2586192759874094e-05, "loss": 1.7686, "step": 10440 }, { "epoch": 0.84, "grad_norm": 2.734375, "learning_rate": 1.2561957306445428e-05, "loss": 1.7861, "step": 10460 }, { "epoch": 0.84, "grad_norm": 2.53125, "learning_rate": 1.253770573737402e-05, "loss": 1.7744, "step": 10480 }, { "epoch": 0.84, "grad_norm": 2.40625, "learning_rate": 1.2513438205211048e-05, "loss": 1.7703, "step": 10500 }, { "epoch": 0.84, "grad_norm": 2.65625, "learning_rate": 1.2489154862608111e-05, "loss": 1.7785, "step": 10520 }, { "epoch": 0.84, "grad_norm": 2.453125, "learning_rate": 1.2464855862316263e-05, "loss": 1.7789, "step": 10540 }, { "epoch": 0.84, "grad_norm": 2.484375, "learning_rate": 1.244054135718505e-05, "loss": 1.7766, "step": 10560 }, { "epoch": 0.84, "grad_norm": 2.484375, "learning_rate": 1.2416211500161546e-05, "loss": 1.7805, "step": 10580 }, { "epoch": 0.85, "grad_norm": 2.5, "learning_rate": 1.2391866444289394e-05, "loss": 1.7769, "step": 10600 }, { "epoch": 0.85, "grad_norm": 2.5, "learning_rate": 1.2367506342707851e-05, "loss": 1.7727, "step": 10620 }, { "epoch": 0.85, "grad_norm": 2.640625, "learning_rate": 1.2343131348650806e-05, "loss": 1.7603, "step": 10640 }, { "epoch": 0.85, "grad_norm": 2.375, "learning_rate": 1.231874161544583e-05, "loss": 1.7681, "step": 10660 }, { "epoch": 0.85, "grad_norm": 2.515625, "learning_rate": 1.2294337296513219e-05, "loss": 1.7705, "step": 10680 }, { "epoch": 0.85, "grad_norm": 2.375, "learning_rate": 1.2269918545365e-05, "loss": 1.7692, "step": 10700 }, { "epoch": 0.86, "grad_norm": 2.53125, "learning_rate": 1.2245485515604004e-05, "loss": 1.7685, "step": 10720 }, { "epoch": 0.86, "grad_norm": 2.84375, "learning_rate": 1.2221038360922863e-05, "loss": 1.7873, "step": 10740 }, { "epoch": 0.86, "grad_norm": 2.5, "learning_rate": 1.219657723510307e-05, "loss": 1.779, "step": 10760 }, { "epoch": 0.86, "grad_norm": 2.65625, "learning_rate": 1.2172102292013994e-05, "loss": 1.7963, "step": 10780 }, { "epoch": 0.86, "grad_norm": 2.4375, "learning_rate": 1.2147613685611928e-05, "loss": 1.7737, "step": 10800 }, { "epoch": 0.86, "grad_norm": 2.671875, "learning_rate": 1.212311156993911e-05, "loss": 1.7578, "step": 10820 }, { "epoch": 0.87, "grad_norm": 2.875, "learning_rate": 1.2098596099122745e-05, "loss": 1.7649, "step": 10840 }, { "epoch": 0.87, "grad_norm": 2.734375, "learning_rate": 1.2074067427374068e-05, "loss": 1.782, "step": 10860 }, { "epoch": 0.87, "grad_norm": 2.640625, "learning_rate": 1.2049525708987331e-05, "loss": 1.7729, "step": 10880 }, { "epoch": 0.87, "grad_norm": 2.546875, "learning_rate": 1.2024971098338868e-05, "loss": 1.7769, "step": 10900 }, { "epoch": 0.87, "grad_norm": 2.40625, "learning_rate": 1.2000403749886108e-05, "loss": 1.7761, "step": 10920 }, { "epoch": 0.87, "grad_norm": 2.640625, "learning_rate": 1.1975823818166596e-05, "loss": 1.7476, "step": 10940 }, { "epoch": 0.87, "grad_norm": 2.609375, "learning_rate": 1.1951231457797047e-05, "loss": 1.7814, "step": 10960 }, { "epoch": 0.88, "grad_norm": 2.703125, "learning_rate": 1.1926626823472338e-05, "loss": 1.7691, "step": 10980 }, { "epoch": 0.88, "grad_norm": 2.65625, "learning_rate": 1.1902010069964569e-05, "loss": 1.7756, "step": 11000 }, { "epoch": 0.88, "grad_norm": 2.5625, "learning_rate": 1.1877381352122064e-05, "loss": 1.7833, "step": 11020 }, { "epoch": 0.88, "grad_norm": 2.4375, "learning_rate": 1.1852740824868416e-05, "loss": 1.7659, "step": 11040 }, { "epoch": 0.88, "grad_norm": 2.46875, "learning_rate": 1.1828088643201492e-05, "loss": 1.772, "step": 11060 }, { "epoch": 0.88, "grad_norm": 2.546875, "learning_rate": 1.180342496219248e-05, "loss": 1.7516, "step": 11080 }, { "epoch": 0.89, "grad_norm": 2.609375, "learning_rate": 1.17787499369849e-05, "loss": 1.7647, "step": 11100 }, { "epoch": 0.89, "grad_norm": 2.515625, "learning_rate": 1.1754063722793624e-05, "loss": 1.769, "step": 11120 }, { "epoch": 0.89, "grad_norm": 2.671875, "learning_rate": 1.1729366474903923e-05, "loss": 1.7813, "step": 11140 }, { "epoch": 0.89, "grad_norm": 2.734375, "learning_rate": 1.1704658348670455e-05, "loss": 1.7669, "step": 11160 }, { "epoch": 0.89, "grad_norm": 2.328125, "learning_rate": 1.1679939499516317e-05, "loss": 1.7846, "step": 11180 }, { "epoch": 0.89, "grad_norm": 2.375, "learning_rate": 1.165521008293206e-05, "loss": 1.7719, "step": 11200 }, { "epoch": 0.9, "grad_norm": 2.65625, "learning_rate": 1.1630470254474697e-05, "loss": 1.7625, "step": 11220 }, { "epoch": 0.9, "grad_norm": 2.578125, "learning_rate": 1.1605720169766752e-05, "loss": 1.7721, "step": 11240 }, { "epoch": 0.9, "grad_norm": 2.53125, "learning_rate": 1.1580959984495243e-05, "loss": 1.7558, "step": 11260 }, { "epoch": 0.9, "grad_norm": 2.609375, "learning_rate": 1.1556189854410744e-05, "loss": 1.7633, "step": 11280 }, { "epoch": 0.9, "grad_norm": 2.4375, "learning_rate": 1.1531409935326377e-05, "loss": 1.7632, "step": 11300 }, { "epoch": 0.9, "grad_norm": 2.5, "learning_rate": 1.1506620383116835e-05, "loss": 1.7925, "step": 11320 }, { "epoch": 0.91, "grad_norm": 2.453125, "learning_rate": 1.1481821353717418e-05, "loss": 1.7667, "step": 11340 }, { "epoch": 0.91, "grad_norm": 2.484375, "learning_rate": 1.145701300312303e-05, "loss": 1.7733, "step": 11360 }, { "epoch": 0.91, "grad_norm": 2.703125, "learning_rate": 1.1432195487387223e-05, "loss": 1.7772, "step": 11380 }, { "epoch": 0.91, "grad_norm": 2.640625, "learning_rate": 1.1407368962621184e-05, "loss": 1.7459, "step": 11400 }, { "epoch": 0.91, "grad_norm": 2.59375, "learning_rate": 1.1382533584992783e-05, "loss": 1.7608, "step": 11420 }, { "epoch": 0.91, "grad_norm": 2.4375, "learning_rate": 1.1357689510725571e-05, "loss": 1.749, "step": 11440 }, { "epoch": 0.91, "grad_norm": 2.5625, "learning_rate": 1.1332836896097808e-05, "loss": 1.77, "step": 11460 }, { "epoch": 0.92, "grad_norm": 2.5, "learning_rate": 1.1307975897441473e-05, "loss": 1.7676, "step": 11480 }, { "epoch": 0.92, "grad_norm": 2.546875, "learning_rate": 1.1283106671141282e-05, "loss": 1.7755, "step": 11500 }, { "epoch": 0.92, "grad_norm": 2.5625, "learning_rate": 1.1258229373633713e-05, "loss": 1.7742, "step": 11520 }, { "epoch": 0.92, "grad_norm": 2.828125, "learning_rate": 1.1233344161406008e-05, "loss": 1.7606, "step": 11540 }, { "epoch": 0.92, "grad_norm": 2.640625, "learning_rate": 1.12084511909952e-05, "loss": 1.7749, "step": 11560 }, { "epoch": 0.92, "grad_norm": 2.640625, "learning_rate": 1.1183550618987118e-05, "loss": 1.7868, "step": 11580 }, { "epoch": 0.93, "grad_norm": 2.640625, "learning_rate": 1.1158642602015415e-05, "loss": 1.7712, "step": 11600 }, { "epoch": 0.93, "grad_norm": 2.46875, "learning_rate": 1.1133727296760572e-05, "loss": 1.7732, "step": 11620 }, { "epoch": 0.93, "grad_norm": 2.546875, "learning_rate": 1.110880485994891e-05, "loss": 1.7672, "step": 11640 }, { "epoch": 0.93, "grad_norm": 2.375, "learning_rate": 1.1083875448351626e-05, "loss": 1.7858, "step": 11660 }, { "epoch": 0.93, "grad_norm": 2.40625, "learning_rate": 1.1058939218783772e-05, "loss": 1.7683, "step": 11680 }, { "epoch": 0.93, "grad_norm": 2.59375, "learning_rate": 1.10339963281033e-05, "loss": 1.7813, "step": 11700 }, { "epoch": 0.94, "grad_norm": 2.578125, "learning_rate": 1.100904693321006e-05, "loss": 1.7745, "step": 11720 }, { "epoch": 0.94, "grad_norm": 2.484375, "learning_rate": 1.0984091191044816e-05, "loss": 1.7848, "step": 11740 }, { "epoch": 0.94, "grad_norm": 2.5, "learning_rate": 1.0959129258588257e-05, "loss": 1.7518, "step": 11760 }, { "epoch": 0.94, "grad_norm": 2.53125, "learning_rate": 1.0934161292860008e-05, "loss": 1.7768, "step": 11780 }, { "epoch": 0.94, "grad_norm": 2.5625, "learning_rate": 1.0909187450917656e-05, "loss": 1.7602, "step": 11800 }, { "epoch": 0.94, "grad_norm": 2.515625, "learning_rate": 1.0884207889855735e-05, "loss": 1.758, "step": 11820 }, { "epoch": 0.95, "grad_norm": 2.5625, "learning_rate": 1.0859222766804778e-05, "loss": 1.7761, "step": 11840 }, { "epoch": 0.95, "grad_norm": 2.609375, "learning_rate": 1.0834232238930283e-05, "loss": 1.7606, "step": 11860 }, { "epoch": 0.95, "grad_norm": 2.59375, "learning_rate": 1.0809236463431754e-05, "loss": 1.779, "step": 11880 }, { "epoch": 0.95, "grad_norm": 2.765625, "learning_rate": 1.0784235597541708e-05, "loss": 1.771, "step": 11900 }, { "epoch": 0.95, "grad_norm": 2.5625, "learning_rate": 1.075922979852468e-05, "loss": 1.7654, "step": 11920 }, { "epoch": 0.95, "grad_norm": 2.703125, "learning_rate": 1.073421922367623e-05, "loss": 1.7758, "step": 11940 }, { "epoch": 0.95, "grad_norm": 2.453125, "learning_rate": 1.0709204030321972e-05, "loss": 1.7592, "step": 11960 }, { "epoch": 0.96, "grad_norm": 2.546875, "learning_rate": 1.068418437581656e-05, "loss": 1.7741, "step": 11980 }, { "epoch": 0.96, "grad_norm": 2.46875, "learning_rate": 1.0659160417542721e-05, "loss": 1.759, "step": 12000 }, { "epoch": 0.96, "grad_norm": 2.6875, "learning_rate": 1.0634132312910245e-05, "loss": 1.7809, "step": 12020 }, { "epoch": 0.96, "grad_norm": 2.65625, "learning_rate": 1.060910021935501e-05, "loss": 1.7811, "step": 12040 }, { "epoch": 0.96, "grad_norm": 2.59375, "learning_rate": 1.0584064294337983e-05, "loss": 1.761, "step": 12060 }, { "epoch": 0.96, "grad_norm": 2.40625, "learning_rate": 1.0559024695344233e-05, "loss": 1.7515, "step": 12080 }, { "epoch": 0.97, "grad_norm": 2.359375, "learning_rate": 1.0533981579881938e-05, "loss": 1.7861, "step": 12100 }, { "epoch": 0.97, "grad_norm": 2.546875, "learning_rate": 1.0508935105481402e-05, "loss": 1.7643, "step": 12120 }, { "epoch": 0.97, "grad_norm": 2.546875, "learning_rate": 1.0483885429694051e-05, "loss": 1.7745, "step": 12140 }, { "epoch": 0.97, "grad_norm": 2.46875, "learning_rate": 1.0458832710091448e-05, "loss": 1.7539, "step": 12160 }, { "epoch": 0.97, "grad_norm": 2.421875, "learning_rate": 1.0433777104264313e-05, "loss": 1.7546, "step": 12180 }, { "epoch": 0.97, "grad_norm": 2.53125, "learning_rate": 1.0408718769821512e-05, "loss": 1.7606, "step": 12200 }, { "epoch": 0.98, "grad_norm": 2.765625, "learning_rate": 1.0383657864389077e-05, "loss": 1.7583, "step": 12220 }, { "epoch": 0.98, "grad_norm": 2.828125, "learning_rate": 1.0358594545609207e-05, "loss": 1.7659, "step": 12240 }, { "epoch": 0.98, "grad_norm": 2.53125, "learning_rate": 1.0333528971139297e-05, "loss": 1.7601, "step": 12260 }, { "epoch": 0.98, "grad_norm": 2.421875, "learning_rate": 1.0308461298650923e-05, "loss": 1.7612, "step": 12280 }, { "epoch": 0.98, "grad_norm": 2.484375, "learning_rate": 1.0283391685828844e-05, "loss": 1.7646, "step": 12300 }, { "epoch": 0.98, "grad_norm": 2.765625, "learning_rate": 1.0258320290370051e-05, "loss": 1.7741, "step": 12320 }, { "epoch": 0.99, "grad_norm": 2.359375, "learning_rate": 1.0233247269982732e-05, "loss": 1.7616, "step": 12340 }, { "epoch": 0.99, "grad_norm": 2.625, "learning_rate": 1.0208172782385295e-05, "loss": 1.7502, "step": 12360 }, { "epoch": 0.99, "grad_norm": 2.65625, "learning_rate": 1.0183096985305385e-05, "loss": 1.7806, "step": 12380 }, { "epoch": 0.99, "grad_norm": 2.578125, "learning_rate": 1.0158020036478881e-05, "loss": 1.7728, "step": 12400 } ], "logging_steps": 20, "max_steps": 25052, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "total_flos": 1.9252630386848563e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }