| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9899409228804088, |
| "eval_steps": 500, |
| "global_step": 12400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 3.703125, |
| "learning_rate": 1.9999968642467102e-05, |
| "loss": 4.2386, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.999987438156715e-05, |
| "loss": 3.1965, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.9999717217822316e-05, |
| "loss": 2.7844, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.25, |
| "learning_rate": 1.999949715222121e-05, |
| "loss": 2.6013, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.9999214186148133e-05, |
| "loss": 2.5417, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.1953125, |
| "learning_rate": 1.9998868321383038e-05, |
| "loss": 2.4376, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.25, |
| "learning_rate": 1.9998459560101546e-05, |
| "loss": 2.3875, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.9997987904874905e-05, |
| "loss": 2.3568, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.359375, |
| "learning_rate": 1.9997453358670004e-05, |
| "loss": 2.3034, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.9996855924849337e-05, |
| "loss": 2.2779, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.6015625, |
| "learning_rate": 1.999619560717097e-05, |
| "loss": 2.2728, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.9995472409788548e-05, |
| "loss": 2.2436, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.625, |
| "learning_rate": 1.999468633725125e-05, |
| "loss": 2.2062, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.9993837394503745e-05, |
| "loss": 2.1873, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.99929255868862e-05, |
| "loss": 2.1973, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.5546875, |
| "learning_rate": 1.999195092013422e-05, |
| "loss": 2.1891, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.99909134003788e-05, |
| "loss": 2.1813, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.998981303414633e-05, |
| "loss": 2.1609, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.9988649828358504e-05, |
| "loss": 2.1693, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.9987423790332315e-05, |
| "loss": 2.1465, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.9986134927779986e-05, |
| "loss": 2.1387, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.998478324880893e-05, |
| "loss": 2.1236, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.9983368761921703e-05, |
| "loss": 2.1144, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.640625, |
| "learning_rate": 1.9981891476015936e-05, |
| "loss": 2.1164, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.99803514003843e-05, |
| "loss": 2.1083, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.9978748544714427e-05, |
| "loss": 2.0906, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.997708291908886e-05, |
| "loss": 2.1043, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.0, |
| "learning_rate": 1.9975354533984995e-05, |
| "loss": 2.1028, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.9973563400274994e-05, |
| "loss": 2.082, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.9971709529225754e-05, |
| "loss": 2.0806, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.375, |
| "learning_rate": 1.9969792932498783e-05, |
| "loss": 2.0803, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.9967813622150177e-05, |
| "loss": 2.0731, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.996577161063052e-05, |
| "loss": 2.0662, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.99636669107848e-05, |
| "loss": 2.0472, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.996149953585235e-05, |
| "loss": 2.0562, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.9959269499466746e-05, |
| "loss": 2.0587, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.9956976815655723e-05, |
| "loss": 2.0576, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.99546214988411e-05, |
| "loss": 2.0508, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.9952203563838676e-05, |
| "loss": 2.034, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.9949723025858136e-05, |
| "loss": 2.0259, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.994717990050297e-05, |
| "loss": 2.0439, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.875, |
| "learning_rate": 1.9944574203770365e-05, |
| "loss": 2.0371, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.994190595205109e-05, |
| "loss": 2.0375, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.9939175162129427e-05, |
| "loss": 2.0227, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.9936381851183032e-05, |
| "loss": 2.0182, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.9933526036782852e-05, |
| "loss": 2.0208, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.993060773689299e-05, |
| "loss": 2.0177, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.992762696987062e-05, |
| "loss": 2.0208, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 2.0, |
| "learning_rate": 1.9924583754465842e-05, |
| "loss": 1.9938, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.9921478109821598e-05, |
| "loss": 2.0132, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.9918310055473515e-05, |
| "loss": 2.0062, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.991507961134981e-05, |
| "loss": 2.0074, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.9911786797771144e-05, |
| "loss": 2.0153, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.990843163545052e-05, |
| "loss": 1.996, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.990501414549312e-05, |
| "loss": 2.0067, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.9901534349396204e-05, |
| "loss": 1.9922, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.25, |
| "learning_rate": 1.9897992269048953e-05, |
| "loss": 1.9953, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.9894387926732342e-05, |
| "loss": 1.9968, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.9890721345118987e-05, |
| "loss": 1.9851, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.988699254727303e-05, |
| "loss": 1.9749, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.25, |
| "learning_rate": 1.988320155664996e-05, |
| "loss": 2.003, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.9879348397096482e-05, |
| "loss": 1.9779, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.09375, |
| "learning_rate": 1.9875433092850376e-05, |
| "loss": 1.9633, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.9871455668540325e-05, |
| "loss": 1.9824, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.9867416149185774e-05, |
| "loss": 1.9785, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.9863314560196775e-05, |
| "loss": 1.9923, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.9859150927373803e-05, |
| "loss": 1.9839, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.9854925276907627e-05, |
| "loss": 1.985, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.985063763537913e-05, |
| "loss": 1.974, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.9453125, |
| "learning_rate": 1.9846288029759124e-05, |
| "loss": 1.9801, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.984187648740822e-05, |
| "loss": 1.9733, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.983740303607662e-05, |
| "loss": 1.9653, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.9832867703903953e-05, |
| "loss": 1.9672, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.9828270519419115e-05, |
| "loss": 1.9625, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.9823611511540064e-05, |
| "loss": 1.9542, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.9818890709573652e-05, |
| "loss": 1.9475, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.9814108143215446e-05, |
| "loss": 1.9642, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.9809263842549516e-05, |
| "loss": 1.9541, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.980435783804828e-05, |
| "loss": 1.956, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.9799390160572295e-05, |
| "loss": 1.9812, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.979436084137005e-05, |
| "loss": 1.9617, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 2.125, |
| "learning_rate": 1.9789269912077792e-05, |
| "loss": 1.9534, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.9784117404719324e-05, |
| "loss": 1.9519, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.977890335170578e-05, |
| "loss": 1.9448, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.9773627785835454e-05, |
| "loss": 1.9361, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.9768290740293573e-05, |
| "loss": 1.9485, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.9762892248652093e-05, |
| "loss": 1.9356, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.975743234486949e-05, |
| "loss": 1.9484, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.9751911063290542e-05, |
| "loss": 1.9358, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.974632843864612e-05, |
| "loss": 1.9453, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.9740684506052958e-05, |
| "loss": 1.9217, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.9734979301013445e-05, |
| "loss": 1.9243, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.09375, |
| "learning_rate": 1.9729212859415397e-05, |
| "loss": 1.9421, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.625, |
| "learning_rate": 1.9723385217531824e-05, |
| "loss": 1.9311, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.9717496412020717e-05, |
| "loss": 1.9402, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.9711546479924797e-05, |
| "loss": 1.9433, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.9705535458671304e-05, |
| "loss": 1.9181, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 2.375, |
| "learning_rate": 1.9699463386071748e-05, |
| "loss": 1.929, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 2.25, |
| "learning_rate": 1.9693330300321666e-05, |
| "loss": 1.941, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.96871362400004e-05, |
| "loss": 1.9172, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 2.375, |
| "learning_rate": 1.9680881244070848e-05, |
| "loss": 1.9103, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.96745653518792e-05, |
| "loss": 1.9323, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.9668188603154716e-05, |
| "loss": 1.9333, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.9661751038009463e-05, |
| "loss": 1.9243, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.965525269693807e-05, |
| "loss": 1.9386, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.9648693620817455e-05, |
| "loss": 1.9293, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 2.125, |
| "learning_rate": 1.96420738509066e-05, |
| "loss": 1.9175, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.963539342884626e-05, |
| "loss": 1.9176, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.9628652396658725e-05, |
| "loss": 1.9182, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.9621850796747528e-05, |
| "loss": 1.9048, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.9614988671897208e-05, |
| "loss": 1.9209, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.960806606527303e-05, |
| "loss": 1.9064, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.96010830204207e-05, |
| "loss": 1.9192, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.9594039581266107e-05, |
| "loss": 1.9326, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 2.0, |
| "learning_rate": 1.958693579211505e-05, |
| "loss": 1.9194, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.957977169765294e-05, |
| "loss": 1.8903, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.957254734294454e-05, |
| "loss": 1.9135, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.125, |
| "learning_rate": 1.956526277343366e-05, |
| "loss": 1.9228, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.95579180349429e-05, |
| "loss": 1.9094, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.955051317367333e-05, |
| "loss": 1.9102, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.9543048236204215e-05, |
| "loss": 1.8987, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.9535523269492733e-05, |
| "loss": 1.9124, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.9527938320873652e-05, |
| "loss": 1.9137, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.9520293438059065e-05, |
| "loss": 1.9078, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.9512588669138055e-05, |
| "loss": 1.9092, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.09375, |
| "learning_rate": 1.9504824062576425e-05, |
| "loss": 1.9114, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.949699966721637e-05, |
| "loss": 1.9121, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.9489115532276182e-05, |
| "loss": 1.9139, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.9481171707349936e-05, |
| "loss": 1.8889, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.9473168242407183e-05, |
| "loss": 1.9233, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.9465105187792617e-05, |
| "loss": 1.8928, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.9456982594225787e-05, |
| "loss": 1.9101, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.9448800512800762e-05, |
| "loss": 1.8862, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.9440558994985805e-05, |
| "loss": 1.8912, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.943225809262306e-05, |
| "loss": 1.8983, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.942389785792822e-05, |
| "loss": 1.9031, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.94154783434902e-05, |
| "loss": 1.9023, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.940699960227081e-05, |
| "loss": 1.8974, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.939846168760441e-05, |
| "loss": 1.9007, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.938986465319759e-05, |
| "loss": 1.8949, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.375, |
| "learning_rate": 1.9381208553128813e-05, |
| "loss": 1.8864, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.9372493441848105e-05, |
| "loss": 1.9024, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.9363719374176683e-05, |
| "loss": 1.8891, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.935488640530662e-05, |
| "loss": 1.8849, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.9345994590800498e-05, |
| "loss": 1.8939, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.9337043986591064e-05, |
| "loss": 1.8903, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.9328034648980874e-05, |
| "loss": 1.8731, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.9318966634641936e-05, |
| "loss": 1.8781, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.9309840000615358e-05, |
| "loss": 1.8855, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.930065480431098e-05, |
| "loss": 1.89, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.9291411103507033e-05, |
| "loss": 1.878, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.9282108956349754e-05, |
| "loss": 1.8896, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.9272748421353023e-05, |
| "loss": 1.8763, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.9263329557398012e-05, |
| "loss": 1.8741, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.9253852423732803e-05, |
| "loss": 1.8664, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.25, |
| "learning_rate": 1.9244317079972007e-05, |
| "loss": 1.8706, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.92347235860964e-05, |
| "loss": 1.8791, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.9225072002452557e-05, |
| "loss": 1.8834, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.9215362389752434e-05, |
| "loss": 1.8849, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.9205594809073035e-05, |
| "loss": 1.8804, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.9195769321855984e-05, |
| "loss": 1.8717, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.9185885989907173e-05, |
| "loss": 1.8701, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.917594487539635e-05, |
| "loss": 1.8764, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.9165946040856747e-05, |
| "loss": 1.8695, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.9155889549184657e-05, |
| "loss": 1.8747, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 3.3125, |
| "learning_rate": 1.9145775463639073e-05, |
| "loss": 1.858, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.9135603847841266e-05, |
| "loss": 1.8668, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.9125374765774404e-05, |
| "loss": 1.8479, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.911508828178312e-05, |
| "loss": 1.8627, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.9104744460573156e-05, |
| "loss": 1.8924, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.909434336721089e-05, |
| "loss": 1.8739, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.9083885067122985e-05, |
| "loss": 1.8762, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.5, |
| "learning_rate": 1.9073369626095958e-05, |
| "loss": 1.8711, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.9062797110275743e-05, |
| "loss": 1.8768, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.9052167586167315e-05, |
| "loss": 1.8683, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.9041481120634248e-05, |
| "loss": 1.8697, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.375, |
| "learning_rate": 1.9030737780898284e-05, |
| "loss": 1.863, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.25, |
| "learning_rate": 1.9019937634538946e-05, |
| "loss": 1.8664, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.900908074949307e-05, |
| "loss": 1.8684, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.8998167194054425e-05, |
| "loss": 1.8525, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.8987197036873227e-05, |
| "loss": 1.8582, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.897617034695576e-05, |
| "loss": 1.8664, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.8965087193663906e-05, |
| "loss": 1.8692, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.895394764671473e-05, |
| "loss": 1.8534, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.894275177618004e-05, |
| "loss": 1.852, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.893149965248592e-05, |
| "loss": 1.8699, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.8920191346412326e-05, |
| "loss": 1.8649, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.8908826929092607e-05, |
| "loss": 1.857, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.8897406472013084e-05, |
| "loss": 1.8404, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.8885930047012585e-05, |
| "loss": 1.864, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.887439772628199e-05, |
| "loss": 1.8578, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.886280958236379e-05, |
| "loss": 1.8603, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.8851165688151627e-05, |
| "loss": 1.8603, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.8839466116889823e-05, |
| "loss": 1.8752, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.882771094217293e-05, |
| "loss": 1.8628, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.8815900237945284e-05, |
| "loss": 1.8575, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.8804034078500497e-05, |
| "loss": 1.85, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.8792112538481025e-05, |
| "loss": 1.8687, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.8780135692877693e-05, |
| "loss": 1.8465, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.8768103617029213e-05, |
| "loss": 1.8569, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.8756016386621712e-05, |
| "loss": 1.8401, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.874387407768827e-05, |
| "loss": 1.8356, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.873167676660842e-05, |
| "loss": 1.8605, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 2.125, |
| "learning_rate": 1.8719424530107674e-05, |
| "loss": 1.8598, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 2.890625, |
| "learning_rate": 1.8707117445257067e-05, |
| "loss": 1.8512, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.8694755589472633e-05, |
| "loss": 1.8482, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.8682339040514933e-05, |
| "loss": 1.8479, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.8669867876488578e-05, |
| "loss": 1.8397, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.8657342175841722e-05, |
| "loss": 1.8579, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.8644762017365576e-05, |
| "loss": 1.8508, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.863212748019391e-05, |
| "loss": 1.8335, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 3.03125, |
| "learning_rate": 1.861943864380255e-05, |
| "loss": 1.8415, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 2.375, |
| "learning_rate": 1.86066955880089e-05, |
| "loss": 1.8543, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 2.625, |
| "learning_rate": 1.85938983929714e-05, |
| "loss": 1.861, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.858104713918907e-05, |
| "loss": 1.8387, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.8568141907500964e-05, |
| "loss": 1.8561, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.8555182779085678e-05, |
| "loss": 1.8442, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.8542169835460846e-05, |
| "loss": 1.8582, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.8529103158482605e-05, |
| "loss": 1.8319, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 2.875, |
| "learning_rate": 1.8515982830345115e-05, |
| "loss": 1.8388, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.850280893358e-05, |
| "loss": 1.8552, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.848958155105586e-05, |
| "loss": 1.8317, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.847630076597774e-05, |
| "loss": 1.8413, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 2.5, |
| "learning_rate": 1.846296666188661e-05, |
| "loss": 1.8251, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.8449579322658827e-05, |
| "loss": 1.8445, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.8436138832505623e-05, |
| "loss": 1.8672, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.842264527597257e-05, |
| "loss": 1.8343, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.8409098737939038e-05, |
| "loss": 1.8272, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.8395499303617677e-05, |
| "loss": 1.8448, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.8381847058553872e-05, |
| "loss": 1.835, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.8368142088625213e-05, |
| "loss": 1.8356, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.8354384480040935e-05, |
| "loss": 1.8175, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.83405743193414e-05, |
| "loss": 1.8218, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.8326711693397537e-05, |
| "loss": 1.8409, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.25, |
| "learning_rate": 1.831279668941031e-05, |
| "loss": 1.8471, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.8298829394910146e-05, |
| "loss": 1.8708, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.82848098977564e-05, |
| "loss": 1.8397, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.25, |
| "learning_rate": 1.8270738286136815e-05, |
| "loss": 1.8166, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.8256614648566937e-05, |
| "loss": 1.8257, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.824243907388958e-05, |
| "loss": 1.8483, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.8228211651274264e-05, |
| "loss": 1.8235, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.8213932470216652e-05, |
| "loss": 1.8561, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.8199601620537977e-05, |
| "loss": 1.8324, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.81852191923845e-05, |
| "loss": 1.8389, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.8170785276226915e-05, |
| "loss": 1.8372, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.8156299962859805e-05, |
| "loss": 1.8367, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.8141763343401057e-05, |
| "loss": 1.8078, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.8127175509291292e-05, |
| "loss": 1.8181, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.8112536552293286e-05, |
| "loss": 1.8273, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 3.140625, |
| "learning_rate": 1.80978465644914e-05, |
| "loss": 1.8302, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.125, |
| "learning_rate": 1.8083105638291e-05, |
| "loss": 1.8469, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.8068313866417876e-05, |
| "loss": 1.8235, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.8053471341917636e-05, |
| "loss": 1.8302, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.625, |
| "learning_rate": 1.8038578158155163e-05, |
| "loss": 1.8218, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.8023634408814e-05, |
| "loss": 1.8322, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.8008640187895755e-05, |
| "loss": 1.8091, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.7993595589719533e-05, |
| "loss": 1.828, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.797850070892132e-05, |
| "loss": 1.8188, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.84375, |
| "learning_rate": 1.7963355640453407e-05, |
| "loss": 1.8106, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.7948160479583783e-05, |
| "loss": 1.8172, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.375, |
| "learning_rate": 1.793291532189553e-05, |
| "loss": 1.8324, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.791762026328623e-05, |
| "loss": 1.8202, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.7902275399967363e-05, |
| "loss": 1.8183, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.78868808284637e-05, |
| "loss": 1.8347, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.7871436645612685e-05, |
| "loss": 1.831, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.785594294856385e-05, |
| "loss": 1.8263, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.7840399834778176e-05, |
| "loss": 1.847, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.375, |
| "learning_rate": 1.7824807402027504e-05, |
| "loss": 1.8249, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.78091657483939e-05, |
| "loss": 1.8206, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.779347497226905e-05, |
| "loss": 1.8251, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.5, |
| "learning_rate": 1.777773517235364e-05, |
| "loss": 1.8226, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.7761946447656736e-05, |
| "loss": 1.8309, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.7746108897495157e-05, |
| "loss": 1.8283, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.7730222621492846e-05, |
| "loss": 1.8275, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.7714287719580254e-05, |
| "loss": 1.8059, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.769830429199371e-05, |
| "loss": 1.8235, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.7682272439274778e-05, |
| "loss": 1.8104, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.766619226226965e-05, |
| "loss": 1.8212, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.765006386212847e-05, |
| "loss": 1.8269, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.763388734030475e-05, |
| "loss": 1.8212, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.7617662798554685e-05, |
| "loss": 1.8447, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.7601390338936547e-05, |
| "loss": 1.8244, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 2.375, |
| "learning_rate": 1.7585070063810014e-05, |
| "loss": 1.8125, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.7568702075835557e-05, |
| "loss": 1.8114, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.7552286477973766e-05, |
| "loss": 1.8136, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.7535823373484716e-05, |
| "loss": 1.8261, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.751931286592732e-05, |
| "loss": 1.8085, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.7502755059158683e-05, |
| "loss": 1.8297, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.7486150057333416e-05, |
| "loss": 1.7937, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.7469497964903018e-05, |
| "loss": 1.8052, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.7452798886615205e-05, |
| "loss": 1.8216, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.7436052927513254e-05, |
| "loss": 1.8322, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.741926019293533e-05, |
| "loss": 1.8182, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.740242078851384e-05, |
| "loss": 1.8262, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.7385534820174757e-05, |
| "loss": 1.7948, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 3.0, |
| "learning_rate": 1.7368602394136964e-05, |
| "loss": 1.8332, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.735162361691157e-05, |
| "loss": 1.8016, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 2.25, |
| "learning_rate": 1.7334598595301257e-05, |
| "loss": 1.8103, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.7317527436399603e-05, |
| "loss": 1.8014, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 2.25, |
| "learning_rate": 1.7300410247590402e-05, |
| "loss": 1.8071, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 2.375, |
| "learning_rate": 1.7283247136546996e-05, |
| "loss": 1.809, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.7266038211231583e-05, |
| "loss": 1.8236, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.724878357989457e-05, |
| "loss": 1.8306, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.7231483351073858e-05, |
| "loss": 1.8165, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.721413763359417e-05, |
| "loss": 1.8162, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.25, |
| "learning_rate": 1.7196746536566376e-05, |
| "loss": 1.8346, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.71793101693868e-05, |
| "loss": 1.8082, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.7161828641736527e-05, |
| "loss": 1.8105, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.7144302063580726e-05, |
| "loss": 1.8105, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.712673054516794e-05, |
| "loss": 1.8232, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.75, |
| "learning_rate": 1.7109114197029408e-05, |
| "loss": 1.8227, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.7091453129978363e-05, |
| "loss": 1.8181, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.7073747455109336e-05, |
| "loss": 1.8006, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.7055997283797463e-05, |
| "loss": 1.7975, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.7038202727697766e-05, |
| "loss": 1.8105, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.7020363898744477e-05, |
| "loss": 1.7994, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.7002480909150316e-05, |
| "loss": 1.8193, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.6984553871405783e-05, |
| "loss": 1.8347, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.6966582898278466e-05, |
| "loss": 1.8159, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.694856810281232e-05, |
| "loss": 1.8053, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.6930509598326948e-05, |
| "loss": 1.828, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.6912407498416914e-05, |
| "loss": 1.8186, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.689426191695101e-05, |
| "loss": 1.8027, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 2.375, |
| "learning_rate": 1.6876072968071532e-05, |
| "loss": 1.8098, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.6857840766193586e-05, |
| "loss": 1.8129, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.6839565426004346e-05, |
| "loss": 1.8054, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.6821247062462347e-05, |
| "loss": 1.8123, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.6802885790796753e-05, |
| "loss": 1.8074, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.678448172650664e-05, |
| "loss": 1.7996, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.676603498536026e-05, |
| "loss": 1.8098, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.6747545683394322e-05, |
| "loss": 1.8016, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.672901393691325e-05, |
| "loss": 1.8093, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.6710439862488478e-05, |
| "loss": 1.8023, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.6691823576957676e-05, |
| "loss": 1.8075, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.667316519742405e-05, |
| "loss": 1.8052, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.6654464841255586e-05, |
| "loss": 1.8011, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 2.25, |
| "learning_rate": 1.663572262608433e-05, |
| "loss": 1.8075, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.6616938669805622e-05, |
| "loss": 1.7911, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.659811309057738e-05, |
| "loss": 1.8026, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.6579246006819335e-05, |
| "loss": 1.8088, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.6560337537212306e-05, |
| "loss": 1.8155, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.6541387800697438e-05, |
| "loss": 1.7997, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 2.25, |
| "learning_rate": 1.6522396916475468e-05, |
| "loss": 1.8253, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.650336500400595e-05, |
| "loss": 1.8037, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.6484292183006542e-05, |
| "loss": 1.8154, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.6465178573452214e-05, |
| "loss": 1.8169, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.6446024295574522e-05, |
| "loss": 1.8002, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.6426829469860837e-05, |
| "loss": 1.7999, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 2.375, |
| "learning_rate": 1.6407594217053587e-05, |
| "loss": 1.7973, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.638831865814951e-05, |
| "loss": 1.8073, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.6369002914398874e-05, |
| "loss": 1.795, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.6349647107304724e-05, |
| "loss": 1.7985, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 2.125, |
| "learning_rate": 1.633025135862213e-05, |
| "loss": 1.7936, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 2.890625, |
| "learning_rate": 1.6310815790357404e-05, |
| "loss": 1.8036, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.6291340524767327e-05, |
| "loss": 1.8046, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.6271825684358404e-05, |
| "loss": 1.8052, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.625227139188607e-05, |
| "loss": 1.8105, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 2.375, |
| "learning_rate": 1.6232677770353936e-05, |
| "loss": 1.7952, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 2.625, |
| "learning_rate": 1.621304494301301e-05, |
| "loss": 1.8102, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.6193373033360904e-05, |
| "loss": 1.7962, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.6173662165141084e-05, |
| "loss": 1.8078, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 2.5, |
| "learning_rate": 1.6153912462342073e-05, |
| "loss": 1.8051, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.6134124049196688e-05, |
| "loss": 1.8057, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.6114297050181235e-05, |
| "loss": 1.8153, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.6094431590014746e-05, |
| "loss": 1.8047, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.6074527793658186e-05, |
| "loss": 1.8069, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.605458578631367e-05, |
| "loss": 1.7919, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.6034605693423676e-05, |
| "loss": 1.8104, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.6014587640670244e-05, |
| "loss": 1.7971, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.599453175397421e-05, |
| "loss": 1.7987, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.597443815949439e-05, |
| "loss": 1.8057, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.59543069836268e-05, |
| "loss": 1.7817, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.5934138353003845e-05, |
| "loss": 1.8009, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 2.5, |
| "learning_rate": 1.5913932394493548e-05, |
| "loss": 1.7939, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.589368923519874e-05, |
| "loss": 1.8014, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.587340900245624e-05, |
| "loss": 1.7879, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 2.25, |
| "learning_rate": 1.5853091823836087e-05, |
| "loss": 1.8, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.5832737827140727e-05, |
| "loss": 1.7894, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.581234714040419e-05, |
| "loss": 1.7845, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.5791919891891313e-05, |
| "loss": 1.7841, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.5771456210096913e-05, |
| "loss": 1.8057, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.84375, |
| "learning_rate": 1.5750956223744985e-05, |
| "loss": 1.7961, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.5730420061787898e-05, |
| "loss": 1.7908, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.5709847853405574e-05, |
| "loss": 1.7888, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.568923972800468e-05, |
| "loss": 1.7742, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.566859581521782e-05, |
| "loss": 1.7902, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.5647916244902707e-05, |
| "loss": 1.7918, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.5627201147141357e-05, |
| "loss": 1.806, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 2.25, |
| "learning_rate": 1.5606450652239263e-05, |
| "loss": 1.7925, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.5585664890724584e-05, |
| "loss": 1.7921, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.5564843993347313e-05, |
| "loss": 1.7901, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.5543988091078467e-05, |
| "loss": 1.7881, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.5523097315109245e-05, |
| "loss": 1.7948, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.5502171796850226e-05, |
| "loss": 1.7958, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.5481211667930528e-05, |
| "loss": 1.7911, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.5460217060196986e-05, |
| "loss": 1.7709, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.54391881057133e-05, |
| "loss": 1.7914, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.541812493675925e-05, |
| "loss": 1.8062, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.539702768582982e-05, |
| "loss": 1.8074, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 2.75, |
| "learning_rate": 1.5375896485634386e-05, |
| "loss": 1.7788, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.5354731469095884e-05, |
| "loss": 1.7814, |
| "step": 8020 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.5333532769349955e-05, |
| "loss": 1.7854, |
| "step": 8040 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.5312300519744135e-05, |
| "loss": 1.7869, |
| "step": 8060 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.625, |
| "learning_rate": 1.529103485383699e-05, |
| "loss": 1.7736, |
| "step": 8080 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.5269735905397278e-05, |
| "loss": 1.7966, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.524840380840314e-05, |
| "loss": 1.7907, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.5227038697041216e-05, |
| "loss": 1.7767, |
| "step": 8140 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.520564070570582e-05, |
| "loss": 1.7963, |
| "step": 8160 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.5184209968998098e-05, |
| "loss": 1.7822, |
| "step": 8180 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.5162746621725176e-05, |
| "loss": 1.7806, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.5141250798899307e-05, |
| "loss": 1.7836, |
| "step": 8220 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.5119722635737035e-05, |
| "loss": 1.7825, |
| "step": 8240 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.5098162267658323e-05, |
| "loss": 1.7877, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.5076569830285736e-05, |
| "loss": 1.791, |
| "step": 8280 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.5054945459443544e-05, |
| "loss": 1.781, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.5033289291156905e-05, |
| "loss": 1.7873, |
| "step": 8320 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.501160146165099e-05, |
| "loss": 1.7963, |
| "step": 8340 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.498988210735013e-05, |
| "loss": 1.794, |
| "step": 8360 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.4968131364876952e-05, |
| "loss": 1.8001, |
| "step": 8380 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.4946349371051541e-05, |
| "loss": 1.7728, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.4924536262890557e-05, |
| "loss": 1.7732, |
| "step": 8420 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.4902692177606368e-05, |
| "loss": 1.7822, |
| "step": 8440 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.4880817252606226e-05, |
| "loss": 1.7862, |
| "step": 8460 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.4858911625491352e-05, |
| "loss": 1.801, |
| "step": 8480 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.4836975434056102e-05, |
| "loss": 1.8229, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.48150088162871e-05, |
| "loss": 1.7954, |
| "step": 8520 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.4793011910362352e-05, |
| "loss": 1.7996, |
| "step": 8540 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.4770984854650397e-05, |
| "loss": 1.8033, |
| "step": 8560 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.4748927787709417e-05, |
| "loss": 1.7883, |
| "step": 8580 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.4726840848286385e-05, |
| "loss": 1.7939, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.4704724175316181e-05, |
| "loss": 1.7975, |
| "step": 8620 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.4682577907920707e-05, |
| "loss": 1.8029, |
| "step": 8640 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.4660402185408046e-05, |
| "loss": 1.7807, |
| "step": 8660 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 2.5, |
| "learning_rate": 1.4638197147271548e-05, |
| "loss": 1.7953, |
| "step": 8680 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.4615962933188981e-05, |
| "loss": 1.7902, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.4593699683021625e-05, |
| "loss": 1.7849, |
| "step": 8720 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.765625, |
| "learning_rate": 1.4571407536813422e-05, |
| "loss": 1.7814, |
| "step": 8740 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.375, |
| "learning_rate": 1.4549086634790075e-05, |
| "loss": 1.7932, |
| "step": 8760 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.4526737117358167e-05, |
| "loss": 1.789, |
| "step": 8780 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.4504359125104292e-05, |
| "loss": 1.7828, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.4481952798794152e-05, |
| "loss": 1.7876, |
| "step": 8820 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.4459518279371692e-05, |
| "loss": 1.794, |
| "step": 8840 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.4437055707958184e-05, |
| "loss": 1.7919, |
| "step": 8860 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.4414565225851371e-05, |
| "loss": 1.7846, |
| "step": 8880 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.4392046974524565e-05, |
| "loss": 1.7843, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.4369501095625747e-05, |
| "loss": 1.7726, |
| "step": 8920 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.4346927730976691e-05, |
| "loss": 1.7836, |
| "step": 8940 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.4324327022572073e-05, |
| "loss": 1.776, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.4301699112578557e-05, |
| "loss": 1.7903, |
| "step": 8980 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 2.5, |
| "learning_rate": 1.4279044143333926e-05, |
| "loss": 1.7757, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.425636225734617e-05, |
| "loss": 1.7705, |
| "step": 9020 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 2.5, |
| "learning_rate": 1.42336535972926e-05, |
| "loss": 1.8011, |
| "step": 9040 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 2.375, |
| "learning_rate": 1.4210918306018937e-05, |
| "loss": 1.7795, |
| "step": 9060 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.4188156526538435e-05, |
| "loss": 1.7965, |
| "step": 9080 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.4165368402030952e-05, |
| "loss": 1.7631, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.4142554075842083e-05, |
| "loss": 1.7949, |
| "step": 9120 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.625, |
| "learning_rate": 1.4119713691482228e-05, |
| "loss": 1.785, |
| "step": 9140 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.4096847392625708e-05, |
| "loss": 1.777, |
| "step": 9160 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.4073955323109859e-05, |
| "loss": 1.779, |
| "step": 9180 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.5, |
| "learning_rate": 1.4051037626934112e-05, |
| "loss": 1.7815, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.4028094448259113e-05, |
| "loss": 1.7852, |
| "step": 9220 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 2.375, |
| "learning_rate": 1.4005125931405792e-05, |
| "loss": 1.7999, |
| "step": 9240 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.3982132220854472e-05, |
| "loss": 1.791, |
| "step": 9260 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.3959113461243952e-05, |
| "loss": 1.7836, |
| "step": 9280 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.3936069797370591e-05, |
| "loss": 1.778, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.3913001374187421e-05, |
| "loss": 1.8065, |
| "step": 9320 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.3889908336803198e-05, |
| "loss": 1.8035, |
| "step": 9340 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.3866790830481529e-05, |
| "loss": 1.7789, |
| "step": 9360 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.3843649000639933e-05, |
| "loss": 1.7706, |
| "step": 9380 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.3820482992848929e-05, |
| "loss": 1.7685, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.3797292952831127e-05, |
| "loss": 1.7687, |
| "step": 9420 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 2.375, |
| "learning_rate": 1.3774079026460308e-05, |
| "loss": 1.7768, |
| "step": 9440 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.5, |
| "learning_rate": 1.3750841359760511e-05, |
| "loss": 1.7878, |
| "step": 9460 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.37275800989051e-05, |
| "loss": 1.792, |
| "step": 9480 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.75, |
| "learning_rate": 1.3704295390215868e-05, |
| "loss": 1.7822, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.3680987380162095e-05, |
| "loss": 1.7831, |
| "step": 9520 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.3657656215359634e-05, |
| "loss": 1.7819, |
| "step": 9540 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.3634302042569995e-05, |
| "loss": 1.7839, |
| "step": 9560 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.3610925008699413e-05, |
| "loss": 1.7905, |
| "step": 9580 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.3587525260797934e-05, |
| "loss": 1.7785, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.3564102946058468e-05, |
| "loss": 1.7846, |
| "step": 9620 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.3540658211815898e-05, |
| "loss": 1.7841, |
| "step": 9640 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.3517191205546121e-05, |
| "loss": 1.774, |
| "step": 9660 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.3493702074865139e-05, |
| "loss": 1.7947, |
| "step": 9680 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.3470190967528118e-05, |
| "loss": 1.7843, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.3446658031428474e-05, |
| "loss": 1.7796, |
| "step": 9720 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 2.5, |
| "learning_rate": 1.3423103414596929e-05, |
| "loss": 1.7713, |
| "step": 9740 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.3399527265200581e-05, |
| "loss": 1.7769, |
| "step": 9760 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.3375929731541986e-05, |
| "loss": 1.7823, |
| "step": 9780 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.3352310962058202e-05, |
| "loss": 1.7642, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.332867110531988e-05, |
| "loss": 1.7841, |
| "step": 9820 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.3305010310030311e-05, |
| "loss": 1.7897, |
| "step": 9840 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.3281328725024496e-05, |
| "loss": 1.7813, |
| "step": 9860 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.3257626499268217e-05, |
| "loss": 1.7828, |
| "step": 9880 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.3233903781857084e-05, |
| "loss": 1.7809, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.3210160722015619e-05, |
| "loss": 1.7768, |
| "step": 9920 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.3186397469096295e-05, |
| "loss": 1.7816, |
| "step": 9940 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.3162614172578614e-05, |
| "loss": 1.7741, |
| "step": 9960 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.3138810982068154e-05, |
| "loss": 1.7801, |
| "step": 9980 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.3114988047295638e-05, |
| "loss": 1.7711, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.3091145518115982e-05, |
| "loss": 1.7807, |
| "step": 10020 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.3067283544507366e-05, |
| "loss": 1.7835, |
| "step": 10040 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.3043402276570276e-05, |
| "loss": 1.7746, |
| "step": 10060 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.3019501864526565e-05, |
| "loss": 1.7742, |
| "step": 10080 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 2.25, |
| "learning_rate": 1.2995582458718518e-05, |
| "loss": 1.7811, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.2971644209607893e-05, |
| "loss": 1.7684, |
| "step": 10120 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.2947687267774973e-05, |
| "loss": 1.7778, |
| "step": 10140 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.2923711783917637e-05, |
| "loss": 1.7587, |
| "step": 10160 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.2899717908850385e-05, |
| "loss": 1.784, |
| "step": 10180 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.2875705793503424e-05, |
| "loss": 1.773, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.2851675588921677e-05, |
| "loss": 1.7721, |
| "step": 10220 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 2.75, |
| "learning_rate": 1.2827627446263877e-05, |
| "loss": 1.7781, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.2803561516801575e-05, |
| "loss": 1.7935, |
| "step": 10260 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.2779477951918217e-05, |
| "loss": 1.7746, |
| "step": 10280 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.2755376903108183e-05, |
| "loss": 1.7783, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.2731258521975829e-05, |
| "loss": 1.7812, |
| "step": 10320 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.2707122960234544e-05, |
| "loss": 1.7742, |
| "step": 10340 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.2682970369705773e-05, |
| "loss": 1.7585, |
| "step": 10360 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.2658800902318103e-05, |
| "loss": 1.7848, |
| "step": 10380 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.2634614710106266e-05, |
| "loss": 1.7784, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.2610411945210199e-05, |
| "loss": 1.7762, |
| "step": 10420 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.2586192759874094e-05, |
| "loss": 1.7686, |
| "step": 10440 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.2561957306445428e-05, |
| "loss": 1.7861, |
| "step": 10460 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.253770573737402e-05, |
| "loss": 1.7744, |
| "step": 10480 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.2513438205211048e-05, |
| "loss": 1.7703, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.2489154862608111e-05, |
| "loss": 1.7785, |
| "step": 10520 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.2464855862316263e-05, |
| "loss": 1.7789, |
| "step": 10540 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.244054135718505e-05, |
| "loss": 1.7766, |
| "step": 10560 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.2416211500161546e-05, |
| "loss": 1.7805, |
| "step": 10580 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 2.5, |
| "learning_rate": 1.2391866444289394e-05, |
| "loss": 1.7769, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 2.5, |
| "learning_rate": 1.2367506342707851e-05, |
| "loss": 1.7727, |
| "step": 10620 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.2343131348650806e-05, |
| "loss": 1.7603, |
| "step": 10640 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 2.375, |
| "learning_rate": 1.231874161544583e-05, |
| "loss": 1.7681, |
| "step": 10660 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.2294337296513219e-05, |
| "loss": 1.7705, |
| "step": 10680 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 2.375, |
| "learning_rate": 1.2269918545365e-05, |
| "loss": 1.7692, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.2245485515604004e-05, |
| "loss": 1.7685, |
| "step": 10720 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 2.84375, |
| "learning_rate": 1.2221038360922863e-05, |
| "loss": 1.7873, |
| "step": 10740 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 2.5, |
| "learning_rate": 1.219657723510307e-05, |
| "loss": 1.779, |
| "step": 10760 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.2172102292013994e-05, |
| "loss": 1.7963, |
| "step": 10780 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.2147613685611928e-05, |
| "loss": 1.7737, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.212311156993911e-05, |
| "loss": 1.7578, |
| "step": 10820 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 2.875, |
| "learning_rate": 1.2098596099122745e-05, |
| "loss": 1.7649, |
| "step": 10840 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.2074067427374068e-05, |
| "loss": 1.782, |
| "step": 10860 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.2049525708987331e-05, |
| "loss": 1.7729, |
| "step": 10880 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.2024971098338868e-05, |
| "loss": 1.7769, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.2000403749886108e-05, |
| "loss": 1.7761, |
| "step": 10920 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.1975823818166596e-05, |
| "loss": 1.7476, |
| "step": 10940 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.1951231457797047e-05, |
| "loss": 1.7814, |
| "step": 10960 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.1926626823472338e-05, |
| "loss": 1.7691, |
| "step": 10980 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.1902010069964569e-05, |
| "loss": 1.7756, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.1877381352122064e-05, |
| "loss": 1.7833, |
| "step": 11020 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.1852740824868416e-05, |
| "loss": 1.7659, |
| "step": 11040 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.1828088643201492e-05, |
| "loss": 1.772, |
| "step": 11060 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.180342496219248e-05, |
| "loss": 1.7516, |
| "step": 11080 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.17787499369849e-05, |
| "loss": 1.7647, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.1754063722793624e-05, |
| "loss": 1.769, |
| "step": 11120 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.1729366474903923e-05, |
| "loss": 1.7813, |
| "step": 11140 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.1704658348670455e-05, |
| "loss": 1.7669, |
| "step": 11160 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.1679939499516317e-05, |
| "loss": 1.7846, |
| "step": 11180 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 2.375, |
| "learning_rate": 1.165521008293206e-05, |
| "loss": 1.7719, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.1630470254474697e-05, |
| "loss": 1.7625, |
| "step": 11220 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.1605720169766752e-05, |
| "loss": 1.7721, |
| "step": 11240 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.1580959984495243e-05, |
| "loss": 1.7558, |
| "step": 11260 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.1556189854410744e-05, |
| "loss": 1.7633, |
| "step": 11280 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.1531409935326377e-05, |
| "loss": 1.7632, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 2.5, |
| "learning_rate": 1.1506620383116835e-05, |
| "loss": 1.7925, |
| "step": 11320 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.1481821353717418e-05, |
| "loss": 1.7667, |
| "step": 11340 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.145701300312303e-05, |
| "loss": 1.7733, |
| "step": 11360 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.1432195487387223e-05, |
| "loss": 1.7772, |
| "step": 11380 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.1407368962621184e-05, |
| "loss": 1.7459, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.1382533584992783e-05, |
| "loss": 1.7608, |
| "step": 11420 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.1357689510725571e-05, |
| "loss": 1.749, |
| "step": 11440 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.1332836896097808e-05, |
| "loss": 1.77, |
| "step": 11460 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 2.5, |
| "learning_rate": 1.1307975897441473e-05, |
| "loss": 1.7676, |
| "step": 11480 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.1283106671141282e-05, |
| "loss": 1.7755, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.1258229373633713e-05, |
| "loss": 1.7742, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.1233344161406008e-05, |
| "loss": 1.7606, |
| "step": 11540 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.12084511909952e-05, |
| "loss": 1.7749, |
| "step": 11560 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.1183550618987118e-05, |
| "loss": 1.7868, |
| "step": 11580 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.1158642602015415e-05, |
| "loss": 1.7712, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.1133727296760572e-05, |
| "loss": 1.7732, |
| "step": 11620 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.110880485994891e-05, |
| "loss": 1.7672, |
| "step": 11640 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 2.375, |
| "learning_rate": 1.1083875448351626e-05, |
| "loss": 1.7858, |
| "step": 11660 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.1058939218783772e-05, |
| "loss": 1.7683, |
| "step": 11680 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.10339963281033e-05, |
| "loss": 1.7813, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.100904693321006e-05, |
| "loss": 1.7745, |
| "step": 11720 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.0984091191044816e-05, |
| "loss": 1.7848, |
| "step": 11740 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 2.5, |
| "learning_rate": 1.0959129258588257e-05, |
| "loss": 1.7518, |
| "step": 11760 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.0934161292860008e-05, |
| "loss": 1.7768, |
| "step": 11780 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.0909187450917656e-05, |
| "loss": 1.7602, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.0884207889855735e-05, |
| "loss": 1.758, |
| "step": 11820 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.0859222766804778e-05, |
| "loss": 1.7761, |
| "step": 11840 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.0834232238930283e-05, |
| "loss": 1.7606, |
| "step": 11860 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.0809236463431754e-05, |
| "loss": 1.779, |
| "step": 11880 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.765625, |
| "learning_rate": 1.0784235597541708e-05, |
| "loss": 1.771, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.075922979852468e-05, |
| "loss": 1.7654, |
| "step": 11920 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.073421922367623e-05, |
| "loss": 1.7758, |
| "step": 11940 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.0709204030321972e-05, |
| "loss": 1.7592, |
| "step": 11960 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.068418437581656e-05, |
| "loss": 1.7741, |
| "step": 11980 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.0659160417542721e-05, |
| "loss": 1.759, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.0634132312910245e-05, |
| "loss": 1.7809, |
| "step": 12020 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.060910021935501e-05, |
| "loss": 1.7811, |
| "step": 12040 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.0584064294337983e-05, |
| "loss": 1.761, |
| "step": 12060 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.0559024695344233e-05, |
| "loss": 1.7515, |
| "step": 12080 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.0533981579881938e-05, |
| "loss": 1.7861, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.0508935105481402e-05, |
| "loss": 1.7643, |
| "step": 12120 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.0483885429694051e-05, |
| "loss": 1.7745, |
| "step": 12140 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.0458832710091448e-05, |
| "loss": 1.7539, |
| "step": 12160 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.0433777104264313e-05, |
| "loss": 1.7546, |
| "step": 12180 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.0408718769821512e-05, |
| "loss": 1.7606, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 2.765625, |
| "learning_rate": 1.0383657864389077e-05, |
| "loss": 1.7583, |
| "step": 12220 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.0358594545609207e-05, |
| "loss": 1.7659, |
| "step": 12240 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.0333528971139297e-05, |
| "loss": 1.7601, |
| "step": 12260 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.0308461298650923e-05, |
| "loss": 1.7612, |
| "step": 12280 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.0283391685828844e-05, |
| "loss": 1.7646, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 2.765625, |
| "learning_rate": 1.0258320290370051e-05, |
| "loss": 1.7741, |
| "step": 12320 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.0233247269982732e-05, |
| "loss": 1.7616, |
| "step": 12340 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 2.625, |
| "learning_rate": 1.0208172782385295e-05, |
| "loss": 1.7502, |
| "step": 12360 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.0183096985305385e-05, |
| "loss": 1.7806, |
| "step": 12380 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.0158020036478881e-05, |
| "loss": 1.7728, |
| "step": 12400 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 25052, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 200, |
| "total_flos": 1.9252630386848563e+19, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|