{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2239902080783354, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008159934720522236, "grad_norm": 21.462657928466797, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -2.7689709663391113, "logits/rejected": -2.0927348136901855, "logps/chosen": -55.28605270385742, "logps/rejected": -55.09723663330078, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0016319869441044472, "grad_norm": 34.621238708496094, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -2.360377311706543, "logits/rejected": -1.9050047397613525, "logps/chosen": -57.62479019165039, "logps/rejected": -48.28424072265625, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0024479804161566705, "grad_norm": 52.73011016845703, "learning_rate": 3e-06, "logits/chosen": -3.015127182006836, "logits/rejected": -2.0059075355529785, "logps/chosen": -51.95608901977539, "logps/rejected": -66.48678588867188, "loss": 2.7676, "rewards/accuracies": 0.5, "rewards/chosen": -0.005445980932563543, "rewards/margins": 0.002731609856709838, "rewards/rejected": -0.008177591487765312, "step": 3 }, { "epoch": 0.0032639738882088943, "grad_norm": 20.072193145751953, "learning_rate": 4.000000000000001e-06, "logits/chosen": -2.6263206005096436, "logits/rejected": -1.6859228610992432, "logps/chosen": -49.40510559082031, "logps/rejected": -44.30913543701172, "loss": 2.7083, "rewards/accuracies": 0.75, "rewards/chosen": 0.02305634319782257, "rewards/margins": 0.0330297015607357, "rewards/rejected": -0.009973359294235706, "step": 4 }, { "epoch": 0.004079967360261118, "grad_norm": 21.640443801879883, "learning_rate": 5e-06, "logits/chosen": -2.3080637454986572, "logits/rejected": -1.1792609691619873, "logps/chosen": -43.11738204956055, "logps/rejected": -46.14204788208008, "loss": 2.7418, "rewards/accuracies": 0.5, "rewards/chosen": 0.023705292493104935, "rewards/margins": 0.018291976302862167, "rewards/rejected": 0.005413317121565342, "step": 5 }, { "epoch": 0.004895960832313341, "grad_norm": 17.380878448486328, "learning_rate": 6e-06, "logits/chosen": -3.1565282344818115, "logits/rejected": -1.7998199462890625, "logps/chosen": -52.294700622558594, "logps/rejected": -53.27042007446289, "loss": 2.1441, "rewards/accuracies": 1.0, "rewards/chosen": 0.4977771043777466, "rewards/margins": 0.36135292053222656, "rewards/rejected": 0.13642415404319763, "step": 6 }, { "epoch": 0.005711954304365565, "grad_norm": 19.728750228881836, "learning_rate": 7.000000000000001e-06, "logits/chosen": -2.418670654296875, "logits/rejected": -1.8612295389175415, "logps/chosen": -51.502952575683594, "logps/rejected": -62.385284423828125, "loss": 2.4682, "rewards/accuracies": 1.0, "rewards/chosen": 0.15619251132011414, "rewards/margins": 0.16380366683006287, "rewards/rejected": -0.007611132226884365, "step": 7 }, { "epoch": 0.006527947776417789, "grad_norm": 74.89817810058594, "learning_rate": 8.000000000000001e-06, "logits/chosen": -3.066373825073242, "logits/rejected": -1.8911807537078857, "logps/chosen": -70.49410247802734, "logps/rejected": -56.10986328125, "loss": 2.5324, "rewards/accuracies": 0.625, "rewards/chosen": 0.23252645134925842, "rewards/margins": 0.2700779438018799, "rewards/rejected": -0.03755149245262146, "step": 8 }, { "epoch": 0.0073439412484700125, "grad_norm": 15.937973022460938, "learning_rate": 9e-06, "logits/chosen": -4.6419596672058105, "logits/rejected": -2.7592968940734863, "logps/chosen": -36.66855239868164, "logps/rejected": -53.96320343017578, "loss": 1.7536, "rewards/accuracies": 0.875, "rewards/chosen": 0.5054474472999573, "rewards/margins": 0.7306665778160095, "rewards/rejected": -0.2252192199230194, "step": 9 }, { "epoch": 0.008159934720522236, "grad_norm": 14.358619689941406, "learning_rate": 1e-05, "logits/chosen": -3.2576029300689697, "logits/rejected": -2.996980667114258, "logps/chosen": -53.363067626953125, "logps/rejected": -55.76760482788086, "loss": 1.4903, "rewards/accuracies": 1.0, "rewards/chosen": 0.3997817039489746, "rewards/margins": 0.8900778293609619, "rewards/rejected": -0.4902961254119873, "step": 10 }, { "epoch": 0.00897592819257446, "grad_norm": 39.451316833496094, "learning_rate": 1.1000000000000001e-05, "logits/chosen": -3.959218978881836, "logits/rejected": -3.180154800415039, "logps/chosen": -40.28937911987305, "logps/rejected": -57.90419006347656, "loss": 1.3797, "rewards/accuracies": 0.875, "rewards/chosen": 0.43097198009490967, "rewards/margins": 1.4005110263824463, "rewards/rejected": -0.9695389270782471, "step": 11 }, { "epoch": 0.009791921664626682, "grad_norm": 14.709641456604004, "learning_rate": 1.2e-05, "logits/chosen": -5.814691066741943, "logits/rejected": -5.604647159576416, "logps/chosen": -57.56425094604492, "logps/rejected": -75.95782470703125, "loss": 0.6467, "rewards/accuracies": 1.0, "rewards/chosen": 0.1183604821562767, "rewards/margins": 2.366847038269043, "rewards/rejected": -2.2484865188598633, "step": 12 }, { "epoch": 0.010607915136678907, "grad_norm": 7.058086395263672, "learning_rate": 1.3000000000000001e-05, "logits/chosen": -5.5055365562438965, "logits/rejected": -5.962566375732422, "logps/chosen": -65.0806884765625, "logps/rejected": -95.85297393798828, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.23569892346858978, "rewards/margins": 3.323744535446167, "rewards/rejected": -3.088045597076416, "step": 13 }, { "epoch": 0.01142390860873113, "grad_norm": 9.770357131958008, "learning_rate": 1.4000000000000001e-05, "logits/chosen": -7.015048503875732, "logits/rejected": -6.932949066162109, "logps/chosen": -47.70991897583008, "logps/rejected": -75.99567413330078, "loss": 0.391, "rewards/accuracies": 1.0, "rewards/chosen": 0.26448553800582886, "rewards/margins": 2.8304567337036133, "rewards/rejected": -2.5659713745117188, "step": 14 }, { "epoch": 0.012239902080783354, "grad_norm": 21.54039764404297, "learning_rate": 1.5e-05, "logits/chosen": -9.141258239746094, "logits/rejected": -9.275306701660156, "logps/chosen": -50.19709777832031, "logps/rejected": -105.00299072265625, "loss": 0.2362, "rewards/accuracies": 1.0, "rewards/chosen": 0.027625679969787598, "rewards/margins": 5.023341655731201, "rewards/rejected": -4.995716094970703, "step": 15 }, { "epoch": 0.013055895552835577, "grad_norm": 8.572073936462402, "learning_rate": 1.6000000000000003e-05, "logits/chosen": -11.952176094055176, "logits/rejected": -11.612899780273438, "logps/chosen": -30.554523468017578, "logps/rejected": -70.74420928955078, "loss": 0.2354, "rewards/accuracies": 1.0, "rewards/chosen": 0.9949737191200256, "rewards/margins": 3.6470232009887695, "rewards/rejected": -2.6520495414733887, "step": 16 }, { "epoch": 0.0138718890248878, "grad_norm": 6.92222261428833, "learning_rate": 1.7000000000000003e-05, "logits/chosen": -10.995304107666016, "logits/rejected": -11.316845893859863, "logps/chosen": -55.76210403442383, "logps/rejected": -110.64358520507812, "loss": 0.1492, "rewards/accuracies": 1.0, "rewards/chosen": -0.33038485050201416, "rewards/margins": 5.116985321044922, "rewards/rejected": -5.447370529174805, "step": 17 }, { "epoch": 0.014687882496940025, "grad_norm": 6.291208744049072, "learning_rate": 1.8e-05, "logits/chosen": -13.242725372314453, "logits/rejected": -12.680657386779785, "logps/chosen": -58.80963897705078, "logps/rejected": -131.2765350341797, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": -1.202932596206665, "rewards/margins": 7.289811611175537, "rewards/rejected": -8.492744445800781, "step": 18 }, { "epoch": 0.015503875968992248, "grad_norm": 116.36194610595703, "learning_rate": 1.9e-05, "logits/chosen": -13.954850196838379, "logits/rejected": -13.743473052978516, "logps/chosen": -84.54165649414062, "logps/rejected": -188.95101928710938, "loss": 2.6982, "rewards/accuracies": 0.875, "rewards/chosen": -2.9840242862701416, "rewards/margins": 9.399154663085938, "rewards/rejected": -12.383180618286133, "step": 19 }, { "epoch": 0.016319869441044473, "grad_norm": 220.90142822265625, "learning_rate": 2e-05, "logits/chosen": -13.938445091247559, "logits/rejected": -13.273670196533203, "logps/chosen": -116.75308227539062, "logps/rejected": -182.36404418945312, "loss": 2.6181, "rewards/accuracies": 0.75, "rewards/chosen": -4.653705596923828, "rewards/margins": 7.279078006744385, "rewards/rejected": -11.932783126831055, "step": 20 }, { "epoch": 0.017135862913096694, "grad_norm": 29.513032913208008, "learning_rate": 2.1e-05, "logits/chosen": -13.414140701293945, "logits/rejected": -14.644088745117188, "logps/chosen": -83.08662414550781, "logps/rejected": -141.91819763183594, "loss": 0.2906, "rewards/accuracies": 1.0, "rewards/chosen": -3.196918487548828, "rewards/margins": 5.376188278198242, "rewards/rejected": -8.57310676574707, "step": 21 }, { "epoch": 0.01795185638514892, "grad_norm": 10.239615440368652, "learning_rate": 2.2000000000000003e-05, "logits/chosen": -14.049056053161621, "logits/rejected": -13.65890884399414, "logps/chosen": -116.70942687988281, "logps/rejected": -209.09388732910156, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": -5.642006874084473, "rewards/margins": 7.908568859100342, "rewards/rejected": -13.550575256347656, "step": 22 }, { "epoch": 0.018767849857201143, "grad_norm": 57.80908203125, "learning_rate": 2.3000000000000003e-05, "logits/chosen": -14.986572265625, "logits/rejected": -15.497381210327148, "logps/chosen": -129.2240447998047, "logps/rejected": -159.7672882080078, "loss": 1.8919, "rewards/accuracies": 0.875, "rewards/chosen": -7.9429216384887695, "rewards/margins": 3.274618625640869, "rewards/rejected": -11.217540740966797, "step": 23 }, { "epoch": 0.019583843329253364, "grad_norm": 49.76498794555664, "learning_rate": 2.4e-05, "logits/chosen": -13.990806579589844, "logits/rejected": -15.025201797485352, "logps/chosen": -113.8927001953125, "logps/rejected": -170.2935333251953, "loss": 1.0851, "rewards/accuracies": 0.75, "rewards/chosen": -5.509402275085449, "rewards/margins": 6.7729902267456055, "rewards/rejected": -12.282392501831055, "step": 24 }, { "epoch": 0.02039983680130559, "grad_norm": 31.881771087646484, "learning_rate": 2.5e-05, "logits/chosen": -13.707062721252441, "logits/rejected": -13.531257629394531, "logps/chosen": -127.38655090332031, "logps/rejected": -200.4104766845703, "loss": 0.5159, "rewards/accuracies": 1.0, "rewards/chosen": -7.633309364318848, "rewards/margins": 6.765519142150879, "rewards/rejected": -14.398828506469727, "step": 25 }, { "epoch": 0.021215830273357814, "grad_norm": 9.395254135131836, "learning_rate": 2.6000000000000002e-05, "logits/chosen": -12.398665428161621, "logits/rejected": -13.19998836517334, "logps/chosen": -117.2318115234375, "logps/rejected": -201.49085998535156, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": -6.3769001960754395, "rewards/margins": 8.47896957397461, "rewards/rejected": -14.855870246887207, "step": 26 }, { "epoch": 0.022031823745410038, "grad_norm": 17.17490005493164, "learning_rate": 2.7000000000000002e-05, "logits/chosen": -12.804766654968262, "logits/rejected": -12.747337341308594, "logps/chosen": -118.0882568359375, "logps/rejected": -176.5743408203125, "loss": 0.2446, "rewards/accuracies": 1.0, "rewards/chosen": -7.691473484039307, "rewards/margins": 5.3087592124938965, "rewards/rejected": -13.000232696533203, "step": 27 }, { "epoch": 0.02284781721746226, "grad_norm": 10.926363945007324, "learning_rate": 2.8000000000000003e-05, "logits/chosen": -11.687684059143066, "logits/rejected": -10.691304206848145, "logps/chosen": -104.44070434570312, "logps/rejected": -191.8475341796875, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": -5.38068151473999, "rewards/margins": 7.3574910163879395, "rewards/rejected": -12.73817253112793, "step": 28 }, { "epoch": 0.023663810689514484, "grad_norm": 3.3868727684020996, "learning_rate": 2.9e-05, "logits/chosen": -11.336647033691406, "logits/rejected": -11.679638862609863, "logps/chosen": -131.53826904296875, "logps/rejected": -215.94332885742188, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -5.51068115234375, "rewards/margins": 10.960424423217773, "rewards/rejected": -16.471105575561523, "step": 29 }, { "epoch": 0.02447980416156671, "grad_norm": 0.26708680391311646, "learning_rate": 3e-05, "logits/chosen": -11.344396591186523, "logits/rejected": -12.2286376953125, "logps/chosen": -115.52413177490234, "logps/rejected": -212.9380340576172, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -7.0695576667785645, "rewards/margins": 10.059380531311035, "rewards/rejected": -17.128936767578125, "step": 30 }, { "epoch": 0.02529579763361893, "grad_norm": 0.03327327221632004, "learning_rate": 3.1e-05, "logits/chosen": -10.277887344360352, "logits/rejected": -11.581415176391602, "logps/chosen": -91.64991760253906, "logps/rejected": -203.35400390625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.347740650177002, "rewards/margins": 11.606093406677246, "rewards/rejected": -15.953832626342773, "step": 31 }, { "epoch": 0.026111791105671155, "grad_norm": 12.014016151428223, "learning_rate": 3.2000000000000005e-05, "logits/chosen": -10.310077667236328, "logits/rejected": -11.453861236572266, "logps/chosen": -111.13121795654297, "logps/rejected": -203.17271423339844, "loss": 0.5005, "rewards/accuracies": 0.875, "rewards/chosen": -5.597247123718262, "rewards/margins": 9.249977111816406, "rewards/rejected": -14.847223281860352, "step": 32 }, { "epoch": 0.02692778457772338, "grad_norm": 43.55177688598633, "learning_rate": 3.3e-05, "logits/chosen": -10.565553665161133, "logits/rejected": -11.34553337097168, "logps/chosen": -101.64309692382812, "logps/rejected": -180.67050170898438, "loss": 0.6738, "rewards/accuracies": 0.875, "rewards/chosen": -4.554203033447266, "rewards/margins": 8.794570922851562, "rewards/rejected": -13.348773956298828, "step": 33 }, { "epoch": 0.0277437780497756, "grad_norm": 8.4603271484375, "learning_rate": 3.4000000000000007e-05, "logits/chosen": -10.049832344055176, "logits/rejected": -11.216059684753418, "logps/chosen": -81.22886657714844, "logps/rejected": -187.88560485839844, "loss": 0.2336, "rewards/accuracies": 1.0, "rewards/chosen": -4.490256309509277, "rewards/margins": 9.600495338439941, "rewards/rejected": -14.090751647949219, "step": 34 }, { "epoch": 0.028559771521827825, "grad_norm": 1.8451354503631592, "learning_rate": 3.5e-05, "logits/chosen": -10.480775833129883, "logits/rejected": -10.309918403625488, "logps/chosen": -80.2640151977539, "logps/rejected": -141.5569610595703, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": -2.2878379821777344, "rewards/margins": 6.7369160652160645, "rewards/rejected": -9.02475357055664, "step": 35 }, { "epoch": 0.02937576499388005, "grad_norm": 0.06596250087022781, "learning_rate": 3.6e-05, "logits/chosen": -10.7702054977417, "logits/rejected": -10.448037147521973, "logps/chosen": -95.64859771728516, "logps/rejected": -202.57215881347656, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.0160765647888184, "rewards/margins": 10.345304489135742, "rewards/rejected": -13.361380577087402, "step": 36 }, { "epoch": 0.03019175846593227, "grad_norm": 3.2526538372039795, "learning_rate": 3.7e-05, "logits/chosen": -9.422935485839844, "logits/rejected": -10.116438865661621, "logps/chosen": -89.68400573730469, "logps/rejected": -188.861572265625, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": -2.67020320892334, "rewards/margins": 11.34065055847168, "rewards/rejected": -14.010854721069336, "step": 37 }, { "epoch": 0.031007751937984496, "grad_norm": 10.304131507873535, "learning_rate": 3.8e-05, "logits/chosen": -10.341386795043945, "logits/rejected": -9.998468399047852, "logps/chosen": -86.72317504882812, "logps/rejected": -153.13282775878906, "loss": 0.319, "rewards/accuracies": 1.0, "rewards/chosen": -4.598939895629883, "rewards/margins": 5.5302019119262695, "rewards/rejected": -10.129141807556152, "step": 38 }, { "epoch": 0.03182374541003672, "grad_norm": 0.18143615126609802, "learning_rate": 3.9000000000000006e-05, "logits/chosen": -10.753297805786133, "logits/rejected": -11.136191368103027, "logps/chosen": -91.9932861328125, "logps/rejected": -207.7485809326172, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.82356595993042, "rewards/margins": 11.506268501281738, "rewards/rejected": -14.329835891723633, "step": 39 }, { "epoch": 0.032639738882088945, "grad_norm": 0.0939520075917244, "learning_rate": 4e-05, "logits/chosen": -11.006842613220215, "logits/rejected": -11.542791366577148, "logps/chosen": -90.84228515625, "logps/rejected": -202.94210815429688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8707189559936523, "rewards/margins": 12.638145446777344, "rewards/rejected": -15.50886344909668, "step": 40 }, { "epoch": 0.03345573235414117, "grad_norm": 4.159176349639893, "learning_rate": 4.1e-05, "logits/chosen": -11.37222671508789, "logits/rejected": -11.739785194396973, "logps/chosen": -123.54754638671875, "logps/rejected": -203.88571166992188, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": -6.678891181945801, "rewards/margins": 7.975704193115234, "rewards/rejected": -14.654595375061035, "step": 41 }, { "epoch": 0.03427172582619339, "grad_norm": 0.24069182574748993, "learning_rate": 4.2e-05, "logits/chosen": -11.784760475158691, "logits/rejected": -11.90956974029541, "logps/chosen": -110.05374145507812, "logps/rejected": -204.52383422851562, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.914079666137695, "rewards/margins": 9.516246795654297, "rewards/rejected": -14.430326461791992, "step": 42 }, { "epoch": 0.03508771929824561, "grad_norm": 3.8147671222686768, "learning_rate": 4.3e-05, "logits/chosen": -13.683969497680664, "logits/rejected": -13.234493255615234, "logps/chosen": -112.96327209472656, "logps/rejected": -264.97998046875, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": -6.979959487915039, "rewards/margins": 13.236745834350586, "rewards/rejected": -20.216705322265625, "step": 43 }, { "epoch": 0.03590371277029784, "grad_norm": 0.06900728493928909, "learning_rate": 4.4000000000000006e-05, "logits/chosen": -14.96174430847168, "logits/rejected": -14.943832397460938, "logps/chosen": -154.12229919433594, "logps/rejected": -262.4346923828125, "loss": 0.3479, "rewards/accuracies": 0.875, "rewards/chosen": -10.08390998840332, "rewards/margins": 10.437463760375977, "rewards/rejected": -20.521373748779297, "step": 44 }, { "epoch": 0.03671970624235006, "grad_norm": 0.002201593015342951, "learning_rate": 4.5e-05, "logits/chosen": -15.305776596069336, "logits/rejected": -15.806853294372559, "logps/chosen": -157.091796875, "logps/rejected": -307.3651428222656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.818948745727539, "rewards/margins": 13.897600173950195, "rewards/rejected": -23.716548919677734, "step": 45 }, { "epoch": 0.037535699714402286, "grad_norm": 15.401101112365723, "learning_rate": 4.600000000000001e-05, "logits/chosen": -18.102149963378906, "logits/rejected": -18.22096824645996, "logps/chosen": -208.415283203125, "logps/rejected": -293.334228515625, "loss": 0.7039, "rewards/accuracies": 0.875, "rewards/chosen": -15.247286796569824, "rewards/margins": 8.748310089111328, "rewards/rejected": -23.99559783935547, "step": 46 }, { "epoch": 0.03835169318645451, "grad_norm": 43.82505416870117, "learning_rate": 4.7e-05, "logits/chosen": -19.0135498046875, "logits/rejected": -19.536161422729492, "logps/chosen": -207.71014404296875, "logps/rejected": -299.86199951171875, "loss": 1.1542, "rewards/accuracies": 0.875, "rewards/chosen": -15.108871459960938, "rewards/margins": 9.15868854522705, "rewards/rejected": -24.267559051513672, "step": 47 }, { "epoch": 0.03916768665850673, "grad_norm": 12.39047908782959, "learning_rate": 4.8e-05, "logits/chosen": -19.262493133544922, "logits/rejected": -19.780364990234375, "logps/chosen": -161.326416015625, "logps/rejected": -242.02845764160156, "loss": 0.2724, "rewards/accuracies": 1.0, "rewards/chosen": -10.75531005859375, "rewards/margins": 9.173399925231934, "rewards/rejected": -19.9287109375, "step": 48 }, { "epoch": 0.03998368013055895, "grad_norm": 2.1911697387695312, "learning_rate": 4.9e-05, "logits/chosen": -18.363576889038086, "logits/rejected": -18.51629638671875, "logps/chosen": -150.31112670898438, "logps/rejected": -275.11639404296875, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -10.645605087280273, "rewards/margins": 11.189362525939941, "rewards/rejected": -21.83496856689453, "step": 49 }, { "epoch": 0.04079967360261118, "grad_norm": 35.07072067260742, "learning_rate": 5e-05, "logits/chosen": -16.629329681396484, "logits/rejected": -16.230377197265625, "logps/chosen": -137.39036560058594, "logps/rejected": -223.52264404296875, "loss": 0.4648, "rewards/accuracies": 0.875, "rewards/chosen": -7.266803741455078, "rewards/margins": 9.153909683227539, "rewards/rejected": -16.420713424682617, "step": 50 }, { "epoch": 0.0416156670746634, "grad_norm": 0.9441677331924438, "learning_rate": 5.1000000000000006e-05, "logits/chosen": -14.022198677062988, "logits/rejected": -13.892732620239258, "logps/chosen": -86.72583770751953, "logps/rejected": -222.68865966796875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -2.613485097885132, "rewards/margins": 13.346776008605957, "rewards/rejected": -15.960260391235352, "step": 51 }, { "epoch": 0.04243166054671563, "grad_norm": 1.8051077127456665, "learning_rate": 5.2000000000000004e-05, "logits/chosen": -13.61818790435791, "logits/rejected": -13.113778114318848, "logps/chosen": -69.76924133300781, "logps/rejected": -145.86016845703125, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": -1.9107009172439575, "rewards/margins": 6.816506385803223, "rewards/rejected": -8.727208137512207, "step": 52 }, { "epoch": 0.04324765401876785, "grad_norm": 0.003828912042081356, "learning_rate": 5.300000000000001e-05, "logits/chosen": -13.905014991760254, "logits/rejected": -14.161726951599121, "logps/chosen": -62.63215637207031, "logps/rejected": -226.73385620117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.5790208578109741, "rewards/margins": 15.10482406616211, "rewards/rejected": -16.68384552001953, "step": 53 }, { "epoch": 0.044063647490820076, "grad_norm": 7.056990146636963, "learning_rate": 5.4000000000000005e-05, "logits/chosen": -14.201363563537598, "logits/rejected": -14.156310081481934, "logps/chosen": -91.08805847167969, "logps/rejected": -195.93544006347656, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": -4.004441738128662, "rewards/margins": 9.678146362304688, "rewards/rejected": -13.682588577270508, "step": 54 }, { "epoch": 0.044879640962872294, "grad_norm": 0.00023165266611613333, "learning_rate": 5.500000000000001e-05, "logits/chosen": -13.920963287353516, "logits/rejected": -14.665756225585938, "logps/chosen": -97.17684936523438, "logps/rejected": -211.41957092285156, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -2.7422127723693848, "rewards/margins": 13.699962615966797, "rewards/rejected": -16.442174911499023, "step": 55 }, { "epoch": 0.04569563443492452, "grad_norm": 0.5873515009880066, "learning_rate": 5.6000000000000006e-05, "logits/chosen": -15.820951461791992, "logits/rejected": -15.888005256652832, "logps/chosen": -152.45887756347656, "logps/rejected": -296.1074523925781, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -8.931063652038574, "rewards/margins": 12.730661392211914, "rewards/rejected": -21.661724090576172, "step": 56 }, { "epoch": 0.046511627906976744, "grad_norm": 0.8842801451683044, "learning_rate": 5.6999999999999996e-05, "logits/chosen": -14.706609725952148, "logits/rejected": -15.38596248626709, "logps/chosen": -145.334716796875, "logps/rejected": -243.2307586669922, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -8.153348922729492, "rewards/margins": 10.173572540283203, "rewards/rejected": -18.326921463012695, "step": 57 }, { "epoch": 0.04732762137902897, "grad_norm": 0.5146946310997009, "learning_rate": 5.8e-05, "logits/chosen": -17.250165939331055, "logits/rejected": -18.05356216430664, "logps/chosen": -120.22584533691406, "logps/rejected": -205.58740234375, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -7.734566688537598, "rewards/margins": 8.845052719116211, "rewards/rejected": -16.579618453979492, "step": 58 }, { "epoch": 0.04814361485108119, "grad_norm": 3.981776237487793, "learning_rate": 5.9e-05, "logits/chosen": -18.58802032470703, "logits/rejected": -19.030080795288086, "logps/chosen": -131.78843688964844, "logps/rejected": -252.42808532714844, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": -8.43131160736084, "rewards/margins": 11.851092338562012, "rewards/rejected": -20.28240394592285, "step": 59 }, { "epoch": 0.04895960832313342, "grad_norm": 0.09226488322019577, "learning_rate": 6e-05, "logits/chosen": -17.94907569885254, "logits/rejected": -18.652515411376953, "logps/chosen": -125.25306701660156, "logps/rejected": -265.3421325683594, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.225669860839844, "rewards/margins": 13.431042671203613, "rewards/rejected": -21.656713485717773, "step": 60 }, { "epoch": 0.049775601795185635, "grad_norm": 0.013151478953659534, "learning_rate": 6.1e-05, "logits/chosen": -17.630491256713867, "logits/rejected": -18.552654266357422, "logps/chosen": -86.04399871826172, "logps/rejected": -329.23046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.384803295135498, "rewards/margins": 23.05382537841797, "rewards/rejected": -27.438629150390625, "step": 61 }, { "epoch": 0.05059159526723786, "grad_norm": 15.674346923828125, "learning_rate": 6.2e-05, "logits/chosen": -18.14073371887207, "logits/rejected": -18.707223892211914, "logps/chosen": -190.47344970703125, "logps/rejected": -268.0466613769531, "loss": 0.6325, "rewards/accuracies": 0.875, "rewards/chosen": -12.520374298095703, "rewards/margins": 9.70265007019043, "rewards/rejected": -22.223026275634766, "step": 62 }, { "epoch": 0.051407588739290085, "grad_norm": 0.20326249301433563, "learning_rate": 6.3e-05, "logits/chosen": -18.666969299316406, "logits/rejected": -18.980743408203125, "logps/chosen": -165.6114501953125, "logps/rejected": -329.83148193359375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -10.465351104736328, "rewards/margins": 17.443470001220703, "rewards/rejected": -27.90882110595703, "step": 63 }, { "epoch": 0.05222358221134231, "grad_norm": 0.01380116306245327, "learning_rate": 6.400000000000001e-05, "logits/chosen": -18.09732437133789, "logits/rejected": -18.72121238708496, "logps/chosen": -142.4990234375, "logps/rejected": -351.5462341308594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.254779815673828, "rewards/margins": 20.03445816040039, "rewards/rejected": -29.28923797607422, "step": 64 }, { "epoch": 0.053039575683394534, "grad_norm": 1.4752107858657837, "learning_rate": 6.500000000000001e-05, "logits/chosen": -17.80196762084961, "logits/rejected": -18.21116065979004, "logps/chosen": -173.88632202148438, "logps/rejected": -391.4139404296875, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -11.253101348876953, "rewards/margins": 20.102920532226562, "rewards/rejected": -31.356021881103516, "step": 65 }, { "epoch": 0.05385556915544676, "grad_norm": 0.007827825844287872, "learning_rate": 6.6e-05, "logits/chosen": -18.946306228637695, "logits/rejected": -19.263242721557617, "logps/chosen": -219.56671142578125, "logps/rejected": -397.2974548339844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.89900016784668, "rewards/margins": 18.02783203125, "rewards/rejected": -32.92683410644531, "step": 66 }, { "epoch": 0.05467156262749898, "grad_norm": 14.491320610046387, "learning_rate": 6.7e-05, "logits/chosen": -18.1463565826416, "logits/rejected": -18.792158126831055, "logps/chosen": -197.15493774414062, "logps/rejected": -295.4598693847656, "loss": 0.3057, "rewards/accuracies": 1.0, "rewards/chosen": -15.303884506225586, "rewards/margins": 10.378215789794922, "rewards/rejected": -25.68210220336914, "step": 67 }, { "epoch": 0.0554875560995512, "grad_norm": 49.27912521362305, "learning_rate": 6.800000000000001e-05, "logits/chosen": -17.420108795166016, "logits/rejected": -17.716339111328125, "logps/chosen": -205.34857177734375, "logps/rejected": -316.29315185546875, "loss": 1.4839, "rewards/accuracies": 0.875, "rewards/chosen": -16.214710235595703, "rewards/margins": 11.022737503051758, "rewards/rejected": -27.237449645996094, "step": 68 }, { "epoch": 0.056303549571603426, "grad_norm": 0.08895689249038696, "learning_rate": 6.9e-05, "logits/chosen": -17.380586624145508, "logits/rejected": -17.88616943359375, "logps/chosen": -199.71690368652344, "logps/rejected": -349.3351135253906, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -15.195581436157227, "rewards/margins": 14.23803997039795, "rewards/rejected": -29.433622360229492, "step": 69 }, { "epoch": 0.05711954304365565, "grad_norm": 1.3052929639816284, "learning_rate": 7e-05, "logits/chosen": -16.54950714111328, "logits/rejected": -17.362905502319336, "logps/chosen": -193.903076171875, "logps/rejected": -376.810546875, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -13.714447021484375, "rewards/margins": 18.415538787841797, "rewards/rejected": -32.12998580932617, "step": 70 }, { "epoch": 0.057935536515707875, "grad_norm": 0.1257426142692566, "learning_rate": 7.1e-05, "logits/chosen": -16.73952293395996, "logits/rejected": -17.64552879333496, "logps/chosen": -222.7097625732422, "logps/rejected": -423.40069580078125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -15.788162231445312, "rewards/margins": 19.77151870727539, "rewards/rejected": -35.5596809387207, "step": 71 }, { "epoch": 0.0587515299877601, "grad_norm": 0.00047821071348153055, "learning_rate": 7.2e-05, "logits/chosen": -16.672523498535156, "logits/rejected": -17.659717559814453, "logps/chosen": -219.80979919433594, "logps/rejected": -406.72467041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.919352531433105, "rewards/margins": 20.131080627441406, "rewards/rejected": -35.05043411254883, "step": 72 }, { "epoch": 0.059567523459812324, "grad_norm": 0.4082631766796112, "learning_rate": 7.3e-05, "logits/chosen": -16.708377838134766, "logits/rejected": -17.612152099609375, "logps/chosen": -195.7980499267578, "logps/rejected": -345.5812683105469, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -14.880353927612305, "rewards/margins": 13.664918899536133, "rewards/rejected": -28.545270919799805, "step": 73 }, { "epoch": 0.06038351693186454, "grad_norm": 0.07692300528287888, "learning_rate": 7.4e-05, "logits/chosen": -16.438520431518555, "logits/rejected": -17.72043228149414, "logps/chosen": -195.89724731445312, "logps/rejected": -358.75933837890625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -15.166898727416992, "rewards/margins": 15.396188735961914, "rewards/rejected": -30.563091278076172, "step": 74 }, { "epoch": 0.06119951040391677, "grad_norm": 0.014075415208935738, "learning_rate": 7.500000000000001e-05, "logits/chosen": -16.659435272216797, "logits/rejected": -18.157087326049805, "logps/chosen": -196.61114501953125, "logps/rejected": -378.9595947265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.051222801208496, "rewards/margins": 17.932422637939453, "rewards/rejected": -32.983646392822266, "step": 75 }, { "epoch": 0.06201550387596899, "grad_norm": 3.203604865120724e-05, "learning_rate": 7.6e-05, "logits/chosen": -18.067096710205078, "logits/rejected": -19.08338165283203, "logps/chosen": -231.9807586669922, "logps/rejected": -492.6588134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.40658187866211, "rewards/margins": 24.783287048339844, "rewards/rejected": -41.18987274169922, "step": 76 }, { "epoch": 0.06283149734802121, "grad_norm": 9.396432687935885e-06, "learning_rate": 7.7e-05, "logits/chosen": -15.288285255432129, "logits/rejected": -18.411211013793945, "logps/chosen": -144.79306030273438, "logps/rejected": -370.4601745605469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.595946311950684, "rewards/margins": 23.046783447265625, "rewards/rejected": -31.642728805541992, "step": 77 }, { "epoch": 0.06364749082007344, "grad_norm": 13.065900802612305, "learning_rate": 7.800000000000001e-05, "logits/chosen": -16.71186637878418, "logits/rejected": -18.02735137939453, "logps/chosen": -167.5548858642578, "logps/rejected": -340.9808349609375, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": -12.439533233642578, "rewards/margins": 16.875547409057617, "rewards/rejected": -29.315080642700195, "step": 78 }, { "epoch": 0.06446348429212566, "grad_norm": 0.02999279648065567, "learning_rate": 7.900000000000001e-05, "logits/chosen": -15.071319580078125, "logits/rejected": -17.911449432373047, "logps/chosen": -184.70310974121094, "logps/rejected": -375.94769287109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -12.30439567565918, "rewards/margins": 20.031049728393555, "rewards/rejected": -32.33544158935547, "step": 79 }, { "epoch": 0.06527947776417789, "grad_norm": 0.002327166497707367, "learning_rate": 8e-05, "logits/chosen": -14.753462791442871, "logits/rejected": -16.586809158325195, "logps/chosen": -161.132080078125, "logps/rejected": -305.30126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.008363723754883, "rewards/margins": 14.816070556640625, "rewards/rejected": -24.824432373046875, "step": 80 }, { "epoch": 0.06609547123623011, "grad_norm": 0.017179209738969803, "learning_rate": 8.1e-05, "logits/chosen": -14.335855484008789, "logits/rejected": -16.148284912109375, "logps/chosen": -115.20692443847656, "logps/rejected": -327.6485900878906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.040459632873535, "rewards/margins": 19.943601608276367, "rewards/rejected": -25.98406219482422, "step": 81 }, { "epoch": 0.06691146470828234, "grad_norm": 0.013045057654380798, "learning_rate": 8.2e-05, "logits/chosen": -14.145484924316406, "logits/rejected": -16.331363677978516, "logps/chosen": -138.76222229003906, "logps/rejected": -333.7987976074219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.8088202476501465, "rewards/margins": 20.57308578491211, "rewards/rejected": -27.381906509399414, "step": 82 }, { "epoch": 0.06772745818033456, "grad_norm": 2.1503865718841553, "learning_rate": 8.3e-05, "logits/chosen": -13.617944717407227, "logits/rejected": -15.571752548217773, "logps/chosen": -96.72132110595703, "logps/rejected": -244.99179077148438, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": -4.689030647277832, "rewards/margins": 14.99293041229248, "rewards/rejected": -19.681961059570312, "step": 83 }, { "epoch": 0.06854345165238677, "grad_norm": 10.67632007598877, "learning_rate": 8.4e-05, "logits/chosen": -15.039068222045898, "logits/rejected": -16.45990753173828, "logps/chosen": -118.89762115478516, "logps/rejected": -300.55706787109375, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": -6.650244235992432, "rewards/margins": 18.16445541381836, "rewards/rejected": -24.814701080322266, "step": 84 }, { "epoch": 0.069359445124439, "grad_norm": 0.00027416000375524163, "learning_rate": 8.5e-05, "logits/chosen": -13.035578727722168, "logits/rejected": -17.179458618164062, "logps/chosen": -98.36331176757812, "logps/rejected": -344.48614501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.606076717376709, "rewards/margins": 24.968786239624023, "rewards/rejected": -28.574861526489258, "step": 85 }, { "epoch": 0.07017543859649122, "grad_norm": 0.033160075545310974, "learning_rate": 8.6e-05, "logits/chosen": -16.409320831298828, "logits/rejected": -18.307891845703125, "logps/chosen": -127.08856964111328, "logps/rejected": -327.3031311035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.56443977355957, "rewards/margins": 18.717037200927734, "rewards/rejected": -27.281478881835938, "step": 86 }, { "epoch": 0.07099143206854346, "grad_norm": 0.03559010103344917, "learning_rate": 8.7e-05, "logits/chosen": -14.642630577087402, "logits/rejected": -17.001995086669922, "logps/chosen": -140.6201171875, "logps/rejected": -334.2969055175781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.407254219055176, "rewards/margins": 19.611309051513672, "rewards/rejected": -28.01856231689453, "step": 87 }, { "epoch": 0.07180742554059567, "grad_norm": 54.867652893066406, "learning_rate": 8.800000000000001e-05, "logits/chosen": -13.813724517822266, "logits/rejected": -17.63209342956543, "logps/chosen": -171.11184692382812, "logps/rejected": -375.3641357421875, "loss": 0.3691, "rewards/accuracies": 0.875, "rewards/chosen": -10.77604866027832, "rewards/margins": 21.147382736206055, "rewards/rejected": -31.923431396484375, "step": 88 }, { "epoch": 0.0726234190126479, "grad_norm": 0.0006915251724421978, "learning_rate": 8.900000000000001e-05, "logits/chosen": -11.482247352600098, "logits/rejected": -14.921418190002441, "logps/chosen": -112.84828186035156, "logps/rejected": -353.3184509277344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.958806991577148, "rewards/margins": 24.461135864257812, "rewards/rejected": -30.419940948486328, "step": 89 }, { "epoch": 0.07343941248470012, "grad_norm": 0.013733156956732273, "learning_rate": 9e-05, "logits/chosen": -10.106366157531738, "logits/rejected": -13.667549133300781, "logps/chosen": -128.18017578125, "logps/rejected": -331.735595703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.4582929611206055, "rewards/margins": 19.43147850036621, "rewards/rejected": -26.889772415161133, "step": 90 }, { "epoch": 0.07425540595675234, "grad_norm": 0.24836812913417816, "learning_rate": 9.1e-05, "logits/chosen": -8.172449111938477, "logits/rejected": -10.906209945678711, "logps/chosen": -104.70474243164062, "logps/rejected": -286.102783203125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -5.947463512420654, "rewards/margins": 16.653474807739258, "rewards/rejected": -22.600936889648438, "step": 91 }, { "epoch": 0.07507139942880457, "grad_norm": 1.3489576578140259, "learning_rate": 9.200000000000001e-05, "logits/chosen": -8.610447883605957, "logits/rejected": -10.612234115600586, "logps/chosen": -104.92532348632812, "logps/rejected": -240.49728393554688, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -5.719016075134277, "rewards/margins": 14.276619911193848, "rewards/rejected": -19.995635986328125, "step": 92 }, { "epoch": 0.07588739290085679, "grad_norm": 62.06695556640625, "learning_rate": 9.300000000000001e-05, "logits/chosen": -7.438627243041992, "logits/rejected": -9.446512222290039, "logps/chosen": -102.69845581054688, "logps/rejected": -206.41064453125, "loss": 2.9063, "rewards/accuracies": 0.875, "rewards/chosen": -3.6037256717681885, "rewards/margins": 10.236214637756348, "rewards/rejected": -13.839940071105957, "step": 93 }, { "epoch": 0.07670338637290902, "grad_norm": 0.8230538964271545, "learning_rate": 9.4e-05, "logits/chosen": -5.176508903503418, "logits/rejected": -6.047417640686035, "logps/chosen": -48.08903503417969, "logps/rejected": -101.95512390136719, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 0.960364043712616, "rewards/margins": 5.971961975097656, "rewards/rejected": -5.011598110198975, "step": 94 }, { "epoch": 0.07751937984496124, "grad_norm": 1.5156131982803345, "learning_rate": 9.5e-05, "logits/chosen": -5.784613609313965, "logits/rejected": -6.794781684875488, "logps/chosen": -48.456748962402344, "logps/rejected": -114.17437744140625, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": 0.47982096672058105, "rewards/margins": 6.253100872039795, "rewards/rejected": -5.773279666900635, "step": 95 }, { "epoch": 0.07833537331701346, "grad_norm": 0.008328431285917759, "learning_rate": 9.6e-05, "logits/chosen": -11.646584510803223, "logits/rejected": -13.800525665283203, "logps/chosen": -97.26179504394531, "logps/rejected": -245.62057495117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.622432231903076, "rewards/margins": 15.759281158447266, "rewards/rejected": -19.3817138671875, "step": 96 }, { "epoch": 0.07915136678906569, "grad_norm": 1.458761215209961, "learning_rate": 9.7e-05, "logits/chosen": -15.91093635559082, "logits/rejected": -16.875141143798828, "logps/chosen": -120.96035766601562, "logps/rejected": -298.62646484375, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": -6.976350784301758, "rewards/margins": 15.71484375, "rewards/rejected": -22.691192626953125, "step": 97 }, { "epoch": 0.0799673602611179, "grad_norm": 0.149215966463089, "learning_rate": 9.8e-05, "logits/chosen": -16.8973388671875, "logits/rejected": -18.833566665649414, "logps/chosen": -126.22310638427734, "logps/rejected": -285.57501220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.603396892547607, "rewards/margins": 16.279884338378906, "rewards/rejected": -23.88327980041504, "step": 98 }, { "epoch": 0.08078335373317014, "grad_norm": 0.0002216091816080734, "learning_rate": 9.900000000000001e-05, "logits/chosen": -17.101900100708008, "logits/rejected": -19.42289924621582, "logps/chosen": -106.19839477539062, "logps/rejected": -350.1490478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.912981033325195, "rewards/margins": 23.654104232788086, "rewards/rejected": -29.56708526611328, "step": 99 }, { "epoch": 0.08159934720522236, "grad_norm": 0.7281332612037659, "learning_rate": 0.0001, "logits/chosen": -18.330326080322266, "logits/rejected": -19.705284118652344, "logps/chosen": -254.52920532226562, "logps/rejected": -482.06707763671875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -20.432693481445312, "rewards/margins": 21.394956588745117, "rewards/rejected": -41.82765197753906, "step": 100 }, { "epoch": 0.08241534067727459, "grad_norm": 40.32618713378906, "learning_rate": 9.999987411224159e-05, "logits/chosen": -18.564014434814453, "logits/rejected": -19.810617446899414, "logps/chosen": -218.45751953125, "logps/rejected": -373.02996826171875, "loss": 0.8574, "rewards/accuracies": 0.875, "rewards/chosen": -17.71500587463379, "rewards/margins": 15.137849807739258, "rewards/rejected": -32.85285568237305, "step": 101 }, { "epoch": 0.0832313341493268, "grad_norm": 0.014034694992005825, "learning_rate": 9.999949644960028e-05, "logits/chosen": -17.31386375427246, "logits/rejected": -18.619089126586914, "logps/chosen": -199.452880859375, "logps/rejected": -497.09991455078125, "loss": 0.3467, "rewards/accuracies": 0.875, "rewards/chosen": -15.244401931762695, "rewards/margins": 28.73537826538086, "rewards/rejected": -43.97977828979492, "step": 102 }, { "epoch": 0.08404732762137902, "grad_norm": 0.03441351652145386, "learning_rate": 9.999886701397774e-05, "logits/chosen": -15.77090835571289, "logits/rejected": -16.552553176879883, "logps/chosen": -275.6441345214844, "logps/rejected": -467.2615051269531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -21.916170120239258, "rewards/margins": 19.386369705200195, "rewards/rejected": -41.30253982543945, "step": 103 }, { "epoch": 0.08486332109343125, "grad_norm": 1.6955844330368564e-05, "learning_rate": 9.999798580854356e-05, "logits/chosen": -15.417091369628906, "logits/rejected": -16.316852569580078, "logps/chosen": -259.7345275878906, "logps/rejected": -609.6787719726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.86127471923828, "rewards/margins": 35.95375061035156, "rewards/rejected": -54.815025329589844, "step": 104 }, { "epoch": 0.08567931456548347, "grad_norm": 0.11926229298114777, "learning_rate": 9.999685283773504e-05, "logits/chosen": -14.520143508911133, "logits/rejected": -15.06566047668457, "logps/chosen": -299.6357116699219, "logps/rejected": -530.3211669921875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -24.64225196838379, "rewards/margins": 22.366947174072266, "rewards/rejected": -47.00920104980469, "step": 105 }, { "epoch": 0.0864953080375357, "grad_norm": 0.013610818423330784, "learning_rate": 9.999546810725726e-05, "logits/chosen": -13.444767951965332, "logits/rejected": -13.745817184448242, "logps/chosen": -302.77587890625, "logps/rejected": -526.2947998046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -25.608264923095703, "rewards/margins": 20.62717056274414, "rewards/rejected": -46.235435485839844, "step": 106 }, { "epoch": 0.08731130150958792, "grad_norm": 0.8795053958892822, "learning_rate": 9.999383162408304e-05, "logits/chosen": -13.638946533203125, "logits/rejected": -13.692296981811523, "logps/chosen": -367.74700927734375, "logps/rejected": -554.509033203125, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -32.09387969970703, "rewards/margins": 17.593647003173828, "rewards/rejected": -49.687530517578125, "step": 107 }, { "epoch": 0.08812729498164015, "grad_norm": 20.566598892211914, "learning_rate": 9.999194339645292e-05, "logits/chosen": -12.71544361114502, "logits/rejected": -12.343500137329102, "logps/chosen": -310.85955810546875, "logps/rejected": -459.6898193359375, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": -25.918561935424805, "rewards/margins": 16.130218505859375, "rewards/rejected": -42.04878234863281, "step": 108 }, { "epoch": 0.08894328845369237, "grad_norm": 8.07617173137487e-09, "learning_rate": 9.998980343387507e-05, "logits/chosen": -11.582489967346191, "logits/rejected": -10.471150398254395, "logps/chosen": -158.15115356445312, "logps/rejected": -509.56500244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.800251960754395, "rewards/margins": 35.28743362426758, "rewards/rejected": -45.087684631347656, "step": 109 }, { "epoch": 0.08975928192574459, "grad_norm": 0.0015460011782124639, "learning_rate": 9.998741174712533e-05, "logits/chosen": -10.296249389648438, "logits/rejected": -9.619437217712402, "logps/chosen": -168.74484252929688, "logps/rejected": -417.0572509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.69832992553711, "rewards/margins": 24.744081497192383, "rewards/rejected": -35.442413330078125, "step": 110 }, { "epoch": 0.09057527539779682, "grad_norm": 0.005372427869588137, "learning_rate": 9.998476834824704e-05, "logits/chosen": -10.647531509399414, "logits/rejected": -8.757180213928223, "logps/chosen": -133.31288146972656, "logps/rejected": -449.2030334472656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.490942001342773, "rewards/margins": 29.94868278503418, "rewards/rejected": -38.43962097167969, "step": 111 }, { "epoch": 0.09139126886984904, "grad_norm": 14.663928031921387, "learning_rate": 9.998187325055106e-05, "logits/chosen": -10.352989196777344, "logits/rejected": -9.076300621032715, "logps/chosen": -138.71669006347656, "logps/rejected": -326.07568359375, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -8.126306533813477, "rewards/margins": 17.709117889404297, "rewards/rejected": -25.835424423217773, "step": 112 }, { "epoch": 0.09220726234190127, "grad_norm": 86.91736602783203, "learning_rate": 9.997872646861569e-05, "logits/chosen": -8.376124382019043, "logits/rejected": -8.201831817626953, "logps/chosen": -126.6820068359375, "logps/rejected": -205.21798706054688, "loss": 2.1599, "rewards/accuracies": 0.75, "rewards/chosen": -6.447875022888184, "rewards/margins": 7.875028610229492, "rewards/rejected": -14.322903633117676, "step": 113 }, { "epoch": 0.09302325581395349, "grad_norm": 37.286415100097656, "learning_rate": 9.997532801828658e-05, "logits/chosen": -9.0169677734375, "logits/rejected": -9.55757999420166, "logps/chosen": -194.37417602539062, "logps/rejected": -343.51007080078125, "loss": 0.3917, "rewards/accuracies": 0.875, "rewards/chosen": -14.820259094238281, "rewards/margins": 13.483481407165527, "rewards/rejected": -28.303739547729492, "step": 114 }, { "epoch": 0.09383924928600572, "grad_norm": 0.01762077398598194, "learning_rate": 9.997167791667667e-05, "logits/chosen": -8.619664192199707, "logits/rejected": -7.3895392417907715, "logps/chosen": -229.090576171875, "logps/rejected": -358.30487060546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -18.156267166137695, "rewards/margins": 13.704174995422363, "rewards/rejected": -31.860441207885742, "step": 115 }, { "epoch": 0.09465524275805794, "grad_norm": 0.8990716934204102, "learning_rate": 9.996777618216607e-05, "logits/chosen": -8.047431945800781, "logits/rejected": -8.85681438446045, "logps/chosen": -247.82540893554688, "logps/rejected": -391.39306640625, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -18.89797019958496, "rewards/margins": 14.381750106811523, "rewards/rejected": -33.279720306396484, "step": 116 }, { "epoch": 0.09547123623011015, "grad_norm": 0.6635581254959106, "learning_rate": 9.996362283440202e-05, "logits/chosen": -8.270270347595215, "logits/rejected": -8.052099227905273, "logps/chosen": -268.5914306640625, "logps/rejected": -435.14691162109375, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -21.25659942626953, "rewards/margins": 16.09331512451172, "rewards/rejected": -37.34991455078125, "step": 117 }, { "epoch": 0.09628722970216239, "grad_norm": 48.9112434387207, "learning_rate": 9.995921789429874e-05, "logits/chosen": -8.27122974395752, "logits/rejected": -6.610650062561035, "logps/chosen": -312.5487976074219, "logps/rejected": -453.9895935058594, "loss": 1.0934, "rewards/accuracies": 0.875, "rewards/chosen": -24.368267059326172, "rewards/margins": 14.311613082885742, "rewards/rejected": -38.67987823486328, "step": 118 }, { "epoch": 0.0971032231742146, "grad_norm": 10.856385231018066, "learning_rate": 9.995456138403733e-05, "logits/chosen": -7.8151445388793945, "logits/rejected": -6.775144577026367, "logps/chosen": -290.8141784667969, "logps/rejected": -381.2417907714844, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": -22.78008270263672, "rewards/margins": 10.513654708862305, "rewards/rejected": -33.29373550415039, "step": 119 }, { "epoch": 0.09791921664626684, "grad_norm": 0.005845786537975073, "learning_rate": 9.994965332706573e-05, "logits/chosen": -8.525732040405273, "logits/rejected": -8.166800498962402, "logps/chosen": -281.24945068359375, "logps/rejected": -512.0765380859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.642168045043945, "rewards/margins": 23.59695816040039, "rewards/rejected": -44.23912811279297, "step": 120 }, { "epoch": 0.09873521011831905, "grad_norm": 104.05577850341797, "learning_rate": 9.994449374809851e-05, "logits/chosen": -9.384843826293945, "logits/rejected": -8.990543365478516, "logps/chosen": -326.7237548828125, "logps/rejected": -614.994873046875, "loss": 3.594, "rewards/accuracies": 0.875, "rewards/chosen": -24.50212287902832, "rewards/margins": 28.986713409423828, "rewards/rejected": -53.48883819580078, "step": 121 }, { "epoch": 0.09955120359037127, "grad_norm": 84.7860107421875, "learning_rate": 9.993908267311676e-05, "logits/chosen": -8.020954132080078, "logits/rejected": -2.6261682510375977, "logps/chosen": -319.34625244140625, "logps/rejected": -442.01861572265625, "loss": 3.8237, "rewards/accuracies": 0.875, "rewards/chosen": -27.016874313354492, "rewards/margins": 12.696240425109863, "rewards/rejected": -39.71311569213867, "step": 122 }, { "epoch": 0.1003671970624235, "grad_norm": 97.67596435546875, "learning_rate": 9.993342012936799e-05, "logits/chosen": -5.580348014831543, "logits/rejected": -1.784616470336914, "logps/chosen": -281.4082946777344, "logps/rejected": -451.0873718261719, "loss": 2.3548, "rewards/accuracies": 0.875, "rewards/chosen": -22.232406616210938, "rewards/margins": 17.07642936706543, "rewards/rejected": -39.308837890625, "step": 123 }, { "epoch": 0.10118319053447572, "grad_norm": 12.320988655090332, "learning_rate": 9.992750614536605e-05, "logits/chosen": -4.895771026611328, "logits/rejected": -3.48138165473938, "logps/chosen": -198.10154724121094, "logps/rejected": -335.56597900390625, "loss": 0.3589, "rewards/accuracies": 0.875, "rewards/chosen": -15.060635566711426, "rewards/margins": 13.6637544631958, "rewards/rejected": -28.72439193725586, "step": 124 }, { "epoch": 0.10199918400652795, "grad_norm": 1.2777900695800781, "learning_rate": 9.992134075089084e-05, "logits/chosen": -5.481958389282227, "logits/rejected": -4.407360076904297, "logps/chosen": -128.6956329345703, "logps/rejected": -250.60147094726562, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -8.704690933227539, "rewards/margins": 10.010443687438965, "rewards/rejected": -18.715133666992188, "step": 125 }, { "epoch": 0.10281517747858017, "grad_norm": 0.47402527928352356, "learning_rate": 9.991492397698826e-05, "logits/chosen": -5.8929123878479, "logits/rejected": -4.749438285827637, "logps/chosen": -138.2623291015625, "logps/rejected": -314.9207763671875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -8.541731834411621, "rewards/margins": 16.4551944732666, "rewards/rejected": -24.996925354003906, "step": 126 }, { "epoch": 0.1036311709506324, "grad_norm": 0.3539305627346039, "learning_rate": 9.990825585597003e-05, "logits/chosen": -6.004885673522949, "logits/rejected": -4.687943935394287, "logps/chosen": -148.11526489257812, "logps/rejected": -282.5763244628906, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -8.867768287658691, "rewards/margins": 12.994601249694824, "rewards/rejected": -21.862369537353516, "step": 127 }, { "epoch": 0.10444716442268462, "grad_norm": 7.2684855461120605, "learning_rate": 9.990133642141359e-05, "logits/chosen": -7.2527265548706055, "logits/rejected": -6.704451560974121, "logps/chosen": -152.5477294921875, "logps/rejected": -297.7027893066406, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": -9.373881340026855, "rewards/margins": 13.275394439697266, "rewards/rejected": -22.649276733398438, "step": 128 }, { "epoch": 0.10526315789473684, "grad_norm": 0.09790434688329697, "learning_rate": 9.989416570816177e-05, "logits/chosen": -7.8855695724487305, "logits/rejected": -6.360276222229004, "logps/chosen": -165.10397338867188, "logps/rejected": -342.9494934082031, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -11.482208251953125, "rewards/margins": 17.60296058654785, "rewards/rejected": -29.085168838500977, "step": 129 }, { "epoch": 0.10607915136678907, "grad_norm": 1.1120563745498657, "learning_rate": 9.98867437523228e-05, "logits/chosen": -8.783295631408691, "logits/rejected": -8.305458068847656, "logps/chosen": -130.01914978027344, "logps/rejected": -312.567626953125, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -8.467400550842285, "rewards/margins": 17.613445281982422, "rewards/rejected": -26.080848693847656, "step": 130 }, { "epoch": 0.10689514483884129, "grad_norm": 3.0355939865112305, "learning_rate": 9.987907059127004e-05, "logits/chosen": -9.47612190246582, "logits/rejected": -9.53102970123291, "logps/chosen": -163.37864685058594, "logps/rejected": -396.9827575683594, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": -10.2012300491333, "rewards/margins": 21.9224853515625, "rewards/rejected": -32.12371826171875, "step": 131 }, { "epoch": 0.10771113831089352, "grad_norm": 0.38698264956474304, "learning_rate": 9.987114626364171e-05, "logits/chosen": -9.094422340393066, "logits/rejected": -9.220941543579102, "logps/chosen": -152.22108459472656, "logps/rejected": -396.6138916015625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -10.3824462890625, "rewards/margins": 22.49732208251953, "rewards/rejected": -32.87976837158203, "step": 132 }, { "epoch": 0.10852713178294573, "grad_norm": 1.375956628635322e-07, "learning_rate": 9.986297080934089e-05, "logits/chosen": -9.908473014831543, "logits/rejected": -9.665618896484375, "logps/chosen": -124.3120346069336, "logps/rejected": -400.19293212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.511601448059082, "rewards/margins": 25.42327117919922, "rewards/rejected": -31.934871673583984, "step": 133 }, { "epoch": 0.10934312525499797, "grad_norm": 0.02756287157535553, "learning_rate": 9.985454426953513e-05, "logits/chosen": -9.009744644165039, "logits/rejected": -9.358818054199219, "logps/chosen": -74.45677185058594, "logps/rejected": -274.05731201171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.4509336948394775, "rewards/margins": 21.209636688232422, "rewards/rejected": -22.660568237304688, "step": 134 }, { "epoch": 0.11015911872705018, "grad_norm": 1.1666558748402167e-05, "learning_rate": 9.98458666866564e-05, "logits/chosen": -9.296965599060059, "logits/rejected": -10.130204200744629, "logps/chosen": -62.45051574707031, "logps/rejected": -441.5521240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2746009826660156, "rewards/margins": 37.65492248535156, "rewards/rejected": -38.929527282714844, "step": 135 }, { "epoch": 0.1109751121991024, "grad_norm": 84.0453109741211, "learning_rate": 9.983693810440073e-05, "logits/chosen": -9.42636775970459, "logits/rejected": -9.780965805053711, "logps/chosen": -149.9615478515625, "logps/rejected": -314.5615234375, "loss": 1.3758, "rewards/accuracies": 0.75, "rewards/chosen": -9.158196449279785, "rewards/margins": 16.882373809814453, "rewards/rejected": -26.040569305419922, "step": 136 }, { "epoch": 0.11179110567115463, "grad_norm": 0.0004350121598690748, "learning_rate": 9.98277585677281e-05, "logits/chosen": -8.756780624389648, "logits/rejected": -9.285183906555176, "logps/chosen": -118.01063537597656, "logps/rejected": -363.28985595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.944210529327393, "rewards/margins": 24.102855682373047, "rewards/rejected": -29.047069549560547, "step": 137 }, { "epoch": 0.11260709914320685, "grad_norm": 52.02396011352539, "learning_rate": 9.981832812286216e-05, "logits/chosen": -8.947444915771484, "logits/rejected": -10.248913764953613, "logps/chosen": -73.7902603149414, "logps/rejected": -310.4482116699219, "loss": 1.3838, "rewards/accuracies": 0.875, "rewards/chosen": -2.25221586227417, "rewards/margins": 23.844741821289062, "rewards/rejected": -26.09695816040039, "step": 138 }, { "epoch": 0.11342309261525908, "grad_norm": 0.008107366971671581, "learning_rate": 9.980864681729001e-05, "logits/chosen": -9.492016792297363, "logits/rejected": -10.002896308898926, "logps/chosen": -86.90738677978516, "logps/rejected": -314.3465881347656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.9979944229125977, "rewards/margins": 22.525819778442383, "rewards/rejected": -26.523815155029297, "step": 139 }, { "epoch": 0.1142390860873113, "grad_norm": 30.134023666381836, "learning_rate": 9.979871469976196e-05, "logits/chosen": -9.088010787963867, "logits/rejected": -9.040060997009277, "logps/chosen": -87.63094329833984, "logps/rejected": -287.69677734375, "loss": 0.7789, "rewards/accuracies": 0.875, "rewards/chosen": -1.9104138612747192, "rewards/margins": 21.06722640991211, "rewards/rejected": -22.977638244628906, "step": 140 }, { "epoch": 0.11505507955936352, "grad_norm": 0.0006887811468914151, "learning_rate": 9.978853182029128e-05, "logits/chosen": -7.765155792236328, "logits/rejected": -7.557384967803955, "logps/chosen": -82.7491455078125, "logps/rejected": -251.93270874023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.488473892211914, "rewards/margins": 17.14349365234375, "rewards/rejected": -19.631967544555664, "step": 141 }, { "epoch": 0.11587107303141575, "grad_norm": 0.0482780784368515, "learning_rate": 9.977809823015401e-05, "logits/chosen": -7.890049934387207, "logits/rejected": -7.528028964996338, "logps/chosen": -63.4837646484375, "logps/rejected": -240.84165954589844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.8118658065795898, "rewards/margins": 16.857742309570312, "rewards/rejected": -18.669607162475586, "step": 142 }, { "epoch": 0.11668706650346797, "grad_norm": 0.17297087609767914, "learning_rate": 9.976741398188856e-05, "logits/chosen": -6.889120101928711, "logits/rejected": -7.00593376159668, "logps/chosen": -62.47257995605469, "logps/rejected": -203.7180633544922, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.6020545959472656, "rewards/margins": 14.494718551635742, "rewards/rejected": -15.096773147583008, "step": 143 }, { "epoch": 0.1175030599755202, "grad_norm": 57.439048767089844, "learning_rate": 9.975647912929556e-05, "logits/chosen": -6.650762557983398, "logits/rejected": -5.913149833679199, "logps/chosen": -67.39019775390625, "logps/rejected": -222.046630859375, "loss": 0.8276, "rewards/accuracies": 0.875, "rewards/chosen": -1.4074301719665527, "rewards/margins": 15.629777908325195, "rewards/rejected": -17.037208557128906, "step": 144 }, { "epoch": 0.11831905344757242, "grad_norm": 1.037514090538025, "learning_rate": 9.974529372743761e-05, "logits/chosen": -7.607600688934326, "logits/rejected": -7.400801658630371, "logps/chosen": -78.57658386230469, "logps/rejected": -206.0238800048828, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -2.1764583587646484, "rewards/margins": 13.39217758178711, "rewards/rejected": -15.568634986877441, "step": 145 }, { "epoch": 0.11913504691962465, "grad_norm": 0.002576767699792981, "learning_rate": 9.973385783263892e-05, "logits/chosen": -10.032018661499023, "logits/rejected": -10.136394500732422, "logps/chosen": -205.93606567382812, "logps/rejected": -402.2510681152344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.823860168457031, "rewards/margins": 19.889867782592773, "rewards/rejected": -34.71372985839844, "step": 146 }, { "epoch": 0.11995104039167687, "grad_norm": 0.3208094537258148, "learning_rate": 9.972217150248503e-05, "logits/chosen": -11.894468307495117, "logits/rejected": -11.055181503295898, "logps/chosen": -222.7667236328125, "logps/rejected": -423.7601318359375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -17.038131713867188, "rewards/margins": 20.566143035888672, "rewards/rejected": -37.604278564453125, "step": 147 }, { "epoch": 0.12076703386372908, "grad_norm": 0.15382026135921478, "learning_rate": 9.971023479582257e-05, "logits/chosen": -12.589498519897461, "logits/rejected": -11.601400375366211, "logps/chosen": -313.74969482421875, "logps/rejected": -407.0709228515625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -24.210725784301758, "rewards/margins": 10.80363655090332, "rewards/rejected": -35.01435852050781, "step": 148 }, { "epoch": 0.12158302733578132, "grad_norm": 3.3618967533111572, "learning_rate": 9.9698047772759e-05, "logits/chosen": -13.211864471435547, "logits/rejected": -12.420893669128418, "logps/chosen": -344.2156982421875, "logps/rejected": -460.77911376953125, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": -29.655385971069336, "rewards/margins": 11.683499336242676, "rewards/rejected": -41.33888626098633, "step": 149 }, { "epoch": 0.12239902080783353, "grad_norm": 0.38514944911003113, "learning_rate": 9.968561049466214e-05, "logits/chosen": -13.403837203979492, "logits/rejected": -12.948923110961914, "logps/chosen": -357.63360595703125, "logps/rejected": -486.73443603515625, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -31.02216339111328, "rewards/margins": 12.52367115020752, "rewards/rejected": -43.545833587646484, "step": 150 }, { "epoch": 0.12321501427988577, "grad_norm": 0.1039704829454422, "learning_rate": 9.967292302416007e-05, "logits/chosen": -11.818498611450195, "logits/rejected": -10.816457748413086, "logps/chosen": -330.75567626953125, "logps/rejected": -512.255126953125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -27.58340835571289, "rewards/margins": 16.965147018432617, "rewards/rejected": -44.54855728149414, "step": 151 }, { "epoch": 0.12403100775193798, "grad_norm": 2.3062896728515625, "learning_rate": 9.965998542514066e-05, "logits/chosen": -11.485230445861816, "logits/rejected": -11.824407577514648, "logps/chosen": -340.9769287109375, "logps/rejected": -471.4417724609375, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -27.39825439453125, "rewards/margins": 14.74493408203125, "rewards/rejected": -42.1431884765625, "step": 152 }, { "epoch": 0.12484700122399021, "grad_norm": 0.0009012340451590717, "learning_rate": 9.964679776275133e-05, "logits/chosen": -9.114503860473633, "logits/rejected": -9.173911094665527, "logps/chosen": -218.23272705078125, "logps/rejected": -492.91357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.664508819580078, "rewards/margins": 27.416259765625, "rewards/rejected": -43.08076858520508, "step": 153 }, { "epoch": 0.12566299469604242, "grad_norm": 0.1858462542295456, "learning_rate": 9.963336010339868e-05, "logits/chosen": -7.53230619430542, "logits/rejected": -7.826943397521973, "logps/chosen": -187.81214904785156, "logps/rejected": -413.69708251953125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -10.53892993927002, "rewards/margins": 23.601192474365234, "rewards/rejected": -34.14012145996094, "step": 154 }, { "epoch": 0.12647898816809466, "grad_norm": 97.2794189453125, "learning_rate": 9.961967251474822e-05, "logits/chosen": -7.345565319061279, "logits/rejected": -7.196117877960205, "logps/chosen": -191.12484741210938, "logps/rejected": -379.2767333984375, "loss": 1.257, "rewards/accuracies": 0.875, "rewards/chosen": -13.547900199890137, "rewards/margins": 18.920434951782227, "rewards/rejected": -32.46833419799805, "step": 155 }, { "epoch": 0.12729498164014688, "grad_norm": 107.82056427001953, "learning_rate": 9.96057350657239e-05, "logits/chosen": -4.0500898361206055, "logits/rejected": -4.006730079650879, "logps/chosen": -131.43234252929688, "logps/rejected": -276.36395263671875, "loss": 2.9132, "rewards/accuracies": 0.875, "rewards/chosen": -7.694950103759766, "rewards/margins": 15.142526626586914, "rewards/rejected": -22.83747673034668, "step": 156 }, { "epoch": 0.1281109751121991, "grad_norm": 0.1368093192577362, "learning_rate": 9.95915478265079e-05, "logits/chosen": -4.98304557800293, "logits/rejected": -4.572909355163574, "logps/chosen": -161.88348388671875, "logps/rejected": -322.52203369140625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -10.883417129516602, "rewards/margins": 15.607605934143066, "rewards/rejected": -26.491024017333984, "step": 157 }, { "epoch": 0.12892696858425132, "grad_norm": 0.435255765914917, "learning_rate": 9.957711086854023e-05, "logits/chosen": -4.711574554443359, "logits/rejected": -3.597723960876465, "logps/chosen": -201.3252716064453, "logps/rejected": -357.24737548828125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -14.155069351196289, "rewards/margins": 15.235760688781738, "rewards/rejected": -29.390830993652344, "step": 158 }, { "epoch": 0.12974296205630356, "grad_norm": 0.018706390634179115, "learning_rate": 9.956242426451834e-05, "logits/chosen": -4.354576587677002, "logits/rejected": -4.007681846618652, "logps/chosen": -230.79354858398438, "logps/rejected": -431.6690368652344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -17.69304084777832, "rewards/margins": 19.284696578979492, "rewards/rejected": -36.97773742675781, "step": 159 }, { "epoch": 0.13055895552835578, "grad_norm": 0.006217387970536947, "learning_rate": 9.954748808839674e-05, "logits/chosen": -3.887822151184082, "logits/rejected": -3.772988796234131, "logps/chosen": -220.97848510742188, "logps/rejected": -382.2613525390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.949357986450195, "rewards/margins": 15.757946968078613, "rewards/rejected": -32.707305908203125, "step": 160 }, { "epoch": 0.131374949000408, "grad_norm": 74.06698608398438, "learning_rate": 9.953230241538674e-05, "logits/chosen": -4.062504768371582, "logits/rejected": -3.9091544151306152, "logps/chosen": -274.2057800292969, "logps/rejected": -423.5182189941406, "loss": 1.5204, "rewards/accuracies": 0.875, "rewards/chosen": -20.99988555908203, "rewards/margins": 15.32850170135498, "rewards/rejected": -36.32838821411133, "step": 161 }, { "epoch": 0.13219094247246022, "grad_norm": 0.10316301882266998, "learning_rate": 9.951686732195593e-05, "logits/chosen": -3.9859068393707275, "logits/rejected": -3.9647064208984375, "logps/chosen": -208.64788818359375, "logps/rejected": -360.498046875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -17.1180419921875, "rewards/margins": 14.567641258239746, "rewards/rejected": -31.685684204101562, "step": 162 }, { "epoch": 0.13300693594451243, "grad_norm": 63.05160140991211, "learning_rate": 9.950118288582788e-05, "logits/chosen": -4.306698799133301, "logits/rejected": -4.083983421325684, "logps/chosen": -247.62535095214844, "logps/rejected": -400.89630126953125, "loss": 1.1021, "rewards/accuracies": 0.875, "rewards/chosen": -19.445648193359375, "rewards/margins": 15.765042304992676, "rewards/rejected": -35.210693359375, "step": 163 }, { "epoch": 0.13382292941656468, "grad_norm": 0.9241384863853455, "learning_rate": 9.948524918598175e-05, "logits/chosen": -2.5552124977111816, "logits/rejected": -2.809809684753418, "logps/chosen": -159.1575927734375, "logps/rejected": -285.222412109375, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -10.921754837036133, "rewards/margins": 12.139368057250977, "rewards/rejected": -23.06112289428711, "step": 164 }, { "epoch": 0.1346389228886169, "grad_norm": 0.06657973676919937, "learning_rate": 9.946906630265184e-05, "logits/chosen": -2.1704678535461426, "logits/rejected": -1.7570085525512695, "logps/chosen": -142.751220703125, "logps/rejected": -275.7013244628906, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -8.409687042236328, "rewards/margins": 13.81444263458252, "rewards/rejected": -22.22412872314453, "step": 165 }, { "epoch": 0.13545491636066911, "grad_norm": 7.387560844421387, "learning_rate": 9.945263431732722e-05, "logits/chosen": -1.1812591552734375, "logits/rejected": -0.46002528071403503, "logps/chosen": -137.74603271484375, "logps/rejected": -246.60321044921875, "loss": 0.1553, "rewards/accuracies": 1.0, "rewards/chosen": -8.031245231628418, "rewards/margins": 11.55129623413086, "rewards/rejected": -19.58254051208496, "step": 166 }, { "epoch": 0.13627090983272133, "grad_norm": 6.118429183959961, "learning_rate": 9.943595331275133e-05, "logits/chosen": -1.1371512413024902, "logits/rejected": -1.084957242012024, "logps/chosen": -82.92341613769531, "logps/rejected": -205.3703155517578, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": -2.995541572570801, "rewards/margins": 12.497947692871094, "rewards/rejected": -15.493489265441895, "step": 167 }, { "epoch": 0.13708690330477355, "grad_norm": 0.0035286417696624994, "learning_rate": 9.941902337292155e-05, "logits/chosen": -3.2425994873046875, "logits/rejected": -2.592329502105713, "logps/chosen": -90.39131164550781, "logps/rejected": -251.87332153320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.608015537261963, "rewards/margins": 16.655677795410156, "rewards/rejected": -21.263690948486328, "step": 168 }, { "epoch": 0.1379028967768258, "grad_norm": 56.1549186706543, "learning_rate": 9.940184458308875e-05, "logits/chosen": -3.0948984622955322, "logits/rejected": -3.4676384925842285, "logps/chosen": -83.08346557617188, "logps/rejected": -231.6548309326172, "loss": 3.078, "rewards/accuracies": 0.875, "rewards/chosen": -3.0428802967071533, "rewards/margins": 14.51797103881836, "rewards/rejected": -17.560850143432617, "step": 169 }, { "epoch": 0.138718890248878, "grad_norm": 0.3888104557991028, "learning_rate": 9.938441702975689e-05, "logits/chosen": -3.242690324783325, "logits/rejected": -4.247036933898926, "logps/chosen": -82.7392578125, "logps/rejected": -225.09054565429688, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.9107720851898193, "rewards/margins": 12.57863998413086, "rewards/rejected": -16.489412307739258, "step": 170 }, { "epoch": 0.13953488372093023, "grad_norm": 0.04658369719982147, "learning_rate": 9.936674080068262e-05, "logits/chosen": -3.0336592197418213, "logits/rejected": -3.9387965202331543, "logps/chosen": -87.65532684326172, "logps/rejected": -195.1087646484375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.3527097702026367, "rewards/margins": 10.067031860351562, "rewards/rejected": -13.419742584228516, "step": 171 }, { "epoch": 0.14035087719298245, "grad_norm": 0.0017384886741638184, "learning_rate": 9.934881598487479e-05, "logits/chosen": -3.1025617122650146, "logits/rejected": -3.6343607902526855, "logps/chosen": -66.33136749267578, "logps/rejected": -233.0080108642578, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.8291230797767639, "rewards/margins": 16.767189025878906, "rewards/rejected": -17.5963134765625, "step": 172 }, { "epoch": 0.14116687066503467, "grad_norm": 19.84010887145996, "learning_rate": 9.933064267259395e-05, "logits/chosen": -2.566371440887451, "logits/rejected": -4.434239387512207, "logps/chosen": -92.90447998046875, "logps/rejected": -207.34085083007812, "loss": 1.124, "rewards/accuracies": 0.75, "rewards/chosen": -3.555971384048462, "rewards/margins": 10.492154121398926, "rewards/rejected": -14.048124313354492, "step": 173 }, { "epoch": 0.1419828641370869, "grad_norm": 0.13556401431560516, "learning_rate": 9.931222095535204e-05, "logits/chosen": -3.098706007003784, "logits/rejected": -3.452782392501831, "logps/chosen": -79.74060821533203, "logps/rejected": -199.20233154296875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.2739012241363525, "rewards/margins": 11.9495210647583, "rewards/rejected": -13.223422050476074, "step": 174 }, { "epoch": 0.14279885760913913, "grad_norm": 0.14788305759429932, "learning_rate": 9.92935509259118e-05, "logits/chosen": -1.8057136535644531, "logits/rejected": -3.507549285888672, "logps/chosen": -57.953697204589844, "logps/rejected": -197.6510772705078, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.7268328666687012, "rewards/margins": 13.700089454650879, "rewards/rejected": -14.426921844482422, "step": 175 }, { "epoch": 0.14361485108119135, "grad_norm": 4.578485488891602, "learning_rate": 9.927463267828633e-05, "logits/chosen": -3.678302764892578, "logits/rejected": -4.323716640472412, "logps/chosen": -80.77462768554688, "logps/rejected": -197.78419494628906, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": -2.1418023109436035, "rewards/margins": 11.948043823242188, "rewards/rejected": -14.08984661102295, "step": 176 }, { "epoch": 0.14443084455324356, "grad_norm": 0.007463231682777405, "learning_rate": 9.92554663077387e-05, "logits/chosen": -4.491919040679932, "logits/rejected": -6.127155303955078, "logps/chosen": -55.86888122558594, "logps/rejected": -266.1819763183594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.5789920091629028, "rewards/margins": 19.268728256225586, "rewards/rejected": -20.847721099853516, "step": 177 }, { "epoch": 0.1452468380252958, "grad_norm": 37.037357330322266, "learning_rate": 9.923605191078133e-05, "logits/chosen": -6.254344463348389, "logits/rejected": -7.3804144859313965, "logps/chosen": -112.32540893554688, "logps/rejected": -233.82693481445312, "loss": 0.8202, "rewards/accuracies": 0.875, "rewards/chosen": -6.419517517089844, "rewards/margins": 11.812712669372559, "rewards/rejected": -18.232229232788086, "step": 178 }, { "epoch": 0.14606283149734803, "grad_norm": 1.3555214405059814, "learning_rate": 9.921638958517565e-05, "logits/chosen": -9.611239433288574, "logits/rejected": -8.396800994873047, "logps/chosen": -177.31585693359375, "logps/rejected": -314.0685119628906, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -12.30691146850586, "rewards/margins": 13.93209171295166, "rewards/rejected": -26.239002227783203, "step": 179 }, { "epoch": 0.14687882496940025, "grad_norm": 0.9312224388122559, "learning_rate": 9.919647942993148e-05, "logits/chosen": -9.060738563537598, "logits/rejected": -8.969897270202637, "logps/chosen": -178.79501342773438, "logps/rejected": -301.1840515136719, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -12.016740798950195, "rewards/margins": 11.947447776794434, "rewards/rejected": -23.964187622070312, "step": 180 }, { "epoch": 0.14769481844145246, "grad_norm": 0.09290151298046112, "learning_rate": 9.917632154530663e-05, "logits/chosen": -9.876375198364258, "logits/rejected": -8.972967147827148, "logps/chosen": -188.810302734375, "logps/rejected": -294.578125, "loss": 0.3476, "rewards/accuracies": 0.875, "rewards/chosen": -14.343807220458984, "rewards/margins": 10.847795486450195, "rewards/rejected": -25.19160270690918, "step": 181 }, { "epoch": 0.14851081191350468, "grad_norm": 0.015469293110072613, "learning_rate": 9.915591603280631e-05, "logits/chosen": -9.170942306518555, "logits/rejected": -9.489679336547852, "logps/chosen": -229.85922241210938, "logps/rejected": -406.08074951171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -17.587005615234375, "rewards/margins": 15.726442337036133, "rewards/rejected": -33.313446044921875, "step": 182 }, { "epoch": 0.14932680538555693, "grad_norm": 0.007531976327300072, "learning_rate": 9.913526299518273e-05, "logits/chosen": -11.154109954833984, "logits/rejected": -10.014095306396484, "logps/chosen": -168.28741455078125, "logps/rejected": -338.1419677734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.953131675720215, "rewards/margins": 16.920652389526367, "rewards/rejected": -27.8737850189209, "step": 183 }, { "epoch": 0.15014279885760914, "grad_norm": 0.006566903553903103, "learning_rate": 9.911436253643445e-05, "logits/chosen": -10.542390823364258, "logits/rejected": -10.253548622131348, "logps/chosen": -225.070556640625, "logps/rejected": -394.44598388671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.347590446472168, "rewards/margins": 15.979637145996094, "rewards/rejected": -31.327228546142578, "step": 184 }, { "epoch": 0.15095879232966136, "grad_norm": 28.506227493286133, "learning_rate": 9.909321476180594e-05, "logits/chosen": -10.579569816589355, "logits/rejected": -10.497944831848145, "logps/chosen": -219.5005340576172, "logps/rejected": -411.8670654296875, "loss": 0.3089, "rewards/accuracies": 1.0, "rewards/chosen": -15.673074722290039, "rewards/margins": 20.544403076171875, "rewards/rejected": -36.21747589111328, "step": 185 }, { "epoch": 0.15177478580171358, "grad_norm": 0.00063995091477409, "learning_rate": 9.907181977778702e-05, "logits/chosen": -10.314340591430664, "logits/rejected": -10.481261253356934, "logps/chosen": -209.0740966796875, "logps/rejected": -407.42169189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.517580032348633, "rewards/margins": 19.40691375732422, "rewards/rejected": -33.92449188232422, "step": 186 }, { "epoch": 0.1525907792737658, "grad_norm": 19.983701705932617, "learning_rate": 9.90501776921124e-05, "logits/chosen": -10.12258529663086, "logits/rejected": -10.26667308807373, "logps/chosen": -193.82847595214844, "logps/rejected": -382.774658203125, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": -14.295387268066406, "rewards/margins": 18.395654678344727, "rewards/rejected": -32.6910400390625, "step": 187 }, { "epoch": 0.15340677274581804, "grad_norm": 0.05211590602993965, "learning_rate": 9.902828861376101e-05, "logits/chosen": -10.663837432861328, "logits/rejected": -10.75368881225586, "logps/chosen": -231.18240356445312, "logps/rejected": -429.90911865234375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -15.423650741577148, "rewards/margins": 19.717973709106445, "rewards/rejected": -35.141624450683594, "step": 188 }, { "epoch": 0.15422276621787026, "grad_norm": 0.31658321619033813, "learning_rate": 9.900615265295552e-05, "logits/chosen": -9.933612823486328, "logits/rejected": -9.776636123657227, "logps/chosen": -259.4471435546875, "logps/rejected": -387.23516845703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -21.01820182800293, "rewards/margins": 13.501227378845215, "rewards/rejected": -34.519432067871094, "step": 189 }, { "epoch": 0.15503875968992248, "grad_norm": 0.0005060741677880287, "learning_rate": 9.898376992116179e-05, "logits/chosen": -9.611753463745117, "logits/rejected": -10.465991020202637, "logps/chosen": -244.47506713867188, "logps/rejected": -461.1335144042969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.473222732543945, "rewards/margins": 21.643070220947266, "rewards/rejected": -40.116294860839844, "step": 190 }, { "epoch": 0.1558547531619747, "grad_norm": 0.32138895988464355, "learning_rate": 9.896114053108829e-05, "logits/chosen": -11.094322204589844, "logits/rejected": -10.983280181884766, "logps/chosen": -239.91351318359375, "logps/rejected": -455.6475524902344, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -19.029102325439453, "rewards/margins": 20.797306060791016, "rewards/rejected": -39.826412200927734, "step": 191 }, { "epoch": 0.15667074663402691, "grad_norm": 0.19485993683338165, "learning_rate": 9.893826459668558e-05, "logits/chosen": -11.328532218933105, "logits/rejected": -10.550125122070312, "logps/chosen": -250.12982177734375, "logps/rejected": -489.8529052734375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -18.225866317749023, "rewards/margins": 23.314971923828125, "rewards/rejected": -41.54084014892578, "step": 192 }, { "epoch": 0.15748674010607916, "grad_norm": 0.004426443483680487, "learning_rate": 9.891514223314561e-05, "logits/chosen": -10.991119384765625, "logits/rejected": -10.795465469360352, "logps/chosen": -240.25888061523438, "logps/rejected": -405.3650817871094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.66986846923828, "rewards/margins": 16.816207885742188, "rewards/rejected": -35.48607635498047, "step": 193 }, { "epoch": 0.15830273357813138, "grad_norm": 0.003467374946922064, "learning_rate": 9.889177355690135e-05, "logits/chosen": -10.94709587097168, "logits/rejected": -10.99362850189209, "logps/chosen": -180.08712768554688, "logps/rejected": -453.0244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.484884262084961, "rewards/margins": 24.614334106445312, "rewards/rejected": -38.099220275878906, "step": 194 }, { "epoch": 0.1591187270501836, "grad_norm": 1.7857033014297485, "learning_rate": 9.886815868562596e-05, "logits/chosen": -11.250267028808594, "logits/rejected": -11.083313941955566, "logps/chosen": -152.93431091308594, "logps/rejected": -407.85662841796875, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -9.944047927856445, "rewards/margins": 25.77132225036621, "rewards/rejected": -35.715370178222656, "step": 195 }, { "epoch": 0.1599347205222358, "grad_norm": 137.55816650390625, "learning_rate": 9.884429773823239e-05, "logits/chosen": -10.868474006652832, "logits/rejected": -10.098901748657227, "logps/chosen": -250.75048828125, "logps/rejected": -392.07183837890625, "loss": 1.9797, "rewards/accuracies": 0.875, "rewards/chosen": -19.819580078125, "rewards/margins": 13.533679008483887, "rewards/rejected": -33.3532600402832, "step": 196 }, { "epoch": 0.16075071399428806, "grad_norm": 27.17814064025879, "learning_rate": 9.882019083487268e-05, "logits/chosen": -11.706673622131348, "logits/rejected": -11.008928298950195, "logps/chosen": -281.3752136230469, "logps/rejected": -401.46185302734375, "loss": 0.9082, "rewards/accuracies": 0.875, "rewards/chosen": -21.738346099853516, "rewards/margins": 13.734701156616211, "rewards/rejected": -35.473045349121094, "step": 197 }, { "epoch": 0.16156670746634028, "grad_norm": 71.07138061523438, "learning_rate": 9.879583809693738e-05, "logits/chosen": -12.598490715026855, "logits/rejected": -12.2940673828125, "logps/chosen": -244.0551300048828, "logps/rejected": -395.144287109375, "loss": 2.1079, "rewards/accuracies": 0.875, "rewards/chosen": -19.864599227905273, "rewards/margins": 15.19216537475586, "rewards/rejected": -35.0567626953125, "step": 198 }, { "epoch": 0.1623827009383925, "grad_norm": 9.147815580945462e-05, "learning_rate": 9.877123964705497e-05, "logits/chosen": -13.311019897460938, "logits/rejected": -12.45639419555664, "logps/chosen": -223.63330078125, "logps/rejected": -429.79693603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.19282341003418, "rewards/margins": 21.147323608398438, "rewards/rejected": -37.34014892578125, "step": 199 }, { "epoch": 0.1631986944104447, "grad_norm": 0.06349876523017883, "learning_rate": 9.874639560909117e-05, "logits/chosen": -12.23132610321045, "logits/rejected": -13.6112060546875, "logps/chosen": -221.1026153564453, "logps/rejected": -440.9586181640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -16.7999210357666, "rewards/margins": 20.94506072998047, "rewards/rejected": -37.74497985839844, "step": 200 }, { "epoch": 0.16401468788249693, "grad_norm": 30.766164779663086, "learning_rate": 9.872130610814845e-05, "logits/chosen": -12.056867599487305, "logits/rejected": -12.70809555053711, "logps/chosen": -231.79592895507812, "logps/rejected": -453.8127136230469, "loss": 0.2557, "rewards/accuracies": 1.0, "rewards/chosen": -17.781831741333008, "rewards/margins": 18.279821395874023, "rewards/rejected": -36.06165313720703, "step": 201 }, { "epoch": 0.16483068135454917, "grad_norm": 0.00015747039287816733, "learning_rate": 9.86959712705652e-05, "logits/chosen": -11.412585258483887, "logits/rejected": -11.403087615966797, "logps/chosen": -143.67539978027344, "logps/rejected": -357.93072509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.240835189819336, "rewards/margins": 21.329654693603516, "rewards/rejected": -29.57048797607422, "step": 202 }, { "epoch": 0.1656466748266014, "grad_norm": 34.08121871948242, "learning_rate": 9.867039122391527e-05, "logits/chosen": -10.59276008605957, "logits/rejected": -11.749135971069336, "logps/chosen": -141.79110717773438, "logps/rejected": -352.31640625, "loss": 0.1625, "rewards/accuracies": 1.0, "rewards/chosen": -7.446841716766357, "rewards/margins": 21.827266693115234, "rewards/rejected": -29.27410888671875, "step": 203 }, { "epoch": 0.1664626682986536, "grad_norm": 0.002161883981898427, "learning_rate": 9.864456609700726e-05, "logits/chosen": -9.452085494995117, "logits/rejected": -11.074464797973633, "logps/chosen": -136.88784790039062, "logps/rejected": -385.814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.308890342712402, "rewards/margins": 24.90286636352539, "rewards/rejected": -31.21175765991211, "step": 204 }, { "epoch": 0.16727866177070583, "grad_norm": 0.0034374261740595102, "learning_rate": 9.861849601988383e-05, "logits/chosen": -8.34603214263916, "logits/rejected": -8.789122581481934, "logps/chosen": -93.8017349243164, "logps/rejected": -333.60943603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.369506359100342, "rewards/margins": 22.493385314941406, "rewards/rejected": -25.862890243530273, "step": 205 }, { "epoch": 0.16809465524275805, "grad_norm": 62.52790832519531, "learning_rate": 9.859218112382116e-05, "logits/chosen": -6.61884880065918, "logits/rejected": -7.18696403503418, "logps/chosen": -128.78118896484375, "logps/rejected": -279.55706787109375, "loss": 1.3214, "rewards/accuracies": 0.875, "rewards/chosen": -7.445745944976807, "rewards/margins": 14.89245319366455, "rewards/rejected": -22.338197708129883, "step": 206 }, { "epoch": 0.1689106487148103, "grad_norm": 9.981008020076843e-08, "learning_rate": 9.856562154132817e-05, "logits/chosen": -5.699916839599609, "logits/rejected": -6.699404716491699, "logps/chosen": -77.47503662109375, "logps/rejected": -346.01092529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.1075050830841064, "rewards/margins": 26.18037223815918, "rewards/rejected": -28.287878036499023, "step": 207 }, { "epoch": 0.1697266421868625, "grad_norm": 11.483179092407227, "learning_rate": 9.853881740614591e-05, "logits/chosen": -3.9437899589538574, "logits/rejected": -5.025261878967285, "logps/chosen": -84.85011291503906, "logps/rejected": -185.99810791015625, "loss": 0.1412, "rewards/accuracies": 1.0, "rewards/chosen": -4.559015274047852, "rewards/margins": 8.942718505859375, "rewards/rejected": -13.501733779907227, "step": 208 }, { "epoch": 0.17054263565891473, "grad_norm": 1.369499683380127, "learning_rate": 9.851176885324689e-05, "logits/chosen": -4.790278911590576, "logits/rejected": -6.176909446716309, "logps/chosen": -119.6777114868164, "logps/rejected": -286.101806640625, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -6.216161251068115, "rewards/margins": 16.598087310791016, "rewards/rejected": -22.814247131347656, "step": 209 }, { "epoch": 0.17135862913096694, "grad_norm": 0.9726454019546509, "learning_rate": 9.848447601883435e-05, "logits/chosen": -5.915358543395996, "logits/rejected": -6.8826727867126465, "logps/chosen": -159.8888397216797, "logps/rejected": -260.7906188964844, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -11.577180862426758, "rewards/margins": 10.401376724243164, "rewards/rejected": -21.978557586669922, "step": 210 }, { "epoch": 0.1721746226030192, "grad_norm": 20.896596908569336, "learning_rate": 9.845693904034165e-05, "logits/chosen": -8.330397605895996, "logits/rejected": -9.974111557006836, "logps/chosen": -193.13076782226562, "logps/rejected": -385.9781494140625, "loss": 0.3421, "rewards/accuracies": 1.0, "rewards/chosen": -14.404189109802246, "rewards/margins": 18.76852035522461, "rewards/rejected": -33.17271041870117, "step": 211 }, { "epoch": 0.1729906160750714, "grad_norm": 1.0134173862752505e-05, "learning_rate": 9.842915805643155e-05, "logits/chosen": -8.903475761413574, "logits/rejected": -10.378299713134766, "logps/chosen": -172.91143798828125, "logps/rejected": -388.0994873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.372087478637695, "rewards/margins": 21.385066986083984, "rewards/rejected": -32.75715255737305, "step": 212 }, { "epoch": 0.17380660954712362, "grad_norm": 5.606016657111468e-06, "learning_rate": 9.840113320699548e-05, "logits/chosen": -10.14904499053955, "logits/rejected": -11.777897834777832, "logps/chosen": -227.46286010742188, "logps/rejected": -491.423583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.373880386352539, "rewards/margins": 25.71185302734375, "rewards/rejected": -41.085731506347656, "step": 213 }, { "epoch": 0.17462260301917584, "grad_norm": 1.0177737474441528, "learning_rate": 9.837286463315283e-05, "logits/chosen": -11.999674797058105, "logits/rejected": -12.751399993896484, "logps/chosen": -246.75518798828125, "logps/rejected": -396.91864013671875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -19.625469207763672, "rewards/margins": 15.08359432220459, "rewards/rejected": -34.70906448364258, "step": 214 }, { "epoch": 0.17543859649122806, "grad_norm": 72.41325378417969, "learning_rate": 9.834435247725033e-05, "logits/chosen": -11.747089385986328, "logits/rejected": -12.685077667236328, "logps/chosen": -213.81553649902344, "logps/rejected": -350.5102844238281, "loss": 0.3413, "rewards/accuracies": 1.0, "rewards/chosen": -16.352115631103516, "rewards/margins": 14.491851806640625, "rewards/rejected": -30.843969345092773, "step": 215 }, { "epoch": 0.1762545899632803, "grad_norm": 5.367645280784927e-05, "learning_rate": 9.831559688286121e-05, "logits/chosen": -10.731492042541504, "logits/rejected": -12.082486152648926, "logps/chosen": -132.66925048828125, "logps/rejected": -350.9669494628906, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -8.670299530029297, "rewards/margins": 20.80813980102539, "rewards/rejected": -29.478439331054688, "step": 216 }, { "epoch": 0.17707058343533252, "grad_norm": 0.0014386995462700725, "learning_rate": 9.828659799478456e-05, "logits/chosen": -8.936058044433594, "logits/rejected": -10.244815826416016, "logps/chosen": -122.88407897949219, "logps/rejected": -374.7409362792969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.322514057159424, "rewards/margins": 23.583980560302734, "rewards/rejected": -29.906494140625, "step": 217 }, { "epoch": 0.17788657690738474, "grad_norm": 0.0010246096644550562, "learning_rate": 9.825735595904462e-05, "logits/chosen": -9.219013214111328, "logits/rejected": -9.83883285522461, "logps/chosen": -148.1288604736328, "logps/rejected": -397.3128662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.474661827087402, "rewards/margins": 23.757164001464844, "rewards/rejected": -33.23182678222656, "step": 218 }, { "epoch": 0.17870257037943696, "grad_norm": 0.043091803789138794, "learning_rate": 9.822787092288991e-05, "logits/chosen": -9.598678588867188, "logits/rejected": -9.748772621154785, "logps/chosen": -133.420654296875, "logps/rejected": -335.8758850097656, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.933663845062256, "rewards/margins": 22.15019989013672, "rewards/rejected": -28.083864212036133, "step": 219 }, { "epoch": 0.17951856385148918, "grad_norm": 0.00025197453214786947, "learning_rate": 9.819814303479267e-05, "logits/chosen": -7.425317287445068, "logits/rejected": -8.081472396850586, "logps/chosen": -126.22563171386719, "logps/rejected": -351.8447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.03368616104126, "rewards/margins": 21.100322723388672, "rewards/rejected": -28.13400650024414, "step": 220 }, { "epoch": 0.18033455732354142, "grad_norm": 0.04015165939927101, "learning_rate": 9.816817244444798e-05, "logits/chosen": -7.429627895355225, "logits/rejected": -7.255378723144531, "logps/chosen": -129.43746948242188, "logps/rejected": -290.09771728515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.125915050506592, "rewards/margins": 15.683549880981445, "rewards/rejected": -21.809467315673828, "step": 221 }, { "epoch": 0.18115055079559364, "grad_norm": 52.7648811340332, "learning_rate": 9.813795930277305e-05, "logits/chosen": -7.407536029815674, "logits/rejected": -7.617238521575928, "logps/chosen": -121.99497985839844, "logps/rejected": -254.43443298339844, "loss": 0.7373, "rewards/accuracies": 0.875, "rewards/chosen": -7.972005367279053, "rewards/margins": 12.29456901550293, "rewards/rejected": -20.26657485961914, "step": 222 }, { "epoch": 0.18196654426764586, "grad_norm": 0.344727486371994, "learning_rate": 9.810750376190647e-05, "logits/chosen": -6.791281223297119, "logits/rejected": -7.754394054412842, "logps/chosen": -117.88693237304688, "logps/rejected": -320.5682373046875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -5.675289154052734, "rewards/margins": 19.939022064208984, "rewards/rejected": -25.61431312561035, "step": 223 }, { "epoch": 0.18278253773969808, "grad_norm": 2.436330214550253e-05, "learning_rate": 9.807680597520746e-05, "logits/chosen": -8.29433536529541, "logits/rejected": -8.760702133178711, "logps/chosen": -134.0946807861328, "logps/rejected": -345.6012268066406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.584972381591797, "rewards/margins": 20.336795806884766, "rewards/rejected": -28.921768188476562, "step": 224 }, { "epoch": 0.1835985312117503, "grad_norm": 0.08174566179513931, "learning_rate": 9.804586609725499e-05, "logits/chosen": -7.85927677154541, "logits/rejected": -8.500328063964844, "logps/chosen": -103.41401672363281, "logps/rejected": -285.2572021484375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.3946614265441895, "rewards/margins": 18.0141544342041, "rewards/rejected": -22.408815383911133, "step": 225 }, { "epoch": 0.18441452468380254, "grad_norm": 0.028664806857705116, "learning_rate": 9.801468428384716e-05, "logits/chosen": -9.046140670776367, "logits/rejected": -9.204185485839844, "logps/chosen": -135.2132568359375, "logps/rejected": -319.17340087890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.3632121086120605, "rewards/margins": 18.02349090576172, "rewards/rejected": -25.386703491210938, "step": 226 }, { "epoch": 0.18523051815585476, "grad_norm": 0.003431818913668394, "learning_rate": 9.798326069200032e-05, "logits/chosen": -8.390056610107422, "logits/rejected": -8.99643611907959, "logps/chosen": -129.34617614746094, "logps/rejected": -300.840087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.543408393859863, "rewards/margins": 16.922605514526367, "rewards/rejected": -23.466014862060547, "step": 227 }, { "epoch": 0.18604651162790697, "grad_norm": 5.227649688720703, "learning_rate": 9.79515954799483e-05, "logits/chosen": -9.183683395385742, "logits/rejected": -8.548301696777344, "logps/chosen": -151.10508728027344, "logps/rejected": -321.4646301269531, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": -10.125373840332031, "rewards/margins": 16.229141235351562, "rewards/rejected": -26.354516983032227, "step": 228 }, { "epoch": 0.1868625050999592, "grad_norm": 3.94410926674027e-05, "learning_rate": 9.791968880714157e-05, "logits/chosen": -10.589200973510742, "logits/rejected": -10.035511016845703, "logps/chosen": -173.73291015625, "logps/rejected": -366.38165283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.699869155883789, "rewards/margins": 18.741313934326172, "rewards/rejected": -30.44118309020996, "step": 229 }, { "epoch": 0.18767849857201144, "grad_norm": 0.011429272592067719, "learning_rate": 9.788754083424652e-05, "logits/chosen": -10.649983406066895, "logits/rejected": -9.973566055297852, "logps/chosen": -155.43072509765625, "logps/rejected": -347.6503601074219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.801738739013672, "rewards/margins": 18.86285400390625, "rewards/rejected": -29.664592742919922, "step": 230 }, { "epoch": 0.18849449204406366, "grad_norm": 0.007120995782315731, "learning_rate": 9.785515172314463e-05, "logits/chosen": -10.095817565917969, "logits/rejected": -9.692100524902344, "logps/chosen": -202.2255096435547, "logps/rejected": -395.7437744140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.064268112182617, "rewards/margins": 18.338651657104492, "rewards/rejected": -33.402923583984375, "step": 231 }, { "epoch": 0.18931048551611587, "grad_norm": 0.01877114363014698, "learning_rate": 9.782252163693158e-05, "logits/chosen": -11.89059066772461, "logits/rejected": -10.557610511779785, "logps/chosen": -223.27420043945312, "logps/rejected": -385.75341796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.122882843017578, "rewards/margins": 15.509461402893066, "rewards/rejected": -31.632347106933594, "step": 232 }, { "epoch": 0.1901264789881681, "grad_norm": 0.6729365587234497, "learning_rate": 9.778965073991651e-05, "logits/chosen": -11.315085411071777, "logits/rejected": -9.847110748291016, "logps/chosen": -188.60519409179688, "logps/rejected": -379.0281982421875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -13.17982292175293, "rewards/margins": 18.362977981567383, "rewards/rejected": -31.542800903320312, "step": 233 }, { "epoch": 0.1909424724602203, "grad_norm": 0.22915491461753845, "learning_rate": 9.775653919762115e-05, "logits/chosen": -11.66118049621582, "logits/rejected": -10.966322898864746, "logps/chosen": -224.986328125, "logps/rejected": -332.61456298828125, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -17.371706008911133, "rewards/margins": 11.128622055053711, "rewards/rejected": -28.500328063964844, "step": 234 }, { "epoch": 0.19175846593227255, "grad_norm": 4.238717079162598, "learning_rate": 9.772318717677904e-05, "logits/chosen": -11.475011825561523, "logits/rejected": -10.001847267150879, "logps/chosen": -232.87615966796875, "logps/rejected": -338.768798828125, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": -17.62227439880371, "rewards/margins": 10.624771118164062, "rewards/rejected": -28.247045516967773, "step": 235 }, { "epoch": 0.19257445940432477, "grad_norm": 0.006709840148687363, "learning_rate": 9.76895948453346e-05, "logits/chosen": -11.52347183227539, "logits/rejected": -10.157676696777344, "logps/chosen": -179.35302734375, "logps/rejected": -414.2537841796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.221406936645508, "rewards/margins": 21.20288848876953, "rewards/rejected": -33.42429733276367, "step": 236 }, { "epoch": 0.193390452876377, "grad_norm": 0.27653154730796814, "learning_rate": 9.765576237244238e-05, "logits/chosen": -11.719663619995117, "logits/rejected": -9.528708457946777, "logps/chosen": -155.10829162597656, "logps/rejected": -333.2040100097656, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.069525718688965, "rewards/margins": 18.643569946289062, "rewards/rejected": -26.713098526000977, "step": 237 }, { "epoch": 0.1942064463484292, "grad_norm": 0.00037473050178959966, "learning_rate": 9.762168992846614e-05, "logits/chosen": -10.337793350219727, "logits/rejected": -9.201471328735352, "logps/chosen": -144.75930786132812, "logps/rejected": -361.66949462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.9350056648254395, "rewards/margins": 21.719388961791992, "rewards/rejected": -29.654394149780273, "step": 238 }, { "epoch": 0.19502243982048142, "grad_norm": 5.7392473220825195, "learning_rate": 9.758737768497802e-05, "logits/chosen": -10.430442810058594, "logits/rejected": -9.666704177856445, "logps/chosen": -111.52546691894531, "logps/rejected": -291.26806640625, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": -5.858402729034424, "rewards/margins": 16.841293334960938, "rewards/rejected": -22.69969367980957, "step": 239 }, { "epoch": 0.19583843329253367, "grad_norm": 5.331135253072716e-05, "learning_rate": 9.755282581475769e-05, "logits/chosen": -11.34234619140625, "logits/rejected": -9.818406105041504, "logps/chosen": -144.6824493408203, "logps/rejected": -366.60101318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.208514213562012, "rewards/margins": 20.698909759521484, "rewards/rejected": -28.90742301940918, "step": 240 }, { "epoch": 0.1966544267645859, "grad_norm": 20.284547805786133, "learning_rate": 9.751803449179142e-05, "logits/chosen": -11.653255462646484, "logits/rejected": -10.242216110229492, "logps/chosen": -141.26528930664062, "logps/rejected": -278.6805419921875, "loss": 0.7687, "rewards/accuracies": 0.875, "rewards/chosen": -8.970970153808594, "rewards/margins": 13.616973876953125, "rewards/rejected": -22.58794403076172, "step": 241 }, { "epoch": 0.1974704202366381, "grad_norm": 0.8589663505554199, "learning_rate": 9.748300389127131e-05, "logits/chosen": -12.241464614868164, "logits/rejected": -10.482295036315918, "logps/chosen": -191.11062622070312, "logps/rejected": -320.9187927246094, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -13.194540023803711, "rewards/margins": 13.354866981506348, "rewards/rejected": -26.549406051635742, "step": 242 }, { "epoch": 0.19828641370869032, "grad_norm": 0.38982346653938293, "learning_rate": 9.74477341895943e-05, "logits/chosen": -12.696541786193848, "logits/rejected": -11.760897636413574, "logps/chosen": -178.57931518554688, "logps/rejected": -351.7535705566406, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -12.733301162719727, "rewards/margins": 16.532794952392578, "rewards/rejected": -29.266096115112305, "step": 243 }, { "epoch": 0.19910240718074254, "grad_norm": 1.9323961734771729, "learning_rate": 9.741222556436132e-05, "logits/chosen": -13.456323623657227, "logits/rejected": -11.570959091186523, "logps/chosen": -175.71612548828125, "logps/rejected": -285.2767333984375, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -10.404020309448242, "rewards/margins": 11.891300201416016, "rewards/rejected": -22.295318603515625, "step": 244 }, { "epoch": 0.1999184006527948, "grad_norm": 51.70561218261719, "learning_rate": 9.737647819437645e-05, "logits/chosen": -13.106903076171875, "logits/rejected": -12.066636085510254, "logps/chosen": -165.18734741210938, "logps/rejected": -295.9970703125, "loss": 0.713, "rewards/accuracies": 0.875, "rewards/chosen": -11.67812442779541, "rewards/margins": 12.304929733276367, "rewards/rejected": -23.98305320739746, "step": 245 }, { "epoch": 0.200734394124847, "grad_norm": 0.00019479783077258617, "learning_rate": 9.734049225964591e-05, "logits/chosen": -13.965633392333984, "logits/rejected": -13.05101203918457, "logps/chosen": -162.20584106445312, "logps/rejected": -373.83367919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.291982650756836, "rewards/margins": 21.230201721191406, "rewards/rejected": -30.522184371948242, "step": 246 }, { "epoch": 0.20155038759689922, "grad_norm": 38.63285446166992, "learning_rate": 9.730426794137727e-05, "logits/chosen": -14.432849884033203, "logits/rejected": -13.94564437866211, "logps/chosen": -151.0867919921875, "logps/rejected": -296.13720703125, "loss": 0.5662, "rewards/accuracies": 0.875, "rewards/chosen": -8.330916404724121, "rewards/margins": 15.570674896240234, "rewards/rejected": -23.901592254638672, "step": 247 }, { "epoch": 0.20236638106895144, "grad_norm": 5.4017486572265625, "learning_rate": 9.726780542197844e-05, "logits/chosen": -14.084209442138672, "logits/rejected": -13.625798225402832, "logps/chosen": -134.9090576171875, "logps/rejected": -312.2884216308594, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": -7.538906097412109, "rewards/margins": 16.462499618530273, "rewards/rejected": -24.001407623291016, "step": 248 }, { "epoch": 0.20318237454100369, "grad_norm": 0.0003306112776044756, "learning_rate": 9.723110488505685e-05, "logits/chosen": -14.432716369628906, "logits/rejected": -14.22595500946045, "logps/chosen": -126.11137390136719, "logps/rejected": -377.93914794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.3637542724609375, "rewards/margins": 23.923786163330078, "rewards/rejected": -30.287540435791016, "step": 249 }, { "epoch": 0.2039983680130559, "grad_norm": 0.004264854826033115, "learning_rate": 9.719416651541839e-05, "logits/chosen": -15.44569206237793, "logits/rejected": -15.273046493530273, "logps/chosen": -128.05401611328125, "logps/rejected": -332.40106201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.688143730163574, "rewards/margins": 19.061494827270508, "rewards/rejected": -26.749637603759766, "step": 250 }, { "epoch": 0.20481436148510812, "grad_norm": 25.818355560302734, "learning_rate": 9.715699049906661e-05, "logits/chosen": -14.806204795837402, "logits/rejected": -15.692740440368652, "logps/chosen": -141.16058349609375, "logps/rejected": -357.89056396484375, "loss": 0.3231, "rewards/accuracies": 1.0, "rewards/chosen": -7.950809478759766, "rewards/margins": 21.72277069091797, "rewards/rejected": -29.673580169677734, "step": 251 }, { "epoch": 0.20563035495716034, "grad_norm": 0.00021741412638220936, "learning_rate": 9.711957702320175e-05, "logits/chosen": -14.638630867004395, "logits/rejected": -15.377279281616211, "logps/chosen": -77.75355529785156, "logps/rejected": -278.17669677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.933814764022827, "rewards/margins": 19.258255004882812, "rewards/rejected": -22.19207000732422, "step": 252 }, { "epoch": 0.20644634842921256, "grad_norm": 0.0004011043347418308, "learning_rate": 9.708192627621972e-05, "logits/chosen": -13.33406925201416, "logits/rejected": -15.511001586914062, "logps/chosen": -84.72941589355469, "logps/rejected": -361.37896728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4796228408813477, "rewards/margins": 27.50973892211914, "rewards/rejected": -29.989360809326172, "step": 253 }, { "epoch": 0.2072623419012648, "grad_norm": 11.444947242736816, "learning_rate": 9.704403844771128e-05, "logits/chosen": -12.788993835449219, "logits/rejected": -14.043696403503418, "logps/chosen": -67.53684997558594, "logps/rejected": -289.6108703613281, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": -1.677436351776123, "rewards/margins": 21.26360511779785, "rewards/rejected": -22.941041946411133, "step": 254 }, { "epoch": 0.20807833537331702, "grad_norm": 6.386953373294091e-06, "learning_rate": 9.700591372846095e-05, "logits/chosen": -11.497272491455078, "logits/rejected": -13.800570487976074, "logps/chosen": -81.11112213134766, "logps/rejected": -334.911865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4783458709716797, "rewards/margins": 24.664453506469727, "rewards/rejected": -27.142799377441406, "step": 255 }, { "epoch": 0.20889432884536924, "grad_norm": 0.3597415089607239, "learning_rate": 9.696755231044618e-05, "logits/chosen": -11.049829483032227, "logits/rejected": -12.393906593322754, "logps/chosen": -83.88363647460938, "logps/rejected": -260.3031311035156, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.5049729347229004, "rewards/margins": 17.24718475341797, "rewards/rejected": -20.752159118652344, "step": 256 }, { "epoch": 0.20971032231742145, "grad_norm": 0.011453061364591122, "learning_rate": 9.692895438683627e-05, "logits/chosen": -10.997700691223145, "logits/rejected": -12.036637306213379, "logps/chosen": -85.96501159667969, "logps/rejected": -279.2877197265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.3556814193725586, "rewards/margins": 18.5041561126709, "rewards/rejected": -21.85983657836914, "step": 257 }, { "epoch": 0.21052631578947367, "grad_norm": 0.8397673964500427, "learning_rate": 9.689012015199145e-05, "logits/chosen": -11.02825927734375, "logits/rejected": -11.197043418884277, "logps/chosen": -92.59553527832031, "logps/rejected": -223.1221923828125, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -4.067196369171143, "rewards/margins": 13.232458114624023, "rewards/rejected": -17.29965591430664, "step": 258 }, { "epoch": 0.21134230926152592, "grad_norm": 0.0012098310980945826, "learning_rate": 9.685104980146193e-05, "logits/chosen": -9.231039047241211, "logits/rejected": -11.65709114074707, "logps/chosen": -143.7205810546875, "logps/rejected": -334.2421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.9045000076293945, "rewards/margins": 19.19518280029297, "rewards/rejected": -26.09968376159668, "step": 259 }, { "epoch": 0.21215830273357814, "grad_norm": 0.013031992129981518, "learning_rate": 9.681174353198687e-05, "logits/chosen": -10.867335319519043, "logits/rejected": -11.649496078491211, "logps/chosen": -99.10530853271484, "logps/rejected": -324.1353759765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.782567024230957, "rewards/margins": 21.823368072509766, "rewards/rejected": -26.605937957763672, "step": 260 }, { "epoch": 0.21297429620563035, "grad_norm": 74.20552825927734, "learning_rate": 9.677220154149337e-05, "logits/chosen": -11.26164722442627, "logits/rejected": -11.673406600952148, "logps/chosen": -91.27073669433594, "logps/rejected": -251.53573608398438, "loss": 1.0208, "rewards/accuracies": 0.875, "rewards/chosen": -4.743115425109863, "rewards/margins": 15.789651870727539, "rewards/rejected": -20.53276824951172, "step": 261 }, { "epoch": 0.21379028967768257, "grad_norm": 2.3926048015709966e-05, "learning_rate": 9.673242402909555e-05, "logits/chosen": -8.806029319763184, "logits/rejected": -11.166755676269531, "logps/chosen": -96.30796813964844, "logps/rejected": -360.9769287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9405808448791504, "rewards/margins": 24.858417510986328, "rewards/rejected": -28.798995971679688, "step": 262 }, { "epoch": 0.2146062831497348, "grad_norm": 0.0003016665286850184, "learning_rate": 9.66924111950935e-05, "logits/chosen": -9.688959121704102, "logits/rejected": -12.59365463256836, "logps/chosen": -73.85462951660156, "logps/rejected": -322.65093994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1804075241088867, "rewards/margins": 24.086734771728516, "rewards/rejected": -27.26714324951172, "step": 263 }, { "epoch": 0.21542227662178703, "grad_norm": 3.214951992034912, "learning_rate": 9.665216324097222e-05, "logits/chosen": -9.278764724731445, "logits/rejected": -11.973953247070312, "logps/chosen": -125.27214050292969, "logps/rejected": -381.9024353027344, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -6.596549034118652, "rewards/margins": 25.674951553344727, "rewards/rejected": -32.27150344848633, "step": 264 }, { "epoch": 0.21623827009383925, "grad_norm": 2.589518771856092e-05, "learning_rate": 9.661168036940071e-05, "logits/chosen": -10.128534317016602, "logits/rejected": -10.7782621383667, "logps/chosen": -110.13398742675781, "logps/rejected": -369.9613037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.938895225524902, "rewards/margins": 24.552947998046875, "rewards/rejected": -29.491844177246094, "step": 265 }, { "epoch": 0.21705426356589147, "grad_norm": 0.0010133684845641255, "learning_rate": 9.657096278423093e-05, "logits/chosen": -10.444766998291016, "logits/rejected": -11.980485916137695, "logps/chosen": -168.49758911132812, "logps/rejected": -375.51898193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.14266586303711, "rewards/margins": 20.275033950805664, "rewards/rejected": -31.417699813842773, "step": 266 }, { "epoch": 0.2178702570379437, "grad_norm": 41.21939468383789, "learning_rate": 9.653001069049664e-05, "logits/chosen": -10.166536331176758, "logits/rejected": -12.473788261413574, "logps/chosen": -173.22340393066406, "logps/rejected": -460.94573974609375, "loss": 0.3467, "rewards/accuracies": 0.875, "rewards/chosen": -11.171211242675781, "rewards/margins": 27.674640655517578, "rewards/rejected": -38.84585189819336, "step": 267 }, { "epoch": 0.21868625050999593, "grad_norm": 0.0001726004556985572, "learning_rate": 9.648882429441257e-05, "logits/chosen": -9.916497230529785, "logits/rejected": -12.719560623168945, "logps/chosen": -198.09535217285156, "logps/rejected": -438.9021301269531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.261007308959961, "rewards/margins": 24.087074279785156, "rewards/rejected": -37.34808349609375, "step": 268 }, { "epoch": 0.21950224398204815, "grad_norm": 0.006375892553478479, "learning_rate": 9.644740380337325e-05, "logits/chosen": -8.966487884521484, "logits/rejected": -12.468289375305176, "logps/chosen": -160.25244140625, "logps/rejected": -396.6556091308594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.194402694702148, "rewards/margins": 23.289348602294922, "rewards/rejected": -34.4837532043457, "step": 269 }, { "epoch": 0.22031823745410037, "grad_norm": 0.006416396703571081, "learning_rate": 9.640574942595196e-05, "logits/chosen": -9.270419120788574, "logits/rejected": -11.869553565979004, "logps/chosen": -194.03829956054688, "logps/rejected": -382.5901184082031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.770822525024414, "rewards/margins": 18.972993850708008, "rewards/rejected": -31.743816375732422, "step": 270 }, { "epoch": 0.2211342309261526, "grad_norm": 4.4845099449157715, "learning_rate": 9.636386137189975e-05, "logits/chosen": -8.918877601623535, "logits/rejected": -12.158781051635742, "logps/chosen": -202.99667358398438, "logps/rejected": -373.7106018066406, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -15.038204193115234, "rewards/margins": 16.501924514770508, "rewards/rejected": -31.540128707885742, "step": 271 }, { "epoch": 0.2219502243982048, "grad_norm": 112.37238311767578, "learning_rate": 9.632173985214438e-05, "logits/chosen": -8.699335098266602, "logits/rejected": -10.916994094848633, "logps/chosen": -195.94102478027344, "logps/rejected": -352.72601318359375, "loss": 1.1297, "rewards/accuracies": 0.875, "rewards/chosen": -13.683219909667969, "rewards/margins": 15.147764205932617, "rewards/rejected": -28.83098602294922, "step": 272 }, { "epoch": 0.22276621787025705, "grad_norm": 0.08233103156089783, "learning_rate": 9.627938507878917e-05, "logits/chosen": -9.567941665649414, "logits/rejected": -12.628256797790527, "logps/chosen": -232.67938232421875, "logps/rejected": -456.5940246582031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -17.11125946044922, "rewards/margins": 22.15615463256836, "rewards/rejected": -39.26741409301758, "step": 273 }, { "epoch": 0.22358221134230927, "grad_norm": 0.004013880621641874, "learning_rate": 9.623679726511203e-05, "logits/chosen": -8.763690948486328, "logits/rejected": -11.703088760375977, "logps/chosen": -205.50926208496094, "logps/rejected": -422.7945861816406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.506056785583496, "rewards/margins": 20.677791595458984, "rewards/rejected": -35.1838493347168, "step": 274 }, { "epoch": 0.22439820481436148, "grad_norm": 18.592931747436523, "learning_rate": 9.619397662556435e-05, "logits/chosen": -9.727400779724121, "logits/rejected": -11.86372184753418, "logps/chosen": -224.7122802734375, "logps/rejected": -396.5758972167969, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": -17.492294311523438, "rewards/margins": 14.986063957214355, "rewards/rejected": -32.478355407714844, "step": 275 }, { "epoch": 0.2252141982864137, "grad_norm": 0.002710699802264571, "learning_rate": 9.615092337576988e-05, "logits/chosen": -9.137358665466309, "logits/rejected": -14.060672760009766, "logps/chosen": -184.71690368652344, "logps/rejected": -440.1384582519531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.663496017456055, "rewards/margins": 26.404312133789062, "rewards/rejected": -39.06781005859375, "step": 276 }, { "epoch": 0.22603019175846592, "grad_norm": 0.6816709637641907, "learning_rate": 9.61076377325237e-05, "logits/chosen": -8.518875122070312, "logits/rejected": -14.777054786682129, "logps/chosen": -181.19232177734375, "logps/rejected": -469.70684814453125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -13.994298934936523, "rewards/margins": 27.094783782958984, "rewards/rejected": -41.089080810546875, "step": 277 }, { "epoch": 0.22684618523051817, "grad_norm": 0.00708477757871151, "learning_rate": 9.606411991379113e-05, "logits/chosen": -11.593401908874512, "logits/rejected": -16.87900161743164, "logps/chosen": -205.0511474609375, "logps/rejected": -453.56268310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.33000946044922, "rewards/margins": 24.95808982849121, "rewards/rejected": -41.28810119628906, "step": 278 }, { "epoch": 0.22766217870257038, "grad_norm": 61.75625228881836, "learning_rate": 9.60203701387066e-05, "logits/chosen": -13.449602127075195, "logits/rejected": -16.416706085205078, "logps/chosen": -237.55517578125, "logps/rejected": -421.36138916015625, "loss": 1.3865, "rewards/accuracies": 0.875, "rewards/chosen": -19.302162170410156, "rewards/margins": 18.45186996459961, "rewards/rejected": -37.754032135009766, "step": 279 }, { "epoch": 0.2284781721746226, "grad_norm": 0.004971665795892477, "learning_rate": 9.597638862757255e-05, "logits/chosen": -13.651420593261719, "logits/rejected": -17.855146408081055, "logps/chosen": -220.893798828125, "logps/rejected": -508.57635498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.427242279052734, "rewards/margins": 27.370925903320312, "rewards/rejected": -44.79816818237305, "step": 280 }, { "epoch": 0.22929416564667482, "grad_norm": 0.005900177638977766, "learning_rate": 9.59321756018583e-05, "logits/chosen": -13.64161491394043, "logits/rejected": -18.822555541992188, "logps/chosen": -259.52191162109375, "logps/rejected": -582.2808837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.577970504760742, "rewards/margins": 32.62310791015625, "rewards/rejected": -52.201080322265625, "step": 281 }, { "epoch": 0.23011015911872704, "grad_norm": 0.12680520117282867, "learning_rate": 9.588773128419906e-05, "logits/chosen": -16.898468017578125, "logits/rejected": -19.14842414855957, "logps/chosen": -326.052734375, "logps/rejected": -506.814208984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -26.912670135498047, "rewards/margins": 18.92453956604004, "rewards/rejected": -45.83721160888672, "step": 282 }, { "epoch": 0.23092615259077928, "grad_norm": 11.035979270935059, "learning_rate": 9.584305589839462e-05, "logits/chosen": -16.63330841064453, "logits/rejected": -19.544584274291992, "logps/chosen": -321.236572265625, "logps/rejected": -673.548095703125, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -25.799196243286133, "rewards/margins": 34.145301818847656, "rewards/rejected": -59.944496154785156, "step": 283 }, { "epoch": 0.2317421460628315, "grad_norm": 1.1647770404815674, "learning_rate": 9.579814966940833e-05, "logits/chosen": -15.102285385131836, "logits/rejected": -16.898733139038086, "logps/chosen": -278.5546875, "logps/rejected": -489.2818298339844, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -23.328365325927734, "rewards/margins": 20.699390411376953, "rewards/rejected": -44.02776336669922, "step": 284 }, { "epoch": 0.23255813953488372, "grad_norm": 0.004144749138504267, "learning_rate": 9.575301282336599e-05, "logits/chosen": -12.067740440368652, "logits/rejected": -15.403039932250977, "logps/chosen": -195.6218719482422, "logps/rejected": -411.275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.01167106628418, "rewards/margins": 22.132976531982422, "rewards/rejected": -35.144649505615234, "step": 285 }, { "epoch": 0.23337413300693594, "grad_norm": 1.0316583939129487e-05, "learning_rate": 9.570764558755466e-05, "logits/chosen": -11.243715286254883, "logits/rejected": -13.690643310546875, "logps/chosen": -105.70458221435547, "logps/rejected": -437.6370544433594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.261760711669922, "rewards/margins": 32.69416046142578, "rewards/rejected": -37.95592498779297, "step": 286 }, { "epoch": 0.23419012647898818, "grad_norm": 2.3228534701047465e-05, "learning_rate": 9.566204819042152e-05, "logits/chosen": -9.66921329498291, "logits/rejected": -12.400483131408691, "logps/chosen": -118.60467529296875, "logps/rejected": -382.8337097167969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.02426815032959, "rewards/margins": 25.982173919677734, "rewards/rejected": -31.00644302368164, "step": 287 }, { "epoch": 0.2350061199510404, "grad_norm": 0.00022300882847048342, "learning_rate": 9.561622086157272e-05, "logits/chosen": -9.489175796508789, "logits/rejected": -10.951833724975586, "logps/chosen": -105.94268798828125, "logps/rejected": -353.39495849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.817045211791992, "rewards/margins": 23.50859260559082, "rewards/rejected": -28.32563591003418, "step": 288 }, { "epoch": 0.23582211342309262, "grad_norm": 0.0005385390250012279, "learning_rate": 9.557016383177227e-05, "logits/chosen": -8.012263298034668, "logits/rejected": -10.501367568969727, "logps/chosen": -97.65235137939453, "logps/rejected": -332.4874267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0450046062469482, "rewards/margins": 23.431724548339844, "rewards/rejected": -26.476728439331055, "step": 289 }, { "epoch": 0.23663810689514483, "grad_norm": 0.028803100809454918, "learning_rate": 9.552387733294081e-05, "logits/chosen": -7.68990421295166, "logits/rejected": -9.194329261779785, "logps/chosen": -109.41976928710938, "logps/rejected": -238.83673095703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.3987932205200195, "rewards/margins": 14.029091835021973, "rewards/rejected": -18.42788314819336, "step": 290 }, { "epoch": 0.23745410036719705, "grad_norm": 1.762647271156311, "learning_rate": 9.547736159815446e-05, "logits/chosen": -7.999564170837402, "logits/rejected": -9.19517707824707, "logps/chosen": -119.50699615478516, "logps/rejected": -306.7482604980469, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -5.213405609130859, "rewards/margins": 17.332218170166016, "rewards/rejected": -22.545623779296875, "step": 291 }, { "epoch": 0.2382700938392493, "grad_norm": 0.3635430634021759, "learning_rate": 9.543061686164373e-05, "logits/chosen": -10.734640121459961, "logits/rejected": -11.860909461975098, "logps/chosen": -56.15593719482422, "logps/rejected": -251.75912475585938, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -1.696345329284668, "rewards/margins": 19.10782241821289, "rewards/rejected": -20.804166793823242, "step": 292 }, { "epoch": 0.23908608731130152, "grad_norm": 1.2420682907104492, "learning_rate": 9.53836433587922e-05, "logits/chosen": -12.816301345825195, "logits/rejected": -13.469122886657715, "logps/chosen": -111.08757019042969, "logps/rejected": -228.88519287109375, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -4.740365028381348, "rewards/margins": 14.106433868408203, "rewards/rejected": -18.846797943115234, "step": 293 }, { "epoch": 0.23990208078335373, "grad_norm": 0.0006637500482611358, "learning_rate": 9.533644132613541e-05, "logits/chosen": -14.893056869506836, "logits/rejected": -16.027860641479492, "logps/chosen": -131.9689483642578, "logps/rejected": -397.3836364746094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.1445512771606445, "rewards/margins": 26.371074676513672, "rewards/rejected": -33.515625, "step": 294 }, { "epoch": 0.24071807425540595, "grad_norm": 9.669391147326678e-06, "learning_rate": 9.528901100135971e-05, "logits/chosen": -13.805521965026855, "logits/rejected": -16.047161102294922, "logps/chosen": -102.98158264160156, "logps/rejected": -424.4520263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.915434837341309, "rewards/margins": 29.851396560668945, "rewards/rejected": -35.76683044433594, "step": 295 }, { "epoch": 0.24153406772745817, "grad_norm": 13.593500137329102, "learning_rate": 9.524135262330098e-05, "logits/chosen": -15.909582138061523, "logits/rejected": -17.80043601989746, "logps/chosen": -164.89254760742188, "logps/rejected": -416.3616638183594, "loss": 0.2537, "rewards/accuracies": 1.0, "rewards/chosen": -10.910738945007324, "rewards/margins": 25.20503807067871, "rewards/rejected": -36.11577606201172, "step": 296 }, { "epoch": 0.2423500611995104, "grad_norm": 3.55899210262578e-05, "learning_rate": 9.519346643194349e-05, "logits/chosen": -16.322526931762695, "logits/rejected": -17.99270248413086, "logps/chosen": -201.72634887695312, "logps/rejected": -469.08319091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.93994140625, "rewards/margins": 28.11479949951172, "rewards/rejected": -40.05474090576172, "step": 297 }, { "epoch": 0.24316605467156263, "grad_norm": 4.634157448890619e-06, "learning_rate": 9.514535266841862e-05, "logits/chosen": -17.590543746948242, "logits/rejected": -18.33767318725586, "logps/chosen": -235.98593139648438, "logps/rejected": -518.6690063476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.715530395507812, "rewards/margins": 26.96985626220703, "rewards/rejected": -44.685386657714844, "step": 298 }, { "epoch": 0.24398204814361485, "grad_norm": 4.84290075302124, "learning_rate": 9.509701157500376e-05, "logits/chosen": -17.375850677490234, "logits/rejected": -18.327800750732422, "logps/chosen": -191.04986572265625, "logps/rejected": -449.1412048339844, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": -14.847517013549805, "rewards/margins": 24.439416885375977, "rewards/rejected": -39.28693389892578, "step": 299 }, { "epoch": 0.24479804161566707, "grad_norm": 0.00014727511734236032, "learning_rate": 9.504844339512095e-05, "logits/chosen": -17.409826278686523, "logits/rejected": -18.643966674804688, "logps/chosen": -194.89422607421875, "logps/rejected": -556.2900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.674834251403809, "rewards/margins": 34.213626861572266, "rewards/rejected": -48.888458251953125, "step": 300 }, { "epoch": 0.24561403508771928, "grad_norm": 0.0037735416553914547, "learning_rate": 9.49996483733358e-05, "logits/chosen": -17.465438842773438, "logits/rejected": -18.596603393554688, "logps/chosen": -250.05517578125, "logps/rejected": -574.5320434570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.963924407958984, "rewards/margins": 29.947113037109375, "rewards/rejected": -48.911033630371094, "step": 301 }, { "epoch": 0.24643002855977153, "grad_norm": 0.04821038618683815, "learning_rate": 9.495062675535613e-05, "logits/chosen": -17.616226196289062, "logits/rejected": -18.53111457824707, "logps/chosen": -259.45989990234375, "logps/rejected": -434.0611267089844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -20.50058364868164, "rewards/margins": 18.040441513061523, "rewards/rejected": -38.54102325439453, "step": 302 }, { "epoch": 0.24724602203182375, "grad_norm": 1.8714048862457275, "learning_rate": 9.490137878803079e-05, "logits/chosen": -16.885345458984375, "logits/rejected": -18.01694107055664, "logps/chosen": -246.2701873779297, "logps/rejected": -497.90374755859375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -17.579376220703125, "rewards/margins": 25.142433166503906, "rewards/rejected": -42.72180938720703, "step": 303 }, { "epoch": 0.24806201550387597, "grad_norm": 0.004759036935865879, "learning_rate": 9.485190471934843e-05, "logits/chosen": -16.91033935546875, "logits/rejected": -18.10635757446289, "logps/chosen": -241.02296447753906, "logps/rejected": -555.1207275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.88488006591797, "rewards/margins": 30.154438018798828, "rewards/rejected": -49.03931427001953, "step": 304 }, { "epoch": 0.24887800897592818, "grad_norm": 16.475387573242188, "learning_rate": 9.480220479843627e-05, "logits/chosen": -16.333524703979492, "logits/rejected": -17.400737762451172, "logps/chosen": -307.61883544921875, "logps/rejected": -520.6001586914062, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": -25.180419921875, "rewards/margins": 20.530651092529297, "rewards/rejected": -45.71106719970703, "step": 305 }, { "epoch": 0.24969400244798043, "grad_norm": 1.1196490049769636e-05, "learning_rate": 9.475227927555872e-05, "logits/chosen": -16.294763565063477, "logits/rejected": -16.13132095336914, "logps/chosen": -266.0546875, "logps/rejected": -523.6103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.53067970275879, "rewards/margins": 24.24150848388672, "rewards/rejected": -45.772186279296875, "step": 306 }, { "epoch": 0.25050999592003265, "grad_norm": 0.37041911482810974, "learning_rate": 9.470212840211632e-05, "logits/chosen": -15.523881912231445, "logits/rejected": -16.76061248779297, "logps/chosen": -268.8870544433594, "logps/rejected": -476.2798156738281, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -21.66097640991211, "rewards/margins": 20.96092414855957, "rewards/rejected": -42.62190246582031, "step": 307 }, { "epoch": 0.25132598939208484, "grad_norm": 68.40034484863281, "learning_rate": 9.465175243064428e-05, "logits/chosen": -14.98227310180664, "logits/rejected": -16.57996940612793, "logps/chosen": -280.7308349609375, "logps/rejected": -456.3181457519531, "loss": 8.1667, "rewards/accuracies": 0.875, "rewards/chosen": -23.031734466552734, "rewards/margins": 18.248315811157227, "rewards/rejected": -41.280052185058594, "step": 308 }, { "epoch": 0.2521419828641371, "grad_norm": 0.02184598706662655, "learning_rate": 9.460115161481132e-05, "logits/chosen": -15.476863861083984, "logits/rejected": -15.941044807434082, "logps/chosen": -247.20034790039062, "logps/rejected": -491.08367919921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -19.599496841430664, "rewards/margins": 23.256038665771484, "rewards/rejected": -42.855533599853516, "step": 309 }, { "epoch": 0.2529579763361893, "grad_norm": 0.8346619009971619, "learning_rate": 9.45503262094184e-05, "logits/chosen": -14.856800079345703, "logits/rejected": -16.43402862548828, "logps/chosen": -217.92416381835938, "logps/rejected": -483.34771728515625, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -16.042951583862305, "rewards/margins": 26.06529426574707, "rewards/rejected": -42.108245849609375, "step": 310 }, { "epoch": 0.2537739698082415, "grad_norm": 0.002111955778673291, "learning_rate": 9.449927647039736e-05, "logits/chosen": -13.997231483459473, "logits/rejected": -15.2578125, "logps/chosen": -170.9444580078125, "logps/rejected": -395.749755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.919166564941406, "rewards/margins": 21.485597610473633, "rewards/rejected": -32.40476608276367, "step": 311 }, { "epoch": 0.25458996328029376, "grad_norm": 0.008486362174153328, "learning_rate": 9.444800265480967e-05, "logits/chosen": -14.362428665161133, "logits/rejected": -16.026975631713867, "logps/chosen": -202.03189086914062, "logps/rejected": -482.66033935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.820335388183594, "rewards/margins": 25.21038055419922, "rewards/rejected": -40.03071594238281, "step": 312 }, { "epoch": 0.255405956752346, "grad_norm": 0.1023339331150055, "learning_rate": 9.439650502084518e-05, "logits/chosen": -13.698419570922852, "logits/rejected": -15.592988967895508, "logps/chosen": -210.83197021484375, "logps/rejected": -519.4010620117188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -15.191287994384766, "rewards/margins": 29.19268798828125, "rewards/rejected": -44.383975982666016, "step": 313 }, { "epoch": 0.2562219502243982, "grad_norm": 0.001921481336466968, "learning_rate": 9.434478382782075e-05, "logits/chosen": -13.992198944091797, "logits/rejected": -15.472991943359375, "logps/chosen": -199.48011779785156, "logps/rejected": -488.013916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.257129669189453, "rewards/margins": 28.266815185546875, "rewards/rejected": -41.52394485473633, "step": 314 }, { "epoch": 0.25703794369645044, "grad_norm": 0.00026581474230624735, "learning_rate": 9.4292839336179e-05, "logits/chosen": -14.632766723632812, "logits/rejected": -16.343807220458984, "logps/chosen": -181.8853759765625, "logps/rejected": -408.5599060058594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.430631637573242, "rewards/margins": 21.519502639770508, "rewards/rejected": -35.95013427734375, "step": 315 }, { "epoch": 0.25785393716850263, "grad_norm": 62.53071594238281, "learning_rate": 9.424067180748692e-05, "logits/chosen": -14.636754035949707, "logits/rejected": -15.93353271484375, "logps/chosen": -202.10061645507812, "logps/rejected": -459.2847900390625, "loss": 2.3219, "rewards/accuracies": 0.875, "rewards/chosen": -15.618247032165527, "rewards/margins": 24.006549835205078, "rewards/rejected": -39.624794006347656, "step": 316 }, { "epoch": 0.2586699306405549, "grad_norm": 0.30295220017433167, "learning_rate": 9.418828150443469e-05, "logits/chosen": -13.074886322021484, "logits/rejected": -14.525035858154297, "logps/chosen": -204.2904815673828, "logps/rejected": -391.6851806640625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -13.37636947631836, "rewards/margins": 19.45201873779297, "rewards/rejected": -32.82838821411133, "step": 317 }, { "epoch": 0.2594859241126071, "grad_norm": 32.64633560180664, "learning_rate": 9.413566869083416e-05, "logits/chosen": -13.608403205871582, "logits/rejected": -15.285137176513672, "logps/chosen": -199.98977661132812, "logps/rejected": -383.95361328125, "loss": 2.3135, "rewards/accuracies": 0.875, "rewards/chosen": -15.212648391723633, "rewards/margins": 18.959125518798828, "rewards/rejected": -34.17177200317383, "step": 318 }, { "epoch": 0.2603019175846593, "grad_norm": 0.047908175736665726, "learning_rate": 9.408283363161774e-05, "logits/chosen": -13.857292175292969, "logits/rejected": -14.904293060302734, "logps/chosen": -238.568359375, "logps/rejected": -487.5601806640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.902257919311523, "rewards/margins": 25.258113861083984, "rewards/rejected": -42.160369873046875, "step": 319 }, { "epoch": 0.26111791105671156, "grad_norm": 0.003684533294290304, "learning_rate": 9.40297765928369e-05, "logits/chosen": -14.061006546020508, "logits/rejected": -15.612723350524902, "logps/chosen": -192.35302734375, "logps/rejected": -412.67303466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.432043075561523, "rewards/margins": 22.022167205810547, "rewards/rejected": -36.45420837402344, "step": 320 }, { "epoch": 0.26193390452876375, "grad_norm": 66.28643035888672, "learning_rate": 9.397649784166092e-05, "logits/chosen": -13.45442008972168, "logits/rejected": -14.812433242797852, "logps/chosen": -261.83123779296875, "logps/rejected": -394.64703369140625, "loss": 1.5748, "rewards/accuracies": 0.875, "rewards/chosen": -19.741979598999023, "rewards/margins": 14.081380844116211, "rewards/rejected": -33.8233642578125, "step": 321 }, { "epoch": 0.262749898000816, "grad_norm": 76.19438171386719, "learning_rate": 9.39229976463755e-05, "logits/chosen": -13.217508316040039, "logits/rejected": -13.494247436523438, "logps/chosen": -316.42181396484375, "logps/rejected": -446.54888916015625, "loss": 4.1813, "rewards/accuracies": 0.875, "rewards/chosen": -27.31641387939453, "rewards/margins": 12.031435012817383, "rewards/rejected": -39.34784698486328, "step": 322 }, { "epoch": 0.26356589147286824, "grad_norm": 52.25479507446289, "learning_rate": 9.386927627638142e-05, "logits/chosen": -10.287138938903809, "logits/rejected": -10.96617317199707, "logps/chosen": -235.65670776367188, "logps/rejected": -426.1023864746094, "loss": 1.0586, "rewards/accuracies": 0.875, "rewards/chosen": -16.468318939208984, "rewards/margins": 19.113845825195312, "rewards/rejected": -35.5821647644043, "step": 323 }, { "epoch": 0.26438188494492043, "grad_norm": 14.650277137756348, "learning_rate": 9.381533400219318e-05, "logits/chosen": -8.269796371459961, "logits/rejected": -8.923911094665527, "logps/chosen": -201.1951446533203, "logps/rejected": -318.8875732421875, "loss": 2.5361, "rewards/accuracies": 0.875, "rewards/chosen": -14.699061393737793, "rewards/margins": 11.92845630645752, "rewards/rejected": -26.627517700195312, "step": 324 }, { "epoch": 0.2651978784169727, "grad_norm": 0.018482405692338943, "learning_rate": 9.376117109543769e-05, "logits/chosen": -4.699315547943115, "logits/rejected": -5.341955184936523, "logps/chosen": -136.36013793945312, "logps/rejected": -287.37017822265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.553003787994385, "rewards/margins": 16.62084197998047, "rewards/rejected": -23.173843383789062, "step": 325 }, { "epoch": 0.26601387188902487, "grad_norm": 0.01990222930908203, "learning_rate": 9.37067878288528e-05, "logits/chosen": -4.491049289703369, "logits/rejected": -4.341488838195801, "logps/chosen": -123.81049346923828, "logps/rejected": -284.9947509765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.082080364227295, "rewards/margins": 15.382107734680176, "rewards/rejected": -21.464187622070312, "step": 326 }, { "epoch": 0.2668298653610771, "grad_norm": 0.19517892599105835, "learning_rate": 9.365218447628603e-05, "logits/chosen": -1.2369911670684814, "logits/rejected": -1.460111141204834, "logps/chosen": -105.55363464355469, "logps/rejected": -210.57440185546875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.59702730178833, "rewards/margins": 11.786713600158691, "rewards/rejected": -16.383739471435547, "step": 327 }, { "epoch": 0.26764585883312936, "grad_norm": 0.06865404546260834, "learning_rate": 9.359736131269312e-05, "logits/chosen": -0.9978897571563721, "logits/rejected": -0.7153415679931641, "logps/chosen": -102.38935852050781, "logps/rejected": -249.46487426757812, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.2326974868774414, "rewards/margins": 15.163217544555664, "rewards/rejected": -18.39591407775879, "step": 328 }, { "epoch": 0.26846185230518155, "grad_norm": 3.034238576889038, "learning_rate": 9.354231861413668e-05, "logits/chosen": -1.1397753953933716, "logits/rejected": 0.27396970987319946, "logps/chosen": -111.52031707763672, "logps/rejected": -263.5360412597656, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -3.799574613571167, "rewards/margins": 15.522680282592773, "rewards/rejected": -19.322254180908203, "step": 329 }, { "epoch": 0.2692778457772338, "grad_norm": 0.09006127715110779, "learning_rate": 9.348705665778478e-05, "logits/chosen": 0.01636667549610138, "logits/rejected": -0.312343567609787, "logps/chosen": -106.26002502441406, "logps/rejected": -228.46665954589844, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.576852798461914, "rewards/margins": 12.851371765136719, "rewards/rejected": -17.428224563598633, "step": 330 }, { "epoch": 0.270093839249286, "grad_norm": 0.10469751805067062, "learning_rate": 9.343157572190957e-05, "logits/chosen": -0.7751108407974243, "logits/rejected": 0.11586302518844604, "logps/chosen": -70.46332550048828, "logps/rejected": -216.43356323242188, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.940313458442688, "rewards/margins": 13.771827697753906, "rewards/rejected": -14.712140083312988, "step": 331 }, { "epoch": 0.27090983272133823, "grad_norm": 0.008728167973458767, "learning_rate": 9.337587608588588e-05, "logits/chosen": -1.3020216226577759, "logits/rejected": 0.29650938510894775, "logps/chosen": -92.79142761230469, "logps/rejected": -261.17523193359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.3466310501098633, "rewards/margins": 16.64208984375, "rewards/rejected": -19.988719940185547, "step": 332 }, { "epoch": 0.2717258261933905, "grad_norm": 0.9698936343193054, "learning_rate": 9.331995803018981e-05, "logits/chosen": -0.8883445262908936, "logits/rejected": 0.12885552644729614, "logps/chosen": -57.33027267456055, "logps/rejected": -217.082275390625, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -1.1083827018737793, "rewards/margins": 14.171688079833984, "rewards/rejected": -15.280070304870605, "step": 333 }, { "epoch": 0.27254181966544266, "grad_norm": 0.0007219838444143534, "learning_rate": 9.326382183639731e-05, "logits/chosen": -0.9328141212463379, "logits/rejected": 0.1704556941986084, "logps/chosen": -101.66610717773438, "logps/rejected": -289.36285400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.17764949798584, "rewards/margins": 18.452714920043945, "rewards/rejected": -22.63036346435547, "step": 334 }, { "epoch": 0.2733578131374949, "grad_norm": 0.00043095965520478785, "learning_rate": 9.320746778718275e-05, "logits/chosen": -0.8222547769546509, "logits/rejected": 0.6380710601806641, "logps/chosen": -108.79914855957031, "logps/rejected": -303.60430908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.4873762130737305, "rewards/margins": 20.098907470703125, "rewards/rejected": -24.586284637451172, "step": 335 }, { "epoch": 0.2741738066095471, "grad_norm": 0.0005042212433181703, "learning_rate": 9.315089616631752e-05, "logits/chosen": -2.1082537174224854, "logits/rejected": -0.8749812245368958, "logps/chosen": -84.0229721069336, "logps/rejected": -287.2393493652344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.760906219482422, "rewards/margins": 19.502105712890625, "rewards/rejected": -23.263010025024414, "step": 336 }, { "epoch": 0.27498980008159934, "grad_norm": 0.03879374638199806, "learning_rate": 9.309410725866862e-05, "logits/chosen": -1.9367183446884155, "logits/rejected": -0.7970031499862671, "logps/chosen": -92.76180267333984, "logps/rejected": -293.6820373535156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.5940134525299072, "rewards/margins": 19.663738250732422, "rewards/rejected": -23.25775146484375, "step": 337 }, { "epoch": 0.2758057935536516, "grad_norm": 0.018965065479278564, "learning_rate": 9.30371013501972e-05, "logits/chosen": -1.3051910400390625, "logits/rejected": 0.03368878364562988, "logps/chosen": -156.66961669921875, "logps/rejected": -324.44964599609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -10.33583927154541, "rewards/margins": 17.56170654296875, "rewards/rejected": -27.897544860839844, "step": 338 }, { "epoch": 0.2766217870257038, "grad_norm": 0.00809240248054266, "learning_rate": 9.297987872795705e-05, "logits/chosen": -1.8416078090667725, "logits/rejected": 0.27854490280151367, "logps/chosen": -142.57113647460938, "logps/rejected": -357.99334716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.7122955322265625, "rewards/margins": 20.939483642578125, "rewards/rejected": -27.651779174804688, "step": 339 }, { "epoch": 0.277437780497756, "grad_norm": 0.000985692604444921, "learning_rate": 9.292243968009331e-05, "logits/chosen": -1.6931838989257812, "logits/rejected": 0.4305517375469208, "logps/chosen": -177.5070037841797, "logps/rejected": -358.6864929199219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.425640106201172, "rewards/margins": 17.95069694519043, "rewards/rejected": -30.37633514404297, "step": 340 }, { "epoch": 0.2782537739698082, "grad_norm": 0.03076515533030033, "learning_rate": 9.28647844958409e-05, "logits/chosen": -2.124490261077881, "logits/rejected": 0.2151016891002655, "logps/chosen": -184.319580078125, "logps/rejected": -333.41986083984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.887386322021484, "rewards/margins": 15.4686279296875, "rewards/rejected": -29.356014251708984, "step": 341 }, { "epoch": 0.27906976744186046, "grad_norm": 0.03788198530673981, "learning_rate": 9.280691346552308e-05, "logits/chosen": -1.499356985092163, "logits/rejected": 0.17660725116729736, "logps/chosen": -193.41488647460938, "logps/rejected": -397.3574523925781, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -13.014991760253906, "rewards/margins": 19.674057006835938, "rewards/rejected": -32.68905258178711, "step": 342 }, { "epoch": 0.2798857609139127, "grad_norm": 12.133699417114258, "learning_rate": 9.274882688055005e-05, "logits/chosen": -0.5906044840812683, "logits/rejected": 1.0954763889312744, "logps/chosen": -237.655029296875, "logps/rejected": -408.3963623046875, "loss": 0.3589, "rewards/accuracies": 1.0, "rewards/chosen": -18.649202346801758, "rewards/margins": 14.809571266174316, "rewards/rejected": -33.45877456665039, "step": 343 }, { "epoch": 0.2807017543859649, "grad_norm": 5.9587495343293995e-05, "learning_rate": 9.269052503341736e-05, "logits/chosen": -2.795231342315674, "logits/rejected": -0.3811749815940857, "logps/chosen": -133.2888641357422, "logps/rejected": -355.7118225097656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.817846298217773, "rewards/margins": 21.44972801208496, "rewards/rejected": -30.267576217651367, "step": 344 }, { "epoch": 0.28151774785801714, "grad_norm": 0.8091966509819031, "learning_rate": 9.263200821770461e-05, "logits/chosen": -1.5151029825210571, "logits/rejected": 0.15025201439857483, "logps/chosen": -240.8880157470703, "logps/rejected": -391.9287414550781, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -18.042678833007812, "rewards/margins": 15.586021423339844, "rewards/rejected": -33.628700256347656, "step": 345 }, { "epoch": 0.28233374133006933, "grad_norm": 108.66437530517578, "learning_rate": 9.257327672807382e-05, "logits/chosen": -3.7523913383483887, "logits/rejected": -1.9780633449554443, "logps/chosen": -161.63131713867188, "logps/rejected": -357.8226318359375, "loss": 5.4769, "rewards/accuracies": 0.875, "rewards/chosen": -11.534266471862793, "rewards/margins": 18.318321228027344, "rewards/rejected": -29.852588653564453, "step": 346 }, { "epoch": 0.2831497348021216, "grad_norm": 0.00404748972505331, "learning_rate": 9.251433086026799e-05, "logits/chosen": -3.2383816242218018, "logits/rejected": -1.6977741718292236, "logps/chosen": -145.80490112304688, "logps/rejected": -374.75042724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.425862312316895, "rewards/margins": 21.946489334106445, "rewards/rejected": -31.372352600097656, "step": 347 }, { "epoch": 0.2839657282741738, "grad_norm": 2.6204563141618564e-07, "learning_rate": 9.24551709111097e-05, "logits/chosen": -3.002244234085083, "logits/rejected": -2.5995137691497803, "logps/chosen": -173.5109100341797, "logps/rejected": -411.783935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.301502227783203, "rewards/margins": 22.66751480102539, "rewards/rejected": -33.96902084350586, "step": 348 }, { "epoch": 0.284781721746226, "grad_norm": 0.0034193319734185934, "learning_rate": 9.239579717849941e-05, "logits/chosen": -5.328883647918701, "logits/rejected": -3.011111259460449, "logps/chosen": -123.85810089111328, "logps/rejected": -360.529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.582424163818359, "rewards/margins": 22.653446197509766, "rewards/rejected": -30.235872268676758, "step": 349 }, { "epoch": 0.28559771521827826, "grad_norm": 0.0001371344696963206, "learning_rate": 9.233620996141421e-05, "logits/chosen": -3.5035183429718018, "logits/rejected": -2.666187286376953, "logps/chosen": -165.5836639404297, "logps/rejected": -320.6746826171875, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -11.525524139404297, "rewards/margins": 16.065261840820312, "rewards/rejected": -27.590787887573242, "step": 350 }, { "epoch": 0.2864137086903305, "grad_norm": 3.7021639347076416, "learning_rate": 9.227640955990615e-05, "logits/chosen": -3.112853527069092, "logits/rejected": -1.392868161201477, "logps/chosen": -121.599609375, "logps/rejected": -320.98870849609375, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -7.247061729431152, "rewards/margins": 19.423612594604492, "rewards/rejected": -26.670673370361328, "step": 351 }, { "epoch": 0.2872297021623827, "grad_norm": 0.0023244700860232115, "learning_rate": 9.221639627510076e-05, "logits/chosen": -1.9835022687911987, "logits/rejected": -1.8746528625488281, "logps/chosen": -190.60140991210938, "logps/rejected": -362.8575439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.247076034545898, "rewards/margins": 17.52633285522461, "rewards/rejected": -31.773408889770508, "step": 352 }, { "epoch": 0.28804569563443494, "grad_norm": 2.901745080947876, "learning_rate": 9.215617040919555e-05, "logits/chosen": -4.088661193847656, "logits/rejected": -3.0415070056915283, "logps/chosen": -125.10768127441406, "logps/rejected": -321.67279052734375, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -7.406469345092773, "rewards/margins": 18.845613479614258, "rewards/rejected": -26.25208282470703, "step": 353 }, { "epoch": 0.28886168910648713, "grad_norm": 0.048526570200920105, "learning_rate": 9.209573226545851e-05, "logits/chosen": -4.404238700866699, "logits/rejected": -2.0350520610809326, "logps/chosen": -141.83575439453125, "logps/rejected": -427.3538818359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.049225807189941, "rewards/margins": 25.89535903930664, "rewards/rejected": -33.94458770751953, "step": 354 }, { "epoch": 0.2896776825785394, "grad_norm": 3.42357566296414e-06, "learning_rate": 9.203508214822652e-05, "logits/chosen": -6.802318096160889, "logits/rejected": -4.725444316864014, "logps/chosen": -98.21269226074219, "logps/rejected": -391.6097412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.6672139167785645, "rewards/margins": 27.427013397216797, "rewards/rejected": -32.0942268371582, "step": 355 }, { "epoch": 0.2904936760505916, "grad_norm": 8.799574851989746, "learning_rate": 9.197422036290387e-05, "logits/chosen": -5.641745567321777, "logits/rejected": -5.742550373077393, "logps/chosen": -122.88751220703125, "logps/rejected": -338.53369140625, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -7.03666877746582, "rewards/margins": 21.008047103881836, "rewards/rejected": -28.044715881347656, "step": 356 }, { "epoch": 0.2913096695226438, "grad_norm": 74.34954071044922, "learning_rate": 9.191314721596072e-05, "logits/chosen": -8.330649375915527, "logits/rejected": -7.803876876831055, "logps/chosen": -118.15165710449219, "logps/rejected": -239.80862426757812, "loss": 1.202, "rewards/accuracies": 0.875, "rewards/chosen": -6.044073104858398, "rewards/margins": 12.057257652282715, "rewards/rejected": -18.101329803466797, "step": 357 }, { "epoch": 0.29212566299469606, "grad_norm": 105.7542724609375, "learning_rate": 9.185186301493152e-05, "logits/chosen": -4.875405311584473, "logits/rejected": -4.629509925842285, "logps/chosen": -148.9691162109375, "logps/rejected": -248.81399536132812, "loss": 1.218, "rewards/accuracies": 0.875, "rewards/chosen": -9.644676208496094, "rewards/margins": 9.942374229431152, "rewards/rejected": -19.587051391601562, "step": 358 }, { "epoch": 0.29294165646674825, "grad_norm": 85.23902893066406, "learning_rate": 9.179036806841353e-05, "logits/chosen": -3.7346081733703613, "logits/rejected": -3.7710280418395996, "logps/chosen": -125.45524597167969, "logps/rejected": -215.84674072265625, "loss": 3.862, "rewards/accuracies": 0.875, "rewards/chosen": -6.264307022094727, "rewards/margins": 9.598668098449707, "rewards/rejected": -15.862975120544434, "step": 359 }, { "epoch": 0.2937576499388005, "grad_norm": 0.21265903115272522, "learning_rate": 9.172866268606513e-05, "logits/chosen": -4.320167064666748, "logits/rejected": -4.998408317565918, "logps/chosen": -125.39999389648438, "logps/rejected": -320.9742126464844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -6.25499153137207, "rewards/margins": 21.179424285888672, "rewards/rejected": -27.43441390991211, "step": 360 }, { "epoch": 0.29457364341085274, "grad_norm": 0.0026147153694182634, "learning_rate": 9.166674717860447e-05, "logits/chosen": -5.461986541748047, "logits/rejected": -5.333798408508301, "logps/chosen": -105.26905059814453, "logps/rejected": -334.90802001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.023922920227051, "rewards/margins": 22.01861572265625, "rewards/rejected": -28.042537689208984, "step": 361 }, { "epoch": 0.2953896368829049, "grad_norm": 0.0002686309744603932, "learning_rate": 9.16046218578077e-05, "logits/chosen": -6.053030490875244, "logits/rejected": -7.271350383758545, "logps/chosen": -115.27403259277344, "logps/rejected": -376.9645690917969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.855991363525391, "rewards/margins": 26.73113441467285, "rewards/rejected": -32.58712387084961, "step": 362 }, { "epoch": 0.2962056303549572, "grad_norm": 3.1935705919750035e-06, "learning_rate": 9.154228703650752e-05, "logits/chosen": -6.300269603729248, "logits/rejected": -6.181588172912598, "logps/chosen": -123.89974975585938, "logps/rejected": -386.9864501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.795979976654053, "rewards/margins": 25.805152893066406, "rewards/rejected": -32.60113525390625, "step": 363 }, { "epoch": 0.29702162382700936, "grad_norm": 0.0003742240951396525, "learning_rate": 9.147974302859157e-05, "logits/chosen": -6.0235090255737305, "logits/rejected": -6.900145530700684, "logps/chosen": -135.19837951660156, "logps/rejected": -393.5557861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.047418594360352, "rewards/margins": 24.826753616333008, "rewards/rejected": -32.87417221069336, "step": 364 }, { "epoch": 0.2978376172990616, "grad_norm": 0.00030626216903328896, "learning_rate": 9.141699014900083e-05, "logits/chosen": -6.023988246917725, "logits/rejected": -6.718372821807861, "logps/chosen": -127.13862609863281, "logps/rejected": -384.21343994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.446455955505371, "rewards/margins": 24.659299850463867, "rewards/rejected": -31.105754852294922, "step": 365 }, { "epoch": 0.29865361077111385, "grad_norm": 0.0030684475786983967, "learning_rate": 9.135402871372808e-05, "logits/chosen": -5.935622692108154, "logits/rejected": -6.211177825927734, "logps/chosen": -128.56326293945312, "logps/rejected": -410.6916809082031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.044066429138184, "rewards/margins": 27.79281234741211, "rewards/rejected": -33.83687973022461, "step": 366 }, { "epoch": 0.29946960424316604, "grad_norm": 11.674453735351562, "learning_rate": 9.12908590398163e-05, "logits/chosen": -6.548252105712891, "logits/rejected": -6.55700159072876, "logps/chosen": -176.50494384765625, "logps/rejected": -422.53302001953125, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": -13.35818099975586, "rewards/margins": 23.076107025146484, "rewards/rejected": -36.434288024902344, "step": 367 }, { "epoch": 0.3002855977152183, "grad_norm": 0.262414813041687, "learning_rate": 9.122748144535705e-05, "logits/chosen": -8.7116060256958, "logits/rejected": -7.72294282913208, "logps/chosen": -140.0068817138672, "logps/rejected": -468.04119873046875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -10.507989883422852, "rewards/margins": 30.16128921508789, "rewards/rejected": -40.669281005859375, "step": 368 }, { "epoch": 0.3011015911872705, "grad_norm": 1.0055865473646008e-08, "learning_rate": 9.11638962494888e-05, "logits/chosen": -8.266602516174316, "logits/rejected": -7.879839897155762, "logps/chosen": -141.32513427734375, "logps/rejected": -420.65716552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.573930740356445, "rewards/margins": 28.207426071166992, "rewards/rejected": -36.78135681152344, "step": 369 }, { "epoch": 0.3019175846593227, "grad_norm": 0.17706608772277832, "learning_rate": 9.110010377239551e-05, "logits/chosen": -7.264012336730957, "logits/rejected": -6.726618766784668, "logps/chosen": -221.22109985351562, "logps/rejected": -418.7328796386719, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -18.01608657836914, "rewards/margins": 18.883026123046875, "rewards/rejected": -36.89911651611328, "step": 370 }, { "epoch": 0.30273357813137497, "grad_norm": 2.4014722654897014e-08, "learning_rate": 9.103610433530483e-05, "logits/chosen": -8.006454467773438, "logits/rejected": -8.459311485290527, "logps/chosen": -179.5457000732422, "logps/rejected": -502.20977783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.522847175598145, "rewards/margins": 32.71052932739258, "rewards/rejected": -43.233375549316406, "step": 371 }, { "epoch": 0.30354957160342716, "grad_norm": 0.14560459554195404, "learning_rate": 9.09718982604866e-05, "logits/chosen": -6.9581427574157715, "logits/rejected": -7.828060150146484, "logps/chosen": -133.0667724609375, "logps/rejected": -469.5762939453125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -6.176506042480469, "rewards/margins": 32.89826965332031, "rewards/rejected": -39.07477569580078, "step": 372 }, { "epoch": 0.3043655650754794, "grad_norm": 0.00036669516703113914, "learning_rate": 9.090748587125118e-05, "logits/chosen": -8.726449012756348, "logits/rejected": -8.874643325805664, "logps/chosen": -265.4364013671875, "logps/rejected": -472.29022216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.514177322387695, "rewards/margins": 20.47307586669922, "rewards/rejected": -40.98725128173828, "step": 373 }, { "epoch": 0.3051815585475316, "grad_norm": 0.0026654324028640985, "learning_rate": 9.084286749194782e-05, "logits/chosen": -9.380644798278809, "logits/rejected": -9.918187141418457, "logps/chosen": -240.1265411376953, "logps/rejected": -495.5820007324219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.326528549194336, "rewards/margins": 24.81061363220215, "rewards/rejected": -44.13714599609375, "step": 374 }, { "epoch": 0.30599755201958384, "grad_norm": 0.12199927121400833, "learning_rate": 9.077804344796302e-05, "logits/chosen": -9.163942337036133, "logits/rejected": -10.156996726989746, "logps/chosen": -231.6831512451172, "logps/rejected": -449.58001708984375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -17.001680374145508, "rewards/margins": 22.011795043945312, "rewards/rejected": -39.01347351074219, "step": 375 }, { "epoch": 0.3068135454916361, "grad_norm": 0.024529704824090004, "learning_rate": 9.071301406571892e-05, "logits/chosen": -9.280887603759766, "logits/rejected": -10.469989776611328, "logps/chosen": -227.19290161132812, "logps/rejected": -505.447509765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.64015769958496, "rewards/margins": 28.543880462646484, "rewards/rejected": -45.18403625488281, "step": 376 }, { "epoch": 0.3076295389636883, "grad_norm": 0.43731245398521423, "learning_rate": 9.06477796726717e-05, "logits/chosen": -10.133909225463867, "logits/rejected": -11.603899002075195, "logps/chosen": -243.4939422607422, "logps/rejected": -514.3060302734375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -16.453149795532227, "rewards/margins": 30.392702102661133, "rewards/rejected": -46.84585189819336, "step": 377 }, { "epoch": 0.3084455324357405, "grad_norm": 2.250394391012378e-05, "learning_rate": 9.058234059730976e-05, "logits/chosen": -9.37553596496582, "logits/rejected": -11.051447868347168, "logps/chosen": -224.58099365234375, "logps/rejected": -473.7567443847656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.768797874450684, "rewards/margins": 26.2579345703125, "rewards/rejected": -42.0267333984375, "step": 378 }, { "epoch": 0.3092615259077927, "grad_norm": 0.0007410438847728074, "learning_rate": 9.051669716915227e-05, "logits/chosen": -7.350563049316406, "logits/rejected": -8.915231704711914, "logps/chosen": -203.40069580078125, "logps/rejected": -496.9312438964844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.184200286865234, "rewards/margins": 28.96760368347168, "rewards/rejected": -43.15180206298828, "step": 379 }, { "epoch": 0.31007751937984496, "grad_norm": 8.542142104772665e-09, "learning_rate": 9.045084971874738e-05, "logits/chosen": -6.969725608825684, "logits/rejected": -7.049520492553711, "logps/chosen": -232.30982971191406, "logps/rejected": -570.05322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.14422035217285, "rewards/margins": 32.68315887451172, "rewards/rejected": -50.8273811340332, "step": 380 }, { "epoch": 0.3108935128518972, "grad_norm": 3.993727570872352e-09, "learning_rate": 9.038479857767062e-05, "logits/chosen": -6.7536211013793945, "logits/rejected": -7.175205230712891, "logps/chosen": -179.47695922851562, "logps/rejected": -509.67926025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.30460262298584, "rewards/margins": 32.45375061035156, "rewards/rejected": -44.75835418701172, "step": 381 }, { "epoch": 0.3117095063239494, "grad_norm": 1.5916862139420118e-07, "learning_rate": 9.031854407852316e-05, "logits/chosen": -6.225503921508789, "logits/rejected": -6.168867111206055, "logps/chosen": -177.0838623046875, "logps/rejected": -557.537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.56078052520752, "rewards/margins": 38.0240478515625, "rewards/rejected": -49.58483123779297, "step": 382 }, { "epoch": 0.31252549979600164, "grad_norm": 1.6356008245566045e-06, "learning_rate": 9.025208655493026e-05, "logits/chosen": -5.902649879455566, "logits/rejected": -6.113770484924316, "logps/chosen": -169.453369140625, "logps/rejected": -465.3369445800781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.255166053771973, "rewards/margins": 31.180221557617188, "rewards/rejected": -41.43539047241211, "step": 383 }, { "epoch": 0.31334149326805383, "grad_norm": 0.0003264894476160407, "learning_rate": 9.018542634153944e-05, "logits/chosen": -5.727290153503418, "logits/rejected": -5.074371814727783, "logps/chosen": -199.1065673828125, "logps/rejected": -521.2606201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.013130187988281, "rewards/margins": 31.338886260986328, "rewards/rejected": -46.352012634277344, "step": 384 }, { "epoch": 0.3141574867401061, "grad_norm": 2.2005915525369346e-06, "learning_rate": 9.01185637740189e-05, "logits/chosen": -5.7641987800598145, "logits/rejected": -3.7964560985565186, "logps/chosen": -161.62652587890625, "logps/rejected": -443.57958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.597410202026367, "rewards/margins": 27.546138763427734, "rewards/rejected": -38.14354705810547, "step": 385 }, { "epoch": 0.3149734802121583, "grad_norm": 0.4496854245662689, "learning_rate": 9.00514991890558e-05, "logits/chosen": -6.622735500335693, "logits/rejected": -5.093892574310303, "logps/chosen": -190.18252563476562, "logps/rejected": -430.4713439941406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -15.005180358886719, "rewards/margins": 23.474016189575195, "rewards/rejected": -38.47919845581055, "step": 386 }, { "epoch": 0.3157894736842105, "grad_norm": 0.023410921916365623, "learning_rate": 8.998423292435454e-05, "logits/chosen": -5.292146682739258, "logits/rejected": -4.734771728515625, "logps/chosen": -208.3104248046875, "logps/rejected": -415.52789306640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -15.75658130645752, "rewards/margins": 20.02714729309082, "rewards/rejected": -35.783729553222656, "step": 387 }, { "epoch": 0.31660546715626275, "grad_norm": 0.2466963231563568, "learning_rate": 8.991676531863508e-05, "logits/chosen": -4.427967548370361, "logits/rejected": -3.927788496017456, "logps/chosen": -204.1000213623047, "logps/rejected": -447.7653503417969, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -15.298025131225586, "rewards/margins": 25.08603858947754, "rewards/rejected": -40.384063720703125, "step": 388 }, { "epoch": 0.317421460628315, "grad_norm": 0.001948477467522025, "learning_rate": 8.984909671163127e-05, "logits/chosen": -6.299063682556152, "logits/rejected": -5.2794575691223145, "logps/chosen": -104.57907104492188, "logps/rejected": -355.5840148925781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.847350120544434, "rewards/margins": 23.837615966796875, "rewards/rejected": -29.684967041015625, "step": 389 }, { "epoch": 0.3182374541003672, "grad_norm": 7.140112757042516e-06, "learning_rate": 8.978122744408906e-05, "logits/chosen": -4.356049060821533, "logits/rejected": -3.5765812397003174, "logps/chosen": -124.45198059082031, "logps/rejected": -428.32159423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.827098369598389, "rewards/margins": 30.744626998901367, "rewards/rejected": -36.57172775268555, "step": 390 }, { "epoch": 0.31905344757241944, "grad_norm": 0.1483558565378189, "learning_rate": 8.971315785776486e-05, "logits/chosen": -2.4060168266296387, "logits/rejected": -0.6167707443237305, "logps/chosen": -143.5925750732422, "logps/rejected": -437.9368591308594, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.616633892059326, "rewards/margins": 29.45090675354004, "rewards/rejected": -37.06753921508789, "step": 391 }, { "epoch": 0.3198694410444716, "grad_norm": 0.00039594079134985805, "learning_rate": 8.964488829542377e-05, "logits/chosen": -3.0352795124053955, "logits/rejected": -1.4286903142929077, "logps/chosen": -204.2565155029297, "logps/rejected": -550.7020263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.805036544799805, "rewards/margins": 32.57807922363281, "rewards/rejected": -47.38311004638672, "step": 392 }, { "epoch": 0.32068543451652387, "grad_norm": 0.00032628359622322023, "learning_rate": 8.957641910083787e-05, "logits/chosen": -3.3089816570281982, "logits/rejected": -1.4907761812210083, "logps/chosen": -148.31939697265625, "logps/rejected": -481.1114501953125, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -8.40054988861084, "rewards/margins": 33.697906494140625, "rewards/rejected": -42.09846115112305, "step": 393 }, { "epoch": 0.3215014279885761, "grad_norm": 3.4395256420793885e-08, "learning_rate": 8.950775061878453e-05, "logits/chosen": -3.792238473892212, "logits/rejected": -2.4382996559143066, "logps/chosen": -117.34744262695312, "logps/rejected": -416.9165954589844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.82038688659668, "rewards/margins": 30.859310150146484, "rewards/rejected": -36.6796989440918, "step": 394 }, { "epoch": 0.3223174214606283, "grad_norm": 2.5555420506861992e-05, "learning_rate": 8.943888319504457e-05, "logits/chosen": -3.6695539951324463, "logits/rejected": -2.0732078552246094, "logps/chosen": -105.44559478759766, "logps/rejected": -395.8901672363281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.600123405456543, "rewards/margins": 28.84372329711914, "rewards/rejected": -34.44384765625, "step": 395 }, { "epoch": 0.32313341493268055, "grad_norm": 0.3028666079044342, "learning_rate": 8.936981717640061e-05, "logits/chosen": -3.774432897567749, "logits/rejected": -1.526755452156067, "logps/chosen": -152.1060028076172, "logps/rejected": -513.3495483398438, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -8.423258781433105, "rewards/margins": 36.945919036865234, "rewards/rejected": -45.369178771972656, "step": 396 }, { "epoch": 0.32394940840473274, "grad_norm": 0.010057342238724232, "learning_rate": 8.93005529106353e-05, "logits/chosen": -1.5307648181915283, "logits/rejected": -0.10370321571826935, "logps/chosen": -156.42733764648438, "logps/rejected": -489.9898376464844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.381754875183105, "rewards/margins": 30.613506317138672, "rewards/rejected": -40.995262145996094, "step": 397 }, { "epoch": 0.324765401876785, "grad_norm": 8.55942428046319e-09, "learning_rate": 8.92310907465296e-05, "logits/chosen": -2.4315831661224365, "logits/rejected": 0.6498696804046631, "logps/chosen": -159.9340057373047, "logps/rejected": -487.3100891113281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.208471298217773, "rewards/margins": 33.79970169067383, "rewards/rejected": -42.00817108154297, "step": 398 }, { "epoch": 0.32558139534883723, "grad_norm": 1.2192066911609345e-08, "learning_rate": 8.916143103386093e-05, "logits/chosen": -3.4125771522521973, "logits/rejected": -0.45130282640457153, "logps/chosen": -158.19384765625, "logps/rejected": -563.846435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.581827163696289, "rewards/margins": 40.11309051513672, "rewards/rejected": -48.694915771484375, "step": 399 }, { "epoch": 0.3263973888208894, "grad_norm": 0.00017364691302645952, "learning_rate": 8.90915741234015e-05, "logits/chosen": -1.4645180702209473, "logits/rejected": 0.2688311040401459, "logps/chosen": -184.26028442382812, "logps/rejected": -491.21392822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.285184860229492, "rewards/margins": 29.921802520751953, "rewards/rejected": -43.20698928833008, "step": 400 }, { "epoch": 0.32721338229294167, "grad_norm": 0.23414458334445953, "learning_rate": 8.90215203669165e-05, "logits/chosen": -2.256833076477051, "logits/rejected": 0.6768537759780884, "logps/chosen": -231.82113647460938, "logps/rejected": -495.4825134277344, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -17.99984359741211, "rewards/margins": 26.801786422729492, "rewards/rejected": -44.80162811279297, "step": 401 }, { "epoch": 0.32802937576499386, "grad_norm": 6.96125113108792e-08, "learning_rate": 8.895127011716233e-05, "logits/chosen": -1.9077379703521729, "logits/rejected": 1.1587449312210083, "logps/chosen": -228.20101928710938, "logps/rejected": -558.0187377929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.183063507080078, "rewards/margins": 33.76262283325195, "rewards/rejected": -49.94568634033203, "step": 402 }, { "epoch": 0.3288453692370461, "grad_norm": 2.343566166018718e-06, "learning_rate": 8.888082372788488e-05, "logits/chosen": -1.0045440196990967, "logits/rejected": 2.131012201309204, "logps/chosen": -219.7013397216797, "logps/rejected": -533.7833251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.149486541748047, "rewards/margins": 31.43106460571289, "rewards/rejected": -47.58055114746094, "step": 403 }, { "epoch": 0.32966136270909835, "grad_norm": 3.242465027142316e-06, "learning_rate": 8.881018155381766e-05, "logits/chosen": -2.7876107692718506, "logits/rejected": 0.8396314978599548, "logps/chosen": -198.9061279296875, "logps/rejected": -502.0777587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.465373992919922, "rewards/margins": 30.582300186157227, "rewards/rejected": -45.04767608642578, "step": 404 }, { "epoch": 0.33047735618115054, "grad_norm": 0.0012586540542542934, "learning_rate": 8.873934395068005e-05, "logits/chosen": -1.002247929573059, "logits/rejected": 2.588984251022339, "logps/chosen": -235.761962890625, "logps/rejected": -576.3990478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.93100357055664, "rewards/margins": 33.30910873413086, "rewards/rejected": -51.2401123046875, "step": 405 }, { "epoch": 0.3312933496532028, "grad_norm": 0.17587506771087646, "learning_rate": 8.866831127517557e-05, "logits/chosen": -0.7011470794677734, "logits/rejected": 2.4688684940338135, "logps/chosen": -277.5315246582031, "logps/rejected": -512.5650024414062, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -23.63814926147461, "rewards/margins": 23.026456832885742, "rewards/rejected": -46.664608001708984, "step": 406 }, { "epoch": 0.332109343125255, "grad_norm": 3.553513765335083, "learning_rate": 8.859708388498996e-05, "logits/chosen": -1.4394261837005615, "logits/rejected": 2.1301112174987793, "logps/chosen": -279.28167724609375, "logps/rejected": -549.3244018554688, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -21.04325294494629, "rewards/margins": 28.0008544921875, "rewards/rejected": -49.044105529785156, "step": 407 }, { "epoch": 0.3329253365973072, "grad_norm": 4.7867568355286494e-05, "learning_rate": 8.852566213878947e-05, "logits/chosen": -3.677197217941284, "logits/rejected": -1.6481961011886597, "logps/chosen": -177.2451934814453, "logps/rejected": -480.8868408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.127684593200684, "rewards/margins": 30.1270751953125, "rewards/rejected": -43.2547607421875, "step": 408 }, { "epoch": 0.33374133006935947, "grad_norm": 3.8373548250092426e-07, "learning_rate": 8.845404639621906e-05, "logits/chosen": -4.313418388366699, "logits/rejected": -1.9113926887512207, "logps/chosen": -198.59043884277344, "logps/rejected": -474.71295166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.976470947265625, "rewards/margins": 28.40229034423828, "rewards/rejected": -43.378761291503906, "step": 409 }, { "epoch": 0.33455732354141166, "grad_norm": 1.128104727687873e-11, "learning_rate": 8.838223701790055e-05, "logits/chosen": -5.507012367248535, "logits/rejected": -3.870380401611328, "logps/chosen": -129.57029724121094, "logps/rejected": -513.2935180664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.775950908660889, "rewards/margins": 36.93947219848633, "rewards/rejected": -44.715423583984375, "step": 410 }, { "epoch": 0.3353733170134639, "grad_norm": 3.2324922358384356e-05, "learning_rate": 8.831023436543082e-05, "logits/chosen": -5.624139785766602, "logits/rejected": -5.209603786468506, "logps/chosen": -129.42218017578125, "logps/rejected": -430.4466857910156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.529536247253418, "rewards/margins": 31.370929718017578, "rewards/rejected": -37.90047073364258, "step": 411 }, { "epoch": 0.3361893104855161, "grad_norm": 0.0007527855923399329, "learning_rate": 8.823803880137993e-05, "logits/chosen": -6.12045955657959, "logits/rejected": -4.789494037628174, "logps/chosen": -119.45098876953125, "logps/rejected": -381.3493957519531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.1785173416137695, "rewards/margins": 26.746854782104492, "rewards/rejected": -32.92537307739258, "step": 412 }, { "epoch": 0.33700530395756834, "grad_norm": 0.0786202996969223, "learning_rate": 8.81656506892894e-05, "logits/chosen": -5.691105365753174, "logits/rejected": -4.934353828430176, "logps/chosen": -116.87104034423828, "logps/rejected": -364.56292724609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -6.3512468338012695, "rewards/margins": 24.310720443725586, "rewards/rejected": -30.66196632385254, "step": 413 }, { "epoch": 0.3378212974296206, "grad_norm": 0.04771481081843376, "learning_rate": 8.809307039367034e-05, "logits/chosen": -6.657368183135986, "logits/rejected": -6.2018232345581055, "logps/chosen": -93.44149780273438, "logps/rejected": -315.6748352050781, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.821873903274536, "rewards/margins": 22.55467414855957, "rewards/rejected": -26.37654685974121, "step": 414 }, { "epoch": 0.33863729090167277, "grad_norm": 0.1300797015428543, "learning_rate": 8.802029828000156e-05, "logits/chosen": -6.584091663360596, "logits/rejected": -5.718299865722656, "logps/chosen": -113.99746704101562, "logps/rejected": -328.93853759765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.587121963500977, "rewards/margins": 21.15274429321289, "rewards/rejected": -27.739866256713867, "step": 415 }, { "epoch": 0.339453284373725, "grad_norm": 5.072165004094131e-05, "learning_rate": 8.794733471472778e-05, "logits/chosen": -5.546013355255127, "logits/rejected": -5.881369113922119, "logps/chosen": -118.76322937011719, "logps/rejected": -407.38018798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.420012474060059, "rewards/margins": 29.598777770996094, "rewards/rejected": -35.0187873840332, "step": 416 }, { "epoch": 0.3402692778457772, "grad_norm": 0.7536813616752625, "learning_rate": 8.787418006525782e-05, "logits/chosen": -6.453298568725586, "logits/rejected": -6.850407600402832, "logps/chosen": -96.24649810791016, "logps/rejected": -331.982666015625, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.135004997253418, "rewards/margins": 23.842464447021484, "rewards/rejected": -26.97747039794922, "step": 417 }, { "epoch": 0.34108527131782945, "grad_norm": 2.1663766336388335e-09, "learning_rate": 8.780083469996264e-05, "logits/chosen": -5.913601875305176, "logits/rejected": -6.319863796234131, "logps/chosen": -118.03551483154297, "logps/rejected": -518.2093505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.169255256652832, "rewards/margins": 40.13055419921875, "rewards/rejected": -45.2998046875, "step": 418 }, { "epoch": 0.3419012647898817, "grad_norm": 5.6560284065199085e-06, "learning_rate": 8.77272989881736e-05, "logits/chosen": -6.44437313079834, "logits/rejected": -6.2107319831848145, "logps/chosen": -157.67506408691406, "logps/rejected": -385.94964599609375, "loss": 0.3466, "rewards/accuracies": 1.0, "rewards/chosen": -9.692152976989746, "rewards/margins": 23.59151840209961, "rewards/rejected": -33.28367233276367, "step": 419 }, { "epoch": 0.3427172582619339, "grad_norm": 1.49592054299319e-07, "learning_rate": 8.765357330018056e-05, "logits/chosen": -6.897873878479004, "logits/rejected": -6.889559745788574, "logps/chosen": -117.32821655273438, "logps/rejected": -407.2445983886719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.900213241577148, "rewards/margins": 29.046871185302734, "rewards/rejected": -35.94708251953125, "step": 420 }, { "epoch": 0.34353325173398613, "grad_norm": 1.4075667422730476e-06, "learning_rate": 8.757965800722993e-05, "logits/chosen": -6.962460517883301, "logits/rejected": -6.303035736083984, "logps/chosen": -166.49044799804688, "logps/rejected": -421.1368408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.465068817138672, "rewards/margins": 26.086666107177734, "rewards/rejected": -36.551734924316406, "step": 421 }, { "epoch": 0.3443492452060384, "grad_norm": 1.3796503708363161e-06, "learning_rate": 8.750555348152298e-05, "logits/chosen": -6.0186238288879395, "logits/rejected": -5.476321220397949, "logps/chosen": -164.51467895507812, "logps/rejected": -462.2780456542969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.46378231048584, "rewards/margins": 30.772844314575195, "rewards/rejected": -42.23662567138672, "step": 422 }, { "epoch": 0.34516523867809057, "grad_norm": 0.378627747297287, "learning_rate": 8.74312600962138e-05, "logits/chosen": -6.132939338684082, "logits/rejected": -6.118511199951172, "logps/chosen": -200.45907592773438, "logps/rejected": -463.2393798828125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -14.023656845092773, "rewards/margins": 26.863441467285156, "rewards/rejected": -40.88710021972656, "step": 423 }, { "epoch": 0.3459812321501428, "grad_norm": 6.097407094785012e-07, "learning_rate": 8.735677822540749e-05, "logits/chosen": -6.1333513259887695, "logits/rejected": -5.877615451812744, "logps/chosen": -199.98858642578125, "logps/rejected": -458.0414733886719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.158284187316895, "rewards/margins": 25.907623291015625, "rewards/rejected": -41.06591033935547, "step": 424 }, { "epoch": 0.346797225622195, "grad_norm": 0.0009660385549068451, "learning_rate": 8.728210824415827e-05, "logits/chosen": -6.066734313964844, "logits/rejected": -5.445124626159668, "logps/chosen": -226.0995330810547, "logps/rejected": -439.9416809082031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.345285415649414, "rewards/margins": 23.440263748168945, "rewards/rejected": -38.785545349121094, "step": 425 }, { "epoch": 0.34761321909424725, "grad_norm": 0.07538966834545135, "learning_rate": 8.720725052846765e-05, "logits/chosen": -5.704412937164307, "logits/rejected": -5.645760536193848, "logps/chosen": -237.27947998046875, "logps/rejected": -515.111328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -17.182680130004883, "rewards/margins": 28.909385681152344, "rewards/rejected": -46.092063903808594, "step": 426 }, { "epoch": 0.3484292125662995, "grad_norm": 1.8916219914899557e-06, "learning_rate": 8.71322054552824e-05, "logits/chosen": -5.387274742126465, "logits/rejected": -5.706953048706055, "logps/chosen": -201.4109344482422, "logps/rejected": -608.3343505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.876215934753418, "rewards/margins": 39.416507720947266, "rewards/rejected": -52.292724609375, "step": 427 }, { "epoch": 0.3492452060383517, "grad_norm": 0.00042236922308802605, "learning_rate": 8.705697340249275e-05, "logits/chosen": -5.590426445007324, "logits/rejected": -5.448949337005615, "logps/chosen": -231.6854705810547, "logps/rejected": -510.4942626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.967876434326172, "rewards/margins": 28.335872650146484, "rewards/rejected": -46.303749084472656, "step": 428 }, { "epoch": 0.35006119951040393, "grad_norm": 0.03994862362742424, "learning_rate": 8.69815547489305e-05, "logits/chosen": -7.295804500579834, "logits/rejected": -6.288346290588379, "logps/chosen": -209.8961181640625, "logps/rejected": -449.7335510253906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -15.620079040527344, "rewards/margins": 24.79230499267578, "rewards/rejected": -40.412384033203125, "step": 429 }, { "epoch": 0.3508771929824561, "grad_norm": 0.014769727364182472, "learning_rate": 8.690594987436704e-05, "logits/chosen": -5.836956024169922, "logits/rejected": -5.618312835693359, "logps/chosen": -194.8831787109375, "logps/rejected": -470.92047119140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.75951862335205, "rewards/margins": 28.595230102539062, "rewards/rejected": -42.35475158691406, "step": 430 }, { "epoch": 0.35169318645450837, "grad_norm": 1.3981745723867789e-05, "learning_rate": 8.683015915951152e-05, "logits/chosen": -7.178122043609619, "logits/rejected": -6.053740978240967, "logps/chosen": -192.1156005859375, "logps/rejected": -464.4403076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.658855438232422, "rewards/margins": 26.735563278198242, "rewards/rejected": -41.3944206237793, "step": 431 }, { "epoch": 0.3525091799265606, "grad_norm": 20.47211265563965, "learning_rate": 8.675418298600884e-05, "logits/chosen": -5.769148349761963, "logits/rejected": -5.904295444488525, "logps/chosen": -245.74191284179688, "logps/rejected": -475.79443359375, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": -19.377368927001953, "rewards/margins": 23.52926254272461, "rewards/rejected": -42.90663146972656, "step": 432 }, { "epoch": 0.3533251733986128, "grad_norm": 4.52850781584857e-06, "learning_rate": 8.667802173643784e-05, "logits/chosen": -6.080057144165039, "logits/rejected": -4.537544250488281, "logps/chosen": -220.46835327148438, "logps/rejected": -561.27587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.383403778076172, "rewards/margins": 33.047447204589844, "rewards/rejected": -49.43085479736328, "step": 433 }, { "epoch": 0.35414116687066505, "grad_norm": 5.024001121520996, "learning_rate": 8.660167579430927e-05, "logits/chosen": -6.4044108390808105, "logits/rejected": -5.052708148956299, "logps/chosen": -219.8153839111328, "logps/rejected": -479.6542663574219, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -16.161766052246094, "rewards/margins": 25.409048080444336, "rewards/rejected": -41.57081604003906, "step": 434 }, { "epoch": 0.35495716034271724, "grad_norm": 3.4765591863106238e-06, "learning_rate": 8.652514554406388e-05, "logits/chosen": -8.287181854248047, "logits/rejected": -6.575533390045166, "logps/chosen": -264.6041259765625, "logps/rejected": -526.1235961914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.346389770507812, "rewards/margins": 25.409957885742188, "rewards/rejected": -46.756343841552734, "step": 435 }, { "epoch": 0.3557731538147695, "grad_norm": 9.342408624490872e-09, "learning_rate": 8.644843137107059e-05, "logits/chosen": -8.900733947753906, "logits/rejected": -7.888272762298584, "logps/chosen": -216.3601531982422, "logps/rejected": -547.2452392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.338869094848633, "rewards/margins": 31.912303924560547, "rewards/rejected": -48.25117111206055, "step": 436 }, { "epoch": 0.35658914728682173, "grad_norm": 15.535378456115723, "learning_rate": 8.637153366162436e-05, "logits/chosen": -10.910832405090332, "logits/rejected": -10.301733016967773, "logps/chosen": -209.82815551757812, "logps/rejected": -525.09521484375, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": -16.167236328125, "rewards/margins": 31.50454330444336, "rewards/rejected": -47.671775817871094, "step": 437 }, { "epoch": 0.3574051407588739, "grad_norm": 4.455471755426821e-11, "learning_rate": 8.629445280294444e-05, "logits/chosen": -11.281587600708008, "logits/rejected": -10.022695541381836, "logps/chosen": -180.2557830810547, "logps/rejected": -517.155517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.47867202758789, "rewards/margins": 33.66119384765625, "rewards/rejected": -47.13986587524414, "step": 438 }, { "epoch": 0.35822113423092616, "grad_norm": 3.549811708580819e-06, "learning_rate": 8.621718918317225e-05, "logits/chosen": -11.128429412841797, "logits/rejected": -11.450457572937012, "logps/chosen": -193.41372680664062, "logps/rejected": -436.32080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.918584823608398, "rewards/margins": 25.547832489013672, "rewards/rejected": -38.46641540527344, "step": 439 }, { "epoch": 0.35903712770297835, "grad_norm": 0.005092740058898926, "learning_rate": 8.613974319136958e-05, "logits/chosen": -11.970477104187012, "logits/rejected": -11.08476448059082, "logps/chosen": -211.6596221923828, "logps/rejected": -460.267333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.01909637451172, "rewards/margins": 23.978439331054688, "rewards/rejected": -40.997535705566406, "step": 440 }, { "epoch": 0.3598531211750306, "grad_norm": 4.083885192871094, "learning_rate": 8.606211521751652e-05, "logits/chosen": -12.796659469604492, "logits/rejected": -11.695082664489746, "logps/chosen": -173.8153839111328, "logps/rejected": -454.7597961425781, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -11.463430404663086, "rewards/margins": 27.832473754882812, "rewards/rejected": -39.29590606689453, "step": 441 }, { "epoch": 0.36066911464708284, "grad_norm": 2.3170990971266292e-05, "learning_rate": 8.598430565250952e-05, "logits/chosen": -11.791269302368164, "logits/rejected": -11.764620780944824, "logps/chosen": -164.89700317382812, "logps/rejected": -458.64013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.817549705505371, "rewards/margins": 28.925426483154297, "rewards/rejected": -39.74297332763672, "step": 442 }, { "epoch": 0.36148510811913503, "grad_norm": 8.294775398098864e-06, "learning_rate": 8.590631488815944e-05, "logits/chosen": -12.234158515930176, "logits/rejected": -11.637259483337402, "logps/chosen": -171.80091857910156, "logps/rejected": -538.036376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.242918968200684, "rewards/margins": 35.36836624145508, "rewards/rejected": -46.61128234863281, "step": 443 }, { "epoch": 0.3623011015911873, "grad_norm": 8.397245437663514e-06, "learning_rate": 8.582814331718961e-05, "logits/chosen": -11.491058349609375, "logits/rejected": -10.738380432128906, "logps/chosen": -224.05548095703125, "logps/rejected": -487.7114562988281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.865446090698242, "rewards/margins": 28.360492706298828, "rewards/rejected": -43.22593688964844, "step": 444 }, { "epoch": 0.36311709506323947, "grad_norm": 0.030085844919085503, "learning_rate": 8.574979133323377e-05, "logits/chosen": -12.602558135986328, "logits/rejected": -11.461106300354004, "logps/chosen": -208.13818359375, "logps/rejected": -407.01678466796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -14.56165885925293, "rewards/margins": 20.460859298706055, "rewards/rejected": -35.02252197265625, "step": 445 }, { "epoch": 0.3639330885352917, "grad_norm": 65.36039733886719, "learning_rate": 8.567125933083415e-05, "logits/chosen": -12.831865310668945, "logits/rejected": -11.856672286987305, "logps/chosen": -185.97325134277344, "logps/rejected": -358.39752197265625, "loss": 0.6608, "rewards/accuracies": 0.875, "rewards/chosen": -13.11384391784668, "rewards/margins": 17.728450775146484, "rewards/rejected": -30.84229278564453, "step": 446 }, { "epoch": 0.36474908200734396, "grad_norm": 0.007965218275785446, "learning_rate": 8.559254770543944e-05, "logits/chosen": -11.066886901855469, "logits/rejected": -10.121254920959473, "logps/chosen": -210.07891845703125, "logps/rejected": -424.2562561035156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.899775505065918, "rewards/margins": 22.711261749267578, "rewards/rejected": -36.61103820800781, "step": 447 }, { "epoch": 0.36556507547939615, "grad_norm": 0.08724936097860336, "learning_rate": 8.551365685340285e-05, "logits/chosen": -10.588062286376953, "logits/rejected": -9.554072380065918, "logps/chosen": -161.3327178955078, "logps/rejected": -363.77679443359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.908801078796387, "rewards/margins": 20.555063247680664, "rewards/rejected": -30.463865280151367, "step": 448 }, { "epoch": 0.3663810689514484, "grad_norm": 0.9764013290405273, "learning_rate": 8.54345871719801e-05, "logits/chosen": -10.012062072753906, "logits/rejected": -8.689175605773926, "logps/chosen": -131.47659301757812, "logps/rejected": -338.0377197265625, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -6.4237751960754395, "rewards/margins": 21.843788146972656, "rewards/rejected": -28.267562866210938, "step": 449 }, { "epoch": 0.3671970624235006, "grad_norm": 1.568230800330639e-05, "learning_rate": 8.535533905932738e-05, "logits/chosen": -9.780055046081543, "logits/rejected": -7.824827194213867, "logps/chosen": -148.10382080078125, "logps/rejected": -424.451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.966879844665527, "rewards/margins": 26.04444122314453, "rewards/rejected": -36.011322021484375, "step": 450 }, { "epoch": 0.36801305589555283, "grad_norm": 8.781707763671875, "learning_rate": 8.527591291449937e-05, "logits/chosen": -9.222942352294922, "logits/rejected": -8.333105087280273, "logps/chosen": -160.0270233154297, "logps/rejected": -437.1316833496094, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": -11.472217559814453, "rewards/margins": 26.59707260131836, "rewards/rejected": -38.06929016113281, "step": 451 }, { "epoch": 0.3688290493676051, "grad_norm": 0.03322325646877289, "learning_rate": 8.519630913744725e-05, "logits/chosen": -8.561456680297852, "logits/rejected": -8.2776460647583, "logps/chosen": -121.53541564941406, "logps/rejected": -371.692626953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.540974140167236, "rewards/margins": 25.98070526123047, "rewards/rejected": -32.52168273925781, "step": 452 }, { "epoch": 0.36964504283965727, "grad_norm": 0.08821164816617966, "learning_rate": 8.511652812901666e-05, "logits/chosen": -7.15286922454834, "logits/rejected": -7.224461555480957, "logps/chosen": -133.3076629638672, "logps/rejected": -407.8914489746094, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.98518180847168, "rewards/margins": 26.55181121826172, "rewards/rejected": -34.53699493408203, "step": 453 }, { "epoch": 0.3704610363117095, "grad_norm": 9.43649354212539e-07, "learning_rate": 8.50365702909457e-05, "logits/chosen": -8.883678436279297, "logits/rejected": -6.182427406311035, "logps/chosen": -150.90377807617188, "logps/rejected": -484.31060791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.603713989257812, "rewards/margins": 33.110958099365234, "rewards/rejected": -42.71466827392578, "step": 454 }, { "epoch": 0.3712770297837617, "grad_norm": 0.00300230854190886, "learning_rate": 8.495643602586287e-05, "logits/chosen": -6.6004638671875, "logits/rejected": -5.507245063781738, "logps/chosen": -234.38180541992188, "logps/rejected": -474.21484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.41421890258789, "rewards/margins": 25.671646118164062, "rewards/rejected": -42.08586120605469, "step": 455 }, { "epoch": 0.37209302325581395, "grad_norm": 0.001489706919528544, "learning_rate": 8.487612573728513e-05, "logits/chosen": -5.89524507522583, "logits/rejected": -5.341463088989258, "logps/chosen": -150.78668212890625, "logps/rejected": -440.22052001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.057676315307617, "rewards/margins": 30.041276931762695, "rewards/rejected": -39.09895324707031, "step": 456 }, { "epoch": 0.3729090167278662, "grad_norm": 13.220328330993652, "learning_rate": 8.479563982961571e-05, "logits/chosen": -6.194330215454102, "logits/rejected": -5.90523624420166, "logps/chosen": -228.33729553222656, "logps/rejected": -478.9873046875, "loss": 0.3589, "rewards/accuracies": 0.875, "rewards/chosen": -17.477895736694336, "rewards/margins": 25.298315048217773, "rewards/rejected": -42.77621078491211, "step": 457 }, { "epoch": 0.3737250101999184, "grad_norm": 1.7373338323523058e-06, "learning_rate": 8.47149787081423e-05, "logits/chosen": -5.945152282714844, "logits/rejected": -4.4888715744018555, "logps/chosen": -190.51809692382812, "logps/rejected": -486.1563415527344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.731708526611328, "rewards/margins": 29.789901733398438, "rewards/rejected": -42.521610260009766, "step": 458 }, { "epoch": 0.37454100367197063, "grad_norm": 0.8338418006896973, "learning_rate": 8.463414277903475e-05, "logits/chosen": -6.444195747375488, "logits/rejected": -4.319398880004883, "logps/chosen": -284.3909606933594, "logps/rejected": -496.9165954589844, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -23.18535041809082, "rewards/margins": 21.26992416381836, "rewards/rejected": -44.45527648925781, "step": 459 }, { "epoch": 0.3753569971440229, "grad_norm": 3.8474350731121376e-05, "learning_rate": 8.455313244934324e-05, "logits/chosen": -4.936422348022461, "logits/rejected": -2.9731175899505615, "logps/chosen": -273.8941345214844, "logps/rejected": -591.8756713867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.56604766845703, "rewards/margins": 31.721940994262695, "rewards/rejected": -53.287986755371094, "step": 460 }, { "epoch": 0.37617299061607506, "grad_norm": 7.513102173106745e-05, "learning_rate": 8.447194812699613e-05, "logits/chosen": -6.544737815856934, "logits/rejected": -5.0629425048828125, "logps/chosen": -197.92214965820312, "logps/rejected": -577.421630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.257893562316895, "rewards/margins": 37.63240051269531, "rewards/rejected": -50.890289306640625, "step": 461 }, { "epoch": 0.3769889840881273, "grad_norm": 0.0010595141211524606, "learning_rate": 8.439059022079789e-05, "logits/chosen": -7.237377166748047, "logits/rejected": -5.319348335266113, "logps/chosen": -294.1373596191406, "logps/rejected": -591.9949951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.970195770263672, "rewards/margins": 29.40284538269043, "rewards/rejected": -53.37303924560547, "step": 462 }, { "epoch": 0.3778049775601795, "grad_norm": 39.324951171875, "learning_rate": 8.430905914042714e-05, "logits/chosen": -5.572193145751953, "logits/rejected": -3.227384567260742, "logps/chosen": -274.10687255859375, "logps/rejected": -557.810546875, "loss": 1.4246, "rewards/accuracies": 0.875, "rewards/chosen": -23.182334899902344, "rewards/margins": 28.197853088378906, "rewards/rejected": -51.38018798828125, "step": 463 }, { "epoch": 0.37862097103223175, "grad_norm": 0.0005098275141790509, "learning_rate": 8.422735529643444e-05, "logits/chosen": -8.129159927368164, "logits/rejected": -4.9163007736206055, "logps/chosen": -271.1097412109375, "logps/rejected": -593.4912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.714292526245117, "rewards/margins": 30.23236083984375, "rewards/rejected": -52.9466552734375, "step": 464 }, { "epoch": 0.379436964504284, "grad_norm": 0.0012483281316235662, "learning_rate": 8.414547910024036e-05, "logits/chosen": -7.489397048950195, "logits/rejected": -5.820690155029297, "logps/chosen": -184.85928344726562, "logps/rejected": -531.0404052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.202693939208984, "rewards/margins": 33.164283752441406, "rewards/rejected": -47.366981506347656, "step": 465 }, { "epoch": 0.3802529579763362, "grad_norm": 0.07257784157991409, "learning_rate": 8.406343096413332e-05, "logits/chosen": -6.88952112197876, "logits/rejected": -4.496829986572266, "logps/chosen": -314.2440185546875, "logps/rejected": -580.50830078125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -24.840238571166992, "rewards/margins": 27.053754806518555, "rewards/rejected": -51.89399337768555, "step": 466 }, { "epoch": 0.3810689514483884, "grad_norm": 7.492184295188054e-07, "learning_rate": 8.398121130126756e-05, "logits/chosen": -8.126199722290039, "logits/rejected": -4.752597808837891, "logps/chosen": -245.57235717773438, "logps/rejected": -536.4049682617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.506092071533203, "rewards/margins": 29.3541259765625, "rewards/rejected": -47.86022186279297, "step": 467 }, { "epoch": 0.3818849449204406, "grad_norm": 4.552337646484375, "learning_rate": 8.389882052566105e-05, "logits/chosen": -7.977252960205078, "logits/rejected": -5.414823532104492, "logps/chosen": -294.5708312988281, "logps/rejected": -548.0565185546875, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": -24.170204162597656, "rewards/margins": 25.04886245727539, "rewards/rejected": -49.21906280517578, "step": 468 }, { "epoch": 0.38270093839249286, "grad_norm": 0.0002390938316239044, "learning_rate": 8.381625905219339e-05, "logits/chosen": -8.850868225097656, "logits/rejected": -6.5321245193481445, "logps/chosen": -268.44287109375, "logps/rejected": -568.1260986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.050888061523438, "rewards/margins": 30.364009857177734, "rewards/rejected": -51.41489791870117, "step": 469 }, { "epoch": 0.3835169318645451, "grad_norm": 2.1009442718877835e-07, "learning_rate": 8.373352729660373e-05, "logits/chosen": -9.430639266967773, "logits/rejected": -8.279118537902832, "logps/chosen": -252.4949951171875, "logps/rejected": -580.11572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.765947341918945, "rewards/margins": 31.4400691986084, "rewards/rejected": -51.206016540527344, "step": 470 }, { "epoch": 0.3843329253365973, "grad_norm": 5.319410774973221e-05, "learning_rate": 8.365062567548867e-05, "logits/chosen": -9.386661529541016, "logits/rejected": -8.403724670410156, "logps/chosen": -259.6888122558594, "logps/rejected": -507.2263488769531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.235618591308594, "rewards/margins": 26.400375366210938, "rewards/rejected": -45.63599395751953, "step": 471 }, { "epoch": 0.38514891880864954, "grad_norm": 0.27799028158187866, "learning_rate": 8.35675546063002e-05, "logits/chosen": -10.579034805297852, "logits/rejected": -10.369976043701172, "logps/chosen": -287.62939453125, "logps/rejected": -552.4834594726562, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -21.72197151184082, "rewards/margins": 27.616992950439453, "rewards/rejected": -49.338966369628906, "step": 472 }, { "epoch": 0.38596491228070173, "grad_norm": 0.0014463033294305205, "learning_rate": 8.348431450734355e-05, "logits/chosen": -10.611384391784668, "logits/rejected": -10.61125659942627, "logps/chosen": -209.849609375, "logps/rejected": -541.3489990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.640226364135742, "rewards/margins": 31.29888916015625, "rewards/rejected": -47.939117431640625, "step": 473 }, { "epoch": 0.386780905752754, "grad_norm": 7.470121659025608e-07, "learning_rate": 8.340090579777506e-05, "logits/chosen": -11.531368255615234, "logits/rejected": -11.569398880004883, "logps/chosen": -149.64231872558594, "logps/rejected": -530.9060668945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.375689506530762, "rewards/margins": 37.468894958496094, "rewards/rejected": -45.84458541870117, "step": 474 }, { "epoch": 0.3875968992248062, "grad_norm": 0.00012541349860839546, "learning_rate": 8.33173288976002e-05, "logits/chosen": -10.467924118041992, "logits/rejected": -12.469841003417969, "logps/chosen": -171.28692626953125, "logps/rejected": -538.8927001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.5216064453125, "rewards/margins": 34.768951416015625, "rewards/rejected": -46.290557861328125, "step": 475 }, { "epoch": 0.3884128926968584, "grad_norm": 2.1235748590697767e-06, "learning_rate": 8.32335842276713e-05, "logits/chosen": -12.27385139465332, "logits/rejected": -13.175346374511719, "logps/chosen": -215.7777557373047, "logps/rejected": -567.3623657226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.513385772705078, "rewards/margins": 33.81013488769531, "rewards/rejected": -50.323524475097656, "step": 476 }, { "epoch": 0.38922888616891066, "grad_norm": 3.1314158377426793e-07, "learning_rate": 8.314967220968548e-05, "logits/chosen": -12.382613182067871, "logits/rejected": -14.192113876342773, "logps/chosen": -171.99659729003906, "logps/rejected": -510.28411865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.404550552368164, "rewards/margins": 33.02766036987305, "rewards/rejected": -45.432212829589844, "step": 477 }, { "epoch": 0.39004487964096285, "grad_norm": 0.5368106961250305, "learning_rate": 8.306559326618259e-05, "logits/chosen": -11.748361587524414, "logits/rejected": -14.087860107421875, "logps/chosen": -189.35882568359375, "logps/rejected": -539.2950439453125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -12.556926727294922, "rewards/margins": 34.50001525878906, "rewards/rejected": -47.056941986083984, "step": 478 }, { "epoch": 0.3908608731130151, "grad_norm": 0.001143061788752675, "learning_rate": 8.298134782054305e-05, "logits/chosen": -12.371622085571289, "logits/rejected": -14.027061462402344, "logps/chosen": -222.72982788085938, "logps/rejected": -516.3983154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.782367706298828, "rewards/margins": 29.061805725097656, "rewards/rejected": -44.84416961669922, "step": 479 }, { "epoch": 0.39167686658506734, "grad_norm": 2.518359132341885e-10, "learning_rate": 8.289693629698564e-05, "logits/chosen": -11.00863265991211, "logits/rejected": -14.149203300476074, "logps/chosen": -166.61219787597656, "logps/rejected": -519.4423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.250097274780273, "rewards/margins": 36.06797409057617, "rewards/rejected": -45.31807327270508, "step": 480 }, { "epoch": 0.39249286005711953, "grad_norm": 0.0331815704703331, "learning_rate": 8.281235912056546e-05, "logits/chosen": -11.139287948608398, "logits/rejected": -13.734397888183594, "logps/chosen": -187.91392517089844, "logps/rejected": -433.2566223144531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.717411041259766, "rewards/margins": 25.591508865356445, "rewards/rejected": -38.308921813964844, "step": 481 }, { "epoch": 0.3933088535291718, "grad_norm": 4.524374890024774e-05, "learning_rate": 8.272761671717178e-05, "logits/chosen": -12.467604637145996, "logits/rejected": -14.166021347045898, "logps/chosen": -199.8466033935547, "logps/rejected": -479.434326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.10938835144043, "rewards/margins": 27.906612396240234, "rewards/rejected": -42.01599884033203, "step": 482 }, { "epoch": 0.39412484700122397, "grad_norm": 9.94588845060207e-07, "learning_rate": 8.264270951352581e-05, "logits/chosen": -12.595074653625488, "logits/rejected": -15.403417587280273, "logps/chosen": -228.16738891601562, "logps/rejected": -488.15728759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.01485252380371, "rewards/margins": 26.885372161865234, "rewards/rejected": -43.90022659301758, "step": 483 }, { "epoch": 0.3949408404732762, "grad_norm": 5.281162157189101e-05, "learning_rate": 8.255763793717868e-05, "logits/chosen": -11.872706413269043, "logits/rejected": -16.112064361572266, "logps/chosen": -212.52593994140625, "logps/rejected": -508.7774963378906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.13166618347168, "rewards/margins": 30.058448791503906, "rewards/rejected": -45.19011688232422, "step": 484 }, { "epoch": 0.39575683394532846, "grad_norm": 5.641837219627632e-09, "learning_rate": 8.247240241650918e-05, "logits/chosen": -13.895126342773438, "logits/rejected": -15.915205001831055, "logps/chosen": -212.92807006835938, "logps/rejected": -557.1370239257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.26259422302246, "rewards/margins": 32.23137664794922, "rewards/rejected": -48.49397277832031, "step": 485 }, { "epoch": 0.39657282741738065, "grad_norm": 1.0427000522613525, "learning_rate": 8.238700338072167e-05, "logits/chosen": -14.492374420166016, "logits/rejected": -15.958181381225586, "logps/chosen": -229.51577758789062, "logps/rejected": -469.69097900390625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -18.267642974853516, "rewards/margins": 23.371593475341797, "rewards/rejected": -41.63923645019531, "step": 486 }, { "epoch": 0.3973888208894329, "grad_norm": 3.8266196611402847e-07, "learning_rate": 8.230144125984387e-05, "logits/chosen": -12.075204849243164, "logits/rejected": -14.37142562866211, "logps/chosen": -190.43545532226562, "logps/rejected": -479.1802673339844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.900508880615234, "rewards/margins": 28.71734619140625, "rewards/rejected": -42.61785888671875, "step": 487 }, { "epoch": 0.3982048143614851, "grad_norm": 2.3598360712639987e-05, "learning_rate": 8.221571648472472e-05, "logits/chosen": -9.181947708129883, "logits/rejected": -12.384515762329102, "logps/chosen": -123.74555969238281, "logps/rejected": -502.7847595214844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.426608085632324, "rewards/margins": 35.68864440917969, "rewards/rejected": -42.11524963378906, "step": 488 }, { "epoch": 0.3990208078335373, "grad_norm": 5.533936473511858e-06, "learning_rate": 8.21298294870322e-05, "logits/chosen": -8.50558090209961, "logits/rejected": -11.148627281188965, "logps/chosen": -113.29942321777344, "logps/rejected": -431.6334228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.588619232177734, "rewards/margins": 30.274293899536133, "rewards/rejected": -36.8629150390625, "step": 489 }, { "epoch": 0.3998368013055896, "grad_norm": 0.001474875956773758, "learning_rate": 8.20437806992512e-05, "logits/chosen": -5.098386287689209, "logits/rejected": -7.472157001495361, "logps/chosen": -86.97142028808594, "logps/rejected": -388.658447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6831469535827637, "rewards/margins": 28.44316864013672, "rewards/rejected": -32.12631607055664, "step": 490 }, { "epoch": 0.40065279477764176, "grad_norm": 0.2270638793706894, "learning_rate": 8.195757055468127e-05, "logits/chosen": -6.815646171569824, "logits/rejected": -10.010425567626953, "logps/chosen": -72.40776062011719, "logps/rejected": -332.515380859375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.3666666746139526, "rewards/margins": 26.358078002929688, "rewards/rejected": -27.724746704101562, "step": 491 }, { "epoch": 0.401468788249694, "grad_norm": 0.11513742804527283, "learning_rate": 8.18711994874345e-05, "logits/chosen": -6.011063575744629, "logits/rejected": -7.998358249664307, "logps/chosen": -99.67781066894531, "logps/rejected": -370.0491027832031, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.502941131591797, "rewards/margins": 27.790035247802734, "rewards/rejected": -31.29297637939453, "step": 492 }, { "epoch": 0.4022847817217462, "grad_norm": 6.476535781985149e-05, "learning_rate": 8.178466793243324e-05, "logits/chosen": -6.51555061340332, "logits/rejected": -8.135910034179688, "logps/chosen": -89.09518432617188, "logps/rejected": -317.5426330566406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3362386226654053, "rewards/margins": 23.252689361572266, "rewards/rejected": -26.588926315307617, "step": 493 }, { "epoch": 0.40310077519379844, "grad_norm": 0.05544556304812431, "learning_rate": 8.16979763254081e-05, "logits/chosen": -5.23173713684082, "logits/rejected": -7.579280853271484, "logps/chosen": -69.81936645507812, "logps/rejected": -283.3907470703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.8526333570480347, "rewards/margins": 21.416521072387695, "rewards/rejected": -23.269155502319336, "step": 494 }, { "epoch": 0.4039167686658507, "grad_norm": 0.2902042269706726, "learning_rate": 8.161112510289549e-05, "logits/chosen": -6.343537330627441, "logits/rejected": -7.4283928871154785, "logps/chosen": -110.88422393798828, "logps/rejected": -283.411865234375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -6.305590629577637, "rewards/margins": 17.178726196289062, "rewards/rejected": -23.48431968688965, "step": 495 }, { "epoch": 0.4047327621379029, "grad_norm": 0.02235700748860836, "learning_rate": 8.152411470223569e-05, "logits/chosen": -5.530867576599121, "logits/rejected": -7.3470940589904785, "logps/chosen": -77.74740600585938, "logps/rejected": -382.9835510253906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8507827520370483, "rewards/margins": 29.51656723022461, "rewards/rejected": -31.367351531982422, "step": 496 }, { "epoch": 0.4055487556099551, "grad_norm": 0.05165637284517288, "learning_rate": 8.143694556157046e-05, "logits/chosen": -4.345794677734375, "logits/rejected": -6.110574722290039, "logps/chosen": -87.35365295410156, "logps/rejected": -267.375732421875, "loss": 0.3468, "rewards/accuracies": 0.875, "rewards/chosen": -4.1508002281188965, "rewards/margins": 17.604629516601562, "rewards/rejected": -21.755428314208984, "step": 497 }, { "epoch": 0.40636474908200737, "grad_norm": 0.002026922535151243, "learning_rate": 8.13496181198409e-05, "logits/chosen": -5.537290573120117, "logits/rejected": -6.5208048820495605, "logps/chosen": -61.7830924987793, "logps/rejected": -383.3427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4912253618240356, "rewards/margins": 30.10000991821289, "rewards/rejected": -31.59123420715332, "step": 498 }, { "epoch": 0.40718074255405956, "grad_norm": 0.0017718462040647864, "learning_rate": 8.126213281678528e-05, "logits/chosen": -6.067045211791992, "logits/rejected": -6.511468887329102, "logps/chosen": -78.53439331054688, "logps/rejected": -347.812255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.7181000709533691, "rewards/margins": 25.326248168945312, "rewards/rejected": -27.04434585571289, "step": 499 }, { "epoch": 0.4079967360261118, "grad_norm": 24.027694702148438, "learning_rate": 8.117449009293668e-05, "logits/chosen": -6.513627052307129, "logits/rejected": -6.6752753257751465, "logps/chosen": -108.98941802978516, "logps/rejected": -351.4279479980469, "loss": 0.2491, "rewards/accuracies": 1.0, "rewards/chosen": -6.436740875244141, "rewards/margins": 23.88714027404785, "rewards/rejected": -30.323881149291992, "step": 500 }, { "epoch": 0.408812729498164, "grad_norm": 0.00030457047978416085, "learning_rate": 8.1086690389621e-05, "logits/chosen": -6.736510276794434, "logits/rejected": -6.224907875061035, "logps/chosen": -125.12226104736328, "logps/rejected": -351.68414306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.541443824768066, "rewards/margins": 22.849390029907227, "rewards/rejected": -30.390832901000977, "step": 501 }, { "epoch": 0.40962872297021624, "grad_norm": 6.014238920215575e-07, "learning_rate": 8.099873414895453e-05, "logits/chosen": -6.4541778564453125, "logits/rejected": -7.012676239013672, "logps/chosen": -84.2264404296875, "logps/rejected": -344.7731018066406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.464620351791382, "rewards/margins": 26.34708023071289, "rewards/rejected": -29.811702728271484, "step": 502 }, { "epoch": 0.4104447164422685, "grad_norm": 3.660852598841302e-06, "learning_rate": 8.091062181384184e-05, "logits/chosen": -6.718441963195801, "logits/rejected": -6.150249004364014, "logps/chosen": -140.58901977539062, "logps/rejected": -380.10693359375, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -10.275491714477539, "rewards/margins": 22.278642654418945, "rewards/rejected": -32.554134368896484, "step": 503 }, { "epoch": 0.4112607099143207, "grad_norm": 0.01852903701364994, "learning_rate": 8.082235382797349e-05, "logits/chosen": -5.185626029968262, "logits/rejected": -5.862762451171875, "logps/chosen": -119.38169860839844, "logps/rejected": -418.0333557128906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.5457048416137695, "rewards/margins": 30.40680694580078, "rewards/rejected": -36.9525146484375, "step": 504 }, { "epoch": 0.4120767033863729, "grad_norm": 7.113725587259978e-06, "learning_rate": 8.073393063582386e-05, "logits/chosen": -5.73909330368042, "logits/rejected": -5.695547103881836, "logps/chosen": -84.0013427734375, "logps/rejected": -366.26214599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.9689248204231262, "rewards/margins": 29.828603744506836, "rewards/rejected": -30.797527313232422, "step": 505 }, { "epoch": 0.4128926968584251, "grad_norm": 0.0004193938511889428, "learning_rate": 8.064535268264883e-05, "logits/chosen": -5.205691337585449, "logits/rejected": -5.419007301330566, "logps/chosen": -98.61830139160156, "logps/rejected": -357.93975830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.830706596374512, "rewards/margins": 24.931743621826172, "rewards/rejected": -29.762451171875, "step": 506 }, { "epoch": 0.41370869033047736, "grad_norm": 0.005399726331233978, "learning_rate": 8.05566204144836e-05, "logits/chosen": -6.4770612716674805, "logits/rejected": -6.20255184173584, "logps/chosen": -156.97653198242188, "logps/rejected": -387.75006103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.097769737243652, "rewards/margins": 23.918466567993164, "rewards/rejected": -33.0162353515625, "step": 507 }, { "epoch": 0.4145246838025296, "grad_norm": 0.0066842325031757355, "learning_rate": 8.046773427814042e-05, "logits/chosen": -5.804233551025391, "logits/rejected": -4.6192240715026855, "logps/chosen": -174.2189483642578, "logps/rejected": -428.90643310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.04319953918457, "rewards/margins": 24.294567108154297, "rewards/rejected": -37.3377685546875, "step": 508 }, { "epoch": 0.4153406772745818, "grad_norm": 1.2342589798208792e-05, "learning_rate": 8.037869472120634e-05, "logits/chosen": -5.874184608459473, "logits/rejected": -5.058535575866699, "logps/chosen": -155.4041748046875, "logps/rejected": -448.8852233886719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.7583646774292, "rewards/margins": 28.13190460205078, "rewards/rejected": -37.8902702331543, "step": 509 }, { "epoch": 0.41615667074663404, "grad_norm": 4.400712441565702e-06, "learning_rate": 8.0289502192041e-05, "logits/chosen": -6.109137535095215, "logits/rejected": -5.588523864746094, "logps/chosen": -173.42495727539062, "logps/rejected": -499.03424072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.045900344848633, "rewards/margins": 30.909530639648438, "rewards/rejected": -41.95542907714844, "step": 510 }, { "epoch": 0.41697266421868623, "grad_norm": 0.03828820958733559, "learning_rate": 8.020015713977427e-05, "logits/chosen": -5.8864359855651855, "logits/rejected": -4.572179794311523, "logps/chosen": -190.39566040039062, "logps/rejected": -419.84246826171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.201654434204102, "rewards/margins": 22.99075698852539, "rewards/rejected": -36.192413330078125, "step": 511 }, { "epoch": 0.4177886576907385, "grad_norm": 0.535259485244751, "learning_rate": 8.011066001430412e-05, "logits/chosen": -5.076742649078369, "logits/rejected": -5.26524019241333, "logps/chosen": -188.21795654296875, "logps/rejected": -408.0771484375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -13.850798606872559, "rewards/margins": 22.02074432373047, "rewards/rejected": -35.871543884277344, "step": 512 }, { "epoch": 0.4186046511627907, "grad_norm": 0.05568370968103409, "learning_rate": 8.002101126629421e-05, "logits/chosen": -5.789296627044678, "logits/rejected": -5.924657821655273, "logps/chosen": -167.93582153320312, "logps/rejected": -378.2938232421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.753562927246094, "rewards/margins": 22.68785285949707, "rewards/rejected": -32.44141387939453, "step": 513 }, { "epoch": 0.4194206446348429, "grad_norm": 1.4964315653287485e-07, "learning_rate": 7.993121134717177e-05, "logits/chosen": -5.520864009857178, "logits/rejected": -4.583808422088623, "logps/chosen": -192.40847778320312, "logps/rejected": -448.3153381347656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.035701751708984, "rewards/margins": 25.618770599365234, "rewards/rejected": -39.65447235107422, "step": 514 }, { "epoch": 0.42023663810689516, "grad_norm": 88.95675659179688, "learning_rate": 7.984126070912518e-05, "logits/chosen": -5.3716864585876465, "logits/rejected": -4.770180702209473, "logps/chosen": -126.78449249267578, "logps/rejected": -386.2102355957031, "loss": 7.4495, "rewards/accuracies": 0.875, "rewards/chosen": -9.01545524597168, "rewards/margins": 24.64366912841797, "rewards/rejected": -33.65912628173828, "step": 515 }, { "epoch": 0.42105263157894735, "grad_norm": 0.0007481240900233388, "learning_rate": 7.975115980510187e-05, "logits/chosen": -6.454975128173828, "logits/rejected": -6.158426284790039, "logps/chosen": -151.57496643066406, "logps/rejected": -346.513427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.448592185974121, "rewards/margins": 19.038166046142578, "rewards/rejected": -29.48675537109375, "step": 516 }, { "epoch": 0.4218686250509996, "grad_norm": 0.05091157928109169, "learning_rate": 7.966090908880581e-05, "logits/chosen": -5.477484703063965, "logits/rejected": -6.158967971801758, "logps/chosen": -151.4366912841797, "logps/rejected": -319.61175537109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.903636932373047, "rewards/margins": 16.754417419433594, "rewards/rejected": -26.65805435180664, "step": 517 }, { "epoch": 0.42268461852305184, "grad_norm": 78.16626739501953, "learning_rate": 7.957050901469545e-05, "logits/chosen": -5.498078346252441, "logits/rejected": -6.02641487121582, "logps/chosen": -194.40426635742188, "logps/rejected": -321.21832275390625, "loss": 0.7078, "rewards/accuracies": 0.875, "rewards/chosen": -12.517132759094238, "rewards/margins": 13.173750877380371, "rewards/rejected": -25.69088363647461, "step": 518 }, { "epoch": 0.423500611995104, "grad_norm": 0.0082944855093956, "learning_rate": 7.94799600379813e-05, "logits/chosen": -4.673421859741211, "logits/rejected": -5.647769927978516, "logps/chosen": -138.14083862304688, "logps/rejected": -288.9040222167969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.51358413696289, "rewards/margins": 14.569899559020996, "rewards/rejected": -23.083484649658203, "step": 519 }, { "epoch": 0.42431660546715627, "grad_norm": 1.2419451475143433, "learning_rate": 7.938926261462366e-05, "logits/chosen": -3.5749142169952393, "logits/rejected": -4.473803997039795, "logps/chosen": -142.6949920654297, "logps/rejected": -311.536865234375, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -9.70411491394043, "rewards/margins": 16.062074661254883, "rewards/rejected": -25.766189575195312, "step": 520 }, { "epoch": 0.42513259893920846, "grad_norm": 0.0005771263386122882, "learning_rate": 7.929841720133034e-05, "logits/chosen": -4.587742328643799, "logits/rejected": -6.369336128234863, "logps/chosen": -150.55577087402344, "logps/rejected": -448.220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.237763404846191, "rewards/margins": 29.289278030395508, "rewards/rejected": -38.52703857421875, "step": 521 }, { "epoch": 0.4259485924112607, "grad_norm": 0.187058687210083, "learning_rate": 7.920742425555436e-05, "logits/chosen": -4.76069450378418, "logits/rejected": -5.9432373046875, "logps/chosen": -176.87649536132812, "logps/rejected": -401.52032470703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -10.581954956054688, "rewards/margins": 21.873836517333984, "rewards/rejected": -32.45579147338867, "step": 522 }, { "epoch": 0.42676458588331295, "grad_norm": 0.00848436076194048, "learning_rate": 7.911628423549162e-05, "logits/chosen": -5.018893718719482, "logits/rejected": -6.79255485534668, "logps/chosen": -148.241943359375, "logps/rejected": -388.741455078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.846860885620117, "rewards/margins": 25.151819229125977, "rewards/rejected": -32.998680114746094, "step": 523 }, { "epoch": 0.42758057935536514, "grad_norm": 7.849709800211713e-05, "learning_rate": 7.902499760007867e-05, "logits/chosen": -3.961524486541748, "logits/rejected": -6.2347636222839355, "logps/chosen": -139.35997009277344, "logps/rejected": -403.8305969238281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.330637454986572, "rewards/margins": 26.290910720825195, "rewards/rejected": -33.621551513671875, "step": 524 }, { "epoch": 0.4283965728274174, "grad_norm": 0.0015875069657340646, "learning_rate": 7.89335648089903e-05, "logits/chosen": -4.889189720153809, "logits/rejected": -6.224565505981445, "logps/chosen": -142.58413696289062, "logps/rejected": -417.06268310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.921649932861328, "rewards/margins": 25.488807678222656, "rewards/rejected": -34.41046142578125, "step": 525 }, { "epoch": 0.4292125662994696, "grad_norm": 0.000530370743945241, "learning_rate": 7.884198632263724e-05, "logits/chosen": -4.733016014099121, "logits/rejected": -5.655503273010254, "logps/chosen": -162.33319091796875, "logps/rejected": -443.6896057128906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.674219131469727, "rewards/margins": 26.836772918701172, "rewards/rejected": -37.51099395751953, "step": 526 }, { "epoch": 0.4300285597715218, "grad_norm": 9.254453470930457e-05, "learning_rate": 7.875026260216393e-05, "logits/chosen": -4.719394683837891, "logits/rejected": -6.4371337890625, "logps/chosen": -188.969970703125, "logps/rejected": -447.5559387207031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.943809509277344, "rewards/margins": 24.57550048828125, "rewards/rejected": -38.51930618286133, "step": 527 }, { "epoch": 0.43084455324357407, "grad_norm": 0.21037057042121887, "learning_rate": 7.865839410944612e-05, "logits/chosen": -4.1153459548950195, "logits/rejected": -6.536139965057373, "logps/chosen": -166.78273010253906, "logps/rejected": -428.62042236328125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -9.819108009338379, "rewards/margins": 26.516582489013672, "rewards/rejected": -36.335693359375, "step": 528 }, { "epoch": 0.43166054671562626, "grad_norm": 0.0855441763997078, "learning_rate": 7.856638130708853e-05, "logits/chosen": -4.528077602386475, "logits/rejected": -5.835092067718506, "logps/chosen": -191.1522979736328, "logps/rejected": -428.292236328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -12.938443183898926, "rewards/margins": 22.999004364013672, "rewards/rejected": -35.93744659423828, "step": 529 }, { "epoch": 0.4324765401876785, "grad_norm": 7.83840732765384e-05, "learning_rate": 7.84742246584226e-05, "logits/chosen": -4.119903087615967, "logits/rejected": -6.903938293457031, "logps/chosen": -183.53851318359375, "logps/rejected": -510.0367126464844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.790241241455078, "rewards/margins": 32.626731872558594, "rewards/rejected": -45.416969299316406, "step": 530 }, { "epoch": 0.43329253365973075, "grad_norm": 1.8254302740097046, "learning_rate": 7.838192462750409e-05, "logits/chosen": -4.252890586853027, "logits/rejected": -6.463965892791748, "logps/chosen": -191.1399688720703, "logps/rejected": -410.83917236328125, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -12.67291259765625, "rewards/margins": 23.503013610839844, "rewards/rejected": -36.175926208496094, "step": 531 }, { "epoch": 0.43410852713178294, "grad_norm": 0.06161493435502052, "learning_rate": 7.828948167911074e-05, "logits/chosen": -5.355913162231445, "logits/rejected": -6.780840873718262, "logps/chosen": -249.3521270751953, "logps/rejected": -471.9092712402344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -18.43470001220703, "rewards/margins": 22.309844970703125, "rewards/rejected": -40.744544982910156, "step": 532 }, { "epoch": 0.4349245206038352, "grad_norm": 0.0676368698477745, "learning_rate": 7.819689627873997e-05, "logits/chosen": -6.746939182281494, "logits/rejected": -7.392888069152832, "logps/chosen": -224.40805053710938, "logps/rejected": -516.489501953125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -17.933456420898438, "rewards/margins": 28.4155330657959, "rewards/rejected": -46.34899139404297, "step": 533 }, { "epoch": 0.4357405140758874, "grad_norm": 2.8745896543114213e-06, "learning_rate": 7.810416889260653e-05, "logits/chosen": -5.369772911071777, "logits/rejected": -6.600979804992676, "logps/chosen": -187.41238403320312, "logps/rejected": -473.0243835449219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.779483795166016, "rewards/margins": 28.015384674072266, "rewards/rejected": -40.79486846923828, "step": 534 }, { "epoch": 0.4365565075479396, "grad_norm": 6.508518708869815e-06, "learning_rate": 7.801129998764014e-05, "logits/chosen": -7.367636680603027, "logits/rejected": -8.59785270690918, "logps/chosen": -210.47811889648438, "logps/rejected": -553.1729125976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.639144897460938, "rewards/margins": 33.13409423828125, "rewards/rejected": -49.77323532104492, "step": 535 }, { "epoch": 0.43737250101999187, "grad_norm": 0.0007444047369062901, "learning_rate": 7.791829003148312e-05, "logits/chosen": -6.183640480041504, "logits/rejected": -8.349274635314941, "logps/chosen": -201.6265869140625, "logps/rejected": -507.39642333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.385353088378906, "rewards/margins": 28.351736068725586, "rewards/rejected": -41.73708724975586, "step": 536 }, { "epoch": 0.43818849449204406, "grad_norm": 0.013057343661785126, "learning_rate": 7.782513949248808e-05, "logits/chosen": -7.361266136169434, "logits/rejected": -9.345489501953125, "logps/chosen": -249.9781951904297, "logps/rejected": -539.9937133789062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -18.154245376586914, "rewards/margins": 30.116661071777344, "rewards/rejected": -48.270904541015625, "step": 537 }, { "epoch": 0.4390044879640963, "grad_norm": 1.2944035530090332, "learning_rate": 7.773184883971551e-05, "logits/chosen": -7.149392127990723, "logits/rejected": -9.2420654296875, "logps/chosen": -270.50225830078125, "logps/rejected": -508.5354919433594, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -22.892650604248047, "rewards/margins": 22.747045516967773, "rewards/rejected": -45.63969802856445, "step": 538 }, { "epoch": 0.4398204814361485, "grad_norm": 0.0025299186818301678, "learning_rate": 7.763841854293145e-05, "logits/chosen": -8.482688903808594, "logits/rejected": -10.897703170776367, "logps/chosen": -287.95318603515625, "logps/rejected": -650.37646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.149147033691406, "rewards/margins": 35.793190002441406, "rewards/rejected": -57.94233703613281, "step": 539 }, { "epoch": 0.44063647490820074, "grad_norm": 8.649404525756836, "learning_rate": 7.754484907260513e-05, "logits/chosen": -9.912261962890625, "logits/rejected": -11.457635879516602, "logps/chosen": -306.2189025878906, "logps/rejected": -565.4864501953125, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": -26.453678131103516, "rewards/margins": 25.26333999633789, "rewards/rejected": -51.717018127441406, "step": 540 }, { "epoch": 0.441452468380253, "grad_norm": 4.061058280058205e-05, "learning_rate": 7.74511408999066e-05, "logits/chosen": -9.242270469665527, "logits/rejected": -10.960188865661621, "logps/chosen": -284.27685546875, "logps/rejected": -626.4168701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.17308807373047, "rewards/margins": 33.243690490722656, "rewards/rejected": -56.416778564453125, "step": 541 }, { "epoch": 0.4422684618523052, "grad_norm": 2.2837855340185342e-07, "learning_rate": 7.73572944967043e-05, "logits/chosen": -7.931734561920166, "logits/rejected": -10.126764297485352, "logps/chosen": -340.6962890625, "logps/rejected": -703.852294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.590557098388672, "rewards/margins": 36.99875259399414, "rewards/rejected": -64.58930969238281, "step": 542 }, { "epoch": 0.4430844553243574, "grad_norm": 0.001880842843092978, "learning_rate": 7.72633103355628e-05, "logits/chosen": -9.1897611618042, "logits/rejected": -10.335771560668945, "logps/chosen": -307.1841125488281, "logps/rejected": -568.2523193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.971590042114258, "rewards/margins": 26.275794982910156, "rewards/rejected": -52.24738311767578, "step": 543 }, { "epoch": 0.4439004487964096, "grad_norm": 0.00024275877512991428, "learning_rate": 7.71691888897403e-05, "logits/chosen": -9.622304916381836, "logits/rejected": -9.372457504272461, "logps/chosen": -341.7306823730469, "logps/rejected": -643.3847045898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.290374755859375, "rewards/margins": 29.653179168701172, "rewards/rejected": -57.94355392456055, "step": 544 }, { "epoch": 0.44471644226846185, "grad_norm": 0.006660128943622112, "learning_rate": 7.707493063318629e-05, "logits/chosen": -9.281148910522461, "logits/rejected": -10.029559135437012, "logps/chosen": -399.2254638671875, "logps/rejected": -648.66259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.91218185424805, "rewards/margins": 25.618972778320312, "rewards/rejected": -59.531158447265625, "step": 545 }, { "epoch": 0.4455324357405141, "grad_norm": 0.008231583051383495, "learning_rate": 7.698053604053922e-05, "logits/chosen": -8.600878715515137, "logits/rejected": -9.786214828491211, "logps/chosen": -300.126953125, "logps/rejected": -656.9234619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.209386825561523, "rewards/margins": 35.40959167480469, "rewards/rejected": -59.61897659301758, "step": 546 }, { "epoch": 0.4463484292125663, "grad_norm": 0.42686811089515686, "learning_rate": 7.688600558712406e-05, "logits/chosen": -8.071819305419922, "logits/rejected": -8.898977279663086, "logps/chosen": -299.1033630371094, "logps/rejected": -578.3878173828125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -23.299053192138672, "rewards/margins": 29.299598693847656, "rewards/rejected": -52.59865188598633, "step": 547 }, { "epoch": 0.44716442268461853, "grad_norm": 0.02431810274720192, "learning_rate": 7.679133974894983e-05, "logits/chosen": -8.399133682250977, "logits/rejected": -8.364358901977539, "logps/chosen": -178.95675659179688, "logps/rejected": -490.95672607421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.031347274780273, "rewards/margins": 30.85942840576172, "rewards/rejected": -43.890777587890625, "step": 548 }, { "epoch": 0.4479804161566707, "grad_norm": 0.7898699045181274, "learning_rate": 7.669653900270737e-05, "logits/chosen": -5.690967082977295, "logits/rejected": -7.488203048706055, "logps/chosen": -217.01625061035156, "logps/rejected": -455.9725341796875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -16.43178367614746, "rewards/margins": 22.465208053588867, "rewards/rejected": -38.896995544433594, "step": 549 }, { "epoch": 0.44879640962872297, "grad_norm": 58.294403076171875, "learning_rate": 7.660160382576683e-05, "logits/chosen": -7.658189296722412, "logits/rejected": -8.698125839233398, "logps/chosen": -166.57264709472656, "logps/rejected": -399.9736328125, "loss": 1.4311, "rewards/accuracies": 0.875, "rewards/chosen": -11.364160537719727, "rewards/margins": 24.37084197998047, "rewards/rejected": -35.73500061035156, "step": 550 }, { "epoch": 0.4496124031007752, "grad_norm": 0.006897193379700184, "learning_rate": 7.650653469617526e-05, "logits/chosen": -7.193971633911133, "logits/rejected": -9.461420059204102, "logps/chosen": -215.70697021484375, "logps/rejected": -548.2341918945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.138734817504883, "rewards/margins": 33.28773880004883, "rewards/rejected": -49.42647933959961, "step": 551 }, { "epoch": 0.4504283965728274, "grad_norm": 0.009316652081906796, "learning_rate": 7.641133209265424e-05, "logits/chosen": -9.273200035095215, "logits/rejected": -10.182255744934082, "logps/chosen": -293.30450439453125, "logps/rejected": -527.5364990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.761611938476562, "rewards/margins": 22.666540145874023, "rewards/rejected": -47.42815017700195, "step": 552 }, { "epoch": 0.45124439004487965, "grad_norm": 0.1745024472475052, "learning_rate": 7.631599649459744e-05, "logits/chosen": -9.570891380310059, "logits/rejected": -10.232598304748535, "logps/chosen": -224.1129608154297, "logps/rejected": -471.9299621582031, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -19.321678161621094, "rewards/margins": 23.11342430114746, "rewards/rejected": -42.43510055541992, "step": 553 }, { "epoch": 0.45206038351693184, "grad_norm": 1.0484611266292632e-05, "learning_rate": 7.62205283820683e-05, "logits/chosen": -9.435012817382812, "logits/rejected": -10.714786529541016, "logps/chosen": -269.7120666503906, "logps/rejected": -604.209716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.63249969482422, "rewards/margins": 32.151641845703125, "rewards/rejected": -53.784141540527344, "step": 554 }, { "epoch": 0.4528763769889841, "grad_norm": 0.039956916123628616, "learning_rate": 7.612492823579745e-05, "logits/chosen": -9.385261535644531, "logits/rejected": -10.994989395141602, "logps/chosen": -185.382080078125, "logps/rejected": -452.054443359375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -14.00438117980957, "rewards/margins": 26.561504364013672, "rewards/rejected": -40.565887451171875, "step": 555 }, { "epoch": 0.45369237046103633, "grad_norm": 0.0005593692767433822, "learning_rate": 7.602919653718044e-05, "logits/chosen": -9.196725845336914, "logits/rejected": -11.531560897827148, "logps/chosen": -261.31878662109375, "logps/rejected": -573.443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.592004776000977, "rewards/margins": 33.2576789855957, "rewards/rejected": -51.84968566894531, "step": 556 }, { "epoch": 0.4545083639330885, "grad_norm": 0.0035134439822286367, "learning_rate": 7.59333337682752e-05, "logits/chosen": -10.545424461364746, "logits/rejected": -11.474995613098145, "logps/chosen": -194.5181884765625, "logps/rejected": -459.35955810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.38761043548584, "rewards/margins": 27.15220832824707, "rewards/rejected": -41.539817810058594, "step": 557 }, { "epoch": 0.45532435740514077, "grad_norm": 0.0007244928856380284, "learning_rate": 7.583734041179973e-05, "logits/chosen": -9.778675079345703, "logits/rejected": -11.248695373535156, "logps/chosen": -243.0888671875, "logps/rejected": -671.978271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.395479202270508, "rewards/margins": 41.955467224121094, "rewards/rejected": -59.35094451904297, "step": 558 }, { "epoch": 0.45614035087719296, "grad_norm": 0.0446365661919117, "learning_rate": 7.574121695112954e-05, "logits/chosen": -9.385865211486816, "logits/rejected": -11.142232894897461, "logps/chosen": -252.35447692871094, "logps/rejected": -626.4423828125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -18.556346893310547, "rewards/margins": 36.988502502441406, "rewards/rejected": -55.54485321044922, "step": 559 }, { "epoch": 0.4569563443492452, "grad_norm": 0.00017952373309526592, "learning_rate": 7.564496387029532e-05, "logits/chosen": -9.648923873901367, "logits/rejected": -10.719868659973145, "logps/chosen": -208.3495635986328, "logps/rejected": -547.93896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.646714210510254, "rewards/margins": 32.38082504272461, "rewards/rejected": -48.02754211425781, "step": 560 }, { "epoch": 0.45777233782129745, "grad_norm": 23.599063873291016, "learning_rate": 7.554858165398045e-05, "logits/chosen": -9.98363971710205, "logits/rejected": -11.317000389099121, "logps/chosen": -212.11834716796875, "logps/rejected": -437.26324462890625, "loss": 3.7026, "rewards/accuracies": 0.875, "rewards/chosen": -16.79193878173828, "rewards/margins": 22.634326934814453, "rewards/rejected": -39.426265716552734, "step": 561 }, { "epoch": 0.45858833129334964, "grad_norm": 3.1301897251978517e-05, "learning_rate": 7.545207078751857e-05, "logits/chosen": -8.149337768554688, "logits/rejected": -10.495651245117188, "logps/chosen": -146.93336486816406, "logps/rejected": -440.01910400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.61351490020752, "rewards/margins": 28.71072006225586, "rewards/rejected": -38.32423400878906, "step": 562 }, { "epoch": 0.4594043247654019, "grad_norm": 1.4893172419760958e-06, "learning_rate": 7.535543175689116e-05, "logits/chosen": -8.753131866455078, "logits/rejected": -10.660839080810547, "logps/chosen": -196.60682678222656, "logps/rejected": -494.4366760253906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.078885078430176, "rewards/margins": 29.79541778564453, "rewards/rejected": -43.874305725097656, "step": 563 }, { "epoch": 0.4602203182374541, "grad_norm": 109.68608093261719, "learning_rate": 7.525866504872506e-05, "logits/chosen": -8.058271408081055, "logits/rejected": -9.369834899902344, "logps/chosen": -229.91232299804688, "logps/rejected": -487.8659362792969, "loss": 4.22, "rewards/accuracies": 0.875, "rewards/chosen": -16.832626342773438, "rewards/margins": 25.689023971557617, "rewards/rejected": -42.52164840698242, "step": 564 }, { "epoch": 0.4610363117095063, "grad_norm": 0.007908941246569157, "learning_rate": 7.516177115029002e-05, "logits/chosen": -7.105183124542236, "logits/rejected": -9.090309143066406, "logps/chosen": -175.10592651367188, "logps/rejected": -518.162841796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.785223960876465, "rewards/margins": 31.87879180908203, "rewards/rejected": -43.66401672363281, "step": 565 }, { "epoch": 0.46185230518155856, "grad_norm": 0.20377209782600403, "learning_rate": 7.506475054949625e-05, "logits/chosen": -6.270544052124023, "logits/rejected": -9.02583122253418, "logps/chosen": -145.87286376953125, "logps/rejected": -446.7011413574219, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -9.552237510681152, "rewards/margins": 29.082719802856445, "rewards/rejected": -38.63495635986328, "step": 566 }, { "epoch": 0.46266829865361075, "grad_norm": 0.00014286680379882455, "learning_rate": 7.496760373489202e-05, "logits/chosen": -6.300063610076904, "logits/rejected": -9.064165115356445, "logps/chosen": -146.9872589111328, "logps/rejected": -421.869384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.155951499938965, "rewards/margins": 27.28836441040039, "rewards/rejected": -36.44431686401367, "step": 567 }, { "epoch": 0.463484292125663, "grad_norm": 0.0845804437994957, "learning_rate": 7.48703311956611e-05, "logits/chosen": -5.626402854919434, "logits/rejected": -8.386307716369629, "logps/chosen": -113.70246887207031, "logps/rejected": -376.4010925292969, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.372666835784912, "rewards/margins": 25.726581573486328, "rewards/rejected": -32.09925079345703, "step": 568 }, { "epoch": 0.46430028559771525, "grad_norm": 0.00041846363455988467, "learning_rate": 7.477293342162039e-05, "logits/chosen": -5.201422691345215, "logits/rejected": -7.594449043273926, "logps/chosen": -146.6895751953125, "logps/rejected": -355.38690185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.154599189758301, "rewards/margins": 23.292346954345703, "rewards/rejected": -30.446949005126953, "step": 569 }, { "epoch": 0.46511627906976744, "grad_norm": 0.042673259973526, "learning_rate": 7.467541090321735e-05, "logits/chosen": -5.239297866821289, "logits/rejected": -7.397007465362549, "logps/chosen": -105.28317260742188, "logps/rejected": -279.0161437988281, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.935916900634766, "rewards/margins": 16.72314453125, "rewards/rejected": -22.659061431884766, "step": 570 }, { "epoch": 0.4659322725418197, "grad_norm": 2.574140787124634, "learning_rate": 7.457776413152767e-05, "logits/chosen": -6.140218734741211, "logits/rejected": -8.09719181060791, "logps/chosen": -122.76172637939453, "logps/rejected": -301.58905029296875, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -7.205014705657959, "rewards/margins": 18.112699508666992, "rewards/rejected": -25.31771469116211, "step": 571 }, { "epoch": 0.46674826601387187, "grad_norm": 8.711807822692208e-06, "learning_rate": 7.447999359825263e-05, "logits/chosen": -5.565192222595215, "logits/rejected": -8.797279357910156, "logps/chosen": -116.69784545898438, "logps/rejected": -386.88958740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.209121227264404, "rewards/margins": 25.735340118408203, "rewards/rejected": -31.944461822509766, "step": 572 }, { "epoch": 0.4675642594859241, "grad_norm": 63.60091781616211, "learning_rate": 7.43820997957168e-05, "logits/chosen": -5.529299736022949, "logits/rejected": -8.13792610168457, "logps/chosen": -131.70652770996094, "logps/rejected": -328.62469482421875, "loss": 1.1551, "rewards/accuracies": 0.875, "rewards/chosen": -5.446751117706299, "rewards/margins": 21.56125259399414, "rewards/rejected": -27.00800132751465, "step": 573 }, { "epoch": 0.46838025295797636, "grad_norm": 2.859369033103576e-06, "learning_rate": 7.428408321686541e-05, "logits/chosen": -5.923610210418701, "logits/rejected": -8.287321090698242, "logps/chosen": -109.8123550415039, "logps/rejected": -389.9000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.324112415313721, "rewards/margins": 28.522125244140625, "rewards/rejected": -32.84623718261719, "step": 574 }, { "epoch": 0.46919624643002855, "grad_norm": 9.908392530633137e-05, "learning_rate": 7.4185944355262e-05, "logits/chosen": -5.813706398010254, "logits/rejected": -7.628688812255859, "logps/chosen": -95.56623840332031, "logps/rejected": -311.1016845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3476507663726807, "rewards/margins": 21.888132095336914, "rewards/rejected": -24.235782623291016, "step": 575 }, { "epoch": 0.4700122399020808, "grad_norm": 0.00010639648826327175, "learning_rate": 7.408768370508576e-05, "logits/chosen": -6.654320240020752, "logits/rejected": -8.035578727722168, "logps/chosen": -95.1711654663086, "logps/rejected": -363.49798583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9631402492523193, "rewards/margins": 25.528213500976562, "rewards/rejected": -29.491352081298828, "step": 576 }, { "epoch": 0.470828233374133, "grad_norm": 1.8251647304623475e-07, "learning_rate": 7.398930176112927e-05, "logits/chosen": -7.104450702667236, "logits/rejected": -7.74190616607666, "logps/chosen": -113.87870788574219, "logps/rejected": -416.22882080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.194535732269287, "rewards/margins": 28.85607147216797, "rewards/rejected": -35.05060577392578, "step": 577 }, { "epoch": 0.47164422684618523, "grad_norm": 0.018204592168331146, "learning_rate": 7.389079901879579e-05, "logits/chosen": -5.9228515625, "logits/rejected": -7.314057350158691, "logps/chosen": -130.19349670410156, "logps/rejected": -392.01318359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.735382080078125, "rewards/margins": 25.225736618041992, "rewards/rejected": -32.96112060546875, "step": 578 }, { "epoch": 0.4724602203182375, "grad_norm": 2.8426875360310078e-06, "learning_rate": 7.379217597409688e-05, "logits/chosen": -6.567907333374023, "logits/rejected": -7.914875030517578, "logps/chosen": -93.01517486572266, "logps/rejected": -388.70550537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3918185234069824, "rewards/margins": 29.491443634033203, "rewards/rejected": -32.883262634277344, "step": 579 }, { "epoch": 0.47327621379028967, "grad_norm": 0.0010487388353794813, "learning_rate": 7.369343312364993e-05, "logits/chosen": -7.1592559814453125, "logits/rejected": -8.169957160949707, "logps/chosen": -103.09225463867188, "logps/rejected": -319.879150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2819600105285645, "rewards/margins": 21.59095001220703, "rewards/rejected": -27.872907638549805, "step": 580 }, { "epoch": 0.4740922072623419, "grad_norm": 6.499951268779114e-05, "learning_rate": 7.35945709646756e-05, "logits/chosen": -6.2074666023254395, "logits/rejected": -7.2951579093933105, "logps/chosen": -96.55442810058594, "logps/rejected": -334.722412109375, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -4.950173854827881, "rewards/margins": 23.48088264465332, "rewards/rejected": -28.43105697631836, "step": 581 }, { "epoch": 0.4749082007343941, "grad_norm": 6.657076028204756e-06, "learning_rate": 7.349558999499527e-05, "logits/chosen": -5.602343559265137, "logits/rejected": -6.61866569519043, "logps/chosen": -98.41743469238281, "logps/rejected": -433.7593078613281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.680654525756836, "rewards/margins": 32.23064422607422, "rewards/rejected": -35.91130065917969, "step": 582 }, { "epoch": 0.47572419420644635, "grad_norm": 0.0005692525301128626, "learning_rate": 7.339649071302867e-05, "logits/chosen": -5.772888660430908, "logits/rejected": -8.102022171020508, "logps/chosen": -90.28707885742188, "logps/rejected": -355.6865539550781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.394608974456787, "rewards/margins": 25.991819381713867, "rewards/rejected": -30.386428833007812, "step": 583 }, { "epoch": 0.4765401876784986, "grad_norm": 3.4318647976760985e-06, "learning_rate": 7.329727361779124e-05, "logits/chosen": -6.10479736328125, "logits/rejected": -8.245504379272461, "logps/chosen": -109.7325439453125, "logps/rejected": -426.79022216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.182377815246582, "rewards/margins": 30.74917984008789, "rewards/rejected": -36.931556701660156, "step": 584 }, { "epoch": 0.4773561811505508, "grad_norm": 1.2704441360256169e-05, "learning_rate": 7.319793920889171e-05, "logits/chosen": -6.039206504821777, "logits/rejected": -7.4903950691223145, "logps/chosen": -80.34228515625, "logps/rejected": -378.27581787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.257093906402588, "rewards/margins": 30.187358856201172, "rewards/rejected": -33.44445037841797, "step": 585 }, { "epoch": 0.47817217462260303, "grad_norm": 0.0032676432747393847, "learning_rate": 7.309848798652949e-05, "logits/chosen": -6.243011474609375, "logits/rejected": -6.939949035644531, "logps/chosen": -129.28219604492188, "logps/rejected": -344.04083251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.004813194274902, "rewards/margins": 20.761005401611328, "rewards/rejected": -27.76581573486328, "step": 586 }, { "epoch": 0.4789881680946552, "grad_norm": 0.00026835568132810295, "learning_rate": 7.299892045149226e-05, "logits/chosen": -5.415736198425293, "logits/rejected": -7.102700233459473, "logps/chosen": -124.19764709472656, "logps/rejected": -361.8546142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.708080768585205, "rewards/margins": 23.924419403076172, "rewards/rejected": -30.632503509521484, "step": 587 }, { "epoch": 0.47980416156670747, "grad_norm": 4.772583906742511e-06, "learning_rate": 7.289923710515339e-05, "logits/chosen": -6.854109764099121, "logits/rejected": -8.263835906982422, "logps/chosen": -132.98214721679688, "logps/rejected": -416.856689453125, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -9.220958709716797, "rewards/margins": 27.714008331298828, "rewards/rejected": -36.934967041015625, "step": 588 }, { "epoch": 0.4806201550387597, "grad_norm": 0.007731916848570108, "learning_rate": 7.279943844946935e-05, "logits/chosen": -5.136406898498535, "logits/rejected": -7.025312423706055, "logps/chosen": -119.03328704833984, "logps/rejected": -368.68621826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.674581527709961, "rewards/margins": 25.747745513916016, "rewards/rejected": -31.422327041625977, "step": 589 }, { "epoch": 0.4814361485108119, "grad_norm": 0.8601283431053162, "learning_rate": 7.269952498697734e-05, "logits/chosen": -5.52672815322876, "logits/rejected": -6.730932712554932, "logps/chosen": -120.97191619873047, "logps/rejected": -309.80999755859375, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -4.225407600402832, "rewards/margins": 19.834726333618164, "rewards/rejected": -24.060134887695312, "step": 590 }, { "epoch": 0.48225214198286415, "grad_norm": 8.18910894651026e-09, "learning_rate": 7.259949722079263e-05, "logits/chosen": -5.647915363311768, "logits/rejected": -7.318437576293945, "logps/chosen": -125.46420288085938, "logps/rejected": -406.3254699707031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.308499336242676, "rewards/margins": 28.686649322509766, "rewards/rejected": -33.995147705078125, "step": 591 }, { "epoch": 0.48306813545491634, "grad_norm": 1.3693871778741595e-06, "learning_rate": 7.249935565460607e-05, "logits/chosen": -6.699575901031494, "logits/rejected": -8.43157958984375, "logps/chosen": -133.1099090576172, "logps/rejected": -369.018798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.765563011169434, "rewards/margins": 25.61511993408203, "rewards/rejected": -31.38068389892578, "step": 592 }, { "epoch": 0.4838841289269686, "grad_norm": 2.1948940753936768, "learning_rate": 7.239910079268153e-05, "logits/chosen": -6.3711161613464355, "logits/rejected": -7.560609817504883, "logps/chosen": -150.9547119140625, "logps/rejected": -379.1116943359375, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -8.422013282775879, "rewards/margins": 22.188617706298828, "rewards/rejected": -30.61063003540039, "step": 593 }, { "epoch": 0.4847001223990208, "grad_norm": 4.947535489918664e-05, "learning_rate": 7.229873313985342e-05, "logits/chosen": -6.842009544372559, "logits/rejected": -8.77597713470459, "logps/chosen": -139.81451416015625, "logps/rejected": -341.21673583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.40705680847168, "rewards/margins": 20.657651901245117, "rewards/rejected": -29.064708709716797, "step": 594 }, { "epoch": 0.485516115871073, "grad_norm": 0.0004751038213726133, "learning_rate": 7.219825320152411e-05, "logits/chosen": -7.869336128234863, "logits/rejected": -8.951192855834961, "logps/chosen": -151.9659423828125, "logps/rejected": -402.841552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.862804412841797, "rewards/margins": 25.230323791503906, "rewards/rejected": -35.09312438964844, "step": 595 }, { "epoch": 0.48633210934312526, "grad_norm": 2.4284745450131595e-06, "learning_rate": 7.209766148366136e-05, "logits/chosen": -8.728713035583496, "logits/rejected": -9.325000762939453, "logps/chosen": -160.85177612304688, "logps/rejected": -433.53131103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.650712966918945, "rewards/margins": 26.210891723632812, "rewards/rejected": -37.861602783203125, "step": 596 }, { "epoch": 0.48714810281517745, "grad_norm": 0.0004085556138306856, "learning_rate": 7.199695849279576e-05, "logits/chosen": -8.956633567810059, "logits/rejected": -9.842212677001953, "logps/chosen": -170.4790802001953, "logps/rejected": -412.1809387207031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.582844734191895, "rewards/margins": 23.276565551757812, "rewards/rejected": -35.85940933227539, "step": 597 }, { "epoch": 0.4879640962872297, "grad_norm": 47.79097366333008, "learning_rate": 7.189614473601833e-05, "logits/chosen": -8.716723442077637, "logits/rejected": -10.474002838134766, "logps/chosen": -168.7913818359375, "logps/rejected": -394.55279541015625, "loss": 0.5957, "rewards/accuracies": 0.875, "rewards/chosen": -12.386531829833984, "rewards/margins": 22.790287017822266, "rewards/rejected": -35.17681884765625, "step": 598 }, { "epoch": 0.48878008975928194, "grad_norm": 0.0012269731378182769, "learning_rate": 7.179522072097774e-05, "logits/chosen": -7.52215576171875, "logits/rejected": -10.64630126953125, "logps/chosen": -200.81478881835938, "logps/rejected": -457.82049560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.363096237182617, "rewards/margins": 25.428953170776367, "rewards/rejected": -39.792049407958984, "step": 599 }, { "epoch": 0.48959608323133413, "grad_norm": 2.5410246962564997e-05, "learning_rate": 7.169418695587791e-05, "logits/chosen": -9.784414291381836, "logits/rejected": -10.088153839111328, "logps/chosen": -183.90403747558594, "logps/rejected": -417.66845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.376978874206543, "rewards/margins": 23.265846252441406, "rewards/rejected": -36.642826080322266, "step": 600 }, { "epoch": 0.4904120767033864, "grad_norm": 0.00038932624738663435, "learning_rate": 7.159304394947544e-05, "logits/chosen": -7.2167816162109375, "logits/rejected": -8.97376823425293, "logps/chosen": -167.1228485107422, "logps/rejected": -411.35003662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.98442554473877, "rewards/margins": 24.008676528930664, "rewards/rejected": -33.99310302734375, "step": 601 }, { "epoch": 0.49122807017543857, "grad_norm": 0.7698314785957336, "learning_rate": 7.149179221107694e-05, "logits/chosen": -8.475640296936035, "logits/rejected": -10.607635498046875, "logps/chosen": -147.94723510742188, "logps/rejected": -360.07366943359375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -8.574627876281738, "rewards/margins": 21.058635711669922, "rewards/rejected": -29.633264541625977, "step": 602 }, { "epoch": 0.4920440636474908, "grad_norm": 8.661454200744629, "learning_rate": 7.139043225053665e-05, "logits/chosen": -8.459619522094727, "logits/rejected": -11.502706527709961, "logps/chosen": -149.743896484375, "logps/rejected": -368.5150451660156, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": -9.621954917907715, "rewards/margins": 20.81463623046875, "rewards/rejected": -30.43659210205078, "step": 603 }, { "epoch": 0.49286005711954306, "grad_norm": 0.0010924426605924964, "learning_rate": 7.128896457825364e-05, "logits/chosen": -9.539922714233398, "logits/rejected": -11.904350280761719, "logps/chosen": -145.6488800048828, "logps/rejected": -358.3431396484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.096936225891113, "rewards/margins": 21.75282096862793, "rewards/rejected": -29.84975814819336, "step": 604 }, { "epoch": 0.49367605059159525, "grad_norm": 0.02530396357178688, "learning_rate": 7.118738970516944e-05, "logits/chosen": -8.491678237915039, "logits/rejected": -11.324287414550781, "logps/chosen": -120.07656860351562, "logps/rejected": -379.254150390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.376715660095215, "rewards/margins": 25.281370162963867, "rewards/rejected": -31.658084869384766, "step": 605 }, { "epoch": 0.4944920440636475, "grad_norm": 0.008351127617061138, "learning_rate": 7.108570814276539e-05, "logits/chosen": -8.621748924255371, "logits/rejected": -10.992010116577148, "logps/chosen": -121.52318572998047, "logps/rejected": -343.42669677734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.335441589355469, "rewards/margins": 22.85108184814453, "rewards/rejected": -29.1865234375, "step": 606 }, { "epoch": 0.49530803753569974, "grad_norm": 0.3963375985622406, "learning_rate": 7.098392040306001e-05, "logits/chosen": -10.324615478515625, "logits/rejected": -13.194280624389648, "logps/chosen": -132.12937927246094, "logps/rejected": -306.8707580566406, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -8.363182067871094, "rewards/margins": 17.62933349609375, "rewards/rejected": -25.992515563964844, "step": 607 }, { "epoch": 0.49612403100775193, "grad_norm": 2.7606935501098633, "learning_rate": 7.088202699860656e-05, "logits/chosen": -10.307616233825684, "logits/rejected": -12.713693618774414, "logps/chosen": -149.9664306640625, "logps/rejected": -335.6012268066406, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -9.179362297058105, "rewards/margins": 18.38833236694336, "rewards/rejected": -27.567691802978516, "step": 608 }, { "epoch": 0.4969400244798042, "grad_norm": 0.08082493394613266, "learning_rate": 7.078002844249032e-05, "logits/chosen": -9.9180908203125, "logits/rejected": -13.316540718078613, "logps/chosen": -118.09566497802734, "logps/rejected": -364.57989501953125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.28568696975708, "rewards/margins": 24.914840698242188, "rewards/rejected": -30.20052719116211, "step": 609 }, { "epoch": 0.49775601795185637, "grad_norm": 0.06311525404453278, "learning_rate": 7.067792524832604e-05, "logits/chosen": -10.13919448852539, "logits/rejected": -13.21667194366455, "logps/chosen": -107.24586486816406, "logps/rejected": -377.2168884277344, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.737684726715088, "rewards/margins": 25.881637573242188, "rewards/rejected": -31.61932373046875, "step": 610 }, { "epoch": 0.4985720114239086, "grad_norm": 0.03008950687944889, "learning_rate": 7.057571793025544e-05, "logits/chosen": -10.251066207885742, "logits/rejected": -12.699479103088379, "logps/chosen": -123.96683502197266, "logps/rejected": -363.3095397949219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.234539985656738, "rewards/margins": 22.783206939697266, "rewards/rejected": -30.017745971679688, "step": 611 }, { "epoch": 0.49938800489596086, "grad_norm": 7.634774479292616e-10, "learning_rate": 7.047340700294453e-05, "logits/chosen": -10.4700927734375, "logits/rejected": -15.345464706420898, "logps/chosen": -128.02684020996094, "logps/rejected": -443.4031677246094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0386505126953125, "rewards/margins": 30.525291442871094, "rewards/rejected": -37.563941955566406, "step": 612 }, { "epoch": 0.5002039983680131, "grad_norm": 0.08059186488389969, "learning_rate": 7.037099298158103e-05, "logits/chosen": -9.197421073913574, "logits/rejected": -13.529272079467773, "logps/chosen": -103.67757415771484, "logps/rejected": -373.07366943359375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.083804130554199, "rewards/margins": 27.674694061279297, "rewards/rejected": -31.758499145507812, "step": 613 }, { "epoch": 0.5010199918400653, "grad_norm": 1.4168600159791822e-07, "learning_rate": 7.02684763818718e-05, "logits/chosen": -9.728713989257812, "logits/rejected": -14.518499374389648, "logps/chosen": -109.76725769042969, "logps/rejected": -384.80682373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.855955600738525, "rewards/margins": 27.81747817993164, "rewards/rejected": -32.67343521118164, "step": 614 }, { "epoch": 0.5018359853121175, "grad_norm": 0.2905903458595276, "learning_rate": 7.016585772004026e-05, "logits/chosen": -10.924501419067383, "logits/rejected": -15.272537231445312, "logps/chosen": -113.51898956298828, "logps/rejected": -403.49371337890625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -5.684210777282715, "rewards/margins": 28.696348190307617, "rewards/rejected": -34.38056182861328, "step": 615 }, { "epoch": 0.5026519787841697, "grad_norm": 4.718088803201681e-06, "learning_rate": 7.006313751282372e-05, "logits/chosen": -10.720643997192383, "logits/rejected": -14.637575149536133, "logps/chosen": -120.71696472167969, "logps/rejected": -361.4962158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2003397941589355, "rewards/margins": 23.867834091186523, "rewards/rejected": -30.06817626953125, "step": 616 }, { "epoch": 0.503467972256222, "grad_norm": 185.02288818359375, "learning_rate": 6.996031627747085e-05, "logits/chosen": -9.813997268676758, "logits/rejected": -15.42113971710205, "logps/chosen": -160.25743103027344, "logps/rejected": -502.6247253417969, "loss": 8.6406, "rewards/accuracies": 0.875, "rewards/chosen": -9.836695671081543, "rewards/margins": 33.51651382446289, "rewards/rejected": -43.35321044921875, "step": 617 }, { "epoch": 0.5042839657282742, "grad_norm": 4.4423759115730377e-10, "learning_rate": 6.985739453173903e-05, "logits/chosen": -11.404342651367188, "logits/rejected": -15.540081977844238, "logps/chosen": -110.156005859375, "logps/rejected": -463.49700927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.887096881866455, "rewards/margins": 33.80358123779297, "rewards/rejected": -38.69068145751953, "step": 618 }, { "epoch": 0.5050999592003264, "grad_norm": 4.5986809730529785, "learning_rate": 6.975437279389181e-05, "logits/chosen": -9.246354103088379, "logits/rejected": -12.794147491455078, "logps/chosen": -86.3014907836914, "logps/rejected": -407.75115966796875, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": -3.987774610519409, "rewards/margins": 28.295886993408203, "rewards/rejected": -32.283660888671875, "step": 619 }, { "epoch": 0.5059159526723787, "grad_norm": 2.3762001991271973, "learning_rate": 6.965125158269619e-05, "logits/chosen": -5.5720014572143555, "logits/rejected": -10.904889106750488, "logps/chosen": -64.48480987548828, "logps/rejected": -291.9510192871094, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -1.6649460792541504, "rewards/margins": 22.413589477539062, "rewards/rejected": -24.078535079956055, "step": 620 }, { "epoch": 0.5067319461444308, "grad_norm": 4.421212196350098, "learning_rate": 6.954803141742009e-05, "logits/chosen": -6.679623126983643, "logits/rejected": -12.448347091674805, "logps/chosen": -103.70945739746094, "logps/rejected": -348.6783447265625, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": -3.4926199913024902, "rewards/margins": 25.803674697875977, "rewards/rejected": -29.296297073364258, "step": 621 }, { "epoch": 0.507547939616483, "grad_norm": 0.00014185853069648147, "learning_rate": 6.944471281782975e-05, "logits/chosen": -9.056452751159668, "logits/rejected": -15.06635856628418, "logps/chosen": -113.93928527832031, "logps/rejected": -427.1973571777344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.374663352966309, "rewards/margins": 31.26557159423828, "rewards/rejected": -37.640235900878906, "step": 622 }, { "epoch": 0.5083639330885353, "grad_norm": 2.381420199526474e-05, "learning_rate": 6.934129630418701e-05, "logits/chosen": -10.172235488891602, "logits/rejected": -14.913277626037598, "logps/chosen": -118.11848449707031, "logps/rejected": -410.7361145019531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.9842047691345215, "rewards/margins": 30.892200469970703, "rewards/rejected": -35.87640380859375, "step": 623 }, { "epoch": 0.5091799265605875, "grad_norm": 2.2431757315644063e-05, "learning_rate": 6.92377823972468e-05, "logits/chosen": -11.805821418762207, "logits/rejected": -15.529184341430664, "logps/chosen": -137.88449096679688, "logps/rejected": -446.09613037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.259939193725586, "rewards/margins": 30.926603317260742, "rewards/rejected": -39.186546325683594, "step": 624 }, { "epoch": 0.5099959200326397, "grad_norm": 0.00046490025124512613, "learning_rate": 6.91341716182545e-05, "logits/chosen": -12.130410194396973, "logits/rejected": -15.064712524414062, "logps/chosen": -196.2427978515625, "logps/rejected": -575.9134521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.651397705078125, "rewards/margins": 36.46014404296875, "rewards/rejected": -50.111541748046875, "step": 625 }, { "epoch": 0.510811913504692, "grad_norm": 0.00038839338230900466, "learning_rate": 6.903046448894322e-05, "logits/chosen": -14.961433410644531, "logits/rejected": -16.054306030273438, "logps/chosen": -207.11932373046875, "logps/rejected": -455.13739013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.99285888671875, "rewards/margins": 22.427383422851562, "rewards/rejected": -39.42024230957031, "step": 626 }, { "epoch": 0.5116279069767442, "grad_norm": 0.21753999590873718, "learning_rate": 6.892666153153129e-05, "logits/chosen": -13.41496753692627, "logits/rejected": -15.091075897216797, "logps/chosen": -266.24530029296875, "logps/rejected": -491.8201904296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -20.82952117919922, "rewards/margins": 21.391887664794922, "rewards/rejected": -42.22140884399414, "step": 627 }, { "epoch": 0.5124439004487964, "grad_norm": 1.1152191162109375, "learning_rate": 6.88227632687196e-05, "logits/chosen": -13.901987075805664, "logits/rejected": -15.668777465820312, "logps/chosen": -289.2164306640625, "logps/rejected": -512.5906982421875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -21.977561950683594, "rewards/margins": 23.24602508544922, "rewards/rejected": -45.22358703613281, "step": 628 }, { "epoch": 0.5132598939208486, "grad_norm": 0.0019230297766625881, "learning_rate": 6.871877022368891e-05, "logits/chosen": -13.85743522644043, "logits/rejected": -14.84681510925293, "logps/chosen": -212.78677368164062, "logps/rejected": -506.1865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.600668907165527, "rewards/margins": 27.836788177490234, "rewards/rejected": -43.43745803833008, "step": 629 }, { "epoch": 0.5140758873929009, "grad_norm": 2.728959771047812e-06, "learning_rate": 6.861468292009727e-05, "logits/chosen": -12.467726707458496, "logits/rejected": -14.629349708557129, "logps/chosen": -194.70799255371094, "logps/rejected": -478.5106201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.954286575317383, "rewards/margins": 28.751527786254883, "rewards/rejected": -41.70581817626953, "step": 630 }, { "epoch": 0.5148918808649531, "grad_norm": 7.764065779181095e-11, "learning_rate": 6.851050188207737e-05, "logits/chosen": -11.817378997802734, "logits/rejected": -14.181836128234863, "logps/chosen": -153.39466857910156, "logps/rejected": -531.8952026367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.260488510131836, "rewards/margins": 37.72230529785156, "rewards/rejected": -45.982791900634766, "step": 631 }, { "epoch": 0.5157078743370053, "grad_norm": 0.0009053711546584964, "learning_rate": 6.840622763423391e-05, "logits/chosen": -13.111361503601074, "logits/rejected": -15.150406837463379, "logps/chosen": -197.35340881347656, "logps/rejected": -454.9023742675781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.804797172546387, "rewards/margins": 26.07158851623535, "rewards/rejected": -40.87638473510742, "step": 632 }, { "epoch": 0.5165238678090576, "grad_norm": 9.852879156824201e-05, "learning_rate": 6.830186070164094e-05, "logits/chosen": -13.522668838500977, "logits/rejected": -14.539219856262207, "logps/chosen": -259.3019714355469, "logps/rejected": -516.60888671875, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -20.710525512695312, "rewards/margins": 23.67288589477539, "rewards/rejected": -44.38341522216797, "step": 633 }, { "epoch": 0.5173398612811098, "grad_norm": 0.0007006657542660832, "learning_rate": 6.819740160983923e-05, "logits/chosen": -12.36196517944336, "logits/rejected": -15.310734748840332, "logps/chosen": -198.74996948242188, "logps/rejected": -539.371337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.523571014404297, "rewards/margins": 33.77357482910156, "rewards/rejected": -48.297142028808594, "step": 634 }, { "epoch": 0.518155854753162, "grad_norm": 4.968931079929462e-07, "learning_rate": 6.809285088483362e-05, "logits/chosen": -12.419576644897461, "logits/rejected": -15.535836219787598, "logps/chosen": -200.1110382080078, "logps/rejected": -477.5141906738281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.620108604431152, "rewards/margins": 31.482988357543945, "rewards/rejected": -43.10309600830078, "step": 635 }, { "epoch": 0.5189718482252142, "grad_norm": 0.009185935370624065, "learning_rate": 6.798820905309036e-05, "logits/chosen": -11.448163986206055, "logits/rejected": -14.248616218566895, "logps/chosen": -167.94715881347656, "logps/rejected": -426.27728271484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.12446403503418, "rewards/margins": 27.36431884765625, "rewards/rejected": -37.48878479003906, "step": 636 }, { "epoch": 0.5197878416972664, "grad_norm": 0.10505910962820053, "learning_rate": 6.788347664153447e-05, "logits/chosen": -11.589506149291992, "logits/rejected": -13.171876907348633, "logps/chosen": -242.18247985839844, "logps/rejected": -478.3992614746094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -17.866626739501953, "rewards/margins": 23.862308502197266, "rewards/rejected": -41.72893524169922, "step": 637 }, { "epoch": 0.5206038351693186, "grad_norm": 2.783336867651087e-07, "learning_rate": 6.77786541775471e-05, "logits/chosen": -12.054291725158691, "logits/rejected": -13.134427070617676, "logps/chosen": -153.81753540039062, "logps/rejected": -487.12860107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.186903953552246, "rewards/margins": 33.45975875854492, "rewards/rejected": -42.646663665771484, "step": 638 }, { "epoch": 0.5214198286413708, "grad_norm": 0.008774095214903355, "learning_rate": 6.767374218896286e-05, "logits/chosen": -11.854328155517578, "logits/rejected": -14.064948081970215, "logps/chosen": -227.65194702148438, "logps/rejected": -531.4483642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.554947853088379, "rewards/margins": 30.89764404296875, "rewards/rejected": -46.45259094238281, "step": 639 }, { "epoch": 0.5222358221134231, "grad_norm": 1.5577953371703757e-09, "learning_rate": 6.756874120406714e-05, "logits/chosen": -11.573383331298828, "logits/rejected": -13.813871383666992, "logps/chosen": -209.78424072265625, "logps/rejected": -529.3743896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.308801651000977, "rewards/margins": 33.09150695800781, "rewards/rejected": -47.40031051635742, "step": 640 }, { "epoch": 0.5230518155854753, "grad_norm": 0.015064050443470478, "learning_rate": 6.746365175159348e-05, "logits/chosen": -12.636956214904785, "logits/rejected": -13.992265701293945, "logps/chosen": -183.97369384765625, "logps/rejected": -522.031005859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.088376998901367, "rewards/margins": 32.954586029052734, "rewards/rejected": -46.042964935302734, "step": 641 }, { "epoch": 0.5238678090575275, "grad_norm": 2.2924355747022673e-09, "learning_rate": 6.735847436072094e-05, "logits/chosen": -11.895618438720703, "logits/rejected": -13.803711891174316, "logps/chosen": -163.71255493164062, "logps/rejected": -500.1565246582031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.814990997314453, "rewards/margins": 34.64254379272461, "rewards/rejected": -45.45753479003906, "step": 642 }, { "epoch": 0.5246838025295798, "grad_norm": 0.00024848588509485126, "learning_rate": 6.725320956107131e-05, "logits/chosen": -12.154273986816406, "logits/rejected": -13.262405395507812, "logps/chosen": -237.25936889648438, "logps/rejected": -493.40936279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.07831382751465, "rewards/margins": 25.625938415527344, "rewards/rejected": -43.704254150390625, "step": 643 }, { "epoch": 0.525499796001632, "grad_norm": 1.5581237633455203e-08, "learning_rate": 6.714785788270658e-05, "logits/chosen": -11.731143951416016, "logits/rejected": -13.519087791442871, "logps/chosen": -150.7329864501953, "logps/rejected": -501.54974365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.29079818725586, "rewards/margins": 34.08597183227539, "rewards/rejected": -44.37677001953125, "step": 644 }, { "epoch": 0.5263157894736842, "grad_norm": 9.82433795928955, "learning_rate": 6.704241985612625e-05, "logits/chosen": -11.957530975341797, "logits/rejected": -12.659502029418945, "logps/chosen": -197.7490234375, "logps/rejected": -448.332275390625, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": -15.845063209533691, "rewards/margins": 24.446928024291992, "rewards/rejected": -40.2919921875, "step": 645 }, { "epoch": 0.5271317829457365, "grad_norm": 3.874019057548139e-06, "learning_rate": 6.693689601226458e-05, "logits/chosen": -11.17598819732666, "logits/rejected": -12.866371154785156, "logps/chosen": -153.3990936279297, "logps/rejected": -455.2225341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.819023132324219, "rewards/margins": 29.921030044555664, "rewards/rejected": -40.74005126953125, "step": 646 }, { "epoch": 0.5279477764177887, "grad_norm": 1.0998886864399537e-05, "learning_rate": 6.683128688248795e-05, "logits/chosen": -11.219449043273926, "logits/rejected": -13.265613555908203, "logps/chosen": -153.4649200439453, "logps/rejected": -497.50274658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.243898391723633, "rewards/margins": 34.238197326660156, "rewards/rejected": -43.482093811035156, "step": 647 }, { "epoch": 0.5287637698898409, "grad_norm": 6.1370510593405925e-06, "learning_rate": 6.672559299859228e-05, "logits/chosen": -10.264692306518555, "logits/rejected": -12.134771347045898, "logps/chosen": -198.69863891601562, "logps/rejected": -457.66571044921875, "loss": 0.3466, "rewards/accuracies": 1.0, "rewards/chosen": -14.591754913330078, "rewards/margins": 25.781692504882812, "rewards/rejected": -40.37344741821289, "step": 648 }, { "epoch": 0.529579763361893, "grad_norm": 4.353623711539356e-11, "learning_rate": 6.661981489280016e-05, "logits/chosen": -9.718801498413086, "logits/rejected": -13.200424194335938, "logps/chosen": -155.76321411132812, "logps/rejected": -503.068603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.559229850769043, "rewards/margins": 35.37504577636719, "rewards/rejected": -44.93427276611328, "step": 649 }, { "epoch": 0.5303957568339454, "grad_norm": 0.00014406382979359478, "learning_rate": 6.651395309775837e-05, "logits/chosen": -10.669112205505371, "logits/rejected": -12.6686372756958, "logps/chosen": -163.03163146972656, "logps/rejected": -442.668701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.084558486938477, "rewards/margins": 29.46227264404297, "rewards/rejected": -39.54683303833008, "step": 650 }, { "epoch": 0.5312117503059975, "grad_norm": 1.983715947062592e-06, "learning_rate": 6.640800814653503e-05, "logits/chosen": -8.798725128173828, "logits/rejected": -11.132282257080078, "logps/chosen": -112.66226959228516, "logps/rejected": -478.228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.579124450683594, "rewards/margins": 37.4599494934082, "rewards/rejected": -42.0390739440918, "step": 651 }, { "epoch": 0.5320277437780497, "grad_norm": 0.20768195390701294, "learning_rate": 6.63019805726171e-05, "logits/chosen": -12.35297966003418, "logits/rejected": -14.067665100097656, "logps/chosen": -164.1319580078125, "logps/rejected": -458.08203125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -11.895299911499023, "rewards/margins": 29.712661743164062, "rewards/rejected": -41.60796356201172, "step": 652 }, { "epoch": 0.532843737250102, "grad_norm": 0.0008311850251629949, "learning_rate": 6.619587090990748e-05, "logits/chosen": -11.836203575134277, "logits/rejected": -11.024471282958984, "logps/chosen": -117.68061065673828, "logps/rejected": -462.6477355957031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.951277256011963, "rewards/margins": 31.826913833618164, "rewards/rejected": -39.77819061279297, "step": 653 }, { "epoch": 0.5336597307221542, "grad_norm": 0.0003438894054852426, "learning_rate": 6.608967969272248e-05, "logits/chosen": -11.231639862060547, "logits/rejected": -11.947975158691406, "logps/chosen": -146.9644775390625, "logps/rejected": -429.10205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.01382064819336, "rewards/margins": 27.66033363342285, "rewards/rejected": -37.674156188964844, "step": 654 }, { "epoch": 0.5344757241942064, "grad_norm": 0.006351151969283819, "learning_rate": 6.598340745578909e-05, "logits/chosen": -11.486856460571289, "logits/rejected": -11.724233627319336, "logps/chosen": -147.02413940429688, "logps/rejected": -478.24053955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.25082778930664, "rewards/margins": 31.921340942382812, "rewards/rejected": -40.17217254638672, "step": 655 }, { "epoch": 0.5352917176662587, "grad_norm": 7.196657634267467e-07, "learning_rate": 6.587705473424222e-05, "logits/chosen": -11.328856468200684, "logits/rejected": -11.26856803894043, "logps/chosen": -115.53678894042969, "logps/rejected": -474.85498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.9446258544921875, "rewards/margins": 34.614288330078125, "rewards/rejected": -40.55891418457031, "step": 656 }, { "epoch": 0.5361077111383109, "grad_norm": 0.11345303058624268, "learning_rate": 6.577062206362215e-05, "logits/chosen": -11.10736083984375, "logits/rejected": -12.649262428283691, "logps/chosen": -127.39744567871094, "logps/rejected": -436.7554931640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.665700912475586, "rewards/margins": 31.3900146484375, "rewards/rejected": -39.05571746826172, "step": 657 }, { "epoch": 0.5369237046103631, "grad_norm": 0.7017265558242798, "learning_rate": 6.566410997987163e-05, "logits/chosen": -10.93345832824707, "logits/rejected": -11.578706741333008, "logps/chosen": -113.32836151123047, "logps/rejected": -445.9029541015625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -6.261209964752197, "rewards/margins": 32.87615203857422, "rewards/rejected": -39.13736343383789, "step": 658 }, { "epoch": 0.5377396980824154, "grad_norm": 7.528967398684472e-05, "learning_rate": 6.555751901933342e-05, "logits/chosen": -12.426508903503418, "logits/rejected": -12.3074951171875, "logps/chosen": -100.32259368896484, "logps/rejected": -450.89459228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.04624080657959, "rewards/margins": 34.18479919433594, "rewards/rejected": -39.23103713989258, "step": 659 }, { "epoch": 0.5385556915544676, "grad_norm": 5.217513532329576e-09, "learning_rate": 6.545084971874738e-05, "logits/chosen": -11.418357849121094, "logits/rejected": -12.374424934387207, "logps/chosen": -112.9708251953125, "logps/rejected": -464.2666320800781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.518621444702148, "rewards/margins": 33.565406799316406, "rewards/rejected": -41.08403015136719, "step": 660 }, { "epoch": 0.5393716850265198, "grad_norm": 4.032056949654361e-06, "learning_rate": 6.534410261524786e-05, "logits/chosen": -10.857662200927734, "logits/rejected": -11.1044282913208, "logps/chosen": -108.22480010986328, "logps/rejected": -451.22357177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.99286413192749, "rewards/margins": 33.15734100341797, "rewards/rejected": -38.15020751953125, "step": 661 }, { "epoch": 0.540187678498572, "grad_norm": 1.67350108881692e-07, "learning_rate": 6.523727824636104e-05, "logits/chosen": -10.421466827392578, "logits/rejected": -11.290205001831055, "logps/chosen": -85.93537902832031, "logps/rejected": -403.68792724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5362389087677, "rewards/margins": 32.31285858154297, "rewards/rejected": -35.849098205566406, "step": 662 }, { "epoch": 0.5410036719706243, "grad_norm": 1.774798363829433e-10, "learning_rate": 6.513037715000209e-05, "logits/chosen": -11.041996002197266, "logits/rejected": -12.19951343536377, "logps/chosen": -88.71807861328125, "logps/rejected": -510.3374328613281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6204535961151123, "rewards/margins": 41.890785217285156, "rewards/rejected": -45.51123809814453, "step": 663 }, { "epoch": 0.5418196654426765, "grad_norm": 1.3618863192732533e-07, "learning_rate": 6.50233998644726e-05, "logits/chosen": -11.564818382263184, "logits/rejected": -12.731621742248535, "logps/chosen": -130.14422607421875, "logps/rejected": -536.9296264648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.7076897621154785, "rewards/margins": 40.74969482421875, "rewards/rejected": -48.4573860168457, "step": 664 }, { "epoch": 0.5426356589147286, "grad_norm": 0.0033253494184464216, "learning_rate": 6.49163469284578e-05, "logits/chosen": -11.82891845703125, "logits/rejected": -12.669678688049316, "logps/chosen": -102.91777801513672, "logps/rejected": -426.1036376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.122319221496582, "rewards/margins": 31.35201644897461, "rewards/rejected": -37.47433853149414, "step": 665 }, { "epoch": 0.543451652386781, "grad_norm": 4.267010353942169e-06, "learning_rate": 6.48092188810239e-05, "logits/chosen": -11.883503913879395, "logits/rejected": -13.011443138122559, "logps/chosen": -94.73567199707031, "logps/rejected": -437.1458740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.581737518310547, "rewards/margins": 34.58277130126953, "rewards/rejected": -39.164512634277344, "step": 666 }, { "epoch": 0.5442676458588331, "grad_norm": 2.14593200098534e-07, "learning_rate": 6.47020162616152e-05, "logits/chosen": -12.200760841369629, "logits/rejected": -12.942585945129395, "logps/chosen": -99.98588562011719, "logps/rejected": -493.34375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.633606433868408, "rewards/margins": 39.52996826171875, "rewards/rejected": -44.16357421875, "step": 667 }, { "epoch": 0.5450836393308853, "grad_norm": 4.7984517692611917e-08, "learning_rate": 6.459473961005168e-05, "logits/chosen": -11.396636962890625, "logits/rejected": -12.765892028808594, "logps/chosen": -103.5709228515625, "logps/rejected": -409.94091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.807588577270508, "rewards/margins": 30.13881492614746, "rewards/rejected": -35.94640350341797, "step": 668 }, { "epoch": 0.5458996328029376, "grad_norm": 0.03522012010216713, "learning_rate": 6.448738946652597e-05, "logits/chosen": -12.022494316101074, "logits/rejected": -12.402336120605469, "logps/chosen": -85.44802856445312, "logps/rejected": -410.5926818847656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.561415672302246, "rewards/margins": 32.307437896728516, "rewards/rejected": -35.86885070800781, "step": 669 }, { "epoch": 0.5467156262749898, "grad_norm": 0.000667052692733705, "learning_rate": 6.437996637160087e-05, "logits/chosen": -10.592529296875, "logits/rejected": -11.267663955688477, "logps/chosen": -85.44790649414062, "logps/rejected": -402.5139465332031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.053218126296997, "rewards/margins": 32.1084098815918, "rewards/rejected": -35.16162872314453, "step": 670 }, { "epoch": 0.547531619747042, "grad_norm": 1.882626605720361e-07, "learning_rate": 6.427247086620647e-05, "logits/chosen": -12.043863296508789, "logits/rejected": -12.500633239746094, "logps/chosen": -88.21500396728516, "logps/rejected": -458.11663818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.753880977630615, "rewards/margins": 36.5698356628418, "rewards/rejected": -41.32371520996094, "step": 671 }, { "epoch": 0.5483476132190942, "grad_norm": 0.0001566860155435279, "learning_rate": 6.416490349163748e-05, "logits/chosen": -10.979511260986328, "logits/rejected": -12.336616516113281, "logps/chosen": -127.75448608398438, "logps/rejected": -428.675048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.447306156158447, "rewards/margins": 27.7928409576416, "rewards/rejected": -35.24014663696289, "step": 672 }, { "epoch": 0.5491636066911465, "grad_norm": 2.0624618173314957e-06, "learning_rate": 6.405726478955054e-05, "logits/chosen": -11.750930786132812, "logits/rejected": -13.473167419433594, "logps/chosen": -129.36863708496094, "logps/rejected": -439.19232177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.569822788238525, "rewards/margins": 32.641639709472656, "rewards/rejected": -39.211463928222656, "step": 673 }, { "epoch": 0.5499796001631987, "grad_norm": 3.495085465488046e-08, "learning_rate": 6.394955530196147e-05, "logits/chosen": -10.951770782470703, "logits/rejected": -11.966653823852539, "logps/chosen": -76.65948486328125, "logps/rejected": -472.3620300292969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.8437690734863281, "rewards/margins": 38.85121154785156, "rewards/rejected": -40.69498062133789, "step": 674 }, { "epoch": 0.5507955936352509, "grad_norm": 5.916060672461754e-06, "learning_rate": 6.384177557124247e-05, "logits/chosen": -12.058682441711426, "logits/rejected": -11.682610511779785, "logps/chosen": -104.3941879272461, "logps/rejected": -485.4424133300781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.6833600997924805, "rewards/margins": 35.570308685302734, "rewards/rejected": -40.25366973876953, "step": 675 }, { "epoch": 0.5516115871073032, "grad_norm": 6.51620624125826e-09, "learning_rate": 6.373392614011952e-05, "logits/chosen": -11.403972625732422, "logits/rejected": -12.491090774536133, "logps/chosen": -83.69111633300781, "logps/rejected": -467.27587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2870283126831055, "rewards/margins": 38.79072189331055, "rewards/rejected": -42.0777473449707, "step": 676 }, { "epoch": 0.5524275805793554, "grad_norm": 0.25904056429862976, "learning_rate": 6.362600755166953e-05, "logits/chosen": -11.359214782714844, "logits/rejected": -12.16591739654541, "logps/chosen": -95.8244857788086, "logps/rejected": -350.9337158203125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.4290771484375, "rewards/margins": 24.371551513671875, "rewards/rejected": -29.800628662109375, "step": 677 }, { "epoch": 0.5532435740514076, "grad_norm": 2.0735144504180347e-11, "learning_rate": 6.351802034931765e-05, "logits/chosen": -11.496862411499023, "logits/rejected": -12.55332088470459, "logps/chosen": -100.97088623046875, "logps/rejected": -462.46051025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9798457622528076, "rewards/margins": 37.78905487060547, "rewards/rejected": -40.768898010253906, "step": 678 }, { "epoch": 0.5540595675234599, "grad_norm": 0.0003695300256367773, "learning_rate": 6.340996507683458e-05, "logits/chosen": -11.075685501098633, "logits/rejected": -11.552314758300781, "logps/chosen": -112.61575317382812, "logps/rejected": -430.391845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.496854782104492, "rewards/margins": 32.48323059082031, "rewards/rejected": -36.98008728027344, "step": 679 }, { "epoch": 0.554875560995512, "grad_norm": 5.532898583737733e-08, "learning_rate": 6.330184227833376e-05, "logits/chosen": -11.999592781066895, "logits/rejected": -12.707901954650879, "logps/chosen": -96.80068969726562, "logps/rejected": -450.478759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.6346585750579834, "rewards/margins": 36.01194763183594, "rewards/rejected": -38.646610260009766, "step": 680 }, { "epoch": 0.5556915544675642, "grad_norm": 1.8303733213542728e-06, "learning_rate": 6.319365249826865e-05, "logits/chosen": -10.718677520751953, "logits/rejected": -12.50559139251709, "logps/chosen": -90.61520385742188, "logps/rejected": -421.4306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6440653800964355, "rewards/margins": 32.48713302612305, "rewards/rejected": -36.13119888305664, "step": 681 }, { "epoch": 0.5565075479396164, "grad_norm": 6.506265322059335e-07, "learning_rate": 6.308539628143e-05, "logits/chosen": -12.387819290161133, "logits/rejected": -13.785852432250977, "logps/chosen": -116.88565063476562, "logps/rejected": -488.5749816894531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.950189590454102, "rewards/margins": 36.45252227783203, "rewards/rejected": -42.402713775634766, "step": 682 }, { "epoch": 0.5573235414116687, "grad_norm": 0.054917316883802414, "learning_rate": 6.297707417294313e-05, "logits/chosen": -12.27040958404541, "logits/rejected": -12.129751205444336, "logps/chosen": -147.89163208007812, "logps/rejected": -470.3831787109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.043220520019531, "rewards/margins": 32.21288299560547, "rewards/rejected": -41.256103515625, "step": 683 }, { "epoch": 0.5581395348837209, "grad_norm": 1.770465440920077e-09, "learning_rate": 6.286868671826512e-05, "logits/chosen": -11.323878288269043, "logits/rejected": -12.887749671936035, "logps/chosen": -82.95848083496094, "logps/rejected": -534.1513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.5002799034118652, "rewards/margins": 43.530006408691406, "rewards/rejected": -46.0302848815918, "step": 684 }, { "epoch": 0.5589555283557731, "grad_norm": 2.963185075088859e-08, "learning_rate": 6.276023446318213e-05, "logits/chosen": -13.522725105285645, "logits/rejected": -13.13780403137207, "logps/chosen": -119.88855743408203, "logps/rejected": -520.277587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.943012237548828, "rewards/margins": 38.85814666748047, "rewards/rejected": -44.8011589050293, "step": 685 }, { "epoch": 0.5597715218278254, "grad_norm": 0.006697890814393759, "learning_rate": 6.265171795380659e-05, "logits/chosen": -11.681096076965332, "logits/rejected": -12.643136024475098, "logps/chosen": -100.18924713134766, "logps/rejected": -425.362060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.501147747039795, "rewards/margins": 31.859203338623047, "rewards/rejected": -37.3603515625, "step": 686 }, { "epoch": 0.5605875152998776, "grad_norm": 0.04377106949687004, "learning_rate": 6.254313773657455e-05, "logits/chosen": -13.499939918518066, "logits/rejected": -13.203235626220703, "logps/chosen": -125.46604919433594, "logps/rejected": -547.0223999023438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.364560127258301, "rewards/margins": 41.37193298339844, "rewards/rejected": -47.73649597167969, "step": 687 }, { "epoch": 0.5614035087719298, "grad_norm": 4.189074365168466e-10, "learning_rate": 6.243449435824276e-05, "logits/chosen": -12.058494567871094, "logits/rejected": -12.839855194091797, "logps/chosen": -108.78173065185547, "logps/rejected": -538.2767944335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.2668585777282715, "rewards/margins": 41.936798095703125, "rewards/rejected": -47.20365524291992, "step": 688 }, { "epoch": 0.5622195022439821, "grad_norm": 0.0007223335560411215, "learning_rate": 6.232578836588608e-05, "logits/chosen": -12.06602954864502, "logits/rejected": -12.817649841308594, "logps/chosen": -109.71942901611328, "logps/rejected": -404.36395263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.545543670654297, "rewards/margins": 29.84206771850586, "rewards/rejected": -35.387611389160156, "step": 689 }, { "epoch": 0.5630354957160343, "grad_norm": 1.918447196658235e-07, "learning_rate": 6.22170203068947e-05, "logits/chosen": -12.282931327819824, "logits/rejected": -12.37342643737793, "logps/chosen": -124.6962890625, "logps/rejected": -489.770751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.4587788581848145, "rewards/margins": 36.937313079833984, "rewards/rejected": -43.396095275878906, "step": 690 }, { "epoch": 0.5638514891880865, "grad_norm": 0.005286333616822958, "learning_rate": 6.21081907289713e-05, "logits/chosen": -13.228278160095215, "logits/rejected": -13.89018440246582, "logps/chosen": -137.3478240966797, "logps/rejected": -563.2137451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.267800331115723, "rewards/margins": 42.23296356201172, "rewards/rejected": -50.500770568847656, "step": 691 }, { "epoch": 0.5646674826601387, "grad_norm": 2.093256340832128e-12, "learning_rate": 6.19993001801283e-05, "logits/chosen": -12.304590225219727, "logits/rejected": -13.371092796325684, "logps/chosen": -95.98432922363281, "logps/rejected": -491.11181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.736153602600098, "rewards/margins": 38.64958953857422, "rewards/rejected": -43.3857421875, "step": 692 }, { "epoch": 0.565483476132191, "grad_norm": 0.0015452594961971045, "learning_rate": 6.189034920868522e-05, "logits/chosen": -13.135002136230469, "logits/rejected": -13.090768814086914, "logps/chosen": -115.72269439697266, "logps/rejected": -444.1919250488281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.478119373321533, "rewards/margins": 31.933002471923828, "rewards/rejected": -38.4111213684082, "step": 693 }, { "epoch": 0.5662994696042432, "grad_norm": 11.346657752990723, "learning_rate": 6.17813383632658e-05, "logits/chosen": -12.699785232543945, "logits/rejected": -13.554603576660156, "logps/chosen": -135.29934692382812, "logps/rejected": -484.8019104003906, "loss": 0.2106, "rewards/accuracies": 1.0, "rewards/chosen": -8.596197128295898, "rewards/margins": 34.244239807128906, "rewards/rejected": -42.84043884277344, "step": 694 }, { "epoch": 0.5671154630762953, "grad_norm": 3.061559618799947e-05, "learning_rate": 6.167226819279528e-05, "logits/chosen": -11.952872276306152, "logits/rejected": -13.048439025878906, "logps/chosen": -129.4569549560547, "logps/rejected": -485.13690185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.47683334350586, "rewards/margins": 35.177764892578125, "rewards/rejected": -43.65460205078125, "step": 695 }, { "epoch": 0.5679314565483476, "grad_norm": 1.1094236640474264e-08, "learning_rate": 6.156313924649761e-05, "logits/chosen": -13.367219924926758, "logits/rejected": -13.487167358398438, "logps/chosen": -143.29029846191406, "logps/rejected": -540.2257080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.635380744934082, "rewards/margins": 38.29148864746094, "rewards/rejected": -47.92686462402344, "step": 696 }, { "epoch": 0.5687474500203998, "grad_norm": 0.05993055924773216, "learning_rate": 6.145395207389276e-05, "logits/chosen": -12.50224494934082, "logits/rejected": -13.150858879089355, "logps/chosen": -173.81101989746094, "logps/rejected": -461.2322998046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -12.04633903503418, "rewards/margins": 29.884201049804688, "rewards/rejected": -41.9305419921875, "step": 697 }, { "epoch": 0.569563443492452, "grad_norm": 0.045683603733778, "learning_rate": 6.134470722479382e-05, "logits/chosen": -11.418628692626953, "logits/rejected": -12.071040153503418, "logps/chosen": -186.0621795654297, "logps/rejected": -510.523193359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.427482604980469, "rewards/margins": 32.314659118652344, "rewards/rejected": -44.74214172363281, "step": 698 }, { "epoch": 0.5703794369645043, "grad_norm": 2.9991699228293367e-12, "learning_rate": 6.123540524930442e-05, "logits/chosen": -12.924901962280273, "logits/rejected": -13.433744430541992, "logps/chosen": -138.94384765625, "logps/rejected": -520.4769287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.293036460876465, "rewards/margins": 37.939971923828125, "rewards/rejected": -46.233009338378906, "step": 699 }, { "epoch": 0.5711954304365565, "grad_norm": 5.923261880980135e-12, "learning_rate": 6.112604669781572e-05, "logits/chosen": -13.971463203430176, "logits/rejected": -14.273597717285156, "logps/chosen": -158.77371215820312, "logps/rejected": -574.9678955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.377717971801758, "rewards/margins": 40.532020568847656, "rewards/rejected": -50.90974426269531, "step": 700 }, { "epoch": 0.5720114239086087, "grad_norm": 0.006923122331500053, "learning_rate": 6.101663212100389e-05, "logits/chosen": -13.329087257385254, "logits/rejected": -14.338876724243164, "logps/chosen": -187.70396423339844, "logps/rejected": -507.087646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.654670715332031, "rewards/margins": 31.987850189208984, "rewards/rejected": -45.642520904541016, "step": 701 }, { "epoch": 0.572827417380661, "grad_norm": 1.6058346874459062e-09, "learning_rate": 6.090716206982714e-05, "logits/chosen": -13.354561805725098, "logits/rejected": -13.311338424682617, "logps/chosen": -245.83192443847656, "logps/rejected": -632.1427001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.33411979675293, "rewards/margins": 37.329917907714844, "rewards/rejected": -55.664031982421875, "step": 702 }, { "epoch": 0.5736434108527132, "grad_norm": 1.944104951689951e-06, "learning_rate": 6.079763709552303e-05, "logits/chosen": -13.63715934753418, "logits/rejected": -13.740058898925781, "logps/chosen": -194.14132690429688, "logps/rejected": -574.678955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.497998237609863, "rewards/margins": 37.59982681274414, "rewards/rejected": -52.09782409667969, "step": 703 }, { "epoch": 0.5744594043247654, "grad_norm": 2.5488927235528536e-07, "learning_rate": 6.068805774960573e-05, "logits/chosen": -13.73020076751709, "logits/rejected": -14.167044639587402, "logps/chosen": -265.6759338378906, "logps/rejected": -575.1224365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.968868255615234, "rewards/margins": 31.20016860961914, "rewards/rejected": -52.169036865234375, "step": 704 }, { "epoch": 0.5752753977968176, "grad_norm": 1.4267727443950662e-08, "learning_rate": 6.0578424583863146e-05, "logits/chosen": -12.793146133422852, "logits/rejected": -13.851167678833008, "logps/chosen": -280.6208190917969, "logps/rejected": -582.1693115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.738468170166016, "rewards/margins": 31.339834213256836, "rewards/rejected": -52.07830047607422, "step": 705 }, { "epoch": 0.5760913912688699, "grad_norm": 1.2650352800847031e-05, "learning_rate": 6.046873815035422e-05, "logits/chosen": -13.236797332763672, "logits/rejected": -13.617668151855469, "logps/chosen": -271.18157958984375, "logps/rejected": -540.24560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.04175567626953, "rewards/margins": 28.425966262817383, "rewards/rejected": -48.46772384643555, "step": 706 }, { "epoch": 0.5769073847409221, "grad_norm": 4.526654961978238e-08, "learning_rate": 6.0358999001406156e-05, "logits/chosen": -13.661354064941406, "logits/rejected": -13.157580375671387, "logps/chosen": -231.07351684570312, "logps/rejected": -626.4072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.042081832885742, "rewards/margins": 39.02461624145508, "rewards/rejected": -56.06669616699219, "step": 707 }, { "epoch": 0.5777233782129743, "grad_norm": 0.02994419075548649, "learning_rate": 6.0249207689611533e-05, "logits/chosen": -13.830133438110352, "logits/rejected": -15.210990905761719, "logps/chosen": -233.32534790039062, "logps/rejected": -575.5543212890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -17.32893180847168, "rewards/margins": 34.431602478027344, "rewards/rejected": -51.760528564453125, "step": 708 }, { "epoch": 0.5785393716850266, "grad_norm": 2.29803413276386e-06, "learning_rate": 6.0139364767825626e-05, "logits/chosen": -14.20703411102295, "logits/rejected": -14.48139762878418, "logps/chosen": -265.6534118652344, "logps/rejected": -570.0811157226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.475374221801758, "rewards/margins": 28.958799362182617, "rewards/rejected": -51.434173583984375, "step": 709 }, { "epoch": 0.5793553651570787, "grad_norm": 0.04453292861580849, "learning_rate": 6.0029470789163646e-05, "logits/chosen": -12.910744667053223, "logits/rejected": -13.985135078430176, "logps/chosen": -235.23703002929688, "logps/rejected": -595.7083129882812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -18.100357055664062, "rewards/margins": 35.407859802246094, "rewards/rejected": -53.508216857910156, "step": 710 }, { "epoch": 0.5801713586291309, "grad_norm": 0.9307540655136108, "learning_rate": 5.991952630699783e-05, "logits/chosen": -12.501060485839844, "logits/rejected": -14.026400566101074, "logps/chosen": -281.09527587890625, "logps/rejected": -550.01025390625, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -22.3350887298584, "rewards/margins": 27.13920021057129, "rewards/rejected": -49.47428894042969, "step": 711 }, { "epoch": 0.5809873521011832, "grad_norm": 6.110519024105088e-08, "learning_rate": 5.980953187495476e-05, "logits/chosen": -13.652664184570312, "logits/rejected": -14.509042739868164, "logps/chosen": -259.00738525390625, "logps/rejected": -598.30322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.65616226196289, "rewards/margins": 33.07389450073242, "rewards/rejected": -53.73005676269531, "step": 712 }, { "epoch": 0.5818033455732354, "grad_norm": 5.708290611028399e-12, "learning_rate": 5.9699488046912554e-05, "logits/chosen": -14.375019073486328, "logits/rejected": -15.838645935058594, "logps/chosen": -208.18751525878906, "logps/rejected": -627.0629272460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.79100227355957, "rewards/margins": 41.42053985595703, "rewards/rejected": -56.2115478515625, "step": 713 }, { "epoch": 0.5826193390452876, "grad_norm": 4.793867560692888e-07, "learning_rate": 5.9589395376998e-05, "logits/chosen": -15.571927070617676, "logits/rejected": -16.25042724609375, "logps/chosen": -270.72418212890625, "logps/rejected": -558.5277099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.82109832763672, "rewards/margins": 29.747390747070312, "rewards/rejected": -50.56848907470703, "step": 714 }, { "epoch": 0.5834353325173398, "grad_norm": 5.688056717190193e-06, "learning_rate": 5.947925441958393e-05, "logits/chosen": -15.82855224609375, "logits/rejected": -16.733518600463867, "logps/chosen": -196.72091674804688, "logps/rejected": -529.4848022460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.779842376708984, "rewards/margins": 32.48434066772461, "rewards/rejected": -47.264183044433594, "step": 715 }, { "epoch": 0.5842513259893921, "grad_norm": 1.5398098184959963e-05, "learning_rate": 5.9369065729286245e-05, "logits/chosen": -16.021160125732422, "logits/rejected": -16.242780685424805, "logps/chosen": -222.2786407470703, "logps/rejected": -563.7661743164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.136587142944336, "rewards/margins": 32.11185073852539, "rewards/rejected": -49.248435974121094, "step": 716 }, { "epoch": 0.5850673194614443, "grad_norm": 4.034934279056879e-12, "learning_rate": 5.925882986096122e-05, "logits/chosen": -16.044654846191406, "logits/rejected": -17.522153854370117, "logps/chosen": -188.13455200195312, "logps/rejected": -596.58935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.135616302490234, "rewards/margins": 40.848121643066406, "rewards/rejected": -53.983734130859375, "step": 717 }, { "epoch": 0.5858833129334965, "grad_norm": 2.5759111510481603e-10, "learning_rate": 5.9148547369702736e-05, "logits/chosen": -15.714804649353027, "logits/rejected": -17.615812301635742, "logps/chosen": -178.51345825195312, "logps/rejected": -576.0773315429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.335519790649414, "rewards/margins": 38.22216033935547, "rewards/rejected": -51.55767822265625, "step": 718 }, { "epoch": 0.5866993064055488, "grad_norm": 0.0005048731109127402, "learning_rate": 5.903821881083942e-05, "logits/chosen": -17.21536636352539, "logits/rejected": -18.377880096435547, "logps/chosen": -230.1999053955078, "logps/rejected": -569.93798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.805694580078125, "rewards/margins": 34.719940185546875, "rewards/rejected": -52.525634765625, "step": 719 }, { "epoch": 0.587515299877601, "grad_norm": 2.4064420358627103e-06, "learning_rate": 5.8927844739931834e-05, "logits/chosen": -16.880556106567383, "logits/rejected": -17.69455337524414, "logps/chosen": -241.1000518798828, "logps/rejected": -581.6572875976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.55473518371582, "rewards/margins": 33.579349517822266, "rewards/rejected": -52.13408660888672, "step": 720 }, { "epoch": 0.5883312933496532, "grad_norm": 5.072400881545036e-07, "learning_rate": 5.8817425712769794e-05, "logits/chosen": -16.845029830932617, "logits/rejected": -17.68631935119629, "logps/chosen": -218.44114685058594, "logps/rejected": -578.9362182617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.060916900634766, "rewards/margins": 35.02678680419922, "rewards/rejected": -51.08770751953125, "step": 721 }, { "epoch": 0.5891472868217055, "grad_norm": 6.921948170202086e-07, "learning_rate": 5.870696228536944e-05, "logits/chosen": -16.602941513061523, "logits/rejected": -17.481124877929688, "logps/chosen": -195.2366485595703, "logps/rejected": -491.96771240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.712194442749023, "rewards/margins": 28.68512725830078, "rewards/rejected": -42.39732360839844, "step": 722 }, { "epoch": 0.5899632802937577, "grad_norm": 2.1553807350027654e-11, "learning_rate": 5.859645501397048e-05, "logits/chosen": -16.71381950378418, "logits/rejected": -17.982112884521484, "logps/chosen": -251.5819091796875, "logps/rejected": -675.6869506835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.77943992614746, "rewards/margins": 41.19769287109375, "rewards/rejected": -58.97713088989258, "step": 723 }, { "epoch": 0.5907792737658099, "grad_norm": 0.00022386538330465555, "learning_rate": 5.8485904455033444e-05, "logits/chosen": -16.841651916503906, "logits/rejected": -18.1470947265625, "logps/chosen": -173.1919708251953, "logps/rejected": -481.3482666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.685517311096191, "rewards/margins": 30.963356018066406, "rewards/rejected": -42.64887237548828, "step": 724 }, { "epoch": 0.591595267237862, "grad_norm": 0.03896578028798103, "learning_rate": 5.837531116523682e-05, "logits/chosen": -17.020687103271484, "logits/rejected": -19.091960906982422, "logps/chosen": -223.84927368164062, "logps/rejected": -543.2957763671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.74271011352539, "rewards/margins": 33.670623779296875, "rewards/rejected": -49.413330078125, "step": 725 }, { "epoch": 0.5924112607099143, "grad_norm": 4.4912017074238975e-06, "learning_rate": 5.826467570147426e-05, "logits/chosen": -17.78936767578125, "logits/rejected": -18.747703552246094, "logps/chosen": -234.1423797607422, "logps/rejected": -534.635498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.075416564941406, "rewards/margins": 30.01675796508789, "rewards/rejected": -49.09217071533203, "step": 726 }, { "epoch": 0.5932272541819665, "grad_norm": 4.477864742279053, "learning_rate": 5.8153998620851766e-05, "logits/chosen": -16.705333709716797, "logits/rejected": -18.14346694946289, "logps/chosen": -210.80093383789062, "logps/rejected": -522.7885131835938, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -14.698155403137207, "rewards/margins": 31.29252052307129, "rewards/rejected": -45.99067687988281, "step": 727 }, { "epoch": 0.5940432476540187, "grad_norm": 0.0003855411196127534, "learning_rate": 5.804328048068492e-05, "logits/chosen": -17.75423240661621, "logits/rejected": -18.617536544799805, "logps/chosen": -239.41403198242188, "logps/rejected": -540.6597900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.719669342041016, "rewards/margins": 30.76836395263672, "rewards/rejected": -48.48802947998047, "step": 728 }, { "epoch": 0.594859241126071, "grad_norm": 0.0025723103899508715, "learning_rate": 5.793252183849609e-05, "logits/chosen": -18.634109497070312, "logits/rejected": -19.073379516601562, "logps/chosen": -218.9388885498047, "logps/rejected": -515.8388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.016756057739258, "rewards/margins": 28.671884536743164, "rewards/rejected": -45.68864059448242, "step": 729 }, { "epoch": 0.5956752345981232, "grad_norm": 0.03525311127305031, "learning_rate": 5.782172325201155e-05, "logits/chosen": -17.069894790649414, "logits/rejected": -18.484230041503906, "logps/chosen": -229.96902465820312, "logps/rejected": -536.58544921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -15.63998794555664, "rewards/margins": 31.595870971679688, "rewards/rejected": -47.23585510253906, "step": 730 }, { "epoch": 0.5964912280701754, "grad_norm": 2.720908923947718e-06, "learning_rate": 5.7710885279158724e-05, "logits/chosen": -18.804073333740234, "logits/rejected": -19.33216094970703, "logps/chosen": -315.00018310546875, "logps/rejected": -674.2713012695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.90161895751953, "rewards/margins": 33.728511810302734, "rewards/rejected": -59.63013458251953, "step": 731 }, { "epoch": 0.5973072215422277, "grad_norm": 7.869150161743164, "learning_rate": 5.760000847806337e-05, "logits/chosen": -18.72123146057129, "logits/rejected": -19.75717544555664, "logps/chosen": -268.048583984375, "logps/rejected": -509.328857421875, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": -21.078407287597656, "rewards/margins": 24.54246711730957, "rewards/rejected": -45.620872497558594, "step": 732 }, { "epoch": 0.5981232150142799, "grad_norm": 0.018218757584691048, "learning_rate": 5.748909340704676e-05, "logits/chosen": -18.578039169311523, "logits/rejected": -19.348064422607422, "logps/chosen": -289.7190246582031, "logps/rejected": -508.92498779296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -24.00012969970703, "rewards/margins": 22.414514541625977, "rewards/rejected": -46.41464614868164, "step": 733 }, { "epoch": 0.5989392084863321, "grad_norm": 95.11345672607422, "learning_rate": 5.7378140624622886e-05, "logits/chosen": -19.445926666259766, "logits/rejected": -20.32268714904785, "logps/chosen": -300.4189453125, "logps/rejected": -612.0400390625, "loss": 0.8157, "rewards/accuracies": 0.875, "rewards/chosen": -24.280460357666016, "rewards/margins": 31.26093292236328, "rewards/rejected": -55.54138946533203, "step": 734 }, { "epoch": 0.5997552019583844, "grad_norm": 7.471778392791748, "learning_rate": 5.7267150689495644e-05, "logits/chosen": -18.66109848022461, "logits/rejected": -18.454605102539062, "logps/chosen": -350.6202392578125, "logps/rejected": -565.9701538085938, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -30.250057220458984, "rewards/margins": 21.19732093811035, "rewards/rejected": -51.4473762512207, "step": 735 }, { "epoch": 0.6005711954304366, "grad_norm": 0.0007178331725299358, "learning_rate": 5.715612416055598e-05, "logits/chosen": -15.953618049621582, "logits/rejected": -16.692729949951172, "logps/chosen": -195.35614013671875, "logps/rejected": -524.2161254882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.339428901672363, "rewards/margins": 30.64185905456543, "rewards/rejected": -45.98128890991211, "step": 736 }, { "epoch": 0.6013871889024888, "grad_norm": 0.022080259397625923, "learning_rate": 5.7045061596879134e-05, "logits/chosen": -15.040862083435059, "logits/rejected": -15.853734016418457, "logps/chosen": -188.75897216796875, "logps/rejected": -571.0548095703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.026379585266113, "rewards/margins": 36.248252868652344, "rewards/rejected": -51.27463150024414, "step": 737 }, { "epoch": 0.602203182374541, "grad_norm": 33.18916320800781, "learning_rate": 5.69339635577218e-05, "logits/chosen": -12.577797889709473, "logits/rejected": -13.517096519470215, "logps/chosen": -129.05654907226562, "logps/rejected": -400.83416748046875, "loss": 0.2083, "rewards/accuracies": 1.0, "rewards/chosen": -7.628040790557861, "rewards/margins": 26.562959671020508, "rewards/rejected": -34.191001892089844, "step": 738 }, { "epoch": 0.6030191758465933, "grad_norm": 2.553262220317265e-06, "learning_rate": 5.682283060251932e-05, "logits/chosen": -11.198391914367676, "logits/rejected": -12.711152076721191, "logps/chosen": -113.1657943725586, "logps/rejected": -473.80303955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.0209641456604, "rewards/margins": 34.48542785644531, "rewards/rejected": -40.50639343261719, "step": 739 }, { "epoch": 0.6038351693186454, "grad_norm": 2.6099180104210973e-05, "learning_rate": 5.6711663290882776e-05, "logits/chosen": -10.574487686157227, "logits/rejected": -12.706587791442871, "logps/chosen": -130.55972290039062, "logps/rejected": -516.2453002929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.753695487976074, "rewards/margins": 37.82324981689453, "rewards/rejected": -45.576942443847656, "step": 740 }, { "epoch": 0.6046511627906976, "grad_norm": 0.16564203798770905, "learning_rate": 5.660046218259638e-05, "logits/chosen": -11.352567672729492, "logits/rejected": -11.898149490356445, "logps/chosen": -157.41558837890625, "logps/rejected": -464.79669189453125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -10.304679870605469, "rewards/margins": 28.736717224121094, "rewards/rejected": -39.04139709472656, "step": 741 }, { "epoch": 0.6054671562627499, "grad_norm": 3.5200250749767292e-06, "learning_rate": 5.648922783761443e-05, "logits/chosen": -10.622819900512695, "logits/rejected": -11.174888610839844, "logps/chosen": -121.51255798339844, "logps/rejected": -419.0201416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.388755798339844, "rewards/margins": 29.374099731445312, "rewards/rejected": -36.762855529785156, "step": 742 }, { "epoch": 0.6062831497348021, "grad_norm": 0.09036902338266373, "learning_rate": 5.637796081605863e-05, "logits/chosen": -10.73077392578125, "logits/rejected": -11.091413497924805, "logps/chosen": -93.78949737548828, "logps/rejected": -369.45159912109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.285470485687256, "rewards/margins": 27.701129913330078, "rewards/rejected": -31.98659896850586, "step": 743 }, { "epoch": 0.6070991432068543, "grad_norm": 3.463153234406491e-06, "learning_rate": 5.6266661678215216e-05, "logits/chosen": -9.089672088623047, "logits/rejected": -11.263440132141113, "logps/chosen": -87.9820785522461, "logps/rejected": -427.64117431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1418821811676025, "rewards/margins": 34.17840576171875, "rewards/rejected": -37.320289611816406, "step": 744 }, { "epoch": 0.6079151366789066, "grad_norm": 3.4210565900139045e-06, "learning_rate": 5.615533098453215e-05, "logits/chosen": -9.492667198181152, "logits/rejected": -12.23658561706543, "logps/chosen": -63.906185150146484, "logps/rejected": -458.38726806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1735647916793823, "rewards/margins": 40.015647888183594, "rewards/rejected": -41.18921661376953, "step": 745 }, { "epoch": 0.6087311301509588, "grad_norm": 1.1152980732731521e-05, "learning_rate": 5.6043969295616283e-05, "logits/chosen": -9.578241348266602, "logits/rejected": -11.328447341918945, "logps/chosen": -53.64385223388672, "logps/rejected": -396.85479736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.15947449207305908, "rewards/margins": 34.516441345214844, "rewards/rejected": -34.67591857910156, "step": 746 }, { "epoch": 0.609547123623011, "grad_norm": 1.1064738535127017e-11, "learning_rate": 5.593257717223055e-05, "logits/chosen": -7.834045886993408, "logits/rejected": -11.08991813659668, "logps/chosen": -90.27885437011719, "logps/rejected": -544.7114868164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.498511791229248, "rewards/margins": 45.19015121459961, "rewards/rejected": -47.688663482666016, "step": 747 }, { "epoch": 0.6103631170950632, "grad_norm": 0.0032993776258081198, "learning_rate": 5.582115517529114e-05, "logits/chosen": -7.72848653793335, "logits/rejected": -9.961610794067383, "logps/chosen": -98.85079956054688, "logps/rejected": -434.2631530761719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1519815921783447, "rewards/margins": 33.55510711669922, "rewards/rejected": -36.70708465576172, "step": 748 }, { "epoch": 0.6111791105671155, "grad_norm": 0.0010963525855913758, "learning_rate": 5.570970386586469e-05, "logits/chosen": -9.00244426727295, "logits/rejected": -10.537528991699219, "logps/chosen": -92.90353393554688, "logps/rejected": -432.55389404296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.828710079193115, "rewards/margins": 32.07170867919922, "rewards/rejected": -36.900421142578125, "step": 749 }, { "epoch": 0.6119951040391677, "grad_norm": 81.38335418701172, "learning_rate": 5.559822380516539e-05, "logits/chosen": -9.029541015625, "logits/rejected": -11.702943801879883, "logps/chosen": -98.16416931152344, "logps/rejected": -458.5103759765625, "loss": 1.5606, "rewards/accuracies": 0.875, "rewards/chosen": -3.466797351837158, "rewards/margins": 37.27366638183594, "rewards/rejected": -40.74046325683594, "step": 750 }, { "epoch": 0.6128110975112199, "grad_norm": 9.563007097312948e-07, "learning_rate": 5.548671555455226e-05, "logits/chosen": -8.247505187988281, "logits/rejected": -10.626958847045898, "logps/chosen": -70.11160278320312, "logps/rejected": -366.97698974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3445393443107605, "rewards/margins": 30.25238037109375, "rewards/rejected": -30.596921920776367, "step": 751 }, { "epoch": 0.6136270909832722, "grad_norm": 0.0331290178000927, "learning_rate": 5.537517967552626e-05, "logits/chosen": -8.172770500183105, "logits/rejected": -10.201312065124512, "logps/chosen": -114.45855712890625, "logps/rejected": -402.092041015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.933886528015137, "rewards/margins": 28.19131088256836, "rewards/rejected": -34.12519836425781, "step": 752 }, { "epoch": 0.6144430844553244, "grad_norm": 1.607028252692544e-06, "learning_rate": 5.5263616729727416e-05, "logits/chosen": -7.473128318786621, "logits/rejected": -10.2882080078125, "logps/chosen": -77.69697570800781, "logps/rejected": -400.3310241699219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.8616001605987549, "rewards/margins": 30.95956039428711, "rewards/rejected": -32.82115936279297, "step": 753 }, { "epoch": 0.6152590779273766, "grad_norm": 0.4424353241920471, "learning_rate": 5.515202727893213e-05, "logits/chosen": -8.0835599899292, "logits/rejected": -9.621963500976562, "logps/chosen": -65.29817962646484, "logps/rejected": -301.6338806152344, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.7690684795379639, "rewards/margins": 23.558143615722656, "rewards/rejected": -24.327213287353516, "step": 754 }, { "epoch": 0.6160750713994289, "grad_norm": 1.1119167804718018, "learning_rate": 5.5040411885050225e-05, "logits/chosen": -8.358354568481445, "logits/rejected": -11.376543998718262, "logps/chosen": -46.665626525878906, "logps/rejected": -374.79412841796875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.09066733717918396, "rewards/margins": 31.550947189331055, "rewards/rejected": -31.641613006591797, "step": 755 }, { "epoch": 0.616891064871481, "grad_norm": 0.00015398761024698615, "learning_rate": 5.492877111012218e-05, "logits/chosen": -8.275474548339844, "logits/rejected": -10.08199691772461, "logps/chosen": -76.5840072631836, "logps/rejected": -282.2994384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7996596097946167, "rewards/margins": 21.282493591308594, "rewards/rejected": -22.0821533203125, "step": 756 }, { "epoch": 0.6177070583435332, "grad_norm": 0.00011426692799432203, "learning_rate": 5.481710551631626e-05, "logits/chosen": -8.868736267089844, "logits/rejected": -10.391292572021484, "logps/chosen": -89.73383331298828, "logps/rejected": -354.6399230957031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.023981094360352, "rewards/margins": 26.095809936523438, "rewards/rejected": -30.119789123535156, "step": 757 }, { "epoch": 0.6185230518155854, "grad_norm": 2.8702104373223847e-06, "learning_rate": 5.470541566592573e-05, "logits/chosen": -9.809064865112305, "logits/rejected": -11.197547912597656, "logps/chosen": -51.27611541748047, "logps/rejected": -348.779541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6862881183624268, "rewards/margins": 28.72683334350586, "rewards/rejected": -29.413122177124023, "step": 758 }, { "epoch": 0.6193390452876377, "grad_norm": 0.00025165631086565554, "learning_rate": 5.4593702121365955e-05, "logits/chosen": -9.00564956665039, "logits/rejected": -11.374876976013184, "logps/chosen": -82.87354278564453, "logps/rejected": -373.47039794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.5488061904907227, "rewards/margins": 28.114051818847656, "rewards/rejected": -30.662857055664062, "step": 759 }, { "epoch": 0.6201550387596899, "grad_norm": 0.24739865958690643, "learning_rate": 5.448196544517168e-05, "logits/chosen": -9.521352767944336, "logits/rejected": -10.373207092285156, "logps/chosen": -79.1142349243164, "logps/rejected": -341.4302978515625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.2578139305114746, "rewards/margins": 25.351892471313477, "rewards/rejected": -28.60970687866211, "step": 760 }, { "epoch": 0.6209710322317421, "grad_norm": 7.815536264388356e-06, "learning_rate": 5.437020619999408e-05, "logits/chosen": -9.594131469726562, "logits/rejected": -11.221623420715332, "logps/chosen": -78.61805725097656, "logps/rejected": -386.9607238769531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4668216705322266, "rewards/margins": 30.472166061401367, "rewards/rejected": -32.93899154663086, "step": 761 }, { "epoch": 0.6217870257037944, "grad_norm": 8.99101308959871e-08, "learning_rate": 5.425842494859797e-05, "logits/chosen": -9.508732795715332, "logits/rejected": -12.686866760253906, "logps/chosen": -79.71096801757812, "logps/rejected": -415.14739990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4119387865066528, "rewards/margins": 34.580196380615234, "rewards/rejected": -35.99213409423828, "step": 762 }, { "epoch": 0.6226030191758466, "grad_norm": 0.0008509016479365528, "learning_rate": 5.414662225385903e-05, "logits/chosen": -9.351231575012207, "logits/rejected": -11.777487754821777, "logps/chosen": -76.26289367675781, "logps/rejected": -406.05792236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0468807220458984, "rewards/margins": 32.75368118286133, "rewards/rejected": -33.800559997558594, "step": 763 }, { "epoch": 0.6234190126478988, "grad_norm": 0.827567458152771, "learning_rate": 5.403479867876087e-05, "logits/chosen": -10.325624465942383, "logits/rejected": -12.540379524230957, "logps/chosen": -90.98965454101562, "logps/rejected": -339.72540283203125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.9926486015319824, "rewards/margins": 24.254657745361328, "rewards/rejected": -28.247304916381836, "step": 764 }, { "epoch": 0.6242350061199511, "grad_norm": 0.0011695167049765587, "learning_rate": 5.392295478639225e-05, "logits/chosen": -10.1301851272583, "logits/rejected": -10.870433807373047, "logps/chosen": -81.07695007324219, "logps/rejected": -294.32745361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.5955810546875, "rewards/margins": 20.33725357055664, "rewards/rejected": -22.932836532592773, "step": 765 }, { "epoch": 0.6250509995920033, "grad_norm": 4.191277548670769e-05, "learning_rate": 5.3811091139944255e-05, "logits/chosen": -10.039804458618164, "logits/rejected": -11.625326156616211, "logps/chosen": -92.372802734375, "logps/rejected": -355.3260498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.81292724609375, "rewards/margins": 26.89110565185547, "rewards/rejected": -29.704030990600586, "step": 766 }, { "epoch": 0.6258669930640555, "grad_norm": 0.18334592878818512, "learning_rate": 5.3699208302707435e-05, "logits/chosen": -9.264416694641113, "logits/rejected": -10.542755126953125, "logps/chosen": -124.18408203125, "logps/rejected": -301.03741455078125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.399246692657471, "rewards/margins": 17.026935577392578, "rewards/rejected": -24.42618179321289, "step": 767 }, { "epoch": 0.6266829865361077, "grad_norm": 0.10919523984193802, "learning_rate": 5.3587306838068964e-05, "logits/chosen": -8.847415924072266, "logits/rejected": -10.254568099975586, "logps/chosen": -107.11857604980469, "logps/rejected": -322.29541015625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.797715187072754, "rewards/margins": 20.70761489868164, "rewards/rejected": -26.505329132080078, "step": 768 }, { "epoch": 0.62749898000816, "grad_norm": 3.89959204767365e-05, "learning_rate": 5.347538730950984e-05, "logits/chosen": -10.037516593933105, "logits/rejected": -11.463859558105469, "logps/chosen": -85.08778381347656, "logps/rejected": -367.5113525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.690973997116089, "rewards/margins": 28.398147583007812, "rewards/rejected": -31.089122772216797, "step": 769 }, { "epoch": 0.6283149734802121, "grad_norm": 0.0027467836625874043, "learning_rate": 5.336345028060199e-05, "logits/chosen": -8.345975875854492, "logits/rejected": -10.558794021606445, "logps/chosen": -106.7115249633789, "logps/rejected": -396.1872863769531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8719892501831055, "rewards/margins": 31.008411407470703, "rewards/rejected": -33.880401611328125, "step": 770 }, { "epoch": 0.6291309669522643, "grad_norm": 3.49676656696829e-06, "learning_rate": 5.325149631500549e-05, "logits/chosen": -9.255264282226562, "logits/rejected": -10.921621322631836, "logps/chosen": -77.0183334350586, "logps/rejected": -406.372802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3182015419006348, "rewards/margins": 32.59929656982422, "rewards/rejected": -34.91749572753906, "step": 771 }, { "epoch": 0.6299469604243166, "grad_norm": 3.8174548535607755e-06, "learning_rate": 5.313952597646568e-05, "logits/chosen": -9.18436336517334, "logits/rejected": -11.19346809387207, "logps/chosen": -105.72712707519531, "logps/rejected": -410.27520751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.386946678161621, "rewards/margins": 30.05548858642578, "rewards/rejected": -36.44243621826172, "step": 772 }, { "epoch": 0.6307629538963688, "grad_norm": 0.021035680547356606, "learning_rate": 5.302753982881037e-05, "logits/chosen": -8.388370513916016, "logits/rejected": -9.557819366455078, "logps/chosen": -143.63894653320312, "logps/rejected": -359.4957275390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.62950325012207, "rewards/margins": 21.37269401550293, "rewards/rejected": -30.002197265625, "step": 773 }, { "epoch": 0.631578947368421, "grad_norm": 1.4868883226881735e-06, "learning_rate": 5.291553843594694e-05, "logits/chosen": -8.290831565856934, "logits/rejected": -9.927749633789062, "logps/chosen": -102.80081176757812, "logps/rejected": -451.5273742675781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.469058990478516, "rewards/margins": 32.9725341796875, "rewards/rejected": -38.441593170166016, "step": 774 }, { "epoch": 0.6323949408404733, "grad_norm": 0.01893387921154499, "learning_rate": 5.2803522361859594e-05, "logits/chosen": -9.258369445800781, "logits/rejected": -10.564496994018555, "logps/chosen": -86.58311462402344, "logps/rejected": -364.8173828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.535538196563721, "rewards/margins": 28.194107055664062, "rewards/rejected": -32.729644775390625, "step": 775 }, { "epoch": 0.6332109343125255, "grad_norm": 9.506152309768368e-06, "learning_rate": 5.2691492170606415e-05, "logits/chosen": -7.747804164886475, "logits/rejected": -9.483755111694336, "logps/chosen": -124.31104278564453, "logps/rejected": -358.4240417480469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.70568323135376, "rewards/margins": 26.11667251586914, "rewards/rejected": -31.822355270385742, "step": 776 }, { "epoch": 0.6340269277845777, "grad_norm": 1.0673632459656801e-05, "learning_rate": 5.257944842631658e-05, "logits/chosen": -8.749338150024414, "logits/rejected": -9.847439765930176, "logps/chosen": -100.95051574707031, "logps/rejected": -361.4495849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.994298458099365, "rewards/margins": 26.348209381103516, "rewards/rejected": -31.34250831604004, "step": 777 }, { "epoch": 0.63484292125663, "grad_norm": 0.8416497707366943, "learning_rate": 5.246739169318756e-05, "logits/chosen": -8.024955749511719, "logits/rejected": -8.886615753173828, "logps/chosen": -107.07364654541016, "logps/rejected": -399.700439453125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -5.2121405601501465, "rewards/margins": 29.134231567382812, "rewards/rejected": -34.34637451171875, "step": 778 }, { "epoch": 0.6356589147286822, "grad_norm": 7.304421887965873e-05, "learning_rate": 5.235532253548213e-05, "logits/chosen": -8.656824111938477, "logits/rejected": -9.996021270751953, "logps/chosen": -118.78260803222656, "logps/rejected": -423.2466735839844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.391486644744873, "rewards/margins": 30.329914093017578, "rewards/rejected": -35.721405029296875, "step": 779 }, { "epoch": 0.6364749082007344, "grad_norm": 0.00010505410318728536, "learning_rate": 5.2243241517525754e-05, "logits/chosen": -7.965946674346924, "logits/rejected": -9.85911750793457, "logps/chosen": -112.6383056640625, "logps/rejected": -406.2864685058594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.59121036529541, "rewards/margins": 30.136919021606445, "rewards/rejected": -35.72813034057617, "step": 780 }, { "epoch": 0.6372909016727866, "grad_norm": 1.4220893795879874e-08, "learning_rate": 5.213114920370352e-05, "logits/chosen": -8.160714149475098, "logits/rejected": -9.72645378112793, "logps/chosen": -108.76927185058594, "logps/rejected": -486.3230285644531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.637323379516602, "rewards/margins": 36.84716033935547, "rewards/rejected": -42.4844856262207, "step": 781 }, { "epoch": 0.6381068951448389, "grad_norm": 0.0006915467092767358, "learning_rate": 5.201904615845743e-05, "logits/chosen": -7.835486888885498, "logits/rejected": -9.931282997131348, "logps/chosen": -92.83341217041016, "logps/rejected": -444.58880615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.194491386413574, "rewards/margins": 36.290794372558594, "rewards/rejected": -39.485286712646484, "step": 782 }, { "epoch": 0.6389228886168911, "grad_norm": 0.031152578070759773, "learning_rate": 5.190693294628355e-05, "logits/chosen": -6.807201385498047, "logits/rejected": -8.476959228515625, "logps/chosen": -142.13320922851562, "logps/rejected": -529.745849609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.342566967010498, "rewards/margins": 38.96146011352539, "rewards/rejected": -46.30402374267578, "step": 783 }, { "epoch": 0.6397388820889433, "grad_norm": 1.5170260667800903, "learning_rate": 5.179481013172912e-05, "logits/chosen": -7.459615230560303, "logits/rejected": -9.60389232635498, "logps/chosen": -138.58485412597656, "logps/rejected": -469.4046325683594, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -7.562735557556152, "rewards/margins": 33.23483657836914, "rewards/rejected": -40.797569274902344, "step": 784 }, { "epoch": 0.6405548755609956, "grad_norm": 1.3985472502309193e-11, "learning_rate": 5.168267827938971e-05, "logits/chosen": -7.745636463165283, "logits/rejected": -9.875961303710938, "logps/chosen": -140.57296752929688, "logps/rejected": -542.8450927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.638259887695312, "rewards/margins": 40.394596099853516, "rewards/rejected": -49.03285217285156, "step": 785 }, { "epoch": 0.6413708690330477, "grad_norm": 4.9471662877387246e-11, "learning_rate": 5.157053795390642e-05, "logits/chosen": -7.758460521697998, "logits/rejected": -9.400269508361816, "logps/chosen": -104.42796325683594, "logps/rejected": -564.0632934570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.067353248596191, "rewards/margins": 43.71430587768555, "rewards/rejected": -48.78165817260742, "step": 786 }, { "epoch": 0.6421868625050999, "grad_norm": 2.753356431739462e-09, "learning_rate": 5.145838971996304e-05, "logits/chosen": -7.884222984313965, "logits/rejected": -9.315622329711914, "logps/chosen": -117.35763549804688, "logps/rejected": -529.4259033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.236788749694824, "rewards/margins": 40.902034759521484, "rewards/rejected": -47.138824462890625, "step": 787 }, { "epoch": 0.6430028559771522, "grad_norm": 4.474728587666732e-09, "learning_rate": 5.1346234142283144e-05, "logits/chosen": -8.159672737121582, "logits/rejected": -10.590547561645508, "logps/chosen": -133.3588409423828, "logps/rejected": -564.2685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.393622875213623, "rewards/margins": 42.804935455322266, "rewards/rejected": -50.19855880737305, "step": 788 }, { "epoch": 0.6438188494492044, "grad_norm": 0.08255145698785782, "learning_rate": 5.123407178562731e-05, "logits/chosen": -8.601037979125977, "logits/rejected": -9.498760223388672, "logps/chosen": -159.9658660888672, "logps/rejected": -475.5217590332031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -10.939549446105957, "rewards/margins": 31.156360626220703, "rewards/rejected": -42.095909118652344, "step": 789 }, { "epoch": 0.6446348429212566, "grad_norm": 7.513835384997947e-07, "learning_rate": 5.112190321479026e-05, "logits/chosen": -8.243340492248535, "logits/rejected": -9.133132934570312, "logps/chosen": -165.07260131835938, "logps/rejected": -513.1398315429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.041038513183594, "rewards/margins": 34.29660415649414, "rewards/rejected": -46.33763885498047, "step": 790 }, { "epoch": 0.6454508363933088, "grad_norm": 9.050180604930347e-10, "learning_rate": 5.100972899459796e-05, "logits/chosen": -7.747093200683594, "logits/rejected": -9.707788467407227, "logps/chosen": -145.17782592773438, "logps/rejected": -568.8737182617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.105585098266602, "rewards/margins": 43.13270568847656, "rewards/rejected": -51.23828887939453, "step": 791 }, { "epoch": 0.6462668298653611, "grad_norm": 1.7704685717490065e-08, "learning_rate": 5.0897549689904865e-05, "logits/chosen": -8.872098922729492, "logits/rejected": -10.717086791992188, "logps/chosen": -137.7938232421875, "logps/rejected": -497.7420654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.100292205810547, "rewards/margins": 35.972686767578125, "rewards/rejected": -45.072975158691406, "step": 792 }, { "epoch": 0.6470828233374133, "grad_norm": 0.029521528631448746, "learning_rate": 5.078536586559104e-05, "logits/chosen": -8.099203109741211, "logits/rejected": -10.706249237060547, "logps/chosen": -175.5059356689453, "logps/rejected": -708.2294921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.895895004272461, "rewards/margins": 52.371620178222656, "rewards/rejected": -64.26751708984375, "step": 793 }, { "epoch": 0.6478988168094655, "grad_norm": 6.292042371569551e-07, "learning_rate": 5.067317808655927e-05, "logits/chosen": -8.883793830871582, "logits/rejected": -11.62285041809082, "logps/chosen": -154.45059204101562, "logps/rejected": -596.55810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.529207229614258, "rewards/margins": 43.11297607421875, "rewards/rejected": -54.642181396484375, "step": 794 }, { "epoch": 0.6487148102815178, "grad_norm": 0.00013201928231865168, "learning_rate": 5.05609869177323e-05, "logits/chosen": -8.060866355895996, "logits/rejected": -10.283324241638184, "logps/chosen": -141.92156982421875, "logps/rejected": -513.6624145507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.899789810180664, "rewards/margins": 37.389984130859375, "rewards/rejected": -47.289772033691406, "step": 795 }, { "epoch": 0.64953080375357, "grad_norm": 0.5458858013153076, "learning_rate": 5.0448792924049894e-05, "logits/chosen": -7.548958778381348, "logits/rejected": -8.89815616607666, "logps/chosen": -254.91067504882812, "logps/rejected": -581.77099609375, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -20.28685188293457, "rewards/margins": 31.795166015625, "rewards/rejected": -52.0820198059082, "step": 796 }, { "epoch": 0.6503467972256222, "grad_norm": 3.0023002750567684e-08, "learning_rate": 5.0336596670466094e-05, "logits/chosen": -8.475671768188477, "logits/rejected": -9.651651382446289, "logps/chosen": -169.65380859375, "logps/rejected": -623.7200927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.646023750305176, "rewards/margins": 45.70507049560547, "rewards/rejected": -56.35108947753906, "step": 797 }, { "epoch": 0.6511627906976745, "grad_norm": 1.1651698703096791e-11, "learning_rate": 5.022439872194629e-05, "logits/chosen": -7.791810512542725, "logits/rejected": -9.27805233001709, "logps/chosen": -153.15032958984375, "logps/rejected": -618.604248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.331062316894531, "rewards/margins": 45.14891052246094, "rewards/rejected": -55.47997283935547, "step": 798 }, { "epoch": 0.6519787841697267, "grad_norm": 0.00018264450773131102, "learning_rate": 5.0112199643464376e-05, "logits/chosen": -9.419665336608887, "logits/rejected": -10.790104866027832, "logps/chosen": -191.822021484375, "logps/rejected": -614.5564575195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.678794860839844, "rewards/margins": 41.986846923828125, "rewards/rejected": -56.66564178466797, "step": 799 }, { "epoch": 0.6527947776417788, "grad_norm": 1.2381454325804953e-05, "learning_rate": 5e-05, "logits/chosen": -8.845880508422852, "logits/rejected": -9.817625999450684, "logps/chosen": -147.36294555664062, "logps/rejected": -631.6409301757812, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -10.048043251037598, "rewards/margins": 47.40929412841797, "rewards/rejected": -57.45734405517578, "step": 800 }, { "epoch": 0.653610771113831, "grad_norm": 3.1041245962537545e-12, "learning_rate": 4.988780035653564e-05, "logits/chosen": -8.157445907592773, "logits/rejected": -10.751742362976074, "logps/chosen": -133.18453979492188, "logps/rejected": -624.149658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.53363037109375, "rewards/margins": 48.640838623046875, "rewards/rejected": -57.174468994140625, "step": 801 }, { "epoch": 0.6544267645858833, "grad_norm": 0.0002831698511727154, "learning_rate": 4.977560127805373e-05, "logits/chosen": -7.6384596824646, "logits/rejected": -10.149208068847656, "logps/chosen": -206.7364044189453, "logps/rejected": -560.6300048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.699421882629395, "rewards/margins": 36.49420166015625, "rewards/rejected": -51.19362258911133, "step": 802 }, { "epoch": 0.6552427580579355, "grad_norm": 7.134055846069742e-11, "learning_rate": 4.966340332953392e-05, "logits/chosen": -8.133301734924316, "logits/rejected": -9.579669952392578, "logps/chosen": -204.95457458496094, "logps/rejected": -720.709228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.84677791595459, "rewards/margins": 50.59303283691406, "rewards/rejected": -65.43981170654297, "step": 803 }, { "epoch": 0.6560587515299877, "grad_norm": 7.703653524555182e-14, "learning_rate": 4.9551207075950104e-05, "logits/chosen": -7.9884209632873535, "logits/rejected": -10.640478134155273, "logps/chosen": -185.38099670410156, "logps/rejected": -682.8763427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.018457412719727, "rewards/margins": 50.77614212036133, "rewards/rejected": -61.79460144042969, "step": 804 }, { "epoch": 0.65687474500204, "grad_norm": 6.278708497120533e-06, "learning_rate": 4.943901308226771e-05, "logits/chosen": -7.989889144897461, "logits/rejected": -10.223711967468262, "logps/chosen": -216.35162353515625, "logps/rejected": -595.9683227539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.553007125854492, "rewards/margins": 37.99660873413086, "rewards/rejected": -54.549617767333984, "step": 805 }, { "epoch": 0.6576907384740922, "grad_norm": 41.52996063232422, "learning_rate": 4.9326821913440724e-05, "logits/chosen": -8.875541687011719, "logits/rejected": -10.234762191772461, "logps/chosen": -276.1763000488281, "logps/rejected": -637.7070922851562, "loss": 1.0384, "rewards/accuracies": 0.875, "rewards/chosen": -23.46758270263672, "rewards/margins": 34.02180862426758, "rewards/rejected": -57.4893913269043, "step": 806 }, { "epoch": 0.6585067319461444, "grad_norm": 6.613680056943849e-07, "learning_rate": 4.921463413440898e-05, "logits/chosen": -9.055868148803711, "logits/rejected": -10.125507354736328, "logps/chosen": -220.6768798828125, "logps/rejected": -616.9510498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.16687774658203, "rewards/margins": 39.53569030761719, "rewards/rejected": -56.70256805419922, "step": 807 }, { "epoch": 0.6593227254181967, "grad_norm": 2.4650046270835446e-06, "learning_rate": 4.9102450310095146e-05, "logits/chosen": -8.851944923400879, "logits/rejected": -9.045145988464355, "logps/chosen": -170.58380126953125, "logps/rejected": -628.5426635742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.111774444580078, "rewards/margins": 44.48540496826172, "rewards/rejected": -55.5971794128418, "step": 808 }, { "epoch": 0.6601387188902489, "grad_norm": 2.788188112390344e-06, "learning_rate": 4.8990271005402056e-05, "logits/chosen": -8.633882522583008, "logits/rejected": -9.582802772521973, "logps/chosen": -233.58409118652344, "logps/rejected": -649.826904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.829265594482422, "rewards/margins": 40.92922592163086, "rewards/rejected": -57.75849151611328, "step": 809 }, { "epoch": 0.6609547123623011, "grad_norm": 6.751749825895104e-09, "learning_rate": 4.887809678520976e-05, "logits/chosen": -7.74419641494751, "logits/rejected": -8.426321029663086, "logps/chosen": -215.8158721923828, "logps/rejected": -633.7783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.196149826049805, "rewards/margins": 42.187740325927734, "rewards/rejected": -57.383888244628906, "step": 810 }, { "epoch": 0.6617707058343534, "grad_norm": 5.846690492861129e-11, "learning_rate": 4.8765928214372685e-05, "logits/chosen": -8.150691032409668, "logits/rejected": -8.994494438171387, "logps/chosen": -239.41909790039062, "logps/rejected": -684.5789794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.253833770751953, "rewards/margins": 44.811920166015625, "rewards/rejected": -63.065757751464844, "step": 811 }, { "epoch": 0.6625866993064056, "grad_norm": 1.1414474698767663e-08, "learning_rate": 4.865376585771687e-05, "logits/chosen": -7.623331546783447, "logits/rejected": -9.105491638183594, "logps/chosen": -122.3028564453125, "logps/rejected": -560.5870971679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.8151116371154785, "rewards/margins": 44.2843017578125, "rewards/rejected": -50.09941864013672, "step": 812 }, { "epoch": 0.6634026927784578, "grad_norm": 2.1366448887949474e-10, "learning_rate": 4.8541610280036984e-05, "logits/chosen": -8.031265258789062, "logits/rejected": -8.155567169189453, "logps/chosen": -173.91847229003906, "logps/rejected": -632.6617431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.678502082824707, "rewards/margins": 46.00237274169922, "rewards/rejected": -57.680877685546875, "step": 813 }, { "epoch": 0.66421868625051, "grad_norm": 8.216941296268487e-07, "learning_rate": 4.8429462046093585e-05, "logits/chosen": -8.866800308227539, "logits/rejected": -8.415535926818848, "logps/chosen": -183.0516815185547, "logps/rejected": -571.5326538085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.351606369018555, "rewards/margins": 38.77735900878906, "rewards/rejected": -52.128963470458984, "step": 814 }, { "epoch": 0.6650346797225622, "grad_norm": 8.236204271965994e-16, "learning_rate": 4.831732172061032e-05, "logits/chosen": -7.495810508728027, "logits/rejected": -9.102935791015625, "logps/chosen": -144.72093200683594, "logps/rejected": -696.8792724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.274097442626953, "rewards/margins": 53.992340087890625, "rewards/rejected": -62.26643371582031, "step": 815 }, { "epoch": 0.6658506731946144, "grad_norm": 8.386445045471191, "learning_rate": 4.820518986827089e-05, "logits/chosen": -8.588945388793945, "logits/rejected": -8.821585655212402, "logps/chosen": -162.8731689453125, "logps/rejected": -537.049072265625, "loss": 0.1076, "rewards/accuracies": 1.0, "rewards/chosen": -10.910384178161621, "rewards/margins": 37.85021209716797, "rewards/rejected": -48.760589599609375, "step": 816 }, { "epoch": 0.6666666666666666, "grad_norm": 5.678784873452969e-06, "learning_rate": 4.8093067053716456e-05, "logits/chosen": -8.299180030822754, "logits/rejected": -8.2591552734375, "logps/chosen": -184.364501953125, "logps/rejected": -641.7031860351562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.318891525268555, "rewards/margins": 44.2385139465332, "rewards/rejected": -57.557411193847656, "step": 817 }, { "epoch": 0.6674826601387189, "grad_norm": 0.18458124995231628, "learning_rate": 4.798095384154257e-05, "logits/chosen": -7.87662410736084, "logits/rejected": -7.830836296081543, "logps/chosen": -232.51498413085938, "logps/rejected": -543.65625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -16.5366268157959, "rewards/margins": 32.65376663208008, "rewards/rejected": -49.190391540527344, "step": 818 }, { "epoch": 0.6682986536107711, "grad_norm": 1.9460914000082874e-13, "learning_rate": 4.7868850796296495e-05, "logits/chosen": -8.426939964294434, "logits/rejected": -8.179827690124512, "logps/chosen": -184.18063354492188, "logps/rejected": -688.52197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.214641571044922, "rewards/margins": 51.565879821777344, "rewards/rejected": -62.780517578125, "step": 819 }, { "epoch": 0.6691146470828233, "grad_norm": 0.0006061766180209816, "learning_rate": 4.775675848247427e-05, "logits/chosen": -8.197361946105957, "logits/rejected": -7.447135925292969, "logps/chosen": -218.30642700195312, "logps/rejected": -661.3826904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.557958602905273, "rewards/margins": 43.33123779296875, "rewards/rejected": -58.88919448852539, "step": 820 }, { "epoch": 0.6699306405548756, "grad_norm": 0.0003510330861900002, "learning_rate": 4.7644677464517874e-05, "logits/chosen": -8.737665176391602, "logits/rejected": -7.2622151374816895, "logps/chosen": -142.23233032226562, "logps/rejected": -561.4498901367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.453990936279297, "rewards/margins": 41.22915267944336, "rewards/rejected": -50.683143615722656, "step": 821 }, { "epoch": 0.6707466340269278, "grad_norm": 6.842033428711147e-08, "learning_rate": 4.7532608306812465e-05, "logits/chosen": -8.049604415893555, "logits/rejected": -6.444941520690918, "logps/chosen": -96.331787109375, "logps/rejected": -617.1983032226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.712611198425293, "rewards/margins": 49.455718994140625, "rewards/rejected": -54.1683349609375, "step": 822 }, { "epoch": 0.67156262749898, "grad_norm": 2.178394353791191e-08, "learning_rate": 4.742055157368341e-05, "logits/chosen": -7.140134811401367, "logits/rejected": -6.8011016845703125, "logps/chosen": -154.99362182617188, "logps/rejected": -618.7509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.31391429901123, "rewards/margins": 46.30226135253906, "rewards/rejected": -55.616180419921875, "step": 823 }, { "epoch": 0.6723786209710322, "grad_norm": 2.8535278033814393e-06, "learning_rate": 4.7308507829393597e-05, "logits/chosen": -8.211275100708008, "logits/rejected": -7.852006435394287, "logps/chosen": -143.39321899414062, "logps/rejected": -536.3853149414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.063182830810547, "rewards/margins": 38.942298889160156, "rewards/rejected": -49.00547790527344, "step": 824 }, { "epoch": 0.6731946144430845, "grad_norm": 8.235773729659002e-10, "learning_rate": 4.7196477638140404e-05, "logits/chosen": -7.837662220001221, "logits/rejected": -7.509067535400391, "logps/chosen": -143.4053497314453, "logps/rejected": -549.1807861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.07435131072998, "rewards/margins": 41.00175476074219, "rewards/rejected": -49.07610321044922, "step": 825 }, { "epoch": 0.6740106079151367, "grad_norm": 1.2828512581108953e-06, "learning_rate": 4.708446156405307e-05, "logits/chosen": -7.683688163757324, "logits/rejected": -6.568706512451172, "logps/chosen": -191.06048583984375, "logps/rejected": -641.7824096679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.17885971069336, "rewards/margins": 44.392845153808594, "rewards/rejected": -58.57170486450195, "step": 826 }, { "epoch": 0.6748266013871889, "grad_norm": 0.034840118139982224, "learning_rate": 4.697246017118966e-05, "logits/chosen": -7.0310959815979, "logits/rejected": -7.171036720275879, "logps/chosen": -131.5184783935547, "logps/rejected": -566.81884765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.92503833770752, "rewards/margins": 42.05824279785156, "rewards/rejected": -50.98328399658203, "step": 827 }, { "epoch": 0.6756425948592412, "grad_norm": 2.3145660179579863e-06, "learning_rate": 4.6860474023534335e-05, "logits/chosen": -7.298779487609863, "logits/rejected": -6.449517250061035, "logps/chosen": -161.4108428955078, "logps/rejected": -622.8397216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.91093635559082, "rewards/margins": 44.57374572753906, "rewards/rejected": -55.484683990478516, "step": 828 }, { "epoch": 0.6764585883312934, "grad_norm": 1.1278837863670788e-09, "learning_rate": 4.674850368499454e-05, "logits/chosen": -7.496706008911133, "logits/rejected": -6.936426162719727, "logps/chosen": -113.70095825195312, "logps/rejected": -656.2664794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.190007209777832, "rewards/margins": 52.510032653808594, "rewards/rejected": -59.70003890991211, "step": 829 }, { "epoch": 0.6772745818033455, "grad_norm": 1.7511367510331089e-15, "learning_rate": 4.663654971939802e-05, "logits/chosen": -7.132686614990234, "logits/rejected": -6.148468017578125, "logps/chosen": -141.23211669921875, "logps/rejected": -631.0220947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.752356052398682, "rewards/margins": 49.1510009765625, "rewards/rejected": -56.903358459472656, "step": 830 }, { "epoch": 0.6780905752753978, "grad_norm": 0.0007406626245938241, "learning_rate": 4.652461269049018e-05, "logits/chosen": -6.565459728240967, "logits/rejected": -6.821968078613281, "logps/chosen": -173.1432647705078, "logps/rejected": -578.760009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.590364456176758, "rewards/margins": 38.86769485473633, "rewards/rejected": -51.45806121826172, "step": 831 }, { "epoch": 0.67890656874745, "grad_norm": 2.317723328815191e-06, "learning_rate": 4.6412693161931034e-05, "logits/chosen": -7.6086602210998535, "logits/rejected": -8.080364227294922, "logps/chosen": -137.95706176757812, "logps/rejected": -493.3158874511719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.490351676940918, "rewards/margins": 35.856842041015625, "rewards/rejected": -44.34719467163086, "step": 832 }, { "epoch": 0.6797225622195022, "grad_norm": 0.36400219798088074, "learning_rate": 4.630079169729257e-05, "logits/chosen": -7.6919708251953125, "logits/rejected": -6.78069543838501, "logps/chosen": -146.6015167236328, "logps/rejected": -499.8291320800781, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -10.061351776123047, "rewards/margins": 34.82432556152344, "rewards/rejected": -44.885677337646484, "step": 833 }, { "epoch": 0.6805385556915544, "grad_norm": 1.081008773571061e-14, "learning_rate": 4.618890886005576e-05, "logits/chosen": -7.040790557861328, "logits/rejected": -7.351246356964111, "logps/chosen": -135.1257781982422, "logps/rejected": -622.68359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.27301025390625, "rewards/margins": 48.32360076904297, "rewards/rejected": -54.59661102294922, "step": 834 }, { "epoch": 0.6813545491636067, "grad_norm": 1.2921336178806087e-07, "learning_rate": 4.607704521360776e-05, "logits/chosen": -8.180876731872559, "logits/rejected": -7.0663862228393555, "logps/chosen": -171.4868927001953, "logps/rejected": -588.8698120117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.410932540893555, "rewards/margins": 43.401153564453125, "rewards/rejected": -52.81208801269531, "step": 835 }, { "epoch": 0.6821705426356589, "grad_norm": 1.8388897071730526e-09, "learning_rate": 4.5965201321239144e-05, "logits/chosen": -6.62940788269043, "logits/rejected": -7.676819324493408, "logps/chosen": -138.8271026611328, "logps/rejected": -524.061767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.428949356079102, "rewards/margins": 37.46607971191406, "rewards/rejected": -46.89502716064453, "step": 836 }, { "epoch": 0.6829865361077111, "grad_norm": 7.413092589558801e-06, "learning_rate": 4.585337774614097e-05, "logits/chosen": -7.1035051345825195, "logits/rejected": -6.215327262878418, "logps/chosen": -155.837646484375, "logps/rejected": -633.740478515625, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -8.786284446716309, "rewards/margins": 46.322547912597656, "rewards/rejected": -55.10883712768555, "step": 837 }, { "epoch": 0.6838025295797634, "grad_norm": 5.378480438800182e-10, "learning_rate": 4.574157505140204e-05, "logits/chosen": -7.743355751037598, "logits/rejected": -7.00079345703125, "logps/chosen": -166.73797607421875, "logps/rejected": -564.6064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.600069046020508, "rewards/margins": 38.94749450683594, "rewards/rejected": -50.54756546020508, "step": 838 }, { "epoch": 0.6846185230518156, "grad_norm": 6.750130069121951e-06, "learning_rate": 4.5629793800005945e-05, "logits/chosen": -7.740220546722412, "logits/rejected": -6.454562187194824, "logps/chosen": -138.62445068359375, "logps/rejected": -465.32415771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.695594787597656, "rewards/margins": 31.423675537109375, "rewards/rejected": -40.11927032470703, "step": 839 }, { "epoch": 0.6854345165238678, "grad_norm": 1.3931958885160522e-11, "learning_rate": 4.551803455482833e-05, "logits/chosen": -8.090571403503418, "logits/rejected": -7.976179122924805, "logps/chosen": -119.83372497558594, "logps/rejected": -557.6464233398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.020478248596191, "rewards/margins": 43.354759216308594, "rewards/rejected": -49.37523651123047, "step": 840 }, { "epoch": 0.6862505099959201, "grad_norm": 0.00027326869894750416, "learning_rate": 4.540629787863405e-05, "logits/chosen": -7.194211483001709, "logits/rejected": -6.746013641357422, "logps/chosen": -140.1172637939453, "logps/rejected": -590.36279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.8433518409729, "rewards/margins": 45.17737579345703, "rewards/rejected": -53.020729064941406, "step": 841 }, { "epoch": 0.6870665034679723, "grad_norm": 0.00023294484708458185, "learning_rate": 4.529458433407429e-05, "logits/chosen": -7.608976364135742, "logits/rejected": -7.163238525390625, "logps/chosen": -146.2188720703125, "logps/rejected": -635.0987548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.804982662200928, "rewards/margins": 47.2724723815918, "rewards/rejected": -55.07745361328125, "step": 842 }, { "epoch": 0.6878824969400245, "grad_norm": 0.026997186243534088, "learning_rate": 4.518289448368376e-05, "logits/chosen": -8.350173950195312, "logits/rejected": -7.7360615730285645, "logps/chosen": -121.87615203857422, "logps/rejected": -382.8321228027344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.77719783782959, "rewards/margins": 25.324012756347656, "rewards/rejected": -33.1012077331543, "step": 843 }, { "epoch": 0.6886984904120768, "grad_norm": 0.00034344804589636624, "learning_rate": 4.5071228889877825e-05, "logits/chosen": -6.712033271789551, "logits/rejected": -7.283689975738525, "logps/chosen": -126.87998962402344, "logps/rejected": -604.7369995117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.244818687438965, "rewards/margins": 46.89167785644531, "rewards/rejected": -53.136497497558594, "step": 844 }, { "epoch": 0.689514483884129, "grad_norm": 2.545806232490122e-08, "learning_rate": 4.495958811494978e-05, "logits/chosen": -6.327035427093506, "logits/rejected": -7.320213317871094, "logps/chosen": -131.33580017089844, "logps/rejected": -494.96087646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.747506618499756, "rewards/margins": 37.180572509765625, "rewards/rejected": -44.928077697753906, "step": 845 }, { "epoch": 0.6903304773561811, "grad_norm": 3.67160623682139e-07, "learning_rate": 4.484797272106789e-05, "logits/chosen": -7.369645118713379, "logits/rejected": -6.960336208343506, "logps/chosen": -104.50862884521484, "logps/rejected": -544.0094604492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.547873497009277, "rewards/margins": 42.925201416015625, "rewards/rejected": -48.47307586669922, "step": 846 }, { "epoch": 0.6911464708282333, "grad_norm": 2.5298256802678935e-14, "learning_rate": 4.473638327027259e-05, "logits/chosen": -6.968975067138672, "logits/rejected": -7.473151206970215, "logps/chosen": -113.45198059082031, "logps/rejected": -674.103759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.406398773193359, "rewards/margins": 54.792076110839844, "rewards/rejected": -60.19847869873047, "step": 847 }, { "epoch": 0.6919624643002856, "grad_norm": 1.980993857841895e-08, "learning_rate": 4.4624820324473766e-05, "logits/chosen": -7.110167503356934, "logits/rejected": -7.015293121337891, "logps/chosen": -117.6363525390625, "logps/rejected": -608.499755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.373560428619385, "rewards/margins": 48.20237731933594, "rewards/rejected": -54.5759391784668, "step": 848 }, { "epoch": 0.6927784577723378, "grad_norm": 5.337129550753161e-06, "learning_rate": 4.451328444544774e-05, "logits/chosen": -7.337632179260254, "logits/rejected": -6.783229827880859, "logps/chosen": -131.83309936523438, "logps/rejected": -479.0083923339844, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -8.729533195495605, "rewards/margins": 34.68309783935547, "rewards/rejected": -43.412628173828125, "step": 849 }, { "epoch": 0.69359445124439, "grad_norm": 29.21795082092285, "learning_rate": 4.4401776194834613e-05, "logits/chosen": -8.003349304199219, "logits/rejected": -7.750983238220215, "logps/chosen": -85.23335266113281, "logps/rejected": -467.602294921875, "loss": 0.1534, "rewards/accuracies": 1.0, "rewards/chosen": -4.3463134765625, "rewards/margins": 37.66285705566406, "rewards/rejected": -42.00917053222656, "step": 850 }, { "epoch": 0.6944104447164423, "grad_norm": 2.9535933663282776e-06, "learning_rate": 4.429029613413531e-05, "logits/chosen": -7.075855255126953, "logits/rejected": -6.969350814819336, "logps/chosen": -93.90969848632812, "logps/rejected": -419.8570556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.640460968017578, "rewards/margins": 32.1026611328125, "rewards/rejected": -36.74311828613281, "step": 851 }, { "epoch": 0.6952264381884945, "grad_norm": 7.710588789677786e-08, "learning_rate": 4.4178844824708864e-05, "logits/chosen": -7.4922075271606445, "logits/rejected": -7.263906955718994, "logps/chosen": -121.39163208007812, "logps/rejected": -538.0001220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.651219844818115, "rewards/margins": 41.78778839111328, "rewards/rejected": -48.43901062011719, "step": 852 }, { "epoch": 0.6960424316605467, "grad_norm": 3.043614804632e-10, "learning_rate": 4.4067422827769464e-05, "logits/chosen": -7.650830268859863, "logits/rejected": -7.015204906463623, "logps/chosen": -168.35008239746094, "logps/rejected": -615.9232177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.127116203308105, "rewards/margins": 43.79975128173828, "rewards/rejected": -52.92686462402344, "step": 853 }, { "epoch": 0.696858425132599, "grad_norm": 0.056416623294353485, "learning_rate": 4.395603070438373e-05, "logits/chosen": -6.619339942932129, "logits/rejected": -6.843029975891113, "logps/chosen": -163.25120544433594, "logps/rejected": -560.0572509765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.531026840209961, "rewards/margins": 39.59891891479492, "rewards/rejected": -49.12994384765625, "step": 854 }, { "epoch": 0.6976744186046512, "grad_norm": 0.0005970962229184806, "learning_rate": 4.3844669015467864e-05, "logits/chosen": -7.395131587982178, "logits/rejected": -7.410223484039307, "logps/chosen": -116.5417251586914, "logps/rejected": -444.5098876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.313534736633301, "rewards/margins": 33.982688903808594, "rewards/rejected": -39.29622268676758, "step": 855 }, { "epoch": 0.6984904120767034, "grad_norm": 0.25697022676467896, "learning_rate": 4.373333832178478e-05, "logits/chosen": -6.751603603363037, "logits/rejected": -6.251522541046143, "logps/chosen": -96.81986999511719, "logps/rejected": -482.9059753417969, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.8816299438476562, "rewards/margins": 37.79030990600586, "rewards/rejected": -41.671939849853516, "step": 856 }, { "epoch": 0.6993064055487556, "grad_norm": 2.162765042612591e-07, "learning_rate": 4.362203918394138e-05, "logits/chosen": -5.467316627502441, "logits/rejected": -5.970155239105225, "logps/chosen": -81.72679138183594, "logps/rejected": -450.90008544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2915961742401123, "rewards/margins": 38.56409454345703, "rewards/rejected": -39.855690002441406, "step": 857 }, { "epoch": 0.7001223990208079, "grad_norm": 1.6183012974124722e-07, "learning_rate": 4.3510772162385574e-05, "logits/chosen": -6.439509868621826, "logits/rejected": -6.519394397735596, "logps/chosen": -102.72280883789062, "logps/rejected": -520.75537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9156742095947266, "rewards/margins": 42.454132080078125, "rewards/rejected": -45.369808197021484, "step": 858 }, { "epoch": 0.70093839249286, "grad_norm": 7.718450618396475e-16, "learning_rate": 4.339953781740363e-05, "logits/chosen": -6.819568634033203, "logits/rejected": -6.148873329162598, "logps/chosen": -97.97677612304688, "logps/rejected": -597.8478393554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.106709957122803, "rewards/margins": 48.6968994140625, "rewards/rejected": -52.80360412597656, "step": 859 }, { "epoch": 0.7017543859649122, "grad_norm": 1.0258190030754122e-07, "learning_rate": 4.328833670911724e-05, "logits/chosen": -7.423059463500977, "logits/rejected": -7.033263206481934, "logps/chosen": -94.74369812011719, "logps/rejected": -427.464111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.018950939178467, "rewards/margins": 33.13215255737305, "rewards/rejected": -38.151100158691406, "step": 860 }, { "epoch": 0.7025703794369645, "grad_norm": 0.0078487703576684, "learning_rate": 4.31771693974807e-05, "logits/chosen": -6.875002861022949, "logits/rejected": -7.20584774017334, "logps/chosen": -99.48585510253906, "logps/rejected": -569.9276733398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.53353214263916, "rewards/margins": 45.569583892822266, "rewards/rejected": -50.10311508178711, "step": 861 }, { "epoch": 0.7033863729090167, "grad_norm": 9.415711588189879e-08, "learning_rate": 4.3066036442278215e-05, "logits/chosen": -6.094510078430176, "logits/rejected": -6.225366592407227, "logps/chosen": -133.09552001953125, "logps/rejected": -463.66455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.175508499145508, "rewards/margins": 33.288639068603516, "rewards/rejected": -39.464149475097656, "step": 862 }, { "epoch": 0.7042023663810689, "grad_norm": 0.0006088624359108508, "learning_rate": 4.295493840312087e-05, "logits/chosen": -6.512539863586426, "logits/rejected": -6.893945693969727, "logps/chosen": -121.4056396484375, "logps/rejected": -422.765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.167640686035156, "rewards/margins": 30.20742416381836, "rewards/rejected": -38.37506103515625, "step": 863 }, { "epoch": 0.7050183598531212, "grad_norm": 1.0513620907204313e-07, "learning_rate": 4.284387583944403e-05, "logits/chosen": -6.056107521057129, "logits/rejected": -6.319377899169922, "logps/chosen": -95.38180541992188, "logps/rejected": -454.2421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.513369560241699, "rewards/margins": 36.397098541259766, "rewards/rejected": -39.91046905517578, "step": 864 }, { "epoch": 0.7058343533251734, "grad_norm": 5.997489397474576e-12, "learning_rate": 4.273284931050438e-05, "logits/chosen": -7.040292263031006, "logits/rejected": -7.341108322143555, "logps/chosen": -99.12557983398438, "logps/rejected": -460.82318115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.080049514770508, "rewards/margins": 37.24843978881836, "rewards/rejected": -40.328487396240234, "step": 865 }, { "epoch": 0.7066503467972256, "grad_norm": 1.4423278116737492e-05, "learning_rate": 4.2621859375377125e-05, "logits/chosen": -5.379884719848633, "logits/rejected": -5.387287616729736, "logps/chosen": -102.00119018554688, "logps/rejected": -458.8790588378906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7066526412963867, "rewards/margins": 35.39707565307617, "rewards/rejected": -39.103729248046875, "step": 866 }, { "epoch": 0.7074663402692778, "grad_norm": 0.08894829452037811, "learning_rate": 4.251090659295326e-05, "logits/chosen": -6.650407791137695, "logits/rejected": -5.918703079223633, "logps/chosen": -96.23146057128906, "logps/rejected": -474.71307373046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.406583309173584, "rewards/margins": 36.41285705566406, "rewards/rejected": -40.819435119628906, "step": 867 }, { "epoch": 0.7082823337413301, "grad_norm": 0.0006731871399097145, "learning_rate": 4.239999152193664e-05, "logits/chosen": -6.670350551605225, "logits/rejected": -6.667930603027344, "logps/chosen": -98.22769165039062, "logps/rejected": -466.30560302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.542698621749878, "rewards/margins": 37.4851188659668, "rewards/rejected": -41.02781677246094, "step": 868 }, { "epoch": 0.7090983272133823, "grad_norm": 7.4679219324025325e-06, "learning_rate": 4.228911472084129e-05, "logits/chosen": -6.146826267242432, "logits/rejected": -6.093161106109619, "logps/chosen": -89.6170883178711, "logps/rejected": -567.0143432617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.710592269897461, "rewards/margins": 48.04198455810547, "rewards/rejected": -49.7525749206543, "step": 869 }, { "epoch": 0.7099143206854345, "grad_norm": 1.0739873879117567e-08, "learning_rate": 4.2178276747988446e-05, "logits/chosen": -6.696559429168701, "logits/rejected": -6.709953308105469, "logps/chosen": -118.65386199951172, "logps/rejected": -510.9305419921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.372931003570557, "rewards/margins": 37.52901077270508, "rewards/rejected": -43.901947021484375, "step": 870 }, { "epoch": 0.7107303141574868, "grad_norm": 1.535057344881352e-05, "learning_rate": 4.206747816150392e-05, "logits/chosen": -6.4736175537109375, "logits/rejected": -5.734258651733398, "logps/chosen": -88.35476684570312, "logps/rejected": -431.44244384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.115384817123413, "rewards/margins": 34.550296783447266, "rewards/rejected": -37.66568374633789, "step": 871 }, { "epoch": 0.711546307629539, "grad_norm": 2.2037198732505203e-07, "learning_rate": 4.195671951931509e-05, "logits/chosen": -6.336493015289307, "logits/rejected": -6.152273654937744, "logps/chosen": -78.913330078125, "logps/rejected": -450.0062255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.931671142578125, "rewards/margins": 37.114288330078125, "rewards/rejected": -39.04595947265625, "step": 872 }, { "epoch": 0.7123623011015912, "grad_norm": 2.69335642677504e-09, "learning_rate": 4.1846001379148246e-05, "logits/chosen": -6.465977668762207, "logits/rejected": -6.654810905456543, "logps/chosen": -123.09996032714844, "logps/rejected": -451.44024658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.792114734649658, "rewards/margins": 33.90153503417969, "rewards/rejected": -39.69365310668945, "step": 873 }, { "epoch": 0.7131782945736435, "grad_norm": 7.265834511827052e-08, "learning_rate": 4.173532429852576e-05, "logits/chosen": -6.290567398071289, "logits/rejected": -6.016059398651123, "logps/chosen": -80.01792907714844, "logps/rejected": -510.28839111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.340167999267578, "rewards/margins": 42.79618835449219, "rewards/rejected": -45.1363525390625, "step": 874 }, { "epoch": 0.7139942880456956, "grad_norm": 4.880840265286679e-07, "learning_rate": 4.162468883476319e-05, "logits/chosen": -5.11698055267334, "logits/rejected": -5.5777387619018555, "logps/chosen": -103.00265502929688, "logps/rejected": -430.10784912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9957778453826904, "rewards/margins": 33.6544075012207, "rewards/rejected": -37.650184631347656, "step": 875 }, { "epoch": 0.7148102815177478, "grad_norm": 0.202168807387352, "learning_rate": 4.151409554496656e-05, "logits/chosen": -6.268181324005127, "logits/rejected": -5.9254469871521, "logps/chosen": -95.7607192993164, "logps/rejected": -376.79046630859375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.372523307800293, "rewards/margins": 30.614748001098633, "rewards/rejected": -33.987274169921875, "step": 876 }, { "epoch": 0.7156262749898, "grad_norm": 8.427507025565717e-10, "learning_rate": 4.140354498602952e-05, "logits/chosen": -6.424466133117676, "logits/rejected": -7.27536678314209, "logps/chosen": -55.06889724731445, "logps/rejected": -481.1366882324219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.2474559247493744, "rewards/margins": 42.588287353515625, "rewards/rejected": -42.34082794189453, "step": 877 }, { "epoch": 0.7164422684618523, "grad_norm": 1.924158823385369e-05, "learning_rate": 4.129303771463057e-05, "logits/chosen": -6.21418571472168, "logits/rejected": -6.7632951736450195, "logps/chosen": -116.23712158203125, "logps/rejected": -506.014892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.521279335021973, "rewards/margins": 38.94368362426758, "rewards/rejected": -44.464962005615234, "step": 878 }, { "epoch": 0.7172582619339045, "grad_norm": 8.078031896729954e-06, "learning_rate": 4.1182574287230224e-05, "logits/chosen": -7.022758960723877, "logits/rejected": -7.237202167510986, "logps/chosen": -88.9845199584961, "logps/rejected": -402.3362121582031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.17106819152832, "rewards/margins": 29.17300033569336, "rewards/rejected": -33.34407043457031, "step": 879 }, { "epoch": 0.7180742554059567, "grad_norm": 8.44977954272963e-09, "learning_rate": 4.107215526006817e-05, "logits/chosen": -6.238371849060059, "logits/rejected": -6.039397239685059, "logps/chosen": -107.80701446533203, "logps/rejected": -523.022216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.791708469390869, "rewards/margins": 40.48551940917969, "rewards/rejected": -45.27722930908203, "step": 880 }, { "epoch": 0.718890248878009, "grad_norm": 3.7642264942405745e-05, "learning_rate": 4.09617811891606e-05, "logits/chosen": -6.946268081665039, "logits/rejected": -6.117950916290283, "logps/chosen": -120.19419860839844, "logps/rejected": -423.740966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.860569953918457, "rewards/margins": 30.673368453979492, "rewards/rejected": -37.533939361572266, "step": 881 }, { "epoch": 0.7197062423500612, "grad_norm": 6.818916881456971e-05, "learning_rate": 4.085145263029726e-05, "logits/chosen": -6.302319526672363, "logits/rejected": -6.048935413360596, "logps/chosen": -117.46604919433594, "logps/rejected": -403.59832763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.921346187591553, "rewards/margins": 29.40916633605957, "rewards/rejected": -35.33051300048828, "step": 882 }, { "epoch": 0.7205222358221134, "grad_norm": 9.561740625940729e-06, "learning_rate": 4.07411701390388e-05, "logits/chosen": -6.840497970581055, "logits/rejected": -7.067358016967773, "logps/chosen": -95.26100158691406, "logps/rejected": -404.50811767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.245852947235107, "rewards/margins": 31.50286865234375, "rewards/rejected": -35.748722076416016, "step": 883 }, { "epoch": 0.7213382292941657, "grad_norm": 0.02595667541027069, "learning_rate": 4.063093427071376e-05, "logits/chosen": -5.3986053466796875, "logits/rejected": -6.524201393127441, "logps/chosen": -129.4342498779297, "logps/rejected": -516.4373779296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.490121364593506, "rewards/margins": 37.951168060302734, "rewards/rejected": -44.44129180908203, "step": 884 }, { "epoch": 0.7221542227662179, "grad_norm": 7.650464795005973e-06, "learning_rate": 4.052074558041608e-05, "logits/chosen": -6.407112121582031, "logits/rejected": -6.20591402053833, "logps/chosen": -108.1146011352539, "logps/rejected": -446.91229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.889533996582031, "rewards/margins": 33.90983581542969, "rewards/rejected": -38.79936981201172, "step": 885 }, { "epoch": 0.7229702162382701, "grad_norm": 0.00195497227832675, "learning_rate": 4.0410604623002e-05, "logits/chosen": -6.011791229248047, "logits/rejected": -6.134161472320557, "logps/chosen": -95.32693481445312, "logps/rejected": -414.846435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.256114959716797, "rewards/margins": 31.399782180786133, "rewards/rejected": -35.6558952331543, "step": 886 }, { "epoch": 0.7237862097103224, "grad_norm": 3.2429720704385545e-06, "learning_rate": 4.0300511953087464e-05, "logits/chosen": -6.788949012756348, "logits/rejected": -6.580812454223633, "logps/chosen": -100.53335571289062, "logps/rejected": -377.67352294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.421197891235352, "rewards/margins": 28.677383422851562, "rewards/rejected": -33.09857940673828, "step": 887 }, { "epoch": 0.7246022031823746, "grad_norm": 4.951248047291301e-05, "learning_rate": 4.019046812504526e-05, "logits/chosen": -7.476909160614014, "logits/rejected": -6.847577095031738, "logps/chosen": -83.67095184326172, "logps/rejected": -482.6685791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3477773666381836, "rewards/margins": 40.61328887939453, "rewards/rejected": -42.96106719970703, "step": 888 }, { "epoch": 0.7254181966544268, "grad_norm": 1.9367826098459773e-05, "learning_rate": 4.008047369300218e-05, "logits/chosen": -6.544079780578613, "logits/rejected": -6.219208240509033, "logps/chosen": -162.87115478515625, "logps/rejected": -485.5908508300781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.101906776428223, "rewards/margins": 32.40758514404297, "rewards/rejected": -43.509490966796875, "step": 889 }, { "epoch": 0.7262341901264789, "grad_norm": 5.3919344683572845e-11, "learning_rate": 3.9970529210836366e-05, "logits/chosen": -5.744133472442627, "logits/rejected": -6.545658588409424, "logps/chosen": -89.33573150634766, "logps/rejected": -488.997314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.340379476547241, "rewards/margins": 38.56201934814453, "rewards/rejected": -41.902400970458984, "step": 890 }, { "epoch": 0.7270501835985312, "grad_norm": 1.0119215188009179e-10, "learning_rate": 3.986063523217439e-05, "logits/chosen": -6.763413429260254, "logits/rejected": -7.1626739501953125, "logps/chosen": -102.3887939453125, "logps/rejected": -506.45208740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.6166300773620605, "rewards/margins": 41.18091583251953, "rewards/rejected": -45.797550201416016, "step": 891 }, { "epoch": 0.7278661770705834, "grad_norm": 2.567889225701947e-07, "learning_rate": 3.9750792310388485e-05, "logits/chosen": -6.337133407592773, "logits/rejected": -6.714809894561768, "logps/chosen": -148.62257385253906, "logps/rejected": -505.3175354003906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.602746963500977, "rewards/margins": 35.59080505371094, "rewards/rejected": -45.19355010986328, "step": 892 }, { "epoch": 0.7286821705426356, "grad_norm": 3.334962173084932e-07, "learning_rate": 3.964100099859387e-05, "logits/chosen": -6.953942775726318, "logits/rejected": -6.88776969909668, "logps/chosen": -105.53307342529297, "logps/rejected": -492.0090637207031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.628922462463379, "rewards/margins": 36.62535095214844, "rewards/rejected": -43.2542724609375, "step": 893 }, { "epoch": 0.7294981640146879, "grad_norm": 1.1529420589795336e-05, "learning_rate": 3.953126184964577e-05, "logits/chosen": -7.395512104034424, "logits/rejected": -7.1778717041015625, "logps/chosen": -122.02226257324219, "logps/rejected": -521.3729248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.438019752502441, "rewards/margins": 39.197021484375, "rewards/rejected": -45.635040283203125, "step": 894 }, { "epoch": 0.7303141574867401, "grad_norm": 3.492830558116111e-08, "learning_rate": 3.9421575416136866e-05, "logits/chosen": -7.8807878494262695, "logits/rejected": -6.708286285400391, "logps/chosen": -147.91139221191406, "logps/rejected": -505.39642333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.824249267578125, "rewards/margins": 36.66569137573242, "rewards/rejected": -44.48994064331055, "step": 895 }, { "epoch": 0.7311301509587923, "grad_norm": 2.4081399985220742e-08, "learning_rate": 3.9311942250394276e-05, "logits/chosen": -5.608738899230957, "logits/rejected": -6.58112907409668, "logps/chosen": -92.02583312988281, "logps/rejected": -511.8350830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.050960540771484, "rewards/margins": 41.134559631347656, "rewards/rejected": -45.185516357421875, "step": 896 }, { "epoch": 0.7319461444308446, "grad_norm": 0.0034711207263171673, "learning_rate": 3.920236290447698e-05, "logits/chosen": -6.728615760803223, "logits/rejected": -7.2830095291137695, "logps/chosen": -132.10470581054688, "logps/rejected": -433.56060791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.112365245819092, "rewards/margins": 30.713449478149414, "rewards/rejected": -36.8258171081543, "step": 897 }, { "epoch": 0.7327621379028968, "grad_norm": 1.493803275520733e-10, "learning_rate": 3.9092837930172884e-05, "logits/chosen": -6.587739944458008, "logits/rejected": -6.896650791168213, "logps/chosen": -113.66299438476562, "logps/rejected": -562.9949951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.519837379455566, "rewards/margins": 44.81891632080078, "rewards/rejected": -50.3387565612793, "step": 898 }, { "epoch": 0.733578131374949, "grad_norm": 0.000774801301304251, "learning_rate": 3.898336787899612e-05, "logits/chosen": -7.429217338562012, "logits/rejected": -7.242953300476074, "logps/chosen": -96.09693145751953, "logps/rejected": -437.2388916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.31473445892334, "rewards/margins": 33.479888916015625, "rewards/rejected": -37.79462432861328, "step": 899 }, { "epoch": 0.7343941248470012, "grad_norm": 1.0266814065573726e-08, "learning_rate": 3.887395330218429e-05, "logits/chosen": -7.193870544433594, "logits/rejected": -6.417882919311523, "logps/chosen": -134.14715576171875, "logps/rejected": -608.3157958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.6384735107421875, "rewards/margins": 46.5177001953125, "rewards/rejected": -54.15617752075195, "step": 900 }, { "epoch": 0.7352101183190535, "grad_norm": 8.52175716659076e-08, "learning_rate": 3.8764594750695596e-05, "logits/chosen": -6.079996585845947, "logits/rejected": -6.04316520690918, "logps/chosen": -134.93020629882812, "logps/rejected": -563.1721801757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.334725379943848, "rewards/margins": 43.48246765136719, "rewards/rejected": -51.81718826293945, "step": 901 }, { "epoch": 0.7360261117911057, "grad_norm": 1.171654048448545e-06, "learning_rate": 3.865529277520619e-05, "logits/chosen": -6.879838466644287, "logits/rejected": -7.405745983123779, "logps/chosen": -107.6246337890625, "logps/rejected": -439.4381103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.358441352844238, "rewards/margins": 33.06073760986328, "rewards/rejected": -39.41918182373047, "step": 902 }, { "epoch": 0.7368421052631579, "grad_norm": 4.849014294450171e-05, "learning_rate": 3.8546047926107256e-05, "logits/chosen": -5.8017072677612305, "logits/rejected": -6.664381980895996, "logps/chosen": -134.54803466796875, "logps/rejected": -468.52679443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.771470546722412, "rewards/margins": 33.89118957519531, "rewards/rejected": -41.66265869140625, "step": 903 }, { "epoch": 0.7376580987352102, "grad_norm": 6.723491969751194e-05, "learning_rate": 3.84368607535024e-05, "logits/chosen": -6.042647838592529, "logits/rejected": -6.19908332824707, "logps/chosen": -115.90914916992188, "logps/rejected": -509.79034423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.896599292755127, "rewards/margins": 39.01707458496094, "rewards/rejected": -44.913673400878906, "step": 904 }, { "epoch": 0.7384740922072623, "grad_norm": 5.1262176725686004e-08, "learning_rate": 3.832773180720475e-05, "logits/chosen": -6.3213605880737305, "logits/rejected": -6.588250637054443, "logps/chosen": -137.34402465820312, "logps/rejected": -548.251220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.239797592163086, "rewards/margins": 39.091976165771484, "rewards/rejected": -48.33177185058594, "step": 905 }, { "epoch": 0.7392900856793145, "grad_norm": 1.1526545939943844e-08, "learning_rate": 3.821866163673421e-05, "logits/chosen": -6.193759918212891, "logits/rejected": -6.491817474365234, "logps/chosen": -138.96533203125, "logps/rejected": -620.6439208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.891239643096924, "rewards/margins": 46.65751647949219, "rewards/rejected": -54.54875946044922, "step": 906 }, { "epoch": 0.7401060791513668, "grad_norm": 4.470444707749266e-07, "learning_rate": 3.810965079131479e-05, "logits/chosen": -5.231653213500977, "logits/rejected": -6.007192134857178, "logps/chosen": -85.00543212890625, "logps/rejected": -445.06597900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5302791595458984, "rewards/margins": 35.40717315673828, "rewards/rejected": -38.93745040893555, "step": 907 }, { "epoch": 0.740922072623419, "grad_norm": 5.430155738395115e-07, "learning_rate": 3.8000699819871705e-05, "logits/chosen": -7.195486068725586, "logits/rejected": -7.078901767730713, "logps/chosen": -91.27706909179688, "logps/rejected": -513.847900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.050506591796875, "rewards/margins": 39.82109451293945, "rewards/rejected": -44.87160110473633, "step": 908 }, { "epoch": 0.7417380660954712, "grad_norm": 1.314502066224188e-11, "learning_rate": 3.789180927102872e-05, "logits/chosen": -7.6177897453308105, "logits/rejected": -7.639970779418945, "logps/chosen": -132.4476318359375, "logps/rejected": -570.739013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.065996170043945, "rewards/margins": 43.31053161621094, "rewards/rejected": -51.37653350830078, "step": 909 }, { "epoch": 0.7425540595675234, "grad_norm": 2.5250845858693083e-09, "learning_rate": 3.778297969310529e-05, "logits/chosen": -6.821011543273926, "logits/rejected": -6.454713821411133, "logps/chosen": -97.4455337524414, "logps/rejected": -498.6915283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.989928722381592, "rewards/margins": 39.97600173950195, "rewards/rejected": -42.9659309387207, "step": 910 }, { "epoch": 0.7433700530395757, "grad_norm": 3.534987413900126e-09, "learning_rate": 3.7674211634113926e-05, "logits/chosen": -6.024805068969727, "logits/rejected": -5.555134296417236, "logps/chosen": -104.98849487304688, "logps/rejected": -469.79718017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.038840293884277, "rewards/margins": 37.03614044189453, "rewards/rejected": -42.074981689453125, "step": 911 }, { "epoch": 0.7441860465116279, "grad_norm": 0.0006416201940737665, "learning_rate": 3.756550564175727e-05, "logits/chosen": -6.1395487785339355, "logits/rejected": -6.831380844116211, "logps/chosen": -159.76773071289062, "logps/rejected": -498.30902099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.348998069763184, "rewards/margins": 34.71168518066406, "rewards/rejected": -44.0606803894043, "step": 912 }, { "epoch": 0.7450020399836801, "grad_norm": 3.0600444400619153e-10, "learning_rate": 3.745686226342547e-05, "logits/chosen": -7.09568977355957, "logits/rejected": -6.93174409866333, "logps/chosen": -112.81098937988281, "logps/rejected": -512.4292602539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.672760963439941, "rewards/margins": 38.862060546875, "rewards/rejected": -44.53481674194336, "step": 913 }, { "epoch": 0.7458180334557324, "grad_norm": 1.310369680140866e-05, "learning_rate": 3.734828204619342e-05, "logits/chosen": -6.775852203369141, "logits/rejected": -7.099741458892822, "logps/chosen": -106.41658782958984, "logps/rejected": -471.3017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.385287284851074, "rewards/margins": 36.888492584228516, "rewards/rejected": -41.273780822753906, "step": 914 }, { "epoch": 0.7466340269277846, "grad_norm": 1.332505550344365e-09, "learning_rate": 3.723976553681787e-05, "logits/chosen": -6.098475456237793, "logits/rejected": -6.86149787902832, "logps/chosen": -106.17916870117188, "logps/rejected": -591.1338500976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.460769176483154, "rewards/margins": 42.769203186035156, "rewards/rejected": -48.2299690246582, "step": 915 }, { "epoch": 0.7474500203998368, "grad_norm": 0.025519516319036484, "learning_rate": 3.713131328173489e-05, "logits/chosen": -6.184087753295898, "logits/rejected": -6.6484479904174805, "logps/chosen": -135.18014526367188, "logps/rejected": -523.1878051757812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.800558090209961, "rewards/margins": 39.67253875732422, "rewards/rejected": -47.47309494018555, "step": 916 }, { "epoch": 0.7482660138718891, "grad_norm": 0.0007377600995823741, "learning_rate": 3.7022925827056884e-05, "logits/chosen": -5.639071941375732, "logits/rejected": -6.163357257843018, "logps/chosen": -138.92127990722656, "logps/rejected": -389.41278076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.604091644287109, "rewards/margins": 26.92858123779297, "rewards/rejected": -34.53266906738281, "step": 917 }, { "epoch": 0.7490820073439413, "grad_norm": 1.8063861517703117e-10, "learning_rate": 3.6914603718569995e-05, "logits/chosen": -6.0565619468688965, "logits/rejected": -6.171745300292969, "logps/chosen": -150.50352478027344, "logps/rejected": -599.7535400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.533790588378906, "rewards/margins": 45.6572380065918, "rewards/rejected": -54.19102478027344, "step": 918 }, { "epoch": 0.7498980008159934, "grad_norm": 1.7737984308041632e-06, "learning_rate": 3.680634750173137e-05, "logits/chosen": -6.384346961975098, "logits/rejected": -6.631587982177734, "logps/chosen": -104.18423461914062, "logps/rejected": -478.33575439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.047684669494629, "rewards/margins": 38.0599250793457, "rewards/rejected": -43.107608795166016, "step": 919 }, { "epoch": 0.7507139942880457, "grad_norm": 3.43396178159594e-09, "learning_rate": 3.6698157721666246e-05, "logits/chosen": -6.173569679260254, "logits/rejected": -6.073760986328125, "logps/chosen": -98.73519897460938, "logps/rejected": -446.0335693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.134438991546631, "rewards/margins": 34.805091857910156, "rewards/rejected": -39.93953323364258, "step": 920 }, { "epoch": 0.7515299877600979, "grad_norm": 9.345766348312079e-13, "learning_rate": 3.659003492316543e-05, "logits/chosen": -5.787993431091309, "logits/rejected": -6.606307029724121, "logps/chosen": -133.37295532226562, "logps/rejected": -547.9935913085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.392078876495361, "rewards/margins": 43.40110397338867, "rewards/rejected": -49.793182373046875, "step": 921 }, { "epoch": 0.7523459812321501, "grad_norm": 0.000674067938234657, "learning_rate": 3.648197965068235e-05, "logits/chosen": -5.8972907066345215, "logits/rejected": -6.3043999671936035, "logps/chosen": -116.31842041015625, "logps/rejected": -370.31427001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.022809028625488, "rewards/margins": 26.329673767089844, "rewards/rejected": -33.352481842041016, "step": 922 }, { "epoch": 0.7531619747042023, "grad_norm": 0.0034074028953909874, "learning_rate": 3.6373992448330486e-05, "logits/chosen": -6.087894916534424, "logits/rejected": -6.696368217468262, "logps/chosen": -144.45420837402344, "logps/rejected": -568.333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.847084999084473, "rewards/margins": 41.63330841064453, "rewards/rejected": -49.48039627075195, "step": 923 }, { "epoch": 0.7539779681762546, "grad_norm": 5.726737413169758e-07, "learning_rate": 3.62660738598805e-05, "logits/chosen": -6.515054702758789, "logits/rejected": -6.088358402252197, "logps/chosen": -102.38128662109375, "logps/rejected": -475.35107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.950681209564209, "rewards/margins": 36.19157409667969, "rewards/rejected": -41.142250061035156, "step": 924 }, { "epoch": 0.7547939616483068, "grad_norm": 0.03446212783455849, "learning_rate": 3.6158224428757535e-05, "logits/chosen": -6.853287696838379, "logits/rejected": -7.364847183227539, "logps/chosen": -159.63449096679688, "logps/rejected": -520.2529907226562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.339559555053711, "rewards/margins": 35.07750701904297, "rewards/rejected": -45.41706848144531, "step": 925 }, { "epoch": 0.755609955120359, "grad_norm": 4.564771737686213e-12, "learning_rate": 3.605044469803854e-05, "logits/chosen": -7.497727870941162, "logits/rejected": -7.44624137878418, "logps/chosen": -104.48017883300781, "logps/rejected": -524.0081787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.42478084564209, "rewards/margins": 43.50041198730469, "rewards/rejected": -46.925201416015625, "step": 926 }, { "epoch": 0.7564259485924113, "grad_norm": 1.4907300283084624e-05, "learning_rate": 3.594273521044945e-05, "logits/chosen": -7.409976959228516, "logits/rejected": -7.010684013366699, "logps/chosen": -94.30085754394531, "logps/rejected": -411.600830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5010626316070557, "rewards/margins": 31.78411865234375, "rewards/rejected": -35.285179138183594, "step": 927 }, { "epoch": 0.7572419420644635, "grad_norm": 0.00037591351429000497, "learning_rate": 3.5835096508362545e-05, "logits/chosen": -7.014759063720703, "logits/rejected": -6.60964298248291, "logps/chosen": -131.0952911376953, "logps/rejected": -512.6124267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.673942565917969, "rewards/margins": 37.365074157714844, "rewards/rejected": -45.03900909423828, "step": 928 }, { "epoch": 0.7580579355365157, "grad_norm": 1.8412013233493063e-11, "learning_rate": 3.5727529133793535e-05, "logits/chosen": -6.309494495391846, "logits/rejected": -6.224315643310547, "logps/chosen": -125.03514099121094, "logps/rejected": -630.395751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.508886814117432, "rewards/margins": 49.76074981689453, "rewards/rejected": -56.26963806152344, "step": 929 }, { "epoch": 0.758873929008568, "grad_norm": 3.852505467416556e-11, "learning_rate": 3.562003362839914e-05, "logits/chosen": -6.60986328125, "logits/rejected": -6.245790004730225, "logps/chosen": -156.88491821289062, "logps/rejected": -545.2890014648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.818694114685059, "rewards/margins": 38.67414093017578, "rewards/rejected": -48.49283218383789, "step": 930 }, { "epoch": 0.7596899224806202, "grad_norm": 2.4641495954824677e-08, "learning_rate": 3.551261053347404e-05, "logits/chosen": -5.809235095977783, "logits/rejected": -6.329345226287842, "logps/chosen": -127.83232116699219, "logps/rejected": -463.9982604980469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.119894027709961, "rewards/margins": 33.346370697021484, "rewards/rejected": -40.46626281738281, "step": 931 }, { "epoch": 0.7605059159526724, "grad_norm": 0.001204081461764872, "learning_rate": 3.5405260389948336e-05, "logits/chosen": -5.520548343658447, "logits/rejected": -6.623941898345947, "logps/chosen": -98.35289001464844, "logps/rejected": -565.7600708007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.137250900268555, "rewards/margins": 45.51465606689453, "rewards/rejected": -49.65190505981445, "step": 932 }, { "epoch": 0.7613219094247246, "grad_norm": 8.845460713717557e-13, "learning_rate": 3.529798373838481e-05, "logits/chosen": -6.518190860748291, "logits/rejected": -6.40873384475708, "logps/chosen": -129.38320922851562, "logps/rejected": -595.0855712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.846559524536133, "rewards/margins": 47.65790557861328, "rewards/rejected": -53.50446701049805, "step": 933 }, { "epoch": 0.7621379028967769, "grad_norm": 8.066057489486411e-05, "learning_rate": 3.519078111897612e-05, "logits/chosen": -7.516792297363281, "logits/rejected": -6.79666805267334, "logps/chosen": -115.80352783203125, "logps/rejected": -436.8191833496094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.292628288269043, "rewards/margins": 30.08831787109375, "rewards/rejected": -37.380943298339844, "step": 934 }, { "epoch": 0.762953896368829, "grad_norm": 9.687690649684555e-10, "learning_rate": 3.50836530715422e-05, "logits/chosen": -7.0116658210754395, "logits/rejected": -7.082104206085205, "logps/chosen": -81.82879638671875, "logps/rejected": -516.3549194335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3657126426696777, "rewards/margins": 43.27186965942383, "rewards/rejected": -45.6375846862793, "step": 935 }, { "epoch": 0.7637698898408812, "grad_norm": 5.774102191935526e-06, "learning_rate": 3.4976600135527406e-05, "logits/chosen": -5.590629577636719, "logits/rejected": -6.1795654296875, "logps/chosen": -162.7818145751953, "logps/rejected": -467.62176513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.811158180236816, "rewards/margins": 30.590129852294922, "rewards/rejected": -40.40129089355469, "step": 936 }, { "epoch": 0.7645858833129335, "grad_norm": 6.719640310848263e-08, "learning_rate": 3.4869622849997924e-05, "logits/chosen": -6.435369968414307, "logits/rejected": -6.075799942016602, "logps/chosen": -89.81330108642578, "logps/rejected": -437.46673583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.761168003082275, "rewards/margins": 34.80940628051758, "rewards/rejected": -39.57057571411133, "step": 937 }, { "epoch": 0.7654018767849857, "grad_norm": 3.5575556012190646e-06, "learning_rate": 3.4762721753638995e-05, "logits/chosen": -7.256749153137207, "logits/rejected": -7.435452938079834, "logps/chosen": -127.18892669677734, "logps/rejected": -533.9754638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.837247848510742, "rewards/margins": 40.4743766784668, "rewards/rejected": -47.311622619628906, "step": 938 }, { "epoch": 0.7662178702570379, "grad_norm": 1.2851942621239232e-08, "learning_rate": 3.4655897384752146e-05, "logits/chosen": -7.355846405029297, "logits/rejected": -6.64986515045166, "logps/chosen": -118.23406219482422, "logps/rejected": -492.4130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.215798854827881, "rewards/margins": 37.74570846557617, "rewards/rejected": -43.96150588989258, "step": 939 }, { "epoch": 0.7670338637290902, "grad_norm": 1.3217442695889758e-09, "learning_rate": 3.4549150281252636e-05, "logits/chosen": -7.672344207763672, "logits/rejected": -7.440163612365723, "logps/chosen": -107.23051452636719, "logps/rejected": -498.4793701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.1124587059021, "rewards/margins": 37.737464904785156, "rewards/rejected": -43.84992599487305, "step": 940 }, { "epoch": 0.7678498572011424, "grad_norm": 4.115680596328275e-08, "learning_rate": 3.4442480980666584e-05, "logits/chosen": -7.239082336425781, "logits/rejected": -7.135476112365723, "logps/chosen": -99.5657958984375, "logps/rejected": -521.332763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.078187465667725, "rewards/margins": 41.213687896728516, "rewards/rejected": -46.2918701171875, "step": 941 }, { "epoch": 0.7686658506731946, "grad_norm": 2.3787335123870434e-07, "learning_rate": 3.433589002012838e-05, "logits/chosen": -6.7907609939575195, "logits/rejected": -6.807424068450928, "logps/chosen": -133.96127319335938, "logps/rejected": -620.9345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.054166316986084, "rewards/margins": 48.54234313964844, "rewards/rejected": -54.59651184082031, "step": 942 }, { "epoch": 0.7694818441452468, "grad_norm": 4.4496684381556406e-07, "learning_rate": 3.422937793637788e-05, "logits/chosen": -6.3998517990112305, "logits/rejected": -6.586563587188721, "logps/chosen": -94.34501647949219, "logps/rejected": -483.2471923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.42890739440918, "rewards/margins": 37.16135025024414, "rewards/rejected": -41.59025573730469, "step": 943 }, { "epoch": 0.7702978376172991, "grad_norm": 3.174153162532889e-10, "learning_rate": 3.412294526575779e-05, "logits/chosen": -6.859379768371582, "logits/rejected": -6.83840274810791, "logps/chosen": -75.93003845214844, "logps/rejected": -450.86553955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.940333843231201, "rewards/margins": 36.128929138183594, "rewards/rejected": -40.06926345825195, "step": 944 }, { "epoch": 0.7711138310893513, "grad_norm": 2.1513804071560116e-09, "learning_rate": 3.401659254421094e-05, "logits/chosen": -6.483255863189697, "logits/rejected": -6.337862968444824, "logps/chosen": -145.68348693847656, "logps/rejected": -549.2694091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.5956449508667, "rewards/margins": 40.36219024658203, "rewards/rejected": -48.95783233642578, "step": 945 }, { "epoch": 0.7719298245614035, "grad_norm": 0.023736214265227318, "learning_rate": 3.391032030727752e-05, "logits/chosen": -7.629997730255127, "logits/rejected": -8.208882331848145, "logps/chosen": -101.29032897949219, "logps/rejected": -425.86932373046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.4962306022644043, "rewards/margins": 34.241756439208984, "rewards/rejected": -37.73798751831055, "step": 946 }, { "epoch": 0.7727458180334558, "grad_norm": 4.755860283497601e-14, "learning_rate": 3.380412909009254e-05, "logits/chosen": -6.82168436050415, "logits/rejected": -7.650664329528809, "logps/chosen": -93.79022216796875, "logps/rejected": -630.237060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.2281100749969482, "rewards/margins": 53.63298797607422, "rewards/rejected": -55.86109161376953, "step": 947 }, { "epoch": 0.773561811505508, "grad_norm": 2.3236850665853126e-08, "learning_rate": 3.369801942738291e-05, "logits/chosen": -5.0344414710998535, "logits/rejected": -6.345734119415283, "logps/chosen": -103.65696716308594, "logps/rejected": -516.2890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.632599353790283, "rewards/margins": 41.615692138671875, "rewards/rejected": -45.248291015625, "step": 948 }, { "epoch": 0.7743778049775601, "grad_norm": 0.007932160049676895, "learning_rate": 3.3591991853464965e-05, "logits/chosen": -7.163247108459473, "logits/rejected": -7.182155609130859, "logps/chosen": -92.78787994384766, "logps/rejected": -449.15301513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4433584213256836, "rewards/margins": 35.12727737426758, "rewards/rejected": -38.57063293457031, "step": 949 }, { "epoch": 0.7751937984496124, "grad_norm": 0.021269695833325386, "learning_rate": 3.3486046902241664e-05, "logits/chosen": -6.366323947906494, "logits/rejected": -6.549004077911377, "logps/chosen": -102.34263610839844, "logps/rejected": -478.5224914550781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.168724536895752, "rewards/margins": 36.67898178100586, "rewards/rejected": -40.84770584106445, "step": 950 }, { "epoch": 0.7760097919216646, "grad_norm": 1.3969322587570332e-08, "learning_rate": 3.3380185107199855e-05, "logits/chosen": -6.805169582366943, "logits/rejected": -6.781885147094727, "logps/chosen": -108.28457641601562, "logps/rejected": -478.0218505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.881062030792236, "rewards/margins": 35.88161849975586, "rewards/rejected": -41.76268005371094, "step": 951 }, { "epoch": 0.7768257853937168, "grad_norm": 0.012178542092442513, "learning_rate": 3.3274407001407735e-05, "logits/chosen": -7.708365440368652, "logits/rejected": -6.285226345062256, "logps/chosen": -93.84080505371094, "logps/rejected": -467.5010986328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.605701446533203, "rewards/margins": 34.44078063964844, "rewards/rejected": -39.04648208618164, "step": 952 }, { "epoch": 0.7776417788657691, "grad_norm": 6.389609397139395e-10, "learning_rate": 3.316871311751205e-05, "logits/chosen": -6.648227214813232, "logits/rejected": -6.536336898803711, "logps/chosen": -109.59049987792969, "logps/rejected": -523.794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.354750156402588, "rewards/margins": 40.318260192871094, "rewards/rejected": -46.67300796508789, "step": 953 }, { "epoch": 0.7784577723378213, "grad_norm": 2.978370503115002e-06, "learning_rate": 3.3063103987735433e-05, "logits/chosen": -5.946878433227539, "logits/rejected": -7.102060317993164, "logps/chosen": -122.75942993164062, "logps/rejected": -542.4523315429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.705526351928711, "rewards/margins": 40.298492431640625, "rewards/rejected": -46.0040168762207, "step": 954 }, { "epoch": 0.7792737658098735, "grad_norm": 0.012402746826410294, "learning_rate": 3.295758014387375e-05, "logits/chosen": -6.669669151306152, "logits/rejected": -7.047648906707764, "logps/chosen": -71.7738265991211, "logps/rejected": -452.60113525390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.478639841079712, "rewards/margins": 38.259910583496094, "rewards/rejected": -40.738548278808594, "step": 955 }, { "epoch": 0.7800897592819257, "grad_norm": 1.6948457414400764e-05, "learning_rate": 3.2852142117293435e-05, "logits/chosen": -6.19246244430542, "logits/rejected": -7.199228763580322, "logps/chosen": -101.36763000488281, "logps/rejected": -399.01239013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.926088333129883, "rewards/margins": 29.726661682128906, "rewards/rejected": -35.652748107910156, "step": 956 }, { "epoch": 0.780905752753978, "grad_norm": 2.7363980872885385e-11, "learning_rate": 3.274679043892872e-05, "logits/chosen": -6.732827663421631, "logits/rejected": -6.98160982131958, "logps/chosen": -126.03718566894531, "logps/rejected": -610.712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.886098861694336, "rewards/margins": 48.69449996948242, "rewards/rejected": -54.580596923828125, "step": 957 }, { "epoch": 0.7817217462260302, "grad_norm": 8.924232730578296e-08, "learning_rate": 3.264152563927908e-05, "logits/chosen": -7.042593955993652, "logits/rejected": -7.319014072418213, "logps/chosen": -112.72335052490234, "logps/rejected": -460.43634033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.306236743927002, "rewards/margins": 35.3431282043457, "rewards/rejected": -41.64936447143555, "step": 958 }, { "epoch": 0.7825377396980824, "grad_norm": 1.5653628224754357e-06, "learning_rate": 3.2536348248406534e-05, "logits/chosen": -6.985728740692139, "logits/rejected": -7.003339767456055, "logps/chosen": -113.25697326660156, "logps/rejected": -424.4642333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.081225872039795, "rewards/margins": 33.40571594238281, "rewards/rejected": -37.4869384765625, "step": 959 }, { "epoch": 0.7833537331701347, "grad_norm": 6.82928594317872e-13, "learning_rate": 3.243125879593286e-05, "logits/chosen": -6.564444541931152, "logits/rejected": -7.053368091583252, "logps/chosen": -146.9222412109375, "logps/rejected": -648.1365966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.189029693603516, "rewards/margins": 49.522701263427734, "rewards/rejected": -57.71173095703125, "step": 960 }, { "epoch": 0.7841697266421869, "grad_norm": 2.0974051949451678e-05, "learning_rate": 3.2326257811037155e-05, "logits/chosen": -7.3510637283325195, "logits/rejected": -6.6294755935668945, "logps/chosen": -100.06434631347656, "logps/rejected": -407.53350830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.073235988616943, "rewards/margins": 29.532882690429688, "rewards/rejected": -35.60612106323242, "step": 961 }, { "epoch": 0.7849857201142391, "grad_norm": 3.496042691453205e-11, "learning_rate": 3.22213458224529e-05, "logits/chosen": -6.882901668548584, "logits/rejected": -7.231308937072754, "logps/chosen": -152.94586181640625, "logps/rejected": -569.5340576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.725696563720703, "rewards/margins": 40.934303283691406, "rewards/rejected": -50.660003662109375, "step": 962 }, { "epoch": 0.7858017135862914, "grad_norm": 1.0118065802089404e-05, "learning_rate": 3.2116523358465535e-05, "logits/chosen": -6.777507781982422, "logits/rejected": -6.198722839355469, "logps/chosen": -115.82669830322266, "logps/rejected": -480.455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.850943565368652, "rewards/margins": 36.300926208496094, "rewards/rejected": -43.15187072753906, "step": 963 }, { "epoch": 0.7866177070583436, "grad_norm": 5.044822781741232e-09, "learning_rate": 3.201179094690967e-05, "logits/chosen": -5.925457954406738, "logits/rejected": -6.078361988067627, "logps/chosen": -137.1419677734375, "logps/rejected": -500.67144775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.357913017272949, "rewards/margins": 37.240516662597656, "rewards/rejected": -44.59843063354492, "step": 964 }, { "epoch": 0.7874337005303957, "grad_norm": 1.308863156745943e-10, "learning_rate": 3.19071491151664e-05, "logits/chosen": -6.441374778747559, "logits/rejected": -6.7258405685424805, "logps/chosen": -133.44125366210938, "logps/rejected": -503.4341735839844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.786650657653809, "rewards/margins": 36.22782897949219, "rewards/rejected": -43.01448059082031, "step": 965 }, { "epoch": 0.7882496940024479, "grad_norm": 6.813405661887373e-07, "learning_rate": 3.1802598390160784e-05, "logits/chosen": -6.900813102722168, "logits/rejected": -7.219237804412842, "logps/chosen": -124.44815063476562, "logps/rejected": -499.9249267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.855707168579102, "rewards/margins": 38.88752365112305, "rewards/rejected": -43.74323272705078, "step": 966 }, { "epoch": 0.7890656874745002, "grad_norm": 3.2056141208158806e-05, "learning_rate": 3.169813929835907e-05, "logits/chosen": -6.064820766448975, "logits/rejected": -6.788723468780518, "logps/chosen": -121.23538208007812, "logps/rejected": -491.2615966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.462198257446289, "rewards/margins": 37.86302947998047, "rewards/rejected": -44.32522964477539, "step": 967 }, { "epoch": 0.7898816809465524, "grad_norm": 2.975171541422239e-12, "learning_rate": 3.1593772365766105e-05, "logits/chosen": -6.81306266784668, "logits/rejected": -6.314748764038086, "logps/chosen": -142.203857421875, "logps/rejected": -598.1051635742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.341259956359863, "rewards/margins": 45.636436462402344, "rewards/rejected": -54.97769546508789, "step": 968 }, { "epoch": 0.7906976744186046, "grad_norm": 7.894003601904842e-07, "learning_rate": 3.148949811792266e-05, "logits/chosen": -6.544186592102051, "logits/rejected": -6.900391578674316, "logps/chosen": -99.3653793334961, "logps/rejected": -476.1439208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.016885757446289, "rewards/margins": 38.138671875, "rewards/rejected": -43.15556335449219, "step": 969 }, { "epoch": 0.7915136678906569, "grad_norm": 4.252393773640506e-05, "learning_rate": 3.1385317079902745e-05, "logits/chosen": -5.926723957061768, "logits/rejected": -5.716807842254639, "logps/chosen": -169.77813720703125, "logps/rejected": -469.9918212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.002962112426758, "rewards/margins": 30.52813720703125, "rewards/rejected": -41.53110122680664, "step": 970 }, { "epoch": 0.7923296613627091, "grad_norm": 2.6861282975687573e-08, "learning_rate": 3.1281229776311104e-05, "logits/chosen": -6.129620552062988, "logits/rejected": -7.411147117614746, "logps/chosen": -106.85725402832031, "logps/rejected": -487.8482666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.391049385070801, "rewards/margins": 37.51606750488281, "rewards/rejected": -43.90711975097656, "step": 971 }, { "epoch": 0.7931456548347613, "grad_norm": 9.456280736230838e-08, "learning_rate": 3.11772367312804e-05, "logits/chosen": -7.196463584899902, "logits/rejected": -7.111794471740723, "logps/chosen": -132.7318115234375, "logps/rejected": -590.0720825195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.911232948303223, "rewards/margins": 44.84159851074219, "rewards/rejected": -52.752830505371094, "step": 972 }, { "epoch": 0.7939616483068136, "grad_norm": 4.5364937500957936e-11, "learning_rate": 3.107333846846872e-05, "logits/chosen": -6.728616714477539, "logits/rejected": -7.6608099937438965, "logps/chosen": -85.05732727050781, "logps/rejected": -502.4952392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.585437297821045, "rewards/margins": 42.2752685546875, "rewards/rejected": -44.8607063293457, "step": 973 }, { "epoch": 0.7947776417788658, "grad_norm": 6.633839588765511e-10, "learning_rate": 3.096953551105679e-05, "logits/chosen": -7.153611660003662, "logits/rejected": -7.567729949951172, "logps/chosen": -156.99197387695312, "logps/rejected": -511.9169006347656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.794189453125, "rewards/margins": 35.64336395263672, "rewards/rejected": -45.43755340576172, "step": 974 }, { "epoch": 0.795593635250918, "grad_norm": 1.946807287822594e-06, "learning_rate": 3.086582838174551e-05, "logits/chosen": -6.412019729614258, "logits/rejected": -6.427597522735596, "logps/chosen": -130.62750244140625, "logps/rejected": -512.5863037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.816741943359375, "rewards/margins": 38.979949951171875, "rewards/rejected": -45.79669189453125, "step": 975 }, { "epoch": 0.7964096287229702, "grad_norm": 8.248854754810964e-08, "learning_rate": 3.076221760275321e-05, "logits/chosen": -5.720818519592285, "logits/rejected": -6.4293718338012695, "logps/chosen": -148.8586883544922, "logps/rejected": -550.3387451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.371658325195312, "rewards/margins": 39.51685333251953, "rewards/rejected": -48.888511657714844, "step": 976 }, { "epoch": 0.7972256221950225, "grad_norm": 1.6177666845029748e-11, "learning_rate": 3.0658703695813e-05, "logits/chosen": -6.285127639770508, "logits/rejected": -6.961733818054199, "logps/chosen": -133.51919555664062, "logps/rejected": -558.3131103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.034435272216797, "rewards/margins": 41.700801849365234, "rewards/rejected": -49.73523712158203, "step": 977 }, { "epoch": 0.7980416156670747, "grad_norm": 0.005890270229429007, "learning_rate": 3.055528718217028e-05, "logits/chosen": -7.102115154266357, "logits/rejected": -6.729103088378906, "logps/chosen": -161.8472137451172, "logps/rejected": -487.29742431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.621862411499023, "rewards/margins": 32.74443054199219, "rewards/rejected": -43.366294860839844, "step": 978 }, { "epoch": 0.7988576091391268, "grad_norm": 0.006343854125589132, "learning_rate": 3.0451968582579915e-05, "logits/chosen": -6.874529838562012, "logits/rejected": -6.840769290924072, "logps/chosen": -113.38435363769531, "logps/rejected": -504.66162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.821788787841797, "rewards/margins": 39.48640060424805, "rewards/rejected": -44.308189392089844, "step": 979 }, { "epoch": 0.7996736026111791, "grad_norm": 2.554915852215345e-07, "learning_rate": 3.0348748417303823e-05, "logits/chosen": -6.671462535858154, "logits/rejected": -6.090724468231201, "logps/chosen": -111.85031127929688, "logps/rejected": -524.546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.246711730957031, "rewards/margins": 41.00428009033203, "rewards/rejected": -47.25099182128906, "step": 980 }, { "epoch": 0.8004895960832313, "grad_norm": 0.0005122366128489375, "learning_rate": 3.0245627206108196e-05, "logits/chosen": -6.821171760559082, "logits/rejected": -6.5710296630859375, "logps/chosen": -109.86365509033203, "logps/rejected": -472.9414978027344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.306456089019775, "rewards/margins": 35.45451354980469, "rewards/rejected": -41.76097106933594, "step": 981 }, { "epoch": 0.8013055895552835, "grad_norm": 7.515783551070854e-08, "learning_rate": 3.0142605468260978e-05, "logits/chosen": -5.931756019592285, "logits/rejected": -6.0338134765625, "logps/chosen": -78.42326354980469, "logps/rejected": -471.4119567871094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6586294770240784, "rewards/margins": 40.63633728027344, "rewards/rejected": -41.29496765136719, "step": 982 }, { "epoch": 0.8021215830273358, "grad_norm": 4.1072414913578825e-10, "learning_rate": 3.0039683722529177e-05, "logits/chosen": -6.506806373596191, "logits/rejected": -5.770281791687012, "logps/chosen": -171.22882080078125, "logps/rejected": -552.769287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.893926620483398, "rewards/margins": 36.61451721191406, "rewards/rejected": -48.508445739746094, "step": 983 }, { "epoch": 0.802937576499388, "grad_norm": 0.02198401466012001, "learning_rate": 2.9936862487176293e-05, "logits/chosen": -6.683540344238281, "logits/rejected": -7.725393772125244, "logps/chosen": -84.85970306396484, "logps/rejected": -446.31427001953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.6724531650543213, "rewards/margins": 35.943359375, "rewards/rejected": -39.615814208984375, "step": 984 }, { "epoch": 0.8037535699714402, "grad_norm": 2.7578731533139944e-05, "learning_rate": 2.9834142279959752e-05, "logits/chosen": -6.99709415435791, "logits/rejected": -6.673989772796631, "logps/chosen": -136.29742431640625, "logps/rejected": -521.5691528320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.458311080932617, "rewards/margins": 38.033714294433594, "rewards/rejected": -46.49201965332031, "step": 985 }, { "epoch": 0.8045695634434924, "grad_norm": 5.782008884125389e-06, "learning_rate": 2.97315236181282e-05, "logits/chosen": -6.79514217376709, "logits/rejected": -6.760045051574707, "logps/chosen": -71.45242309570312, "logps/rejected": -477.0556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4002585411071777, "rewards/margins": 40.2522087097168, "rewards/rejected": -42.6524658203125, "step": 986 }, { "epoch": 0.8053855569155447, "grad_norm": 6.411332797995328e-09, "learning_rate": 2.9629007018418985e-05, "logits/chosen": -7.658866882324219, "logits/rejected": -7.596493721008301, "logps/chosen": -119.3629150390625, "logps/rejected": -465.96881103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.849913120269775, "rewards/margins": 34.50267028808594, "rewards/rejected": -42.35258102416992, "step": 987 }, { "epoch": 0.8062015503875969, "grad_norm": 5.631753197121725e-07, "learning_rate": 2.9526592997055487e-05, "logits/chosen": -6.895620822906494, "logits/rejected": -6.973711967468262, "logps/chosen": -177.08522033691406, "logps/rejected": -546.2019653320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.723348617553711, "rewards/margins": 37.46473693847656, "rewards/rejected": -47.188079833984375, "step": 988 }, { "epoch": 0.8070175438596491, "grad_norm": 1.8739922325039515e-06, "learning_rate": 2.9424282069744564e-05, "logits/chosen": -6.7784013748168945, "logits/rejected": -7.510337829589844, "logps/chosen": -117.17430114746094, "logps/rejected": -488.8161315917969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.3475341796875, "rewards/margins": 36.726985931396484, "rewards/rejected": -42.074520111083984, "step": 989 }, { "epoch": 0.8078335373317014, "grad_norm": 6.847155620270939e-10, "learning_rate": 2.932207475167398e-05, "logits/chosen": -7.26101016998291, "logits/rejected": -6.829400539398193, "logps/chosen": -103.95441436767578, "logps/rejected": -528.2484741210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.498802185058594, "rewards/margins": 40.6168327331543, "rewards/rejected": -47.115638732910156, "step": 990 }, { "epoch": 0.8086495308037536, "grad_norm": 0.000917157856747508, "learning_rate": 2.9219971557509695e-05, "logits/chosen": -6.840610504150391, "logits/rejected": -6.903287887573242, "logps/chosen": -96.43013000488281, "logps/rejected": -454.7883605957031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.685647964477539, "rewards/margins": 35.145206451416016, "rewards/rejected": -39.83085632324219, "step": 991 }, { "epoch": 0.8094655242758058, "grad_norm": 6.734600788149692e-07, "learning_rate": 2.911797300139345e-05, "logits/chosen": -7.673084259033203, "logits/rejected": -6.253875732421875, "logps/chosen": -112.98916625976562, "logps/rejected": -494.82452392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.263279914855957, "rewards/margins": 37.590110778808594, "rewards/rejected": -43.8533935546875, "step": 992 }, { "epoch": 0.8102815177478581, "grad_norm": 75.63589477539062, "learning_rate": 2.9016079596939992e-05, "logits/chosen": -7.453423023223877, "logits/rejected": -6.483057975769043, "logps/chosen": -89.85579681396484, "logps/rejected": -467.05865478515625, "loss": 0.4718, "rewards/accuracies": 0.875, "rewards/chosen": -4.052299976348877, "rewards/margins": 38.550315856933594, "rewards/rejected": -42.60261535644531, "step": 993 }, { "epoch": 0.8110975112199102, "grad_norm": 0.0007855792064219713, "learning_rate": 2.8914291857234636e-05, "logits/chosen": -6.585747718811035, "logits/rejected": -7.725704193115234, "logps/chosen": -137.987548828125, "logps/rejected": -474.99853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.993085861206055, "rewards/margins": 32.95569610595703, "rewards/rejected": -40.94878387451172, "step": 994 }, { "epoch": 0.8119135046919624, "grad_norm": 1.3994384264696264e-13, "learning_rate": 2.8812610294830566e-05, "logits/chosen": -6.506470203399658, "logits/rejected": -6.4290924072265625, "logps/chosen": -128.68533325195312, "logps/rejected": -611.918212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.653985500335693, "rewards/margins": 48.12384796142578, "rewards/rejected": -54.77783203125, "step": 995 }, { "epoch": 0.8127294981640147, "grad_norm": 3.761631978127511e-11, "learning_rate": 2.8711035421746367e-05, "logits/chosen": -5.699454307556152, "logits/rejected": -6.604212760925293, "logps/chosen": -152.34646606445312, "logps/rejected": -644.341064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.258380889892578, "rewards/margins": 48.88677978515625, "rewards/rejected": -57.14516067504883, "step": 996 }, { "epoch": 0.8135454916360669, "grad_norm": 3.038953463874172e-11, "learning_rate": 2.860956774946337e-05, "logits/chosen": -5.788163661956787, "logits/rejected": -6.576289176940918, "logps/chosen": -125.37005615234375, "logps/rejected": -561.2939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.375026702880859, "rewards/margins": 44.243804931640625, "rewards/rejected": -51.618831634521484, "step": 997 }, { "epoch": 0.8143614851081191, "grad_norm": 1.3084628335491288e-05, "learning_rate": 2.8508207788923046e-05, "logits/chosen": -6.871591567993164, "logits/rejected": -6.5831756591796875, "logps/chosen": -178.50320434570312, "logps/rejected": -493.51251220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.422432899475098, "rewards/margins": 30.619062423706055, "rewards/rejected": -44.04149627685547, "step": 998 }, { "epoch": 0.8151774785801713, "grad_norm": 5.478078719534096e-07, "learning_rate": 2.840695605052458e-05, "logits/chosen": -5.899359703063965, "logits/rejected": -6.830013751983643, "logps/chosen": -112.04924011230469, "logps/rejected": -459.62744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.403657913208008, "rewards/margins": 34.44694519042969, "rewards/rejected": -40.85060119628906, "step": 999 }, { "epoch": 0.8159934720522236, "grad_norm": 0.23069968819618225, "learning_rate": 2.8305813044122097e-05, "logits/chosen": -7.509840965270996, "logits/rejected": -7.145511150360107, "logps/chosen": -158.78082275390625, "logps/rejected": -574.9197998046875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -11.612314224243164, "rewards/margins": 39.76613235473633, "rewards/rejected": -51.378440856933594, "step": 1000 }, { "epoch": 0.8168094655242758, "grad_norm": 6.036398190190084e-07, "learning_rate": 2.8204779279022276e-05, "logits/chosen": -6.909107685089111, "logits/rejected": -7.3856658935546875, "logps/chosen": -132.00091552734375, "logps/rejected": -594.7566528320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.20920181274414, "rewards/margins": 45.77363204956055, "rewards/rejected": -53.98283386230469, "step": 1001 }, { "epoch": 0.817625458996328, "grad_norm": 7.071452451379301e-11, "learning_rate": 2.8103855263981695e-05, "logits/chosen": -6.501651763916016, "logits/rejected": -7.322134494781494, "logps/chosen": -175.20005798339844, "logps/rejected": -591.263916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.024626731872559, "rewards/margins": 42.33715057373047, "rewards/rejected": -54.361778259277344, "step": 1002 }, { "epoch": 0.8184414524683803, "grad_norm": 0.00066294678254053, "learning_rate": 2.8003041507204242e-05, "logits/chosen": -6.352685451507568, "logits/rejected": -6.389235496520996, "logps/chosen": -210.2667694091797, "logps/rejected": -591.95751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.586164474487305, "rewards/margins": 35.631561279296875, "rewards/rejected": -51.21772766113281, "step": 1003 }, { "epoch": 0.8192574459404325, "grad_norm": 2.779444730549585e-05, "learning_rate": 2.7902338516338677e-05, "logits/chosen": -6.687412261962891, "logits/rejected": -6.993821144104004, "logps/chosen": -161.6879119873047, "logps/rejected": -553.9426879882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.339847564697266, "rewards/margins": 39.399566650390625, "rewards/rejected": -50.73941421508789, "step": 1004 }, { "epoch": 0.8200734394124847, "grad_norm": 6.585361944644319e-08, "learning_rate": 2.7801746798475904e-05, "logits/chosen": -6.197707176208496, "logits/rejected": -7.1416754722595215, "logps/chosen": -179.64633178710938, "logps/rejected": -588.009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.845651626586914, "rewards/margins": 40.079017639160156, "rewards/rejected": -51.9246711730957, "step": 1005 }, { "epoch": 0.820889432884537, "grad_norm": 2.77589094821451e-07, "learning_rate": 2.7701266860146575e-05, "logits/chosen": -6.377667427062988, "logits/rejected": -6.648372650146484, "logps/chosen": -200.26742553710938, "logps/rejected": -576.971923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.758617401123047, "rewards/margins": 37.1399040222168, "rewards/rejected": -51.89851760864258, "step": 1006 }, { "epoch": 0.8217054263565892, "grad_norm": 1.8425481584927184e-08, "learning_rate": 2.7600899207318465e-05, "logits/chosen": -7.7657270431518555, "logits/rejected": -7.846233367919922, "logps/chosen": -172.5588836669922, "logps/rejected": -613.8388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.357542991638184, "rewards/margins": 42.958919525146484, "rewards/rejected": -55.31645965576172, "step": 1007 }, { "epoch": 0.8225214198286414, "grad_norm": 4.873114812653512e-05, "learning_rate": 2.7500644345393943e-05, "logits/chosen": -6.8896684646606445, "logits/rejected": -7.0943684577941895, "logps/chosen": -247.18185424804688, "logps/rejected": -725.7239990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.075212478637695, "rewards/margins": 46.07542037963867, "rewards/rejected": -66.150634765625, "step": 1008 }, { "epoch": 0.8233374133006935, "grad_norm": 0.23977914452552795, "learning_rate": 2.740050277920739e-05, "logits/chosen": -6.596702575683594, "logits/rejected": -6.719897270202637, "logps/chosen": -207.19403076171875, "logps/rejected": -541.3421630859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -15.249186515808105, "rewards/margins": 33.94136047363281, "rewards/rejected": -49.1905517578125, "step": 1009 }, { "epoch": 0.8241534067727458, "grad_norm": 2.3598133793711895e-06, "learning_rate": 2.7300475013022663e-05, "logits/chosen": -6.151098251342773, "logits/rejected": -6.6784281730651855, "logps/chosen": -195.1455535888672, "logps/rejected": -645.6377563476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.056802749633789, "rewards/margins": 44.679561614990234, "rewards/rejected": -58.736366271972656, "step": 1010 }, { "epoch": 0.824969400244798, "grad_norm": 0.004735945723950863, "learning_rate": 2.720056155053067e-05, "logits/chosen": -6.7774810791015625, "logits/rejected": -7.205092430114746, "logps/chosen": -189.6084747314453, "logps/rejected": -592.311279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.244738578796387, "rewards/margins": 41.359893798828125, "rewards/rejected": -53.604637145996094, "step": 1011 }, { "epoch": 0.8257853937168502, "grad_norm": 0.00018475265824235976, "learning_rate": 2.710076289484663e-05, "logits/chosen": -7.935247898101807, "logits/rejected": -7.562012672424316, "logps/chosen": -253.9874725341797, "logps/rejected": -648.1053466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.248056411743164, "rewards/margins": 40.357215881347656, "rewards/rejected": -59.60527420043945, "step": 1012 }, { "epoch": 0.8266013871889025, "grad_norm": 9.074414464294023e-08, "learning_rate": 2.7001079548507736e-05, "logits/chosen": -6.137836933135986, "logits/rejected": -7.116020202636719, "logps/chosen": -186.0381317138672, "logps/rejected": -558.3424682617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.277799606323242, "rewards/margins": 38.11887741088867, "rewards/rejected": -51.39667892456055, "step": 1013 }, { "epoch": 0.8274173806609547, "grad_norm": 8.252191463498093e-08, "learning_rate": 2.690151201347052e-05, "logits/chosen": -6.299282073974609, "logits/rejected": -6.735664367675781, "logps/chosen": -228.14329528808594, "logps/rejected": -617.1948852539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.776487350463867, "rewards/margins": 39.443138122558594, "rewards/rejected": -56.219627380371094, "step": 1014 }, { "epoch": 0.8282333741330069, "grad_norm": 5.358018029255618e-07, "learning_rate": 2.68020607911083e-05, "logits/chosen": -7.1445794105529785, "logits/rejected": -7.802563667297363, "logps/chosen": -166.76441955566406, "logps/rejected": -524.2116088867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.542205810546875, "rewards/margins": 35.72195053100586, "rewards/rejected": -48.26416015625, "step": 1015 }, { "epoch": 0.8290493676050592, "grad_norm": 1.6548820092054939e-09, "learning_rate": 2.6702726382208776e-05, "logits/chosen": -6.936863899230957, "logits/rejected": -6.811605453491211, "logps/chosen": -183.65451049804688, "logps/rejected": -597.7079467773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.897449493408203, "rewards/margins": 40.458587646484375, "rewards/rejected": -54.35603332519531, "step": 1016 }, { "epoch": 0.8298653610771114, "grad_norm": 0.0001746060879668221, "learning_rate": 2.660350928697134e-05, "logits/chosen": -6.347721099853516, "logits/rejected": -6.822726726531982, "logps/chosen": -230.40415954589844, "logps/rejected": -617.8380126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.5299072265625, "rewards/margins": 39.13287353515625, "rewards/rejected": -57.66278076171875, "step": 1017 }, { "epoch": 0.8306813545491636, "grad_norm": 0.00010304812894901261, "learning_rate": 2.6504410005004732e-05, "logits/chosen": -6.2589945793151855, "logits/rejected": -7.4440155029296875, "logps/chosen": -180.2696990966797, "logps/rejected": -495.84075927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.568323135375977, "rewards/margins": 32.948997497558594, "rewards/rejected": -44.51732635498047, "step": 1018 }, { "epoch": 0.8314973480212158, "grad_norm": 9.778446941055563e-10, "learning_rate": 2.6405429035324403e-05, "logits/chosen": -7.340249061584473, "logits/rejected": -7.39459753036499, "logps/chosen": -219.20480346679688, "logps/rejected": -693.5477294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.443375587463379, "rewards/margins": 47.3580207824707, "rewards/rejected": -62.80139923095703, "step": 1019 }, { "epoch": 0.8323133414932681, "grad_norm": 1.3867566849512514e-05, "learning_rate": 2.630656687635007e-05, "logits/chosen": -6.6103081703186035, "logits/rejected": -7.811107158660889, "logps/chosen": -269.359130859375, "logps/rejected": -619.3856201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.179269790649414, "rewards/margins": 37.50752258300781, "rewards/rejected": -56.68679428100586, "step": 1020 }, { "epoch": 0.8331293349653203, "grad_norm": 6.245045824471163e-06, "learning_rate": 2.6207824025903137e-05, "logits/chosen": -7.133868217468262, "logits/rejected": -7.348158359527588, "logps/chosen": -234.26394653320312, "logps/rejected": -559.1795043945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.53624153137207, "rewards/margins": 33.32814025878906, "rewards/rejected": -51.864383697509766, "step": 1021 }, { "epoch": 0.8339453284373725, "grad_norm": 7.557321805506945e-05, "learning_rate": 2.610920098120424e-05, "logits/chosen": -8.637264251708984, "logits/rejected": -7.655214309692383, "logps/chosen": -216.28782653808594, "logps/rejected": -578.3019409179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.936962127685547, "rewards/margins": 35.635581970214844, "rewards/rejected": -53.572540283203125, "step": 1022 }, { "epoch": 0.8347613219094248, "grad_norm": 2.883229512917751e-07, "learning_rate": 2.6010698238870744e-05, "logits/chosen": -6.997838973999023, "logits/rejected": -7.420304775238037, "logps/chosen": -187.08010864257812, "logps/rejected": -585.591064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.593658447265625, "rewards/margins": 38.987911224365234, "rewards/rejected": -52.58156967163086, "step": 1023 }, { "epoch": 0.835577315381477, "grad_norm": 5.38575573028055e-10, "learning_rate": 2.591231629491423e-05, "logits/chosen": -6.2383623123168945, "logits/rejected": -6.285945892333984, "logps/chosen": -203.84246826171875, "logps/rejected": -710.2127685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.7962007522583, "rewards/margins": 49.64633560180664, "rewards/rejected": -63.44253921508789, "step": 1024 }, { "epoch": 0.8363933088535291, "grad_norm": 0.050582971423864365, "learning_rate": 2.581405564473801e-05, "logits/chosen": -8.263072967529297, "logits/rejected": -7.482296466827393, "logps/chosen": -244.24594116210938, "logps/rejected": -569.2033081054688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -18.568721771240234, "rewards/margins": 30.648107528686523, "rewards/rejected": -49.21683120727539, "step": 1025 }, { "epoch": 0.8372093023255814, "grad_norm": 2.962269984863042e-09, "learning_rate": 2.571591678313458e-05, "logits/chosen": -7.071425437927246, "logits/rejected": -8.126596450805664, "logps/chosen": -211.85545349121094, "logps/rejected": -614.5064697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.865781784057617, "rewards/margins": 39.01493453979492, "rewards/rejected": -55.880714416503906, "step": 1026 }, { "epoch": 0.8380252957976336, "grad_norm": 3.1420064595740354e-20, "learning_rate": 2.561790020428322e-05, "logits/chosen": -6.428366661071777, "logits/rejected": -7.6387434005737305, "logps/chosen": -277.14398193359375, "logps/rejected": -861.443603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.27480697631836, "rewards/margins": 59.55926513671875, "rewards/rejected": -77.83407592773438, "step": 1027 }, { "epoch": 0.8388412892696858, "grad_norm": 0.020989205688238144, "learning_rate": 2.5520006401747398e-05, "logits/chosen": -7.236568450927734, "logits/rejected": -7.523683547973633, "logps/chosen": -210.26722717285156, "logps/rejected": -519.999755859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.286935806274414, "rewards/margins": 29.02237892150879, "rewards/rejected": -45.30931854248047, "step": 1028 }, { "epoch": 0.8396572827417381, "grad_norm": 0.0021759578958153725, "learning_rate": 2.5422235868472345e-05, "logits/chosen": -6.336637496948242, "logits/rejected": -7.280792713165283, "logps/chosen": -219.3902587890625, "logps/rejected": -674.1849975585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.912830352783203, "rewards/margins": 44.29109191894531, "rewards/rejected": -61.203922271728516, "step": 1029 }, { "epoch": 0.8404732762137903, "grad_norm": 7.306962743314216e-06, "learning_rate": 2.5324589096782657e-05, "logits/chosen": -6.915951251983643, "logits/rejected": -8.023942947387695, "logps/chosen": -203.36729431152344, "logps/rejected": -628.8574829101562, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -15.889001846313477, "rewards/margins": 40.63396453857422, "rewards/rejected": -56.52296829223633, "step": 1030 }, { "epoch": 0.8412892696858425, "grad_norm": 1.9382240168397402e-07, "learning_rate": 2.522706657837962e-05, "logits/chosen": -7.251037120819092, "logits/rejected": -7.118707656860352, "logps/chosen": -187.84474182128906, "logps/rejected": -580.507080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.341388702392578, "rewards/margins": 37.08915328979492, "rewards/rejected": -50.4305419921875, "step": 1031 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5779076814651489, "learning_rate": 2.5129668804338906e-05, "logits/chosen": -7.695659160614014, "logits/rejected": -7.274648666381836, "logps/chosen": -250.39236450195312, "logps/rejected": -553.095947265625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -20.188352584838867, "rewards/margins": 29.978565216064453, "rewards/rejected": -50.16691970825195, "step": 1032 }, { "epoch": 0.842921256629947, "grad_norm": 13.444469451904297, "learning_rate": 2.5032396265107984e-05, "logits/chosen": -6.322668075561523, "logits/rejected": -7.662363529205322, "logps/chosen": -221.02000427246094, "logps/rejected": -517.7114868164062, "loss": 0.181, "rewards/accuracies": 1.0, "rewards/chosen": -17.673383712768555, "rewards/margins": 28.631458282470703, "rewards/rejected": -46.304840087890625, "step": 1033 }, { "epoch": 0.8437372501019992, "grad_norm": 1.0638508683769032e-05, "learning_rate": 2.4935249450503762e-05, "logits/chosen": -6.71500301361084, "logits/rejected": -7.184361457824707, "logps/chosen": -178.41026306152344, "logps/rejected": -470.0269775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.083944320678711, "rewards/margins": 29.223073959350586, "rewards/rejected": -43.3070182800293, "step": 1034 }, { "epoch": 0.8445532435740514, "grad_norm": 2.4979987855999752e-08, "learning_rate": 2.483822884971e-05, "logits/chosen": -6.390749454498291, "logits/rejected": -6.967741012573242, "logps/chosen": -179.5550537109375, "logps/rejected": -618.778564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.518630981445312, "rewards/margins": 43.224361419677734, "rewards/rejected": -55.74298858642578, "step": 1035 }, { "epoch": 0.8453692370461037, "grad_norm": 2.0217243434927923e-09, "learning_rate": 2.4741334951274947e-05, "logits/chosen": -5.851958274841309, "logits/rejected": -6.83444356918335, "logps/chosen": -139.6371307373047, "logps/rejected": -574.4845581054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.5301513671875, "rewards/margins": 42.69941711425781, "rewards/rejected": -52.22956848144531, "step": 1036 }, { "epoch": 0.8461852305181559, "grad_norm": 6.467762858619608e-08, "learning_rate": 2.464456824310885e-05, "logits/chosen": -6.7824602127075195, "logits/rejected": -7.211301326751709, "logps/chosen": -182.1636505126953, "logps/rejected": -667.2701416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.287744522094727, "rewards/margins": 49.01072311401367, "rewards/rejected": -60.298465728759766, "step": 1037 }, { "epoch": 0.847001223990208, "grad_norm": 9.961034811567515e-05, "learning_rate": 2.4547929212481435e-05, "logits/chosen": -7.249908924102783, "logits/rejected": -7.217072010040283, "logps/chosen": -195.3370361328125, "logps/rejected": -594.0701904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.448078155517578, "rewards/margins": 39.51043701171875, "rewards/rejected": -52.958518981933594, "step": 1038 }, { "epoch": 0.8478172174622604, "grad_norm": 0.007365425117313862, "learning_rate": 2.4451418346019576e-05, "logits/chosen": -5.92827033996582, "logits/rejected": -6.193601608276367, "logps/chosen": -182.54734802246094, "logps/rejected": -469.1607971191406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.553750991821289, "rewards/margins": 29.217559814453125, "rewards/rejected": -42.77130889892578, "step": 1039 }, { "epoch": 0.8486332109343125, "grad_norm": 0.00015885780157987028, "learning_rate": 2.43550361297047e-05, "logits/chosen": -7.335334777832031, "logits/rejected": -7.4644670486450195, "logps/chosen": -243.54428100585938, "logps/rejected": -706.3829956054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.847375869750977, "rewards/margins": 46.934165954589844, "rewards/rejected": -64.78153991699219, "step": 1040 }, { "epoch": 0.8494492044063647, "grad_norm": 1.676207051848691e-14, "learning_rate": 2.425878304887047e-05, "logits/chosen": -7.263794898986816, "logits/rejected": -7.413153648376465, "logps/chosen": -162.2875213623047, "logps/rejected": -691.3690185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.0809326171875, "rewards/margins": 51.845550537109375, "rewards/rejected": -61.926483154296875, "step": 1041 }, { "epoch": 0.8502651978784169, "grad_norm": 42.745853424072266, "learning_rate": 2.4162659588200288e-05, "logits/chosen": -6.3970770835876465, "logits/rejected": -7.109959125518799, "logps/chosen": -185.94493103027344, "logps/rejected": -496.4001159667969, "loss": 0.6031, "rewards/accuracies": 0.875, "rewards/chosen": -14.766996383666992, "rewards/margins": 30.115802764892578, "rewards/rejected": -44.88279724121094, "step": 1042 }, { "epoch": 0.8510811913504692, "grad_norm": 1.0544999318184978e-09, "learning_rate": 2.40666662317248e-05, "logits/chosen": -7.406240463256836, "logits/rejected": -7.956611633300781, "logps/chosen": -188.75694274902344, "logps/rejected": -617.62744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.42151927947998, "rewards/margins": 42.847999572753906, "rewards/rejected": -56.2695198059082, "step": 1043 }, { "epoch": 0.8518971848225214, "grad_norm": 9.188412241489677e-16, "learning_rate": 2.3970803462819584e-05, "logits/chosen": -6.319215774536133, "logits/rejected": -6.857836723327637, "logps/chosen": -206.92056274414062, "logps/rejected": -761.5103759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.594417572021484, "rewards/margins": 55.1982536315918, "rewards/rejected": -69.79267883300781, "step": 1044 }, { "epoch": 0.8527131782945736, "grad_norm": 0.09356626868247986, "learning_rate": 2.3875071764202563e-05, "logits/chosen": -6.076625347137451, "logits/rejected": -7.043323516845703, "logps/chosen": -122.97783660888672, "logps/rejected": -428.67608642578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.493732452392578, "rewards/margins": 31.537185668945312, "rewards/rejected": -39.03091812133789, "step": 1045 }, { "epoch": 0.8535291717666259, "grad_norm": 1.6603208391474666e-11, "learning_rate": 2.377947161793171e-05, "logits/chosen": -6.532040119171143, "logits/rejected": -6.84799337387085, "logps/chosen": -133.5863494873047, "logps/rejected": -659.4168701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.364590644836426, "rewards/margins": 51.485286712646484, "rewards/rejected": -59.849876403808594, "step": 1046 }, { "epoch": 0.8543451652386781, "grad_norm": 7.705287998760468e-07, "learning_rate": 2.3684003505402574e-05, "logits/chosen": -6.462632179260254, "logits/rejected": -7.22099494934082, "logps/chosen": -106.50241088867188, "logps/rejected": -557.697021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.808804988861084, "rewards/margins": 45.52777862548828, "rewards/rejected": -50.336585998535156, "step": 1047 }, { "epoch": 0.8551611587107303, "grad_norm": 1.1483275663692893e-08, "learning_rate": 2.3588667907345786e-05, "logits/chosen": -7.148177623748779, "logits/rejected": -7.093587875366211, "logps/chosen": -134.43650817871094, "logps/rejected": -494.6029968261719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.982542037963867, "rewards/margins": 37.0305061340332, "rewards/rejected": -44.01304626464844, "step": 1048 }, { "epoch": 0.8559771521827826, "grad_norm": 1.9909478932026228e-14, "learning_rate": 2.3493465303824767e-05, "logits/chosen": -6.605038642883301, "logits/rejected": -6.835592746734619, "logps/chosen": -188.79400634765625, "logps/rejected": -691.7178955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.98828411102295, "rewards/margins": 48.91318893432617, "rewards/rejected": -62.90147399902344, "step": 1049 }, { "epoch": 0.8567931456548348, "grad_norm": 1.8135132067928494e-12, "learning_rate": 2.3398396174233178e-05, "logits/chosen": -5.776115894317627, "logits/rejected": -7.474608898162842, "logps/chosen": -176.7278594970703, "logps/rejected": -707.316650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.541825294494629, "rewards/margins": 52.156070709228516, "rewards/rejected": -63.697898864746094, "step": 1050 }, { "epoch": 0.857609139126887, "grad_norm": 0.007139904424548149, "learning_rate": 2.3303460997292637e-05, "logits/chosen": -7.20109224319458, "logits/rejected": -7.5963053703308105, "logps/chosen": -160.20199584960938, "logps/rejected": -571.27783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.348814010620117, "rewards/margins": 40.888267517089844, "rewards/rejected": -51.237083435058594, "step": 1051 }, { "epoch": 0.8584251325989392, "grad_norm": 8.389431926936197e-10, "learning_rate": 2.3208660251050158e-05, "logits/chosen": -6.337010860443115, "logits/rejected": -6.931233882904053, "logps/chosen": -186.77816772460938, "logps/rejected": -731.113037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.048983573913574, "rewards/margins": 53.80959701538086, "rewards/rejected": -65.85858154296875, "step": 1052 }, { "epoch": 0.8592411260709915, "grad_norm": 2.481268371748573e-17, "learning_rate": 2.311399441287595e-05, "logits/chosen": -6.24257755279541, "logits/rejected": -6.849460601806641, "logps/chosen": -176.54330444335938, "logps/rejected": -806.7232666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.28794002532959, "rewards/margins": 60.77527618408203, "rewards/rejected": -73.06321716308594, "step": 1053 }, { "epoch": 0.8600571195430436, "grad_norm": 0.0005864492268301547, "learning_rate": 2.3019463959460787e-05, "logits/chosen": -6.671006202697754, "logits/rejected": -6.56172513961792, "logps/chosen": -126.14171600341797, "logps/rejected": -457.4732666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.774211883544922, "rewards/margins": 31.962364196777344, "rewards/rejected": -39.736576080322266, "step": 1054 }, { "epoch": 0.8608731130150958, "grad_norm": 9.804772815869e-08, "learning_rate": 2.2925069366813717e-05, "logits/chosen": -7.699621200561523, "logits/rejected": -8.359642028808594, "logps/chosen": -167.99261474609375, "logps/rejected": -714.1470336914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.00529670715332, "rewards/margins": 51.98234558105469, "rewards/rejected": -61.987640380859375, "step": 1055 }, { "epoch": 0.8616891064871481, "grad_norm": 2.858434982044855e-07, "learning_rate": 2.283081111025973e-05, "logits/chosen": -5.792584419250488, "logits/rejected": -7.200011253356934, "logps/chosen": -116.14237976074219, "logps/rejected": -486.2475891113281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.859138488769531, "rewards/margins": 37.26411056518555, "rewards/rejected": -43.12324905395508, "step": 1056 }, { "epoch": 0.8625050999592003, "grad_norm": 2.108328317262931e-06, "learning_rate": 2.2736689664437217e-05, "logits/chosen": -7.454646110534668, "logits/rejected": -7.285010814666748, "logps/chosen": -103.81887817382812, "logps/rejected": -463.3179626464844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.7618303298950195, "rewards/margins": 35.00215148925781, "rewards/rejected": -40.763980865478516, "step": 1057 }, { "epoch": 0.8633210934312525, "grad_norm": 4.932614956487669e-06, "learning_rate": 2.26427055032957e-05, "logits/chosen": -7.012362003326416, "logits/rejected": -5.77869987487793, "logps/chosen": -154.62757873535156, "logps/rejected": -527.4701538085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.02025032043457, "rewards/margins": 36.95254135131836, "rewards/rejected": -45.97278594970703, "step": 1058 }, { "epoch": 0.8641370869033048, "grad_norm": 2.8486046854681035e-09, "learning_rate": 2.2548859100093407e-05, "logits/chosen": -7.270592212677002, "logits/rejected": -7.4143290519714355, "logps/chosen": -127.08454895019531, "logps/rejected": -573.911865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.530942916870117, "rewards/margins": 44.4328727722168, "rewards/rejected": -51.96381378173828, "step": 1059 }, { "epoch": 0.864953080375357, "grad_norm": 5.194169148126093e-07, "learning_rate": 2.245515092739488e-05, "logits/chosen": -7.237967014312744, "logits/rejected": -7.346175670623779, "logps/chosen": -121.23512268066406, "logps/rejected": -561.6117553710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.756834030151367, "rewards/margins": 44.022064208984375, "rewards/rejected": -50.778900146484375, "step": 1060 }, { "epoch": 0.8657690738474092, "grad_norm": 3.9539072016481214e-10, "learning_rate": 2.2361581457068574e-05, "logits/chosen": -5.940120697021484, "logits/rejected": -6.884821891784668, "logps/chosen": -162.4795379638672, "logps/rejected": -677.3381958007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.761777877807617, "rewards/margins": 50.77714538574219, "rewards/rejected": -60.53892135620117, "step": 1061 }, { "epoch": 0.8665850673194615, "grad_norm": 4.209814960631775e-06, "learning_rate": 2.2268151160284506e-05, "logits/chosen": -5.878019332885742, "logits/rejected": -5.863801956176758, "logps/chosen": -173.04000854492188, "logps/rejected": -560.2630004882812, "loss": 0.3466, "rewards/accuracies": 1.0, "rewards/chosen": -11.771949768066406, "rewards/margins": 37.2284049987793, "rewards/rejected": -49.00035858154297, "step": 1062 }, { "epoch": 0.8674010607915137, "grad_norm": 4.953144980390789e-06, "learning_rate": 2.2174860507511924e-05, "logits/chosen": -6.1771626472473145, "logits/rejected": -6.435300827026367, "logps/chosen": -155.3844757080078, "logps/rejected": -523.6998291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.668807983398438, "rewards/margins": 37.01026916503906, "rewards/rejected": -48.6790771484375, "step": 1063 }, { "epoch": 0.8682170542635659, "grad_norm": 7.1342793894757506e-09, "learning_rate": 2.2081709968516866e-05, "logits/chosen": -6.788031101226807, "logits/rejected": -6.647252082824707, "logps/chosen": -162.41433715820312, "logps/rejected": -589.9866943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.159398078918457, "rewards/margins": 43.03309631347656, "rewards/rejected": -54.1924934387207, "step": 1064 }, { "epoch": 0.8690330477356181, "grad_norm": 2.256143289969259e-07, "learning_rate": 2.1988700012359862e-05, "logits/chosen": -6.5439653396606445, "logits/rejected": -7.532792568206787, "logps/chosen": -162.69049072265625, "logps/rejected": -559.7326049804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.02795696258545, "rewards/margins": 40.27193832397461, "rewards/rejected": -51.29989242553711, "step": 1065 }, { "epoch": 0.8698490412076704, "grad_norm": 1.7011054159266417e-14, "learning_rate": 2.1895831107393484e-05, "logits/chosen": -6.499828338623047, "logits/rejected": -7.074036121368408, "logps/chosen": -146.01454162597656, "logps/rejected": -615.845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.524699211120605, "rewards/margins": 47.546173095703125, "rewards/rejected": -57.07087707519531, "step": 1066 }, { "epoch": 0.8706650346797226, "grad_norm": 0.0014108893228694797, "learning_rate": 2.180310372126005e-05, "logits/chosen": -7.0842413902282715, "logits/rejected": -7.003969192504883, "logps/chosen": -152.21731567382812, "logps/rejected": -581.183837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.377056121826172, "rewards/margins": 42.80815887451172, "rewards/rejected": -52.185211181640625, "step": 1067 }, { "epoch": 0.8714810281517748, "grad_norm": 4.0819239655043305e-10, "learning_rate": 2.1710518320889278e-05, "logits/chosen": -6.24017858505249, "logits/rejected": -6.060520172119141, "logps/chosen": -135.5904541015625, "logps/rejected": -556.2974853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.023902893066406, "rewards/margins": 43.28565216064453, "rewards/rejected": -51.30955123901367, "step": 1068 }, { "epoch": 0.872297021623827, "grad_norm": 6.899627464918012e-07, "learning_rate": 2.1618075372495916e-05, "logits/chosen": -7.085262775421143, "logits/rejected": -7.3647918701171875, "logps/chosen": -126.08404541015625, "logps/rejected": -510.45709228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.333318710327148, "rewards/margins": 38.7557373046875, "rewards/rejected": -46.08905792236328, "step": 1069 }, { "epoch": 0.8731130150958792, "grad_norm": 1.0938968131313231e-07, "learning_rate": 2.1525775341577405e-05, "logits/chosen": -6.8497467041015625, "logits/rejected": -6.840420722961426, "logps/chosen": -176.23971557617188, "logps/rejected": -753.235595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.699023246765137, "rewards/margins": 57.07941818237305, "rewards/rejected": -68.7784423828125, "step": 1070 }, { "epoch": 0.8739290085679314, "grad_norm": 0.6048853397369385, "learning_rate": 2.1433618692911467e-05, "logits/chosen": -6.6051249504089355, "logits/rejected": -6.425624370574951, "logps/chosen": -173.2975616455078, "logps/rejected": -532.9370727539062, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -12.853692054748535, "rewards/margins": 34.506500244140625, "rewards/rejected": -47.36019515991211, "step": 1071 }, { "epoch": 0.8747450020399837, "grad_norm": 8.097737946854977e-08, "learning_rate": 2.1341605890553896e-05, "logits/chosen": -7.170252799987793, "logits/rejected": -7.930485248565674, "logps/chosen": -149.0162353515625, "logps/rejected": -520.9038696289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.66181755065918, "rewards/margins": 36.873130798339844, "rewards/rejected": -47.534950256347656, "step": 1072 }, { "epoch": 0.8755609955120359, "grad_norm": 0.01309832651168108, "learning_rate": 2.124973739783609e-05, "logits/chosen": -6.360030174255371, "logits/rejected": -6.531821250915527, "logps/chosen": -129.0981903076172, "logps/rejected": -530.7021484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.14389705657959, "rewards/margins": 40.84330368041992, "rewards/rejected": -47.98719787597656, "step": 1073 }, { "epoch": 0.8763769889840881, "grad_norm": 1.6493057586330906e-08, "learning_rate": 2.115801367736276e-05, "logits/chosen": -6.163723945617676, "logits/rejected": -7.623319149017334, "logps/chosen": -134.1580810546875, "logps/rejected": -582.4261474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.072746276855469, "rewards/margins": 44.928775787353516, "rewards/rejected": -52.00151824951172, "step": 1074 }, { "epoch": 0.8771929824561403, "grad_norm": 1.0998207899959311e-10, "learning_rate": 2.1066435191009715e-05, "logits/chosen": -6.52783727645874, "logits/rejected": -7.329387664794922, "logps/chosen": -119.3442153930664, "logps/rejected": -569.1777954101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.322475433349609, "rewards/margins": 44.59421920776367, "rewards/rejected": -50.91669464111328, "step": 1075 }, { "epoch": 0.8780089759281926, "grad_norm": 8.36433537187986e-06, "learning_rate": 2.0975002399921324e-05, "logits/chosen": -7.589158058166504, "logits/rejected": -7.865601539611816, "logps/chosen": -157.44847106933594, "logps/rejected": -556.7814331054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.03665542602539, "rewards/margins": 39.63075256347656, "rewards/rejected": -49.66740417480469, "step": 1076 }, { "epoch": 0.8788249694002448, "grad_norm": 4.011652166013846e-08, "learning_rate": 2.0883715764508383e-05, "logits/chosen": -7.3029937744140625, "logits/rejected": -7.299839019775391, "logps/chosen": -128.52310180664062, "logps/rejected": -579.0975341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.110122680664062, "rewards/margins": 43.79547119140625, "rewards/rejected": -52.905601501464844, "step": 1077 }, { "epoch": 0.879640962872297, "grad_norm": 2.6571080979920225e-07, "learning_rate": 2.0792575744445653e-05, "logits/chosen": -6.043122291564941, "logits/rejected": -6.996606826782227, "logps/chosen": -161.1346893310547, "logps/rejected": -534.6522216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.350149154663086, "rewards/margins": 35.942771911621094, "rewards/rejected": -46.29291534423828, "step": 1078 }, { "epoch": 0.8804569563443493, "grad_norm": 5.379956746764947e-07, "learning_rate": 2.0701582798669676e-05, "logits/chosen": -7.248783111572266, "logits/rejected": -7.757346153259277, "logps/chosen": -131.79029846191406, "logps/rejected": -543.0537719726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.092224597930908, "rewards/margins": 42.69084167480469, "rewards/rejected": -48.7830696105957, "step": 1079 }, { "epoch": 0.8812729498164015, "grad_norm": 1.468813243775402e-12, "learning_rate": 2.061073738537635e-05, "logits/chosen": -6.72285795211792, "logits/rejected": -8.076656341552734, "logps/chosen": -111.46786499023438, "logps/rejected": -719.0148315429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.687182426452637, "rewards/margins": 59.80085372924805, "rewards/rejected": -65.488037109375, "step": 1080 }, { "epoch": 0.8820889432884537, "grad_norm": 0.0012002399889752269, "learning_rate": 2.0520039962018693e-05, "logits/chosen": -6.176737308502197, "logits/rejected": -7.30014705657959, "logps/chosen": -119.17707061767578, "logps/rejected": -613.3716430664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.654947757720947, "rewards/margins": 47.773536682128906, "rewards/rejected": -53.42848205566406, "step": 1081 }, { "epoch": 0.882904936760506, "grad_norm": 4.605929149192889e-08, "learning_rate": 2.0429490985304555e-05, "logits/chosen": -6.625246047973633, "logits/rejected": -7.343602180480957, "logps/chosen": -185.56549072265625, "logps/rejected": -566.9921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.205551147460938, "rewards/margins": 38.84259796142578, "rewards/rejected": -52.048152923583984, "step": 1082 }, { "epoch": 0.8837209302325582, "grad_norm": 7.361554921203606e-09, "learning_rate": 2.033909091119419e-05, "logits/chosen": -6.155369281768799, "logits/rejected": -7.763030052185059, "logps/chosen": -117.87129974365234, "logps/rejected": -626.9953002929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.583340644836426, "rewards/margins": 48.83264923095703, "rewards/rejected": -55.41598892211914, "step": 1083 }, { "epoch": 0.8845369237046103, "grad_norm": 1.3784876018441139e-11, "learning_rate": 2.0248840194898156e-05, "logits/chosen": -7.584636211395264, "logits/rejected": -7.552645683288574, "logps/chosen": -142.05650329589844, "logps/rejected": -657.962646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.271238327026367, "rewards/margins": 48.960060119628906, "rewards/rejected": -57.23129653930664, "step": 1084 }, { "epoch": 0.8853529171766625, "grad_norm": 5.786867512957983e-11, "learning_rate": 2.015873929087482e-05, "logits/chosen": -7.70976448059082, "logits/rejected": -7.772372722625732, "logps/chosen": -119.65131378173828, "logps/rejected": -618.71630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.9706711769104, "rewards/margins": 48.60588455200195, "rewards/rejected": -56.576560974121094, "step": 1085 }, { "epoch": 0.8861689106487148, "grad_norm": 4.56409920707862e-13, "learning_rate": 2.0068788652828242e-05, "logits/chosen": -7.023508548736572, "logits/rejected": -7.735404014587402, "logps/chosen": -137.8614959716797, "logps/rejected": -588.4012451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.09121036529541, "rewards/margins": 44.49663162231445, "rewards/rejected": -53.58784103393555, "step": 1086 }, { "epoch": 0.886984904120767, "grad_norm": 7.721093903123588e-11, "learning_rate": 1.9978988733705807e-05, "logits/chosen": -6.564457893371582, "logits/rejected": -7.07019567489624, "logps/chosen": -154.74632263183594, "logps/rejected": -658.6039428710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.322032928466797, "rewards/margins": 50.32207107543945, "rewards/rejected": -60.64410400390625, "step": 1087 }, { "epoch": 0.8878008975928192, "grad_norm": 1.580657826849574e-10, "learning_rate": 1.9889339985695893e-05, "logits/chosen": -6.757185935974121, "logits/rejected": -7.836286544799805, "logps/chosen": -161.19981384277344, "logps/rejected": -667.3612670898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.971321105957031, "rewards/margins": 51.08329772949219, "rewards/rejected": -60.054622650146484, "step": 1088 }, { "epoch": 0.8886168910648715, "grad_norm": 7.982172246556729e-05, "learning_rate": 1.979984286022574e-05, "logits/chosen": -7.548434257507324, "logits/rejected": -7.564579963684082, "logps/chosen": -106.68846893310547, "logps/rejected": -482.2628173828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.329278945922852, "rewards/margins": 36.94706344604492, "rewards/rejected": -43.276344299316406, "step": 1089 }, { "epoch": 0.8894328845369237, "grad_norm": 0.01464772503823042, "learning_rate": 1.971049780795901e-05, "logits/chosen": -7.489234924316406, "logits/rejected": -6.909248352050781, "logps/chosen": -136.76539611816406, "logps/rejected": -549.3875732421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.13443374633789, "rewards/margins": 41.62824630737305, "rewards/rejected": -49.76268005371094, "step": 1090 }, { "epoch": 0.8902488780089759, "grad_norm": 6.513909816741943, "learning_rate": 1.9621305278793656e-05, "logits/chosen": -7.242232322692871, "logits/rejected": -7.105123519897461, "logps/chosen": -123.19844055175781, "logps/rejected": -519.712158203125, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -7.769237041473389, "rewards/margins": 38.58500671386719, "rewards/rejected": -46.354248046875, "step": 1091 }, { "epoch": 0.8910648714810282, "grad_norm": 0.00445168511942029, "learning_rate": 1.95322657218596e-05, "logits/chosen": -7.330022811889648, "logits/rejected": -8.467903137207031, "logps/chosen": -83.82913970947266, "logps/rejected": -558.0216064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.423640727996826, "rewards/margins": 46.97949981689453, "rewards/rejected": -51.40314483642578, "step": 1092 }, { "epoch": 0.8918808649530804, "grad_norm": 0.4584499001502991, "learning_rate": 1.9443379585516414e-05, "logits/chosen": -7.029788970947266, "logits/rejected": -6.823813438415527, "logps/chosen": -157.59278869628906, "logps/rejected": -519.5042724609375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -10.691435813903809, "rewards/margins": 35.665531158447266, "rewards/rejected": -46.356971740722656, "step": 1093 }, { "epoch": 0.8926968584251326, "grad_norm": 0.10085583478212357, "learning_rate": 1.9354647317351188e-05, "logits/chosen": -7.527275085449219, "logits/rejected": -7.690697193145752, "logps/chosen": -124.20128631591797, "logps/rejected": -405.2897644042969, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.294943809509277, "rewards/margins": 27.936065673828125, "rewards/rejected": -36.23101043701172, "step": 1094 }, { "epoch": 0.8935128518971848, "grad_norm": 6.406218017218634e-05, "learning_rate": 1.926606936417614e-05, "logits/chosen": -6.964505195617676, "logits/rejected": -7.423449516296387, "logps/chosen": -105.63005065917969, "logps/rejected": -622.2555541992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.626858234405518, "rewards/margins": 51.00697326660156, "rewards/rejected": -55.63383483886719, "step": 1095 }, { "epoch": 0.8943288453692371, "grad_norm": 1.5364855698862812e-06, "learning_rate": 1.9177646172026513e-05, "logits/chosen": -6.964550018310547, "logits/rejected": -7.298067092895508, "logps/chosen": -120.53507995605469, "logps/rejected": -566.617431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.763580322265625, "rewards/margins": 43.77137756347656, "rewards/rejected": -51.53495788574219, "step": 1096 }, { "epoch": 0.8951448388412893, "grad_norm": 7.084693152137334e-07, "learning_rate": 1.9089378186158154e-05, "logits/chosen": -6.247053623199463, "logits/rejected": -6.804183006286621, "logps/chosen": -128.2218475341797, "logps/rejected": -574.3028564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.5363383293151855, "rewards/margins": 43.97453689575195, "rewards/rejected": -51.51087951660156, "step": 1097 }, { "epoch": 0.8959608323133414, "grad_norm": 9.405973833054304e-05, "learning_rate": 1.900126585104547e-05, "logits/chosen": -6.141485214233398, "logits/rejected": -6.710798263549805, "logps/chosen": -152.0536651611328, "logps/rejected": -703.9891967773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.531711101531982, "rewards/margins": 55.47283935546875, "rewards/rejected": -62.00455093383789, "step": 1098 }, { "epoch": 0.8967768257853937, "grad_norm": 4.145594024312915e-12, "learning_rate": 1.8913309610379015e-05, "logits/chosen": -6.9206366539001465, "logits/rejected": -7.444928169250488, "logps/chosen": -122.89987182617188, "logps/rejected": -616.2235717773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.2836785316467285, "rewards/margins": 48.81224822998047, "rewards/rejected": -56.095924377441406, "step": 1099 }, { "epoch": 0.8975928192574459, "grad_norm": 1.92721860692302e-09, "learning_rate": 1.8825509907063327e-05, "logits/chosen": -6.271638870239258, "logits/rejected": -7.434557914733887, "logps/chosen": -130.59451293945312, "logps/rejected": -686.735107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.4360198974609375, "rewards/margins": 52.326473236083984, "rewards/rejected": -59.76249694824219, "step": 1100 }, { "epoch": 0.8984088127294981, "grad_norm": 5.670936742657555e-14, "learning_rate": 1.8737867183214757e-05, "logits/chosen": -6.653785705566406, "logits/rejected": -6.174010276794434, "logps/chosen": -112.35467529296875, "logps/rejected": -660.4691772460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.7553815841674805, "rewards/margins": 55.008033752441406, "rewards/rejected": -60.76342010498047, "step": 1101 }, { "epoch": 0.8992248062015504, "grad_norm": 6.641878345058083e-13, "learning_rate": 1.8650381880159106e-05, "logits/chosen": -6.876583099365234, "logits/rejected": -7.2240142822265625, "logps/chosen": -167.15916442871094, "logps/rejected": -703.189697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.357057571411133, "rewards/margins": 53.35264587402344, "rewards/rejected": -64.70970916748047, "step": 1102 }, { "epoch": 0.9000407996736026, "grad_norm": 3.306181283607712e-12, "learning_rate": 1.8563054438429545e-05, "logits/chosen": -6.590597629547119, "logits/rejected": -6.5098137855529785, "logps/chosen": -89.85095977783203, "logps/rejected": -527.5140380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.041618585586548, "rewards/margins": 42.57188415527344, "rewards/rejected": -45.613502502441406, "step": 1103 }, { "epoch": 0.9008567931456548, "grad_norm": 3.3265878940369475e-10, "learning_rate": 1.8475885297764305e-05, "logits/chosen": -6.722969055175781, "logits/rejected": -7.50646448135376, "logps/chosen": -144.64962768554688, "logps/rejected": -603.6268310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.486413955688477, "rewards/margins": 44.75495147705078, "rewards/rejected": -54.24136734008789, "step": 1104 }, { "epoch": 0.9016727866177071, "grad_norm": 3.1862359046936035, "learning_rate": 1.838887489710452e-05, "logits/chosen": -6.886572360992432, "logits/rejected": -7.184734344482422, "logps/chosen": -124.53517150878906, "logps/rejected": -474.89617919921875, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -6.605356693267822, "rewards/margins": 36.39944076538086, "rewards/rejected": -43.00479507446289, "step": 1105 }, { "epoch": 0.9024887800897593, "grad_norm": 2.5371835743470683e-09, "learning_rate": 1.8302023674591935e-05, "logits/chosen": -8.003538131713867, "logits/rejected": -7.283989906311035, "logps/chosen": -134.36886596679688, "logps/rejected": -575.0875854492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.103466033935547, "rewards/margins": 42.892120361328125, "rewards/rejected": -51.995582580566406, "step": 1106 }, { "epoch": 0.9033047735618115, "grad_norm": 9.597305589137406e-13, "learning_rate": 1.8215332067566764e-05, "logits/chosen": -7.231374740600586, "logits/rejected": -6.924656391143799, "logps/chosen": -123.76591491699219, "logps/rejected": -704.5792236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.047192573547363, "rewards/margins": 58.187705993652344, "rewards/rejected": -64.23490142822266, "step": 1107 }, { "epoch": 0.9041207670338637, "grad_norm": 2.3438831171090913e-12, "learning_rate": 1.8128800512565513e-05, "logits/chosen": -6.903187274932861, "logits/rejected": -7.065579891204834, "logps/chosen": -131.59291076660156, "logps/rejected": -654.6253662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.1180524826049805, "rewards/margins": 53.143035888671875, "rewards/rejected": -59.26108932495117, "step": 1108 }, { "epoch": 0.904936760505916, "grad_norm": 1.5338572426082386e-11, "learning_rate": 1.804242944531872e-05, "logits/chosen": -7.565974235534668, "logits/rejected": -6.818000793457031, "logps/chosen": -132.14804077148438, "logps/rejected": -589.017333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.287580490112305, "rewards/margins": 45.58216094970703, "rewards/rejected": -52.8697395324707, "step": 1109 }, { "epoch": 0.9057527539779682, "grad_norm": 0.0005592904635705054, "learning_rate": 1.7956219300748793e-05, "logits/chosen": -7.785189151763916, "logits/rejected": -7.464883327484131, "logps/chosen": -147.01756286621094, "logps/rejected": -526.6019897460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.037928581237793, "rewards/margins": 38.86616516113281, "rewards/rejected": -47.90409851074219, "step": 1110 }, { "epoch": 0.9065687474500204, "grad_norm": 2.899985854481601e-15, "learning_rate": 1.7870170512967786e-05, "logits/chosen": -6.504182815551758, "logits/rejected": -5.150595188140869, "logps/chosen": -138.79104614257812, "logps/rejected": -671.259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.208583831787109, "rewards/margins": 51.8308219909668, "rewards/rejected": -59.039405822753906, "step": 1111 }, { "epoch": 0.9073847409220727, "grad_norm": 1.5866433167310845e-11, "learning_rate": 1.778428351527529e-05, "logits/chosen": -7.994728088378906, "logits/rejected": -6.811821937561035, "logps/chosen": -121.04143524169922, "logps/rejected": -663.6031494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.746923446655273, "rewards/margins": 53.36548614501953, "rewards/rejected": -60.11241149902344, "step": 1112 }, { "epoch": 0.9082007343941249, "grad_norm": 2.250407760584494e-06, "learning_rate": 1.7698558740156135e-05, "logits/chosen": -7.287871360778809, "logits/rejected": -7.018043518066406, "logps/chosen": -145.76931762695312, "logps/rejected": -653.7904663085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.466585159301758, "rewards/margins": 49.52812194824219, "rewards/rejected": -58.99470520019531, "step": 1113 }, { "epoch": 0.909016727866177, "grad_norm": 0.0010312001686543226, "learning_rate": 1.7612996619278322e-05, "logits/chosen": -7.187777519226074, "logits/rejected": -6.483051776885986, "logps/chosen": -121.04930114746094, "logps/rejected": -592.9542846679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.347536563873291, "rewards/margins": 47.171630859375, "rewards/rejected": -53.5191650390625, "step": 1114 }, { "epoch": 0.9098327213382293, "grad_norm": 0.0006135955336503685, "learning_rate": 1.7527597583490822e-05, "logits/chosen": -6.956814765930176, "logits/rejected": -7.0183491706848145, "logps/chosen": -168.95904541015625, "logps/rejected": -651.545166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.846373558044434, "rewards/margins": 49.24065017700195, "rewards/rejected": -59.0870246887207, "step": 1115 }, { "epoch": 0.9106487148102815, "grad_norm": 8.86724981635778e-12, "learning_rate": 1.744236206282132e-05, "logits/chosen": -7.3057756423950195, "logits/rejected": -6.457116603851318, "logps/chosen": -118.28044128417969, "logps/rejected": -592.678955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.261096000671387, "rewards/margins": 46.05619812011719, "rewards/rejected": -53.317291259765625, "step": 1116 }, { "epoch": 0.9114647082823337, "grad_norm": 0.044849738478660583, "learning_rate": 1.7357290486474203e-05, "logits/chosen": -7.281096458435059, "logits/rejected": -6.53550386428833, "logps/chosen": -128.2425537109375, "logps/rejected": -682.0823974609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.505494117736816, "rewards/margins": 52.98163604736328, "rewards/rejected": -61.48713684082031, "step": 1117 }, { "epoch": 0.9122807017543859, "grad_norm": 0.0002832544851116836, "learning_rate": 1.7272383282828253e-05, "logits/chosen": -7.709177017211914, "logits/rejected": -7.081418991088867, "logps/chosen": -153.88003540039062, "logps/rejected": -469.3575134277344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.785005569458008, "rewards/margins": 32.31218719482422, "rewards/rejected": -43.097190856933594, "step": 1118 }, { "epoch": 0.9130966952264382, "grad_norm": 1.2912568791989543e-09, "learning_rate": 1.7187640879434557e-05, "logits/chosen": -6.4012250900268555, "logits/rejected": -6.8440117835998535, "logps/chosen": -201.0283966064453, "logps/rejected": -645.6533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.895970344543457, "rewards/margins": 46.06568145751953, "rewards/rejected": -58.96165466308594, "step": 1119 }, { "epoch": 0.9139126886984904, "grad_norm": 2.916928133345209e-05, "learning_rate": 1.7103063703014372e-05, "logits/chosen": -7.56565523147583, "logits/rejected": -7.459561347961426, "logps/chosen": -117.23164367675781, "logps/rejected": -503.66729736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.343488693237305, "rewards/margins": 38.929290771484375, "rewards/rejected": -46.27277755737305, "step": 1120 }, { "epoch": 0.9147286821705426, "grad_norm": 4.093767529411707e-06, "learning_rate": 1.7018652179456956e-05, "logits/chosen": -7.6530303955078125, "logits/rejected": -6.7796831130981445, "logps/chosen": -123.35159301757812, "logps/rejected": -579.0963134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.616498947143555, "rewards/margins": 47.2216682434082, "rewards/rejected": -52.838165283203125, "step": 1121 }, { "epoch": 0.9155446756425949, "grad_norm": 1.4549493554394388e-10, "learning_rate": 1.6934406733817414e-05, "logits/chosen": -7.356721878051758, "logits/rejected": -7.102889060974121, "logps/chosen": -162.35015869140625, "logps/rejected": -615.3013305664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.625631332397461, "rewards/margins": 44.225608825683594, "rewards/rejected": -55.85124206542969, "step": 1122 }, { "epoch": 0.9163606691146471, "grad_norm": 6.164853241612256e-13, "learning_rate": 1.685032779031453e-05, "logits/chosen": -8.18823528289795, "logits/rejected": -7.7823991775512695, "logps/chosen": -134.38963317871094, "logps/rejected": -605.0186767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.6466217041015625, "rewards/margins": 48.22352600097656, "rewards/rejected": -55.87015151977539, "step": 1123 }, { "epoch": 0.9171766625866993, "grad_norm": 8.733581566278126e-12, "learning_rate": 1.676641577232873e-05, "logits/chosen": -7.203513145446777, "logits/rejected": -5.572113513946533, "logps/chosen": -139.7464141845703, "logps/rejected": -722.9542236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.206775665283203, "rewards/margins": 57.72871780395508, "rewards/rejected": -65.93549346923828, "step": 1124 }, { "epoch": 0.9179926560587516, "grad_norm": 2.97726387543662e-06, "learning_rate": 1.6682671102399805e-05, "logits/chosen": -7.392849445343018, "logits/rejected": -6.656040191650391, "logps/chosen": -133.380126953125, "logps/rejected": -601.580322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.270519256591797, "rewards/margins": 47.509674072265625, "rewards/rejected": -54.78018569946289, "step": 1125 }, { "epoch": 0.9188086495308038, "grad_norm": 0.00578769575804472, "learning_rate": 1.6599094202224934e-05, "logits/chosen": -6.7985124588012695, "logits/rejected": -7.019556045532227, "logps/chosen": -123.28785705566406, "logps/rejected": -699.7650756835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.399803638458252, "rewards/margins": 56.948570251464844, "rewards/rejected": -63.34837341308594, "step": 1126 }, { "epoch": 0.919624643002856, "grad_norm": 0.002885531634092331, "learning_rate": 1.6515685492656467e-05, "logits/chosen": -6.5552191734313965, "logits/rejected": -5.966639518737793, "logps/chosen": -108.62010955810547, "logps/rejected": -573.5120849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.737054347991943, "rewards/margins": 46.18852615356445, "rewards/rejected": -51.92558288574219, "step": 1127 }, { "epoch": 0.9204406364749081, "grad_norm": 0.012259893119335175, "learning_rate": 1.64324453936998e-05, "logits/chosen": -7.802640914916992, "logits/rejected": -7.5420241355896, "logps/chosen": -137.82090759277344, "logps/rejected": -628.7078247070312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.774868488311768, "rewards/margins": 47.603729248046875, "rewards/rejected": -55.378597259521484, "step": 1128 }, { "epoch": 0.9212566299469604, "grad_norm": 0.0017162562580779195, "learning_rate": 1.6349374324511345e-05, "logits/chosen": -8.000518798828125, "logits/rejected": -7.864659786224365, "logps/chosen": -171.76800537109375, "logps/rejected": -578.36767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.303716659545898, "rewards/margins": 39.53121566772461, "rewards/rejected": -51.83493423461914, "step": 1129 }, { "epoch": 0.9220726234190126, "grad_norm": 4.784208316308187e-16, "learning_rate": 1.6266472703396286e-05, "logits/chosen": -8.162973403930664, "logits/rejected": -7.72489070892334, "logps/chosen": -155.1001739501953, "logps/rejected": -778.3590087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.033145904541016, "rewards/margins": 59.6159553527832, "rewards/rejected": -68.64910125732422, "step": 1130 }, { "epoch": 0.9228886168910648, "grad_norm": 7.524077931878037e-13, "learning_rate": 1.618374094780662e-05, "logits/chosen": -7.79013729095459, "logits/rejected": -6.463640213012695, "logps/chosen": -147.2064208984375, "logps/rejected": -606.9465942382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.996176719665527, "rewards/margins": 47.201576232910156, "rewards/rejected": -56.19775390625, "step": 1131 }, { "epoch": 0.9237046103631171, "grad_norm": 1.1817553805372558e-14, "learning_rate": 1.610117947433897e-05, "logits/chosen": -7.368673801422119, "logits/rejected": -6.567935943603516, "logps/chosen": -184.85122680664062, "logps/rejected": -683.149169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.347640991210938, "rewards/margins": 50.409568786621094, "rewards/rejected": -62.757205963134766, "step": 1132 }, { "epoch": 0.9245206038351693, "grad_norm": 1.2358280400803778e-05, "learning_rate": 1.6018788698732444e-05, "logits/chosen": -7.418164253234863, "logits/rejected": -7.040445327758789, "logps/chosen": -130.0794219970703, "logps/rejected": -616.0155639648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.998896598815918, "rewards/margins": 48.400390625, "rewards/rejected": -57.399288177490234, "step": 1133 }, { "epoch": 0.9253365973072215, "grad_norm": 0.00032175102387554944, "learning_rate": 1.59365690358667e-05, "logits/chosen": -7.855903625488281, "logits/rejected": -6.898977756500244, "logps/chosen": -204.4007568359375, "logps/rejected": -554.568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.17025089263916, "rewards/margins": 35.45344924926758, "rewards/rejected": -50.62369918823242, "step": 1134 }, { "epoch": 0.9261525907792738, "grad_norm": 2.5940492809350957e-11, "learning_rate": 1.5854520899759657e-05, "logits/chosen": -7.19679069519043, "logits/rejected": -6.397531032562256, "logps/chosen": -173.95635986328125, "logps/rejected": -693.7006225585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.606866836547852, "rewards/margins": 50.43309020996094, "rewards/rejected": -62.03996276855469, "step": 1135 }, { "epoch": 0.926968584251326, "grad_norm": 0.00011674805136863142, "learning_rate": 1.5772644703565565e-05, "logits/chosen": -8.1903076171875, "logits/rejected": -6.931133270263672, "logps/chosen": -182.76751708984375, "logps/rejected": -506.83251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.304527282714844, "rewards/margins": 33.9619140625, "rewards/rejected": -46.266441345214844, "step": 1136 }, { "epoch": 0.9277845777233782, "grad_norm": 1.5606094162182865e-13, "learning_rate": 1.5690940859572862e-05, "logits/chosen": -7.940272808074951, "logits/rejected": -6.599405765533447, "logps/chosen": -147.79598999023438, "logps/rejected": -670.20703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.589943885803223, "rewards/margins": 52.091331481933594, "rewards/rejected": -61.6812744140625, "step": 1137 }, { "epoch": 0.9286005711954305, "grad_norm": 5.2500619953863015e-14, "learning_rate": 1.5609409779202106e-05, "logits/chosen": -6.652703285217285, "logits/rejected": -5.996033668518066, "logps/chosen": -167.40296936035156, "logps/rejected": -835.5472412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.216285705566406, "rewards/margins": 65.1540298461914, "rewards/rejected": -75.37031555175781, "step": 1138 }, { "epoch": 0.9294165646674827, "grad_norm": 2.2538374722103072e-08, "learning_rate": 1.552805187300389e-05, "logits/chosen": -7.212708473205566, "logits/rejected": -7.101888656616211, "logps/chosen": -138.4595489501953, "logps/rejected": -572.2574462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.687419891357422, "rewards/margins": 44.059844970703125, "rewards/rejected": -51.74726867675781, "step": 1139 }, { "epoch": 0.9302325581395349, "grad_norm": 2.3028885113955677e-13, "learning_rate": 1.544686755065677e-05, "logits/chosen": -7.101231575012207, "logits/rejected": -6.739551544189453, "logps/chosen": -118.80558776855469, "logps/rejected": -716.164306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.070984840393066, "rewards/margins": 59.236934661865234, "rewards/rejected": -64.30792236328125, "step": 1140 }, { "epoch": 0.9310485516115871, "grad_norm": 4.8267141394831015e-09, "learning_rate": 1.5365857220965275e-05, "logits/chosen": -7.866279602050781, "logits/rejected": -6.624053001403809, "logps/chosen": -126.85305786132812, "logps/rejected": -568.106201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.591221809387207, "rewards/margins": 43.21352767944336, "rewards/rejected": -51.80474853515625, "step": 1141 }, { "epoch": 0.9318645450836394, "grad_norm": 6.565575461081608e-08, "learning_rate": 1.5285021291857705e-05, "logits/chosen": -7.716455459594727, "logits/rejected": -7.4706549644470215, "logps/chosen": -139.3629608154297, "logps/rejected": -562.4326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.973700523376465, "rewards/margins": 42.324005126953125, "rewards/rejected": -51.297706604003906, "step": 1142 }, { "epoch": 0.9326805385556916, "grad_norm": 2.5960464000718275e-08, "learning_rate": 1.5204360170384286e-05, "logits/chosen": -6.876330852508545, "logits/rejected": -6.53967809677124, "logps/chosen": -113.04824829101562, "logps/rejected": -524.0091552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.215922832489014, "rewards/margins": 40.17789840698242, "rewards/rejected": -46.393821716308594, "step": 1143 }, { "epoch": 0.9334965320277437, "grad_norm": 5.17938624389376e-16, "learning_rate": 1.5123874262714893e-05, "logits/chosen": -6.334572792053223, "logits/rejected": -7.4973344802856445, "logps/chosen": -103.38703918457031, "logps/rejected": -663.0143432617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.974421977996826, "rewards/margins": 55.393951416015625, "rewards/rejected": -59.368370056152344, "step": 1144 }, { "epoch": 0.934312525499796, "grad_norm": 3.6778904366485676e-09, "learning_rate": 1.504356397413713e-05, "logits/chosen": -6.88292121887207, "logits/rejected": -6.712118625640869, "logps/chosen": -196.62811279296875, "logps/rejected": -555.392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.058181762695312, "rewards/margins": 36.528472900390625, "rewards/rejected": -51.58665466308594, "step": 1145 }, { "epoch": 0.9351285189718482, "grad_norm": 1.3774513085706985e-08, "learning_rate": 1.4963429709054322e-05, "logits/chosen": -6.968238353729248, "logits/rejected": -7.149592399597168, "logps/chosen": -160.82444763183594, "logps/rejected": -661.8172607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.320356369018555, "rewards/margins": 49.76243209838867, "rewards/rejected": -60.082794189453125, "step": 1146 }, { "epoch": 0.9359445124439004, "grad_norm": 57.595462799072266, "learning_rate": 1.488347187098335e-05, "logits/chosen": -6.387712478637695, "logits/rejected": -6.201419830322266, "logps/chosen": -186.46096801757812, "logps/rejected": -582.3863525390625, "loss": 2.621, "rewards/accuracies": 0.875, "rewards/chosen": -12.121153831481934, "rewards/margins": 40.907657623291016, "rewards/rejected": -53.02880859375, "step": 1147 }, { "epoch": 0.9367605059159527, "grad_norm": 4.364372836111567e-14, "learning_rate": 1.4803690862552755e-05, "logits/chosen": -6.363498210906982, "logits/rejected": -6.402423858642578, "logps/chosen": -138.15243530273438, "logps/rejected": -687.40625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.487557411193848, "rewards/margins": 54.22333908081055, "rewards/rejected": -62.71089553833008, "step": 1148 }, { "epoch": 0.9375764993880049, "grad_norm": 0.00013180523819755763, "learning_rate": 1.4724087085500627e-05, "logits/chosen": -7.470310211181641, "logits/rejected": -7.133910655975342, "logps/chosen": -116.8299560546875, "logps/rejected": -507.7726745605469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.724471092224121, "rewards/margins": 38.91522979736328, "rewards/rejected": -45.63970184326172, "step": 1149 }, { "epoch": 0.9383924928600571, "grad_norm": 7.351823039236649e-10, "learning_rate": 1.4644660940672627e-05, "logits/chosen": -7.736476898193359, "logits/rejected": -7.318112850189209, "logps/chosen": -108.68844604492188, "logps/rejected": -620.30859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.679969787597656, "rewards/margins": 50.542335510253906, "rewards/rejected": -57.22230529785156, "step": 1150 }, { "epoch": 0.9392084863321093, "grad_norm": 3.1635844134912704e-15, "learning_rate": 1.4565412828019914e-05, "logits/chosen": -7.490141868591309, "logits/rejected": -6.5377020835876465, "logps/chosen": -111.20030212402344, "logps/rejected": -655.0897216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.9962477684021, "rewards/margins": 53.97710037231445, "rewards/rejected": -59.973350524902344, "step": 1151 }, { "epoch": 0.9400244798041616, "grad_norm": 1.786983943929954e-06, "learning_rate": 1.4486343146597154e-05, "logits/chosen": -6.894463539123535, "logits/rejected": -6.610096454620361, "logps/chosen": -148.70208740234375, "logps/rejected": -614.1356811523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.325251579284668, "rewards/margins": 47.48878479003906, "rewards/rejected": -55.81403350830078, "step": 1152 }, { "epoch": 0.9408404732762138, "grad_norm": 0.239919513463974, "learning_rate": 1.4407452294560569e-05, "logits/chosen": -7.477836608886719, "logits/rejected": -7.411771774291992, "logps/chosen": -156.60972595214844, "logps/rejected": -572.1832275390625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -11.323211669921875, "rewards/margins": 40.13239288330078, "rewards/rejected": -51.455604553222656, "step": 1153 }, { "epoch": 0.941656466748266, "grad_norm": 3.2741836585720163e-12, "learning_rate": 1.4328740669165857e-05, "logits/chosen": -8.476795196533203, "logits/rejected": -7.107411861419678, "logps/chosen": -110.84146881103516, "logps/rejected": -498.5096740722656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.696564674377441, "rewards/margins": 39.773563385009766, "rewards/rejected": -45.47012710571289, "step": 1154 }, { "epoch": 0.9424724602203183, "grad_norm": 4.893748709218493e-13, "learning_rate": 1.4250208666766235e-05, "logits/chosen": -7.452364921569824, "logits/rejected": -6.904690742492676, "logps/chosen": -158.56451416015625, "logps/rejected": -753.1148681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.856307983398438, "rewards/margins": 57.11131286621094, "rewards/rejected": -65.96762084960938, "step": 1155 }, { "epoch": 0.9432884536923705, "grad_norm": 4.5783585811420835e-09, "learning_rate": 1.4171856682810386e-05, "logits/chosen": -7.446739196777344, "logits/rejected": -6.9744062423706055, "logps/chosen": -115.9423599243164, "logps/rejected": -488.9326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.02372932434082, "rewards/margins": 36.81303787231445, "rewards/rejected": -43.836769104003906, "step": 1156 }, { "epoch": 0.9441044471644227, "grad_norm": 1.1460034310406542e-10, "learning_rate": 1.4093685111840566e-05, "logits/chosen": -7.902061462402344, "logits/rejected": -7.0689191818237305, "logps/chosen": -124.55236053466797, "logps/rejected": -628.2977294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.823260307312012, "rewards/margins": 50.02910614013672, "rewards/rejected": -57.85236740112305, "step": 1157 }, { "epoch": 0.944920440636475, "grad_norm": 4.6523836866319e-11, "learning_rate": 1.401569434749051e-05, "logits/chosen": -6.825101852416992, "logits/rejected": -6.8212385177612305, "logps/chosen": -127.55630493164062, "logps/rejected": -551.5949096679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.588115692138672, "rewards/margins": 43.65840148925781, "rewards/rejected": -50.246517181396484, "step": 1158 }, { "epoch": 0.9457364341085271, "grad_norm": 5.5529279091581785e-12, "learning_rate": 1.3937884782483484e-05, "logits/chosen": -6.661940574645996, "logits/rejected": -6.778162956237793, "logps/chosen": -116.91413879394531, "logps/rejected": -544.560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.329556465148926, "rewards/margins": 43.01204299926758, "rewards/rejected": -48.34159851074219, "step": 1159 }, { "epoch": 0.9465524275805793, "grad_norm": 6.555318510237385e-10, "learning_rate": 1.3860256808630428e-05, "logits/chosen": -6.4960036277771, "logits/rejected": -6.2599568367004395, "logps/chosen": -166.9423828125, "logps/rejected": -691.970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.256244659423828, "rewards/margins": 52.41993713378906, "rewards/rejected": -62.676177978515625, "step": 1160 }, { "epoch": 0.9473684210526315, "grad_norm": 1.1647310720064308e-11, "learning_rate": 1.3782810816827751e-05, "logits/chosen": -6.808387756347656, "logits/rejected": -6.519811630249023, "logps/chosen": -146.38711547851562, "logps/rejected": -622.4691162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.816367149353027, "rewards/margins": 47.885902404785156, "rewards/rejected": -56.702266693115234, "step": 1161 }, { "epoch": 0.9481844145246838, "grad_norm": 1.932660052261781e-05, "learning_rate": 1.3705547197055584e-05, "logits/chosen": -7.7110185623168945, "logits/rejected": -7.4767069816589355, "logps/chosen": -102.76018524169922, "logps/rejected": -522.746337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.244107723236084, "rewards/margins": 41.5728759765625, "rewards/rejected": -47.816986083984375, "step": 1162 }, { "epoch": 0.949000407996736, "grad_norm": 7.445166705366546e-09, "learning_rate": 1.3628466338375661e-05, "logits/chosen": -8.329639434814453, "logits/rejected": -7.375041484832764, "logps/chosen": -131.94766235351562, "logps/rejected": -537.3768310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.906369209289551, "rewards/margins": 40.61806106567383, "rewards/rejected": -48.52442932128906, "step": 1163 }, { "epoch": 0.9498164014687882, "grad_norm": 6.105855820948136e-09, "learning_rate": 1.3551568628929434e-05, "logits/chosen": -7.5122175216674805, "logits/rejected": -6.8663434982299805, "logps/chosen": -117.64839935302734, "logps/rejected": -662.6881103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.863827705383301, "rewards/margins": 52.91696548461914, "rewards/rejected": -60.780792236328125, "step": 1164 }, { "epoch": 0.9506323949408405, "grad_norm": 1.10815729422592e-12, "learning_rate": 1.3474854455936126e-05, "logits/chosen": -7.421065330505371, "logits/rejected": -6.725497245788574, "logps/chosen": -130.6546173095703, "logps/rejected": -538.4093017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.816342353820801, "rewards/margins": 41.19114685058594, "rewards/rejected": -49.00748825073242, "step": 1165 }, { "epoch": 0.9514483884128927, "grad_norm": 1.983127273508245e-10, "learning_rate": 1.3398324205690742e-05, "logits/chosen": -5.944937705993652, "logits/rejected": -7.573111534118652, "logps/chosen": -173.743896484375, "logps/rejected": -649.1138305664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.04800033569336, "rewards/margins": 46.36924362182617, "rewards/rejected": -56.41724395751953, "step": 1166 }, { "epoch": 0.9522643818849449, "grad_norm": 3.2041879457871125e-13, "learning_rate": 1.3321978263562174e-05, "logits/chosen": -6.576235771179199, "logits/rejected": -6.609488487243652, "logps/chosen": -146.61453247070312, "logps/rejected": -730.4959716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.096159934997559, "rewards/margins": 57.58885955810547, "rewards/rejected": -66.68502044677734, "step": 1167 }, { "epoch": 0.9530803753569972, "grad_norm": 3.2683349082685e-11, "learning_rate": 1.3245817013991162e-05, "logits/chosen": -6.691068649291992, "logits/rejected": -5.754236221313477, "logps/chosen": -159.89694213867188, "logps/rejected": -653.4036865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.657848358154297, "rewards/margins": 47.16819381713867, "rewards/rejected": -57.8260383605957, "step": 1168 }, { "epoch": 0.9538963688290494, "grad_norm": 0.00044441979844123125, "learning_rate": 1.3169840840488501e-05, "logits/chosen": -7.165187835693359, "logits/rejected": -6.763363838195801, "logps/chosen": -176.900390625, "logps/rejected": -563.9129638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.226061820983887, "rewards/margins": 38.74515914916992, "rewards/rejected": -51.971221923828125, "step": 1169 }, { "epoch": 0.9547123623011016, "grad_norm": 1.230113493705387e-11, "learning_rate": 1.3094050125632972e-05, "logits/chosen": -7.381021499633789, "logits/rejected": -7.072745323181152, "logps/chosen": -134.03907775878906, "logps/rejected": -658.152587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.656729698181152, "rewards/margins": 50.3661003112793, "rewards/rejected": -59.022830963134766, "step": 1170 }, { "epoch": 0.9555283557731539, "grad_norm": 2.397080978155941e-09, "learning_rate": 1.301844525106951e-05, "logits/chosen": -7.412420272827148, "logits/rejected": -7.511662483215332, "logps/chosen": -142.014404296875, "logps/rejected": -546.6196899414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.8902997970581055, "rewards/margins": 41.77096176147461, "rewards/rejected": -49.661258697509766, "step": 1171 }, { "epoch": 0.9563443492452061, "grad_norm": 9.642330267567445e-10, "learning_rate": 1.2943026597507269e-05, "logits/chosen": -8.318841934204102, "logits/rejected": -7.3736982345581055, "logps/chosen": -178.55007934570312, "logps/rejected": -592.7235107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.71102237701416, "rewards/margins": 41.220458984375, "rewards/rejected": -53.931480407714844, "step": 1172 }, { "epoch": 0.9571603427172583, "grad_norm": 7.069894114586361e-10, "learning_rate": 1.2867794544717614e-05, "logits/chosen": -7.1330976486206055, "logits/rejected": -6.604722023010254, "logps/chosen": -132.3365936279297, "logps/rejected": -677.7643432617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.188941955566406, "rewards/margins": 54.23170471191406, "rewards/rejected": -61.420650482177734, "step": 1173 }, { "epoch": 0.9579763361893104, "grad_norm": 1.1347529166316761e-10, "learning_rate": 1.2792749471532362e-05, "logits/chosen": -7.588987350463867, "logits/rejected": -7.235241889953613, "logps/chosen": -115.8415298461914, "logps/rejected": -551.7445068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.387536525726318, "rewards/margins": 44.25907897949219, "rewards/rejected": -50.646610260009766, "step": 1174 }, { "epoch": 0.9587923296613627, "grad_norm": 1.3131322695869585e-08, "learning_rate": 1.2717891755841722e-05, "logits/chosen": -8.00667953491211, "logits/rejected": -7.624996185302734, "logps/chosen": -79.44429016113281, "logps/rejected": -561.45263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.733358144760132, "rewards/margins": 46.25382995605469, "rewards/rejected": -48.98719024658203, "step": 1175 }, { "epoch": 0.9596083231334149, "grad_norm": 1.720240128122441e-08, "learning_rate": 1.2643221774592518e-05, "logits/chosen": -6.5198564529418945, "logits/rejected": -6.654117107391357, "logps/chosen": -144.39883422851562, "logps/rejected": -604.1370239257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.356054306030273, "rewards/margins": 45.760643005371094, "rewards/rejected": -54.116695404052734, "step": 1176 }, { "epoch": 0.9604243166054671, "grad_norm": 5.306274530880728e-09, "learning_rate": 1.2568739903786213e-05, "logits/chosen": -6.975401878356934, "logits/rejected": -7.278813362121582, "logps/chosen": -165.1144561767578, "logps/rejected": -683.4727783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.254161834716797, "rewards/margins": 48.53536605834961, "rewards/rejected": -59.78952407836914, "step": 1177 }, { "epoch": 0.9612403100775194, "grad_norm": 3.578601763365441e-06, "learning_rate": 1.2494446518477022e-05, "logits/chosen": -6.672177314758301, "logits/rejected": -6.891894340515137, "logps/chosen": -149.00595092773438, "logps/rejected": -570.6575927734375, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -8.847668647766113, "rewards/margins": 42.87446212768555, "rewards/rejected": -51.722129821777344, "step": 1178 }, { "epoch": 0.9620563035495716, "grad_norm": 2.9154505415754373e-13, "learning_rate": 1.242034199277008e-05, "logits/chosen": -7.5415239334106445, "logits/rejected": -7.203076362609863, "logps/chosen": -134.08590698242188, "logps/rejected": -718.6574096679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.6733551025390625, "rewards/margins": 59.024940490722656, "rewards/rejected": -65.69830322265625, "step": 1179 }, { "epoch": 0.9628722970216238, "grad_norm": 2.0049428783863732e-10, "learning_rate": 1.2346426699819458e-05, "logits/chosen": -7.3151421546936035, "logits/rejected": -7.364412307739258, "logps/chosen": -122.55911254882812, "logps/rejected": -536.5571899414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.423955917358398, "rewards/margins": 41.1040153503418, "rewards/rejected": -48.52796936035156, "step": 1180 }, { "epoch": 0.9636882904936761, "grad_norm": 3.157387880037277e-07, "learning_rate": 1.2272701011826392e-05, "logits/chosen": -7.301874160766602, "logits/rejected": -7.297002792358398, "logps/chosen": -169.69532775878906, "logps/rejected": -527.0130615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.295766830444336, "rewards/margins": 37.11810302734375, "rewards/rejected": -47.41386795043945, "step": 1181 }, { "epoch": 0.9645042839657283, "grad_norm": 1.5163230273174122e-05, "learning_rate": 1.2199165300037357e-05, "logits/chosen": -6.6611528396606445, "logits/rejected": -6.59444522857666, "logps/chosen": -129.31434631347656, "logps/rejected": -539.4876708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.092068195343018, "rewards/margins": 41.8177490234375, "rewards/rejected": -48.90981674194336, "step": 1182 }, { "epoch": 0.9653202774377805, "grad_norm": 1.4835246986422135e-07, "learning_rate": 1.2125819934742188e-05, "logits/chosen": -6.827138900756836, "logits/rejected": -6.617217063903809, "logps/chosen": -172.8956298828125, "logps/rejected": -502.69268798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.353939056396484, "rewards/margins": 32.40199661254883, "rewards/rejected": -45.75593566894531, "step": 1183 }, { "epoch": 0.9661362709098327, "grad_norm": 0.00010350630327593535, "learning_rate": 1.205266528527223e-05, "logits/chosen": -6.993449687957764, "logits/rejected": -6.378485679626465, "logps/chosen": -132.8719940185547, "logps/rejected": -591.1376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.50363540649414, "rewards/margins": 45.11537170410156, "rewards/rejected": -53.61901092529297, "step": 1184 }, { "epoch": 0.966952264381885, "grad_norm": 5.134938030337821e-10, "learning_rate": 1.1979701719998453e-05, "logits/chosen": -7.4138712882995605, "logits/rejected": -7.599052429199219, "logps/chosen": -118.30844116210938, "logps/rejected": -602.02294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.423765182495117, "rewards/margins": 47.58564758300781, "rewards/rejected": -55.0094108581543, "step": 1185 }, { "epoch": 0.9677682578539372, "grad_norm": 5.2948667672581e-10, "learning_rate": 1.190692960632968e-05, "logits/chosen": -6.299568176269531, "logits/rejected": -7.079009056091309, "logps/chosen": -157.91970825195312, "logps/rejected": -576.1212158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.819087982177734, "rewards/margins": 41.50067901611328, "rewards/rejected": -52.319766998291016, "step": 1186 }, { "epoch": 0.9685842513259894, "grad_norm": 8.755216549616307e-05, "learning_rate": 1.1834349310710608e-05, "logits/chosen": -6.601018905639648, "logits/rejected": -6.8838276863098145, "logps/chosen": -189.55368041992188, "logps/rejected": -552.22216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.342903137207031, "rewards/margins": 36.28550338745117, "rewards/rejected": -49.6284065246582, "step": 1187 }, { "epoch": 0.9694002447980417, "grad_norm": 1.5229724220026247e-13, "learning_rate": 1.176196119862008e-05, "logits/chosen": -7.3868088722229, "logits/rejected": -7.764196872711182, "logps/chosen": -138.33419799804688, "logps/rejected": -706.6485595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.26193904876709, "rewards/margins": 54.693050384521484, "rewards/rejected": -63.954994201660156, "step": 1188 }, { "epoch": 0.9702162382700938, "grad_norm": 0.0005634170374833047, "learning_rate": 1.16897656345692e-05, "logits/chosen": -8.347718238830566, "logits/rejected": -6.573906898498535, "logps/chosen": -161.07168579101562, "logps/rejected": -618.697509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.68581771850586, "rewards/margins": 45.176273345947266, "rewards/rejected": -55.862091064453125, "step": 1189 }, { "epoch": 0.971032231742146, "grad_norm": 8.249015692740613e-09, "learning_rate": 1.1617762982099446e-05, "logits/chosen": -7.556419849395752, "logits/rejected": -6.703402042388916, "logps/chosen": -157.7227325439453, "logps/rejected": -583.4690551757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.660643577575684, "rewards/margins": 42.134910583496094, "rewards/rejected": -52.795555114746094, "step": 1190 }, { "epoch": 0.9718482252141983, "grad_norm": 3.958480244981022e-12, "learning_rate": 1.154595360378095e-05, "logits/chosen": -6.871547222137451, "logits/rejected": -6.272480010986328, "logps/chosen": -161.5709686279297, "logps/rejected": -560.5164794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.906436920166016, "rewards/margins": 40.964317321777344, "rewards/rejected": -51.87075424194336, "step": 1191 }, { "epoch": 0.9726642186862505, "grad_norm": 2.543975519131436e-08, "learning_rate": 1.1474337861210543e-05, "logits/chosen": -7.109729766845703, "logits/rejected": -7.458901405334473, "logps/chosen": -107.54964447021484, "logps/rejected": -666.8688354492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.237298965454102, "rewards/margins": 55.90203857421875, "rewards/rejected": -61.13933563232422, "step": 1192 }, { "epoch": 0.9734802121583027, "grad_norm": 9.199165748271544e-15, "learning_rate": 1.140291611501006e-05, "logits/chosen": -6.861810207366943, "logits/rejected": -7.071361541748047, "logps/chosen": -108.58892059326172, "logps/rejected": -613.8567504882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.899258613586426, "rewards/margins": 49.370548248291016, "rewards/rejected": -55.269805908203125, "step": 1193 }, { "epoch": 0.9742962056303549, "grad_norm": 3.105233847833233e-09, "learning_rate": 1.133168872482444e-05, "logits/chosen": -8.018560409545898, "logits/rejected": -7.898011684417725, "logps/chosen": -105.75494384765625, "logps/rejected": -494.26763916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.550743579864502, "rewards/margins": 37.4456901550293, "rewards/rejected": -41.996437072753906, "step": 1194 }, { "epoch": 0.9751121991024072, "grad_norm": 2.60737635926489e-07, "learning_rate": 1.1260656049319957e-05, "logits/chosen": -6.26933479309082, "logits/rejected": -6.781866550445557, "logps/chosen": -139.738037109375, "logps/rejected": -577.3156127929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.975550651550293, "rewards/margins": 44.57779312133789, "rewards/rejected": -52.553340911865234, "step": 1195 }, { "epoch": 0.9759281925744594, "grad_norm": 1.3501046947084205e-09, "learning_rate": 1.118981844618236e-05, "logits/chosen": -8.681135177612305, "logits/rejected": -8.047524452209473, "logps/chosen": -126.64323425292969, "logps/rejected": -614.5921630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.046533584594727, "rewards/margins": 48.0692024230957, "rewards/rejected": -56.1157341003418, "step": 1196 }, { "epoch": 0.9767441860465116, "grad_norm": 7.885290185562221e-13, "learning_rate": 1.1119176272115128e-05, "logits/chosen": -7.519394397735596, "logits/rejected": -6.700594902038574, "logps/chosen": -111.2757339477539, "logps/rejected": -665.313720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.500655651092529, "rewards/margins": 55.557701110839844, "rewards/rejected": -60.05835723876953, "step": 1197 }, { "epoch": 0.9775601795185639, "grad_norm": 8.065640516585759e-11, "learning_rate": 1.1048729882837671e-05, "logits/chosen": -6.900045394897461, "logits/rejected": -7.19952917098999, "logps/chosen": -147.1121368408203, "logps/rejected": -652.808349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.093424797058105, "rewards/margins": 49.728519439697266, "rewards/rejected": -58.82194519042969, "step": 1198 }, { "epoch": 0.9783761729906161, "grad_norm": 2.7038841746650742e-15, "learning_rate": 1.097847963308351e-05, "logits/chosen": -6.887539386749268, "logits/rejected": -6.322705268859863, "logps/chosen": -153.5152587890625, "logps/rejected": -730.6309814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.233983993530273, "rewards/margins": 56.143245697021484, "rewards/rejected": -65.37722778320312, "step": 1199 }, { "epoch": 0.9791921664626683, "grad_norm": 1.3142435750790327e-10, "learning_rate": 1.090842587659851e-05, "logits/chosen": -8.432363510131836, "logits/rejected": -7.476741790771484, "logps/chosen": -121.04109191894531, "logps/rejected": -641.85009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.5229878425598145, "rewards/margins": 49.804107666015625, "rewards/rejected": -57.32709503173828, "step": 1200 }, { "epoch": 0.9800081599347206, "grad_norm": 2.1271162864399145e-17, "learning_rate": 1.0838568966139074e-05, "logits/chosen": -6.760540962219238, "logits/rejected": -5.69184684753418, "logps/chosen": -118.1841812133789, "logps/rejected": -713.193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.794841766357422, "rewards/margins": 58.77915954589844, "rewards/rejected": -65.57400512695312, "step": 1201 }, { "epoch": 0.9808241534067728, "grad_norm": 1.4891141653095019e-08, "learning_rate": 1.076890925347041e-05, "logits/chosen": -6.274911403656006, "logits/rejected": -6.333632469177246, "logps/chosen": -127.88924407958984, "logps/rejected": -569.3750610351562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.304553508758545, "rewards/margins": 43.45020294189453, "rewards/rejected": -49.754756927490234, "step": 1202 }, { "epoch": 0.981640146878825, "grad_norm": 5.892308763577603e-06, "learning_rate": 1.0699447089364712e-05, "logits/chosen": -6.731015205383301, "logits/rejected": -6.377777576446533, "logps/chosen": -148.1463623046875, "logps/rejected": -651.812255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.421847343444824, "rewards/margins": 48.85506820678711, "rewards/rejected": -57.276912689208984, "step": 1203 }, { "epoch": 0.9824561403508771, "grad_norm": 2.674157073556671e-08, "learning_rate": 1.0630182823599399e-05, "logits/chosen": -6.38957405090332, "logits/rejected": -6.396829128265381, "logps/chosen": -155.8629913330078, "logps/rejected": -670.80712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.158661842346191, "rewards/margins": 50.75440216064453, "rewards/rejected": -60.91306686401367, "step": 1204 }, { "epoch": 0.9832721338229294, "grad_norm": 6.047259589782306e-13, "learning_rate": 1.056111680495545e-05, "logits/chosen": -6.436957359313965, "logits/rejected": -5.812904357910156, "logps/chosen": -126.07466888427734, "logps/rejected": -615.0796508789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.61015510559082, "rewards/margins": 48.11015319824219, "rewards/rejected": -53.72031021118164, "step": 1205 }, { "epoch": 0.9840881272949816, "grad_norm": 6.713815701303583e-12, "learning_rate": 1.049224938121548e-05, "logits/chosen": -8.130189895629883, "logits/rejected": -7.983138084411621, "logps/chosen": -104.77740478515625, "logps/rejected": -597.3709106445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.8045196533203125, "rewards/margins": 47.55481719970703, "rewards/rejected": -54.359336853027344, "step": 1206 }, { "epoch": 0.9849041207670338, "grad_norm": 1.245718522113748e-06, "learning_rate": 1.0423580899162133e-05, "logits/chosen": -6.6769633293151855, "logits/rejected": -6.845852851867676, "logps/chosen": -141.20864868164062, "logps/rejected": -575.0076904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.243289947509766, "rewards/margins": 43.26742935180664, "rewards/rejected": -50.51071548461914, "step": 1207 }, { "epoch": 0.9857201142390861, "grad_norm": 3.490556368251907e-13, "learning_rate": 1.0355111704576238e-05, "logits/chosen": -6.150079727172852, "logits/rejected": -7.331273078918457, "logps/chosen": -209.89373779296875, "logps/rejected": -760.2815551757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.5997896194458, "rewards/margins": 53.37818145751953, "rewards/rejected": -66.97798156738281, "step": 1208 }, { "epoch": 0.9865361077111383, "grad_norm": 6.48427800342688e-09, "learning_rate": 1.028684214223516e-05, "logits/chosen": -7.498891353607178, "logits/rejected": -6.306125640869141, "logps/chosen": -166.05792236328125, "logps/rejected": -615.0819091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.760988235473633, "rewards/margins": 43.83699035644531, "rewards/rejected": -54.597984313964844, "step": 1209 }, { "epoch": 0.9873521011831905, "grad_norm": 1.6110162093241343e-08, "learning_rate": 1.0218772555910955e-05, "logits/chosen": -7.548478126525879, "logits/rejected": -7.327568531036377, "logps/chosen": -120.59208679199219, "logps/rejected": -514.9014282226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.084129810333252, "rewards/margins": 38.792118072509766, "rewards/rejected": -45.876243591308594, "step": 1210 }, { "epoch": 0.9881680946552428, "grad_norm": 9.02234205568675e-07, "learning_rate": 1.0150903288368741e-05, "logits/chosen": -7.104111671447754, "logits/rejected": -6.9761576652526855, "logps/chosen": -139.26852416992188, "logps/rejected": -597.6561279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.47894287109375, "rewards/margins": 44.05740737915039, "rewards/rejected": -52.53635025024414, "step": 1211 }, { "epoch": 0.988984088127295, "grad_norm": 1.682142793113517e-11, "learning_rate": 1.0083234681364934e-05, "logits/chosen": -6.626518249511719, "logits/rejected": -6.173725605010986, "logps/chosen": -168.92864990234375, "logps/rejected": -612.1477661132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.10346794128418, "rewards/margins": 47.119117736816406, "rewards/rejected": -56.22257995605469, "step": 1212 }, { "epoch": 0.9898000815993472, "grad_norm": 3.943911508486053e-08, "learning_rate": 1.0015767075645471e-05, "logits/chosen": -7.167057514190674, "logits/rejected": -5.927870750427246, "logps/chosen": -182.6990966796875, "logps/rejected": -632.1954956054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.133193969726562, "rewards/margins": 44.82081604003906, "rewards/rejected": -57.954010009765625, "step": 1213 }, { "epoch": 0.9906160750713995, "grad_norm": 1.1014351757043794e-11, "learning_rate": 9.948500810944217e-06, "logits/chosen": -6.579416275024414, "logits/rejected": -7.133519172668457, "logps/chosen": -152.195068359375, "logps/rejected": -746.4486083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.919594764709473, "rewards/margins": 58.59280014038086, "rewards/rejected": -67.51239013671875, "step": 1214 }, { "epoch": 0.9914320685434517, "grad_norm": 4.788440130520932e-11, "learning_rate": 9.881436225981106e-06, "logits/chosen": -7.7801127433776855, "logits/rejected": -7.2415595054626465, "logps/chosen": -123.31609344482422, "logps/rejected": -634.88623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.8136186599731445, "rewards/margins": 49.739662170410156, "rewards/rejected": -56.553279876708984, "step": 1215 }, { "epoch": 0.9922480620155039, "grad_norm": 0.003723371308296919, "learning_rate": 9.814573658460562e-06, "logits/chosen": -7.096724033355713, "logits/rejected": -7.220358371734619, "logps/chosen": -116.85345458984375, "logps/rejected": -643.42626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.704996109008789, "rewards/margins": 52.087581634521484, "rewards/rejected": -58.792579650878906, "step": 1216 }, { "epoch": 0.993064055487556, "grad_norm": 2.8201320390053297e-08, "learning_rate": 9.747913445069751e-06, "logits/chosen": -7.058794021606445, "logits/rejected": -7.265246868133545, "logps/chosen": -132.57925415039062, "logps/rejected": -515.8441162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.205917358398438, "rewards/margins": 38.461082458496094, "rewards/rejected": -46.66699981689453, "step": 1217 }, { "epoch": 0.9938800489596084, "grad_norm": 3.6935496328105444e-10, "learning_rate": 9.681455921476839e-06, "logits/chosen": -6.473876953125, "logits/rejected": -6.360503196716309, "logps/chosen": -145.86105346679688, "logps/rejected": -543.614501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.06406307220459, "rewards/margins": 40.74214172363281, "rewards/rejected": -49.80619812011719, "step": 1218 }, { "epoch": 0.9946960424316605, "grad_norm": 5.3325999033404514e-05, "learning_rate": 9.615201422329406e-06, "logits/chosen": -7.338897705078125, "logits/rejected": -7.164212226867676, "logps/chosen": -128.2718048095703, "logps/rejected": -644.8291625976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.635087966918945, "rewards/margins": 50.407859802246094, "rewards/rejected": -58.042945861816406, "step": 1219 }, { "epoch": 0.9955120359037127, "grad_norm": 7.944923066877455e-14, "learning_rate": 9.549150281252633e-06, "logits/chosen": -6.915926456451416, "logits/rejected": -7.6222453117370605, "logps/chosen": -160.65782165527344, "logps/rejected": -753.6861572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.034055709838867, "rewards/margins": 57.10554504394531, "rewards/rejected": -68.13960266113281, "step": 1220 }, { "epoch": 0.996328029375765, "grad_norm": 1.6270222147340974e-07, "learning_rate": 9.48330283084774e-06, "logits/chosen": -6.638601779937744, "logits/rejected": -7.039539337158203, "logps/chosen": -160.74217224121094, "logps/rejected": -593.12890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.553316116333008, "rewards/margins": 44.80897521972656, "rewards/rejected": -53.36228942871094, "step": 1221 }, { "epoch": 0.9971440228478172, "grad_norm": 6.659919108642498e-06, "learning_rate": 9.417659402690253e-06, "logits/chosen": -7.088401794433594, "logits/rejected": -6.5077314376831055, "logps/chosen": -133.61317443847656, "logps/rejected": -566.8411865234375, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -8.338176727294922, "rewards/margins": 43.00834274291992, "rewards/rejected": -51.346519470214844, "step": 1222 }, { "epoch": 0.9979600163198694, "grad_norm": 3.0985180783996147e-13, "learning_rate": 9.35222032732831e-06, "logits/chosen": -6.8370442390441895, "logits/rejected": -6.942171096801758, "logps/chosen": -106.57246398925781, "logps/rejected": -566.3816528320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.329244613647461, "rewards/margins": 45.723201751708984, "rewards/rejected": -52.05244445800781, "step": 1223 }, { "epoch": 0.9987760097919217, "grad_norm": 9.852354878603364e-07, "learning_rate": 9.286985934281078e-06, "logits/chosen": -6.700427532196045, "logits/rejected": -6.074217319488525, "logps/chosen": -115.6702880859375, "logps/rejected": -542.1293334960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.3618245124816895, "rewards/margins": 43.02632522583008, "rewards/rejected": -49.38814926147461, "step": 1224 }, { "epoch": 0.9995920032639739, "grad_norm": 3.020629943005157e-14, "learning_rate": 9.221956552036992e-06, "logits/chosen": -7.565042972564697, "logits/rejected": -7.349754333496094, "logps/chosen": -140.63417053222656, "logps/rejected": -682.6019287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.696273803710938, "rewards/margins": 52.91769790649414, "rewards/rejected": -61.613975524902344, "step": 1225 }, { "epoch": 1.0004079967360262, "grad_norm": 1.488035508145913e-07, "learning_rate": 9.157132508052208e-06, "logits/chosen": -7.140951633453369, "logits/rejected": -6.527490139007568, "logps/chosen": -171.91326904296875, "logps/rejected": -637.42333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.657440185546875, "rewards/margins": 46.52783966064453, "rewards/rejected": -57.185272216796875, "step": 1226 }, { "epoch": 1.0012239902080784, "grad_norm": 8.862591489207654e-16, "learning_rate": 9.09251412874882e-06, "logits/chosen": -6.085210800170898, "logits/rejected": -6.903650760650635, "logps/chosen": -132.61827087402344, "logps/rejected": -610.6260986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.25225305557251, "rewards/margins": 48.20720672607422, "rewards/rejected": -55.45945739746094, "step": 1227 }, { "epoch": 1.0020399836801306, "grad_norm": 4.015427920722736e-11, "learning_rate": 9.028101739513406e-06, "logits/chosen": -6.275026798248291, "logits/rejected": -6.382984638214111, "logps/chosen": -130.1084747314453, "logps/rejected": -529.5577392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.645136833190918, "rewards/margins": 39.5250358581543, "rewards/rejected": -47.170169830322266, "step": 1228 }, { "epoch": 1.0028559771521828, "grad_norm": 1.4984636864667777e-09, "learning_rate": 8.963895664695187e-06, "logits/chosen": -7.1286234855651855, "logits/rejected": -6.8225836753845215, "logps/chosen": -138.27691650390625, "logps/rejected": -681.060302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.978051662445068, "rewards/margins": 53.708194732666016, "rewards/rejected": -61.686241149902344, "step": 1229 }, { "epoch": 1.003671970624235, "grad_norm": 1.7546362869325094e-06, "learning_rate": 8.899896227604509e-06, "logits/chosen": -6.760880470275879, "logits/rejected": -6.549430847167969, "logps/chosen": -130.531494140625, "logps/rejected": -625.9953002929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.333934783935547, "rewards/margins": 49.64347839355469, "rewards/rejected": -57.9774169921875, "step": 1230 }, { "epoch": 1.0044879640962872, "grad_norm": 1.8995329753579426e-08, "learning_rate": 8.836103750511215e-06, "logits/chosen": -7.572076797485352, "logits/rejected": -6.773073673248291, "logps/chosen": -112.94912719726562, "logps/rejected": -634.3078002929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.267877101898193, "rewards/margins": 50.016624450683594, "rewards/rejected": -57.28450012207031, "step": 1231 }, { "epoch": 1.0053039575683393, "grad_norm": 5.763019395033098e-09, "learning_rate": 8.772518554642973e-06, "logits/chosen": -7.916407108306885, "logits/rejected": -7.29710578918457, "logps/chosen": -155.41546630859375, "logps/rejected": -635.27880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.5095853805542, "rewards/margins": 48.90849304199219, "rewards/rejected": -57.41807556152344, "step": 1232 }, { "epoch": 1.0061199510403918, "grad_norm": 2.864141649544516e-12, "learning_rate": 8.709140960183686e-06, "logits/chosen": -7.281870365142822, "logits/rejected": -6.464560508728027, "logps/chosen": -115.35570526123047, "logps/rejected": -569.22412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.73897647857666, "rewards/margins": 43.10210037231445, "rewards/rejected": -50.84107971191406, "step": 1233 }, { "epoch": 1.006935944512444, "grad_norm": 4.679904019866399e-09, "learning_rate": 8.645971286271904e-06, "logits/chosen": -7.118518352508545, "logits/rejected": -6.249866962432861, "logps/chosen": -180.73731994628906, "logps/rejected": -683.6807861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.523308753967285, "rewards/margins": 49.3570671081543, "rewards/rejected": -61.88037872314453, "step": 1234 }, { "epoch": 1.0077519379844961, "grad_norm": 4.135182791742409e-07, "learning_rate": 8.58300985099918e-06, "logits/chosen": -6.528001308441162, "logits/rejected": -7.514505386352539, "logps/chosen": -140.39382934570312, "logps/rejected": -678.105224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.688582420349121, "rewards/margins": 54.33555603027344, "rewards/rejected": -63.024139404296875, "step": 1235 }, { "epoch": 1.0085679314565483, "grad_norm": 2.1781064365317127e-12, "learning_rate": 8.520256971408453e-06, "logits/chosen": -7.003327369689941, "logits/rejected": -6.935664653778076, "logps/chosen": -160.90737915039062, "logps/rejected": -653.5100708007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.879807472229004, "rewards/margins": 49.23138427734375, "rewards/rejected": -59.11119079589844, "step": 1236 }, { "epoch": 1.0093839249286005, "grad_norm": 2.8544886845338624e-06, "learning_rate": 8.457712963492493e-06, "logits/chosen": -7.331467151641846, "logits/rejected": -7.315871238708496, "logps/chosen": -137.24066162109375, "logps/rejected": -610.6403198242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.597768783569336, "rewards/margins": 45.736412048339844, "rewards/rejected": -55.33417892456055, "step": 1237 }, { "epoch": 1.0101999184006527, "grad_norm": 0.013579295016825199, "learning_rate": 8.395378142192306e-06, "logits/chosen": -6.830997467041016, "logits/rejected": -6.696641445159912, "logps/chosen": -161.0510711669922, "logps/rejected": -604.6161499023438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.914960861206055, "rewards/margins": 43.792240142822266, "rewards/rejected": -54.70720672607422, "step": 1238 }, { "epoch": 1.0110159118727051, "grad_norm": 3.8810918340459466e-05, "learning_rate": 8.333252821395526e-06, "logits/chosen": -6.242073059082031, "logits/rejected": -6.882124423980713, "logps/chosen": -110.51914978027344, "logps/rejected": -566.4849243164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.251533508300781, "rewards/margins": 45.33150863647461, "rewards/rejected": -51.583045959472656, "step": 1239 }, { "epoch": 1.0118319053447573, "grad_norm": 3.768689353644827e-12, "learning_rate": 8.271337313934869e-06, "logits/chosen": -6.215544700622559, "logits/rejected": -6.061234474182129, "logps/chosen": -165.958740234375, "logps/rejected": -717.47705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.189754486083984, "rewards/margins": 55.801090240478516, "rewards/rejected": -65.9908447265625, "step": 1240 }, { "epoch": 1.0126478988168095, "grad_norm": 4.804393327617618e-09, "learning_rate": 8.209631931586498e-06, "logits/chosen": -8.171195030212402, "logits/rejected": -7.504464626312256, "logps/chosen": -169.11199951171875, "logps/rejected": -598.2391357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.800472259521484, "rewards/margins": 42.39384460449219, "rewards/rejected": -54.194313049316406, "step": 1241 }, { "epoch": 1.0134638922888617, "grad_norm": 2.6732990854891423e-15, "learning_rate": 8.148136985068488e-06, "logits/chosen": -7.378259658813477, "logits/rejected": -7.449681282043457, "logps/chosen": -126.22602844238281, "logps/rejected": -657.210205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.895766735076904, "rewards/margins": 51.619319915771484, "rewards/rejected": -58.51508712768555, "step": 1242 }, { "epoch": 1.0142798857609139, "grad_norm": 0.0002938231045845896, "learning_rate": 8.086852784039289e-06, "logits/chosen": -7.326908111572266, "logits/rejected": -6.503884315490723, "logps/chosen": -135.04226684570312, "logps/rejected": -505.71185302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.90419864654541, "rewards/margins": 37.590415954589844, "rewards/rejected": -46.49460983276367, "step": 1243 }, { "epoch": 1.015095879232966, "grad_norm": 3.374212859623863e-20, "learning_rate": 8.025779637096137e-06, "logits/chosen": -7.523876190185547, "logits/rejected": -6.042084693908691, "logps/chosen": -109.65963745117188, "logps/rejected": -696.98193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.134560585021973, "rewards/margins": 57.415443420410156, "rewards/rejected": -62.55000686645508, "step": 1244 }, { "epoch": 1.0159118727050183, "grad_norm": 1.0010700601803535e-10, "learning_rate": 7.964917851773496e-06, "logits/chosen": -7.2686920166015625, "logits/rejected": -6.442315578460693, "logps/chosen": -147.14334106445312, "logps/rejected": -649.0411987304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.916247367858887, "rewards/margins": 50.746238708496094, "rewards/rejected": -58.6624870300293, "step": 1245 }, { "epoch": 1.0167278661770707, "grad_norm": 2.2799486032454297e-06, "learning_rate": 7.904267734541498e-06, "logits/chosen": -6.975922107696533, "logits/rejected": -7.319137096405029, "logps/chosen": -130.5023193359375, "logps/rejected": -497.48785400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.318998336791992, "rewards/margins": 37.781734466552734, "rewards/rejected": -45.10073471069336, "step": 1246 }, { "epoch": 1.0175438596491229, "grad_norm": 1.3104218510306964e-07, "learning_rate": 7.843829590804458e-06, "logits/chosen": -7.651801109313965, "logits/rejected": -7.2404279708862305, "logps/chosen": -101.60566711425781, "logps/rejected": -551.3975219726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.832644462585449, "rewards/margins": 44.22163772583008, "rewards/rejected": -50.054283142089844, "step": 1247 }, { "epoch": 1.018359853121175, "grad_norm": 6.89001247955413e-15, "learning_rate": 7.783603724899257e-06, "logits/chosen": -7.056883811950684, "logits/rejected": -6.760322570800781, "logps/chosen": -100.06416320800781, "logps/rejected": -620.7057495117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3399808406829834, "rewards/margins": 52.90917205810547, "rewards/rejected": -56.24915313720703, "step": 1248 }, { "epoch": 1.0191758465932272, "grad_norm": 4.8903432414704184e-09, "learning_rate": 7.723590440093848e-06, "logits/chosen": -7.529638290405273, "logits/rejected": -6.6077680587768555, "logps/chosen": -173.73001098632812, "logps/rejected": -632.2479858398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.134082794189453, "rewards/margins": 44.14993667602539, "rewards/rejected": -56.284019470214844, "step": 1249 }, { "epoch": 1.0199918400652794, "grad_norm": 2.9025928327541806e-08, "learning_rate": 7.663790038585793e-06, "logits/chosen": -6.736865997314453, "logits/rejected": -6.25760555267334, "logps/chosen": -139.98675537109375, "logps/rejected": -511.7694091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.742087364196777, "rewards/margins": 38.20545196533203, "rewards/rejected": -46.94753646850586, "step": 1250 }, { "epoch": 1.0208078335373316, "grad_norm": 3.6169719698714645e-14, "learning_rate": 7.604202821500595e-06, "logits/chosen": -6.731583118438721, "logits/rejected": -6.849190711975098, "logps/chosen": -136.27735900878906, "logps/rejected": -634.0584716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.792990684509277, "rewards/margins": 48.737693786621094, "rewards/rejected": -57.53068542480469, "step": 1251 }, { "epoch": 1.021623827009384, "grad_norm": 1.854680533597275e-07, "learning_rate": 7.544829088890326e-06, "logits/chosen": -7.062849998474121, "logits/rejected": -6.998011589050293, "logps/chosen": -112.9912109375, "logps/rejected": -617.6557006835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.8096394538879395, "rewards/margins": 50.53436279296875, "rewards/rejected": -56.3440055847168, "step": 1252 }, { "epoch": 1.0224398204814362, "grad_norm": 2.189764956028739e-07, "learning_rate": 7.485669139732004e-06, "logits/chosen": -7.231657028198242, "logits/rejected": -6.80997896194458, "logps/chosen": -102.28582000732422, "logps/rejected": -578.538330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.859968900680542, "rewards/margins": 48.70262908935547, "rewards/rejected": -52.562599182128906, "step": 1253 }, { "epoch": 1.0232558139534884, "grad_norm": 2.162344088507684e-12, "learning_rate": 7.426723271926195e-06, "logits/chosen": -8.323240280151367, "logits/rejected": -7.301581382751465, "logps/chosen": -100.53471374511719, "logps/rejected": -552.90087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.490312099456787, "rewards/margins": 43.80995178222656, "rewards/rejected": -49.300262451171875, "step": 1254 }, { "epoch": 1.0240718074255406, "grad_norm": 1.3819954801874701e-05, "learning_rate": 7.367991782295391e-06, "logits/chosen": -6.909029006958008, "logits/rejected": -7.039897918701172, "logps/chosen": -135.10757446289062, "logps/rejected": -500.656005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.783763885498047, "rewards/margins": 36.304046630859375, "rewards/rejected": -45.08781051635742, "step": 1255 }, { "epoch": 1.0248878008975928, "grad_norm": 1.3561731598832338e-11, "learning_rate": 7.309474966582636e-06, "logits/chosen": -6.901665210723877, "logits/rejected": -7.160006046295166, "logps/chosen": -95.193603515625, "logps/rejected": -575.2412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.663945198059082, "rewards/margins": 46.69297790527344, "rewards/rejected": -52.35692596435547, "step": 1256 }, { "epoch": 1.025703794369645, "grad_norm": 3.93613643020452e-16, "learning_rate": 7.251173119449972e-06, "logits/chosen": -6.713200569152832, "logits/rejected": -6.313168525695801, "logps/chosen": -108.49944305419922, "logps/rejected": -657.7601928710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.592751979827881, "rewards/margins": 53.90097427368164, "rewards/rejected": -59.49372863769531, "step": 1257 }, { "epoch": 1.0265197878416972, "grad_norm": 5.288460370577591e-14, "learning_rate": 7.193086534476923e-06, "logits/chosen": -7.09201717376709, "logits/rejected": -6.665772914886475, "logps/chosen": -113.93116760253906, "logps/rejected": -675.7772216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.490316390991211, "rewards/margins": 53.967742919921875, "rewards/rejected": -60.45806121826172, "step": 1258 }, { "epoch": 1.0273357813137496, "grad_norm": 6.417873464670265e-06, "learning_rate": 7.135215504159115e-06, "logits/chosen": -6.9400835037231445, "logits/rejected": -6.3951096534729, "logps/chosen": -143.84962463378906, "logps/rejected": -638.522705078125, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -9.302140235900879, "rewards/margins": 48.76149368286133, "rewards/rejected": -58.063629150390625, "step": 1259 }, { "epoch": 1.0281517747858018, "grad_norm": 5.737861574761638e-12, "learning_rate": 7.077560319906695e-06, "logits/chosen": -5.6803460121154785, "logits/rejected": -6.664228439331055, "logps/chosen": -179.250244140625, "logps/rejected": -636.8603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.295049667358398, "rewards/margins": 46.88385009765625, "rewards/rejected": -58.17889404296875, "step": 1260 }, { "epoch": 1.028967768257854, "grad_norm": 1.0777184402061338e-13, "learning_rate": 7.0201212720429564e-06, "logits/chosen": -6.523622035980225, "logits/rejected": -6.630192756652832, "logps/chosen": -148.3211212158203, "logps/rejected": -621.6093139648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.643310546875, "rewards/margins": 47.069183349609375, "rewards/rejected": -55.71249008178711, "step": 1261 }, { "epoch": 1.0297837617299062, "grad_norm": 2.8770001492262054e-08, "learning_rate": 6.962898649802823e-06, "logits/chosen": -6.340115547180176, "logits/rejected": -6.339421272277832, "logps/chosen": -165.112060546875, "logps/rejected": -613.7457275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.640644073486328, "rewards/margins": 43.977783203125, "rewards/rejected": -54.6184196472168, "step": 1262 }, { "epoch": 1.0305997552019583, "grad_norm": 1.2146722738748394e-08, "learning_rate": 6.905892741331382e-06, "logits/chosen": -6.785120487213135, "logits/rejected": -6.968717575073242, "logps/chosen": -185.65139770507812, "logps/rejected": -578.1284790039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.806476593017578, "rewards/margins": 37.801513671875, "rewards/rejected": -51.607994079589844, "step": 1263 }, { "epoch": 1.0314157486740105, "grad_norm": 0.00036588028888218105, "learning_rate": 6.849103833682491e-06, "logits/chosen": -6.707691192626953, "logits/rejected": -6.469372272491455, "logps/chosen": -144.3059844970703, "logps/rejected": -597.4542236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.85089111328125, "rewards/margins": 45.782920837402344, "rewards/rejected": -54.633811950683594, "step": 1264 }, { "epoch": 1.0322317421460627, "grad_norm": 1.7540963126450038e-10, "learning_rate": 6.792532212817271e-06, "logits/chosen": -7.043671131134033, "logits/rejected": -6.654651165008545, "logps/chosen": -107.99763488769531, "logps/rejected": -605.3734130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.5974225997924805, "rewards/margins": 49.83927917480469, "rewards/rejected": -55.436702728271484, "step": 1265 }, { "epoch": 1.0330477356181151, "grad_norm": 7.354391318159514e-09, "learning_rate": 6.7361781636027026e-06, "logits/chosen": -7.651730537414551, "logits/rejected": -7.3326897621154785, "logps/chosen": -126.12905883789062, "logps/rejected": -601.4541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.067748069763184, "rewards/margins": 46.97990417480469, "rewards/rejected": -55.04765319824219, "step": 1266 }, { "epoch": 1.0338637290901673, "grad_norm": 1.4384800017808175e-08, "learning_rate": 6.680041969810203e-06, "logits/chosen": -6.8669891357421875, "logits/rejected": -6.735611915588379, "logps/chosen": -134.50833129882812, "logps/rejected": -571.616455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.477274894714355, "rewards/margins": 41.97629928588867, "rewards/rejected": -50.45357131958008, "step": 1267 }, { "epoch": 1.0346797225622195, "grad_norm": 1.1979814917850717e-09, "learning_rate": 6.624123914114122e-06, "logits/chosen": -7.452213287353516, "logits/rejected": -7.573553562164307, "logps/chosen": -146.62088012695312, "logps/rejected": -667.9913330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.2955322265625, "rewards/margins": 50.36605453491211, "rewards/rejected": -59.661590576171875, "step": 1268 }, { "epoch": 1.0354957160342717, "grad_norm": 3.136537429782038e-08, "learning_rate": 6.568424278090446e-06, "logits/chosen": -6.477704048156738, "logits/rejected": -6.795453071594238, "logps/chosen": -141.1036834716797, "logps/rejected": -712.71875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.321407318115234, "rewards/margins": 54.39638900756836, "rewards/rejected": -62.71778869628906, "step": 1269 }, { "epoch": 1.036311709506324, "grad_norm": 1.0284268370241989e-07, "learning_rate": 6.5129433422152334e-06, "logits/chosen": -6.801608562469482, "logits/rejected": -7.125922203063965, "logps/chosen": -149.4530487060547, "logps/rejected": -624.6044311523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.234405517578125, "rewards/margins": 46.10502243041992, "rewards/rejected": -56.33942413330078, "step": 1270 }, { "epoch": 1.037127702978376, "grad_norm": 7.064359692776634e-07, "learning_rate": 6.457681385863335e-06, "logits/chosen": -6.848394870758057, "logits/rejected": -6.830493927001953, "logps/chosen": -105.46536254882812, "logps/rejected": -517.76171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.488734245300293, "rewards/margins": 40.67006301879883, "rewards/rejected": -47.15879821777344, "step": 1271 }, { "epoch": 1.0379436964504285, "grad_norm": 8.562732771097425e-13, "learning_rate": 6.402638687306872e-06, "logits/chosen": -6.7898383140563965, "logits/rejected": -6.751334190368652, "logps/chosen": -92.716552734375, "logps/rejected": -594.9876708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.62199068069458, "rewards/margins": 50.97905731201172, "rewards/rejected": -54.60104751586914, "step": 1272 }, { "epoch": 1.0387596899224807, "grad_norm": 2.8519500649970553e-10, "learning_rate": 6.347815523713968e-06, "logits/chosen": -7.249602794647217, "logits/rejected": -6.971109390258789, "logps/chosen": -161.10946655273438, "logps/rejected": -620.442138671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.707296371459961, "rewards/margins": 45.18893051147461, "rewards/rejected": -56.89622497558594, "step": 1273 }, { "epoch": 1.0395756833945329, "grad_norm": 4.2669635186287636e-11, "learning_rate": 6.293212171147206e-06, "logits/chosen": -7.2602081298828125, "logits/rejected": -6.084102630615234, "logps/chosen": -132.20359802246094, "logps/rejected": -569.264892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.324295997619629, "rewards/margins": 44.54783630371094, "rewards/rejected": -52.872135162353516, "step": 1274 }, { "epoch": 1.040391676866585, "grad_norm": 8.380567351196078e-09, "learning_rate": 6.238828904562316e-06, "logits/chosen": -6.6179280281066895, "logits/rejected": -6.061271667480469, "logps/chosen": -116.01106262207031, "logps/rejected": -531.9620971679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.056171417236328, "rewards/margins": 40.565303802490234, "rewards/rejected": -47.62147903442383, "step": 1275 }, { "epoch": 1.0412076703386373, "grad_norm": 6.526759133151927e-09, "learning_rate": 6.184665997806832e-06, "logits/chosen": -6.634053707122803, "logits/rejected": -6.409177780151367, "logps/chosen": -110.38566589355469, "logps/rejected": -572.5557861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.669288158416748, "rewards/margins": 46.55924987792969, "rewards/rejected": -51.228538513183594, "step": 1276 }, { "epoch": 1.0420236638106894, "grad_norm": 1.2857521037440423e-13, "learning_rate": 6.130723723618598e-06, "logits/chosen": -6.758490562438965, "logits/rejected": -6.5875654220581055, "logps/chosen": -133.45028686523438, "logps/rejected": -702.3355102539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.154435157775879, "rewards/margins": 57.972991943359375, "rewards/rejected": -65.12742614746094, "step": 1277 }, { "epoch": 1.0428396572827416, "grad_norm": 5.5615259043406695e-05, "learning_rate": 6.0770023536245055e-06, "logits/chosen": -7.580227851867676, "logits/rejected": -6.957072734832764, "logps/chosen": -133.67507934570312, "logps/rejected": -506.2093200683594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.84096622467041, "rewards/margins": 36.852874755859375, "rewards/rejected": -45.69384002685547, "step": 1278 }, { "epoch": 1.043655650754794, "grad_norm": 4.5924528069285486e-10, "learning_rate": 6.023502158339078e-06, "logits/chosen": -7.058837413787842, "logits/rejected": -7.28770637512207, "logps/chosen": -92.02406311035156, "logps/rejected": -520.9585571289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0568125247955322, "rewards/margins": 42.560203552246094, "rewards/rejected": -45.61701583862305, "step": 1279 }, { "epoch": 1.0444716442268462, "grad_norm": 4.962010403258832e-12, "learning_rate": 5.9702234071631e-06, "logits/chosen": -7.999358177185059, "logits/rejected": -7.415945529937744, "logps/chosen": -174.0209197998047, "logps/rejected": -637.833740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.742218017578125, "rewards/margins": 46.15385437011719, "rewards/rejected": -55.89607238769531, "step": 1280 }, { "epoch": 1.0452876376988984, "grad_norm": 1.2357251723618679e-12, "learning_rate": 5.917166368382277e-06, "logits/chosen": -7.721870422363281, "logits/rejected": -6.973611354827881, "logps/chosen": -160.8148193359375, "logps/rejected": -651.025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.72314453125, "rewards/margins": 46.19377517700195, "rewards/rejected": -56.91691589355469, "step": 1281 }, { "epoch": 1.0461036311709506, "grad_norm": 1.7506075755679973e-10, "learning_rate": 5.864331309165849e-06, "logits/chosen": -8.41287612915039, "logits/rejected": -7.847235202789307, "logps/chosen": -158.83763122558594, "logps/rejected": -646.763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.083454132080078, "rewards/margins": 47.57320785522461, "rewards/rejected": -57.65666198730469, "step": 1282 }, { "epoch": 1.0469196246430028, "grad_norm": 4.059654955951286e-12, "learning_rate": 5.811718495565327e-06, "logits/chosen": -6.756931304931641, "logits/rejected": -6.429799556732178, "logps/chosen": -101.17091369628906, "logps/rejected": -607.6238403320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.375924110412598, "rewards/margins": 49.77903747558594, "rewards/rejected": -54.15496063232422, "step": 1283 }, { "epoch": 1.047735618115055, "grad_norm": 1.8872503559919096e-08, "learning_rate": 5.759328192513075e-06, "logits/chosen": -5.732082366943359, "logits/rejected": -7.007308006286621, "logps/chosen": -98.90265655517578, "logps/rejected": -504.832763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.1756911277771, "rewards/margins": 41.068214416503906, "rewards/rejected": -45.2439079284668, "step": 1284 }, { "epoch": 1.0485516115871074, "grad_norm": 1.0792163247997288e-15, "learning_rate": 5.70716066382101e-06, "logits/chosen": -7.71282434463501, "logits/rejected": -7.51558780670166, "logps/chosen": -103.4247055053711, "logps/rejected": -636.751708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.8538172245025635, "rewards/margins": 54.88804244995117, "rewards/rejected": -58.741859436035156, "step": 1285 }, { "epoch": 1.0493676050591596, "grad_norm": 7.946249214185741e-10, "learning_rate": 5.655216172179245e-06, "logits/chosen": -6.886788368225098, "logits/rejected": -7.419835090637207, "logps/chosen": -138.86219787597656, "logps/rejected": -689.3318481445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.199654579162598, "rewards/margins": 53.4631462097168, "rewards/rejected": -62.662803649902344, "step": 1286 }, { "epoch": 1.0501835985312118, "grad_norm": 2.934782814987755e-14, "learning_rate": 5.603494979154828e-06, "logits/chosen": -5.774219989776611, "logits/rejected": -5.830315113067627, "logps/chosen": -147.22637939453125, "logps/rejected": -713.2928466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.373225212097168, "rewards/margins": 56.510475158691406, "rewards/rejected": -64.88369750976562, "step": 1287 }, { "epoch": 1.050999592003264, "grad_norm": 1.270959682983519e-12, "learning_rate": 5.5519973451903405e-06, "logits/chosen": -6.8602447509765625, "logits/rejected": -7.13714599609375, "logps/chosen": -122.59303283691406, "logps/rejected": -649.278076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.051909923553467, "rewards/margins": 50.984527587890625, "rewards/rejected": -58.03643035888672, "step": 1288 }, { "epoch": 1.0518155854753162, "grad_norm": 0.00037876571877859533, "learning_rate": 5.500723529602653e-06, "logits/chosen": -6.9784674644470215, "logits/rejected": -6.665844440460205, "logps/chosen": -148.5310516357422, "logps/rejected": -544.6571044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.869091033935547, "rewards/margins": 39.909217834472656, "rewards/rejected": -48.7783088684082, "step": 1289 }, { "epoch": 1.0526315789473684, "grad_norm": 3.660414193973338e-10, "learning_rate": 5.449673790581611e-06, "logits/chosen": -6.801142692565918, "logits/rejected": -6.624506950378418, "logps/chosen": -167.72865295410156, "logps/rejected": -642.856689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.006564140319824, "rewards/margins": 45.78586959838867, "rewards/rejected": -55.79243469238281, "step": 1290 }, { "epoch": 1.0534475724194206, "grad_norm": 0.0001935799518832937, "learning_rate": 5.398848385188682e-06, "logits/chosen": -6.742382049560547, "logits/rejected": -6.6320319175720215, "logps/chosen": -146.31826782226562, "logps/rejected": -593.7434692382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.098531723022461, "rewards/margins": 44.47930908203125, "rewards/rejected": -53.577842712402344, "step": 1291 }, { "epoch": 1.054263565891473, "grad_norm": 5.435327836818488e-16, "learning_rate": 5.348247569355735e-06, "logits/chosen": -6.460098743438721, "logits/rejected": -5.686362266540527, "logps/chosen": -82.7742919921875, "logps/rejected": -624.9664306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4574272632598877, "rewards/margins": 53.82773971557617, "rewards/rejected": -56.28516387939453, "step": 1292 }, { "epoch": 1.0550795593635252, "grad_norm": 6.461491466325242e-07, "learning_rate": 5.297871597883697e-06, "logits/chosen": -7.62399435043335, "logits/rejected": -7.08998966217041, "logps/chosen": -189.7083740234375, "logps/rejected": -563.51611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.207935333251953, "rewards/margins": 38.742469787597656, "rewards/rejected": -51.950401306152344, "step": 1293 }, { "epoch": 1.0558955528355773, "grad_norm": 1.1262137888934376e-08, "learning_rate": 5.2477207244412855e-06, "logits/chosen": -6.1972975730896, "logits/rejected": -6.527198791503906, "logps/chosen": -128.52813720703125, "logps/rejected": -602.6407470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.695379257202148, "rewards/margins": 47.67113494873047, "rewards/rejected": -53.36650848388672, "step": 1294 }, { "epoch": 1.0567115463076295, "grad_norm": 4.578953571993851e-17, "learning_rate": 5.197795201563743e-06, "logits/chosen": -6.854223728179932, "logits/rejected": -7.399696350097656, "logps/chosen": -120.68740844726562, "logps/rejected": -639.338623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.824126243591309, "rewards/margins": 53.154388427734375, "rewards/rejected": -58.97850799560547, "step": 1295 }, { "epoch": 1.0575275397796817, "grad_norm": 0.0008280192269012332, "learning_rate": 5.148095280651566e-06, "logits/chosen": -6.9975080490112305, "logits/rejected": -7.389623641967773, "logps/chosen": -173.7050323486328, "logps/rejected": -608.052978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.452108383178711, "rewards/margins": 43.582454681396484, "rewards/rejected": -55.03456115722656, "step": 1296 }, { "epoch": 1.058343533251734, "grad_norm": 2.371754703744955e-08, "learning_rate": 5.098621211969223e-06, "logits/chosen": -7.077836036682129, "logits/rejected": -7.522392272949219, "logps/chosen": -145.26809692382812, "logps/rejected": -600.076416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.732061386108398, "rewards/margins": 46.20988845825195, "rewards/rejected": -54.941951751708984, "step": 1297 }, { "epoch": 1.059159526723786, "grad_norm": 1.657982155430296e-13, "learning_rate": 5.049373244643879e-06, "logits/chosen": -7.534359931945801, "logits/rejected": -7.254509925842285, "logps/chosen": -140.92276000976562, "logps/rejected": -591.04052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.810919761657715, "rewards/margins": 44.93915939331055, "rewards/rejected": -53.750083923339844, "step": 1298 }, { "epoch": 1.0599755201958385, "grad_norm": 1.7811247268739727e-11, "learning_rate": 5.000351626664207e-06, "logits/chosen": -7.103813171386719, "logits/rejected": -6.2386345863342285, "logps/chosen": -146.3072509765625, "logps/rejected": -620.8831787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.360313415527344, "rewards/margins": 47.369991302490234, "rewards/rejected": -55.73030471801758, "step": 1299 }, { "epoch": 1.0607915136678907, "grad_norm": 1.5740588423795998e-05, "learning_rate": 4.951556604879048e-06, "logits/chosen": -7.089349269866943, "logits/rejected": -6.7973246574401855, "logps/chosen": -165.58914184570312, "logps/rejected": -514.875244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.819570541381836, "rewards/margins": 36.08820724487305, "rewards/rejected": -47.90777587890625, "step": 1300 }, { "epoch": 1.061607507139943, "grad_norm": 0.001051057712174952, "learning_rate": 4.902988424996247e-06, "logits/chosen": -7.347461223602295, "logits/rejected": -6.732455730438232, "logps/chosen": -141.98789978027344, "logps/rejected": -517.6405639648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.006295204162598, "rewards/margins": 37.83008575439453, "rewards/rejected": -47.83638000488281, "step": 1301 }, { "epoch": 1.062423500611995, "grad_norm": 1.1316436069469527e-12, "learning_rate": 4.8546473315813856e-06, "logits/chosen": -7.417614459991455, "logits/rejected": -6.864349365234375, "logps/chosen": -126.371337890625, "logps/rejected": -657.9120483398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.789726257324219, "rewards/margins": 52.77532196044922, "rewards/rejected": -60.56504821777344, "step": 1302 }, { "epoch": 1.0632394940840473, "grad_norm": 1.6362311043849331e-06, "learning_rate": 4.80653356805652e-06, "logits/chosen": -7.456138610839844, "logits/rejected": -7.959969520568848, "logps/chosen": -197.0839080810547, "logps/rejected": -497.1072998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.326505661010742, "rewards/margins": 31.5373477935791, "rewards/rejected": -44.86385726928711, "step": 1303 }, { "epoch": 1.0640554875560995, "grad_norm": 2.5929063583729484e-12, "learning_rate": 4.758647376699032e-06, "logits/chosen": -7.007823944091797, "logits/rejected": -6.9029860496521, "logps/chosen": -191.32595825195312, "logps/rejected": -689.2511596679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.922852516174316, "rewards/margins": 49.037452697753906, "rewards/rejected": -61.960304260253906, "step": 1304 }, { "epoch": 1.0648714810281519, "grad_norm": 4.82609356615432e-16, "learning_rate": 4.710988998640298e-06, "logits/chosen": -7.005453109741211, "logits/rejected": -7.238670349121094, "logps/chosen": -121.38983917236328, "logps/rejected": -679.8306274414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.878935813903809, "rewards/margins": 54.70369338989258, "rewards/rejected": -61.58262634277344, "step": 1305 }, { "epoch": 1.065687474500204, "grad_norm": 1.7466027235624182e-10, "learning_rate": 4.663558673864599e-06, "logits/chosen": -8.177976608276367, "logits/rejected": -7.000204086303711, "logps/chosen": -153.12545776367188, "logps/rejected": -620.7991943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.055038452148438, "rewards/margins": 46.896575927734375, "rewards/rejected": -55.95161437988281, "step": 1306 }, { "epoch": 1.0665034679722563, "grad_norm": 4.478241919514403e-07, "learning_rate": 4.61635664120782e-06, "logits/chosen": -7.05510139465332, "logits/rejected": -6.754891872406006, "logps/chosen": -213.55032348632812, "logps/rejected": -657.4791259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.00579833984375, "rewards/margins": 44.136871337890625, "rewards/rejected": -59.142669677734375, "step": 1307 }, { "epoch": 1.0673194614443084, "grad_norm": 4.40920303740322e-13, "learning_rate": 4.569383138356276e-06, "logits/chosen": -6.566649913787842, "logits/rejected": -5.976454734802246, "logps/chosen": -156.32398986816406, "logps/rejected": -727.3472900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.926319122314453, "rewards/margins": 57.34102249145508, "rewards/rejected": -67.26734161376953, "step": 1308 }, { "epoch": 1.0681354549163606, "grad_norm": 2.4099431339408284e-09, "learning_rate": 4.522638401845547e-06, "logits/chosen": -7.104426383972168, "logits/rejected": -6.831494331359863, "logps/chosen": -92.45146179199219, "logps/rejected": -544.6063842773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.598711013793945, "rewards/margins": 44.36261749267578, "rewards/rejected": -48.96133041381836, "step": 1309 }, { "epoch": 1.0689514483884128, "grad_norm": 4.9749937526499496e-17, "learning_rate": 4.476122667059207e-06, "logits/chosen": -7.822164058685303, "logits/rejected": -7.387905120849609, "logps/chosen": -137.52886962890625, "logps/rejected": -682.9971923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.642409801483154, "rewards/margins": 54.344383239746094, "rewards/rejected": -61.986793518066406, "step": 1310 }, { "epoch": 1.069767441860465, "grad_norm": 9.534122894384184e-14, "learning_rate": 4.429836168227735e-06, "logits/chosen": -7.413878440856934, "logits/rejected": -7.205953598022461, "logps/chosen": -146.4038543701172, "logps/rejected": -649.8685302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.315534591674805, "rewards/margins": 51.68547058105469, "rewards/rejected": -59.000999450683594, "step": 1311 }, { "epoch": 1.0705834353325174, "grad_norm": 3.5871583481394964e-10, "learning_rate": 4.3837791384272745e-06, "logits/chosen": -7.755704879760742, "logits/rejected": -7.121979236602783, "logps/chosen": -97.26911163330078, "logps/rejected": -523.6900634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.8668930530548096, "rewards/margins": 43.98244094848633, "rewards/rejected": -47.849334716796875, "step": 1312 }, { "epoch": 1.0713994288045696, "grad_norm": 7.801862125234038e-07, "learning_rate": 4.3379518095784886e-06, "logits/chosen": -6.679169654846191, "logits/rejected": -6.938194274902344, "logps/chosen": -129.7191162109375, "logps/rejected": -519.5830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.256112575531006, "rewards/margins": 38.93998718261719, "rewards/rejected": -46.19609832763672, "step": 1313 }, { "epoch": 1.0722154222766218, "grad_norm": 61.97478485107422, "learning_rate": 4.292354412445349e-06, "logits/chosen": -6.825686454772949, "logits/rejected": -6.774442672729492, "logps/chosen": -116.50499725341797, "logps/rejected": -494.1399230957031, "loss": 0.4112, "rewards/accuracies": 0.875, "rewards/chosen": -7.742458343505859, "rewards/margins": 37.10887145996094, "rewards/rejected": -44.85133361816406, "step": 1314 }, { "epoch": 1.073031415748674, "grad_norm": 3.343645672837514e-12, "learning_rate": 4.2469871766340095e-06, "logits/chosen": -5.811534881591797, "logits/rejected": -6.224266529083252, "logps/chosen": -102.34471893310547, "logps/rejected": -700.416259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.942553997039795, "rewards/margins": 60.0830192565918, "rewards/rejected": -64.02557373046875, "step": 1315 }, { "epoch": 1.0738474092207262, "grad_norm": 1.8209042051964275e-09, "learning_rate": 4.2018503305916775e-06, "logits/chosen": -7.993265628814697, "logits/rejected": -7.030299186706543, "logps/chosen": -159.11390686035156, "logps/rejected": -538.3910522460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.730499267578125, "rewards/margins": 38.361289978027344, "rewards/rejected": -50.09178924560547, "step": 1316 }, { "epoch": 1.0746634026927784, "grad_norm": 2.4160835550117443e-13, "learning_rate": 4.156944101605387e-06, "logits/chosen": -7.832879066467285, "logits/rejected": -6.903683662414551, "logps/chosen": -122.98483276367188, "logps/rejected": -574.9373779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.684086322784424, "rewards/margins": 44.42658615112305, "rewards/rejected": -50.11067199707031, "step": 1317 }, { "epoch": 1.0754793961648308, "grad_norm": 3.9527279227513645e-09, "learning_rate": 4.112268715800943e-06, "logits/chosen": -5.625957489013672, "logits/rejected": -6.327333450317383, "logps/chosen": -154.0369873046875, "logps/rejected": -540.814208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.366653442382812, "rewards/margins": 39.769142150878906, "rewards/rejected": -48.13579559326172, "step": 1318 }, { "epoch": 1.076295389636883, "grad_norm": 2.2443579439368477e-07, "learning_rate": 4.067824398141701e-06, "logits/chosen": -7.063338279724121, "logits/rejected": -6.086772441864014, "logps/chosen": -137.77252197265625, "logps/rejected": -665.8982543945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.767029762268066, "rewards/margins": 52.25366973876953, "rewards/rejected": -60.02069854736328, "step": 1319 }, { "epoch": 1.0771113831089352, "grad_norm": 3.60649254601067e-13, "learning_rate": 4.023611372427471e-06, "logits/chosen": -6.70237922668457, "logits/rejected": -6.1104607582092285, "logps/chosen": -188.96630859375, "logps/rejected": -740.0675048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.014466285705566, "rewards/margins": 55.09973907470703, "rewards/rejected": -66.11420440673828, "step": 1320 }, { "epoch": 1.0779273765809874, "grad_norm": 9.822174615692347e-05, "learning_rate": 3.979629861293415e-06, "logits/chosen": -6.734999656677246, "logits/rejected": -6.78219747543335, "logps/chosen": -135.0784912109375, "logps/rejected": -546.2601318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.5994157791137695, "rewards/margins": 41.76702117919922, "rewards/rejected": -49.36643600463867, "step": 1321 }, { "epoch": 1.0787433700530396, "grad_norm": 1.1946119116146292e-07, "learning_rate": 3.9358800862088816e-06, "logits/chosen": -6.288086414337158, "logits/rejected": -7.356182098388672, "logps/chosen": -125.85601806640625, "logps/rejected": -554.187255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.085792541503906, "rewards/margins": 41.52323532104492, "rewards/rejected": -48.60902404785156, "step": 1322 }, { "epoch": 1.0795593635250917, "grad_norm": 1.3229296769168286e-07, "learning_rate": 3.892362267476313e-06, "logits/chosen": -6.812233924865723, "logits/rejected": -6.488239288330078, "logps/chosen": -134.8046875, "logps/rejected": -593.3858642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.12581205368042, "rewards/margins": 46.973854064941406, "rewards/rejected": -54.099666595458984, "step": 1323 }, { "epoch": 1.080375356997144, "grad_norm": 1.2389892909041105e-15, "learning_rate": 3.8490766242301355e-06, "logits/chosen": -6.549201011657715, "logits/rejected": -6.909786701202393, "logps/chosen": -142.5798797607422, "logps/rejected": -704.3746948242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.573823928833008, "rewards/margins": 52.526607513427734, "rewards/rejected": -61.100433349609375, "step": 1324 }, { "epoch": 1.0811913504691963, "grad_norm": 7.962143964979074e-12, "learning_rate": 3.8060233744356633e-06, "logits/chosen": -6.888712406158447, "logits/rejected": -6.400815010070801, "logps/chosen": -141.83021545410156, "logps/rejected": -590.1830444335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.546143531799316, "rewards/margins": 43.34699249267578, "rewards/rejected": -53.89313507080078, "step": 1325 }, { "epoch": 1.0820073439412485, "grad_norm": 5.080270284452126e-07, "learning_rate": 3.7632027348879775e-06, "logits/chosen": -6.936678409576416, "logits/rejected": -6.622475624084473, "logps/chosen": -180.89065551757812, "logps/rejected": -549.1580200195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.460354804992676, "rewards/margins": 38.44303894042969, "rewards/rejected": -49.90338897705078, "step": 1326 }, { "epoch": 1.0828233374133007, "grad_norm": 1.0263459353232474e-11, "learning_rate": 3.7206149212108356e-06, "logits/chosen": -6.626966953277588, "logits/rejected": -6.319972991943359, "logps/chosen": -138.00332641601562, "logps/rejected": -579.404296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.825298309326172, "rewards/margins": 43.19071960449219, "rewards/rejected": -53.016021728515625, "step": 1327 }, { "epoch": 1.083639330885353, "grad_norm": 4.086580156581476e-05, "learning_rate": 3.6782601478556278e-06, "logits/chosen": -6.410050392150879, "logits/rejected": -7.240342140197754, "logps/chosen": -199.96749877929688, "logps/rejected": -627.3638916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.0472412109375, "rewards/margins": 44.37053680419922, "rewards/rejected": -57.41777801513672, "step": 1328 }, { "epoch": 1.084455324357405, "grad_norm": 2.8836331422810277e-16, "learning_rate": 3.6361386281002495e-06, "logits/chosen": -6.589963912963867, "logits/rejected": -6.807948112487793, "logps/chosen": -164.53353881835938, "logps/rejected": -764.1002197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.568780899047852, "rewards/margins": 59.36898422241211, "rewards/rejected": -67.9377670288086, "step": 1329 }, { "epoch": 1.0852713178294573, "grad_norm": 3.9084975863169346e-15, "learning_rate": 3.5942505740480582e-06, "logits/chosen": -6.752612590789795, "logits/rejected": -6.233998775482178, "logps/chosen": -142.34451293945312, "logps/rejected": -649.8759155273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.794673919677734, "rewards/margins": 51.360198974609375, "rewards/rejected": -59.15487289428711, "step": 1330 }, { "epoch": 1.0860873113015095, "grad_norm": 6.813080517531489e-07, "learning_rate": 3.552596196626762e-06, "logits/chosen": -6.578749656677246, "logits/rejected": -6.312310695648193, "logps/chosen": -246.589599609375, "logps/rejected": -699.802978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.692384719848633, "rewards/margins": 46.48286819458008, "rewards/rejected": -63.175254821777344, "step": 1331 }, { "epoch": 1.086903304773562, "grad_norm": 3.814738738583401e-05, "learning_rate": 3.511175705587433e-06, "logits/chosen": -7.12091064453125, "logits/rejected": -7.37393045425415, "logps/chosen": -103.87776947021484, "logps/rejected": -556.3834838867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.081623554229736, "rewards/margins": 44.88936996459961, "rewards/rejected": -49.97099304199219, "step": 1332 }, { "epoch": 1.087719298245614, "grad_norm": 1.4840666889837095e-11, "learning_rate": 3.4699893095033687e-06, "logits/chosen": -6.967960357666016, "logits/rejected": -7.276474475860596, "logps/chosen": -120.31454467773438, "logps/rejected": -613.4080810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.426430702209473, "rewards/margins": 49.2888069152832, "rewards/rejected": -56.715232849121094, "step": 1333 }, { "epoch": 1.0885352917176663, "grad_norm": 0.00462407898157835, "learning_rate": 3.429037215769082e-06, "logits/chosen": -7.277599334716797, "logits/rejected": -6.449504852294922, "logps/chosen": -191.55825805664062, "logps/rejected": -647.0009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.274214744567871, "rewards/margins": 44.619110107421875, "rewards/rejected": -57.89332962036133, "step": 1334 }, { "epoch": 1.0893512851897185, "grad_norm": 3.5084840588339716e-10, "learning_rate": 3.3883196305992903e-06, "logits/chosen": -6.118390083312988, "logits/rejected": -7.132074356079102, "logps/chosen": -130.0224151611328, "logps/rejected": -629.50927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.5180182456970215, "rewards/margins": 50.407875061035156, "rewards/rejected": -56.92588806152344, "step": 1335 }, { "epoch": 1.0901672786617707, "grad_norm": 2.8972192467335844e-07, "learning_rate": 3.3478367590277894e-06, "logits/chosen": -7.159361839294434, "logits/rejected": -7.34248161315918, "logps/chosen": -153.20220947265625, "logps/rejected": -606.5865478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.59972095489502, "rewards/margins": 45.45381546020508, "rewards/rejected": -54.05353546142578, "step": 1336 }, { "epoch": 1.0909832721338228, "grad_norm": 1.3792325148642703e-07, "learning_rate": 3.3075888049065196e-06, "logits/chosen": -7.999736785888672, "logits/rejected": -7.194777965545654, "logps/chosen": -153.45008850097656, "logps/rejected": -703.9371337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.794160842895508, "rewards/margins": 52.980010986328125, "rewards/rejected": -62.77417755126953, "step": 1337 }, { "epoch": 1.091799265605875, "grad_norm": 1.8634941680029105e-14, "learning_rate": 3.2675759709044573e-06, "logits/chosen": -6.370276927947998, "logits/rejected": -6.840634346008301, "logps/chosen": -141.6243896484375, "logps/rejected": -632.9337158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.086795806884766, "rewards/margins": 48.44490051269531, "rewards/rejected": -57.531700134277344, "step": 1338 }, { "epoch": 1.0926152590779274, "grad_norm": 0.00211515324190259, "learning_rate": 3.2277984585066366e-06, "logits/chosen": -7.6291608810424805, "logits/rejected": -7.2938055992126465, "logps/chosen": -113.1148681640625, "logps/rejected": -531.7252807617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.82069730758667, "rewards/margins": 40.966712951660156, "rewards/rejected": -47.78740692138672, "step": 1339 }, { "epoch": 1.0934312525499796, "grad_norm": 1.0464306504642806e-11, "learning_rate": 3.18825646801314e-06, "logits/chosen": -6.543098449707031, "logits/rejected": -5.825587749481201, "logps/chosen": -155.95236206054688, "logps/rejected": -725.613037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.141462326049805, "rewards/margins": 57.59455490112305, "rewards/rejected": -66.73601531982422, "step": 1340 }, { "epoch": 1.0942472460220318, "grad_norm": 3.7904069927208184e-07, "learning_rate": 3.14895019853807e-06, "logits/chosen": -7.192318439483643, "logits/rejected": -6.9366912841796875, "logps/chosen": -131.99783325195312, "logps/rejected": -472.931884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.594539642333984, "rewards/margins": 34.1910400390625, "rewards/rejected": -42.78558349609375, "step": 1341 }, { "epoch": 1.095063239494084, "grad_norm": 6.007489355397411e-05, "learning_rate": 3.1098798480085565e-06, "logits/chosen": -6.910605430603027, "logits/rejected": -7.150279998779297, "logps/chosen": -148.96026611328125, "logps/rejected": -586.5947875976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.989523887634277, "rewards/margins": 42.69891357421875, "rewards/rejected": -52.68843460083008, "step": 1342 }, { "epoch": 1.0958792329661362, "grad_norm": 2.87025557099696e-07, "learning_rate": 3.071045613163742e-06, "logits/chosen": -7.517772674560547, "logits/rejected": -6.138071537017822, "logps/chosen": -120.69732666015625, "logps/rejected": -581.4812622070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.839153289794922, "rewards/margins": 45.38450622558594, "rewards/rejected": -51.22365951538086, "step": 1343 }, { "epoch": 1.0966952264381884, "grad_norm": 1.3429687584221028e-08, "learning_rate": 3.03244768955383e-06, "logits/chosen": -6.491483688354492, "logits/rejected": -6.447216033935547, "logps/chosen": -177.46182250976562, "logps/rejected": -655.6626586914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.32175350189209, "rewards/margins": 47.150020599365234, "rewards/rejected": -59.47177505493164, "step": 1344 }, { "epoch": 1.0975112199102408, "grad_norm": 7.751244958387193e-12, "learning_rate": 2.9940862715390485e-06, "logits/chosen": -6.557646751403809, "logits/rejected": -6.8856329917907715, "logps/chosen": -95.80010986328125, "logps/rejected": -557.75537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.98654842376709, "rewards/margins": 46.49822998046875, "rewards/rejected": -51.484779357910156, "step": 1345 }, { "epoch": 1.098327213382293, "grad_norm": 1.2381449643328324e-08, "learning_rate": 2.9559615522887273e-06, "logits/chosen": -7.366596698760986, "logits/rejected": -6.671658515930176, "logps/chosen": -112.96949768066406, "logps/rejected": -509.6144104003906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.187186241149902, "rewards/margins": 39.3734130859375, "rewards/rejected": -45.56060028076172, "step": 1346 }, { "epoch": 1.0991432068543452, "grad_norm": 1.536436644578032e-09, "learning_rate": 2.9180737237802848e-06, "logits/chosen": -7.271457672119141, "logits/rejected": -6.508647441864014, "logps/chosen": -141.52157592773438, "logps/rejected": -623.2091674804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.643208503723145, "rewards/margins": 45.74219512939453, "rewards/rejected": -55.385398864746094, "step": 1347 }, { "epoch": 1.0999592003263974, "grad_norm": 5.805142402648926, "learning_rate": 2.8804229767982637e-06, "logits/chosen": -6.653270244598389, "logits/rejected": -7.365847587585449, "logps/chosen": -140.13165283203125, "logps/rejected": -566.0771484375, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -8.080198287963867, "rewards/margins": 42.47087097167969, "rewards/rejected": -50.55106735229492, "step": 1348 }, { "epoch": 1.1007751937984496, "grad_norm": 2.1122693851793883e-06, "learning_rate": 2.8430095009333967e-06, "logits/chosen": -6.978106498718262, "logits/rejected": -7.219250202178955, "logps/chosen": -106.8791275024414, "logps/rejected": -564.51611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.590892791748047, "rewards/margins": 45.831912994384766, "rewards/rejected": -51.42280578613281, "step": 1349 }, { "epoch": 1.1015911872705018, "grad_norm": 6.683707454158139e-08, "learning_rate": 2.8058334845816213e-06, "logits/chosen": -6.187528610229492, "logits/rejected": -6.337064743041992, "logps/chosen": -213.26089477539062, "logps/rejected": -602.0040283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.576634407043457, "rewards/margins": 38.557533264160156, "rewards/rejected": -53.1341667175293, "step": 1350 }, { "epoch": 1.1024071807425542, "grad_norm": 2.9338018681102085e-09, "learning_rate": 2.7688951149431595e-06, "logits/chosen": -7.4647722244262695, "logits/rejected": -6.3180131912231445, "logps/chosen": -176.97271728515625, "logps/rejected": -608.466552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.300027847290039, "rewards/margins": 42.46051788330078, "rewards/rejected": -54.76055145263672, "step": 1351 }, { "epoch": 1.1032231742146064, "grad_norm": 5.85194641189446e-07, "learning_rate": 2.7321945780215573e-06, "logits/chosen": -7.631089210510254, "logits/rejected": -6.709107875823975, "logps/chosen": -172.5901641845703, "logps/rejected": -699.7327270507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.353005409240723, "rewards/margins": 52.404449462890625, "rewards/rejected": -64.75745391845703, "step": 1352 }, { "epoch": 1.1040391676866586, "grad_norm": 0.073719322681427, "learning_rate": 2.695732058622735e-06, "logits/chosen": -7.694441318511963, "logits/rejected": -5.684350490570068, "logps/chosen": -148.33224487304688, "logps/rejected": -578.156494140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.184056282043457, "rewards/margins": 43.33380126953125, "rewards/rejected": -51.51785659790039, "step": 1353 }, { "epoch": 1.1048551611587107, "grad_norm": 9.68534692447065e-08, "learning_rate": 2.6595077403541002e-06, "logits/chosen": -6.998529434204102, "logits/rejected": -6.671814441680908, "logps/chosen": -205.21231079101562, "logps/rejected": -687.8218994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.466357231140137, "rewards/margins": 46.97118377685547, "rewards/rejected": -61.43754196166992, "step": 1354 }, { "epoch": 1.105671154630763, "grad_norm": 1.3601691489426287e-14, "learning_rate": 2.6235218056235634e-06, "logits/chosen": -7.126516342163086, "logits/rejected": -6.677074432373047, "logps/chosen": -165.14556884765625, "logps/rejected": -730.9562377929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.521970748901367, "rewards/margins": 56.99715805053711, "rewards/rejected": -67.51912689208984, "step": 1355 }, { "epoch": 1.1064871481028151, "grad_norm": 1.155569771071896e-05, "learning_rate": 2.587774435638679e-06, "logits/chosen": -7.162045478820801, "logits/rejected": -7.1714863777160645, "logps/chosen": -183.38717651367188, "logps/rejected": -690.43994140625, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -12.565495491027832, "rewards/margins": 50.28551483154297, "rewards/rejected": -62.85100555419922, "step": 1356 }, { "epoch": 1.1073031415748673, "grad_norm": 2.713325397696309e-11, "learning_rate": 2.552265810405707e-06, "logits/chosen": -6.9862284660339355, "logits/rejected": -7.438005447387695, "logps/chosen": -157.0361785888672, "logps/rejected": -534.9445190429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.867998123168945, "rewards/margins": 38.50843048095703, "rewards/rejected": -48.376426696777344, "step": 1357 }, { "epoch": 1.1081191350469197, "grad_norm": 1.1453635349184532e-12, "learning_rate": 2.5169961087286974e-06, "logits/chosen": -6.986154079437256, "logits/rejected": -6.355813980102539, "logps/chosen": -123.95381927490234, "logps/rejected": -754.7601318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.475018501281738, "rewards/margins": 61.96601104736328, "rewards/rejected": -68.44103240966797, "step": 1358 }, { "epoch": 1.108935128518972, "grad_norm": 4.673999728921352e-11, "learning_rate": 2.4819655082085835e-06, "logits/chosen": -7.483128547668457, "logits/rejected": -6.9669718742370605, "logps/chosen": -191.71783447265625, "logps/rejected": -712.9083251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.943183898925781, "rewards/margins": 52.54689025878906, "rewards/rejected": -65.49007415771484, "step": 1359 }, { "epoch": 1.109751121991024, "grad_norm": 3.1474566009048743e-11, "learning_rate": 2.4471741852423237e-06, "logits/chosen": -5.959547996520996, "logits/rejected": -6.470186233520508, "logps/chosen": -147.38006591796875, "logps/rejected": -732.9388427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.796573638916016, "rewards/margins": 58.34242248535156, "rewards/rejected": -67.13899230957031, "step": 1360 }, { "epoch": 1.1105671154630763, "grad_norm": 8.150230490500121e-10, "learning_rate": 2.4126223150219896e-06, "logits/chosen": -6.5018534660339355, "logits/rejected": -6.652813911437988, "logps/chosen": -117.17147827148438, "logps/rejected": -610.2732543945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.421075344085693, "rewards/margins": 49.80326461791992, "rewards/rejected": -56.22433853149414, "step": 1361 }, { "epoch": 1.1113831089351285, "grad_norm": 1.4486449817496805e-09, "learning_rate": 2.3783100715338624e-06, "logits/chosen": -7.420388221740723, "logits/rejected": -7.014531135559082, "logps/chosen": -118.04415893554688, "logps/rejected": -551.5057373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.635969638824463, "rewards/margins": 43.48454666137695, "rewards/rejected": -50.120521545410156, "step": 1362 }, { "epoch": 1.1121991024071807, "grad_norm": 9.688324448253738e-11, "learning_rate": 2.344237627557622e-06, "logits/chosen": -8.207910537719727, "logits/rejected": -7.925335884094238, "logps/chosen": -110.51051330566406, "logps/rejected": -574.4498291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.476245880126953, "rewards/margins": 45.72639846801758, "rewards/rejected": -52.20264434814453, "step": 1363 }, { "epoch": 1.1130150958792329, "grad_norm": 6.38241542949558e-12, "learning_rate": 2.3104051546654013e-06, "logits/chosen": -6.970721244812012, "logits/rejected": -6.598348140716553, "logps/chosen": -140.68527221679688, "logps/rejected": -580.5308837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.641879081726074, "rewards/margins": 44.55473709106445, "rewards/rejected": -53.196617126464844, "step": 1364 }, { "epoch": 1.1138310893512853, "grad_norm": 9.852700895862654e-05, "learning_rate": 2.276812823220964e-06, "logits/chosen": -7.853436470031738, "logits/rejected": -7.638522148132324, "logps/chosen": -135.22976684570312, "logps/rejected": -570.5036010742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.18052864074707, "rewards/margins": 42.800437927246094, "rewards/rejected": -51.98096466064453, "step": 1365 }, { "epoch": 1.1146470828233375, "grad_norm": 2.573146502728857e-14, "learning_rate": 2.2434608023788496e-06, "logits/chosen": -7.265067100524902, "logits/rejected": -7.5999555587768555, "logps/chosen": -74.11612701416016, "logps/rejected": -579.5839233398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0383620262145996, "rewards/margins": 49.94358825683594, "rewards/rejected": -52.98194885253906, "step": 1366 }, { "epoch": 1.1154630762953897, "grad_norm": 1.954130260384268e-11, "learning_rate": 2.210349260083494e-06, "logits/chosen": -7.4111480712890625, "logits/rejected": -7.593049049377441, "logps/chosen": -102.6999282836914, "logps/rejected": -571.2298583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.295522689819336, "rewards/margins": 46.08027648925781, "rewards/rejected": -51.375797271728516, "step": 1367 }, { "epoch": 1.1162790697674418, "grad_norm": 9.530021910508779e-11, "learning_rate": 2.177478363068425e-06, "logits/chosen": -5.889284610748291, "logits/rejected": -5.850528717041016, "logps/chosen": -117.6241226196289, "logps/rejected": -696.3713989257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.509321689605713, "rewards/margins": 58.071128845214844, "rewards/rejected": -63.580448150634766, "step": 1368 }, { "epoch": 1.117095063239494, "grad_norm": 0.0002491323684807867, "learning_rate": 2.1448482768553656e-06, "logits/chosen": -6.426095962524414, "logits/rejected": -6.587791442871094, "logps/chosen": -144.3266143798828, "logps/rejected": -599.5411987304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.668416023254395, "rewards/margins": 46.06491470336914, "rewards/rejected": -54.73332977294922, "step": 1369 }, { "epoch": 1.1179110567115462, "grad_norm": 1.3017912081636496e-08, "learning_rate": 2.1124591657534774e-06, "logits/chosen": -6.839224815368652, "logits/rejected": -6.551403999328613, "logps/chosen": -138.53271484375, "logps/rejected": -624.2384033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.039055824279785, "rewards/margins": 48.2911376953125, "rewards/rejected": -56.33019256591797, "step": 1370 }, { "epoch": 1.1187270501835984, "grad_norm": 1.4347099964767858e-14, "learning_rate": 2.0803111928584473e-06, "logits/chosen": -5.744645595550537, "logits/rejected": -6.5367560386657715, "logps/chosen": -123.75282287597656, "logps/rejected": -610.892333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.481036186218262, "rewards/margins": 48.13214111328125, "rewards/rejected": -54.61317825317383, "step": 1371 }, { "epoch": 1.1195430436556508, "grad_norm": 4.075702619843469e-08, "learning_rate": 2.048404520051722e-06, "logits/chosen": -7.102990627288818, "logits/rejected": -6.554357528686523, "logps/chosen": -150.3870391845703, "logps/rejected": -625.9649658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.811220169067383, "rewards/margins": 47.81748580932617, "rewards/rejected": -57.62870407104492, "step": 1372 }, { "epoch": 1.120359037127703, "grad_norm": 1.8201877505230613e-11, "learning_rate": 2.016739307999688e-06, "logits/chosen": -5.984044075012207, "logits/rejected": -7.479698657989502, "logps/chosen": -87.61055755615234, "logps/rejected": -629.461669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.8480937480926514, "rewards/margins": 52.86693572998047, "rewards/rejected": -56.71503448486328, "step": 1373 }, { "epoch": 1.1211750305997552, "grad_norm": 8.470354195821983e-09, "learning_rate": 1.985315716152847e-06, "logits/chosen": -6.690721035003662, "logits/rejected": -6.789357662200928, "logps/chosen": -154.04852294921875, "logps/rejected": -658.8170166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.358824729919434, "rewards/margins": 49.69442367553711, "rewards/rejected": -59.053245544433594, "step": 1374 }, { "epoch": 1.1219910240718074, "grad_norm": 2.798740572629299e-09, "learning_rate": 1.9541339027450256e-06, "logits/chosen": -6.816305160522461, "logits/rejected": -6.423028469085693, "logps/chosen": -136.04278564453125, "logps/rejected": -580.320068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.601543426513672, "rewards/margins": 43.743675231933594, "rewards/rejected": -52.34521484375, "step": 1375 }, { "epoch": 1.1228070175438596, "grad_norm": 3.233549250580836e-08, "learning_rate": 1.9231940247925573e-06, "logits/chosen": -7.709749698638916, "logits/rejected": -7.56063175201416, "logps/chosen": -117.11778259277344, "logps/rejected": -616.9591064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.599471092224121, "rewards/margins": 49.96296310424805, "rewards/rejected": -56.56243133544922, "step": 1376 }, { "epoch": 1.1236230110159118, "grad_norm": 2.768138438113965e-05, "learning_rate": 1.892496238093533e-06, "logits/chosen": -7.508174896240234, "logits/rejected": -7.053256988525391, "logps/chosen": -202.67791748046875, "logps/rejected": -666.01953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.382746696472168, "rewards/margins": 45.82094955444336, "rewards/rejected": -60.203697204589844, "step": 1377 }, { "epoch": 1.1244390044879642, "grad_norm": 2.566464831555207e-16, "learning_rate": 1.8620406972269577e-06, "logits/chosen": -7.301523685455322, "logits/rejected": -7.003870010375977, "logps/chosen": -123.62962341308594, "logps/rejected": -646.554931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.033031463623047, "rewards/margins": 52.606834411621094, "rewards/rejected": -60.639869689941406, "step": 1378 }, { "epoch": 1.1252549979600164, "grad_norm": 6.568584010047118e-10, "learning_rate": 1.8318275555520237e-06, "logits/chosen": -7.683860778808594, "logits/rejected": -7.020798683166504, "logps/chosen": -136.38189697265625, "logps/rejected": -570.594970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.967794418334961, "rewards/margins": 43.57072067260742, "rewards/rejected": -51.53851318359375, "step": 1379 }, { "epoch": 1.1260709914320686, "grad_norm": 5.488362512551248e-05, "learning_rate": 1.8018569652073381e-06, "logits/chosen": -7.780178546905518, "logits/rejected": -6.899590492248535, "logps/chosen": -124.86117553710938, "logps/rejected": -581.653564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.678009986877441, "rewards/margins": 44.5987434387207, "rewards/rejected": -52.276756286621094, "step": 1380 }, { "epoch": 1.1268869849041208, "grad_norm": 5.350547144189477e-05, "learning_rate": 1.7721290771100961e-06, "logits/chosen": -7.744278907775879, "logits/rejected": -6.647613525390625, "logps/chosen": -189.38162231445312, "logps/rejected": -594.0230712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.877735137939453, "rewards/margins": 40.180564880371094, "rewards/rejected": -54.05829620361328, "step": 1381 }, { "epoch": 1.127702978376173, "grad_norm": 9.90877380147026e-10, "learning_rate": 1.742644040955399e-06, "logits/chosen": -6.334100246429443, "logits/rejected": -6.01815128326416, "logps/chosen": -157.49095153808594, "logps/rejected": -831.3177490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.084609031677246, "rewards/margins": 64.67049407958984, "rewards/rejected": -74.75511169433594, "step": 1382 }, { "epoch": 1.1285189718482251, "grad_norm": 1.1630632457126922e-07, "learning_rate": 1.7134020052154364e-06, "logits/chosen": -7.312596321105957, "logits/rejected": -6.932892799377441, "logps/chosen": -163.933349609375, "logps/rejected": -699.619384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.900765419006348, "rewards/margins": 51.64457702636719, "rewards/rejected": -62.54534149169922, "step": 1383 }, { "epoch": 1.1293349653202776, "grad_norm": 8.244023275453746e-14, "learning_rate": 1.6844031171388053e-06, "logits/chosen": -6.439624786376953, "logits/rejected": -6.9004082679748535, "logps/chosen": -143.3874969482422, "logps/rejected": -680.0242919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.232898712158203, "rewards/margins": 53.60955810546875, "rewards/rejected": -62.84246063232422, "step": 1384 }, { "epoch": 1.1301509587923297, "grad_norm": 6.674422126096147e-11, "learning_rate": 1.6556475227496814e-06, "logits/chosen": -6.612742900848389, "logits/rejected": -6.721435070037842, "logps/chosen": -129.48387145996094, "logps/rejected": -544.435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.873552322387695, "rewards/margins": 43.109371185302734, "rewards/rejected": -49.98291778564453, "step": 1385 }, { "epoch": 1.130966952264382, "grad_norm": 1.2689954881177357e-13, "learning_rate": 1.6271353668471655e-06, "logits/chosen": -7.953449726104736, "logits/rejected": -7.119565010070801, "logps/chosen": -139.11361694335938, "logps/rejected": -651.1995849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.391294479370117, "rewards/margins": 50.11161422729492, "rewards/rejected": -59.502906799316406, "step": 1386 }, { "epoch": 1.1317829457364341, "grad_norm": 1.4688331930953757e-12, "learning_rate": 1.5988667930045276e-06, "logits/chosen": -6.8732523918151855, "logits/rejected": -6.74839448928833, "logps/chosen": -166.56295776367188, "logps/rejected": -655.7608642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.183759689331055, "rewards/margins": 48.910037994384766, "rewards/rejected": -59.09379959106445, "step": 1387 }, { "epoch": 1.1325989392084863, "grad_norm": 3.574586315835404e-08, "learning_rate": 1.5708419435684462e-06, "logits/chosen": -6.425674915313721, "logits/rejected": -5.033251762390137, "logps/chosen": -171.6023712158203, "logps/rejected": -621.5147705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.57868766784668, "rewards/margins": 44.44709014892578, "rewards/rejected": -56.025779724121094, "step": 1388 }, { "epoch": 1.1334149326805385, "grad_norm": 1.500116350583447e-11, "learning_rate": 1.543060959658349e-06, "logits/chosen": -6.793994426727295, "logits/rejected": -6.534900665283203, "logps/chosen": -172.32638549804688, "logps/rejected": -675.1092529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.338159561157227, "rewards/margins": 51.10370635986328, "rewards/rejected": -61.441864013671875, "step": 1389 }, { "epoch": 1.1342309261525907, "grad_norm": 2.1848714244071532e-12, "learning_rate": 1.5155239811656563e-06, "logits/chosen": -7.665908336639404, "logits/rejected": -6.050355911254883, "logps/chosen": -150.23655700683594, "logps/rejected": -673.5358276367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.2523775100708, "rewards/margins": 53.27577590942383, "rewards/rejected": -61.52815246582031, "step": 1390 }, { "epoch": 1.135046919624643, "grad_norm": 7.827619441513889e-09, "learning_rate": 1.4882311467531219e-06, "logits/chosen": -7.5488786697387695, "logits/rejected": -7.547455310821533, "logps/chosen": -121.10052490234375, "logps/rejected": -572.550048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.743788242340088, "rewards/margins": 45.133544921875, "rewards/rejected": -51.87732696533203, "step": 1391 }, { "epoch": 1.1358629130966953, "grad_norm": 6.814595053583616e-06, "learning_rate": 1.4611825938540935e-06, "logits/chosen": -7.726937294006348, "logits/rejected": -7.1999311447143555, "logps/chosen": -164.6455535888672, "logps/rejected": -532.2927856445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.516263961791992, "rewards/margins": 38.1191291809082, "rewards/rejected": -46.63539505004883, "step": 1392 }, { "epoch": 1.1366789065687475, "grad_norm": 6.005728891977924e-07, "learning_rate": 1.4343784586718311e-06, "logits/chosen": -7.3039093017578125, "logits/rejected": -6.8622636795043945, "logps/chosen": -194.39120483398438, "logps/rejected": -621.060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.576262474060059, "rewards/margins": 43.686370849609375, "rewards/rejected": -57.26262664794922, "step": 1393 }, { "epoch": 1.1374949000407997, "grad_norm": 5.191831405682024e-06, "learning_rate": 1.4078188761788402e-06, "logits/chosen": -7.674655914306641, "logits/rejected": -7.540920257568359, "logps/chosen": -120.20027923583984, "logps/rejected": -575.6889038085938, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -7.367013931274414, "rewards/margins": 45.176551818847656, "rewards/rejected": -52.5435676574707, "step": 1394 }, { "epoch": 1.1383108935128519, "grad_norm": 8.79484586853251e-19, "learning_rate": 1.3815039801161721e-06, "logits/chosen": -5.807469367980957, "logits/rejected": -6.48643159866333, "logps/chosen": -137.63388061523438, "logps/rejected": -738.3067626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.880917549133301, "rewards/margins": 60.162227630615234, "rewards/rejected": -67.04314422607422, "step": 1395 }, { "epoch": 1.139126886984904, "grad_norm": 5.727693874746365e-10, "learning_rate": 1.3554339029927532e-06, "logits/chosen": -6.493084907531738, "logits/rejected": -6.500147342681885, "logps/chosen": -136.05606079101562, "logps/rejected": -499.30609130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.337793827056885, "rewards/margins": 36.82730484008789, "rewards/rejected": -44.16510009765625, "step": 1396 }, { "epoch": 1.1399428804569562, "grad_norm": 3.5045355506468923e-09, "learning_rate": 1.3296087760847397e-06, "logits/chosen": -7.7784624099731445, "logits/rejected": -6.912225723266602, "logps/chosen": -127.39422607421875, "logps/rejected": -526.8031005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.745199203491211, "rewards/margins": 41.485382080078125, "rewards/rejected": -48.23058319091797, "step": 1397 }, { "epoch": 1.1407588739290087, "grad_norm": 4.5957003749208525e-05, "learning_rate": 1.304028729434803e-06, "logits/chosen": -7.631582736968994, "logits/rejected": -7.024820804595947, "logps/chosen": -136.10989379882812, "logps/rejected": -538.728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.475414276123047, "rewards/margins": 40.16012954711914, "rewards/rejected": -48.63554000854492, "step": 1398 }, { "epoch": 1.1415748674010608, "grad_norm": 3.8736042151654715e-11, "learning_rate": 1.2786938918515568e-06, "logits/chosen": -6.940373420715332, "logits/rejected": -7.113255023956299, "logps/chosen": -174.7807159423828, "logps/rejected": -637.8775634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.390000343322754, "rewards/margins": 45.62245178222656, "rewards/rejected": -59.012454986572266, "step": 1399 }, { "epoch": 1.142390860873113, "grad_norm": 9.625704746474408e-18, "learning_rate": 1.2536043909088191e-06, "logits/chosen": -8.253847122192383, "logits/rejected": -6.726228713989258, "logps/chosen": -147.69866943359375, "logps/rejected": -701.7841186523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.444489479064941, "rewards/margins": 55.84718704223633, "rewards/rejected": -64.29167938232422, "step": 1400 }, { "epoch": 1.1432068543451652, "grad_norm": 3.304974837640279e-13, "learning_rate": 1.2287603529450465e-06, "logits/chosen": -7.109292984008789, "logits/rejected": -6.8303327560424805, "logps/chosen": -145.93441772460938, "logps/rejected": -611.11181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.595412254333496, "rewards/margins": 48.12872314453125, "rewards/rejected": -55.7241325378418, "step": 1401 }, { "epoch": 1.1440228478172174, "grad_norm": 0.0017781425267457962, "learning_rate": 1.2041619030626284e-06, "logits/chosen": -6.998717784881592, "logits/rejected": -7.402974605560303, "logps/chosen": -141.16921997070312, "logps/rejected": -603.8748779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.481037139892578, "rewards/margins": 46.058834075927734, "rewards/rejected": -54.53987503051758, "step": 1402 }, { "epoch": 1.1448388412892696, "grad_norm": 5.917521680998128e-12, "learning_rate": 1.1798091651273324e-06, "logits/chosen": -7.415061950683594, "logits/rejected": -7.5644330978393555, "logps/chosen": -108.39695739746094, "logps/rejected": -636.609130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.601627349853516, "rewards/margins": 51.87712097167969, "rewards/rejected": -57.4787483215332, "step": 1403 }, { "epoch": 1.1456548347613218, "grad_norm": 0.0005348153063096106, "learning_rate": 1.1557022617676215e-06, "logits/chosen": -7.293082237243652, "logits/rejected": -6.208313941955566, "logps/chosen": -174.6320343017578, "logps/rejected": -558.2731323242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.462747573852539, "rewards/margins": 37.54373550415039, "rewards/rejected": -50.0064811706543, "step": 1404 }, { "epoch": 1.1464708282333742, "grad_norm": 1.0400062972273916e-12, "learning_rate": 1.1318413143740437e-06, "logits/chosen": -6.703773498535156, "logits/rejected": -6.764033794403076, "logps/chosen": -163.498291015625, "logps/rejected": -595.5675048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.61797046661377, "rewards/margins": 43.60625457763672, "rewards/rejected": -55.22422409057617, "step": 1405 }, { "epoch": 1.1472868217054264, "grad_norm": 3.003353565844691e-09, "learning_rate": 1.1082264430986532e-06, "logits/chosen": -7.557116508483887, "logits/rejected": -6.573292255401611, "logps/chosen": -219.23196411132812, "logps/rejected": -610.458251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.767312049865723, "rewards/margins": 40.45979309082031, "rewards/rejected": -55.227108001708984, "step": 1406 }, { "epoch": 1.1481028151774786, "grad_norm": 1.5411609766147194e-08, "learning_rate": 1.0848577668543802e-06, "logits/chosen": -6.855121612548828, "logits/rejected": -6.774405002593994, "logps/chosen": -202.903076171875, "logps/rejected": -671.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.291632652282715, "rewards/margins": 46.72636795043945, "rewards/rejected": -61.01799774169922, "step": 1407 }, { "epoch": 1.1489188086495308, "grad_norm": 7.698245099163614e-07, "learning_rate": 1.061735403314429e-06, "logits/chosen": -6.983760833740234, "logits/rejected": -6.861172199249268, "logps/chosen": -194.3118133544922, "logps/rejected": -700.386962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.841279029846191, "rewards/margins": 50.57574462890625, "rewards/rejected": -63.417022705078125, "step": 1408 }, { "epoch": 1.149734802121583, "grad_norm": 0.047490157186985016, "learning_rate": 1.0388594689117071e-06, "logits/chosen": -7.076789855957031, "logits/rejected": -7.306351661682129, "logps/chosen": -137.280029296875, "logps/rejected": -404.2222900390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.832366466522217, "rewards/margins": 27.966175079345703, "rewards/rejected": -35.79854202270508, "step": 1409 }, { "epoch": 1.1505507955936352, "grad_norm": 4.374629131165264e-12, "learning_rate": 1.016230078838226e-06, "logits/chosen": -6.980110168457031, "logits/rejected": -6.709958553314209, "logps/chosen": -152.86911010742188, "logps/rejected": -685.3109741210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.623022079467773, "rewards/margins": 53.51324462890625, "rewards/rejected": -62.136268615722656, "step": 1410 }, { "epoch": 1.1513667890656876, "grad_norm": 1.5194716318500667e-10, "learning_rate": 9.938473470444964e-07, "logits/chosen": -6.542700290679932, "logits/rejected": -6.745017051696777, "logps/chosen": -176.6524200439453, "logps/rejected": -669.013427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.316869735717773, "rewards/margins": 49.98406219482422, "rewards/rejected": -61.30093002319336, "step": 1411 }, { "epoch": 1.1521827825377398, "grad_norm": 2.2252150565015916e-15, "learning_rate": 9.717113862389992e-07, "logits/chosen": -6.436588764190674, "logits/rejected": -6.252602577209473, "logps/chosen": -192.0948028564453, "logps/rejected": -793.2886962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.175811767578125, "rewards/margins": 57.91781997680664, "rewards/rejected": -71.0936279296875, "step": 1412 }, { "epoch": 1.152998776009792, "grad_norm": 3.0416867247140544e-08, "learning_rate": 9.498223078876045e-07, "logits/chosen": -6.552713394165039, "logits/rejected": -6.63987398147583, "logps/chosen": -199.1394500732422, "logps/rejected": -615.8759155273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.133045196533203, "rewards/margins": 41.53655242919922, "rewards/rejected": -54.66960144042969, "step": 1413 }, { "epoch": 1.1538147694818441, "grad_norm": 5.216232393939757e-12, "learning_rate": 9.281802222129765e-07, "logits/chosen": -6.037303447723389, "logits/rejected": -6.855700492858887, "logps/chosen": -129.6443328857422, "logps/rejected": -784.7827758789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.098645210266113, "rewards/margins": 64.52433776855469, "rewards/rejected": -71.62298583984375, "step": 1414 }, { "epoch": 1.1546307629538963, "grad_norm": 1.522761517575022e-12, "learning_rate": 9.0678523819408e-07, "logits/chosen": -7.128828525543213, "logits/rejected": -7.012771129608154, "logps/chosen": -106.97381591796875, "logps/rejected": -599.3585205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2444117069244385, "rewards/margins": 50.019710540771484, "rewards/rejected": -53.264122009277344, "step": 1415 }, { "epoch": 1.1554467564259485, "grad_norm": 2.4726762304361216e-10, "learning_rate": 8.856374635655695e-07, "logits/chosen": -7.299508094787598, "logits/rejected": -6.20573616027832, "logps/chosen": -148.82852172851562, "logps/rejected": -585.37353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.483789443969727, "rewards/margins": 42.693355560302734, "rewards/rejected": -52.177146911621094, "step": 1416 }, { "epoch": 1.156262749898001, "grad_norm": 3.8194130524971115e-08, "learning_rate": 8.647370048172787e-07, "logits/chosen": -6.757617473602295, "logits/rejected": -6.347906112670898, "logps/chosen": -147.23883056640625, "logps/rejected": -837.4776611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.519922733306885, "rewards/margins": 67.88613891601562, "rewards/rejected": -75.40606689453125, "step": 1417 }, { "epoch": 1.1570787433700531, "grad_norm": 8.495653310092166e-05, "learning_rate": 8.440839671936818e-07, "logits/chosen": -7.492063522338867, "logits/rejected": -6.9073896408081055, "logps/chosen": -133.06234741210938, "logps/rejected": -700.318115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.676192760467529, "rewards/margins": 54.38591384887695, "rewards/rejected": -62.06210708618164, "step": 1418 }, { "epoch": 1.1578947368421053, "grad_norm": 1.0172617685277191e-13, "learning_rate": 8.236784546933718e-07, "logits/chosen": -6.805829048156738, "logits/rejected": -6.280318737030029, "logps/chosen": -172.830810546875, "logps/rejected": -672.336669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.396626472473145, "rewards/margins": 50.62590408325195, "rewards/rejected": -60.02253341674805, "step": 1419 }, { "epoch": 1.1587107303141575, "grad_norm": 1.1262590252775206e-12, "learning_rate": 8.035205700685167e-07, "logits/chosen": -7.912688255310059, "logits/rejected": -6.60399055480957, "logps/chosen": -134.02578735351562, "logps/rejected": -601.2265014648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.13432788848877, "rewards/margins": 47.24880599975586, "rewards/rejected": -55.38312911987305, "step": 1420 }, { "epoch": 1.1595267237862097, "grad_norm": 1.9940056235881632e-20, "learning_rate": 7.836104148243484e-07, "logits/chosen": -5.542054176330566, "logits/rejected": -5.428903102874756, "logps/chosen": -187.38125610351562, "logps/rejected": -751.6656494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.4711332321167, "rewards/margins": 55.75421142578125, "rewards/rejected": -69.22534942626953, "step": 1421 }, { "epoch": 1.1603427172582619, "grad_norm": 0.0004978177603334188, "learning_rate": 7.639480892186634e-07, "logits/chosen": -6.878686428070068, "logits/rejected": -7.095165252685547, "logps/chosen": -150.25233459472656, "logps/rejected": -593.0003662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.731071472167969, "rewards/margins": 43.83512878417969, "rewards/rejected": -53.56620407104492, "step": 1422 }, { "epoch": 1.161158710730314, "grad_norm": 7.816903234925121e-06, "learning_rate": 7.445336922613067e-07, "logits/chosen": -7.816350936889648, "logits/rejected": -7.284532070159912, "logps/chosen": -111.10909271240234, "logps/rejected": -555.1173706054688, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -5.880964279174805, "rewards/margins": 43.250335693359375, "rewards/rejected": -49.13129425048828, "step": 1423 }, { "epoch": 1.1619747042023665, "grad_norm": 3.098478771335067e-07, "learning_rate": 7.253673217136658e-07, "logits/chosen": -7.111577033996582, "logits/rejected": -6.995243072509766, "logps/chosen": -186.25137329101562, "logps/rejected": -612.030029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.190237045288086, "rewards/margins": 43.149200439453125, "rewards/rejected": -56.339439392089844, "step": 1424 }, { "epoch": 1.1627906976744187, "grad_norm": 6.681885133730248e-05, "learning_rate": 7.064490740882057e-07, "logits/chosen": -7.3498687744140625, "logits/rejected": -6.362187385559082, "logps/chosen": -188.18206787109375, "logps/rejected": -662.1502685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.241829872131348, "rewards/margins": 48.0961799621582, "rewards/rejected": -59.338008880615234, "step": 1425 }, { "epoch": 1.1636066911464709, "grad_norm": 1.697831632230251e-11, "learning_rate": 6.87779044647957e-07, "logits/chosen": -7.192269802093506, "logits/rejected": -6.2168378829956055, "logps/chosen": -154.1925048828125, "logps/rejected": -687.8056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.071752548217773, "rewards/margins": 51.47259521484375, "rewards/rejected": -60.544342041015625, "step": 1426 }, { "epoch": 1.164422684618523, "grad_norm": 0.001965973060578108, "learning_rate": 6.693573274060449e-07, "logits/chosen": -7.869913101196289, "logits/rejected": -6.283721923828125, "logps/chosen": -214.25086975097656, "logps/rejected": -522.88232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.749266624450684, "rewards/margins": 31.52469253540039, "rewards/rejected": -47.27396011352539, "step": 1427 }, { "epoch": 1.1652386780905752, "grad_norm": 5.294193972105177e-10, "learning_rate": 6.511840151252169e-07, "logits/chosen": -6.9407525062561035, "logits/rejected": -6.591971397399902, "logps/chosen": -152.37318420410156, "logps/rejected": -603.3731689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.322097778320312, "rewards/margins": 45.1480827331543, "rewards/rejected": -55.47018051147461, "step": 1428 }, { "epoch": 1.1660546715626274, "grad_norm": 3.4948538996104617e-06, "learning_rate": 6.332591993173764e-07, "logits/chosen": -7.287535667419434, "logits/rejected": -5.932552814483643, "logps/chosen": -103.14602661132812, "logps/rejected": -595.3433837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.364408016204834, "rewards/margins": 48.606239318847656, "rewards/rejected": -54.97064971923828, "step": 1429 }, { "epoch": 1.1668706650346796, "grad_norm": 4.910275745034776e-13, "learning_rate": 6.15582970243117e-07, "logits/chosen": -7.634973049163818, "logits/rejected": -6.699948310852051, "logps/chosen": -171.0742950439453, "logps/rejected": -678.8221435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.23150634765625, "rewards/margins": 49.75706481933594, "rewards/rejected": -61.98857116699219, "step": 1430 }, { "epoch": 1.167686658506732, "grad_norm": 5.215850418238688e-09, "learning_rate": 5.981554169112668e-07, "logits/chosen": -7.39681339263916, "logits/rejected": -7.760202884674072, "logps/chosen": -176.0970458984375, "logps/rejected": -650.0439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.59067153930664, "rewards/margins": 47.32304000854492, "rewards/rejected": -58.91371154785156, "step": 1431 }, { "epoch": 1.1685026519787842, "grad_norm": 3.004218740443321e-08, "learning_rate": 5.809766270784666e-07, "logits/chosen": -6.951590538024902, "logits/rejected": -7.163557052612305, "logps/chosen": -154.7957000732422, "logps/rejected": -590.4446411132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.04997444152832, "rewards/margins": 43.217044830322266, "rewards/rejected": -53.26701736450195, "step": 1432 }, { "epoch": 1.1693186454508364, "grad_norm": 8.577821341681363e-14, "learning_rate": 5.64046687248676e-07, "logits/chosen": -7.844363212585449, "logits/rejected": -6.918219089508057, "logps/chosen": -134.86135864257812, "logps/rejected": -682.9410400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.381171226501465, "rewards/margins": 54.622196197509766, "rewards/rejected": -62.00336837768555, "step": 1433 }, { "epoch": 1.1701346389228886, "grad_norm": 1.577631636440202e-10, "learning_rate": 5.473656826727847e-07, "logits/chosen": -6.682950496673584, "logits/rejected": -6.310460090637207, "logps/chosen": -178.9345703125, "logps/rejected": -585.7257080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.832657814025879, "rewards/margins": 43.826507568359375, "rewards/rejected": -53.65916442871094, "step": 1434 }, { "epoch": 1.1709506323949408, "grad_norm": 2.81667294739843e-13, "learning_rate": 5.309336973481683e-07, "logits/chosen": -7.4770121574401855, "logits/rejected": -6.27863073348999, "logps/chosen": -151.7655029296875, "logps/rejected": -705.0848388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.867023468017578, "rewards/margins": 55.5274658203125, "rewards/rejected": -64.39448547363281, "step": 1435 }, { "epoch": 1.171766625866993, "grad_norm": 1.5111700406749199e-16, "learning_rate": 5.147508140182556e-07, "logits/chosen": -7.900677680969238, "logits/rejected": -7.309189796447754, "logps/chosen": -178.32208251953125, "logps/rejected": -726.4885864257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.373639106750488, "rewards/margins": 52.673370361328125, "rewards/rejected": -66.04701232910156, "step": 1436 }, { "epoch": 1.1725826193390452, "grad_norm": 9.463443362278667e-09, "learning_rate": 4.988171141721232e-07, "logits/chosen": -6.995291233062744, "logits/rejected": -6.445014953613281, "logps/chosen": -138.62815856933594, "logps/rejected": -624.4122314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.828766822814941, "rewards/margins": 47.50064468383789, "rewards/rejected": -55.32940673828125, "step": 1437 }, { "epoch": 1.1733986128110976, "grad_norm": 2.654034460647381e-06, "learning_rate": 4.831326780440792e-07, "logits/chosen": -7.566240310668945, "logits/rejected": -6.324095249176025, "logps/chosen": -213.53701782226562, "logps/rejected": -674.297607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.93654727935791, "rewards/margins": 46.89471435546875, "rewards/rejected": -61.831260681152344, "step": 1438 }, { "epoch": 1.1742146062831498, "grad_norm": 2.20523516114568e-10, "learning_rate": 4.676975846132692e-07, "logits/chosen": -7.0202250480651855, "logits/rejected": -6.459344863891602, "logps/chosen": -122.93045043945312, "logps/rejected": -699.0527954101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.92543363571167, "rewards/margins": 55.92243576049805, "rewards/rejected": -62.847869873046875, "step": 1439 }, { "epoch": 1.175030599755202, "grad_norm": 0.0008135532261803746, "learning_rate": 4.52511911603265e-07, "logits/chosen": -6.971234321594238, "logits/rejected": -7.363770008087158, "logps/chosen": -180.8470458984375, "logps/rejected": -629.9547119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.81660270690918, "rewards/margins": 45.945316314697266, "rewards/rejected": -57.76191711425781, "step": 1440 }, { "epoch": 1.1758465932272542, "grad_norm": 1.3846632174185913e-10, "learning_rate": 4.375757354816712e-07, "logits/chosen": -7.736919403076172, "logits/rejected": -6.730690002441406, "logps/chosen": -108.01109313964844, "logps/rejected": -639.632080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.703548431396484, "rewards/margins": 53.31228256225586, "rewards/rejected": -58.01582717895508, "step": 1441 }, { "epoch": 1.1766625866993063, "grad_norm": 2.1111558453412727e-05, "learning_rate": 4.228891314597694e-07, "logits/chosen": -6.719990253448486, "logits/rejected": -7.023519992828369, "logps/chosen": -156.74777221679688, "logps/rejected": -627.3797607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.799367904663086, "rewards/margins": 46.754913330078125, "rewards/rejected": -57.554283142089844, "step": 1442 }, { "epoch": 1.1774785801713585, "grad_norm": 9.122391210480973e-12, "learning_rate": 4.084521734920965e-07, "logits/chosen": -7.553009510040283, "logits/rejected": -7.264677047729492, "logps/chosen": -126.76387786865234, "logps/rejected": -561.379150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.970125198364258, "rewards/margins": 44.48455047607422, "rewards/rejected": -52.454673767089844, "step": 1443 }, { "epoch": 1.178294573643411, "grad_norm": 4.665279122219312e-13, "learning_rate": 3.9426493427611177e-07, "logits/chosen": -6.633243560791016, "logits/rejected": -6.496610164642334, "logps/chosen": -120.4695053100586, "logps/rejected": -708.5932006835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.104903221130371, "rewards/margins": 59.40156555175781, "rewards/rejected": -65.5064697265625, "step": 1444 }, { "epoch": 1.1791105671154631, "grad_norm": 7.861369340389501e-06, "learning_rate": 3.8032748525179685e-07, "logits/chosen": -6.531682014465332, "logits/rejected": -5.943619728088379, "logps/chosen": -161.71917724609375, "logps/rejected": -547.408447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.246952056884766, "rewards/margins": 37.3834228515625, "rewards/rejected": -48.630374908447266, "step": 1445 }, { "epoch": 1.1799265605875153, "grad_norm": 3.127425212247695e-14, "learning_rate": 3.6663989660132293e-07, "logits/chosen": -5.532962799072266, "logits/rejected": -6.631799697875977, "logps/chosen": -138.6743927001953, "logps/rejected": -631.07177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.089829444885254, "rewards/margins": 48.86381149291992, "rewards/rejected": -55.95363998413086, "step": 1446 }, { "epoch": 1.1807425540595675, "grad_norm": 0.15143999457359314, "learning_rate": 3.532022372486843e-07, "logits/chosen": -6.249359130859375, "logits/rejected": -5.887520790100098, "logps/chosen": -195.6981964111328, "logps/rejected": -588.0567626953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -14.306573867797852, "rewards/margins": 39.23350524902344, "rewards/rejected": -53.540077209472656, "step": 1447 }, { "epoch": 1.1815585475316197, "grad_norm": 3.301723840485422e-15, "learning_rate": 3.4001457485935416e-07, "logits/chosen": -5.716670513153076, "logits/rejected": -6.067047595977783, "logps/chosen": -167.8014373779297, "logps/rejected": -761.193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.070441246032715, "rewards/margins": 58.79222106933594, "rewards/rejected": -68.86265563964844, "step": 1448 }, { "epoch": 1.182374541003672, "grad_norm": 2.2841546432061932e-09, "learning_rate": 3.2707697583995167e-07, "logits/chosen": -7.436946868896484, "logits/rejected": -7.573231220245361, "logps/chosen": -136.04400634765625, "logps/rejected": -603.6605224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.007290840148926, "rewards/margins": 46.83333206176758, "rewards/rejected": -55.84062194824219, "step": 1449 }, { "epoch": 1.1831905344757243, "grad_norm": 2.7899799137642844e-10, "learning_rate": 3.143895053378698e-07, "logits/chosen": -7.88786506652832, "logits/rejected": -7.324092864990234, "logps/chosen": -138.43408203125, "logps/rejected": -530.7945556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.963135719299316, "rewards/margins": 39.91547393798828, "rewards/rejected": -48.878604888916016, "step": 1450 }, { "epoch": 1.1840065279477765, "grad_norm": 1.8595774919916153e-09, "learning_rate": 3.019522272410202e-07, "logits/chosen": -7.215117454528809, "logits/rejected": -7.34201717376709, "logps/chosen": -114.86835479736328, "logps/rejected": -524.16015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.608964920043945, "rewards/margins": 40.4965705871582, "rewards/rejected": -48.10553741455078, "step": 1451 }, { "epoch": 1.1848225214198287, "grad_norm": 6.124876605895224e-09, "learning_rate": 2.8976520417742794e-07, "logits/chosen": -6.581459045410156, "logits/rejected": -5.993125915527344, "logps/chosen": -177.42010498046875, "logps/rejected": -699.097412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.861839294433594, "rewards/margins": 49.600887298583984, "rewards/rejected": -62.46272277832031, "step": 1452 }, { "epoch": 1.1856385148918809, "grad_norm": 2.1768800650967535e-10, "learning_rate": 2.7782849751497586e-07, "logits/chosen": -8.071474075317383, "logits/rejected": -7.066542625427246, "logps/chosen": -138.3693084716797, "logps/rejected": -566.2166748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.34572982788086, "rewards/margins": 41.65818786621094, "rewards/rejected": -51.00392150878906, "step": 1453 }, { "epoch": 1.186454508363933, "grad_norm": 7.972340235889996e-13, "learning_rate": 2.661421673610831e-07, "logits/chosen": -6.874843597412109, "logits/rejected": -6.493813991546631, "logps/chosen": -196.16018676757812, "logps/rejected": -714.6113891601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.562015533447266, "rewards/margins": 51.89096450805664, "rewards/rejected": -65.4529800415039, "step": 1454 }, { "epoch": 1.1872705018359853, "grad_norm": 2.8052579637005692e-06, "learning_rate": 2.5470627256238277e-07, "logits/chosen": -6.9577484130859375, "logits/rejected": -6.546022891998291, "logps/chosen": -138.14517211914062, "logps/rejected": -636.2645263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.701485633850098, "rewards/margins": 50.453887939453125, "rewards/rejected": -58.15538024902344, "step": 1455 }, { "epoch": 1.1880864953080374, "grad_norm": 4.788777881181705e-13, "learning_rate": 2.4352087070443895e-07, "logits/chosen": -7.351794242858887, "logits/rejected": -6.228594779968262, "logps/chosen": -170.7940216064453, "logps/rejected": -652.2719116210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.127017974853516, "rewards/margins": 47.4102783203125, "rewards/rejected": -58.53730010986328, "step": 1456 }, { "epoch": 1.1889024887800899, "grad_norm": 2.1567227292962343e-07, "learning_rate": 2.3258601811145808e-07, "logits/chosen": -7.0824360847473145, "logits/rejected": -7.01284646987915, "logps/chosen": -138.24822998046875, "logps/rejected": -611.5413818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.904139995574951, "rewards/margins": 47.69293212890625, "rewards/rejected": -55.59707260131836, "step": 1457 }, { "epoch": 1.189718482252142, "grad_norm": 1.4794318303756882e-07, "learning_rate": 2.219017698460002e-07, "logits/chosen": -7.1891021728515625, "logits/rejected": -6.746156215667725, "logps/chosen": -113.65567016601562, "logps/rejected": -542.9784545898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.066556930541992, "rewards/margins": 41.87728500366211, "rewards/rejected": -48.943843841552734, "step": 1458 }, { "epoch": 1.1905344757241942, "grad_norm": 1.236309579016881e-11, "learning_rate": 2.1146817970871258e-07, "logits/chosen": -6.396811485290527, "logits/rejected": -6.911467552185059, "logps/chosen": -107.62168884277344, "logps/rejected": -704.0970458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.0700154304504395, "rewards/margins": 57.358646392822266, "rewards/rejected": -62.42866516113281, "step": 1459 }, { "epoch": 1.1913504691962464, "grad_norm": 1.1112322173545408e-10, "learning_rate": 2.012853002380466e-07, "logits/chosen": -6.651047706604004, "logits/rejected": -6.8758769035339355, "logps/chosen": -157.0769805908203, "logps/rejected": -631.6320190429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.772658348083496, "rewards/margins": 48.698081970214844, "rewards/rejected": -57.470741271972656, "step": 1460 }, { "epoch": 1.1921664626682986, "grad_norm": 0.0030640955083072186, "learning_rate": 1.913531827099968e-07, "logits/chosen": -7.167802333831787, "logits/rejected": -6.333366870880127, "logps/chosen": -109.49198150634766, "logps/rejected": -454.552978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.85483455657959, "rewards/margins": 34.4827880859375, "rewards/rejected": -40.337615966796875, "step": 1461 }, { "epoch": 1.1929824561403508, "grad_norm": 9.303727788179117e-10, "learning_rate": 1.816718771378456e-07, "logits/chosen": -8.066367149353027, "logits/rejected": -6.933481216430664, "logps/chosen": -198.8236541748047, "logps/rejected": -587.4581298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.876151084899902, "rewards/margins": 39.317832946777344, "rewards/rejected": -54.1939811706543, "step": 1462 }, { "epoch": 1.193798449612403, "grad_norm": 4.2057243104798303e-13, "learning_rate": 1.7224143227190236e-07, "logits/chosen": -7.586519241333008, "logits/rejected": -5.314624786376953, "logps/chosen": -161.70004272460938, "logps/rejected": -637.3812255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.494690895080566, "rewards/margins": 48.4356803894043, "rewards/rejected": -56.93037033081055, "step": 1463 }, { "epoch": 1.1946144430844554, "grad_norm": 7.790096345772213e-10, "learning_rate": 1.630618955992702e-07, "logits/chosen": -6.837246894836426, "logits/rejected": -6.620534420013428, "logps/chosen": -174.30332946777344, "logps/rejected": -731.3538818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.216611862182617, "rewards/margins": 53.85016632080078, "rewards/rejected": -66.0667724609375, "step": 1464 }, { "epoch": 1.1954304365565076, "grad_norm": 2.0301089534768835e-05, "learning_rate": 1.5413331334360182e-07, "logits/chosen": -7.482601165771484, "logits/rejected": -6.778335094451904, "logps/chosen": -205.1571807861328, "logps/rejected": -596.3362426757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.971944808959961, "rewards/margins": 39.57579803466797, "rewards/rejected": -54.54774475097656, "step": 1465 }, { "epoch": 1.1962464300285598, "grad_norm": 3.689703589770943e-06, "learning_rate": 1.4545573046486627e-07, "logits/chosen": -6.80306339263916, "logits/rejected": -6.506601810455322, "logps/chosen": -129.42520141601562, "logps/rejected": -645.8232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.925424098968506, "rewards/margins": 50.53900909423828, "rewards/rejected": -57.464439392089844, "step": 1466 }, { "epoch": 1.197062423500612, "grad_norm": 1.3154737644072156e-05, "learning_rate": 1.3702919065912144e-07, "logits/chosen": -7.577611446380615, "logits/rejected": -6.47293758392334, "logps/chosen": -138.76539611816406, "logps/rejected": -608.870361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.731707572937012, "rewards/margins": 46.230186462402344, "rewards/rejected": -55.96189498901367, "step": 1467 }, { "epoch": 1.1978784169726642, "grad_norm": 4.307822848437226e-11, "learning_rate": 1.2885373635829755e-07, "logits/chosen": -7.378691673278809, "logits/rejected": -6.292264938354492, "logps/chosen": -193.0967254638672, "logps/rejected": -594.2042236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.010083198547363, "rewards/margins": 41.552913665771484, "rewards/rejected": -54.56300354003906, "step": 1468 }, { "epoch": 1.1986944104447164, "grad_norm": 7.704667931212814e-14, "learning_rate": 1.209294087299806e-07, "logits/chosen": -6.774802207946777, "logits/rejected": -7.3687567710876465, "logps/chosen": -86.86824035644531, "logps/rejected": -575.9573364257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.554805755615234, "rewards/margins": 48.3052978515625, "rewards/rejected": -52.860103607177734, "step": 1469 }, { "epoch": 1.1995104039167686, "grad_norm": 6.731353213544622e-14, "learning_rate": 1.132562476771959e-07, "logits/chosen": -6.994156837463379, "logits/rejected": -5.938295364379883, "logps/chosen": -153.54168701171875, "logps/rejected": -715.8099365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.359964370727539, "rewards/margins": 56.77783966064453, "rewards/rejected": -65.13780212402344, "step": 1470 }, { "epoch": 1.200326397388821, "grad_norm": 6.295162741781768e-12, "learning_rate": 1.0583429183823047e-07, "logits/chosen": -5.392746925354004, "logits/rejected": -6.317188739776611, "logps/chosen": -180.0477752685547, "logps/rejected": -728.8651123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.049480438232422, "rewards/margins": 55.85816192626953, "rewards/rejected": -66.90763854980469, "step": 1471 }, { "epoch": 1.2011423908608732, "grad_norm": 3.9243847083292037e-17, "learning_rate": 9.866357858642205e-08, "logits/chosen": -6.797751426696777, "logits/rejected": -6.496039867401123, "logps/chosen": -109.09124755859375, "logps/rejected": -734.0556030273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.236407279968262, "rewards/margins": 61.14521408081055, "rewards/rejected": -66.38162231445312, "step": 1472 }, { "epoch": 1.2019583843329253, "grad_norm": 1.749255740257638e-09, "learning_rate": 9.174414402997044e-08, "logits/chosen": -7.139665603637695, "logits/rejected": -6.281843185424805, "logps/chosen": -140.19937133789062, "logps/rejected": -617.9854736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.861648559570312, "rewards/margins": 46.78270721435547, "rewards/rejected": -55.64435958862305, "step": 1473 }, { "epoch": 1.2027743778049775, "grad_norm": 3.674497415743874e-17, "learning_rate": 8.507602301175421e-08, "logits/chosen": -6.92472505569458, "logits/rejected": -7.503504753112793, "logps/chosen": -115.74481964111328, "logps/rejected": -670.098876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.059145927429199, "rewards/margins": 55.3023567199707, "rewards/rejected": -61.361507415771484, "step": 1474 }, { "epoch": 1.2035903712770297, "grad_norm": 8.389650751894351e-09, "learning_rate": 7.865924910916977e-08, "logits/chosen": -6.8183512687683105, "logits/rejected": -7.377598762512207, "logps/chosen": -167.7454833984375, "logps/rejected": -543.501220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.682921409606934, "rewards/margins": 37.61346435546875, "rewards/rejected": -50.29638671875, "step": 1475 }, { "epoch": 1.204406364749082, "grad_norm": 1.3657678884932138e-15, "learning_rate": 7.249385463395375e-08, "logits/chosen": -7.4980340003967285, "logits/rejected": -7.965002536773682, "logps/chosen": -146.94863891601562, "logps/rejected": -773.0965576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.208826065063477, "rewards/margins": 62.295494079589844, "rewards/rejected": -70.50431823730469, "step": 1476 }, { "epoch": 1.2052223582211343, "grad_norm": 2.1836949837348435e-13, "learning_rate": 6.657987063200533e-08, "logits/chosen": -6.861485958099365, "logits/rejected": -7.369841575622559, "logps/chosen": -135.78558349609375, "logps/rejected": -591.0440673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.547868728637695, "rewards/margins": 46.92558288574219, "rewards/rejected": -55.473453521728516, "step": 1477 }, { "epoch": 1.2060383516931865, "grad_norm": 1.673580718453138e-11, "learning_rate": 6.091732688325302e-08, "logits/chosen": -7.81682014465332, "logits/rejected": -7.455563068389893, "logps/chosen": -107.51118469238281, "logps/rejected": -551.3150024414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.320724964141846, "rewards/margins": 44.1546516418457, "rewards/rejected": -49.475372314453125, "step": 1478 }, { "epoch": 1.2068543451652387, "grad_norm": 1.0600093247603581e-07, "learning_rate": 5.550625190150483e-08, "logits/chosen": -7.309029579162598, "logits/rejected": -6.735969543457031, "logps/chosen": -137.33294677734375, "logps/rejected": -587.5722045898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.880069732666016, "rewards/margins": 44.857032775878906, "rewards/rejected": -53.73710250854492, "step": 1479 }, { "epoch": 1.207670338637291, "grad_norm": 2.204751714529607e-09, "learning_rate": 5.0346672934270534e-08, "logits/chosen": -7.531141757965088, "logits/rejected": -7.22315788269043, "logps/chosen": -123.70738220214844, "logps/rejected": -593.9208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.743195533752441, "rewards/margins": 46.78544235229492, "rewards/rejected": -53.52864074707031, "step": 1480 }, { "epoch": 1.208486332109343, "grad_norm": 0.4200381934642792, "learning_rate": 4.543861596266741e-08, "logits/chosen": -6.521172523498535, "logits/rejected": -6.673562526702881, "logps/chosen": -104.3756103515625, "logps/rejected": -602.6777954101562, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -5.486727237701416, "rewards/margins": 49.47715377807617, "rewards/rejected": -54.96388244628906, "step": 1481 }, { "epoch": 1.2093023255813953, "grad_norm": 5.2924036680890296e-11, "learning_rate": 4.078210570127028e-08, "logits/chosen": -7.190937042236328, "logits/rejected": -6.765682220458984, "logps/chosen": -152.60385131835938, "logps/rejected": -660.5419311523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.079051971435547, "rewards/margins": 51.778324127197266, "rewards/rejected": -59.85737609863281, "step": 1482 }, { "epoch": 1.2101183190534477, "grad_norm": 3.3718386698196323e-13, "learning_rate": 3.637716559798388e-08, "logits/chosen": -6.875710487365723, "logits/rejected": -6.298351764678955, "logps/chosen": -130.72438049316406, "logps/rejected": -606.9263305664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.393056869506836, "rewards/margins": 47.21990203857422, "rewards/rejected": -55.61296081542969, "step": 1483 }, { "epoch": 1.2109343125254999, "grad_norm": 2.6925879061112656e-13, "learning_rate": 3.2223817833931805e-08, "logits/chosen": -7.398077487945557, "logits/rejected": -7.597370624542236, "logps/chosen": -139.3125762939453, "logps/rejected": -690.7651977539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.25338077545166, "rewards/margins": 54.778831481933594, "rewards/rejected": -62.0322151184082, "step": 1484 }, { "epoch": 1.211750305997552, "grad_norm": 1.212114501651651e-12, "learning_rate": 2.8322083323334413e-08, "logits/chosen": -7.3977155685424805, "logits/rejected": -6.963332176208496, "logps/chosen": -133.6049041748047, "logps/rejected": -612.2484741210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.551057815551758, "rewards/margins": 48.418758392333984, "rewards/rejected": -56.969818115234375, "step": 1485 }, { "epoch": 1.2125662994696043, "grad_norm": 0.0050335172563791275, "learning_rate": 2.467198171342e-08, "logits/chosen": -7.222658157348633, "logits/rejected": -6.848958492279053, "logps/chosen": -235.92703247070312, "logps/rejected": -564.3464965820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.859344482421875, "rewards/margins": 33.03126907348633, "rewards/rejected": -51.89061737060547, "step": 1486 }, { "epoch": 1.2133822929416564, "grad_norm": 4.838442328036763e-05, "learning_rate": 2.127353138431376e-08, "logits/chosen": -7.194638252258301, "logits/rejected": -7.175111770629883, "logps/chosen": -149.1226806640625, "logps/rejected": -612.5123291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.877973556518555, "rewards/margins": 45.490875244140625, "rewards/rejected": -54.36884307861328, "step": 1487 }, { "epoch": 1.2141982864137086, "grad_norm": 1.1357517010202045e-10, "learning_rate": 1.8126749448943437e-08, "logits/chosen": -6.515019416809082, "logits/rejected": -6.859869003295898, "logps/chosen": -161.1650390625, "logps/rejected": -580.9207153320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.05063533782959, "rewards/margins": 42.46636962890625, "rewards/rejected": -53.517005920410156, "step": 1488 }, { "epoch": 1.2150142798857608, "grad_norm": 2.685799716309334e-13, "learning_rate": 1.5231651752967147e-08, "logits/chosen": -7.6082444190979, "logits/rejected": -7.6985321044921875, "logps/chosen": -177.09568786621094, "logps/rejected": -602.12841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.227948188781738, "rewards/margins": 44.078094482421875, "rewards/rejected": -55.30603790283203, "step": 1489 }, { "epoch": 1.2158302733578132, "grad_norm": 3.343459731297571e-09, "learning_rate": 1.2588252874673468e-08, "logits/chosen": -6.898858070373535, "logits/rejected": -7.111300468444824, "logps/chosen": -113.2388916015625, "logps/rejected": -598.3736572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.259777545928955, "rewards/margins": 48.28276824951172, "rewards/rejected": -54.54254150390625, "step": 1490 }, { "epoch": 1.2166462668298654, "grad_norm": 1.1832387292187718e-09, "learning_rate": 1.019656612492592e-08, "logits/chosen": -6.428921699523926, "logits/rejected": -7.04888916015625, "logps/chosen": -174.02569580078125, "logps/rejected": -650.0184326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.784832000732422, "rewards/margins": 48.02080154418945, "rewards/rejected": -58.805633544921875, "step": 1491 }, { "epoch": 1.2174622603019176, "grad_norm": 1.44727755137014e-16, "learning_rate": 8.056603547090813e-09, "logits/chosen": -7.00877046585083, "logits/rejected": -6.881664752960205, "logps/chosen": -120.50566101074219, "logps/rejected": -644.47412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.684418201446533, "rewards/margins": 50.90438461303711, "rewards/rejected": -58.58879852294922, "step": 1492 }, { "epoch": 1.2182782537739698, "grad_norm": 2.4544415282434784e-05, "learning_rate": 6.1683759169706146e-09, "logits/chosen": -7.646150588989258, "logits/rejected": -6.649076461791992, "logps/chosen": -145.93173217773438, "logps/rejected": -537.1747436523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.013772010803223, "rewards/margins": 39.8564338684082, "rewards/rejected": -48.87020492553711, "step": 1493 }, { "epoch": 1.219094247246022, "grad_norm": 4.491123792305096e-11, "learning_rate": 4.531892742754007e-09, "logits/chosen": -6.223931312561035, "logits/rejected": -6.679912567138672, "logps/chosen": -194.38128662109375, "logps/rejected": -711.8040771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.332050323486328, "rewards/margins": 52.30227279663086, "rewards/rejected": -65.63432312011719, "step": 1494 }, { "epoch": 1.2199102407180742, "grad_norm": 5.183977918932214e-06, "learning_rate": 3.1471622649714703e-09, "logits/chosen": -6.627361297607422, "logits/rejected": -6.217928886413574, "logps/chosen": -184.6697998046875, "logps/rejected": -702.2725830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.514232635498047, "rewards/margins": 52.21017074584961, "rewards/rejected": -63.72439956665039, "step": 1495 }, { "epoch": 1.2207262341901264, "grad_norm": 4.83949023077912e-08, "learning_rate": 2.0141914564453244e-09, "logits/chosen": -7.303642272949219, "logits/rejected": -7.500290393829346, "logps/chosen": -172.6779022216797, "logps/rejected": -597.4942016601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.725220680236816, "rewards/margins": 42.158138275146484, "rewards/rejected": -55.88335418701172, "step": 1496 }, { "epoch": 1.2215422276621788, "grad_norm": 9.274863863040217e-12, "learning_rate": 1.1329860222619726e-09, "logits/chosen": -7.435050010681152, "logits/rejected": -6.186545372009277, "logps/chosen": -159.94981384277344, "logps/rejected": -581.063232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.284784317016602, "rewards/margins": 43.292388916015625, "rewards/rejected": -53.57717514038086, "step": 1497 }, { "epoch": 1.222358221134231, "grad_norm": 2.3961367787705967e-06, "learning_rate": 5.035503997385949e-10, "logits/chosen": -6.3945770263671875, "logits/rejected": -7.151654243469238, "logps/chosen": -131.2822723388672, "logps/rejected": -554.4712524414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.033185005187988, "rewards/margins": 42.355472564697266, "rewards/rejected": -49.38865661621094, "step": 1498 }, { "epoch": 1.2231742146062832, "grad_norm": 3.805320947236623e-09, "learning_rate": 1.2588775841204658e-10, "logits/chosen": -7.1991801261901855, "logits/rejected": -7.507455348968506, "logps/chosen": -149.15185546875, "logps/rejected": -735.0498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.630273342132568, "rewards/margins": 57.29672622680664, "rewards/rejected": -64.927001953125, "step": 1499 }, { "epoch": 1.2239902080783354, "grad_norm": 3.645789004025346e-09, "learning_rate": 0.0, "logits/chosen": -6.253324508666992, "logits/rejected": -6.954267501831055, "logps/chosen": -196.44046020507812, "logps/rejected": -725.673583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.12429428100586, "rewards/margins": 53.6168327331543, "rewards/rejected": -65.74111938476562, "step": 1500 } ], "logging_steps": 1, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }