{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.85819455926932, "eval_steps": 2000, "global_step": 5900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003149482303846305, "grad_norm": 3.109375, "learning_rate": 5.000000000000001e-07, "logits/chosen": -0.45210394263267517, "logits/rejected": -0.3446429371833801, "logps/chosen": -213.57180786132812, "logps/rejected": -191.9154052734375, "loss": 0.6941, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.005887002218514681, "rewards/margins": -0.0011509137693792582, "rewards/rejected": 0.00703791668638587, "step": 10 }, { "epoch": 0.00629896460769261, "grad_norm": 2.640625, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -0.46016186475753784, "logits/rejected": -0.2938859760761261, "logps/chosen": -203.55313110351562, "logps/rejected": -186.91030883789062, "loss": 0.6917, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.006860991008579731, "rewards/margins": 0.004207834601402283, "rewards/rejected": 0.002653155941516161, "step": 20 }, { "epoch": 0.009448446911538916, "grad_norm": 2.703125, "learning_rate": 1.5e-06, "logits/chosen": -0.5505405068397522, "logits/rejected": -0.32200556993484497, "logps/chosen": -206.43130493164062, "logps/rejected": -159.43423461914062, "loss": 0.6978, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.004765630234032869, "rewards/margins": -0.00844950508326292, "rewards/rejected": 0.003683874849230051, "step": 30 }, { "epoch": 0.01259792921538522, "grad_norm": 3.078125, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -0.5206310749053955, "logits/rejected": -0.44945794343948364, "logps/chosen": -192.1409454345703, "logps/rejected": -195.43496704101562, "loss": 0.6938, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.014343030750751495, "rewards/margins": 0.0002199936716351658, "rewards/rejected": 0.014123037457466125, "step": 40 }, { "epoch": 0.015747411519231525, "grad_norm": 2.953125, "learning_rate": 2.5e-06, "logits/chosen": -0.5311203002929688, "logits/rejected": -0.4016449451446533, "logps/chosen": -208.75094604492188, "logps/rejected": -167.02542114257812, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.009670769795775414, "rewards/margins": 0.001526160747744143, "rewards/rejected": 0.00814460963010788, "step": 50 }, { "epoch": 0.018896893823077833, "grad_norm": 2.734375, "learning_rate": 3e-06, "logits/chosen": -0.5485053062438965, "logits/rejected": -0.43868058919906616, "logps/chosen": -196.2797088623047, "logps/rejected": -174.60232543945312, "loss": 0.6959, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.008773775771260262, "rewards/margins": -0.004444376099854708, "rewards/rejected": 0.013218151405453682, "step": 60 }, { "epoch": 0.022046376126924137, "grad_norm": 3.921875, "learning_rate": 3.5e-06, "logits/chosen": -0.5210541486740112, "logits/rejected": -0.37378597259521484, "logps/chosen": -205.04330444335938, "logps/rejected": -167.24517822265625, "loss": 0.6964, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.006323327776044607, "rewards/margins": -0.00504258181899786, "rewards/rejected": 0.011365910060703754, "step": 70 }, { "epoch": 0.02519585843077044, "grad_norm": 2.84375, "learning_rate": 4.000000000000001e-06, "logits/chosen": -0.4273603558540344, "logits/rejected": -0.31517690420150757, "logps/chosen": -206.41354370117188, "logps/rejected": -190.51402282714844, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.012041259557008743, "rewards/margins": 0.001850623870268464, "rewards/rejected": 0.010190634056925774, "step": 80 }, { "epoch": 0.028345340734616746, "grad_norm": 3.015625, "learning_rate": 4.5e-06, "logits/chosen": -0.47311750054359436, "logits/rejected": -0.3815276026725769, "logps/chosen": -201.52731323242188, "logps/rejected": -173.32937622070312, "loss": 0.6893, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.016733495518565178, "rewards/margins": 0.009016195312142372, "rewards/rejected": 0.007717301603406668, "step": 90 }, { "epoch": 0.03149482303846305, "grad_norm": 2.953125, "learning_rate": 5e-06, "logits/chosen": -0.44015175104141235, "logits/rejected": -0.29738959670066833, "logps/chosen": -233.11642456054688, "logps/rejected": -192.25820922851562, "loss": 0.6937, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.02164524421095848, "rewards/margins": 0.0006244674441404641, "rewards/rejected": 0.021020779386162758, "step": 100 }, { "epoch": 0.034644305342309355, "grad_norm": 3.015625, "learning_rate": 4.999994443042687e-06, "logits/chosen": -0.4323144853115082, "logits/rejected": -0.325883150100708, "logps/chosen": -224.80685424804688, "logps/rejected": -206.5443572998047, "loss": 0.697, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.016110900789499283, "rewards/margins": -0.005870751105248928, "rewards/rejected": 0.021981652826070786, "step": 110 }, { "epoch": 0.037793787646155666, "grad_norm": 4.4375, "learning_rate": 4.999977772195451e-06, "logits/chosen": -0.47046566009521484, "logits/rejected": -0.31014284491539, "logps/chosen": -213.2732696533203, "logps/rejected": -178.27183532714844, "loss": 0.6867, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.034097231924533844, "rewards/margins": 0.01459626667201519, "rewards/rejected": 0.019500967115163803, "step": 120 }, { "epoch": 0.04094326995000197, "grad_norm": 3.46875, "learning_rate": 4.999949987532405e-06, "logits/chosen": -0.5328488349914551, "logits/rejected": -0.3397727906703949, "logps/chosen": -212.8068084716797, "logps/rejected": -165.73016357421875, "loss": 0.6904, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.03291865810751915, "rewards/margins": 0.0070328498259186745, "rewards/rejected": 0.02588580921292305, "step": 130 }, { "epoch": 0.044092752253848275, "grad_norm": 2.84375, "learning_rate": 4.9999110891770655e-06, "logits/chosen": -0.5087035894393921, "logits/rejected": -0.37333738803863525, "logps/chosen": -197.49069213867188, "logps/rejected": -164.01104736328125, "loss": 0.6848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04153250902891159, "rewards/margins": 0.018097227439284325, "rewards/rejected": 0.023435279726982117, "step": 140 }, { "epoch": 0.04724223455769458, "grad_norm": 2.671875, "learning_rate": 4.999861077302358e-06, "logits/chosen": -0.5480197668075562, "logits/rejected": -0.39153584837913513, "logps/chosen": -197.01953125, "logps/rejected": -172.3303985595703, "loss": 0.6895, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.04789372906088829, "rewards/margins": 0.008711261674761772, "rewards/rejected": 0.03918246552348137, "step": 150 }, { "epoch": 0.05039171686154088, "grad_norm": 4.0, "learning_rate": 4.999799952130615e-06, "logits/chosen": -0.5040096044540405, "logits/rejected": -0.39294344186782837, "logps/chosen": -214.4684600830078, "logps/rejected": -185.05201721191406, "loss": 0.6902, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.043327994644641876, "rewards/margins": 0.007260638289153576, "rewards/rejected": 0.03606735169887543, "step": 160 }, { "epoch": 0.05354119916538719, "grad_norm": 2.328125, "learning_rate": 4.999727713933572e-06, "logits/chosen": -0.5509570837020874, "logits/rejected": -0.37693116068840027, "logps/chosen": -184.16673278808594, "logps/rejected": -162.15756225585938, "loss": 0.6923, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.047002751380205154, "rewards/margins": 0.003170955926179886, "rewards/rejected": 0.04383179172873497, "step": 170 }, { "epoch": 0.05669068146923349, "grad_norm": 2.8125, "learning_rate": 4.999644363032367e-06, "logits/chosen": -0.5072035193443298, "logits/rejected": -0.37213388085365295, "logps/chosen": -195.76577758789062, "logps/rejected": -164.03848266601562, "loss": 0.6903, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.05497163534164429, "rewards/margins": 0.007314900867640972, "rewards/rejected": 0.04765673726797104, "step": 180 }, { "epoch": 0.0598401637730798, "grad_norm": 3.734375, "learning_rate": 4.999549899797544e-06, "logits/chosen": -0.45937657356262207, "logits/rejected": -0.31247037649154663, "logps/chosen": -218.7798309326172, "logps/rejected": -182.95201110839844, "loss": 0.6905, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.052850522100925446, "rewards/margins": 0.006988201290369034, "rewards/rejected": 0.04586231708526611, "step": 190 }, { "epoch": 0.0629896460769261, "grad_norm": 2.8125, "learning_rate": 4.999444324649045e-06, "logits/chosen": -0.5390416979789734, "logits/rejected": -0.35482490062713623, "logps/chosen": -199.8936767578125, "logps/rejected": -165.85488891601562, "loss": 0.6856, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07901380211114883, "rewards/margins": 0.01698421686887741, "rewards/rejected": 0.062029581516981125, "step": 200 }, { "epoch": 0.0661391283807724, "grad_norm": 3.046875, "learning_rate": 4.999327638056212e-06, "logits/chosen": -0.4826637804508209, "logits/rejected": -0.3051258623600006, "logps/chosen": -221.1772003173828, "logps/rejected": -173.49583435058594, "loss": 0.6906, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.07877618074417114, "rewards/margins": 0.007990067824721336, "rewards/rejected": 0.07078611105680466, "step": 210 }, { "epoch": 0.06928861068461871, "grad_norm": 2.6875, "learning_rate": 4.999199840537781e-06, "logits/chosen": -0.4253220558166504, "logits/rejected": -0.25771626830101013, "logps/chosen": -218.5417938232422, "logps/rejected": -180.00387573242188, "loss": 0.6865, "rewards/accuracies": 0.5, "rewards/chosen": 0.09241167455911636, "rewards/margins": 0.01505853421986103, "rewards/rejected": 0.07735314220190048, "step": 220 }, { "epoch": 0.07243809298846503, "grad_norm": 3.0, "learning_rate": 4.9990609326618845e-06, "logits/chosen": -0.468860924243927, "logits/rejected": -0.416407972574234, "logps/chosen": -227.7265625, "logps/rejected": -207.37020874023438, "loss": 0.689, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.10431376844644547, "rewards/margins": 0.01053541712462902, "rewards/rejected": 0.09377835690975189, "step": 230 }, { "epoch": 0.07558757529231133, "grad_norm": 3.046875, "learning_rate": 4.998910915046048e-06, "logits/chosen": -0.47573143243789673, "logits/rejected": -0.3487725853919983, "logps/chosen": -218.22030639648438, "logps/rejected": -186.4251708984375, "loss": 0.6898, "rewards/accuracies": 0.625, "rewards/chosen": 0.11783305555582047, "rewards/margins": 0.009454838000237942, "rewards/rejected": 0.10837821662425995, "step": 240 }, { "epoch": 0.07873705759615764, "grad_norm": 3.328125, "learning_rate": 4.998749788357184e-06, "logits/chosen": -0.491966187953949, "logits/rejected": -0.42401209473609924, "logps/chosen": -209.5353240966797, "logps/rejected": -191.00352478027344, "loss": 0.6781, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12841267883777618, "rewards/margins": 0.032561883330345154, "rewards/rejected": 0.09585078805685043, "step": 250 }, { "epoch": 0.08188653990000394, "grad_norm": 2.578125, "learning_rate": 4.998577553311592e-06, "logits/chosen": -0.5209980010986328, "logits/rejected": -0.3212158679962158, "logps/chosen": -207.7479705810547, "logps/rejected": -174.1891632080078, "loss": 0.681, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.12082584202289581, "rewards/margins": 0.02722765877842903, "rewards/rejected": 0.09359817951917648, "step": 260 }, { "epoch": 0.08503602220385025, "grad_norm": 2.671875, "learning_rate": 4.998394210674954e-06, "logits/chosen": -0.4800085127353668, "logits/rejected": -0.4280025064945221, "logps/chosen": -193.07815551757812, "logps/rejected": -190.64801025390625, "loss": 0.6958, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1191275492310524, "rewards/margins": -0.0025443979538977146, "rewards/rejected": 0.12167193740606308, "step": 270 }, { "epoch": 0.08818550450769655, "grad_norm": 3.015625, "learning_rate": 4.998199761262332e-06, "logits/chosen": -0.4940645694732666, "logits/rejected": -0.3298795521259308, "logps/chosen": -200.9434814453125, "logps/rejected": -179.53814697265625, "loss": 0.6842, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.13147035241127014, "rewards/margins": 0.02113470621407032, "rewards/rejected": 0.11033564805984497, "step": 280 }, { "epoch": 0.09133498681154285, "grad_norm": 2.375, "learning_rate": 4.997994205938164e-06, "logits/chosen": -0.5327965617179871, "logits/rejected": -0.38405364751815796, "logps/chosen": -214.1449432373047, "logps/rejected": -185.3954620361328, "loss": 0.6731, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16222958266735077, "rewards/margins": 0.0439109206199646, "rewards/rejected": 0.11831866204738617, "step": 290 }, { "epoch": 0.09448446911538916, "grad_norm": 2.671875, "learning_rate": 4.997777545616258e-06, "logits/chosen": -0.4295479655265808, "logits/rejected": -0.2999122738838196, "logps/chosen": -206.5615997314453, "logps/rejected": -172.51394653320312, "loss": 0.6781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15420152246952057, "rewards/margins": 0.033724717795848846, "rewards/rejected": 0.12047679722309113, "step": 300 }, { "epoch": 0.09763395141923546, "grad_norm": 3.109375, "learning_rate": 4.9975497812597935e-06, "logits/chosen": -0.47075533866882324, "logits/rejected": -0.38670963048934937, "logps/chosen": -209.8960418701172, "logps/rejected": -182.66079711914062, "loss": 0.6837, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.18156033754348755, "rewards/margins": 0.02306472323834896, "rewards/rejected": 0.15849560499191284, "step": 310 }, { "epoch": 0.10078343372308177, "grad_norm": 3.21875, "learning_rate": 4.997310913881312e-06, "logits/chosen": -0.4988747239112854, "logits/rejected": -0.363254576921463, "logps/chosen": -202.0031280517578, "logps/rejected": -175.33151245117188, "loss": 0.6791, "rewards/accuracies": 0.5, "rewards/chosen": 0.1774149388074875, "rewards/margins": 0.03209572285413742, "rewards/rejected": 0.14531922340393066, "step": 320 }, { "epoch": 0.10393291602692807, "grad_norm": 2.875, "learning_rate": 4.997060944542713e-06, "logits/chosen": -0.5674048662185669, "logits/rejected": -0.4282529950141907, "logps/chosen": -204.34375, "logps/rejected": -169.42828369140625, "loss": 0.6705, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18822543323040009, "rewards/margins": 0.0504903681576252, "rewards/rejected": 0.13773508369922638, "step": 330 }, { "epoch": 0.10708239833077438, "grad_norm": 3.578125, "learning_rate": 4.996799874355253e-06, "logits/chosen": -0.548692524433136, "logits/rejected": -0.42939552664756775, "logps/chosen": -204.91748046875, "logps/rejected": -187.4248046875, "loss": 0.686, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.18558073043823242, "rewards/margins": 0.018990959972143173, "rewards/rejected": 0.16658978164196014, "step": 340 }, { "epoch": 0.11023188063462068, "grad_norm": 4.09375, "learning_rate": 4.996527704479535e-06, "logits/chosen": -0.43033042550086975, "logits/rejected": -0.3002139627933502, "logps/chosen": -215.57754516601562, "logps/rejected": -193.66397094726562, "loss": 0.6858, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.19084212183952332, "rewards/margins": 0.018678244203329086, "rewards/rejected": 0.17216388881206512, "step": 350 }, { "epoch": 0.11338136293846698, "grad_norm": 2.921875, "learning_rate": 4.9962444361255095e-06, "logits/chosen": -0.5352093577384949, "logits/rejected": -0.3528750538825989, "logps/chosen": -202.76707458496094, "logps/rejected": -155.83926391601562, "loss": 0.6774, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.20079512894153595, "rewards/margins": 0.03638342395424843, "rewards/rejected": 0.164411723613739, "step": 360 }, { "epoch": 0.11653084524231329, "grad_norm": 3.640625, "learning_rate": 4.995950070552464e-06, "logits/chosen": -0.5055617094039917, "logits/rejected": -0.3472587764263153, "logps/chosen": -225.160888671875, "logps/rejected": -181.47703552246094, "loss": 0.6912, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.21311064064502716, "rewards/margins": 0.00886852853000164, "rewards/rejected": 0.20424208045005798, "step": 370 }, { "epoch": 0.1196803275461596, "grad_norm": 2.828125, "learning_rate": 4.995644609069021e-06, "logits/chosen": -0.48927387595176697, "logits/rejected": -0.42719903588294983, "logps/chosen": -185.412109375, "logps/rejected": -181.981201171875, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1894209086894989, "rewards/margins": 0.012292629107832909, "rewards/rejected": 0.17712828516960144, "step": 380 }, { "epoch": 0.12282980985000591, "grad_norm": 3.1875, "learning_rate": 4.995328053033129e-06, "logits/chosen": -0.4682087302207947, "logits/rejected": -0.25708621740341187, "logps/chosen": -217.36709594726562, "logps/rejected": -174.3680877685547, "loss": 0.6665, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22537508606910706, "rewards/margins": 0.057967256754636765, "rewards/rejected": 0.1674078404903412, "step": 390 }, { "epoch": 0.1259792921538522, "grad_norm": 3.515625, "learning_rate": 4.995000403852057e-06, "logits/chosen": -0.5313188433647156, "logits/rejected": -0.39059966802597046, "logps/chosen": -197.9477081298828, "logps/rejected": -174.9998779296875, "loss": 0.6761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.19948481023311615, "rewards/margins": 0.038825444877147675, "rewards/rejected": 0.16065934300422668, "step": 400 }, { "epoch": 0.12912877445769852, "grad_norm": 2.4375, "learning_rate": 4.994661662982393e-06, "logits/chosen": -0.533206045627594, "logits/rejected": -0.4300476610660553, "logps/chosen": -192.5688934326172, "logps/rejected": -167.0782928466797, "loss": 0.684, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.19937366247177124, "rewards/margins": 0.022861123085021973, "rewards/rejected": 0.17651252448558807, "step": 410 }, { "epoch": 0.1322782567615448, "grad_norm": 3.796875, "learning_rate": 4.994311831930032e-06, "logits/chosen": -0.4812285006046295, "logits/rejected": -0.35317444801330566, "logps/chosen": -189.956787109375, "logps/rejected": -159.92453002929688, "loss": 0.6706, "rewards/accuracies": 0.625, "rewards/chosen": 0.23123323917388916, "rewards/margins": 0.05047137662768364, "rewards/rejected": 0.18076185882091522, "step": 420 }, { "epoch": 0.13542773906539113, "grad_norm": 3.1875, "learning_rate": 4.993950912250171e-06, "logits/chosen": -0.4360167384147644, "logits/rejected": -0.3796977400779724, "logps/chosen": -203.8537139892578, "logps/rejected": -175.59356689453125, "loss": 0.6806, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.21009023487567902, "rewards/margins": 0.030164580792188644, "rewards/rejected": 0.17992563545703888, "step": 430 }, { "epoch": 0.13857722136923742, "grad_norm": 2.921875, "learning_rate": 4.9935789055473e-06, "logits/chosen": -0.4947914183139801, "logits/rejected": -0.36814266443252563, "logps/chosen": -189.90982055664062, "logps/rejected": -170.28982543945312, "loss": 0.6788, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.22383634746074677, "rewards/margins": 0.033211078494787216, "rewards/rejected": 0.19062528014183044, "step": 440 }, { "epoch": 0.14172670367308374, "grad_norm": 2.734375, "learning_rate": 4.993195813475202e-06, "logits/chosen": -0.5398550629615784, "logits/rejected": -0.3718551993370056, "logps/chosen": -193.36532592773438, "logps/rejected": -169.635986328125, "loss": 0.6801, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2529357373714447, "rewards/margins": 0.03174880892038345, "rewards/rejected": 0.22118695080280304, "step": 450 }, { "epoch": 0.14487618597693006, "grad_norm": 2.859375, "learning_rate": 4.992801637736937e-06, "logits/chosen": -0.5186210870742798, "logits/rejected": -0.35956451296806335, "logps/chosen": -189.38392639160156, "logps/rejected": -168.08277893066406, "loss": 0.6769, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.2704620957374573, "rewards/margins": 0.03909246250987053, "rewards/rejected": 0.23136961460113525, "step": 460 }, { "epoch": 0.14802566828077635, "grad_norm": 3.609375, "learning_rate": 4.992396380084839e-06, "logits/chosen": -0.481611967086792, "logits/rejected": -0.3885895907878876, "logps/chosen": -212.15335083007812, "logps/rejected": -216.1449737548828, "loss": 0.6683, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.29815778136253357, "rewards/margins": 0.05739093944430351, "rewards/rejected": 0.24076685309410095, "step": 470 }, { "epoch": 0.15117515058462266, "grad_norm": 3.671875, "learning_rate": 4.991980042320507e-06, "logits/chosen": -0.515384316444397, "logits/rejected": -0.3729632794857025, "logps/chosen": -199.055908203125, "logps/rejected": -176.830322265625, "loss": 0.6822, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.27753692865371704, "rewards/margins": 0.03093332052230835, "rewards/rejected": 0.2466035783290863, "step": 480 }, { "epoch": 0.15432463288846895, "grad_norm": 3.171875, "learning_rate": 4.991552626294799e-06, "logits/chosen": -0.477985680103302, "logits/rejected": -0.3193301260471344, "logps/chosen": -205.2687530517578, "logps/rejected": -178.9701385498047, "loss": 0.6786, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.275654137134552, "rewards/margins": 0.03465648740530014, "rewards/rejected": 0.24099759757518768, "step": 490 }, { "epoch": 0.15747411519231527, "grad_norm": 2.953125, "learning_rate": 4.991114133907822e-06, "logits/chosen": -0.5235245823860168, "logits/rejected": -0.3859252631664276, "logps/chosen": -212.98876953125, "logps/rejected": -178.56101989746094, "loss": 0.6856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2918005585670471, "rewards/margins": 0.020267702639102936, "rewards/rejected": 0.2715328335762024, "step": 500 }, { "epoch": 0.16062359749616156, "grad_norm": 2.671875, "learning_rate": 4.99066456710892e-06, "logits/chosen": -0.5184639096260071, "logits/rejected": -0.3862496316432953, "logps/chosen": -194.21218872070312, "logps/rejected": -152.38967895507812, "loss": 0.6722, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.28834104537963867, "rewards/margins": 0.05140441656112671, "rewards/rejected": 0.23693661391735077, "step": 510 }, { "epoch": 0.16377307980000788, "grad_norm": 2.953125, "learning_rate": 4.990203927896674e-06, "logits/chosen": -0.5369669198989868, "logits/rejected": -0.42427778244018555, "logps/chosen": -195.22592163085938, "logps/rejected": -173.0403289794922, "loss": 0.6737, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.282554566860199, "rewards/margins": 0.04619991034269333, "rewards/rejected": 0.23635463416576385, "step": 520 }, { "epoch": 0.16692256210385417, "grad_norm": 2.609375, "learning_rate": 4.9897322183188855e-06, "logits/chosen": -0.49566903710365295, "logits/rejected": -0.42590633034706116, "logps/chosen": -193.32513427734375, "logps/rejected": -163.94113159179688, "loss": 0.68, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3135472536087036, "rewards/margins": 0.0341959223151207, "rewards/rejected": 0.2793513536453247, "step": 530 }, { "epoch": 0.1700720444077005, "grad_norm": 2.609375, "learning_rate": 4.989249440472569e-06, "logits/chosen": -0.4742864668369293, "logits/rejected": -0.361924409866333, "logps/chosen": -206.1411590576172, "logps/rejected": -177.32366943359375, "loss": 0.6616, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2787955701351166, "rewards/margins": 0.07065565139055252, "rewards/rejected": 0.20813994109630585, "step": 540 }, { "epoch": 0.17322152671154678, "grad_norm": 2.640625, "learning_rate": 4.988755596503948e-06, "logits/chosen": -0.47193408012390137, "logits/rejected": -0.32936495542526245, "logps/chosen": -212.5970916748047, "logps/rejected": -184.22251892089844, "loss": 0.6785, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3025556802749634, "rewards/margins": 0.03635026142001152, "rewards/rejected": 0.26620543003082275, "step": 550 }, { "epoch": 0.1763710090153931, "grad_norm": 2.671875, "learning_rate": 4.988250688608436e-06, "logits/chosen": -0.5082842111587524, "logits/rejected": -0.38267362117767334, "logps/chosen": -196.55685424804688, "logps/rejected": -176.552734375, "loss": 0.6866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.312252402305603, "rewards/margins": 0.022432830184698105, "rewards/rejected": 0.2898195683956146, "step": 560 }, { "epoch": 0.1795204913192394, "grad_norm": 3.203125, "learning_rate": 4.9877347190306354e-06, "logits/chosen": -0.44343939423561096, "logits/rejected": -0.38272932171821594, "logps/chosen": -187.27857971191406, "logps/rejected": -173.09884643554688, "loss": 0.7007, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.28175923228263855, "rewards/margins": -0.002676494885236025, "rewards/rejected": 0.2844357490539551, "step": 570 }, { "epoch": 0.1826699736230857, "grad_norm": 3.03125, "learning_rate": 4.987207690064323e-06, "logits/chosen": -0.47279614210128784, "logits/rejected": -0.31698504090309143, "logps/chosen": -226.852783203125, "logps/rejected": -187.92398071289062, "loss": 0.6668, "rewards/accuracies": 0.625, "rewards/chosen": 0.3296729326248169, "rewards/margins": 0.06299933046102524, "rewards/rejected": 0.26667362451553345, "step": 580 }, { "epoch": 0.185819455926932, "grad_norm": 2.796875, "learning_rate": 4.98666960405244e-06, "logits/chosen": -0.5369702577590942, "logits/rejected": -0.4153470993041992, "logps/chosen": -185.39688110351562, "logps/rejected": -150.71243286132812, "loss": 0.6733, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2983696460723877, "rewards/margins": 0.04920379817485809, "rewards/rejected": 0.24916581809520721, "step": 590 }, { "epoch": 0.18896893823077832, "grad_norm": 2.734375, "learning_rate": 4.986120463387084e-06, "logits/chosen": -0.4622046947479248, "logits/rejected": -0.3503243923187256, "logps/chosen": -191.92813110351562, "logps/rejected": -176.70423889160156, "loss": 0.6848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.309493750333786, "rewards/margins": 0.02766551449894905, "rewards/rejected": 0.28182822465896606, "step": 600 }, { "epoch": 0.19211842053462463, "grad_norm": 2.96875, "learning_rate": 4.985560270509496e-06, "logits/chosen": -0.5258822441101074, "logits/rejected": -0.3977143168449402, "logps/chosen": -215.99563598632812, "logps/rejected": -186.25816345214844, "loss": 0.6636, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3478468358516693, "rewards/margins": 0.06890861690044403, "rewards/rejected": 0.2789382338523865, "step": 610 }, { "epoch": 0.19526790283847092, "grad_norm": 2.4375, "learning_rate": 4.9849890279100495e-06, "logits/chosen": -0.49815624952316284, "logits/rejected": -0.4273989796638489, "logps/chosen": -206.10385131835938, "logps/rejected": -180.3357391357422, "loss": 0.6795, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3327508568763733, "rewards/margins": 0.03660760074853897, "rewards/rejected": 0.29614323377609253, "step": 620 }, { "epoch": 0.19841738514231724, "grad_norm": 3.140625, "learning_rate": 4.984406738128241e-06, "logits/chosen": -0.4096647799015045, "logits/rejected": -0.3559405505657196, "logps/chosen": -206.38064575195312, "logps/rejected": -185.5215301513672, "loss": 0.6733, "rewards/accuracies": 0.625, "rewards/chosen": 0.3558829426765442, "rewards/margins": 0.049683526158332825, "rewards/rejected": 0.3061993718147278, "step": 630 }, { "epoch": 0.20156686744616353, "grad_norm": 3.015625, "learning_rate": 4.9838134037526795e-06, "logits/chosen": -0.4695689082145691, "logits/rejected": -0.39185652136802673, "logps/chosen": -194.5779266357422, "logps/rejected": -173.6152801513672, "loss": 0.6702, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.33483070135116577, "rewards/margins": 0.05525987595319748, "rewards/rejected": 0.27957087755203247, "step": 640 }, { "epoch": 0.20471634975000985, "grad_norm": 2.453125, "learning_rate": 4.983209027421072e-06, "logits/chosen": -0.5027821063995361, "logits/rejected": -0.3183223009109497, "logps/chosen": -209.20291137695312, "logps/rejected": -171.07730102539062, "loss": 0.6453, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3705558776855469, "rewards/margins": 0.10951970517635345, "rewards/rejected": 0.2610361576080322, "step": 650 }, { "epoch": 0.20786583205385614, "grad_norm": 2.96875, "learning_rate": 4.982593611820211e-06, "logits/chosen": -0.5004459023475647, "logits/rejected": -0.40845292806625366, "logps/chosen": -183.89920043945312, "logps/rejected": -157.65817260742188, "loss": 0.6759, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3168241083621979, "rewards/margins": 0.04429393634200096, "rewards/rejected": 0.27253013849258423, "step": 660 }, { "epoch": 0.21101531435770246, "grad_norm": 3.328125, "learning_rate": 4.981967159685969e-06, "logits/chosen": -0.5235536694526672, "logits/rejected": -0.3312370181083679, "logps/chosen": -212.1456298828125, "logps/rejected": -173.4841766357422, "loss": 0.6682, "rewards/accuracies": 0.625, "rewards/chosen": 0.37052249908447266, "rewards/margins": 0.06635678559541702, "rewards/rejected": 0.30416572093963623, "step": 670 }, { "epoch": 0.21416479666154875, "grad_norm": 2.90625, "learning_rate": 4.98132967380328e-06, "logits/chosen": -0.4822749197483063, "logits/rejected": -0.37933364510536194, "logps/chosen": -184.20840454101562, "logps/rejected": -165.41285705566406, "loss": 0.6753, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3370289206504822, "rewards/margins": 0.04510311037302017, "rewards/rejected": 0.2919257879257202, "step": 680 }, { "epoch": 0.21731427896539507, "grad_norm": 2.5625, "learning_rate": 4.980681157006129e-06, "logits/chosen": -0.45780807733535767, "logits/rejected": -0.3043513894081116, "logps/chosen": -217.8817901611328, "logps/rejected": -184.9659423828125, "loss": 0.6575, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.38531795144081116, "rewards/margins": 0.08670790493488312, "rewards/rejected": 0.29861006140708923, "step": 690 }, { "epoch": 0.22046376126924136, "grad_norm": 3.34375, "learning_rate": 4.9800216121775404e-06, "logits/chosen": -0.4495162069797516, "logits/rejected": -0.3232493996620178, "logps/chosen": -209.5262908935547, "logps/rejected": -175.12301635742188, "loss": 0.6529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4109676778316498, "rewards/margins": 0.093905009329319, "rewards/rejected": 0.3170626759529114, "step": 700 }, { "epoch": 0.22361324357308768, "grad_norm": 2.40625, "learning_rate": 4.979351042249564e-06, "logits/chosen": -0.5166658163070679, "logits/rejected": -0.36778515577316284, "logps/chosen": -202.32473754882812, "logps/rejected": -175.3853759765625, "loss": 0.6596, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3800323009490967, "rewards/margins": 0.07695108652114868, "rewards/rejected": 0.303081214427948, "step": 710 }, { "epoch": 0.22676272587693397, "grad_norm": 3.75, "learning_rate": 4.978669450203263e-06, "logits/chosen": -0.5254799127578735, "logits/rejected": -0.3579130470752716, "logps/chosen": -201.89544677734375, "logps/rejected": -170.9855194091797, "loss": 0.6548, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.41179266571998596, "rewards/margins": 0.09266375005245209, "rewards/rejected": 0.31912893056869507, "step": 720 }, { "epoch": 0.2299122081807803, "grad_norm": 2.5, "learning_rate": 4.977976839068699e-06, "logits/chosen": -0.41213518381118774, "logits/rejected": -0.37237733602523804, "logps/chosen": -193.40603637695312, "logps/rejected": -176.70425415039062, "loss": 0.6763, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3635534644126892, "rewards/margins": 0.05010410025715828, "rewards/rejected": 0.31344935297966003, "step": 730 }, { "epoch": 0.23306169048462658, "grad_norm": 3.140625, "learning_rate": 4.977273211924921e-06, "logits/chosen": -0.5329135060310364, "logits/rejected": -0.387803316116333, "logps/chosen": -230.49569702148438, "logps/rejected": -199.40280151367188, "loss": 0.6588, "rewards/accuracies": 0.625, "rewards/chosen": 0.4647158682346344, "rewards/margins": 0.08450464904308319, "rewards/rejected": 0.38021120429039, "step": 740 }, { "epoch": 0.2362111727884729, "grad_norm": 2.875, "learning_rate": 4.97655857189995e-06, "logits/chosen": -0.4675888419151306, "logits/rejected": -0.3698478639125824, "logps/chosen": -200.2988739013672, "logps/rejected": -172.2423858642578, "loss": 0.6704, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.38973450660705566, "rewards/margins": 0.05968126654624939, "rewards/rejected": 0.3300532102584839, "step": 750 }, { "epoch": 0.2393606550923192, "grad_norm": 3.5, "learning_rate": 4.975832922170765e-06, "logits/chosen": -0.48995572328567505, "logits/rejected": -0.3825104236602783, "logps/chosen": -199.5481414794922, "logps/rejected": -186.9451904296875, "loss": 0.6902, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.40951433777809143, "rewards/margins": 0.019469773396849632, "rewards/rejected": 0.39004451036453247, "step": 760 }, { "epoch": 0.2425101373961655, "grad_norm": 2.84375, "learning_rate": 4.9750962659632886e-06, "logits/chosen": -0.46572384238243103, "logits/rejected": -0.25734081864356995, "logps/chosen": -213.49441528320312, "logps/rejected": -179.05862426757812, "loss": 0.6589, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4372657835483551, "rewards/margins": 0.08767595142126083, "rewards/rejected": 0.34958982467651367, "step": 770 }, { "epoch": 0.24565961970001182, "grad_norm": 2.609375, "learning_rate": 4.974348606552377e-06, "logits/chosen": -0.49957194924354553, "logits/rejected": -0.44457465410232544, "logps/chosen": -174.76776123046875, "logps/rejected": -162.83163452148438, "loss": 0.6713, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.37218689918518066, "rewards/margins": 0.05890881270170212, "rewards/rejected": 0.31327807903289795, "step": 780 }, { "epoch": 0.2488091020038581, "grad_norm": 2.484375, "learning_rate": 4.973589947261797e-06, "logits/chosen": -0.43002453446388245, "logits/rejected": -0.26697424054145813, "logps/chosen": -214.3047637939453, "logps/rejected": -164.3812713623047, "loss": 0.6533, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4211522042751312, "rewards/margins": 0.09669376909732819, "rewards/rejected": 0.324458509683609, "step": 790 }, { "epoch": 0.2519585843077044, "grad_norm": 3.0625, "learning_rate": 4.972820291464219e-06, "logits/chosen": -0.5306814312934875, "logits/rejected": -0.3864821195602417, "logps/chosen": -212.0864715576172, "logps/rejected": -191.2584991455078, "loss": 0.6768, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4268336296081543, "rewards/margins": 0.04500560089945793, "rewards/rejected": 0.3818280100822449, "step": 800 }, { "epoch": 0.2551080666115507, "grad_norm": 2.78125, "learning_rate": 4.972039642581199e-06, "logits/chosen": -0.44598865509033203, "logits/rejected": -0.3495904505252838, "logps/chosen": -206.1003875732422, "logps/rejected": -180.69091796875, "loss": 0.6704, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4352218508720398, "rewards/margins": 0.06349250674247742, "rewards/rejected": 0.37172931432724, "step": 810 }, { "epoch": 0.25825754891539704, "grad_norm": 2.90625, "learning_rate": 4.9712480040831626e-06, "logits/chosen": -0.4507770538330078, "logits/rejected": -0.4034315049648285, "logps/chosen": -201.1985626220703, "logps/rejected": -188.18038940429688, "loss": 0.6691, "rewards/accuracies": 0.625, "rewards/chosen": 0.4719354212284088, "rewards/margins": 0.06965798884630203, "rewards/rejected": 0.40227746963500977, "step": 820 }, { "epoch": 0.26140703121924336, "grad_norm": 2.859375, "learning_rate": 4.9704453794893905e-06, "logits/chosen": -0.4594174921512604, "logits/rejected": -0.3546017110347748, "logps/chosen": -208.15328979492188, "logps/rejected": -175.2351531982422, "loss": 0.6596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4524504244327545, "rewards/margins": 0.08146923780441284, "rewards/rejected": 0.3709811568260193, "step": 830 }, { "epoch": 0.2645565135230896, "grad_norm": 2.9375, "learning_rate": 4.969631772368005e-06, "logits/chosen": -0.44061025977134705, "logits/rejected": -0.3553173840045929, "logps/chosen": -204.01336669921875, "logps/rejected": -194.84194946289062, "loss": 0.683, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4293643534183502, "rewards/margins": 0.03097158670425415, "rewards/rejected": 0.39839276671409607, "step": 840 }, { "epoch": 0.26770599582693594, "grad_norm": 2.734375, "learning_rate": 4.968807186335948e-06, "logits/chosen": -0.45513710379600525, "logits/rejected": -0.35907578468322754, "logps/chosen": -187.94784545898438, "logps/rejected": -163.60385131835938, "loss": 0.6449, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4533543586730957, "rewards/margins": 0.11502598226070404, "rewards/rejected": 0.3383283317089081, "step": 850 }, { "epoch": 0.27085547813078226, "grad_norm": 2.34375, "learning_rate": 4.9679716250589726e-06, "logits/chosen": -0.53230220079422, "logits/rejected": -0.36705273389816284, "logps/chosen": -214.5038299560547, "logps/rejected": -173.00045776367188, "loss": 0.6439, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4622381627559662, "rewards/margins": 0.11802558600902557, "rewards/rejected": 0.3442125916481018, "step": 860 }, { "epoch": 0.2740049604346286, "grad_norm": 2.703125, "learning_rate": 4.96712509225162e-06, "logits/chosen": -0.4010530412197113, "logits/rejected": -0.40078672766685486, "logps/chosen": -189.293212890625, "logps/rejected": -190.86239624023438, "loss": 0.7079, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.3847641050815582, "rewards/margins": -0.01615377888083458, "rewards/rejected": 0.4009179174900055, "step": 870 }, { "epoch": 0.27715444273847484, "grad_norm": 2.3125, "learning_rate": 4.966267591677209e-06, "logits/chosen": -0.47001272439956665, "logits/rejected": -0.36287882924079895, "logps/chosen": -193.36114501953125, "logps/rejected": -163.00997924804688, "loss": 0.6561, "rewards/accuracies": 0.625, "rewards/chosen": 0.412198543548584, "rewards/margins": 0.08650766313076019, "rewards/rejected": 0.3256909251213074, "step": 880 }, { "epoch": 0.28030392504232116, "grad_norm": 3.40625, "learning_rate": 4.965399127147814e-06, "logits/chosen": -0.5800660848617554, "logits/rejected": -0.4225196838378906, "logps/chosen": -199.9794158935547, "logps/rejected": -175.31942749023438, "loss": 0.6812, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4509803354740143, "rewards/margins": 0.04268043860793114, "rewards/rejected": 0.40829986333847046, "step": 890 }, { "epoch": 0.2834534073461675, "grad_norm": 2.890625, "learning_rate": 4.964519702524251e-06, "logits/chosen": -0.48715677857398987, "logits/rejected": -0.3676280677318573, "logps/chosen": -206.2698974609375, "logps/rejected": -180.06561279296875, "loss": 0.6616, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4541000425815582, "rewards/margins": 0.08144226670265198, "rewards/rejected": 0.37265777587890625, "step": 900 }, { "epoch": 0.2866028896500138, "grad_norm": 2.75, "learning_rate": 4.9636293217160615e-06, "logits/chosen": -0.4000244736671448, "logits/rejected": -0.37658295035362244, "logps/chosen": -201.72274780273438, "logps/rejected": -197.0117645263672, "loss": 0.708, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.4207178056240082, "rewards/margins": -0.009672733955085278, "rewards/rejected": 0.43039053678512573, "step": 910 }, { "epoch": 0.2897523719538601, "grad_norm": 2.15625, "learning_rate": 4.96272798868149e-06, "logits/chosen": -0.5054947137832642, "logits/rejected": -0.3827149271965027, "logps/chosen": -193.49334716796875, "logps/rejected": -166.5660858154297, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": 0.4493057131767273, "rewards/margins": 0.08539381623268127, "rewards/rejected": 0.363911896944046, "step": 920 }, { "epoch": 0.2929018542577064, "grad_norm": 2.90625, "learning_rate": 4.961815707427473e-06, "logits/chosen": -0.4878782629966736, "logits/rejected": -0.34200000762939453, "logps/chosen": -192.65780639648438, "logps/rejected": -171.5297393798828, "loss": 0.6809, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.43513813614845276, "rewards/margins": 0.04086482524871826, "rewards/rejected": 0.3942733108997345, "step": 930 }, { "epoch": 0.2960513365615527, "grad_norm": 3.046875, "learning_rate": 4.960892482009617e-06, "logits/chosen": -0.5498200058937073, "logits/rejected": -0.3890989422798157, "logps/chosen": -217.5938262939453, "logps/rejected": -185.64468383789062, "loss": 0.6572, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4587416648864746, "rewards/margins": 0.08828563988208771, "rewards/rejected": 0.3704560697078705, "step": 940 }, { "epoch": 0.299200818865399, "grad_norm": 3.1875, "learning_rate": 4.959958316532181e-06, "logits/chosen": -0.4381844103336334, "logits/rejected": -0.39938193559646606, "logps/chosen": -202.2256622314453, "logps/rejected": -184.60812377929688, "loss": 0.6555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.47266441583633423, "rewards/margins": 0.08841115981340408, "rewards/rejected": 0.38425326347351074, "step": 950 }, { "epoch": 0.30235030116924533, "grad_norm": 2.875, "learning_rate": 4.959013215148059e-06, "logits/chosen": -0.516729474067688, "logits/rejected": -0.37376007437705994, "logps/chosen": -206.91470336914062, "logps/rejected": -165.06771850585938, "loss": 0.6483, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.48910826444625854, "rewards/margins": 0.11284583806991577, "rewards/rejected": 0.3762624263763428, "step": 960 }, { "epoch": 0.3054997834730916, "grad_norm": 2.265625, "learning_rate": 4.958057182058763e-06, "logits/chosen": -0.4985506534576416, "logits/rejected": -0.3460918366909027, "logps/chosen": -204.8501434326172, "logps/rejected": -156.81150817871094, "loss": 0.6317, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5237672924995422, "rewards/margins": 0.14426377415657043, "rewards/rejected": 0.3795034885406494, "step": 970 }, { "epoch": 0.3086492657769379, "grad_norm": 2.421875, "learning_rate": 4.957090221514399e-06, "logits/chosen": -0.44721898436546326, "logits/rejected": -0.3173277676105499, "logps/chosen": -204.72488403320312, "logps/rejected": -172.4529571533203, "loss": 0.6822, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.43838948011398315, "rewards/margins": 0.04825422167778015, "rewards/rejected": 0.3901353180408478, "step": 980 }, { "epoch": 0.3117987480807842, "grad_norm": 3.53125, "learning_rate": 4.956112337813655e-06, "logits/chosen": -0.5343712568283081, "logits/rejected": -0.42165470123291016, "logps/chosen": -196.4682159423828, "logps/rejected": -159.38626098632812, "loss": 0.6411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.49275222420692444, "rewards/margins": 0.12367204576730728, "rewards/rejected": 0.369080126285553, "step": 990 }, { "epoch": 0.31494823038463055, "grad_norm": 3.046875, "learning_rate": 4.955123535303775e-06, "logits/chosen": -0.5008007884025574, "logits/rejected": -0.32471761107444763, "logps/chosen": -215.82382202148438, "logps/rejected": -170.16177368164062, "loss": 0.6278, "rewards/accuracies": 0.6875, "rewards/chosen": 0.555659294128418, "rewards/margins": 0.15915410220623016, "rewards/rejected": 0.3965051770210266, "step": 1000 }, { "epoch": 0.3180977126884768, "grad_norm": 2.8125, "learning_rate": 4.95412381838055e-06, "logits/chosen": -0.45208463072776794, "logits/rejected": -0.3023605942726135, "logps/chosen": -211.84561157226562, "logps/rejected": -175.75242614746094, "loss": 0.6482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5254601836204529, "rewards/margins": 0.1151600107550621, "rewards/rejected": 0.41030019521713257, "step": 1010 }, { "epoch": 0.3212471949923231, "grad_norm": 3.34375, "learning_rate": 4.953113191488284e-06, "logits/chosen": -0.5526930093765259, "logits/rejected": -0.4031393527984619, "logps/chosen": -200.52316284179688, "logps/rejected": -160.85092163085938, "loss": 0.6635, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.47668904066085815, "rewards/margins": 0.08161365985870361, "rewards/rejected": 0.3950754404067993, "step": 1020 }, { "epoch": 0.32439667729616944, "grad_norm": 2.375, "learning_rate": 4.9520916591197865e-06, "logits/chosen": -0.49668893218040466, "logits/rejected": -0.3756228983402252, "logps/chosen": -201.33413696289062, "logps/rejected": -169.83847045898438, "loss": 0.6651, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5008379220962524, "rewards/margins": 0.08502224832773209, "rewards/rejected": 0.41581565141677856, "step": 1030 }, { "epoch": 0.32754615960001576, "grad_norm": 3.078125, "learning_rate": 4.951059225816347e-06, "logits/chosen": -0.4749979078769684, "logits/rejected": -0.30828791856765747, "logps/chosen": -224.30178833007812, "logps/rejected": -178.66122436523438, "loss": 0.6517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5890859961509705, "rewards/margins": 0.12044923007488251, "rewards/rejected": 0.46863681077957153, "step": 1040 }, { "epoch": 0.330695641903862, "grad_norm": 2.203125, "learning_rate": 4.950015896167716e-06, "logits/chosen": -0.47923216223716736, "logits/rejected": -0.28145602345466614, "logps/chosen": -189.4966583251953, "logps/rejected": -161.24319458007812, "loss": 0.6675, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4908905029296875, "rewards/margins": 0.07855306565761566, "rewards/rejected": 0.41233739256858826, "step": 1050 }, { "epoch": 0.33384512420770834, "grad_norm": 2.484375, "learning_rate": 4.948961674812083e-06, "logits/chosen": -0.49384012818336487, "logits/rejected": -0.36533522605895996, "logps/chosen": -198.85769653320312, "logps/rejected": -174.79847717285156, "loss": 0.6573, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5101815462112427, "rewards/margins": 0.08699540048837662, "rewards/rejected": 0.42318612337112427, "step": 1060 }, { "epoch": 0.33699460651155466, "grad_norm": 2.46875, "learning_rate": 4.9478965664360595e-06, "logits/chosen": -0.5174117088317871, "logits/rejected": -0.3790570795536041, "logps/chosen": -199.65078735351562, "logps/rejected": -159.61801147460938, "loss": 0.6417, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5138002634048462, "rewards/margins": 0.12827709317207336, "rewards/rejected": 0.38552325963974, "step": 1070 }, { "epoch": 0.340144088815401, "grad_norm": 2.34375, "learning_rate": 4.946820575774654e-06, "logits/chosen": -0.4343532919883728, "logits/rejected": -0.31658726930618286, "logps/chosen": -199.5583038330078, "logps/rejected": -162.34683227539062, "loss": 0.6508, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5402163863182068, "rewards/margins": 0.12262473255395889, "rewards/rejected": 0.41759172081947327, "step": 1080 }, { "epoch": 0.3432935711192473, "grad_norm": 2.65625, "learning_rate": 4.945733707611256e-06, "logits/chosen": -0.42169666290283203, "logits/rejected": -0.27337223291397095, "logps/chosen": -210.4072265625, "logps/rejected": -176.20767211914062, "loss": 0.6267, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6182594299316406, "rewards/margins": 0.16625744104385376, "rewards/rejected": 0.45200204849243164, "step": 1090 }, { "epoch": 0.34644305342309356, "grad_norm": 2.234375, "learning_rate": 4.944635966777607e-06, "logits/chosen": -0.5145076513290405, "logits/rejected": -0.3866254985332489, "logps/chosen": -186.8591766357422, "logps/rejected": -147.5457305908203, "loss": 0.6267, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5492256283760071, "rewards/margins": 0.15818223357200623, "rewards/rejected": 0.39104336500167847, "step": 1100 }, { "epoch": 0.3495925357269399, "grad_norm": 2.796875, "learning_rate": 4.943527358153787e-06, "logits/chosen": -0.4312458634376526, "logits/rejected": -0.2944316267967224, "logps/chosen": -176.56130981445312, "logps/rejected": -144.69210815429688, "loss": 0.6711, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5271092057228088, "rewards/margins": 0.074883833527565, "rewards/rejected": 0.45222535729408264, "step": 1110 }, { "epoch": 0.3527420180307862, "grad_norm": 3.25, "learning_rate": 4.942407886668189e-06, "logits/chosen": -0.4668591022491455, "logits/rejected": -0.4000996947288513, "logps/chosen": -192.39215087890625, "logps/rejected": -181.41018676757812, "loss": 0.6692, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5816969871520996, "rewards/margins": 0.07639260590076447, "rewards/rejected": 0.5053043365478516, "step": 1120 }, { "epoch": 0.3558915003346325, "grad_norm": 2.421875, "learning_rate": 4.941277557297497e-06, "logits/chosen": -0.5000630617141724, "logits/rejected": -0.3792869448661804, "logps/chosen": -201.1351776123047, "logps/rejected": -163.50692749023438, "loss": 0.6669, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5100800395011902, "rewards/margins": 0.07874707877635956, "rewards/rejected": 0.4313329756259918, "step": 1130 }, { "epoch": 0.3590409826384788, "grad_norm": 3.015625, "learning_rate": 4.940136375066664e-06, "logits/chosen": -0.5313334465026855, "logits/rejected": -0.4073900580406189, "logps/chosen": -195.81436157226562, "logps/rejected": -165.0396728515625, "loss": 0.6488, "rewards/accuracies": 0.625, "rewards/chosen": 0.5570641756057739, "rewards/margins": 0.12459216266870499, "rewards/rejected": 0.4324720501899719, "step": 1140 }, { "epoch": 0.3621904649423251, "grad_norm": 2.921875, "learning_rate": 4.938984345048892e-06, "logits/chosen": -0.47552186250686646, "logits/rejected": -0.3426175117492676, "logps/chosen": -226.5735626220703, "logps/rejected": -185.11488342285156, "loss": 0.6607, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6292687654495239, "rewards/margins": 0.10300488770008087, "rewards/rejected": 0.5262638926506042, "step": 1150 }, { "epoch": 0.3653399472461714, "grad_norm": 2.890625, "learning_rate": 4.937821472365606e-06, "logits/chosen": -0.45469918847084045, "logits/rejected": -0.2649703621864319, "logps/chosen": -203.91030883789062, "logps/rejected": -154.42904663085938, "loss": 0.6364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.614439845085144, "rewards/margins": 0.14579293131828308, "rewards/rejected": 0.46864691376686096, "step": 1160 }, { "epoch": 0.36848942955001773, "grad_norm": 2.546875, "learning_rate": 4.9366477621864325e-06, "logits/chosen": -0.45428067445755005, "logits/rejected": -0.38431745767593384, "logps/chosen": -193.93759155273438, "logps/rejected": -166.46226501464844, "loss": 0.6829, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.5578614473342896, "rewards/margins": 0.04849938303232193, "rewards/rejected": 0.5093621015548706, "step": 1170 }, { "epoch": 0.371638911853864, "grad_norm": 2.984375, "learning_rate": 4.935463219729178e-06, "logits/chosen": -0.43662381172180176, "logits/rejected": -0.3796136975288391, "logps/chosen": -208.486083984375, "logps/rejected": -186.07241821289062, "loss": 0.6391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5909144878387451, "rewards/margins": 0.12893392145633698, "rewards/rejected": 0.46198058128356934, "step": 1180 }, { "epoch": 0.3747883941577103, "grad_norm": 2.640625, "learning_rate": 4.934267850259802e-06, "logits/chosen": -0.5133141875267029, "logits/rejected": -0.36373409628868103, "logps/chosen": -194.3529815673828, "logps/rejected": -159.5540771484375, "loss": 0.6617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5327507853507996, "rewards/margins": 0.08738649636507034, "rewards/rejected": 0.4453642964363098, "step": 1190 }, { "epoch": 0.37793787646155663, "grad_norm": 3.15625, "learning_rate": 4.933061659092401e-06, "logits/chosen": -0.45688313245773315, "logits/rejected": -0.3101183772087097, "logps/chosen": -197.06399536132812, "logps/rejected": -178.8423309326172, "loss": 0.6921, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.5263506174087524, "rewards/margins": 0.027934294193983078, "rewards/rejected": 0.4984162747859955, "step": 1200 }, { "epoch": 0.38108735876540295, "grad_norm": 2.953125, "learning_rate": 4.931844651589176e-06, "logits/chosen": -0.3927622437477112, "logits/rejected": -0.3346686065196991, "logps/chosen": -192.22146606445312, "logps/rejected": -177.29522705078125, "loss": 0.6793, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5813957452774048, "rewards/margins": 0.05229301005601883, "rewards/rejected": 0.5291028022766113, "step": 1210 }, { "epoch": 0.38423684106924927, "grad_norm": 2.71875, "learning_rate": 4.930616833160414e-06, "logits/chosen": -0.503675639629364, "logits/rejected": -0.3028663694858551, "logps/chosen": -209.7771759033203, "logps/rejected": -166.1737060546875, "loss": 0.6183, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.6333845853805542, "rewards/margins": 0.17845922708511353, "rewards/rejected": 0.45492544770240784, "step": 1220 }, { "epoch": 0.38738632337309553, "grad_norm": 2.359375, "learning_rate": 4.929378209264464e-06, "logits/chosen": -0.4029599130153656, "logits/rejected": -0.2958356738090515, "logps/chosen": -201.47354125976562, "logps/rejected": -175.35316467285156, "loss": 0.6546, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6188668012619019, "rewards/margins": 0.10686089843511581, "rewards/rejected": 0.5120059251785278, "step": 1230 }, { "epoch": 0.39053580567694185, "grad_norm": 3.421875, "learning_rate": 4.9281287854077075e-06, "logits/chosen": -0.5093222856521606, "logits/rejected": -0.3469759523868561, "logps/chosen": -203.4248809814453, "logps/rejected": -168.985595703125, "loss": 0.6675, "rewards/accuracies": 0.625, "rewards/chosen": 0.5747894048690796, "rewards/margins": 0.08418375253677368, "rewards/rejected": 0.4906056821346283, "step": 1240 }, { "epoch": 0.39368528798078817, "grad_norm": 3.140625, "learning_rate": 4.926868567144543e-06, "logits/chosen": -0.43734902143478394, "logits/rejected": -0.27301496267318726, "logps/chosen": -217.2354736328125, "logps/rejected": -179.50477600097656, "loss": 0.6439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.602939784526825, "rewards/margins": 0.13366705179214478, "rewards/rejected": 0.46927279233932495, "step": 1250 }, { "epoch": 0.3968347702846345, "grad_norm": 2.015625, "learning_rate": 4.9255975600773506e-06, "logits/chosen": -0.46365243196487427, "logits/rejected": -0.369037002325058, "logps/chosen": -193.8307647705078, "logps/rejected": -167.16226196289062, "loss": 0.6528, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5885978937149048, "rewards/margins": 0.11199425160884857, "rewards/rejected": 0.4766036570072174, "step": 1260 }, { "epoch": 0.39998425258848075, "grad_norm": 2.859375, "learning_rate": 4.92431576985648e-06, "logits/chosen": -0.5200837850570679, "logits/rejected": -0.3553524315357208, "logps/chosen": -207.01754760742188, "logps/rejected": -175.571533203125, "loss": 0.6289, "rewards/accuracies": 0.625, "rewards/chosen": 0.6721639633178711, "rewards/margins": 0.16469912230968475, "rewards/rejected": 0.5074647665023804, "step": 1270 }, { "epoch": 0.40313373489232707, "grad_norm": 2.828125, "learning_rate": 4.9230232021802116e-06, "logits/chosen": -0.4531930088996887, "logits/rejected": -0.33995673060417175, "logps/chosen": -182.60243225097656, "logps/rejected": -167.54733276367188, "loss": 0.6632, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5690507292747498, "rewards/margins": 0.08255408704280853, "rewards/rejected": 0.4864966869354248, "step": 1280 }, { "epoch": 0.4062832171961734, "grad_norm": 2.421875, "learning_rate": 4.921719862794741e-06, "logits/chosen": -0.5172183513641357, "logits/rejected": -0.43691587448120117, "logps/chosen": -198.16488647460938, "logps/rejected": -176.88308715820312, "loss": 0.6577, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5772544741630554, "rewards/margins": 0.09580488502979279, "rewards/rejected": 0.4814496636390686, "step": 1290 }, { "epoch": 0.4094326995000197, "grad_norm": 3.09375, "learning_rate": 4.920405757494147e-06, "logits/chosen": -0.5242056846618652, "logits/rejected": -0.41546088457107544, "logps/chosen": -205.5856475830078, "logps/rejected": -165.4510040283203, "loss": 0.6403, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6132642030715942, "rewards/margins": 0.13079342246055603, "rewards/rejected": 0.4824707508087158, "step": 1300 }, { "epoch": 0.41258218180386597, "grad_norm": 3.78125, "learning_rate": 4.919080892120375e-06, "logits/chosen": -0.4625665247440338, "logits/rejected": -0.36404842138290405, "logps/chosen": -186.5456085205078, "logps/rejected": -172.53269958496094, "loss": 0.6757, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5738548040390015, "rewards/margins": 0.07151280343532562, "rewards/rejected": 0.5023420453071594, "step": 1310 }, { "epoch": 0.4157316641077123, "grad_norm": 2.21875, "learning_rate": 4.917745272563198e-06, "logits/chosen": -0.514171302318573, "logits/rejected": -0.4131143093109131, "logps/chosen": -190.60104370117188, "logps/rejected": -163.16876220703125, "loss": 0.6377, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.644982635974884, "rewards/margins": 0.149316668510437, "rewards/rejected": 0.49566593766212463, "step": 1320 }, { "epoch": 0.4188811464115586, "grad_norm": 3.25, "learning_rate": 4.916398904760202e-06, "logits/chosen": -0.407987117767334, "logits/rejected": -0.3450384736061096, "logps/chosen": -204.9993133544922, "logps/rejected": -177.96536254882812, "loss": 0.6839, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6174831390380859, "rewards/margins": 0.06308461725711823, "rewards/rejected": 0.5543986558914185, "step": 1330 }, { "epoch": 0.4220306287154049, "grad_norm": 3.359375, "learning_rate": 4.915041794696755e-06, "logits/chosen": -0.4585428833961487, "logits/rejected": -0.32048022747039795, "logps/chosen": -205.7061004638672, "logps/rejected": -188.31765747070312, "loss": 0.6532, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6447404623031616, "rewards/margins": 0.11661551147699356, "rewards/rejected": 0.5281249284744263, "step": 1340 }, { "epoch": 0.4251801110192512, "grad_norm": 2.921875, "learning_rate": 4.913673948405977e-06, "logits/chosen": -0.425508975982666, "logits/rejected": -0.3008427619934082, "logps/chosen": -188.8653564453125, "logps/rejected": -154.46444702148438, "loss": 0.6551, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5882565379142761, "rewards/margins": 0.11206309497356415, "rewards/rejected": 0.4761934280395508, "step": 1350 }, { "epoch": 0.4283295933230975, "grad_norm": 2.484375, "learning_rate": 4.91229537196872e-06, "logits/chosen": -0.466022253036499, "logits/rejected": -0.40916579961776733, "logps/chosen": -188.9302978515625, "logps/rejected": -176.31539916992188, "loss": 0.6762, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5576030015945435, "rewards/margins": 0.059089891612529755, "rewards/rejected": 0.4985131323337555, "step": 1360 }, { "epoch": 0.4314790756269438, "grad_norm": 2.203125, "learning_rate": 4.910906071513536e-06, "logits/chosen": -0.468529611825943, "logits/rejected": -0.3725103735923767, "logps/chosen": -189.84571838378906, "logps/rejected": -157.89552307128906, "loss": 0.6576, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5729373693466187, "rewards/margins": 0.10841979831457138, "rewards/rejected": 0.46451759338378906, "step": 1370 }, { "epoch": 0.43462855793079014, "grad_norm": 2.578125, "learning_rate": 4.9095060532166515e-06, "logits/chosen": -0.4988088607788086, "logits/rejected": -0.3795704245567322, "logps/chosen": -187.31626892089844, "logps/rejected": -166.83358764648438, "loss": 0.6576, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5874306559562683, "rewards/margins": 0.10258360207080841, "rewards/rejected": 0.4848470687866211, "step": 1380 }, { "epoch": 0.43777804023463646, "grad_norm": 2.875, "learning_rate": 4.90809532330194e-06, "logits/chosen": -0.44124871492385864, "logits/rejected": -0.33731183409690857, "logps/chosen": -193.7071990966797, "logps/rejected": -182.87266540527344, "loss": 0.6483, "rewards/accuracies": 0.625, "rewards/chosen": 0.6297098398208618, "rewards/margins": 0.11017205566167831, "rewards/rejected": 0.5195378065109253, "step": 1390 }, { "epoch": 0.4409275225384827, "grad_norm": 2.75, "learning_rate": 4.906673888040895e-06, "logits/chosen": -0.470440149307251, "logits/rejected": -0.40894460678100586, "logps/chosen": -201.374755859375, "logps/rejected": -162.78060913085938, "loss": 0.6548, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6208814382553101, "rewards/margins": 0.12041078507900238, "rewards/rejected": 0.5004706382751465, "step": 1400 }, { "epoch": 0.44407700484232904, "grad_norm": 3.234375, "learning_rate": 4.905241753752599e-06, "logits/chosen": -0.45281878113746643, "logits/rejected": -0.3816523253917694, "logps/chosen": -187.10189819335938, "logps/rejected": -180.3949737548828, "loss": 0.6594, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6287654638290405, "rewards/margins": 0.09312457591295242, "rewards/rejected": 0.5356410145759583, "step": 1410 }, { "epoch": 0.44722648714617536, "grad_norm": 2.390625, "learning_rate": 4.903798926803701e-06, "logits/chosen": -0.5540148615837097, "logits/rejected": -0.31469932198524475, "logps/chosen": -195.7555694580078, "logps/rejected": -160.02955627441406, "loss": 0.6663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5776636004447937, "rewards/margins": 0.08535424619913101, "rewards/rejected": 0.4923093318939209, "step": 1420 }, { "epoch": 0.4503759694500217, "grad_norm": 3.03125, "learning_rate": 4.902345413608382e-06, "logits/chosen": -0.5317307710647583, "logits/rejected": -0.4287118911743164, "logps/chosen": -212.67172241210938, "logps/rejected": -184.7850799560547, "loss": 0.6522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6378628015518188, "rewards/margins": 0.11098313331604004, "rewards/rejected": 0.5268796682357788, "step": 1430 }, { "epoch": 0.45352545175386794, "grad_norm": 2.890625, "learning_rate": 4.900881220628332e-06, "logits/chosen": -0.4813712537288666, "logits/rejected": -0.39985647797584534, "logps/chosen": -198.0911407470703, "logps/rejected": -183.64688110351562, "loss": 0.6738, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.603533148765564, "rewards/margins": 0.07522164285182953, "rewards/rejected": 0.528311550617218, "step": 1440 }, { "epoch": 0.45667493405771425, "grad_norm": 2.703125, "learning_rate": 4.899406354372716e-06, "logits/chosen": -0.48485565185546875, "logits/rejected": -0.335989385843277, "logps/chosen": -213.813720703125, "logps/rejected": -169.41334533691406, "loss": 0.6284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.667735755443573, "rewards/margins": 0.18240274488925934, "rewards/rejected": 0.48533302545547485, "step": 1450 }, { "epoch": 0.4598244163615606, "grad_norm": 3.015625, "learning_rate": 4.897920821398149e-06, "logits/chosen": -0.36127400398254395, "logits/rejected": -0.33091285824775696, "logps/chosen": -198.51376342773438, "logps/rejected": -179.6190643310547, "loss": 0.6766, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6157578825950623, "rewards/margins": 0.06863512098789215, "rewards/rejected": 0.5471227765083313, "step": 1460 }, { "epoch": 0.4629738986654069, "grad_norm": 2.34375, "learning_rate": 4.896424628308666e-06, "logits/chosen": -0.49601975083351135, "logits/rejected": -0.3733476996421814, "logps/chosen": -201.9625701904297, "logps/rejected": -163.45664978027344, "loss": 0.646, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6690108180046082, "rewards/margins": 0.130323126912117, "rewards/rejected": 0.5386877059936523, "step": 1470 }, { "epoch": 0.46612338096925315, "grad_norm": 2.625, "learning_rate": 4.894917781755693e-06, "logits/chosen": -0.40988796949386597, "logits/rejected": -0.2908838391304016, "logps/chosen": -174.8596954345703, "logps/rejected": -152.51849365234375, "loss": 0.6666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5752810835838318, "rewards/margins": 0.07995191216468811, "rewards/rejected": 0.4953291416168213, "step": 1480 }, { "epoch": 0.46927286327309947, "grad_norm": 3.203125, "learning_rate": 4.893400288438013e-06, "logits/chosen": -0.4711819291114807, "logits/rejected": -0.34161874651908875, "logps/chosen": -210.2778778076172, "logps/rejected": -188.28152465820312, "loss": 0.652, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6171534061431885, "rewards/margins": 0.11316442489624023, "rewards/rejected": 0.5039889216423035, "step": 1490 }, { "epoch": 0.4724223455769458, "grad_norm": 2.546875, "learning_rate": 4.891872155101746e-06, "logits/chosen": -0.45833712816238403, "logits/rejected": -0.3544694781303406, "logps/chosen": -205.4825439453125, "logps/rejected": -174.62255859375, "loss": 0.6504, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6301782131195068, "rewards/margins": 0.11798008531332016, "rewards/rejected": 0.5121980905532837, "step": 1500 }, { "epoch": 0.4755718278807921, "grad_norm": 3.3125, "learning_rate": 4.890333388540306e-06, "logits/chosen": -0.3727184236049652, "logits/rejected": -0.2826997637748718, "logps/chosen": -219.3084716796875, "logps/rejected": -186.0581512451172, "loss": 0.6574, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6568127870559692, "rewards/margins": 0.10541248321533203, "rewards/rejected": 0.551400363445282, "step": 1510 }, { "epoch": 0.4787213101846384, "grad_norm": 4.21875, "learning_rate": 4.888783995594383e-06, "logits/chosen": -0.5030876398086548, "logits/rejected": -0.41888195276260376, "logps/chosen": -210.5947723388672, "logps/rejected": -189.0829620361328, "loss": 0.6691, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6370301842689514, "rewards/margins": 0.07603044807910919, "rewards/rejected": 0.5609997510910034, "step": 1520 }, { "epoch": 0.4818707924884847, "grad_norm": 2.828125, "learning_rate": 4.887223983151905e-06, "logits/chosen": -0.45775994658470154, "logits/rejected": -0.3528696894645691, "logps/chosen": -205.0533447265625, "logps/rejected": -169.550048828125, "loss": 0.6478, "rewards/accuracies": 0.625, "rewards/chosen": 0.6256412863731384, "rewards/margins": 0.12434478104114532, "rewards/rejected": 0.5012965202331543, "step": 1530 }, { "epoch": 0.485020274792331, "grad_norm": 2.46875, "learning_rate": 4.88565335814801e-06, "logits/chosen": -0.4910427927970886, "logits/rejected": -0.3841659128665924, "logps/chosen": -224.6226348876953, "logps/rejected": -191.9573516845703, "loss": 0.6349, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6829046010971069, "rewards/margins": 0.15164795517921448, "rewards/rejected": 0.5312565565109253, "step": 1540 }, { "epoch": 0.4881697570961773, "grad_norm": 2.5625, "learning_rate": 4.884072127565015e-06, "logits/chosen": -0.5355531573295593, "logits/rejected": -0.4093754291534424, "logps/chosen": -190.81552124023438, "logps/rejected": -157.70872497558594, "loss": 0.6334, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6082229018211365, "rewards/margins": 0.15607663989067078, "rewards/rejected": 0.4521462321281433, "step": 1550 }, { "epoch": 0.49131923940002364, "grad_norm": 3.078125, "learning_rate": 4.882480298432384e-06, "logits/chosen": -0.4936140179634094, "logits/rejected": -0.3383339047431946, "logps/chosen": -200.6589813232422, "logps/rejected": -171.00064086914062, "loss": 0.6331, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6500834226608276, "rewards/margins": 0.15458612143993378, "rewards/rejected": 0.49549728631973267, "step": 1560 }, { "epoch": 0.4944687217038699, "grad_norm": 3.578125, "learning_rate": 4.8808778778266985e-06, "logits/chosen": -0.4825199246406555, "logits/rejected": -0.3065822124481201, "logps/chosen": -222.0049285888672, "logps/rejected": -180.4205780029297, "loss": 0.6483, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7011119723320007, "rewards/margins": 0.13294753432273865, "rewards/rejected": 0.5681644678115845, "step": 1570 }, { "epoch": 0.4976182040077162, "grad_norm": 2.890625, "learning_rate": 4.879264872871625e-06, "logits/chosen": -0.47797495126724243, "logits/rejected": -0.3094409704208374, "logps/chosen": -211.0563201904297, "logps/rejected": -168.85647583007812, "loss": 0.6436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.65972501039505, "rewards/margins": 0.13198061287403107, "rewards/rejected": 0.5277442932128906, "step": 1580 }, { "epoch": 0.5007676863115625, "grad_norm": 2.4375, "learning_rate": 4.8776412907378845e-06, "logits/chosen": -0.4951102137565613, "logits/rejected": -0.4047602117061615, "logps/chosen": -200.99998474121094, "logps/rejected": -183.0837860107422, "loss": 0.6603, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6028062105178833, "rewards/margins": 0.10012507438659668, "rewards/rejected": 0.5026811361312866, "step": 1590 }, { "epoch": 0.5039171686154088, "grad_norm": 2.953125, "learning_rate": 4.876007138643216e-06, "logits/chosen": -0.47414493560791016, "logits/rejected": -0.40259408950805664, "logps/chosen": -184.4724884033203, "logps/rejected": -163.58192443847656, "loss": 0.6457, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6546593904495239, "rewards/margins": 0.1281435191631317, "rewards/rejected": 0.5265159010887146, "step": 1600 }, { "epoch": 0.5070666509192552, "grad_norm": 3.34375, "learning_rate": 4.874362423852352e-06, "logits/chosen": -0.5160477161407471, "logits/rejected": -0.43918365240097046, "logps/chosen": -185.5178985595703, "logps/rejected": -165.6835479736328, "loss": 0.6668, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6197860836982727, "rewards/margins": 0.08066993951797485, "rewards/rejected": 0.5391160845756531, "step": 1610 }, { "epoch": 0.5102161332231014, "grad_norm": 3.40625, "learning_rate": 4.872707153676979e-06, "logits/chosen": -0.466305673122406, "logits/rejected": -0.3604885935783386, "logps/chosen": -213.26834106445312, "logps/rejected": -181.22439575195312, "loss": 0.6655, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6425263285636902, "rewards/margins": 0.08319854736328125, "rewards/rejected": 0.5593277812004089, "step": 1620 }, { "epoch": 0.5133656155269477, "grad_norm": 3.046875, "learning_rate": 4.871041335475712e-06, "logits/chosen": -0.5054647922515869, "logits/rejected": -0.4364451467990875, "logps/chosen": -186.38011169433594, "logps/rejected": -157.52462768554688, "loss": 0.674, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5790429711341858, "rewards/margins": 0.07125049829483032, "rewards/rejected": 0.5077924728393555, "step": 1630 }, { "epoch": 0.5165150978307941, "grad_norm": 3.421875, "learning_rate": 4.869364976654052e-06, "logits/chosen": -0.5144768953323364, "logits/rejected": -0.4074745774269104, "logps/chosen": -209.39431762695312, "logps/rejected": -187.3798065185547, "loss": 0.6849, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.655154824256897, "rewards/margins": 0.0526747927069664, "rewards/rejected": 0.6024800539016724, "step": 1640 }, { "epoch": 0.5196645801346403, "grad_norm": 3.140625, "learning_rate": 4.867678084664365e-06, "logits/chosen": -0.4594438672065735, "logits/rejected": -0.3518297076225281, "logps/chosen": -200.11427307128906, "logps/rejected": -160.406494140625, "loss": 0.6205, "rewards/accuracies": 0.625, "rewards/chosen": 0.6651702523231506, "rewards/margins": 0.18755197525024414, "rewards/rejected": 0.4776183068752289, "step": 1650 }, { "epoch": 0.5228140624384867, "grad_norm": 2.515625, "learning_rate": 4.865980667005839e-06, "logits/chosen": -0.5308347940444946, "logits/rejected": -0.38161686062812805, "logps/chosen": -228.52413940429688, "logps/rejected": -180.50119018554688, "loss": 0.6231, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7223028540611267, "rewards/margins": 0.1841433346271515, "rewards/rejected": 0.5381595492362976, "step": 1660 }, { "epoch": 0.525963544742333, "grad_norm": 3.28125, "learning_rate": 4.864272731224457e-06, "logits/chosen": -0.4550582468509674, "logits/rejected": -0.36829763650894165, "logps/chosen": -199.5106964111328, "logps/rejected": -177.867919921875, "loss": 0.6481, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7055026888847351, "rewards/margins": 0.13270851969718933, "rewards/rejected": 0.5727940797805786, "step": 1670 }, { "epoch": 0.5291130270461792, "grad_norm": 2.40625, "learning_rate": 4.862554284912961e-06, "logits/chosen": -0.5029420256614685, "logits/rejected": -0.37519973516464233, "logps/chosen": -193.17857360839844, "logps/rejected": -151.67193603515625, "loss": 0.6445, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5841437578201294, "rewards/margins": 0.12721844017505646, "rewards/rejected": 0.4569253921508789, "step": 1680 }, { "epoch": 0.5322625093500256, "grad_norm": 2.546875, "learning_rate": 4.860825335710815e-06, "logits/chosen": -0.4503244459629059, "logits/rejected": -0.33839210867881775, "logps/chosen": -221.6410675048828, "logps/rejected": -179.1236114501953, "loss": 0.6475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7080546617507935, "rewards/margins": 0.12341801822185516, "rewards/rejected": 0.5846366286277771, "step": 1690 }, { "epoch": 0.5354119916538719, "grad_norm": 2.859375, "learning_rate": 4.8590858913041775e-06, "logits/chosen": -0.5011571645736694, "logits/rejected": -0.34209686517715454, "logps/chosen": -194.4629364013672, "logps/rejected": -162.49404907226562, "loss": 0.6226, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6779108643531799, "rewards/margins": 0.17823724448680878, "rewards/rejected": 0.49967360496520996, "step": 1700 }, { "epoch": 0.5385614739577183, "grad_norm": 2.484375, "learning_rate": 4.857335959425864e-06, "logits/chosen": -0.47282689809799194, "logits/rejected": -0.387722373008728, "logps/chosen": -188.14022827148438, "logps/rejected": -163.66702270507812, "loss": 0.6431, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6209579110145569, "rewards/margins": 0.12763172388076782, "rewards/rejected": 0.49332618713378906, "step": 1710 }, { "epoch": 0.5417109562615645, "grad_norm": 3.125, "learning_rate": 4.85557554785531e-06, "logits/chosen": -0.5164914727210999, "logits/rejected": -0.4268696904182434, "logps/chosen": -194.6099090576172, "logps/rejected": -171.3253631591797, "loss": 0.6742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6653293371200562, "rewards/margins": 0.08088545501232147, "rewards/rejected": 0.5844438672065735, "step": 1720 }, { "epoch": 0.5448604385654108, "grad_norm": 2.390625, "learning_rate": 4.853804664418543e-06, "logits/chosen": -0.5048812627792358, "logits/rejected": -0.36949628591537476, "logps/chosen": -186.7891845703125, "logps/rejected": -177.64492797851562, "loss": 0.6929, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6204543113708496, "rewards/margins": 0.03395826369524002, "rewards/rejected": 0.5864960551261902, "step": 1730 }, { "epoch": 0.5480099208692571, "grad_norm": 2.75, "learning_rate": 4.85202331698814e-06, "logits/chosen": -0.5082224011421204, "logits/rejected": -0.3899040222167969, "logps/chosen": -189.59127807617188, "logps/rejected": -176.00732421875, "loss": 0.6808, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5799592733383179, "rewards/margins": 0.04768219217658043, "rewards/rejected": 0.5322771072387695, "step": 1740 }, { "epoch": 0.5511594031731034, "grad_norm": 2.796875, "learning_rate": 4.8502315134832e-06, "logits/chosen": -0.45675116777420044, "logits/rejected": -0.35309940576553345, "logps/chosen": -182.9220733642578, "logps/rejected": -161.3640899658203, "loss": 0.6708, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.605731189250946, "rewards/margins": 0.07132869213819504, "rewards/rejected": 0.5344024896621704, "step": 1750 }, { "epoch": 0.5543088854769497, "grad_norm": 3.015625, "learning_rate": 4.848429261869303e-06, "logits/chosen": -0.47061842679977417, "logits/rejected": -0.32226455211639404, "logps/chosen": -206.2819366455078, "logps/rejected": -163.6657257080078, "loss": 0.627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7078389525413513, "rewards/margins": 0.1701158881187439, "rewards/rejected": 0.5377230644226074, "step": 1760 }, { "epoch": 0.557458367780796, "grad_norm": 2.5, "learning_rate": 4.8466165701584766e-06, "logits/chosen": -0.46921786665916443, "logits/rejected": -0.33290085196495056, "logps/chosen": -185.49661254882812, "logps/rejected": -155.57052612304688, "loss": 0.6234, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6777461767196655, "rewards/margins": 0.18197762966156006, "rewards/rejected": 0.49576863646507263, "step": 1770 }, { "epoch": 0.5606078500846423, "grad_norm": 3.53125, "learning_rate": 4.844793446409162e-06, "logits/chosen": -0.4977359175682068, "logits/rejected": -0.3592739999294281, "logps/chosen": -234.7196807861328, "logps/rejected": -196.9207763671875, "loss": 0.6667, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.725782573223114, "rewards/margins": 0.08639247715473175, "rewards/rejected": 0.6393901109695435, "step": 1780 }, { "epoch": 0.5637573323884887, "grad_norm": 3.703125, "learning_rate": 4.842959898726175e-06, "logits/chosen": -0.42303353548049927, "logits/rejected": -0.34682708978652954, "logps/chosen": -233.24038696289062, "logps/rejected": -207.0811309814453, "loss": 0.6478, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.75257807970047, "rewards/margins": 0.12484292685985565, "rewards/rejected": 0.6277351975440979, "step": 1790 }, { "epoch": 0.566906814692335, "grad_norm": 2.671875, "learning_rate": 4.8411159352606735e-06, "logits/chosen": -0.49121102690696716, "logits/rejected": -0.3855198621749878, "logps/chosen": -201.4905548095703, "logps/rejected": -184.22596740722656, "loss": 0.6408, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6799174547195435, "rewards/margins": 0.1396731436252594, "rewards/rejected": 0.5402444005012512, "step": 1800 }, { "epoch": 0.5700562969961812, "grad_norm": 3.21875, "learning_rate": 4.839261564210118e-06, "logits/chosen": -0.4372677206993103, "logits/rejected": -0.3174059987068176, "logps/chosen": -182.1688995361328, "logps/rejected": -167.50186157226562, "loss": 0.6455, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5665633678436279, "rewards/margins": 0.12508925795555115, "rewards/rejected": 0.441474050283432, "step": 1810 }, { "epoch": 0.5732057793000276, "grad_norm": 2.453125, "learning_rate": 4.837396793818237e-06, "logits/chosen": -0.5189486742019653, "logits/rejected": -0.4490106701850891, "logps/chosen": -169.6758575439453, "logps/rejected": -160.53372192382812, "loss": 0.6999, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.5530275106430054, "rewards/margins": 0.015836771577596664, "rewards/rejected": 0.5371907353401184, "step": 1820 }, { "epoch": 0.5763552616038738, "grad_norm": 2.625, "learning_rate": 4.83552163237499e-06, "logits/chosen": -0.44089236855506897, "logits/rejected": -0.31842055916786194, "logps/chosen": -189.9591064453125, "logps/rejected": -159.47850036621094, "loss": 0.6333, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6711987257003784, "rewards/margins": 0.16579605638980865, "rewards/rejected": 0.505402684211731, "step": 1830 }, { "epoch": 0.5795047439077202, "grad_norm": 2.390625, "learning_rate": 4.8336360882165315e-06, "logits/chosen": -0.4447326064109802, "logits/rejected": -0.3511047661304474, "logps/chosen": -189.64212036132812, "logps/rejected": -163.2635955810547, "loss": 0.6612, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6063627600669861, "rewards/margins": 0.10564740747213364, "rewards/rejected": 0.5007153749465942, "step": 1840 }, { "epoch": 0.5826542262115665, "grad_norm": 2.71875, "learning_rate": 4.831740169725172e-06, "logits/chosen": -0.37459006905555725, "logits/rejected": -0.27248674631118774, "logps/chosen": -191.74412536621094, "logps/rejected": -170.22665405273438, "loss": 0.6765, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.6292241811752319, "rewards/margins": 0.06548759341239929, "rewards/rejected": 0.5637365579605103, "step": 1850 }, { "epoch": 0.5858037085154127, "grad_norm": 3.171875, "learning_rate": 4.829833885329341e-06, "logits/chosen": -0.5164798498153687, "logits/rejected": -0.3869698643684387, "logps/chosen": -195.4289093017578, "logps/rejected": -161.2754364013672, "loss": 0.6763, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.5856376886367798, "rewards/margins": 0.06769655644893646, "rewards/rejected": 0.5179411172866821, "step": 1860 }, { "epoch": 0.5889531908192591, "grad_norm": 2.71875, "learning_rate": 4.827917243503552e-06, "logits/chosen": -0.4844232499599457, "logits/rejected": -0.38613173365592957, "logps/chosen": -212.82217407226562, "logps/rejected": -174.99215698242188, "loss": 0.6457, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6664649844169617, "rewards/margins": 0.13369064033031464, "rewards/rejected": 0.5327743291854858, "step": 1870 }, { "epoch": 0.5921026731231054, "grad_norm": 3.6875, "learning_rate": 4.825990252768362e-06, "logits/chosen": -0.4278056025505066, "logits/rejected": -0.3540639579296112, "logps/chosen": -191.68344116210938, "logps/rejected": -180.2418670654297, "loss": 0.6856, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6736811399459839, "rewards/margins": 0.05576536804437637, "rewards/rejected": 0.6179158091545105, "step": 1880 }, { "epoch": 0.5952521554269516, "grad_norm": 2.703125, "learning_rate": 4.824052921690337e-06, "logits/chosen": -0.5202184915542603, "logits/rejected": -0.38284236192703247, "logps/chosen": -211.123046875, "logps/rejected": -174.93304443359375, "loss": 0.6467, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7286884188652039, "rewards/margins": 0.13865350186824799, "rewards/rejected": 0.5900349020957947, "step": 1890 }, { "epoch": 0.598401637730798, "grad_norm": 3.03125, "learning_rate": 4.822105258882007e-06, "logits/chosen": -0.553580105304718, "logits/rejected": -0.40661874413490295, "logps/chosen": -231.7763214111328, "logps/rejected": -190.30320739746094, "loss": 0.6755, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7072332501411438, "rewards/margins": 0.06532811373472214, "rewards/rejected": 0.6419050693511963, "step": 1900 }, { "epoch": 0.6015511200346443, "grad_norm": 3.1875, "learning_rate": 4.8201472730018386e-06, "logits/chosen": -0.4518907964229584, "logits/rejected": -0.3795849680900574, "logps/chosen": -205.5015411376953, "logps/rejected": -180.77366638183594, "loss": 0.6682, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6349667310714722, "rewards/margins": 0.07740394026041031, "rewards/rejected": 0.5575627684593201, "step": 1910 }, { "epoch": 0.6047006023384907, "grad_norm": 2.484375, "learning_rate": 4.818178972754184e-06, "logits/chosen": -0.43323999643325806, "logits/rejected": -0.4050220549106598, "logps/chosen": -190.7349090576172, "logps/rejected": -176.9745330810547, "loss": 0.6589, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6163308024406433, "rewards/margins": 0.09825930744409561, "rewards/rejected": 0.5180714726448059, "step": 1920 }, { "epoch": 0.6078500846423369, "grad_norm": 2.390625, "learning_rate": 4.816200366889252e-06, "logits/chosen": -0.48036471009254456, "logits/rejected": -0.34876304864883423, "logps/chosen": -202.17532348632812, "logps/rejected": -171.40567016601562, "loss": 0.6655, "rewards/accuracies": 0.625, "rewards/chosen": 0.6224693059921265, "rewards/margins": 0.08968141674995422, "rewards/rejected": 0.5327879190444946, "step": 1930 }, { "epoch": 0.6109995669461832, "grad_norm": 2.53125, "learning_rate": 4.8142114642030665e-06, "logits/chosen": -0.5331605672836304, "logits/rejected": -0.35749322175979614, "logps/chosen": -200.44296264648438, "logps/rejected": -155.3129119873047, "loss": 0.6277, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.686188817024231, "rewards/margins": 0.17992666363716125, "rewards/rejected": 0.5062621831893921, "step": 1940 }, { "epoch": 0.6141490492500296, "grad_norm": 3.65625, "learning_rate": 4.812212273537426e-06, "logits/chosen": -0.5703829526901245, "logits/rejected": -0.3986208736896515, "logps/chosen": -207.4144287109375, "logps/rejected": -166.04586791992188, "loss": 0.635, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6602171659469604, "rewards/margins": 0.1550987958908081, "rewards/rejected": 0.5051184296607971, "step": 1950 }, { "epoch": 0.6172985315538758, "grad_norm": 3.46875, "learning_rate": 4.810202803779862e-06, "logits/chosen": -0.49680987000465393, "logits/rejected": -0.3343147039413452, "logps/chosen": -194.55484008789062, "logps/rejected": -171.62588500976562, "loss": 0.6414, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6576918959617615, "rewards/margins": 0.14642241597175598, "rewards/rejected": 0.5112695693969727, "step": 1960 }, { "epoch": 0.6204480138577222, "grad_norm": 3.015625, "learning_rate": 4.808183063863606e-06, "logits/chosen": -0.5439049005508423, "logits/rejected": -0.4092562794685364, "logps/chosen": -223.9301300048828, "logps/rejected": -191.12759399414062, "loss": 0.6458, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7495763897895813, "rewards/margins": 0.13388705253601074, "rewards/rejected": 0.6156893372535706, "step": 1970 }, { "epoch": 0.6235974961615685, "grad_norm": 2.703125, "learning_rate": 4.806153062767544e-06, "logits/chosen": -0.4681766629219055, "logits/rejected": -0.3462735116481781, "logps/chosen": -192.7504425048828, "logps/rejected": -170.0450897216797, "loss": 0.6479, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6624137163162231, "rewards/margins": 0.12909933924674988, "rewards/rejected": 0.5333144068717957, "step": 1980 }, { "epoch": 0.6267469784654147, "grad_norm": 2.78125, "learning_rate": 4.804112809516181e-06, "logits/chosen": -0.5108489394187927, "logits/rejected": -0.3038786053657532, "logps/chosen": -207.41226196289062, "logps/rejected": -163.39027404785156, "loss": 0.6386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6961920261383057, "rewards/margins": 0.14503511786460876, "rewards/rejected": 0.5511568188667297, "step": 1990 }, { "epoch": 0.6298964607692611, "grad_norm": 2.171875, "learning_rate": 4.802062313179595e-06, "logits/chosen": -0.4810725748538971, "logits/rejected": -0.40924152731895447, "logps/chosen": -187.09451293945312, "logps/rejected": -162.91554260253906, "loss": 0.661, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5915666818618774, "rewards/margins": 0.10626377910375595, "rewards/rejected": 0.4853029251098633, "step": 2000 }, { "epoch": 0.6298964607692611, "eval_logits/chosen": -0.6020957231521606, "eval_logits/rejected": -0.4771311283111572, "eval_logps/chosen": -243.46115112304688, "eval_logps/rejected": -222.5963897705078, "eval_loss": 0.6673439145088196, "eval_rewards/accuracies": 0.5848915576934814, "eval_rewards/chosen": 0.7640087604522705, "eval_rewards/margins": 0.09086808562278748, "eval_rewards/rejected": 0.6731407046318054, "eval_runtime": 3657.8705, "eval_samples_per_second": 0.366, "eval_steps_per_second": 0.366, "step": 2000 }, { "epoch": 0.6330459430731074, "grad_norm": 2.984375, "learning_rate": 4.800001582873405e-06, "logits/chosen": -0.42106738686561584, "logits/rejected": -0.35292476415634155, "logps/chosen": -202.25643920898438, "logps/rejected": -198.75399780273438, "loss": 0.6953, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6455667614936829, "rewards/margins": 0.026390869170427322, "rewards/rejected": 0.6191757917404175, "step": 2010 }, { "epoch": 0.6361954253769536, "grad_norm": 3.3125, "learning_rate": 4.797930627758721e-06, "logits/chosen": -0.3941357731819153, "logits/rejected": -0.3840845823287964, "logps/chosen": -193.46145629882812, "logps/rejected": -182.74398803710938, "loss": 0.6844, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6705250144004822, "rewards/margins": 0.047097403556108475, "rewards/rejected": 0.6234275698661804, "step": 2020 }, { "epoch": 0.6393449076808, "grad_norm": 2.1875, "learning_rate": 4.795849457042112e-06, "logits/chosen": -0.5017488598823547, "logits/rejected": -0.40243926644325256, "logps/chosen": -190.13934326171875, "logps/rejected": -166.44187927246094, "loss": 0.6766, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6177536845207214, "rewards/margins": 0.07091078162193298, "rewards/rejected": 0.5468429327011108, "step": 2030 }, { "epoch": 0.6424943899846463, "grad_norm": 2.671875, "learning_rate": 4.793758079975559e-06, "logits/chosen": -0.47853583097457886, "logits/rejected": -0.3796117901802063, "logps/chosen": -183.77645874023438, "logps/rejected": -150.51734924316406, "loss": 0.6283, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6325706243515015, "rewards/margins": 0.16535897552967072, "rewards/rejected": 0.46721166372299194, "step": 2040 }, { "epoch": 0.6456438722884926, "grad_norm": 2.859375, "learning_rate": 4.791656505856416e-06, "logits/chosen": -0.49306803941726685, "logits/rejected": -0.37095504999160767, "logps/chosen": -193.6649932861328, "logps/rejected": -169.91397094726562, "loss": 0.6596, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6382758617401123, "rewards/margins": 0.09585778415203094, "rewards/rejected": 0.5424180030822754, "step": 2050 }, { "epoch": 0.6487933545923389, "grad_norm": 4.375, "learning_rate": 4.789544744027369e-06, "logits/chosen": -0.456062376499176, "logits/rejected": -0.326261430978775, "logps/chosen": -202.9215087890625, "logps/rejected": -158.33755493164062, "loss": 0.6307, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6835106015205383, "rewards/margins": 0.16993892192840576, "rewards/rejected": 0.5135716199874878, "step": 2060 }, { "epoch": 0.6519428368961852, "grad_norm": 2.90625, "learning_rate": 4.787422803876394e-06, "logits/chosen": -0.4271954596042633, "logits/rejected": -0.32812589406967163, "logps/chosen": -217.31643676757812, "logps/rejected": -189.05111694335938, "loss": 0.654, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7037121057510376, "rewards/margins": 0.1129276379942894, "rewards/rejected": 0.59078449010849, "step": 2070 }, { "epoch": 0.6550923192000315, "grad_norm": 2.984375, "learning_rate": 4.785290694836719e-06, "logits/chosen": -0.4718199670314789, "logits/rejected": -0.29942384362220764, "logps/chosen": -194.9972686767578, "logps/rejected": -162.9912872314453, "loss": 0.6402, "rewards/accuracies": 0.625, "rewards/chosen": 0.7091890573501587, "rewards/margins": 0.13689157366752625, "rewards/rejected": 0.5722974538803101, "step": 2080 }, { "epoch": 0.6582418015038778, "grad_norm": 2.40625, "learning_rate": 4.783148426386771e-06, "logits/chosen": -0.4268282353878021, "logits/rejected": -0.2675458788871765, "logps/chosen": -199.43051147460938, "logps/rejected": -162.5355987548828, "loss": 0.6125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.665716826915741, "rewards/margins": 0.20410867035388947, "rewards/rejected": 0.4616081714630127, "step": 2090 }, { "epoch": 0.661391283807724, "grad_norm": 2.265625, "learning_rate": 4.7809960080501464e-06, "logits/chosen": -0.4748079776763916, "logits/rejected": -0.4310288429260254, "logps/chosen": -201.29617309570312, "logps/rejected": -184.75961303710938, "loss": 0.6435, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7009934186935425, "rewards/margins": 0.13489434123039246, "rewards/rejected": 0.5660991668701172, "step": 2100 }, { "epoch": 0.6645407661115704, "grad_norm": 2.671875, "learning_rate": 4.778833449395563e-06, "logits/chosen": -0.4732258915901184, "logits/rejected": -0.3779997229576111, "logps/chosen": -211.35055541992188, "logps/rejected": -177.94529724121094, "loss": 0.6271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6552049517631531, "rewards/margins": 0.1688978374004364, "rewards/rejected": 0.48630720376968384, "step": 2110 }, { "epoch": 0.6676902484154167, "grad_norm": 3.046875, "learning_rate": 4.77666076003682e-06, "logits/chosen": -0.5109846591949463, "logits/rejected": -0.4116583466529846, "logps/chosen": -201.0147705078125, "logps/rejected": -173.7718048095703, "loss": 0.6458, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6840208172798157, "rewards/margins": 0.1273798793554306, "rewards/rejected": 0.5566409230232239, "step": 2120 }, { "epoch": 0.6708397307192631, "grad_norm": 2.84375, "learning_rate": 4.774477949632747e-06, "logits/chosen": -0.4809556007385254, "logits/rejected": -0.37704578042030334, "logps/chosen": -215.3496551513672, "logps/rejected": -185.5915985107422, "loss": 0.6738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6900166273117065, "rewards/margins": 0.08503197878599167, "rewards/rejected": 0.604984700679779, "step": 2130 }, { "epoch": 0.6739892130231093, "grad_norm": 2.8125, "learning_rate": 4.772285027887174e-06, "logits/chosen": -0.4865281581878662, "logits/rejected": -0.3789558410644531, "logps/chosen": -212.9844512939453, "logps/rejected": -181.038818359375, "loss": 0.6399, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6945775747299194, "rewards/margins": 0.15178106725215912, "rewards/rejected": 0.5427964329719543, "step": 2140 }, { "epoch": 0.6771386953269556, "grad_norm": 2.28125, "learning_rate": 4.770082004548878e-06, "logits/chosen": -0.5607801675796509, "logits/rejected": -0.4424077868461609, "logps/chosen": -197.34896850585938, "logps/rejected": -168.6945343017578, "loss": 0.6788, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6424630880355835, "rewards/margins": 0.06348511576652527, "rewards/rejected": 0.5789780020713806, "step": 2150 }, { "epoch": 0.680288177630802, "grad_norm": 2.890625, "learning_rate": 4.767868889411545e-06, "logits/chosen": -0.37069040536880493, "logits/rejected": -0.29210925102233887, "logps/chosen": -189.8524169921875, "logps/rejected": -173.9405517578125, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": 0.648335337638855, "rewards/margins": 0.061441101133823395, "rewards/rejected": 0.586894154548645, "step": 2160 }, { "epoch": 0.6834376599346482, "grad_norm": 2.53125, "learning_rate": 4.765645692313724e-06, "logits/chosen": -0.4717496931552887, "logits/rejected": -0.38448435068130493, "logps/chosen": -197.7617645263672, "logps/rejected": -176.4130401611328, "loss": 0.6377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6932880282402039, "rewards/margins": 0.14973357319831848, "rewards/rejected": 0.543554425239563, "step": 2170 }, { "epoch": 0.6865871422384946, "grad_norm": 3.265625, "learning_rate": 4.763412423138784e-06, "logits/chosen": -0.46070584654808044, "logits/rejected": -0.37532466650009155, "logps/chosen": -211.4508514404297, "logps/rejected": -192.87716674804688, "loss": 0.6801, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6881066560745239, "rewards/margins": 0.05569648742675781, "rewards/rejected": 0.6324101686477661, "step": 2180 }, { "epoch": 0.6897366245423409, "grad_norm": 3.328125, "learning_rate": 4.761169091814869e-06, "logits/chosen": -0.44056111574172974, "logits/rejected": -0.3600524961948395, "logps/chosen": -212.5589599609375, "logps/rejected": -194.01473999023438, "loss": 0.66, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7078835368156433, "rewards/margins": 0.10578237473964691, "rewards/rejected": 0.60210120677948, "step": 2190 }, { "epoch": 0.6928861068461871, "grad_norm": 2.671875, "learning_rate": 4.758915708314858e-06, "logits/chosen": -0.528985857963562, "logits/rejected": -0.3610820770263672, "logps/chosen": -214.4971923828125, "logps/rejected": -172.77456665039062, "loss": 0.6279, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7492777705192566, "rewards/margins": 0.17471325397491455, "rewards/rejected": 0.5745645761489868, "step": 2200 }, { "epoch": 0.6960355891500335, "grad_norm": 3.328125, "learning_rate": 4.756652282656314e-06, "logits/chosen": -0.46792277693748474, "logits/rejected": -0.3924694061279297, "logps/chosen": -189.278076171875, "logps/rejected": -168.0936737060547, "loss": 0.6298, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6673378944396973, "rewards/margins": 0.18005801737308502, "rewards/rejected": 0.48727989196777344, "step": 2210 }, { "epoch": 0.6991850714538798, "grad_norm": 2.765625, "learning_rate": 4.754378824901447e-06, "logits/chosen": -0.42372870445251465, "logits/rejected": -0.3045392334461212, "logps/chosen": -217.94894409179688, "logps/rejected": -181.64312744140625, "loss": 0.6352, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7333195209503174, "rewards/margins": 0.15828455984592438, "rewards/rejected": 0.5750349760055542, "step": 2220 }, { "epoch": 0.702334553757726, "grad_norm": 2.59375, "learning_rate": 4.752095345157062e-06, "logits/chosen": -0.5427747368812561, "logits/rejected": -0.3744064271450043, "logps/chosen": -196.52786254882812, "logps/rejected": -179.39456176757812, "loss": 0.6659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7010904550552368, "rewards/margins": 0.10356839001178741, "rewards/rejected": 0.5975220203399658, "step": 2230 }, { "epoch": 0.7054840360615724, "grad_norm": 2.875, "learning_rate": 4.7498018535745175e-06, "logits/chosen": -0.45889267325401306, "logits/rejected": -0.3741144835948944, "logps/chosen": -194.2584228515625, "logps/rejected": -169.21490478515625, "loss": 0.6514, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6648105382919312, "rewards/margins": 0.12509003281593323, "rewards/rejected": 0.5397205352783203, "step": 2240 }, { "epoch": 0.7086335183654187, "grad_norm": 2.953125, "learning_rate": 4.747498360349681e-06, "logits/chosen": -0.4748617708683014, "logits/rejected": -0.3516277074813843, "logps/chosen": -219.1868438720703, "logps/rejected": -182.24143981933594, "loss": 0.6369, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7704683542251587, "rewards/margins": 0.16334478557109833, "rewards/rejected": 0.6071235537528992, "step": 2250 }, { "epoch": 0.711783000669265, "grad_norm": 2.59375, "learning_rate": 4.745184875722887e-06, "logits/chosen": -0.5461896657943726, "logits/rejected": -0.4441626965999603, "logps/chosen": -177.56777954101562, "logps/rejected": -164.29269409179688, "loss": 0.6576, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.63064044713974, "rewards/margins": 0.11562252044677734, "rewards/rejected": 0.5150178670883179, "step": 2260 }, { "epoch": 0.7149324829731113, "grad_norm": 2.625, "learning_rate": 4.7428614099788804e-06, "logits/chosen": -0.4357683062553406, "logits/rejected": -0.40783005952835083, "logps/chosen": -180.0326690673828, "logps/rejected": -161.7758331298828, "loss": 0.6844, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5830633640289307, "rewards/margins": 0.04567595571279526, "rewards/rejected": 0.5373873710632324, "step": 2270 }, { "epoch": 0.7180819652769576, "grad_norm": 2.953125, "learning_rate": 4.740527973446782e-06, "logits/chosen": -0.4672257900238037, "logits/rejected": -0.38863813877105713, "logps/chosen": -172.70326232910156, "logps/rejected": -155.18206787109375, "loss": 0.6836, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5959118008613586, "rewards/margins": 0.06164885684847832, "rewards/rejected": 0.5342629551887512, "step": 2280 }, { "epoch": 0.7212314475808039, "grad_norm": 2.53125, "learning_rate": 4.738184576500038e-06, "logits/chosen": -0.4562186300754547, "logits/rejected": -0.3595152199268341, "logps/chosen": -202.3937530517578, "logps/rejected": -184.3506622314453, "loss": 0.6476, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6859248280525208, "rewards/margins": 0.13335470855236053, "rewards/rejected": 0.5525700449943542, "step": 2290 }, { "epoch": 0.7243809298846502, "grad_norm": 3.390625, "learning_rate": 4.735831229556374e-06, "logits/chosen": -0.5141324996948242, "logits/rejected": -0.4093483090400696, "logps/chosen": -219.1161651611328, "logps/rejected": -183.11634826660156, "loss": 0.6545, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7390071153640747, "rewards/margins": 0.13933688402175903, "rewards/rejected": 0.5996701717376709, "step": 2300 }, { "epoch": 0.7275304121884966, "grad_norm": 3.75, "learning_rate": 4.733467943077747e-06, "logits/chosen": -0.43999338150024414, "logits/rejected": -0.3529604375362396, "logps/chosen": -179.51895141601562, "logps/rejected": -177.3287811279297, "loss": 0.7027, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.5721325278282166, "rewards/margins": 0.017483506351709366, "rewards/rejected": 0.5546489953994751, "step": 2310 }, { "epoch": 0.7306798944923428, "grad_norm": 3.71875, "learning_rate": 4.731094727570305e-06, "logits/chosen": -0.39424124360084534, "logits/rejected": -0.32468560338020325, "logps/chosen": -201.04409790039062, "logps/rejected": -186.39602661132812, "loss": 0.6709, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.615720808506012, "rewards/margins": 0.08357678353786469, "rewards/rejected": 0.5321440100669861, "step": 2320 }, { "epoch": 0.7338293767961891, "grad_norm": 2.65625, "learning_rate": 4.7287115935843335e-06, "logits/chosen": -0.4321005940437317, "logits/rejected": -0.3664671778678894, "logps/chosen": -183.7452392578125, "logps/rejected": -170.99435424804688, "loss": 0.6807, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6428938508033752, "rewards/margins": 0.06662772595882416, "rewards/rejected": 0.5762661695480347, "step": 2330 }, { "epoch": 0.7369788591000355, "grad_norm": 2.6875, "learning_rate": 4.72631855171421e-06, "logits/chosen": -0.407909095287323, "logits/rejected": -0.3485127389431, "logps/chosen": -189.66488647460938, "logps/rejected": -168.92410278320312, "loss": 0.6538, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6725813150405884, "rewards/margins": 0.11997097730636597, "rewards/rejected": 0.5526103377342224, "step": 2340 }, { "epoch": 0.7401283414038817, "grad_norm": 3.765625, "learning_rate": 4.72391561259836e-06, "logits/chosen": -0.4830864369869232, "logits/rejected": -0.39991340041160583, "logps/chosen": -196.81080627441406, "logps/rejected": -173.09869384765625, "loss": 0.6644, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.6581705808639526, "rewards/margins": 0.10404298454523087, "rewards/rejected": 0.55412757396698, "step": 2350 }, { "epoch": 0.743277823707728, "grad_norm": 2.9375, "learning_rate": 4.721502786919209e-06, "logits/chosen": -0.5260214805603027, "logits/rejected": -0.45939525961875916, "logps/chosen": -191.50491333007812, "logps/rejected": -162.9628143310547, "loss": 0.6571, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6285362243652344, "rewards/margins": 0.10894250869750977, "rewards/rejected": 0.5195937156677246, "step": 2360 }, { "epoch": 0.7464273060115744, "grad_norm": 2.390625, "learning_rate": 4.719080085403131e-06, "logits/chosen": -0.4764222204685211, "logits/rejected": -0.37835368514060974, "logps/chosen": -207.5322723388672, "logps/rejected": -173.14564514160156, "loss": 0.6403, "rewards/accuracies": 0.625, "rewards/chosen": 0.7107085585594177, "rewards/margins": 0.15078091621398926, "rewards/rejected": 0.5599276423454285, "step": 2370 }, { "epoch": 0.7495767883154206, "grad_norm": 3.703125, "learning_rate": 4.716647518820406e-06, "logits/chosen": -0.51595538854599, "logits/rejected": -0.3755626380443573, "logps/chosen": -191.32260131835938, "logps/rejected": -156.11282348632812, "loss": 0.676, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6388931274414062, "rewards/margins": 0.06408585608005524, "rewards/rejected": 0.5748072862625122, "step": 2380 }, { "epoch": 0.752726270619267, "grad_norm": 3.140625, "learning_rate": 4.714205097985169e-06, "logits/chosen": -0.4605388045310974, "logits/rejected": -0.31849172711372375, "logps/chosen": -183.9641876220703, "logps/rejected": -168.69387817382812, "loss": 0.6416, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7079667448997498, "rewards/margins": 0.15685173869132996, "rewards/rejected": 0.5511150360107422, "step": 2390 }, { "epoch": 0.7558757529231133, "grad_norm": 3.5625, "learning_rate": 4.711752833755362e-06, "logits/chosen": -0.4923486113548279, "logits/rejected": -0.36627882719039917, "logps/chosen": -211.5367431640625, "logps/rejected": -170.20559692382812, "loss": 0.6441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7119361162185669, "rewards/margins": 0.14048628509044647, "rewards/rejected": 0.571449875831604, "step": 2400 }, { "epoch": 0.7590252352269595, "grad_norm": 3.328125, "learning_rate": 4.7092907370326876e-06, "logits/chosen": -0.4083434045314789, "logits/rejected": -0.3242022693157196, "logps/chosen": -195.62118530273438, "logps/rejected": -170.5220489501953, "loss": 0.6861, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6857455968856812, "rewards/margins": 0.0565875768661499, "rewards/rejected": 0.6291579604148865, "step": 2410 }, { "epoch": 0.7621747175308059, "grad_norm": 2.65625, "learning_rate": 4.706818818762558e-06, "logits/chosen": -0.5046082735061646, "logits/rejected": -0.35835257172584534, "logps/chosen": -212.45217895507812, "logps/rejected": -180.62155151367188, "loss": 0.6265, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7465167045593262, "rewards/margins": 0.1813477873802185, "rewards/rejected": 0.5651688575744629, "step": 2420 }, { "epoch": 0.7653241998346522, "grad_norm": 2.453125, "learning_rate": 4.7043370899340505e-06, "logits/chosen": -0.4568845331668854, "logits/rejected": -0.4346873164176941, "logps/chosen": -202.90444946289062, "logps/rejected": -197.64659118652344, "loss": 0.6894, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.6897009015083313, "rewards/margins": 0.04634975641965866, "rewards/rejected": 0.6433511972427368, "step": 2430 }, { "epoch": 0.7684736821384985, "grad_norm": 3.109375, "learning_rate": 4.701845561579853e-06, "logits/chosen": -0.4123512804508209, "logits/rejected": -0.3234286606311798, "logps/chosen": -199.5214385986328, "logps/rejected": -182.79100036621094, "loss": 0.6436, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6898704767227173, "rewards/margins": 0.13404139876365662, "rewards/rejected": 0.5558291077613831, "step": 2440 }, { "epoch": 0.7716231644423448, "grad_norm": 2.390625, "learning_rate": 4.6993442447762185e-06, "logits/chosen": -0.4709036350250244, "logits/rejected": -0.3212384283542633, "logps/chosen": -214.52877807617188, "logps/rejected": -172.2939910888672, "loss": 0.6349, "rewards/accuracies": 0.625, "rewards/chosen": 0.7454115152359009, "rewards/margins": 0.16362550854682922, "rewards/rejected": 0.5817859768867493, "step": 2450 }, { "epoch": 0.7747726467461911, "grad_norm": 3.234375, "learning_rate": 4.696833150642916e-06, "logits/chosen": -0.4889756143093109, "logits/rejected": -0.3762696385383606, "logps/chosen": -200.27764892578125, "logps/rejected": -167.56094360351562, "loss": 0.646, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6864475011825562, "rewards/margins": 0.13881604373455048, "rewards/rejected": 0.5476315021514893, "step": 2460 }, { "epoch": 0.7779221290500374, "grad_norm": 2.359375, "learning_rate": 4.694312290343178e-06, "logits/chosen": -0.5132132768630981, "logits/rejected": -0.38872334361076355, "logps/chosen": -200.6832733154297, "logps/rejected": -171.80210876464844, "loss": 0.6265, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.709212601184845, "rewards/margins": 0.17675474286079407, "rewards/rejected": 0.5324578881263733, "step": 2470 }, { "epoch": 0.7810716113538837, "grad_norm": 2.734375, "learning_rate": 4.691781675083658e-06, "logits/chosen": -0.543319582939148, "logits/rejected": -0.4530103802680969, "logps/chosen": -193.91519165039062, "logps/rejected": -169.05667114257812, "loss": 0.6525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6816273927688599, "rewards/margins": 0.11835716664791107, "rewards/rejected": 0.5632702112197876, "step": 2480 }, { "epoch": 0.78422109365773, "grad_norm": 3.359375, "learning_rate": 4.689241316114373e-06, "logits/chosen": -0.42033663392066956, "logits/rejected": -0.35506805777549744, "logps/chosen": -197.7043914794922, "logps/rejected": -181.24746704101562, "loss": 0.6528, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.761360764503479, "rewards/margins": 0.1297047734260559, "rewards/rejected": 0.6316559910774231, "step": 2490 }, { "epoch": 0.7873705759615763, "grad_norm": 2.828125, "learning_rate": 4.686691224728652e-06, "logits/chosen": -0.5081063508987427, "logits/rejected": -0.3558959364891052, "logps/chosen": -215.3639373779297, "logps/rejected": -175.36788940429688, "loss": 0.6667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7302805781364441, "rewards/margins": 0.10294453799724579, "rewards/rejected": 0.6273361444473267, "step": 2500 }, { "epoch": 0.7905200582654226, "grad_norm": 3.0, "learning_rate": 4.684131412263098e-06, "logits/chosen": -0.35183241963386536, "logits/rejected": -0.3214506208896637, "logps/chosen": -177.6040496826172, "logps/rejected": -171.63784790039062, "loss": 0.6771, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6120871901512146, "rewards/margins": 0.06991194188594818, "rewards/rejected": 0.5421752333641052, "step": 2510 }, { "epoch": 0.793669540569269, "grad_norm": 2.28125, "learning_rate": 4.681561890097525e-06, "logits/chosen": -0.4272761344909668, "logits/rejected": -0.35165831446647644, "logps/chosen": -195.2444305419922, "logps/rejected": -184.37887573242188, "loss": 0.6776, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.68397456407547, "rewards/margins": 0.0739150270819664, "rewards/rejected": 0.6100595593452454, "step": 2520 }, { "epoch": 0.7968190228731152, "grad_norm": 2.265625, "learning_rate": 4.678982669654912e-06, "logits/chosen": -0.423833429813385, "logits/rejected": -0.37776055932044983, "logps/chosen": -202.28225708007812, "logps/rejected": -188.2001190185547, "loss": 0.6442, "rewards/accuracies": 0.625, "rewards/chosen": 0.6994912624359131, "rewards/margins": 0.13222193717956543, "rewards/rejected": 0.5672692656517029, "step": 2530 }, { "epoch": 0.7999685051769615, "grad_norm": 2.8125, "learning_rate": 4.676393762401354e-06, "logits/chosen": -0.4404567778110504, "logits/rejected": -0.32071155309677124, "logps/chosen": -197.72959899902344, "logps/rejected": -161.87881469726562, "loss": 0.6561, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6411703824996948, "rewards/margins": 0.11388187110424042, "rewards/rejected": 0.527288556098938, "step": 2540 }, { "epoch": 0.8031179874808079, "grad_norm": 2.765625, "learning_rate": 4.673795179846008e-06, "logits/chosen": -0.4287866950035095, "logits/rejected": -0.33532360196113586, "logps/chosen": -207.0975341796875, "logps/rejected": -175.67420959472656, "loss": 0.6759, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6876527667045593, "rewards/margins": 0.06763847172260284, "rewards/rejected": 0.6200142502784729, "step": 2550 }, { "epoch": 0.8062674697846541, "grad_norm": 2.90625, "learning_rate": 4.671186933541044e-06, "logits/chosen": -0.4286137521266937, "logits/rejected": -0.37905916571617126, "logps/chosen": -191.0441131591797, "logps/rejected": -179.77825927734375, "loss": 0.695, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6264538168907166, "rewards/margins": 0.020585432648658752, "rewards/rejected": 0.6058684587478638, "step": 2560 }, { "epoch": 0.8094169520885005, "grad_norm": 2.96875, "learning_rate": 4.668569035081594e-06, "logits/chosen": -0.4553300440311432, "logits/rejected": -0.4031829833984375, "logps/chosen": -182.02127075195312, "logps/rejected": -171.6331787109375, "loss": 0.6973, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.6412879228591919, "rewards/margins": 0.04937012866139412, "rewards/rejected": 0.5919178128242493, "step": 2570 }, { "epoch": 0.8125664343923468, "grad_norm": 2.828125, "learning_rate": 4.665941496105697e-06, "logits/chosen": -0.5085742473602295, "logits/rejected": -0.3790132701396942, "logps/chosen": -187.68478393554688, "logps/rejected": -153.02972412109375, "loss": 0.6575, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6509996056556702, "rewards/margins": 0.11632029712200165, "rewards/rejected": 0.5346792936325073, "step": 2580 }, { "epoch": 0.815715916696193, "grad_norm": 2.9375, "learning_rate": 4.663304328294251e-06, "logits/chosen": -0.4928790032863617, "logits/rejected": -0.31242626905441284, "logps/chosen": -204.3889617919922, "logps/rejected": -167.4778594970703, "loss": 0.645, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6917664408683777, "rewards/margins": 0.1491299867630005, "rewards/rejected": 0.5426364541053772, "step": 2590 }, { "epoch": 0.8188653990000394, "grad_norm": 3.609375, "learning_rate": 4.660657543370958e-06, "logits/chosen": -0.4446497857570648, "logits/rejected": -0.35457903146743774, "logps/chosen": -181.77471923828125, "logps/rejected": -157.31527709960938, "loss": 0.6608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6617356538772583, "rewards/margins": 0.09582933783531189, "rewards/rejected": 0.5659063458442688, "step": 2600 }, { "epoch": 0.8220148813038857, "grad_norm": 2.96875, "learning_rate": 4.658001153102276e-06, "logits/chosen": -0.44886890053749084, "logits/rejected": -0.43147921562194824, "logps/chosen": -186.99864196777344, "logps/rejected": -188.51278686523438, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": 0.6480501890182495, "rewards/margins": 0.08241890370845795, "rewards/rejected": 0.5656312108039856, "step": 2610 }, { "epoch": 0.8251643636077319, "grad_norm": 2.4375, "learning_rate": 4.655335169297363e-06, "logits/chosen": -0.4991195797920227, "logits/rejected": -0.3592751622200012, "logps/chosen": -185.76898193359375, "logps/rejected": -163.68087768554688, "loss": 0.6537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6216106414794922, "rewards/margins": 0.11423293501138687, "rewards/rejected": 0.5073777437210083, "step": 2620 }, { "epoch": 0.8283138459115783, "grad_norm": 2.484375, "learning_rate": 4.652659603808024e-06, "logits/chosen": -0.46877604722976685, "logits/rejected": -0.3875640332698822, "logps/chosen": -194.6293182373047, "logps/rejected": -162.6995391845703, "loss": 0.6556, "rewards/accuracies": 0.625, "rewards/chosen": 0.6514769792556763, "rewards/margins": 0.10939432680606842, "rewards/rejected": 0.542082667350769, "step": 2630 }, { "epoch": 0.8314633282154246, "grad_norm": 3.0625, "learning_rate": 4.6499744685286626e-06, "logits/chosen": -0.4762224555015564, "logits/rejected": -0.3948310911655426, "logps/chosen": -176.2687530517578, "logps/rejected": -162.93235778808594, "loss": 0.678, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6167873740196228, "rewards/margins": 0.05972397327423096, "rewards/rejected": 0.5570634007453918, "step": 2640 }, { "epoch": 0.8346128105192709, "grad_norm": 2.4375, "learning_rate": 4.6472797753962255e-06, "logits/chosen": -0.46065980195999146, "logits/rejected": -0.4272928237915039, "logps/chosen": -203.83956909179688, "logps/rejected": -183.5655059814453, "loss": 0.6521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7077460289001465, "rewards/margins": 0.13481923937797546, "rewards/rejected": 0.5729268193244934, "step": 2650 }, { "epoch": 0.8377622928231172, "grad_norm": 2.046875, "learning_rate": 4.6445755363901465e-06, "logits/chosen": -0.4757087826728821, "logits/rejected": -0.38740482926368713, "logps/chosen": -201.98423767089844, "logps/rejected": -168.28964233398438, "loss": 0.662, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6773720979690552, "rewards/margins": 0.10439164936542511, "rewards/rejected": 0.5729804039001465, "step": 2660 }, { "epoch": 0.8409117751269635, "grad_norm": 3.09375, "learning_rate": 4.641861763532299e-06, "logits/chosen": -0.4650692939758301, "logits/rejected": -0.3750719130039215, "logps/chosen": -202.7363739013672, "logps/rejected": -168.4525909423828, "loss": 0.6652, "rewards/accuracies": 0.625, "rewards/chosen": 0.6195183992385864, "rewards/margins": 0.08996371924877167, "rewards/rejected": 0.529554545879364, "step": 2670 }, { "epoch": 0.8440612574308098, "grad_norm": 3.0, "learning_rate": 4.639138468886939e-06, "logits/chosen": -0.49985313415527344, "logits/rejected": -0.4105320870876312, "logps/chosen": -200.1018524169922, "logps/rejected": -182.53408813476562, "loss": 0.6707, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.68363356590271, "rewards/margins": 0.08553630113601685, "rewards/rejected": 0.5980972051620483, "step": 2680 }, { "epoch": 0.8472107397346561, "grad_norm": 3.390625, "learning_rate": 4.636405664560652e-06, "logits/chosen": -0.5170512199401855, "logits/rejected": -0.4552704691886902, "logps/chosen": -205.1244354248047, "logps/rejected": -176.17617797851562, "loss": 0.6613, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7613251805305481, "rewards/margins": 0.1176910400390625, "rewards/rejected": 0.6436341404914856, "step": 2690 }, { "epoch": 0.8503602220385024, "grad_norm": 2.65625, "learning_rate": 4.6336633627023e-06, "logits/chosen": -0.5023754835128784, "logits/rejected": -0.3591773808002472, "logps/chosen": -198.65286254882812, "logps/rejected": -173.5179443359375, "loss": 0.6609, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6720755696296692, "rewards/margins": 0.10540957748889923, "rewards/rejected": 0.5666660070419312, "step": 2700 }, { "epoch": 0.8535097043423487, "grad_norm": 2.53125, "learning_rate": 4.630911575502967e-06, "logits/chosen": -0.4533337652683258, "logits/rejected": -0.3400491774082184, "logps/chosen": -190.54476928710938, "logps/rejected": -167.88975524902344, "loss": 0.6551, "rewards/accuracies": 0.625, "rewards/chosen": 0.662422776222229, "rewards/margins": 0.11403118073940277, "rewards/rejected": 0.5483915209770203, "step": 2710 }, { "epoch": 0.856659186646195, "grad_norm": 2.90625, "learning_rate": 4.628150315195902e-06, "logits/chosen": -0.44884181022644043, "logits/rejected": -0.3520192801952362, "logps/chosen": -192.61509704589844, "logps/rejected": -169.34854125976562, "loss": 0.6865, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6516313552856445, "rewards/margins": 0.05064528435468674, "rewards/rejected": 0.6009860634803772, "step": 2720 }, { "epoch": 0.8598086689500414, "grad_norm": 2.84375, "learning_rate": 4.625379594056472e-06, "logits/chosen": -0.43172770738601685, "logits/rejected": -0.3771997094154358, "logps/chosen": -181.5345916748047, "logps/rejected": -166.65841674804688, "loss": 0.6848, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6033766865730286, "rewards/margins": 0.052743762731552124, "rewards/rejected": 0.5506329536437988, "step": 2730 }, { "epoch": 0.8629581512538876, "grad_norm": 2.25, "learning_rate": 4.6225994244020984e-06, "logits/chosen": -0.4917459487915039, "logits/rejected": -0.37200021743774414, "logps/chosen": -209.8813018798828, "logps/rejected": -180.4649658203125, "loss": 0.6435, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7169169187545776, "rewards/margins": 0.15931591391563416, "rewards/rejected": 0.5576010346412659, "step": 2740 }, { "epoch": 0.8661076335577339, "grad_norm": 2.953125, "learning_rate": 4.61980981859221e-06, "logits/chosen": -0.5261596441268921, "logits/rejected": -0.4045354425907135, "logps/chosen": -197.52328491210938, "logps/rejected": -175.49851989746094, "loss": 0.6605, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6951828598976135, "rewards/margins": 0.11784076690673828, "rewards/rejected": 0.5773420929908752, "step": 2750 }, { "epoch": 0.8692571158615803, "grad_norm": 3.234375, "learning_rate": 4.6170107890281826e-06, "logits/chosen": -0.4892495274543762, "logits/rejected": -0.3943815231323242, "logps/chosen": -214.80078125, "logps/rejected": -192.01844787597656, "loss": 0.6471, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7895753979682922, "rewards/margins": 0.12512117624282837, "rewards/rejected": 0.6644541025161743, "step": 2760 }, { "epoch": 0.8724065981654265, "grad_norm": 3.5625, "learning_rate": 4.614202348153285e-06, "logits/chosen": -0.506980299949646, "logits/rejected": -0.37280508875846863, "logps/chosen": -206.6781005859375, "logps/rejected": -187.54193115234375, "loss": 0.6726, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6904038190841675, "rewards/margins": 0.0823880210518837, "rewards/rejected": 0.6080158352851868, "step": 2770 }, { "epoch": 0.8755560804692729, "grad_norm": 2.421875, "learning_rate": 4.611384508452629e-06, "logits/chosen": -0.47033706307411194, "logits/rejected": -0.3309301435947418, "logps/chosen": -221.22421264648438, "logps/rejected": -190.77188110351562, "loss": 0.6516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.759658932685852, "rewards/margins": 0.12068722397089005, "rewards/rejected": 0.638971745967865, "step": 2780 }, { "epoch": 0.8787055627731192, "grad_norm": 2.796875, "learning_rate": 4.608557282453104e-06, "logits/chosen": -0.5095947980880737, "logits/rejected": -0.45932531356811523, "logps/chosen": -189.14663696289062, "logps/rejected": -171.67837524414062, "loss": 0.6528, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6966843008995056, "rewards/margins": 0.13209667801856995, "rewards/rejected": 0.5645877122879028, "step": 2790 }, { "epoch": 0.8818550450769654, "grad_norm": 2.390625, "learning_rate": 4.605720682723331e-06, "logits/chosen": -0.46834999322891235, "logits/rejected": -0.3549017608165741, "logps/chosen": -210.1326446533203, "logps/rejected": -179.3318634033203, "loss": 0.6451, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.761681318283081, "rewards/margins": 0.15264348685741425, "rewards/rejected": 0.6090378761291504, "step": 2800 }, { "epoch": 0.8850045273808118, "grad_norm": 2.140625, "learning_rate": 4.602874721873599e-06, "logits/chosen": -0.5375205874443054, "logits/rejected": -0.34866297245025635, "logps/chosen": -188.57321166992188, "logps/rejected": -155.67984008789062, "loss": 0.6321, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6381895542144775, "rewards/margins": 0.16713955998420715, "rewards/rejected": 0.47105008363723755, "step": 2810 }, { "epoch": 0.8881540096846581, "grad_norm": 3.390625, "learning_rate": 4.600019412555816e-06, "logits/chosen": -0.4357992112636566, "logits/rejected": -0.28150540590286255, "logps/chosen": -199.49620056152344, "logps/rejected": -153.14419555664062, "loss": 0.6476, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6509748101234436, "rewards/margins": 0.14667873084545135, "rewards/rejected": 0.5042960047721863, "step": 2820 }, { "epoch": 0.8913034919885043, "grad_norm": 3.1875, "learning_rate": 4.597154767463448e-06, "logits/chosen": -0.4881827235221863, "logits/rejected": -0.3464208245277405, "logps/chosen": -201.01864624023438, "logps/rejected": -183.83456420898438, "loss": 0.6627, "rewards/accuracies": 0.625, "rewards/chosen": 0.6923896074295044, "rewards/margins": 0.09767551720142365, "rewards/rejected": 0.5947140455245972, "step": 2830 }, { "epoch": 0.8944529742923507, "grad_norm": 2.203125, "learning_rate": 4.594280799331461e-06, "logits/chosen": -0.516327977180481, "logits/rejected": -0.4000956416130066, "logps/chosen": -177.13763427734375, "logps/rejected": -154.5478515625, "loss": 0.6463, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6087929010391235, "rewards/margins": 0.1329253911972046, "rewards/rejected": 0.47586750984191895, "step": 2840 }, { "epoch": 0.897602456596197, "grad_norm": 2.59375, "learning_rate": 4.591397520936271e-06, "logits/chosen": -0.4523008465766907, "logits/rejected": -0.32660526037216187, "logps/chosen": -190.58114624023438, "logps/rejected": -168.5622100830078, "loss": 0.6439, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6767908930778503, "rewards/margins": 0.1442452073097229, "rewards/rejected": 0.5325456857681274, "step": 2850 }, { "epoch": 0.9007519389000433, "grad_norm": 2.46875, "learning_rate": 4.588504945095684e-06, "logits/chosen": -0.4655417799949646, "logits/rejected": -0.3397763669490814, "logps/chosen": -184.3746337890625, "logps/rejected": -158.1368408203125, "loss": 0.6466, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6702816486358643, "rewards/margins": 0.1449626088142395, "rewards/rejected": 0.5253190398216248, "step": 2860 }, { "epoch": 0.9039014212038896, "grad_norm": 2.75, "learning_rate": 4.585603084668833e-06, "logits/chosen": -0.48204731941223145, "logits/rejected": -0.3869457542896271, "logps/chosen": -211.1409149169922, "logps/rejected": -177.21664428710938, "loss": 0.6278, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.693871796131134, "rewards/margins": 0.17502956092357635, "rewards/rejected": 0.5188421607017517, "step": 2870 }, { "epoch": 0.9070509035077359, "grad_norm": 3.1875, "learning_rate": 4.582691952556131e-06, "logits/chosen": -0.516753077507019, "logits/rejected": -0.3798757791519165, "logps/chosen": -185.08062744140625, "logps/rejected": -157.95521545410156, "loss": 0.6113, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.6912366151809692, "rewards/margins": 0.21181932091712952, "rewards/rejected": 0.4794173240661621, "step": 2880 }, { "epoch": 0.9102003858115822, "grad_norm": 2.375, "learning_rate": 4.579771561699208e-06, "logits/chosen": -0.5301073789596558, "logits/rejected": -0.37922126054763794, "logps/chosen": -200.29745483398438, "logps/rejected": -163.3575439453125, "loss": 0.6527, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.655502200126648, "rewards/margins": 0.1373116374015808, "rewards/rejected": 0.5181905627250671, "step": 2890 }, { "epoch": 0.9133498681154285, "grad_norm": 3.28125, "learning_rate": 4.576841925080853e-06, "logits/chosen": -0.3888477683067322, "logits/rejected": -0.32840999960899353, "logps/chosen": -185.23165893554688, "logps/rejected": -163.85369873046875, "loss": 0.6592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6471211314201355, "rewards/margins": 0.11747239530086517, "rewards/rejected": 0.5296487808227539, "step": 2900 }, { "epoch": 0.9164993504192749, "grad_norm": 2.234375, "learning_rate": 4.5739030557249595e-06, "logits/chosen": -0.5069789886474609, "logits/rejected": -0.3604618310928345, "logps/chosen": -196.76856994628906, "logps/rejected": -165.517822265625, "loss": 0.6391, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6826483607292175, "rewards/margins": 0.1527707278728485, "rewards/rejected": 0.5298776030540466, "step": 2910 }, { "epoch": 0.9196488327231211, "grad_norm": 2.375, "learning_rate": 4.570954966696464e-06, "logits/chosen": -0.43340712785720825, "logits/rejected": -0.3375625014305115, "logps/chosen": -195.20223999023438, "logps/rejected": -172.97694396972656, "loss": 0.6641, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6797399520874023, "rewards/margins": 0.09967435896396637, "rewards/rejected": 0.5800655484199524, "step": 2920 }, { "epoch": 0.9227983150269674, "grad_norm": 2.6875, "learning_rate": 4.56799767110129e-06, "logits/chosen": -0.4303937554359436, "logits/rejected": -0.38990846276283264, "logps/chosen": -192.70545959472656, "logps/rejected": -176.0408172607422, "loss": 0.6566, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6560736894607544, "rewards/margins": 0.10354320704936981, "rewards/rejected": 0.5525304079055786, "step": 2930 }, { "epoch": 0.9259477973308138, "grad_norm": 3.296875, "learning_rate": 4.565031182086291e-06, "logits/chosen": -0.50087571144104, "logits/rejected": -0.3773635923862457, "logps/chosen": -194.03750610351562, "logps/rejected": -171.54971313476562, "loss": 0.6346, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7052168846130371, "rewards/margins": 0.16069751977920532, "rewards/rejected": 0.544519305229187, "step": 2940 }, { "epoch": 0.92909727963466, "grad_norm": 2.765625, "learning_rate": 4.562055512839189e-06, "logits/chosen": -0.46178698539733887, "logits/rejected": -0.37201589345932007, "logps/chosen": -205.4782257080078, "logps/rejected": -171.72808837890625, "loss": 0.6217, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7175599336624146, "rewards/margins": 0.19239577651023865, "rewards/rejected": 0.5251641273498535, "step": 2950 }, { "epoch": 0.9322467619385063, "grad_norm": 2.484375, "learning_rate": 4.559070676588516e-06, "logits/chosen": -0.4568074643611908, "logits/rejected": -0.37853866815567017, "logps/chosen": -214.366455078125, "logps/rejected": -184.67774963378906, "loss": 0.6514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7769818305969238, "rewards/margins": 0.1274883896112442, "rewards/rejected": 0.6494934558868408, "step": 2960 }, { "epoch": 0.9353962442423527, "grad_norm": 3.078125, "learning_rate": 4.55607668660356e-06, "logits/chosen": -0.48704949021339417, "logits/rejected": -0.4139239192008972, "logps/chosen": -193.26736450195312, "logps/rejected": -175.77593994140625, "loss": 0.6544, "rewards/accuracies": 0.625, "rewards/chosen": 0.6955569982528687, "rewards/margins": 0.11222921311855316, "rewards/rejected": 0.5833277702331543, "step": 2970 }, { "epoch": 0.9385457265461989, "grad_norm": 2.546875, "learning_rate": 4.5530735561943e-06, "logits/chosen": -0.4977632462978363, "logits/rejected": -0.3273804187774658, "logps/chosen": -193.0499725341797, "logps/rejected": -158.284423828125, "loss": 0.6462, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6608245968818665, "rewards/margins": 0.13379625976085663, "rewards/rejected": 0.5270283222198486, "step": 2980 }, { "epoch": 0.9416952088500453, "grad_norm": 2.640625, "learning_rate": 4.55006129871135e-06, "logits/chosen": -0.5545817017555237, "logits/rejected": -0.43782949447631836, "logps/chosen": -191.93934631347656, "logps/rejected": -167.23953247070312, "loss": 0.6625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6882120370864868, "rewards/margins": 0.1209985762834549, "rewards/rejected": 0.5672134160995483, "step": 2990 }, { "epoch": 0.9448446911538916, "grad_norm": 2.859375, "learning_rate": 4.547039927545899e-06, "logits/chosen": -0.4707905650138855, "logits/rejected": -0.3295817971229553, "logps/chosen": -186.739501953125, "logps/rejected": -162.34422302246094, "loss": 0.6588, "rewards/accuracies": 0.625, "rewards/chosen": 0.579537034034729, "rewards/margins": 0.0972566232085228, "rewards/rejected": 0.48228034377098083, "step": 3000 }, { "epoch": 0.9479941734577378, "grad_norm": 2.578125, "learning_rate": 4.544009456129651e-06, "logits/chosen": -0.40460291504859924, "logits/rejected": -0.36005955934524536, "logps/chosen": -191.5205841064453, "logps/rejected": -164.66702270507812, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.615665078163147, "rewards/margins": 0.033144764602184296, "rewards/rejected": 0.5825203061103821, "step": 3010 }, { "epoch": 0.9511436557615842, "grad_norm": 3.125, "learning_rate": 4.540969897934767e-06, "logits/chosen": -0.4697691798210144, "logits/rejected": -0.3581236004829407, "logps/chosen": -205.1107940673828, "logps/rejected": -161.09967041015625, "loss": 0.6513, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6768245697021484, "rewards/margins": 0.13728924095630646, "rewards/rejected": 0.5395353436470032, "step": 3020 }, { "epoch": 0.9542931380654305, "grad_norm": 3.3125, "learning_rate": 4.537921266473802e-06, "logits/chosen": -0.4509238302707672, "logits/rejected": -0.41182202100753784, "logps/chosen": -198.67684936523438, "logps/rejected": -188.21649169921875, "loss": 0.7044, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7039377093315125, "rewards/margins": 0.03385835513472557, "rewards/rejected": 0.6700793504714966, "step": 3030 }, { "epoch": 0.9574426203692769, "grad_norm": 3.3125, "learning_rate": 4.53486357529965e-06, "logits/chosen": -0.48828214406967163, "logits/rejected": -0.3500337302684784, "logps/chosen": -187.39865112304688, "logps/rejected": -156.00173950195312, "loss": 0.64, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6695148944854736, "rewards/margins": 0.1509798765182495, "rewards/rejected": 0.5185350179672241, "step": 3040 }, { "epoch": 0.9605921026731231, "grad_norm": 2.40625, "learning_rate": 4.531796838005477e-06, "logits/chosen": -0.5069050192832947, "logits/rejected": -0.3991912305355072, "logps/chosen": -197.77066040039062, "logps/rejected": -163.296875, "loss": 0.6187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7524086236953735, "rewards/margins": 0.1966829001903534, "rewards/rejected": 0.5557257533073425, "step": 3050 }, { "epoch": 0.9637415849769694, "grad_norm": 2.5, "learning_rate": 4.5287210682246655e-06, "logits/chosen": -0.42384999990463257, "logits/rejected": -0.3603217601776123, "logps/chosen": -183.27471923828125, "logps/rejected": -170.30142211914062, "loss": 0.6832, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5444117784500122, "rewards/margins": 0.05068939924240112, "rewards/rejected": 0.49372243881225586, "step": 3060 }, { "epoch": 0.9668910672808158, "grad_norm": 6.25, "learning_rate": 4.525636279630752e-06, "logits/chosen": -0.4649208188056946, "logits/rejected": -0.3279884159564972, "logps/chosen": -204.52191162109375, "logps/rejected": -171.454833984375, "loss": 0.6164, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7678558826446533, "rewards/margins": 0.21700289845466614, "rewards/rejected": 0.5508529543876648, "step": 3070 }, { "epoch": 0.970040549584662, "grad_norm": 2.65625, "learning_rate": 4.522542485937369e-06, "logits/chosen": -0.48648127913475037, "logits/rejected": -0.3391122817993164, "logps/chosen": -191.50942993164062, "logps/rejected": -159.47317504882812, "loss": 0.6609, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6665128469467163, "rewards/margins": 0.10518161207437515, "rewards/rejected": 0.5613312721252441, "step": 3080 }, { "epoch": 0.9731900318885083, "grad_norm": 2.796875, "learning_rate": 4.519439700898179e-06, "logits/chosen": -0.5188948512077332, "logits/rejected": -0.4177464544773102, "logps/chosen": -205.6922607421875, "logps/rejected": -177.05718994140625, "loss": 0.6589, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7047790288925171, "rewards/margins": 0.11938399076461792, "rewards/rejected": 0.5853949785232544, "step": 3090 }, { "epoch": 0.9763395141923547, "grad_norm": 3.078125, "learning_rate": 4.516327938306818e-06, "logits/chosen": -0.49799761176109314, "logits/rejected": -0.3481315076351166, "logps/chosen": -192.59019470214844, "logps/rejected": -155.11587524414062, "loss": 0.6338, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6473811268806458, "rewards/margins": 0.15366099774837494, "rewards/rejected": 0.4937201142311096, "step": 3100 }, { "epoch": 0.9794889964962009, "grad_norm": 2.671875, "learning_rate": 4.513207211996831e-06, "logits/chosen": -0.43314170837402344, "logits/rejected": -0.3531089425086975, "logps/chosen": -195.55174255371094, "logps/rejected": -162.95797729492188, "loss": 0.6499, "rewards/accuracies": 0.625, "rewards/chosen": 0.6635746955871582, "rewards/margins": 0.1176295131444931, "rewards/rejected": 0.5459452271461487, "step": 3110 }, { "epoch": 0.9826384788000473, "grad_norm": 3.015625, "learning_rate": 4.510077535841612e-06, "logits/chosen": -0.4261040687561035, "logits/rejected": -0.3906673192977905, "logps/chosen": -192.201416015625, "logps/rejected": -177.6776123046875, "loss": 0.6474, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7501270771026611, "rewards/margins": 0.1394282579421997, "rewards/rejected": 0.6106988191604614, "step": 3120 }, { "epoch": 0.9857879611038936, "grad_norm": 4.0625, "learning_rate": 4.506938923754342e-06, "logits/chosen": -0.47423991560935974, "logits/rejected": -0.36756032705307007, "logps/chosen": -209.3015594482422, "logps/rejected": -172.35110473632812, "loss": 0.6551, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6946024894714355, "rewards/margins": 0.12966330349445343, "rewards/rejected": 0.5649392008781433, "step": 3130 }, { "epoch": 0.9889374434077398, "grad_norm": 2.84375, "learning_rate": 4.50379138968793e-06, "logits/chosen": -0.4364834725856781, "logits/rejected": -0.28871777653694153, "logps/chosen": -206.86984252929688, "logps/rejected": -161.70379638671875, "loss": 0.6198, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7661986351013184, "rewards/margins": 0.20797491073608398, "rewards/rejected": 0.5582237839698792, "step": 3140 }, { "epoch": 0.9920869257115862, "grad_norm": 3.046875, "learning_rate": 4.500634947634943e-06, "logits/chosen": -0.4126051962375641, "logits/rejected": -0.3056962192058563, "logps/chosen": -204.4892578125, "logps/rejected": -179.78683471679688, "loss": 0.6801, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.730519711971283, "rewards/margins": 0.09220778197050095, "rewards/rejected": 0.6383119821548462, "step": 3150 }, { "epoch": 0.9952364080154324, "grad_norm": 2.421875, "learning_rate": 4.497469611627554e-06, "logits/chosen": -0.4934718608856201, "logits/rejected": -0.3333396911621094, "logps/chosen": -200.66720581054688, "logps/rejected": -161.30099487304688, "loss": 0.6013, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7085192203521729, "rewards/margins": 0.23979604244232178, "rewards/rejected": 0.46872320771217346, "step": 3160 }, { "epoch": 0.9983858903192788, "grad_norm": 3.453125, "learning_rate": 4.4942953957374724e-06, "logits/chosen": -0.5716500878334045, "logits/rejected": -0.4639313220977783, "logps/chosen": -209.87631225585938, "logps/rejected": -187.9324493408203, "loss": 0.6631, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7570088505744934, "rewards/margins": 0.10784139484167099, "rewards/rejected": 0.6491674184799194, "step": 3170 }, { "epoch": 1.001535372623125, "grad_norm": 2.984375, "learning_rate": 4.491112314075883e-06, "logits/chosen": -0.4782622456550598, "logits/rejected": -0.406808078289032, "logps/chosen": -200.47714233398438, "logps/rejected": -190.09597778320312, "loss": 0.6675, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7260812520980835, "rewards/margins": 0.08043357729911804, "rewards/rejected": 0.6456476449966431, "step": 3180 }, { "epoch": 1.0046848549269713, "grad_norm": 2.984375, "learning_rate": 4.487920380793386e-06, "logits/chosen": -0.5357908010482788, "logits/rejected": -0.40378838777542114, "logps/chosen": -195.0863494873047, "logps/rejected": -157.54249572753906, "loss": 0.6263, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7121897339820862, "rewards/margins": 0.17122754454612732, "rewards/rejected": 0.5409621596336365, "step": 3190 }, { "epoch": 1.0078343372308176, "grad_norm": 2.8125, "learning_rate": 4.4847196100799305e-06, "logits/chosen": -0.5046489834785461, "logits/rejected": -0.38591229915618896, "logps/chosen": -206.5364227294922, "logps/rejected": -177.38302612304688, "loss": 0.6648, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.698761522769928, "rewards/margins": 0.10818042606115341, "rewards/rejected": 0.5905810594558716, "step": 3200 }, { "epoch": 1.010983819534664, "grad_norm": 2.84375, "learning_rate": 4.481510016164753e-06, "logits/chosen": -0.4204939007759094, "logits/rejected": -0.32506507635116577, "logps/chosen": -186.22238159179688, "logps/rejected": -157.1585235595703, "loss": 0.6338, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7055295705795288, "rewards/margins": 0.16552086174488068, "rewards/rejected": 0.5400087237358093, "step": 3210 }, { "epoch": 1.0141333018385104, "grad_norm": 3.28125, "learning_rate": 4.478291613316316e-06, "logits/chosen": -0.526237428188324, "logits/rejected": -0.5045801997184753, "logps/chosen": -179.8779754638672, "logps/rejected": -179.01283264160156, "loss": 0.6786, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.6495916247367859, "rewards/margins": 0.07142230123281479, "rewards/rejected": 0.5781692862510681, "step": 3220 }, { "epoch": 1.0172827841423566, "grad_norm": 2.4375, "learning_rate": 4.47506441584224e-06, "logits/chosen": -0.47785645723342896, "logits/rejected": -0.37066784501075745, "logps/chosen": -176.42306518554688, "logps/rejected": -158.9258575439453, "loss": 0.6722, "rewards/accuracies": 0.625, "rewards/chosen": 0.6187280416488647, "rewards/margins": 0.08662554621696472, "rewards/rejected": 0.5321024656295776, "step": 3230 }, { "epoch": 1.0204322664462029, "grad_norm": 2.96875, "learning_rate": 4.471828438089245e-06, "logits/chosen": -0.5045980215072632, "logits/rejected": -0.3146094083786011, "logps/chosen": -192.99447631835938, "logps/rejected": -160.2574462890625, "loss": 0.6403, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6895080208778381, "rewards/margins": 0.1523585319519043, "rewards/rejected": 0.5371494293212891, "step": 3240 }, { "epoch": 1.0235817487500491, "grad_norm": 2.609375, "learning_rate": 4.4685836944430815e-06, "logits/chosen": -0.45219331979751587, "logits/rejected": -0.3365115821361542, "logps/chosen": -212.85543823242188, "logps/rejected": -181.6865692138672, "loss": 0.6293, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7488403916358948, "rewards/margins": 0.16951295733451843, "rewards/rejected": 0.579327404499054, "step": 3250 }, { "epoch": 1.0267312310538954, "grad_norm": 3.046875, "learning_rate": 4.465330199328473e-06, "logits/chosen": -0.46741557121276855, "logits/rejected": -0.4280362129211426, "logps/chosen": -201.90011596679688, "logps/rejected": -174.54420471191406, "loss": 0.6809, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.692274808883667, "rewards/margins": 0.0669262483716011, "rewards/rejected": 0.625348687171936, "step": 3260 }, { "epoch": 1.029880713357742, "grad_norm": 2.53125, "learning_rate": 4.462067967209045e-06, "logits/chosen": -0.46599286794662476, "logits/rejected": -0.40415382385253906, "logps/chosen": -191.61109924316406, "logps/rejected": -170.8994140625, "loss": 0.637, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.694964587688446, "rewards/margins": 0.15309689939022064, "rewards/rejected": 0.541867733001709, "step": 3270 }, { "epoch": 1.0330301956615882, "grad_norm": 2.46875, "learning_rate": 4.458797012587266e-06, "logits/chosen": -0.45959001779556274, "logits/rejected": -0.3480526804924011, "logps/chosen": -194.1427764892578, "logps/rejected": -164.58615112304688, "loss": 0.6511, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6329681277275085, "rewards/margins": 0.11931748688220978, "rewards/rejected": 0.51365065574646, "step": 3280 }, { "epoch": 1.0361796779654344, "grad_norm": 2.5, "learning_rate": 4.455517350004379e-06, "logits/chosen": -0.4861987233161926, "logits/rejected": -0.3476967513561249, "logps/chosen": -218.5071563720703, "logps/rejected": -177.85232543945312, "loss": 0.6175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8144248723983765, "rewards/margins": 0.19891203939914703, "rewards/rejected": 0.6155128479003906, "step": 3290 }, { "epoch": 1.0393291602692807, "grad_norm": 2.078125, "learning_rate": 4.452228994040341e-06, "logits/chosen": -0.4951728284358978, "logits/rejected": -0.3645640015602112, "logps/chosen": -196.14259338378906, "logps/rejected": -167.96095275878906, "loss": 0.6494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7171255946159363, "rewards/margins": 0.12575490772724152, "rewards/rejected": 0.591370701789856, "step": 3300 }, { "epoch": 1.042478642573127, "grad_norm": 2.21875, "learning_rate": 4.448931959313754e-06, "logits/chosen": -0.4978647232055664, "logits/rejected": -0.3358718454837799, "logps/chosen": -200.82778930664062, "logps/rejected": -170.21578979492188, "loss": 0.6559, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6936062574386597, "rewards/margins": 0.12191645056009293, "rewards/rejected": 0.5716897249221802, "step": 3310 }, { "epoch": 1.0456281248769734, "grad_norm": 3.0625, "learning_rate": 4.4456262604818044e-06, "logits/chosen": -0.5448322892189026, "logits/rejected": -0.4078589379787445, "logps/chosen": -202.68072509765625, "logps/rejected": -172.15322875976562, "loss": 0.65, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6884664297103882, "rewards/margins": 0.12657414376735687, "rewards/rejected": 0.5618923306465149, "step": 3320 }, { "epoch": 1.0487776071808197, "grad_norm": 4.0, "learning_rate": 4.442311912240194e-06, "logits/chosen": -0.46120721101760864, "logits/rejected": -0.31904077529907227, "logps/chosen": -204.01699829101562, "logps/rejected": -172.23001098632812, "loss": 0.6343, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6871546506881714, "rewards/margins": 0.15920737385749817, "rewards/rejected": 0.5279473066329956, "step": 3330 }, { "epoch": 1.051927089484666, "grad_norm": 4.0625, "learning_rate": 4.438988929323075e-06, "logits/chosen": -0.4875815808773041, "logits/rejected": -0.37348097562789917, "logps/chosen": -197.5282745361328, "logps/rejected": -167.55035400390625, "loss": 0.6422, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7295669317245483, "rewards/margins": 0.15396803617477417, "rewards/rejected": 0.5755988955497742, "step": 3340 }, { "epoch": 1.0550765717885122, "grad_norm": 2.984375, "learning_rate": 4.435657326502986e-06, "logits/chosen": -0.49304842948913574, "logits/rejected": -0.3635968565940857, "logps/chosen": -194.97134399414062, "logps/rejected": -163.18759155273438, "loss": 0.615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7523232698440552, "rewards/margins": 0.2145862579345703, "rewards/rejected": 0.5377371311187744, "step": 3350 }, { "epoch": 1.0582260540923585, "grad_norm": 2.546875, "learning_rate": 4.432317118590789e-06, "logits/chosen": -0.48891234397888184, "logits/rejected": -0.3675716519355774, "logps/chosen": -206.11074829101562, "logps/rejected": -187.20938110351562, "loss": 0.6612, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6792635917663574, "rewards/margins": 0.10087893903255463, "rewards/rejected": 0.5783846974372864, "step": 3360 }, { "epoch": 1.061375536396205, "grad_norm": 2.546875, "learning_rate": 4.428968320435597e-06, "logits/chosen": -0.44766363501548767, "logits/rejected": -0.442352294921875, "logps/chosen": -195.1834716796875, "logps/rejected": -194.13803100585938, "loss": 0.6908, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7168688178062439, "rewards/margins": 0.04094035550951958, "rewards/rejected": 0.6759284734725952, "step": 3370 }, { "epoch": 1.0645250187000512, "grad_norm": 2.65625, "learning_rate": 4.425610946924714e-06, "logits/chosen": -0.5080258250236511, "logits/rejected": -0.3179413378238678, "logps/chosen": -209.2963409423828, "logps/rejected": -164.95855712890625, "loss": 0.6388, "rewards/accuracies": 0.625, "rewards/chosen": 0.693896472454071, "rewards/margins": 0.1626008152961731, "rewards/rejected": 0.5312955975532532, "step": 3380 }, { "epoch": 1.0676745010038975, "grad_norm": 2.6875, "learning_rate": 4.422245012983563e-06, "logits/chosen": -0.43999892473220825, "logits/rejected": -0.34846392273902893, "logps/chosen": -202.3621826171875, "logps/rejected": -172.74575805664062, "loss": 0.656, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7476739287376404, "rewards/margins": 0.1216663271188736, "rewards/rejected": 0.6260076761245728, "step": 3390 }, { "epoch": 1.0708239833077438, "grad_norm": 2.875, "learning_rate": 4.418870533575626e-06, "logits/chosen": -0.4500795006752014, "logits/rejected": -0.35547685623168945, "logps/chosen": -194.93258666992188, "logps/rejected": -169.08042907714844, "loss": 0.6452, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6677473783493042, "rewards/margins": 0.13091039657592773, "rewards/rejected": 0.5368369221687317, "step": 3400 }, { "epoch": 1.07397346561159, "grad_norm": 2.21875, "learning_rate": 4.4154875237023725e-06, "logits/chosen": -0.5026477575302124, "logits/rejected": -0.42180952429771423, "logps/chosen": -190.4978790283203, "logps/rejected": -170.85133361816406, "loss": 0.6539, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6496149301528931, "rewards/margins": 0.1192195862531662, "rewards/rejected": 0.5303953289985657, "step": 3410 }, { "epoch": 1.0771229479154365, "grad_norm": 2.546875, "learning_rate": 4.412095998403198e-06, "logits/chosen": -0.467219740152359, "logits/rejected": -0.36950141191482544, "logps/chosen": -184.17282104492188, "logps/rejected": -159.8118133544922, "loss": 0.6296, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6559081673622131, "rewards/margins": 0.16591888666152954, "rewards/rejected": 0.4899892210960388, "step": 3420 }, { "epoch": 1.0802724302192828, "grad_norm": 2.0, "learning_rate": 4.4086959727553484e-06, "logits/chosen": -0.43744510412216187, "logits/rejected": -0.4274655282497406, "logps/chosen": -184.89468383789062, "logps/rejected": -174.7428436279297, "loss": 0.6674, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6194896101951599, "rewards/margins": 0.08610032498836517, "rewards/rejected": 0.5333893299102783, "step": 3430 }, { "epoch": 1.083421912523129, "grad_norm": 3.078125, "learning_rate": 4.4052874618738645e-06, "logits/chosen": -0.42463669180870056, "logits/rejected": -0.32110992074012756, "logps/chosen": -198.18606567382812, "logps/rejected": -166.31546020507812, "loss": 0.6371, "rewards/accuracies": 0.625, "rewards/chosen": 0.7485095262527466, "rewards/margins": 0.16625897586345673, "rewards/rejected": 0.5822504758834839, "step": 3440 }, { "epoch": 1.0865713948269753, "grad_norm": 2.671875, "learning_rate": 4.401870480911505e-06, "logits/chosen": -0.5295459032058716, "logits/rejected": -0.3508453369140625, "logps/chosen": -202.53451538085938, "logps/rejected": -164.10360717773438, "loss": 0.6324, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7244966626167297, "rewards/margins": 0.17259523272514343, "rewards/rejected": 0.5519014000892639, "step": 3450 }, { "epoch": 1.0897208771308216, "grad_norm": 2.171875, "learning_rate": 4.398445045058682e-06, "logits/chosen": -0.39355480670928955, "logits/rejected": -0.3064250349998474, "logps/chosen": -198.85391235351562, "logps/rejected": -193.22476196289062, "loss": 0.6711, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7315341234207153, "rewards/margins": 0.08587940782308578, "rewards/rejected": 0.6456546783447266, "step": 3460 }, { "epoch": 1.0928703594346678, "grad_norm": 2.640625, "learning_rate": 4.395011169543398e-06, "logits/chosen": -0.5193358659744263, "logits/rejected": -0.43463462591171265, "logps/chosen": -221.0367431640625, "logps/rejected": -189.76303100585938, "loss": 0.6429, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7745602130889893, "rewards/margins": 0.15965554118156433, "rewards/rejected": 0.6149047017097473, "step": 3470 }, { "epoch": 1.0960198417385143, "grad_norm": 2.5, "learning_rate": 4.3915688696311734e-06, "logits/chosen": -0.47026944160461426, "logits/rejected": -0.3491131067276001, "logps/chosen": -193.53817749023438, "logps/rejected": -180.67047119140625, "loss": 0.6465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7157790660858154, "rewards/margins": 0.1319868266582489, "rewards/rejected": 0.5837923288345337, "step": 3480 }, { "epoch": 1.0991693240423606, "grad_norm": 2.78125, "learning_rate": 4.3881181606249775e-06, "logits/chosen": -0.4417082369327545, "logits/rejected": -0.33755919337272644, "logps/chosen": -205.1912841796875, "logps/rejected": -168.96607971191406, "loss": 0.6527, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7606759071350098, "rewards/margins": 0.125040203332901, "rewards/rejected": 0.6356357336044312, "step": 3490 }, { "epoch": 1.1023188063462068, "grad_norm": 3.5, "learning_rate": 4.384659057865165e-06, "logits/chosen": -0.4954513609409332, "logits/rejected": -0.41869044303894043, "logps/chosen": -207.7197265625, "logps/rejected": -187.3145751953125, "loss": 0.6409, "rewards/accuracies": 0.625, "rewards/chosen": 0.7566835880279541, "rewards/margins": 0.13307683169841766, "rewards/rejected": 0.6236067414283752, "step": 3500 }, { "epoch": 1.105468288650053, "grad_norm": 3.09375, "learning_rate": 4.381191576729404e-06, "logits/chosen": -0.465837299823761, "logits/rejected": -0.3935115933418274, "logps/chosen": -189.66842651367188, "logps/rejected": -181.11953735351562, "loss": 0.67, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6702840328216553, "rewards/margins": 0.08853673934936523, "rewards/rejected": 0.58174729347229, "step": 3510 }, { "epoch": 1.1086177709538994, "grad_norm": 2.546875, "learning_rate": 4.377715732632613e-06, "logits/chosen": -0.4160510003566742, "logits/rejected": -0.34267672896385193, "logps/chosen": -190.46127319335938, "logps/rejected": -164.63645935058594, "loss": 0.6486, "rewards/accuracies": 0.625, "rewards/chosen": 0.7083535194396973, "rewards/margins": 0.13883116841316223, "rewards/rejected": 0.5695223808288574, "step": 3520 }, { "epoch": 1.1117672532577458, "grad_norm": 2.90625, "learning_rate": 4.374231541026883e-06, "logits/chosen": -0.5327466130256653, "logits/rejected": -0.38102999329566956, "logps/chosen": -208.6250457763672, "logps/rejected": -187.94937133789062, "loss": 0.6461, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7454079985618591, "rewards/margins": 0.1365070343017578, "rewards/rejected": 0.6089010238647461, "step": 3530 }, { "epoch": 1.114916735561592, "grad_norm": 3.703125, "learning_rate": 4.370739017401417e-06, "logits/chosen": -0.5027688145637512, "logits/rejected": -0.4032517373561859, "logps/chosen": -195.99122619628906, "logps/rejected": -161.76515197753906, "loss": 0.6612, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7000411152839661, "rewards/margins": 0.11141884326934814, "rewards/rejected": 0.5886222720146179, "step": 3540 }, { "epoch": 1.1180662178654384, "grad_norm": 4.1875, "learning_rate": 4.367238177282462e-06, "logits/chosen": -0.4465219974517822, "logits/rejected": -0.3524473309516907, "logps/chosen": -191.51556396484375, "logps/rejected": -171.40370178222656, "loss": 0.6437, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6995719075202942, "rewards/margins": 0.14049354195594788, "rewards/rejected": 0.5590783953666687, "step": 3550 }, { "epoch": 1.1212157001692846, "grad_norm": 2.53125, "learning_rate": 4.363729036233231e-06, "logits/chosen": -0.440678209066391, "logits/rejected": -0.35682061314582825, "logps/chosen": -233.2520751953125, "logps/rejected": -197.86093139648438, "loss": 0.6151, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7932190895080566, "rewards/margins": 0.20204809308052063, "rewards/rejected": 0.5911709666252136, "step": 3560 }, { "epoch": 1.1243651824731309, "grad_norm": 2.90625, "learning_rate": 4.360211609853841e-06, "logits/chosen": -0.46588149666786194, "logits/rejected": -0.3290489614009857, "logps/chosen": -189.09005737304688, "logps/rejected": -161.47216796875, "loss": 0.6275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7199033498764038, "rewards/margins": 0.18104612827301025, "rewards/rejected": 0.5388572216033936, "step": 3570 }, { "epoch": 1.1275146647769774, "grad_norm": 3.53125, "learning_rate": 4.356685913781243e-06, "logits/chosen": -0.4571777284145355, "logits/rejected": -0.2775833010673523, "logps/chosen": -212.7926788330078, "logps/rejected": -178.75543212890625, "loss": 0.6815, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7048720717430115, "rewards/margins": 0.06307940185070038, "rewards/rejected": 0.6417926549911499, "step": 3580 }, { "epoch": 1.1306641470808236, "grad_norm": 1.96875, "learning_rate": 4.353151963689153e-06, "logits/chosen": -0.4465222954750061, "logits/rejected": -0.33635538816452026, "logps/chosen": -193.8217315673828, "logps/rejected": -170.3684844970703, "loss": 0.656, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6877092719078064, "rewards/margins": 0.12182395160198212, "rewards/rejected": 0.5658854246139526, "step": 3590 }, { "epoch": 1.13381362938467, "grad_norm": 2.4375, "learning_rate": 4.349609775287977e-06, "logits/chosen": -0.45926880836486816, "logits/rejected": -0.3848228454589844, "logps/chosen": -188.42308044433594, "logps/rejected": -178.51895141601562, "loss": 0.6831, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.625271201133728, "rewards/margins": 0.08226005733013153, "rewards/rejected": 0.5430110692977905, "step": 3600 }, { "epoch": 1.1369631116885162, "grad_norm": 2.84375, "learning_rate": 4.346059364324747e-06, "logits/chosen": -0.5314878821372986, "logits/rejected": -0.4037748873233795, "logps/chosen": -198.21347045898438, "logps/rejected": -172.4250946044922, "loss": 0.6247, "rewards/accuracies": 0.625, "rewards/chosen": 0.7435242533683777, "rewards/margins": 0.1861993968486786, "rewards/rejected": 0.5573248267173767, "step": 3610 }, { "epoch": 1.1401125939923624, "grad_norm": 3.0, "learning_rate": 4.342500746583049e-06, "logits/chosen": -0.4643464982509613, "logits/rejected": -0.39244017004966736, "logps/chosen": -199.8033447265625, "logps/rejected": -178.8501739501953, "loss": 0.6568, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7149983644485474, "rewards/margins": 0.11290229856967926, "rewards/rejected": 0.6020959615707397, "step": 3620 }, { "epoch": 1.143262076296209, "grad_norm": 2.171875, "learning_rate": 4.338933937882952e-06, "logits/chosen": -0.48191675543785095, "logits/rejected": -0.34128910303115845, "logps/chosen": -192.5113983154297, "logps/rejected": -171.77792358398438, "loss": 0.6556, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7115253210067749, "rewards/margins": 0.11348304897546768, "rewards/rejected": 0.5980421900749207, "step": 3630 }, { "epoch": 1.1464115586000552, "grad_norm": 3.046875, "learning_rate": 4.335358954080939e-06, "logits/chosen": -0.5026925802230835, "logits/rejected": -0.3849731683731079, "logps/chosen": -195.01315307617188, "logps/rejected": -153.3354949951172, "loss": 0.6349, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.664111316204071, "rewards/margins": 0.1614895761013031, "rewards/rejected": 0.5026217699050903, "step": 3640 }, { "epoch": 1.1495610409039014, "grad_norm": 2.421875, "learning_rate": 4.331775811069837e-06, "logits/chosen": -0.3999708294868469, "logits/rejected": -0.3556813895702362, "logps/chosen": -193.3236083984375, "logps/rejected": -170.455810546875, "loss": 0.6737, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.6599856019020081, "rewards/margins": 0.08706656098365784, "rewards/rejected": 0.5729190111160278, "step": 3650 }, { "epoch": 1.1527105232077477, "grad_norm": 3.34375, "learning_rate": 4.328184524778743e-06, "logits/chosen": -0.5279411673545837, "logits/rejected": -0.4057609438896179, "logps/chosen": -195.410400390625, "logps/rejected": -158.7968292236328, "loss": 0.6285, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6674174070358276, "rewards/margins": 0.166324645280838, "rewards/rejected": 0.501092791557312, "step": 3660 }, { "epoch": 1.155860005511594, "grad_norm": 3.28125, "learning_rate": 4.324585111172959e-06, "logits/chosen": -0.46349841356277466, "logits/rejected": -0.3997943699359894, "logps/chosen": -198.0673370361328, "logps/rejected": -179.5006103515625, "loss": 0.6457, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7249336242675781, "rewards/margins": 0.15420496463775635, "rewards/rejected": 0.5707286596298218, "step": 3670 }, { "epoch": 1.1590094878154402, "grad_norm": 2.0, "learning_rate": 4.320977586253911e-06, "logits/chosen": -0.4722752571105957, "logits/rejected": -0.3240264058113098, "logps/chosen": -202.7415771484375, "logps/rejected": -171.6900177001953, "loss": 0.6431, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.669415295124054, "rewards/margins": 0.1358814686536789, "rewards/rejected": 0.5335337519645691, "step": 3680 }, { "epoch": 1.1621589701192867, "grad_norm": 2.4375, "learning_rate": 4.317361966059092e-06, "logits/chosen": -0.4404812753200531, "logits/rejected": -0.32088351249694824, "logps/chosen": -207.16323852539062, "logps/rejected": -176.3754119873047, "loss": 0.6309, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6894677877426147, "rewards/margins": 0.16316859424114227, "rewards/rejected": 0.5262991786003113, "step": 3690 }, { "epoch": 1.165308452423133, "grad_norm": 6.875, "learning_rate": 4.313738266661979e-06, "logits/chosen": -0.4920511841773987, "logits/rejected": -0.34299007058143616, "logps/chosen": -201.88381958007812, "logps/rejected": -172.03121948242188, "loss": 0.6224, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7374047636985779, "rewards/margins": 0.18349048495292664, "rewards/rejected": 0.5539143681526184, "step": 3700 }, { "epoch": 1.1684579347269792, "grad_norm": 3.0, "learning_rate": 4.310106504171966e-06, "logits/chosen": -0.45534056425094604, "logits/rejected": -0.35874494910240173, "logps/chosen": -190.65248107910156, "logps/rejected": -158.1346893310547, "loss": 0.6446, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6649999618530273, "rewards/margins": 0.14345432817935944, "rewards/rejected": 0.5215457081794739, "step": 3710 }, { "epoch": 1.1716074170308255, "grad_norm": 3.28125, "learning_rate": 4.306466694734292e-06, "logits/chosen": -0.4667026102542877, "logits/rejected": -0.33577385544776917, "logps/chosen": -194.20590209960938, "logps/rejected": -177.63400268554688, "loss": 0.6922, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6832542419433594, "rewards/margins": 0.034148164093494415, "rewards/rejected": 0.6491062045097351, "step": 3720 }, { "epoch": 1.174756899334672, "grad_norm": 2.59375, "learning_rate": 4.302818854529969e-06, "logits/chosen": -0.5350615382194519, "logits/rejected": -0.42588871717453003, "logps/chosen": -198.84042358398438, "logps/rejected": -164.7383270263672, "loss": 0.6171, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7313185334205627, "rewards/margins": 0.19767650961875916, "rewards/rejected": 0.5336421132087708, "step": 3730 }, { "epoch": 1.1779063816385182, "grad_norm": 4.53125, "learning_rate": 4.299162999775712e-06, "logits/chosen": -0.40838590264320374, "logits/rejected": -0.37127965688705444, "logps/chosen": -196.0567169189453, "logps/rejected": -196.9461669921875, "loss": 0.69, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7238589525222778, "rewards/margins": 0.06422169506549835, "rewards/rejected": 0.6596371531486511, "step": 3740 }, { "epoch": 1.1810558639423645, "grad_norm": 2.5625, "learning_rate": 4.295499146723864e-06, "logits/chosen": -0.5344425439834595, "logits/rejected": -0.35419797897338867, "logps/chosen": -202.3170623779297, "logps/rejected": -161.15969848632812, "loss": 0.6066, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7712113261222839, "rewards/margins": 0.229976087808609, "rewards/rejected": 0.5412352681159973, "step": 3750 }, { "epoch": 1.1842053462462108, "grad_norm": 2.453125, "learning_rate": 4.2918273116623245e-06, "logits/chosen": -0.39341261982917786, "logits/rejected": -0.3262158930301666, "logps/chosen": -178.946533203125, "logps/rejected": -171.49209594726562, "loss": 0.6667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6648285388946533, "rewards/margins": 0.08790337294340134, "rewards/rejected": 0.5769251585006714, "step": 3760 }, { "epoch": 1.187354828550057, "grad_norm": 2.359375, "learning_rate": 4.288147510914477e-06, "logits/chosen": -0.46092867851257324, "logits/rejected": -0.44449153542518616, "logps/chosen": -199.6840362548828, "logps/rejected": -191.94036865234375, "loss": 0.6985, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.6821948885917664, "rewards/margins": 0.02965412102639675, "rewards/rejected": 0.6525408029556274, "step": 3770 }, { "epoch": 1.1905043108539033, "grad_norm": 2.453125, "learning_rate": 4.284459760839122e-06, "logits/chosen": -0.5228853225708008, "logits/rejected": -0.3966136574745178, "logps/chosen": -197.607177734375, "logps/rejected": -169.21734619140625, "loss": 0.6785, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.6453790664672852, "rewards/margins": 0.06496746838092804, "rewards/rejected": 0.5804116129875183, "step": 3780 }, { "epoch": 1.1936537931577498, "grad_norm": 2.015625, "learning_rate": 4.28076407783039e-06, "logits/chosen": -0.4347180724143982, "logits/rejected": -0.3091747760772705, "logps/chosen": -183.70968627929688, "logps/rejected": -151.46176147460938, "loss": 0.5928, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7321251034736633, "rewards/margins": 0.25292515754699707, "rewards/rejected": 0.4791998863220215, "step": 3790 }, { "epoch": 1.196803275461596, "grad_norm": 2.375, "learning_rate": 4.277060478317687e-06, "logits/chosen": -0.5005819797515869, "logits/rejected": -0.36348724365234375, "logps/chosen": -205.44021606445312, "logps/rejected": -170.33944702148438, "loss": 0.6353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6576865911483765, "rewards/margins": 0.1621154546737671, "rewards/rejected": 0.49557122588157654, "step": 3800 }, { "epoch": 1.1999527577654423, "grad_norm": 2.9375, "learning_rate": 4.2733489787656075e-06, "logits/chosen": -0.4653560221195221, "logits/rejected": -0.3435734808444977, "logps/chosen": -185.8004913330078, "logps/rejected": -159.63597106933594, "loss": 0.6312, "rewards/accuracies": 0.625, "rewards/chosen": 0.6951313018798828, "rewards/margins": 0.1663820743560791, "rewards/rejected": 0.5287492871284485, "step": 3810 }, { "epoch": 1.2031022400692886, "grad_norm": 3.03125, "learning_rate": 4.269629595673867e-06, "logits/chosen": -0.478889524936676, "logits/rejected": -0.4129001200199127, "logps/chosen": -221.73828125, "logps/rejected": -191.03372192382812, "loss": 0.6142, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8134725689888, "rewards/margins": 0.20523671805858612, "rewards/rejected": 0.6082358360290527, "step": 3820 }, { "epoch": 1.2062517223731348, "grad_norm": 2.515625, "learning_rate": 4.265902345577227e-06, "logits/chosen": -0.4593692421913147, "logits/rejected": -0.3384065628051758, "logps/chosen": -186.61167907714844, "logps/rejected": -161.26182556152344, "loss": 0.6454, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.660686731338501, "rewards/margins": 0.13643305003643036, "rewards/rejected": 0.5242536664009094, "step": 3830 }, { "epoch": 1.2094012046769813, "grad_norm": 2.46875, "learning_rate": 4.262167245045424e-06, "logits/chosen": -0.503960132598877, "logits/rejected": -0.3568868339061737, "logps/chosen": -189.79818725585938, "logps/rejected": -160.52206420898438, "loss": 0.6303, "rewards/accuracies": 0.625, "rewards/chosen": 0.7322565317153931, "rewards/margins": 0.18888680636882782, "rewards/rejected": 0.5433696508407593, "step": 3840 }, { "epoch": 1.2125506869808276, "grad_norm": 3.015625, "learning_rate": 4.258424310683094e-06, "logits/chosen": -0.5178459882736206, "logits/rejected": -0.4422483444213867, "logps/chosen": -202.4190216064453, "logps/rejected": -191.25418090820312, "loss": 0.6634, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6625019311904907, "rewards/margins": 0.09516273438930511, "rewards/rejected": 0.5673390626907349, "step": 3850 }, { "epoch": 1.2157001692846738, "grad_norm": 2.4375, "learning_rate": 4.254673559129698e-06, "logits/chosen": -0.4428611695766449, "logits/rejected": -0.3638356328010559, "logps/chosen": -177.161376953125, "logps/rejected": -158.397216796875, "loss": 0.6791, "rewards/accuracies": 0.5625, "rewards/chosen": 0.660545825958252, "rewards/margins": 0.06867508590221405, "rewards/rejected": 0.5918707251548767, "step": 3860 }, { "epoch": 1.21884965158852, "grad_norm": 2.484375, "learning_rate": 4.250915007059448e-06, "logits/chosen": -0.4426344037055969, "logits/rejected": -0.39190584421157837, "logps/chosen": -182.5616455078125, "logps/rejected": -166.43875122070312, "loss": 0.6467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6761131286621094, "rewards/margins": 0.147123783826828, "rewards/rejected": 0.5289894342422485, "step": 3870 }, { "epoch": 1.2219991338923664, "grad_norm": 2.546875, "learning_rate": 4.247148671181237e-06, "logits/chosen": -0.5092406868934631, "logits/rejected": -0.41020458936691284, "logps/chosen": -190.69708251953125, "logps/rejected": -165.26943969726562, "loss": 0.6436, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7014779448509216, "rewards/margins": 0.13978223502635956, "rewards/rejected": 0.5616958141326904, "step": 3880 }, { "epoch": 1.2251486161962126, "grad_norm": 2.0625, "learning_rate": 4.243374568238556e-06, "logits/chosen": -0.49269500374794006, "logits/rejected": -0.34552276134490967, "logps/chosen": -197.5285186767578, "logps/rejected": -163.1229248046875, "loss": 0.6266, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.686238706111908, "rewards/margins": 0.17949660122394562, "rewards/rejected": 0.5067421197891235, "step": 3890 }, { "epoch": 1.228298098500059, "grad_norm": 2.453125, "learning_rate": 4.23959271500943e-06, "logits/chosen": -0.4887842535972595, "logits/rejected": -0.32892632484436035, "logps/chosen": -213.9662322998047, "logps/rejected": -175.28543090820312, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": 0.7332077026367188, "rewards/margins": 0.14412468671798706, "rewards/rejected": 0.5890830755233765, "step": 3900 }, { "epoch": 1.2314475808039054, "grad_norm": 3.515625, "learning_rate": 4.235803128306337e-06, "logits/chosen": -0.4606234133243561, "logits/rejected": -0.3164953291416168, "logps/chosen": -217.72933959960938, "logps/rejected": -188.64517211914062, "loss": 0.6548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7374159693717957, "rewards/margins": 0.12077488750219345, "rewards/rejected": 0.6166411638259888, "step": 3910 }, { "epoch": 1.2345970631077516, "grad_norm": 2.78125, "learning_rate": 4.232005824976133e-06, "logits/chosen": -0.4276387095451355, "logits/rejected": -0.34980887174606323, "logps/chosen": -212.1448516845703, "logps/rejected": -174.04776000976562, "loss": 0.6255, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7644809484481812, "rewards/margins": 0.19336572289466858, "rewards/rejected": 0.5711151361465454, "step": 3920 }, { "epoch": 1.237746545411598, "grad_norm": 2.265625, "learning_rate": 4.22820082189998e-06, "logits/chosen": -0.4039751887321472, "logits/rejected": -0.44209712743759155, "logps/chosen": -198.69827270507812, "logps/rejected": -201.2398681640625, "loss": 0.6825, "rewards/accuracies": 0.625, "rewards/chosen": 0.6999807357788086, "rewards/margins": 0.07114030420780182, "rewards/rejected": 0.6288403868675232, "step": 3930 }, { "epoch": 1.2408960277154444, "grad_norm": 3.25, "learning_rate": 4.224388135993271e-06, "logits/chosen": -0.4334556460380554, "logits/rejected": -0.30388978123664856, "logps/chosen": -206.3880157470703, "logps/rejected": -177.33868408203125, "loss": 0.6462, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.745646595954895, "rewards/margins": 0.1241118311882019, "rewards/rejected": 0.6215347051620483, "step": 3940 }, { "epoch": 1.2440455100192906, "grad_norm": 3.5, "learning_rate": 4.220567784205551e-06, "logits/chosen": -0.38715043663978577, "logits/rejected": -0.2959139049053192, "logps/chosen": -204.772216796875, "logps/rejected": -177.6510772705078, "loss": 0.649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7557042837142944, "rewards/margins": 0.1426253467798233, "rewards/rejected": 0.6130789518356323, "step": 3950 }, { "epoch": 1.247194992323137, "grad_norm": 2.765625, "learning_rate": 4.216739783520447e-06, "logits/chosen": -0.44497212767601013, "logits/rejected": -0.3760547935962677, "logps/chosen": -193.57406616210938, "logps/rejected": -169.6040802001953, "loss": 0.6622, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6512136459350586, "rewards/margins": 0.10468528419733047, "rewards/rejected": 0.5465283989906311, "step": 3960 }, { "epoch": 1.2503444746269832, "grad_norm": 2.734375, "learning_rate": 4.212904150955587e-06, "logits/chosen": -0.512496292591095, "logits/rejected": -0.40937352180480957, "logps/chosen": -215.65908813476562, "logps/rejected": -184.07162475585938, "loss": 0.6385, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8019892573356628, "rewards/margins": 0.16033987700939178, "rewards/rejected": 0.6416494250297546, "step": 3970 }, { "epoch": 1.2534939569308294, "grad_norm": 3.03125, "learning_rate": 4.209060903562528e-06, "logits/chosen": -0.4832407832145691, "logits/rejected": -0.37063470482826233, "logps/chosen": -211.16928100585938, "logps/rejected": -176.49264526367188, "loss": 0.6414, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7144185900688171, "rewards/margins": 0.13941380381584167, "rewards/rejected": 0.5750047564506531, "step": 3980 }, { "epoch": 1.2566434392346757, "grad_norm": 2.703125, "learning_rate": 4.20521005842668e-06, "logits/chosen": -0.4679936468601227, "logits/rejected": -0.320305734872818, "logps/chosen": -198.4337615966797, "logps/rejected": -156.4415283203125, "loss": 0.615, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.717679500579834, "rewards/margins": 0.20881009101867676, "rewards/rejected": 0.5088694095611572, "step": 3990 }, { "epoch": 1.2597929215385222, "grad_norm": 2.984375, "learning_rate": 4.201351632667227e-06, "logits/chosen": -0.4357661306858063, "logits/rejected": -0.33136463165283203, "logps/chosen": -215.5205535888672, "logps/rejected": -194.84225463867188, "loss": 0.6584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7862467169761658, "rewards/margins": 0.1381584107875824, "rewards/rejected": 0.6480883359909058, "step": 4000 }, { "epoch": 1.2597929215385222, "eval_logits/chosen": -0.6012239456176758, "eval_logits/rejected": -0.4762186110019684, "eval_logps/chosen": -243.07638549804688, "eval_logps/rejected": -222.3102569580078, "eval_loss": 0.6651261448860168, "eval_rewards/accuracies": 0.5946148037910461, "eval_rewards/chosen": 0.8024876117706299, "eval_rewards/margins": 0.10073534399271011, "eval_rewards/rejected": 0.701752245426178, "eval_runtime": 3657.1235, "eval_samples_per_second": 0.366, "eval_steps_per_second": 0.366, "step": 4000 }, { "epoch": 1.2629424038423684, "grad_norm": 2.78125, "learning_rate": 4.197485643437058e-06, "logits/chosen": -0.45790332555770874, "logits/rejected": -0.3080021142959595, "logps/chosen": -194.04660034179688, "logps/rejected": -152.27664184570312, "loss": 0.6051, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7683263421058655, "rewards/margins": 0.22982630133628845, "rewards/rejected": 0.5384999513626099, "step": 4010 }, { "epoch": 1.2660918861462147, "grad_norm": 3.765625, "learning_rate": 4.19361210792268e-06, "logits/chosen": -0.45886415243148804, "logits/rejected": -0.36696118116378784, "logps/chosen": -197.91018676757812, "logps/rejected": -165.89205932617188, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": 0.649300217628479, "rewards/margins": 0.12859824299812317, "rewards/rejected": 0.5207020044326782, "step": 4020 }, { "epoch": 1.269241368450061, "grad_norm": 2.0625, "learning_rate": 4.189731043344151e-06, "logits/chosen": -0.5488325953483582, "logits/rejected": -0.3945949673652649, "logps/chosen": -198.27198791503906, "logps/rejected": -160.57528686523438, "loss": 0.6517, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6978427767753601, "rewards/margins": 0.1346282958984375, "rewards/rejected": 0.5632144808769226, "step": 4030 }, { "epoch": 1.2723908507539075, "grad_norm": 2.59375, "learning_rate": 4.185842466954998e-06, "logits/chosen": -0.5013604760169983, "logits/rejected": -0.40894627571105957, "logps/chosen": -192.42051696777344, "logps/rejected": -167.13221740722656, "loss": 0.6322, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6890965700149536, "rewards/margins": 0.17353561520576477, "rewards/rejected": 0.5155609846115112, "step": 4040 }, { "epoch": 1.2755403330577537, "grad_norm": 3.265625, "learning_rate": 4.181946396042146e-06, "logits/chosen": -0.46542900800704956, "logits/rejected": -0.3752228319644928, "logps/chosen": -204.59176635742188, "logps/rejected": -184.3190155029297, "loss": 0.646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.712192952632904, "rewards/margins": 0.12876209616661072, "rewards/rejected": 0.5834308862686157, "step": 4050 }, { "epoch": 1.2786898153616, "grad_norm": 3.28125, "learning_rate": 4.178042847925833e-06, "logits/chosen": -0.4636574387550354, "logits/rejected": -0.4096450209617615, "logps/chosen": -207.9044952392578, "logps/rejected": -191.27566528320312, "loss": 0.6599, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7198086380958557, "rewards/margins": 0.10640084743499756, "rewards/rejected": 0.6134077906608582, "step": 4060 }, { "epoch": 1.2818392976654462, "grad_norm": 2.609375, "learning_rate": 4.174131839959539e-06, "logits/chosen": -0.5049449801445007, "logits/rejected": -0.37073415517807007, "logps/chosen": -201.08135986328125, "logps/rejected": -178.99465942382812, "loss": 0.6637, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7069438099861145, "rewards/margins": 0.11059415340423584, "rewards/rejected": 0.5963497161865234, "step": 4070 }, { "epoch": 1.2849887799692925, "grad_norm": 2.421875, "learning_rate": 4.170213389529908e-06, "logits/chosen": -0.4413929879665375, "logits/rejected": -0.37840536236763, "logps/chosen": -186.3639678955078, "logps/rejected": -180.01528930664062, "loss": 0.657, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.664527416229248, "rewards/margins": 0.11193283647298813, "rewards/rejected": 0.5525946617126465, "step": 4080 }, { "epoch": 1.2881382622731388, "grad_norm": 2.359375, "learning_rate": 4.16628751405667e-06, "logits/chosen": -0.4986829161643982, "logits/rejected": -0.34226298332214355, "logps/chosen": -184.44613647460938, "logps/rejected": -149.0967254638672, "loss": 0.6311, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6766608953475952, "rewards/margins": 0.16855625808238983, "rewards/rejected": 0.508104681968689, "step": 4090 }, { "epoch": 1.291287744576985, "grad_norm": 2.375, "learning_rate": 4.162354230992562e-06, "logits/chosen": -0.5013774633407593, "logits/rejected": -0.4048996865749359, "logps/chosen": -198.087646484375, "logps/rejected": -170.59884643554688, "loss": 0.6539, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7187263369560242, "rewards/margins": 0.12955203652381897, "rewards/rejected": 0.5891742706298828, "step": 4100 }, { "epoch": 1.2944372268808315, "grad_norm": 2.625, "learning_rate": 4.158413557823253e-06, "logits/chosen": -0.5318306684494019, "logits/rejected": -0.3859057128429413, "logps/chosen": -198.9587860107422, "logps/rejected": -168.4477081298828, "loss": 0.638, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7599405646324158, "rewards/margins": 0.16434063017368317, "rewards/rejected": 0.5955999493598938, "step": 4110 }, { "epoch": 1.2975867091846778, "grad_norm": 2.875, "learning_rate": 4.154465512067266e-06, "logits/chosen": -0.4665352702140808, "logits/rejected": -0.3361702561378479, "logps/chosen": -219.12088012695312, "logps/rejected": -174.34353637695312, "loss": 0.6203, "rewards/accuracies": 0.75, "rewards/chosen": 0.7928240895271301, "rewards/margins": 0.19742821156978607, "rewards/rejected": 0.5953959226608276, "step": 4120 }, { "epoch": 1.300736191488524, "grad_norm": 2.59375, "learning_rate": 4.1505101112758975e-06, "logits/chosen": -0.5339746475219727, "logits/rejected": -0.45769819617271423, "logps/chosen": -180.9729766845703, "logps/rejected": -161.6527557373047, "loss": 0.666, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6436579823493958, "rewards/margins": 0.08750168979167938, "rewards/rejected": 0.5561562180519104, "step": 4130 }, { "epoch": 1.3038856737923703, "grad_norm": 2.625, "learning_rate": 4.146547373033142e-06, "logits/chosen": -0.44753965735435486, "logits/rejected": -0.33734267950057983, "logps/chosen": -184.26077270507812, "logps/rejected": -155.27151489257812, "loss": 0.6196, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7223949432373047, "rewards/margins": 0.19382601976394653, "rewards/rejected": 0.5285689234733582, "step": 4140 }, { "epoch": 1.3070351560962168, "grad_norm": 2.265625, "learning_rate": 4.142577314955614e-06, "logits/chosen": -0.4315119683742523, "logits/rejected": -0.27412423491477966, "logps/chosen": -203.50393676757812, "logps/rejected": -160.0498504638672, "loss": 0.6151, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7396818995475769, "rewards/margins": 0.2151108682155609, "rewards/rejected": 0.5245710015296936, "step": 4150 }, { "epoch": 1.310184638400063, "grad_norm": 2.921875, "learning_rate": 4.138599954692467e-06, "logits/chosen": -0.4760667681694031, "logits/rejected": -0.4465053677558899, "logps/chosen": -204.943359375, "logps/rejected": -194.9350128173828, "loss": 0.6528, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7579629421234131, "rewards/margins": 0.136540025472641, "rewards/rejected": 0.6214228868484497, "step": 4160 }, { "epoch": 1.3133341207039093, "grad_norm": 2.78125, "learning_rate": 4.13461530992532e-06, "logits/chosen": -0.5714303255081177, "logits/rejected": -0.39394524693489075, "logps/chosen": -186.2847137451172, "logps/rejected": -156.30838012695312, "loss": 0.6444, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6628353595733643, "rewards/margins": 0.14596715569496155, "rewards/rejected": 0.5168682336807251, "step": 4170 }, { "epoch": 1.3164836030077556, "grad_norm": 2.484375, "learning_rate": 4.130623398368171e-06, "logits/chosen": -0.4178102910518646, "logits/rejected": -0.3646093010902405, "logps/chosen": -174.4610595703125, "logps/rejected": -152.55731201171875, "loss": 0.6464, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6118752360343933, "rewards/margins": 0.13116849958896637, "rewards/rejected": 0.48070669174194336, "step": 4180 }, { "epoch": 1.3196330853116018, "grad_norm": 2.21875, "learning_rate": 4.126624237767328e-06, "logits/chosen": -0.4415621757507324, "logits/rejected": -0.3622170686721802, "logps/chosen": -186.65716552734375, "logps/rejected": -164.0797576904297, "loss": 0.6271, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6536270380020142, "rewards/margins": 0.1805739849805832, "rewards/rejected": 0.47305306792259216, "step": 4190 }, { "epoch": 1.322782567615448, "grad_norm": 3.09375, "learning_rate": 4.122617845901322e-06, "logits/chosen": -0.5048421025276184, "logits/rejected": -0.41297560930252075, "logps/chosen": -190.35025024414062, "logps/rejected": -161.85830688476562, "loss": 0.6162, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.731613278388977, "rewards/margins": 0.20245853066444397, "rewards/rejected": 0.5291547179222107, "step": 4200 }, { "epoch": 1.3259320499192946, "grad_norm": 2.6875, "learning_rate": 4.118604240580832e-06, "logits/chosen": -0.40836066007614136, "logits/rejected": -0.35586100816726685, "logps/chosen": -199.27098083496094, "logps/rejected": -172.96218872070312, "loss": 0.6595, "rewards/accuracies": 0.625, "rewards/chosen": 0.7109326720237732, "rewards/margins": 0.11674892902374268, "rewards/rejected": 0.5941838026046753, "step": 4210 }, { "epoch": 1.3290815322231408, "grad_norm": 2.46875, "learning_rate": 4.114583439648604e-06, "logits/chosen": -0.47792333364486694, "logits/rejected": -0.3548354506492615, "logps/chosen": -212.23178100585938, "logps/rejected": -169.51315307617188, "loss": 0.6391, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6976093053817749, "rewards/margins": 0.15473678708076477, "rewards/rejected": 0.5428725481033325, "step": 4220 }, { "epoch": 1.3322310145269871, "grad_norm": 2.546875, "learning_rate": 4.110555460979374e-06, "logits/chosen": -0.5251237154006958, "logits/rejected": -0.4099500775337219, "logps/chosen": -200.9913787841797, "logps/rejected": -171.32870483398438, "loss": 0.6298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7174925804138184, "rewards/margins": 0.1686524748802185, "rewards/rejected": 0.5488401055335999, "step": 4230 }, { "epoch": 1.3353804968308334, "grad_norm": 2.90625, "learning_rate": 4.106520322479786e-06, "logits/chosen": -0.4651850163936615, "logits/rejected": -0.34185513854026794, "logps/chosen": -196.94589233398438, "logps/rejected": -159.74307250976562, "loss": 0.6523, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6821771860122681, "rewards/margins": 0.11817701160907745, "rewards/rejected": 0.564000129699707, "step": 4240 }, { "epoch": 1.3385299791346799, "grad_norm": 2.671875, "learning_rate": 4.102478042088315e-06, "logits/chosen": -0.49223145842552185, "logits/rejected": -0.35867035388946533, "logps/chosen": -202.394775390625, "logps/rejected": -171.31448364257812, "loss": 0.6207, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.725380539894104, "rewards/margins": 0.2017165720462799, "rewards/rejected": 0.5236639380455017, "step": 4250 }, { "epoch": 1.3416794614385261, "grad_norm": 2.765625, "learning_rate": 4.098428637775183e-06, "logits/chosen": -0.5037646293640137, "logits/rejected": -0.46415749192237854, "logps/chosen": -181.4263916015625, "logps/rejected": -168.53955078125, "loss": 0.674, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.6651412844657898, "rewards/margins": 0.08603055775165558, "rewards/rejected": 0.5791107416152954, "step": 4260 }, { "epoch": 1.3448289437423724, "grad_norm": 2.3125, "learning_rate": 4.094372127542285e-06, "logits/chosen": -0.44330787658691406, "logits/rejected": -0.3730103075504303, "logps/chosen": -190.1846923828125, "logps/rejected": -176.09255981445312, "loss": 0.6639, "rewards/accuracies": 0.625, "rewards/chosen": 0.6509902477264404, "rewards/margins": 0.11924519389867783, "rewards/rejected": 0.5317450761795044, "step": 4270 }, { "epoch": 1.3479784260462186, "grad_norm": 2.78125, "learning_rate": 4.0903085294231035e-06, "logits/chosen": -0.4798402190208435, "logits/rejected": -0.27211225032806396, "logps/chosen": -221.6403045654297, "logps/rejected": -164.9480743408203, "loss": 0.5944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7979739308357239, "rewards/margins": 0.26428526639938354, "rewards/rejected": 0.5336886644363403, "step": 4280 }, { "epoch": 1.351127908350065, "grad_norm": 2.84375, "learning_rate": 4.086237861482632e-06, "logits/chosen": -0.45586076378822327, "logits/rejected": -0.37053224444389343, "logps/chosen": -212.56216430664062, "logps/rejected": -183.00344848632812, "loss": 0.6408, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7277430295944214, "rewards/margins": 0.1431419998407364, "rewards/rejected": 0.5846010446548462, "step": 4290 }, { "epoch": 1.3542773906539112, "grad_norm": 2.421875, "learning_rate": 4.0821601418172926e-06, "logits/chosen": -0.4372914731502533, "logits/rejected": -0.362470418214798, "logps/chosen": -194.07884216308594, "logps/rejected": -176.6485137939453, "loss": 0.6393, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6592785716056824, "rewards/margins": 0.14654000103473663, "rewards/rejected": 0.5127385854721069, "step": 4300 }, { "epoch": 1.3574268729577574, "grad_norm": 3.078125, "learning_rate": 4.078075388554857e-06, "logits/chosen": -0.4022819995880127, "logits/rejected": -0.3265857398509979, "logps/chosen": -197.20260620117188, "logps/rejected": -163.478515625, "loss": 0.6253, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.741704523563385, "rewards/margins": 0.17545071244239807, "rewards/rejected": 0.5662537217140198, "step": 4310 }, { "epoch": 1.360576355261604, "grad_norm": 3.046875, "learning_rate": 4.0739836198543634e-06, "logits/chosen": -0.5151673555374146, "logits/rejected": -0.3456025719642639, "logps/chosen": -217.0493927001953, "logps/rejected": -183.4969024658203, "loss": 0.619, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7682887315750122, "rewards/margins": 0.19945594668388367, "rewards/rejected": 0.5688328146934509, "step": 4320 }, { "epoch": 1.3637258375654502, "grad_norm": 3.140625, "learning_rate": 4.069884853906041e-06, "logits/chosen": -0.5158268809318542, "logits/rejected": -0.333678662776947, "logps/chosen": -213.3292236328125, "logps/rejected": -159.3480987548828, "loss": 0.6005, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.801051914691925, "rewards/margins": 0.23268017172813416, "rewards/rejected": 0.5683717131614685, "step": 4330 }, { "epoch": 1.3668753198692964, "grad_norm": 4.34375, "learning_rate": 4.065779108931222e-06, "logits/chosen": -0.4823933243751526, "logits/rejected": -0.3266783654689789, "logps/chosen": -199.21498107910156, "logps/rejected": -166.51553344726562, "loss": 0.6243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7152196764945984, "rewards/margins": 0.1862681806087494, "rewards/rejected": 0.5289515256881714, "step": 4340 }, { "epoch": 1.3700248021731427, "grad_norm": 2.640625, "learning_rate": 4.0616664031822686e-06, "logits/chosen": -0.43618011474609375, "logits/rejected": -0.28981930017471313, "logps/chosen": -192.85598754882812, "logps/rejected": -157.85415649414062, "loss": 0.6548, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6323038935661316, "rewards/margins": 0.11133924871683121, "rewards/rejected": 0.5209646821022034, "step": 4350 }, { "epoch": 1.3731742844769892, "grad_norm": 2.5, "learning_rate": 4.057546754942482e-06, "logits/chosen": -0.5119596719741821, "logits/rejected": -0.3471509516239166, "logps/chosen": -196.14657592773438, "logps/rejected": -154.673583984375, "loss": 0.6247, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6863905191421509, "rewards/margins": 0.18415386974811554, "rewards/rejected": 0.5022366642951965, "step": 4360 }, { "epoch": 1.3763237667808355, "grad_norm": 2.640625, "learning_rate": 4.053420182526031e-06, "logits/chosen": -0.4351939260959625, "logits/rejected": -0.35209694504737854, "logps/chosen": -194.9961700439453, "logps/rejected": -161.25656127929688, "loss": 0.6406, "rewards/accuracies": 0.625, "rewards/chosen": 0.725360095500946, "rewards/margins": 0.15019366145133972, "rewards/rejected": 0.5751665234565735, "step": 4370 }, { "epoch": 1.3794732490846817, "grad_norm": 2.78125, "learning_rate": 4.049286704277865e-06, "logits/chosen": -0.4273145794868469, "logits/rejected": -0.32500097155570984, "logps/chosen": -198.8387908935547, "logps/rejected": -177.99790954589844, "loss": 0.6608, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6882764101028442, "rewards/margins": 0.1112000122666359, "rewards/rejected": 0.5770763158798218, "step": 4380 }, { "epoch": 1.382622731388528, "grad_norm": 2.78125, "learning_rate": 4.045146338573634e-06, "logits/chosen": -0.4316592216491699, "logits/rejected": -0.33414119482040405, "logps/chosen": -183.42718505859375, "logps/rejected": -173.8665313720703, "loss": 0.6702, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6961274743080139, "rewards/margins": 0.10534685850143433, "rewards/rejected": 0.5907806158065796, "step": 4390 }, { "epoch": 1.3857722136923742, "grad_norm": 3.125, "learning_rate": 4.040999103819606e-06, "logits/chosen": -0.5050019025802612, "logits/rejected": -0.3844815194606781, "logps/chosen": -205.3455352783203, "logps/rejected": -164.32203674316406, "loss": 0.6283, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7454142570495605, "rewards/margins": 0.17193658649921417, "rewards/rejected": 0.5734776854515076, "step": 4400 }, { "epoch": 1.3889216959962205, "grad_norm": 2.265625, "learning_rate": 4.036845018452586e-06, "logits/chosen": -0.4365948736667633, "logits/rejected": -0.3728296458721161, "logps/chosen": -188.38137817382812, "logps/rejected": -169.47296142578125, "loss": 0.6519, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.65342777967453, "rewards/margins": 0.1305466592311859, "rewards/rejected": 0.5228811502456665, "step": 4410 }, { "epoch": 1.392071178300067, "grad_norm": 2.859375, "learning_rate": 4.0326841009398354e-06, "logits/chosen": -0.46066349744796753, "logits/rejected": -0.38210171461105347, "logps/chosen": -189.90225219726562, "logps/rejected": -175.32156372070312, "loss": 0.6905, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6591070890426636, "rewards/margins": 0.048674892634153366, "rewards/rejected": 0.6104320883750916, "step": 4420 }, { "epoch": 1.3952206606039133, "grad_norm": 2.59375, "learning_rate": 4.028516369778987e-06, "logits/chosen": -0.43766292929649353, "logits/rejected": -0.29349422454833984, "logps/chosen": -188.58241271972656, "logps/rejected": -164.57627868652344, "loss": 0.655, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6774150729179382, "rewards/margins": 0.10568390041589737, "rewards/rejected": 0.5717312693595886, "step": 4430 }, { "epoch": 1.3983701429077595, "grad_norm": 2.703125, "learning_rate": 4.0243418434979605e-06, "logits/chosen": -0.43426451086997986, "logits/rejected": -0.3612423539161682, "logps/chosen": -201.9038543701172, "logps/rejected": -177.84640502929688, "loss": 0.6486, "rewards/accuracies": 0.625, "rewards/chosen": 0.7348337173461914, "rewards/margins": 0.1346462517976761, "rewards/rejected": 0.6001874208450317, "step": 4440 }, { "epoch": 1.4015196252116058, "grad_norm": 2.40625, "learning_rate": 4.020160540654892e-06, "logits/chosen": -0.42761626839637756, "logits/rejected": -0.31236857175827026, "logps/chosen": -192.8739471435547, "logps/rejected": -171.3946990966797, "loss": 0.6802, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6831879615783691, "rewards/margins": 0.06777564436197281, "rewards/rejected": 0.6154123544692993, "step": 4450 }, { "epoch": 1.4046691075154523, "grad_norm": 3.046875, "learning_rate": 4.015972479838035e-06, "logits/chosen": -0.4452191889286041, "logits/rejected": -0.3907052278518677, "logps/chosen": -210.70761108398438, "logps/rejected": -191.74087524414062, "loss": 0.6637, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7690273523330688, "rewards/margins": 0.09637213498353958, "rewards/rejected": 0.6726552248001099, "step": 4460 }, { "epoch": 1.4078185898192985, "grad_norm": 2.515625, "learning_rate": 4.011777679665693e-06, "logits/chosen": -0.5560199022293091, "logits/rejected": -0.4294883608818054, "logps/chosen": -188.9731903076172, "logps/rejected": -155.7578125, "loss": 0.6421, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6779214143753052, "rewards/margins": 0.14832261204719543, "rewards/rejected": 0.5295988321304321, "step": 4470 }, { "epoch": 1.4109680721231448, "grad_norm": 2.75, "learning_rate": 4.007576158786123e-06, "logits/chosen": -0.5267634987831116, "logits/rejected": -0.4555323123931885, "logps/chosen": -215.6820526123047, "logps/rejected": -179.27700805664062, "loss": 0.6235, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7598739266395569, "rewards/margins": 0.1784120500087738, "rewards/rejected": 0.5814618468284607, "step": 4480 }, { "epoch": 1.414117554426991, "grad_norm": 2.765625, "learning_rate": 4.003367935877466e-06, "logits/chosen": -0.49042144417762756, "logits/rejected": -0.40049901604652405, "logps/chosen": -205.4383544921875, "logps/rejected": -174.7517547607422, "loss": 0.6269, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.725433349609375, "rewards/margins": 0.18734149634838104, "rewards/rejected": 0.538091778755188, "step": 4490 }, { "epoch": 1.4172670367308373, "grad_norm": 2.484375, "learning_rate": 3.999153029647651e-06, "logits/chosen": -0.43651509284973145, "logits/rejected": -0.32142752408981323, "logps/chosen": -216.54519653320312, "logps/rejected": -169.64065551757812, "loss": 0.6045, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8090263605117798, "rewards/margins": 0.24071991443634033, "rewards/rejected": 0.5683062076568604, "step": 4500 }, { "epoch": 1.4204165190346836, "grad_norm": 3.109375, "learning_rate": 3.994931458834323e-06, "logits/chosen": -0.4635355472564697, "logits/rejected": -0.374891459941864, "logps/chosen": -176.76364135742188, "logps/rejected": -150.61795043945312, "loss": 0.6502, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6052500605583191, "rewards/margins": 0.12052911520004272, "rewards/rejected": 0.48472094535827637, "step": 4510 }, { "epoch": 1.4235660013385298, "grad_norm": 3.484375, "learning_rate": 3.990703242204754e-06, "logits/chosen": -0.4779212474822998, "logits/rejected": -0.3155384659767151, "logps/chosen": -203.7576904296875, "logps/rejected": -177.79849243164062, "loss": 0.6564, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.805811882019043, "rewards/margins": 0.1246214359998703, "rewards/rejected": 0.6811904907226562, "step": 4520 }, { "epoch": 1.4267154836423763, "grad_norm": 2.53125, "learning_rate": 3.986468398555758e-06, "logits/chosen": -0.4679221212863922, "logits/rejected": -0.3618343472480774, "logps/chosen": -198.53536987304688, "logps/rejected": -171.15133666992188, "loss": 0.6479, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7284259796142578, "rewards/margins": 0.14506401121616364, "rewards/rejected": 0.5833619832992554, "step": 4530 }, { "epoch": 1.4298649659462226, "grad_norm": 2.78125, "learning_rate": 3.98222694671361e-06, "logits/chosen": -0.45645904541015625, "logits/rejected": -0.36011576652526855, "logps/chosen": -218.33425903320312, "logps/rejected": -187.9898681640625, "loss": 0.6464, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8267934918403625, "rewards/margins": 0.15051567554473877, "rewards/rejected": 0.6762778162956238, "step": 4540 }, { "epoch": 1.4330144482500689, "grad_norm": 3.453125, "learning_rate": 3.977978905533966e-06, "logits/chosen": -0.49182215332984924, "logits/rejected": -0.40443873405456543, "logps/chosen": -203.3308868408203, "logps/rejected": -182.2737274169922, "loss": 0.6645, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6901066303253174, "rewards/margins": 0.09435725957155228, "rewards/rejected": 0.5957493185997009, "step": 4550 }, { "epoch": 1.4361639305539153, "grad_norm": 2.125, "learning_rate": 3.973724293901772e-06, "logits/chosen": -0.4056168496608734, "logits/rejected": -0.38391467928886414, "logps/chosen": -195.6358184814453, "logps/rejected": -183.57254028320312, "loss": 0.6455, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.682805061340332, "rewards/margins": 0.12637227773666382, "rewards/rejected": 0.5564327836036682, "step": 4560 }, { "epoch": 1.4393134128577616, "grad_norm": 2.703125, "learning_rate": 3.969463130731183e-06, "logits/chosen": -0.4418698847293854, "logits/rejected": -0.39611127972602844, "logps/chosen": -185.42196655273438, "logps/rejected": -161.41970825195312, "loss": 0.6608, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6986794471740723, "rewards/margins": 0.10915372520685196, "rewards/rejected": 0.5895256996154785, "step": 4570 }, { "epoch": 1.4424628951616079, "grad_norm": 2.28125, "learning_rate": 3.965195434965482e-06, "logits/chosen": -0.49892979860305786, "logits/rejected": -0.3777271807193756, "logps/chosen": -214.09317016601562, "logps/rejected": -188.13845825195312, "loss": 0.6567, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7749571800231934, "rewards/margins": 0.1210152879357338, "rewards/rejected": 0.6539419889450073, "step": 4580 }, { "epoch": 1.4456123774654541, "grad_norm": 3.453125, "learning_rate": 3.960921225576991e-06, "logits/chosen": -0.5049037933349609, "logits/rejected": -0.34426528215408325, "logps/chosen": -215.4214630126953, "logps/rejected": -190.46994018554688, "loss": 0.6566, "rewards/accuracies": 0.625, "rewards/chosen": 0.7792149782180786, "rewards/margins": 0.11181477457284927, "rewards/rejected": 0.6674002408981323, "step": 4590 }, { "epoch": 1.4487618597693004, "grad_norm": 2.578125, "learning_rate": 3.956640521566989e-06, "logits/chosen": -0.48397397994995117, "logits/rejected": -0.4080958962440491, "logps/chosen": -187.16818237304688, "logps/rejected": -166.5101318359375, "loss": 0.6367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7400224804878235, "rewards/margins": 0.15180279314517975, "rewards/rejected": 0.5882197022438049, "step": 4600 }, { "epoch": 1.4519113420731466, "grad_norm": 2.75, "learning_rate": 3.952353341965628e-06, "logits/chosen": -0.44713473320007324, "logits/rejected": -0.3727934956550598, "logps/chosen": -202.4422607421875, "logps/rejected": -174.40975952148438, "loss": 0.6574, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6708841323852539, "rewards/margins": 0.10317204892635345, "rewards/rejected": 0.5677120089530945, "step": 4610 }, { "epoch": 1.455060824376993, "grad_norm": 2.765625, "learning_rate": 3.948059705831847e-06, "logits/chosen": -0.4646250605583191, "logits/rejected": -0.34977811574935913, "logps/chosen": -175.93081665039062, "logps/rejected": -161.12667846679688, "loss": 0.6457, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6609789729118347, "rewards/margins": 0.13796725869178772, "rewards/rejected": 0.5230117440223694, "step": 4620 }, { "epoch": 1.4582103066808394, "grad_norm": 2.90625, "learning_rate": 3.943759632253289e-06, "logits/chosen": -0.44996100664138794, "logits/rejected": -0.3352479934692383, "logps/chosen": -191.62132263183594, "logps/rejected": -161.776611328125, "loss": 0.6196, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7225370407104492, "rewards/margins": 0.20392043888568878, "rewards/rejected": 0.5186166167259216, "step": 4630 }, { "epoch": 1.4613597889846857, "grad_norm": 2.421875, "learning_rate": 3.939453140346212e-06, "logits/chosen": -0.4620143473148346, "logits/rejected": -0.2916569113731384, "logps/chosen": -198.8416748046875, "logps/rejected": -158.47796630859375, "loss": 0.6241, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6989172697067261, "rewards/margins": 0.20297345519065857, "rewards/rejected": 0.4959437847137451, "step": 4640 }, { "epoch": 1.464509271288532, "grad_norm": 2.4375, "learning_rate": 3.935140249255412e-06, "logits/chosen": -0.4222196638584137, "logits/rejected": -0.3922198414802551, "logps/chosen": -191.1973114013672, "logps/rejected": -182.5751495361328, "loss": 0.7209, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.6637114882469177, "rewards/margins": 0.014695653691887856, "rewards/rejected": 0.6490157842636108, "step": 4650 }, { "epoch": 1.4676587535923782, "grad_norm": 4.71875, "learning_rate": 3.930820978154129e-06, "logits/chosen": -0.4839719831943512, "logits/rejected": -0.36961930990219116, "logps/chosen": -199.48463439941406, "logps/rejected": -167.02774047851562, "loss": 0.6264, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7042496204376221, "rewards/margins": 0.18996970355510712, "rewards/rejected": 0.5142799615859985, "step": 4660 }, { "epoch": 1.4708082358962247, "grad_norm": 2.78125, "learning_rate": 3.926495346243967e-06, "logits/chosen": -0.5004483461380005, "logits/rejected": -0.40153947472572327, "logps/chosen": -210.6992645263672, "logps/rejected": -190.9122314453125, "loss": 0.6394, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7591875791549683, "rewards/margins": 0.1497870534658432, "rewards/rejected": 0.6094005107879639, "step": 4670 }, { "epoch": 1.473957718200071, "grad_norm": 3.078125, "learning_rate": 3.922163372754807e-06, "logits/chosen": -0.41180485486984253, "logits/rejected": -0.31235820055007935, "logps/chosen": -203.12744140625, "logps/rejected": -167.7096710205078, "loss": 0.6562, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6816965937614441, "rewards/margins": 0.12079987674951553, "rewards/rejected": 0.5608968138694763, "step": 4680 }, { "epoch": 1.4771072005039172, "grad_norm": 1.78125, "learning_rate": 3.9178250769447245e-06, "logits/chosen": -0.4602780342102051, "logits/rejected": -0.352649986743927, "logps/chosen": -178.3946075439453, "logps/rejected": -165.46885681152344, "loss": 0.6532, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6400049924850464, "rewards/margins": 0.12722846865653992, "rewards/rejected": 0.5127764940261841, "step": 4690 }, { "epoch": 1.4802566828077635, "grad_norm": 2.359375, "learning_rate": 3.913480478099898e-06, "logits/chosen": -0.4489854872226715, "logits/rejected": -0.3600943684577942, "logps/chosen": -215.1140594482422, "logps/rejected": -183.04026794433594, "loss": 0.6386, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7823598980903625, "rewards/margins": 0.1600048840045929, "rewards/rejected": 0.6223549842834473, "step": 4700 }, { "epoch": 1.4834061651116097, "grad_norm": 2.734375, "learning_rate": 3.909129595534527e-06, "logits/chosen": -0.4606572091579437, "logits/rejected": -0.3265830874443054, "logps/chosen": -197.79678344726562, "logps/rejected": -165.95912170410156, "loss": 0.6541, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6801806688308716, "rewards/margins": 0.11523494869470596, "rewards/rejected": 0.5649456977844238, "step": 4710 }, { "epoch": 1.486555647415456, "grad_norm": 3.625, "learning_rate": 3.904772448590747e-06, "logits/chosen": -0.46349745988845825, "logits/rejected": -0.4300110936164856, "logps/chosen": -190.7307891845703, "logps/rejected": -179.4709014892578, "loss": 0.6772, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6604156494140625, "rewards/margins": 0.06815408170223236, "rewards/rejected": 0.5922616124153137, "step": 4720 }, { "epoch": 1.4897051297193025, "grad_norm": 3.015625, "learning_rate": 3.900409056638542e-06, "logits/chosen": -0.4786251485347748, "logits/rejected": -0.3198348581790924, "logps/chosen": -210.8201904296875, "logps/rejected": -178.6912078857422, "loss": 0.6503, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7678909301757812, "rewards/margins": 0.12642471492290497, "rewards/rejected": 0.6414662003517151, "step": 4730 }, { "epoch": 1.4928546120231487, "grad_norm": 3.015625, "learning_rate": 3.896039439075659e-06, "logits/chosen": -0.5011542439460754, "logits/rejected": -0.36163219809532166, "logps/chosen": -197.447021484375, "logps/rejected": -147.70782470703125, "loss": 0.629, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6751397848129272, "rewards/margins": 0.1756085753440857, "rewards/rejected": 0.49953117966651917, "step": 4740 }, { "epoch": 1.496004094326995, "grad_norm": 2.734375, "learning_rate": 3.891663615327518e-06, "logits/chosen": -0.47268587350845337, "logits/rejected": -0.3960368037223816, "logps/chosen": -188.75637817382812, "logps/rejected": -167.4852752685547, "loss": 0.6569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6986013054847717, "rewards/margins": 0.12062957137823105, "rewards/rejected": 0.5779717564582825, "step": 4750 }, { "epoch": 1.4991535766308413, "grad_norm": 2.703125, "learning_rate": 3.887281604847134e-06, "logits/chosen": -0.510342001914978, "logits/rejected": -0.36850231885910034, "logps/chosen": -192.31849670410156, "logps/rejected": -166.56289672851562, "loss": 0.64, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6647813320159912, "rewards/margins": 0.15008948743343353, "rewards/rejected": 0.5146918296813965, "step": 4760 }, { "epoch": 1.5023030589346877, "grad_norm": 2.96875, "learning_rate": 3.8828934271150225e-06, "logits/chosen": -0.460064172744751, "logits/rejected": -0.4005635678768158, "logps/chosen": -222.4634246826172, "logps/rejected": -201.12998962402344, "loss": 0.6611, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7676762938499451, "rewards/margins": 0.10998652130365372, "rewards/rejected": 0.6576897501945496, "step": 4770 }, { "epoch": 1.505452541238534, "grad_norm": 3.21875, "learning_rate": 3.878499101639116e-06, "logits/chosen": -0.5471469759941101, "logits/rejected": -0.4315733313560486, "logps/chosen": -210.2429962158203, "logps/rejected": -175.91360473632812, "loss": 0.6623, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7174965143203735, "rewards/margins": 0.10746297985315323, "rewards/rejected": 0.6100335717201233, "step": 4780 }, { "epoch": 1.5086020235423803, "grad_norm": 3.4375, "learning_rate": 3.8740986479546796e-06, "logits/chosen": -0.45277899503707886, "logits/rejected": -0.4165739119052887, "logps/chosen": -194.2645263671875, "logps/rejected": -179.887939453125, "loss": 0.6491, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7052034139633179, "rewards/margins": 0.1254056990146637, "rewards/rejected": 0.5797977447509766, "step": 4790 }, { "epoch": 1.5117515058462265, "grad_norm": 2.796875, "learning_rate": 3.869692085624218e-06, "logits/chosen": -0.45139655470848083, "logits/rejected": -0.3898164629936218, "logps/chosen": -209.1693572998047, "logps/rejected": -194.74021911621094, "loss": 0.6479, "rewards/accuracies": 0.625, "rewards/chosen": 0.7721225619316101, "rewards/margins": 0.12638486921787262, "rewards/rejected": 0.6457376480102539, "step": 4800 }, { "epoch": 1.5149009881500728, "grad_norm": 2.859375, "learning_rate": 3.865279434237394e-06, "logits/chosen": -0.4812788963317871, "logits/rejected": -0.39241549372673035, "logps/chosen": -190.20193481445312, "logps/rejected": -166.25454711914062, "loss": 0.6459, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6281577944755554, "rewards/margins": 0.12559270858764648, "rewards/rejected": 0.5025650858879089, "step": 4810 }, { "epoch": 1.518050470453919, "grad_norm": 3.015625, "learning_rate": 3.860860713410941e-06, "logits/chosen": -0.38653573393821716, "logits/rejected": -0.361195832490921, "logps/chosen": -204.06520080566406, "logps/rejected": -209.9178009033203, "loss": 0.6876, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7754232287406921, "rewards/margins": 0.045936040580272675, "rewards/rejected": 0.7294871211051941, "step": 4820 }, { "epoch": 1.5211999527577653, "grad_norm": 2.453125, "learning_rate": 3.8564359427885735e-06, "logits/chosen": -0.469553142786026, "logits/rejected": -0.41318267583847046, "logps/chosen": -207.17117309570312, "logps/rejected": -177.3662872314453, "loss": 0.6248, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7327437400817871, "rewards/margins": 0.1847001016139984, "rewards/rejected": 0.5480436086654663, "step": 4830 }, { "epoch": 1.5243494350616118, "grad_norm": 2.984375, "learning_rate": 3.852005142040901e-06, "logits/chosen": -0.4887300133705139, "logits/rejected": -0.38054460287094116, "logps/chosen": -189.93447875976562, "logps/rejected": -159.81796264648438, "loss": 0.635, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6666679978370667, "rewards/margins": 0.1536942720413208, "rewards/rejected": 0.5129736661911011, "step": 4840 }, { "epoch": 1.527498917365458, "grad_norm": 2.453125, "learning_rate": 3.8475683308653385e-06, "logits/chosen": -0.4928087294101715, "logits/rejected": -0.3735254406929016, "logps/chosen": -204.97787475585938, "logps/rejected": -168.03756713867188, "loss": 0.6341, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.772538423538208, "rewards/margins": 0.1797189563512802, "rewards/rejected": 0.5928195118904114, "step": 4850 }, { "epoch": 1.5306483996693043, "grad_norm": 2.375, "learning_rate": 3.8431255289860225e-06, "logits/chosen": -0.47978100180625916, "logits/rejected": -0.3051653504371643, "logps/chosen": -217.0726776123047, "logps/rejected": -169.8887939453125, "loss": 0.5989, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.799135148525238, "rewards/margins": 0.24375374615192413, "rewards/rejected": 0.5553814172744751, "step": 4860 }, { "epoch": 1.5337978819731508, "grad_norm": 3.1875, "learning_rate": 3.838676756153723e-06, "logits/chosen": -0.4916785657405853, "logits/rejected": -0.4126752018928528, "logps/chosen": -204.36715698242188, "logps/rejected": -171.4047088623047, "loss": 0.6373, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7680164575576782, "rewards/margins": 0.1488504558801651, "rewards/rejected": 0.6191660761833191, "step": 4870 }, { "epoch": 1.536947364276997, "grad_norm": 2.328125, "learning_rate": 3.834222032145751e-06, "logits/chosen": -0.4622599184513092, "logits/rejected": -0.3681351840496063, "logps/chosen": -192.53379821777344, "logps/rejected": -161.97409057617188, "loss": 0.6518, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6855158805847168, "rewards/margins": 0.11872188746929169, "rewards/rejected": 0.5667939782142639, "step": 4880 }, { "epoch": 1.5400968465808433, "grad_norm": 3.296875, "learning_rate": 3.829761376765875e-06, "logits/chosen": -0.5002883076667786, "logits/rejected": -0.37738004326820374, "logps/chosen": -208.83547973632812, "logps/rejected": -180.82960510253906, "loss": 0.6302, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7079477906227112, "rewards/margins": 0.172250896692276, "rewards/rejected": 0.5356968641281128, "step": 4890 }, { "epoch": 1.5432463288846896, "grad_norm": 2.640625, "learning_rate": 3.825294809844234e-06, "logits/chosen": -0.4677095413208008, "logits/rejected": -0.43684762716293335, "logps/chosen": -204.42538452148438, "logps/rejected": -189.5218505859375, "loss": 0.6914, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.697405219078064, "rewards/margins": 0.04717016965150833, "rewards/rejected": 0.6502350568771362, "step": 4900 }, { "epoch": 1.5463958111885359, "grad_norm": 3.234375, "learning_rate": 3.820822351237245e-06, "logits/chosen": -0.44419798254966736, "logits/rejected": -0.3508889377117157, "logps/chosen": -192.20458984375, "logps/rejected": -182.95726013183594, "loss": 0.665, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6904107928276062, "rewards/margins": 0.09415493160486221, "rewards/rejected": 0.596255898475647, "step": 4910 }, { "epoch": 1.5495452934923821, "grad_norm": 2.59375, "learning_rate": 3.816344020827516e-06, "logits/chosen": -0.5454775094985962, "logits/rejected": -0.39591822028160095, "logps/chosen": -196.39208984375, "logps/rejected": -159.05348205566406, "loss": 0.6543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6435393691062927, "rewards/margins": 0.11705289781093597, "rewards/rejected": 0.5264865159988403, "step": 4920 }, { "epoch": 1.5526947757962284, "grad_norm": 3.359375, "learning_rate": 3.8118598385237604e-06, "logits/chosen": -0.439689576625824, "logits/rejected": -0.4039355218410492, "logps/chosen": -186.1417236328125, "logps/rejected": -170.55740356445312, "loss": 0.6797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6710829138755798, "rewards/margins": 0.06706185638904572, "rewards/rejected": 0.6040210127830505, "step": 4930 }, { "epoch": 1.5558442581000747, "grad_norm": 2.28125, "learning_rate": 3.807369824260706e-06, "logits/chosen": -0.45275792479515076, "logits/rejected": -0.3387320935726166, "logps/chosen": -205.11349487304688, "logps/rejected": -165.2199249267578, "loss": 0.626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7621325850486755, "rewards/margins": 0.1830775886774063, "rewards/rejected": 0.5790549516677856, "step": 4940 }, { "epoch": 1.5589937404039211, "grad_norm": 2.828125, "learning_rate": 3.8028739979990072e-06, "logits/chosen": -0.4740076959133148, "logits/rejected": -0.36817610263824463, "logps/chosen": -217.1376190185547, "logps/rejected": -184.6923065185547, "loss": 0.617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7812771797180176, "rewards/margins": 0.19556982815265656, "rewards/rejected": 0.5857073068618774, "step": 4950 }, { "epoch": 1.5621432227077674, "grad_norm": 2.859375, "learning_rate": 3.798372379725155e-06, "logits/chosen": -0.47232404351234436, "logits/rejected": -0.30332669615745544, "logps/chosen": -179.11349487304688, "logps/rejected": -152.26527404785156, "loss": 0.6515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6055221557617188, "rewards/margins": 0.1263987123966217, "rewards/rejected": 0.47912344336509705, "step": 4960 }, { "epoch": 1.5652927050116137, "grad_norm": 3.4375, "learning_rate": 3.79386498945139e-06, "logits/chosen": -0.512363612651825, "logits/rejected": -0.3525320291519165, "logps/chosen": -199.22694396972656, "logps/rejected": -158.5713348388672, "loss": 0.6495, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7140517234802246, "rewards/margins": 0.13541573286056519, "rewards/rejected": 0.5786360502243042, "step": 4970 }, { "epoch": 1.5684421873154601, "grad_norm": 3.125, "learning_rate": 3.789351847215613e-06, "logits/chosen": -0.4590677320957184, "logits/rejected": -0.2966582179069519, "logps/chosen": -211.01083374023438, "logps/rejected": -171.15493774414062, "loss": 0.6124, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7449513673782349, "rewards/margins": 0.21209721267223358, "rewards/rejected": 0.5328541994094849, "step": 4980 }, { "epoch": 1.5715916696193064, "grad_norm": 3.046875, "learning_rate": 3.784832973081295e-06, "logits/chosen": -0.4136572778224945, "logits/rejected": -0.3735829293727875, "logps/chosen": -190.76473999023438, "logps/rejected": -176.0132293701172, "loss": 0.6683, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7110625505447388, "rewards/margins": 0.08584681898355484, "rewards/rejected": 0.6252157688140869, "step": 4990 }, { "epoch": 1.5747411519231527, "grad_norm": 2.828125, "learning_rate": 3.7803083871373876e-06, "logits/chosen": -0.42973631620407104, "logits/rejected": -0.2951328754425049, "logps/chosen": -206.06332397460938, "logps/rejected": -174.61618041992188, "loss": 0.6725, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7120364308357239, "rewards/margins": 0.0878576785326004, "rewards/rejected": 0.6241787672042847, "step": 5000 }, { "epoch": 1.577890634226999, "grad_norm": 3.296875, "learning_rate": 3.775778109498237e-06, "logits/chosen": -0.391795814037323, "logits/rejected": -0.2272360622882843, "logps/chosen": -214.9050750732422, "logps/rejected": -175.29550170898438, "loss": 0.6073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7749017477035522, "rewards/margins": 0.23047152161598206, "rewards/rejected": 0.544430136680603, "step": 5010 }, { "epoch": 1.5810401165308452, "grad_norm": 2.578125, "learning_rate": 3.7712421603034894e-06, "logits/chosen": -0.43912237882614136, "logits/rejected": -0.34138351678848267, "logps/chosen": -204.3086395263672, "logps/rejected": -182.76370239257812, "loss": 0.7119, "rewards/accuracies": 0.5, "rewards/chosen": 0.6654126048088074, "rewards/margins": 0.0014205619227141142, "rewards/rejected": 0.6639919877052307, "step": 5020 }, { "epoch": 1.5841895988346915, "grad_norm": 2.765625, "learning_rate": 3.766700559718006e-06, "logits/chosen": -0.5014868974685669, "logits/rejected": -0.4088328778743744, "logps/chosen": -185.99476623535156, "logps/rejected": -165.3468017578125, "loss": 0.6596, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6833370923995972, "rewards/margins": 0.10189273208379745, "rewards/rejected": 0.5814443826675415, "step": 5030 }, { "epoch": 1.5873390811385377, "grad_norm": 2.390625, "learning_rate": 3.762153327931772e-06, "logits/chosen": -0.44292283058166504, "logits/rejected": -0.3582010269165039, "logps/chosen": -201.4560546875, "logps/rejected": -181.69656372070312, "loss": 0.6431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.720178484916687, "rewards/margins": 0.14207497239112854, "rewards/rejected": 0.5781034231185913, "step": 5040 }, { "epoch": 1.5904885634423842, "grad_norm": 3.375, "learning_rate": 3.7576004851598052e-06, "logits/chosen": -0.4733700156211853, "logits/rejected": -0.3716031610965729, "logps/chosen": -199.3379364013672, "logps/rejected": -178.63943481445312, "loss": 0.6621, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.75123131275177, "rewards/margins": 0.10580404102802277, "rewards/rejected": 0.6454272866249084, "step": 5050 }, { "epoch": 1.5936380457462305, "grad_norm": 2.6875, "learning_rate": 3.7530420516420676e-06, "logits/chosen": -0.4269101023674011, "logits/rejected": -0.37862318754196167, "logps/chosen": -199.61886596679688, "logps/rejected": -173.53884887695312, "loss": 0.6456, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7329145073890686, "rewards/margins": 0.1574871987104416, "rewards/rejected": 0.575427234172821, "step": 5060 }, { "epoch": 1.5967875280500767, "grad_norm": 2.546875, "learning_rate": 3.7484780476433764e-06, "logits/chosen": -0.4474611282348633, "logits/rejected": -0.35605159401893616, "logps/chosen": -188.90847778320312, "logps/rejected": -159.92965698242188, "loss": 0.6346, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6905893087387085, "rewards/margins": 0.16502177715301514, "rewards/rejected": 0.5255674123764038, "step": 5070 }, { "epoch": 1.5999370103539232, "grad_norm": 2.75, "learning_rate": 3.743908493453311e-06, "logits/chosen": -0.43630313873291016, "logits/rejected": -0.37972143292427063, "logps/chosen": -223.48648071289062, "logps/rejected": -194.86001586914062, "loss": 0.6523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7133353352546692, "rewards/margins": 0.11560231447219849, "rewards/rejected": 0.5977329611778259, "step": 5080 }, { "epoch": 1.6030864926577695, "grad_norm": 3.0625, "learning_rate": 3.739333409386126e-06, "logits/chosen": -0.44998350739479065, "logits/rejected": -0.3347667157649994, "logps/chosen": -207.3821563720703, "logps/rejected": -182.54971313476562, "loss": 0.6532, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7008602619171143, "rewards/margins": 0.14106692373752594, "rewards/rejected": 0.5597933530807495, "step": 5090 }, { "epoch": 1.6062359749616157, "grad_norm": 2.25, "learning_rate": 3.734752815780659e-06, "logits/chosen": -0.514176070690155, "logits/rejected": -0.3695998787879944, "logps/chosen": -196.74203491210938, "logps/rejected": -162.3852081298828, "loss": 0.6246, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7342701554298401, "rewards/margins": 0.20258764922618866, "rewards/rejected": 0.5316824913024902, "step": 5100 }, { "epoch": 1.609385457265462, "grad_norm": 2.5, "learning_rate": 3.7301667330002408e-06, "logits/chosen": -0.439274400472641, "logits/rejected": -0.3606041669845581, "logps/chosen": -202.26397705078125, "logps/rejected": -176.84947204589844, "loss": 0.641, "rewards/accuracies": 0.625, "rewards/chosen": 0.7874661684036255, "rewards/margins": 0.1603676974773407, "rewards/rejected": 0.6270985007286072, "step": 5110 }, { "epoch": 1.6125349395693083, "grad_norm": 2.625, "learning_rate": 3.7255751814326035e-06, "logits/chosen": -0.4579714238643646, "logits/rejected": -0.3923242390155792, "logps/chosen": -189.69895935058594, "logps/rejected": -181.8014373779297, "loss": 0.6762, "rewards/accuracies": 0.625, "rewards/chosen": 0.6748474836349487, "rewards/margins": 0.07303015887737274, "rewards/rejected": 0.6018173098564148, "step": 5120 }, { "epoch": 1.6156844218731545, "grad_norm": 2.734375, "learning_rate": 3.720978181489792e-06, "logits/chosen": -0.4986580014228821, "logits/rejected": -0.4134606719017029, "logps/chosen": -189.92276000976562, "logps/rejected": -189.25656127929688, "loss": 0.6926, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.7121160626411438, "rewards/margins": 0.05977436155080795, "rewards/rejected": 0.6523416042327881, "step": 5130 }, { "epoch": 1.6188339041770008, "grad_norm": 3.046875, "learning_rate": 3.716375753608073e-06, "logits/chosen": -0.45538026094436646, "logits/rejected": -0.29467564821243286, "logps/chosen": -217.4430694580078, "logps/rejected": -182.09951782226562, "loss": 0.6399, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.804099440574646, "rewards/margins": 0.17206081748008728, "rewards/rejected": 0.6320386528968811, "step": 5140 }, { "epoch": 1.621983386480847, "grad_norm": 2.34375, "learning_rate": 3.7117679182478415e-06, "logits/chosen": -0.45015448331832886, "logits/rejected": -0.3417063355445862, "logps/chosen": -196.57412719726562, "logps/rejected": -183.40467834472656, "loss": 0.669, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6573584675788879, "rewards/margins": 0.0827714204788208, "rewards/rejected": 0.5745870471000671, "step": 5150 }, { "epoch": 1.6251328687846935, "grad_norm": 3.359375, "learning_rate": 3.707154695893535e-06, "logits/chosen": -0.49352359771728516, "logits/rejected": -0.3199608027935028, "logps/chosen": -191.95172119140625, "logps/rejected": -168.58969116210938, "loss": 0.6372, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.743454098701477, "rewards/margins": 0.16557399928569794, "rewards/rejected": 0.5778801441192627, "step": 5160 }, { "epoch": 1.6282823510885398, "grad_norm": 2.6875, "learning_rate": 3.702536107053536e-06, "logits/chosen": -0.402625173330307, "logits/rejected": -0.3243730366230011, "logps/chosen": -182.0989532470703, "logps/rejected": -162.49978637695312, "loss": 0.6791, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6483687162399292, "rewards/margins": 0.07241274416446686, "rewards/rejected": 0.5759559869766235, "step": 5170 }, { "epoch": 1.6314318333923863, "grad_norm": 2.5625, "learning_rate": 3.697912172260085e-06, "logits/chosen": -0.49576109647750854, "logits/rejected": -0.39149078726768494, "logps/chosen": -205.7623291015625, "logps/rejected": -176.04811096191406, "loss": 0.6461, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7142454385757446, "rewards/margins": 0.13653890788555145, "rewards/rejected": 0.5777064561843872, "step": 5180 }, { "epoch": 1.6345813156962326, "grad_norm": 3.140625, "learning_rate": 3.693282912069189e-06, "logits/chosen": -0.4688878655433655, "logits/rejected": -0.3151419162750244, "logps/chosen": -222.38955688476562, "logps/rejected": -187.0220489501953, "loss": 0.6532, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7705121636390686, "rewards/margins": 0.11784696578979492, "rewards/rejected": 0.6526652574539185, "step": 5190 }, { "epoch": 1.6377307980000788, "grad_norm": 3.21875, "learning_rate": 3.6886483470605293e-06, "logits/chosen": -0.4141194224357605, "logits/rejected": -0.33433422446250916, "logps/chosen": -192.88711547851562, "logps/rejected": -161.22756958007812, "loss": 0.6707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.645660400390625, "rewards/margins": 0.0918336808681488, "rewards/rejected": 0.5538267493247986, "step": 5200 }, { "epoch": 1.640880280303925, "grad_norm": 2.65625, "learning_rate": 3.6840084978373704e-06, "logits/chosen": -0.4529247283935547, "logits/rejected": -0.23250219225883484, "logps/chosen": -212.8038330078125, "logps/rejected": -176.3964385986328, "loss": 0.6297, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7746697664260864, "rewards/margins": 0.16910018026828766, "rewards/rejected": 0.6055695414543152, "step": 5210 }, { "epoch": 1.6440297626077713, "grad_norm": 2.328125, "learning_rate": 3.6793633850264655e-06, "logits/chosen": -0.4927656650543213, "logits/rejected": -0.3135763704776764, "logps/chosen": -213.3108673095703, "logps/rejected": -167.95083618164062, "loss": 0.6254, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7371099591255188, "rewards/margins": 0.19389715790748596, "rewards/rejected": 0.5432127714157104, "step": 5220 }, { "epoch": 1.6471792449116176, "grad_norm": 2.59375, "learning_rate": 3.6747130292779715e-06, "logits/chosen": -0.4238489270210266, "logits/rejected": -0.3129653036594391, "logps/chosen": -194.534912109375, "logps/rejected": -174.8766632080078, "loss": 0.6662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7266419529914856, "rewards/margins": 0.09835498034954071, "rewards/rejected": 0.6282869577407837, "step": 5230 }, { "epoch": 1.6503287272154639, "grad_norm": 2.78125, "learning_rate": 3.6700574512653497e-06, "logits/chosen": -0.4622649550437927, "logits/rejected": -0.37696540355682373, "logps/chosen": -199.86097717285156, "logps/rejected": -181.8938446044922, "loss": 0.6482, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7182890176773071, "rewards/margins": 0.12708035111427307, "rewards/rejected": 0.5912087559700012, "step": 5240 }, { "epoch": 1.6534782095193101, "grad_norm": 2.5, "learning_rate": 3.66539667168528e-06, "logits/chosen": -0.4936433732509613, "logits/rejected": -0.3448127210140228, "logps/chosen": -187.07192993164062, "logps/rejected": -149.51683044433594, "loss": 0.6323, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6617879867553711, "rewards/margins": 0.18173685669898987, "rewards/rejected": 0.480051189661026, "step": 5250 }, { "epoch": 1.6566276918231566, "grad_norm": 3.109375, "learning_rate": 3.6607307112575646e-06, "logits/chosen": -0.5180322527885437, "logits/rejected": -0.3746757507324219, "logps/chosen": -201.63467407226562, "logps/rejected": -166.1622314453125, "loss": 0.5896, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7969163656234741, "rewards/margins": 0.26014357805252075, "rewards/rejected": 0.5367728471755981, "step": 5260 }, { "epoch": 1.6597771741270029, "grad_norm": 3.15625, "learning_rate": 3.6560595907250375e-06, "logits/chosen": -0.3662557005882263, "logits/rejected": -0.3333319425582886, "logps/chosen": -191.13694763183594, "logps/rejected": -189.7360382080078, "loss": 0.6766, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7387022972106934, "rewards/margins": 0.08876340091228485, "rewards/rejected": 0.6499389410018921, "step": 5270 }, { "epoch": 1.6629266564308491, "grad_norm": 3.265625, "learning_rate": 3.651383330853472e-06, "logits/chosen": -0.4906235635280609, "logits/rejected": -0.36935657262802124, "logps/chosen": -217.0959014892578, "logps/rejected": -180.4376220703125, "loss": 0.6515, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7411099076271057, "rewards/margins": 0.1661885380744934, "rewards/rejected": 0.5749213695526123, "step": 5280 }, { "epoch": 1.6660761387346956, "grad_norm": 2.21875, "learning_rate": 3.6467019524314905e-06, "logits/chosen": -0.49627685546875, "logits/rejected": -0.3914474844932556, "logps/chosen": -189.37979125976562, "logps/rejected": -160.41397094726562, "loss": 0.6208, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7144301533699036, "rewards/margins": 0.19551965594291687, "rewards/rejected": 0.5189104676246643, "step": 5290 }, { "epoch": 1.6692256210385419, "grad_norm": 2.515625, "learning_rate": 3.6420154762704685e-06, "logits/chosen": -0.5125707983970642, "logits/rejected": -0.3919418752193451, "logps/chosen": -194.13682556152344, "logps/rejected": -161.27906799316406, "loss": 0.6436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6802747249603271, "rewards/margins": 0.1379631608724594, "rewards/rejected": 0.5423115491867065, "step": 5300 }, { "epoch": 1.6723751033423881, "grad_norm": 3.0625, "learning_rate": 3.6373239232044445e-06, "logits/chosen": -0.5417731404304504, "logits/rejected": -0.3268618881702423, "logps/chosen": -193.62232971191406, "logps/rejected": -159.81134033203125, "loss": 0.6213, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6962756514549255, "rewards/margins": 0.18256238102912903, "rewards/rejected": 0.5137132406234741, "step": 5310 }, { "epoch": 1.6755245856462344, "grad_norm": 2.359375, "learning_rate": 3.632627314090026e-06, "logits/chosen": -0.4504339098930359, "logits/rejected": -0.38121432065963745, "logps/chosen": -208.55001831054688, "logps/rejected": -187.78811645507812, "loss": 0.6453, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7359059453010559, "rewards/margins": 0.14206233620643616, "rewards/rejected": 0.5938436388969421, "step": 5320 }, { "epoch": 1.6786740679500807, "grad_norm": 3.03125, "learning_rate": 3.6279256698062986e-06, "logits/chosen": -0.5013277530670166, "logits/rejected": -0.33975881338119507, "logps/chosen": -206.8636474609375, "logps/rejected": -179.7592315673828, "loss": 0.6196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7417197227478027, "rewards/margins": 0.20961704850196838, "rewards/rejected": 0.5321027040481567, "step": 5330 }, { "epoch": 1.681823550253927, "grad_norm": 2.984375, "learning_rate": 3.6232190112547324e-06, "logits/chosen": -0.5261915922164917, "logits/rejected": -0.4154892861843109, "logps/chosen": -196.72494506835938, "logps/rejected": -167.34524536132812, "loss": 0.6448, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6392444968223572, "rewards/margins": 0.13428232073783875, "rewards/rejected": 0.5049622058868408, "step": 5340 }, { "epoch": 1.6849730325577732, "grad_norm": 2.28125, "learning_rate": 3.6185073593590868e-06, "logits/chosen": -0.5182952880859375, "logits/rejected": -0.32755130529403687, "logps/chosen": -193.91754150390625, "logps/rejected": -147.9218292236328, "loss": 0.6283, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6746249794960022, "rewards/margins": 0.181734099984169, "rewards/rejected": 0.4928908944129944, "step": 5350 }, { "epoch": 1.6881225148616195, "grad_norm": 2.84375, "learning_rate": 3.613790735065321e-06, "logits/chosen": -0.397797167301178, "logits/rejected": -0.34379732608795166, "logps/chosen": -196.30372619628906, "logps/rejected": -173.0957489013672, "loss": 0.653, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7253559827804565, "rewards/margins": 0.13321705162525177, "rewards/rejected": 0.592138946056366, "step": 5360 }, { "epoch": 1.691271997165466, "grad_norm": 2.46875, "learning_rate": 3.6090691593414978e-06, "logits/chosen": -0.49348416924476624, "logits/rejected": -0.33678576350212097, "logps/chosen": -201.0540313720703, "logps/rejected": -170.12416076660156, "loss": 0.611, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7913371324539185, "rewards/margins": 0.21138879656791687, "rewards/rejected": 0.5799483060836792, "step": 5370 }, { "epoch": 1.6944214794693122, "grad_norm": 2.546875, "learning_rate": 3.604342653177695e-06, "logits/chosen": -0.48641714453697205, "logits/rejected": -0.40380674600601196, "logps/chosen": -179.4705810546875, "logps/rejected": -162.80154418945312, "loss": 0.6501, "rewards/accuracies": 0.625, "rewards/chosen": 0.6522781252861023, "rewards/margins": 0.11866960674524307, "rewards/rejected": 0.5336084961891174, "step": 5380 }, { "epoch": 1.6975709617731587, "grad_norm": 2.4375, "learning_rate": 3.599611237585906e-06, "logits/chosen": -0.45630472898483276, "logits/rejected": -0.3485426902770996, "logps/chosen": -171.63662719726562, "logps/rejected": -156.93072509765625, "loss": 0.6609, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5976822972297668, "rewards/margins": 0.1124814972281456, "rewards/rejected": 0.48520079255104065, "step": 5390 }, { "epoch": 1.700720444077005, "grad_norm": 2.640625, "learning_rate": 3.5948749335999493e-06, "logits/chosen": -0.5297515988349915, "logits/rejected": -0.44055309891700745, "logps/chosen": -192.01625061035156, "logps/rejected": -184.41848754882812, "loss": 0.6709, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6303820610046387, "rewards/margins": 0.07556124031543732, "rewards/rejected": 0.5548208951950073, "step": 5400 }, { "epoch": 1.7038699263808512, "grad_norm": 2.890625, "learning_rate": 3.590133762275378e-06, "logits/chosen": -0.5689431428909302, "logits/rejected": -0.3852362334728241, "logps/chosen": -212.37899780273438, "logps/rejected": -171.7864227294922, "loss": 0.6305, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7726091742515564, "rewards/margins": 0.1817570924758911, "rewards/rejected": 0.5908521413803101, "step": 5410 }, { "epoch": 1.7070194086846975, "grad_norm": 2.4375, "learning_rate": 3.5853877446893802e-06, "logits/chosen": -0.496450811624527, "logits/rejected": -0.39372554421424866, "logps/chosen": -183.16973876953125, "logps/rejected": -166.51968383789062, "loss": 0.6613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6342931985855103, "rewards/margins": 0.08918163925409317, "rewards/rejected": 0.5451115369796753, "step": 5420 }, { "epoch": 1.7101688909885437, "grad_norm": 2.578125, "learning_rate": 3.5806369019406906e-06, "logits/chosen": -0.5108271837234497, "logits/rejected": -0.33603325486183167, "logps/chosen": -210.8227996826172, "logps/rejected": -167.770263671875, "loss": 0.6393, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7098419666290283, "rewards/margins": 0.17859862744808197, "rewards/rejected": 0.5312432646751404, "step": 5430 }, { "epoch": 1.71331837329239, "grad_norm": 2.90625, "learning_rate": 3.5758812551494926e-06, "logits/chosen": -0.4778234362602234, "logits/rejected": -0.3700777590274811, "logps/chosen": -218.7685546875, "logps/rejected": -184.45120239257812, "loss": 0.6412, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.798811674118042, "rewards/margins": 0.14647333323955536, "rewards/rejected": 0.6523382663726807, "step": 5440 }, { "epoch": 1.7164678555962363, "grad_norm": 2.4375, "learning_rate": 3.571120825457327e-06, "logits/chosen": -0.4903620779514313, "logits/rejected": -0.37475109100341797, "logps/chosen": -196.15283203125, "logps/rejected": -187.24850463867188, "loss": 0.6469, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6965973377227783, "rewards/margins": 0.13229303061962128, "rewards/rejected": 0.5643042922019958, "step": 5450 }, { "epoch": 1.7196173379000825, "grad_norm": 3.171875, "learning_rate": 3.5663556340269984e-06, "logits/chosen": -0.5170494318008423, "logits/rejected": -0.3688809871673584, "logps/chosen": -198.2430419921875, "logps/rejected": -165.7610321044922, "loss": 0.6394, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7186397314071655, "rewards/margins": 0.14983563125133514, "rewards/rejected": 0.568804144859314, "step": 5460 }, { "epoch": 1.722766820203929, "grad_norm": 3.265625, "learning_rate": 3.5615857020424786e-06, "logits/chosen": -0.48056459426879883, "logits/rejected": -0.42547035217285156, "logps/chosen": -225.29281616210938, "logps/rejected": -195.69827270507812, "loss": 0.6488, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7470188140869141, "rewards/margins": 0.1338168829679489, "rewards/rejected": 0.613201916217804, "step": 5470 }, { "epoch": 1.7259163025077753, "grad_norm": 3.21875, "learning_rate": 3.5568110507088146e-06, "logits/chosen": -0.42114323377609253, "logits/rejected": -0.29093047976493835, "logps/chosen": -188.66677856445312, "logps/rejected": -163.27169799804688, "loss": 0.6448, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6538271307945251, "rewards/margins": 0.14478826522827148, "rewards/rejected": 0.5090388059616089, "step": 5480 }, { "epoch": 1.7290657848116215, "grad_norm": 2.75, "learning_rate": 3.5520317012520327e-06, "logits/chosen": -0.4472483694553375, "logits/rejected": -0.3266296684741974, "logps/chosen": -198.31227111816406, "logps/rejected": -158.91233825683594, "loss": 0.6354, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6903390884399414, "rewards/margins": 0.15532377362251282, "rewards/rejected": 0.5350152850151062, "step": 5490 }, { "epoch": 1.732215267115468, "grad_norm": 1.9765625, "learning_rate": 3.5472476749190465e-06, "logits/chosen": -0.44366365671157837, "logits/rejected": -0.3832937180995941, "logps/chosen": -178.91236877441406, "logps/rejected": -161.47250366210938, "loss": 0.6579, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6258020401000977, "rewards/margins": 0.1169646754860878, "rewards/rejected": 0.5088373422622681, "step": 5500 }, { "epoch": 1.7353647494193143, "grad_norm": 3.53125, "learning_rate": 3.5424589929775593e-06, "logits/chosen": -0.4584302306175232, "logits/rejected": -0.3267679810523987, "logps/chosen": -193.6768798828125, "logps/rejected": -170.361572265625, "loss": 0.6402, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7419657707214355, "rewards/margins": 0.14626576006412506, "rewards/rejected": 0.5956999063491821, "step": 5510 }, { "epoch": 1.7385142317231606, "grad_norm": 2.859375, "learning_rate": 3.5376656767159724e-06, "logits/chosen": -0.42630109190940857, "logits/rejected": -0.33150094747543335, "logps/chosen": -188.36764526367188, "logps/rejected": -165.72805786132812, "loss": 0.6567, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6277583837509155, "rewards/margins": 0.11112833023071289, "rewards/rejected": 0.5166300535202026, "step": 5520 }, { "epoch": 1.7416637140270068, "grad_norm": 2.40625, "learning_rate": 3.5328677474432893e-06, "logits/chosen": -0.4710637629032135, "logits/rejected": -0.36444777250289917, "logps/chosen": -193.68409729003906, "logps/rejected": -158.17857360839844, "loss": 0.6465, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6602168083190918, "rewards/margins": 0.13958847522735596, "rewards/rejected": 0.5206283330917358, "step": 5530 }, { "epoch": 1.744813196330853, "grad_norm": 2.640625, "learning_rate": 3.5280652264890197e-06, "logits/chosen": -0.5480708479881287, "logits/rejected": -0.4107402265071869, "logps/chosen": -198.30174255371094, "logps/rejected": -155.7358856201172, "loss": 0.6093, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7575246095657349, "rewards/margins": 0.22439555823802948, "rewards/rejected": 0.5331289768218994, "step": 5540 }, { "epoch": 1.7479626786346993, "grad_norm": 2.375, "learning_rate": 3.523258135203087e-06, "logits/chosen": -0.49628472328186035, "logits/rejected": -0.39903074502944946, "logps/chosen": -206.2187957763672, "logps/rejected": -172.8035888671875, "loss": 0.6493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7126896977424622, "rewards/margins": 0.13072232902050018, "rewards/rejected": 0.5819673538208008, "step": 5550 }, { "epoch": 1.7511121609385456, "grad_norm": 3.015625, "learning_rate": 3.518446494955732e-06, "logits/chosen": -0.5156094431877136, "logits/rejected": -0.38701295852661133, "logps/chosen": -179.8013153076172, "logps/rejected": -152.9771728515625, "loss": 0.6506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6560216546058655, "rewards/margins": 0.12969034910202026, "rewards/rejected": 0.52633136510849, "step": 5560 }, { "epoch": 1.7542616432423919, "grad_norm": 2.65625, "learning_rate": 3.5136303271374185e-06, "logits/chosen": -0.46732720732688904, "logits/rejected": -0.33687490224838257, "logps/chosen": -198.79708862304688, "logps/rejected": -176.80389404296875, "loss": 0.6617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7469144463539124, "rewards/margins": 0.11790307611227036, "rewards/rejected": 0.629011332988739, "step": 5570 }, { "epoch": 1.7574111255462384, "grad_norm": 3.203125, "learning_rate": 3.5088096531587377e-06, "logits/chosen": -0.4696858823299408, "logits/rejected": -0.378410279750824, "logps/chosen": -198.86094665527344, "logps/rejected": -171.12442016601562, "loss": 0.6509, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6952942609786987, "rewards/margins": 0.12442521750926971, "rewards/rejected": 0.5708690285682678, "step": 5580 }, { "epoch": 1.7605606078500846, "grad_norm": 2.265625, "learning_rate": 3.5039844944503137e-06, "logits/chosen": -0.44871068000793457, "logits/rejected": -0.23819032311439514, "logps/chosen": -208.89993286132812, "logps/rejected": -163.7926025390625, "loss": 0.6338, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7412741184234619, "rewards/margins": 0.17887642979621887, "rewards/rejected": 0.5623977780342102, "step": 5590 }, { "epoch": 1.763710090153931, "grad_norm": 2.484375, "learning_rate": 3.4991548724627054e-06, "logits/chosen": -0.5238919258117676, "logits/rejected": -0.3585182726383209, "logps/chosen": -222.5647430419922, "logps/rejected": -178.90841674804688, "loss": 0.5866, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7905155420303345, "rewards/margins": 0.27365249395370483, "rewards/rejected": 0.5168629884719849, "step": 5600 }, { "epoch": 1.7668595724577774, "grad_norm": 2.46875, "learning_rate": 3.4943208086663183e-06, "logits/chosen": -0.4847659170627594, "logits/rejected": -0.33793026208877563, "logps/chosen": -197.34933471679688, "logps/rejected": -174.9829559326172, "loss": 0.646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7014733552932739, "rewards/margins": 0.12839707732200623, "rewards/rejected": 0.5730762481689453, "step": 5610 }, { "epoch": 1.7700090547616236, "grad_norm": 3.578125, "learning_rate": 3.4894823245512986e-06, "logits/chosen": -0.506749153137207, "logits/rejected": -0.45556968450546265, "logps/chosen": -197.71902465820312, "logps/rejected": -186.50241088867188, "loss": 0.6803, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7117626070976257, "rewards/margins": 0.06737571209669113, "rewards/rejected": 0.644386887550354, "step": 5620 }, { "epoch": 1.7731585370654699, "grad_norm": 2.578125, "learning_rate": 3.484639441627448e-06, "logits/chosen": -0.5070594549179077, "logits/rejected": -0.3329693078994751, "logps/chosen": -220.60986328125, "logps/rejected": -183.98416137695312, "loss": 0.6042, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7875211834907532, "rewards/margins": 0.2286391705274582, "rewards/rejected": 0.5588821172714233, "step": 5630 }, { "epoch": 1.7763080193693161, "grad_norm": 2.546875, "learning_rate": 3.4797921814241196e-06, "logits/chosen": -0.48938584327697754, "logits/rejected": -0.37643399834632874, "logps/chosen": -194.7692413330078, "logps/rejected": -171.0836944580078, "loss": 0.6345, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7208179235458374, "rewards/margins": 0.17952939867973328, "rewards/rejected": 0.5412884950637817, "step": 5640 }, { "epoch": 1.7794575016731624, "grad_norm": 2.71875, "learning_rate": 3.4749405654901297e-06, "logits/chosen": -0.5021311044692993, "logits/rejected": -0.3592470586299896, "logps/chosen": -203.04798889160156, "logps/rejected": -170.28916931152344, "loss": 0.6468, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7304830551147461, "rewards/margins": 0.14200101792812347, "rewards/rejected": 0.5884820222854614, "step": 5650 }, { "epoch": 1.7826069839770087, "grad_norm": 1.8125, "learning_rate": 3.470084615393655e-06, "logits/chosen": -0.5099314451217651, "logits/rejected": -0.36777496337890625, "logps/chosen": -188.96286010742188, "logps/rejected": -158.13487243652344, "loss": 0.5854, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.7638787031173706, "rewards/margins": 0.25533777475357056, "rewards/rejected": 0.5085408687591553, "step": 5660 }, { "epoch": 1.785756466280855, "grad_norm": 2.71875, "learning_rate": 3.4652243527221423e-06, "logits/chosen": -0.4756031632423401, "logits/rejected": -0.44920986890792847, "logps/chosen": -185.1388397216797, "logps/rejected": -172.55137634277344, "loss": 0.6583, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7094627618789673, "rewards/margins": 0.13025884330272675, "rewards/rejected": 0.5792039036750793, "step": 5670 }, { "epoch": 1.7889059485847014, "grad_norm": 3.171875, "learning_rate": 3.460359799082209e-06, "logits/chosen": -0.47689515352249146, "logits/rejected": -0.34241801500320435, "logps/chosen": -204.8109588623047, "logps/rejected": -166.13514709472656, "loss": 0.615, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7595565915107727, "rewards/margins": 0.21238622069358826, "rewards/rejected": 0.5471702814102173, "step": 5680 }, { "epoch": 1.7920554308885477, "grad_norm": 3.765625, "learning_rate": 3.4554909760995485e-06, "logits/chosen": -0.5418170094490051, "logits/rejected": -0.41362690925598145, "logps/chosen": -187.98043823242188, "logps/rejected": -167.5854034423828, "loss": 0.6338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7204712629318237, "rewards/margins": 0.1737706959247589, "rewards/rejected": 0.5467005968093872, "step": 5690 }, { "epoch": 1.795204913192394, "grad_norm": 3.3125, "learning_rate": 3.450617905418834e-06, "logits/chosen": -0.442087322473526, "logits/rejected": -0.3480719029903412, "logps/chosen": -205.0787353515625, "logps/rejected": -176.585693359375, "loss": 0.6078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7968889474868774, "rewards/margins": 0.2236328423023224, "rewards/rejected": 0.5732561349868774, "step": 5700 }, { "epoch": 1.7983543954962404, "grad_norm": 3.125, "learning_rate": 3.4457406087036233e-06, "logits/chosen": -0.4669428765773773, "logits/rejected": -0.379183828830719, "logps/chosen": -183.84532165527344, "logps/rejected": -169.44937133789062, "loss": 0.6755, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6309347748756409, "rewards/margins": 0.07157482206821442, "rewards/rejected": 0.5593599081039429, "step": 5710 }, { "epoch": 1.8015038778000867, "grad_norm": 2.984375, "learning_rate": 3.4408591076362585e-06, "logits/chosen": -0.5323187112808228, "logits/rejected": -0.45780545473098755, "logps/chosen": -205.9134521484375, "logps/rejected": -180.65916442871094, "loss": 0.6566, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7317408323287964, "rewards/margins": 0.11702696233987808, "rewards/rejected": 0.6147138476371765, "step": 5720 }, { "epoch": 1.804653360103933, "grad_norm": 2.859375, "learning_rate": 3.435973423917774e-06, "logits/chosen": -0.48551005125045776, "logits/rejected": -0.40477806329727173, "logps/chosen": -195.50228881835938, "logps/rejected": -173.91912841796875, "loss": 0.6842, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7036144137382507, "rewards/margins": 0.06239970773458481, "rewards/rejected": 0.6412147283554077, "step": 5730 }, { "epoch": 1.8078028424077792, "grad_norm": 2.40625, "learning_rate": 3.4310835792677995e-06, "logits/chosen": -0.4431411623954773, "logits/rejected": -0.3337770104408264, "logps/chosen": -198.4442138671875, "logps/rejected": -162.93258666992188, "loss": 0.6348, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6712988018989563, "rewards/margins": 0.1662341058254242, "rewards/rejected": 0.5050647854804993, "step": 5740 }, { "epoch": 1.8109523247116255, "grad_norm": 3.015625, "learning_rate": 3.4261895954244613e-06, "logits/chosen": -0.4226387143135071, "logits/rejected": -0.3787776827812195, "logps/chosen": -173.4969024658203, "logps/rejected": -161.3011932373047, "loss": 0.6435, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6374837160110474, "rewards/margins": 0.13156263530254364, "rewards/rejected": 0.5059210658073425, "step": 5750 }, { "epoch": 1.8141018070154717, "grad_norm": 3.09375, "learning_rate": 3.4212914941442866e-06, "logits/chosen": -0.48183003067970276, "logits/rejected": -0.3869970142841339, "logps/chosen": -199.9102020263672, "logps/rejected": -183.46273803710938, "loss": 0.6739, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.7144922614097595, "rewards/margins": 0.07754186540842056, "rewards/rejected": 0.6369503736495972, "step": 5760 }, { "epoch": 1.817251289319318, "grad_norm": 2.796875, "learning_rate": 3.416389297202107e-06, "logits/chosen": -0.435200035572052, "logits/rejected": -0.273305743932724, "logps/chosen": -200.13018798828125, "logps/rejected": -172.42526245117188, "loss": 0.6273, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7225381135940552, "rewards/margins": 0.19170936942100525, "rewards/rejected": 0.5308286547660828, "step": 5770 }, { "epoch": 1.8204007716231645, "grad_norm": 3.203125, "learning_rate": 3.4114830263909615e-06, "logits/chosen": -0.488565593957901, "logits/rejected": -0.3196925222873688, "logps/chosen": -203.71237182617188, "logps/rejected": -175.8201141357422, "loss": 0.6425, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6835566759109497, "rewards/margins": 0.13755542039871216, "rewards/rejected": 0.5460013151168823, "step": 5780 }, { "epoch": 1.8235502539270108, "grad_norm": 2.84375, "learning_rate": 3.4065727035220013e-06, "logits/chosen": -0.48802971839904785, "logits/rejected": -0.401599645614624, "logps/chosen": -203.4430694580078, "logps/rejected": -178.24978637695312, "loss": 0.6509, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7013251185417175, "rewards/margins": 0.12661480903625488, "rewards/rejected": 0.5747103095054626, "step": 5790 }, { "epoch": 1.826699736230857, "grad_norm": 3.15625, "learning_rate": 3.4016583504243892e-06, "logits/chosen": -0.39509814977645874, "logits/rejected": -0.3049541115760803, "logps/chosen": -193.34628295898438, "logps/rejected": -168.88990783691406, "loss": 0.6467, "rewards/accuracies": 0.625, "rewards/chosen": 0.6939215660095215, "rewards/margins": 0.13865116238594055, "rewards/rejected": 0.5552703738212585, "step": 5800 }, { "epoch": 1.8298492185347035, "grad_norm": 2.609375, "learning_rate": 3.3967399889452056e-06, "logits/chosen": -0.5302572250366211, "logits/rejected": -0.42114171385765076, "logps/chosen": -187.310791015625, "logps/rejected": -158.18551635742188, "loss": 0.62, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6706022024154663, "rewards/margins": 0.18203167617321014, "rewards/rejected": 0.48857051134109497, "step": 5810 }, { "epoch": 1.8329987008385498, "grad_norm": 2.359375, "learning_rate": 3.3918176409493498e-06, "logits/chosen": -0.4302283227443695, "logits/rejected": -0.3126838207244873, "logps/chosen": -207.9413604736328, "logps/rejected": -186.14862060546875, "loss": 0.6106, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.8021620512008667, "rewards/margins": 0.22595825791358948, "rewards/rejected": 0.5762038826942444, "step": 5820 }, { "epoch": 1.836148183142396, "grad_norm": 3.15625, "learning_rate": 3.3868913283194445e-06, "logits/chosen": -0.4245404303073883, "logits/rejected": -0.3099447190761566, "logps/chosen": -215.6573486328125, "logps/rejected": -180.88473510742188, "loss": 0.6243, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8092790842056274, "rewards/margins": 0.21156442165374756, "rewards/rejected": 0.5977145433425903, "step": 5830 }, { "epoch": 1.8392976654462423, "grad_norm": 2.203125, "learning_rate": 3.381961072955737e-06, "logits/chosen": -0.4956479072570801, "logits/rejected": -0.4022194743156433, "logps/chosen": -181.72386169433594, "logps/rejected": -157.3038330078125, "loss": 0.6444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6013648509979248, "rewards/margins": 0.13577811419963837, "rewards/rejected": 0.46558675169944763, "step": 5840 }, { "epoch": 1.8424471477500886, "grad_norm": 2.828125, "learning_rate": 3.3770268967760026e-06, "logits/chosen": -0.4699929356575012, "logits/rejected": -0.38960105180740356, "logps/chosen": -190.84512329101562, "logps/rejected": -165.31561279296875, "loss": 0.6521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7368890047073364, "rewards/margins": 0.1246052160859108, "rewards/rejected": 0.6122837662696838, "step": 5850 }, { "epoch": 1.8455966300539348, "grad_norm": 3.640625, "learning_rate": 3.372088821715446e-06, "logits/chosen": -0.5164574384689331, "logits/rejected": -0.40460482239723206, "logps/chosen": -215.09130859375, "logps/rejected": -181.18551635742188, "loss": 0.6583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7538167238235474, "rewards/margins": 0.11776645481586456, "rewards/rejected": 0.636050283908844, "step": 5860 }, { "epoch": 1.848746112357781, "grad_norm": 2.65625, "learning_rate": 3.3671468697266048e-06, "logits/chosen": -0.486356645822525, "logits/rejected": -0.45697417855262756, "logps/chosen": -189.52955627441406, "logps/rejected": -172.86190795898438, "loss": 0.6822, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.6347873210906982, "rewards/margins": 0.054320335388183594, "rewards/rejected": 0.5804670453071594, "step": 5870 }, { "epoch": 1.8518955946616273, "grad_norm": 3.375, "learning_rate": 3.3622010627792513e-06, "logits/chosen": -0.5492820143699646, "logits/rejected": -0.38086193799972534, "logps/chosen": -194.9511260986328, "logps/rejected": -161.57528686523438, "loss": 0.6699, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6926398873329163, "rewards/margins": 0.09596933424472809, "rewards/rejected": 0.5966705083847046, "step": 5880 }, { "epoch": 1.8550450769654738, "grad_norm": 2.84375, "learning_rate": 3.3572514228602977e-06, "logits/chosen": -0.4424726366996765, "logits/rejected": -0.35579612851142883, "logps/chosen": -196.1681671142578, "logps/rejected": -165.40811157226562, "loss": 0.6129, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7343538999557495, "rewards/margins": 0.21051523089408875, "rewards/rejected": 0.5238386392593384, "step": 5890 }, { "epoch": 1.85819455926932, "grad_norm": 2.96875, "learning_rate": 3.3522979719736923e-06, "logits/chosen": -0.4300655722618103, "logits/rejected": -0.23585304617881775, "logps/chosen": -209.92355346679688, "logps/rejected": -173.3553924560547, "loss": 0.639, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7228736877441406, "rewards/margins": 0.16360947489738464, "rewards/rejected": 0.5592643022537231, "step": 5900 } ], "logging_steps": 10, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }