diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8903 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.85819455926932, + "eval_steps": 2000, + "global_step": 5900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003149482303846305, + "grad_norm": 3.109375, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -0.45210394263267517, + "logits/rejected": -0.3446429371833801, + "logps/chosen": -213.57180786132812, + "logps/rejected": -191.9154052734375, + "loss": 0.6941, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.005887002218514681, + "rewards/margins": -0.0011509137693792582, + "rewards/rejected": 0.00703791668638587, + "step": 10 + }, + { + "epoch": 0.00629896460769261, + "grad_norm": 2.640625, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -0.46016186475753784, + "logits/rejected": -0.2938859760761261, + "logps/chosen": -203.55313110351562, + "logps/rejected": -186.91030883789062, + "loss": 0.6917, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.006860991008579731, + "rewards/margins": 0.004207834601402283, + "rewards/rejected": 0.002653155941516161, + "step": 20 + }, + { + "epoch": 0.009448446911538916, + "grad_norm": 2.703125, + "learning_rate": 1.5e-06, + "logits/chosen": -0.5505405068397522, + "logits/rejected": -0.32200556993484497, + "logps/chosen": -206.43130493164062, + "logps/rejected": -159.43423461914062, + "loss": 0.6978, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.004765630234032869, + "rewards/margins": -0.00844950508326292, + "rewards/rejected": 0.003683874849230051, + "step": 30 + }, + { + "epoch": 0.01259792921538522, + "grad_norm": 3.078125, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -0.5206310749053955, + "logits/rejected": -0.44945794343948364, + "logps/chosen": -192.1409454345703, + "logps/rejected": -195.43496704101562, + "loss": 0.6938, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.014343030750751495, + "rewards/margins": 0.0002199936716351658, + "rewards/rejected": 0.014123037457466125, + "step": 40 + }, + { + "epoch": 0.015747411519231525, + "grad_norm": 2.953125, + "learning_rate": 2.5e-06, + "logits/chosen": -0.5311203002929688, + "logits/rejected": -0.4016449451446533, + "logps/chosen": -208.75094604492188, + "logps/rejected": -167.02542114257812, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009670769795775414, + "rewards/margins": 0.001526160747744143, + "rewards/rejected": 0.00814460963010788, + "step": 50 + }, + { + "epoch": 0.018896893823077833, + "grad_norm": 2.734375, + "learning_rate": 3e-06, + "logits/chosen": -0.5485053062438965, + "logits/rejected": -0.43868058919906616, + "logps/chosen": -196.2797088623047, + "logps/rejected": -174.60232543945312, + "loss": 0.6959, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.008773775771260262, + "rewards/margins": -0.004444376099854708, + "rewards/rejected": 0.013218151405453682, + "step": 60 + }, + { + "epoch": 0.022046376126924137, + "grad_norm": 3.921875, + "learning_rate": 3.5e-06, + "logits/chosen": -0.5210541486740112, + "logits/rejected": -0.37378597259521484, + "logps/chosen": -205.04330444335938, + "logps/rejected": -167.24517822265625, + "loss": 0.6964, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.006323327776044607, + "rewards/margins": -0.00504258181899786, + "rewards/rejected": 0.011365910060703754, + "step": 70 + }, + { + "epoch": 0.02519585843077044, + "grad_norm": 2.84375, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -0.4273603558540344, + "logits/rejected": -0.31517690420150757, + "logps/chosen": -206.41354370117188, + "logps/rejected": -190.51402282714844, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012041259557008743, + "rewards/margins": 0.001850623870268464, + "rewards/rejected": 0.010190634056925774, + "step": 80 + }, + { + "epoch": 0.028345340734616746, + "grad_norm": 3.015625, + "learning_rate": 4.5e-06, + "logits/chosen": -0.47311750054359436, + "logits/rejected": -0.3815276026725769, + "logps/chosen": -201.52731323242188, + "logps/rejected": -173.32937622070312, + "loss": 0.6893, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.016733495518565178, + "rewards/margins": 0.009016195312142372, + "rewards/rejected": 0.007717301603406668, + "step": 90 + }, + { + "epoch": 0.03149482303846305, + "grad_norm": 2.953125, + "learning_rate": 5e-06, + "logits/chosen": -0.44015175104141235, + "logits/rejected": -0.29738959670066833, + "logps/chosen": -233.11642456054688, + "logps/rejected": -192.25820922851562, + "loss": 0.6937, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.02164524421095848, + "rewards/margins": 0.0006244674441404641, + "rewards/rejected": 0.021020779386162758, + "step": 100 + }, + { + "epoch": 0.034644305342309355, + "grad_norm": 3.015625, + "learning_rate": 4.999994443042687e-06, + "logits/chosen": -0.4323144853115082, + "logits/rejected": -0.325883150100708, + "logps/chosen": -224.80685424804688, + "logps/rejected": -206.5443572998047, + "loss": 0.697, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.016110900789499283, + "rewards/margins": -0.005870751105248928, + "rewards/rejected": 0.021981652826070786, + "step": 110 + }, + { + "epoch": 0.037793787646155666, + "grad_norm": 4.4375, + "learning_rate": 4.999977772195451e-06, + "logits/chosen": -0.47046566009521484, + "logits/rejected": -0.31014284491539, + "logps/chosen": -213.2732696533203, + "logps/rejected": -178.27183532714844, + "loss": 0.6867, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.034097231924533844, + "rewards/margins": 0.01459626667201519, + "rewards/rejected": 0.019500967115163803, + "step": 120 + }, + { + "epoch": 0.04094326995000197, + "grad_norm": 3.46875, + "learning_rate": 4.999949987532405e-06, + "logits/chosen": -0.5328488349914551, + "logits/rejected": -0.3397727906703949, + "logps/chosen": -212.8068084716797, + "logps/rejected": -165.73016357421875, + "loss": 0.6904, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.03291865810751915, + "rewards/margins": 0.0070328498259186745, + "rewards/rejected": 0.02588580921292305, + "step": 130 + }, + { + "epoch": 0.044092752253848275, + "grad_norm": 2.84375, + "learning_rate": 4.9999110891770655e-06, + "logits/chosen": -0.5087035894393921, + "logits/rejected": -0.37333738803863525, + "logps/chosen": -197.49069213867188, + "logps/rejected": -164.01104736328125, + "loss": 0.6848, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04153250902891159, + "rewards/margins": 0.018097227439284325, + "rewards/rejected": 0.023435279726982117, + "step": 140 + }, + { + "epoch": 0.04724223455769458, + "grad_norm": 2.671875, + "learning_rate": 4.999861077302358e-06, + "logits/chosen": -0.5480197668075562, + "logits/rejected": -0.39153584837913513, + "logps/chosen": -197.01953125, + "logps/rejected": -172.3303985595703, + "loss": 0.6895, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04789372906088829, + "rewards/margins": 0.008711261674761772, + "rewards/rejected": 0.03918246552348137, + "step": 150 + }, + { + "epoch": 0.05039171686154088, + "grad_norm": 4.0, + "learning_rate": 4.999799952130615e-06, + "logits/chosen": -0.5040096044540405, + "logits/rejected": -0.39294344186782837, + "logps/chosen": -214.4684600830078, + "logps/rejected": -185.05201721191406, + "loss": 0.6902, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.043327994644641876, + "rewards/margins": 0.007260638289153576, + "rewards/rejected": 0.03606735169887543, + "step": 160 + }, + { + "epoch": 0.05354119916538719, + "grad_norm": 2.328125, + "learning_rate": 4.999727713933572e-06, + "logits/chosen": -0.5509570837020874, + "logits/rejected": -0.37693116068840027, + "logps/chosen": -184.16673278808594, + "logps/rejected": -162.15756225585938, + "loss": 0.6923, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.047002751380205154, + "rewards/margins": 0.003170955926179886, + "rewards/rejected": 0.04383179172873497, + "step": 170 + }, + { + "epoch": 0.05669068146923349, + "grad_norm": 2.8125, + "learning_rate": 4.999644363032367e-06, + "logits/chosen": -0.5072035193443298, + "logits/rejected": -0.37213388085365295, + "logps/chosen": -195.76577758789062, + "logps/rejected": -164.03848266601562, + "loss": 0.6903, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05497163534164429, + "rewards/margins": 0.007314900867640972, + "rewards/rejected": 0.04765673726797104, + "step": 180 + }, + { + "epoch": 0.0598401637730798, + "grad_norm": 3.734375, + "learning_rate": 4.999549899797544e-06, + "logits/chosen": -0.45937657356262207, + "logits/rejected": -0.31247037649154663, + "logps/chosen": -218.7798309326172, + "logps/rejected": -182.95201110839844, + "loss": 0.6905, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.052850522100925446, + "rewards/margins": 0.006988201290369034, + "rewards/rejected": 0.04586231708526611, + "step": 190 + }, + { + "epoch": 0.0629896460769261, + "grad_norm": 2.8125, + "learning_rate": 4.999444324649045e-06, + "logits/chosen": -0.5390416979789734, + "logits/rejected": -0.35482490062713623, + "logps/chosen": -199.8936767578125, + "logps/rejected": -165.85488891601562, + "loss": 0.6856, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07901380211114883, + "rewards/margins": 0.01698421686887741, + "rewards/rejected": 0.062029581516981125, + "step": 200 + }, + { + "epoch": 0.0661391283807724, + "grad_norm": 3.046875, + "learning_rate": 4.999327638056212e-06, + "logits/chosen": -0.4826637804508209, + "logits/rejected": -0.3051258623600006, + "logps/chosen": -221.1772003173828, + "logps/rejected": -173.49583435058594, + "loss": 0.6906, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.07877618074417114, + "rewards/margins": 0.007990067824721336, + "rewards/rejected": 0.07078611105680466, + "step": 210 + }, + { + "epoch": 0.06928861068461871, + "grad_norm": 2.6875, + "learning_rate": 4.999199840537781e-06, + "logits/chosen": -0.4253220558166504, + "logits/rejected": -0.25771626830101013, + "logps/chosen": -218.5417938232422, + "logps/rejected": -180.00387573242188, + "loss": 0.6865, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09241167455911636, + "rewards/margins": 0.01505853421986103, + "rewards/rejected": 0.07735314220190048, + "step": 220 + }, + { + "epoch": 0.07243809298846503, + "grad_norm": 3.0, + "learning_rate": 4.9990609326618845e-06, + "logits/chosen": -0.468860924243927, + "logits/rejected": -0.416407972574234, + "logps/chosen": -227.7265625, + "logps/rejected": -207.37020874023438, + "loss": 0.689, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.10431376844644547, + "rewards/margins": 0.01053541712462902, + "rewards/rejected": 0.09377835690975189, + "step": 230 + }, + { + "epoch": 0.07558757529231133, + "grad_norm": 3.046875, + "learning_rate": 4.998910915046048e-06, + "logits/chosen": -0.47573143243789673, + "logits/rejected": -0.3487725853919983, + "logps/chosen": -218.22030639648438, + "logps/rejected": -186.4251708984375, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11783305555582047, + "rewards/margins": 0.009454838000237942, + "rewards/rejected": 0.10837821662425995, + "step": 240 + }, + { + "epoch": 0.07873705759615764, + "grad_norm": 3.328125, + "learning_rate": 4.998749788357184e-06, + "logits/chosen": -0.491966187953949, + "logits/rejected": -0.42401209473609924, + "logps/chosen": -209.5353240966797, + "logps/rejected": -191.00352478027344, + "loss": 0.6781, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12841267883777618, + "rewards/margins": 0.032561883330345154, + "rewards/rejected": 0.09585078805685043, + "step": 250 + }, + { + "epoch": 0.08188653990000394, + "grad_norm": 2.578125, + "learning_rate": 4.998577553311592e-06, + "logits/chosen": -0.5209980010986328, + "logits/rejected": -0.3212158679962158, + "logps/chosen": -207.7479705810547, + "logps/rejected": -174.1891632080078, + "loss": 0.681, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.12082584202289581, + "rewards/margins": 0.02722765877842903, + "rewards/rejected": 0.09359817951917648, + "step": 260 + }, + { + "epoch": 0.08503602220385025, + "grad_norm": 2.671875, + "learning_rate": 4.998394210674954e-06, + "logits/chosen": -0.4800085127353668, + "logits/rejected": -0.4280025064945221, + "logps/chosen": -193.07815551757812, + "logps/rejected": -190.64801025390625, + "loss": 0.6958, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1191275492310524, + "rewards/margins": -0.0025443979538977146, + "rewards/rejected": 0.12167193740606308, + "step": 270 + }, + { + "epoch": 0.08818550450769655, + "grad_norm": 3.015625, + "learning_rate": 4.998199761262332e-06, + "logits/chosen": -0.4940645694732666, + "logits/rejected": -0.3298795521259308, + "logps/chosen": -200.9434814453125, + "logps/rejected": -179.53814697265625, + "loss": 0.6842, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.13147035241127014, + "rewards/margins": 0.02113470621407032, + "rewards/rejected": 0.11033564805984497, + "step": 280 + }, + { + "epoch": 0.09133498681154285, + "grad_norm": 2.375, + "learning_rate": 4.997994205938164e-06, + "logits/chosen": -0.5327965617179871, + "logits/rejected": -0.38405364751815796, + "logps/chosen": -214.1449432373047, + "logps/rejected": -185.3954620361328, + "loss": 0.6731, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16222958266735077, + "rewards/margins": 0.0439109206199646, + "rewards/rejected": 0.11831866204738617, + "step": 290 + }, + { + "epoch": 0.09448446911538916, + "grad_norm": 2.671875, + "learning_rate": 4.997777545616258e-06, + "logits/chosen": -0.4295479655265808, + "logits/rejected": -0.2999122738838196, + "logps/chosen": -206.5615997314453, + "logps/rejected": -172.51394653320312, + "loss": 0.6781, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.15420152246952057, + "rewards/margins": 0.033724717795848846, + "rewards/rejected": 0.12047679722309113, + "step": 300 + }, + { + "epoch": 0.09763395141923546, + "grad_norm": 3.109375, + "learning_rate": 4.9975497812597935e-06, + "logits/chosen": -0.47075533866882324, + "logits/rejected": -0.38670963048934937, + "logps/chosen": -209.8960418701172, + "logps/rejected": -182.66079711914062, + "loss": 0.6837, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.18156033754348755, + "rewards/margins": 0.02306472323834896, + "rewards/rejected": 0.15849560499191284, + "step": 310 + }, + { + "epoch": 0.10078343372308177, + "grad_norm": 3.21875, + "learning_rate": 4.997310913881312e-06, + "logits/chosen": -0.4988747239112854, + "logits/rejected": -0.363254576921463, + "logps/chosen": -202.0031280517578, + "logps/rejected": -175.33151245117188, + "loss": 0.6791, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1774149388074875, + "rewards/margins": 0.03209572285413742, + "rewards/rejected": 0.14531922340393066, + "step": 320 + }, + { + "epoch": 0.10393291602692807, + "grad_norm": 2.875, + "learning_rate": 4.997060944542713e-06, + "logits/chosen": -0.5674048662185669, + "logits/rejected": -0.4282529950141907, + "logps/chosen": -204.34375, + "logps/rejected": -169.42828369140625, + "loss": 0.6705, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18822543323040009, + "rewards/margins": 0.0504903681576252, + "rewards/rejected": 0.13773508369922638, + "step": 330 + }, + { + "epoch": 0.10708239833077438, + "grad_norm": 3.578125, + "learning_rate": 4.996799874355253e-06, + "logits/chosen": -0.548692524433136, + "logits/rejected": -0.42939552664756775, + "logps/chosen": -204.91748046875, + "logps/rejected": -187.4248046875, + "loss": 0.686, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.18558073043823242, + "rewards/margins": 0.018990959972143173, + "rewards/rejected": 0.16658978164196014, + "step": 340 + }, + { + "epoch": 0.11023188063462068, + "grad_norm": 4.09375, + "learning_rate": 4.996527704479535e-06, + "logits/chosen": -0.43033042550086975, + "logits/rejected": -0.3002139627933502, + "logps/chosen": -215.57754516601562, + "logps/rejected": -193.66397094726562, + "loss": 0.6858, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.19084212183952332, + "rewards/margins": 0.018678244203329086, + "rewards/rejected": 0.17216388881206512, + "step": 350 + }, + { + "epoch": 0.11338136293846698, + "grad_norm": 2.921875, + "learning_rate": 4.9962444361255095e-06, + "logits/chosen": -0.5352093577384949, + "logits/rejected": -0.3528750538825989, + "logps/chosen": -202.76707458496094, + "logps/rejected": -155.83926391601562, + "loss": 0.6774, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.20079512894153595, + "rewards/margins": 0.03638342395424843, + "rewards/rejected": 0.164411723613739, + "step": 360 + }, + { + "epoch": 0.11653084524231329, + "grad_norm": 3.640625, + "learning_rate": 4.995950070552464e-06, + "logits/chosen": -0.5055617094039917, + "logits/rejected": -0.3472587764263153, + "logps/chosen": -225.160888671875, + "logps/rejected": -181.47703552246094, + "loss": 0.6912, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.21311064064502716, + "rewards/margins": 0.00886852853000164, + "rewards/rejected": 0.20424208045005798, + "step": 370 + }, + { + "epoch": 0.1196803275461596, + "grad_norm": 2.828125, + "learning_rate": 4.995644609069021e-06, + "logits/chosen": -0.48927387595176697, + "logits/rejected": -0.42719903588294983, + "logps/chosen": -185.412109375, + "logps/rejected": -181.981201171875, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1894209086894989, + "rewards/margins": 0.012292629107832909, + "rewards/rejected": 0.17712828516960144, + "step": 380 + }, + { + "epoch": 0.12282980985000591, + "grad_norm": 3.1875, + "learning_rate": 4.995328053033129e-06, + "logits/chosen": -0.4682087302207947, + "logits/rejected": -0.25708621740341187, + "logps/chosen": -217.36709594726562, + "logps/rejected": -174.3680877685547, + "loss": 0.6665, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.22537508606910706, + "rewards/margins": 0.057967256754636765, + "rewards/rejected": 0.1674078404903412, + "step": 390 + }, + { + "epoch": 0.1259792921538522, + "grad_norm": 3.515625, + "learning_rate": 4.995000403852057e-06, + "logits/chosen": -0.5313188433647156, + "logits/rejected": -0.39059966802597046, + "logps/chosen": -197.9477081298828, + "logps/rejected": -174.9998779296875, + "loss": 0.6761, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.19948481023311615, + "rewards/margins": 0.038825444877147675, + "rewards/rejected": 0.16065934300422668, + "step": 400 + }, + { + "epoch": 0.12912877445769852, + "grad_norm": 2.4375, + "learning_rate": 4.994661662982393e-06, + "logits/chosen": -0.533206045627594, + "logits/rejected": -0.4300476610660553, + "logps/chosen": -192.5688934326172, + "logps/rejected": -167.0782928466797, + "loss": 0.684, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.19937366247177124, + "rewards/margins": 0.022861123085021973, + "rewards/rejected": 0.17651252448558807, + "step": 410 + }, + { + "epoch": 0.1322782567615448, + "grad_norm": 3.796875, + "learning_rate": 4.994311831930032e-06, + "logits/chosen": -0.4812285006046295, + "logits/rejected": -0.35317444801330566, + "logps/chosen": -189.956787109375, + "logps/rejected": -159.92453002929688, + "loss": 0.6706, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23123323917388916, + "rewards/margins": 0.05047137662768364, + "rewards/rejected": 0.18076185882091522, + "step": 420 + }, + { + "epoch": 0.13542773906539113, + "grad_norm": 3.1875, + "learning_rate": 4.993950912250171e-06, + "logits/chosen": -0.4360167384147644, + "logits/rejected": -0.3796977400779724, + "logps/chosen": -203.8537139892578, + "logps/rejected": -175.59356689453125, + "loss": 0.6806, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.21009023487567902, + "rewards/margins": 0.030164580792188644, + "rewards/rejected": 0.17992563545703888, + "step": 430 + }, + { + "epoch": 0.13857722136923742, + "grad_norm": 2.921875, + "learning_rate": 4.9935789055473e-06, + "logits/chosen": -0.4947914183139801, + "logits/rejected": -0.36814266443252563, + "logps/chosen": -189.90982055664062, + "logps/rejected": -170.28982543945312, + "loss": 0.6788, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.22383634746074677, + "rewards/margins": 0.033211078494787216, + "rewards/rejected": 0.19062528014183044, + "step": 440 + }, + { + "epoch": 0.14172670367308374, + "grad_norm": 2.734375, + "learning_rate": 4.993195813475202e-06, + "logits/chosen": -0.5398550629615784, + "logits/rejected": -0.3718551993370056, + "logps/chosen": -193.36532592773438, + "logps/rejected": -169.635986328125, + "loss": 0.6801, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2529357373714447, + "rewards/margins": 0.03174880892038345, + "rewards/rejected": 0.22118695080280304, + "step": 450 + }, + { + "epoch": 0.14487618597693006, + "grad_norm": 2.859375, + "learning_rate": 4.992801637736937e-06, + "logits/chosen": -0.5186210870742798, + "logits/rejected": -0.35956451296806335, + "logps/chosen": -189.38392639160156, + "logps/rejected": -168.08277893066406, + "loss": 0.6769, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.2704620957374573, + "rewards/margins": 0.03909246250987053, + "rewards/rejected": 0.23136961460113525, + "step": 460 + }, + { + "epoch": 0.14802566828077635, + "grad_norm": 3.609375, + "learning_rate": 4.992396380084839e-06, + "logits/chosen": -0.481611967086792, + "logits/rejected": -0.3885895907878876, + "logps/chosen": -212.15335083007812, + "logps/rejected": -216.1449737548828, + "loss": 0.6683, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.29815778136253357, + "rewards/margins": 0.05739093944430351, + "rewards/rejected": 0.24076685309410095, + "step": 470 + }, + { + "epoch": 0.15117515058462266, + "grad_norm": 3.671875, + "learning_rate": 4.991980042320507e-06, + "logits/chosen": -0.515384316444397, + "logits/rejected": -0.3729632794857025, + "logps/chosen": -199.055908203125, + "logps/rejected": -176.830322265625, + "loss": 0.6822, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.27753692865371704, + "rewards/margins": 0.03093332052230835, + "rewards/rejected": 0.2466035783290863, + "step": 480 + }, + { + "epoch": 0.15432463288846895, + "grad_norm": 3.171875, + "learning_rate": 4.991552626294799e-06, + "logits/chosen": -0.477985680103302, + "logits/rejected": -0.3193301260471344, + "logps/chosen": -205.2687530517578, + "logps/rejected": -178.9701385498047, + "loss": 0.6786, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.275654137134552, + "rewards/margins": 0.03465648740530014, + "rewards/rejected": 0.24099759757518768, + "step": 490 + }, + { + "epoch": 0.15747411519231527, + "grad_norm": 2.953125, + "learning_rate": 4.991114133907822e-06, + "logits/chosen": -0.5235245823860168, + "logits/rejected": -0.3859252631664276, + "logps/chosen": -212.98876953125, + "logps/rejected": -178.56101989746094, + "loss": 0.6856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2918005585670471, + "rewards/margins": 0.020267702639102936, + "rewards/rejected": 0.2715328335762024, + "step": 500 + }, + { + "epoch": 0.16062359749616156, + "grad_norm": 2.671875, + "learning_rate": 4.99066456710892e-06, + "logits/chosen": -0.5184639096260071, + "logits/rejected": -0.3862496316432953, + "logps/chosen": -194.21218872070312, + "logps/rejected": -152.38967895507812, + "loss": 0.6722, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.28834104537963867, + "rewards/margins": 0.05140441656112671, + "rewards/rejected": 0.23693661391735077, + "step": 510 + }, + { + "epoch": 0.16377307980000788, + "grad_norm": 2.953125, + "learning_rate": 4.990203927896674e-06, + "logits/chosen": -0.5369669198989868, + "logits/rejected": -0.42427778244018555, + "logps/chosen": -195.22592163085938, + "logps/rejected": -173.0403289794922, + "loss": 0.6737, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.282554566860199, + "rewards/margins": 0.04619991034269333, + "rewards/rejected": 0.23635463416576385, + "step": 520 + }, + { + "epoch": 0.16692256210385417, + "grad_norm": 2.609375, + "learning_rate": 4.9897322183188855e-06, + "logits/chosen": -0.49566903710365295, + "logits/rejected": -0.42590633034706116, + "logps/chosen": -193.32513427734375, + "logps/rejected": -163.94113159179688, + "loss": 0.68, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3135472536087036, + "rewards/margins": 0.0341959223151207, + "rewards/rejected": 0.2793513536453247, + "step": 530 + }, + { + "epoch": 0.1700720444077005, + "grad_norm": 2.609375, + "learning_rate": 4.989249440472569e-06, + "logits/chosen": -0.4742864668369293, + "logits/rejected": -0.361924409866333, + "logps/chosen": -206.1411590576172, + "logps/rejected": -177.32366943359375, + "loss": 0.6616, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2787955701351166, + "rewards/margins": 0.07065565139055252, + "rewards/rejected": 0.20813994109630585, + "step": 540 + }, + { + "epoch": 0.17322152671154678, + "grad_norm": 2.640625, + "learning_rate": 4.988755596503948e-06, + "logits/chosen": -0.47193408012390137, + "logits/rejected": -0.32936495542526245, + "logps/chosen": -212.5970916748047, + "logps/rejected": -184.22251892089844, + "loss": 0.6785, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.3025556802749634, + "rewards/margins": 0.03635026142001152, + "rewards/rejected": 0.26620543003082275, + "step": 550 + }, + { + "epoch": 0.1763710090153931, + "grad_norm": 2.671875, + "learning_rate": 4.988250688608436e-06, + "logits/chosen": -0.5082842111587524, + "logits/rejected": -0.38267362117767334, + "logps/chosen": -196.55685424804688, + "logps/rejected": -176.552734375, + "loss": 0.6866, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.312252402305603, + "rewards/margins": 0.022432830184698105, + "rewards/rejected": 0.2898195683956146, + "step": 560 + }, + { + "epoch": 0.1795204913192394, + "grad_norm": 3.203125, + "learning_rate": 4.9877347190306354e-06, + "logits/chosen": -0.44343939423561096, + "logits/rejected": -0.38272932171821594, + "logps/chosen": -187.27857971191406, + "logps/rejected": -173.09884643554688, + "loss": 0.7007, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.28175923228263855, + "rewards/margins": -0.002676494885236025, + "rewards/rejected": 0.2844357490539551, + "step": 570 + }, + { + "epoch": 0.1826699736230857, + "grad_norm": 3.03125, + "learning_rate": 4.987207690064323e-06, + "logits/chosen": -0.47279614210128784, + "logits/rejected": -0.31698504090309143, + "logps/chosen": -226.852783203125, + "logps/rejected": -187.92398071289062, + "loss": 0.6668, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3296729326248169, + "rewards/margins": 0.06299933046102524, + "rewards/rejected": 0.26667362451553345, + "step": 580 + }, + { + "epoch": 0.185819455926932, + "grad_norm": 2.796875, + "learning_rate": 4.98666960405244e-06, + "logits/chosen": -0.5369702577590942, + "logits/rejected": -0.4153470993041992, + "logps/chosen": -185.39688110351562, + "logps/rejected": -150.71243286132812, + "loss": 0.6733, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2983696460723877, + "rewards/margins": 0.04920379817485809, + "rewards/rejected": 0.24916581809520721, + "step": 590 + }, + { + "epoch": 0.18896893823077832, + "grad_norm": 2.734375, + "learning_rate": 4.986120463387084e-06, + "logits/chosen": -0.4622046947479248, + "logits/rejected": -0.3503243923187256, + "logps/chosen": -191.92813110351562, + "logps/rejected": -176.70423889160156, + "loss": 0.6848, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.309493750333786, + "rewards/margins": 0.02766551449894905, + "rewards/rejected": 0.28182822465896606, + "step": 600 + }, + { + "epoch": 0.19211842053462463, + "grad_norm": 2.96875, + "learning_rate": 4.985560270509496e-06, + "logits/chosen": -0.5258822441101074, + "logits/rejected": -0.3977143168449402, + "logps/chosen": -215.99563598632812, + "logps/rejected": -186.25816345214844, + "loss": 0.6636, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3478468358516693, + "rewards/margins": 0.06890861690044403, + "rewards/rejected": 0.2789382338523865, + "step": 610 + }, + { + "epoch": 0.19526790283847092, + "grad_norm": 2.4375, + "learning_rate": 4.9849890279100495e-06, + "logits/chosen": -0.49815624952316284, + "logits/rejected": -0.4273989796638489, + "logps/chosen": -206.10385131835938, + "logps/rejected": -180.3357391357422, + "loss": 0.6795, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.3327508568763733, + "rewards/margins": 0.03660760074853897, + "rewards/rejected": 0.29614323377609253, + "step": 620 + }, + { + "epoch": 0.19841738514231724, + "grad_norm": 3.140625, + "learning_rate": 4.984406738128241e-06, + "logits/chosen": -0.4096647799015045, + "logits/rejected": -0.3559405505657196, + "logps/chosen": -206.38064575195312, + "logps/rejected": -185.5215301513672, + "loss": 0.6733, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3558829426765442, + "rewards/margins": 0.049683526158332825, + "rewards/rejected": 0.3061993718147278, + "step": 630 + }, + { + "epoch": 0.20156686744616353, + "grad_norm": 3.015625, + "learning_rate": 4.9838134037526795e-06, + "logits/chosen": -0.4695689082145691, + "logits/rejected": -0.39185652136802673, + "logps/chosen": -194.5779266357422, + "logps/rejected": -173.6152801513672, + "loss": 0.6702, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.33483070135116577, + "rewards/margins": 0.05525987595319748, + "rewards/rejected": 0.27957087755203247, + "step": 640 + }, + { + "epoch": 0.20471634975000985, + "grad_norm": 2.453125, + "learning_rate": 4.983209027421072e-06, + "logits/chosen": -0.5027821063995361, + "logits/rejected": -0.3183223009109497, + "logps/chosen": -209.20291137695312, + "logps/rejected": -171.07730102539062, + "loss": 0.6453, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.3705558776855469, + "rewards/margins": 0.10951970517635345, + "rewards/rejected": 0.2610361576080322, + "step": 650 + }, + { + "epoch": 0.20786583205385614, + "grad_norm": 2.96875, + "learning_rate": 4.982593611820211e-06, + "logits/chosen": -0.5004459023475647, + "logits/rejected": -0.40845292806625366, + "logps/chosen": -183.89920043945312, + "logps/rejected": -157.65817260742188, + "loss": 0.6759, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.3168241083621979, + "rewards/margins": 0.04429393634200096, + "rewards/rejected": 0.27253013849258423, + "step": 660 + }, + { + "epoch": 0.21101531435770246, + "grad_norm": 3.328125, + "learning_rate": 4.981967159685969e-06, + "logits/chosen": -0.5235536694526672, + "logits/rejected": -0.3312370181083679, + "logps/chosen": -212.1456298828125, + "logps/rejected": -173.4841766357422, + "loss": 0.6682, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37052249908447266, + "rewards/margins": 0.06635678559541702, + "rewards/rejected": 0.30416572093963623, + "step": 670 + }, + { + "epoch": 0.21416479666154875, + "grad_norm": 2.90625, + "learning_rate": 4.98132967380328e-06, + "logits/chosen": -0.4822749197483063, + "logits/rejected": -0.37933364510536194, + "logps/chosen": -184.20840454101562, + "logps/rejected": -165.41285705566406, + "loss": 0.6753, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3370289206504822, + "rewards/margins": 0.04510311037302017, + "rewards/rejected": 0.2919257879257202, + "step": 680 + }, + { + "epoch": 0.21731427896539507, + "grad_norm": 2.5625, + "learning_rate": 4.980681157006129e-06, + "logits/chosen": -0.45780807733535767, + "logits/rejected": -0.3043513894081116, + "logps/chosen": -217.8817901611328, + "logps/rejected": -184.9659423828125, + "loss": 0.6575, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.38531795144081116, + "rewards/margins": 0.08670790493488312, + "rewards/rejected": 0.29861006140708923, + "step": 690 + }, + { + "epoch": 0.22046376126924136, + "grad_norm": 3.34375, + "learning_rate": 4.9800216121775404e-06, + "logits/chosen": -0.4495162069797516, + "logits/rejected": -0.3232493996620178, + "logps/chosen": -209.5262908935547, + "logps/rejected": -175.12301635742188, + "loss": 0.6529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4109676778316498, + "rewards/margins": 0.093905009329319, + "rewards/rejected": 0.3170626759529114, + "step": 700 + }, + { + "epoch": 0.22361324357308768, + "grad_norm": 2.40625, + "learning_rate": 4.979351042249564e-06, + "logits/chosen": -0.5166658163070679, + "logits/rejected": -0.36778515577316284, + "logps/chosen": -202.32473754882812, + "logps/rejected": -175.3853759765625, + "loss": 0.6596, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.3800323009490967, + "rewards/margins": 0.07695108652114868, + "rewards/rejected": 0.303081214427948, + "step": 710 + }, + { + "epoch": 0.22676272587693397, + "grad_norm": 3.75, + "learning_rate": 4.978669450203263e-06, + "logits/chosen": -0.5254799127578735, + "logits/rejected": -0.3579130470752716, + "logps/chosen": -201.89544677734375, + "logps/rejected": -170.9855194091797, + "loss": 0.6548, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.41179266571998596, + "rewards/margins": 0.09266375005245209, + "rewards/rejected": 0.31912893056869507, + "step": 720 + }, + { + "epoch": 0.2299122081807803, + "grad_norm": 2.5, + "learning_rate": 4.977976839068699e-06, + "logits/chosen": -0.41213518381118774, + "logits/rejected": -0.37237733602523804, + "logps/chosen": -193.40603637695312, + "logps/rejected": -176.70425415039062, + "loss": 0.6763, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3635534644126892, + "rewards/margins": 0.05010410025715828, + "rewards/rejected": 0.31344935297966003, + "step": 730 + }, + { + "epoch": 0.23306169048462658, + "grad_norm": 3.140625, + "learning_rate": 4.977273211924921e-06, + "logits/chosen": -0.5329135060310364, + "logits/rejected": -0.387803316116333, + "logps/chosen": -230.49569702148438, + "logps/rejected": -199.40280151367188, + "loss": 0.6588, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4647158682346344, + "rewards/margins": 0.08450464904308319, + "rewards/rejected": 0.38021120429039, + "step": 740 + }, + { + "epoch": 0.2362111727884729, + "grad_norm": 2.875, + "learning_rate": 4.97655857189995e-06, + "logits/chosen": -0.4675888419151306, + "logits/rejected": -0.3698478639125824, + "logps/chosen": -200.2988739013672, + "logps/rejected": -172.2423858642578, + "loss": 0.6704, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.38973450660705566, + "rewards/margins": 0.05968126654624939, + "rewards/rejected": 0.3300532102584839, + "step": 750 + }, + { + "epoch": 0.2393606550923192, + "grad_norm": 3.5, + "learning_rate": 4.975832922170765e-06, + "logits/chosen": -0.48995572328567505, + "logits/rejected": -0.3825104236602783, + "logps/chosen": -199.5481414794922, + "logps/rejected": -186.9451904296875, + "loss": 0.6902, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.40951433777809143, + "rewards/margins": 0.019469773396849632, + "rewards/rejected": 0.39004451036453247, + "step": 760 + }, + { + "epoch": 0.2425101373961655, + "grad_norm": 2.84375, + "learning_rate": 4.9750962659632886e-06, + "logits/chosen": -0.46572384238243103, + "logits/rejected": -0.25734081864356995, + "logps/chosen": -213.49441528320312, + "logps/rejected": -179.05862426757812, + "loss": 0.6589, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4372657835483551, + "rewards/margins": 0.08767595142126083, + "rewards/rejected": 0.34958982467651367, + "step": 770 + }, + { + "epoch": 0.24565961970001182, + "grad_norm": 2.609375, + "learning_rate": 4.974348606552377e-06, + "logits/chosen": -0.49957194924354553, + "logits/rejected": -0.44457465410232544, + "logps/chosen": -174.76776123046875, + "logps/rejected": -162.83163452148438, + "loss": 0.6713, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.37218689918518066, + "rewards/margins": 0.05890881270170212, + "rewards/rejected": 0.31327807903289795, + "step": 780 + }, + { + "epoch": 0.2488091020038581, + "grad_norm": 2.484375, + "learning_rate": 4.973589947261797e-06, + "logits/chosen": -0.43002453446388245, + "logits/rejected": -0.26697424054145813, + "logps/chosen": -214.3047637939453, + "logps/rejected": -164.3812713623047, + "loss": 0.6533, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4211522042751312, + "rewards/margins": 0.09669376909732819, + "rewards/rejected": 0.324458509683609, + "step": 790 + }, + { + "epoch": 0.2519585843077044, + "grad_norm": 3.0625, + "learning_rate": 4.972820291464219e-06, + "logits/chosen": -0.5306814312934875, + "logits/rejected": -0.3864821195602417, + "logps/chosen": -212.0864715576172, + "logps/rejected": -191.2584991455078, + "loss": 0.6768, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4268336296081543, + "rewards/margins": 0.04500560089945793, + "rewards/rejected": 0.3818280100822449, + "step": 800 + }, + { + "epoch": 0.2551080666115507, + "grad_norm": 2.78125, + "learning_rate": 4.972039642581199e-06, + "logits/chosen": -0.44598865509033203, + "logits/rejected": -0.3495904505252838, + "logps/chosen": -206.1003875732422, + "logps/rejected": -180.69091796875, + "loss": 0.6704, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4352218508720398, + "rewards/margins": 0.06349250674247742, + "rewards/rejected": 0.37172931432724, + "step": 810 + }, + { + "epoch": 0.25825754891539704, + "grad_norm": 2.90625, + "learning_rate": 4.9712480040831626e-06, + "logits/chosen": -0.4507770538330078, + "logits/rejected": -0.4034315049648285, + "logps/chosen": -201.1985626220703, + "logps/rejected": -188.18038940429688, + "loss": 0.6691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4719354212284088, + "rewards/margins": 0.06965798884630203, + "rewards/rejected": 0.40227746963500977, + "step": 820 + }, + { + "epoch": 0.26140703121924336, + "grad_norm": 2.859375, + "learning_rate": 4.9704453794893905e-06, + "logits/chosen": -0.4594174921512604, + "logits/rejected": -0.3546017110347748, + "logps/chosen": -208.15328979492188, + "logps/rejected": -175.2351531982422, + "loss": 0.6596, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4524504244327545, + "rewards/margins": 0.08146923780441284, + "rewards/rejected": 0.3709811568260193, + "step": 830 + }, + { + "epoch": 0.2645565135230896, + "grad_norm": 2.9375, + "learning_rate": 4.969631772368005e-06, + "logits/chosen": -0.44061025977134705, + "logits/rejected": -0.3553173840045929, + "logps/chosen": -204.01336669921875, + "logps/rejected": -194.84194946289062, + "loss": 0.683, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.4293643534183502, + "rewards/margins": 0.03097158670425415, + "rewards/rejected": 0.39839276671409607, + "step": 840 + }, + { + "epoch": 0.26770599582693594, + "grad_norm": 2.734375, + "learning_rate": 4.968807186335948e-06, + "logits/chosen": -0.45513710379600525, + "logits/rejected": -0.35907578468322754, + "logps/chosen": -187.94784545898438, + "logps/rejected": -163.60385131835938, + "loss": 0.6449, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4533543586730957, + "rewards/margins": 0.11502598226070404, + "rewards/rejected": 0.3383283317089081, + "step": 850 + }, + { + "epoch": 0.27085547813078226, + "grad_norm": 2.34375, + "learning_rate": 4.9679716250589726e-06, + "logits/chosen": -0.53230220079422, + "logits/rejected": -0.36705273389816284, + "logps/chosen": -214.5038299560547, + "logps/rejected": -173.00045776367188, + "loss": 0.6439, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4622381627559662, + "rewards/margins": 0.11802558600902557, + "rewards/rejected": 0.3442125916481018, + "step": 860 + }, + { + "epoch": 0.2740049604346286, + "grad_norm": 2.703125, + "learning_rate": 4.96712509225162e-06, + "logits/chosen": -0.4010530412197113, + "logits/rejected": -0.40078672766685486, + "logps/chosen": -189.293212890625, + "logps/rejected": -190.86239624023438, + "loss": 0.7079, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.3847641050815582, + "rewards/margins": -0.01615377888083458, + "rewards/rejected": 0.4009179174900055, + "step": 870 + }, + { + "epoch": 0.27715444273847484, + "grad_norm": 2.3125, + "learning_rate": 4.966267591677209e-06, + "logits/chosen": -0.47001272439956665, + "logits/rejected": -0.36287882924079895, + "logps/chosen": -193.36114501953125, + "logps/rejected": -163.00997924804688, + "loss": 0.6561, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.412198543548584, + "rewards/margins": 0.08650766313076019, + "rewards/rejected": 0.3256909251213074, + "step": 880 + }, + { + "epoch": 0.28030392504232116, + "grad_norm": 3.40625, + "learning_rate": 4.965399127147814e-06, + "logits/chosen": -0.5800660848617554, + "logits/rejected": -0.4225196838378906, + "logps/chosen": -199.9794158935547, + "logps/rejected": -175.31942749023438, + "loss": 0.6812, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4509803354740143, + "rewards/margins": 0.04268043860793114, + "rewards/rejected": 0.40829986333847046, + "step": 890 + }, + { + "epoch": 0.2834534073461675, + "grad_norm": 2.890625, + "learning_rate": 4.964519702524251e-06, + "logits/chosen": -0.48715677857398987, + "logits/rejected": -0.3676280677318573, + "logps/chosen": -206.2698974609375, + "logps/rejected": -180.06561279296875, + "loss": 0.6616, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4541000425815582, + "rewards/margins": 0.08144226670265198, + "rewards/rejected": 0.37265777587890625, + "step": 900 + }, + { + "epoch": 0.2866028896500138, + "grad_norm": 2.75, + "learning_rate": 4.9636293217160615e-06, + "logits/chosen": -0.4000244736671448, + "logits/rejected": -0.37658295035362244, + "logps/chosen": -201.72274780273438, + "logps/rejected": -197.0117645263672, + "loss": 0.708, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.4207178056240082, + "rewards/margins": -0.009672733955085278, + "rewards/rejected": 0.43039053678512573, + "step": 910 + }, + { + "epoch": 0.2897523719538601, + "grad_norm": 2.15625, + "learning_rate": 4.96272798868149e-06, + "logits/chosen": -0.5054947137832642, + "logits/rejected": -0.3827149271965027, + "logps/chosen": -193.49334716796875, + "logps/rejected": -166.5660858154297, + "loss": 0.6608, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4493057131767273, + "rewards/margins": 0.08539381623268127, + "rewards/rejected": 0.363911896944046, + "step": 920 + }, + { + "epoch": 0.2929018542577064, + "grad_norm": 2.90625, + "learning_rate": 4.961815707427473e-06, + "logits/chosen": -0.4878782629966736, + "logits/rejected": -0.34200000762939453, + "logps/chosen": -192.65780639648438, + "logps/rejected": -171.5297393798828, + "loss": 0.6809, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.43513813614845276, + "rewards/margins": 0.04086482524871826, + "rewards/rejected": 0.3942733108997345, + "step": 930 + }, + { + "epoch": 0.2960513365615527, + "grad_norm": 3.046875, + "learning_rate": 4.960892482009617e-06, + "logits/chosen": -0.5498200058937073, + "logits/rejected": -0.3890989422798157, + "logps/chosen": -217.5938262939453, + "logps/rejected": -185.64468383789062, + "loss": 0.6572, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4587416648864746, + "rewards/margins": 0.08828563988208771, + "rewards/rejected": 0.3704560697078705, + "step": 940 + }, + { + "epoch": 0.299200818865399, + "grad_norm": 3.1875, + "learning_rate": 4.959958316532181e-06, + "logits/chosen": -0.4381844103336334, + "logits/rejected": -0.39938193559646606, + "logps/chosen": -202.2256622314453, + "logps/rejected": -184.60812377929688, + "loss": 0.6555, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.47266441583633423, + "rewards/margins": 0.08841115981340408, + "rewards/rejected": 0.38425326347351074, + "step": 950 + }, + { + "epoch": 0.30235030116924533, + "grad_norm": 2.875, + "learning_rate": 4.959013215148059e-06, + "logits/chosen": -0.516729474067688, + "logits/rejected": -0.37376007437705994, + "logps/chosen": -206.91470336914062, + "logps/rejected": -165.06771850585938, + "loss": 0.6483, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.48910826444625854, + "rewards/margins": 0.11284583806991577, + "rewards/rejected": 0.3762624263763428, + "step": 960 + }, + { + "epoch": 0.3054997834730916, + "grad_norm": 2.265625, + "learning_rate": 4.958057182058763e-06, + "logits/chosen": -0.4985506534576416, + "logits/rejected": -0.3460918366909027, + "logps/chosen": -204.8501434326172, + "logps/rejected": -156.81150817871094, + "loss": 0.6317, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5237672924995422, + "rewards/margins": 0.14426377415657043, + "rewards/rejected": 0.3795034885406494, + "step": 970 + }, + { + "epoch": 0.3086492657769379, + "grad_norm": 2.421875, + "learning_rate": 4.957090221514399e-06, + "logits/chosen": -0.44721898436546326, + "logits/rejected": -0.3173277676105499, + "logps/chosen": -204.72488403320312, + "logps/rejected": -172.4529571533203, + "loss": 0.6822, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.43838948011398315, + "rewards/margins": 0.04825422167778015, + "rewards/rejected": 0.3901353180408478, + "step": 980 + }, + { + "epoch": 0.3117987480807842, + "grad_norm": 3.53125, + "learning_rate": 4.956112337813655e-06, + "logits/chosen": -0.5343712568283081, + "logits/rejected": -0.42165470123291016, + "logps/chosen": -196.4682159423828, + "logps/rejected": -159.38626098632812, + "loss": 0.6411, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.49275222420692444, + "rewards/margins": 0.12367204576730728, + "rewards/rejected": 0.369080126285553, + "step": 990 + }, + { + "epoch": 0.31494823038463055, + "grad_norm": 3.046875, + "learning_rate": 4.955123535303775e-06, + "logits/chosen": -0.5008007884025574, + "logits/rejected": -0.32471761107444763, + "logps/chosen": -215.82382202148438, + "logps/rejected": -170.16177368164062, + "loss": 0.6278, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.555659294128418, + "rewards/margins": 0.15915410220623016, + "rewards/rejected": 0.3965051770210266, + "step": 1000 + }, + { + "epoch": 0.3180977126884768, + "grad_norm": 2.8125, + "learning_rate": 4.95412381838055e-06, + "logits/chosen": -0.45208463072776794, + "logits/rejected": -0.3023605942726135, + "logps/chosen": -211.84561157226562, + "logps/rejected": -175.75242614746094, + "loss": 0.6482, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5254601836204529, + "rewards/margins": 0.1151600107550621, + "rewards/rejected": 0.41030019521713257, + "step": 1010 + }, + { + "epoch": 0.3212471949923231, + "grad_norm": 3.34375, + "learning_rate": 4.953113191488284e-06, + "logits/chosen": -0.5526930093765259, + "logits/rejected": -0.4031393527984619, + "logps/chosen": -200.52316284179688, + "logps/rejected": -160.85092163085938, + "loss": 0.6635, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.47668904066085815, + "rewards/margins": 0.08161365985870361, + "rewards/rejected": 0.3950754404067993, + "step": 1020 + }, + { + "epoch": 0.32439667729616944, + "grad_norm": 2.375, + "learning_rate": 4.9520916591197865e-06, + "logits/chosen": -0.49668893218040466, + "logits/rejected": -0.3756228983402252, + "logps/chosen": -201.33413696289062, + "logps/rejected": -169.83847045898438, + "loss": 0.6651, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5008379220962524, + "rewards/margins": 0.08502224832773209, + "rewards/rejected": 0.41581565141677856, + "step": 1030 + }, + { + "epoch": 0.32754615960001576, + "grad_norm": 3.078125, + "learning_rate": 4.951059225816347e-06, + "logits/chosen": -0.4749979078769684, + "logits/rejected": -0.30828791856765747, + "logps/chosen": -224.30178833007812, + "logps/rejected": -178.66122436523438, + "loss": 0.6517, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5890859961509705, + "rewards/margins": 0.12044923007488251, + "rewards/rejected": 0.46863681077957153, + "step": 1040 + }, + { + "epoch": 0.330695641903862, + "grad_norm": 2.203125, + "learning_rate": 4.950015896167716e-06, + "logits/chosen": -0.47923216223716736, + "logits/rejected": -0.28145602345466614, + "logps/chosen": -189.4966583251953, + "logps/rejected": -161.24319458007812, + "loss": 0.6675, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4908905029296875, + "rewards/margins": 0.07855306565761566, + "rewards/rejected": 0.41233739256858826, + "step": 1050 + }, + { + "epoch": 0.33384512420770834, + "grad_norm": 2.484375, + "learning_rate": 4.948961674812083e-06, + "logits/chosen": -0.49384012818336487, + "logits/rejected": -0.36533522605895996, + "logps/chosen": -198.85769653320312, + "logps/rejected": -174.79847717285156, + "loss": 0.6573, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5101815462112427, + "rewards/margins": 0.08699540048837662, + "rewards/rejected": 0.42318612337112427, + "step": 1060 + }, + { + "epoch": 0.33699460651155466, + "grad_norm": 2.46875, + "learning_rate": 4.9478965664360595e-06, + "logits/chosen": -0.5174117088317871, + "logits/rejected": -0.3790570795536041, + "logps/chosen": -199.65078735351562, + "logps/rejected": -159.61801147460938, + "loss": 0.6417, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5138002634048462, + "rewards/margins": 0.12827709317207336, + "rewards/rejected": 0.38552325963974, + "step": 1070 + }, + { + "epoch": 0.340144088815401, + "grad_norm": 2.34375, + "learning_rate": 4.946820575774654e-06, + "logits/chosen": -0.4343532919883728, + "logits/rejected": -0.31658726930618286, + "logps/chosen": -199.5583038330078, + "logps/rejected": -162.34683227539062, + "loss": 0.6508, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5402163863182068, + "rewards/margins": 0.12262473255395889, + "rewards/rejected": 0.41759172081947327, + "step": 1080 + }, + { + "epoch": 0.3432935711192473, + "grad_norm": 2.65625, + "learning_rate": 4.945733707611256e-06, + "logits/chosen": -0.42169666290283203, + "logits/rejected": -0.27337223291397095, + "logps/chosen": -210.4072265625, + "logps/rejected": -176.20767211914062, + "loss": 0.6267, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6182594299316406, + "rewards/margins": 0.16625744104385376, + "rewards/rejected": 0.45200204849243164, + "step": 1090 + }, + { + "epoch": 0.34644305342309356, + "grad_norm": 2.234375, + "learning_rate": 4.944635966777607e-06, + "logits/chosen": -0.5145076513290405, + "logits/rejected": -0.3866254985332489, + "logps/chosen": -186.8591766357422, + "logps/rejected": -147.5457305908203, + "loss": 0.6267, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5492256283760071, + "rewards/margins": 0.15818223357200623, + "rewards/rejected": 0.39104336500167847, + "step": 1100 + }, + { + "epoch": 0.3495925357269399, + "grad_norm": 2.796875, + "learning_rate": 4.943527358153787e-06, + "logits/chosen": -0.4312458634376526, + "logits/rejected": -0.2944316267967224, + "logps/chosen": -176.56130981445312, + "logps/rejected": -144.69210815429688, + "loss": 0.6711, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5271092057228088, + "rewards/margins": 0.074883833527565, + "rewards/rejected": 0.45222535729408264, + "step": 1110 + }, + { + "epoch": 0.3527420180307862, + "grad_norm": 3.25, + "learning_rate": 4.942407886668189e-06, + "logits/chosen": -0.4668591022491455, + "logits/rejected": -0.4000996947288513, + "logps/chosen": -192.39215087890625, + "logps/rejected": -181.41018676757812, + "loss": 0.6692, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5816969871520996, + "rewards/margins": 0.07639260590076447, + "rewards/rejected": 0.5053043365478516, + "step": 1120 + }, + { + "epoch": 0.3558915003346325, + "grad_norm": 2.421875, + "learning_rate": 4.941277557297497e-06, + "logits/chosen": -0.5000630617141724, + "logits/rejected": -0.3792869448661804, + "logps/chosen": -201.1351776123047, + "logps/rejected": -163.50692749023438, + "loss": 0.6669, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5100800395011902, + "rewards/margins": 0.07874707877635956, + "rewards/rejected": 0.4313329756259918, + "step": 1130 + }, + { + "epoch": 0.3590409826384788, + "grad_norm": 3.015625, + "learning_rate": 4.940136375066664e-06, + "logits/chosen": -0.5313334465026855, + "logits/rejected": -0.4073900580406189, + "logps/chosen": -195.81436157226562, + "logps/rejected": -165.0396728515625, + "loss": 0.6488, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5570641756057739, + "rewards/margins": 0.12459216266870499, + "rewards/rejected": 0.4324720501899719, + "step": 1140 + }, + { + "epoch": 0.3621904649423251, + "grad_norm": 2.921875, + "learning_rate": 4.938984345048892e-06, + "logits/chosen": -0.47552186250686646, + "logits/rejected": -0.3426175117492676, + "logps/chosen": -226.5735626220703, + "logps/rejected": -185.11488342285156, + "loss": 0.6607, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6292687654495239, + "rewards/margins": 0.10300488770008087, + "rewards/rejected": 0.5262638926506042, + "step": 1150 + }, + { + "epoch": 0.3653399472461714, + "grad_norm": 2.890625, + "learning_rate": 4.937821472365606e-06, + "logits/chosen": -0.45469918847084045, + "logits/rejected": -0.2649703621864319, + "logps/chosen": -203.91030883789062, + "logps/rejected": -154.42904663085938, + "loss": 0.6364, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.614439845085144, + "rewards/margins": 0.14579293131828308, + "rewards/rejected": 0.46864691376686096, + "step": 1160 + }, + { + "epoch": 0.36848942955001773, + "grad_norm": 2.546875, + "learning_rate": 4.9366477621864325e-06, + "logits/chosen": -0.45428067445755005, + "logits/rejected": -0.38431745767593384, + "logps/chosen": -193.93759155273438, + "logps/rejected": -166.46226501464844, + "loss": 0.6829, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.5578614473342896, + "rewards/margins": 0.04849938303232193, + "rewards/rejected": 0.5093621015548706, + "step": 1170 + }, + { + "epoch": 0.371638911853864, + "grad_norm": 2.984375, + "learning_rate": 4.935463219729178e-06, + "logits/chosen": -0.43662381172180176, + "logits/rejected": -0.3796136975288391, + "logps/chosen": -208.486083984375, + "logps/rejected": -186.07241821289062, + "loss": 0.6391, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5909144878387451, + "rewards/margins": 0.12893392145633698, + "rewards/rejected": 0.46198058128356934, + "step": 1180 + }, + { + "epoch": 0.3747883941577103, + "grad_norm": 2.640625, + "learning_rate": 4.934267850259802e-06, + "logits/chosen": -0.5133141875267029, + "logits/rejected": -0.36373409628868103, + "logps/chosen": -194.3529815673828, + "logps/rejected": -159.5540771484375, + "loss": 0.6617, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5327507853507996, + "rewards/margins": 0.08738649636507034, + "rewards/rejected": 0.4453642964363098, + "step": 1190 + }, + { + "epoch": 0.37793787646155663, + "grad_norm": 3.15625, + "learning_rate": 4.933061659092401e-06, + "logits/chosen": -0.45688313245773315, + "logits/rejected": -0.3101183772087097, + "logps/chosen": -197.06399536132812, + "logps/rejected": -178.8423309326172, + "loss": 0.6921, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.5263506174087524, + "rewards/margins": 0.027934294193983078, + "rewards/rejected": 0.4984162747859955, + "step": 1200 + }, + { + "epoch": 0.38108735876540295, + "grad_norm": 2.953125, + "learning_rate": 4.931844651589176e-06, + "logits/chosen": -0.3927622437477112, + "logits/rejected": -0.3346686065196991, + "logps/chosen": -192.22146606445312, + "logps/rejected": -177.29522705078125, + "loss": 0.6793, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5813957452774048, + "rewards/margins": 0.05229301005601883, + "rewards/rejected": 0.5291028022766113, + "step": 1210 + }, + { + "epoch": 0.38423684106924927, + "grad_norm": 2.71875, + "learning_rate": 4.930616833160414e-06, + "logits/chosen": -0.503675639629364, + "logits/rejected": -0.3028663694858551, + "logps/chosen": -209.7771759033203, + "logps/rejected": -166.1737060546875, + "loss": 0.6183, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.6333845853805542, + "rewards/margins": 0.17845922708511353, + "rewards/rejected": 0.45492544770240784, + "step": 1220 + }, + { + "epoch": 0.38738632337309553, + "grad_norm": 2.359375, + "learning_rate": 4.929378209264464e-06, + "logits/chosen": -0.4029599130153656, + "logits/rejected": -0.2958356738090515, + "logps/chosen": -201.47354125976562, + "logps/rejected": -175.35316467285156, + "loss": 0.6546, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6188668012619019, + "rewards/margins": 0.10686089843511581, + "rewards/rejected": 0.5120059251785278, + "step": 1230 + }, + { + "epoch": 0.39053580567694185, + "grad_norm": 3.421875, + "learning_rate": 4.9281287854077075e-06, + "logits/chosen": -0.5093222856521606, + "logits/rejected": -0.3469759523868561, + "logps/chosen": -203.4248809814453, + "logps/rejected": -168.985595703125, + "loss": 0.6675, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5747894048690796, + "rewards/margins": 0.08418375253677368, + "rewards/rejected": 0.4906056821346283, + "step": 1240 + }, + { + "epoch": 0.39368528798078817, + "grad_norm": 3.140625, + "learning_rate": 4.926868567144543e-06, + "logits/chosen": -0.43734902143478394, + "logits/rejected": -0.27301496267318726, + "logps/chosen": -217.2354736328125, + "logps/rejected": -179.50477600097656, + "loss": 0.6439, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.602939784526825, + "rewards/margins": 0.13366705179214478, + "rewards/rejected": 0.46927279233932495, + "step": 1250 + }, + { + "epoch": 0.3968347702846345, + "grad_norm": 2.015625, + "learning_rate": 4.9255975600773506e-06, + "logits/chosen": -0.46365243196487427, + "logits/rejected": -0.369037002325058, + "logps/chosen": -193.8307647705078, + "logps/rejected": -167.16226196289062, + "loss": 0.6528, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5885978937149048, + "rewards/margins": 0.11199425160884857, + "rewards/rejected": 0.4766036570072174, + "step": 1260 + }, + { + "epoch": 0.39998425258848075, + "grad_norm": 2.859375, + "learning_rate": 4.92431576985648e-06, + "logits/chosen": -0.5200837850570679, + "logits/rejected": -0.3553524315357208, + "logps/chosen": -207.01754760742188, + "logps/rejected": -175.571533203125, + "loss": 0.6289, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6721639633178711, + "rewards/margins": 0.16469912230968475, + "rewards/rejected": 0.5074647665023804, + "step": 1270 + }, + { + "epoch": 0.40313373489232707, + "grad_norm": 2.828125, + "learning_rate": 4.9230232021802116e-06, + "logits/chosen": -0.4531930088996887, + "logits/rejected": -0.33995673060417175, + "logps/chosen": -182.60243225097656, + "logps/rejected": -167.54733276367188, + "loss": 0.6632, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5690507292747498, + "rewards/margins": 0.08255408704280853, + "rewards/rejected": 0.4864966869354248, + "step": 1280 + }, + { + "epoch": 0.4062832171961734, + "grad_norm": 2.421875, + "learning_rate": 4.921719862794741e-06, + "logits/chosen": -0.5172183513641357, + "logits/rejected": -0.43691587448120117, + "logps/chosen": -198.16488647460938, + "logps/rejected": -176.88308715820312, + "loss": 0.6577, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5772544741630554, + "rewards/margins": 0.09580488502979279, + "rewards/rejected": 0.4814496636390686, + "step": 1290 + }, + { + "epoch": 0.4094326995000197, + "grad_norm": 3.09375, + "learning_rate": 4.920405757494147e-06, + "logits/chosen": -0.5242056846618652, + "logits/rejected": -0.41546088457107544, + "logps/chosen": -205.5856475830078, + "logps/rejected": -165.4510040283203, + "loss": 0.6403, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6132642030715942, + "rewards/margins": 0.13079342246055603, + "rewards/rejected": 0.4824707508087158, + "step": 1300 + }, + { + "epoch": 0.41258218180386597, + "grad_norm": 3.78125, + "learning_rate": 4.919080892120375e-06, + "logits/chosen": -0.4625665247440338, + "logits/rejected": -0.36404842138290405, + "logps/chosen": -186.5456085205078, + "logps/rejected": -172.53269958496094, + "loss": 0.6757, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.5738548040390015, + "rewards/margins": 0.07151280343532562, + "rewards/rejected": 0.5023420453071594, + "step": 1310 + }, + { + "epoch": 0.4157316641077123, + "grad_norm": 2.21875, + "learning_rate": 4.917745272563198e-06, + "logits/chosen": -0.514171302318573, + "logits/rejected": -0.4131143093109131, + "logps/chosen": -190.60104370117188, + "logps/rejected": -163.16876220703125, + "loss": 0.6377, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.644982635974884, + "rewards/margins": 0.149316668510437, + "rewards/rejected": 0.49566593766212463, + "step": 1320 + }, + { + "epoch": 0.4188811464115586, + "grad_norm": 3.25, + "learning_rate": 4.916398904760202e-06, + "logits/chosen": -0.407987117767334, + "logits/rejected": -0.3450384736061096, + "logps/chosen": -204.9993133544922, + "logps/rejected": -177.96536254882812, + "loss": 0.6839, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6174831390380859, + "rewards/margins": 0.06308461725711823, + "rewards/rejected": 0.5543986558914185, + "step": 1330 + }, + { + "epoch": 0.4220306287154049, + "grad_norm": 3.359375, + "learning_rate": 4.915041794696755e-06, + "logits/chosen": -0.4585428833961487, + "logits/rejected": -0.32048022747039795, + "logps/chosen": -205.7061004638672, + "logps/rejected": -188.31765747070312, + "loss": 0.6532, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6447404623031616, + "rewards/margins": 0.11661551147699356, + "rewards/rejected": 0.5281249284744263, + "step": 1340 + }, + { + "epoch": 0.4251801110192512, + "grad_norm": 2.921875, + "learning_rate": 4.913673948405977e-06, + "logits/chosen": -0.425508975982666, + "logits/rejected": -0.3008427619934082, + "logps/chosen": -188.8653564453125, + "logps/rejected": -154.46444702148438, + "loss": 0.6551, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5882565379142761, + "rewards/margins": 0.11206309497356415, + "rewards/rejected": 0.4761934280395508, + "step": 1350 + }, + { + "epoch": 0.4283295933230975, + "grad_norm": 2.484375, + "learning_rate": 4.91229537196872e-06, + "logits/chosen": -0.466022253036499, + "logits/rejected": -0.40916579961776733, + "logps/chosen": -188.9302978515625, + "logps/rejected": -176.31539916992188, + "loss": 0.6762, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.5576030015945435, + "rewards/margins": 0.059089891612529755, + "rewards/rejected": 0.4985131323337555, + "step": 1360 + }, + { + "epoch": 0.4314790756269438, + "grad_norm": 2.203125, + "learning_rate": 4.910906071513536e-06, + "logits/chosen": -0.468529611825943, + "logits/rejected": -0.3725103735923767, + "logps/chosen": -189.84571838378906, + "logps/rejected": -157.89552307128906, + "loss": 0.6576, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.5729373693466187, + "rewards/margins": 0.10841979831457138, + "rewards/rejected": 0.46451759338378906, + "step": 1370 + }, + { + "epoch": 0.43462855793079014, + "grad_norm": 2.578125, + "learning_rate": 4.9095060532166515e-06, + "logits/chosen": -0.4988088607788086, + "logits/rejected": -0.3795704245567322, + "logps/chosen": -187.31626892089844, + "logps/rejected": -166.83358764648438, + "loss": 0.6576, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5874306559562683, + "rewards/margins": 0.10258360207080841, + "rewards/rejected": 0.4848470687866211, + "step": 1380 + }, + { + "epoch": 0.43777804023463646, + "grad_norm": 2.875, + "learning_rate": 4.90809532330194e-06, + "logits/chosen": -0.44124871492385864, + "logits/rejected": -0.33731183409690857, + "logps/chosen": -193.7071990966797, + "logps/rejected": -182.87266540527344, + "loss": 0.6483, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6297098398208618, + "rewards/margins": 0.11017205566167831, + "rewards/rejected": 0.5195378065109253, + "step": 1390 + }, + { + "epoch": 0.4409275225384827, + "grad_norm": 2.75, + "learning_rate": 4.906673888040895e-06, + "logits/chosen": -0.470440149307251, + "logits/rejected": -0.40894460678100586, + "logps/chosen": -201.374755859375, + "logps/rejected": -162.78060913085938, + "loss": 0.6548, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6208814382553101, + "rewards/margins": 0.12041078507900238, + "rewards/rejected": 0.5004706382751465, + "step": 1400 + }, + { + "epoch": 0.44407700484232904, + "grad_norm": 3.234375, + "learning_rate": 4.905241753752599e-06, + "logits/chosen": -0.45281878113746643, + "logits/rejected": -0.3816523253917694, + "logps/chosen": -187.10189819335938, + "logps/rejected": -180.3949737548828, + "loss": 0.6594, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6287654638290405, + "rewards/margins": 0.09312457591295242, + "rewards/rejected": 0.5356410145759583, + "step": 1410 + }, + { + "epoch": 0.44722648714617536, + "grad_norm": 2.390625, + "learning_rate": 4.903798926803701e-06, + "logits/chosen": -0.5540148615837097, + "logits/rejected": -0.31469932198524475, + "logps/chosen": -195.7555694580078, + "logps/rejected": -160.02955627441406, + "loss": 0.6663, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5776636004447937, + "rewards/margins": 0.08535424619913101, + "rewards/rejected": 0.4923093318939209, + "step": 1420 + }, + { + "epoch": 0.4503759694500217, + "grad_norm": 3.03125, + "learning_rate": 4.902345413608382e-06, + "logits/chosen": -0.5317307710647583, + "logits/rejected": -0.4287118911743164, + "logps/chosen": -212.67172241210938, + "logps/rejected": -184.7850799560547, + "loss": 0.6522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6378628015518188, + "rewards/margins": 0.11098313331604004, + "rewards/rejected": 0.5268796682357788, + "step": 1430 + }, + { + "epoch": 0.45352545175386794, + "grad_norm": 2.890625, + "learning_rate": 4.900881220628332e-06, + "logits/chosen": -0.4813712537288666, + "logits/rejected": -0.39985647797584534, + "logps/chosen": -198.0911407470703, + "logps/rejected": -183.64688110351562, + "loss": 0.6738, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.603533148765564, + "rewards/margins": 0.07522164285182953, + "rewards/rejected": 0.528311550617218, + "step": 1440 + }, + { + "epoch": 0.45667493405771425, + "grad_norm": 2.703125, + "learning_rate": 4.899406354372716e-06, + "logits/chosen": -0.48485565185546875, + "logits/rejected": -0.335989385843277, + "logps/chosen": -213.813720703125, + "logps/rejected": -169.41334533691406, + "loss": 0.6284, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.667735755443573, + "rewards/margins": 0.18240274488925934, + "rewards/rejected": 0.48533302545547485, + "step": 1450 + }, + { + "epoch": 0.4598244163615606, + "grad_norm": 3.015625, + "learning_rate": 4.897920821398149e-06, + "logits/chosen": -0.36127400398254395, + "logits/rejected": -0.33091285824775696, + "logps/chosen": -198.51376342773438, + "logps/rejected": -179.6190643310547, + "loss": 0.6766, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6157578825950623, + "rewards/margins": 0.06863512098789215, + "rewards/rejected": 0.5471227765083313, + "step": 1460 + }, + { + "epoch": 0.4629738986654069, + "grad_norm": 2.34375, + "learning_rate": 4.896424628308666e-06, + "logits/chosen": -0.49601975083351135, + "logits/rejected": -0.3733476996421814, + "logps/chosen": -201.9625701904297, + "logps/rejected": -163.45664978027344, + "loss": 0.646, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6690108180046082, + "rewards/margins": 0.130323126912117, + "rewards/rejected": 0.5386877059936523, + "step": 1470 + }, + { + "epoch": 0.46612338096925315, + "grad_norm": 2.625, + "learning_rate": 4.894917781755693e-06, + "logits/chosen": -0.40988796949386597, + "logits/rejected": -0.2908838391304016, + "logps/chosen": -174.8596954345703, + "logps/rejected": -152.51849365234375, + "loss": 0.6666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5752810835838318, + "rewards/margins": 0.07995191216468811, + "rewards/rejected": 0.4953291416168213, + "step": 1480 + }, + { + "epoch": 0.46927286327309947, + "grad_norm": 3.203125, + "learning_rate": 4.893400288438013e-06, + "logits/chosen": -0.4711819291114807, + "logits/rejected": -0.34161874651908875, + "logps/chosen": -210.2778778076172, + "logps/rejected": -188.28152465820312, + "loss": 0.652, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6171534061431885, + "rewards/margins": 0.11316442489624023, + "rewards/rejected": 0.5039889216423035, + "step": 1490 + }, + { + "epoch": 0.4724223455769458, + "grad_norm": 2.546875, + "learning_rate": 4.891872155101746e-06, + "logits/chosen": -0.45833712816238403, + "logits/rejected": -0.3544694781303406, + "logps/chosen": -205.4825439453125, + "logps/rejected": -174.62255859375, + "loss": 0.6504, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6301782131195068, + "rewards/margins": 0.11798008531332016, + "rewards/rejected": 0.5121980905532837, + "step": 1500 + }, + { + "epoch": 0.4755718278807921, + "grad_norm": 3.3125, + "learning_rate": 4.890333388540306e-06, + "logits/chosen": -0.3727184236049652, + "logits/rejected": -0.2826997637748718, + "logps/chosen": -219.3084716796875, + "logps/rejected": -186.0581512451172, + "loss": 0.6574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6568127870559692, + "rewards/margins": 0.10541248321533203, + "rewards/rejected": 0.551400363445282, + "step": 1510 + }, + { + "epoch": 0.4787213101846384, + "grad_norm": 4.21875, + "learning_rate": 4.888783995594383e-06, + "logits/chosen": -0.5030876398086548, + "logits/rejected": -0.41888195276260376, + "logps/chosen": -210.5947723388672, + "logps/rejected": -189.0829620361328, + "loss": 0.6691, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6370301842689514, + "rewards/margins": 0.07603044807910919, + "rewards/rejected": 0.5609997510910034, + "step": 1520 + }, + { + "epoch": 0.4818707924884847, + "grad_norm": 2.828125, + "learning_rate": 4.887223983151905e-06, + "logits/chosen": -0.45775994658470154, + "logits/rejected": -0.3528696894645691, + "logps/chosen": -205.0533447265625, + "logps/rejected": -169.550048828125, + "loss": 0.6478, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6256412863731384, + "rewards/margins": 0.12434478104114532, + "rewards/rejected": 0.5012965202331543, + "step": 1530 + }, + { + "epoch": 0.485020274792331, + "grad_norm": 2.46875, + "learning_rate": 4.88565335814801e-06, + "logits/chosen": -0.4910427927970886, + "logits/rejected": -0.3841659128665924, + "logps/chosen": -224.6226348876953, + "logps/rejected": -191.9573516845703, + "loss": 0.6349, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6829046010971069, + "rewards/margins": 0.15164795517921448, + "rewards/rejected": 0.5312565565109253, + "step": 1540 + }, + { + "epoch": 0.4881697570961773, + "grad_norm": 2.5625, + "learning_rate": 4.884072127565015e-06, + "logits/chosen": -0.5355531573295593, + "logits/rejected": -0.4093754291534424, + "logps/chosen": -190.81552124023438, + "logps/rejected": -157.70872497558594, + "loss": 0.6334, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6082229018211365, + "rewards/margins": 0.15607663989067078, + "rewards/rejected": 0.4521462321281433, + "step": 1550 + }, + { + "epoch": 0.49131923940002364, + "grad_norm": 3.078125, + "learning_rate": 4.882480298432384e-06, + "logits/chosen": -0.4936140179634094, + "logits/rejected": -0.3383339047431946, + "logps/chosen": -200.6589813232422, + "logps/rejected": -171.00064086914062, + "loss": 0.6331, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6500834226608276, + "rewards/margins": 0.15458612143993378, + "rewards/rejected": 0.49549728631973267, + "step": 1560 + }, + { + "epoch": 0.4944687217038699, + "grad_norm": 3.578125, + "learning_rate": 4.8808778778266985e-06, + "logits/chosen": -0.4825199246406555, + "logits/rejected": -0.3065822124481201, + "logps/chosen": -222.0049285888672, + "logps/rejected": -180.4205780029297, + "loss": 0.6483, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7011119723320007, + "rewards/margins": 0.13294753432273865, + "rewards/rejected": 0.5681644678115845, + "step": 1570 + }, + { + "epoch": 0.4976182040077162, + "grad_norm": 2.890625, + "learning_rate": 4.879264872871625e-06, + "logits/chosen": -0.47797495126724243, + "logits/rejected": -0.3094409704208374, + "logps/chosen": -211.0563201904297, + "logps/rejected": -168.85647583007812, + "loss": 0.6436, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.65972501039505, + "rewards/margins": 0.13198061287403107, + "rewards/rejected": 0.5277442932128906, + "step": 1580 + }, + { + "epoch": 0.5007676863115625, + "grad_norm": 2.4375, + "learning_rate": 4.8776412907378845e-06, + "logits/chosen": -0.4951102137565613, + "logits/rejected": -0.4047602117061615, + "logps/chosen": -200.99998474121094, + "logps/rejected": -183.0837860107422, + "loss": 0.6603, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6028062105178833, + "rewards/margins": 0.10012507438659668, + "rewards/rejected": 0.5026811361312866, + "step": 1590 + }, + { + "epoch": 0.5039171686154088, + "grad_norm": 2.953125, + "learning_rate": 4.876007138643216e-06, + "logits/chosen": -0.47414493560791016, + "logits/rejected": -0.40259408950805664, + "logps/chosen": -184.4724884033203, + "logps/rejected": -163.58192443847656, + "loss": 0.6457, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6546593904495239, + "rewards/margins": 0.1281435191631317, + "rewards/rejected": 0.5265159010887146, + "step": 1600 + }, + { + "epoch": 0.5070666509192552, + "grad_norm": 3.34375, + "learning_rate": 4.874362423852352e-06, + "logits/chosen": -0.5160477161407471, + "logits/rejected": -0.43918365240097046, + "logps/chosen": -185.5178985595703, + "logps/rejected": -165.6835479736328, + "loss": 0.6668, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6197860836982727, + "rewards/margins": 0.08066993951797485, + "rewards/rejected": 0.5391160845756531, + "step": 1610 + }, + { + "epoch": 0.5102161332231014, + "grad_norm": 3.40625, + "learning_rate": 4.872707153676979e-06, + "logits/chosen": -0.466305673122406, + "logits/rejected": -0.3604885935783386, + "logps/chosen": -213.26834106445312, + "logps/rejected": -181.22439575195312, + "loss": 0.6655, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6425263285636902, + "rewards/margins": 0.08319854736328125, + "rewards/rejected": 0.5593277812004089, + "step": 1620 + }, + { + "epoch": 0.5133656155269477, + "grad_norm": 3.046875, + "learning_rate": 4.871041335475712e-06, + "logits/chosen": -0.5054647922515869, + "logits/rejected": -0.4364451467990875, + "logps/chosen": -186.38011169433594, + "logps/rejected": -157.52462768554688, + "loss": 0.674, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5790429711341858, + "rewards/margins": 0.07125049829483032, + "rewards/rejected": 0.5077924728393555, + "step": 1630 + }, + { + "epoch": 0.5165150978307941, + "grad_norm": 3.421875, + "learning_rate": 4.869364976654052e-06, + "logits/chosen": -0.5144768953323364, + "logits/rejected": -0.4074745774269104, + "logps/chosen": -209.39431762695312, + "logps/rejected": -187.3798065185547, + "loss": 0.6849, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.655154824256897, + "rewards/margins": 0.0526747927069664, + "rewards/rejected": 0.6024800539016724, + "step": 1640 + }, + { + "epoch": 0.5196645801346403, + "grad_norm": 3.140625, + "learning_rate": 4.867678084664365e-06, + "logits/chosen": -0.4594438672065735, + "logits/rejected": -0.3518297076225281, + "logps/chosen": -200.11427307128906, + "logps/rejected": -160.406494140625, + "loss": 0.6205, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6651702523231506, + "rewards/margins": 0.18755197525024414, + "rewards/rejected": 0.4776183068752289, + "step": 1650 + }, + { + "epoch": 0.5228140624384867, + "grad_norm": 2.515625, + "learning_rate": 4.865980667005839e-06, + "logits/chosen": -0.5308347940444946, + "logits/rejected": -0.38161686062812805, + "logps/chosen": -228.52413940429688, + "logps/rejected": -180.50119018554688, + "loss": 0.6231, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.7223028540611267, + "rewards/margins": 0.1841433346271515, + "rewards/rejected": 0.5381595492362976, + "step": 1660 + }, + { + "epoch": 0.525963544742333, + "grad_norm": 3.28125, + "learning_rate": 4.864272731224457e-06, + "logits/chosen": -0.4550582468509674, + "logits/rejected": -0.36829763650894165, + "logps/chosen": -199.5106964111328, + "logps/rejected": -177.867919921875, + "loss": 0.6481, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7055026888847351, + "rewards/margins": 0.13270851969718933, + "rewards/rejected": 0.5727940797805786, + "step": 1670 + }, + { + "epoch": 0.5291130270461792, + "grad_norm": 2.40625, + "learning_rate": 4.862554284912961e-06, + "logits/chosen": -0.5029420256614685, + "logits/rejected": -0.37519973516464233, + "logps/chosen": -193.17857360839844, + "logps/rejected": -151.67193603515625, + "loss": 0.6445, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.5841437578201294, + "rewards/margins": 0.12721844017505646, + "rewards/rejected": 0.4569253921508789, + "step": 1680 + }, + { + "epoch": 0.5322625093500256, + "grad_norm": 2.546875, + "learning_rate": 4.860825335710815e-06, + "logits/chosen": -0.4503244459629059, + "logits/rejected": -0.33839210867881775, + "logps/chosen": -221.6410675048828, + "logps/rejected": -179.1236114501953, + "loss": 0.6475, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7080546617507935, + "rewards/margins": 0.12341801822185516, + "rewards/rejected": 0.5846366286277771, + "step": 1690 + }, + { + "epoch": 0.5354119916538719, + "grad_norm": 2.859375, + "learning_rate": 4.8590858913041775e-06, + "logits/chosen": -0.5011571645736694, + "logits/rejected": -0.34209686517715454, + "logps/chosen": -194.4629364013672, + "logps/rejected": -162.49404907226562, + "loss": 0.6226, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6779108643531799, + "rewards/margins": 0.17823724448680878, + "rewards/rejected": 0.49967360496520996, + "step": 1700 + }, + { + "epoch": 0.5385614739577183, + "grad_norm": 2.484375, + "learning_rate": 4.857335959425864e-06, + "logits/chosen": -0.47282689809799194, + "logits/rejected": -0.387722373008728, + "logps/chosen": -188.14022827148438, + "logps/rejected": -163.66702270507812, + "loss": 0.6431, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6209579110145569, + "rewards/margins": 0.12763172388076782, + "rewards/rejected": 0.49332618713378906, + "step": 1710 + }, + { + "epoch": 0.5417109562615645, + "grad_norm": 3.125, + "learning_rate": 4.85557554785531e-06, + "logits/chosen": -0.5164914727210999, + "logits/rejected": -0.4268696904182434, + "logps/chosen": -194.6099090576172, + "logps/rejected": -171.3253631591797, + "loss": 0.6742, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6653293371200562, + "rewards/margins": 0.08088545501232147, + "rewards/rejected": 0.5844438672065735, + "step": 1720 + }, + { + "epoch": 0.5448604385654108, + "grad_norm": 2.390625, + "learning_rate": 4.853804664418543e-06, + "logits/chosen": -0.5048812627792358, + "logits/rejected": -0.36949628591537476, + "logps/chosen": -186.7891845703125, + "logps/rejected": -177.64492797851562, + "loss": 0.6929, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6204543113708496, + "rewards/margins": 0.03395826369524002, + "rewards/rejected": 0.5864960551261902, + "step": 1730 + }, + { + "epoch": 0.5480099208692571, + "grad_norm": 2.75, + "learning_rate": 4.85202331698814e-06, + "logits/chosen": -0.5082224011421204, + "logits/rejected": -0.3899040222167969, + "logps/chosen": -189.59127807617188, + "logps/rejected": -176.00732421875, + "loss": 0.6808, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.5799592733383179, + "rewards/margins": 0.04768219217658043, + "rewards/rejected": 0.5322771072387695, + "step": 1740 + }, + { + "epoch": 0.5511594031731034, + "grad_norm": 2.796875, + "learning_rate": 4.8502315134832e-06, + "logits/chosen": -0.45675116777420044, + "logits/rejected": -0.35309940576553345, + "logps/chosen": -182.9220733642578, + "logps/rejected": -161.3640899658203, + "loss": 0.6708, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.605731189250946, + "rewards/margins": 0.07132869213819504, + "rewards/rejected": 0.5344024896621704, + "step": 1750 + }, + { + "epoch": 0.5543088854769497, + "grad_norm": 3.015625, + "learning_rate": 4.848429261869303e-06, + "logits/chosen": -0.47061842679977417, + "logits/rejected": -0.32226455211639404, + "logps/chosen": -206.2819366455078, + "logps/rejected": -163.6657257080078, + "loss": 0.627, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7078389525413513, + "rewards/margins": 0.1701158881187439, + "rewards/rejected": 0.5377230644226074, + "step": 1760 + }, + { + "epoch": 0.557458367780796, + "grad_norm": 2.5, + "learning_rate": 4.8466165701584766e-06, + "logits/chosen": -0.46921786665916443, + "logits/rejected": -0.33290085196495056, + "logps/chosen": -185.49661254882812, + "logps/rejected": -155.57052612304688, + "loss": 0.6234, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6777461767196655, + "rewards/margins": 0.18197762966156006, + "rewards/rejected": 0.49576863646507263, + "step": 1770 + }, + { + "epoch": 0.5606078500846423, + "grad_norm": 3.53125, + "learning_rate": 4.844793446409162e-06, + "logits/chosen": -0.4977359175682068, + "logits/rejected": -0.3592739999294281, + "logps/chosen": -234.7196807861328, + "logps/rejected": -196.9207763671875, + "loss": 0.6667, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.725782573223114, + "rewards/margins": 0.08639247715473175, + "rewards/rejected": 0.6393901109695435, + "step": 1780 + }, + { + "epoch": 0.5637573323884887, + "grad_norm": 3.703125, + "learning_rate": 4.842959898726175e-06, + "logits/chosen": -0.42303353548049927, + "logits/rejected": -0.34682708978652954, + "logps/chosen": -233.24038696289062, + "logps/rejected": -207.0811309814453, + "loss": 0.6478, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.75257807970047, + "rewards/margins": 0.12484292685985565, + "rewards/rejected": 0.6277351975440979, + "step": 1790 + }, + { + "epoch": 0.566906814692335, + "grad_norm": 2.671875, + "learning_rate": 4.8411159352606735e-06, + "logits/chosen": -0.49121102690696716, + "logits/rejected": -0.3855198621749878, + "logps/chosen": -201.4905548095703, + "logps/rejected": -184.22596740722656, + "loss": 0.6408, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6799174547195435, + "rewards/margins": 0.1396731436252594, + "rewards/rejected": 0.5402444005012512, + "step": 1800 + }, + { + "epoch": 0.5700562969961812, + "grad_norm": 3.21875, + "learning_rate": 4.839261564210118e-06, + "logits/chosen": -0.4372677206993103, + "logits/rejected": -0.3174059987068176, + "logps/chosen": -182.1688995361328, + "logps/rejected": -167.50186157226562, + "loss": 0.6455, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5665633678436279, + "rewards/margins": 0.12508925795555115, + "rewards/rejected": 0.441474050283432, + "step": 1810 + }, + { + "epoch": 0.5732057793000276, + "grad_norm": 2.453125, + "learning_rate": 4.837396793818237e-06, + "logits/chosen": -0.5189486742019653, + "logits/rejected": -0.4490106701850891, + "logps/chosen": -169.6758575439453, + "logps/rejected": -160.53372192382812, + "loss": 0.6999, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.5530275106430054, + "rewards/margins": 0.015836771577596664, + "rewards/rejected": 0.5371907353401184, + "step": 1820 + }, + { + "epoch": 0.5763552616038738, + "grad_norm": 2.625, + "learning_rate": 4.83552163237499e-06, + "logits/chosen": -0.44089236855506897, + "logits/rejected": -0.31842055916786194, + "logps/chosen": -189.9591064453125, + "logps/rejected": -159.47850036621094, + "loss": 0.6333, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6711987257003784, + "rewards/margins": 0.16579605638980865, + "rewards/rejected": 0.505402684211731, + "step": 1830 + }, + { + "epoch": 0.5795047439077202, + "grad_norm": 2.390625, + "learning_rate": 4.8336360882165315e-06, + "logits/chosen": -0.4447326064109802, + "logits/rejected": -0.3511047661304474, + "logps/chosen": -189.64212036132812, + "logps/rejected": -163.2635955810547, + "loss": 0.6612, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6063627600669861, + "rewards/margins": 0.10564740747213364, + "rewards/rejected": 0.5007153749465942, + "step": 1840 + }, + { + "epoch": 0.5826542262115665, + "grad_norm": 2.71875, + "learning_rate": 4.831740169725172e-06, + "logits/chosen": -0.37459006905555725, + "logits/rejected": -0.27248674631118774, + "logps/chosen": -191.74412536621094, + "logps/rejected": -170.22665405273438, + "loss": 0.6765, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.6292241811752319, + "rewards/margins": 0.06548759341239929, + "rewards/rejected": 0.5637365579605103, + "step": 1850 + }, + { + "epoch": 0.5858037085154127, + "grad_norm": 3.171875, + "learning_rate": 4.829833885329341e-06, + "logits/chosen": -0.5164798498153687, + "logits/rejected": -0.3869698643684387, + "logps/chosen": -195.4289093017578, + "logps/rejected": -161.2754364013672, + "loss": 0.6763, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.5856376886367798, + "rewards/margins": 0.06769655644893646, + "rewards/rejected": 0.5179411172866821, + "step": 1860 + }, + { + "epoch": 0.5889531908192591, + "grad_norm": 2.71875, + "learning_rate": 4.827917243503552e-06, + "logits/chosen": -0.4844232499599457, + "logits/rejected": -0.38613173365592957, + "logps/chosen": -212.82217407226562, + "logps/rejected": -174.99215698242188, + "loss": 0.6457, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6664649844169617, + "rewards/margins": 0.13369064033031464, + "rewards/rejected": 0.5327743291854858, + "step": 1870 + }, + { + "epoch": 0.5921026731231054, + "grad_norm": 3.6875, + "learning_rate": 4.825990252768362e-06, + "logits/chosen": -0.4278056025505066, + "logits/rejected": -0.3540639579296112, + "logps/chosen": -191.68344116210938, + "logps/rejected": -180.2418670654297, + "loss": 0.6856, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6736811399459839, + "rewards/margins": 0.05576536804437637, + "rewards/rejected": 0.6179158091545105, + "step": 1880 + }, + { + "epoch": 0.5952521554269516, + "grad_norm": 2.703125, + "learning_rate": 4.824052921690337e-06, + "logits/chosen": -0.5202184915542603, + "logits/rejected": -0.38284236192703247, + "logps/chosen": -211.123046875, + "logps/rejected": -174.93304443359375, + "loss": 0.6467, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7286884188652039, + "rewards/margins": 0.13865350186824799, + "rewards/rejected": 0.5900349020957947, + "step": 1890 + }, + { + "epoch": 0.598401637730798, + "grad_norm": 3.03125, + "learning_rate": 4.822105258882007e-06, + "logits/chosen": -0.553580105304718, + "logits/rejected": -0.40661874413490295, + "logps/chosen": -231.7763214111328, + "logps/rejected": -190.30320739746094, + "loss": 0.6755, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7072332501411438, + "rewards/margins": 0.06532811373472214, + "rewards/rejected": 0.6419050693511963, + "step": 1900 + }, + { + "epoch": 0.6015511200346443, + "grad_norm": 3.1875, + "learning_rate": 4.8201472730018386e-06, + "logits/chosen": -0.4518907964229584, + "logits/rejected": -0.3795849680900574, + "logps/chosen": -205.5015411376953, + "logps/rejected": -180.77366638183594, + "loss": 0.6682, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6349667310714722, + "rewards/margins": 0.07740394026041031, + "rewards/rejected": 0.5575627684593201, + "step": 1910 + }, + { + "epoch": 0.6047006023384907, + "grad_norm": 2.484375, + "learning_rate": 4.818178972754184e-06, + "logits/chosen": -0.43323999643325806, + "logits/rejected": -0.4050220549106598, + "logps/chosen": -190.7349090576172, + "logps/rejected": -176.9745330810547, + "loss": 0.6589, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6163308024406433, + "rewards/margins": 0.09825930744409561, + "rewards/rejected": 0.5180714726448059, + "step": 1920 + }, + { + "epoch": 0.6078500846423369, + "grad_norm": 2.390625, + "learning_rate": 4.816200366889252e-06, + "logits/chosen": -0.48036471009254456, + "logits/rejected": -0.34876304864883423, + "logps/chosen": -202.17532348632812, + "logps/rejected": -171.40567016601562, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6224693059921265, + "rewards/margins": 0.08968141674995422, + "rewards/rejected": 0.5327879190444946, + "step": 1930 + }, + { + "epoch": 0.6109995669461832, + "grad_norm": 2.53125, + "learning_rate": 4.8142114642030665e-06, + "logits/chosen": -0.5331605672836304, + "logits/rejected": -0.35749322175979614, + "logps/chosen": -200.44296264648438, + "logps/rejected": -155.3129119873047, + "loss": 0.6277, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.686188817024231, + "rewards/margins": 0.17992666363716125, + "rewards/rejected": 0.5062621831893921, + "step": 1940 + }, + { + "epoch": 0.6141490492500296, + "grad_norm": 3.65625, + "learning_rate": 4.812212273537426e-06, + "logits/chosen": -0.5703829526901245, + "logits/rejected": -0.3986208736896515, + "logps/chosen": -207.4144287109375, + "logps/rejected": -166.04586791992188, + "loss": 0.635, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6602171659469604, + "rewards/margins": 0.1550987958908081, + "rewards/rejected": 0.5051184296607971, + "step": 1950 + }, + { + "epoch": 0.6172985315538758, + "grad_norm": 3.46875, + "learning_rate": 4.810202803779862e-06, + "logits/chosen": -0.49680987000465393, + "logits/rejected": -0.3343147039413452, + "logps/chosen": -194.55484008789062, + "logps/rejected": -171.62588500976562, + "loss": 0.6414, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6576918959617615, + "rewards/margins": 0.14642241597175598, + "rewards/rejected": 0.5112695693969727, + "step": 1960 + }, + { + "epoch": 0.6204480138577222, + "grad_norm": 3.015625, + "learning_rate": 4.808183063863606e-06, + "logits/chosen": -0.5439049005508423, + "logits/rejected": -0.4092562794685364, + "logps/chosen": -223.9301300048828, + "logps/rejected": -191.12759399414062, + "loss": 0.6458, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7495763897895813, + "rewards/margins": 0.13388705253601074, + "rewards/rejected": 0.6156893372535706, + "step": 1970 + }, + { + "epoch": 0.6235974961615685, + "grad_norm": 2.703125, + "learning_rate": 4.806153062767544e-06, + "logits/chosen": -0.4681766629219055, + "logits/rejected": -0.3462735116481781, + "logps/chosen": -192.7504425048828, + "logps/rejected": -170.0450897216797, + "loss": 0.6479, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6624137163162231, + "rewards/margins": 0.12909933924674988, + "rewards/rejected": 0.5333144068717957, + "step": 1980 + }, + { + "epoch": 0.6267469784654147, + "grad_norm": 2.78125, + "learning_rate": 4.804112809516181e-06, + "logits/chosen": -0.5108489394187927, + "logits/rejected": -0.3038786053657532, + "logps/chosen": -207.41226196289062, + "logps/rejected": -163.39027404785156, + "loss": 0.6386, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6961920261383057, + "rewards/margins": 0.14503511786460876, + "rewards/rejected": 0.5511568188667297, + "step": 1990 + }, + { + "epoch": 0.6298964607692611, + "grad_norm": 2.171875, + "learning_rate": 4.802062313179595e-06, + "logits/chosen": -0.4810725748538971, + "logits/rejected": -0.40924152731895447, + "logps/chosen": -187.09451293945312, + "logps/rejected": -162.91554260253906, + "loss": 0.661, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5915666818618774, + "rewards/margins": 0.10626377910375595, + "rewards/rejected": 0.4853029251098633, + "step": 2000 + }, + { + "epoch": 0.6298964607692611, + "eval_logits/chosen": -0.6020957231521606, + "eval_logits/rejected": -0.4771311283111572, + "eval_logps/chosen": -243.46115112304688, + "eval_logps/rejected": -222.5963897705078, + "eval_loss": 0.6673439145088196, + "eval_rewards/accuracies": 0.5848915576934814, + "eval_rewards/chosen": 0.7640087604522705, + "eval_rewards/margins": 0.09086808562278748, + "eval_rewards/rejected": 0.6731407046318054, + "eval_runtime": 3657.8705, + "eval_samples_per_second": 0.366, + "eval_steps_per_second": 0.366, + "step": 2000 + }, + { + "epoch": 0.6330459430731074, + "grad_norm": 2.984375, + "learning_rate": 4.800001582873405e-06, + "logits/chosen": -0.42106738686561584, + "logits/rejected": -0.35292476415634155, + "logps/chosen": -202.25643920898438, + "logps/rejected": -198.75399780273438, + "loss": 0.6953, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6455667614936829, + "rewards/margins": 0.026390869170427322, + "rewards/rejected": 0.6191757917404175, + "step": 2010 + }, + { + "epoch": 0.6361954253769536, + "grad_norm": 3.3125, + "learning_rate": 4.797930627758721e-06, + "logits/chosen": -0.3941357731819153, + "logits/rejected": -0.3840845823287964, + "logps/chosen": -193.46145629882812, + "logps/rejected": -182.74398803710938, + "loss": 0.6844, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6705250144004822, + "rewards/margins": 0.047097403556108475, + "rewards/rejected": 0.6234275698661804, + "step": 2020 + }, + { + "epoch": 0.6393449076808, + "grad_norm": 2.1875, + "learning_rate": 4.795849457042112e-06, + "logits/chosen": -0.5017488598823547, + "logits/rejected": -0.40243926644325256, + "logps/chosen": -190.13934326171875, + "logps/rejected": -166.44187927246094, + "loss": 0.6766, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6177536845207214, + "rewards/margins": 0.07091078162193298, + "rewards/rejected": 0.5468429327011108, + "step": 2030 + }, + { + "epoch": 0.6424943899846463, + "grad_norm": 2.671875, + "learning_rate": 4.793758079975559e-06, + "logits/chosen": -0.47853583097457886, + "logits/rejected": -0.3796117901802063, + "logps/chosen": -183.77645874023438, + "logps/rejected": -150.51734924316406, + "loss": 0.6283, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6325706243515015, + "rewards/margins": 0.16535897552967072, + "rewards/rejected": 0.46721166372299194, + "step": 2040 + }, + { + "epoch": 0.6456438722884926, + "grad_norm": 2.859375, + "learning_rate": 4.791656505856416e-06, + "logits/chosen": -0.49306803941726685, + "logits/rejected": -0.37095504999160767, + "logps/chosen": -193.6649932861328, + "logps/rejected": -169.91397094726562, + "loss": 0.6596, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6382758617401123, + "rewards/margins": 0.09585778415203094, + "rewards/rejected": 0.5424180030822754, + "step": 2050 + }, + { + "epoch": 0.6487933545923389, + "grad_norm": 4.375, + "learning_rate": 4.789544744027369e-06, + "logits/chosen": -0.456062376499176, + "logits/rejected": -0.326261430978775, + "logps/chosen": -202.9215087890625, + "logps/rejected": -158.33755493164062, + "loss": 0.6307, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6835106015205383, + "rewards/margins": 0.16993892192840576, + "rewards/rejected": 0.5135716199874878, + "step": 2060 + }, + { + "epoch": 0.6519428368961852, + "grad_norm": 2.90625, + "learning_rate": 4.787422803876394e-06, + "logits/chosen": -0.4271954596042633, + "logits/rejected": -0.32812589406967163, + "logps/chosen": -217.31643676757812, + "logps/rejected": -189.05111694335938, + "loss": 0.654, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7037121057510376, + "rewards/margins": 0.1129276379942894, + "rewards/rejected": 0.59078449010849, + "step": 2070 + }, + { + "epoch": 0.6550923192000315, + "grad_norm": 2.984375, + "learning_rate": 4.785290694836719e-06, + "logits/chosen": -0.4718199670314789, + "logits/rejected": -0.29942384362220764, + "logps/chosen": -194.9972686767578, + "logps/rejected": -162.9912872314453, + "loss": 0.6402, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7091890573501587, + "rewards/margins": 0.13689157366752625, + "rewards/rejected": 0.5722974538803101, + "step": 2080 + }, + { + "epoch": 0.6582418015038778, + "grad_norm": 2.40625, + "learning_rate": 4.783148426386771e-06, + "logits/chosen": -0.4268282353878021, + "logits/rejected": -0.2675458788871765, + "logps/chosen": -199.43051147460938, + "logps/rejected": -162.5355987548828, + "loss": 0.6125, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.665716826915741, + "rewards/margins": 0.20410867035388947, + "rewards/rejected": 0.4616081714630127, + "step": 2090 + }, + { + "epoch": 0.661391283807724, + "grad_norm": 2.265625, + "learning_rate": 4.7809960080501464e-06, + "logits/chosen": -0.4748079776763916, + "logits/rejected": -0.4310288429260254, + "logps/chosen": -201.29617309570312, + "logps/rejected": -184.75961303710938, + "loss": 0.6435, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7009934186935425, + "rewards/margins": 0.13489434123039246, + "rewards/rejected": 0.5660991668701172, + "step": 2100 + }, + { + "epoch": 0.6645407661115704, + "grad_norm": 2.671875, + "learning_rate": 4.778833449395563e-06, + "logits/chosen": -0.4732258915901184, + "logits/rejected": -0.3779997229576111, + "logps/chosen": -211.35055541992188, + "logps/rejected": -177.94529724121094, + "loss": 0.6271, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6552049517631531, + "rewards/margins": 0.1688978374004364, + "rewards/rejected": 0.48630720376968384, + "step": 2110 + }, + { + "epoch": 0.6676902484154167, + "grad_norm": 3.046875, + "learning_rate": 4.77666076003682e-06, + "logits/chosen": -0.5109846591949463, + "logits/rejected": -0.4116583466529846, + "logps/chosen": -201.0147705078125, + "logps/rejected": -173.7718048095703, + "loss": 0.6458, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6840208172798157, + "rewards/margins": 0.1273798793554306, + "rewards/rejected": 0.5566409230232239, + "step": 2120 + }, + { + "epoch": 0.6708397307192631, + "grad_norm": 2.84375, + "learning_rate": 4.774477949632747e-06, + "logits/chosen": -0.4809556007385254, + "logits/rejected": -0.37704578042030334, + "logps/chosen": -215.3496551513672, + "logps/rejected": -185.5915985107422, + "loss": 0.6738, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6900166273117065, + "rewards/margins": 0.08503197878599167, + "rewards/rejected": 0.604984700679779, + "step": 2130 + }, + { + "epoch": 0.6739892130231093, + "grad_norm": 2.8125, + "learning_rate": 4.772285027887174e-06, + "logits/chosen": -0.4865281581878662, + "logits/rejected": -0.3789558410644531, + "logps/chosen": -212.9844512939453, + "logps/rejected": -181.038818359375, + "loss": 0.6399, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6945775747299194, + "rewards/margins": 0.15178106725215912, + "rewards/rejected": 0.5427964329719543, + "step": 2140 + }, + { + "epoch": 0.6771386953269556, + "grad_norm": 2.28125, + "learning_rate": 4.770082004548878e-06, + "logits/chosen": -0.5607801675796509, + "logits/rejected": -0.4424077868461609, + "logps/chosen": -197.34896850585938, + "logps/rejected": -168.6945343017578, + "loss": 0.6788, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6424630880355835, + "rewards/margins": 0.06348511576652527, + "rewards/rejected": 0.5789780020713806, + "step": 2150 + }, + { + "epoch": 0.680288177630802, + "grad_norm": 2.890625, + "learning_rate": 4.767868889411545e-06, + "logits/chosen": -0.37069040536880493, + "logits/rejected": -0.29210925102233887, + "logps/chosen": -189.8524169921875, + "logps/rejected": -173.9405517578125, + "loss": 0.6838, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.648335337638855, + "rewards/margins": 0.061441101133823395, + "rewards/rejected": 0.586894154548645, + "step": 2160 + }, + { + "epoch": 0.6834376599346482, + "grad_norm": 2.53125, + "learning_rate": 4.765645692313724e-06, + "logits/chosen": -0.4717496931552887, + "logits/rejected": -0.38448435068130493, + "logps/chosen": -197.7617645263672, + "logps/rejected": -176.4130401611328, + "loss": 0.6377, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6932880282402039, + "rewards/margins": 0.14973357319831848, + "rewards/rejected": 0.543554425239563, + "step": 2170 + }, + { + "epoch": 0.6865871422384946, + "grad_norm": 3.265625, + "learning_rate": 4.763412423138784e-06, + "logits/chosen": -0.46070584654808044, + "logits/rejected": -0.37532466650009155, + "logps/chosen": -211.4508514404297, + "logps/rejected": -192.87716674804688, + "loss": 0.6801, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6881066560745239, + "rewards/margins": 0.05569648742675781, + "rewards/rejected": 0.6324101686477661, + "step": 2180 + }, + { + "epoch": 0.6897366245423409, + "grad_norm": 3.328125, + "learning_rate": 4.761169091814869e-06, + "logits/chosen": -0.44056111574172974, + "logits/rejected": -0.3600524961948395, + "logps/chosen": -212.5589599609375, + "logps/rejected": -194.01473999023438, + "loss": 0.66, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7078835368156433, + "rewards/margins": 0.10578237473964691, + "rewards/rejected": 0.60210120677948, + "step": 2190 + }, + { + "epoch": 0.6928861068461871, + "grad_norm": 2.671875, + "learning_rate": 4.758915708314858e-06, + "logits/chosen": -0.528985857963562, + "logits/rejected": -0.3610820770263672, + "logps/chosen": -214.4971923828125, + "logps/rejected": -172.77456665039062, + "loss": 0.6279, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7492777705192566, + "rewards/margins": 0.17471325397491455, + "rewards/rejected": 0.5745645761489868, + "step": 2200 + }, + { + "epoch": 0.6960355891500335, + "grad_norm": 3.328125, + "learning_rate": 4.756652282656314e-06, + "logits/chosen": -0.46792277693748474, + "logits/rejected": -0.3924694061279297, + "logps/chosen": -189.278076171875, + "logps/rejected": -168.0936737060547, + "loss": 0.6298, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6673378944396973, + "rewards/margins": 0.18005801737308502, + "rewards/rejected": 0.48727989196777344, + "step": 2210 + }, + { + "epoch": 0.6991850714538798, + "grad_norm": 2.765625, + "learning_rate": 4.754378824901447e-06, + "logits/chosen": -0.42372870445251465, + "logits/rejected": -0.3045392334461212, + "logps/chosen": -217.94894409179688, + "logps/rejected": -181.64312744140625, + "loss": 0.6352, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7333195209503174, + "rewards/margins": 0.15828455984592438, + "rewards/rejected": 0.5750349760055542, + "step": 2220 + }, + { + "epoch": 0.702334553757726, + "grad_norm": 2.59375, + "learning_rate": 4.752095345157062e-06, + "logits/chosen": -0.5427747368812561, + "logits/rejected": -0.3744064271450043, + "logps/chosen": -196.52786254882812, + "logps/rejected": -179.39456176757812, + "loss": 0.6659, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7010904550552368, + "rewards/margins": 0.10356839001178741, + "rewards/rejected": 0.5975220203399658, + "step": 2230 + }, + { + "epoch": 0.7054840360615724, + "grad_norm": 2.875, + "learning_rate": 4.7498018535745175e-06, + "logits/chosen": -0.45889267325401306, + "logits/rejected": -0.3741144835948944, + "logps/chosen": -194.2584228515625, + "logps/rejected": -169.21490478515625, + "loss": 0.6514, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6648105382919312, + "rewards/margins": 0.12509003281593323, + "rewards/rejected": 0.5397205352783203, + "step": 2240 + }, + { + "epoch": 0.7086335183654187, + "grad_norm": 2.953125, + "learning_rate": 4.747498360349681e-06, + "logits/chosen": -0.4748617708683014, + "logits/rejected": -0.3516277074813843, + "logps/chosen": -219.1868438720703, + "logps/rejected": -182.24143981933594, + "loss": 0.6369, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7704683542251587, + "rewards/margins": 0.16334478557109833, + "rewards/rejected": 0.6071235537528992, + "step": 2250 + }, + { + "epoch": 0.711783000669265, + "grad_norm": 2.59375, + "learning_rate": 4.745184875722887e-06, + "logits/chosen": -0.5461896657943726, + "logits/rejected": -0.4441626965999603, + "logps/chosen": -177.56777954101562, + "logps/rejected": -164.29269409179688, + "loss": 0.6576, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.63064044713974, + "rewards/margins": 0.11562252044677734, + "rewards/rejected": 0.5150178670883179, + "step": 2260 + }, + { + "epoch": 0.7149324829731113, + "grad_norm": 2.625, + "learning_rate": 4.7428614099788804e-06, + "logits/chosen": -0.4357683062553406, + "logits/rejected": -0.40783005952835083, + "logps/chosen": -180.0326690673828, + "logps/rejected": -161.7758331298828, + "loss": 0.6844, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5830633640289307, + "rewards/margins": 0.04567595571279526, + "rewards/rejected": 0.5373873710632324, + "step": 2270 + }, + { + "epoch": 0.7180819652769576, + "grad_norm": 2.953125, + "learning_rate": 4.740527973446782e-06, + "logits/chosen": -0.4672257900238037, + "logits/rejected": -0.38863813877105713, + "logps/chosen": -172.70326232910156, + "logps/rejected": -155.18206787109375, + "loss": 0.6836, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5959118008613586, + "rewards/margins": 0.06164885684847832, + "rewards/rejected": 0.5342629551887512, + "step": 2280 + }, + { + "epoch": 0.7212314475808039, + "grad_norm": 2.53125, + "learning_rate": 4.738184576500038e-06, + "logits/chosen": -0.4562186300754547, + "logits/rejected": -0.3595152199268341, + "logps/chosen": -202.3937530517578, + "logps/rejected": -184.3506622314453, + "loss": 0.6476, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6859248280525208, + "rewards/margins": 0.13335470855236053, + "rewards/rejected": 0.5525700449943542, + "step": 2290 + }, + { + "epoch": 0.7243809298846502, + "grad_norm": 3.390625, + "learning_rate": 4.735831229556374e-06, + "logits/chosen": -0.5141324996948242, + "logits/rejected": -0.4093483090400696, + "logps/chosen": -219.1161651611328, + "logps/rejected": -183.11634826660156, + "loss": 0.6545, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7390071153640747, + "rewards/margins": 0.13933688402175903, + "rewards/rejected": 0.5996701717376709, + "step": 2300 + }, + { + "epoch": 0.7275304121884966, + "grad_norm": 3.75, + "learning_rate": 4.733467943077747e-06, + "logits/chosen": -0.43999338150024414, + "logits/rejected": -0.3529604375362396, + "logps/chosen": -179.51895141601562, + "logps/rejected": -177.3287811279297, + "loss": 0.7027, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.5721325278282166, + "rewards/margins": 0.017483506351709366, + "rewards/rejected": 0.5546489953994751, + "step": 2310 + }, + { + "epoch": 0.7306798944923428, + "grad_norm": 3.71875, + "learning_rate": 4.731094727570305e-06, + "logits/chosen": -0.39424124360084534, + "logits/rejected": -0.32468560338020325, + "logps/chosen": -201.04409790039062, + "logps/rejected": -186.39602661132812, + "loss": 0.6709, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.615720808506012, + "rewards/margins": 0.08357678353786469, + "rewards/rejected": 0.5321440100669861, + "step": 2320 + }, + { + "epoch": 0.7338293767961891, + "grad_norm": 2.65625, + "learning_rate": 4.7287115935843335e-06, + "logits/chosen": -0.4321005940437317, + "logits/rejected": -0.3664671778678894, + "logps/chosen": -183.7452392578125, + "logps/rejected": -170.99435424804688, + "loss": 0.6807, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6428938508033752, + "rewards/margins": 0.06662772595882416, + "rewards/rejected": 0.5762661695480347, + "step": 2330 + }, + { + "epoch": 0.7369788591000355, + "grad_norm": 2.6875, + "learning_rate": 4.72631855171421e-06, + "logits/chosen": -0.407909095287323, + "logits/rejected": -0.3485127389431, + "logps/chosen": -189.66488647460938, + "logps/rejected": -168.92410278320312, + "loss": 0.6538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6725813150405884, + "rewards/margins": 0.11997097730636597, + "rewards/rejected": 0.5526103377342224, + "step": 2340 + }, + { + "epoch": 0.7401283414038817, + "grad_norm": 3.765625, + "learning_rate": 4.72391561259836e-06, + "logits/chosen": -0.4830864369869232, + "logits/rejected": -0.39991340041160583, + "logps/chosen": -196.81080627441406, + "logps/rejected": -173.09869384765625, + "loss": 0.6644, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.6581705808639526, + "rewards/margins": 0.10404298454523087, + "rewards/rejected": 0.55412757396698, + "step": 2350 + }, + { + "epoch": 0.743277823707728, + "grad_norm": 2.9375, + "learning_rate": 4.721502786919209e-06, + "logits/chosen": -0.5260214805603027, + "logits/rejected": -0.45939525961875916, + "logps/chosen": -191.50491333007812, + "logps/rejected": -162.9628143310547, + "loss": 0.6571, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6285362243652344, + "rewards/margins": 0.10894250869750977, + "rewards/rejected": 0.5195937156677246, + "step": 2360 + }, + { + "epoch": 0.7464273060115744, + "grad_norm": 2.390625, + "learning_rate": 4.719080085403131e-06, + "logits/chosen": -0.4764222204685211, + "logits/rejected": -0.37835368514060974, + "logps/chosen": -207.5322723388672, + "logps/rejected": -173.14564514160156, + "loss": 0.6403, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7107085585594177, + "rewards/margins": 0.15078091621398926, + "rewards/rejected": 0.5599276423454285, + "step": 2370 + }, + { + "epoch": 0.7495767883154206, + "grad_norm": 3.703125, + "learning_rate": 4.716647518820406e-06, + "logits/chosen": -0.51595538854599, + "logits/rejected": -0.3755626380443573, + "logps/chosen": -191.32260131835938, + "logps/rejected": -156.11282348632812, + "loss": 0.676, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6388931274414062, + "rewards/margins": 0.06408585608005524, + "rewards/rejected": 0.5748072862625122, + "step": 2380 + }, + { + "epoch": 0.752726270619267, + "grad_norm": 3.140625, + "learning_rate": 4.714205097985169e-06, + "logits/chosen": -0.4605388045310974, + "logits/rejected": -0.31849172711372375, + "logps/chosen": -183.9641876220703, + "logps/rejected": -168.69387817382812, + "loss": 0.6416, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7079667448997498, + "rewards/margins": 0.15685173869132996, + "rewards/rejected": 0.5511150360107422, + "step": 2390 + }, + { + "epoch": 0.7558757529231133, + "grad_norm": 3.5625, + "learning_rate": 4.711752833755362e-06, + "logits/chosen": -0.4923486113548279, + "logits/rejected": -0.36627882719039917, + "logps/chosen": -211.5367431640625, + "logps/rejected": -170.20559692382812, + "loss": 0.6441, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7119361162185669, + "rewards/margins": 0.14048628509044647, + "rewards/rejected": 0.571449875831604, + "step": 2400 + }, + { + "epoch": 0.7590252352269595, + "grad_norm": 3.328125, + "learning_rate": 4.7092907370326876e-06, + "logits/chosen": -0.4083434045314789, + "logits/rejected": -0.3242022693157196, + "logps/chosen": -195.62118530273438, + "logps/rejected": -170.5220489501953, + "loss": 0.6861, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6857455968856812, + "rewards/margins": 0.0565875768661499, + "rewards/rejected": 0.6291579604148865, + "step": 2410 + }, + { + "epoch": 0.7621747175308059, + "grad_norm": 2.65625, + "learning_rate": 4.706818818762558e-06, + "logits/chosen": -0.5046082735061646, + "logits/rejected": -0.35835257172584534, + "logps/chosen": -212.45217895507812, + "logps/rejected": -180.62155151367188, + "loss": 0.6265, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7465167045593262, + "rewards/margins": 0.1813477873802185, + "rewards/rejected": 0.5651688575744629, + "step": 2420 + }, + { + "epoch": 0.7653241998346522, + "grad_norm": 2.453125, + "learning_rate": 4.7043370899340505e-06, + "logits/chosen": -0.4568845331668854, + "logits/rejected": -0.4346873164176941, + "logps/chosen": -202.90444946289062, + "logps/rejected": -197.64659118652344, + "loss": 0.6894, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.6897009015083313, + "rewards/margins": 0.04634975641965866, + "rewards/rejected": 0.6433511972427368, + "step": 2430 + }, + { + "epoch": 0.7684736821384985, + "grad_norm": 3.109375, + "learning_rate": 4.701845561579853e-06, + "logits/chosen": -0.4123512804508209, + "logits/rejected": -0.3234286606311798, + "logps/chosen": -199.5214385986328, + "logps/rejected": -182.79100036621094, + "loss": 0.6436, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6898704767227173, + "rewards/margins": 0.13404139876365662, + "rewards/rejected": 0.5558291077613831, + "step": 2440 + }, + { + "epoch": 0.7716231644423448, + "grad_norm": 2.390625, + "learning_rate": 4.6993442447762185e-06, + "logits/chosen": -0.4709036350250244, + "logits/rejected": -0.3212384283542633, + "logps/chosen": -214.52877807617188, + "logps/rejected": -172.2939910888672, + "loss": 0.6349, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7454115152359009, + "rewards/margins": 0.16362550854682922, + "rewards/rejected": 0.5817859768867493, + "step": 2450 + }, + { + "epoch": 0.7747726467461911, + "grad_norm": 3.234375, + "learning_rate": 4.696833150642916e-06, + "logits/chosen": -0.4889756143093109, + "logits/rejected": -0.3762696385383606, + "logps/chosen": -200.27764892578125, + "logps/rejected": -167.56094360351562, + "loss": 0.646, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6864475011825562, + "rewards/margins": 0.13881604373455048, + "rewards/rejected": 0.5476315021514893, + "step": 2460 + }, + { + "epoch": 0.7779221290500374, + "grad_norm": 2.359375, + "learning_rate": 4.694312290343178e-06, + "logits/chosen": -0.5132132768630981, + "logits/rejected": -0.38872334361076355, + "logps/chosen": -200.6832733154297, + "logps/rejected": -171.80210876464844, + "loss": 0.6265, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.709212601184845, + "rewards/margins": 0.17675474286079407, + "rewards/rejected": 0.5324578881263733, + "step": 2470 + }, + { + "epoch": 0.7810716113538837, + "grad_norm": 2.734375, + "learning_rate": 4.691781675083658e-06, + "logits/chosen": -0.543319582939148, + "logits/rejected": -0.4530103802680969, + "logps/chosen": -193.91519165039062, + "logps/rejected": -169.05667114257812, + "loss": 0.6525, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6816273927688599, + "rewards/margins": 0.11835716664791107, + "rewards/rejected": 0.5632702112197876, + "step": 2480 + }, + { + "epoch": 0.78422109365773, + "grad_norm": 3.359375, + "learning_rate": 4.689241316114373e-06, + "logits/chosen": -0.42033663392066956, + "logits/rejected": -0.35506805777549744, + "logps/chosen": -197.7043914794922, + "logps/rejected": -181.24746704101562, + "loss": 0.6528, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.761360764503479, + "rewards/margins": 0.1297047734260559, + "rewards/rejected": 0.6316559910774231, + "step": 2490 + }, + { + "epoch": 0.7873705759615763, + "grad_norm": 2.828125, + "learning_rate": 4.686691224728652e-06, + "logits/chosen": -0.5081063508987427, + "logits/rejected": -0.3558959364891052, + "logps/chosen": -215.3639373779297, + "logps/rejected": -175.36788940429688, + "loss": 0.6667, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7302805781364441, + "rewards/margins": 0.10294453799724579, + "rewards/rejected": 0.6273361444473267, + "step": 2500 + }, + { + "epoch": 0.7905200582654226, + "grad_norm": 3.0, + "learning_rate": 4.684131412263098e-06, + "logits/chosen": -0.35183241963386536, + "logits/rejected": -0.3214506208896637, + "logps/chosen": -177.6040496826172, + "logps/rejected": -171.63784790039062, + "loss": 0.6771, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6120871901512146, + "rewards/margins": 0.06991194188594818, + "rewards/rejected": 0.5421752333641052, + "step": 2510 + }, + { + "epoch": 0.793669540569269, + "grad_norm": 2.28125, + "learning_rate": 4.681561890097525e-06, + "logits/chosen": -0.4272761344909668, + "logits/rejected": -0.35165831446647644, + "logps/chosen": -195.2444305419922, + "logps/rejected": -184.37887573242188, + "loss": 0.6776, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.68397456407547, + "rewards/margins": 0.0739150270819664, + "rewards/rejected": 0.6100595593452454, + "step": 2520 + }, + { + "epoch": 0.7968190228731152, + "grad_norm": 2.265625, + "learning_rate": 4.678982669654912e-06, + "logits/chosen": -0.423833429813385, + "logits/rejected": -0.37776055932044983, + "logps/chosen": -202.28225708007812, + "logps/rejected": -188.2001190185547, + "loss": 0.6442, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6994912624359131, + "rewards/margins": 0.13222193717956543, + "rewards/rejected": 0.5672692656517029, + "step": 2530 + }, + { + "epoch": 0.7999685051769615, + "grad_norm": 2.8125, + "learning_rate": 4.676393762401354e-06, + "logits/chosen": -0.4404567778110504, + "logits/rejected": -0.32071155309677124, + "logps/chosen": -197.72959899902344, + "logps/rejected": -161.87881469726562, + "loss": 0.6561, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6411703824996948, + "rewards/margins": 0.11388187110424042, + "rewards/rejected": 0.527288556098938, + "step": 2540 + }, + { + "epoch": 0.8031179874808079, + "grad_norm": 2.765625, + "learning_rate": 4.673795179846008e-06, + "logits/chosen": -0.4287866950035095, + "logits/rejected": -0.33532360196113586, + "logps/chosen": -207.0975341796875, + "logps/rejected": -175.67420959472656, + "loss": 0.6759, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6876527667045593, + "rewards/margins": 0.06763847172260284, + "rewards/rejected": 0.6200142502784729, + "step": 2550 + }, + { + "epoch": 0.8062674697846541, + "grad_norm": 2.90625, + "learning_rate": 4.671186933541044e-06, + "logits/chosen": -0.4286137521266937, + "logits/rejected": -0.37905916571617126, + "logps/chosen": -191.0441131591797, + "logps/rejected": -179.77825927734375, + "loss": 0.695, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6264538168907166, + "rewards/margins": 0.020585432648658752, + "rewards/rejected": 0.6058684587478638, + "step": 2560 + }, + { + "epoch": 0.8094169520885005, + "grad_norm": 2.96875, + "learning_rate": 4.668569035081594e-06, + "logits/chosen": -0.4553300440311432, + "logits/rejected": -0.4031829833984375, + "logps/chosen": -182.02127075195312, + "logps/rejected": -171.6331787109375, + "loss": 0.6973, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.6412879228591919, + "rewards/margins": 0.04937012866139412, + "rewards/rejected": 0.5919178128242493, + "step": 2570 + }, + { + "epoch": 0.8125664343923468, + "grad_norm": 2.828125, + "learning_rate": 4.665941496105697e-06, + "logits/chosen": -0.5085742473602295, + "logits/rejected": -0.3790132701396942, + "logps/chosen": -187.68478393554688, + "logps/rejected": -153.02972412109375, + "loss": 0.6575, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6509996056556702, + "rewards/margins": 0.11632029712200165, + "rewards/rejected": 0.5346792936325073, + "step": 2580 + }, + { + "epoch": 0.815715916696193, + "grad_norm": 2.9375, + "learning_rate": 4.663304328294251e-06, + "logits/chosen": -0.4928790032863617, + "logits/rejected": -0.31242626905441284, + "logps/chosen": -204.3889617919922, + "logps/rejected": -167.4778594970703, + "loss": 0.645, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6917664408683777, + "rewards/margins": 0.1491299867630005, + "rewards/rejected": 0.5426364541053772, + "step": 2590 + }, + { + "epoch": 0.8188653990000394, + "grad_norm": 3.609375, + "learning_rate": 4.660657543370958e-06, + "logits/chosen": -0.4446497857570648, + "logits/rejected": -0.35457903146743774, + "logps/chosen": -181.77471923828125, + "logps/rejected": -157.31527709960938, + "loss": 0.6608, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6617356538772583, + "rewards/margins": 0.09582933783531189, + "rewards/rejected": 0.5659063458442688, + "step": 2600 + }, + { + "epoch": 0.8220148813038857, + "grad_norm": 2.96875, + "learning_rate": 4.658001153102276e-06, + "logits/chosen": -0.44886890053749084, + "logits/rejected": -0.43147921562194824, + "logps/chosen": -186.99864196777344, + "logps/rejected": -188.51278686523438, + "loss": 0.6707, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6480501890182495, + "rewards/margins": 0.08241890370845795, + "rewards/rejected": 0.5656312108039856, + "step": 2610 + }, + { + "epoch": 0.8251643636077319, + "grad_norm": 2.4375, + "learning_rate": 4.655335169297363e-06, + "logits/chosen": -0.4991195797920227, + "logits/rejected": -0.3592751622200012, + "logps/chosen": -185.76898193359375, + "logps/rejected": -163.68087768554688, + "loss": 0.6537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6216106414794922, + "rewards/margins": 0.11423293501138687, + "rewards/rejected": 0.5073777437210083, + "step": 2620 + }, + { + "epoch": 0.8283138459115783, + "grad_norm": 2.484375, + "learning_rate": 4.652659603808024e-06, + "logits/chosen": -0.46877604722976685, + "logits/rejected": -0.3875640332698822, + "logps/chosen": -194.6293182373047, + "logps/rejected": -162.6995391845703, + "loss": 0.6556, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6514769792556763, + "rewards/margins": 0.10939432680606842, + "rewards/rejected": 0.542082667350769, + "step": 2630 + }, + { + "epoch": 0.8314633282154246, + "grad_norm": 3.0625, + "learning_rate": 4.6499744685286626e-06, + "logits/chosen": -0.4762224555015564, + "logits/rejected": -0.3948310911655426, + "logps/chosen": -176.2687530517578, + "logps/rejected": -162.93235778808594, + "loss": 0.678, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6167873740196228, + "rewards/margins": 0.05972397327423096, + "rewards/rejected": 0.5570634007453918, + "step": 2640 + }, + { + "epoch": 0.8346128105192709, + "grad_norm": 2.4375, + "learning_rate": 4.6472797753962255e-06, + "logits/chosen": -0.46065980195999146, + "logits/rejected": -0.4272928237915039, + "logps/chosen": -203.83956909179688, + "logps/rejected": -183.5655059814453, + "loss": 0.6521, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7077460289001465, + "rewards/margins": 0.13481923937797546, + "rewards/rejected": 0.5729268193244934, + "step": 2650 + }, + { + "epoch": 0.8377622928231172, + "grad_norm": 2.046875, + "learning_rate": 4.6445755363901465e-06, + "logits/chosen": -0.4757087826728821, + "logits/rejected": -0.38740482926368713, + "logps/chosen": -201.98423767089844, + "logps/rejected": -168.28964233398438, + "loss": 0.662, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6773720979690552, + "rewards/margins": 0.10439164936542511, + "rewards/rejected": 0.5729804039001465, + "step": 2660 + }, + { + "epoch": 0.8409117751269635, + "grad_norm": 3.09375, + "learning_rate": 4.641861763532299e-06, + "logits/chosen": -0.4650692939758301, + "logits/rejected": -0.3750719130039215, + "logps/chosen": -202.7363739013672, + "logps/rejected": -168.4525909423828, + "loss": 0.6652, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6195183992385864, + "rewards/margins": 0.08996371924877167, + "rewards/rejected": 0.529554545879364, + "step": 2670 + }, + { + "epoch": 0.8440612574308098, + "grad_norm": 3.0, + "learning_rate": 4.639138468886939e-06, + "logits/chosen": -0.49985313415527344, + "logits/rejected": -0.4105320870876312, + "logps/chosen": -200.1018524169922, + "logps/rejected": -182.53408813476562, + "loss": 0.6707, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.68363356590271, + "rewards/margins": 0.08553630113601685, + "rewards/rejected": 0.5980972051620483, + "step": 2680 + }, + { + "epoch": 0.8472107397346561, + "grad_norm": 3.390625, + "learning_rate": 4.636405664560652e-06, + "logits/chosen": -0.5170512199401855, + "logits/rejected": -0.4552704691886902, + "logps/chosen": -205.1244354248047, + "logps/rejected": -176.17617797851562, + "loss": 0.6613, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.7613251805305481, + "rewards/margins": 0.1176910400390625, + "rewards/rejected": 0.6436341404914856, + "step": 2690 + }, + { + "epoch": 0.8503602220385024, + "grad_norm": 2.65625, + "learning_rate": 4.6336633627023e-06, + "logits/chosen": -0.5023754835128784, + "logits/rejected": -0.3591773808002472, + "logps/chosen": -198.65286254882812, + "logps/rejected": -173.5179443359375, + "loss": 0.6609, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6720755696296692, + "rewards/margins": 0.10540957748889923, + "rewards/rejected": 0.5666660070419312, + "step": 2700 + }, + { + "epoch": 0.8535097043423487, + "grad_norm": 2.53125, + "learning_rate": 4.630911575502967e-06, + "logits/chosen": -0.4533337652683258, + "logits/rejected": -0.3400491774082184, + "logps/chosen": -190.54476928710938, + "logps/rejected": -167.88975524902344, + "loss": 0.6551, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.662422776222229, + "rewards/margins": 0.11403118073940277, + "rewards/rejected": 0.5483915209770203, + "step": 2710 + }, + { + "epoch": 0.856659186646195, + "grad_norm": 2.90625, + "learning_rate": 4.628150315195902e-06, + "logits/chosen": -0.44884181022644043, + "logits/rejected": -0.3520192801952362, + "logps/chosen": -192.61509704589844, + "logps/rejected": -169.34854125976562, + "loss": 0.6865, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6516313552856445, + "rewards/margins": 0.05064528435468674, + "rewards/rejected": 0.6009860634803772, + "step": 2720 + }, + { + "epoch": 0.8598086689500414, + "grad_norm": 2.84375, + "learning_rate": 4.625379594056472e-06, + "logits/chosen": -0.43172770738601685, + "logits/rejected": -0.3771997094154358, + "logps/chosen": -181.5345916748047, + "logps/rejected": -166.65841674804688, + "loss": 0.6848, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6033766865730286, + "rewards/margins": 0.052743762731552124, + "rewards/rejected": 0.5506329536437988, + "step": 2730 + }, + { + "epoch": 0.8629581512538876, + "grad_norm": 2.25, + "learning_rate": 4.6225994244020984e-06, + "logits/chosen": -0.4917459487915039, + "logits/rejected": -0.37200021743774414, + "logps/chosen": -209.8813018798828, + "logps/rejected": -180.4649658203125, + "loss": 0.6435, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7169169187545776, + "rewards/margins": 0.15931591391563416, + "rewards/rejected": 0.5576010346412659, + "step": 2740 + }, + { + "epoch": 0.8661076335577339, + "grad_norm": 2.953125, + "learning_rate": 4.61980981859221e-06, + "logits/chosen": -0.5261596441268921, + "logits/rejected": -0.4045354425907135, + "logps/chosen": -197.52328491210938, + "logps/rejected": -175.49851989746094, + "loss": 0.6605, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6951828598976135, + "rewards/margins": 0.11784076690673828, + "rewards/rejected": 0.5773420929908752, + "step": 2750 + }, + { + "epoch": 0.8692571158615803, + "grad_norm": 3.234375, + "learning_rate": 4.6170107890281826e-06, + "logits/chosen": -0.4892495274543762, + "logits/rejected": -0.3943815231323242, + "logps/chosen": -214.80078125, + "logps/rejected": -192.01844787597656, + "loss": 0.6471, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7895753979682922, + "rewards/margins": 0.12512117624282837, + "rewards/rejected": 0.6644541025161743, + "step": 2760 + }, + { + "epoch": 0.8724065981654265, + "grad_norm": 3.5625, + "learning_rate": 4.614202348153285e-06, + "logits/chosen": -0.506980299949646, + "logits/rejected": -0.37280508875846863, + "logps/chosen": -206.6781005859375, + "logps/rejected": -187.54193115234375, + "loss": 0.6726, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6904038190841675, + "rewards/margins": 0.0823880210518837, + "rewards/rejected": 0.6080158352851868, + "step": 2770 + }, + { + "epoch": 0.8755560804692729, + "grad_norm": 2.421875, + "learning_rate": 4.611384508452629e-06, + "logits/chosen": -0.47033706307411194, + "logits/rejected": -0.3309301435947418, + "logps/chosen": -221.22421264648438, + "logps/rejected": -190.77188110351562, + "loss": 0.6516, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.759658932685852, + "rewards/margins": 0.12068722397089005, + "rewards/rejected": 0.638971745967865, + "step": 2780 + }, + { + "epoch": 0.8787055627731192, + "grad_norm": 2.796875, + "learning_rate": 4.608557282453104e-06, + "logits/chosen": -0.5095947980880737, + "logits/rejected": -0.45932531356811523, + "logps/chosen": -189.14663696289062, + "logps/rejected": -171.67837524414062, + "loss": 0.6528, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6966843008995056, + "rewards/margins": 0.13209667801856995, + "rewards/rejected": 0.5645877122879028, + "step": 2790 + }, + { + "epoch": 0.8818550450769654, + "grad_norm": 2.390625, + "learning_rate": 4.605720682723331e-06, + "logits/chosen": -0.46834999322891235, + "logits/rejected": -0.3549017608165741, + "logps/chosen": -210.1326446533203, + "logps/rejected": -179.3318634033203, + "loss": 0.6451, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.761681318283081, + "rewards/margins": 0.15264348685741425, + "rewards/rejected": 0.6090378761291504, + "step": 2800 + }, + { + "epoch": 0.8850045273808118, + "grad_norm": 2.140625, + "learning_rate": 4.602874721873599e-06, + "logits/chosen": -0.5375205874443054, + "logits/rejected": -0.34866297245025635, + "logps/chosen": -188.57321166992188, + "logps/rejected": -155.67984008789062, + "loss": 0.6321, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6381895542144775, + "rewards/margins": 0.16713955998420715, + "rewards/rejected": 0.47105008363723755, + "step": 2810 + }, + { + "epoch": 0.8881540096846581, + "grad_norm": 3.390625, + "learning_rate": 4.600019412555816e-06, + "logits/chosen": -0.4357992112636566, + "logits/rejected": -0.28150540590286255, + "logps/chosen": -199.49620056152344, + "logps/rejected": -153.14419555664062, + "loss": 0.6476, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6509748101234436, + "rewards/margins": 0.14667873084545135, + "rewards/rejected": 0.5042960047721863, + "step": 2820 + }, + { + "epoch": 0.8913034919885043, + "grad_norm": 3.1875, + "learning_rate": 4.597154767463448e-06, + "logits/chosen": -0.4881827235221863, + "logits/rejected": -0.3464208245277405, + "logps/chosen": -201.01864624023438, + "logps/rejected": -183.83456420898438, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6923896074295044, + "rewards/margins": 0.09767551720142365, + "rewards/rejected": 0.5947140455245972, + "step": 2830 + }, + { + "epoch": 0.8944529742923507, + "grad_norm": 2.203125, + "learning_rate": 4.594280799331461e-06, + "logits/chosen": -0.516327977180481, + "logits/rejected": -0.4000956416130066, + "logps/chosen": -177.13763427734375, + "logps/rejected": -154.5478515625, + "loss": 0.6463, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6087929010391235, + "rewards/margins": 0.1329253911972046, + "rewards/rejected": 0.47586750984191895, + "step": 2840 + }, + { + "epoch": 0.897602456596197, + "grad_norm": 2.59375, + "learning_rate": 4.591397520936271e-06, + "logits/chosen": -0.4523008465766907, + "logits/rejected": -0.32660526037216187, + "logps/chosen": -190.58114624023438, + "logps/rejected": -168.5622100830078, + "loss": 0.6439, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6767908930778503, + "rewards/margins": 0.1442452073097229, + "rewards/rejected": 0.5325456857681274, + "step": 2850 + }, + { + "epoch": 0.9007519389000433, + "grad_norm": 2.46875, + "learning_rate": 4.588504945095684e-06, + "logits/chosen": -0.4655417799949646, + "logits/rejected": -0.3397763669490814, + "logps/chosen": -184.3746337890625, + "logps/rejected": -158.1368408203125, + "loss": 0.6466, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6702816486358643, + "rewards/margins": 0.1449626088142395, + "rewards/rejected": 0.5253190398216248, + "step": 2860 + }, + { + "epoch": 0.9039014212038896, + "grad_norm": 2.75, + "learning_rate": 4.585603084668833e-06, + "logits/chosen": -0.48204731941223145, + "logits/rejected": -0.3869457542896271, + "logps/chosen": -211.1409149169922, + "logps/rejected": -177.21664428710938, + "loss": 0.6278, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.693871796131134, + "rewards/margins": 0.17502956092357635, + "rewards/rejected": 0.5188421607017517, + "step": 2870 + }, + { + "epoch": 0.9070509035077359, + "grad_norm": 3.1875, + "learning_rate": 4.582691952556131e-06, + "logits/chosen": -0.516753077507019, + "logits/rejected": -0.3798757791519165, + "logps/chosen": -185.08062744140625, + "logps/rejected": -157.95521545410156, + "loss": 0.6113, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.6912366151809692, + "rewards/margins": 0.21181932091712952, + "rewards/rejected": 0.4794173240661621, + "step": 2880 + }, + { + "epoch": 0.9102003858115822, + "grad_norm": 2.375, + "learning_rate": 4.579771561699208e-06, + "logits/chosen": -0.5301073789596558, + "logits/rejected": -0.37922126054763794, + "logps/chosen": -200.29745483398438, + "logps/rejected": -163.3575439453125, + "loss": 0.6527, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.655502200126648, + "rewards/margins": 0.1373116374015808, + "rewards/rejected": 0.5181905627250671, + "step": 2890 + }, + { + "epoch": 0.9133498681154285, + "grad_norm": 3.28125, + "learning_rate": 4.576841925080853e-06, + "logits/chosen": -0.3888477683067322, + "logits/rejected": -0.32840999960899353, + "logps/chosen": -185.23165893554688, + "logps/rejected": -163.85369873046875, + "loss": 0.6592, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6471211314201355, + "rewards/margins": 0.11747239530086517, + "rewards/rejected": 0.5296487808227539, + "step": 2900 + }, + { + "epoch": 0.9164993504192749, + "grad_norm": 2.234375, + "learning_rate": 4.5739030557249595e-06, + "logits/chosen": -0.5069789886474609, + "logits/rejected": -0.3604618310928345, + "logps/chosen": -196.76856994628906, + "logps/rejected": -165.517822265625, + "loss": 0.6391, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6826483607292175, + "rewards/margins": 0.1527707278728485, + "rewards/rejected": 0.5298776030540466, + "step": 2910 + }, + { + "epoch": 0.9196488327231211, + "grad_norm": 2.375, + "learning_rate": 4.570954966696464e-06, + "logits/chosen": -0.43340712785720825, + "logits/rejected": -0.3375625014305115, + "logps/chosen": -195.20223999023438, + "logps/rejected": -172.97694396972656, + "loss": 0.6641, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6797399520874023, + "rewards/margins": 0.09967435896396637, + "rewards/rejected": 0.5800655484199524, + "step": 2920 + }, + { + "epoch": 0.9227983150269674, + "grad_norm": 2.6875, + "learning_rate": 4.56799767110129e-06, + "logits/chosen": -0.4303937554359436, + "logits/rejected": -0.38990846276283264, + "logps/chosen": -192.70545959472656, + "logps/rejected": -176.0408172607422, + "loss": 0.6566, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6560736894607544, + "rewards/margins": 0.10354320704936981, + "rewards/rejected": 0.5525304079055786, + "step": 2930 + }, + { + "epoch": 0.9259477973308138, + "grad_norm": 3.296875, + "learning_rate": 4.565031182086291e-06, + "logits/chosen": -0.50087571144104, + "logits/rejected": -0.3773635923862457, + "logps/chosen": -194.03750610351562, + "logps/rejected": -171.54971313476562, + "loss": 0.6346, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7052168846130371, + "rewards/margins": 0.16069751977920532, + "rewards/rejected": 0.544519305229187, + "step": 2940 + }, + { + "epoch": 0.92909727963466, + "grad_norm": 2.765625, + "learning_rate": 4.562055512839189e-06, + "logits/chosen": -0.46178698539733887, + "logits/rejected": -0.37201589345932007, + "logps/chosen": -205.4782257080078, + "logps/rejected": -171.72808837890625, + "loss": 0.6217, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7175599336624146, + "rewards/margins": 0.19239577651023865, + "rewards/rejected": 0.5251641273498535, + "step": 2950 + }, + { + "epoch": 0.9322467619385063, + "grad_norm": 2.484375, + "learning_rate": 4.559070676588516e-06, + "logits/chosen": -0.4568074643611908, + "logits/rejected": -0.37853866815567017, + "logps/chosen": -214.366455078125, + "logps/rejected": -184.67774963378906, + "loss": 0.6514, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7769818305969238, + "rewards/margins": 0.1274883896112442, + "rewards/rejected": 0.6494934558868408, + "step": 2960 + }, + { + "epoch": 0.9353962442423527, + "grad_norm": 3.078125, + "learning_rate": 4.55607668660356e-06, + "logits/chosen": -0.48704949021339417, + "logits/rejected": -0.4139239192008972, + "logps/chosen": -193.26736450195312, + "logps/rejected": -175.77593994140625, + "loss": 0.6544, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6955569982528687, + "rewards/margins": 0.11222921311855316, + "rewards/rejected": 0.5833277702331543, + "step": 2970 + }, + { + "epoch": 0.9385457265461989, + "grad_norm": 2.546875, + "learning_rate": 4.5530735561943e-06, + "logits/chosen": -0.4977632462978363, + "logits/rejected": -0.3273804187774658, + "logps/chosen": -193.0499725341797, + "logps/rejected": -158.284423828125, + "loss": 0.6462, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6608245968818665, + "rewards/margins": 0.13379625976085663, + "rewards/rejected": 0.5270283222198486, + "step": 2980 + }, + { + "epoch": 0.9416952088500453, + "grad_norm": 2.640625, + "learning_rate": 4.55006129871135e-06, + "logits/chosen": -0.5545817017555237, + "logits/rejected": -0.43782949447631836, + "logps/chosen": -191.93934631347656, + "logps/rejected": -167.23953247070312, + "loss": 0.6625, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6882120370864868, + "rewards/margins": 0.1209985762834549, + "rewards/rejected": 0.5672134160995483, + "step": 2990 + }, + { + "epoch": 0.9448446911538916, + "grad_norm": 2.859375, + "learning_rate": 4.547039927545899e-06, + "logits/chosen": -0.4707905650138855, + "logits/rejected": -0.3295817971229553, + "logps/chosen": -186.739501953125, + "logps/rejected": -162.34422302246094, + "loss": 0.6588, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.579537034034729, + "rewards/margins": 0.0972566232085228, + "rewards/rejected": 0.48228034377098083, + "step": 3000 + }, + { + "epoch": 0.9479941734577378, + "grad_norm": 2.578125, + "learning_rate": 4.544009456129651e-06, + "logits/chosen": -0.40460291504859924, + "logits/rejected": -0.36005955934524536, + "logps/chosen": -191.5205841064453, + "logps/rejected": -164.66702270507812, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.615665078163147, + "rewards/margins": 0.033144764602184296, + "rewards/rejected": 0.5825203061103821, + "step": 3010 + }, + { + "epoch": 0.9511436557615842, + "grad_norm": 3.125, + "learning_rate": 4.540969897934767e-06, + "logits/chosen": -0.4697691798210144, + "logits/rejected": -0.3581236004829407, + "logps/chosen": -205.1107940673828, + "logps/rejected": -161.09967041015625, + "loss": 0.6513, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6768245697021484, + "rewards/margins": 0.13728924095630646, + "rewards/rejected": 0.5395353436470032, + "step": 3020 + }, + { + "epoch": 0.9542931380654305, + "grad_norm": 3.3125, + "learning_rate": 4.537921266473802e-06, + "logits/chosen": -0.4509238302707672, + "logits/rejected": -0.41182202100753784, + "logps/chosen": -198.67684936523438, + "logps/rejected": -188.21649169921875, + "loss": 0.7044, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7039377093315125, + "rewards/margins": 0.03385835513472557, + "rewards/rejected": 0.6700793504714966, + "step": 3030 + }, + { + "epoch": 0.9574426203692769, + "grad_norm": 3.3125, + "learning_rate": 4.53486357529965e-06, + "logits/chosen": -0.48828214406967163, + "logits/rejected": -0.3500337302684784, + "logps/chosen": -187.39865112304688, + "logps/rejected": -156.00173950195312, + "loss": 0.64, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6695148944854736, + "rewards/margins": 0.1509798765182495, + "rewards/rejected": 0.5185350179672241, + "step": 3040 + }, + { + "epoch": 0.9605921026731231, + "grad_norm": 2.40625, + "learning_rate": 4.531796838005477e-06, + "logits/chosen": -0.5069050192832947, + "logits/rejected": -0.3991912305355072, + "logps/chosen": -197.77066040039062, + "logps/rejected": -163.296875, + "loss": 0.6187, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7524086236953735, + "rewards/margins": 0.1966829001903534, + "rewards/rejected": 0.5557257533073425, + "step": 3050 + }, + { + "epoch": 0.9637415849769694, + "grad_norm": 2.5, + "learning_rate": 4.5287210682246655e-06, + "logits/chosen": -0.42384999990463257, + "logits/rejected": -0.3603217601776123, + "logps/chosen": -183.27471923828125, + "logps/rejected": -170.30142211914062, + "loss": 0.6832, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5444117784500122, + "rewards/margins": 0.05068939924240112, + "rewards/rejected": 0.49372243881225586, + "step": 3060 + }, + { + "epoch": 0.9668910672808158, + "grad_norm": 6.25, + "learning_rate": 4.525636279630752e-06, + "logits/chosen": -0.4649208188056946, + "logits/rejected": -0.3279884159564972, + "logps/chosen": -204.52191162109375, + "logps/rejected": -171.454833984375, + "loss": 0.6164, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7678558826446533, + "rewards/margins": 0.21700289845466614, + "rewards/rejected": 0.5508529543876648, + "step": 3070 + }, + { + "epoch": 0.970040549584662, + "grad_norm": 2.65625, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -0.48648127913475037, + "logits/rejected": -0.3391122817993164, + "logps/chosen": -191.50942993164062, + "logps/rejected": -159.47317504882812, + "loss": 0.6609, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6665128469467163, + "rewards/margins": 0.10518161207437515, + "rewards/rejected": 0.5613312721252441, + "step": 3080 + }, + { + "epoch": 0.9731900318885083, + "grad_norm": 2.796875, + "learning_rate": 4.519439700898179e-06, + "logits/chosen": -0.5188948512077332, + "logits/rejected": -0.4177464544773102, + "logps/chosen": -205.6922607421875, + "logps/rejected": -177.05718994140625, + "loss": 0.6589, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7047790288925171, + "rewards/margins": 0.11938399076461792, + "rewards/rejected": 0.5853949785232544, + "step": 3090 + }, + { + "epoch": 0.9763395141923547, + "grad_norm": 3.078125, + "learning_rate": 4.516327938306818e-06, + "logits/chosen": -0.49799761176109314, + "logits/rejected": -0.3481315076351166, + "logps/chosen": -192.59019470214844, + "logps/rejected": -155.11587524414062, + "loss": 0.6338, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6473811268806458, + "rewards/margins": 0.15366099774837494, + "rewards/rejected": 0.4937201142311096, + "step": 3100 + }, + { + "epoch": 0.9794889964962009, + "grad_norm": 2.671875, + "learning_rate": 4.513207211996831e-06, + "logits/chosen": -0.43314170837402344, + "logits/rejected": -0.3531089425086975, + "logps/chosen": -195.55174255371094, + "logps/rejected": -162.95797729492188, + "loss": 0.6499, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6635746955871582, + "rewards/margins": 0.1176295131444931, + "rewards/rejected": 0.5459452271461487, + "step": 3110 + }, + { + "epoch": 0.9826384788000473, + "grad_norm": 3.015625, + "learning_rate": 4.510077535841612e-06, + "logits/chosen": -0.4261040687561035, + "logits/rejected": -0.3906673192977905, + "logps/chosen": -192.201416015625, + "logps/rejected": -177.6776123046875, + "loss": 0.6474, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7501270771026611, + "rewards/margins": 0.1394282579421997, + "rewards/rejected": 0.6106988191604614, + "step": 3120 + }, + { + "epoch": 0.9857879611038936, + "grad_norm": 4.0625, + "learning_rate": 4.506938923754342e-06, + "logits/chosen": -0.47423991560935974, + "logits/rejected": -0.36756032705307007, + "logps/chosen": -209.3015594482422, + "logps/rejected": -172.35110473632812, + "loss": 0.6551, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6946024894714355, + "rewards/margins": 0.12966330349445343, + "rewards/rejected": 0.5649392008781433, + "step": 3130 + }, + { + "epoch": 0.9889374434077398, + "grad_norm": 2.84375, + "learning_rate": 4.50379138968793e-06, + "logits/chosen": -0.4364834725856781, + "logits/rejected": -0.28871777653694153, + "logps/chosen": -206.86984252929688, + "logps/rejected": -161.70379638671875, + "loss": 0.6198, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7661986351013184, + "rewards/margins": 0.20797491073608398, + "rewards/rejected": 0.5582237839698792, + "step": 3140 + }, + { + "epoch": 0.9920869257115862, + "grad_norm": 3.046875, + "learning_rate": 4.500634947634943e-06, + "logits/chosen": -0.4126051962375641, + "logits/rejected": -0.3056962192058563, + "logps/chosen": -204.4892578125, + "logps/rejected": -179.78683471679688, + "loss": 0.6801, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.730519711971283, + "rewards/margins": 0.09220778197050095, + "rewards/rejected": 0.6383119821548462, + "step": 3150 + }, + { + "epoch": 0.9952364080154324, + "grad_norm": 2.421875, + "learning_rate": 4.497469611627554e-06, + "logits/chosen": -0.4934718608856201, + "logits/rejected": -0.3333396911621094, + "logps/chosen": -200.66720581054688, + "logps/rejected": -161.30099487304688, + "loss": 0.6013, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7085192203521729, + "rewards/margins": 0.23979604244232178, + "rewards/rejected": 0.46872320771217346, + "step": 3160 + }, + { + "epoch": 0.9983858903192788, + "grad_norm": 3.453125, + "learning_rate": 4.4942953957374724e-06, + "logits/chosen": -0.5716500878334045, + "logits/rejected": -0.4639313220977783, + "logps/chosen": -209.87631225585938, + "logps/rejected": -187.9324493408203, + "loss": 0.6631, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7570088505744934, + "rewards/margins": 0.10784139484167099, + "rewards/rejected": 0.6491674184799194, + "step": 3170 + }, + { + "epoch": 1.001535372623125, + "grad_norm": 2.984375, + "learning_rate": 4.491112314075883e-06, + "logits/chosen": -0.4782622456550598, + "logits/rejected": -0.406808078289032, + "logps/chosen": -200.47714233398438, + "logps/rejected": -190.09597778320312, + "loss": 0.6675, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7260812520980835, + "rewards/margins": 0.08043357729911804, + "rewards/rejected": 0.6456476449966431, + "step": 3180 + }, + { + "epoch": 1.0046848549269713, + "grad_norm": 2.984375, + "learning_rate": 4.487920380793386e-06, + "logits/chosen": -0.5357908010482788, + "logits/rejected": -0.40378838777542114, + "logps/chosen": -195.0863494873047, + "logps/rejected": -157.54249572753906, + "loss": 0.6263, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7121897339820862, + "rewards/margins": 0.17122754454612732, + "rewards/rejected": 0.5409621596336365, + "step": 3190 + }, + { + "epoch": 1.0078343372308176, + "grad_norm": 2.8125, + "learning_rate": 4.4847196100799305e-06, + "logits/chosen": -0.5046489834785461, + "logits/rejected": -0.38591229915618896, + "logps/chosen": -206.5364227294922, + "logps/rejected": -177.38302612304688, + "loss": 0.6648, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.698761522769928, + "rewards/margins": 0.10818042606115341, + "rewards/rejected": 0.5905810594558716, + "step": 3200 + }, + { + "epoch": 1.010983819534664, + "grad_norm": 2.84375, + "learning_rate": 4.481510016164753e-06, + "logits/chosen": -0.4204939007759094, + "logits/rejected": -0.32506507635116577, + "logps/chosen": -186.22238159179688, + "logps/rejected": -157.1585235595703, + "loss": 0.6338, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.7055295705795288, + "rewards/margins": 0.16552086174488068, + "rewards/rejected": 0.5400087237358093, + "step": 3210 + }, + { + "epoch": 1.0141333018385104, + "grad_norm": 3.28125, + "learning_rate": 4.478291613316316e-06, + "logits/chosen": -0.526237428188324, + "logits/rejected": -0.5045801997184753, + "logps/chosen": -179.8779754638672, + "logps/rejected": -179.01283264160156, + "loss": 0.6786, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.6495916247367859, + "rewards/margins": 0.07142230123281479, + "rewards/rejected": 0.5781692862510681, + "step": 3220 + }, + { + "epoch": 1.0172827841423566, + "grad_norm": 2.4375, + "learning_rate": 4.47506441584224e-06, + "logits/chosen": -0.47785645723342896, + "logits/rejected": -0.37066784501075745, + "logps/chosen": -176.42306518554688, + "logps/rejected": -158.9258575439453, + "loss": 0.6722, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6187280416488647, + "rewards/margins": 0.08662554621696472, + "rewards/rejected": 0.5321024656295776, + "step": 3230 + }, + { + "epoch": 1.0204322664462029, + "grad_norm": 2.96875, + "learning_rate": 4.471828438089245e-06, + "logits/chosen": -0.5045980215072632, + "logits/rejected": -0.3146094083786011, + "logps/chosen": -192.99447631835938, + "logps/rejected": -160.2574462890625, + "loss": 0.6403, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6895080208778381, + "rewards/margins": 0.1523585319519043, + "rewards/rejected": 0.5371494293212891, + "step": 3240 + }, + { + "epoch": 1.0235817487500491, + "grad_norm": 2.609375, + "learning_rate": 4.4685836944430815e-06, + "logits/chosen": -0.45219331979751587, + "logits/rejected": -0.3365115821361542, + "logps/chosen": -212.85543823242188, + "logps/rejected": -181.6865692138672, + "loss": 0.6293, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7488403916358948, + "rewards/margins": 0.16951295733451843, + "rewards/rejected": 0.579327404499054, + "step": 3250 + }, + { + "epoch": 1.0267312310538954, + "grad_norm": 3.046875, + "learning_rate": 4.465330199328473e-06, + "logits/chosen": -0.46741557121276855, + "logits/rejected": -0.4280362129211426, + "logps/chosen": -201.90011596679688, + "logps/rejected": -174.54420471191406, + "loss": 0.6809, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.692274808883667, + "rewards/margins": 0.0669262483716011, + "rewards/rejected": 0.625348687171936, + "step": 3260 + }, + { + "epoch": 1.029880713357742, + "grad_norm": 2.53125, + "learning_rate": 4.462067967209045e-06, + "logits/chosen": -0.46599286794662476, + "logits/rejected": -0.40415382385253906, + "logps/chosen": -191.61109924316406, + "logps/rejected": -170.8994140625, + "loss": 0.637, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.694964587688446, + "rewards/margins": 0.15309689939022064, + "rewards/rejected": 0.541867733001709, + "step": 3270 + }, + { + "epoch": 1.0330301956615882, + "grad_norm": 2.46875, + "learning_rate": 4.458797012587266e-06, + "logits/chosen": -0.45959001779556274, + "logits/rejected": -0.3480526804924011, + "logps/chosen": -194.1427764892578, + "logps/rejected": -164.58615112304688, + "loss": 0.6511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6329681277275085, + "rewards/margins": 0.11931748688220978, + "rewards/rejected": 0.51365065574646, + "step": 3280 + }, + { + "epoch": 1.0361796779654344, + "grad_norm": 2.5, + "learning_rate": 4.455517350004379e-06, + "logits/chosen": -0.4861987233161926, + "logits/rejected": -0.3476967513561249, + "logps/chosen": -218.5071563720703, + "logps/rejected": -177.85232543945312, + "loss": 0.6175, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.8144248723983765, + "rewards/margins": 0.19891203939914703, + "rewards/rejected": 0.6155128479003906, + "step": 3290 + }, + { + "epoch": 1.0393291602692807, + "grad_norm": 2.078125, + "learning_rate": 4.452228994040341e-06, + "logits/chosen": -0.4951728284358978, + "logits/rejected": -0.3645640015602112, + "logps/chosen": -196.14259338378906, + "logps/rejected": -167.96095275878906, + "loss": 0.6494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7171255946159363, + "rewards/margins": 0.12575490772724152, + "rewards/rejected": 0.591370701789856, + "step": 3300 + }, + { + "epoch": 1.042478642573127, + "grad_norm": 2.21875, + "learning_rate": 4.448931959313754e-06, + "logits/chosen": -0.4978647232055664, + "logits/rejected": -0.3358718454837799, + "logps/chosen": -200.82778930664062, + "logps/rejected": -170.21578979492188, + "loss": 0.6559, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6936062574386597, + "rewards/margins": 0.12191645056009293, + "rewards/rejected": 0.5716897249221802, + "step": 3310 + }, + { + "epoch": 1.0456281248769734, + "grad_norm": 3.0625, + "learning_rate": 4.4456262604818044e-06, + "logits/chosen": -0.5448322892189026, + "logits/rejected": -0.4078589379787445, + "logps/chosen": -202.68072509765625, + "logps/rejected": -172.15322875976562, + "loss": 0.65, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6884664297103882, + "rewards/margins": 0.12657414376735687, + "rewards/rejected": 0.5618923306465149, + "step": 3320 + }, + { + "epoch": 1.0487776071808197, + "grad_norm": 4.0, + "learning_rate": 4.442311912240194e-06, + "logits/chosen": -0.46120721101760864, + "logits/rejected": -0.31904077529907227, + "logps/chosen": -204.01699829101562, + "logps/rejected": -172.23001098632812, + "loss": 0.6343, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6871546506881714, + "rewards/margins": 0.15920737385749817, + "rewards/rejected": 0.5279473066329956, + "step": 3330 + }, + { + "epoch": 1.051927089484666, + "grad_norm": 4.0625, + "learning_rate": 4.438988929323075e-06, + "logits/chosen": -0.4875815808773041, + "logits/rejected": -0.37348097562789917, + "logps/chosen": -197.5282745361328, + "logps/rejected": -167.55035400390625, + "loss": 0.6422, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7295669317245483, + "rewards/margins": 0.15396803617477417, + "rewards/rejected": 0.5755988955497742, + "step": 3340 + }, + { + "epoch": 1.0550765717885122, + "grad_norm": 2.984375, + "learning_rate": 4.435657326502986e-06, + "logits/chosen": -0.49304842948913574, + "logits/rejected": -0.3635968565940857, + "logps/chosen": -194.97134399414062, + "logps/rejected": -163.18759155273438, + "loss": 0.615, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7523232698440552, + "rewards/margins": 0.2145862579345703, + "rewards/rejected": 0.5377371311187744, + "step": 3350 + }, + { + "epoch": 1.0582260540923585, + "grad_norm": 2.546875, + "learning_rate": 4.432317118590789e-06, + "logits/chosen": -0.48891234397888184, + "logits/rejected": -0.3675716519355774, + "logps/chosen": -206.11074829101562, + "logps/rejected": -187.20938110351562, + "loss": 0.6612, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6792635917663574, + "rewards/margins": 0.10087893903255463, + "rewards/rejected": 0.5783846974372864, + "step": 3360 + }, + { + "epoch": 1.061375536396205, + "grad_norm": 2.546875, + "learning_rate": 4.428968320435597e-06, + "logits/chosen": -0.44766363501548767, + "logits/rejected": -0.442352294921875, + "logps/chosen": -195.1834716796875, + "logps/rejected": -194.13803100585938, + "loss": 0.6908, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7168688178062439, + "rewards/margins": 0.04094035550951958, + "rewards/rejected": 0.6759284734725952, + "step": 3370 + }, + { + "epoch": 1.0645250187000512, + "grad_norm": 2.65625, + "learning_rate": 4.425610946924714e-06, + "logits/chosen": -0.5080258250236511, + "logits/rejected": -0.3179413378238678, + "logps/chosen": -209.2963409423828, + "logps/rejected": -164.95855712890625, + "loss": 0.6388, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.693896472454071, + "rewards/margins": 0.1626008152961731, + "rewards/rejected": 0.5312955975532532, + "step": 3380 + }, + { + "epoch": 1.0676745010038975, + "grad_norm": 2.6875, + "learning_rate": 4.422245012983563e-06, + "logits/chosen": -0.43999892473220825, + "logits/rejected": -0.34846392273902893, + "logps/chosen": -202.3621826171875, + "logps/rejected": -172.74575805664062, + "loss": 0.656, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7476739287376404, + "rewards/margins": 0.1216663271188736, + "rewards/rejected": 0.6260076761245728, + "step": 3390 + }, + { + "epoch": 1.0708239833077438, + "grad_norm": 2.875, + "learning_rate": 4.418870533575626e-06, + "logits/chosen": -0.4500795006752014, + "logits/rejected": -0.35547685623168945, + "logps/chosen": -194.93258666992188, + "logps/rejected": -169.08042907714844, + "loss": 0.6452, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6677473783493042, + "rewards/margins": 0.13091039657592773, + "rewards/rejected": 0.5368369221687317, + "step": 3400 + }, + { + "epoch": 1.07397346561159, + "grad_norm": 2.21875, + "learning_rate": 4.4154875237023725e-06, + "logits/chosen": -0.5026477575302124, + "logits/rejected": -0.42180952429771423, + "logps/chosen": -190.4978790283203, + "logps/rejected": -170.85133361816406, + "loss": 0.6539, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6496149301528931, + "rewards/margins": 0.1192195862531662, + "rewards/rejected": 0.5303953289985657, + "step": 3410 + }, + { + "epoch": 1.0771229479154365, + "grad_norm": 2.546875, + "learning_rate": 4.412095998403198e-06, + "logits/chosen": -0.467219740152359, + "logits/rejected": -0.36950141191482544, + "logps/chosen": -184.17282104492188, + "logps/rejected": -159.8118133544922, + "loss": 0.6296, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6559081673622131, + "rewards/margins": 0.16591888666152954, + "rewards/rejected": 0.4899892210960388, + "step": 3420 + }, + { + "epoch": 1.0802724302192828, + "grad_norm": 2.0, + "learning_rate": 4.4086959727553484e-06, + "logits/chosen": -0.43744510412216187, + "logits/rejected": -0.4274655282497406, + "logps/chosen": -184.89468383789062, + "logps/rejected": -174.7428436279297, + "loss": 0.6674, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6194896101951599, + "rewards/margins": 0.08610032498836517, + "rewards/rejected": 0.5333893299102783, + "step": 3430 + }, + { + "epoch": 1.083421912523129, + "grad_norm": 3.078125, + "learning_rate": 4.4052874618738645e-06, + "logits/chosen": -0.42463669180870056, + "logits/rejected": -0.32110992074012756, + "logps/chosen": -198.18606567382812, + "logps/rejected": -166.31546020507812, + "loss": 0.6371, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7485095262527466, + "rewards/margins": 0.16625897586345673, + "rewards/rejected": 0.5822504758834839, + "step": 3440 + }, + { + "epoch": 1.0865713948269753, + "grad_norm": 2.671875, + "learning_rate": 4.401870480911505e-06, + "logits/chosen": -0.5295459032058716, + "logits/rejected": -0.3508453369140625, + "logps/chosen": -202.53451538085938, + "logps/rejected": -164.10360717773438, + "loss": 0.6324, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7244966626167297, + "rewards/margins": 0.17259523272514343, + "rewards/rejected": 0.5519014000892639, + "step": 3450 + }, + { + "epoch": 1.0897208771308216, + "grad_norm": 2.171875, + "learning_rate": 4.398445045058682e-06, + "logits/chosen": -0.39355480670928955, + "logits/rejected": -0.3064250349998474, + "logps/chosen": -198.85391235351562, + "logps/rejected": -193.22476196289062, + "loss": 0.6711, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7315341234207153, + "rewards/margins": 0.08587940782308578, + "rewards/rejected": 0.6456546783447266, + "step": 3460 + }, + { + "epoch": 1.0928703594346678, + "grad_norm": 2.640625, + "learning_rate": 4.395011169543398e-06, + "logits/chosen": -0.5193358659744263, + "logits/rejected": -0.43463462591171265, + "logps/chosen": -221.0367431640625, + "logps/rejected": -189.76303100585938, + "loss": 0.6429, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.7745602130889893, + "rewards/margins": 0.15965554118156433, + "rewards/rejected": 0.6149047017097473, + "step": 3470 + }, + { + "epoch": 1.0960198417385143, + "grad_norm": 2.5, + "learning_rate": 4.3915688696311734e-06, + "logits/chosen": -0.47026944160461426, + "logits/rejected": -0.3491131067276001, + "logps/chosen": -193.53817749023438, + "logps/rejected": -180.67047119140625, + "loss": 0.6465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7157790660858154, + "rewards/margins": 0.1319868266582489, + "rewards/rejected": 0.5837923288345337, + "step": 3480 + }, + { + "epoch": 1.0991693240423606, + "grad_norm": 2.78125, + "learning_rate": 4.3881181606249775e-06, + "logits/chosen": -0.4417082369327545, + "logits/rejected": -0.33755919337272644, + "logps/chosen": -205.1912841796875, + "logps/rejected": -168.96607971191406, + "loss": 0.6527, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7606759071350098, + "rewards/margins": 0.125040203332901, + "rewards/rejected": 0.6356357336044312, + "step": 3490 + }, + { + "epoch": 1.1023188063462068, + "grad_norm": 3.5, + "learning_rate": 4.384659057865165e-06, + "logits/chosen": -0.4954513609409332, + "logits/rejected": -0.41869044303894043, + "logps/chosen": -207.7197265625, + "logps/rejected": -187.3145751953125, + "loss": 0.6409, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7566835880279541, + "rewards/margins": 0.13307683169841766, + "rewards/rejected": 0.6236067414283752, + "step": 3500 + }, + { + "epoch": 1.105468288650053, + "grad_norm": 3.09375, + "learning_rate": 4.381191576729404e-06, + "logits/chosen": -0.465837299823761, + "logits/rejected": -0.3935115933418274, + "logps/chosen": -189.66842651367188, + "logps/rejected": -181.11953735351562, + "loss": 0.67, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6702840328216553, + "rewards/margins": 0.08853673934936523, + "rewards/rejected": 0.58174729347229, + "step": 3510 + }, + { + "epoch": 1.1086177709538994, + "grad_norm": 2.546875, + "learning_rate": 4.377715732632613e-06, + "logits/chosen": -0.4160510003566742, + "logits/rejected": -0.34267672896385193, + "logps/chosen": -190.46127319335938, + "logps/rejected": -164.63645935058594, + "loss": 0.6486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7083535194396973, + "rewards/margins": 0.13883116841316223, + "rewards/rejected": 0.5695223808288574, + "step": 3520 + }, + { + "epoch": 1.1117672532577458, + "grad_norm": 2.90625, + "learning_rate": 4.374231541026883e-06, + "logits/chosen": -0.5327466130256653, + "logits/rejected": -0.38102999329566956, + "logps/chosen": -208.6250457763672, + "logps/rejected": -187.94937133789062, + "loss": 0.6461, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7454079985618591, + "rewards/margins": 0.1365070343017578, + "rewards/rejected": 0.6089010238647461, + "step": 3530 + }, + { + "epoch": 1.114916735561592, + "grad_norm": 3.703125, + "learning_rate": 4.370739017401417e-06, + "logits/chosen": -0.5027688145637512, + "logits/rejected": -0.4032517373561859, + "logps/chosen": -195.99122619628906, + "logps/rejected": -161.76515197753906, + "loss": 0.6612, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7000411152839661, + "rewards/margins": 0.11141884326934814, + "rewards/rejected": 0.5886222720146179, + "step": 3540 + }, + { + "epoch": 1.1180662178654384, + "grad_norm": 4.1875, + "learning_rate": 4.367238177282462e-06, + "logits/chosen": -0.4465219974517822, + "logits/rejected": -0.3524473309516907, + "logps/chosen": -191.51556396484375, + "logps/rejected": -171.40370178222656, + "loss": 0.6437, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6995719075202942, + "rewards/margins": 0.14049354195594788, + "rewards/rejected": 0.5590783953666687, + "step": 3550 + }, + { + "epoch": 1.1212157001692846, + "grad_norm": 2.53125, + "learning_rate": 4.363729036233231e-06, + "logits/chosen": -0.440678209066391, + "logits/rejected": -0.35682061314582825, + "logps/chosen": -233.2520751953125, + "logps/rejected": -197.86093139648438, + "loss": 0.6151, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.7932190895080566, + "rewards/margins": 0.20204809308052063, + "rewards/rejected": 0.5911709666252136, + "step": 3560 + }, + { + "epoch": 1.1243651824731309, + "grad_norm": 2.90625, + "learning_rate": 4.360211609853841e-06, + "logits/chosen": -0.46588149666786194, + "logits/rejected": -0.3290489614009857, + "logps/chosen": -189.09005737304688, + "logps/rejected": -161.47216796875, + "loss": 0.6275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7199033498764038, + "rewards/margins": 0.18104612827301025, + "rewards/rejected": 0.5388572216033936, + "step": 3570 + }, + { + "epoch": 1.1275146647769774, + "grad_norm": 3.53125, + "learning_rate": 4.356685913781243e-06, + "logits/chosen": -0.4571777284145355, + "logits/rejected": -0.2775833010673523, + "logps/chosen": -212.7926788330078, + "logps/rejected": -178.75543212890625, + "loss": 0.6815, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7048720717430115, + "rewards/margins": 0.06307940185070038, + "rewards/rejected": 0.6417926549911499, + "step": 3580 + }, + { + "epoch": 1.1306641470808236, + "grad_norm": 1.96875, + "learning_rate": 4.353151963689153e-06, + "logits/chosen": -0.4465222954750061, + "logits/rejected": -0.33635538816452026, + "logps/chosen": -193.8217315673828, + "logps/rejected": -170.3684844970703, + "loss": 0.656, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6877092719078064, + "rewards/margins": 0.12182395160198212, + "rewards/rejected": 0.5658854246139526, + "step": 3590 + }, + { + "epoch": 1.13381362938467, + "grad_norm": 2.4375, + "learning_rate": 4.349609775287977e-06, + "logits/chosen": -0.45926880836486816, + "logits/rejected": -0.3848228454589844, + "logps/chosen": -188.42308044433594, + "logps/rejected": -178.51895141601562, + "loss": 0.6831, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.625271201133728, + "rewards/margins": 0.08226005733013153, + "rewards/rejected": 0.5430110692977905, + "step": 3600 + }, + { + "epoch": 1.1369631116885162, + "grad_norm": 2.84375, + "learning_rate": 4.346059364324747e-06, + "logits/chosen": -0.5314878821372986, + "logits/rejected": -0.4037748873233795, + "logps/chosen": -198.21347045898438, + "logps/rejected": -172.4250946044922, + "loss": 0.6247, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7435242533683777, + "rewards/margins": 0.1861993968486786, + "rewards/rejected": 0.5573248267173767, + "step": 3610 + }, + { + "epoch": 1.1401125939923624, + "grad_norm": 3.0, + "learning_rate": 4.342500746583049e-06, + "logits/chosen": -0.4643464982509613, + "logits/rejected": -0.39244017004966736, + "logps/chosen": -199.8033447265625, + "logps/rejected": -178.8501739501953, + "loss": 0.6568, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.7149983644485474, + "rewards/margins": 0.11290229856967926, + "rewards/rejected": 0.6020959615707397, + "step": 3620 + }, + { + "epoch": 1.143262076296209, + "grad_norm": 2.171875, + "learning_rate": 4.338933937882952e-06, + "logits/chosen": -0.48191675543785095, + "logits/rejected": -0.34128910303115845, + "logps/chosen": -192.5113983154297, + "logps/rejected": -171.77792358398438, + "loss": 0.6556, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7115253210067749, + "rewards/margins": 0.11348304897546768, + "rewards/rejected": 0.5980421900749207, + "step": 3630 + }, + { + "epoch": 1.1464115586000552, + "grad_norm": 3.046875, + "learning_rate": 4.335358954080939e-06, + "logits/chosen": -0.5026925802230835, + "logits/rejected": -0.3849731683731079, + "logps/chosen": -195.01315307617188, + "logps/rejected": -153.3354949951172, + "loss": 0.6349, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.664111316204071, + "rewards/margins": 0.1614895761013031, + "rewards/rejected": 0.5026217699050903, + "step": 3640 + }, + { + "epoch": 1.1495610409039014, + "grad_norm": 2.421875, + "learning_rate": 4.331775811069837e-06, + "logits/chosen": -0.3999708294868469, + "logits/rejected": -0.3556813895702362, + "logps/chosen": -193.3236083984375, + "logps/rejected": -170.455810546875, + "loss": 0.6737, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.6599856019020081, + "rewards/margins": 0.08706656098365784, + "rewards/rejected": 0.5729190111160278, + "step": 3650 + }, + { + "epoch": 1.1527105232077477, + "grad_norm": 3.34375, + "learning_rate": 4.328184524778743e-06, + "logits/chosen": -0.5279411673545837, + "logits/rejected": -0.4057609438896179, + "logps/chosen": -195.410400390625, + "logps/rejected": -158.7968292236328, + "loss": 0.6285, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6674174070358276, + "rewards/margins": 0.166324645280838, + "rewards/rejected": 0.501092791557312, + "step": 3660 + }, + { + "epoch": 1.155860005511594, + "grad_norm": 3.28125, + "learning_rate": 4.324585111172959e-06, + "logits/chosen": -0.46349841356277466, + "logits/rejected": -0.3997943699359894, + "logps/chosen": -198.0673370361328, + "logps/rejected": -179.5006103515625, + "loss": 0.6457, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7249336242675781, + "rewards/margins": 0.15420496463775635, + "rewards/rejected": 0.5707286596298218, + "step": 3670 + }, + { + "epoch": 1.1590094878154402, + "grad_norm": 2.0, + "learning_rate": 4.320977586253911e-06, + "logits/chosen": -0.4722752571105957, + "logits/rejected": -0.3240264058113098, + "logps/chosen": -202.7415771484375, + "logps/rejected": -171.6900177001953, + "loss": 0.6431, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.669415295124054, + "rewards/margins": 0.1358814686536789, + "rewards/rejected": 0.5335337519645691, + "step": 3680 + }, + { + "epoch": 1.1621589701192867, + "grad_norm": 2.4375, + "learning_rate": 4.317361966059092e-06, + "logits/chosen": -0.4404812753200531, + "logits/rejected": -0.32088351249694824, + "logps/chosen": -207.16323852539062, + "logps/rejected": -176.3754119873047, + "loss": 0.6309, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6894677877426147, + "rewards/margins": 0.16316859424114227, + "rewards/rejected": 0.5262991786003113, + "step": 3690 + }, + { + "epoch": 1.165308452423133, + "grad_norm": 6.875, + "learning_rate": 4.313738266661979e-06, + "logits/chosen": -0.4920511841773987, + "logits/rejected": -0.34299007058143616, + "logps/chosen": -201.88381958007812, + "logps/rejected": -172.03121948242188, + "loss": 0.6224, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.7374047636985779, + "rewards/margins": 0.18349048495292664, + "rewards/rejected": 0.5539143681526184, + "step": 3700 + }, + { + "epoch": 1.1684579347269792, + "grad_norm": 3.0, + "learning_rate": 4.310106504171966e-06, + "logits/chosen": -0.45534056425094604, + "logits/rejected": -0.35874494910240173, + "logps/chosen": -190.65248107910156, + "logps/rejected": -158.1346893310547, + "loss": 0.6446, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6649999618530273, + "rewards/margins": 0.14345432817935944, + "rewards/rejected": 0.5215457081794739, + "step": 3710 + }, + { + "epoch": 1.1716074170308255, + "grad_norm": 3.28125, + "learning_rate": 4.306466694734292e-06, + "logits/chosen": -0.4667026102542877, + "logits/rejected": -0.33577385544776917, + "logps/chosen": -194.20590209960938, + "logps/rejected": -177.63400268554688, + "loss": 0.6922, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6832542419433594, + "rewards/margins": 0.034148164093494415, + "rewards/rejected": 0.6491062045097351, + "step": 3720 + }, + { + "epoch": 1.174756899334672, + "grad_norm": 2.59375, + "learning_rate": 4.302818854529969e-06, + "logits/chosen": -0.5350615382194519, + "logits/rejected": -0.42588871717453003, + "logps/chosen": -198.84042358398438, + "logps/rejected": -164.7383270263672, + "loss": 0.6171, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.7313185334205627, + "rewards/margins": 0.19767650961875916, + "rewards/rejected": 0.5336421132087708, + "step": 3730 + }, + { + "epoch": 1.1779063816385182, + "grad_norm": 4.53125, + "learning_rate": 4.299162999775712e-06, + "logits/chosen": -0.40838590264320374, + "logits/rejected": -0.37127965688705444, + "logps/chosen": -196.0567169189453, + "logps/rejected": -196.9461669921875, + "loss": 0.69, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7238589525222778, + "rewards/margins": 0.06422169506549835, + "rewards/rejected": 0.6596371531486511, + "step": 3740 + }, + { + "epoch": 1.1810558639423645, + "grad_norm": 2.5625, + "learning_rate": 4.295499146723864e-06, + "logits/chosen": -0.5344425439834595, + "logits/rejected": -0.35419797897338867, + "logps/chosen": -202.3170623779297, + "logps/rejected": -161.15969848632812, + "loss": 0.6066, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7712113261222839, + "rewards/margins": 0.229976087808609, + "rewards/rejected": 0.5412352681159973, + "step": 3750 + }, + { + "epoch": 1.1842053462462108, + "grad_norm": 2.453125, + "learning_rate": 4.2918273116623245e-06, + "logits/chosen": -0.39341261982917786, + "logits/rejected": -0.3262158930301666, + "logps/chosen": -178.946533203125, + "logps/rejected": -171.49209594726562, + "loss": 0.6667, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6648285388946533, + "rewards/margins": 0.08790337294340134, + "rewards/rejected": 0.5769251585006714, + "step": 3760 + }, + { + "epoch": 1.187354828550057, + "grad_norm": 2.359375, + "learning_rate": 4.288147510914477e-06, + "logits/chosen": -0.46092867851257324, + "logits/rejected": -0.44449153542518616, + "logps/chosen": -199.6840362548828, + "logps/rejected": -191.94036865234375, + "loss": 0.6985, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.6821948885917664, + "rewards/margins": 0.02965412102639675, + "rewards/rejected": 0.6525408029556274, + "step": 3770 + }, + { + "epoch": 1.1905043108539033, + "grad_norm": 2.453125, + "learning_rate": 4.284459760839122e-06, + "logits/chosen": -0.5228853225708008, + "logits/rejected": -0.3966136574745178, + "logps/chosen": -197.607177734375, + "logps/rejected": -169.21734619140625, + "loss": 0.6785, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.6453790664672852, + "rewards/margins": 0.06496746838092804, + "rewards/rejected": 0.5804116129875183, + "step": 3780 + }, + { + "epoch": 1.1936537931577498, + "grad_norm": 2.015625, + "learning_rate": 4.28076407783039e-06, + "logits/chosen": -0.4347180724143982, + "logits/rejected": -0.3091747760772705, + "logps/chosen": -183.70968627929688, + "logps/rejected": -151.46176147460938, + "loss": 0.5928, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.7321251034736633, + "rewards/margins": 0.25292515754699707, + "rewards/rejected": 0.4791998863220215, + "step": 3790 + }, + { + "epoch": 1.196803275461596, + "grad_norm": 2.375, + "learning_rate": 4.277060478317687e-06, + "logits/chosen": -0.5005819797515869, + "logits/rejected": -0.36348724365234375, + "logps/chosen": -205.44021606445312, + "logps/rejected": -170.33944702148438, + "loss": 0.6353, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6576865911483765, + "rewards/margins": 0.1621154546737671, + "rewards/rejected": 0.49557122588157654, + "step": 3800 + }, + { + "epoch": 1.1999527577654423, + "grad_norm": 2.9375, + "learning_rate": 4.2733489787656075e-06, + "logits/chosen": -0.4653560221195221, + "logits/rejected": -0.3435734808444977, + "logps/chosen": -185.8004913330078, + "logps/rejected": -159.63597106933594, + "loss": 0.6312, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6951313018798828, + "rewards/margins": 0.1663820743560791, + "rewards/rejected": 0.5287492871284485, + "step": 3810 + }, + { + "epoch": 1.2031022400692886, + "grad_norm": 3.03125, + "learning_rate": 4.269629595673867e-06, + "logits/chosen": -0.478889524936676, + "logits/rejected": -0.4129001200199127, + "logps/chosen": -221.73828125, + "logps/rejected": -191.03372192382812, + "loss": 0.6142, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8134725689888, + "rewards/margins": 0.20523671805858612, + "rewards/rejected": 0.6082358360290527, + "step": 3820 + }, + { + "epoch": 1.2062517223731348, + "grad_norm": 2.515625, + "learning_rate": 4.265902345577227e-06, + "logits/chosen": -0.4593692421913147, + "logits/rejected": -0.3384065628051758, + "logps/chosen": -186.61167907714844, + "logps/rejected": -161.26182556152344, + "loss": 0.6454, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.660686731338501, + "rewards/margins": 0.13643305003643036, + "rewards/rejected": 0.5242536664009094, + "step": 3830 + }, + { + "epoch": 1.2094012046769813, + "grad_norm": 2.46875, + "learning_rate": 4.262167245045424e-06, + "logits/chosen": -0.503960132598877, + "logits/rejected": -0.3568868339061737, + "logps/chosen": -189.79818725585938, + "logps/rejected": -160.52206420898438, + "loss": 0.6303, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7322565317153931, + "rewards/margins": 0.18888680636882782, + "rewards/rejected": 0.5433696508407593, + "step": 3840 + }, + { + "epoch": 1.2125506869808276, + "grad_norm": 3.015625, + "learning_rate": 4.258424310683094e-06, + "logits/chosen": -0.5178459882736206, + "logits/rejected": -0.4422483444213867, + "logps/chosen": -202.4190216064453, + "logps/rejected": -191.25418090820312, + "loss": 0.6634, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6625019311904907, + "rewards/margins": 0.09516273438930511, + "rewards/rejected": 0.5673390626907349, + "step": 3850 + }, + { + "epoch": 1.2157001692846738, + "grad_norm": 2.4375, + "learning_rate": 4.254673559129698e-06, + "logits/chosen": -0.4428611695766449, + "logits/rejected": -0.3638356328010559, + "logps/chosen": -177.161376953125, + "logps/rejected": -158.397216796875, + "loss": 0.6791, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.660545825958252, + "rewards/margins": 0.06867508590221405, + "rewards/rejected": 0.5918707251548767, + "step": 3860 + }, + { + "epoch": 1.21884965158852, + "grad_norm": 2.484375, + "learning_rate": 4.250915007059448e-06, + "logits/chosen": -0.4426344037055969, + "logits/rejected": -0.39190584421157837, + "logps/chosen": -182.5616455078125, + "logps/rejected": -166.43875122070312, + "loss": 0.6467, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6761131286621094, + "rewards/margins": 0.147123783826828, + "rewards/rejected": 0.5289894342422485, + "step": 3870 + }, + { + "epoch": 1.2219991338923664, + "grad_norm": 2.546875, + "learning_rate": 4.247148671181237e-06, + "logits/chosen": -0.5092406868934631, + "logits/rejected": -0.41020458936691284, + "logps/chosen": -190.69708251953125, + "logps/rejected": -165.26943969726562, + "loss": 0.6436, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.7014779448509216, + "rewards/margins": 0.13978223502635956, + "rewards/rejected": 0.5616958141326904, + "step": 3880 + }, + { + "epoch": 1.2251486161962126, + "grad_norm": 2.0625, + "learning_rate": 4.243374568238556e-06, + "logits/chosen": -0.49269500374794006, + "logits/rejected": -0.34552276134490967, + "logps/chosen": -197.5285186767578, + "logps/rejected": -163.1229248046875, + "loss": 0.6266, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.686238706111908, + "rewards/margins": 0.17949660122394562, + "rewards/rejected": 0.5067421197891235, + "step": 3890 + }, + { + "epoch": 1.228298098500059, + "grad_norm": 2.453125, + "learning_rate": 4.23959271500943e-06, + "logits/chosen": -0.4887842535972595, + "logits/rejected": -0.32892632484436035, + "logps/chosen": -213.9662322998047, + "logps/rejected": -175.28543090820312, + "loss": 0.6495, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7332077026367188, + "rewards/margins": 0.14412468671798706, + "rewards/rejected": 0.5890830755233765, + "step": 3900 + }, + { + "epoch": 1.2314475808039054, + "grad_norm": 3.515625, + "learning_rate": 4.235803128306337e-06, + "logits/chosen": -0.4606234133243561, + "logits/rejected": -0.3164953291416168, + "logps/chosen": -217.72933959960938, + "logps/rejected": -188.64517211914062, + "loss": 0.6548, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7374159693717957, + "rewards/margins": 0.12077488750219345, + "rewards/rejected": 0.6166411638259888, + "step": 3910 + }, + { + "epoch": 1.2345970631077516, + "grad_norm": 2.78125, + "learning_rate": 4.232005824976133e-06, + "logits/chosen": -0.4276387095451355, + "logits/rejected": -0.34980887174606323, + "logps/chosen": -212.1448516845703, + "logps/rejected": -174.04776000976562, + "loss": 0.6255, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7644809484481812, + "rewards/margins": 0.19336572289466858, + "rewards/rejected": 0.5711151361465454, + "step": 3920 + }, + { + "epoch": 1.237746545411598, + "grad_norm": 2.265625, + "learning_rate": 4.22820082189998e-06, + "logits/chosen": -0.4039751887321472, + "logits/rejected": -0.44209712743759155, + "logps/chosen": -198.69827270507812, + "logps/rejected": -201.2398681640625, + "loss": 0.6825, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6999807357788086, + "rewards/margins": 0.07114030420780182, + "rewards/rejected": 0.6288403868675232, + "step": 3930 + }, + { + "epoch": 1.2408960277154444, + "grad_norm": 3.25, + "learning_rate": 4.224388135993271e-06, + "logits/chosen": -0.4334556460380554, + "logits/rejected": -0.30388978123664856, + "logps/chosen": -206.3880157470703, + "logps/rejected": -177.33868408203125, + "loss": 0.6462, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.745646595954895, + "rewards/margins": 0.1241118311882019, + "rewards/rejected": 0.6215347051620483, + "step": 3940 + }, + { + "epoch": 1.2440455100192906, + "grad_norm": 3.5, + "learning_rate": 4.220567784205551e-06, + "logits/chosen": -0.38715043663978577, + "logits/rejected": -0.2959139049053192, + "logps/chosen": -204.772216796875, + "logps/rejected": -177.6510772705078, + "loss": 0.649, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7557042837142944, + "rewards/margins": 0.1426253467798233, + "rewards/rejected": 0.6130789518356323, + "step": 3950 + }, + { + "epoch": 1.247194992323137, + "grad_norm": 2.765625, + "learning_rate": 4.216739783520447e-06, + "logits/chosen": -0.44497212767601013, + "logits/rejected": -0.3760547935962677, + "logps/chosen": -193.57406616210938, + "logps/rejected": -169.6040802001953, + "loss": 0.6622, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6512136459350586, + "rewards/margins": 0.10468528419733047, + "rewards/rejected": 0.5465283989906311, + "step": 3960 + }, + { + "epoch": 1.2503444746269832, + "grad_norm": 2.734375, + "learning_rate": 4.212904150955587e-06, + "logits/chosen": -0.512496292591095, + "logits/rejected": -0.40937352180480957, + "logps/chosen": -215.65908813476562, + "logps/rejected": -184.07162475585938, + "loss": 0.6385, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.8019892573356628, + "rewards/margins": 0.16033987700939178, + "rewards/rejected": 0.6416494250297546, + "step": 3970 + }, + { + "epoch": 1.2534939569308294, + "grad_norm": 3.03125, + "learning_rate": 4.209060903562528e-06, + "logits/chosen": -0.4832407832145691, + "logits/rejected": -0.37063470482826233, + "logps/chosen": -211.16928100585938, + "logps/rejected": -176.49264526367188, + "loss": 0.6414, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7144185900688171, + "rewards/margins": 0.13941380381584167, + "rewards/rejected": 0.5750047564506531, + "step": 3980 + }, + { + "epoch": 1.2566434392346757, + "grad_norm": 2.703125, + "learning_rate": 4.20521005842668e-06, + "logits/chosen": -0.4679936468601227, + "logits/rejected": -0.320305734872818, + "logps/chosen": -198.4337615966797, + "logps/rejected": -156.4415283203125, + "loss": 0.615, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.717679500579834, + "rewards/margins": 0.20881009101867676, + "rewards/rejected": 0.5088694095611572, + "step": 3990 + }, + { + "epoch": 1.2597929215385222, + "grad_norm": 2.984375, + "learning_rate": 4.201351632667227e-06, + "logits/chosen": -0.4357661306858063, + "logits/rejected": -0.33136463165283203, + "logps/chosen": -215.5205535888672, + "logps/rejected": -194.84225463867188, + "loss": 0.6584, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7862467169761658, + "rewards/margins": 0.1381584107875824, + "rewards/rejected": 0.6480883359909058, + "step": 4000 + }, + { + "epoch": 1.2597929215385222, + "eval_logits/chosen": -0.6012239456176758, + "eval_logits/rejected": -0.4762186110019684, + "eval_logps/chosen": -243.07638549804688, + "eval_logps/rejected": -222.3102569580078, + "eval_loss": 0.6651261448860168, + "eval_rewards/accuracies": 0.5946148037910461, + "eval_rewards/chosen": 0.8024876117706299, + "eval_rewards/margins": 0.10073534399271011, + "eval_rewards/rejected": 0.701752245426178, + "eval_runtime": 3657.1235, + "eval_samples_per_second": 0.366, + "eval_steps_per_second": 0.366, + "step": 4000 + }, + { + "epoch": 1.2629424038423684, + "grad_norm": 2.78125, + "learning_rate": 4.197485643437058e-06, + "logits/chosen": -0.45790332555770874, + "logits/rejected": -0.3080021142959595, + "logps/chosen": -194.04660034179688, + "logps/rejected": -152.27664184570312, + "loss": 0.6051, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7683263421058655, + "rewards/margins": 0.22982630133628845, + "rewards/rejected": 0.5384999513626099, + "step": 4010 + }, + { + "epoch": 1.2660918861462147, + "grad_norm": 3.765625, + "learning_rate": 4.19361210792268e-06, + "logits/chosen": -0.45886415243148804, + "logits/rejected": -0.36696118116378784, + "logps/chosen": -197.91018676757812, + "logps/rejected": -165.89205932617188, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.649300217628479, + "rewards/margins": 0.12859824299812317, + "rewards/rejected": 0.5207020044326782, + "step": 4020 + }, + { + "epoch": 1.269241368450061, + "grad_norm": 2.0625, + "learning_rate": 4.189731043344151e-06, + "logits/chosen": -0.5488325953483582, + "logits/rejected": -0.3945949673652649, + "logps/chosen": -198.27198791503906, + "logps/rejected": -160.57528686523438, + "loss": 0.6517, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6978427767753601, + "rewards/margins": 0.1346282958984375, + "rewards/rejected": 0.5632144808769226, + "step": 4030 + }, + { + "epoch": 1.2723908507539075, + "grad_norm": 2.59375, + "learning_rate": 4.185842466954998e-06, + "logits/chosen": -0.5013604760169983, + "logits/rejected": -0.40894627571105957, + "logps/chosen": -192.42051696777344, + "logps/rejected": -167.13221740722656, + "loss": 0.6322, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6890965700149536, + "rewards/margins": 0.17353561520576477, + "rewards/rejected": 0.5155609846115112, + "step": 4040 + }, + { + "epoch": 1.2755403330577537, + "grad_norm": 3.265625, + "learning_rate": 4.181946396042146e-06, + "logits/chosen": -0.46542900800704956, + "logits/rejected": -0.3752228319644928, + "logps/chosen": -204.59176635742188, + "logps/rejected": -184.3190155029297, + "loss": 0.646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.712192952632904, + "rewards/margins": 0.12876209616661072, + "rewards/rejected": 0.5834308862686157, + "step": 4050 + }, + { + "epoch": 1.2786898153616, + "grad_norm": 3.28125, + "learning_rate": 4.178042847925833e-06, + "logits/chosen": -0.4636574387550354, + "logits/rejected": -0.4096450209617615, + "logps/chosen": -207.9044952392578, + "logps/rejected": -191.27566528320312, + "loss": 0.6599, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7198086380958557, + "rewards/margins": 0.10640084743499756, + "rewards/rejected": 0.6134077906608582, + "step": 4060 + }, + { + "epoch": 1.2818392976654462, + "grad_norm": 2.609375, + "learning_rate": 4.174131839959539e-06, + "logits/chosen": -0.5049449801445007, + "logits/rejected": -0.37073415517807007, + "logps/chosen": -201.08135986328125, + "logps/rejected": -178.99465942382812, + "loss": 0.6637, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7069438099861145, + "rewards/margins": 0.11059415340423584, + "rewards/rejected": 0.5963497161865234, + "step": 4070 + }, + { + "epoch": 1.2849887799692925, + "grad_norm": 2.421875, + "learning_rate": 4.170213389529908e-06, + "logits/chosen": -0.4413929879665375, + "logits/rejected": -0.37840536236763, + "logps/chosen": -186.3639678955078, + "logps/rejected": -180.01528930664062, + "loss": 0.657, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.664527416229248, + "rewards/margins": 0.11193283647298813, + "rewards/rejected": 0.5525946617126465, + "step": 4080 + }, + { + "epoch": 1.2881382622731388, + "grad_norm": 2.359375, + "learning_rate": 4.16628751405667e-06, + "logits/chosen": -0.4986829161643982, + "logits/rejected": -0.34226298332214355, + "logps/chosen": -184.44613647460938, + "logps/rejected": -149.0967254638672, + "loss": 0.6311, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6766608953475952, + "rewards/margins": 0.16855625808238983, + "rewards/rejected": 0.508104681968689, + "step": 4090 + }, + { + "epoch": 1.291287744576985, + "grad_norm": 2.375, + "learning_rate": 4.162354230992562e-06, + "logits/chosen": -0.5013774633407593, + "logits/rejected": -0.4048996865749359, + "logps/chosen": -198.087646484375, + "logps/rejected": -170.59884643554688, + "loss": 0.6539, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7187263369560242, + "rewards/margins": 0.12955203652381897, + "rewards/rejected": 0.5891742706298828, + "step": 4100 + }, + { + "epoch": 1.2944372268808315, + "grad_norm": 2.625, + "learning_rate": 4.158413557823253e-06, + "logits/chosen": -0.5318306684494019, + "logits/rejected": -0.3859057128429413, + "logps/chosen": -198.9587860107422, + "logps/rejected": -168.4477081298828, + "loss": 0.638, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7599405646324158, + "rewards/margins": 0.16434063017368317, + "rewards/rejected": 0.5955999493598938, + "step": 4110 + }, + { + "epoch": 1.2975867091846778, + "grad_norm": 2.875, + "learning_rate": 4.154465512067266e-06, + "logits/chosen": -0.4665352702140808, + "logits/rejected": -0.3361702561378479, + "logps/chosen": -219.12088012695312, + "logps/rejected": -174.34353637695312, + "loss": 0.6203, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7928240895271301, + "rewards/margins": 0.19742821156978607, + "rewards/rejected": 0.5953959226608276, + "step": 4120 + }, + { + "epoch": 1.300736191488524, + "grad_norm": 2.59375, + "learning_rate": 4.1505101112758975e-06, + "logits/chosen": -0.5339746475219727, + "logits/rejected": -0.45769819617271423, + "logps/chosen": -180.9729766845703, + "logps/rejected": -161.6527557373047, + "loss": 0.666, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6436579823493958, + "rewards/margins": 0.08750168979167938, + "rewards/rejected": 0.5561562180519104, + "step": 4130 + }, + { + "epoch": 1.3038856737923703, + "grad_norm": 2.625, + "learning_rate": 4.146547373033142e-06, + "logits/chosen": -0.44753965735435486, + "logits/rejected": -0.33734267950057983, + "logps/chosen": -184.26077270507812, + "logps/rejected": -155.27151489257812, + "loss": 0.6196, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7223949432373047, + "rewards/margins": 0.19382601976394653, + "rewards/rejected": 0.5285689234733582, + "step": 4140 + }, + { + "epoch": 1.3070351560962168, + "grad_norm": 2.265625, + "learning_rate": 4.142577314955614e-06, + "logits/chosen": -0.4315119683742523, + "logits/rejected": -0.27412423491477966, + "logps/chosen": -203.50393676757812, + "logps/rejected": -160.0498504638672, + "loss": 0.6151, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7396818995475769, + "rewards/margins": 0.2151108682155609, + "rewards/rejected": 0.5245710015296936, + "step": 4150 + }, + { + "epoch": 1.310184638400063, + "grad_norm": 2.921875, + "learning_rate": 4.138599954692467e-06, + "logits/chosen": -0.4760667681694031, + "logits/rejected": -0.4465053677558899, + "logps/chosen": -204.943359375, + "logps/rejected": -194.9350128173828, + "loss": 0.6528, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7579629421234131, + "rewards/margins": 0.136540025472641, + "rewards/rejected": 0.6214228868484497, + "step": 4160 + }, + { + "epoch": 1.3133341207039093, + "grad_norm": 2.78125, + "learning_rate": 4.13461530992532e-06, + "logits/chosen": -0.5714303255081177, + "logits/rejected": -0.39394524693489075, + "logps/chosen": -186.2847137451172, + "logps/rejected": -156.30838012695312, + "loss": 0.6444, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6628353595733643, + "rewards/margins": 0.14596715569496155, + "rewards/rejected": 0.5168682336807251, + "step": 4170 + }, + { + "epoch": 1.3164836030077556, + "grad_norm": 2.484375, + "learning_rate": 4.130623398368171e-06, + "logits/chosen": -0.4178102910518646, + "logits/rejected": -0.3646093010902405, + "logps/chosen": -174.4610595703125, + "logps/rejected": -152.55731201171875, + "loss": 0.6464, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6118752360343933, + "rewards/margins": 0.13116849958896637, + "rewards/rejected": 0.48070669174194336, + "step": 4180 + }, + { + "epoch": 1.3196330853116018, + "grad_norm": 2.21875, + "learning_rate": 4.126624237767328e-06, + "logits/chosen": -0.4415621757507324, + "logits/rejected": -0.3622170686721802, + "logps/chosen": -186.65716552734375, + "logps/rejected": -164.0797576904297, + "loss": 0.6271, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6536270380020142, + "rewards/margins": 0.1805739849805832, + "rewards/rejected": 0.47305306792259216, + "step": 4190 + }, + { + "epoch": 1.322782567615448, + "grad_norm": 3.09375, + "learning_rate": 4.122617845901322e-06, + "logits/chosen": -0.5048421025276184, + "logits/rejected": -0.41297560930252075, + "logps/chosen": -190.35025024414062, + "logps/rejected": -161.85830688476562, + "loss": 0.6162, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.731613278388977, + "rewards/margins": 0.20245853066444397, + "rewards/rejected": 0.5291547179222107, + "step": 4200 + }, + { + "epoch": 1.3259320499192946, + "grad_norm": 2.6875, + "learning_rate": 4.118604240580832e-06, + "logits/chosen": -0.40836066007614136, + "logits/rejected": -0.35586100816726685, + "logps/chosen": -199.27098083496094, + "logps/rejected": -172.96218872070312, + "loss": 0.6595, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7109326720237732, + "rewards/margins": 0.11674892902374268, + "rewards/rejected": 0.5941838026046753, + "step": 4210 + }, + { + "epoch": 1.3290815322231408, + "grad_norm": 2.46875, + "learning_rate": 4.114583439648604e-06, + "logits/chosen": -0.47792333364486694, + "logits/rejected": -0.3548354506492615, + "logps/chosen": -212.23178100585938, + "logps/rejected": -169.51315307617188, + "loss": 0.6391, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6976093053817749, + "rewards/margins": 0.15473678708076477, + "rewards/rejected": 0.5428725481033325, + "step": 4220 + }, + { + "epoch": 1.3322310145269871, + "grad_norm": 2.546875, + "learning_rate": 4.110555460979374e-06, + "logits/chosen": -0.5251237154006958, + "logits/rejected": -0.4099500775337219, + "logps/chosen": -200.9913787841797, + "logps/rejected": -171.32870483398438, + "loss": 0.6298, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7174925804138184, + "rewards/margins": 0.1686524748802185, + "rewards/rejected": 0.5488401055335999, + "step": 4230 + }, + { + "epoch": 1.3353804968308334, + "grad_norm": 2.90625, + "learning_rate": 4.106520322479786e-06, + "logits/chosen": -0.4651850163936615, + "logits/rejected": -0.34185513854026794, + "logps/chosen": -196.94589233398438, + "logps/rejected": -159.74307250976562, + "loss": 0.6523, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6821771860122681, + "rewards/margins": 0.11817701160907745, + "rewards/rejected": 0.564000129699707, + "step": 4240 + }, + { + "epoch": 1.3385299791346799, + "grad_norm": 2.671875, + "learning_rate": 4.102478042088315e-06, + "logits/chosen": -0.49223145842552185, + "logits/rejected": -0.35867035388946533, + "logps/chosen": -202.394775390625, + "logps/rejected": -171.31448364257812, + "loss": 0.6207, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.725380539894104, + "rewards/margins": 0.2017165720462799, + "rewards/rejected": 0.5236639380455017, + "step": 4250 + }, + { + "epoch": 1.3416794614385261, + "grad_norm": 2.765625, + "learning_rate": 4.098428637775183e-06, + "logits/chosen": -0.5037646293640137, + "logits/rejected": -0.46415749192237854, + "logps/chosen": -181.4263916015625, + "logps/rejected": -168.53955078125, + "loss": 0.674, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.6651412844657898, + "rewards/margins": 0.08603055775165558, + "rewards/rejected": 0.5791107416152954, + "step": 4260 + }, + { + "epoch": 1.3448289437423724, + "grad_norm": 2.3125, + "learning_rate": 4.094372127542285e-06, + "logits/chosen": -0.44330787658691406, + "logits/rejected": -0.3730103075504303, + "logps/chosen": -190.1846923828125, + "logps/rejected": -176.09255981445312, + "loss": 0.6639, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6509902477264404, + "rewards/margins": 0.11924519389867783, + "rewards/rejected": 0.5317450761795044, + "step": 4270 + }, + { + "epoch": 1.3479784260462186, + "grad_norm": 2.78125, + "learning_rate": 4.0903085294231035e-06, + "logits/chosen": -0.4798402190208435, + "logits/rejected": -0.27211225032806396, + "logps/chosen": -221.6403045654297, + "logps/rejected": -164.9480743408203, + "loss": 0.5944, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7979739308357239, + "rewards/margins": 0.26428526639938354, + "rewards/rejected": 0.5336886644363403, + "step": 4280 + }, + { + "epoch": 1.351127908350065, + "grad_norm": 2.84375, + "learning_rate": 4.086237861482632e-06, + "logits/chosen": -0.45586076378822327, + "logits/rejected": -0.37053224444389343, + "logps/chosen": -212.56216430664062, + "logps/rejected": -183.00344848632812, + "loss": 0.6408, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7277430295944214, + "rewards/margins": 0.1431419998407364, + "rewards/rejected": 0.5846010446548462, + "step": 4290 + }, + { + "epoch": 1.3542773906539112, + "grad_norm": 2.421875, + "learning_rate": 4.0821601418172926e-06, + "logits/chosen": -0.4372914731502533, + "logits/rejected": -0.362470418214798, + "logps/chosen": -194.07884216308594, + "logps/rejected": -176.6485137939453, + "loss": 0.6393, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6592785716056824, + "rewards/margins": 0.14654000103473663, + "rewards/rejected": 0.5127385854721069, + "step": 4300 + }, + { + "epoch": 1.3574268729577574, + "grad_norm": 3.078125, + "learning_rate": 4.078075388554857e-06, + "logits/chosen": -0.4022819995880127, + "logits/rejected": -0.3265857398509979, + "logps/chosen": -197.20260620117188, + "logps/rejected": -163.478515625, + "loss": 0.6253, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.741704523563385, + "rewards/margins": 0.17545071244239807, + "rewards/rejected": 0.5662537217140198, + "step": 4310 + }, + { + "epoch": 1.360576355261604, + "grad_norm": 3.046875, + "learning_rate": 4.0739836198543634e-06, + "logits/chosen": -0.5151673555374146, + "logits/rejected": -0.3456025719642639, + "logps/chosen": -217.0493927001953, + "logps/rejected": -183.4969024658203, + "loss": 0.619, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.7682887315750122, + "rewards/margins": 0.19945594668388367, + "rewards/rejected": 0.5688328146934509, + "step": 4320 + }, + { + "epoch": 1.3637258375654502, + "grad_norm": 3.140625, + "learning_rate": 4.069884853906041e-06, + "logits/chosen": -0.5158268809318542, + "logits/rejected": -0.333678662776947, + "logps/chosen": -213.3292236328125, + "logps/rejected": -159.3480987548828, + "loss": 0.6005, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.801051914691925, + "rewards/margins": 0.23268017172813416, + "rewards/rejected": 0.5683717131614685, + "step": 4330 + }, + { + "epoch": 1.3668753198692964, + "grad_norm": 4.34375, + "learning_rate": 4.065779108931222e-06, + "logits/chosen": -0.4823933243751526, + "logits/rejected": -0.3266783654689789, + "logps/chosen": -199.21498107910156, + "logps/rejected": -166.51553344726562, + "loss": 0.6243, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7152196764945984, + "rewards/margins": 0.1862681806087494, + "rewards/rejected": 0.5289515256881714, + "step": 4340 + }, + { + "epoch": 1.3700248021731427, + "grad_norm": 2.640625, + "learning_rate": 4.0616664031822686e-06, + "logits/chosen": -0.43618011474609375, + "logits/rejected": -0.28981930017471313, + "logps/chosen": -192.85598754882812, + "logps/rejected": -157.85415649414062, + "loss": 0.6548, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6323038935661316, + "rewards/margins": 0.11133924871683121, + "rewards/rejected": 0.5209646821022034, + "step": 4350 + }, + { + "epoch": 1.3731742844769892, + "grad_norm": 2.5, + "learning_rate": 4.057546754942482e-06, + "logits/chosen": -0.5119596719741821, + "logits/rejected": -0.3471509516239166, + "logps/chosen": -196.14657592773438, + "logps/rejected": -154.673583984375, + "loss": 0.6247, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6863905191421509, + "rewards/margins": 0.18415386974811554, + "rewards/rejected": 0.5022366642951965, + "step": 4360 + }, + { + "epoch": 1.3763237667808355, + "grad_norm": 2.640625, + "learning_rate": 4.053420182526031e-06, + "logits/chosen": -0.4351939260959625, + "logits/rejected": -0.35209694504737854, + "logps/chosen": -194.9961700439453, + "logps/rejected": -161.25656127929688, + "loss": 0.6406, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.725360095500946, + "rewards/margins": 0.15019366145133972, + "rewards/rejected": 0.5751665234565735, + "step": 4370 + }, + { + "epoch": 1.3794732490846817, + "grad_norm": 2.78125, + "learning_rate": 4.049286704277865e-06, + "logits/chosen": -0.4273145794868469, + "logits/rejected": -0.32500097155570984, + "logps/chosen": -198.8387908935547, + "logps/rejected": -177.99790954589844, + "loss": 0.6608, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6882764101028442, + "rewards/margins": 0.1112000122666359, + "rewards/rejected": 0.5770763158798218, + "step": 4380 + }, + { + "epoch": 1.382622731388528, + "grad_norm": 2.78125, + "learning_rate": 4.045146338573634e-06, + "logits/chosen": -0.4316592216491699, + "logits/rejected": -0.33414119482040405, + "logps/chosen": -183.42718505859375, + "logps/rejected": -173.8665313720703, + "loss": 0.6702, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6961274743080139, + "rewards/margins": 0.10534685850143433, + "rewards/rejected": 0.5907806158065796, + "step": 4390 + }, + { + "epoch": 1.3857722136923742, + "grad_norm": 3.125, + "learning_rate": 4.040999103819606e-06, + "logits/chosen": -0.5050019025802612, + "logits/rejected": -0.3844815194606781, + "logps/chosen": -205.3455352783203, + "logps/rejected": -164.32203674316406, + "loss": 0.6283, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7454142570495605, + "rewards/margins": 0.17193658649921417, + "rewards/rejected": 0.5734776854515076, + "step": 4400 + }, + { + "epoch": 1.3889216959962205, + "grad_norm": 2.265625, + "learning_rate": 4.036845018452586e-06, + "logits/chosen": -0.4365948736667633, + "logits/rejected": -0.3728296458721161, + "logps/chosen": -188.38137817382812, + "logps/rejected": -169.47296142578125, + "loss": 0.6519, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.65342777967453, + "rewards/margins": 0.1305466592311859, + "rewards/rejected": 0.5228811502456665, + "step": 4410 + }, + { + "epoch": 1.392071178300067, + "grad_norm": 2.859375, + "learning_rate": 4.0326841009398354e-06, + "logits/chosen": -0.46066349744796753, + "logits/rejected": -0.38210171461105347, + "logps/chosen": -189.90225219726562, + "logps/rejected": -175.32156372070312, + "loss": 0.6905, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6591070890426636, + "rewards/margins": 0.048674892634153366, + "rewards/rejected": 0.6104320883750916, + "step": 4420 + }, + { + "epoch": 1.3952206606039133, + "grad_norm": 2.59375, + "learning_rate": 4.028516369778987e-06, + "logits/chosen": -0.43766292929649353, + "logits/rejected": -0.29349422454833984, + "logps/chosen": -188.58241271972656, + "logps/rejected": -164.57627868652344, + "loss": 0.655, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6774150729179382, + "rewards/margins": 0.10568390041589737, + "rewards/rejected": 0.5717312693595886, + "step": 4430 + }, + { + "epoch": 1.3983701429077595, + "grad_norm": 2.703125, + "learning_rate": 4.0243418434979605e-06, + "logits/chosen": -0.43426451086997986, + "logits/rejected": -0.3612423539161682, + "logps/chosen": -201.9038543701172, + "logps/rejected": -177.84640502929688, + "loss": 0.6486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7348337173461914, + "rewards/margins": 0.1346462517976761, + "rewards/rejected": 0.6001874208450317, + "step": 4440 + }, + { + "epoch": 1.4015196252116058, + "grad_norm": 2.40625, + "learning_rate": 4.020160540654892e-06, + "logits/chosen": -0.42761626839637756, + "logits/rejected": -0.31236857175827026, + "logps/chosen": -192.8739471435547, + "logps/rejected": -171.3946990966797, + "loss": 0.6802, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6831879615783691, + "rewards/margins": 0.06777564436197281, + "rewards/rejected": 0.6154123544692993, + "step": 4450 + }, + { + "epoch": 1.4046691075154523, + "grad_norm": 3.046875, + "learning_rate": 4.015972479838035e-06, + "logits/chosen": -0.4452191889286041, + "logits/rejected": -0.3907052278518677, + "logps/chosen": -210.70761108398438, + "logps/rejected": -191.74087524414062, + "loss": 0.6637, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7690273523330688, + "rewards/margins": 0.09637213498353958, + "rewards/rejected": 0.6726552248001099, + "step": 4460 + }, + { + "epoch": 1.4078185898192985, + "grad_norm": 2.515625, + "learning_rate": 4.011777679665693e-06, + "logits/chosen": -0.5560199022293091, + "logits/rejected": -0.4294883608818054, + "logps/chosen": -188.9731903076172, + "logps/rejected": -155.7578125, + "loss": 0.6421, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6779214143753052, + "rewards/margins": 0.14832261204719543, + "rewards/rejected": 0.5295988321304321, + "step": 4470 + }, + { + "epoch": 1.4109680721231448, + "grad_norm": 2.75, + "learning_rate": 4.007576158786123e-06, + "logits/chosen": -0.5267634987831116, + "logits/rejected": -0.4555323123931885, + "logps/chosen": -215.6820526123047, + "logps/rejected": -179.27700805664062, + "loss": 0.6235, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7598739266395569, + "rewards/margins": 0.1784120500087738, + "rewards/rejected": 0.5814618468284607, + "step": 4480 + }, + { + "epoch": 1.414117554426991, + "grad_norm": 2.765625, + "learning_rate": 4.003367935877466e-06, + "logits/chosen": -0.49042144417762756, + "logits/rejected": -0.40049901604652405, + "logps/chosen": -205.4383544921875, + "logps/rejected": -174.7517547607422, + "loss": 0.6269, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.725433349609375, + "rewards/margins": 0.18734149634838104, + "rewards/rejected": 0.538091778755188, + "step": 4490 + }, + { + "epoch": 1.4172670367308373, + "grad_norm": 2.484375, + "learning_rate": 3.999153029647651e-06, + "logits/chosen": -0.43651509284973145, + "logits/rejected": -0.32142752408981323, + "logps/chosen": -216.54519653320312, + "logps/rejected": -169.64065551757812, + "loss": 0.6045, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8090263605117798, + "rewards/margins": 0.24071991443634033, + "rewards/rejected": 0.5683062076568604, + "step": 4500 + }, + { + "epoch": 1.4204165190346836, + "grad_norm": 3.109375, + "learning_rate": 3.994931458834323e-06, + "logits/chosen": -0.4635355472564697, + "logits/rejected": -0.374891459941864, + "logps/chosen": -176.76364135742188, + "logps/rejected": -150.61795043945312, + "loss": 0.6502, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6052500605583191, + "rewards/margins": 0.12052911520004272, + "rewards/rejected": 0.48472094535827637, + "step": 4510 + }, + { + "epoch": 1.4235660013385298, + "grad_norm": 3.484375, + "learning_rate": 3.990703242204754e-06, + "logits/chosen": -0.4779212474822998, + "logits/rejected": -0.3155384659767151, + "logps/chosen": -203.7576904296875, + "logps/rejected": -177.79849243164062, + "loss": 0.6564, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.805811882019043, + "rewards/margins": 0.1246214359998703, + "rewards/rejected": 0.6811904907226562, + "step": 4520 + }, + { + "epoch": 1.4267154836423763, + "grad_norm": 2.53125, + "learning_rate": 3.986468398555758e-06, + "logits/chosen": -0.4679221212863922, + "logits/rejected": -0.3618343472480774, + "logps/chosen": -198.53536987304688, + "logps/rejected": -171.15133666992188, + "loss": 0.6479, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7284259796142578, + "rewards/margins": 0.14506401121616364, + "rewards/rejected": 0.5833619832992554, + "step": 4530 + }, + { + "epoch": 1.4298649659462226, + "grad_norm": 2.78125, + "learning_rate": 3.98222694671361e-06, + "logits/chosen": -0.45645904541015625, + "logits/rejected": -0.36011576652526855, + "logps/chosen": -218.33425903320312, + "logps/rejected": -187.9898681640625, + "loss": 0.6464, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.8267934918403625, + "rewards/margins": 0.15051567554473877, + "rewards/rejected": 0.6762778162956238, + "step": 4540 + }, + { + "epoch": 1.4330144482500689, + "grad_norm": 3.453125, + "learning_rate": 3.977978905533966e-06, + "logits/chosen": -0.49182215332984924, + "logits/rejected": -0.40443873405456543, + "logps/chosen": -203.3308868408203, + "logps/rejected": -182.2737274169922, + "loss": 0.6645, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6901066303253174, + "rewards/margins": 0.09435725957155228, + "rewards/rejected": 0.5957493185997009, + "step": 4550 + }, + { + "epoch": 1.4361639305539153, + "grad_norm": 2.125, + "learning_rate": 3.973724293901772e-06, + "logits/chosen": -0.4056168496608734, + "logits/rejected": -0.38391467928886414, + "logps/chosen": -195.6358184814453, + "logps/rejected": -183.57254028320312, + "loss": 0.6455, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.682805061340332, + "rewards/margins": 0.12637227773666382, + "rewards/rejected": 0.5564327836036682, + "step": 4560 + }, + { + "epoch": 1.4393134128577616, + "grad_norm": 2.703125, + "learning_rate": 3.969463130731183e-06, + "logits/chosen": -0.4418698847293854, + "logits/rejected": -0.39611127972602844, + "logps/chosen": -185.42196655273438, + "logps/rejected": -161.41970825195312, + "loss": 0.6608, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6986794471740723, + "rewards/margins": 0.10915372520685196, + "rewards/rejected": 0.5895256996154785, + "step": 4570 + }, + { + "epoch": 1.4424628951616079, + "grad_norm": 2.28125, + "learning_rate": 3.965195434965482e-06, + "logits/chosen": -0.49892979860305786, + "logits/rejected": -0.3777271807193756, + "logps/chosen": -214.09317016601562, + "logps/rejected": -188.13845825195312, + "loss": 0.6567, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7749571800231934, + "rewards/margins": 0.1210152879357338, + "rewards/rejected": 0.6539419889450073, + "step": 4580 + }, + { + "epoch": 1.4456123774654541, + "grad_norm": 3.453125, + "learning_rate": 3.960921225576991e-06, + "logits/chosen": -0.5049037933349609, + "logits/rejected": -0.34426528215408325, + "logps/chosen": -215.4214630126953, + "logps/rejected": -190.46994018554688, + "loss": 0.6566, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7792149782180786, + "rewards/margins": 0.11181477457284927, + "rewards/rejected": 0.6674002408981323, + "step": 4590 + }, + { + "epoch": 1.4487618597693004, + "grad_norm": 2.578125, + "learning_rate": 3.956640521566989e-06, + "logits/chosen": -0.48397397994995117, + "logits/rejected": -0.4080958962440491, + "logps/chosen": -187.16818237304688, + "logps/rejected": -166.5101318359375, + "loss": 0.6367, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7400224804878235, + "rewards/margins": 0.15180279314517975, + "rewards/rejected": 0.5882197022438049, + "step": 4600 + }, + { + "epoch": 1.4519113420731466, + "grad_norm": 2.75, + "learning_rate": 3.952353341965628e-06, + "logits/chosen": -0.44713473320007324, + "logits/rejected": -0.3727934956550598, + "logps/chosen": -202.4422607421875, + "logps/rejected": -174.40975952148438, + "loss": 0.6574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6708841323852539, + "rewards/margins": 0.10317204892635345, + "rewards/rejected": 0.5677120089530945, + "step": 4610 + }, + { + "epoch": 1.455060824376993, + "grad_norm": 2.765625, + "learning_rate": 3.948059705831847e-06, + "logits/chosen": -0.4646250605583191, + "logits/rejected": -0.34977811574935913, + "logps/chosen": -175.93081665039062, + "logps/rejected": -161.12667846679688, + "loss": 0.6457, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6609789729118347, + "rewards/margins": 0.13796725869178772, + "rewards/rejected": 0.5230117440223694, + "step": 4620 + }, + { + "epoch": 1.4582103066808394, + "grad_norm": 2.90625, + "learning_rate": 3.943759632253289e-06, + "logits/chosen": -0.44996100664138794, + "logits/rejected": -0.3352479934692383, + "logps/chosen": -191.62132263183594, + "logps/rejected": -161.776611328125, + "loss": 0.6196, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7225370407104492, + "rewards/margins": 0.20392043888568878, + "rewards/rejected": 0.5186166167259216, + "step": 4630 + }, + { + "epoch": 1.4613597889846857, + "grad_norm": 2.421875, + "learning_rate": 3.939453140346212e-06, + "logits/chosen": -0.4620143473148346, + "logits/rejected": -0.2916569113731384, + "logps/chosen": -198.8416748046875, + "logps/rejected": -158.47796630859375, + "loss": 0.6241, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6989172697067261, + "rewards/margins": 0.20297345519065857, + "rewards/rejected": 0.4959437847137451, + "step": 4640 + }, + { + "epoch": 1.464509271288532, + "grad_norm": 2.4375, + "learning_rate": 3.935140249255412e-06, + "logits/chosen": -0.4222196638584137, + "logits/rejected": -0.3922198414802551, + "logps/chosen": -191.1973114013672, + "logps/rejected": -182.5751495361328, + "loss": 0.7209, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.6637114882469177, + "rewards/margins": 0.014695653691887856, + "rewards/rejected": 0.6490157842636108, + "step": 4650 + }, + { + "epoch": 1.4676587535923782, + "grad_norm": 4.71875, + "learning_rate": 3.930820978154129e-06, + "logits/chosen": -0.4839719831943512, + "logits/rejected": -0.36961930990219116, + "logps/chosen": -199.48463439941406, + "logps/rejected": -167.02774047851562, + "loss": 0.6264, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7042496204376221, + "rewards/margins": 0.18996970355510712, + "rewards/rejected": 0.5142799615859985, + "step": 4660 + }, + { + "epoch": 1.4708082358962247, + "grad_norm": 2.78125, + "learning_rate": 3.926495346243967e-06, + "logits/chosen": -0.5004483461380005, + "logits/rejected": -0.40153947472572327, + "logps/chosen": -210.6992645263672, + "logps/rejected": -190.9122314453125, + "loss": 0.6394, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7591875791549683, + "rewards/margins": 0.1497870534658432, + "rewards/rejected": 0.6094005107879639, + "step": 4670 + }, + { + "epoch": 1.473957718200071, + "grad_norm": 3.078125, + "learning_rate": 3.922163372754807e-06, + "logits/chosen": -0.41180485486984253, + "logits/rejected": -0.31235820055007935, + "logps/chosen": -203.12744140625, + "logps/rejected": -167.7096710205078, + "loss": 0.6562, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6816965937614441, + "rewards/margins": 0.12079987674951553, + "rewards/rejected": 0.5608968138694763, + "step": 4680 + }, + { + "epoch": 1.4771072005039172, + "grad_norm": 1.78125, + "learning_rate": 3.9178250769447245e-06, + "logits/chosen": -0.4602780342102051, + "logits/rejected": -0.352649986743927, + "logps/chosen": -178.3946075439453, + "logps/rejected": -165.46885681152344, + "loss": 0.6532, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6400049924850464, + "rewards/margins": 0.12722846865653992, + "rewards/rejected": 0.5127764940261841, + "step": 4690 + }, + { + "epoch": 1.4802566828077635, + "grad_norm": 2.359375, + "learning_rate": 3.913480478099898e-06, + "logits/chosen": -0.4489854872226715, + "logits/rejected": -0.3600943684577942, + "logps/chosen": -215.1140594482422, + "logps/rejected": -183.04026794433594, + "loss": 0.6386, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7823598980903625, + "rewards/margins": 0.1600048840045929, + "rewards/rejected": 0.6223549842834473, + "step": 4700 + }, + { + "epoch": 1.4834061651116097, + "grad_norm": 2.734375, + "learning_rate": 3.909129595534527e-06, + "logits/chosen": -0.4606572091579437, + "logits/rejected": -0.3265830874443054, + "logps/chosen": -197.79678344726562, + "logps/rejected": -165.95912170410156, + "loss": 0.6541, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6801806688308716, + "rewards/margins": 0.11523494869470596, + "rewards/rejected": 0.5649456977844238, + "step": 4710 + }, + { + "epoch": 1.486555647415456, + "grad_norm": 3.625, + "learning_rate": 3.904772448590747e-06, + "logits/chosen": -0.46349745988845825, + "logits/rejected": -0.4300110936164856, + "logps/chosen": -190.7307891845703, + "logps/rejected": -179.4709014892578, + "loss": 0.6772, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6604156494140625, + "rewards/margins": 0.06815408170223236, + "rewards/rejected": 0.5922616124153137, + "step": 4720 + }, + { + "epoch": 1.4897051297193025, + "grad_norm": 3.015625, + "learning_rate": 3.900409056638542e-06, + "logits/chosen": -0.4786251485347748, + "logits/rejected": -0.3198348581790924, + "logps/chosen": -210.8201904296875, + "logps/rejected": -178.6912078857422, + "loss": 0.6503, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7678909301757812, + "rewards/margins": 0.12642471492290497, + "rewards/rejected": 0.6414662003517151, + "step": 4730 + }, + { + "epoch": 1.4928546120231487, + "grad_norm": 3.015625, + "learning_rate": 3.896039439075659e-06, + "logits/chosen": -0.5011542439460754, + "logits/rejected": -0.36163219809532166, + "logps/chosen": -197.447021484375, + "logps/rejected": -147.70782470703125, + "loss": 0.629, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6751397848129272, + "rewards/margins": 0.1756085753440857, + "rewards/rejected": 0.49953117966651917, + "step": 4740 + }, + { + "epoch": 1.496004094326995, + "grad_norm": 2.734375, + "learning_rate": 3.891663615327518e-06, + "logits/chosen": -0.47268587350845337, + "logits/rejected": -0.3960368037223816, + "logps/chosen": -188.75637817382812, + "logps/rejected": -167.4852752685547, + "loss": 0.6569, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6986013054847717, + "rewards/margins": 0.12062957137823105, + "rewards/rejected": 0.5779717564582825, + "step": 4750 + }, + { + "epoch": 1.4991535766308413, + "grad_norm": 2.703125, + "learning_rate": 3.887281604847134e-06, + "logits/chosen": -0.510342001914978, + "logits/rejected": -0.36850231885910034, + "logps/chosen": -192.31849670410156, + "logps/rejected": -166.56289672851562, + "loss": 0.64, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6647813320159912, + "rewards/margins": 0.15008948743343353, + "rewards/rejected": 0.5146918296813965, + "step": 4760 + }, + { + "epoch": 1.5023030589346877, + "grad_norm": 2.96875, + "learning_rate": 3.8828934271150225e-06, + "logits/chosen": -0.460064172744751, + "logits/rejected": -0.4005635678768158, + "logps/chosen": -222.4634246826172, + "logps/rejected": -201.12998962402344, + "loss": 0.6611, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7676762938499451, + "rewards/margins": 0.10998652130365372, + "rewards/rejected": 0.6576897501945496, + "step": 4770 + }, + { + "epoch": 1.505452541238534, + "grad_norm": 3.21875, + "learning_rate": 3.878499101639116e-06, + "logits/chosen": -0.5471469759941101, + "logits/rejected": -0.4315733313560486, + "logps/chosen": -210.2429962158203, + "logps/rejected": -175.91360473632812, + "loss": 0.6623, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7174965143203735, + "rewards/margins": 0.10746297985315323, + "rewards/rejected": 0.6100335717201233, + "step": 4780 + }, + { + "epoch": 1.5086020235423803, + "grad_norm": 3.4375, + "learning_rate": 3.8740986479546796e-06, + "logits/chosen": -0.45277899503707886, + "logits/rejected": -0.4165739119052887, + "logps/chosen": -194.2645263671875, + "logps/rejected": -179.887939453125, + "loss": 0.6491, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7052034139633179, + "rewards/margins": 0.1254056990146637, + "rewards/rejected": 0.5797977447509766, + "step": 4790 + }, + { + "epoch": 1.5117515058462265, + "grad_norm": 2.796875, + "learning_rate": 3.869692085624218e-06, + "logits/chosen": -0.45139655470848083, + "logits/rejected": -0.3898164629936218, + "logps/chosen": -209.1693572998047, + "logps/rejected": -194.74021911621094, + "loss": 0.6479, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7721225619316101, + "rewards/margins": 0.12638486921787262, + "rewards/rejected": 0.6457376480102539, + "step": 4800 + }, + { + "epoch": 1.5149009881500728, + "grad_norm": 2.859375, + "learning_rate": 3.865279434237394e-06, + "logits/chosen": -0.4812788963317871, + "logits/rejected": -0.39241549372673035, + "logps/chosen": -190.20193481445312, + "logps/rejected": -166.25454711914062, + "loss": 0.6459, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6281577944755554, + "rewards/margins": 0.12559270858764648, + "rewards/rejected": 0.5025650858879089, + "step": 4810 + }, + { + "epoch": 1.518050470453919, + "grad_norm": 3.015625, + "learning_rate": 3.860860713410941e-06, + "logits/chosen": -0.38653573393821716, + "logits/rejected": -0.361195832490921, + "logps/chosen": -204.06520080566406, + "logps/rejected": -209.9178009033203, + "loss": 0.6876, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.7754232287406921, + "rewards/margins": 0.045936040580272675, + "rewards/rejected": 0.7294871211051941, + "step": 4820 + }, + { + "epoch": 1.5211999527577653, + "grad_norm": 2.453125, + "learning_rate": 3.8564359427885735e-06, + "logits/chosen": -0.469553142786026, + "logits/rejected": -0.41318267583847046, + "logps/chosen": -207.17117309570312, + "logps/rejected": -177.3662872314453, + "loss": 0.6248, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7327437400817871, + "rewards/margins": 0.1847001016139984, + "rewards/rejected": 0.5480436086654663, + "step": 4830 + }, + { + "epoch": 1.5243494350616118, + "grad_norm": 2.984375, + "learning_rate": 3.852005142040901e-06, + "logits/chosen": -0.4887300133705139, + "logits/rejected": -0.38054460287094116, + "logps/chosen": -189.93447875976562, + "logps/rejected": -159.81796264648438, + "loss": 0.635, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6666679978370667, + "rewards/margins": 0.1536942720413208, + "rewards/rejected": 0.5129736661911011, + "step": 4840 + }, + { + "epoch": 1.527498917365458, + "grad_norm": 2.453125, + "learning_rate": 3.8475683308653385e-06, + "logits/chosen": -0.4928087294101715, + "logits/rejected": -0.3735254406929016, + "logps/chosen": -204.97787475585938, + "logps/rejected": -168.03756713867188, + "loss": 0.6341, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.772538423538208, + "rewards/margins": 0.1797189563512802, + "rewards/rejected": 0.5928195118904114, + "step": 4850 + }, + { + "epoch": 1.5306483996693043, + "grad_norm": 2.375, + "learning_rate": 3.8431255289860225e-06, + "logits/chosen": -0.47978100180625916, + "logits/rejected": -0.3051653504371643, + "logps/chosen": -217.0726776123047, + "logps/rejected": -169.8887939453125, + "loss": 0.5989, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.799135148525238, + "rewards/margins": 0.24375374615192413, + "rewards/rejected": 0.5553814172744751, + "step": 4860 + }, + { + "epoch": 1.5337978819731508, + "grad_norm": 3.1875, + "learning_rate": 3.838676756153723e-06, + "logits/chosen": -0.4916785657405853, + "logits/rejected": -0.4126752018928528, + "logps/chosen": -204.36715698242188, + "logps/rejected": -171.4047088623047, + "loss": 0.6373, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7680164575576782, + "rewards/margins": 0.1488504558801651, + "rewards/rejected": 0.6191660761833191, + "step": 4870 + }, + { + "epoch": 1.536947364276997, + "grad_norm": 2.328125, + "learning_rate": 3.834222032145751e-06, + "logits/chosen": -0.4622599184513092, + "logits/rejected": -0.3681351840496063, + "logps/chosen": -192.53379821777344, + "logps/rejected": -161.97409057617188, + "loss": 0.6518, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6855158805847168, + "rewards/margins": 0.11872188746929169, + "rewards/rejected": 0.5667939782142639, + "step": 4880 + }, + { + "epoch": 1.5400968465808433, + "grad_norm": 3.296875, + "learning_rate": 3.829761376765875e-06, + "logits/chosen": -0.5002883076667786, + "logits/rejected": -0.37738004326820374, + "logps/chosen": -208.83547973632812, + "logps/rejected": -180.82960510253906, + "loss": 0.6302, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7079477906227112, + "rewards/margins": 0.172250896692276, + "rewards/rejected": 0.5356968641281128, + "step": 4890 + }, + { + "epoch": 1.5432463288846896, + "grad_norm": 2.640625, + "learning_rate": 3.825294809844234e-06, + "logits/chosen": -0.4677095413208008, + "logits/rejected": -0.43684762716293335, + "logps/chosen": -204.42538452148438, + "logps/rejected": -189.5218505859375, + "loss": 0.6914, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.697405219078064, + "rewards/margins": 0.04717016965150833, + "rewards/rejected": 0.6502350568771362, + "step": 4900 + }, + { + "epoch": 1.5463958111885359, + "grad_norm": 3.234375, + "learning_rate": 3.820822351237245e-06, + "logits/chosen": -0.44419798254966736, + "logits/rejected": -0.3508889377117157, + "logps/chosen": -192.20458984375, + "logps/rejected": -182.95726013183594, + "loss": 0.665, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6904107928276062, + "rewards/margins": 0.09415493160486221, + "rewards/rejected": 0.596255898475647, + "step": 4910 + }, + { + "epoch": 1.5495452934923821, + "grad_norm": 2.59375, + "learning_rate": 3.816344020827516e-06, + "logits/chosen": -0.5454775094985962, + "logits/rejected": -0.39591822028160095, + "logps/chosen": -196.39208984375, + "logps/rejected": -159.05348205566406, + "loss": 0.6543, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6435393691062927, + "rewards/margins": 0.11705289781093597, + "rewards/rejected": 0.5264865159988403, + "step": 4920 + }, + { + "epoch": 1.5526947757962284, + "grad_norm": 3.359375, + "learning_rate": 3.8118598385237604e-06, + "logits/chosen": -0.439689576625824, + "logits/rejected": -0.4039355218410492, + "logps/chosen": -186.1417236328125, + "logps/rejected": -170.55740356445312, + "loss": 0.6797, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6710829138755798, + "rewards/margins": 0.06706185638904572, + "rewards/rejected": 0.6040210127830505, + "step": 4930 + }, + { + "epoch": 1.5558442581000747, + "grad_norm": 2.28125, + "learning_rate": 3.807369824260706e-06, + "logits/chosen": -0.45275792479515076, + "logits/rejected": -0.3387320935726166, + "logps/chosen": -205.11349487304688, + "logps/rejected": -165.2199249267578, + "loss": 0.626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7621325850486755, + "rewards/margins": 0.1830775886774063, + "rewards/rejected": 0.5790549516677856, + "step": 4940 + }, + { + "epoch": 1.5589937404039211, + "grad_norm": 2.828125, + "learning_rate": 3.8028739979990072e-06, + "logits/chosen": -0.4740076959133148, + "logits/rejected": -0.36817610263824463, + "logps/chosen": -217.1376190185547, + "logps/rejected": -184.6923065185547, + "loss": 0.617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7812771797180176, + "rewards/margins": 0.19556982815265656, + "rewards/rejected": 0.5857073068618774, + "step": 4950 + }, + { + "epoch": 1.5621432227077674, + "grad_norm": 2.859375, + "learning_rate": 3.798372379725155e-06, + "logits/chosen": -0.47232404351234436, + "logits/rejected": -0.30332669615745544, + "logps/chosen": -179.11349487304688, + "logps/rejected": -152.26527404785156, + "loss": 0.6515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6055221557617188, + "rewards/margins": 0.1263987123966217, + "rewards/rejected": 0.47912344336509705, + "step": 4960 + }, + { + "epoch": 1.5652927050116137, + "grad_norm": 3.4375, + "learning_rate": 3.79386498945139e-06, + "logits/chosen": -0.512363612651825, + "logits/rejected": -0.3525320291519165, + "logps/chosen": -199.22694396972656, + "logps/rejected": -158.5713348388672, + "loss": 0.6495, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7140517234802246, + "rewards/margins": 0.13541573286056519, + "rewards/rejected": 0.5786360502243042, + "step": 4970 + }, + { + "epoch": 1.5684421873154601, + "grad_norm": 3.125, + "learning_rate": 3.789351847215613e-06, + "logits/chosen": -0.4590677320957184, + "logits/rejected": -0.2966582179069519, + "logps/chosen": -211.01083374023438, + "logps/rejected": -171.15493774414062, + "loss": 0.6124, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.7449513673782349, + "rewards/margins": 0.21209721267223358, + "rewards/rejected": 0.5328541994094849, + "step": 4980 + }, + { + "epoch": 1.5715916696193064, + "grad_norm": 3.046875, + "learning_rate": 3.784832973081295e-06, + "logits/chosen": -0.4136572778224945, + "logits/rejected": -0.3735829293727875, + "logps/chosen": -190.76473999023438, + "logps/rejected": -176.0132293701172, + "loss": 0.6683, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.7110625505447388, + "rewards/margins": 0.08584681898355484, + "rewards/rejected": 0.6252157688140869, + "step": 4990 + }, + { + "epoch": 1.5747411519231527, + "grad_norm": 2.828125, + "learning_rate": 3.7803083871373876e-06, + "logits/chosen": -0.42973631620407104, + "logits/rejected": -0.2951328754425049, + "logps/chosen": -206.06332397460938, + "logps/rejected": -174.61618041992188, + "loss": 0.6725, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.7120364308357239, + "rewards/margins": 0.0878576785326004, + "rewards/rejected": 0.6241787672042847, + "step": 5000 + }, + { + "epoch": 1.577890634226999, + "grad_norm": 3.296875, + "learning_rate": 3.775778109498237e-06, + "logits/chosen": -0.391795814037323, + "logits/rejected": -0.2272360622882843, + "logps/chosen": -214.9050750732422, + "logps/rejected": -175.29550170898438, + "loss": 0.6073, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7749017477035522, + "rewards/margins": 0.23047152161598206, + "rewards/rejected": 0.544430136680603, + "step": 5010 + }, + { + "epoch": 1.5810401165308452, + "grad_norm": 2.578125, + "learning_rate": 3.7712421603034894e-06, + "logits/chosen": -0.43912237882614136, + "logits/rejected": -0.34138351678848267, + "logps/chosen": -204.3086395263672, + "logps/rejected": -182.76370239257812, + "loss": 0.7119, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6654126048088074, + "rewards/margins": 0.0014205619227141142, + "rewards/rejected": 0.6639919877052307, + "step": 5020 + }, + { + "epoch": 1.5841895988346915, + "grad_norm": 2.765625, + "learning_rate": 3.766700559718006e-06, + "logits/chosen": -0.5014868974685669, + "logits/rejected": -0.4088328778743744, + "logps/chosen": -185.99476623535156, + "logps/rejected": -165.3468017578125, + "loss": 0.6596, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6833370923995972, + "rewards/margins": 0.10189273208379745, + "rewards/rejected": 0.5814443826675415, + "step": 5030 + }, + { + "epoch": 1.5873390811385377, + "grad_norm": 2.390625, + "learning_rate": 3.762153327931772e-06, + "logits/chosen": -0.44292283058166504, + "logits/rejected": -0.3582010269165039, + "logps/chosen": -201.4560546875, + "logps/rejected": -181.69656372070312, + "loss": 0.6431, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.720178484916687, + "rewards/margins": 0.14207497239112854, + "rewards/rejected": 0.5781034231185913, + "step": 5040 + }, + { + "epoch": 1.5904885634423842, + "grad_norm": 3.375, + "learning_rate": 3.7576004851598052e-06, + "logits/chosen": -0.4733700156211853, + "logits/rejected": -0.3716031610965729, + "logps/chosen": -199.3379364013672, + "logps/rejected": -178.63943481445312, + "loss": 0.6621, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.75123131275177, + "rewards/margins": 0.10580404102802277, + "rewards/rejected": 0.6454272866249084, + "step": 5050 + }, + { + "epoch": 1.5936380457462305, + "grad_norm": 2.6875, + "learning_rate": 3.7530420516420676e-06, + "logits/chosen": -0.4269101023674011, + "logits/rejected": -0.37862318754196167, + "logps/chosen": -199.61886596679688, + "logps/rejected": -173.53884887695312, + "loss": 0.6456, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7329145073890686, + "rewards/margins": 0.1574871987104416, + "rewards/rejected": 0.575427234172821, + "step": 5060 + }, + { + "epoch": 1.5967875280500767, + "grad_norm": 2.546875, + "learning_rate": 3.7484780476433764e-06, + "logits/chosen": -0.4474611282348633, + "logits/rejected": -0.35605159401893616, + "logps/chosen": -188.90847778320312, + "logps/rejected": -159.92965698242188, + "loss": 0.6346, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6905893087387085, + "rewards/margins": 0.16502177715301514, + "rewards/rejected": 0.5255674123764038, + "step": 5070 + }, + { + "epoch": 1.5999370103539232, + "grad_norm": 2.75, + "learning_rate": 3.743908493453311e-06, + "logits/chosen": -0.43630313873291016, + "logits/rejected": -0.37972143292427063, + "logps/chosen": -223.48648071289062, + "logps/rejected": -194.86001586914062, + "loss": 0.6523, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7133353352546692, + "rewards/margins": 0.11560231447219849, + "rewards/rejected": 0.5977329611778259, + "step": 5080 + }, + { + "epoch": 1.6030864926577695, + "grad_norm": 3.0625, + "learning_rate": 3.739333409386126e-06, + "logits/chosen": -0.44998350739479065, + "logits/rejected": -0.3347667157649994, + "logps/chosen": -207.3821563720703, + "logps/rejected": -182.54971313476562, + "loss": 0.6532, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7008602619171143, + "rewards/margins": 0.14106692373752594, + "rewards/rejected": 0.5597933530807495, + "step": 5090 + }, + { + "epoch": 1.6062359749616157, + "grad_norm": 2.25, + "learning_rate": 3.734752815780659e-06, + "logits/chosen": -0.514176070690155, + "logits/rejected": -0.3695998787879944, + "logps/chosen": -196.74203491210938, + "logps/rejected": -162.3852081298828, + "loss": 0.6246, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.7342701554298401, + "rewards/margins": 0.20258764922618866, + "rewards/rejected": 0.5316824913024902, + "step": 5100 + }, + { + "epoch": 1.609385457265462, + "grad_norm": 2.5, + "learning_rate": 3.7301667330002408e-06, + "logits/chosen": -0.439274400472641, + "logits/rejected": -0.3606041669845581, + "logps/chosen": -202.26397705078125, + "logps/rejected": -176.84947204589844, + "loss": 0.641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7874661684036255, + "rewards/margins": 0.1603676974773407, + "rewards/rejected": 0.6270985007286072, + "step": 5110 + }, + { + "epoch": 1.6125349395693083, + "grad_norm": 2.625, + "learning_rate": 3.7255751814326035e-06, + "logits/chosen": -0.4579714238643646, + "logits/rejected": -0.3923242390155792, + "logps/chosen": -189.69895935058594, + "logps/rejected": -181.8014373779297, + "loss": 0.6762, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6748474836349487, + "rewards/margins": 0.07303015887737274, + "rewards/rejected": 0.6018173098564148, + "step": 5120 + }, + { + "epoch": 1.6156844218731545, + "grad_norm": 2.734375, + "learning_rate": 3.720978181489792e-06, + "logits/chosen": -0.4986580014228821, + "logits/rejected": -0.4134606719017029, + "logps/chosen": -189.92276000976562, + "logps/rejected": -189.25656127929688, + "loss": 0.6926, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.7121160626411438, + "rewards/margins": 0.05977436155080795, + "rewards/rejected": 0.6523416042327881, + "step": 5130 + }, + { + "epoch": 1.6188339041770008, + "grad_norm": 3.046875, + "learning_rate": 3.716375753608073e-06, + "logits/chosen": -0.45538026094436646, + "logits/rejected": -0.29467564821243286, + "logps/chosen": -217.4430694580078, + "logps/rejected": -182.09951782226562, + "loss": 0.6399, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.804099440574646, + "rewards/margins": 0.17206081748008728, + "rewards/rejected": 0.6320386528968811, + "step": 5140 + }, + { + "epoch": 1.621983386480847, + "grad_norm": 2.34375, + "learning_rate": 3.7117679182478415e-06, + "logits/chosen": -0.45015448331832886, + "logits/rejected": -0.3417063355445862, + "logps/chosen": -196.57412719726562, + "logps/rejected": -183.40467834472656, + "loss": 0.669, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6573584675788879, + "rewards/margins": 0.0827714204788208, + "rewards/rejected": 0.5745870471000671, + "step": 5150 + }, + { + "epoch": 1.6251328687846935, + "grad_norm": 3.359375, + "learning_rate": 3.707154695893535e-06, + "logits/chosen": -0.49352359771728516, + "logits/rejected": -0.3199608027935028, + "logps/chosen": -191.95172119140625, + "logps/rejected": -168.58969116210938, + "loss": 0.6372, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.743454098701477, + "rewards/margins": 0.16557399928569794, + "rewards/rejected": 0.5778801441192627, + "step": 5160 + }, + { + "epoch": 1.6282823510885398, + "grad_norm": 2.6875, + "learning_rate": 3.702536107053536e-06, + "logits/chosen": -0.402625173330307, + "logits/rejected": -0.3243730366230011, + "logps/chosen": -182.0989532470703, + "logps/rejected": -162.49978637695312, + "loss": 0.6791, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6483687162399292, + "rewards/margins": 0.07241274416446686, + "rewards/rejected": 0.5759559869766235, + "step": 5170 + }, + { + "epoch": 1.6314318333923863, + "grad_norm": 2.5625, + "learning_rate": 3.697912172260085e-06, + "logits/chosen": -0.49576109647750854, + "logits/rejected": -0.39149078726768494, + "logps/chosen": -205.7623291015625, + "logps/rejected": -176.04811096191406, + "loss": 0.6461, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7142454385757446, + "rewards/margins": 0.13653890788555145, + "rewards/rejected": 0.5777064561843872, + "step": 5180 + }, + { + "epoch": 1.6345813156962326, + "grad_norm": 3.140625, + "learning_rate": 3.693282912069189e-06, + "logits/chosen": -0.4688878655433655, + "logits/rejected": -0.3151419162750244, + "logps/chosen": -222.38955688476562, + "logps/rejected": -187.0220489501953, + "loss": 0.6532, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7705121636390686, + "rewards/margins": 0.11784696578979492, + "rewards/rejected": 0.6526652574539185, + "step": 5190 + }, + { + "epoch": 1.6377307980000788, + "grad_norm": 3.21875, + "learning_rate": 3.6886483470605293e-06, + "logits/chosen": -0.4141194224357605, + "logits/rejected": -0.33433422446250916, + "logps/chosen": -192.88711547851562, + "logps/rejected": -161.22756958007812, + "loss": 0.6707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.645660400390625, + "rewards/margins": 0.0918336808681488, + "rewards/rejected": 0.5538267493247986, + "step": 5200 + }, + { + "epoch": 1.640880280303925, + "grad_norm": 2.65625, + "learning_rate": 3.6840084978373704e-06, + "logits/chosen": -0.4529247283935547, + "logits/rejected": -0.23250219225883484, + "logps/chosen": -212.8038330078125, + "logps/rejected": -176.3964385986328, + "loss": 0.6297, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7746697664260864, + "rewards/margins": 0.16910018026828766, + "rewards/rejected": 0.6055695414543152, + "step": 5210 + }, + { + "epoch": 1.6440297626077713, + "grad_norm": 2.328125, + "learning_rate": 3.6793633850264655e-06, + "logits/chosen": -0.4927656650543213, + "logits/rejected": -0.3135763704776764, + "logps/chosen": -213.3108673095703, + "logps/rejected": -167.95083618164062, + "loss": 0.6254, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7371099591255188, + "rewards/margins": 0.19389715790748596, + "rewards/rejected": 0.5432127714157104, + "step": 5220 + }, + { + "epoch": 1.6471792449116176, + "grad_norm": 2.59375, + "learning_rate": 3.6747130292779715e-06, + "logits/chosen": -0.4238489270210266, + "logits/rejected": -0.3129653036594391, + "logps/chosen": -194.534912109375, + "logps/rejected": -174.8766632080078, + "loss": 0.6662, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7266419529914856, + "rewards/margins": 0.09835498034954071, + "rewards/rejected": 0.6282869577407837, + "step": 5230 + }, + { + "epoch": 1.6503287272154639, + "grad_norm": 2.78125, + "learning_rate": 3.6700574512653497e-06, + "logits/chosen": -0.4622649550437927, + "logits/rejected": -0.37696540355682373, + "logps/chosen": -199.86097717285156, + "logps/rejected": -181.8938446044922, + "loss": 0.6482, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7182890176773071, + "rewards/margins": 0.12708035111427307, + "rewards/rejected": 0.5912087559700012, + "step": 5240 + }, + { + "epoch": 1.6534782095193101, + "grad_norm": 2.5, + "learning_rate": 3.66539667168528e-06, + "logits/chosen": -0.4936433732509613, + "logits/rejected": -0.3448127210140228, + "logps/chosen": -187.07192993164062, + "logps/rejected": -149.51683044433594, + "loss": 0.6323, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6617879867553711, + "rewards/margins": 0.18173685669898987, + "rewards/rejected": 0.480051189661026, + "step": 5250 + }, + { + "epoch": 1.6566276918231566, + "grad_norm": 3.109375, + "learning_rate": 3.6607307112575646e-06, + "logits/chosen": -0.5180322527885437, + "logits/rejected": -0.3746757507324219, + "logps/chosen": -201.63467407226562, + "logps/rejected": -166.1622314453125, + "loss": 0.5896, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.7969163656234741, + "rewards/margins": 0.26014357805252075, + "rewards/rejected": 0.5367728471755981, + "step": 5260 + }, + { + "epoch": 1.6597771741270029, + "grad_norm": 3.15625, + "learning_rate": 3.6560595907250375e-06, + "logits/chosen": -0.3662557005882263, + "logits/rejected": -0.3333319425582886, + "logps/chosen": -191.13694763183594, + "logps/rejected": -189.7360382080078, + "loss": 0.6766, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.7387022972106934, + "rewards/margins": 0.08876340091228485, + "rewards/rejected": 0.6499389410018921, + "step": 5270 + }, + { + "epoch": 1.6629266564308491, + "grad_norm": 3.265625, + "learning_rate": 3.651383330853472e-06, + "logits/chosen": -0.4906235635280609, + "logits/rejected": -0.36935657262802124, + "logps/chosen": -217.0959014892578, + "logps/rejected": -180.4376220703125, + "loss": 0.6515, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7411099076271057, + "rewards/margins": 0.1661885380744934, + "rewards/rejected": 0.5749213695526123, + "step": 5280 + }, + { + "epoch": 1.6660761387346956, + "grad_norm": 2.21875, + "learning_rate": 3.6467019524314905e-06, + "logits/chosen": -0.49627685546875, + "logits/rejected": -0.3914474844932556, + "logps/chosen": -189.37979125976562, + "logps/rejected": -160.41397094726562, + "loss": 0.6208, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.7144301533699036, + "rewards/margins": 0.19551965594291687, + "rewards/rejected": 0.5189104676246643, + "step": 5290 + }, + { + "epoch": 1.6692256210385419, + "grad_norm": 2.515625, + "learning_rate": 3.6420154762704685e-06, + "logits/chosen": -0.5125707983970642, + "logits/rejected": -0.3919418752193451, + "logps/chosen": -194.13682556152344, + "logps/rejected": -161.27906799316406, + "loss": 0.6436, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6802747249603271, + "rewards/margins": 0.1379631608724594, + "rewards/rejected": 0.5423115491867065, + "step": 5300 + }, + { + "epoch": 1.6723751033423881, + "grad_norm": 3.0625, + "learning_rate": 3.6373239232044445e-06, + "logits/chosen": -0.5417731404304504, + "logits/rejected": -0.3268618881702423, + "logps/chosen": -193.62232971191406, + "logps/rejected": -159.81134033203125, + "loss": 0.6213, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6962756514549255, + "rewards/margins": 0.18256238102912903, + "rewards/rejected": 0.5137132406234741, + "step": 5310 + }, + { + "epoch": 1.6755245856462344, + "grad_norm": 2.359375, + "learning_rate": 3.632627314090026e-06, + "logits/chosen": -0.4504339098930359, + "logits/rejected": -0.38121432065963745, + "logps/chosen": -208.55001831054688, + "logps/rejected": -187.78811645507812, + "loss": 0.6453, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7359059453010559, + "rewards/margins": 0.14206233620643616, + "rewards/rejected": 0.5938436388969421, + "step": 5320 + }, + { + "epoch": 1.6786740679500807, + "grad_norm": 3.03125, + "learning_rate": 3.6279256698062986e-06, + "logits/chosen": -0.5013277530670166, + "logits/rejected": -0.33975881338119507, + "logps/chosen": -206.8636474609375, + "logps/rejected": -179.7592315673828, + "loss": 0.6196, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7417197227478027, + "rewards/margins": 0.20961704850196838, + "rewards/rejected": 0.5321027040481567, + "step": 5330 + }, + { + "epoch": 1.681823550253927, + "grad_norm": 2.984375, + "learning_rate": 3.6232190112547324e-06, + "logits/chosen": -0.5261915922164917, + "logits/rejected": -0.4154892861843109, + "logps/chosen": -196.72494506835938, + "logps/rejected": -167.34524536132812, + "loss": 0.6448, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6392444968223572, + "rewards/margins": 0.13428232073783875, + "rewards/rejected": 0.5049622058868408, + "step": 5340 + }, + { + "epoch": 1.6849730325577732, + "grad_norm": 2.28125, + "learning_rate": 3.6185073593590868e-06, + "logits/chosen": -0.5182952880859375, + "logits/rejected": -0.32755130529403687, + "logps/chosen": -193.91754150390625, + "logps/rejected": -147.9218292236328, + "loss": 0.6283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6746249794960022, + "rewards/margins": 0.181734099984169, + "rewards/rejected": 0.4928908944129944, + "step": 5350 + }, + { + "epoch": 1.6881225148616195, + "grad_norm": 2.84375, + "learning_rate": 3.613790735065321e-06, + "logits/chosen": -0.397797167301178, + "logits/rejected": -0.34379732608795166, + "logps/chosen": -196.30372619628906, + "logps/rejected": -173.0957489013672, + "loss": 0.653, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7253559827804565, + "rewards/margins": 0.13321705162525177, + "rewards/rejected": 0.592138946056366, + "step": 5360 + }, + { + "epoch": 1.691271997165466, + "grad_norm": 2.46875, + "learning_rate": 3.6090691593414978e-06, + "logits/chosen": -0.49348416924476624, + "logits/rejected": -0.33678576350212097, + "logps/chosen": -201.0540313720703, + "logps/rejected": -170.12416076660156, + "loss": 0.611, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.7913371324539185, + "rewards/margins": 0.21138879656791687, + "rewards/rejected": 0.5799483060836792, + "step": 5370 + }, + { + "epoch": 1.6944214794693122, + "grad_norm": 2.546875, + "learning_rate": 3.604342653177695e-06, + "logits/chosen": -0.48641714453697205, + "logits/rejected": -0.40380674600601196, + "logps/chosen": -179.4705810546875, + "logps/rejected": -162.80154418945312, + "loss": 0.6501, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6522781252861023, + "rewards/margins": 0.11866960674524307, + "rewards/rejected": 0.5336084961891174, + "step": 5380 + }, + { + "epoch": 1.6975709617731587, + "grad_norm": 2.4375, + "learning_rate": 3.599611237585906e-06, + "logits/chosen": -0.45630472898483276, + "logits/rejected": -0.3485426902770996, + "logps/chosen": -171.63662719726562, + "logps/rejected": -156.93072509765625, + "loss": 0.6609, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5976822972297668, + "rewards/margins": 0.1124814972281456, + "rewards/rejected": 0.48520079255104065, + "step": 5390 + }, + { + "epoch": 1.700720444077005, + "grad_norm": 2.640625, + "learning_rate": 3.5948749335999493e-06, + "logits/chosen": -0.5297515988349915, + "logits/rejected": -0.44055309891700745, + "logps/chosen": -192.01625061035156, + "logps/rejected": -184.41848754882812, + "loss": 0.6709, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6303820610046387, + "rewards/margins": 0.07556124031543732, + "rewards/rejected": 0.5548208951950073, + "step": 5400 + }, + { + "epoch": 1.7038699263808512, + "grad_norm": 2.890625, + "learning_rate": 3.590133762275378e-06, + "logits/chosen": -0.5689431428909302, + "logits/rejected": -0.3852362334728241, + "logps/chosen": -212.37899780273438, + "logps/rejected": -171.7864227294922, + "loss": 0.6305, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7726091742515564, + "rewards/margins": 0.1817570924758911, + "rewards/rejected": 0.5908521413803101, + "step": 5410 + }, + { + "epoch": 1.7070194086846975, + "grad_norm": 2.4375, + "learning_rate": 3.5853877446893802e-06, + "logits/chosen": -0.496450811624527, + "logits/rejected": -0.39372554421424866, + "logps/chosen": -183.16973876953125, + "logps/rejected": -166.51968383789062, + "loss": 0.6613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6342931985855103, + "rewards/margins": 0.08918163925409317, + "rewards/rejected": 0.5451115369796753, + "step": 5420 + }, + { + "epoch": 1.7101688909885437, + "grad_norm": 2.578125, + "learning_rate": 3.5806369019406906e-06, + "logits/chosen": -0.5108271837234497, + "logits/rejected": -0.33603325486183167, + "logps/chosen": -210.8227996826172, + "logps/rejected": -167.770263671875, + "loss": 0.6393, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7098419666290283, + "rewards/margins": 0.17859862744808197, + "rewards/rejected": 0.5312432646751404, + "step": 5430 + }, + { + "epoch": 1.71331837329239, + "grad_norm": 2.90625, + "learning_rate": 3.5758812551494926e-06, + "logits/chosen": -0.4778234362602234, + "logits/rejected": -0.3700777590274811, + "logps/chosen": -218.7685546875, + "logps/rejected": -184.45120239257812, + "loss": 0.6412, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.798811674118042, + "rewards/margins": 0.14647333323955536, + "rewards/rejected": 0.6523382663726807, + "step": 5440 + }, + { + "epoch": 1.7164678555962363, + "grad_norm": 2.4375, + "learning_rate": 3.571120825457327e-06, + "logits/chosen": -0.4903620779514313, + "logits/rejected": -0.37475109100341797, + "logps/chosen": -196.15283203125, + "logps/rejected": -187.24850463867188, + "loss": 0.6469, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6965973377227783, + "rewards/margins": 0.13229303061962128, + "rewards/rejected": 0.5643042922019958, + "step": 5450 + }, + { + "epoch": 1.7196173379000825, + "grad_norm": 3.171875, + "learning_rate": 3.5663556340269984e-06, + "logits/chosen": -0.5170494318008423, + "logits/rejected": -0.3688809871673584, + "logps/chosen": -198.2430419921875, + "logps/rejected": -165.7610321044922, + "loss": 0.6394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7186397314071655, + "rewards/margins": 0.14983563125133514, + "rewards/rejected": 0.568804144859314, + "step": 5460 + }, + { + "epoch": 1.722766820203929, + "grad_norm": 3.265625, + "learning_rate": 3.5615857020424786e-06, + "logits/chosen": -0.48056459426879883, + "logits/rejected": -0.42547035217285156, + "logps/chosen": -225.29281616210938, + "logps/rejected": -195.69827270507812, + "loss": 0.6488, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7470188140869141, + "rewards/margins": 0.1338168829679489, + "rewards/rejected": 0.613201916217804, + "step": 5470 + }, + { + "epoch": 1.7259163025077753, + "grad_norm": 3.21875, + "learning_rate": 3.5568110507088146e-06, + "logits/chosen": -0.42114323377609253, + "logits/rejected": -0.29093047976493835, + "logps/chosen": -188.66677856445312, + "logps/rejected": -163.27169799804688, + "loss": 0.6448, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6538271307945251, + "rewards/margins": 0.14478826522827148, + "rewards/rejected": 0.5090388059616089, + "step": 5480 + }, + { + "epoch": 1.7290657848116215, + "grad_norm": 2.75, + "learning_rate": 3.5520317012520327e-06, + "logits/chosen": -0.4472483694553375, + "logits/rejected": -0.3266296684741974, + "logps/chosen": -198.31227111816406, + "logps/rejected": -158.91233825683594, + "loss": 0.6354, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6903390884399414, + "rewards/margins": 0.15532377362251282, + "rewards/rejected": 0.5350152850151062, + "step": 5490 + }, + { + "epoch": 1.732215267115468, + "grad_norm": 1.9765625, + "learning_rate": 3.5472476749190465e-06, + "logits/chosen": -0.44366365671157837, + "logits/rejected": -0.3832937180995941, + "logps/chosen": -178.91236877441406, + "logps/rejected": -161.47250366210938, + "loss": 0.6579, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6258020401000977, + "rewards/margins": 0.1169646754860878, + "rewards/rejected": 0.5088373422622681, + "step": 5500 + }, + { + "epoch": 1.7353647494193143, + "grad_norm": 3.53125, + "learning_rate": 3.5424589929775593e-06, + "logits/chosen": -0.4584302306175232, + "logits/rejected": -0.3267679810523987, + "logps/chosen": -193.6768798828125, + "logps/rejected": -170.361572265625, + "loss": 0.6402, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7419657707214355, + "rewards/margins": 0.14626576006412506, + "rewards/rejected": 0.5956999063491821, + "step": 5510 + }, + { + "epoch": 1.7385142317231606, + "grad_norm": 2.859375, + "learning_rate": 3.5376656767159724e-06, + "logits/chosen": -0.42630109190940857, + "logits/rejected": -0.33150094747543335, + "logps/chosen": -188.36764526367188, + "logps/rejected": -165.72805786132812, + "loss": 0.6567, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6277583837509155, + "rewards/margins": 0.11112833023071289, + "rewards/rejected": 0.5166300535202026, + "step": 5520 + }, + { + "epoch": 1.7416637140270068, + "grad_norm": 2.40625, + "learning_rate": 3.5328677474432893e-06, + "logits/chosen": -0.4710637629032135, + "logits/rejected": -0.36444777250289917, + "logps/chosen": -193.68409729003906, + "logps/rejected": -158.17857360839844, + "loss": 0.6465, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6602168083190918, + "rewards/margins": 0.13958847522735596, + "rewards/rejected": 0.5206283330917358, + "step": 5530 + }, + { + "epoch": 1.744813196330853, + "grad_norm": 2.640625, + "learning_rate": 3.5280652264890197e-06, + "logits/chosen": -0.5480708479881287, + "logits/rejected": -0.4107402265071869, + "logps/chosen": -198.30174255371094, + "logps/rejected": -155.7358856201172, + "loss": 0.6093, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7575246095657349, + "rewards/margins": 0.22439555823802948, + "rewards/rejected": 0.5331289768218994, + "step": 5540 + }, + { + "epoch": 1.7479626786346993, + "grad_norm": 2.375, + "learning_rate": 3.523258135203087e-06, + "logits/chosen": -0.49628472328186035, + "logits/rejected": -0.39903074502944946, + "logps/chosen": -206.2187957763672, + "logps/rejected": -172.8035888671875, + "loss": 0.6493, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7126896977424622, + "rewards/margins": 0.13072232902050018, + "rewards/rejected": 0.5819673538208008, + "step": 5550 + }, + { + "epoch": 1.7511121609385456, + "grad_norm": 3.015625, + "learning_rate": 3.518446494955732e-06, + "logits/chosen": -0.5156094431877136, + "logits/rejected": -0.38701295852661133, + "logps/chosen": -179.8013153076172, + "logps/rejected": -152.9771728515625, + "loss": 0.6506, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6560216546058655, + "rewards/margins": 0.12969034910202026, + "rewards/rejected": 0.52633136510849, + "step": 5560 + }, + { + "epoch": 1.7542616432423919, + "grad_norm": 2.65625, + "learning_rate": 3.5136303271374185e-06, + "logits/chosen": -0.46732720732688904, + "logits/rejected": -0.33687490224838257, + "logps/chosen": -198.79708862304688, + "logps/rejected": -176.80389404296875, + "loss": 0.6617, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7469144463539124, + "rewards/margins": 0.11790307611227036, + "rewards/rejected": 0.629011332988739, + "step": 5570 + }, + { + "epoch": 1.7574111255462384, + "grad_norm": 3.203125, + "learning_rate": 3.5088096531587377e-06, + "logits/chosen": -0.4696858823299408, + "logits/rejected": -0.378410279750824, + "logps/chosen": -198.86094665527344, + "logps/rejected": -171.12442016601562, + "loss": 0.6509, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6952942609786987, + "rewards/margins": 0.12442521750926971, + "rewards/rejected": 0.5708690285682678, + "step": 5580 + }, + { + "epoch": 1.7605606078500846, + "grad_norm": 2.265625, + "learning_rate": 3.5039844944503137e-06, + "logits/chosen": -0.44871068000793457, + "logits/rejected": -0.23819032311439514, + "logps/chosen": -208.89993286132812, + "logps/rejected": -163.7926025390625, + "loss": 0.6338, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7412741184234619, + "rewards/margins": 0.17887642979621887, + "rewards/rejected": 0.5623977780342102, + "step": 5590 + }, + { + "epoch": 1.763710090153931, + "grad_norm": 2.484375, + "learning_rate": 3.4991548724627054e-06, + "logits/chosen": -0.5238919258117676, + "logits/rejected": -0.3585182726383209, + "logps/chosen": -222.5647430419922, + "logps/rejected": -178.90841674804688, + "loss": 0.5866, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.7905155420303345, + "rewards/margins": 0.27365249395370483, + "rewards/rejected": 0.5168629884719849, + "step": 5600 + }, + { + "epoch": 1.7668595724577774, + "grad_norm": 2.46875, + "learning_rate": 3.4943208086663183e-06, + "logits/chosen": -0.4847659170627594, + "logits/rejected": -0.33793026208877563, + "logps/chosen": -197.34933471679688, + "logps/rejected": -174.9829559326172, + "loss": 0.646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7014733552932739, + "rewards/margins": 0.12839707732200623, + "rewards/rejected": 0.5730762481689453, + "step": 5610 + }, + { + "epoch": 1.7700090547616236, + "grad_norm": 3.578125, + "learning_rate": 3.4894823245512986e-06, + "logits/chosen": -0.506749153137207, + "logits/rejected": -0.45556968450546265, + "logps/chosen": -197.71902465820312, + "logps/rejected": -186.50241088867188, + "loss": 0.6803, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.7117626070976257, + "rewards/margins": 0.06737571209669113, + "rewards/rejected": 0.644386887550354, + "step": 5620 + }, + { + "epoch": 1.7731585370654699, + "grad_norm": 2.578125, + "learning_rate": 3.484639441627448e-06, + "logits/chosen": -0.5070594549179077, + "logits/rejected": -0.3329693078994751, + "logps/chosen": -220.60986328125, + "logps/rejected": -183.98416137695312, + "loss": 0.6042, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.7875211834907532, + "rewards/margins": 0.2286391705274582, + "rewards/rejected": 0.5588821172714233, + "step": 5630 + }, + { + "epoch": 1.7763080193693161, + "grad_norm": 2.546875, + "learning_rate": 3.4797921814241196e-06, + "logits/chosen": -0.48938584327697754, + "logits/rejected": -0.37643399834632874, + "logps/chosen": -194.7692413330078, + "logps/rejected": -171.0836944580078, + "loss": 0.6345, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7208179235458374, + "rewards/margins": 0.17952939867973328, + "rewards/rejected": 0.5412884950637817, + "step": 5640 + }, + { + "epoch": 1.7794575016731624, + "grad_norm": 2.71875, + "learning_rate": 3.4749405654901297e-06, + "logits/chosen": -0.5021311044692993, + "logits/rejected": -0.3592470586299896, + "logps/chosen": -203.04798889160156, + "logps/rejected": -170.28916931152344, + "loss": 0.6468, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7304830551147461, + "rewards/margins": 0.14200101792812347, + "rewards/rejected": 0.5884820222854614, + "step": 5650 + }, + { + "epoch": 1.7826069839770087, + "grad_norm": 1.8125, + "learning_rate": 3.470084615393655e-06, + "logits/chosen": -0.5099314451217651, + "logits/rejected": -0.36777496337890625, + "logps/chosen": -188.96286010742188, + "logps/rejected": -158.13487243652344, + "loss": 0.5854, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.7638787031173706, + "rewards/margins": 0.25533777475357056, + "rewards/rejected": 0.5085408687591553, + "step": 5660 + }, + { + "epoch": 1.785756466280855, + "grad_norm": 2.71875, + "learning_rate": 3.4652243527221423e-06, + "logits/chosen": -0.4756031632423401, + "logits/rejected": -0.44920986890792847, + "logps/chosen": -185.1388397216797, + "logps/rejected": -172.55137634277344, + "loss": 0.6583, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.7094627618789673, + "rewards/margins": 0.13025884330272675, + "rewards/rejected": 0.5792039036750793, + "step": 5670 + }, + { + "epoch": 1.7889059485847014, + "grad_norm": 3.171875, + "learning_rate": 3.460359799082209e-06, + "logits/chosen": -0.47689515352249146, + "logits/rejected": -0.34241801500320435, + "logps/chosen": -204.8109588623047, + "logps/rejected": -166.13514709472656, + "loss": 0.615, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.7595565915107727, + "rewards/margins": 0.21238622069358826, + "rewards/rejected": 0.5471702814102173, + "step": 5680 + }, + { + "epoch": 1.7920554308885477, + "grad_norm": 3.765625, + "learning_rate": 3.4554909760995485e-06, + "logits/chosen": -0.5418170094490051, + "logits/rejected": -0.41362690925598145, + "logps/chosen": -187.98043823242188, + "logps/rejected": -167.5854034423828, + "loss": 0.6338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7204712629318237, + "rewards/margins": 0.1737706959247589, + "rewards/rejected": 0.5467005968093872, + "step": 5690 + }, + { + "epoch": 1.795204913192394, + "grad_norm": 3.3125, + "learning_rate": 3.450617905418834e-06, + "logits/chosen": -0.442087322473526, + "logits/rejected": -0.3480719029903412, + "logps/chosen": -205.0787353515625, + "logps/rejected": -176.585693359375, + "loss": 0.6078, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7968889474868774, + "rewards/margins": 0.2236328423023224, + "rewards/rejected": 0.5732561349868774, + "step": 5700 + }, + { + "epoch": 1.7983543954962404, + "grad_norm": 3.125, + "learning_rate": 3.4457406087036233e-06, + "logits/chosen": -0.4669428765773773, + "logits/rejected": -0.379183828830719, + "logps/chosen": -183.84532165527344, + "logps/rejected": -169.44937133789062, + "loss": 0.6755, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6309347748756409, + "rewards/margins": 0.07157482206821442, + "rewards/rejected": 0.5593599081039429, + "step": 5710 + }, + { + "epoch": 1.8015038778000867, + "grad_norm": 2.984375, + "learning_rate": 3.4408591076362585e-06, + "logits/chosen": -0.5323187112808228, + "logits/rejected": -0.45780545473098755, + "logps/chosen": -205.9134521484375, + "logps/rejected": -180.65916442871094, + "loss": 0.6566, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7317408323287964, + "rewards/margins": 0.11702696233987808, + "rewards/rejected": 0.6147138476371765, + "step": 5720 + }, + { + "epoch": 1.804653360103933, + "grad_norm": 2.859375, + "learning_rate": 3.435973423917774e-06, + "logits/chosen": -0.48551005125045776, + "logits/rejected": -0.40477806329727173, + "logps/chosen": -195.50228881835938, + "logps/rejected": -173.91912841796875, + "loss": 0.6842, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.7036144137382507, + "rewards/margins": 0.06239970773458481, + "rewards/rejected": 0.6412147283554077, + "step": 5730 + }, + { + "epoch": 1.8078028424077792, + "grad_norm": 2.40625, + "learning_rate": 3.4310835792677995e-06, + "logits/chosen": -0.4431411623954773, + "logits/rejected": -0.3337770104408264, + "logps/chosen": -198.4442138671875, + "logps/rejected": -162.93258666992188, + "loss": 0.6348, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6712988018989563, + "rewards/margins": 0.1662341058254242, + "rewards/rejected": 0.5050647854804993, + "step": 5740 + }, + { + "epoch": 1.8109523247116255, + "grad_norm": 3.015625, + "learning_rate": 3.4261895954244613e-06, + "logits/chosen": -0.4226387143135071, + "logits/rejected": -0.3787776827812195, + "logps/chosen": -173.4969024658203, + "logps/rejected": -161.3011932373047, + "loss": 0.6435, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6374837160110474, + "rewards/margins": 0.13156263530254364, + "rewards/rejected": 0.5059210658073425, + "step": 5750 + }, + { + "epoch": 1.8141018070154717, + "grad_norm": 3.09375, + "learning_rate": 3.4212914941442866e-06, + "logits/chosen": -0.48183003067970276, + "logits/rejected": -0.3869970142841339, + "logps/chosen": -199.9102020263672, + "logps/rejected": -183.46273803710938, + "loss": 0.6739, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.7144922614097595, + "rewards/margins": 0.07754186540842056, + "rewards/rejected": 0.6369503736495972, + "step": 5760 + }, + { + "epoch": 1.817251289319318, + "grad_norm": 2.796875, + "learning_rate": 3.416389297202107e-06, + "logits/chosen": -0.435200035572052, + "logits/rejected": -0.273305743932724, + "logps/chosen": -200.13018798828125, + "logps/rejected": -172.42526245117188, + "loss": 0.6273, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7225381135940552, + "rewards/margins": 0.19170936942100525, + "rewards/rejected": 0.5308286547660828, + "step": 5770 + }, + { + "epoch": 1.8204007716231645, + "grad_norm": 3.203125, + "learning_rate": 3.4114830263909615e-06, + "logits/chosen": -0.488565593957901, + "logits/rejected": -0.3196925222873688, + "logps/chosen": -203.71237182617188, + "logps/rejected": -175.8201141357422, + "loss": 0.6425, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6835566759109497, + "rewards/margins": 0.13755542039871216, + "rewards/rejected": 0.5460013151168823, + "step": 5780 + }, + { + "epoch": 1.8235502539270108, + "grad_norm": 2.84375, + "learning_rate": 3.4065727035220013e-06, + "logits/chosen": -0.48802971839904785, + "logits/rejected": -0.401599645614624, + "logps/chosen": -203.4430694580078, + "logps/rejected": -178.24978637695312, + "loss": 0.6509, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7013251185417175, + "rewards/margins": 0.12661480903625488, + "rewards/rejected": 0.5747103095054626, + "step": 5790 + }, + { + "epoch": 1.826699736230857, + "grad_norm": 3.15625, + "learning_rate": 3.4016583504243892e-06, + "logits/chosen": -0.39509814977645874, + "logits/rejected": -0.3049541115760803, + "logps/chosen": -193.34628295898438, + "logps/rejected": -168.88990783691406, + "loss": 0.6467, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6939215660095215, + "rewards/margins": 0.13865116238594055, + "rewards/rejected": 0.5552703738212585, + "step": 5800 + }, + { + "epoch": 1.8298492185347035, + "grad_norm": 2.609375, + "learning_rate": 3.3967399889452056e-06, + "logits/chosen": -0.5302572250366211, + "logits/rejected": -0.42114171385765076, + "logps/chosen": -187.310791015625, + "logps/rejected": -158.18551635742188, + "loss": 0.62, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6706022024154663, + "rewards/margins": 0.18203167617321014, + "rewards/rejected": 0.48857051134109497, + "step": 5810 + }, + { + "epoch": 1.8329987008385498, + "grad_norm": 2.359375, + "learning_rate": 3.3918176409493498e-06, + "logits/chosen": -0.4302283227443695, + "logits/rejected": -0.3126838207244873, + "logps/chosen": -207.9413604736328, + "logps/rejected": -186.14862060546875, + "loss": 0.6106, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.8021620512008667, + "rewards/margins": 0.22595825791358948, + "rewards/rejected": 0.5762038826942444, + "step": 5820 + }, + { + "epoch": 1.836148183142396, + "grad_norm": 3.15625, + "learning_rate": 3.3868913283194445e-06, + "logits/chosen": -0.4245404303073883, + "logits/rejected": -0.3099447190761566, + "logps/chosen": -215.6573486328125, + "logps/rejected": -180.88473510742188, + "loss": 0.6243, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8092790842056274, + "rewards/margins": 0.21156442165374756, + "rewards/rejected": 0.5977145433425903, + "step": 5830 + }, + { + "epoch": 1.8392976654462423, + "grad_norm": 2.203125, + "learning_rate": 3.381961072955737e-06, + "logits/chosen": -0.4956479072570801, + "logits/rejected": -0.4022194743156433, + "logps/chosen": -181.72386169433594, + "logps/rejected": -157.3038330078125, + "loss": 0.6444, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6013648509979248, + "rewards/margins": 0.13577811419963837, + "rewards/rejected": 0.46558675169944763, + "step": 5840 + }, + { + "epoch": 1.8424471477500886, + "grad_norm": 2.828125, + "learning_rate": 3.3770268967760026e-06, + "logits/chosen": -0.4699929356575012, + "logits/rejected": -0.38960105180740356, + "logps/chosen": -190.84512329101562, + "logps/rejected": -165.31561279296875, + "loss": 0.6521, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7368890047073364, + "rewards/margins": 0.1246052160859108, + "rewards/rejected": 0.6122837662696838, + "step": 5850 + }, + { + "epoch": 1.8455966300539348, + "grad_norm": 3.640625, + "learning_rate": 3.372088821715446e-06, + "logits/chosen": -0.5164574384689331, + "logits/rejected": -0.40460482239723206, + "logps/chosen": -215.09130859375, + "logps/rejected": -181.18551635742188, + "loss": 0.6583, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7538167238235474, + "rewards/margins": 0.11776645481586456, + "rewards/rejected": 0.636050283908844, + "step": 5860 + }, + { + "epoch": 1.848746112357781, + "grad_norm": 2.65625, + "learning_rate": 3.3671468697266048e-06, + "logits/chosen": -0.486356645822525, + "logits/rejected": -0.45697417855262756, + "logps/chosen": -189.52955627441406, + "logps/rejected": -172.86190795898438, + "loss": 0.6822, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.6347873210906982, + "rewards/margins": 0.054320335388183594, + "rewards/rejected": 0.5804670453071594, + "step": 5870 + }, + { + "epoch": 1.8518955946616273, + "grad_norm": 3.375, + "learning_rate": 3.3622010627792513e-06, + "logits/chosen": -0.5492820143699646, + "logits/rejected": -0.38086193799972534, + "logps/chosen": -194.9511260986328, + "logps/rejected": -161.57528686523438, + "loss": 0.6699, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6926398873329163, + "rewards/margins": 0.09596933424472809, + "rewards/rejected": 0.5966705083847046, + "step": 5880 + }, + { + "epoch": 1.8550450769654738, + "grad_norm": 2.84375, + "learning_rate": 3.3572514228602977e-06, + "logits/chosen": -0.4424726366996765, + "logits/rejected": -0.35579612851142883, + "logps/chosen": -196.1681671142578, + "logps/rejected": -165.40811157226562, + "loss": 0.6129, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7343538999557495, + "rewards/margins": 0.21051523089408875, + "rewards/rejected": 0.5238386392593384, + "step": 5890 + }, + { + "epoch": 1.85819455926932, + "grad_norm": 2.96875, + "learning_rate": 3.3522979719736923e-06, + "logits/chosen": -0.4300655722618103, + "logits/rejected": -0.23585304617881775, + "logps/chosen": -209.92355346679688, + "logps/rejected": -173.3553924560547, + "loss": 0.639, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7228736877441406, + "rewards/margins": 0.16360947489738464, + "rewards/rejected": 0.5592643022537231, + "step": 5900 + } + ], + "logging_steps": 10, + "max_steps": 15000, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}