{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 47.923506570215594, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0110366344451904, "logits/rejected": -0.9818881750106812, "logps/chosen": -0.27409863471984863, "logps/rejected": -0.27151164412498474, "loss": 3.0607, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -2.7409865856170654, "rewards/margins": -0.025869915261864662, "rewards/rejected": -2.715116500854492, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 39.987585891736785, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0418651103973389, "logits/rejected": -0.9748126864433289, "logps/chosen": -0.2945522964000702, "logps/rejected": -0.29994362592697144, "loss": 3.0104, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.9455230236053467, "rewards/margins": 0.05391312763094902, "rewards/rejected": -2.999436378479004, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 52.07278122268582, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.963701069355011, "logits/rejected": -0.9835487604141235, "logps/chosen": -0.2644619345664978, "logps/rejected": -0.3007102608680725, "loss": 3.0162, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.6446194648742676, "rewards/margins": 0.362483412027359, "rewards/rejected": -3.0071024894714355, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 93.33861075914483, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9671205282211304, "logits/rejected": -0.9406957626342773, "logps/chosen": -0.27761051058769226, "logps/rejected": -0.2907746732234955, "loss": 2.9342, "rewards/accuracies": 0.5, "rewards/chosen": -2.7761049270629883, "rewards/margins": 0.13164177536964417, "rewards/rejected": -2.9077467918395996, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 52.349708457694014, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.015834093093872, "logits/rejected": -0.9864752888679504, "logps/chosen": -0.2717323899269104, "logps/rejected": -0.27839282155036926, "loss": 3.1216, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7173242568969727, "rewards/margins": 0.06660404056310654, "rewards/rejected": -2.783928394317627, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 45.104515251326376, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.9981824159622192, "logits/rejected": -0.9536676406860352, "logps/chosen": -0.2733208239078522, "logps/rejected": -0.2788906693458557, "loss": 2.9453, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.733208179473877, "rewards/margins": 0.055698495358228683, "rewards/rejected": -2.7889065742492676, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 61.54928932943931, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.051733136177063, "logits/rejected": -0.9763606190681458, "logps/chosen": -0.2938762605190277, "logps/rejected": -0.3207188844680786, "loss": 2.9156, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.938762664794922, "rewards/margins": 0.26842620968818665, "rewards/rejected": -3.2071890830993652, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 55.913783341396325, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0160491466522217, "logits/rejected": -0.9717121124267578, "logps/chosen": -0.27992749214172363, "logps/rejected": -0.32374969124794006, "loss": 2.9079, "rewards/accuracies": 0.59375, "rewards/chosen": -2.7992749214172363, "rewards/margins": 0.43822187185287476, "rewards/rejected": -3.237496852874756, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 38.79733201252679, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0506359338760376, "logits/rejected": -1.0073621273040771, "logps/chosen": -0.3326144218444824, "logps/rejected": -0.38409319519996643, "loss": 2.9658, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.3261444568634033, "rewards/margins": 0.5147874355316162, "rewards/rejected": -3.8409321308135986, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 101.77454221179983, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.028257131576538, "logits/rejected": -0.9783049821853638, "logps/chosen": -0.3342127203941345, "logps/rejected": -0.3756522536277771, "loss": 2.9987, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -3.342127561569214, "rewards/margins": 0.4143945574760437, "rewards/rejected": -3.7565224170684814, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 70.06029649060484, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0614262819290161, "logits/rejected": -1.025525689125061, "logps/chosen": -0.2905944287776947, "logps/rejected": -0.35211512446403503, "loss": 2.7815, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.905944347381592, "rewards/margins": 0.6152070164680481, "rewards/rejected": -3.521151065826416, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 49.123079394299815, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.0964637994766235, "logits/rejected": -1.061679720878601, "logps/chosen": -0.3209289014339447, "logps/rejected": -0.3418692350387573, "loss": 2.8596, "rewards/accuracies": 0.5625, "rewards/chosen": -3.209289073944092, "rewards/margins": 0.2094031274318695, "rewards/rejected": -3.4186923503875732, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 53.59523574650431, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.0083563327789307, "logits/rejected": -0.9795120358467102, "logps/chosen": -0.3694208264350891, "logps/rejected": -0.4273703694343567, "loss": 2.7899, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.6942081451416016, "rewards/margins": 0.5794947743415833, "rewards/rejected": -4.273703098297119, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 39.11989937521066, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.02675461769104, "logits/rejected": -1.0018466711044312, "logps/chosen": -0.35180264711380005, "logps/rejected": -0.4284419119358063, "loss": 2.8671, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.518026828765869, "rewards/margins": 0.7663925290107727, "rewards/rejected": -4.284419059753418, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 51.11281867224414, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9933602213859558, "logits/rejected": -0.9224111437797546, "logps/chosen": -0.3594875931739807, "logps/rejected": -0.40996867418289185, "loss": 2.7704, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.5948760509490967, "rewards/margins": 0.5048106908798218, "rewards/rejected": -4.099686622619629, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 48.022103189017436, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9609634280204773, "logits/rejected": -0.9471040964126587, "logps/chosen": -0.35821908712387085, "logps/rejected": -0.45667845010757446, "loss": 2.6966, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.582190990447998, "rewards/margins": 0.984593391418457, "rewards/rejected": -4.566784858703613, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 54.03450562178558, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9785356521606445, "logits/rejected": -0.9566847085952759, "logps/chosen": -0.3405897319316864, "logps/rejected": -0.4017128050327301, "loss": 2.6144, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.405897617340088, "rewards/margins": 0.6112309098243713, "rewards/rejected": -4.0171284675598145, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 62.750052897303675, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.0191900730133057, "logits/rejected": -0.9845901727676392, "logps/chosen": -0.4232923090457916, "logps/rejected": -0.5109944939613342, "loss": 2.866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.23292350769043, "rewards/margins": 0.8770216107368469, "rewards/rejected": -5.109944820404053, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 55.61240306403997, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.1014890670776367, "logits/rejected": -1.0177241563796997, "logps/chosen": -0.4533822536468506, "logps/rejected": -0.4995104670524597, "loss": 2.7432, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.533822536468506, "rewards/margins": 0.4612821042537689, "rewards/rejected": -4.995104789733887, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 80.5027346612393, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.9957372546195984, "logits/rejected": -0.9701834917068481, "logps/chosen": -0.43816161155700684, "logps/rejected": -0.5128804445266724, "loss": 2.7813, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.381616592407227, "rewards/margins": 0.7471875548362732, "rewards/rejected": -5.128803253173828, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 66.31806821536476, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.9997787475585938, "logits/rejected": -0.947482705116272, "logps/chosen": -0.4254922866821289, "logps/rejected": -0.5347083806991577, "loss": 2.7046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.254923343658447, "rewards/margins": 1.092160701751709, "rewards/rejected": -5.347084045410156, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 61.1266120827584, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9583929181098938, "logits/rejected": -0.8993922472000122, "logps/chosen": -0.4909549355506897, "logps/rejected": -0.620493471622467, "loss": 2.6559, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.909549713134766, "rewards/margins": 1.2953848838806152, "rewards/rejected": -6.204934120178223, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 66.56145340935555, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.019431471824646, "logits/rejected": -0.9595627784729004, "logps/chosen": -0.5270282030105591, "logps/rejected": -0.600238025188446, "loss": 2.4928, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -5.27028226852417, "rewards/margins": 0.7320979833602905, "rewards/rejected": -6.00238037109375, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 66.16205862286387, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.9745362997055054, "logits/rejected": -0.8843653798103333, "logps/chosen": -0.5472803115844727, "logps/rejected": -0.7492850422859192, "loss": 2.3982, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.472803115844727, "rewards/margins": 2.020047187805176, "rewards/rejected": -7.492850303649902, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 70.88843943146098, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.0500959157943726, "logits/rejected": -1.007611632347107, "logps/chosen": -0.6212247610092163, "logps/rejected": -0.7247714996337891, "loss": 2.3233, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.212247371673584, "rewards/margins": 1.0354671478271484, "rewards/rejected": -7.247714042663574, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 94.40161191780366, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0675666332244873, "logits/rejected": -1.0614221096038818, "logps/chosen": -0.6142371892929077, "logps/rejected": -0.8813148736953735, "loss": 2.1102, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.1423726081848145, "rewards/margins": 2.670776844024658, "rewards/rejected": -8.813148498535156, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 71.42739738901432, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.0529481172561646, "logits/rejected": -1.0047996044158936, "logps/chosen": -0.7235802412033081, "logps/rejected": -0.8823626637458801, "loss": 2.1377, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -7.235803127288818, "rewards/margins": 1.5878244638442993, "rewards/rejected": -8.823626518249512, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 87.9759333714625, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.1124293804168701, "logits/rejected": -1.0896517038345337, "logps/chosen": -0.862978458404541, "logps/rejected": -1.0037717819213867, "loss": 2.1017, "rewards/accuracies": 0.75, "rewards/chosen": -8.62978458404541, "rewards/margins": 1.4079326391220093, "rewards/rejected": -10.037717819213867, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 78.07225371686874, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.030829906463623, "logits/rejected": -1.0042556524276733, "logps/chosen": -0.8588500022888184, "logps/rejected": -1.1039783954620361, "loss": 2.0002, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.588499069213867, "rewards/margins": 2.4512839317321777, "rewards/rejected": -11.039785385131836, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 70.30730129459549, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.060490369796753, "logits/rejected": -1.0404036045074463, "logps/chosen": -0.9423840641975403, "logps/rejected": -1.1874125003814697, "loss": 1.9455, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.42384147644043, "rewards/margins": 2.4502837657928467, "rewards/rejected": -11.874125480651855, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 81.54625041986957, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.0910407304763794, "logits/rejected": -1.0684020519256592, "logps/chosen": -0.9991434812545776, "logps/rejected": -1.3156726360321045, "loss": 2.0451, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.991434097290039, "rewards/margins": 3.165290355682373, "rewards/rejected": -13.15672492980957, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 78.5490421908409, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.109403371810913, "logits/rejected": -1.090001106262207, "logps/chosen": -1.1215949058532715, "logps/rejected": -1.5121821165084839, "loss": 1.9436, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -11.215949058532715, "rewards/margins": 3.9058711528778076, "rewards/rejected": -15.121821403503418, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 59.08371857927558, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.1232795715332031, "logits/rejected": -1.0980435609817505, "logps/chosen": -1.0903780460357666, "logps/rejected": -1.459205150604248, "loss": 1.8384, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -10.903780937194824, "rewards/margins": 3.688269853591919, "rewards/rejected": -14.59205150604248, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 85.71218468828272, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.1307361125946045, "logits/rejected": -1.1074953079223633, "logps/chosen": -1.0654685497283936, "logps/rejected": -1.4472792148590088, "loss": 1.7884, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -10.654685020446777, "rewards/margins": 3.818106174468994, "rewards/rejected": -14.47279167175293, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 92.85957749639208, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.1518357992172241, "logits/rejected": -1.102372407913208, "logps/chosen": -1.1460392475128174, "logps/rejected": -1.4155685901641846, "loss": 1.6771, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.460393905639648, "rewards/margins": 2.695291757583618, "rewards/rejected": -14.155685424804688, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 75.98858315922392, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.0996112823486328, "logits/rejected": -1.0788969993591309, "logps/chosen": -1.1098445653915405, "logps/rejected": -1.476881504058838, "loss": 1.6011, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -11.098443984985352, "rewards/margins": 3.6703686714172363, "rewards/rejected": -14.76881217956543, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 88.81502196023631, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.1378796100616455, "logits/rejected": -1.0828906297683716, "logps/chosen": -1.1474685668945312, "logps/rejected": -1.5796287059783936, "loss": 1.6663, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -11.474684715270996, "rewards/margins": 4.321602821350098, "rewards/rejected": -15.796287536621094, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 90.03714203036446, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.1278326511383057, "logits/rejected": -1.1358839273452759, "logps/chosen": -1.2729408740997314, "logps/rejected": -1.7558482885360718, "loss": 1.5442, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -12.729410171508789, "rewards/margins": 4.829073905944824, "rewards/rejected": -17.558483123779297, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 72.89600233357321, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.0958189964294434, "logits/rejected": -1.076554775238037, "logps/chosen": -1.2896816730499268, "logps/rejected": -1.6636635065078735, "loss": 1.6021, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.896817207336426, "rewards/margins": 3.7398200035095215, "rewards/rejected": -16.636634826660156, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 93.9340667463585, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.0816549062728882, "logits/rejected": -1.0706536769866943, "logps/chosen": -1.3197344541549683, "logps/rejected": -1.7450058460235596, "loss": 1.5092, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -13.197346687316895, "rewards/margins": 4.252710819244385, "rewards/rejected": -17.450056076049805, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 91.15403821743105, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.1295057535171509, "logits/rejected": -1.0786478519439697, "logps/chosen": -1.3944091796875, "logps/rejected": -1.8417927026748657, "loss": 1.7223, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -13.944093704223633, "rewards/margins": 4.473834037780762, "rewards/rejected": -18.417926788330078, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 99.88773415242756, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.1591789722442627, "logits/rejected": -1.147062063217163, "logps/chosen": -1.3990533351898193, "logps/rejected": -1.8112404346466064, "loss": 1.6082, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -13.990533828735352, "rewards/margins": 4.121870040893555, "rewards/rejected": -18.112403869628906, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 108.24791172325133, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.1198530197143555, "logits/rejected": -1.101109504699707, "logps/chosen": -1.390649437904358, "logps/rejected": -1.8630450963974, "loss": 1.4791, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.906494140625, "rewards/margins": 4.7239580154418945, "rewards/rejected": -18.630451202392578, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 95.83911690989143, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.1780140399932861, "logits/rejected": -1.1579878330230713, "logps/chosen": -1.4568861722946167, "logps/rejected": -1.9470503330230713, "loss": 1.4586, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -14.56886100769043, "rewards/margins": 4.901640892028809, "rewards/rejected": -19.470502853393555, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 69.10471107204022, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.1474467515945435, "logits/rejected": -1.1124647855758667, "logps/chosen": -1.48002028465271, "logps/rejected": -1.9400886297225952, "loss": 1.4409, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -14.800203323364258, "rewards/margins": 4.600685119628906, "rewards/rejected": -19.400888442993164, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 80.62484140193865, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.2124546766281128, "logits/rejected": -1.161115050315857, "logps/chosen": -1.4423153400421143, "logps/rejected": -1.9036369323730469, "loss": 1.3817, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -14.423154830932617, "rewards/margins": 4.613214015960693, "rewards/rejected": -19.03636932373047, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 84.05912123531321, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.2311934232711792, "logits/rejected": -1.2020883560180664, "logps/chosen": -1.4844694137573242, "logps/rejected": -1.9821853637695312, "loss": 1.4172, "rewards/accuracies": 0.78125, "rewards/chosen": -14.844694137573242, "rewards/margins": 4.97715950012207, "rewards/rejected": -19.821855545043945, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 84.00316536161533, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.198540449142456, "logits/rejected": -1.1992590427398682, "logps/chosen": -1.3957428932189941, "logps/rejected": -1.8944737911224365, "loss": 1.4343, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -13.957429885864258, "rewards/margins": 4.987309455871582, "rewards/rejected": -18.944738388061523, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 137.49078119090206, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.2955886125564575, "logits/rejected": -1.23685622215271, "logps/chosen": -1.460442066192627, "logps/rejected": -2.0612359046936035, "loss": 1.3532, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -14.60442066192627, "rewards/margins": 6.007939338684082, "rewards/rejected": -20.612361907958984, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 129.54289500612722, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.2238231897354126, "logits/rejected": -1.2080833911895752, "logps/chosen": -1.5243932008743286, "logps/rejected": -2.1077561378479004, "loss": 1.3459, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.243929862976074, "rewards/margins": 5.833629608154297, "rewards/rejected": -21.077558517456055, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 102.89768684384153, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.267345905303955, "logits/rejected": -1.2413192987442017, "logps/chosen": -1.6330616474151611, "logps/rejected": -2.104926824569702, "loss": 1.4434, "rewards/accuracies": 0.8125, "rewards/chosen": -16.330615997314453, "rewards/margins": 4.718654155731201, "rewards/rejected": -21.049266815185547, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 89.20095630673174, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.245793104171753, "logits/rejected": -1.214970350265503, "logps/chosen": -1.554158091545105, "logps/rejected": -2.0427088737487793, "loss": 1.4276, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.541582107543945, "rewards/margins": 4.8855085372924805, "rewards/rejected": -20.42708969116211, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 76.055827552827, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.2102077007293701, "logits/rejected": -1.1913068294525146, "logps/chosen": -1.6448841094970703, "logps/rejected": -2.155822277069092, "loss": 1.3609, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.448841094970703, "rewards/margins": 5.109385967254639, "rewards/rejected": -21.5582275390625, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 129.8909118017969, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.2157796621322632, "logits/rejected": -1.186073899269104, "logps/chosen": -1.4407769441604614, "logps/rejected": -1.9774402379989624, "loss": 1.2996, "rewards/accuracies": 0.84375, "rewards/chosen": -14.407770156860352, "rewards/margins": 5.366633415222168, "rewards/rejected": -19.774402618408203, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 74.47995587471961, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.177504301071167, "logits/rejected": -1.1408427953720093, "logps/chosen": -1.4323005676269531, "logps/rejected": -1.8869625329971313, "loss": 1.4405, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.323003768920898, "rewards/margins": 4.546619892120361, "rewards/rejected": -18.869625091552734, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 96.7891504750656, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.262804627418518, "logits/rejected": -1.2378443479537964, "logps/chosen": -1.4722181558609009, "logps/rejected": -2.023758888244629, "loss": 1.3202, "rewards/accuracies": 0.8125, "rewards/chosen": -14.72218132019043, "rewards/margins": 5.515408515930176, "rewards/rejected": -20.23758888244629, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 98.51578082175142, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.2408018112182617, "logits/rejected": -1.2075875997543335, "logps/chosen": -1.5188751220703125, "logps/rejected": -2.0878236293792725, "loss": 1.0977, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -15.188751220703125, "rewards/margins": 5.689483642578125, "rewards/rejected": -20.878236770629883, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 77.81254701258105, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.2707680463790894, "logits/rejected": -1.2261282205581665, "logps/chosen": -1.5314843654632568, "logps/rejected": -2.0101191997528076, "loss": 1.3577, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.314845085144043, "rewards/margins": 4.786349296569824, "rewards/rejected": -20.101192474365234, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 87.64994632483507, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.199864387512207, "logits/rejected": -1.1842243671417236, "logps/chosen": -1.545689344406128, "logps/rejected": -2.0575714111328125, "loss": 1.1919, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.456893920898438, "rewards/margins": 5.118819713592529, "rewards/rejected": -20.575714111328125, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 84.61392252215398, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.1328258514404297, "logits/rejected": -1.1063092947006226, "logps/chosen": -1.631317138671875, "logps/rejected": -2.079132556915283, "loss": 1.6288, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -16.31317138671875, "rewards/margins": 4.478152275085449, "rewards/rejected": -20.791322708129883, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 84.46013666927763, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.2454413175582886, "logits/rejected": -1.1997601985931396, "logps/chosen": -1.6037687063217163, "logps/rejected": -2.0645315647125244, "loss": 1.3155, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.03768539428711, "rewards/margins": 4.6076273918151855, "rewards/rejected": -20.645313262939453, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 91.59670184758677, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.2738150358200073, "logits/rejected": -1.253278136253357, "logps/chosen": -1.6317838430404663, "logps/rejected": -2.138291835784912, "loss": 1.372, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.31783676147461, "rewards/margins": 5.065082550048828, "rewards/rejected": -21.382923126220703, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 87.02614481244046, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.284517526626587, "logits/rejected": -1.2254732847213745, "logps/chosen": -1.618208885192871, "logps/rejected": -2.1373062133789062, "loss": 1.3415, "rewards/accuracies": 0.84375, "rewards/chosen": -16.182092666625977, "rewards/margins": 5.190975189208984, "rewards/rejected": -21.373065948486328, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 104.27219685818618, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.2704033851623535, "logits/rejected": -1.2657862901687622, "logps/chosen": -1.6442874670028687, "logps/rejected": -2.2715744972229004, "loss": 1.2614, "rewards/accuracies": 0.875, "rewards/chosen": -16.4428768157959, "rewards/margins": 6.2728681564331055, "rewards/rejected": -22.715742111206055, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 78.77445808060149, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.2807691097259521, "logits/rejected": -1.2303869724273682, "logps/chosen": -1.6990268230438232, "logps/rejected": -2.368220329284668, "loss": 1.3078, "rewards/accuracies": 0.84375, "rewards/chosen": -16.99026870727539, "rewards/margins": 6.6919355392456055, "rewards/rejected": -23.682205200195312, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 70.04351714156043, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.1783530712127686, "logits/rejected": -1.136584758758545, "logps/chosen": -1.6521613597869873, "logps/rejected": -2.1305041313171387, "loss": 1.3592, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.5216121673584, "rewards/margins": 4.783430099487305, "rewards/rejected": -21.305042266845703, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 75.03379354143011, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.2156535387039185, "logits/rejected": -1.1975212097167969, "logps/chosen": -1.6360639333724976, "logps/rejected": -2.187391757965088, "loss": 1.1952, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.360637664794922, "rewards/margins": 5.513278484344482, "rewards/rejected": -21.873918533325195, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 103.30210442360509, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.2379086017608643, "logits/rejected": -1.2029554843902588, "logps/chosen": -1.5814708471298218, "logps/rejected": -2.1416497230529785, "loss": 1.3219, "rewards/accuracies": 0.84375, "rewards/chosen": -15.814706802368164, "rewards/margins": 5.601790428161621, "rewards/rejected": -21.41649627685547, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 87.41940209533863, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.254070520401001, "logits/rejected": -1.2306808233261108, "logps/chosen": -1.665123701095581, "logps/rejected": -2.303457021713257, "loss": 1.3788, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.65123748779297, "rewards/margins": 6.3833327293396, "rewards/rejected": -23.034570693969727, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 147.74903637059256, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.2324841022491455, "logits/rejected": -1.2356057167053223, "logps/chosen": -1.5469470024108887, "logps/rejected": -2.0821375846862793, "loss": 1.2883, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.469470024108887, "rewards/margins": 5.351906776428223, "rewards/rejected": -20.82137680053711, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 66.59688038674247, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.1590429544448853, "logits/rejected": -1.1739274263381958, "logps/chosen": -1.5381479263305664, "logps/rejected": -2.0882415771484375, "loss": 1.181, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.381479263305664, "rewards/margins": 5.500934600830078, "rewards/rejected": -20.882413864135742, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 79.30848988409956, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.2186603546142578, "logits/rejected": -1.2177612781524658, "logps/chosen": -1.5483216047286987, "logps/rejected": -2.190535306930542, "loss": 1.3007, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.48321533203125, "rewards/margins": 6.422137260437012, "rewards/rejected": -21.905353546142578, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 71.1995833686848, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.2522964477539062, "logits/rejected": -1.1880736351013184, "logps/chosen": -1.599200963973999, "logps/rejected": -2.2274394035339355, "loss": 1.294, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.992010116577148, "rewards/margins": 6.28238582611084, "rewards/rejected": -22.274394989013672, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 97.88071644925103, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.1726973056793213, "logits/rejected": -1.1615909337997437, "logps/chosen": -1.5849040746688843, "logps/rejected": -2.063690662384033, "loss": 1.2653, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.849041938781738, "rewards/margins": 4.787867546081543, "rewards/rejected": -20.63690948486328, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 93.22521656476633, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.2195771932601929, "logits/rejected": -1.223771095275879, "logps/chosen": -1.7087455987930298, "logps/rejected": -2.2848830223083496, "loss": 1.2583, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -17.087453842163086, "rewards/margins": 5.76137638092041, "rewards/rejected": -22.848833084106445, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 122.65282224004734, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.2245140075683594, "logits/rejected": -1.2064614295959473, "logps/chosen": -1.5752016305923462, "logps/rejected": -2.1021199226379395, "loss": 1.4127, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.752016067504883, "rewards/margins": 5.2691850662231445, "rewards/rejected": -21.021198272705078, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 79.05677313510314, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.2155396938323975, "logits/rejected": -1.194136142730713, "logps/chosen": -1.5979677438735962, "logps/rejected": -2.291325330734253, "loss": 1.0803, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.9796781539917, "rewards/margins": 6.9335784912109375, "rewards/rejected": -22.91325569152832, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 83.81957692142457, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.223331093788147, "logits/rejected": -1.209160327911377, "logps/chosen": -1.668593406677246, "logps/rejected": -2.259793519973755, "loss": 1.2346, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.68593406677246, "rewards/margins": 5.911999702453613, "rewards/rejected": -22.59793472290039, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 76.36054598990746, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.2313239574432373, "logits/rejected": -1.211395502090454, "logps/chosen": -1.5763094425201416, "logps/rejected": -2.1317121982574463, "loss": 1.2849, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.763093948364258, "rewards/margins": 5.554028511047363, "rewards/rejected": -21.317119598388672, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 62.72495111163961, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.2403868436813354, "logits/rejected": -1.2243949174880981, "logps/chosen": -1.6198228597640991, "logps/rejected": -2.1331706047058105, "loss": 1.2445, "rewards/accuracies": 0.8125, "rewards/chosen": -16.19822883605957, "rewards/margins": 5.133477687835693, "rewards/rejected": -21.331707000732422, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.4456316232681274, "eval_logits/rejected": -1.4547516107559204, "eval_logps/chosen": -1.623605728149414, "eval_logps/rejected": -2.176786422729492, "eval_loss": 1.3307912349700928, "eval_rewards/accuracies": 0.8353658318519592, "eval_rewards/chosen": -16.23605728149414, "eval_rewards/margins": 5.531808376312256, "eval_rewards/rejected": -21.767864227294922, "eval_runtime": 94.8719, "eval_samples_per_second": 20.67, "eval_steps_per_second": 1.296, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 99.36460974571031, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.2008370161056519, "logits/rejected": -1.2194417715072632, "logps/chosen": -1.7075388431549072, "logps/rejected": -2.2549824714660645, "loss": 1.2048, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -17.075387954711914, "rewards/margins": 5.474437713623047, "rewards/rejected": -22.54982566833496, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 138.34780783301264, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.216778039932251, "logits/rejected": -1.2035914659500122, "logps/chosen": -1.6350791454315186, "logps/rejected": -2.172778844833374, "loss": 1.3538, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.350793838500977, "rewards/margins": 5.376997947692871, "rewards/rejected": -21.727787017822266, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 72.98512679113071, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.2200865745544434, "logits/rejected": -1.1671762466430664, "logps/chosen": -1.5793800354003906, "logps/rejected": -2.232057571411133, "loss": 1.3478, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.793802261352539, "rewards/margins": 6.5267744064331055, "rewards/rejected": -22.32057762145996, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 98.07615613251582, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.2454715967178345, "logits/rejected": -1.195953607559204, "logps/chosen": -1.5281785726547241, "logps/rejected": -2.1256656646728516, "loss": 1.2056, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.28178596496582, "rewards/margins": 5.974873065948486, "rewards/rejected": -21.25665855407715, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 106.16711447135498, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.2416163682937622, "logits/rejected": -1.231783390045166, "logps/chosen": -1.7100231647491455, "logps/rejected": -2.2363858222961426, "loss": 1.4487, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.100229263305664, "rewards/margins": 5.263625621795654, "rewards/rejected": -22.36385726928711, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 104.40358802274892, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.2276403903961182, "logits/rejected": -1.211700439453125, "logps/chosen": -1.6992714405059814, "logps/rejected": -2.2929625511169434, "loss": 1.2603, "rewards/accuracies": 0.875, "rewards/chosen": -16.992717742919922, "rewards/margins": 5.9369096755981445, "rewards/rejected": -22.92962646484375, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 76.71410236428167, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.1829754114151, "logits/rejected": -1.1444637775421143, "logps/chosen": -1.5611233711242676, "logps/rejected": -2.1490211486816406, "loss": 1.2066, "rewards/accuracies": 0.875, "rewards/chosen": -15.611233711242676, "rewards/margins": 5.8789777755737305, "rewards/rejected": -21.490211486816406, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 89.37849635838704, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.2062740325927734, "logits/rejected": -1.148478627204895, "logps/chosen": -1.6622101068496704, "logps/rejected": -2.1926727294921875, "loss": 1.253, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.622098922729492, "rewards/margins": 5.304628849029541, "rewards/rejected": -21.926727294921875, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 78.38263983661439, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.1661369800567627, "logits/rejected": -1.1493126153945923, "logps/chosen": -1.6586837768554688, "logps/rejected": -2.293992280960083, "loss": 1.0815, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.58683967590332, "rewards/margins": 6.353082180023193, "rewards/rejected": -22.939918518066406, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 85.39108439896182, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.2724522352218628, "logits/rejected": -1.2523143291473389, "logps/chosen": -1.607690453529358, "logps/rejected": -2.168273448944092, "loss": 1.3282, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.076906204223633, "rewards/margins": 5.605828285217285, "rewards/rejected": -21.682735443115234, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 102.09796631455698, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.2248659133911133, "logits/rejected": -1.1913433074951172, "logps/chosen": -1.6112314462661743, "logps/rejected": -2.2650115489959717, "loss": 1.0445, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -16.112314224243164, "rewards/margins": 6.537802696228027, "rewards/rejected": -22.650117874145508, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 90.8560495778911, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.2340514659881592, "logits/rejected": -1.2225282192230225, "logps/chosen": -1.6787292957305908, "logps/rejected": -2.2820496559143066, "loss": 1.1894, "rewards/accuracies": 0.84375, "rewards/chosen": -16.787290573120117, "rewards/margins": 6.033202648162842, "rewards/rejected": -22.82049560546875, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 86.47798441930765, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.2089763879776, "logits/rejected": -1.2146103382110596, "logps/chosen": -1.703181266784668, "logps/rejected": -2.318962335586548, "loss": 1.3219, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.03181266784668, "rewards/margins": 6.157810688018799, "rewards/rejected": -23.189624786376953, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.8032665589636858, "train_runtime": 11474.0462, "train_samples_per_second": 5.218, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }