{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 805, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 6.172839506172839e-09, "logits/chosen": -2.8421168327331543, "logits/rejected": -2.6747336387634277, "logps/chosen": -92.33953094482422, "logps/rejected": -44.262760162353516, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 6.172839506172839e-08, "logits/chosen": -2.9818191528320312, "logits/rejected": -2.9740567207336426, "logps/chosen": -197.3586883544922, "logps/rejected": -149.28749084472656, "loss": 0.6901, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": -0.005469343159347773, "rewards/margins": 0.00850688572973013, "rewards/rejected": -0.01397622935473919, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.2345679012345677e-07, "logits/chosen": -2.9317967891693115, "logits/rejected": -2.8763492107391357, "logps/chosen": -172.7858428955078, "logps/rejected": -133.58245849609375, "loss": 0.6353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1074294000864029, "rewards/margins": 0.16630074381828308, "rewards/rejected": -0.05887135863304138, "step": 20 }, { "epoch": 0.04, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -3.0448741912841797, "logits/rejected": -2.999143362045288, "logps/chosen": -179.38113403320312, "logps/rejected": -146.7749786376953, "loss": 0.5047, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.08539465069770813, "rewards/margins": 0.540686845779419, "rewards/rejected": -0.4552922248840332, "step": 30 }, { "epoch": 0.05, "learning_rate": 2.4691358024691354e-07, "logits/chosen": -2.9631247520446777, "logits/rejected": -2.9552507400512695, "logps/chosen": -174.05288696289062, "logps/rejected": -143.798583984375, "loss": 0.3928, "rewards/accuracies": 0.875, "rewards/chosen": 0.11930576711893082, "rewards/margins": 0.9224799871444702, "rewards/rejected": -0.8031741976737976, "step": 40 }, { "epoch": 0.06, "learning_rate": 3.086419753086419e-07, "logits/chosen": -2.955972194671631, "logits/rejected": -2.880552053451538, "logps/chosen": -183.1387481689453, "logps/rejected": -157.20669555664062, "loss": 0.2956, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8193867802619934, "rewards/margins": 1.485733985900879, "rewards/rejected": -2.3051209449768066, "step": 50 }, { "epoch": 0.07, "learning_rate": 3.703703703703703e-07, "logits/chosen": -2.960036516189575, "logits/rejected": -2.8513035774230957, "logps/chosen": -154.39926147460938, "logps/rejected": -133.40078735351562, "loss": 0.2919, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9133367538452148, "rewards/margins": 1.8072048425674438, "rewards/rejected": -2.720541477203369, "step": 60 }, { "epoch": 0.09, "learning_rate": 4.320987654320987e-07, "logits/chosen": -2.8771309852600098, "logits/rejected": -2.797616481781006, "logps/chosen": -165.77328491210938, "logps/rejected": -142.69815063476562, "loss": 0.2689, "rewards/accuracies": 0.875, "rewards/chosen": -0.7793342471122742, "rewards/margins": 2.3448145389556885, "rewards/rejected": -3.1241488456726074, "step": 70 }, { "epoch": 0.1, "learning_rate": 4.938271604938271e-07, "logits/chosen": -2.7670953273773193, "logits/rejected": -2.723829984664917, "logps/chosen": -185.12596130371094, "logps/rejected": -174.61465454101562, "loss": 0.2633, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7798545360565186, "rewards/margins": 2.6085076332092285, "rewards/rejected": -4.388362884521484, "step": 80 }, { "epoch": 0.11, "learning_rate": 4.937845303867402e-07, "logits/chosen": -2.933378219604492, "logits/rejected": -2.9017395973205566, "logps/chosen": -183.1439208984375, "logps/rejected": -171.66964721679688, "loss": 0.2236, "rewards/accuracies": 0.875, "rewards/chosen": -1.4862909317016602, "rewards/margins": 2.482652187347412, "rewards/rejected": -3.968942642211914, "step": 90 }, { "epoch": 0.12, "learning_rate": 4.868784530386741e-07, "logits/chosen": -2.7947914600372314, "logits/rejected": -2.823068618774414, "logps/chosen": -189.07904052734375, "logps/rejected": -181.06781005859375, "loss": 0.2324, "rewards/accuracies": 0.875, "rewards/chosen": -0.5773736238479614, "rewards/margins": 3.1609904766082764, "rewards/rejected": -3.738363742828369, "step": 100 }, { "epoch": 0.14, "learning_rate": 4.799723756906077e-07, "logits/chosen": -2.972888946533203, "logits/rejected": -2.856055498123169, "logps/chosen": -194.32847595214844, "logps/rejected": -180.61575317382812, "loss": 0.2595, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5956058502197266, "rewards/margins": 2.7098982334136963, "rewards/rejected": -4.305504322052002, "step": 110 }, { "epoch": 0.15, "learning_rate": 4.730662983425414e-07, "logits/chosen": -2.8165183067321777, "logits/rejected": -2.7924342155456543, "logps/chosen": -171.48434448242188, "logps/rejected": -174.36788940429688, "loss": 0.2177, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.069062352180481, "rewards/margins": 3.2766518592834473, "rewards/rejected": -4.345714092254639, "step": 120 }, { "epoch": 0.16, "learning_rate": 4.661602209944751e-07, "logits/chosen": -2.941241502761841, "logits/rejected": -2.8772006034851074, "logps/chosen": -206.2366180419922, "logps/rejected": -196.18931579589844, "loss": 0.1857, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.6614869832992554, "rewards/margins": 3.376704454421997, "rewards/rejected": -5.038191318511963, "step": 130 }, { "epoch": 0.17, "learning_rate": 4.592541436464088e-07, "logits/chosen": -2.7895777225494385, "logits/rejected": -2.7228431701660156, "logps/chosen": -185.4031524658203, "logps/rejected": -193.98040771484375, "loss": 0.174, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8083890080451965, "rewards/margins": 4.341578960418701, "rewards/rejected": -5.149968147277832, "step": 140 }, { "epoch": 0.19, "learning_rate": 4.5234806629834255e-07, "logits/chosen": -2.73811674118042, "logits/rejected": -2.7120604515075684, "logps/chosen": -214.88565063476562, "logps/rejected": -221.6265411376953, "loss": 0.185, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5129263401031494, "rewards/margins": 3.6045918464660645, "rewards/rejected": -6.117517948150635, "step": 150 }, { "epoch": 0.2, "learning_rate": 4.454419889502762e-07, "logits/chosen": -2.9045376777648926, "logits/rejected": -2.8451740741729736, "logps/chosen": -202.41903686523438, "logps/rejected": -205.6617889404297, "loss": 0.208, "rewards/accuracies": 0.75, "rewards/chosen": -2.077017307281494, "rewards/margins": 4.358880996704102, "rewards/rejected": -6.4358978271484375, "step": 160 }, { "epoch": 0.21, "learning_rate": 4.3853591160220993e-07, "logits/chosen": -2.8927552700042725, "logits/rejected": -2.7776336669921875, "logps/chosen": -203.76724243164062, "logps/rejected": -200.79864501953125, "loss": 0.1703, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2595643997192383, "rewards/margins": 3.9248855113983154, "rewards/rejected": -6.184449672698975, "step": 170 }, { "epoch": 0.22, "learning_rate": 4.3162983425414365e-07, "logits/chosen": -2.7550511360168457, "logits/rejected": -2.7225847244262695, "logps/chosen": -225.6936798095703, "logps/rejected": -231.35400390625, "loss": 0.1733, "rewards/accuracies": 0.75, "rewards/chosen": -3.379578113555908, "rewards/margins": 3.530996322631836, "rewards/rejected": -6.910574436187744, "step": 180 }, { "epoch": 0.24, "learning_rate": 4.247237569060773e-07, "logits/chosen": -2.8363826274871826, "logits/rejected": -2.7442939281463623, "logps/chosen": -212.888427734375, "logps/rejected": -198.71542358398438, "loss": 0.218, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.968935251235962, "rewards/margins": 3.221897602081299, "rewards/rejected": -6.19083309173584, "step": 190 }, { "epoch": 0.25, "learning_rate": 4.1781767955801103e-07, "logits/chosen": -2.8280460834503174, "logits/rejected": -2.7499425411224365, "logps/chosen": -182.047607421875, "logps/rejected": -188.87193298339844, "loss": 0.1708, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.369425058364868, "rewards/margins": 4.022088050842285, "rewards/rejected": -6.391513824462891, "step": 200 }, { "epoch": 0.26, "learning_rate": 4.1091160220994475e-07, "logits/chosen": -2.7316782474517822, "logits/rejected": -2.6802334785461426, "logps/chosen": -212.96212768554688, "logps/rejected": -203.3765106201172, "loss": 0.1689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.15626859664917, "rewards/margins": 3.610233783721924, "rewards/rejected": -6.766502380371094, "step": 210 }, { "epoch": 0.27, "learning_rate": 4.0400552486187846e-07, "logits/chosen": -2.8573451042175293, "logits/rejected": -2.751213550567627, "logps/chosen": -225.369140625, "logps/rejected": -213.1463623046875, "loss": 0.2091, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.183094024658203, "rewards/margins": 3.5506489276885986, "rewards/rejected": -6.733743190765381, "step": 220 }, { "epoch": 0.29, "learning_rate": 3.970994475138121e-07, "logits/chosen": -2.7918601036071777, "logits/rejected": -2.731823444366455, "logps/chosen": -194.12893676757812, "logps/rejected": -183.81556701660156, "loss": 0.1979, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6381051540374756, "rewards/margins": 2.9515299797058105, "rewards/rejected": -5.589634895324707, "step": 230 }, { "epoch": 0.3, "learning_rate": 3.9019337016574584e-07, "logits/chosen": -2.699711561203003, "logits/rejected": -2.6905601024627686, "logps/chosen": -212.59793090820312, "logps/rejected": -232.6545867919922, "loss": 0.1849, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5139076709747314, "rewards/margins": 4.449800491333008, "rewards/rejected": -7.96370792388916, "step": 240 }, { "epoch": 0.31, "learning_rate": 3.832872928176795e-07, "logits/chosen": -2.721381425857544, "logits/rejected": -2.7046854496002197, "logps/chosen": -200.67886352539062, "logps/rejected": -220.4629669189453, "loss": 0.1447, "rewards/accuracies": 0.875, "rewards/chosen": -3.1384024620056152, "rewards/margins": 4.899154186248779, "rewards/rejected": -8.037556648254395, "step": 250 }, { "epoch": 0.32, "learning_rate": 3.763812154696133e-07, "logits/chosen": -2.8270747661590576, "logits/rejected": -2.741473913192749, "logps/chosen": -208.8101043701172, "logps/rejected": -218.16537475585938, "loss": 0.1567, "rewards/accuracies": 0.875, "rewards/chosen": -3.2333245277404785, "rewards/margins": 4.820733547210693, "rewards/rejected": -8.054059028625488, "step": 260 }, { "epoch": 0.34, "learning_rate": 3.6947513812154694e-07, "logits/chosen": -2.8693325519561768, "logits/rejected": -2.7638630867004395, "logps/chosen": -211.0121612548828, "logps/rejected": -217.35910034179688, "loss": 0.1731, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.540541410446167, "rewards/margins": 4.472687721252441, "rewards/rejected": -8.013228416442871, "step": 270 }, { "epoch": 0.35, "learning_rate": 3.6256906077348066e-07, "logits/chosen": -2.7264175415039062, "logits/rejected": -2.7106542587280273, "logps/chosen": -221.5767822265625, "logps/rejected": -234.4220733642578, "loss": 0.2261, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.8089592456817627, "rewards/margins": 4.285206317901611, "rewards/rejected": -8.094165802001953, "step": 280 }, { "epoch": 0.36, "learning_rate": 3.556629834254143e-07, "logits/chosen": -2.709073305130005, "logits/rejected": -2.650305986404419, "logps/chosen": -183.47171020507812, "logps/rejected": -189.16053771972656, "loss": 0.1949, "rewards/accuracies": 0.75, "rewards/chosen": -3.39518666267395, "rewards/margins": 3.255284070968628, "rewards/rejected": -6.6504716873168945, "step": 290 }, { "epoch": 0.37, "learning_rate": 3.4875690607734804e-07, "logits/chosen": -2.742584228515625, "logits/rejected": -2.634887456893921, "logps/chosen": -212.4958953857422, "logps/rejected": -214.77978515625, "loss": 0.1577, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.987830400466919, "rewards/margins": 4.540104389190674, "rewards/rejected": -7.527935028076172, "step": 300 }, { "epoch": 0.39, "learning_rate": 3.418508287292817e-07, "logits/chosen": -2.803213119506836, "logits/rejected": -2.705699920654297, "logps/chosen": -201.23739624023438, "logps/rejected": -210.5272979736328, "loss": 0.1819, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2409796714782715, "rewards/margins": 4.629876136779785, "rewards/rejected": -6.870855808258057, "step": 310 }, { "epoch": 0.4, "learning_rate": 3.3494475138121547e-07, "logits/chosen": -2.821776866912842, "logits/rejected": -2.6838371753692627, "logps/chosen": -205.13998413085938, "logps/rejected": -210.2687225341797, "loss": 0.1284, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8670909404754639, "rewards/margins": 5.2015180587768555, "rewards/rejected": -7.068609714508057, "step": 320 }, { "epoch": 0.41, "learning_rate": 3.280386740331492e-07, "logits/chosen": -2.717777729034424, "logits/rejected": -2.6688733100891113, "logps/chosen": -181.1573028564453, "logps/rejected": -195.77450561523438, "loss": 0.2068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.757702589035034, "rewards/margins": 4.233901023864746, "rewards/rejected": -6.991603851318359, "step": 330 }, { "epoch": 0.42, "learning_rate": 3.2113259668508285e-07, "logits/chosen": -2.834904193878174, "logits/rejected": -2.751018524169922, "logps/chosen": -236.5355224609375, "logps/rejected": -249.8627166748047, "loss": 0.1873, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.415268898010254, "rewards/margins": 4.573099136352539, "rewards/rejected": -8.988368034362793, "step": 340 }, { "epoch": 0.43, "learning_rate": 3.1422651933701657e-07, "logits/chosen": -2.702388048171997, "logits/rejected": -2.5783984661102295, "logps/chosen": -205.3250274658203, "logps/rejected": -226.51025390625, "loss": 0.1513, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.3364059925079346, "rewards/margins": 5.607662200927734, "rewards/rejected": -8.94406795501709, "step": 350 }, { "epoch": 0.45, "learning_rate": 3.0732044198895023e-07, "logits/chosen": -2.719581127166748, "logits/rejected": -2.6670548915863037, "logps/chosen": -215.56069946289062, "logps/rejected": -238.80691528320312, "loss": 0.1319, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7043700218200684, "rewards/margins": 5.375087261199951, "rewards/rejected": -9.07945728302002, "step": 360 }, { "epoch": 0.46, "learning_rate": 3.00414364640884e-07, "logits/chosen": -2.687743902206421, "logits/rejected": -2.534635305404663, "logps/chosen": -197.8555145263672, "logps/rejected": -212.0418243408203, "loss": 0.1153, "rewards/accuracies": 0.875, "rewards/chosen": -2.731755495071411, "rewards/margins": 6.006522178649902, "rewards/rejected": -8.738277435302734, "step": 370 }, { "epoch": 0.47, "learning_rate": 2.9350828729281767e-07, "logits/chosen": -2.8031535148620605, "logits/rejected": -2.7039589881896973, "logps/chosen": -196.32485961914062, "logps/rejected": -220.0038604736328, "loss": 0.1482, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.087531805038452, "rewards/margins": 5.792913436889648, "rewards/rejected": -8.88044548034668, "step": 380 }, { "epoch": 0.48, "learning_rate": 2.866022099447514e-07, "logits/chosen": -2.762528657913208, "logits/rejected": -2.7139079570770264, "logps/chosen": -245.5567169189453, "logps/rejected": -268.78729248046875, "loss": 0.1335, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.6741371154785156, "rewards/margins": 6.558190822601318, "rewards/rejected": -10.232328414916992, "step": 390 }, { "epoch": 0.5, "learning_rate": 2.7969613259668505e-07, "logits/chosen": -2.8232762813568115, "logits/rejected": -2.6880855560302734, "logps/chosen": -210.9452362060547, "logps/rejected": -228.576416015625, "loss": 0.1482, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.587395191192627, "rewards/margins": 6.285602569580078, "rewards/rejected": -9.87299919128418, "step": 400 }, { "epoch": 0.51, "learning_rate": 2.7279005524861877e-07, "logits/chosen": -2.968181610107422, "logits/rejected": -2.783268690109253, "logps/chosen": -250.2484893798828, "logps/rejected": -248.7700653076172, "loss": 0.16, "rewards/accuracies": 0.875, "rewards/chosen": -4.302699565887451, "rewards/margins": 5.751161098480225, "rewards/rejected": -10.053861618041992, "step": 410 }, { "epoch": 0.52, "learning_rate": 2.6588397790055243e-07, "logits/chosen": -2.843967914581299, "logits/rejected": -2.7779390811920166, "logps/chosen": -220.41262817382812, "logps/rejected": -247.0951690673828, "loss": 0.131, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.6894760131835938, "rewards/margins": 5.2908501625061035, "rewards/rejected": -8.980325698852539, "step": 420 }, { "epoch": 0.53, "learning_rate": 2.589779005524862e-07, "logits/chosen": -2.895799398422241, "logits/rejected": -2.795949935913086, "logps/chosen": -243.14785766601562, "logps/rejected": -262.537109375, "loss": 0.1558, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.353732109069824, "rewards/margins": 6.2553300857543945, "rewards/rejected": -10.609061241149902, "step": 430 }, { "epoch": 0.55, "learning_rate": 2.5207182320441986e-07, "logits/chosen": -2.8478636741638184, "logits/rejected": -2.782703399658203, "logps/chosen": -251.97689819335938, "logps/rejected": -279.1159362792969, "loss": 0.1489, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.981522083282471, "rewards/margins": 6.379393100738525, "rewards/rejected": -11.360913276672363, "step": 440 }, { "epoch": 0.56, "learning_rate": 2.451657458563536e-07, "logits/chosen": -2.9137637615203857, "logits/rejected": -2.846818447113037, "logps/chosen": -231.48171997070312, "logps/rejected": -252.91494750976562, "loss": 0.1211, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.0656819343566895, "rewards/margins": 6.312979221343994, "rewards/rejected": -10.378661155700684, "step": 450 }, { "epoch": 0.57, "learning_rate": 2.3825966850828727e-07, "logits/chosen": -2.820435047149658, "logits/rejected": -2.7222158908843994, "logps/chosen": -176.95230102539062, "logps/rejected": -202.005859375, "loss": 0.1487, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8066993951797485, "rewards/margins": 6.806242942810059, "rewards/rejected": -8.612942695617676, "step": 460 }, { "epoch": 0.58, "learning_rate": 2.31353591160221e-07, "logits/chosen": -2.861485004425049, "logits/rejected": -2.7676100730895996, "logps/chosen": -223.93276977539062, "logps/rejected": -244.85665893554688, "loss": 0.1293, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.4348251819610596, "rewards/margins": 6.165853023529053, "rewards/rejected": -9.600679397583008, "step": 470 }, { "epoch": 0.6, "learning_rate": 2.2444751381215468e-07, "logits/chosen": -2.933964967727661, "logits/rejected": -2.788942337036133, "logps/chosen": -226.66796875, "logps/rejected": -244.5904998779297, "loss": 0.1224, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5399329662323, "rewards/margins": 5.981089115142822, "rewards/rejected": -9.521021842956543, "step": 480 }, { "epoch": 0.61, "learning_rate": 2.175414364640884e-07, "logits/chosen": -2.798732280731201, "logits/rejected": -2.7525930404663086, "logps/chosen": -224.1324462890625, "logps/rejected": -260.4334411621094, "loss": 0.1676, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.89032244682312, "rewards/margins": 7.569252967834473, "rewards/rejected": -11.459574699401855, "step": 490 }, { "epoch": 0.62, "learning_rate": 2.1063535911602208e-07, "logits/chosen": -2.837965726852417, "logits/rejected": -2.7505483627319336, "logps/chosen": -236.68588256835938, "logps/rejected": -258.0523681640625, "loss": 0.1537, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.027956008911133, "rewards/margins": 6.000524997711182, "rewards/rejected": -11.028480529785156, "step": 500 }, { "epoch": 0.62, "eval_logits/chosen": -2.780914545059204, "eval_logits/rejected": -2.70768666267395, "eval_logps/chosen": -219.7342071533203, "eval_logps/rejected": -246.59507751464844, "eval_loss": 0.1479674130678177, "eval_rewards/accuracies": 0.8563829660415649, "eval_rewards/chosen": -3.757824182510376, "eval_rewards/margins": 6.220169544219971, "eval_rewards/rejected": -9.97799301147461, "eval_runtime": 240.8343, "eval_samples_per_second": 6.208, "eval_steps_per_second": 0.195, "step": 500 }, { "epoch": 0.63, "learning_rate": 2.0372928176795578e-07, "logits/chosen": -2.8163836002349854, "logits/rejected": -2.7235684394836426, "logps/chosen": -206.2197265625, "logps/rejected": -240.59561157226562, "loss": 0.1149, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.1149652004241943, "rewards/margins": 7.466650485992432, "rewards/rejected": -10.58161449432373, "step": 510 }, { "epoch": 0.65, "learning_rate": 1.968232044198895e-07, "logits/chosen": -2.895784854888916, "logits/rejected": -2.8176522254943848, "logps/chosen": -211.858642578125, "logps/rejected": -227.81741333007812, "loss": 0.1775, "rewards/accuracies": 0.875, "rewards/chosen": -4.783520698547363, "rewards/margins": 4.954631805419922, "rewards/rejected": -9.738151550292969, "step": 520 }, { "epoch": 0.66, "learning_rate": 1.899171270718232e-07, "logits/chosen": -2.9654347896575928, "logits/rejected": -2.8510003089904785, "logps/chosen": -215.161376953125, "logps/rejected": -225.93490600585938, "loss": 0.1701, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.928018093109131, "rewards/margins": 5.275222301483154, "rewards/rejected": -9.203241348266602, "step": 530 }, { "epoch": 0.67, "learning_rate": 1.830110497237569e-07, "logits/chosen": -2.834559202194214, "logits/rejected": -2.7768056392669678, "logps/chosen": -248.0672149658203, "logps/rejected": -277.980224609375, "loss": 0.1456, "rewards/accuracies": 0.875, "rewards/chosen": -4.741249084472656, "rewards/margins": 5.934549331665039, "rewards/rejected": -10.675798416137695, "step": 540 }, { "epoch": 0.68, "learning_rate": 1.7610497237569062e-07, "logits/chosen": -2.878166913986206, "logits/rejected": -2.8250679969787598, "logps/chosen": -214.7487030029297, "logps/rejected": -249.8466339111328, "loss": 0.1641, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.848902940750122, "rewards/margins": 6.1274847984313965, "rewards/rejected": -9.976387023925781, "step": 550 }, { "epoch": 0.7, "learning_rate": 1.691988950276243e-07, "logits/chosen": -2.9045345783233643, "logits/rejected": -2.85447096824646, "logps/chosen": -237.44955444335938, "logps/rejected": -270.6697998046875, "loss": 0.1439, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.506590843200684, "rewards/margins": 6.747040748596191, "rewards/rejected": -11.253631591796875, "step": 560 }, { "epoch": 0.71, "learning_rate": 1.62292817679558e-07, "logits/chosen": -2.8267338275909424, "logits/rejected": -2.673027515411377, "logps/chosen": -219.37026977539062, "logps/rejected": -235.9922332763672, "loss": 0.1133, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.315529823303223, "rewards/margins": 6.053648471832275, "rewards/rejected": -10.369178771972656, "step": 570 }, { "epoch": 0.72, "learning_rate": 1.5538674033149171e-07, "logits/chosen": -2.883451223373413, "logits/rejected": -2.7439939975738525, "logps/chosen": -228.4202117919922, "logps/rejected": -242.2844696044922, "loss": 0.1627, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.730469226837158, "rewards/margins": 6.399707317352295, "rewards/rejected": -11.130178451538086, "step": 580 }, { "epoch": 0.73, "learning_rate": 1.484806629834254e-07, "logits/chosen": -2.9942538738250732, "logits/rejected": -2.868765115737915, "logps/chosen": -260.10406494140625, "logps/rejected": -253.8542938232422, "loss": 0.1594, "rewards/accuracies": 0.875, "rewards/chosen": -5.60601282119751, "rewards/margins": 4.876471519470215, "rewards/rejected": -10.482483863830566, "step": 590 }, { "epoch": 0.75, "learning_rate": 1.4157458563535912e-07, "logits/chosen": -2.9481163024902344, "logits/rejected": -2.8333523273468018, "logps/chosen": -226.2001953125, "logps/rejected": -235.05014038085938, "loss": 0.1784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.186473846435547, "rewards/margins": 4.799227714538574, "rewards/rejected": -9.985700607299805, "step": 600 }, { "epoch": 0.76, "learning_rate": 1.346685082872928e-07, "logits/chosen": -2.820510149002075, "logits/rejected": -2.734377384185791, "logps/chosen": -261.7950134277344, "logps/rejected": -286.0494689941406, "loss": 0.104, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.037507057189941, "rewards/margins": 6.992823600769043, "rewards/rejected": -12.030329704284668, "step": 610 }, { "epoch": 0.77, "learning_rate": 1.277624309392265e-07, "logits/chosen": -2.909327507019043, "logits/rejected": -2.750497817993164, "logps/chosen": -224.2764129638672, "logps/rejected": -231.7129364013672, "loss": 0.1504, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.132498741149902, "rewards/margins": 5.400217056274414, "rewards/rejected": -10.532715797424316, "step": 620 }, { "epoch": 0.78, "learning_rate": 1.2085635359116022e-07, "logits/chosen": -2.9544167518615723, "logits/rejected": -2.8220763206481934, "logps/chosen": -245.1588134765625, "logps/rejected": -261.6687927246094, "loss": 0.1255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.218487739562988, "rewards/margins": 6.255653381347656, "rewards/rejected": -11.474142074584961, "step": 630 }, { "epoch": 0.8, "learning_rate": 1.1395027624309392e-07, "logits/chosen": -2.8086042404174805, "logits/rejected": -2.785409927368164, "logps/chosen": -253.0203857421875, "logps/rejected": -281.2375793457031, "loss": 0.16, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.011744976043701, "rewards/margins": 5.636991024017334, "rewards/rejected": -11.648736000061035, "step": 640 }, { "epoch": 0.81, "learning_rate": 1.0704419889502763e-07, "logits/chosen": -2.9435603618621826, "logits/rejected": -2.7380692958831787, "logps/chosen": -261.493896484375, "logps/rejected": -278.2936096191406, "loss": 0.1174, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.648413181304932, "rewards/margins": 7.05194616317749, "rewards/rejected": -11.700358390808105, "step": 650 }, { "epoch": 0.82, "learning_rate": 1.0013812154696132e-07, "logits/chosen": -2.9581801891326904, "logits/rejected": -2.8110265731811523, "logps/chosen": -244.45413208007812, "logps/rejected": -264.9123840332031, "loss": 0.1429, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.337543487548828, "rewards/margins": 6.183696746826172, "rewards/rejected": -11.521239280700684, "step": 660 }, { "epoch": 0.83, "learning_rate": 9.323204419889502e-08, "logits/chosen": -2.905041217803955, "logits/rejected": -2.7901289463043213, "logps/chosen": -200.6727752685547, "logps/rejected": -228.93032836914062, "loss": 0.1408, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.679917812347412, "rewards/margins": 6.9505295753479, "rewards/rejected": -9.630447387695312, "step": 670 }, { "epoch": 0.84, "learning_rate": 8.632596685082872e-08, "logits/chosen": -2.856194257736206, "logits/rejected": -2.761082410812378, "logps/chosen": -218.3007049560547, "logps/rejected": -234.4198455810547, "loss": 0.1398, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.6605896949768066, "rewards/margins": 6.374427795410156, "rewards/rejected": -10.035017013549805, "step": 680 }, { "epoch": 0.86, "learning_rate": 7.941988950276243e-08, "logits/chosen": -2.896955966949463, "logits/rejected": -2.786186933517456, "logps/chosen": -197.8193817138672, "logps/rejected": -214.23001098632812, "loss": 0.1442, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.441520690917969, "rewards/margins": 5.363109588623047, "rewards/rejected": -9.804631233215332, "step": 690 }, { "epoch": 0.87, "learning_rate": 7.251381215469612e-08, "logits/chosen": -2.957908868789673, "logits/rejected": -2.8377201557159424, "logps/chosen": -270.6792907714844, "logps/rejected": -286.62213134765625, "loss": 0.1486, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.179241180419922, "rewards/margins": 6.042364597320557, "rewards/rejected": -12.22160530090332, "step": 700 }, { "epoch": 0.88, "learning_rate": 6.560773480662984e-08, "logits/chosen": -2.774834156036377, "logits/rejected": -2.703329563140869, "logps/chosen": -202.3549041748047, "logps/rejected": -244.16653442382812, "loss": 0.1335, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.156593322753906, "rewards/margins": 6.949100494384766, "rewards/rejected": -11.105693817138672, "step": 710 }, { "epoch": 0.89, "learning_rate": 5.870165745856354e-08, "logits/chosen": -2.8945531845092773, "logits/rejected": -2.7444348335266113, "logps/chosen": -190.70558166503906, "logps/rejected": -232.6694793701172, "loss": 0.1084, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.4744460582733154, "rewards/margins": 8.159748077392578, "rewards/rejected": -10.634195327758789, "step": 720 }, { "epoch": 0.91, "learning_rate": 5.1795580110497236e-08, "logits/chosen": -2.9436967372894287, "logits/rejected": -2.8356709480285645, "logps/chosen": -236.69949340820312, "logps/rejected": -253.46145629882812, "loss": 0.1855, "rewards/accuracies": 0.875, "rewards/chosen": -4.680401802062988, "rewards/margins": 6.0413007736206055, "rewards/rejected": -10.72170352935791, "step": 730 }, { "epoch": 0.92, "learning_rate": 4.488950276243094e-08, "logits/chosen": -2.7878644466400146, "logits/rejected": -2.745734930038452, "logps/chosen": -197.5469970703125, "logps/rejected": -246.12252807617188, "loss": 0.1106, "rewards/accuracies": 0.875, "rewards/chosen": -3.669739246368408, "rewards/margins": 6.831077575683594, "rewards/rejected": -10.500818252563477, "step": 740 }, { "epoch": 0.93, "learning_rate": 3.7983425414364637e-08, "logits/chosen": -2.918865203857422, "logits/rejected": -2.7714521884918213, "logps/chosen": -224.0564727783203, "logps/rejected": -241.874267578125, "loss": 0.1357, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.486175537109375, "rewards/margins": 6.323982238769531, "rewards/rejected": -10.810157775878906, "step": 750 }, { "epoch": 0.94, "learning_rate": 3.107734806629834e-08, "logits/chosen": -2.789961338043213, "logits/rejected": -2.702083110809326, "logps/chosen": -197.3814697265625, "logps/rejected": -234.6428680419922, "loss": 0.1653, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.8847968578338623, "rewards/margins": 6.971263885498047, "rewards/rejected": -10.856060981750488, "step": 760 }, { "epoch": 0.96, "learning_rate": 2.4171270718232044e-08, "logits/chosen": -2.886162042617798, "logits/rejected": -2.782167673110962, "logps/chosen": -251.7931365966797, "logps/rejected": -276.71856689453125, "loss": 0.1811, "rewards/accuracies": 0.75, "rewards/chosen": -5.165857791900635, "rewards/margins": 6.922626495361328, "rewards/rejected": -12.088483810424805, "step": 770 }, { "epoch": 0.97, "learning_rate": 1.7265193370165747e-08, "logits/chosen": -2.998176336288452, "logits/rejected": -2.831453800201416, "logps/chosen": -238.40634155273438, "logps/rejected": -264.1795349121094, "loss": 0.1153, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.043548345565796, "rewards/margins": 8.052096366882324, "rewards/rejected": -11.0956449508667, "step": 780 }, { "epoch": 0.98, "learning_rate": 1.0359116022099446e-08, "logits/chosen": -2.931680202484131, "logits/rejected": -2.758117198944092, "logps/chosen": -215.9856719970703, "logps/rejected": -238.5784912109375, "loss": 0.133, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.6980485916137695, "rewards/margins": 6.217514991760254, "rewards/rejected": -10.915563583374023, "step": 790 }, { "epoch": 0.99, "learning_rate": 3.453038674033149e-09, "logits/chosen": -2.8354744911193848, "logits/rejected": -2.7321650981903076, "logps/chosen": -240.4857940673828, "logps/rejected": -271.0428771972656, "loss": 0.1495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.8582940101623535, "rewards/margins": 7.768431186676025, "rewards/rejected": -11.626726150512695, "step": 800 }, { "epoch": 1.0, "step": 805, "total_flos": 0.0, "train_loss": 0.18541636852003773, "train_runtime": 4261.3779, "train_samples_per_second": 3.022, "train_steps_per_second": 0.189 } ], "logging_steps": 10, "max_steps": 805, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }