{ "best_metric": 0.15150243043899536, "best_model_checkpoint": "saves/Llama-3.1-8B-Instruct/lora/saa-900/checkpoint-200", "epoch": 9.876543209876543, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.19753086419753085, "grad_norm": 5.683405876159668, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -0.4294208884239197, "logits/rejected": -0.5036857724189758, "logps/chosen": -1.7164901494979858, "logps/rejected": -2.1387689113616943, "loss": 1.77, "odds_ratio_loss": 15.52961540222168, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.1716490387916565, "rewards/margins": 0.0422278493642807, "rewards/rejected": -0.2138768881559372, "sft_loss": 0.21705082058906555, "step": 10 }, { "epoch": 0.3950617283950617, "grad_norm": 7.8601579666137695, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -0.41924962401390076, "logits/rejected": -0.4839371144771576, "logps/chosen": -1.7584301233291626, "logps/rejected": -2.1139349937438965, "loss": 1.8145, "odds_ratio_loss": 15.842885971069336, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.1758430302143097, "rewards/margins": 0.03555048629641533, "rewards/rejected": -0.21139350533485413, "sft_loss": 0.23018595576286316, "step": 20 }, { "epoch": 0.5925925925925926, "grad_norm": 8.10688591003418, "learning_rate": 3e-06, "logits/chosen": -0.4261442720890045, "logits/rejected": -0.5029150247573853, "logps/chosen": -1.730046033859253, "logps/rejected": -2.217639684677124, "loss": 1.781, "odds_ratio_loss": 15.661776542663574, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.173004612326622, "rewards/margins": 0.048759374767541885, "rewards/rejected": -0.2217639982700348, "sft_loss": 0.21483230590820312, "step": 30 }, { "epoch": 0.7901234567901234, "grad_norm": 6.699493885040283, "learning_rate": 4.000000000000001e-06, "logits/chosen": -0.41258639097213745, "logits/rejected": -0.4953138828277588, "logps/chosen": -1.566969633102417, "logps/rejected": -2.103066921234131, "loss": 1.6158, "odds_ratio_loss": 14.229395866394043, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.15669694542884827, "rewards/margins": 0.053609758615493774, "rewards/rejected": -0.21030668914318085, "sft_loss": 0.19286662340164185, "step": 40 }, { "epoch": 0.9876543209876543, "grad_norm": 9.212828636169434, "learning_rate": 5e-06, "logits/chosen": -0.4587825834751129, "logits/rejected": -0.5460661053657532, "logps/chosen": -1.524173378944397, "logps/rejected": -1.9827760457992554, "loss": 1.5773, "odds_ratio_loss": 13.699198722839355, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.15241733193397522, "rewards/margins": 0.04586028307676315, "rewards/rejected": -0.19827762246131897, "sft_loss": 0.20735812187194824, "step": 50 }, { "epoch": 0.9876543209876543, "eval_logits/chosen": -0.4034404754638672, "eval_logits/rejected": -0.46630287170410156, "eval_logps/chosen": -1.3146759271621704, "eval_logps/rejected": -1.754432201385498, "eval_loss": 1.3696272373199463, "eval_odds_ratio_loss": 11.865678787231445, "eval_rewards/accuracies": 0.7666666507720947, "eval_rewards/chosen": -0.13146759569644928, "eval_rewards/margins": 0.04397563263773918, "eval_rewards/rejected": -0.17544323205947876, "eval_runtime": 3.7458, "eval_samples_per_second": 24.027, "eval_sft_loss": 0.18305927515029907, "eval_steps_per_second": 12.013, "step": 50 }, { "epoch": 1.1851851851851851, "grad_norm": 4.2851243019104, "learning_rate": 4.993910125649561e-06, "logits/chosen": -0.42754077911376953, "logits/rejected": -0.49272727966308594, "logps/chosen": -1.14907968044281, "logps/rejected": -1.5600894689559937, "loss": 1.2014, "odds_ratio_loss": 10.684017181396484, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.11490797996520996, "rewards/margins": 0.041100967675447464, "rewards/rejected": -0.15600894391536713, "sft_loss": 0.13294857740402222, "step": 60 }, { "epoch": 1.382716049382716, "grad_norm": 2.6753902435302734, "learning_rate": 4.975670171853926e-06, "logits/chosen": -0.42295628786087036, "logits/rejected": -0.5050143003463745, "logps/chosen": -0.8449110984802246, "logps/rejected": -1.3062922954559326, "loss": 0.8939, "odds_ratio_loss": 7.972494602203369, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.08449110388755798, "rewards/margins": 0.04613811895251274, "rewards/rejected": -0.13062922656536102, "sft_loss": 0.09666236490011215, "step": 70 }, { "epoch": 1.5802469135802468, "grad_norm": 2.68009090423584, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -0.41366061568260193, "logits/rejected": -0.5027936100959778, "logps/chosen": -0.5736119151115417, "logps/rejected": -1.0685169696807861, "loss": 0.6186, "odds_ratio_loss": 5.562970161437988, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.057361193001270294, "rewards/margins": 0.04949050396680832, "rewards/rejected": -0.10685168206691742, "sft_loss": 0.06228654831647873, "step": 80 }, { "epoch": 1.7777777777777777, "grad_norm": 2.270496129989624, "learning_rate": 4.903154239845798e-06, "logits/chosen": -0.4178565442562103, "logits/rejected": -0.48920226097106934, "logps/chosen": -0.3355949819087982, "logps/rejected": -0.902151882648468, "loss": 0.3717, "odds_ratio_loss": 3.370701551437378, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.03355949744582176, "rewards/margins": 0.05665569379925728, "rewards/rejected": -0.09021519124507904, "sft_loss": 0.03466098755598068, "step": 90 }, { "epoch": 1.9753086419753085, "grad_norm": 2.1452736854553223, "learning_rate": 4.849231551964771e-06, "logits/chosen": -0.39813241362571716, "logits/rejected": -0.4744625985622406, "logps/chosen": -0.22052116692066193, "logps/rejected": -0.8172726631164551, "loss": 0.2518, "odds_ratio_loss": 2.2916011810302734, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.02205212041735649, "rewards/margins": 0.05967513844370842, "rewards/rejected": -0.08172726631164551, "sft_loss": 0.022620398551225662, "step": 100 }, { "epoch": 1.9753086419753085, "eval_logits/chosen": -0.3780798316001892, "eval_logits/rejected": -0.448334664106369, "eval_logps/chosen": -0.18976576626300812, "eval_logps/rejected": -0.7320783734321594, "eval_loss": 0.23485496640205383, "eval_odds_ratio_loss": 2.1322858333587646, "eval_rewards/accuracies": 0.8111110925674438, "eval_rewards/chosen": -0.018976576626300812, "eval_rewards/margins": 0.05423126742243767, "eval_rewards/rejected": -0.07320784032344818, "eval_runtime": 3.7439, "eval_samples_per_second": 24.039, "eval_sft_loss": 0.02162640169262886, "eval_steps_per_second": 12.02, "step": 100 }, { "epoch": 2.1728395061728394, "grad_norm": 2.2345616817474365, "learning_rate": 4.783863644106502e-06, "logits/chosen": -0.43218794465065, "logits/rejected": -0.4891538619995117, "logps/chosen": -0.18616971373558044, "logps/rejected": -0.6744140386581421, "loss": 0.2306, "odds_ratio_loss": 2.0546927452087402, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.018616970628499985, "rewards/margins": 0.048824433237314224, "rewards/rejected": -0.06744140386581421, "sft_loss": 0.025166219100356102, "step": 110 }, { "epoch": 2.3703703703703702, "grad_norm": 1.977693796157837, "learning_rate": 4.707368982147318e-06, "logits/chosen": -0.4379637837409973, "logits/rejected": -0.5003925561904907, "logps/chosen": -0.11895088851451874, "logps/rejected": -0.6111856698989868, "loss": 0.1586, "odds_ratio_loss": 1.449549674987793, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.011895090341567993, "rewards/margins": 0.04922347515821457, "rewards/rejected": -0.06111856549978256, "sft_loss": 0.01368994265794754, "step": 120 }, { "epoch": 2.567901234567901, "grad_norm": 0.721815288066864, "learning_rate": 4.620120240391065e-06, "logits/chosen": -0.3538777828216553, "logits/rejected": -0.4279879927635193, "logps/chosen": -0.09486258774995804, "logps/rejected": -0.7161315679550171, "loss": 0.1321, "odds_ratio_loss": 1.2123935222625732, "rewards/accuracies": 0.84375, "rewards/chosen": -0.009486258029937744, "rewards/margins": 0.062126897275447845, "rewards/rejected": -0.07161315530538559, "sft_loss": 0.010894850827753544, "step": 130 }, { "epoch": 2.765432098765432, "grad_norm": 1.1547597646713257, "learning_rate": 4.522542485937369e-06, "logits/chosen": -0.38979417085647583, "logits/rejected": -0.45057815313339233, "logps/chosen": -0.0999944657087326, "logps/rejected": -0.7246894836425781, "loss": 0.1326, "odds_ratio_loss": 1.1918091773986816, "rewards/accuracies": 0.84375, "rewards/chosen": -0.009999445639550686, "rewards/margins": 0.06246950477361679, "rewards/rejected": -0.07246895134449005, "sft_loss": 0.013394972309470177, "step": 140 }, { "epoch": 2.962962962962963, "grad_norm": 1.2962913513183594, "learning_rate": 4.415111107797445e-06, "logits/chosen": -0.3816613554954529, "logits/rejected": -0.43519797921180725, "logps/chosen": -0.09587013721466064, "logps/rejected": -0.5622029304504395, "loss": 0.1304, "odds_ratio_loss": 1.182822823524475, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.009587015025317669, "rewards/margins": 0.0466332733631134, "rewards/rejected": -0.05622028559446335, "sft_loss": 0.012101124040782452, "step": 150 }, { "epoch": 2.962962962962963, "eval_logits/chosen": -0.345350980758667, "eval_logits/rejected": -0.4031663239002228, "eval_logps/chosen": -0.10941138863563538, "eval_logps/rejected": -0.6116734147071838, "eval_loss": 0.15299223363399506, "eval_odds_ratio_loss": 1.3987985849380493, "eval_rewards/accuracies": 0.8111110925674438, "eval_rewards/chosen": -0.010941138491034508, "eval_rewards/margins": 0.050226207822561264, "eval_rewards/rejected": -0.061167340725660324, "eval_runtime": 3.741, "eval_samples_per_second": 24.058, "eval_sft_loss": 0.013112363405525684, "eval_steps_per_second": 12.029, "step": 150 }, { "epoch": 3.1604938271604937, "grad_norm": 1.6140797138214111, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -0.3475257158279419, "logits/rejected": -0.4172907769680023, "logps/chosen": -0.0872744545340538, "logps/rejected": -0.547534167766571, "loss": 0.1283, "odds_ratio_loss": 1.1653860807418823, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.008727445267140865, "rewards/margins": 0.046025972813367844, "rewards/rejected": -0.05475341156125069, "sft_loss": 0.011779388412833214, "step": 160 }, { "epoch": 3.3580246913580245, "grad_norm": 1.6337134838104248, "learning_rate": 4.172826515897146e-06, "logits/chosen": -0.3779260814189911, "logits/rejected": -0.44266828894615173, "logps/chosen": -0.08930256962776184, "logps/rejected": -0.6787833571434021, "loss": 0.1222, "odds_ratio_loss": 1.113948106765747, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.008930256590247154, "rewards/margins": 0.058948077261447906, "rewards/rejected": -0.06787833571434021, "sft_loss": 0.01077614352107048, "step": 170 }, { "epoch": 3.5555555555555554, "grad_norm": 1.329077959060669, "learning_rate": 4.039153688314146e-06, "logits/chosen": -0.3644777834415436, "logits/rejected": -0.4096454977989197, "logps/chosen": -0.07825501263141632, "logps/rejected": -0.5899345874786377, "loss": 0.1135, "odds_ratio_loss": 1.0353591442108154, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.007825501263141632, "rewards/margins": 0.05116795748472214, "rewards/rejected": -0.05899345874786377, "sft_loss": 0.009976235218346119, "step": 180 }, { "epoch": 3.753086419753086, "grad_norm": 2.1350457668304443, "learning_rate": 3.897982258676867e-06, "logits/chosen": -0.342695415019989, "logits/rejected": -0.41270384192466736, "logps/chosen": -0.09099350869655609, "logps/rejected": -0.6971117854118347, "loss": 0.1184, "odds_ratio_loss": 1.0633059740066528, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.009099351242184639, "rewards/margins": 0.06061183288693428, "rewards/rejected": -0.06971118599176407, "sft_loss": 0.012052880600094795, "step": 190 }, { "epoch": 3.950617283950617, "grad_norm": 1.1367535591125488, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.3970497250556946, "logits/rejected": -0.4613776206970215, "logps/chosen": -0.07991655170917511, "logps/rejected": -0.6025404334068298, "loss": 0.1129, "odds_ratio_loss": 1.0367439985275269, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.007991654798388481, "rewards/margins": 0.05226239562034607, "rewards/rejected": -0.0602540448307991, "sft_loss": 0.009189181961119175, "step": 200 }, { "epoch": 3.950617283950617, "eval_logits/chosen": -0.34801948070526123, "eval_logits/rejected": -0.4030517041683197, "eval_logps/chosen": -0.10840340703725815, "eval_logps/rejected": -0.5819076299667358, "eval_loss": 0.15150243043899536, "eval_odds_ratio_loss": 1.3828411102294922, "eval_rewards/accuracies": 0.8222222328186035, "eval_rewards/chosen": -0.010840341448783875, "eval_rewards/margins": 0.04735042154788971, "eval_rewards/rejected": -0.058190762996673584, "eval_runtime": 3.7401, "eval_samples_per_second": 24.064, "eval_sft_loss": 0.013218320906162262, "eval_steps_per_second": 12.032, "step": 200 }, { "epoch": 4.148148148148148, "grad_norm": 1.6960663795471191, "learning_rate": 3.595927866972694e-06, "logits/chosen": -0.3732069432735443, "logits/rejected": -0.4440130293369293, "logps/chosen": -0.06717614084482193, "logps/rejected": -0.516498327255249, "loss": 0.0942, "odds_ratio_loss": 0.8479182124137878, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.006717613898217678, "rewards/margins": 0.04493222385644913, "rewards/rejected": -0.05164983123540878, "sft_loss": 0.009402485564351082, "step": 210 }, { "epoch": 4.345679012345679, "grad_norm": 0.9651530385017395, "learning_rate": 3.436516483539781e-06, "logits/chosen": -0.3677094578742981, "logits/rejected": -0.4239886403083801, "logps/chosen": -0.09318893402814865, "logps/rejected": -0.6319376230239868, "loss": 0.13, "odds_ratio_loss": 1.189330816268921, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.009318893775343895, "rewards/margins": 0.053874872624874115, "rewards/rejected": -0.06319376826286316, "sft_loss": 0.011091579683125019, "step": 220 }, { "epoch": 4.54320987654321, "grad_norm": 1.0926239490509033, "learning_rate": 3.272542485937369e-06, "logits/chosen": -0.41018205881118774, "logits/rejected": -0.48241281509399414, "logps/chosen": -0.06124984472990036, "logps/rejected": -0.6271657347679138, "loss": 0.0887, "odds_ratio_loss": 0.8028218150138855, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.006124985404312611, "rewards/margins": 0.056591589003801346, "rewards/rejected": -0.06271658092737198, "sft_loss": 0.008464094251394272, "step": 230 }, { "epoch": 4.7407407407407405, "grad_norm": 1.5143941640853882, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -0.36270713806152344, "logits/rejected": -0.4348696172237396, "logps/chosen": -0.05753596872091293, "logps/rejected": -0.6398877501487732, "loss": 0.0825, "odds_ratio_loss": 0.7441199421882629, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.005753597244620323, "rewards/margins": 0.058235179632902145, "rewards/rejected": -0.06398877501487732, "sft_loss": 0.008074344135820866, "step": 240 }, { "epoch": 4.938271604938271, "grad_norm": 1.6572201251983643, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -0.3572860658168793, "logits/rejected": -0.41670694947242737, "logps/chosen": -0.09173234552145004, "logps/rejected": -0.7026180028915405, "loss": 0.1194, "odds_ratio_loss": 1.0817253589630127, "rewards/accuracies": 0.84375, "rewards/chosen": -0.009173234924674034, "rewards/margins": 0.061088573187589645, "rewards/rejected": -0.07026181370019913, "sft_loss": 0.011259505525231361, "step": 250 }, { "epoch": 4.938271604938271, "eval_logits/chosen": -0.3416781425476074, "eval_logits/rejected": -0.3981509208679199, "eval_logps/chosen": -0.10881069302558899, "eval_logps/rejected": -0.641668975353241, "eval_loss": 0.15224343538284302, "eval_odds_ratio_loss": 1.3890637159347534, "eval_rewards/accuracies": 0.8222222328186035, "eval_rewards/chosen": -0.01088106818497181, "eval_rewards/margins": 0.053285837173461914, "eval_rewards/rejected": -0.06416690349578857, "eval_runtime": 3.7429, "eval_samples_per_second": 24.045, "eval_sft_loss": 0.01333706360310316, "eval_steps_per_second": 12.023, "step": 250 }, { "epoch": 5.135802469135802, "grad_norm": 0.6316437721252441, "learning_rate": 2.761321158169134e-06, "logits/chosen": -0.38472387194633484, "logits/rejected": -0.43845945596694946, "logps/chosen": -0.08237903565168381, "logps/rejected": -0.6897364854812622, "loss": 0.1098, "odds_ratio_loss": 1.019060730934143, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.00823790393769741, "rewards/margins": 0.06073574349284172, "rewards/rejected": -0.06897365301847458, "sft_loss": 0.007918295450508595, "step": 260 }, { "epoch": 5.333333333333333, "grad_norm": 2.247709035873413, "learning_rate": 2.587248741756253e-06, "logits/chosen": -0.33236485719680786, "logits/rejected": -0.41399604082107544, "logps/chosen": -0.06683196127414703, "logps/rejected": -0.745326578617096, "loss": 0.0981, "odds_ratio_loss": 0.897613525390625, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.0066831959411501884, "rewards/margins": 0.06784946471452713, "rewards/rejected": -0.07453266531229019, "sft_loss": 0.008373981341719627, "step": 270 }, { "epoch": 5.530864197530864, "grad_norm": 3.0054218769073486, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -0.38011887669563293, "logits/rejected": -0.4398444592952728, "logps/chosen": -0.07033652812242508, "logps/rejected": -0.6755497455596924, "loss": 0.0967, "odds_ratio_loss": 0.8789058923721313, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.007033653557300568, "rewards/margins": 0.06052132323384285, "rewards/rejected": -0.06755497306585312, "sft_loss": 0.008822395466268063, "step": 280 }, { "epoch": 5.728395061728395, "grad_norm": 1.2473162412643433, "learning_rate": 2.238678841830867e-06, "logits/chosen": -0.35760438442230225, "logits/rejected": -0.432979017496109, "logps/chosen": -0.05806203559041023, "logps/rejected": -0.6580389142036438, "loss": 0.0822, "odds_ratio_loss": 0.7390257716178894, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.005806203465908766, "rewards/margins": 0.05999768525362015, "rewards/rejected": -0.0658038854598999, "sft_loss": 0.008302577771246433, "step": 290 }, { "epoch": 5.925925925925926, "grad_norm": 1.7050279378890991, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -0.39829307794570923, "logits/rejected": -0.4691620469093323, "logps/chosen": -0.06409647315740585, "logps/rejected": -0.6852135062217712, "loss": 0.0898, "odds_ratio_loss": 0.8166918754577637, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.00640964787453413, "rewards/margins": 0.062111712992191315, "rewards/rejected": -0.06852135807275772, "sft_loss": 0.00808151438832283, "step": 300 }, { "epoch": 5.925925925925926, "eval_logits/chosen": -0.3401709198951721, "eval_logits/rejected": -0.39598190784454346, "eval_logps/chosen": -0.11005988717079163, "eval_logps/rejected": -0.6839298605918884, "eval_loss": 0.153494730591774, "eval_odds_ratio_loss": 1.3989084959030151, "eval_rewards/accuracies": 0.8111110925674438, "eval_rewards/chosen": -0.011005990207195282, "eval_rewards/margins": 0.05738700181245804, "eval_rewards/rejected": -0.06839299947023392, "eval_runtime": 3.7412, "eval_samples_per_second": 24.056, "eval_sft_loss": 0.0136038763448596, "eval_steps_per_second": 12.028, "step": 300 }, { "epoch": 6.1234567901234565, "grad_norm": 0.6541029214859009, "learning_rate": 1.895195261000831e-06, "logits/chosen": -0.3678016662597656, "logits/rejected": -0.4242876470088959, "logps/chosen": -0.07081776112318039, "logps/rejected": -0.746510922908783, "loss": 0.0894, "odds_ratio_loss": 0.7998540997505188, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.007081775926053524, "rewards/margins": 0.0675693154335022, "rewards/rejected": -0.0746510922908783, "sft_loss": 0.009410968981683254, "step": 310 }, { "epoch": 6.320987654320987, "grad_norm": 1.3137905597686768, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -0.38049232959747314, "logits/rejected": -0.4344969391822815, "logps/chosen": -0.06233459711074829, "logps/rejected": -0.6675801277160645, "loss": 0.0862, "odds_ratio_loss": 0.7855814695358276, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.006233459338545799, "rewards/margins": 0.060524553060531616, "rewards/rejected": -0.06675801426172256, "sft_loss": 0.007671413477510214, "step": 320 }, { "epoch": 6.518518518518518, "grad_norm": 1.7016428709030151, "learning_rate": 1.56348351646022e-06, "logits/chosen": -0.35415560007095337, "logits/rejected": -0.4285738468170166, "logps/chosen": -0.06706033647060394, "logps/rejected": -0.7278292775154114, "loss": 0.0952, "odds_ratio_loss": 0.8702393770217896, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.0067060342989861965, "rewards/margins": 0.06607688963413239, "rewards/rejected": -0.07278293371200562, "sft_loss": 0.008213361725211143, "step": 330 }, { "epoch": 6.716049382716049, "grad_norm": 1.7326641082763672, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -0.355478435754776, "logits/rejected": -0.42061376571655273, "logps/chosen": -0.057823676615953445, "logps/rejected": -0.6569600105285645, "loss": 0.0825, "odds_ratio_loss": 0.7589547634124756, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.005782367195934057, "rewards/margins": 0.05991363525390625, "rewards/rejected": -0.06569599360227585, "sft_loss": 0.006640643812716007, "step": 340 }, { "epoch": 6.91358024691358, "grad_norm": 2.150254011154175, "learning_rate": 1.2500000000000007e-06, "logits/chosen": -0.3611661493778229, "logits/rejected": -0.43402963876724243, "logps/chosen": -0.06616374105215073, "logps/rejected": -0.7352410554885864, "loss": 0.0928, "odds_ratio_loss": 0.8426556587219238, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.00661637494340539, "rewards/margins": 0.06690772622823715, "rewards/rejected": -0.0735241025686264, "sft_loss": 0.00857949536293745, "step": 350 }, { "epoch": 6.91358024691358, "eval_logits/chosen": -0.33941933512687683, "eval_logits/rejected": -0.39487287402153015, "eval_logps/chosen": -0.11251324415206909, "eval_logps/rejected": -0.679356038570404, "eval_loss": 0.1572149693965912, "eval_odds_ratio_loss": 1.431775450706482, "eval_rewards/accuracies": 0.7888888716697693, "eval_rewards/chosen": -0.011251323856413364, "eval_rewards/margins": 0.05668427795171738, "eval_rewards/rejected": -0.06793560087680817, "eval_runtime": 3.7421, "eval_samples_per_second": 24.05, "eval_sft_loss": 0.014037418179214, "eval_steps_per_second": 12.025, "step": 350 }, { "epoch": 7.111111111111111, "grad_norm": 2.0509181022644043, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -0.3650658130645752, "logits/rejected": -0.4295966625213623, "logps/chosen": -0.057928264141082764, "logps/rejected": -0.6719712615013123, "loss": 0.084, "odds_ratio_loss": 0.771135687828064, "rewards/accuracies": 0.875, "rewards/chosen": -0.005792826414108276, "rewards/margins": 0.061404310166835785, "rewards/rejected": -0.06719712913036346, "sft_loss": 0.006877593696117401, "step": 360 }, { "epoch": 7.308641975308642, "grad_norm": 3.04136323928833, "learning_rate": 9.608463116858544e-07, "logits/chosen": -0.4004891812801361, "logits/rejected": -0.45741385221481323, "logps/chosen": -0.06307969242334366, "logps/rejected": -0.67022705078125, "loss": 0.0872, "odds_ratio_loss": 0.7940786480903625, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.006307969335466623, "rewards/margins": 0.060714732855558395, "rewards/rejected": -0.06702270358800888, "sft_loss": 0.00781711470335722, "step": 370 }, { "epoch": 7.506172839506172, "grad_norm": 1.3740649223327637, "learning_rate": 8.271734841028553e-07, "logits/chosen": -0.3516360819339752, "logits/rejected": -0.4145506024360657, "logps/chosen": -0.06131374090909958, "logps/rejected": -0.8121838569641113, "loss": 0.084, "odds_ratio_loss": 0.779072642326355, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.006131374277174473, "rewards/margins": 0.07508701831102371, "rewards/rejected": -0.08121839910745621, "sft_loss": 0.006131037604063749, "step": 380 }, { "epoch": 7.703703703703704, "grad_norm": 1.9797685146331787, "learning_rate": 7.016504991533727e-07, "logits/chosen": -0.37762802839279175, "logits/rejected": -0.4341215491294861, "logps/chosen": -0.06214147061109543, "logps/rejected": -0.7485532164573669, "loss": 0.0882, "odds_ratio_loss": 0.8045924305915833, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.006214146967977285, "rewards/margins": 0.06864118576049805, "rewards/rejected": -0.07485532760620117, "sft_loss": 0.007714971899986267, "step": 390 }, { "epoch": 7.901234567901234, "grad_norm": 1.4376161098480225, "learning_rate": 5.848888922025553e-07, "logits/chosen": -0.3702755868434906, "logits/rejected": -0.43015074729919434, "logps/chosen": -0.06369648873806, "logps/rejected": -0.6736232042312622, "loss": 0.0855, "odds_ratio_loss": 0.7752313613891602, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.0063696494325995445, "rewards/margins": 0.06099267676472664, "rewards/rejected": -0.06736232340335846, "sft_loss": 0.007961621508002281, "step": 400 }, { "epoch": 7.901234567901234, "eval_logits/chosen": -0.3374575674533844, "eval_logits/rejected": -0.39353418350219727, "eval_logps/chosen": -0.11246351897716522, "eval_logps/rejected": -0.7215471267700195, "eval_loss": 0.1577637493610382, "eval_odds_ratio_loss": 1.4393588304519653, "eval_rewards/accuracies": 0.800000011920929, "eval_rewards/chosen": -0.011246351525187492, "eval_rewards/margins": 0.060908351093530655, "eval_rewards/rejected": -0.072154700756073, "eval_runtime": 3.7403, "eval_samples_per_second": 24.062, "eval_sft_loss": 0.013827885501086712, "eval_steps_per_second": 12.031, "step": 400 }, { "epoch": 8.098765432098766, "grad_norm": 0.5824891328811646, "learning_rate": 4.774575140626317e-07, "logits/chosen": -0.3890485167503357, "logits/rejected": -0.4562603533267975, "logps/chosen": -0.052860356867313385, "logps/rejected": -0.8017857670783997, "loss": 0.0769, "odds_ratio_loss": 0.7033621072769165, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.005286035593599081, "rewards/margins": 0.07489253580570221, "rewards/rejected": -0.08017858117818832, "sft_loss": 0.00657792529091239, "step": 410 }, { "epoch": 8.296296296296296, "grad_norm": 2.3090314865112305, "learning_rate": 3.798797596089351e-07, "logits/chosen": -0.36521631479263306, "logits/rejected": -0.43496161699295044, "logps/chosen": -0.052145786583423615, "logps/rejected": -0.6766245365142822, "loss": 0.0738, "odds_ratio_loss": 0.6751989722251892, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.0052145784720778465, "rewards/margins": 0.062447868287563324, "rewards/rejected": -0.06766244769096375, "sft_loss": 0.0063272216357290745, "step": 420 }, { "epoch": 8.493827160493828, "grad_norm": 3.1166369915008545, "learning_rate": 2.9263101785268253e-07, "logits/chosen": -0.3338077664375305, "logits/rejected": -0.4001466631889343, "logps/chosen": -0.05268881469964981, "logps/rejected": -0.8022252321243286, "loss": 0.0766, "odds_ratio_loss": 0.6915222406387329, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.005268882028758526, "rewards/margins": 0.07495363801717758, "rewards/rejected": -0.08022252470254898, "sft_loss": 0.007409657351672649, "step": 430 }, { "epoch": 8.691358024691358, "grad_norm": 2.2428250312805176, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -0.3932591676712036, "logits/rejected": -0.4431510865688324, "logps/chosen": -0.05874771624803543, "logps/rejected": -0.8061612844467163, "loss": 0.0783, "odds_ratio_loss": 0.7208496928215027, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.005874771624803543, "rewards/margins": 0.07474134862422943, "rewards/rejected": -0.08061611652374268, "sft_loss": 0.006212843116372824, "step": 440 }, { "epoch": 8.88888888888889, "grad_norm": 2.0012757778167725, "learning_rate": 1.507684480352292e-07, "logits/chosen": -0.3309887647628784, "logits/rejected": -0.3963066637516022, "logps/chosen": -0.07408946752548218, "logps/rejected": -0.7123581171035767, "loss": 0.0985, "odds_ratio_loss": 0.9039106369018555, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.007408946752548218, "rewards/margins": 0.06382687389850616, "rewards/rejected": -0.07123581320047379, "sft_loss": 0.008154405280947685, "step": 450 }, { "epoch": 8.88888888888889, "eval_logits/chosen": -0.33720430731773376, "eval_logits/rejected": -0.3934166729450226, "eval_logps/chosen": -0.11222676187753677, "eval_logps/rejected": -0.7204592227935791, "eval_loss": 0.1574162095785141, "eval_odds_ratio_loss": 1.4357547760009766, "eval_rewards/accuracies": 0.800000011920929, "eval_rewards/chosen": -0.011222676374018192, "eval_rewards/margins": 0.06082325428724289, "eval_rewards/rejected": -0.07204592972993851, "eval_runtime": 3.7588, "eval_samples_per_second": 23.944, "eval_sft_loss": 0.013840759173035622, "eval_steps_per_second": 11.972, "step": 450 }, { "epoch": 9.08641975308642, "grad_norm": 1.9788808822631836, "learning_rate": 9.684576015420277e-08, "logits/chosen": -0.3609127700328827, "logits/rejected": -0.41957467794418335, "logps/chosen": -0.05120778828859329, "logps/rejected": -0.6972073316574097, "loss": 0.0681, "odds_ratio_loss": 0.6181837320327759, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.005120779387652874, "rewards/margins": 0.06459995359182358, "rewards/rejected": -0.06972073018550873, "sft_loss": 0.006232709623873234, "step": 460 }, { "epoch": 9.283950617283951, "grad_norm": 0.5987529158592224, "learning_rate": 5.463099816548578e-08, "logits/chosen": -0.3785928189754486, "logits/rejected": -0.44030579924583435, "logps/chosen": -0.06789538264274597, "logps/rejected": -0.718836784362793, "loss": 0.0919, "odds_ratio_loss": 0.8372634649276733, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.006789539009332657, "rewards/margins": 0.06509413570165634, "rewards/rejected": -0.0718836709856987, "sft_loss": 0.008134867064654827, "step": 470 }, { "epoch": 9.481481481481481, "grad_norm": 2.4453930854797363, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -0.3719639182090759, "logits/rejected": -0.44119659066200256, "logps/chosen": -0.06199900060892105, "logps/rejected": -0.9121496081352234, "loss": 0.0796, "odds_ratio_loss": 0.7194961905479431, "rewards/accuracies": 0.90625, "rewards/chosen": -0.006199900526553392, "rewards/margins": 0.08501505851745605, "rewards/rejected": -0.09121496230363846, "sft_loss": 0.0076431455090641975, "step": 480 }, { "epoch": 9.679012345679013, "grad_norm": 2.6841046810150146, "learning_rate": 6.089874350439507e-09, "logits/chosen": -0.3508923053741455, "logits/rejected": -0.410422146320343, "logps/chosen": -0.050847820937633514, "logps/rejected": -0.717828631401062, "loss": 0.0729, "odds_ratio_loss": 0.6642852425575256, "rewards/accuracies": 0.90625, "rewards/chosen": -0.005084782373160124, "rewards/margins": 0.06669807434082031, "rewards/rejected": -0.07178286463022232, "sft_loss": 0.006511971354484558, "step": 490 }, { "epoch": 9.876543209876543, "grad_norm": 1.9767255783081055, "learning_rate": 0.0, "logits/chosen": -0.3572847545146942, "logits/rejected": -0.43199190497398376, "logps/chosen": -0.0603826642036438, "logps/rejected": -0.6694537401199341, "loss": 0.0859, "odds_ratio_loss": 0.7898350954055786, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.006038266699761152, "rewards/margins": 0.060907114297151566, "rewards/rejected": -0.066945381462574, "sft_loss": 0.006888846401125193, "step": 500 }, { "epoch": 9.876543209876543, "eval_logits/chosen": -0.33729952573776245, "eval_logits/rejected": -0.39373528957366943, "eval_logps/chosen": -0.11289726942777634, "eval_logps/rejected": -0.7238631248474121, "eval_loss": 0.15817420184612274, "eval_odds_ratio_loss": 1.4418776035308838, "eval_rewards/accuracies": 0.7888888716697693, "eval_rewards/chosen": -0.011289726011455059, "eval_rewards/margins": 0.061096593737602234, "eval_rewards/rejected": -0.07238632440567017, "eval_runtime": 3.7425, "eval_samples_per_second": 24.048, "eval_sft_loss": 0.013986458070576191, "eval_steps_per_second": 12.024, "step": 500 }, { "epoch": 9.876543209876543, "step": 500, "total_flos": 8.823432944104243e+16, "train_loss": 0.3188942909240723, "train_runtime": 1116.7498, "train_samples_per_second": 7.253, "train_steps_per_second": 0.448 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.823432944104243e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }