{ "best_metric": 0.1529119908809662, "best_model_checkpoint": "saves/Llama-3.1-8B-Instruct/lora/saa-800/checkpoint-450", "epoch": 10.0, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2222222222222222, "grad_norm": 6.713047504425049, "learning_rate": 1.111111111111111e-06, "logits/chosen": -0.4419662356376648, "logits/rejected": -0.5169282555580139, "logps/chosen": -1.7502187490463257, "logps/rejected": -2.110954523086548, "loss": 1.8058, "odds_ratio_loss": 15.742452621459961, "rewards/accuracies": 0.75, "rewards/chosen": -0.17502185702323914, "rewards/margins": 0.03607357665896416, "rewards/rejected": -0.21109545230865479, "sft_loss": 0.23150739073753357, "step": 10 }, { "epoch": 0.4444444444444444, "grad_norm": 7.350757122039795, "learning_rate": 2.222222222222222e-06, "logits/chosen": -0.390705406665802, "logits/rejected": -0.4706195294857025, "logps/chosen": -1.6888424158096313, "logps/rejected": -2.1693763732910156, "loss": 1.7416, "odds_ratio_loss": 15.1643705368042, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.16888423264026642, "rewards/margins": 0.048053398728370667, "rewards/rejected": -0.21693763136863708, "sft_loss": 0.22513218224048615, "step": 20 }, { "epoch": 0.6666666666666666, "grad_norm": 8.413761138916016, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -0.42878788709640503, "logits/rejected": -0.5077282190322876, "logps/chosen": -1.698947548866272, "logps/rejected": -2.1924192905426025, "loss": 1.7505, "odds_ratio_loss": 15.268714904785156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1698947548866272, "rewards/margins": 0.04934718459844589, "rewards/rejected": -0.2192419320344925, "sft_loss": 0.22367195785045624, "step": 30 }, { "epoch": 0.8888888888888888, "grad_norm": 6.453053951263428, "learning_rate": 4.444444444444444e-06, "logits/chosen": -0.40729817748069763, "logits/rejected": -0.480067640542984, "logps/chosen": -1.677170753479004, "logps/rejected": -1.9914417266845703, "loss": 1.736, "odds_ratio_loss": 15.256231307983398, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1677170693874359, "rewards/margins": 0.03142711520195007, "rewards/rejected": -0.1991441696882248, "sft_loss": 0.21038159728050232, "step": 40 }, { "epoch": 1.1111111111111112, "grad_norm": 5.584766864776611, "learning_rate": 4.998119881260576e-06, "logits/chosen": -0.4123212695121765, "logits/rejected": -0.49063047766685486, "logps/chosen": -1.4437320232391357, "logps/rejected": -1.951922059059143, "loss": 1.4945, "odds_ratio_loss": 13.130969047546387, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.14437320828437805, "rewards/margins": 0.050819020718336105, "rewards/rejected": -0.19519223272800446, "sft_loss": 0.18137237429618835, "step": 50 }, { "epoch": 1.1111111111111112, "eval_logits/chosen": -0.3793938159942627, "eval_logits/rejected": -0.45482414960861206, "eval_logps/chosen": -1.228076696395874, "eval_logps/rejected": -1.7094627618789673, "eval_loss": 1.276250958442688, "eval_odds_ratio_loss": 11.260908126831055, "eval_rewards/accuracies": 0.7875000238418579, "eval_rewards/chosen": -0.12280768156051636, "eval_rewards/margins": 0.04813859239220619, "eval_rewards/rejected": -0.17094627022743225, "eval_runtime": 3.3465, "eval_samples_per_second": 23.905, "eval_sft_loss": 0.15016020834445953, "eval_steps_per_second": 11.953, "step": 50 }, { "epoch": 1.3333333333333333, "grad_norm": 3.5215771198272705, "learning_rate": 4.983095894354858e-06, "logits/chosen": -0.4166947901248932, "logits/rejected": -0.49852150678634644, "logps/chosen": -1.0756200551986694, "logps/rejected": -1.5570634603500366, "loss": 1.1242, "odds_ratio_loss": 9.97706127166748, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.10756200551986694, "rewards/margins": 0.04814432933926582, "rewards/rejected": -0.15570633113384247, "sft_loss": 0.1265355795621872, "step": 60 }, { "epoch": 1.5555555555555556, "grad_norm": 3.005864381790161, "learning_rate": 4.953138276568462e-06, "logits/chosen": -0.45257633924484253, "logits/rejected": -0.5196300148963928, "logps/chosen": -0.6806862950325012, "logps/rejected": -1.1254621744155884, "loss": 0.7274, "odds_ratio_loss": 6.491485595703125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06806863099336624, "rewards/margins": 0.044477589428424835, "rewards/rejected": -0.11254620552062988, "sft_loss": 0.07826091349124908, "step": 70 }, { "epoch": 1.7777777777777777, "grad_norm": 3.241006851196289, "learning_rate": 4.908427196539701e-06, "logits/chosen": -0.4250833988189697, "logits/rejected": -0.49341732263565063, "logps/chosen": -0.586665153503418, "logps/rejected": -0.9428589940071106, "loss": 0.6374, "odds_ratio_loss": 5.749706268310547, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.05866651609539986, "rewards/margins": 0.03561938554048538, "rewards/rejected": -0.09428589791059494, "sft_loss": 0.06239296868443489, "step": 80 }, { "epoch": 2.0, "grad_norm": 3.3662965297698975, "learning_rate": 4.849231551964771e-06, "logits/chosen": -0.39320772886276245, "logits/rejected": -0.48544201254844666, "logps/chosen": -0.3357451558113098, "logps/rejected": -0.9199529886245728, "loss": 0.3774, "odds_ratio_loss": 3.414803981781006, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.03357451409101486, "rewards/margins": 0.05842079594731331, "rewards/rejected": -0.09199531376361847, "sft_loss": 0.03593815118074417, "step": 90 }, { "epoch": 2.2222222222222223, "grad_norm": 1.9536067247390747, "learning_rate": 4.775907352415367e-06, "logits/chosen": -0.3746485710144043, "logits/rejected": -0.46515822410583496, "logps/chosen": -0.22609436511993408, "logps/rejected": -0.7698042392730713, "loss": 0.2666, "odds_ratio_loss": 2.4158308506011963, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.022609438747167587, "rewards/margins": 0.05437099188566208, "rewards/rejected": -0.07698042690753937, "sft_loss": 0.02498885616660118, "step": 100 }, { "epoch": 2.2222222222222223, "eval_logits/chosen": -0.36658304929733276, "eval_logits/rejected": -0.4319043755531311, "eval_logps/chosen": -0.2119404524564743, "eval_logps/rejected": -0.7287805676460266, "eval_loss": 0.24907442927360535, "eval_odds_ratio_loss": 2.2483649253845215, "eval_rewards/accuracies": 0.824999988079071, "eval_rewards/chosen": -0.02119404636323452, "eval_rewards/margins": 0.05168401449918747, "eval_rewards/rejected": -0.07287804782390594, "eval_runtime": 3.3336, "eval_samples_per_second": 23.998, "eval_sft_loss": 0.02423796057701111, "eval_steps_per_second": 11.999, "step": 100 }, { "epoch": 2.4444444444444446, "grad_norm": 1.7341715097427368, "learning_rate": 4.688895578255228e-06, "logits/chosen": -0.38399261236190796, "logits/rejected": -0.45104750990867615, "logps/chosen": -0.1595337688922882, "logps/rejected": -0.6476808190345764, "loss": 0.1984, "odds_ratio_loss": 1.7913240194320679, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.01595337688922882, "rewards/margins": 0.048814695328474045, "rewards/rejected": -0.06476806849241257, "sft_loss": 0.0192393995821476, "step": 110 }, { "epoch": 2.6666666666666665, "grad_norm": 1.6577801704406738, "learning_rate": 4.588719528532342e-06, "logits/chosen": -0.42996135354042053, "logits/rejected": -0.5020943880081177, "logps/chosen": -0.11305193603038788, "logps/rejected": -0.6292703747749329, "loss": 0.1528, "odds_ratio_loss": 1.378528356552124, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011305193416774273, "rewards/margins": 0.05162184312939644, "rewards/rejected": -0.06292703002691269, "sft_loss": 0.014913685619831085, "step": 120 }, { "epoch": 2.888888888888889, "grad_norm": 0.804415762424469, "learning_rate": 4.475981673796899e-06, "logits/chosen": -0.4010156989097595, "logits/rejected": -0.4532565474510193, "logps/chosen": -0.11344721168279648, "logps/rejected": -0.5929852724075317, "loss": 0.1557, "odds_ratio_loss": 1.430558204650879, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.011344721540808678, "rewards/margins": 0.047953806817531586, "rewards/rejected": -0.059298526495695114, "sft_loss": 0.012685336172580719, "step": 130 }, { "epoch": 3.111111111111111, "grad_norm": 1.188767910003662, "learning_rate": 4.351360032772512e-06, "logits/chosen": -0.37222054600715637, "logits/rejected": -0.4317198395729065, "logps/chosen": -0.10829365253448486, "logps/rejected": -0.6532029509544373, "loss": 0.1518, "odds_ratio_loss": 1.385451078414917, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.010829364880919456, "rewards/margins": 0.05449093505740166, "rewards/rejected": -0.06532029807567596, "sft_loss": 0.013293206691741943, "step": 140 }, { "epoch": 3.3333333333333335, "grad_norm": 1.1802352666854858, "learning_rate": 4.215604094671835e-06, "logits/chosen": -0.3585512936115265, "logits/rejected": -0.4357479512691498, "logps/chosen": -0.07403306663036346, "logps/rejected": -0.7158663868904114, "loss": 0.1014, "odds_ratio_loss": 0.9041417837142944, "rewards/accuracies": 0.875, "rewards/chosen": -0.007403307594358921, "rewards/margins": 0.06418333202600479, "rewards/rejected": -0.07158664613962173, "sft_loss": 0.010936584323644638, "step": 150 }, { "epoch": 3.3333333333333335, "eval_logits/chosen": -0.3283754289150238, "eval_logits/rejected": -0.3820287585258484, "eval_logps/chosen": -0.1292387694120407, "eval_logps/rejected": -0.6057685017585754, "eval_loss": 0.16315264999866486, "eval_odds_ratio_loss": 1.4634513854980469, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": -0.012923876754939556, "eval_rewards/margins": 0.047652970999479294, "eval_rewards/rejected": -0.060576848685741425, "eval_runtime": 3.3355, "eval_samples_per_second": 23.984, "eval_sft_loss": 0.016807515174150467, "eval_steps_per_second": 11.992, "step": 150 }, { "epoch": 3.5555555555555554, "grad_norm": 2.5364763736724854, "learning_rate": 4.069530311680247e-06, "logits/chosen": -0.37899985909461975, "logits/rejected": -0.4429406225681305, "logps/chosen": -0.09646956622600555, "logps/rejected": -0.5016245245933533, "loss": 0.1441, "odds_ratio_loss": 1.3272716999053955, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.00964695680886507, "rewards/margins": 0.04051550105214119, "rewards/rejected": -0.050162456929683685, "sft_loss": 0.011343193240463734, "step": 160 }, { "epoch": 3.7777777777777777, "grad_norm": 1.1225122213363647, "learning_rate": 3.914017188716347e-06, "logits/chosen": -0.32853808999061584, "logits/rejected": -0.3947625160217285, "logps/chosen": -0.0880068689584732, "logps/rejected": -0.5245143175125122, "loss": 0.1251, "odds_ratio_loss": 1.1624243259429932, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.00880068726837635, "rewards/margins": 0.04365074634552002, "rewards/rejected": -0.05245143175125122, "sft_loss": 0.00884676817804575, "step": 170 }, { "epoch": 4.0, "grad_norm": 2.3566975593566895, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.394605427980423, "logits/rejected": -0.4463408589363098, "logps/chosen": -0.09484803676605225, "logps/rejected": -0.5903218984603882, "loss": 0.1306, "odds_ratio_loss": 1.1676770448684692, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.00948480237275362, "rewards/margins": 0.049547381699085236, "rewards/rejected": -0.05903219059109688, "sft_loss": 0.013791071251034737, "step": 180 }, { "epoch": 4.222222222222222, "grad_norm": 1.111855387687683, "learning_rate": 3.578465164203134e-06, "logits/chosen": -0.3551129698753357, "logits/rejected": -0.43426722288131714, "logps/chosen": -0.06811969727277756, "logps/rejected": -0.6758677959442139, "loss": 0.0949, "odds_ratio_loss": 0.8656018376350403, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.006811970379203558, "rewards/margins": 0.06077481061220169, "rewards/rejected": -0.06758677959442139, "sft_loss": 0.008338207378983498, "step": 190 }, { "epoch": 4.444444444444445, "grad_norm": 1.3708534240722656, "learning_rate": 3.400444312011776e-06, "logits/chosen": -0.345875084400177, "logits/rejected": -0.3952234089374542, "logps/chosen": -0.10109534114599228, "logps/rejected": -0.5074071288108826, "loss": 0.1429, "odds_ratio_loss": 1.325073003768921, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.010109534487128258, "rewards/margins": 0.04063117876648903, "rewards/rejected": -0.05074070766568184, "sft_loss": 0.010358814150094986, "step": 200 }, { "epoch": 4.444444444444445, "eval_logits/chosen": -0.32978400588035583, "eval_logits/rejected": -0.38182884454727173, "eval_logps/chosen": -0.121058389544487, "eval_logps/rejected": -0.5841361284255981, "eval_loss": 0.1533508002758026, "eval_odds_ratio_loss": 1.3751581907272339, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": -0.01210583932697773, "eval_rewards/margins": 0.04630777984857559, "eval_rewards/rejected": -0.058413613587617874, "eval_runtime": 3.3378, "eval_samples_per_second": 23.968, "eval_sft_loss": 0.015834979712963104, "eval_steps_per_second": 11.984, "step": 200 }, { "epoch": 4.666666666666667, "grad_norm": 1.4620966911315918, "learning_rate": 3.217008081777726e-06, "logits/chosen": -0.3843366503715515, "logits/rejected": -0.4375368058681488, "logps/chosen": -0.08334760367870331, "logps/rejected": -0.5660797357559204, "loss": 0.1171, "odds_ratio_loss": 1.0661273002624512, "rewards/accuracies": 0.84375, "rewards/chosen": -0.00833476148545742, "rewards/margins": 0.04827320575714111, "rewards/rejected": -0.05660796910524368, "sft_loss": 0.010473220609128475, "step": 210 }, { "epoch": 4.888888888888889, "grad_norm": 1.2518880367279053, "learning_rate": 3.0292596805735275e-06, "logits/chosen": -0.3598805367946625, "logits/rejected": -0.4199501872062683, "logps/chosen": -0.07187429815530777, "logps/rejected": -0.5510466694831848, "loss": 0.1029, "odds_ratio_loss": 0.9251865148544312, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.007187430746853352, "rewards/margins": 0.047917239367961884, "rewards/rejected": -0.05510466545820236, "sft_loss": 0.010374334640800953, "step": 220 }, { "epoch": 5.111111111111111, "grad_norm": 1.3117250204086304, "learning_rate": 2.8383282493753282e-06, "logits/chosen": -0.3635903596878052, "logits/rejected": -0.43442487716674805, "logps/chosen": -0.06123008579015732, "logps/rejected": -0.7030965089797974, "loss": 0.0878, "odds_ratio_loss": 0.7977678179740906, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.006123009137809277, "rewards/margins": 0.06418663263320923, "rewards/rejected": -0.07030965387821198, "sft_loss": 0.007975312881171703, "step": 230 }, { "epoch": 5.333333333333333, "grad_norm": 2.640739679336548, "learning_rate": 2.6453620722761897e-06, "logits/chosen": -0.350188285112381, "logits/rejected": -0.4023573398590088, "logps/chosen": -0.08658941090106964, "logps/rejected": -0.6677496433258057, "loss": 0.1183, "odds_ratio_loss": 1.068228006362915, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.00865894090384245, "rewards/margins": 0.05811602994799614, "rewards/rejected": -0.06677497178316116, "sft_loss": 0.011527000926434994, "step": 240 }, { "epoch": 5.555555555555555, "grad_norm": 1.3426445722579956, "learning_rate": 2.4515216705704396e-06, "logits/chosen": -0.3609291911125183, "logits/rejected": -0.4190797209739685, "logps/chosen": -0.0702720507979393, "logps/rejected": -0.596028745174408, "loss": 0.1007, "odds_ratio_loss": 0.925524115562439, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.00702720507979393, "rewards/margins": 0.052575670182704926, "rewards/rejected": -0.059602878987789154, "sft_loss": 0.008168576285243034, "step": 250 }, { "epoch": 5.555555555555555, "eval_logits/chosen": -0.32348352670669556, "eval_logits/rejected": -0.37402915954589844, "eval_logps/chosen": -0.12062982469797134, "eval_logps/rejected": -0.6407424807548523, "eval_loss": 0.15295907855033875, "eval_odds_ratio_loss": 1.3704458475112915, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": -0.012062981724739075, "eval_rewards/margins": 0.05201127007603645, "eval_rewards/rejected": -0.06407425552606583, "eval_runtime": 3.3317, "eval_samples_per_second": 24.012, "eval_sft_loss": 0.01591448113322258, "eval_steps_per_second": 12.006, "step": 250 }, { "epoch": 5.777777777777778, "grad_norm": 1.4032063484191895, "learning_rate": 2.2579728232420524e-06, "logits/chosen": -0.40077710151672363, "logits/rejected": -0.4527947008609772, "logps/chosen": -0.0916273444890976, "logps/rejected": -0.5904611349105835, "loss": 0.127, "odds_ratio_loss": 1.1558631658554077, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.009162734262645245, "rewards/margins": 0.049883387982845306, "rewards/rejected": -0.05904611945152283, "sft_loss": 0.011430850252509117, "step": 260 }, { "epoch": 6.0, "grad_norm": 1.6447800397872925, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -0.36904090642929077, "logits/rejected": -0.43377822637557983, "logps/chosen": -0.05662701651453972, "logps/rejected": -0.6641786098480225, "loss": 0.0778, "odds_ratio_loss": 0.7118644714355469, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.005662702023983002, "rewards/margins": 0.060755155980587006, "rewards/rejected": -0.06641785800457001, "sft_loss": 0.006575153209269047, "step": 270 }, { "epoch": 6.222222222222222, "grad_norm": 2.223752021789551, "learning_rate": 1.876397139855047e-06, "logits/chosen": -0.3556548058986664, "logits/rejected": -0.4135330319404602, "logps/chosen": -0.0746704488992691, "logps/rejected": -0.6784394979476929, "loss": 0.1018, "odds_ratio_loss": 0.9240021705627441, "rewards/accuracies": 0.875, "rewards/chosen": -0.007467044983059168, "rewards/margins": 0.06037690117955208, "rewards/rejected": -0.0678439512848854, "sft_loss": 0.009404648095369339, "step": 280 }, { "epoch": 6.444444444444445, "grad_norm": 1.75334894657135, "learning_rate": 1.6906651448541977e-06, "logits/chosen": -0.3690106272697449, "logits/rejected": -0.4269256591796875, "logps/chosen": -0.05620593950152397, "logps/rejected": -0.6589001417160034, "loss": 0.0805, "odds_ratio_loss": 0.7411076426506042, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.005620594136416912, "rewards/margins": 0.060269422829151154, "rewards/rejected": -0.06589001417160034, "sft_loss": 0.006437716074287891, "step": 290 }, { "epoch": 6.666666666666667, "grad_norm": 4.5506911277771, "learning_rate": 1.509800584902108e-06, "logits/chosen": -0.3519323468208313, "logits/rejected": -0.41471752524375916, "logps/chosen": -0.10020919144153595, "logps/rejected": -0.6135331392288208, "loss": 0.1385, "odds_ratio_loss": 1.263484001159668, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.010020919144153595, "rewards/margins": 0.051332395523786545, "rewards/rejected": -0.06135330721735954, "sft_loss": 0.012116280384361744, "step": 300 }, { "epoch": 6.666666666666667, "eval_logits/chosen": -0.32135990262031555, "eval_logits/rejected": -0.3725373148918152, "eval_logps/chosen": -0.12169794738292694, "eval_logps/rejected": -0.6881048083305359, "eval_loss": 0.15343782305717468, "eval_odds_ratio_loss": 1.3728852272033691, "eval_rewards/accuracies": 0.800000011920929, "eval_rewards/chosen": -0.012169795110821724, "eval_rewards/margins": 0.05664069205522537, "eval_rewards/rejected": -0.06881048530340195, "eval_runtime": 3.3342, "eval_samples_per_second": 23.994, "eval_sft_loss": 0.016149301081895828, "eval_steps_per_second": 11.997, "step": 300 }, { "epoch": 6.888888888888889, "grad_norm": 1.4514155387878418, "learning_rate": 1.3348912007436538e-06, "logits/chosen": -0.33361396193504333, "logits/rejected": -0.3964681923389435, "logps/chosen": -0.05151065066456795, "logps/rejected": -0.6935869455337524, "loss": 0.075, "odds_ratio_loss": 0.6938644051551819, "rewards/accuracies": 0.90625, "rewards/chosen": -0.005151065066456795, "rewards/margins": 0.06420762091875076, "rewards/rejected": -0.069358691573143, "sft_loss": 0.005607123486697674, "step": 310 }, { "epoch": 7.111111111111111, "grad_norm": 1.1120833158493042, "learning_rate": 1.1669889179957725e-06, "logits/chosen": -0.34897491335868835, "logits/rejected": -0.4280971884727478, "logps/chosen": -0.06502507627010345, "logps/rejected": -0.724215030670166, "loss": 0.0902, "odds_ratio_loss": 0.8162932395935059, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.006502507720142603, "rewards/margins": 0.06591899693012238, "rewards/rejected": -0.07242151349782944, "sft_loss": 0.008545474149286747, "step": 320 }, { "epoch": 7.333333333333333, "grad_norm": 0.9275766611099243, "learning_rate": 1.0071035207430352e-06, "logits/chosen": -0.34552061557769775, "logits/rejected": -0.4186931550502777, "logps/chosen": -0.057514738291502, "logps/rejected": -0.7304258942604065, "loss": 0.0812, "odds_ratio_loss": 0.7468503713607788, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.005751474294811487, "rewards/margins": 0.06729111820459366, "rewards/rejected": -0.07304258644580841, "sft_loss": 0.006484520621597767, "step": 330 }, { "epoch": 7.555555555555555, "grad_norm": 1.7526174783706665, "learning_rate": 8.561965785773413e-07, "logits/chosen": -0.35446950793266296, "logits/rejected": -0.41398343443870544, "logps/chosen": -0.04472974315285683, "logps/rejected": -0.6881515979766846, "loss": 0.0655, "odds_ratio_loss": 0.5992408394813538, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.004472973756492138, "rewards/margins": 0.0643421858549118, "rewards/rejected": -0.06881515681743622, "sft_loss": 0.0055578285828232765, "step": 340 }, { "epoch": 7.777777777777778, "grad_norm": 1.4603219032287598, "learning_rate": 7.151756636052529e-07, "logits/chosen": -0.35244011878967285, "logits/rejected": -0.4057006239891052, "logps/chosen": -0.06629346311092377, "logps/rejected": -0.6709402203559875, "loss": 0.0918, "odds_ratio_loss": 0.841974139213562, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.0066293468698859215, "rewards/margins": 0.060464680194854736, "rewards/rejected": -0.06709402799606323, "sft_loss": 0.0076185790821909904, "step": 350 }, { "epoch": 7.777777777777778, "eval_logits/chosen": -0.31914329528808594, "eval_logits/rejected": -0.3697816729545593, "eval_logps/chosen": -0.12174425274133682, "eval_logps/rejected": -0.6888757944107056, "eval_loss": 0.1536635458469391, "eval_odds_ratio_loss": 1.3741592168807983, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": -0.012174425646662712, "eval_rewards/margins": 0.05671315640211105, "eval_rewards/rejected": -0.06888757646083832, "eval_runtime": 3.3381, "eval_samples_per_second": 23.966, "eval_sft_loss": 0.016247618943452835, "eval_steps_per_second": 11.983, "step": 350 }, { "epoch": 8.0, "grad_norm": 0.9194740056991577, "learning_rate": 5.848888922025553e-07, "logits/chosen": -0.3591064214706421, "logits/rejected": -0.4297390878200531, "logps/chosen": -0.08391134440898895, "logps/rejected": -0.7468418478965759, "loss": 0.118, "odds_ratio_loss": 1.0900821685791016, "rewards/accuracies": 0.84375, "rewards/chosen": -0.00839113537222147, "rewards/margins": 0.06629304587841034, "rewards/rejected": -0.07468418776988983, "sft_loss": 0.008979488164186478, "step": 360 }, { "epoch": 8.222222222222221, "grad_norm": 1.1211438179016113, "learning_rate": 4.661198243425813e-07, "logits/chosen": -0.3469446301460266, "logits/rejected": -0.416546493768692, "logps/chosen": -0.06295167654752731, "logps/rejected": -0.6733905076980591, "loss": 0.0877, "odds_ratio_loss": 0.7965422868728638, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.006295167841017246, "rewards/margins": 0.0610438771545887, "rewards/rejected": -0.06733904778957367, "sft_loss": 0.008044925518333912, "step": 370 }, { "epoch": 8.444444444444445, "grad_norm": 2.9023804664611816, "learning_rate": 3.595827511743341e-07, "logits/chosen": -0.36501026153564453, "logits/rejected": -0.4163384437561035, "logps/chosen": -0.0608031265437603, "logps/rejected": -0.7410011291503906, "loss": 0.0857, "odds_ratio_loss": 0.7958894371986389, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.006080311723053455, "rewards/margins": 0.068019799888134, "rewards/rejected": -0.07410011440515518, "sft_loss": 0.006128143519163132, "step": 380 }, { "epoch": 8.666666666666666, "grad_norm": 1.2382421493530273, "learning_rate": 2.6591839919146963e-07, "logits/chosen": -0.3580993711948395, "logits/rejected": -0.4169589579105377, "logps/chosen": -0.07410109043121338, "logps/rejected": -0.7789124250411987, "loss": 0.1054, "odds_ratio_loss": 0.9693442583084106, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.0074101099744439125, "rewards/margins": 0.07048113644123077, "rewards/rejected": -0.07789124548435211, "sft_loss": 0.008436056785285473, "step": 390 }, { "epoch": 8.88888888888889, "grad_norm": 1.7495219707489014, "learning_rate": 1.8569007682777417e-07, "logits/chosen": -0.37370580434799194, "logits/rejected": -0.43429917097091675, "logps/chosen": -0.05244039371609688, "logps/rejected": -0.7023984789848328, "loss": 0.0752, "odds_ratio_loss": 0.6945605874061584, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.005244039464741945, "rewards/margins": 0.06499581038951874, "rewards/rejected": -0.0702398419380188, "sft_loss": 0.005755658261477947, "step": 400 }, { "epoch": 8.88888888888889, "eval_logits/chosen": -0.3194546103477478, "eval_logits/rejected": -0.3705773651599884, "eval_logps/chosen": -0.1213468462228775, "eval_logps/rejected": -0.689638614654541, "eval_loss": 0.15339744091033936, "eval_odds_ratio_loss": 1.3723235130310059, "eval_rewards/accuracies": 0.800000011920929, "eval_rewards/chosen": -0.01213468424975872, "eval_rewards/margins": 0.056829191744327545, "eval_rewards/rejected": -0.06896387040615082, "eval_runtime": 3.3351, "eval_samples_per_second": 23.987, "eval_sft_loss": 0.016165101900696754, "eval_steps_per_second": 11.994, "step": 400 }, { "epoch": 9.11111111111111, "grad_norm": 2.9914894104003906, "learning_rate": 1.1938028665396172e-07, "logits/chosen": -0.3595274090766907, "logits/rejected": -0.41625022888183594, "logps/chosen": -0.07939668744802475, "logps/rejected": -0.6182006597518921, "loss": 0.1064, "odds_ratio_loss": 0.9655207395553589, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.007939668372273445, "rewards/margins": 0.05388040468096733, "rewards/rejected": -0.06182006746530533, "sft_loss": 0.009880236349999905, "step": 410 }, { "epoch": 9.333333333333334, "grad_norm": 1.7583844661712646, "learning_rate": 6.738782355044048e-08, "logits/chosen": -0.35592731833457947, "logits/rejected": -0.41578736901283264, "logps/chosen": -0.06436184048652649, "logps/rejected": -0.6822584867477417, "loss": 0.0919, "odds_ratio_loss": 0.8448025584220886, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.006436183117330074, "rewards/margins": 0.06178967282176018, "rewards/rejected": -0.06822585314512253, "sft_loss": 0.007466461509466171, "step": 420 }, { "epoch": 9.555555555555555, "grad_norm": 0.7222520112991333, "learning_rate": 3.0025376307977474e-08, "logits/chosen": -0.3527710437774658, "logits/rejected": -0.41858869791030884, "logps/chosen": -0.042861782014369965, "logps/rejected": -0.7103925943374634, "loss": 0.0601, "odds_ratio_loss": 0.5496730804443359, "rewards/accuracies": 0.9375, "rewards/chosen": -0.004286178387701511, "rewards/margins": 0.0667530745267868, "rewards/rejected": -0.07103925198316574, "sft_loss": 0.005130757577717304, "step": 430 }, { "epoch": 9.777777777777779, "grad_norm": 1.2489662170410156, "learning_rate": 7.517647080519941e-09, "logits/chosen": -0.34115296602249146, "logits/rejected": -0.39803385734558105, "logps/chosen": -0.05217872932553291, "logps/rejected": -0.8210271000862122, "loss": 0.0779, "odds_ratio_loss": 0.7251914739608765, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.0052178725600242615, "rewards/margins": 0.07688483595848083, "rewards/rejected": -0.0821027159690857, "sft_loss": 0.0053517939522862434, "step": 440 }, { "epoch": 10.0, "grad_norm": 1.9128329753875732, "learning_rate": 0.0, "logits/chosen": -0.3783785402774811, "logits/rejected": -0.4389713406562805, "logps/chosen": -0.07910243421792984, "logps/rejected": -0.6643815040588379, "loss": 0.1052, "odds_ratio_loss": 0.9564045667648315, "rewards/accuracies": 0.875, "rewards/chosen": -0.007910243235528469, "rewards/margins": 0.05852789804339409, "rewards/rejected": -0.06643815338611603, "sft_loss": 0.009510872885584831, "step": 450 }, { "epoch": 10.0, "eval_logits/chosen": -0.3184443712234497, "eval_logits/rejected": -0.36903008818626404, "eval_logps/chosen": -0.1208958849310875, "eval_logps/rejected": -0.6922038793563843, "eval_loss": 0.1529119908809662, "eval_odds_ratio_loss": 1.3675727844238281, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": -0.012089588679373264, "eval_rewards/margins": 0.05713079497218132, "eval_rewards/rejected": -0.06922038644552231, "eval_runtime": 3.3378, "eval_samples_per_second": 23.968, "eval_sft_loss": 0.016154715791344643, "eval_steps_per_second": 11.984, "step": 450 }, { "epoch": 10.0, "step": 450, "total_flos": 8.074648418018918e+16, "train_loss": 0.3428536836306254, "train_runtime": 994.0751, "train_samples_per_second": 7.243, "train_steps_per_second": 0.453 } ], "logging_steps": 10, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.074648418018918e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }