{ "best_metric": 0.8505691885948181, "best_model_checkpoint": "saves/Mistral-7B-Instruct-v0.3/lora/orpo-salt-half/checkpoint-1500", "epoch": 2.9974597798475866, "eval_steps": 500, "global_step": 1770, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01693480101608806, "grad_norm": 8.384780883789062, "learning_rate": 4.999614014035063e-06, "logits/chosen": -2.9335973262786865, "logits/rejected": -2.9718575477600098, "logps/chosen": -1.0935328006744385, "logps/rejected": -1.5320154428482056, "loss": 1.1602, "odds_ratio_loss": 0.6671324968338013, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.10935328900814056, "rewards/margins": 0.04384826496243477, "rewards/rejected": -0.15320155024528503, "sft_loss": 1.0935328006744385, "step": 10 }, { "epoch": 0.03386960203217612, "grad_norm": 5.07673454284668, "learning_rate": 4.998440543386042e-06, "logits/chosen": -2.899705648422241, "logits/rejected": -2.8991312980651855, "logps/chosen": -1.0815098285675049, "logps/rejected": -1.5599451065063477, "loss": 1.1396, "odds_ratio_loss": 0.5810434818267822, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10815098136663437, "rewards/margins": 0.04784352704882622, "rewards/rejected": -0.1559945046901703, "sft_loss": 1.0815098285675049, "step": 20 }, { "epoch": 0.05080440304826418, "grad_norm": 4.302060127258301, "learning_rate": 4.996479918381253e-06, "logits/chosen": -2.944632053375244, "logits/rejected": -2.975834846496582, "logps/chosen": -1.056970238685608, "logps/rejected": -1.281690001487732, "loss": 1.1267, "odds_ratio_loss": 0.6977876424789429, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10569703578948975, "rewards/margins": 0.022471977397799492, "rewards/rejected": -0.1281690150499344, "sft_loss": 1.056970238685608, "step": 30 }, { "epoch": 0.06773920406435224, "grad_norm": 3.221508026123047, "learning_rate": 4.993732756731818e-06, "logits/chosen": -2.925711154937744, "logits/rejected": -2.972975730895996, "logps/chosen": -0.8646748661994934, "logps/rejected": -1.3530806303024292, "loss": 0.9216, "odds_ratio_loss": 0.5692964792251587, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08646748960018158, "rewards/margins": 0.04884058237075806, "rewards/rejected": -0.13530807197093964, "sft_loss": 0.8646748661994934, "step": 40 }, { "epoch": 0.0846740050804403, "grad_norm": 11.413677215576172, "learning_rate": 4.9901999239537345e-06, "logits/chosen": -2.9809317588806152, "logits/rejected": -2.983664035797119, "logps/chosen": -0.9910508990287781, "logps/rejected": -1.344020962715149, "loss": 1.0558, "odds_ratio_loss": 0.6479853987693787, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0991050973534584, "rewards/margins": 0.035297006368637085, "rewards/rejected": -0.1344021111726761, "sft_loss": 0.9910508990287781, "step": 50 }, { "epoch": 0.10160880609652836, "grad_norm": 1.7445276975631714, "learning_rate": 4.985882533095186e-06, "logits/chosen": -2.9525933265686035, "logits/rejected": -2.955658435821533, "logps/chosen": -0.9359370470046997, "logps/rejected": -1.2753798961639404, "loss": 1.0023, "odds_ratio_loss": 0.6634014248847961, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09359370172023773, "rewards/margins": 0.03394431248307228, "rewards/rejected": -0.12753799557685852, "sft_loss": 0.9359370470046997, "step": 60 }, { "epoch": 0.11854360711261643, "grad_norm": 3.4384100437164307, "learning_rate": 4.9807819443858705e-06, "logits/chosen": -2.956145763397217, "logits/rejected": -2.9750401973724365, "logps/chosen": -0.8652639389038086, "logps/rejected": -1.2253761291503906, "loss": 0.9227, "odds_ratio_loss": 0.5748167634010315, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08652639389038086, "rewards/margins": 0.0360112190246582, "rewards/rejected": -0.12253761291503906, "sft_loss": 0.8652639389038086, "step": 70 }, { "epoch": 0.1354784081287045, "grad_norm": 2.6011013984680176, "learning_rate": 4.9748997648084404e-06, "logits/chosen": -2.927016496658325, "logits/rejected": -2.961350917816162, "logps/chosen": -0.9362251162528992, "logps/rejected": -1.1120071411132812, "loss": 1.0068, "odds_ratio_loss": 0.7062079906463623, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09362251311540604, "rewards/margins": 0.017578203231096268, "rewards/rejected": -0.111200712621212, "sft_loss": 0.9362251162528992, "step": 80 }, { "epoch": 0.15241320914479256, "grad_norm": 2.413475751876831, "learning_rate": 4.96823784759222e-06, "logits/chosen": -2.869145154953003, "logits/rejected": -2.916862964630127, "logps/chosen": -0.8781692385673523, "logps/rejected": -1.2194924354553223, "loss": 0.9334, "odds_ratio_loss": 0.552145779132843, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08781692385673523, "rewards/margins": 0.034132327884435654, "rewards/rejected": -0.12194924056529999, "sft_loss": 0.8781692385673523, "step": 90 }, { "epoch": 0.1693480101608806, "grad_norm": 1.9281786680221558, "learning_rate": 4.960798291629323e-06, "logits/chosen": -2.9603519439697266, "logits/rejected": -2.9720394611358643, "logps/chosen": -0.8728249669075012, "logps/rejected": -1.0510112047195435, "loss": 0.9387, "odds_ratio_loss": 0.658331036567688, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08728249371051788, "rewards/margins": 0.017818626016378403, "rewards/rejected": -0.10510112345218658, "sft_loss": 0.8728249669075012, "step": 100 }, { "epoch": 0.18628281117696868, "grad_norm": 2.8832967281341553, "learning_rate": 4.952583440813383e-06, "logits/chosen": -2.9769511222839355, "logits/rejected": -2.991858720779419, "logps/chosen": -0.9942096471786499, "logps/rejected": -1.25453519821167, "loss": 1.0667, "odds_ratio_loss": 0.7251978516578674, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09942097961902618, "rewards/margins": 0.026032552123069763, "rewards/rejected": -0.12545353174209595, "sft_loss": 0.9942096471786499, "step": 110 }, { "epoch": 0.20321761219305673, "grad_norm": 1.2281321287155151, "learning_rate": 4.943595883301086e-06, "logits/chosen": -2.9461495876312256, "logits/rejected": -2.988556385040283, "logps/chosen": -0.8882713317871094, "logps/rejected": -1.1232919692993164, "loss": 0.9522, "odds_ratio_loss": 0.6390038728713989, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08882713317871094, "rewards/margins": 0.02350207045674324, "rewards/rejected": -0.11232920736074448, "sft_loss": 0.8882713317871094, "step": 120 }, { "epoch": 0.2201524132091448, "grad_norm": 2.2648568153381348, "learning_rate": 4.933838450696757e-06, "logits/chosen": -2.9817254543304443, "logits/rejected": -3.0018677711486816, "logps/chosen": -0.8987566828727722, "logps/rejected": -1.0799884796142578, "loss": 0.9707, "odds_ratio_loss": 0.7198494672775269, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08987566828727722, "rewards/margins": 0.018123187124729156, "rewards/rejected": -0.10799884796142578, "sft_loss": 0.8987566828727722, "step": 130 }, { "epoch": 0.23708721422523285, "grad_norm": 3.667508840560913, "learning_rate": 4.923314217160234e-06, "logits/chosen": -2.988133192062378, "logits/rejected": -3.033163070678711, "logps/chosen": -0.9288301467895508, "logps/rejected": -1.1756031513214111, "loss": 0.9973, "odds_ratio_loss": 0.6846222877502441, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09288302063941956, "rewards/margins": 0.024677302688360214, "rewards/rejected": -0.11756031215190887, "sft_loss": 0.9288301467895508, "step": 140 }, { "epoch": 0.2540220152413209, "grad_norm": 1.4715845584869385, "learning_rate": 4.9120264984383285e-06, "logits/chosen": -2.997676134109497, "logits/rejected": -2.9935667514801025, "logps/chosen": -0.8239797353744507, "logps/rejected": -0.9983611106872559, "loss": 0.8869, "odds_ratio_loss": 0.6288361549377441, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08239796757698059, "rewards/margins": 0.017438137903809547, "rewards/rejected": -0.09983611106872559, "sft_loss": 0.8239797353744507, "step": 150 }, { "epoch": 0.270956816257409, "grad_norm": 1.3789633512496948, "learning_rate": 4.899978850820176e-06, "logits/chosen": -3.004080295562744, "logits/rejected": -3.0049595832824707, "logps/chosen": -0.9372480511665344, "logps/rejected": -1.147570252418518, "loss": 1.0008, "odds_ratio_loss": 0.635485053062439, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0937248021364212, "rewards/margins": 0.021032210439443588, "rewards/rejected": -0.11475701630115509, "sft_loss": 0.9372480511665344, "step": 160 }, { "epoch": 0.28789161727349705, "grad_norm": 2.1128056049346924, "learning_rate": 4.887175070016795e-06, "logits/chosen": -3.027259349822998, "logits/rejected": -3.0453531742095947, "logps/chosen": -0.7962668538093567, "logps/rejected": -1.0727083683013916, "loss": 0.8585, "odds_ratio_loss": 0.6222477555274963, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07962668687105179, "rewards/margins": 0.027644142508506775, "rewards/rejected": -0.10727082192897797, "sft_loss": 0.7962668538093567, "step": 170 }, { "epoch": 0.3048264182895851, "grad_norm": 4.011683464050293, "learning_rate": 4.873619189965217e-06, "logits/chosen": -2.973634719848633, "logits/rejected": -2.989631414413452, "logps/chosen": -0.8350585103034973, "logps/rejected": -1.051026463508606, "loss": 0.9, "odds_ratio_loss": 0.6493778824806213, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08350586146116257, "rewards/margins": 0.021596785634756088, "rewards/rejected": -0.10510264337062836, "sft_loss": 0.8350585103034973, "step": 180 }, { "epoch": 0.32176121930567314, "grad_norm": 1.2796882390975952, "learning_rate": 4.859315481557563e-06, "logits/chosen": -2.9721431732177734, "logits/rejected": -2.9849448204040527, "logps/chosen": -0.821880042552948, "logps/rejected": -1.0275905132293701, "loss": 0.8867, "odds_ratio_loss": 0.648421585559845, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08218801021575928, "rewards/margins": 0.0205710306763649, "rewards/rejected": -0.10275904089212418, "sft_loss": 0.821880042552948, "step": 190 }, { "epoch": 0.3386960203217612, "grad_norm": 1.8488657474517822, "learning_rate": 4.84426845129546e-06, "logits/chosen": -3.0222136974334717, "logits/rejected": -3.031564235687256, "logps/chosen": -0.8574220538139343, "logps/rejected": -0.9888635873794556, "loss": 0.9247, "odds_ratio_loss": 0.6725481748580933, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08574221283197403, "rewards/margins": 0.013144141063094139, "rewards/rejected": -0.09888636320829391, "sft_loss": 0.8574220538139343, "step": 200 }, { "epoch": 0.3556308213378493, "grad_norm": 1.5179060697555542, "learning_rate": 4.828482839870233e-06, "logits/chosen": -3.010849714279175, "logits/rejected": -3.0288453102111816, "logps/chosen": -0.8511277437210083, "logps/rejected": -0.9780422449111938, "loss": 0.9215, "odds_ratio_loss": 0.7037326693534851, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08511276543140411, "rewards/margins": 0.012691453099250793, "rewards/rejected": -0.0978042259812355, "sft_loss": 0.8511277437210083, "step": 210 }, { "epoch": 0.37256562235393736, "grad_norm": 1.2089393138885498, "learning_rate": 4.811963620669314e-06, "logits/chosen": -3.0581696033477783, "logits/rejected": -3.099684476852417, "logps/chosen": -0.8706866502761841, "logps/rejected": -1.0248219966888428, "loss": 0.9377, "odds_ratio_loss": 0.6702378988265991, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08706866204738617, "rewards/margins": 0.015413539484143257, "rewards/rejected": -0.10248219966888428, "sft_loss": 0.8706866502761841, "step": 220 }, { "epoch": 0.3895004233700254, "grad_norm": 2.3151497840881348, "learning_rate": 4.794715998209328e-06, "logits/chosen": -2.9359335899353027, "logits/rejected": -2.9507675170898438, "logps/chosen": -0.8457564115524292, "logps/rejected": -1.0168864727020264, "loss": 0.9116, "odds_ratio_loss": 0.6584563255310059, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08457564562559128, "rewards/margins": 0.017113011330366135, "rewards/rejected": -0.10168864578008652, "sft_loss": 0.8457564115524292, "step": 230 }, { "epoch": 0.40643522438611346, "grad_norm": 2.3710029125213623, "learning_rate": 4.7767454064963724e-06, "logits/chosen": -2.9894230365753174, "logits/rejected": -3.022892475128174, "logps/chosen": -0.8459660410881042, "logps/rejected": -1.015751838684082, "loss": 0.9112, "odds_ratio_loss": 0.6523627042770386, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08459659665822983, "rewards/margins": 0.016978587955236435, "rewards/rejected": -0.10157518088817596, "sft_loss": 0.8459660410881042, "step": 240 }, { "epoch": 0.42337002540220153, "grad_norm": 2.1464998722076416, "learning_rate": 4.758057507313987e-06, "logits/chosen": -3.024229049682617, "logits/rejected": -3.034865617752075, "logps/chosen": -0.8055688142776489, "logps/rejected": -1.001123070716858, "loss": 0.869, "odds_ratio_loss": 0.6340595483779907, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08055686950683594, "rewards/margins": 0.019555436447262764, "rewards/rejected": -0.10011231899261475, "sft_loss": 0.8055688142776489, "step": 250 }, { "epoch": 0.4403048264182896, "grad_norm": 3.3724582195281982, "learning_rate": 4.73865818843936e-06, "logits/chosen": -3.042086124420166, "logits/rejected": -3.0638880729675293, "logps/chosen": -0.912920355796814, "logps/rejected": -1.1694436073303223, "loss": 0.9795, "odds_ratio_loss": 0.6657846570014954, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09129203855991364, "rewards/margins": 0.025652330368757248, "rewards/rejected": -0.11694437265396118, "sft_loss": 0.912920355796814, "step": 260 }, { "epoch": 0.4572396274343776, "grad_norm": 1.6793408393859863, "learning_rate": 4.718553561788339e-06, "logits/chosen": -3.0312306880950928, "logits/rejected": -3.060295820236206, "logps/chosen": -0.836656928062439, "logps/rejected": -0.986899733543396, "loss": 0.9016, "odds_ratio_loss": 0.6491778492927551, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08366570621728897, "rewards/margins": 0.015024276450276375, "rewards/rejected": -0.0986899808049202, "sft_loss": 0.836656928062439, "step": 270 }, { "epoch": 0.4741744284504657, "grad_norm": 2.6274185180664062, "learning_rate": 4.697749961489822e-06, "logits/chosen": -3.061577558517456, "logits/rejected": -3.0702619552612305, "logps/chosen": -0.8848905563354492, "logps/rejected": -1.0645692348480225, "loss": 0.9531, "odds_ratio_loss": 0.6824666857719421, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08848904818296432, "rewards/margins": 0.017967868596315384, "rewards/rejected": -0.10645692050457001, "sft_loss": 0.8848905563354492, "step": 280 }, { "epoch": 0.4911092294665538, "grad_norm": 2.089768886566162, "learning_rate": 4.67625394189013e-06, "logits/chosen": -3.0883841514587402, "logits/rejected": -3.090940475463867, "logps/chosen": -0.7822630405426025, "logps/rejected": -1.0061594247817993, "loss": 0.8425, "odds_ratio_loss": 0.6027771830558777, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07822629809379578, "rewards/margins": 0.022389648482203484, "rewards/rejected": -0.100615955889225, "sft_loss": 0.7822630405426025, "step": 290 }, { "epoch": 0.5080440304826418, "grad_norm": 2.2402989864349365, "learning_rate": 4.654072275488016e-06, "logits/chosen": -3.1045384407043457, "logits/rejected": -3.1042561531066895, "logps/chosen": -0.7530822157859802, "logps/rejected": -0.9303094744682312, "loss": 0.8152, "odds_ratio_loss": 0.621616542339325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07530822604894638, "rewards/margins": 0.017722725868225098, "rewards/rejected": -0.09303095191717148, "sft_loss": 0.7530822157859802, "step": 300 }, { "epoch": 0.5249788314987299, "grad_norm": 7.859750747680664, "learning_rate": 4.631211950800925e-06, "logits/chosen": -3.0885701179504395, "logits/rejected": -3.1314892768859863, "logps/chosen": -0.8193107843399048, "logps/rejected": -1.0114855766296387, "loss": 0.8855, "odds_ratio_loss": 0.6622810959815979, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08193108439445496, "rewards/margins": 0.01921747997403145, "rewards/rejected": -0.1011485680937767, "sft_loss": 0.8193107843399048, "step": 310 }, { "epoch": 0.541913632514818, "grad_norm": 3.1328227519989014, "learning_rate": 4.6076801701632095e-06, "logits/chosen": -3.087184190750122, "logits/rejected": -3.1436970233917236, "logps/chosen": -0.801365852355957, "logps/rejected": -0.9046095609664917, "loss": 0.8686, "odds_ratio_loss": 0.6719350814819336, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08013658970594406, "rewards/margins": 0.010324367322027683, "rewards/rejected": -0.09046096354722977, "sft_loss": 0.801365852355957, "step": 320 }, { "epoch": 0.558848433530906, "grad_norm": 2.4766950607299805, "learning_rate": 4.583484347456972e-06, "logits/chosen": -3.0817341804504395, "logits/rejected": -3.114269256591797, "logps/chosen": -0.8467103242874146, "logps/rejected": -0.9794108271598816, "loss": 0.9136, "odds_ratio_loss": 0.6689561009407043, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0846710279583931, "rewards/margins": 0.013270048424601555, "rewards/rejected": -0.0979410782456398, "sft_loss": 0.8467103242874146, "step": 330 }, { "epoch": 0.5757832345469941, "grad_norm": 1.4625582695007324, "learning_rate": 4.55863210577626e-06, "logits/chosen": -3.123152256011963, "logits/rejected": -3.1361846923828125, "logps/chosen": -0.8682034611701965, "logps/rejected": -1.090419888496399, "loss": 0.9349, "odds_ratio_loss": 0.6673767566680908, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08682034909725189, "rewards/margins": 0.022221634164452553, "rewards/rejected": -0.10904198884963989, "sft_loss": 0.8682034611701965, "step": 340 }, { "epoch": 0.5927180355630821, "grad_norm": 1.6330158710479736, "learning_rate": 4.5331312750253465e-06, "logits/chosen": -3.079641103744507, "logits/rejected": -3.1221771240234375, "logps/chosen": -0.7776955962181091, "logps/rejected": -0.978277325630188, "loss": 0.8441, "odds_ratio_loss": 0.6638715267181396, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07776956260204315, "rewards/margins": 0.02005816623568535, "rewards/rejected": -0.0978277251124382, "sft_loss": 0.7776955962181091, "step": 350 }, { "epoch": 0.6096528365791702, "grad_norm": 1.9444369077682495, "learning_rate": 4.506989889451858e-06, "logits/chosen": -3.1103367805480957, "logits/rejected": -3.1434414386749268, "logps/chosen": -0.8445068597793579, "logps/rejected": -0.9869591593742371, "loss": 0.9108, "odds_ratio_loss": 0.662727952003479, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08445067703723907, "rewards/margins": 0.014245236292481422, "rewards/rejected": -0.09869591891765594, "sft_loss": 0.8445068597793579, "step": 360 }, { "epoch": 0.6265876375952583, "grad_norm": 2.251150369644165, "learning_rate": 4.480216185115512e-06, "logits/chosen": -3.0998404026031494, "logits/rejected": -3.087653398513794, "logps/chosen": -0.8066253662109375, "logps/rejected": -0.9696222543716431, "loss": 0.8704, "odds_ratio_loss": 0.6379208564758301, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08066253364086151, "rewards/margins": 0.016299689188599586, "rewards/rejected": -0.09696222841739655, "sft_loss": 0.8066253662109375, "step": 370 }, { "epoch": 0.6435224386113463, "grad_norm": 1.8241935968399048, "learning_rate": 4.4528185972932856e-06, "logits/chosen": -3.019221782684326, "logits/rejected": -3.083693265914917, "logps/chosen": -0.8054075241088867, "logps/rejected": -1.0394740104675293, "loss": 0.8728, "odds_ratio_loss": 0.6744040846824646, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08054076135158539, "rewards/margins": 0.023406637832522392, "rewards/rejected": -0.10394741594791412, "sft_loss": 0.8054075241088867, "step": 380 }, { "epoch": 0.6604572396274344, "grad_norm": 7.428930759429932, "learning_rate": 4.424805757821803e-06, "logits/chosen": -3.0259501934051514, "logits/rejected": -3.0692214965820312, "logps/chosen": -0.8603572845458984, "logps/rejected": -0.9688261151313782, "loss": 0.9297, "odds_ratio_loss": 0.6930958032608032, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08603573590517044, "rewards/margins": 0.010846875607967377, "rewards/rejected": -0.09688261151313782, "sft_loss": 0.8603572845458984, "step": 390 }, { "epoch": 0.6773920406435224, "grad_norm": 1.7724531888961792, "learning_rate": 4.396186492377812e-06, "logits/chosen": -3.049534797668457, "logits/rejected": -3.0946240425109863, "logps/chosen": -0.804740309715271, "logps/rejected": -1.0287699699401855, "loss": 0.8638, "odds_ratio_loss": 0.5903798341751099, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08047403395175934, "rewards/margins": 0.02240295708179474, "rewards/rejected": -0.10287699848413467, "sft_loss": 0.804740309715271, "step": 400 }, { "epoch": 0.6943268416596104, "grad_norm": 2.2618014812469482, "learning_rate": 4.366969817697578e-06, "logits/chosen": -3.0235114097595215, "logits/rejected": -3.083573341369629, "logps/chosen": -0.7958801984786987, "logps/rejected": -0.9750314950942993, "loss": 0.8637, "odds_ratio_loss": 0.6784581542015076, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07958801835775375, "rewards/margins": 0.01791512593626976, "rewards/rejected": -0.09750314056873322, "sft_loss": 0.7958801984786987, "step": 410 }, { "epoch": 0.7112616426756986, "grad_norm": 2.504657506942749, "learning_rate": 4.337164938736086e-06, "logits/chosen": -3.049118757247925, "logits/rejected": -3.05595326423645, "logps/chosen": -0.8237883448600769, "logps/rejected": -0.918091893196106, "loss": 0.8939, "odds_ratio_loss": 0.7009984254837036, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.08237884193658829, "rewards/margins": 0.00943034328520298, "rewards/rejected": -0.09180918335914612, "sft_loss": 0.8237883448600769, "step": 420 }, { "epoch": 0.7281964436917866, "grad_norm": 1.2601932287216187, "learning_rate": 4.306781245766945e-06, "logits/chosen": -3.0380876064300537, "logits/rejected": -3.0915472507476807, "logps/chosen": -0.7913134098052979, "logps/rejected": -1.0263410806655884, "loss": 0.8569, "odds_ratio_loss": 0.6553653478622437, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0791313424706459, "rewards/margins": 0.023502767086029053, "rewards/rejected": -0.10263410955667496, "sft_loss": 0.7913134098052979, "step": 430 }, { "epoch": 0.7451312447078747, "grad_norm": 4.184472560882568, "learning_rate": 4.275828311423903e-06, "logits/chosen": -3.093045473098755, "logits/rejected": -3.094749927520752, "logps/chosen": -0.8900951147079468, "logps/rejected": -0.9482099413871765, "loss": 0.9611, "odds_ratio_loss": 0.7105128765106201, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08900952339172363, "rewards/margins": 0.005811482667922974, "rewards/rejected": -0.09482099860906601, "sft_loss": 0.8900951147079468, "step": 440 }, { "epoch": 0.7620660457239627, "grad_norm": 5.296163082122803, "learning_rate": 4.244315887684912e-06, "logits/chosen": -3.063938617706299, "logits/rejected": -3.0689697265625, "logps/chosen": -0.7619583010673523, "logps/rejected": -0.9321343302726746, "loss": 0.8277, "odds_ratio_loss": 0.6576007008552551, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07619582116603851, "rewards/margins": 0.017017606645822525, "rewards/rejected": -0.09321344643831253, "sft_loss": 0.7619583010673523, "step": 450 }, { "epoch": 0.7790008467400508, "grad_norm": 1.0935202836990356, "learning_rate": 4.212253902799685e-06, "logits/chosen": -3.107138156890869, "logits/rejected": -3.1228718757629395, "logps/chosen": -0.828906238079071, "logps/rejected": -1.0854886770248413, "loss": 0.8936, "odds_ratio_loss": 0.6471681594848633, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08289062976837158, "rewards/margins": 0.02565823495388031, "rewards/rejected": -0.10854886472225189, "sft_loss": 0.828906238079071, "step": 460 }, { "epoch": 0.7959356477561389, "grad_norm": 2.112194538116455, "learning_rate": 4.179652458161718e-06, "logits/chosen": -3.0884511470794678, "logits/rejected": -3.086930751800537, "logps/chosen": -0.8770621418952942, "logps/rejected": -0.9670013189315796, "loss": 0.9476, "odds_ratio_loss": 0.7056951522827148, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08770622313022614, "rewards/margins": 0.008993919007480145, "rewards/rejected": -0.09670013934373856, "sft_loss": 0.8770621418952942, "step": 470 }, { "epoch": 0.8128704487722269, "grad_norm": 1.4803324937820435, "learning_rate": 4.146521825125765e-06, "logits/chosen": -3.1123909950256348, "logits/rejected": -3.127878189086914, "logps/chosen": -0.8552261590957642, "logps/rejected": -0.9841393232345581, "loss": 0.9226, "odds_ratio_loss": 0.6738197803497314, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0855226144194603, "rewards/margins": 0.012891319580376148, "rewards/rejected": -0.09841393679380417, "sft_loss": 0.8552261590957642, "step": 480 }, { "epoch": 0.8298052497883149, "grad_norm": 1.639560580253601, "learning_rate": 4.11287244177176e-06, "logits/chosen": -3.1404106616973877, "logits/rejected": -3.1268608570098877, "logps/chosen": -0.7970486283302307, "logps/rejected": -1.0359351634979248, "loss": 0.8576, "odds_ratio_loss": 0.6055651307106018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07970486581325531, "rewards/margins": 0.02388865128159523, "rewards/rejected": -0.10359351336956024, "sft_loss": 0.7970486283302307, "step": 490 }, { "epoch": 0.8467400508044031, "grad_norm": 7.098996162414551, "learning_rate": 4.078714909616215e-06, "logits/chosen": -3.1334495544433594, "logits/rejected": -3.127417802810669, "logps/chosen": -0.8159440755844116, "logps/rejected": -1.052442193031311, "loss": 0.8758, "odds_ratio_loss": 0.5985492467880249, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08159441500902176, "rewards/margins": 0.02364981174468994, "rewards/rejected": -0.10524420440196991, "sft_loss": 0.8159440755844116, "step": 500 }, { "epoch": 0.8467400508044031, "eval_logits/chosen": -3.1088695526123047, "eval_logits/rejected": -3.1276028156280518, "eval_logps/chosen": -0.8053962588310242, "eval_logps/rejected": -1.0085769891738892, "eval_loss": 0.8691067099571228, "eval_odds_ratio_loss": 0.6371051073074341, "eval_rewards/accuracies": 0.5704761743545532, "eval_rewards/chosen": -0.08053962886333466, "eval_rewards/margins": 0.02031807415187359, "eval_rewards/rejected": -0.1008576974272728, "eval_runtime": 194.1916, "eval_samples_per_second": 5.407, "eval_sft_loss": 0.8053962588310242, "eval_steps_per_second": 2.704, "step": 500 }, { "epoch": 0.8636748518204911, "grad_norm": 5.1792216300964355, "learning_rate": 4.044059990272125e-06, "logits/chosen": -3.1320395469665527, "logits/rejected": -3.1584267616271973, "logps/chosen": -0.8421246409416199, "logps/rejected": -1.076226830482483, "loss": 0.9067, "odds_ratio_loss": 0.64554762840271, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08421246707439423, "rewards/margins": 0.02341020107269287, "rewards/rejected": -0.10762268304824829, "sft_loss": 0.8421246409416199, "step": 510 }, { "epoch": 0.8806096528365792, "grad_norm": 2.52073073387146, "learning_rate": 4.0089186020584345e-06, "logits/chosen": -3.1114721298217773, "logits/rejected": -3.1454081535339355, "logps/chosen": -0.8953489065170288, "logps/rejected": -1.0218431949615479, "loss": 0.9597, "odds_ratio_loss": 0.6433413028717041, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08953489363193512, "rewards/margins": 0.012649421580135822, "rewards/rejected": -0.10218431800603867, "sft_loss": 0.8953489065170288, "step": 520 }, { "epoch": 0.8975444538526672, "grad_norm": 3.2529776096343994, "learning_rate": 3.973301816560124e-06, "logits/chosen": -3.1164424419403076, "logits/rejected": -3.0959460735321045, "logps/chosen": -0.8098430633544922, "logps/rejected": -1.0107625722885132, "loss": 0.871, "odds_ratio_loss": 0.6116595268249512, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08098430931568146, "rewards/margins": 0.0200919471681118, "rewards/rejected": -0.10107626020908356, "sft_loss": 0.8098430633544922, "step": 530 }, { "epoch": 0.9144792548687553, "grad_norm": 2.4143612384796143, "learning_rate": 3.937220855140021e-06, "logits/chosen": -3.1310832500457764, "logits/rejected": -3.1453144550323486, "logps/chosen": -0.8188160061836243, "logps/rejected": -0.9051122665405273, "loss": 0.8891, "odds_ratio_loss": 0.7029509544372559, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08188159763813019, "rewards/margins": 0.008629636839032173, "rewards/rejected": -0.09051123261451721, "sft_loss": 0.8188160061836243, "step": 540 }, { "epoch": 0.9314140558848434, "grad_norm": 1.3889896869659424, "learning_rate": 3.900687085403418e-06, "logits/chosen": -3.1416361331939697, "logits/rejected": -3.169236421585083, "logps/chosen": -0.8118529319763184, "logps/rejected": -0.8874411582946777, "loss": 0.8817, "odds_ratio_loss": 0.6982238292694092, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08118529617786407, "rewards/margins": 0.007558824960142374, "rewards/rejected": -0.08874412626028061, "sft_loss": 0.8118529319763184, "step": 550 }, { "epoch": 0.9483488569009314, "grad_norm": 2.055143356323242, "learning_rate": 3.863712017616614e-06, "logits/chosen": -3.131472110748291, "logits/rejected": -3.160679340362549, "logps/chosen": -0.8101698160171509, "logps/rejected": -0.9761570692062378, "loss": 0.8764, "odds_ratio_loss": 0.6618725657463074, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08101697266101837, "rewards/margins": 0.016598742455244064, "rewards/rejected": -0.09761571884155273, "sft_loss": 0.8101698160171509, "step": 560 }, { "epoch": 0.9652836579170194, "grad_norm": 14.660100936889648, "learning_rate": 3.826307301080504e-06, "logits/chosen": -3.0291128158569336, "logits/rejected": -3.0481247901916504, "logps/chosen": -0.8100768327713013, "logps/rejected": -1.162418007850647, "loss": 0.8764, "odds_ratio_loss": 0.6627860069274902, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08100768178701401, "rewards/margins": 0.03523411601781845, "rewards/rejected": -0.11624179035425186, "sft_loss": 0.8100768327713013, "step": 570 }, { "epoch": 0.9822184589331076, "grad_norm": 2.563514471054077, "learning_rate": 3.7884847204603775e-06, "logits/chosen": -3.1155200004577637, "logits/rejected": -3.15079665184021, "logps/chosen": -0.8116699457168579, "logps/rejected": -0.9339970350265503, "loss": 0.8812, "odds_ratio_loss": 0.6949950456619263, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08116699010133743, "rewards/margins": 0.012232715263962746, "rewards/rejected": -0.09339970350265503, "sft_loss": 0.8116699457168579, "step": 580 }, { "epoch": 0.9991532599491956, "grad_norm": 2.7657859325408936, "learning_rate": 3.750256192073058e-06, "logits/chosen": -3.172693967819214, "logits/rejected": -3.1799886226654053, "logps/chosen": -0.9269296526908875, "logps/rejected": -0.9760646820068359, "loss": 1.0009, "odds_ratio_loss": 0.7394477128982544, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09269297122955322, "rewards/margins": 0.004913498647511005, "rewards/rejected": -0.09760646522045135, "sft_loss": 0.9269296526908875, "step": 590 }, { "epoch": 1.0160880609652836, "grad_norm": 7.485799789428711, "learning_rate": 3.7116337601325715e-06, "logits/chosen": -3.1055843830108643, "logits/rejected": -3.129669189453125, "logps/chosen": -0.7523837089538574, "logps/rejected": -0.8988542556762695, "loss": 0.8166, "odds_ratio_loss": 0.6426426768302917, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07523836940526962, "rewards/margins": 0.014647054485976696, "rewards/rejected": -0.08988542854785919, "sft_loss": 0.7523837089538574, "step": 600 }, { "epoch": 1.0330228619813717, "grad_norm": 2.2944157123565674, "learning_rate": 3.6726295929555154e-06, "logits/chosen": -3.077573776245117, "logits/rejected": -3.11928391456604, "logps/chosen": -0.7445582747459412, "logps/rejected": -0.9014002680778503, "loss": 0.8117, "odds_ratio_loss": 0.6712278723716736, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07445583492517471, "rewards/margins": 0.015684202313423157, "rewards/rejected": -0.09014002978801727, "sft_loss": 0.7445582747459412, "step": 610 }, { "epoch": 1.0499576629974599, "grad_norm": 2.4759469032287598, "learning_rate": 3.6332559791273307e-06, "logits/chosen": -3.070753812789917, "logits/rejected": -3.133881092071533, "logps/chosen": -0.7787492871284485, "logps/rejected": -0.9435569643974304, "loss": 0.843, "odds_ratio_loss": 0.6423131823539734, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07787492126226425, "rewards/margins": 0.016480756923556328, "rewards/rejected": -0.09435568749904633, "sft_loss": 0.7787492871284485, "step": 620 }, { "epoch": 1.0668924640135478, "grad_norm": 2.397857666015625, "learning_rate": 3.593525323630681e-06, "logits/chosen": -3.0695629119873047, "logits/rejected": -3.0961527824401855, "logps/chosen": -0.8072765469551086, "logps/rejected": -0.9367998838424683, "loss": 0.8715, "odds_ratio_loss": 0.6417396664619446, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08072765171527863, "rewards/margins": 0.012952342629432678, "rewards/rejected": -0.0936800017952919, "sft_loss": 0.8072765469551086, "step": 630 }, { "epoch": 1.083827265029636, "grad_norm": 2.2752127647399902, "learning_rate": 3.5534501439371615e-06, "logits/chosen": -3.1115057468414307, "logits/rejected": -3.1277289390563965, "logps/chosen": -0.7920553088188171, "logps/rejected": -0.9753093719482422, "loss": 0.8548, "odds_ratio_loss": 0.6271573901176453, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07920553535223007, "rewards/margins": 0.018325407058000565, "rewards/rejected": -0.09753094613552094, "sft_loss": 0.7920553088188171, "step": 640 }, { "epoch": 1.100762066045724, "grad_norm": 1.311848521232605, "learning_rate": 3.5130430660635633e-06, "logits/chosen": -3.1020348072052, "logits/rejected": -3.1430366039276123, "logps/chosen": -0.8032411336898804, "logps/rejected": -0.9853434562683105, "loss": 0.864, "odds_ratio_loss": 0.6071646809577942, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08032412827014923, "rewards/margins": 0.018210221081972122, "rewards/rejected": -0.09853433817625046, "sft_loss": 0.8032411336898804, "step": 650 }, { "epoch": 1.117696867061812, "grad_norm": 1.6412379741668701, "learning_rate": 3.4723168205939444e-06, "logits/chosen": -3.0930168628692627, "logits/rejected": -3.1322126388549805, "logps/chosen": -0.7815112471580505, "logps/rejected": -0.9042918086051941, "loss": 0.8483, "odds_ratio_loss": 0.6679799556732178, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07815112918615341, "rewards/margins": 0.012278061360120773, "rewards/rejected": -0.09042918682098389, "sft_loss": 0.7815112471580505, "step": 660 }, { "epoch": 1.1346316680779, "grad_norm": 2.1591804027557373, "learning_rate": 3.431284238668754e-06, "logits/chosen": -3.062398910522461, "logits/rejected": -3.1151247024536133, "logps/chosen": -0.8405235409736633, "logps/rejected": -1.0043154954910278, "loss": 0.9059, "odds_ratio_loss": 0.6538293957710266, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08405234664678574, "rewards/margins": 0.01637919992208481, "rewards/rejected": -0.10043156147003174, "sft_loss": 0.8405235409736633, "step": 670 }, { "epoch": 1.1515664690939882, "grad_norm": 4.811310291290283, "learning_rate": 3.389958247942274e-06, "logits/chosen": -3.057506561279297, "logits/rejected": -3.1269962787628174, "logps/chosen": -0.8411120176315308, "logps/rejected": -1.1102626323699951, "loss": 0.9091, "odds_ratio_loss": 0.6803519129753113, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08411119878292084, "rewards/margins": 0.026915064081549644, "rewards/rejected": -0.11102626472711563, "sft_loss": 0.8411120176315308, "step": 680 }, { "epoch": 1.168501270110076, "grad_norm": 2.754120111465454, "learning_rate": 3.3483518685096588e-06, "logits/chosen": -3.08880615234375, "logits/rejected": -3.109083890914917, "logps/chosen": -0.8396803140640259, "logps/rejected": -0.9775651693344116, "loss": 0.9066, "odds_ratio_loss": 0.6687448620796204, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08396803587675095, "rewards/margins": 0.013788496144115925, "rewards/rejected": -0.097756527364254, "sft_loss": 0.8396803140640259, "step": 690 }, { "epoch": 1.1854360711261642, "grad_norm": 1.9642765522003174, "learning_rate": 3.306478208804839e-06, "logits/chosen": -3.0659680366516113, "logits/rejected": -3.1026992797851562, "logps/chosen": -0.7790535092353821, "logps/rejected": -0.9482936859130859, "loss": 0.8451, "odds_ratio_loss": 0.659988284111023, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07790535688400269, "rewards/margins": 0.016924021765589714, "rewards/rejected": -0.09482936561107635, "sft_loss": 0.7790535092353821, "step": 700 }, { "epoch": 1.2023708721422524, "grad_norm": 2.6194725036621094, "learning_rate": 3.264350461470608e-06, "logits/chosen": -3.0373668670654297, "logits/rejected": -3.0688533782958984, "logps/chosen": -0.7216525077819824, "logps/rejected": -1.0178143978118896, "loss": 0.7814, "odds_ratio_loss": 0.597113311290741, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07216525077819824, "rewards/margins": 0.02961619198322296, "rewards/rejected": -0.1017814427614212, "sft_loss": 0.7216525077819824, "step": 710 }, { "epoch": 1.2193056731583405, "grad_norm": 2.312389373779297, "learning_rate": 3.2219818992021685e-06, "logits/chosen": -3.03488826751709, "logits/rejected": -3.087043285369873, "logps/chosen": -0.7246071696281433, "logps/rejected": -1.047911286354065, "loss": 0.7828, "odds_ratio_loss": 0.5823417901992798, "rewards/accuracies": 0.625, "rewards/chosen": -0.07246071845293045, "rewards/margins": 0.03233041241765022, "rewards/rejected": -0.10479112714529037, "sft_loss": 0.7246071696281433, "step": 720 }, { "epoch": 1.2362404741744284, "grad_norm": 7.234289169311523, "learning_rate": 3.1793858705654595e-06, "logits/chosen": -3.0948994159698486, "logits/rejected": -3.1080453395843506, "logps/chosen": -0.7130419611930847, "logps/rejected": -0.9181682467460632, "loss": 0.7733, "odds_ratio_loss": 0.6023129820823669, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07130420207977295, "rewards/margins": 0.020512625575065613, "rewards/rejected": -0.09181682765483856, "sft_loss": 0.7130419611930847, "step": 730 }, { "epoch": 1.2531752751905165, "grad_norm": 2.101661443710327, "learning_rate": 3.1365757957915787e-06, "logits/chosen": -3.0874876976013184, "logits/rejected": -3.123832941055298, "logps/chosen": -0.8178297877311707, "logps/rejected": -0.9593319892883301, "loss": 0.8832, "odds_ratio_loss": 0.6534532308578491, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0817829817533493, "rewards/margins": 0.01415021438151598, "rewards/rejected": -0.09593319892883301, "sft_loss": 0.8178297877311707, "step": 740 }, { "epoch": 1.2701100762066047, "grad_norm": 3.2394182682037354, "learning_rate": 3.093565162548633e-06, "logits/chosen": -3.0456290245056152, "logits/rejected": -3.0783424377441406, "logps/chosen": -0.8610566854476929, "logps/rejected": -1.0578378438949585, "loss": 0.9282, "odds_ratio_loss": 0.6714938879013062, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08610567450523376, "rewards/margins": 0.019678115844726562, "rewards/rejected": -0.10578378289937973, "sft_loss": 0.8610566854476929, "step": 750 }, { "epoch": 1.2870448772226926, "grad_norm": 3.4462602138519287, "learning_rate": 3.0503675216923294e-06, "logits/chosen": -3.112204074859619, "logits/rejected": -3.1115825176239014, "logps/chosen": -0.7481369376182556, "logps/rejected": -0.9443623423576355, "loss": 0.8093, "odds_ratio_loss": 0.6117558479309082, "rewards/accuracies": 0.625, "rewards/chosen": -0.07481369376182556, "rewards/margins": 0.019622545689344406, "rewards/rejected": -0.09443624317646027, "sft_loss": 0.7481369376182556, "step": 760 }, { "epoch": 1.3039796782387807, "grad_norm": 2.5497653484344482, "learning_rate": 3.0069964829966748e-06, "logits/chosen": -3.083761692047119, "logits/rejected": -3.1253674030303955, "logps/chosen": -0.7777606248855591, "logps/rejected": -0.9011049270629883, "loss": 0.844, "odds_ratio_loss": 0.6621277928352356, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07777605950832367, "rewards/margins": 0.01233443059027195, "rewards/rejected": -0.09011048078536987, "sft_loss": 0.7777606248855591, "step": 770 }, { "epoch": 1.3209144792548688, "grad_norm": 2.1115994453430176, "learning_rate": 2.963465710866094e-06, "logits/chosen": -3.098053455352783, "logits/rejected": -3.1173479557037354, "logps/chosen": -0.7589127421379089, "logps/rejected": -1.0739343166351318, "loss": 0.8183, "odds_ratio_loss": 0.5940018892288208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07589127868413925, "rewards/margins": 0.03150214999914169, "rewards/rejected": -0.10739342123270035, "sft_loss": 0.7589127421379089, "step": 780 }, { "epoch": 1.337849280270957, "grad_norm": 7.320650577545166, "learning_rate": 2.919788920030357e-06, "logits/chosen": -3.143812894821167, "logits/rejected": -3.1576976776123047, "logps/chosen": -0.8158448934555054, "logps/rejected": -0.9654959440231323, "loss": 0.8826, "odds_ratio_loss": 0.6670835614204407, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08158449828624725, "rewards/margins": 0.014965096488595009, "rewards/rejected": -0.09654959291219711, "sft_loss": 0.8158448934555054, "step": 790 }, { "epoch": 1.3547840812870449, "grad_norm": 1.3826831579208374, "learning_rate": 2.8759798712236303e-06, "logits/chosen": -3.1377153396606445, "logits/rejected": -3.138549566268921, "logps/chosen": -0.7528073191642761, "logps/rejected": -1.0746774673461914, "loss": 0.8173, "odds_ratio_loss": 0.6448089480400085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07528072595596313, "rewards/margins": 0.032187022268772125, "rewards/rejected": -0.10746775567531586, "sft_loss": 0.7528073191642761, "step": 800 }, { "epoch": 1.371718882303133, "grad_norm": 2.1327664852142334, "learning_rate": 2.8320523668490507e-06, "logits/chosen": -3.095376968383789, "logits/rejected": -3.129220485687256, "logps/chosen": -0.8166864514350891, "logps/rejected": -0.9957429766654968, "loss": 0.8841, "odds_ratio_loss": 0.6738199591636658, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08166865259408951, "rewards/margins": 0.01790565252304077, "rewards/rejected": -0.09957430511713028, "sft_loss": 0.8166864514350891, "step": 810 }, { "epoch": 1.388653683319221, "grad_norm": 2.5101046562194824, "learning_rate": 2.7880202466301597e-06, "logits/chosen": -3.075023651123047, "logits/rejected": -3.1098039150238037, "logps/chosen": -0.8058909177780151, "logps/rejected": -0.9401804208755493, "loss": 0.8735, "odds_ratio_loss": 0.6758350133895874, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08058909326791763, "rewards/margins": 0.013428950682282448, "rewards/rejected": -0.09401804953813553, "sft_loss": 0.8058909177780151, "step": 820 }, { "epoch": 1.405588484335309, "grad_norm": 2.202185869216919, "learning_rate": 2.7438973832505854e-06, "logits/chosen": -3.060824155807495, "logits/rejected": -3.0545971393585205, "logps/chosen": -0.7594717741012573, "logps/rejected": -0.9759091138839722, "loss": 0.8235, "odds_ratio_loss": 0.6401799321174622, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07594718039035797, "rewards/margins": 0.021643735468387604, "rewards/rejected": -0.09759090840816498, "sft_loss": 0.7594717741012573, "step": 830 }, { "epoch": 1.4225232853513972, "grad_norm": 5.100254058837891, "learning_rate": 2.699697677983341e-06, "logits/chosen": -3.1682240962982178, "logits/rejected": -3.182861804962158, "logps/chosen": -0.8106738924980164, "logps/rejected": -0.8703418970108032, "loss": 0.8831, "odds_ratio_loss": 0.7241007685661316, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08106739819049835, "rewards/margins": 0.0059667956084012985, "rewards/rejected": -0.0870341882109642, "sft_loss": 0.8106738924980164, "step": 840 }, { "epoch": 1.4394580863674853, "grad_norm": 7.667366981506348, "learning_rate": 2.6554350563111115e-06, "logits/chosen": -3.0851314067840576, "logits/rejected": -3.116276502609253, "logps/chosen": -0.8307350873947144, "logps/rejected": -0.9147516489028931, "loss": 0.9054, "odds_ratio_loss": 0.7461589574813843, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08307350426912308, "rewards/margins": 0.0084016602486372, "rewards/rejected": -0.09147517383098602, "sft_loss": 0.8307350873947144, "step": 850 }, { "epoch": 1.4563928873835732, "grad_norm": 6.640262126922607, "learning_rate": 2.611123463538913e-06, "logits/chosen": -3.0749902725219727, "logits/rejected": -3.0911591053009033, "logps/chosen": -0.7142345309257507, "logps/rejected": -0.9272225499153137, "loss": 0.7775, "odds_ratio_loss": 0.6324297189712524, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07142344862222672, "rewards/margins": 0.02129879966378212, "rewards/rejected": -0.09272225201129913, "sft_loss": 0.7142345309257507, "step": 860 }, { "epoch": 1.4733276883996613, "grad_norm": 8.2167329788208, "learning_rate": 2.566776860400514e-06, "logits/chosen": -3.1082987785339355, "logits/rejected": -3.1257712841033936, "logps/chosen": -0.8350755572319031, "logps/rejected": -1.056910753250122, "loss": 0.898, "odds_ratio_loss": 0.6290403604507446, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08350756019353867, "rewards/margins": 0.022183528169989586, "rewards/rejected": -0.10569107532501221, "sft_loss": 0.8350755572319031, "step": 870 }, { "epoch": 1.4902624894157492, "grad_norm": 1.4808557033538818, "learning_rate": 2.522409218659989e-06, "logits/chosen": -3.143266201019287, "logits/rejected": -3.170633316040039, "logps/chosen": -0.7854605913162231, "logps/rejected": -0.9068530797958374, "loss": 0.8512, "odds_ratio_loss": 0.6573347449302673, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07854606211185455, "rewards/margins": 0.012139257043600082, "rewards/rejected": -0.09068530797958374, "sft_loss": 0.7854605913162231, "step": 880 }, { "epoch": 1.5071972904318374, "grad_norm": 3.6185755729675293, "learning_rate": 2.4780345167097976e-06, "logits/chosen": -3.0972650051116943, "logits/rejected": -3.0835044384002686, "logps/chosen": -0.7852433919906616, "logps/rejected": -1.0855656862258911, "loss": 0.8459, "odds_ratio_loss": 0.6065645813941956, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07852433621883392, "rewards/margins": 0.030032237991690636, "rewards/rejected": -0.1085565835237503, "sft_loss": 0.7852433919906616, "step": 890 }, { "epoch": 1.5241320914479255, "grad_norm": 4.639472961425781, "learning_rate": 2.4336667351667747e-06, "logits/chosen": -3.114197015762329, "logits/rejected": -3.124145269393921, "logps/chosen": -0.8203206062316895, "logps/rejected": -1.0489810705184937, "loss": 0.8792, "odds_ratio_loss": 0.5887311697006226, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08203206956386566, "rewards/margins": 0.022866051644086838, "rewards/rejected": -0.1048981174826622, "sft_loss": 0.8203206062316895, "step": 900 }, { "epoch": 1.5410668924640136, "grad_norm": 3.153228282928467, "learning_rate": 2.3893198524674264e-06, "logits/chosen": -3.086516857147217, "logits/rejected": -3.104675769805908, "logps/chosen": -0.7846948504447937, "logps/rejected": -0.9825633764266968, "loss": 0.8478, "odds_ratio_loss": 0.6309666037559509, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07846949249505997, "rewards/margins": 0.019786860793828964, "rewards/rejected": -0.09825634956359863, "sft_loss": 0.7846948504447937, "step": 910 }, { "epoch": 1.5580016934801018, "grad_norm": 3.532456874847412, "learning_rate": 2.345007840463904e-06, "logits/chosen": -3.0608856678009033, "logits/rejected": -3.0965914726257324, "logps/chosen": -0.8089407682418823, "logps/rejected": -0.9399329423904419, "loss": 0.8759, "odds_ratio_loss": 0.6692665815353394, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08089407533407211, "rewards/margins": 0.013099217787384987, "rewards/rejected": -0.09399329125881195, "sft_loss": 0.8089407682418823, "step": 920 }, { "epoch": 1.5749364944961897, "grad_norm": 3.778723955154419, "learning_rate": 2.3007446600220572e-06, "logits/chosen": -3.122668504714966, "logits/rejected": -3.1052489280700684, "logps/chosen": -0.799084484577179, "logps/rejected": -0.9807415008544922, "loss": 0.8639, "odds_ratio_loss": 0.6483135223388672, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07990844547748566, "rewards/margins": 0.018165703862905502, "rewards/rejected": -0.09807415306568146, "sft_loss": 0.799084484577179, "step": 930 }, { "epoch": 1.5918712955122776, "grad_norm": 3.475341558456421, "learning_rate": 2.2565442566229507e-06, "logits/chosen": -3.0740559101104736, "logits/rejected": -3.106633424758911, "logps/chosen": -0.7921947240829468, "logps/rejected": -0.8962046504020691, "loss": 0.862, "odds_ratio_loss": 0.6985131502151489, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07921947538852692, "rewards/margins": 0.01040099561214447, "rewards/rejected": -0.08962046355009079, "sft_loss": 0.7921947240829468, "step": 940 }, { "epoch": 1.6088060965283657, "grad_norm": 2.1668930053710938, "learning_rate": 2.2124205559692195e-06, "logits/chosen": -3.064317226409912, "logits/rejected": -3.1167845726013184, "logps/chosen": -0.7817971706390381, "logps/rejected": -1.0080362558364868, "loss": 0.8399, "odds_ratio_loss": 0.5807359218597412, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0781797245144844, "rewards/margins": 0.02262391336262226, "rewards/rejected": -0.10080362856388092, "sft_loss": 0.7817971706390381, "step": 950 }, { "epoch": 1.6257408975444538, "grad_norm": 1.8939071893692017, "learning_rate": 2.168387459597666e-06, "logits/chosen": -3.092590808868408, "logits/rejected": -3.138596773147583, "logps/chosen": -0.7956336140632629, "logps/rejected": -1.0347424745559692, "loss": 0.8565, "odds_ratio_loss": 0.6086810827255249, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07956336438655853, "rewards/margins": 0.02391088381409645, "rewards/rejected": -0.10347424447536469, "sft_loss": 0.7956336140632629, "step": 960 }, { "epoch": 1.642675698560542, "grad_norm": 2.1293139457702637, "learning_rate": 2.1244588404994648e-06, "logits/chosen": -3.0621867179870605, "logits/rejected": -3.0614800453186035, "logps/chosen": -0.7734932899475098, "logps/rejected": -0.9287079572677612, "loss": 0.8408, "odds_ratio_loss": 0.6727171540260315, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07734932750463486, "rewards/margins": 0.015521461144089699, "rewards/rejected": -0.0928707867860794, "sft_loss": 0.7734932899475098, "step": 970 }, { "epoch": 1.65961049957663, "grad_norm": 5.466368675231934, "learning_rate": 2.08064853874936e-06, "logits/chosen": -3.0892724990844727, "logits/rejected": -3.1403141021728516, "logps/chosen": -0.8142075538635254, "logps/rejected": -0.9884878396987915, "loss": 0.8786, "odds_ratio_loss": 0.6434410810470581, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08142076432704926, "rewards/margins": 0.017428018152713776, "rewards/rejected": -0.09884877502918243, "sft_loss": 0.8142075538635254, "step": 980 }, { "epoch": 1.676545300592718, "grad_norm": 12.251886367797852, "learning_rate": 2.0369703571452387e-06, "logits/chosen": -3.055989980697632, "logits/rejected": -3.0450901985168457, "logps/chosen": -0.716150164604187, "logps/rejected": -0.9780189394950867, "loss": 0.7743, "odds_ratio_loss": 0.5815138816833496, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07161502540111542, "rewards/margins": 0.026186879724264145, "rewards/rejected": -0.09780190140008926, "sft_loss": 0.716150164604187, "step": 990 }, { "epoch": 1.6934801016088061, "grad_norm": 3.6831836700439453, "learning_rate": 1.993438056859441e-06, "logits/chosen": -3.1155529022216797, "logits/rejected": -3.1060287952423096, "logps/chosen": -0.7484423518180847, "logps/rejected": -0.929480254650116, "loss": 0.8098, "odds_ratio_loss": 0.6138982772827148, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07484424859285355, "rewards/margins": 0.018103793263435364, "rewards/rejected": -0.09294802695512772, "sft_loss": 0.7484423518180847, "step": 1000 }, { "epoch": 1.6934801016088061, "eval_logits/chosen": -3.0966453552246094, "eval_logits/rejected": -3.117032051086426, "eval_logps/chosen": -0.7911127805709839, "eval_logps/rejected": -0.9985377192497253, "eval_loss": 0.8548597693443298, "eval_odds_ratio_loss": 0.6374707221984863, "eval_rewards/accuracies": 0.5676190257072449, "eval_rewards/chosen": -0.07911127805709839, "eval_rewards/margins": 0.020742492750287056, "eval_rewards/rejected": -0.0998537689447403, "eval_runtime": 195.0995, "eval_samples_per_second": 5.382, "eval_sft_loss": 0.7911127805709839, "eval_steps_per_second": 2.691, "step": 1000 }, { "epoch": 1.710414902624894, "grad_norm": 2.5681636333465576, "learning_rate": 1.9500653531031917e-06, "logits/chosen": -3.116891384124756, "logits/rejected": -3.1181325912475586, "logps/chosen": -0.8085691332817078, "logps/rejected": -1.0325000286102295, "loss": 0.8768, "odds_ratio_loss": 0.6825646162033081, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08085691183805466, "rewards/margins": 0.022393101826310158, "rewards/rejected": -0.10325001180171967, "sft_loss": 0.8085691332817078, "step": 1010 }, { "epoch": 1.7273497036409822, "grad_norm": 2.5795137882232666, "learning_rate": 1.9068659108055117e-06, "logits/chosen": -3.1321749687194824, "logits/rejected": -3.155836582183838, "logps/chosen": -0.7755477428436279, "logps/rejected": -0.9410096406936646, "loss": 0.8406, "odds_ratio_loss": 0.6507992744445801, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07755477726459503, "rewards/margins": 0.016546186059713364, "rewards/rejected": -0.0941009670495987, "sft_loss": 0.7755477428436279, "step": 1020 }, { "epoch": 1.7442845046570703, "grad_norm": 2.360201835632324, "learning_rate": 1.863853340307962e-06, "logits/chosen": -3.078691005706787, "logits/rejected": -3.1044058799743652, "logps/chosen": -0.683570146560669, "logps/rejected": -0.9945551753044128, "loss": 0.7414, "odds_ratio_loss": 0.5782071352005005, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06835701316595078, "rewards/margins": 0.03109849989414215, "rewards/rejected": -0.09945552051067352, "sft_loss": 0.683570146560669, "step": 1030 }, { "epoch": 1.7612193056731584, "grad_norm": 2.180799961090088, "learning_rate": 1.8210411930766019e-06, "logits/chosen": -3.0688931941986084, "logits/rejected": -3.096926212310791, "logps/chosen": -0.7589991688728333, "logps/rejected": -0.9845136404037476, "loss": 0.8196, "odds_ratio_loss": 0.6064754128456116, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07589991390705109, "rewards/margins": 0.02255145087838173, "rewards/rejected": -0.09845136106014252, "sft_loss": 0.7589991688728333, "step": 1040 }, { "epoch": 1.7781541066892466, "grad_norm": 2.4373152256011963, "learning_rate": 1.7784429574324803e-06, "logits/chosen": -3.079857349395752, "logits/rejected": -3.1024343967437744, "logps/chosen": -0.738146960735321, "logps/rejected": -0.9651134610176086, "loss": 0.7991, "odds_ratio_loss": 0.6093234419822693, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07381470501422882, "rewards/margins": 0.022696642205119133, "rewards/rejected": -0.0965113490819931, "sft_loss": 0.738146960735321, "step": 1050 }, { "epoch": 1.7950889077053345, "grad_norm": 1.4168922901153564, "learning_rate": 1.7360720543020327e-06, "logits/chosen": -3.158327102661133, "logits/rejected": -3.1482691764831543, "logps/chosen": -0.7429002523422241, "logps/rejected": -0.9451411366462708, "loss": 0.8021, "odds_ratio_loss": 0.5917203426361084, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07429002225399017, "rewards/margins": 0.020224085077643394, "rewards/rejected": -0.09451410919427872, "sft_loss": 0.7429002523422241, "step": 1060 }, { "epoch": 1.8120237087214224, "grad_norm": 7.384251594543457, "learning_rate": 1.6939418329887042e-06, "logits/chosen": -3.1223366260528564, "logits/rejected": -3.157349109649658, "logps/chosen": -0.7720141410827637, "logps/rejected": -1.004219651222229, "loss": 0.8344, "odds_ratio_loss": 0.623358428478241, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07720141112804413, "rewards/margins": 0.023220548406243324, "rewards/rejected": -0.1004219651222229, "sft_loss": 0.7720141410827637, "step": 1070 }, { "epoch": 1.8289585097375105, "grad_norm": 1.976076602935791, "learning_rate": 1.6520655669671467e-06, "logits/chosen": -3.151508331298828, "logits/rejected": -3.174203395843506, "logps/chosen": -0.7854975461959839, "logps/rejected": -0.9663504362106323, "loss": 0.8499, "odds_ratio_loss": 0.6441539525985718, "rewards/accuracies": 0.5, "rewards/chosen": -0.07854975759983063, "rewards/margins": 0.01808529533445835, "rewards/rejected": -0.09663505107164383, "sft_loss": 0.7854975461959839, "step": 1080 }, { "epoch": 1.8458933107535986, "grad_norm": 2.3342175483703613, "learning_rate": 1.610456449701294e-06, "logits/chosen": -3.110039234161377, "logits/rejected": -3.1346383094787598, "logps/chosen": -0.7955976724624634, "logps/rejected": -1.0117195844650269, "loss": 0.8634, "odds_ratio_loss": 0.6780521273612976, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07955978065729141, "rewards/margins": 0.02161218598484993, "rewards/rejected": -0.10117195546627045, "sft_loss": 0.7955976724624634, "step": 1090 }, { "epoch": 1.8628281117696868, "grad_norm": 2.040372133255005, "learning_rate": 1.5691275904876545e-06, "logits/chosen": -3.11810302734375, "logits/rejected": -3.0907464027404785, "logps/chosen": -0.8171418905258179, "logps/rejected": -1.049744963645935, "loss": 0.8794, "odds_ratio_loss": 0.6221813559532166, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0817141979932785, "rewards/margins": 0.02326030097901821, "rewards/rejected": -0.10497449338436127, "sft_loss": 0.8171418905258179, "step": 1100 }, { "epoch": 1.879762912785775, "grad_norm": 1.1363812685012817, "learning_rate": 1.5280920103251235e-06, "logits/chosen": -3.121904134750366, "logits/rejected": -3.1400184631347656, "logps/chosen": -0.7571579217910767, "logps/rejected": -0.9578372240066528, "loss": 0.8213, "odds_ratio_loss": 0.6416117548942566, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07571578770875931, "rewards/margins": 0.020067930221557617, "rewards/rejected": -0.09578372538089752, "sft_loss": 0.7571579217910767, "step": 1110 }, { "epoch": 1.8966977138018628, "grad_norm": 3.5236151218414307, "learning_rate": 1.4873626378126015e-06, "logits/chosen": -3.109051465988159, "logits/rejected": -3.123108386993408, "logps/chosen": -0.7997997403144836, "logps/rejected": -1.0338528156280518, "loss": 0.8642, "odds_ratio_loss": 0.6441462635993958, "rewards/accuracies": 0.625, "rewards/chosen": -0.07997997850179672, "rewards/margins": 0.02340531349182129, "rewards/rejected": -0.10338529199361801, "sft_loss": 0.7997997403144836, "step": 1120 }, { "epoch": 1.913632514817951, "grad_norm": 4.680517673492432, "learning_rate": 1.446952305075738e-06, "logits/chosen": -3.1131389141082764, "logits/rejected": -3.151978015899658, "logps/chosen": -0.7748031616210938, "logps/rejected": -0.8985496759414673, "loss": 0.8402, "odds_ratio_loss": 0.6542429327964783, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07748032361268997, "rewards/margins": 0.012374645099043846, "rewards/rejected": -0.08985497057437897, "sft_loss": 0.7748031616210938, "step": 1130 }, { "epoch": 1.9305673158340388, "grad_norm": 1.990546703338623, "learning_rate": 1.406873743724065e-06, "logits/chosen": -3.134593963623047, "logits/rejected": -3.088059186935425, "logps/chosen": -0.8194819688796997, "logps/rejected": -1.0895836353302002, "loss": 0.8842, "odds_ratio_loss": 0.6472653746604919, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08194819837808609, "rewards/margins": 0.02701016701757908, "rewards/rejected": -0.10895836353302002, "sft_loss": 0.8194819688796997, "step": 1140 }, { "epoch": 1.947502116850127, "grad_norm": 2.0966198444366455, "learning_rate": 1.3671395808397898e-06, "logits/chosen": -3.1003191471099854, "logits/rejected": -3.159738779067993, "logps/chosen": -0.7502952218055725, "logps/rejected": -0.8763904571533203, "loss": 0.8142, "odds_ratio_loss": 0.6395031213760376, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07502951472997665, "rewards/margins": 0.012609531171619892, "rewards/rejected": -0.08763904869556427, "sft_loss": 0.7502952218055725, "step": 1150 }, { "epoch": 1.964436917866215, "grad_norm": 6.19160795211792, "learning_rate": 1.3277623349995418e-06, "logits/chosen": -3.1115336418151855, "logits/rejected": -3.1329808235168457, "logps/chosen": -0.7726483941078186, "logps/rejected": -0.9365378618240356, "loss": 0.8381, "odds_ratio_loss": 0.6545372605323792, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07726484537124634, "rewards/margins": 0.016388945281505585, "rewards/rejected": -0.09365378320217133, "sft_loss": 0.7726483941078186, "step": 1160 }, { "epoch": 1.9813717188823032, "grad_norm": 2.1759722232818604, "learning_rate": 1.2887544123302781e-06, "logits/chosen": -3.1343424320220947, "logits/rejected": -3.145904064178467, "logps/chosen": -0.795665442943573, "logps/rejected": -0.9278801679611206, "loss": 0.863, "odds_ratio_loss": 0.6731692552566528, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07956655323505402, "rewards/margins": 0.01322146225720644, "rewards/rejected": -0.09278801828622818, "sft_loss": 0.795665442943573, "step": 1170 }, { "epoch": 1.9983065198983911, "grad_norm": 2.340391159057617, "learning_rate": 1.2501281026006393e-06, "logits/chosen": -3.113882064819336, "logits/rejected": -3.1493611335754395, "logps/chosen": -0.7624078989028931, "logps/rejected": -0.9082571268081665, "loss": 0.8289, "odds_ratio_loss": 0.6652835607528687, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.07624078541994095, "rewards/margins": 0.014584928750991821, "rewards/rejected": -0.09082571417093277, "sft_loss": 0.7624078989028931, "step": 1180 }, { "epoch": 2.015241320914479, "grad_norm": 2.1316022872924805, "learning_rate": 1.2118955753489523e-06, "logits/chosen": -3.1328041553497314, "logits/rejected": -3.119529962539673, "logps/chosen": -0.7868901491165161, "logps/rejected": -0.9673662185668945, "loss": 0.8514, "odds_ratio_loss": 0.6448970437049866, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07868902385234833, "rewards/margins": 0.018047606572508812, "rewards/rejected": -0.09673662483692169, "sft_loss": 0.7868901491165161, "step": 1190 }, { "epoch": 2.032176121930567, "grad_norm": 2.219909906387329, "learning_rate": 1.1740688760491189e-06, "logits/chosen": -3.1113944053649902, "logits/rejected": -3.1537253856658936, "logps/chosen": -0.7722674608230591, "logps/rejected": -1.0175268650054932, "loss": 0.8299, "odds_ratio_loss": 0.5762220621109009, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07722674310207367, "rewards/margins": 0.02452593669295311, "rewards/rejected": -0.10175268352031708, "sft_loss": 0.7722674608230591, "step": 1200 }, { "epoch": 2.0491109229466553, "grad_norm": 6.625383377075195, "learning_rate": 1.1366599223155847e-06, "logits/chosen": -3.092228651046753, "logits/rejected": -3.137305498123169, "logps/chosen": -0.7458280324935913, "logps/rejected": -1.1138603687286377, "loss": 0.8056, "odds_ratio_loss": 0.5978988409042358, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07458280026912689, "rewards/margins": 0.0368032269179821, "rewards/rejected": -0.1113860234618187, "sft_loss": 0.7458280324935913, "step": 1210 }, { "epoch": 2.0660457239627434, "grad_norm": 3.234821081161499, "learning_rate": 1.0996805001486067e-06, "logits/chosen": -3.109764575958252, "logits/rejected": -3.1344008445739746, "logps/chosen": -0.7534822225570679, "logps/rejected": -0.9295659065246582, "loss": 0.8161, "odds_ratio_loss": 0.625883162021637, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07534822076559067, "rewards/margins": 0.017608370631933212, "rewards/rejected": -0.09295658767223358, "sft_loss": 0.7534822225570679, "step": 1220 }, { "epoch": 2.0829805249788316, "grad_norm": 3.5898754596710205, "learning_rate": 1.0631422602209608e-06, "logits/chosen": -3.1376945972442627, "logits/rejected": -3.1564555168151855, "logps/chosen": -0.7927883863449097, "logps/rejected": -0.9650642275810242, "loss": 0.8569, "odds_ratio_loss": 0.6410055160522461, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0792788416147232, "rewards/margins": 0.017227591946721077, "rewards/rejected": -0.09650643169879913, "sft_loss": 0.7927883863449097, "step": 1230 }, { "epoch": 2.0999153259949197, "grad_norm": 1.6415690183639526, "learning_rate": 1.027056714207319e-06, "logits/chosen": -3.1497724056243896, "logits/rejected": -3.1502339839935303, "logps/chosen": -0.8299382925033569, "logps/rejected": -1.0621525049209595, "loss": 0.8939, "odds_ratio_loss": 0.6398864984512329, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08299383521080017, "rewards/margins": 0.023221401497721672, "rewards/rejected": -0.10621523857116699, "sft_loss": 0.8299382925033569, "step": 1240 }, { "epoch": 2.116850127011008, "grad_norm": 2.1568691730499268, "learning_rate": 9.914352311573838e-07, "logits/chosen": -3.1183249950408936, "logits/rejected": -3.130068302154541, "logps/chosen": -0.680639386177063, "logps/rejected": -0.9202540516853333, "loss": 0.7394, "odds_ratio_loss": 0.5878284573554993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06806393712759018, "rewards/margins": 0.02396145835518837, "rewards/rejected": -0.09202539175748825, "sft_loss": 0.680639386177063, "step": 1250 }, { "epoch": 2.1337849280270955, "grad_norm": 1.6225751638412476, "learning_rate": 9.562890339139877e-07, "logits/chosen": -3.0915961265563965, "logits/rejected": -3.1580090522766113, "logps/chosen": -0.7393635511398315, "logps/rejected": -0.9581485986709595, "loss": 0.8032, "odds_ratio_loss": 0.6382402181625366, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07393636554479599, "rewards/margins": 0.021878493949770927, "rewards/rejected": -0.09581486135721207, "sft_loss": 0.7393635511398315, "step": 1260 }, { "epoch": 2.1507197290431836, "grad_norm": 1.8676691055297852, "learning_rate": 9.216291955772374e-07, "logits/chosen": -3.0996224880218506, "logits/rejected": -3.153738021850586, "logps/chosen": -0.7607396841049194, "logps/rejected": -0.9290043115615845, "loss": 0.8238, "odds_ratio_loss": 0.6303194761276245, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07607396692037582, "rewards/margins": 0.016826456412672997, "rewards/rejected": -0.09290042519569397, "sft_loss": 0.7607396841049194, "step": 1270 }, { "epoch": 2.167654530059272, "grad_norm": 2.096705198287964, "learning_rate": 8.874666360158457e-07, "logits/chosen": -3.082366466522217, "logits/rejected": -3.0917139053344727, "logps/chosen": -0.7106717824935913, "logps/rejected": -0.9525697827339172, "loss": 0.774, "odds_ratio_loss": 0.6332431435585022, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07106717675924301, "rewards/margins": 0.024189796298742294, "rewards/rejected": -0.0952569767832756, "sft_loss": 0.7106717824935913, "step": 1280 }, { "epoch": 2.18458933107536, "grad_norm": 2.435957431793213, "learning_rate": 8.538121184267315e-07, "logits/chosen": -3.1009063720703125, "logits/rejected": -3.121425151824951, "logps/chosen": -0.6681427955627441, "logps/rejected": -0.9359544515609741, "loss": 0.7277, "odds_ratio_loss": 0.595305323600769, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.06681428104639053, "rewards/margins": 0.026781165972352028, "rewards/rejected": -0.09359544515609741, "sft_loss": 0.6681427955627441, "step": 1290 }, { "epoch": 2.201524132091448, "grad_norm": 2.3414995670318604, "learning_rate": 8.206762459439907e-07, "logits/chosen": -3.1007769107818604, "logits/rejected": -3.142437696456909, "logps/chosen": -0.7944619059562683, "logps/rejected": -0.9959940910339355, "loss": 0.8604, "odds_ratio_loss": 0.659730076789856, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07944619655609131, "rewards/margins": 0.020153220742940903, "rewards/rejected": -0.09959942102432251, "sft_loss": 0.7944619059562683, "step": 1300 }, { "epoch": 2.218458933107536, "grad_norm": 2.9054501056671143, "learning_rate": 7.880694582982898e-07, "logits/chosen": -3.165544271469116, "logits/rejected": -3.1803879737854004, "logps/chosen": -0.8287284970283508, "logps/rejected": -1.0228365659713745, "loss": 0.8926, "odds_ratio_loss": 0.6389774084091187, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08287284523248672, "rewards/margins": 0.019410807639360428, "rewards/rejected": -0.10228364169597626, "sft_loss": 0.8287284970283508, "step": 1310 }, { "epoch": 2.235393734123624, "grad_norm": 3.0612242221832275, "learning_rate": 7.560020285277401e-07, "logits/chosen": -3.0883891582489014, "logits/rejected": -3.142484426498413, "logps/chosen": -0.7654698491096497, "logps/rejected": -0.8691753149032593, "loss": 0.8363, "odds_ratio_loss": 0.708137035369873, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07654698193073273, "rewards/margins": 0.010370554402470589, "rewards/rejected": -0.08691753447055817, "sft_loss": 0.7654698491096497, "step": 1320 }, { "epoch": 2.252328535139712, "grad_norm": 3.074390172958374, "learning_rate": 7.244840597412956e-07, "logits/chosen": -3.0840373039245605, "logits/rejected": -3.113865613937378, "logps/chosen": -0.7826686501502991, "logps/rejected": -0.8837703466415405, "loss": 0.8534, "odds_ratio_loss": 0.7077327370643616, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07826686650514603, "rewards/margins": 0.010110177099704742, "rewards/rejected": -0.08837703615427017, "sft_loss": 0.7826686501502991, "step": 1330 }, { "epoch": 2.2692633361558, "grad_norm": 6.461457252502441, "learning_rate": 6.935254819356796e-07, "logits/chosen": -3.1297388076782227, "logits/rejected": -3.1470894813537598, "logps/chosen": -0.7817297577857971, "logps/rejected": -0.9597527384757996, "loss": 0.8464, "odds_ratio_loss": 0.6471723318099976, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07817298173904419, "rewards/margins": 0.017802301794290543, "rewards/rejected": -0.09597527980804443, "sft_loss": 0.7817297577857971, "step": 1340 }, { "epoch": 2.2861981371718882, "grad_norm": 1.7192656993865967, "learning_rate": 6.631360488668662e-07, "logits/chosen": -3.138521194458008, "logits/rejected": -3.150005578994751, "logps/chosen": -0.7190467715263367, "logps/rejected": -0.9147864580154419, "loss": 0.7824, "odds_ratio_loss": 0.6330953240394592, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07190467417240143, "rewards/margins": 0.019573967903852463, "rewards/rejected": -0.09147863835096359, "sft_loss": 0.7190467715263367, "step": 1350 }, { "epoch": 2.3031329381879764, "grad_norm": 2.849228620529175, "learning_rate": 6.333253349770672e-07, "logits/chosen": -3.1443207263946533, "logits/rejected": -3.1502537727355957, "logps/chosen": -0.8002703785896301, "logps/rejected": -0.9851358532905579, "loss": 0.8656, "odds_ratio_loss": 0.6537164449691772, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0800270363688469, "rewards/margins": 0.018486548215150833, "rewards/rejected": -0.09851358830928802, "sft_loss": 0.8002703785896301, "step": 1360 }, { "epoch": 2.3200677392040645, "grad_norm": 1.3985761404037476, "learning_rate": 6.041027323782364e-07, "logits/chosen": -3.136557102203369, "logits/rejected": -3.1552491188049316, "logps/chosen": -0.7381452918052673, "logps/rejected": -0.9497495889663696, "loss": 0.7993, "odds_ratio_loss": 0.611694872379303, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07381454110145569, "rewards/margins": 0.021160420030355453, "rewards/rejected": -0.09497495740652084, "sft_loss": 0.7381452918052673, "step": 1370 }, { "epoch": 2.337002540220152, "grad_norm": 2.8737893104553223, "learning_rate": 5.754774478929969e-07, "logits/chosen": -3.1531028747558594, "logits/rejected": -3.1830787658691406, "logps/chosen": -0.7512658834457397, "logps/rejected": -0.9563377499580383, "loss": 0.81, "odds_ratio_loss": 0.5872438549995422, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07512658089399338, "rewards/margins": 0.020507195964455605, "rewards/rejected": -0.09563378244638443, "sft_loss": 0.7512658834457397, "step": 1380 }, { "epoch": 2.3539373412362403, "grad_norm": 3.9828193187713623, "learning_rate": 5.474585001539634e-07, "logits/chosen": -3.159769296646118, "logits/rejected": -3.1827354431152344, "logps/chosen": -0.716742753982544, "logps/rejected": -0.9266722798347473, "loss": 0.7742, "odds_ratio_loss": 0.5742050409317017, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07167427241802216, "rewards/margins": 0.020992957055568695, "rewards/rejected": -0.09266723692417145, "sft_loss": 0.716742753982544, "step": 1390 }, { "epoch": 2.3708721422523285, "grad_norm": 1.6741605997085571, "learning_rate": 5.200547167623424e-07, "logits/chosen": -3.172938823699951, "logits/rejected": -3.1454200744628906, "logps/chosen": -0.7799001932144165, "logps/rejected": -1.040466070175171, "loss": 0.8399, "odds_ratio_loss": 0.5997284650802612, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07799001783132553, "rewards/margins": 0.026056593284010887, "rewards/rejected": -0.10404660552740097, "sft_loss": 0.7799001932144165, "step": 1400 }, { "epoch": 2.3878069432684166, "grad_norm": 1.3470762968063354, "learning_rate": 4.932747315067271e-07, "logits/chosen": -3.15238356590271, "logits/rejected": -3.1575927734375, "logps/chosen": -0.7618133425712585, "logps/rejected": -1.033060073852539, "loss": 0.8193, "odds_ratio_loss": 0.574641764163971, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0761813372373581, "rewards/margins": 0.027124667540192604, "rewards/rejected": -0.10330601036548615, "sft_loss": 0.7618133425712585, "step": 1410 }, { "epoch": 2.4047417442845047, "grad_norm": 4.0449371337890625, "learning_rate": 4.6712698164294553e-07, "logits/chosen": -3.1169888973236084, "logits/rejected": -3.131412982940674, "logps/chosen": -0.7451744079589844, "logps/rejected": -0.9645574688911438, "loss": 0.8029, "odds_ratio_loss": 0.5777136087417603, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07451744377613068, "rewards/margins": 0.021938303485512733, "rewards/rejected": -0.09645574539899826, "sft_loss": 0.7451744079589844, "step": 1420 }, { "epoch": 2.421676545300593, "grad_norm": 1.97295343875885, "learning_rate": 4.41619705235842e-07, "logits/chosen": -3.1363072395324707, "logits/rejected": -3.1507456302642822, "logps/chosen": -0.7545329332351685, "logps/rejected": -1.082715392112732, "loss": 0.8124, "odds_ratio_loss": 0.5782482624053955, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07545328885316849, "rewards/margins": 0.0328182615339756, "rewards/rejected": -0.10827155411243439, "sft_loss": 0.7545329332351685, "step": 1430 }, { "epoch": 2.438611346316681, "grad_norm": 2.880293846130371, "learning_rate": 4.167609385637961e-07, "logits/chosen": -3.182002544403076, "logits/rejected": -3.165118455886841, "logps/chosen": -0.7962235808372498, "logps/rejected": -0.975197970867157, "loss": 0.8596, "odds_ratio_loss": 0.634021520614624, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07962235808372498, "rewards/margins": 0.017897438257932663, "rewards/rejected": -0.09751980006694794, "sft_loss": 0.7962235808372498, "step": 1440 }, { "epoch": 2.4555461473327687, "grad_norm": 2.545621871948242, "learning_rate": 3.9255851358683567e-07, "logits/chosen": -3.1265337467193604, "logits/rejected": -3.140825033187866, "logps/chosen": -0.7502083778381348, "logps/rejected": -0.9492311477661133, "loss": 0.817, "odds_ratio_loss": 0.6676734685897827, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.07502084225416183, "rewards/margins": 0.019902262836694717, "rewards/rejected": -0.09492311626672745, "sft_loss": 0.7502083778381348, "step": 1450 }, { "epoch": 2.472480948348857, "grad_norm": 2.91165828704834, "learning_rate": 3.690200554791082e-07, "logits/chosen": -3.093761444091797, "logits/rejected": -3.0926525592803955, "logps/chosen": -0.7423545122146606, "logps/rejected": -0.9597676992416382, "loss": 0.8001, "odds_ratio_loss": 0.5777243375778198, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0742354542016983, "rewards/margins": 0.02174132689833641, "rewards/rejected": -0.09597676992416382, "sft_loss": 0.7423545122146606, "step": 1460 }, { "epoch": 2.489415749364945, "grad_norm": 5.057043075561523, "learning_rate": 3.461529802265079e-07, "logits/chosen": -3.153536796569824, "logits/rejected": -3.16294264793396, "logps/chosen": -0.7584110498428345, "logps/rejected": -0.9484812617301941, "loss": 0.8201, "odds_ratio_loss": 0.6163991093635559, "rewards/accuracies": 0.625, "rewards/chosen": -0.07584110647439957, "rewards/margins": 0.01900702901184559, "rewards/rejected": -0.0948481336236, "sft_loss": 0.7584110498428345, "step": 1470 }, { "epoch": 2.506350550381033, "grad_norm": 2.0298070907592773, "learning_rate": 3.2396449229020883e-07, "logits/chosen": -3.18571400642395, "logits/rejected": -3.1676132678985596, "logps/chosen": -0.7937291264533997, "logps/rejected": -0.935733437538147, "loss": 0.8599, "odds_ratio_loss": 0.6614922881126404, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07937291264533997, "rewards/margins": 0.014200428500771523, "rewards/rejected": -0.09357334673404694, "sft_loss": 0.7937291264533997, "step": 1480 }, { "epoch": 2.523285351397121, "grad_norm": 4.000706195831299, "learning_rate": 3.024615823368371e-07, "logits/chosen": -3.1206421852111816, "logits/rejected": -3.152639150619507, "logps/chosen": -0.7629178166389465, "logps/rejected": -0.9828959703445435, "loss": 0.8252, "odds_ratio_loss": 0.622687041759491, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07629178464412689, "rewards/margins": 0.021997807547450066, "rewards/rejected": -0.09828958660364151, "sft_loss": 0.7629178166389465, "step": 1490 }, { "epoch": 2.5402201524132093, "grad_norm": 2.449856758117676, "learning_rate": 2.8165102503600716e-07, "logits/chosen": -3.095520496368408, "logits/rejected": -3.1078646183013916, "logps/chosen": -0.7517341375350952, "logps/rejected": -0.9984272718429565, "loss": 0.8135, "odds_ratio_loss": 0.6172657012939453, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07517342269420624, "rewards/margins": 0.024669310078024864, "rewards/rejected": -0.09984272718429565, "sft_loss": 0.7517341375350952, "step": 1500 }, { "epoch": 2.5402201524132093, "eval_logits/chosen": -3.130527973175049, "eval_logits/rejected": -3.1507296562194824, "eval_logps/chosen": -0.7867480516433716, "eval_logps/rejected": -0.9955620169639587, "eval_loss": 0.8505691885948181, "eval_odds_ratio_loss": 0.638211727142334, "eval_rewards/accuracies": 0.5723809599876404, "eval_rewards/chosen": -0.0786748081445694, "eval_rewards/margins": 0.020881392061710358, "eval_rewards/rejected": -0.09955620020627975, "eval_runtime": 194.4899, "eval_samples_per_second": 5.399, "eval_sft_loss": 0.7867480516433716, "eval_steps_per_second": 2.699, "step": 1500 }, { "epoch": 2.557154953429297, "grad_norm": 1.5426675081253052, "learning_rate": 2.615393769259039e-07, "logits/chosen": -3.1200222969055176, "logits/rejected": -3.151517152786255, "logps/chosen": -0.845578670501709, "logps/rejected": -0.9381099939346313, "loss": 0.9194, "odds_ratio_loss": 0.7380752563476562, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.08455787599086761, "rewards/margins": 0.00925312377512455, "rewards/rejected": -0.09381099790334702, "sft_loss": 0.845578670501709, "step": 1510 }, { "epoch": 2.574089754445385, "grad_norm": 4.487434387207031, "learning_rate": 2.421329743475917e-07, "logits/chosen": -3.1358139514923096, "logits/rejected": -3.157161235809326, "logps/chosen": -0.7332046627998352, "logps/rejected": -0.9182003140449524, "loss": 0.7981, "odds_ratio_loss": 0.6488706469535828, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07332046329975128, "rewards/margins": 0.018499553203582764, "rewards/rejected": -0.09182002395391464, "sft_loss": 0.7332046627998352, "step": 1520 }, { "epoch": 2.5910245554614733, "grad_norm": 2.3630266189575195, "learning_rate": 2.234379314486973e-07, "logits/chosen": -3.1165411472320557, "logits/rejected": -3.1539194583892822, "logps/chosen": -0.8104802370071411, "logps/rejected": -0.9215306043624878, "loss": 0.8756, "odds_ratio_loss": 0.6507223844528198, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08104802668094635, "rewards/margins": 0.011105035431683064, "rewards/rejected": -0.09215305000543594, "sft_loss": 0.8104802370071411, "step": 1530 }, { "epoch": 2.6079593564775614, "grad_norm": 1.243202805519104, "learning_rate": 2.0546013825709783e-07, "logits/chosen": -3.1084282398223877, "logits/rejected": -3.129692792892456, "logps/chosen": -0.7565353512763977, "logps/rejected": -1.0974582433700562, "loss": 0.8149, "odds_ratio_loss": 0.5834510326385498, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07565353065729141, "rewards/margins": 0.034092292189598083, "rewards/rejected": -0.1097458228468895, "sft_loss": 0.7565353512763977, "step": 1540 }, { "epoch": 2.6248941574936495, "grad_norm": 1.8778706789016724, "learning_rate": 1.88205258825217e-07, "logits/chosen": -3.0997517108917236, "logits/rejected": -3.0937392711639404, "logps/chosen": -0.672328770160675, "logps/rejected": -0.9460701942443848, "loss": 0.7295, "odds_ratio_loss": 0.5718902349472046, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06723286956548691, "rewards/margins": 0.02737414464354515, "rewards/rejected": -0.09460701793432236, "sft_loss": 0.672328770160675, "step": 1550 }, { "epoch": 2.6418289585097376, "grad_norm": 3.3697926998138428, "learning_rate": 1.7167872944552245e-07, "logits/chosen": -3.0951905250549316, "logits/rejected": -3.143221378326416, "logps/chosen": -0.6983746290206909, "logps/rejected": -0.9046236276626587, "loss": 0.7572, "odds_ratio_loss": 0.5877906084060669, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06983745843172073, "rewards/margins": 0.020624909549951553, "rewards/rejected": -0.09046236425638199, "sft_loss": 0.6983746290206909, "step": 1560 }, { "epoch": 2.6587637595258258, "grad_norm": 2.3217389583587646, "learning_rate": 1.5588575693777142e-07, "logits/chosen": -3.1411678791046143, "logits/rejected": -3.1680665016174316, "logps/chosen": -0.7495251297950745, "logps/rejected": -0.8962133526802063, "loss": 0.8135, "odds_ratio_loss": 0.6393758058547974, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.07495252043008804, "rewards/margins": 0.01466882973909378, "rewards/rejected": -0.08962134271860123, "sft_loss": 0.7495251297950745, "step": 1570 }, { "epoch": 2.675698560541914, "grad_norm": 2.695345401763916, "learning_rate": 1.4083131700856428e-07, "logits/chosen": -3.093736410140991, "logits/rejected": -3.154810905456543, "logps/chosen": -0.7956855893135071, "logps/rejected": -0.9356697201728821, "loss": 0.8605, "odds_ratio_loss": 0.6480044722557068, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.07956856489181519, "rewards/margins": 0.013998406007885933, "rewards/rejected": -0.09356696903705597, "sft_loss": 0.7956855893135071, "step": 1580 }, { "epoch": 2.6926333615580016, "grad_norm": 4.358800888061523, "learning_rate": 1.2652015268370315e-07, "logits/chosen": -3.150224208831787, "logits/rejected": -3.183022975921631, "logps/chosen": -0.7435088753700256, "logps/rejected": -0.9757458567619324, "loss": 0.8044, "odds_ratio_loss": 0.6088230013847351, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07435088604688644, "rewards/margins": 0.02322370745241642, "rewards/rejected": -0.09757460653781891, "sft_loss": 0.7435088753700256, "step": 1590 }, { "epoch": 2.7095681625740897, "grad_norm": 1.9499818086624146, "learning_rate": 1.1295677281386502e-07, "logits/chosen": -3.1664371490478516, "logits/rejected": -3.1615631580352783, "logps/chosen": -0.8278031349182129, "logps/rejected": -1.062387228012085, "loss": 0.8914, "odds_ratio_loss": 0.6362180113792419, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08278031647205353, "rewards/margins": 0.02345840446650982, "rewards/rejected": -0.1062387228012085, "sft_loss": 0.8278031349182129, "step": 1600 }, { "epoch": 2.726502963590178, "grad_norm": 1.277694821357727, "learning_rate": 1.0014545065404973e-07, "logits/chosen": -3.12839674949646, "logits/rejected": -3.1691813468933105, "logps/chosen": -0.8073375821113586, "logps/rejected": -1.0353208780288696, "loss": 0.8777, "odds_ratio_loss": 0.7032862305641174, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0807337611913681, "rewards/margins": 0.022798333317041397, "rewards/rejected": -0.1035320907831192, "sft_loss": 0.8073375821113586, "step": 1610 }, { "epoch": 2.743437764606266, "grad_norm": 1.525190830230713, "learning_rate": 8.809022251725502e-08, "logits/chosen": -3.1842474937438965, "logits/rejected": -3.156172752380371, "logps/chosen": -0.7726050615310669, "logps/rejected": -1.0311490297317505, "loss": 0.8348, "odds_ratio_loss": 0.6221681833267212, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07726050913333893, "rewards/margins": 0.02585439942777157, "rewards/rejected": -0.10311490297317505, "sft_loss": 0.7726050615310669, "step": 1620 }, { "epoch": 2.7603725656223537, "grad_norm": 2.015005111694336, "learning_rate": 7.679488650280509e-08, "logits/chosen": -3.192863702774048, "logits/rejected": -3.213048219680786, "logps/chosen": -0.7830113172531128, "logps/rejected": -0.9780160188674927, "loss": 0.8429, "odds_ratio_loss": 0.5990433096885681, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07830111682415009, "rewards/margins": 0.019500473514199257, "rewards/rejected": -0.09780160337686539, "sft_loss": 0.7830113172531128, "step": 1630 }, { "epoch": 2.777307366638442, "grad_norm": 2.5389175415039062, "learning_rate": 6.626300129972563e-08, "logits/chosen": -3.122981071472168, "logits/rejected": -3.214785099029541, "logps/chosen": -0.7296528220176697, "logps/rejected": -0.955167293548584, "loss": 0.792, "odds_ratio_loss": 0.6230596303939819, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07296527922153473, "rewards/margins": 0.02255145087838173, "rewards/rejected": -0.09551674872636795, "sft_loss": 0.7296528220176697, "step": 1640 }, { "epoch": 2.79424216765453, "grad_norm": 1.7503160238265991, "learning_rate": 5.649788506555065e-08, "logits/chosen": -3.115635871887207, "logits/rejected": -3.166555881500244, "logps/chosen": -0.7383006811141968, "logps/rejected": -0.9469397664070129, "loss": 0.797, "odds_ratio_loss": 0.5870878100395203, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07383007556200027, "rewards/margins": 0.020863894373178482, "rewards/rejected": -0.09469397366046906, "sft_loss": 0.7383006811141968, "step": 1650 }, { "epoch": 2.811176968670618, "grad_norm": 5.490494728088379, "learning_rate": 4.7502614380908474e-08, "logits/chosen": -3.1543877124786377, "logits/rejected": -3.134704113006592, "logps/chosen": -0.7694907188415527, "logps/rejected": -0.9553998112678528, "loss": 0.8333, "odds_ratio_loss": 0.6379188895225525, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07694907486438751, "rewards/margins": 0.018590910360217094, "rewards/rejected": -0.09553998708724976, "sft_loss": 0.7694907188415527, "step": 1660 }, { "epoch": 2.828111769686706, "grad_norm": 3.784432888031006, "learning_rate": 3.9280023280222066e-08, "logits/chosen": -3.1404221057891846, "logits/rejected": -3.1622607707977295, "logps/chosen": -0.757176399230957, "logps/rejected": -0.9332196116447449, "loss": 0.8222, "odds_ratio_loss": 0.6507169008255005, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07571764290332794, "rewards/margins": 0.017604324966669083, "rewards/rejected": -0.09332196414470673, "sft_loss": 0.757176399230957, "step": 1670 }, { "epoch": 2.8450465707027943, "grad_norm": 2.088921546936035, "learning_rate": 3.1832702358818855e-08, "logits/chosen": -3.116088390350342, "logits/rejected": -3.1639227867126465, "logps/chosen": -0.8132543563842773, "logps/rejected": -1.068440556526184, "loss": 0.8739, "odds_ratio_loss": 0.6062491536140442, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08132544159889221, "rewards/margins": 0.025518611073493958, "rewards/rejected": -0.10684405267238617, "sft_loss": 0.8132543563842773, "step": 1680 }, { "epoch": 2.8619813717188824, "grad_norm": 2.5282764434814453, "learning_rate": 2.5162997956746647e-08, "logits/chosen": -3.1500442028045654, "logits/rejected": -3.171433210372925, "logps/chosen": -0.7479134202003479, "logps/rejected": -0.9974180459976196, "loss": 0.8076, "odds_ratio_loss": 0.596734344959259, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0747913345694542, "rewards/margins": 0.02495047077536583, "rewards/rejected": -0.09974180907011032, "sft_loss": 0.7479134202003479, "step": 1690 }, { "epoch": 2.8789161727349706, "grad_norm": 3.469899892807007, "learning_rate": 1.9273011419536914e-08, "logits/chosen": -3.140916585922241, "logits/rejected": -3.16453218460083, "logps/chosen": -0.7706011533737183, "logps/rejected": -0.9274336695671082, "loss": 0.8378, "odds_ratio_loss": 0.6717150211334229, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07706011831760406, "rewards/margins": 0.0156832467764616, "rewards/rejected": -0.09274337440729141, "sft_loss": 0.7706011533737183, "step": 1700 }, { "epoch": 2.8958509737510583, "grad_norm": 1.3444654941558838, "learning_rate": 1.4164598436159083e-08, "logits/chosen": -3.1658730506896973, "logits/rejected": -3.1828174591064453, "logps/chosen": -0.7568970322608948, "logps/rejected": -0.9151817560195923, "loss": 0.82, "odds_ratio_loss": 0.630837082862854, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07568971067667007, "rewards/margins": 0.01582847535610199, "rewards/rejected": -0.09151817858219147, "sft_loss": 0.7568970322608948, "step": 1710 }, { "epoch": 2.9127857747671464, "grad_norm": 2.7227256298065186, "learning_rate": 9.839368454371556e-09, "logits/chosen": -3.1114795207977295, "logits/rejected": -3.11403751373291, "logps/chosen": -0.7409245371818542, "logps/rejected": -1.0069682598114014, "loss": 0.8046, "odds_ratio_loss": 0.6370644569396973, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07409246265888214, "rewards/margins": 0.026604369282722473, "rewards/rejected": -0.10069682449102402, "sft_loss": 0.7409245371818542, "step": 1720 }, { "epoch": 2.9297205757832345, "grad_norm": 1.9127988815307617, "learning_rate": 6.298684173650649e-09, "logits/chosen": -3.0886847972869873, "logits/rejected": -3.123133897781372, "logps/chosen": -0.7336040735244751, "logps/rejected": -1.0557795763015747, "loss": 0.7977, "odds_ratio_loss": 0.640912652015686, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07336040586233139, "rewards/margins": 0.03221754729747772, "rewards/rejected": -0.10557796061038971, "sft_loss": 0.7336040735244751, "step": 1730 }, { "epoch": 2.9466553767993227, "grad_norm": 4.324622631072998, "learning_rate": 3.543661115860686e-09, "logits/chosen": -3.103651523590088, "logits/rejected": -3.136838674545288, "logps/chosen": -0.790323793888092, "logps/rejected": -1.0052763223648071, "loss": 0.8552, "odds_ratio_loss": 0.6486952900886536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07903237640857697, "rewards/margins": 0.021495262160897255, "rewards/rejected": -0.10052764415740967, "sft_loss": 0.790323793888092, "step": 1740 }, { "epoch": 2.963590177815411, "grad_norm": 2.1066298484802246, "learning_rate": 1.575167273800693e-09, "logits/chosen": -3.1322402954101562, "logits/rejected": -3.141758680343628, "logps/chosen": -0.7295509576797485, "logps/rejected": -0.8775620460510254, "loss": 0.7914, "odds_ratio_loss": 0.6189672350883484, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07295509427785873, "rewards/margins": 0.01480111014097929, "rewards/rejected": -0.0877562090754509, "sft_loss": 0.7295509576797485, "step": 1750 }, { "epoch": 2.9805249788314985, "grad_norm": 2.692870855331421, "learning_rate": 3.9382283773564676e-10, "logits/chosen": -3.153014659881592, "logits/rejected": -3.1698267459869385, "logps/chosen": -0.8286467790603638, "logps/rejected": -1.0176646709442139, "loss": 0.8971, "odds_ratio_loss": 0.6844185590744019, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0828646719455719, "rewards/margins": 0.018901783972978592, "rewards/rejected": -0.10176645219326019, "sft_loss": 0.8286467790603638, "step": 1760 }, { "epoch": 2.9974597798475866, "grad_norm": 11.2490234375, "learning_rate": 0.0, "logits/chosen": -3.122014284133911, "logits/rejected": -3.142989158630371, "logps/chosen": -0.8531309962272644, "logps/rejected": -1.0983434915542603, "loss": 0.9206, "odds_ratio_loss": 0.6742203235626221, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0853130966424942, "rewards/margins": 0.024521255865693092, "rewards/rejected": -0.10983435064554214, "sft_loss": 0.8531309962272644, "step": 1770 }, { "epoch": 2.9974597798475866, "step": 1770, "total_flos": 2.0399855839629804e+18, "train_loss": 0.8663494454938813, "train_runtime": 17638.0542, "train_samples_per_second": 1.607, "train_steps_per_second": 0.1 } ], "logging_steps": 10, "max_steps": 1770, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.0399855839629804e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }