chchen's picture
End of training
002e210 verified
{
"best_metric": 0.8505691885948181,
"best_model_checkpoint": "saves/Mistral-7B-Instruct-v0.3/lora/orpo-salt-half/checkpoint-1500",
"epoch": 2.9974597798475866,
"eval_steps": 500,
"global_step": 1770,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01693480101608806,
"grad_norm": 8.384780883789062,
"learning_rate": 4.999614014035063e-06,
"logits/chosen": -2.9335973262786865,
"logits/rejected": -2.9718575477600098,
"logps/chosen": -1.0935328006744385,
"logps/rejected": -1.5320154428482056,
"loss": 1.1602,
"odds_ratio_loss": 0.6671324968338013,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.10935328900814056,
"rewards/margins": 0.04384826496243477,
"rewards/rejected": -0.15320155024528503,
"sft_loss": 1.0935328006744385,
"step": 10
},
{
"epoch": 0.03386960203217612,
"grad_norm": 5.07673454284668,
"learning_rate": 4.998440543386042e-06,
"logits/chosen": -2.899705648422241,
"logits/rejected": -2.8991312980651855,
"logps/chosen": -1.0815098285675049,
"logps/rejected": -1.5599451065063477,
"loss": 1.1396,
"odds_ratio_loss": 0.5810434818267822,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.10815098136663437,
"rewards/margins": 0.04784352704882622,
"rewards/rejected": -0.1559945046901703,
"sft_loss": 1.0815098285675049,
"step": 20
},
{
"epoch": 0.05080440304826418,
"grad_norm": 4.302060127258301,
"learning_rate": 4.996479918381253e-06,
"logits/chosen": -2.944632053375244,
"logits/rejected": -2.975834846496582,
"logps/chosen": -1.056970238685608,
"logps/rejected": -1.281690001487732,
"loss": 1.1267,
"odds_ratio_loss": 0.6977876424789429,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.10569703578948975,
"rewards/margins": 0.022471977397799492,
"rewards/rejected": -0.1281690150499344,
"sft_loss": 1.056970238685608,
"step": 30
},
{
"epoch": 0.06773920406435224,
"grad_norm": 3.221508026123047,
"learning_rate": 4.993732756731818e-06,
"logits/chosen": -2.925711154937744,
"logits/rejected": -2.972975730895996,
"logps/chosen": -0.8646748661994934,
"logps/rejected": -1.3530806303024292,
"loss": 0.9216,
"odds_ratio_loss": 0.5692964792251587,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08646748960018158,
"rewards/margins": 0.04884058237075806,
"rewards/rejected": -0.13530807197093964,
"sft_loss": 0.8646748661994934,
"step": 40
},
{
"epoch": 0.0846740050804403,
"grad_norm": 11.413677215576172,
"learning_rate": 4.9901999239537345e-06,
"logits/chosen": -2.9809317588806152,
"logits/rejected": -2.983664035797119,
"logps/chosen": -0.9910508990287781,
"logps/rejected": -1.344020962715149,
"loss": 1.0558,
"odds_ratio_loss": 0.6479853987693787,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0991050973534584,
"rewards/margins": 0.035297006368637085,
"rewards/rejected": -0.1344021111726761,
"sft_loss": 0.9910508990287781,
"step": 50
},
{
"epoch": 0.10160880609652836,
"grad_norm": 1.7445276975631714,
"learning_rate": 4.985882533095186e-06,
"logits/chosen": -2.9525933265686035,
"logits/rejected": -2.955658435821533,
"logps/chosen": -0.9359370470046997,
"logps/rejected": -1.2753798961639404,
"loss": 1.0023,
"odds_ratio_loss": 0.6634014248847961,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.09359370172023773,
"rewards/margins": 0.03394431248307228,
"rewards/rejected": -0.12753799557685852,
"sft_loss": 0.9359370470046997,
"step": 60
},
{
"epoch": 0.11854360711261643,
"grad_norm": 3.4384100437164307,
"learning_rate": 4.9807819443858705e-06,
"logits/chosen": -2.956145763397217,
"logits/rejected": -2.9750401973724365,
"logps/chosen": -0.8652639389038086,
"logps/rejected": -1.2253761291503906,
"loss": 0.9227,
"odds_ratio_loss": 0.5748167634010315,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.08652639389038086,
"rewards/margins": 0.0360112190246582,
"rewards/rejected": -0.12253761291503906,
"sft_loss": 0.8652639389038086,
"step": 70
},
{
"epoch": 0.1354784081287045,
"grad_norm": 2.6011013984680176,
"learning_rate": 4.9748997648084404e-06,
"logits/chosen": -2.927016496658325,
"logits/rejected": -2.961350917816162,
"logps/chosen": -0.9362251162528992,
"logps/rejected": -1.1120071411132812,
"loss": 1.0068,
"odds_ratio_loss": 0.7062079906463623,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09362251311540604,
"rewards/margins": 0.017578203231096268,
"rewards/rejected": -0.111200712621212,
"sft_loss": 0.9362251162528992,
"step": 80
},
{
"epoch": 0.15241320914479256,
"grad_norm": 2.413475751876831,
"learning_rate": 4.96823784759222e-06,
"logits/chosen": -2.869145154953003,
"logits/rejected": -2.916862964630127,
"logps/chosen": -0.8781692385673523,
"logps/rejected": -1.2194924354553223,
"loss": 0.9334,
"odds_ratio_loss": 0.552145779132843,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.08781692385673523,
"rewards/margins": 0.034132327884435654,
"rewards/rejected": -0.12194924056529999,
"sft_loss": 0.8781692385673523,
"step": 90
},
{
"epoch": 0.1693480101608806,
"grad_norm": 1.9281786680221558,
"learning_rate": 4.960798291629323e-06,
"logits/chosen": -2.9603519439697266,
"logits/rejected": -2.9720394611358643,
"logps/chosen": -0.8728249669075012,
"logps/rejected": -1.0510112047195435,
"loss": 0.9387,
"odds_ratio_loss": 0.658331036567688,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08728249371051788,
"rewards/margins": 0.017818626016378403,
"rewards/rejected": -0.10510112345218658,
"sft_loss": 0.8728249669075012,
"step": 100
},
{
"epoch": 0.18628281117696868,
"grad_norm": 2.8832967281341553,
"learning_rate": 4.952583440813383e-06,
"logits/chosen": -2.9769511222839355,
"logits/rejected": -2.991858720779419,
"logps/chosen": -0.9942096471786499,
"logps/rejected": -1.25453519821167,
"loss": 1.0667,
"odds_ratio_loss": 0.7251978516578674,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.09942097961902618,
"rewards/margins": 0.026032552123069763,
"rewards/rejected": -0.12545353174209595,
"sft_loss": 0.9942096471786499,
"step": 110
},
{
"epoch": 0.20321761219305673,
"grad_norm": 1.2281321287155151,
"learning_rate": 4.943595883301086e-06,
"logits/chosen": -2.9461495876312256,
"logits/rejected": -2.988556385040283,
"logps/chosen": -0.8882713317871094,
"logps/rejected": -1.1232919692993164,
"loss": 0.9522,
"odds_ratio_loss": 0.6390038728713989,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08882713317871094,
"rewards/margins": 0.02350207045674324,
"rewards/rejected": -0.11232920736074448,
"sft_loss": 0.8882713317871094,
"step": 120
},
{
"epoch": 0.2201524132091448,
"grad_norm": 2.2648568153381348,
"learning_rate": 4.933838450696757e-06,
"logits/chosen": -2.9817254543304443,
"logits/rejected": -3.0018677711486816,
"logps/chosen": -0.8987566828727722,
"logps/rejected": -1.0799884796142578,
"loss": 0.9707,
"odds_ratio_loss": 0.7198494672775269,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.08987566828727722,
"rewards/margins": 0.018123187124729156,
"rewards/rejected": -0.10799884796142578,
"sft_loss": 0.8987566828727722,
"step": 130
},
{
"epoch": 0.23708721422523285,
"grad_norm": 3.667508840560913,
"learning_rate": 4.923314217160234e-06,
"logits/chosen": -2.988133192062378,
"logits/rejected": -3.033163070678711,
"logps/chosen": -0.9288301467895508,
"logps/rejected": -1.1756031513214111,
"loss": 0.9973,
"odds_ratio_loss": 0.6846222877502441,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09288302063941956,
"rewards/margins": 0.024677302688360214,
"rewards/rejected": -0.11756031215190887,
"sft_loss": 0.9288301467895508,
"step": 140
},
{
"epoch": 0.2540220152413209,
"grad_norm": 1.4715845584869385,
"learning_rate": 4.9120264984383285e-06,
"logits/chosen": -2.997676134109497,
"logits/rejected": -2.9935667514801025,
"logps/chosen": -0.8239797353744507,
"logps/rejected": -0.9983611106872559,
"loss": 0.8869,
"odds_ratio_loss": 0.6288361549377441,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08239796757698059,
"rewards/margins": 0.017438137903809547,
"rewards/rejected": -0.09983611106872559,
"sft_loss": 0.8239797353744507,
"step": 150
},
{
"epoch": 0.270956816257409,
"grad_norm": 1.3789633512496948,
"learning_rate": 4.899978850820176e-06,
"logits/chosen": -3.004080295562744,
"logits/rejected": -3.0049595832824707,
"logps/chosen": -0.9372480511665344,
"logps/rejected": -1.147570252418518,
"loss": 1.0008,
"odds_ratio_loss": 0.635485053062439,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0937248021364212,
"rewards/margins": 0.021032210439443588,
"rewards/rejected": -0.11475701630115509,
"sft_loss": 0.9372480511665344,
"step": 160
},
{
"epoch": 0.28789161727349705,
"grad_norm": 2.1128056049346924,
"learning_rate": 4.887175070016795e-06,
"logits/chosen": -3.027259349822998,
"logits/rejected": -3.0453531742095947,
"logps/chosen": -0.7962668538093567,
"logps/rejected": -1.0727083683013916,
"loss": 0.8585,
"odds_ratio_loss": 0.6222477555274963,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07962668687105179,
"rewards/margins": 0.027644142508506775,
"rewards/rejected": -0.10727082192897797,
"sft_loss": 0.7962668538093567,
"step": 170
},
{
"epoch": 0.3048264182895851,
"grad_norm": 4.011683464050293,
"learning_rate": 4.873619189965217e-06,
"logits/chosen": -2.973634719848633,
"logits/rejected": -2.989631414413452,
"logps/chosen": -0.8350585103034973,
"logps/rejected": -1.051026463508606,
"loss": 0.9,
"odds_ratio_loss": 0.6493778824806213,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08350586146116257,
"rewards/margins": 0.021596785634756088,
"rewards/rejected": -0.10510264337062836,
"sft_loss": 0.8350585103034973,
"step": 180
},
{
"epoch": 0.32176121930567314,
"grad_norm": 1.2796882390975952,
"learning_rate": 4.859315481557563e-06,
"logits/chosen": -2.9721431732177734,
"logits/rejected": -2.9849448204040527,
"logps/chosen": -0.821880042552948,
"logps/rejected": -1.0275905132293701,
"loss": 0.8867,
"odds_ratio_loss": 0.648421585559845,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08218801021575928,
"rewards/margins": 0.0205710306763649,
"rewards/rejected": -0.10275904089212418,
"sft_loss": 0.821880042552948,
"step": 190
},
{
"epoch": 0.3386960203217612,
"grad_norm": 1.8488657474517822,
"learning_rate": 4.84426845129546e-06,
"logits/chosen": -3.0222136974334717,
"logits/rejected": -3.031564235687256,
"logps/chosen": -0.8574220538139343,
"logps/rejected": -0.9888635873794556,
"loss": 0.9247,
"odds_ratio_loss": 0.6725481748580933,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08574221283197403,
"rewards/margins": 0.013144141063094139,
"rewards/rejected": -0.09888636320829391,
"sft_loss": 0.8574220538139343,
"step": 200
},
{
"epoch": 0.3556308213378493,
"grad_norm": 1.5179060697555542,
"learning_rate": 4.828482839870233e-06,
"logits/chosen": -3.010849714279175,
"logits/rejected": -3.0288453102111816,
"logps/chosen": -0.8511277437210083,
"logps/rejected": -0.9780422449111938,
"loss": 0.9215,
"odds_ratio_loss": 0.7037326693534851,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08511276543140411,
"rewards/margins": 0.012691453099250793,
"rewards/rejected": -0.0978042259812355,
"sft_loss": 0.8511277437210083,
"step": 210
},
{
"epoch": 0.37256562235393736,
"grad_norm": 1.2089393138885498,
"learning_rate": 4.811963620669314e-06,
"logits/chosen": -3.0581696033477783,
"logits/rejected": -3.099684476852417,
"logps/chosen": -0.8706866502761841,
"logps/rejected": -1.0248219966888428,
"loss": 0.9377,
"odds_ratio_loss": 0.6702378988265991,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.08706866204738617,
"rewards/margins": 0.015413539484143257,
"rewards/rejected": -0.10248219966888428,
"sft_loss": 0.8706866502761841,
"step": 220
},
{
"epoch": 0.3895004233700254,
"grad_norm": 2.3151497840881348,
"learning_rate": 4.794715998209328e-06,
"logits/chosen": -2.9359335899353027,
"logits/rejected": -2.9507675170898438,
"logps/chosen": -0.8457564115524292,
"logps/rejected": -1.0168864727020264,
"loss": 0.9116,
"odds_ratio_loss": 0.6584563255310059,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08457564562559128,
"rewards/margins": 0.017113011330366135,
"rewards/rejected": -0.10168864578008652,
"sft_loss": 0.8457564115524292,
"step": 230
},
{
"epoch": 0.40643522438611346,
"grad_norm": 2.3710029125213623,
"learning_rate": 4.7767454064963724e-06,
"logits/chosen": -2.9894230365753174,
"logits/rejected": -3.022892475128174,
"logps/chosen": -0.8459660410881042,
"logps/rejected": -1.015751838684082,
"loss": 0.9112,
"odds_ratio_loss": 0.6523627042770386,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.08459659665822983,
"rewards/margins": 0.016978587955236435,
"rewards/rejected": -0.10157518088817596,
"sft_loss": 0.8459660410881042,
"step": 240
},
{
"epoch": 0.42337002540220153,
"grad_norm": 2.1464998722076416,
"learning_rate": 4.758057507313987e-06,
"logits/chosen": -3.024229049682617,
"logits/rejected": -3.034865617752075,
"logps/chosen": -0.8055688142776489,
"logps/rejected": -1.001123070716858,
"loss": 0.869,
"odds_ratio_loss": 0.6340595483779907,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08055686950683594,
"rewards/margins": 0.019555436447262764,
"rewards/rejected": -0.10011231899261475,
"sft_loss": 0.8055688142776489,
"step": 250
},
{
"epoch": 0.4403048264182896,
"grad_norm": 3.3724582195281982,
"learning_rate": 4.73865818843936e-06,
"logits/chosen": -3.042086124420166,
"logits/rejected": -3.0638880729675293,
"logps/chosen": -0.912920355796814,
"logps/rejected": -1.1694436073303223,
"loss": 0.9795,
"odds_ratio_loss": 0.6657846570014954,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09129203855991364,
"rewards/margins": 0.025652330368757248,
"rewards/rejected": -0.11694437265396118,
"sft_loss": 0.912920355796814,
"step": 260
},
{
"epoch": 0.4572396274343776,
"grad_norm": 1.6793408393859863,
"learning_rate": 4.718553561788339e-06,
"logits/chosen": -3.0312306880950928,
"logits/rejected": -3.060295820236206,
"logps/chosen": -0.836656928062439,
"logps/rejected": -0.986899733543396,
"loss": 0.9016,
"odds_ratio_loss": 0.6491778492927551,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08366570621728897,
"rewards/margins": 0.015024276450276375,
"rewards/rejected": -0.0986899808049202,
"sft_loss": 0.836656928062439,
"step": 270
},
{
"epoch": 0.4741744284504657,
"grad_norm": 2.6274185180664062,
"learning_rate": 4.697749961489822e-06,
"logits/chosen": -3.061577558517456,
"logits/rejected": -3.0702619552612305,
"logps/chosen": -0.8848905563354492,
"logps/rejected": -1.0645692348480225,
"loss": 0.9531,
"odds_ratio_loss": 0.6824666857719421,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.08848904818296432,
"rewards/margins": 0.017967868596315384,
"rewards/rejected": -0.10645692050457001,
"sft_loss": 0.8848905563354492,
"step": 280
},
{
"epoch": 0.4911092294665538,
"grad_norm": 2.089768886566162,
"learning_rate": 4.67625394189013e-06,
"logits/chosen": -3.0883841514587402,
"logits/rejected": -3.090940475463867,
"logps/chosen": -0.7822630405426025,
"logps/rejected": -1.0061594247817993,
"loss": 0.8425,
"odds_ratio_loss": 0.6027771830558777,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07822629809379578,
"rewards/margins": 0.022389648482203484,
"rewards/rejected": -0.100615955889225,
"sft_loss": 0.7822630405426025,
"step": 290
},
{
"epoch": 0.5080440304826418,
"grad_norm": 2.2402989864349365,
"learning_rate": 4.654072275488016e-06,
"logits/chosen": -3.1045384407043457,
"logits/rejected": -3.1042561531066895,
"logps/chosen": -0.7530822157859802,
"logps/rejected": -0.9303094744682312,
"loss": 0.8152,
"odds_ratio_loss": 0.621616542339325,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07530822604894638,
"rewards/margins": 0.017722725868225098,
"rewards/rejected": -0.09303095191717148,
"sft_loss": 0.7530822157859802,
"step": 300
},
{
"epoch": 0.5249788314987299,
"grad_norm": 7.859750747680664,
"learning_rate": 4.631211950800925e-06,
"logits/chosen": -3.0885701179504395,
"logits/rejected": -3.1314892768859863,
"logps/chosen": -0.8193107843399048,
"logps/rejected": -1.0114855766296387,
"loss": 0.8855,
"odds_ratio_loss": 0.6622810959815979,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08193108439445496,
"rewards/margins": 0.01921747997403145,
"rewards/rejected": -0.1011485680937767,
"sft_loss": 0.8193107843399048,
"step": 310
},
{
"epoch": 0.541913632514818,
"grad_norm": 3.1328227519989014,
"learning_rate": 4.6076801701632095e-06,
"logits/chosen": -3.087184190750122,
"logits/rejected": -3.1436970233917236,
"logps/chosen": -0.801365852355957,
"logps/rejected": -0.9046095609664917,
"loss": 0.8686,
"odds_ratio_loss": 0.6719350814819336,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08013658970594406,
"rewards/margins": 0.010324367322027683,
"rewards/rejected": -0.09046096354722977,
"sft_loss": 0.801365852355957,
"step": 320
},
{
"epoch": 0.558848433530906,
"grad_norm": 2.4766950607299805,
"learning_rate": 4.583484347456972e-06,
"logits/chosen": -3.0817341804504395,
"logits/rejected": -3.114269256591797,
"logps/chosen": -0.8467103242874146,
"logps/rejected": -0.9794108271598816,
"loss": 0.9136,
"odds_ratio_loss": 0.6689561009407043,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0846710279583931,
"rewards/margins": 0.013270048424601555,
"rewards/rejected": -0.0979410782456398,
"sft_loss": 0.8467103242874146,
"step": 330
},
{
"epoch": 0.5757832345469941,
"grad_norm": 1.4625582695007324,
"learning_rate": 4.55863210577626e-06,
"logits/chosen": -3.123152256011963,
"logits/rejected": -3.1361846923828125,
"logps/chosen": -0.8682034611701965,
"logps/rejected": -1.090419888496399,
"loss": 0.9349,
"odds_ratio_loss": 0.6673767566680908,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08682034909725189,
"rewards/margins": 0.022221634164452553,
"rewards/rejected": -0.10904198884963989,
"sft_loss": 0.8682034611701965,
"step": 340
},
{
"epoch": 0.5927180355630821,
"grad_norm": 1.6330158710479736,
"learning_rate": 4.5331312750253465e-06,
"logits/chosen": -3.079641103744507,
"logits/rejected": -3.1221771240234375,
"logps/chosen": -0.7776955962181091,
"logps/rejected": -0.978277325630188,
"loss": 0.8441,
"odds_ratio_loss": 0.6638715267181396,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07776956260204315,
"rewards/margins": 0.02005816623568535,
"rewards/rejected": -0.0978277251124382,
"sft_loss": 0.7776955962181091,
"step": 350
},
{
"epoch": 0.6096528365791702,
"grad_norm": 1.9444369077682495,
"learning_rate": 4.506989889451858e-06,
"logits/chosen": -3.1103367805480957,
"logits/rejected": -3.1434414386749268,
"logps/chosen": -0.8445068597793579,
"logps/rejected": -0.9869591593742371,
"loss": 0.9108,
"odds_ratio_loss": 0.662727952003479,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.08445067703723907,
"rewards/margins": 0.014245236292481422,
"rewards/rejected": -0.09869591891765594,
"sft_loss": 0.8445068597793579,
"step": 360
},
{
"epoch": 0.6265876375952583,
"grad_norm": 2.251150369644165,
"learning_rate": 4.480216185115512e-06,
"logits/chosen": -3.0998404026031494,
"logits/rejected": -3.087653398513794,
"logps/chosen": -0.8066253662109375,
"logps/rejected": -0.9696222543716431,
"loss": 0.8704,
"odds_ratio_loss": 0.6379208564758301,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08066253364086151,
"rewards/margins": 0.016299689188599586,
"rewards/rejected": -0.09696222841739655,
"sft_loss": 0.8066253662109375,
"step": 370
},
{
"epoch": 0.6435224386113463,
"grad_norm": 1.8241935968399048,
"learning_rate": 4.4528185972932856e-06,
"logits/chosen": -3.019221782684326,
"logits/rejected": -3.083693265914917,
"logps/chosen": -0.8054075241088867,
"logps/rejected": -1.0394740104675293,
"loss": 0.8728,
"odds_ratio_loss": 0.6744040846824646,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08054076135158539,
"rewards/margins": 0.023406637832522392,
"rewards/rejected": -0.10394741594791412,
"sft_loss": 0.8054075241088867,
"step": 380
},
{
"epoch": 0.6604572396274344,
"grad_norm": 7.428930759429932,
"learning_rate": 4.424805757821803e-06,
"logits/chosen": -3.0259501934051514,
"logits/rejected": -3.0692214965820312,
"logps/chosen": -0.8603572845458984,
"logps/rejected": -0.9688261151313782,
"loss": 0.9297,
"odds_ratio_loss": 0.6930958032608032,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.08603573590517044,
"rewards/margins": 0.010846875607967377,
"rewards/rejected": -0.09688261151313782,
"sft_loss": 0.8603572845458984,
"step": 390
},
{
"epoch": 0.6773920406435224,
"grad_norm": 1.7724531888961792,
"learning_rate": 4.396186492377812e-06,
"logits/chosen": -3.049534797668457,
"logits/rejected": -3.0946240425109863,
"logps/chosen": -0.804740309715271,
"logps/rejected": -1.0287699699401855,
"loss": 0.8638,
"odds_ratio_loss": 0.5903798341751099,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08047403395175934,
"rewards/margins": 0.02240295708179474,
"rewards/rejected": -0.10287699848413467,
"sft_loss": 0.804740309715271,
"step": 400
},
{
"epoch": 0.6943268416596104,
"grad_norm": 2.2618014812469482,
"learning_rate": 4.366969817697578e-06,
"logits/chosen": -3.0235114097595215,
"logits/rejected": -3.083573341369629,
"logps/chosen": -0.7958801984786987,
"logps/rejected": -0.9750314950942993,
"loss": 0.8637,
"odds_ratio_loss": 0.6784581542015076,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07958801835775375,
"rewards/margins": 0.01791512593626976,
"rewards/rejected": -0.09750314056873322,
"sft_loss": 0.7958801984786987,
"step": 410
},
{
"epoch": 0.7112616426756986,
"grad_norm": 2.504657506942749,
"learning_rate": 4.337164938736086e-06,
"logits/chosen": -3.049118757247925,
"logits/rejected": -3.05595326423645,
"logps/chosen": -0.8237883448600769,
"logps/rejected": -0.918091893196106,
"loss": 0.8939,
"odds_ratio_loss": 0.7009984254837036,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.08237884193658829,
"rewards/margins": 0.00943034328520298,
"rewards/rejected": -0.09180918335914612,
"sft_loss": 0.8237883448600769,
"step": 420
},
{
"epoch": 0.7281964436917866,
"grad_norm": 1.2601932287216187,
"learning_rate": 4.306781245766945e-06,
"logits/chosen": -3.0380876064300537,
"logits/rejected": -3.0915472507476807,
"logps/chosen": -0.7913134098052979,
"logps/rejected": -1.0263410806655884,
"loss": 0.8569,
"odds_ratio_loss": 0.6553653478622437,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.0791313424706459,
"rewards/margins": 0.023502767086029053,
"rewards/rejected": -0.10263410955667496,
"sft_loss": 0.7913134098052979,
"step": 430
},
{
"epoch": 0.7451312447078747,
"grad_norm": 4.184472560882568,
"learning_rate": 4.275828311423903e-06,
"logits/chosen": -3.093045473098755,
"logits/rejected": -3.094749927520752,
"logps/chosen": -0.8900951147079468,
"logps/rejected": -0.9482099413871765,
"loss": 0.9611,
"odds_ratio_loss": 0.7105128765106201,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08900952339172363,
"rewards/margins": 0.005811482667922974,
"rewards/rejected": -0.09482099860906601,
"sft_loss": 0.8900951147079468,
"step": 440
},
{
"epoch": 0.7620660457239627,
"grad_norm": 5.296163082122803,
"learning_rate": 4.244315887684912e-06,
"logits/chosen": -3.063938617706299,
"logits/rejected": -3.0689697265625,
"logps/chosen": -0.7619583010673523,
"logps/rejected": -0.9321343302726746,
"loss": 0.8277,
"odds_ratio_loss": 0.6576007008552551,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07619582116603851,
"rewards/margins": 0.017017606645822525,
"rewards/rejected": -0.09321344643831253,
"sft_loss": 0.7619583010673523,
"step": 450
},
{
"epoch": 0.7790008467400508,
"grad_norm": 1.0935202836990356,
"learning_rate": 4.212253902799685e-06,
"logits/chosen": -3.107138156890869,
"logits/rejected": -3.1228718757629395,
"logps/chosen": -0.828906238079071,
"logps/rejected": -1.0854886770248413,
"loss": 0.8936,
"odds_ratio_loss": 0.6471681594848633,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08289062976837158,
"rewards/margins": 0.02565823495388031,
"rewards/rejected": -0.10854886472225189,
"sft_loss": 0.828906238079071,
"step": 460
},
{
"epoch": 0.7959356477561389,
"grad_norm": 2.112194538116455,
"learning_rate": 4.179652458161718e-06,
"logits/chosen": -3.0884511470794678,
"logits/rejected": -3.086930751800537,
"logps/chosen": -0.8770621418952942,
"logps/rejected": -0.9670013189315796,
"loss": 0.9476,
"odds_ratio_loss": 0.7056951522827148,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08770622313022614,
"rewards/margins": 0.008993919007480145,
"rewards/rejected": -0.09670013934373856,
"sft_loss": 0.8770621418952942,
"step": 470
},
{
"epoch": 0.8128704487722269,
"grad_norm": 1.4803324937820435,
"learning_rate": 4.146521825125765e-06,
"logits/chosen": -3.1123909950256348,
"logits/rejected": -3.127878189086914,
"logps/chosen": -0.8552261590957642,
"logps/rejected": -0.9841393232345581,
"loss": 0.9226,
"odds_ratio_loss": 0.6738197803497314,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.0855226144194603,
"rewards/margins": 0.012891319580376148,
"rewards/rejected": -0.09841393679380417,
"sft_loss": 0.8552261590957642,
"step": 480
},
{
"epoch": 0.8298052497883149,
"grad_norm": 1.639560580253601,
"learning_rate": 4.11287244177176e-06,
"logits/chosen": -3.1404106616973877,
"logits/rejected": -3.1268608570098877,
"logps/chosen": -0.7970486283302307,
"logps/rejected": -1.0359351634979248,
"loss": 0.8576,
"odds_ratio_loss": 0.6055651307106018,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07970486581325531,
"rewards/margins": 0.02388865128159523,
"rewards/rejected": -0.10359351336956024,
"sft_loss": 0.7970486283302307,
"step": 490
},
{
"epoch": 0.8467400508044031,
"grad_norm": 7.098996162414551,
"learning_rate": 4.078714909616215e-06,
"logits/chosen": -3.1334495544433594,
"logits/rejected": -3.127417802810669,
"logps/chosen": -0.8159440755844116,
"logps/rejected": -1.052442193031311,
"loss": 0.8758,
"odds_ratio_loss": 0.5985492467880249,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.08159441500902176,
"rewards/margins": 0.02364981174468994,
"rewards/rejected": -0.10524420440196991,
"sft_loss": 0.8159440755844116,
"step": 500
},
{
"epoch": 0.8467400508044031,
"eval_logits/chosen": -3.1088695526123047,
"eval_logits/rejected": -3.1276028156280518,
"eval_logps/chosen": -0.8053962588310242,
"eval_logps/rejected": -1.0085769891738892,
"eval_loss": 0.8691067099571228,
"eval_odds_ratio_loss": 0.6371051073074341,
"eval_rewards/accuracies": 0.5704761743545532,
"eval_rewards/chosen": -0.08053962886333466,
"eval_rewards/margins": 0.02031807415187359,
"eval_rewards/rejected": -0.1008576974272728,
"eval_runtime": 194.1916,
"eval_samples_per_second": 5.407,
"eval_sft_loss": 0.8053962588310242,
"eval_steps_per_second": 2.704,
"step": 500
},
{
"epoch": 0.8636748518204911,
"grad_norm": 5.1792216300964355,
"learning_rate": 4.044059990272125e-06,
"logits/chosen": -3.1320395469665527,
"logits/rejected": -3.1584267616271973,
"logps/chosen": -0.8421246409416199,
"logps/rejected": -1.076226830482483,
"loss": 0.9067,
"odds_ratio_loss": 0.64554762840271,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08421246707439423,
"rewards/margins": 0.02341020107269287,
"rewards/rejected": -0.10762268304824829,
"sft_loss": 0.8421246409416199,
"step": 510
},
{
"epoch": 0.8806096528365792,
"grad_norm": 2.52073073387146,
"learning_rate": 4.0089186020584345e-06,
"logits/chosen": -3.1114721298217773,
"logits/rejected": -3.1454081535339355,
"logps/chosen": -0.8953489065170288,
"logps/rejected": -1.0218431949615479,
"loss": 0.9597,
"odds_ratio_loss": 0.6433413028717041,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08953489363193512,
"rewards/margins": 0.012649421580135822,
"rewards/rejected": -0.10218431800603867,
"sft_loss": 0.8953489065170288,
"step": 520
},
{
"epoch": 0.8975444538526672,
"grad_norm": 3.2529776096343994,
"learning_rate": 3.973301816560124e-06,
"logits/chosen": -3.1164424419403076,
"logits/rejected": -3.0959460735321045,
"logps/chosen": -0.8098430633544922,
"logps/rejected": -1.0107625722885132,
"loss": 0.871,
"odds_ratio_loss": 0.6116595268249512,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08098430931568146,
"rewards/margins": 0.0200919471681118,
"rewards/rejected": -0.10107626020908356,
"sft_loss": 0.8098430633544922,
"step": 530
},
{
"epoch": 0.9144792548687553,
"grad_norm": 2.4143612384796143,
"learning_rate": 3.937220855140021e-06,
"logits/chosen": -3.1310832500457764,
"logits/rejected": -3.1453144550323486,
"logps/chosen": -0.8188160061836243,
"logps/rejected": -0.9051122665405273,
"loss": 0.8891,
"odds_ratio_loss": 0.7029509544372559,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.08188159763813019,
"rewards/margins": 0.008629636839032173,
"rewards/rejected": -0.09051123261451721,
"sft_loss": 0.8188160061836243,
"step": 540
},
{
"epoch": 0.9314140558848434,
"grad_norm": 1.3889896869659424,
"learning_rate": 3.900687085403418e-06,
"logits/chosen": -3.1416361331939697,
"logits/rejected": -3.169236421585083,
"logps/chosen": -0.8118529319763184,
"logps/rejected": -0.8874411582946777,
"loss": 0.8817,
"odds_ratio_loss": 0.6982238292694092,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.08118529617786407,
"rewards/margins": 0.007558824960142374,
"rewards/rejected": -0.08874412626028061,
"sft_loss": 0.8118529319763184,
"step": 550
},
{
"epoch": 0.9483488569009314,
"grad_norm": 2.055143356323242,
"learning_rate": 3.863712017616614e-06,
"logits/chosen": -3.131472110748291,
"logits/rejected": -3.160679340362549,
"logps/chosen": -0.8101698160171509,
"logps/rejected": -0.9761570692062378,
"loss": 0.8764,
"odds_ratio_loss": 0.6618725657463074,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08101697266101837,
"rewards/margins": 0.016598742455244064,
"rewards/rejected": -0.09761571884155273,
"sft_loss": 0.8101698160171509,
"step": 560
},
{
"epoch": 0.9652836579170194,
"grad_norm": 14.660100936889648,
"learning_rate": 3.826307301080504e-06,
"logits/chosen": -3.0291128158569336,
"logits/rejected": -3.0481247901916504,
"logps/chosen": -0.8100768327713013,
"logps/rejected": -1.162418007850647,
"loss": 0.8764,
"odds_ratio_loss": 0.6627860069274902,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08100768178701401,
"rewards/margins": 0.03523411601781845,
"rewards/rejected": -0.11624179035425186,
"sft_loss": 0.8100768327713013,
"step": 570
},
{
"epoch": 0.9822184589331076,
"grad_norm": 2.563514471054077,
"learning_rate": 3.7884847204603775e-06,
"logits/chosen": -3.1155200004577637,
"logits/rejected": -3.15079665184021,
"logps/chosen": -0.8116699457168579,
"logps/rejected": -0.9339970350265503,
"loss": 0.8812,
"odds_ratio_loss": 0.6949950456619263,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.08116699010133743,
"rewards/margins": 0.012232715263962746,
"rewards/rejected": -0.09339970350265503,
"sft_loss": 0.8116699457168579,
"step": 580
},
{
"epoch": 0.9991532599491956,
"grad_norm": 2.7657859325408936,
"learning_rate": 3.750256192073058e-06,
"logits/chosen": -3.172693967819214,
"logits/rejected": -3.1799886226654053,
"logps/chosen": -0.9269296526908875,
"logps/rejected": -0.9760646820068359,
"loss": 1.0009,
"odds_ratio_loss": 0.7394477128982544,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.09269297122955322,
"rewards/margins": 0.004913498647511005,
"rewards/rejected": -0.09760646522045135,
"sft_loss": 0.9269296526908875,
"step": 590
},
{
"epoch": 1.0160880609652836,
"grad_norm": 7.485799789428711,
"learning_rate": 3.7116337601325715e-06,
"logits/chosen": -3.1055843830108643,
"logits/rejected": -3.129669189453125,
"logps/chosen": -0.7523837089538574,
"logps/rejected": -0.8988542556762695,
"loss": 0.8166,
"odds_ratio_loss": 0.6426426768302917,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07523836940526962,
"rewards/margins": 0.014647054485976696,
"rewards/rejected": -0.08988542854785919,
"sft_loss": 0.7523837089538574,
"step": 600
},
{
"epoch": 1.0330228619813717,
"grad_norm": 2.2944157123565674,
"learning_rate": 3.6726295929555154e-06,
"logits/chosen": -3.077573776245117,
"logits/rejected": -3.11928391456604,
"logps/chosen": -0.7445582747459412,
"logps/rejected": -0.9014002680778503,
"loss": 0.8117,
"odds_ratio_loss": 0.6712278723716736,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.07445583492517471,
"rewards/margins": 0.015684202313423157,
"rewards/rejected": -0.09014002978801727,
"sft_loss": 0.7445582747459412,
"step": 610
},
{
"epoch": 1.0499576629974599,
"grad_norm": 2.4759469032287598,
"learning_rate": 3.6332559791273307e-06,
"logits/chosen": -3.070753812789917,
"logits/rejected": -3.133881092071533,
"logps/chosen": -0.7787492871284485,
"logps/rejected": -0.9435569643974304,
"loss": 0.843,
"odds_ratio_loss": 0.6423131823539734,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07787492126226425,
"rewards/margins": 0.016480756923556328,
"rewards/rejected": -0.09435568749904633,
"sft_loss": 0.7787492871284485,
"step": 620
},
{
"epoch": 1.0668924640135478,
"grad_norm": 2.397857666015625,
"learning_rate": 3.593525323630681e-06,
"logits/chosen": -3.0695629119873047,
"logits/rejected": -3.0961527824401855,
"logps/chosen": -0.8072765469551086,
"logps/rejected": -0.9367998838424683,
"loss": 0.8715,
"odds_ratio_loss": 0.6417396664619446,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08072765171527863,
"rewards/margins": 0.012952342629432678,
"rewards/rejected": -0.0936800017952919,
"sft_loss": 0.8072765469551086,
"step": 630
},
{
"epoch": 1.083827265029636,
"grad_norm": 2.2752127647399902,
"learning_rate": 3.5534501439371615e-06,
"logits/chosen": -3.1115057468414307,
"logits/rejected": -3.1277289390563965,
"logps/chosen": -0.7920553088188171,
"logps/rejected": -0.9753093719482422,
"loss": 0.8548,
"odds_ratio_loss": 0.6271573901176453,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07920553535223007,
"rewards/margins": 0.018325407058000565,
"rewards/rejected": -0.09753094613552094,
"sft_loss": 0.7920553088188171,
"step": 640
},
{
"epoch": 1.100762066045724,
"grad_norm": 1.311848521232605,
"learning_rate": 3.5130430660635633e-06,
"logits/chosen": -3.1020348072052,
"logits/rejected": -3.1430366039276123,
"logps/chosen": -0.8032411336898804,
"logps/rejected": -0.9853434562683105,
"loss": 0.864,
"odds_ratio_loss": 0.6071646809577942,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.08032412827014923,
"rewards/margins": 0.018210221081972122,
"rewards/rejected": -0.09853433817625046,
"sft_loss": 0.8032411336898804,
"step": 650
},
{
"epoch": 1.117696867061812,
"grad_norm": 1.6412379741668701,
"learning_rate": 3.4723168205939444e-06,
"logits/chosen": -3.0930168628692627,
"logits/rejected": -3.1322126388549805,
"logps/chosen": -0.7815112471580505,
"logps/rejected": -0.9042918086051941,
"loss": 0.8483,
"odds_ratio_loss": 0.6679799556732178,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07815112918615341,
"rewards/margins": 0.012278061360120773,
"rewards/rejected": -0.09042918682098389,
"sft_loss": 0.7815112471580505,
"step": 660
},
{
"epoch": 1.1346316680779,
"grad_norm": 2.1591804027557373,
"learning_rate": 3.431284238668754e-06,
"logits/chosen": -3.062398910522461,
"logits/rejected": -3.1151247024536133,
"logps/chosen": -0.8405235409736633,
"logps/rejected": -1.0043154954910278,
"loss": 0.9059,
"odds_ratio_loss": 0.6538293957710266,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08405234664678574,
"rewards/margins": 0.01637919992208481,
"rewards/rejected": -0.10043156147003174,
"sft_loss": 0.8405235409736633,
"step": 670
},
{
"epoch": 1.1515664690939882,
"grad_norm": 4.811310291290283,
"learning_rate": 3.389958247942274e-06,
"logits/chosen": -3.057506561279297,
"logits/rejected": -3.1269962787628174,
"logps/chosen": -0.8411120176315308,
"logps/rejected": -1.1102626323699951,
"loss": 0.9091,
"odds_ratio_loss": 0.6803519129753113,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.08411119878292084,
"rewards/margins": 0.026915064081549644,
"rewards/rejected": -0.11102626472711563,
"sft_loss": 0.8411120176315308,
"step": 680
},
{
"epoch": 1.168501270110076,
"grad_norm": 2.754120111465454,
"learning_rate": 3.3483518685096588e-06,
"logits/chosen": -3.08880615234375,
"logits/rejected": -3.109083890914917,
"logps/chosen": -0.8396803140640259,
"logps/rejected": -0.9775651693344116,
"loss": 0.9066,
"odds_ratio_loss": 0.6687448620796204,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08396803587675095,
"rewards/margins": 0.013788496144115925,
"rewards/rejected": -0.097756527364254,
"sft_loss": 0.8396803140640259,
"step": 690
},
{
"epoch": 1.1854360711261642,
"grad_norm": 1.9642765522003174,
"learning_rate": 3.306478208804839e-06,
"logits/chosen": -3.0659680366516113,
"logits/rejected": -3.1026992797851562,
"logps/chosen": -0.7790535092353821,
"logps/rejected": -0.9482936859130859,
"loss": 0.8451,
"odds_ratio_loss": 0.659988284111023,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07790535688400269,
"rewards/margins": 0.016924021765589714,
"rewards/rejected": -0.09482936561107635,
"sft_loss": 0.7790535092353821,
"step": 700
},
{
"epoch": 1.2023708721422524,
"grad_norm": 2.6194725036621094,
"learning_rate": 3.264350461470608e-06,
"logits/chosen": -3.0373668670654297,
"logits/rejected": -3.0688533782958984,
"logps/chosen": -0.7216525077819824,
"logps/rejected": -1.0178143978118896,
"loss": 0.7814,
"odds_ratio_loss": 0.597113311290741,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07216525077819824,
"rewards/margins": 0.02961619198322296,
"rewards/rejected": -0.1017814427614212,
"sft_loss": 0.7216525077819824,
"step": 710
},
{
"epoch": 1.2193056731583405,
"grad_norm": 2.312389373779297,
"learning_rate": 3.2219818992021685e-06,
"logits/chosen": -3.03488826751709,
"logits/rejected": -3.087043285369873,
"logps/chosen": -0.7246071696281433,
"logps/rejected": -1.047911286354065,
"loss": 0.7828,
"odds_ratio_loss": 0.5823417901992798,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07246071845293045,
"rewards/margins": 0.03233041241765022,
"rewards/rejected": -0.10479112714529037,
"sft_loss": 0.7246071696281433,
"step": 720
},
{
"epoch": 1.2362404741744284,
"grad_norm": 7.234289169311523,
"learning_rate": 3.1793858705654595e-06,
"logits/chosen": -3.0948994159698486,
"logits/rejected": -3.1080453395843506,
"logps/chosen": -0.7130419611930847,
"logps/rejected": -0.9181682467460632,
"loss": 0.7733,
"odds_ratio_loss": 0.6023129820823669,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07130420207977295,
"rewards/margins": 0.020512625575065613,
"rewards/rejected": -0.09181682765483856,
"sft_loss": 0.7130419611930847,
"step": 730
},
{
"epoch": 1.2531752751905165,
"grad_norm": 2.101661443710327,
"learning_rate": 3.1365757957915787e-06,
"logits/chosen": -3.0874876976013184,
"logits/rejected": -3.123832941055298,
"logps/chosen": -0.8178297877311707,
"logps/rejected": -0.9593319892883301,
"loss": 0.8832,
"odds_ratio_loss": 0.6534532308578491,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.0817829817533493,
"rewards/margins": 0.01415021438151598,
"rewards/rejected": -0.09593319892883301,
"sft_loss": 0.8178297877311707,
"step": 740
},
{
"epoch": 1.2701100762066047,
"grad_norm": 3.2394182682037354,
"learning_rate": 3.093565162548633e-06,
"logits/chosen": -3.0456290245056152,
"logits/rejected": -3.0783424377441406,
"logps/chosen": -0.8610566854476929,
"logps/rejected": -1.0578378438949585,
"loss": 0.9282,
"odds_ratio_loss": 0.6714938879013062,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08610567450523376,
"rewards/margins": 0.019678115844726562,
"rewards/rejected": -0.10578378289937973,
"sft_loss": 0.8610566854476929,
"step": 750
},
{
"epoch": 1.2870448772226926,
"grad_norm": 3.4462602138519287,
"learning_rate": 3.0503675216923294e-06,
"logits/chosen": -3.112204074859619,
"logits/rejected": -3.1115825176239014,
"logps/chosen": -0.7481369376182556,
"logps/rejected": -0.9443623423576355,
"loss": 0.8093,
"odds_ratio_loss": 0.6117558479309082,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07481369376182556,
"rewards/margins": 0.019622545689344406,
"rewards/rejected": -0.09443624317646027,
"sft_loss": 0.7481369376182556,
"step": 760
},
{
"epoch": 1.3039796782387807,
"grad_norm": 2.5497653484344482,
"learning_rate": 3.0069964829966748e-06,
"logits/chosen": -3.083761692047119,
"logits/rejected": -3.1253674030303955,
"logps/chosen": -0.7777606248855591,
"logps/rejected": -0.9011049270629883,
"loss": 0.844,
"odds_ratio_loss": 0.6621277928352356,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07777605950832367,
"rewards/margins": 0.01233443059027195,
"rewards/rejected": -0.09011048078536987,
"sft_loss": 0.7777606248855591,
"step": 770
},
{
"epoch": 1.3209144792548688,
"grad_norm": 2.1115994453430176,
"learning_rate": 2.963465710866094e-06,
"logits/chosen": -3.098053455352783,
"logits/rejected": -3.1173479557037354,
"logps/chosen": -0.7589127421379089,
"logps/rejected": -1.0739343166351318,
"loss": 0.8183,
"odds_ratio_loss": 0.5940018892288208,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07589127868413925,
"rewards/margins": 0.03150214999914169,
"rewards/rejected": -0.10739342123270035,
"sft_loss": 0.7589127421379089,
"step": 780
},
{
"epoch": 1.337849280270957,
"grad_norm": 7.320650577545166,
"learning_rate": 2.919788920030357e-06,
"logits/chosen": -3.143812894821167,
"logits/rejected": -3.1576976776123047,
"logps/chosen": -0.8158448934555054,
"logps/rejected": -0.9654959440231323,
"loss": 0.8826,
"odds_ratio_loss": 0.6670835614204407,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08158449828624725,
"rewards/margins": 0.014965096488595009,
"rewards/rejected": -0.09654959291219711,
"sft_loss": 0.8158448934555054,
"step": 790
},
{
"epoch": 1.3547840812870449,
"grad_norm": 1.3826831579208374,
"learning_rate": 2.8759798712236303e-06,
"logits/chosen": -3.1377153396606445,
"logits/rejected": -3.138549566268921,
"logps/chosen": -0.7528073191642761,
"logps/rejected": -1.0746774673461914,
"loss": 0.8173,
"odds_ratio_loss": 0.6448089480400085,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07528072595596313,
"rewards/margins": 0.032187022268772125,
"rewards/rejected": -0.10746775567531586,
"sft_loss": 0.7528073191642761,
"step": 800
},
{
"epoch": 1.371718882303133,
"grad_norm": 2.1327664852142334,
"learning_rate": 2.8320523668490507e-06,
"logits/chosen": -3.095376968383789,
"logits/rejected": -3.129220485687256,
"logps/chosen": -0.8166864514350891,
"logps/rejected": -0.9957429766654968,
"loss": 0.8841,
"odds_ratio_loss": 0.6738199591636658,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08166865259408951,
"rewards/margins": 0.01790565252304077,
"rewards/rejected": -0.09957430511713028,
"sft_loss": 0.8166864514350891,
"step": 810
},
{
"epoch": 1.388653683319221,
"grad_norm": 2.5101046562194824,
"learning_rate": 2.7880202466301597e-06,
"logits/chosen": -3.075023651123047,
"logits/rejected": -3.1098039150238037,
"logps/chosen": -0.8058909177780151,
"logps/rejected": -0.9401804208755493,
"loss": 0.8735,
"odds_ratio_loss": 0.6758350133895874,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08058909326791763,
"rewards/margins": 0.013428950682282448,
"rewards/rejected": -0.09401804953813553,
"sft_loss": 0.8058909177780151,
"step": 820
},
{
"epoch": 1.405588484335309,
"grad_norm": 2.202185869216919,
"learning_rate": 2.7438973832505854e-06,
"logits/chosen": -3.060824155807495,
"logits/rejected": -3.0545971393585205,
"logps/chosen": -0.7594717741012573,
"logps/rejected": -0.9759091138839722,
"loss": 0.8235,
"odds_ratio_loss": 0.6401799321174622,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07594718039035797,
"rewards/margins": 0.021643735468387604,
"rewards/rejected": -0.09759090840816498,
"sft_loss": 0.7594717741012573,
"step": 830
},
{
"epoch": 1.4225232853513972,
"grad_norm": 5.100254058837891,
"learning_rate": 2.699697677983341e-06,
"logits/chosen": -3.1682240962982178,
"logits/rejected": -3.182861804962158,
"logps/chosen": -0.8106738924980164,
"logps/rejected": -0.8703418970108032,
"loss": 0.8831,
"odds_ratio_loss": 0.7241007685661316,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08106739819049835,
"rewards/margins": 0.0059667956084012985,
"rewards/rejected": -0.0870341882109642,
"sft_loss": 0.8106738924980164,
"step": 840
},
{
"epoch": 1.4394580863674853,
"grad_norm": 7.667366981506348,
"learning_rate": 2.6554350563111115e-06,
"logits/chosen": -3.0851314067840576,
"logits/rejected": -3.116276502609253,
"logps/chosen": -0.8307350873947144,
"logps/rejected": -0.9147516489028931,
"loss": 0.9054,
"odds_ratio_loss": 0.7461589574813843,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08307350426912308,
"rewards/margins": 0.0084016602486372,
"rewards/rejected": -0.09147517383098602,
"sft_loss": 0.8307350873947144,
"step": 850
},
{
"epoch": 1.4563928873835732,
"grad_norm": 6.640262126922607,
"learning_rate": 2.611123463538913e-06,
"logits/chosen": -3.0749902725219727,
"logits/rejected": -3.0911591053009033,
"logps/chosen": -0.7142345309257507,
"logps/rejected": -0.9272225499153137,
"loss": 0.7775,
"odds_ratio_loss": 0.6324297189712524,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07142344862222672,
"rewards/margins": 0.02129879966378212,
"rewards/rejected": -0.09272225201129913,
"sft_loss": 0.7142345309257507,
"step": 860
},
{
"epoch": 1.4733276883996613,
"grad_norm": 8.2167329788208,
"learning_rate": 2.566776860400514e-06,
"logits/chosen": -3.1082987785339355,
"logits/rejected": -3.1257712841033936,
"logps/chosen": -0.8350755572319031,
"logps/rejected": -1.056910753250122,
"loss": 0.898,
"odds_ratio_loss": 0.6290403604507446,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08350756019353867,
"rewards/margins": 0.022183528169989586,
"rewards/rejected": -0.10569107532501221,
"sft_loss": 0.8350755572319031,
"step": 870
},
{
"epoch": 1.4902624894157492,
"grad_norm": 1.4808557033538818,
"learning_rate": 2.522409218659989e-06,
"logits/chosen": -3.143266201019287,
"logits/rejected": -3.170633316040039,
"logps/chosen": -0.7854605913162231,
"logps/rejected": -0.9068530797958374,
"loss": 0.8512,
"odds_ratio_loss": 0.6573347449302673,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.07854606211185455,
"rewards/margins": 0.012139257043600082,
"rewards/rejected": -0.09068530797958374,
"sft_loss": 0.7854605913162231,
"step": 880
},
{
"epoch": 1.5071972904318374,
"grad_norm": 3.6185755729675293,
"learning_rate": 2.4780345167097976e-06,
"logits/chosen": -3.0972650051116943,
"logits/rejected": -3.0835044384002686,
"logps/chosen": -0.7852433919906616,
"logps/rejected": -1.0855656862258911,
"loss": 0.8459,
"odds_ratio_loss": 0.6065645813941956,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07852433621883392,
"rewards/margins": 0.030032237991690636,
"rewards/rejected": -0.1085565835237503,
"sft_loss": 0.7852433919906616,
"step": 890
},
{
"epoch": 1.5241320914479255,
"grad_norm": 4.639472961425781,
"learning_rate": 2.4336667351667747e-06,
"logits/chosen": -3.114197015762329,
"logits/rejected": -3.124145269393921,
"logps/chosen": -0.8203206062316895,
"logps/rejected": -1.0489810705184937,
"loss": 0.8792,
"odds_ratio_loss": 0.5887311697006226,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.08203206956386566,
"rewards/margins": 0.022866051644086838,
"rewards/rejected": -0.1048981174826622,
"sft_loss": 0.8203206062316895,
"step": 900
},
{
"epoch": 1.5410668924640136,
"grad_norm": 3.153228282928467,
"learning_rate": 2.3893198524674264e-06,
"logits/chosen": -3.086516857147217,
"logits/rejected": -3.104675769805908,
"logps/chosen": -0.7846948504447937,
"logps/rejected": -0.9825633764266968,
"loss": 0.8478,
"odds_ratio_loss": 0.6309666037559509,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07846949249505997,
"rewards/margins": 0.019786860793828964,
"rewards/rejected": -0.09825634956359863,
"sft_loss": 0.7846948504447937,
"step": 910
},
{
"epoch": 1.5580016934801018,
"grad_norm": 3.532456874847412,
"learning_rate": 2.345007840463904e-06,
"logits/chosen": -3.0608856678009033,
"logits/rejected": -3.0965914726257324,
"logps/chosen": -0.8089407682418823,
"logps/rejected": -0.9399329423904419,
"loss": 0.8759,
"odds_ratio_loss": 0.6692665815353394,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.08089407533407211,
"rewards/margins": 0.013099217787384987,
"rewards/rejected": -0.09399329125881195,
"sft_loss": 0.8089407682418823,
"step": 920
},
{
"epoch": 1.5749364944961897,
"grad_norm": 3.778723955154419,
"learning_rate": 2.3007446600220572e-06,
"logits/chosen": -3.122668504714966,
"logits/rejected": -3.1052489280700684,
"logps/chosen": -0.799084484577179,
"logps/rejected": -0.9807415008544922,
"loss": 0.8639,
"odds_ratio_loss": 0.6483135223388672,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07990844547748566,
"rewards/margins": 0.018165703862905502,
"rewards/rejected": -0.09807415306568146,
"sft_loss": 0.799084484577179,
"step": 930
},
{
"epoch": 1.5918712955122776,
"grad_norm": 3.475341558456421,
"learning_rate": 2.2565442566229507e-06,
"logits/chosen": -3.0740559101104736,
"logits/rejected": -3.106633424758911,
"logps/chosen": -0.7921947240829468,
"logps/rejected": -0.8962046504020691,
"loss": 0.862,
"odds_ratio_loss": 0.6985131502151489,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.07921947538852692,
"rewards/margins": 0.01040099561214447,
"rewards/rejected": -0.08962046355009079,
"sft_loss": 0.7921947240829468,
"step": 940
},
{
"epoch": 1.6088060965283657,
"grad_norm": 2.1668930053710938,
"learning_rate": 2.2124205559692195e-06,
"logits/chosen": -3.064317226409912,
"logits/rejected": -3.1167845726013184,
"logps/chosen": -0.7817971706390381,
"logps/rejected": -1.0080362558364868,
"loss": 0.8399,
"odds_ratio_loss": 0.5807359218597412,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0781797245144844,
"rewards/margins": 0.02262391336262226,
"rewards/rejected": -0.10080362856388092,
"sft_loss": 0.7817971706390381,
"step": 950
},
{
"epoch": 1.6257408975444538,
"grad_norm": 1.8939071893692017,
"learning_rate": 2.168387459597666e-06,
"logits/chosen": -3.092590808868408,
"logits/rejected": -3.138596773147583,
"logps/chosen": -0.7956336140632629,
"logps/rejected": -1.0347424745559692,
"loss": 0.8565,
"odds_ratio_loss": 0.6086810827255249,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07956336438655853,
"rewards/margins": 0.02391088381409645,
"rewards/rejected": -0.10347424447536469,
"sft_loss": 0.7956336140632629,
"step": 960
},
{
"epoch": 1.642675698560542,
"grad_norm": 2.1293139457702637,
"learning_rate": 2.1244588404994648e-06,
"logits/chosen": -3.0621867179870605,
"logits/rejected": -3.0614800453186035,
"logps/chosen": -0.7734932899475098,
"logps/rejected": -0.9287079572677612,
"loss": 0.8408,
"odds_ratio_loss": 0.6727171540260315,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07734932750463486,
"rewards/margins": 0.015521461144089699,
"rewards/rejected": -0.0928707867860794,
"sft_loss": 0.7734932899475098,
"step": 970
},
{
"epoch": 1.65961049957663,
"grad_norm": 5.466368675231934,
"learning_rate": 2.08064853874936e-06,
"logits/chosen": -3.0892724990844727,
"logits/rejected": -3.1403141021728516,
"logps/chosen": -0.8142075538635254,
"logps/rejected": -0.9884878396987915,
"loss": 0.8786,
"odds_ratio_loss": 0.6434410810470581,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08142076432704926,
"rewards/margins": 0.017428018152713776,
"rewards/rejected": -0.09884877502918243,
"sft_loss": 0.8142075538635254,
"step": 980
},
{
"epoch": 1.676545300592718,
"grad_norm": 12.251886367797852,
"learning_rate": 2.0369703571452387e-06,
"logits/chosen": -3.055989980697632,
"logits/rejected": -3.0450901985168457,
"logps/chosen": -0.716150164604187,
"logps/rejected": -0.9780189394950867,
"loss": 0.7743,
"odds_ratio_loss": 0.5815138816833496,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07161502540111542,
"rewards/margins": 0.026186879724264145,
"rewards/rejected": -0.09780190140008926,
"sft_loss": 0.716150164604187,
"step": 990
},
{
"epoch": 1.6934801016088061,
"grad_norm": 3.6831836700439453,
"learning_rate": 1.993438056859441e-06,
"logits/chosen": -3.1155529022216797,
"logits/rejected": -3.1060287952423096,
"logps/chosen": -0.7484423518180847,
"logps/rejected": -0.929480254650116,
"loss": 0.8098,
"odds_ratio_loss": 0.6138982772827148,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07484424859285355,
"rewards/margins": 0.018103793263435364,
"rewards/rejected": -0.09294802695512772,
"sft_loss": 0.7484423518180847,
"step": 1000
},
{
"epoch": 1.6934801016088061,
"eval_logits/chosen": -3.0966453552246094,
"eval_logits/rejected": -3.117032051086426,
"eval_logps/chosen": -0.7911127805709839,
"eval_logps/rejected": -0.9985377192497253,
"eval_loss": 0.8548597693443298,
"eval_odds_ratio_loss": 0.6374707221984863,
"eval_rewards/accuracies": 0.5676190257072449,
"eval_rewards/chosen": -0.07911127805709839,
"eval_rewards/margins": 0.020742492750287056,
"eval_rewards/rejected": -0.0998537689447403,
"eval_runtime": 195.0995,
"eval_samples_per_second": 5.382,
"eval_sft_loss": 0.7911127805709839,
"eval_steps_per_second": 2.691,
"step": 1000
},
{
"epoch": 1.710414902624894,
"grad_norm": 2.5681636333465576,
"learning_rate": 1.9500653531031917e-06,
"logits/chosen": -3.116891384124756,
"logits/rejected": -3.1181325912475586,
"logps/chosen": -0.8085691332817078,
"logps/rejected": -1.0325000286102295,
"loss": 0.8768,
"odds_ratio_loss": 0.6825646162033081,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08085691183805466,
"rewards/margins": 0.022393101826310158,
"rewards/rejected": -0.10325001180171967,
"sft_loss": 0.8085691332817078,
"step": 1010
},
{
"epoch": 1.7273497036409822,
"grad_norm": 2.5795137882232666,
"learning_rate": 1.9068659108055117e-06,
"logits/chosen": -3.1321749687194824,
"logits/rejected": -3.155836582183838,
"logps/chosen": -0.7755477428436279,
"logps/rejected": -0.9410096406936646,
"loss": 0.8406,
"odds_ratio_loss": 0.6507992744445801,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07755477726459503,
"rewards/margins": 0.016546186059713364,
"rewards/rejected": -0.0941009670495987,
"sft_loss": 0.7755477428436279,
"step": 1020
},
{
"epoch": 1.7442845046570703,
"grad_norm": 2.360201835632324,
"learning_rate": 1.863853340307962e-06,
"logits/chosen": -3.078691005706787,
"logits/rejected": -3.1044058799743652,
"logps/chosen": -0.683570146560669,
"logps/rejected": -0.9945551753044128,
"loss": 0.7414,
"odds_ratio_loss": 0.5782071352005005,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.06835701316595078,
"rewards/margins": 0.03109849989414215,
"rewards/rejected": -0.09945552051067352,
"sft_loss": 0.683570146560669,
"step": 1030
},
{
"epoch": 1.7612193056731584,
"grad_norm": 2.180799961090088,
"learning_rate": 1.8210411930766019e-06,
"logits/chosen": -3.0688931941986084,
"logits/rejected": -3.096926212310791,
"logps/chosen": -0.7589991688728333,
"logps/rejected": -0.9845136404037476,
"loss": 0.8196,
"odds_ratio_loss": 0.6064754128456116,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07589991390705109,
"rewards/margins": 0.02255145087838173,
"rewards/rejected": -0.09845136106014252,
"sft_loss": 0.7589991688728333,
"step": 1040
},
{
"epoch": 1.7781541066892466,
"grad_norm": 2.4373152256011963,
"learning_rate": 1.7784429574324803e-06,
"logits/chosen": -3.079857349395752,
"logits/rejected": -3.1024343967437744,
"logps/chosen": -0.738146960735321,
"logps/rejected": -0.9651134610176086,
"loss": 0.7991,
"odds_ratio_loss": 0.6093234419822693,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07381470501422882,
"rewards/margins": 0.022696642205119133,
"rewards/rejected": -0.0965113490819931,
"sft_loss": 0.738146960735321,
"step": 1050
},
{
"epoch": 1.7950889077053345,
"grad_norm": 1.4168922901153564,
"learning_rate": 1.7360720543020327e-06,
"logits/chosen": -3.158327102661133,
"logits/rejected": -3.1482691764831543,
"logps/chosen": -0.7429002523422241,
"logps/rejected": -0.9451411366462708,
"loss": 0.8021,
"odds_ratio_loss": 0.5917203426361084,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07429002225399017,
"rewards/margins": 0.020224085077643394,
"rewards/rejected": -0.09451410919427872,
"sft_loss": 0.7429002523422241,
"step": 1060
},
{
"epoch": 1.8120237087214224,
"grad_norm": 7.384251594543457,
"learning_rate": 1.6939418329887042e-06,
"logits/chosen": -3.1223366260528564,
"logits/rejected": -3.157349109649658,
"logps/chosen": -0.7720141410827637,
"logps/rejected": -1.004219651222229,
"loss": 0.8344,
"odds_ratio_loss": 0.623358428478241,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07720141112804413,
"rewards/margins": 0.023220548406243324,
"rewards/rejected": -0.1004219651222229,
"sft_loss": 0.7720141410827637,
"step": 1070
},
{
"epoch": 1.8289585097375105,
"grad_norm": 1.976076602935791,
"learning_rate": 1.6520655669671467e-06,
"logits/chosen": -3.151508331298828,
"logits/rejected": -3.174203395843506,
"logps/chosen": -0.7854975461959839,
"logps/rejected": -0.9663504362106323,
"loss": 0.8499,
"odds_ratio_loss": 0.6441539525985718,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.07854975759983063,
"rewards/margins": 0.01808529533445835,
"rewards/rejected": -0.09663505107164383,
"sft_loss": 0.7854975461959839,
"step": 1080
},
{
"epoch": 1.8458933107535986,
"grad_norm": 2.3342175483703613,
"learning_rate": 1.610456449701294e-06,
"logits/chosen": -3.110039234161377,
"logits/rejected": -3.1346383094787598,
"logps/chosen": -0.7955976724624634,
"logps/rejected": -1.0117195844650269,
"loss": 0.8634,
"odds_ratio_loss": 0.6780521273612976,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07955978065729141,
"rewards/margins": 0.02161218598484993,
"rewards/rejected": -0.10117195546627045,
"sft_loss": 0.7955976724624634,
"step": 1090
},
{
"epoch": 1.8628281117696868,
"grad_norm": 2.040372133255005,
"learning_rate": 1.5691275904876545e-06,
"logits/chosen": -3.11810302734375,
"logits/rejected": -3.0907464027404785,
"logps/chosen": -0.8171418905258179,
"logps/rejected": -1.049744963645935,
"loss": 0.8794,
"odds_ratio_loss": 0.6221813559532166,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.0817141979932785,
"rewards/margins": 0.02326030097901821,
"rewards/rejected": -0.10497449338436127,
"sft_loss": 0.8171418905258179,
"step": 1100
},
{
"epoch": 1.879762912785775,
"grad_norm": 1.1363812685012817,
"learning_rate": 1.5280920103251235e-06,
"logits/chosen": -3.121904134750366,
"logits/rejected": -3.1400184631347656,
"logps/chosen": -0.7571579217910767,
"logps/rejected": -0.9578372240066528,
"loss": 0.8213,
"odds_ratio_loss": 0.6416117548942566,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07571578770875931,
"rewards/margins": 0.020067930221557617,
"rewards/rejected": -0.09578372538089752,
"sft_loss": 0.7571579217910767,
"step": 1110
},
{
"epoch": 1.8966977138018628,
"grad_norm": 3.5236151218414307,
"learning_rate": 1.4873626378126015e-06,
"logits/chosen": -3.109051465988159,
"logits/rejected": -3.123108386993408,
"logps/chosen": -0.7997997403144836,
"logps/rejected": -1.0338528156280518,
"loss": 0.8642,
"odds_ratio_loss": 0.6441462635993958,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07997997850179672,
"rewards/margins": 0.02340531349182129,
"rewards/rejected": -0.10338529199361801,
"sft_loss": 0.7997997403144836,
"step": 1120
},
{
"epoch": 1.913632514817951,
"grad_norm": 4.680517673492432,
"learning_rate": 1.446952305075738e-06,
"logits/chosen": -3.1131389141082764,
"logits/rejected": -3.151978015899658,
"logps/chosen": -0.7748031616210938,
"logps/rejected": -0.8985496759414673,
"loss": 0.8402,
"odds_ratio_loss": 0.6542429327964783,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07748032361268997,
"rewards/margins": 0.012374645099043846,
"rewards/rejected": -0.08985497057437897,
"sft_loss": 0.7748031616210938,
"step": 1130
},
{
"epoch": 1.9305673158340388,
"grad_norm": 1.990546703338623,
"learning_rate": 1.406873743724065e-06,
"logits/chosen": -3.134593963623047,
"logits/rejected": -3.088059186935425,
"logps/chosen": -0.8194819688796997,
"logps/rejected": -1.0895836353302002,
"loss": 0.8842,
"odds_ratio_loss": 0.6472653746604919,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08194819837808609,
"rewards/margins": 0.02701016701757908,
"rewards/rejected": -0.10895836353302002,
"sft_loss": 0.8194819688796997,
"step": 1140
},
{
"epoch": 1.947502116850127,
"grad_norm": 2.0966198444366455,
"learning_rate": 1.3671395808397898e-06,
"logits/chosen": -3.1003191471099854,
"logits/rejected": -3.159738779067993,
"logps/chosen": -0.7502952218055725,
"logps/rejected": -0.8763904571533203,
"loss": 0.8142,
"odds_ratio_loss": 0.6395031213760376,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07502951472997665,
"rewards/margins": 0.012609531171619892,
"rewards/rejected": -0.08763904869556427,
"sft_loss": 0.7502952218055725,
"step": 1150
},
{
"epoch": 1.964436917866215,
"grad_norm": 6.19160795211792,
"learning_rate": 1.3277623349995418e-06,
"logits/chosen": -3.1115336418151855,
"logits/rejected": -3.1329808235168457,
"logps/chosen": -0.7726483941078186,
"logps/rejected": -0.9365378618240356,
"loss": 0.8381,
"odds_ratio_loss": 0.6545372605323792,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07726484537124634,
"rewards/margins": 0.016388945281505585,
"rewards/rejected": -0.09365378320217133,
"sft_loss": 0.7726483941078186,
"step": 1160
},
{
"epoch": 1.9813717188823032,
"grad_norm": 2.1759722232818604,
"learning_rate": 1.2887544123302781e-06,
"logits/chosen": -3.1343424320220947,
"logits/rejected": -3.145904064178467,
"logps/chosen": -0.795665442943573,
"logps/rejected": -0.9278801679611206,
"loss": 0.863,
"odds_ratio_loss": 0.6731692552566528,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07956655323505402,
"rewards/margins": 0.01322146225720644,
"rewards/rejected": -0.09278801828622818,
"sft_loss": 0.795665442943573,
"step": 1170
},
{
"epoch": 1.9983065198983911,
"grad_norm": 2.340391159057617,
"learning_rate": 1.2501281026006393e-06,
"logits/chosen": -3.113882064819336,
"logits/rejected": -3.1493611335754395,
"logps/chosen": -0.7624078989028931,
"logps/rejected": -0.9082571268081665,
"loss": 0.8289,
"odds_ratio_loss": 0.6652835607528687,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.07624078541994095,
"rewards/margins": 0.014584928750991821,
"rewards/rejected": -0.09082571417093277,
"sft_loss": 0.7624078989028931,
"step": 1180
},
{
"epoch": 2.015241320914479,
"grad_norm": 2.1316022872924805,
"learning_rate": 1.2118955753489523e-06,
"logits/chosen": -3.1328041553497314,
"logits/rejected": -3.119529962539673,
"logps/chosen": -0.7868901491165161,
"logps/rejected": -0.9673662185668945,
"loss": 0.8514,
"odds_ratio_loss": 0.6448970437049866,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07868902385234833,
"rewards/margins": 0.018047606572508812,
"rewards/rejected": -0.09673662483692169,
"sft_loss": 0.7868901491165161,
"step": 1190
},
{
"epoch": 2.032176121930567,
"grad_norm": 2.219909906387329,
"learning_rate": 1.1740688760491189e-06,
"logits/chosen": -3.1113944053649902,
"logits/rejected": -3.1537253856658936,
"logps/chosen": -0.7722674608230591,
"logps/rejected": -1.0175268650054932,
"loss": 0.8299,
"odds_ratio_loss": 0.5762220621109009,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07722674310207367,
"rewards/margins": 0.02452593669295311,
"rewards/rejected": -0.10175268352031708,
"sft_loss": 0.7722674608230591,
"step": 1200
},
{
"epoch": 2.0491109229466553,
"grad_norm": 6.625383377075195,
"learning_rate": 1.1366599223155847e-06,
"logits/chosen": -3.092228651046753,
"logits/rejected": -3.137305498123169,
"logps/chosen": -0.7458280324935913,
"logps/rejected": -1.1138603687286377,
"loss": 0.8056,
"odds_ratio_loss": 0.5978988409042358,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07458280026912689,
"rewards/margins": 0.0368032269179821,
"rewards/rejected": -0.1113860234618187,
"sft_loss": 0.7458280324935913,
"step": 1210
},
{
"epoch": 2.0660457239627434,
"grad_norm": 3.234821081161499,
"learning_rate": 1.0996805001486067e-06,
"logits/chosen": -3.109764575958252,
"logits/rejected": -3.1344008445739746,
"logps/chosen": -0.7534822225570679,
"logps/rejected": -0.9295659065246582,
"loss": 0.8161,
"odds_ratio_loss": 0.625883162021637,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07534822076559067,
"rewards/margins": 0.017608370631933212,
"rewards/rejected": -0.09295658767223358,
"sft_loss": 0.7534822225570679,
"step": 1220
},
{
"epoch": 2.0829805249788316,
"grad_norm": 3.5898754596710205,
"learning_rate": 1.0631422602209608e-06,
"logits/chosen": -3.1376945972442627,
"logits/rejected": -3.1564555168151855,
"logps/chosen": -0.7927883863449097,
"logps/rejected": -0.9650642275810242,
"loss": 0.8569,
"odds_ratio_loss": 0.6410055160522461,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0792788416147232,
"rewards/margins": 0.017227591946721077,
"rewards/rejected": -0.09650643169879913,
"sft_loss": 0.7927883863449097,
"step": 1230
},
{
"epoch": 2.0999153259949197,
"grad_norm": 1.6415690183639526,
"learning_rate": 1.027056714207319e-06,
"logits/chosen": -3.1497724056243896,
"logits/rejected": -3.1502339839935303,
"logps/chosen": -0.8299382925033569,
"logps/rejected": -1.0621525049209595,
"loss": 0.8939,
"odds_ratio_loss": 0.6398864984512329,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08299383521080017,
"rewards/margins": 0.023221401497721672,
"rewards/rejected": -0.10621523857116699,
"sft_loss": 0.8299382925033569,
"step": 1240
},
{
"epoch": 2.116850127011008,
"grad_norm": 2.1568691730499268,
"learning_rate": 9.914352311573838e-07,
"logits/chosen": -3.1183249950408936,
"logits/rejected": -3.130068302154541,
"logps/chosen": -0.680639386177063,
"logps/rejected": -0.9202540516853333,
"loss": 0.7394,
"odds_ratio_loss": 0.5878284573554993,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.06806393712759018,
"rewards/margins": 0.02396145835518837,
"rewards/rejected": -0.09202539175748825,
"sft_loss": 0.680639386177063,
"step": 1250
},
{
"epoch": 2.1337849280270955,
"grad_norm": 1.6225751638412476,
"learning_rate": 9.562890339139877e-07,
"logits/chosen": -3.0915961265563965,
"logits/rejected": -3.1580090522766113,
"logps/chosen": -0.7393635511398315,
"logps/rejected": -0.9581485986709595,
"loss": 0.8032,
"odds_ratio_loss": 0.6382402181625366,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07393636554479599,
"rewards/margins": 0.021878493949770927,
"rewards/rejected": -0.09581486135721207,
"sft_loss": 0.7393635511398315,
"step": 1260
},
{
"epoch": 2.1507197290431836,
"grad_norm": 1.8676691055297852,
"learning_rate": 9.216291955772374e-07,
"logits/chosen": -3.0996224880218506,
"logits/rejected": -3.153738021850586,
"logps/chosen": -0.7607396841049194,
"logps/rejected": -0.9290043115615845,
"loss": 0.8238,
"odds_ratio_loss": 0.6303194761276245,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07607396692037582,
"rewards/margins": 0.016826456412672997,
"rewards/rejected": -0.09290042519569397,
"sft_loss": 0.7607396841049194,
"step": 1270
},
{
"epoch": 2.167654530059272,
"grad_norm": 2.096705198287964,
"learning_rate": 8.874666360158457e-07,
"logits/chosen": -3.082366466522217,
"logits/rejected": -3.0917139053344727,
"logps/chosen": -0.7106717824935913,
"logps/rejected": -0.9525697827339172,
"loss": 0.774,
"odds_ratio_loss": 0.6332431435585022,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07106717675924301,
"rewards/margins": 0.024189796298742294,
"rewards/rejected": -0.0952569767832756,
"sft_loss": 0.7106717824935913,
"step": 1280
},
{
"epoch": 2.18458933107536,
"grad_norm": 2.435957431793213,
"learning_rate": 8.538121184267315e-07,
"logits/chosen": -3.1009063720703125,
"logits/rejected": -3.121425151824951,
"logps/chosen": -0.6681427955627441,
"logps/rejected": -0.9359544515609741,
"loss": 0.7277,
"odds_ratio_loss": 0.595305323600769,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.06681428104639053,
"rewards/margins": 0.026781165972352028,
"rewards/rejected": -0.09359544515609741,
"sft_loss": 0.6681427955627441,
"step": 1290
},
{
"epoch": 2.201524132091448,
"grad_norm": 2.3414995670318604,
"learning_rate": 8.206762459439907e-07,
"logits/chosen": -3.1007769107818604,
"logits/rejected": -3.142437696456909,
"logps/chosen": -0.7944619059562683,
"logps/rejected": -0.9959940910339355,
"loss": 0.8604,
"odds_ratio_loss": 0.659730076789856,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07944619655609131,
"rewards/margins": 0.020153220742940903,
"rewards/rejected": -0.09959942102432251,
"sft_loss": 0.7944619059562683,
"step": 1300
},
{
"epoch": 2.218458933107536,
"grad_norm": 2.9054501056671143,
"learning_rate": 7.880694582982898e-07,
"logits/chosen": -3.165544271469116,
"logits/rejected": -3.1803879737854004,
"logps/chosen": -0.8287284970283508,
"logps/rejected": -1.0228365659713745,
"loss": 0.8926,
"odds_ratio_loss": 0.6389774084091187,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08287284523248672,
"rewards/margins": 0.019410807639360428,
"rewards/rejected": -0.10228364169597626,
"sft_loss": 0.8287284970283508,
"step": 1310
},
{
"epoch": 2.235393734123624,
"grad_norm": 3.0612242221832275,
"learning_rate": 7.560020285277401e-07,
"logits/chosen": -3.0883891582489014,
"logits/rejected": -3.142484426498413,
"logps/chosen": -0.7654698491096497,
"logps/rejected": -0.8691753149032593,
"loss": 0.8363,
"odds_ratio_loss": 0.708137035369873,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.07654698193073273,
"rewards/margins": 0.010370554402470589,
"rewards/rejected": -0.08691753447055817,
"sft_loss": 0.7654698491096497,
"step": 1320
},
{
"epoch": 2.252328535139712,
"grad_norm": 3.074390172958374,
"learning_rate": 7.244840597412956e-07,
"logits/chosen": -3.0840373039245605,
"logits/rejected": -3.113865613937378,
"logps/chosen": -0.7826686501502991,
"logps/rejected": -0.8837703466415405,
"loss": 0.8534,
"odds_ratio_loss": 0.7077327370643616,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.07826686650514603,
"rewards/margins": 0.010110177099704742,
"rewards/rejected": -0.08837703615427017,
"sft_loss": 0.7826686501502991,
"step": 1330
},
{
"epoch": 2.2692633361558,
"grad_norm": 6.461457252502441,
"learning_rate": 6.935254819356796e-07,
"logits/chosen": -3.1297388076782227,
"logits/rejected": -3.1470894813537598,
"logps/chosen": -0.7817297577857971,
"logps/rejected": -0.9597527384757996,
"loss": 0.8464,
"odds_ratio_loss": 0.6471723318099976,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07817298173904419,
"rewards/margins": 0.017802301794290543,
"rewards/rejected": -0.09597527980804443,
"sft_loss": 0.7817297577857971,
"step": 1340
},
{
"epoch": 2.2861981371718882,
"grad_norm": 1.7192656993865967,
"learning_rate": 6.631360488668662e-07,
"logits/chosen": -3.138521194458008,
"logits/rejected": -3.150005578994751,
"logps/chosen": -0.7190467715263367,
"logps/rejected": -0.9147864580154419,
"loss": 0.7824,
"odds_ratio_loss": 0.6330953240394592,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07190467417240143,
"rewards/margins": 0.019573967903852463,
"rewards/rejected": -0.09147863835096359,
"sft_loss": 0.7190467715263367,
"step": 1350
},
{
"epoch": 2.3031329381879764,
"grad_norm": 2.849228620529175,
"learning_rate": 6.333253349770672e-07,
"logits/chosen": -3.1443207263946533,
"logits/rejected": -3.1502537727355957,
"logps/chosen": -0.8002703785896301,
"logps/rejected": -0.9851358532905579,
"loss": 0.8656,
"odds_ratio_loss": 0.6537164449691772,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.0800270363688469,
"rewards/margins": 0.018486548215150833,
"rewards/rejected": -0.09851358830928802,
"sft_loss": 0.8002703785896301,
"step": 1360
},
{
"epoch": 2.3200677392040645,
"grad_norm": 1.3985761404037476,
"learning_rate": 6.041027323782364e-07,
"logits/chosen": -3.136557102203369,
"logits/rejected": -3.1552491188049316,
"logps/chosen": -0.7381452918052673,
"logps/rejected": -0.9497495889663696,
"loss": 0.7993,
"odds_ratio_loss": 0.611694872379303,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07381454110145569,
"rewards/margins": 0.021160420030355453,
"rewards/rejected": -0.09497495740652084,
"sft_loss": 0.7381452918052673,
"step": 1370
},
{
"epoch": 2.337002540220152,
"grad_norm": 2.8737893104553223,
"learning_rate": 5.754774478929969e-07,
"logits/chosen": -3.1531028747558594,
"logits/rejected": -3.1830787658691406,
"logps/chosen": -0.7512658834457397,
"logps/rejected": -0.9563377499580383,
"loss": 0.81,
"odds_ratio_loss": 0.5872438549995422,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07512658089399338,
"rewards/margins": 0.020507195964455605,
"rewards/rejected": -0.09563378244638443,
"sft_loss": 0.7512658834457397,
"step": 1380
},
{
"epoch": 2.3539373412362403,
"grad_norm": 3.9828193187713623,
"learning_rate": 5.474585001539634e-07,
"logits/chosen": -3.159769296646118,
"logits/rejected": -3.1827354431152344,
"logps/chosen": -0.716742753982544,
"logps/rejected": -0.9266722798347473,
"loss": 0.7742,
"odds_ratio_loss": 0.5742050409317017,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.07167427241802216,
"rewards/margins": 0.020992957055568695,
"rewards/rejected": -0.09266723692417145,
"sft_loss": 0.716742753982544,
"step": 1390
},
{
"epoch": 2.3708721422523285,
"grad_norm": 1.6741605997085571,
"learning_rate": 5.200547167623424e-07,
"logits/chosen": -3.172938823699951,
"logits/rejected": -3.1454200744628906,
"logps/chosen": -0.7799001932144165,
"logps/rejected": -1.040466070175171,
"loss": 0.8399,
"odds_ratio_loss": 0.5997284650802612,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07799001783132553,
"rewards/margins": 0.026056593284010887,
"rewards/rejected": -0.10404660552740097,
"sft_loss": 0.7799001932144165,
"step": 1400
},
{
"epoch": 2.3878069432684166,
"grad_norm": 1.3470762968063354,
"learning_rate": 4.932747315067271e-07,
"logits/chosen": -3.15238356590271,
"logits/rejected": -3.1575927734375,
"logps/chosen": -0.7618133425712585,
"logps/rejected": -1.033060073852539,
"loss": 0.8193,
"odds_ratio_loss": 0.574641764163971,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.0761813372373581,
"rewards/margins": 0.027124667540192604,
"rewards/rejected": -0.10330601036548615,
"sft_loss": 0.7618133425712585,
"step": 1410
},
{
"epoch": 2.4047417442845047,
"grad_norm": 4.0449371337890625,
"learning_rate": 4.6712698164294553e-07,
"logits/chosen": -3.1169888973236084,
"logits/rejected": -3.131412982940674,
"logps/chosen": -0.7451744079589844,
"logps/rejected": -0.9645574688911438,
"loss": 0.8029,
"odds_ratio_loss": 0.5777136087417603,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07451744377613068,
"rewards/margins": 0.021938303485512733,
"rewards/rejected": -0.09645574539899826,
"sft_loss": 0.7451744079589844,
"step": 1420
},
{
"epoch": 2.421676545300593,
"grad_norm": 1.97295343875885,
"learning_rate": 4.41619705235842e-07,
"logits/chosen": -3.1363072395324707,
"logits/rejected": -3.1507456302642822,
"logps/chosen": -0.7545329332351685,
"logps/rejected": -1.082715392112732,
"loss": 0.8124,
"odds_ratio_loss": 0.5782482624053955,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07545328885316849,
"rewards/margins": 0.0328182615339756,
"rewards/rejected": -0.10827155411243439,
"sft_loss": 0.7545329332351685,
"step": 1430
},
{
"epoch": 2.438611346316681,
"grad_norm": 2.880293846130371,
"learning_rate": 4.167609385637961e-07,
"logits/chosen": -3.182002544403076,
"logits/rejected": -3.165118455886841,
"logps/chosen": -0.7962235808372498,
"logps/rejected": -0.975197970867157,
"loss": 0.8596,
"odds_ratio_loss": 0.634021520614624,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07962235808372498,
"rewards/margins": 0.017897438257932663,
"rewards/rejected": -0.09751980006694794,
"sft_loss": 0.7962235808372498,
"step": 1440
},
{
"epoch": 2.4555461473327687,
"grad_norm": 2.545621871948242,
"learning_rate": 3.9255851358683567e-07,
"logits/chosen": -3.1265337467193604,
"logits/rejected": -3.140825033187866,
"logps/chosen": -0.7502083778381348,
"logps/rejected": -0.9492311477661133,
"loss": 0.817,
"odds_ratio_loss": 0.6676734685897827,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.07502084225416183,
"rewards/margins": 0.019902262836694717,
"rewards/rejected": -0.09492311626672745,
"sft_loss": 0.7502083778381348,
"step": 1450
},
{
"epoch": 2.472480948348857,
"grad_norm": 2.91165828704834,
"learning_rate": 3.690200554791082e-07,
"logits/chosen": -3.093761444091797,
"logits/rejected": -3.0926525592803955,
"logps/chosen": -0.7423545122146606,
"logps/rejected": -0.9597676992416382,
"loss": 0.8001,
"odds_ratio_loss": 0.5777243375778198,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0742354542016983,
"rewards/margins": 0.02174132689833641,
"rewards/rejected": -0.09597676992416382,
"sft_loss": 0.7423545122146606,
"step": 1460
},
{
"epoch": 2.489415749364945,
"grad_norm": 5.057043075561523,
"learning_rate": 3.461529802265079e-07,
"logits/chosen": -3.153536796569824,
"logits/rejected": -3.16294264793396,
"logps/chosen": -0.7584110498428345,
"logps/rejected": -0.9484812617301941,
"loss": 0.8201,
"odds_ratio_loss": 0.6163991093635559,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07584110647439957,
"rewards/margins": 0.01900702901184559,
"rewards/rejected": -0.0948481336236,
"sft_loss": 0.7584110498428345,
"step": 1470
},
{
"epoch": 2.506350550381033,
"grad_norm": 2.0298070907592773,
"learning_rate": 3.2396449229020883e-07,
"logits/chosen": -3.18571400642395,
"logits/rejected": -3.1676132678985596,
"logps/chosen": -0.7937291264533997,
"logps/rejected": -0.935733437538147,
"loss": 0.8599,
"odds_ratio_loss": 0.6614922881126404,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07937291264533997,
"rewards/margins": 0.014200428500771523,
"rewards/rejected": -0.09357334673404694,
"sft_loss": 0.7937291264533997,
"step": 1480
},
{
"epoch": 2.523285351397121,
"grad_norm": 4.000706195831299,
"learning_rate": 3.024615823368371e-07,
"logits/chosen": -3.1206421852111816,
"logits/rejected": -3.152639150619507,
"logps/chosen": -0.7629178166389465,
"logps/rejected": -0.9828959703445435,
"loss": 0.8252,
"odds_ratio_loss": 0.622687041759491,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07629178464412689,
"rewards/margins": 0.021997807547450066,
"rewards/rejected": -0.09828958660364151,
"sft_loss": 0.7629178166389465,
"step": 1490
},
{
"epoch": 2.5402201524132093,
"grad_norm": 2.449856758117676,
"learning_rate": 2.8165102503600716e-07,
"logits/chosen": -3.095520496368408,
"logits/rejected": -3.1078646183013916,
"logps/chosen": -0.7517341375350952,
"logps/rejected": -0.9984272718429565,
"loss": 0.8135,
"odds_ratio_loss": 0.6172657012939453,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07517342269420624,
"rewards/margins": 0.024669310078024864,
"rewards/rejected": -0.09984272718429565,
"sft_loss": 0.7517341375350952,
"step": 1500
},
{
"epoch": 2.5402201524132093,
"eval_logits/chosen": -3.130527973175049,
"eval_logits/rejected": -3.1507296562194824,
"eval_logps/chosen": -0.7867480516433716,
"eval_logps/rejected": -0.9955620169639587,
"eval_loss": 0.8505691885948181,
"eval_odds_ratio_loss": 0.638211727142334,
"eval_rewards/accuracies": 0.5723809599876404,
"eval_rewards/chosen": -0.0786748081445694,
"eval_rewards/margins": 0.020881392061710358,
"eval_rewards/rejected": -0.09955620020627975,
"eval_runtime": 194.4899,
"eval_samples_per_second": 5.399,
"eval_sft_loss": 0.7867480516433716,
"eval_steps_per_second": 2.699,
"step": 1500
},
{
"epoch": 2.557154953429297,
"grad_norm": 1.5426675081253052,
"learning_rate": 2.615393769259039e-07,
"logits/chosen": -3.1200222969055176,
"logits/rejected": -3.151517152786255,
"logps/chosen": -0.845578670501709,
"logps/rejected": -0.9381099939346313,
"loss": 0.9194,
"odds_ratio_loss": 0.7380752563476562,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.08455787599086761,
"rewards/margins": 0.00925312377512455,
"rewards/rejected": -0.09381099790334702,
"sft_loss": 0.845578670501709,
"step": 1510
},
{
"epoch": 2.574089754445385,
"grad_norm": 4.487434387207031,
"learning_rate": 2.421329743475917e-07,
"logits/chosen": -3.1358139514923096,
"logits/rejected": -3.157161235809326,
"logps/chosen": -0.7332046627998352,
"logps/rejected": -0.9182003140449524,
"loss": 0.7981,
"odds_ratio_loss": 0.6488706469535828,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07332046329975128,
"rewards/margins": 0.018499553203582764,
"rewards/rejected": -0.09182002395391464,
"sft_loss": 0.7332046627998352,
"step": 1520
},
{
"epoch": 2.5910245554614733,
"grad_norm": 2.3630266189575195,
"learning_rate": 2.234379314486973e-07,
"logits/chosen": -3.1165411472320557,
"logits/rejected": -3.1539194583892822,
"logps/chosen": -0.8104802370071411,
"logps/rejected": -0.9215306043624878,
"loss": 0.8756,
"odds_ratio_loss": 0.6507223844528198,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08104802668094635,
"rewards/margins": 0.011105035431683064,
"rewards/rejected": -0.09215305000543594,
"sft_loss": 0.8104802370071411,
"step": 1530
},
{
"epoch": 2.6079593564775614,
"grad_norm": 1.243202805519104,
"learning_rate": 2.0546013825709783e-07,
"logits/chosen": -3.1084282398223877,
"logits/rejected": -3.129692792892456,
"logps/chosen": -0.7565353512763977,
"logps/rejected": -1.0974582433700562,
"loss": 0.8149,
"odds_ratio_loss": 0.5834510326385498,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07565353065729141,
"rewards/margins": 0.034092292189598083,
"rewards/rejected": -0.1097458228468895,
"sft_loss": 0.7565353512763977,
"step": 1540
},
{
"epoch": 2.6248941574936495,
"grad_norm": 1.8778706789016724,
"learning_rate": 1.88205258825217e-07,
"logits/chosen": -3.0997517108917236,
"logits/rejected": -3.0937392711639404,
"logps/chosen": -0.672328770160675,
"logps/rejected": -0.9460701942443848,
"loss": 0.7295,
"odds_ratio_loss": 0.5718902349472046,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.06723286956548691,
"rewards/margins": 0.02737414464354515,
"rewards/rejected": -0.09460701793432236,
"sft_loss": 0.672328770160675,
"step": 1550
},
{
"epoch": 2.6418289585097376,
"grad_norm": 3.3697926998138428,
"learning_rate": 1.7167872944552245e-07,
"logits/chosen": -3.0951905250549316,
"logits/rejected": -3.143221378326416,
"logps/chosen": -0.6983746290206909,
"logps/rejected": -0.9046236276626587,
"loss": 0.7572,
"odds_ratio_loss": 0.5877906084060669,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.06983745843172073,
"rewards/margins": 0.020624909549951553,
"rewards/rejected": -0.09046236425638199,
"sft_loss": 0.6983746290206909,
"step": 1560
},
{
"epoch": 2.6587637595258258,
"grad_norm": 2.3217389583587646,
"learning_rate": 1.5588575693777142e-07,
"logits/chosen": -3.1411678791046143,
"logits/rejected": -3.1680665016174316,
"logps/chosen": -0.7495251297950745,
"logps/rejected": -0.8962133526802063,
"loss": 0.8135,
"odds_ratio_loss": 0.6393758058547974,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.07495252043008804,
"rewards/margins": 0.01466882973909378,
"rewards/rejected": -0.08962134271860123,
"sft_loss": 0.7495251297950745,
"step": 1570
},
{
"epoch": 2.675698560541914,
"grad_norm": 2.695345401763916,
"learning_rate": 1.4083131700856428e-07,
"logits/chosen": -3.093736410140991,
"logits/rejected": -3.154810905456543,
"logps/chosen": -0.7956855893135071,
"logps/rejected": -0.9356697201728821,
"loss": 0.8605,
"odds_ratio_loss": 0.6480044722557068,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.07956856489181519,
"rewards/margins": 0.013998406007885933,
"rewards/rejected": -0.09356696903705597,
"sft_loss": 0.7956855893135071,
"step": 1580
},
{
"epoch": 2.6926333615580016,
"grad_norm": 4.358800888061523,
"learning_rate": 1.2652015268370315e-07,
"logits/chosen": -3.150224208831787,
"logits/rejected": -3.183022975921631,
"logps/chosen": -0.7435088753700256,
"logps/rejected": -0.9757458567619324,
"loss": 0.8044,
"odds_ratio_loss": 0.6088230013847351,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07435088604688644,
"rewards/margins": 0.02322370745241642,
"rewards/rejected": -0.09757460653781891,
"sft_loss": 0.7435088753700256,
"step": 1590
},
{
"epoch": 2.7095681625740897,
"grad_norm": 1.9499818086624146,
"learning_rate": 1.1295677281386502e-07,
"logits/chosen": -3.1664371490478516,
"logits/rejected": -3.1615631580352783,
"logps/chosen": -0.8278031349182129,
"logps/rejected": -1.062387228012085,
"loss": 0.8914,
"odds_ratio_loss": 0.6362180113792419,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08278031647205353,
"rewards/margins": 0.02345840446650982,
"rewards/rejected": -0.1062387228012085,
"sft_loss": 0.8278031349182129,
"step": 1600
},
{
"epoch": 2.726502963590178,
"grad_norm": 1.277694821357727,
"learning_rate": 1.0014545065404973e-07,
"logits/chosen": -3.12839674949646,
"logits/rejected": -3.1691813468933105,
"logps/chosen": -0.8073375821113586,
"logps/rejected": -1.0353208780288696,
"loss": 0.8777,
"odds_ratio_loss": 0.7032862305641174,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0807337611913681,
"rewards/margins": 0.022798333317041397,
"rewards/rejected": -0.1035320907831192,
"sft_loss": 0.8073375821113586,
"step": 1610
},
{
"epoch": 2.743437764606266,
"grad_norm": 1.525190830230713,
"learning_rate": 8.809022251725502e-08,
"logits/chosen": -3.1842474937438965,
"logits/rejected": -3.156172752380371,
"logps/chosen": -0.7726050615310669,
"logps/rejected": -1.0311490297317505,
"loss": 0.8348,
"odds_ratio_loss": 0.6221681833267212,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07726050913333893,
"rewards/margins": 0.02585439942777157,
"rewards/rejected": -0.10311490297317505,
"sft_loss": 0.7726050615310669,
"step": 1620
},
{
"epoch": 2.7603725656223537,
"grad_norm": 2.015005111694336,
"learning_rate": 7.679488650280509e-08,
"logits/chosen": -3.192863702774048,
"logits/rejected": -3.213048219680786,
"logps/chosen": -0.7830113172531128,
"logps/rejected": -0.9780160188674927,
"loss": 0.8429,
"odds_ratio_loss": 0.5990433096885681,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07830111682415009,
"rewards/margins": 0.019500473514199257,
"rewards/rejected": -0.09780160337686539,
"sft_loss": 0.7830113172531128,
"step": 1630
},
{
"epoch": 2.777307366638442,
"grad_norm": 2.5389175415039062,
"learning_rate": 6.626300129972563e-08,
"logits/chosen": -3.122981071472168,
"logits/rejected": -3.214785099029541,
"logps/chosen": -0.7296528220176697,
"logps/rejected": -0.955167293548584,
"loss": 0.792,
"odds_ratio_loss": 0.6230596303939819,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07296527922153473,
"rewards/margins": 0.02255145087838173,
"rewards/rejected": -0.09551674872636795,
"sft_loss": 0.7296528220176697,
"step": 1640
},
{
"epoch": 2.79424216765453,
"grad_norm": 1.7503160238265991,
"learning_rate": 5.649788506555065e-08,
"logits/chosen": -3.115635871887207,
"logits/rejected": -3.166555881500244,
"logps/chosen": -0.7383006811141968,
"logps/rejected": -0.9469397664070129,
"loss": 0.797,
"odds_ratio_loss": 0.5870878100395203,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07383007556200027,
"rewards/margins": 0.020863894373178482,
"rewards/rejected": -0.09469397366046906,
"sft_loss": 0.7383006811141968,
"step": 1650
},
{
"epoch": 2.811176968670618,
"grad_norm": 5.490494728088379,
"learning_rate": 4.7502614380908474e-08,
"logits/chosen": -3.1543877124786377,
"logits/rejected": -3.134704113006592,
"logps/chosen": -0.7694907188415527,
"logps/rejected": -0.9553998112678528,
"loss": 0.8333,
"odds_ratio_loss": 0.6379188895225525,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07694907486438751,
"rewards/margins": 0.018590910360217094,
"rewards/rejected": -0.09553998708724976,
"sft_loss": 0.7694907188415527,
"step": 1660
},
{
"epoch": 2.828111769686706,
"grad_norm": 3.784432888031006,
"learning_rate": 3.9280023280222066e-08,
"logits/chosen": -3.1404221057891846,
"logits/rejected": -3.1622607707977295,
"logps/chosen": -0.757176399230957,
"logps/rejected": -0.9332196116447449,
"loss": 0.8222,
"odds_ratio_loss": 0.6507169008255005,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07571764290332794,
"rewards/margins": 0.017604324966669083,
"rewards/rejected": -0.09332196414470673,
"sft_loss": 0.757176399230957,
"step": 1670
},
{
"epoch": 2.8450465707027943,
"grad_norm": 2.088921546936035,
"learning_rate": 3.1832702358818855e-08,
"logits/chosen": -3.116088390350342,
"logits/rejected": -3.1639227867126465,
"logps/chosen": -0.8132543563842773,
"logps/rejected": -1.068440556526184,
"loss": 0.8739,
"odds_ratio_loss": 0.6062491536140442,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08132544159889221,
"rewards/margins": 0.025518611073493958,
"rewards/rejected": -0.10684405267238617,
"sft_loss": 0.8132543563842773,
"step": 1680
},
{
"epoch": 2.8619813717188824,
"grad_norm": 2.5282764434814453,
"learning_rate": 2.5162997956746647e-08,
"logits/chosen": -3.1500442028045654,
"logits/rejected": -3.171433210372925,
"logps/chosen": -0.7479134202003479,
"logps/rejected": -0.9974180459976196,
"loss": 0.8076,
"odds_ratio_loss": 0.596734344959259,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0747913345694542,
"rewards/margins": 0.02495047077536583,
"rewards/rejected": -0.09974180907011032,
"sft_loss": 0.7479134202003479,
"step": 1690
},
{
"epoch": 2.8789161727349706,
"grad_norm": 3.469899892807007,
"learning_rate": 1.9273011419536914e-08,
"logits/chosen": -3.140916585922241,
"logits/rejected": -3.16453218460083,
"logps/chosen": -0.7706011533737183,
"logps/rejected": -0.9274336695671082,
"loss": 0.8378,
"odds_ratio_loss": 0.6717150211334229,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07706011831760406,
"rewards/margins": 0.0156832467764616,
"rewards/rejected": -0.09274337440729141,
"sft_loss": 0.7706011533737183,
"step": 1700
},
{
"epoch": 2.8958509737510583,
"grad_norm": 1.3444654941558838,
"learning_rate": 1.4164598436159083e-08,
"logits/chosen": -3.1658730506896973,
"logits/rejected": -3.1828174591064453,
"logps/chosen": -0.7568970322608948,
"logps/rejected": -0.9151817560195923,
"loss": 0.82,
"odds_ratio_loss": 0.630837082862854,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07568971067667007,
"rewards/margins": 0.01582847535610199,
"rewards/rejected": -0.09151817858219147,
"sft_loss": 0.7568970322608948,
"step": 1710
},
{
"epoch": 2.9127857747671464,
"grad_norm": 2.7227256298065186,
"learning_rate": 9.839368454371556e-09,
"logits/chosen": -3.1114795207977295,
"logits/rejected": -3.11403751373291,
"logps/chosen": -0.7409245371818542,
"logps/rejected": -1.0069682598114014,
"loss": 0.8046,
"odds_ratio_loss": 0.6370644569396973,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07409246265888214,
"rewards/margins": 0.026604369282722473,
"rewards/rejected": -0.10069682449102402,
"sft_loss": 0.7409245371818542,
"step": 1720
},
{
"epoch": 2.9297205757832345,
"grad_norm": 1.9127988815307617,
"learning_rate": 6.298684173650649e-09,
"logits/chosen": -3.0886847972869873,
"logits/rejected": -3.123133897781372,
"logps/chosen": -0.7336040735244751,
"logps/rejected": -1.0557795763015747,
"loss": 0.7977,
"odds_ratio_loss": 0.640912652015686,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07336040586233139,
"rewards/margins": 0.03221754729747772,
"rewards/rejected": -0.10557796061038971,
"sft_loss": 0.7336040735244751,
"step": 1730
},
{
"epoch": 2.9466553767993227,
"grad_norm": 4.324622631072998,
"learning_rate": 3.543661115860686e-09,
"logits/chosen": -3.103651523590088,
"logits/rejected": -3.136838674545288,
"logps/chosen": -0.790323793888092,
"logps/rejected": -1.0052763223648071,
"loss": 0.8552,
"odds_ratio_loss": 0.6486952900886536,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07903237640857697,
"rewards/margins": 0.021495262160897255,
"rewards/rejected": -0.10052764415740967,
"sft_loss": 0.790323793888092,
"step": 1740
},
{
"epoch": 2.963590177815411,
"grad_norm": 2.1066298484802246,
"learning_rate": 1.575167273800693e-09,
"logits/chosen": -3.1322402954101562,
"logits/rejected": -3.141758680343628,
"logps/chosen": -0.7295509576797485,
"logps/rejected": -0.8775620460510254,
"loss": 0.7914,
"odds_ratio_loss": 0.6189672350883484,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07295509427785873,
"rewards/margins": 0.01480111014097929,
"rewards/rejected": -0.0877562090754509,
"sft_loss": 0.7295509576797485,
"step": 1750
},
{
"epoch": 2.9805249788314985,
"grad_norm": 2.692870855331421,
"learning_rate": 3.9382283773564676e-10,
"logits/chosen": -3.153014659881592,
"logits/rejected": -3.1698267459869385,
"logps/chosen": -0.8286467790603638,
"logps/rejected": -1.0176646709442139,
"loss": 0.8971,
"odds_ratio_loss": 0.6844185590744019,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0828646719455719,
"rewards/margins": 0.018901783972978592,
"rewards/rejected": -0.10176645219326019,
"sft_loss": 0.8286467790603638,
"step": 1760
},
{
"epoch": 2.9974597798475866,
"grad_norm": 11.2490234375,
"learning_rate": 0.0,
"logits/chosen": -3.122014284133911,
"logits/rejected": -3.142989158630371,
"logps/chosen": -0.8531309962272644,
"logps/rejected": -1.0983434915542603,
"loss": 0.9206,
"odds_ratio_loss": 0.6742203235626221,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.0853130966424942,
"rewards/margins": 0.024521255865693092,
"rewards/rejected": -0.10983435064554214,
"sft_loss": 0.8531309962272644,
"step": 1770
},
{
"epoch": 2.9974597798475866,
"step": 1770,
"total_flos": 2.0399855839629804e+18,
"train_loss": 0.8663494454938813,
"train_runtime": 17638.0542,
"train_samples_per_second": 1.607,
"train_steps_per_second": 0.1
}
],
"logging_steps": 10,
"max_steps": 1770,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 2.0399855839629804e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}