phi3-sto-iter0 / trainer_state.json
LordNoah's picture
update
9162499
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9978094194961664,
"eval_steps": 50000,
"global_step": 1216,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008214676889375685,
"grad_norm": 47.48785366410319,
"learning_rate": 4.0983606557377046e-08,
"logits/chosen": 26.403932571411133,
"logits/rejected": 25.755094528198242,
"logps/chosen": -185.5782928466797,
"logps/rejected": -79.66442108154297,
"loss": 1.7879,
"rewards/accuracies": 0.30666670203208923,
"rewards/chosen": 0.008285612799227238,
"rewards/margins": 0.017053820192813873,
"rewards/rejected": -0.008768204599618912,
"sft_loss": 0.6387583017349243,
"step": 5
},
{
"epoch": 0.01642935377875137,
"grad_norm": 36.134481992571715,
"learning_rate": 8.196721311475409e-08,
"logits/chosen": 25.775484085083008,
"logits/rejected": 25.31159210205078,
"logps/chosen": -152.4672088623047,
"logps/rejected": -72.757080078125,
"loss": 1.6789,
"rewards/accuracies": 0.7333334684371948,
"rewards/chosen": -0.026889141649007797,
"rewards/margins": 0.14848218858242035,
"rewards/rejected": -0.17537136375904083,
"sft_loss": 0.6469724774360657,
"step": 10
},
{
"epoch": 0.024644030668127054,
"grad_norm": 19.978551205164187,
"learning_rate": 1.2295081967213113e-07,
"logits/chosen": 26.670787811279297,
"logits/rejected": 26.257781982421875,
"logps/chosen": -176.73304748535156,
"logps/rejected": -84.2028579711914,
"loss": 1.4459,
"rewards/accuracies": 0.8666666746139526,
"rewards/chosen": -0.1812039315700531,
"rewards/margins": 0.5640282034873962,
"rewards/rejected": -0.7452322244644165,
"sft_loss": 0.6364741921424866,
"step": 15
},
{
"epoch": 0.03285870755750274,
"grad_norm": 20.48799482343835,
"learning_rate": 1.6393442622950818e-07,
"logits/chosen": 26.263166427612305,
"logits/rejected": 26.03022003173828,
"logps/chosen": -214.57823181152344,
"logps/rejected": -111.45527648925781,
"loss": 1.316,
"rewards/accuracies": 0.9200000166893005,
"rewards/chosen": -0.642541766166687,
"rewards/margins": 1.2724699974060059,
"rewards/rejected": -1.9150116443634033,
"sft_loss": 0.7241686582565308,
"step": 20
},
{
"epoch": 0.04107338444687842,
"grad_norm": 24.43893120773317,
"learning_rate": 2.0491803278688524e-07,
"logits/chosen": 25.63840103149414,
"logits/rejected": 25.88968849182129,
"logps/chosen": -180.67430114746094,
"logps/rejected": -108.99486541748047,
"loss": 1.26,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -0.9794896245002747,
"rewards/margins": 1.706125020980835,
"rewards/rejected": -2.6856143474578857,
"sft_loss": 0.7140628695487976,
"step": 25
},
{
"epoch": 0.04928806133625411,
"grad_norm": 15.575922163206743,
"learning_rate": 2.4590163934426226e-07,
"logits/chosen": 25.174482345581055,
"logits/rejected": 25.23969841003418,
"logps/chosen": -213.48123168945312,
"logps/rejected": -114.4116439819336,
"loss": 1.1511,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -0.9717932343482971,
"rewards/margins": 2.3237061500549316,
"rewards/rejected": -3.295499563217163,
"sft_loss": 0.6879211664199829,
"step": 30
},
{
"epoch": 0.05750273822562979,
"grad_norm": 12.317269413176323,
"learning_rate": 2.868852459016393e-07,
"logits/chosen": 24.615764617919922,
"logits/rejected": 24.808069229125977,
"logps/chosen": -202.15489196777344,
"logps/rejected": -124.00420379638672,
"loss": 1.0435,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -1.0643211603164673,
"rewards/margins": 2.588770627975464,
"rewards/rejected": -3.6530916690826416,
"sft_loss": 0.7430208325386047,
"step": 35
},
{
"epoch": 0.06571741511500548,
"grad_norm": 11.98913328054039,
"learning_rate": 3.2786885245901637e-07,
"logits/chosen": 24.245140075683594,
"logits/rejected": 24.268098831176758,
"logps/chosen": -207.348876953125,
"logps/rejected": -116.2168960571289,
"loss": 0.9343,
"rewards/accuracies": 0.9333333969116211,
"rewards/chosen": -1.0519558191299438,
"rewards/margins": 2.5677475929260254,
"rewards/rejected": -3.619703531265259,
"sft_loss": 0.7124413251876831,
"step": 40
},
{
"epoch": 0.07393209200438117,
"grad_norm": 11.849547311712948,
"learning_rate": 3.6885245901639347e-07,
"logits/chosen": 22.61182403564453,
"logits/rejected": 22.616382598876953,
"logps/chosen": -222.93838500976562,
"logps/rejected": -123.43074798583984,
"loss": 0.8683,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -1.442903995513916,
"rewards/margins": 2.7517101764678955,
"rewards/rejected": -4.194613456726074,
"sft_loss": 0.702341616153717,
"step": 45
},
{
"epoch": 0.08214676889375684,
"grad_norm": 11.859772629239353,
"learning_rate": 4.0983606557377047e-07,
"logits/chosen": 20.62839126586914,
"logits/rejected": 20.336801528930664,
"logps/chosen": -241.59852600097656,
"logps/rejected": -132.82681274414062,
"loss": 0.7963,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -2.2198777198791504,
"rewards/margins": 3.0024592876434326,
"rewards/rejected": -5.2223358154296875,
"sft_loss": 0.7061720490455627,
"step": 50
},
{
"epoch": 0.09036144578313253,
"grad_norm": 9.406942779681957,
"learning_rate": 4.508196721311475e-07,
"logits/chosen": 19.715351104736328,
"logits/rejected": 20.35331153869629,
"logps/chosen": -208.7209930419922,
"logps/rejected": -150.72914123535156,
"loss": 0.8148,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -2.6436386108398438,
"rewards/margins": 3.832695722579956,
"rewards/rejected": -6.476334571838379,
"sft_loss": 0.7786983251571655,
"step": 55
},
{
"epoch": 0.09857612267250822,
"grad_norm": 10.934185099116656,
"learning_rate": 4.918032786885245e-07,
"logits/chosen": 20.9300537109375,
"logits/rejected": 21.388505935668945,
"logps/chosen": -192.5828399658203,
"logps/rejected": -125.1326904296875,
"loss": 0.8114,
"rewards/accuracies": 0.9200000166893005,
"rewards/chosen": -2.3978421688079834,
"rewards/margins": 3.1992502212524414,
"rewards/rejected": -5.597092628479004,
"sft_loss": 0.698898434638977,
"step": 60
},
{
"epoch": 0.10679079956188389,
"grad_norm": 12.361759850691927,
"learning_rate": 4.999852034151641e-07,
"logits/chosen": 19.11568832397461,
"logits/rejected": 19.857196807861328,
"logps/chosen": -242.90460205078125,
"logps/rejected": -149.67938232421875,
"loss": 0.7666,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -2.5540611743927,
"rewards/margins": 3.71470308303833,
"rewards/rejected": -6.268764019012451,
"sft_loss": 0.7993389368057251,
"step": 65
},
{
"epoch": 0.11500547645125958,
"grad_norm": 14.492049698010485,
"learning_rate": 4.999250952911133e-07,
"logits/chosen": 20.96298599243164,
"logits/rejected": 20.906280517578125,
"logps/chosen": -236.47763061523438,
"logps/rejected": -142.59445190429688,
"loss": 0.6927,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -2.4225642681121826,
"rewards/margins": 4.021193981170654,
"rewards/rejected": -6.4437575340271,
"sft_loss": 0.8038942217826843,
"step": 70
},
{
"epoch": 0.12322015334063527,
"grad_norm": 17.551745284718073,
"learning_rate": 4.998187619501184e-07,
"logits/chosen": 20.637529373168945,
"logits/rejected": 21.148029327392578,
"logps/chosen": -266.9391784667969,
"logps/rejected": -173.1654510498047,
"loss": 0.6651,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -3.129103660583496,
"rewards/margins": 5.091865062713623,
"rewards/rejected": -8.220968246459961,
"sft_loss": 0.8789225816726685,
"step": 75
},
{
"epoch": 0.13143483023001096,
"grad_norm": 21.266109357356587,
"learning_rate": 4.996662230591989e-07,
"logits/chosen": 18.540781021118164,
"logits/rejected": 19.185565948486328,
"logps/chosen": -252.1251983642578,
"logps/rejected": -169.13851928710938,
"loss": 0.706,
"rewards/accuracies": 0.9333333969116211,
"rewards/chosen": -3.4190313816070557,
"rewards/margins": 4.7408881187438965,
"rewards/rejected": -8.159918785095215,
"sft_loss": 0.8200284242630005,
"step": 80
},
{
"epoch": 0.13964950711938665,
"grad_norm": 14.68921798619268,
"learning_rate": 4.994675068313813e-07,
"logits/chosen": 17.844524383544922,
"logits/rejected": 19.307209014892578,
"logps/chosen": -235.93295288085938,
"logps/rejected": -164.65467834472656,
"loss": 0.6425,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -3.202953577041626,
"rewards/margins": 4.453563213348389,
"rewards/rejected": -7.656517028808594,
"sft_loss": 0.8084096908569336,
"step": 85
},
{
"epoch": 0.14786418400876233,
"grad_norm": 9.391954380287526,
"learning_rate": 4.992226500204806e-07,
"logits/chosen": 18.810604095458984,
"logits/rejected": 19.509326934814453,
"logps/chosen": -239.79638671875,
"logps/rejected": -149.21372985839844,
"loss": 0.6741,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -2.8668696880340576,
"rewards/margins": 4.099938869476318,
"rewards/rejected": -6.966808795928955,
"sft_loss": 0.8505186438560486,
"step": 90
},
{
"epoch": 0.156078860898138,
"grad_norm": 8.292078325061183,
"learning_rate": 4.989316979143029e-07,
"logits/chosen": 19.036439895629883,
"logits/rejected": 18.50504493713379,
"logps/chosen": -243.55430603027344,
"logps/rejected": -141.56640625,
"loss": 0.7786,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -2.8055150508880615,
"rewards/margins": 4.023467540740967,
"rewards/rejected": -6.828982830047607,
"sft_loss": 0.8537193536758423,
"step": 95
},
{
"epoch": 0.16429353778751368,
"grad_norm": 11.759496127210191,
"learning_rate": 4.985947043262686e-07,
"logits/chosen": 18.438268661499023,
"logits/rejected": 18.92384147644043,
"logps/chosen": -256.82135009765625,
"logps/rejected": -162.3760223388672,
"loss": 0.656,
"rewards/accuracies": 0.9200000166893005,
"rewards/chosen": -3.251594066619873,
"rewards/margins": 4.7661452293396,
"rewards/rejected": -8.017740249633789,
"sft_loss": 0.8523219227790833,
"step": 100
},
{
"epoch": 0.17250821467688937,
"grad_norm": 13.225983298656475,
"learning_rate": 4.982117315854593e-07,
"logits/chosen": 19.018491744995117,
"logits/rejected": 19.4432373046875,
"logps/chosen": -242.88742065429688,
"logps/rejected": -160.6437225341797,
"loss": 0.6173,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -3.371706962585449,
"rewards/margins": 4.9150261878967285,
"rewards/rejected": -8.286733627319336,
"sft_loss": 0.8633176684379578,
"step": 105
},
{
"epoch": 0.18072289156626506,
"grad_norm": 33.60275949690272,
"learning_rate": 4.977828505250903e-07,
"logits/chosen": 18.26275062561035,
"logits/rejected": 18.561012268066406,
"logps/chosen": -232.76333618164062,
"logps/rejected": -153.5156707763672,
"loss": 0.6725,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -3.7783102989196777,
"rewards/margins": 4.282144069671631,
"rewards/rejected": -8.060454368591309,
"sft_loss": 0.8514001369476318,
"step": 110
},
{
"epoch": 0.18893756845564075,
"grad_norm": 29.7322754671122,
"learning_rate": 4.973081404694087e-07,
"logits/chosen": 17.40985679626465,
"logits/rejected": 18.532135009765625,
"logps/chosen": -263.5098571777344,
"logps/rejected": -179.07461547851562,
"loss": 0.6416,
"rewards/accuracies": 0.9599999785423279,
"rewards/chosen": -4.028925895690918,
"rewards/margins": 5.305994033813477,
"rewards/rejected": -9.334918975830078,
"sft_loss": 0.9138454794883728,
"step": 115
},
{
"epoch": 0.19715224534501644,
"grad_norm": 11.64226677208211,
"learning_rate": 4.967876892190227e-07,
"logits/chosen": 18.535491943359375,
"logits/rejected": 18.528560638427734,
"logps/chosen": -261.1396484375,
"logps/rejected": -164.66261291503906,
"loss": 0.6327,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -3.7333872318267822,
"rewards/margins": 4.9736409187316895,
"rewards/rejected": -8.707027435302734,
"sft_loss": 0.8873167634010315,
"step": 120
},
{
"epoch": 0.20536692223439212,
"grad_norm": 10.419489404925384,
"learning_rate": 4.962215930346614e-07,
"logits/chosen": 18.076738357543945,
"logits/rejected": 18.797412872314453,
"logps/chosen": -240.43885803222656,
"logps/rejected": -170.57994079589844,
"loss": 0.6021,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -3.8788936138153076,
"rewards/margins": 5.037622928619385,
"rewards/rejected": -8.91651725769043,
"sft_loss": 0.8787587285041809,
"step": 125
},
{
"epoch": 0.21358159912376778,
"grad_norm": 13.538147865005195,
"learning_rate": 4.956099566193716e-07,
"logits/chosen": 17.794748306274414,
"logits/rejected": 18.117393493652344,
"logps/chosen": -263.0421447753906,
"logps/rejected": -180.68548583984375,
"loss": 0.5662,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.157100677490234,
"rewards/margins": 5.220449924468994,
"rewards/rejected": -9.377551078796387,
"sft_loss": 0.8972741961479187,
"step": 130
},
{
"epoch": 0.22179627601314347,
"grad_norm": 14.579358533691158,
"learning_rate": 4.949528930991521e-07,
"logits/chosen": 17.554058074951172,
"logits/rejected": 18.180675506591797,
"logps/chosen": -265.0473327636719,
"logps/rejected": -177.85751342773438,
"loss": 0.6399,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -3.8493740558624268,
"rewards/margins": 5.235028266906738,
"rewards/rejected": -9.084402084350586,
"sft_loss": 0.8204969167709351,
"step": 135
},
{
"epoch": 0.23001095290251916,
"grad_norm": 12.455186291161809,
"learning_rate": 4.9425052400203e-07,
"logits/chosen": 17.611921310424805,
"logits/rejected": 17.878339767456055,
"logps/chosen": -265.25787353515625,
"logps/rejected": -185.50375366210938,
"loss": 0.6103,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -4.896492958068848,
"rewards/margins": 4.968528747558594,
"rewards/rejected": -9.865021705627441,
"sft_loss": 0.8832098245620728,
"step": 140
},
{
"epoch": 0.23822562979189485,
"grad_norm": 12.455826945227212,
"learning_rate": 4.935029792355834e-07,
"logits/chosen": 17.996692657470703,
"logits/rejected": 18.594377517700195,
"logps/chosen": -286.6059875488281,
"logps/rejected": -200.05149841308594,
"loss": 0.543,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -5.263542175292969,
"rewards/margins": 5.605571269989014,
"rewards/rejected": -10.86911392211914,
"sft_loss": 0.8996745944023132,
"step": 145
},
{
"epoch": 0.24644030668127054,
"grad_norm": 15.362573644388778,
"learning_rate": 4.927103970629147e-07,
"logits/chosen": 18.072965621948242,
"logits/rejected": 18.25052261352539,
"logps/chosen": -269.8097839355469,
"logps/rejected": -185.32431030273438,
"loss": 0.6219,
"rewards/accuracies": 0.9333333969116211,
"rewards/chosen": -4.9397172927856445,
"rewards/margins": 5.206496715545654,
"rewards/rejected": -10.14621353149414,
"sft_loss": 0.7995728254318237,
"step": 150
},
{
"epoch": 0.2546549835706462,
"grad_norm": 11.01295684823168,
"learning_rate": 4.918729240770775e-07,
"logits/chosen": 17.353046417236328,
"logits/rejected": 18.587129592895508,
"logps/chosen": -240.89488220214844,
"logps/rejected": -173.4665069580078,
"loss": 0.5702,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -4.678676128387451,
"rewards/margins": 5.200152397155762,
"rewards/rejected": -9.878829002380371,
"sft_loss": 0.9399448037147522,
"step": 155
},
{
"epoch": 0.2628696604600219,
"grad_norm": 19.41550454155631,
"learning_rate": 4.909907151739633e-07,
"logits/chosen": 18.130189895629883,
"logits/rejected": 18.379247665405273,
"logps/chosen": -292.39990234375,
"logps/rejected": -188.62220764160156,
"loss": 0.6561,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -4.744537353515625,
"rewards/margins": 5.8243794441223145,
"rewards/rejected": -10.568917274475098,
"sft_loss": 0.8947219848632812,
"step": 160
},
{
"epoch": 0.2710843373493976,
"grad_norm": 11.498941704903908,
"learning_rate": 4.900639335236526e-07,
"logits/chosen": 18.79334259033203,
"logits/rejected": 19.24587059020996,
"logps/chosen": -271.9427185058594,
"logps/rejected": -179.41293334960938,
"loss": 0.607,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -4.425381183624268,
"rewards/margins": 5.343040943145752,
"rewards/rejected": -9.76842212677002,
"sft_loss": 0.9064626097679138,
"step": 165
},
{
"epoch": 0.2792990142387733,
"grad_norm": 10.84500529690262,
"learning_rate": 4.890927505402359e-07,
"logits/chosen": 16.892650604248047,
"logits/rejected": 17.597482681274414,
"logps/chosen": -238.55162048339844,
"logps/rejected": -170.00466918945312,
"loss": 0.5889,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -4.459685802459717,
"rewards/margins": 4.842031002044678,
"rewards/rejected": -9.301715850830078,
"sft_loss": 0.8489271402359009,
"step": 170
},
{
"epoch": 0.28751369112814895,
"grad_norm": 16.18327307978759,
"learning_rate": 4.880773458501089e-07,
"logits/chosen": 19.4614315032959,
"logits/rejected": 19.801300048828125,
"logps/chosen": -232.73573303222656,
"logps/rejected": -165.04103088378906,
"loss": 0.5662,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -4.241008758544922,
"rewards/margins": 4.859863758087158,
"rewards/rejected": -9.100872993469238,
"sft_loss": 0.8601513504981995,
"step": 175
},
{
"epoch": 0.29572836801752467,
"grad_norm": 10.178119222467004,
"learning_rate": 4.870179072587498e-07,
"logits/chosen": 17.228599548339844,
"logits/rejected": 17.30803871154785,
"logps/chosen": -250.42587280273438,
"logps/rejected": -171.54635620117188,
"loss": 0.6129,
"rewards/accuracies": 0.9333333373069763,
"rewards/chosen": -5.068057537078857,
"rewards/margins": 5.1040239334106445,
"rewards/rejected": -10.172082901000977,
"sft_loss": 0.9672516584396362,
"step": 180
},
{
"epoch": 0.30394304490690033,
"grad_norm": 8.317024863573055,
"learning_rate": 4.859146307159841e-07,
"logits/chosen": 18.039478302001953,
"logits/rejected": 18.52968406677246,
"logps/chosen": -248.23155212402344,
"logps/rejected": -179.2881317138672,
"loss": 0.5417,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.143929481506348,
"rewards/margins": 5.1267170906066895,
"rewards/rejected": -10.270648002624512,
"sft_loss": 0.8881379961967468,
"step": 185
},
{
"epoch": 0.312157721796276,
"grad_norm": 11.89542898588366,
"learning_rate": 4.847677202797414e-07,
"logits/chosen": 18.8001708984375,
"logits/rejected": 19.126699447631836,
"logps/chosen": -263.02789306640625,
"logps/rejected": -183.99911499023438,
"loss": 0.5551,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.0858845710754395,
"rewards/margins": 5.6055006980896,
"rewards/rejected": -10.691385269165039,
"sft_loss": 0.8070122599601746,
"step": 190
},
{
"epoch": 0.3203723986856517,
"grad_norm": 12.1886798786308,
"learning_rate": 4.835773880783144e-07,
"logits/chosen": 16.390464782714844,
"logits/rejected": 17.854284286499023,
"logps/chosen": -269.9723815917969,
"logps/rejected": -200.60789489746094,
"loss": 0.5446,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.6444902420043945,
"rewards/margins": 6.389484882354736,
"rewards/rejected": -12.033974647521973,
"sft_loss": 0.8605390191078186,
"step": 195
},
{
"epoch": 0.32858707557502737,
"grad_norm": 11.13911341274398,
"learning_rate": 4.823438542711238e-07,
"logits/chosen": 17.828205108642578,
"logits/rejected": 18.60173797607422,
"logps/chosen": -277.97259521484375,
"logps/rejected": -203.9155731201172,
"loss": 0.5444,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -5.445572853088379,
"rewards/margins": 6.231374740600586,
"rewards/rejected": -11.676946640014648,
"sft_loss": 0.9524543881416321,
"step": 200
},
{
"epoch": 0.3368017524644031,
"grad_norm": 59.69351759377879,
"learning_rate": 4.81067347007999e-07,
"logits/chosen": 18.93602752685547,
"logits/rejected": 19.728424072265625,
"logps/chosen": -247.34567260742188,
"logps/rejected": -173.0783233642578,
"loss": 0.6075,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -4.630053997039795,
"rewards/margins": 4.911181449890137,
"rewards/rejected": -9.54123592376709,
"sft_loss": 0.9002848863601685,
"step": 205
},
{
"epoch": 0.34501642935377874,
"grad_norm": 8.657351709118194,
"learning_rate": 4.797481023869801e-07,
"logits/chosen": 18.50823974609375,
"logits/rejected": 18.78363037109375,
"logps/chosen": -245.55979919433594,
"logps/rejected": -182.1737518310547,
"loss": 0.5425,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.3583760261535645,
"rewards/margins": 5.406437397003174,
"rewards/rejected": -10.764813423156738,
"sft_loss": 0.9510916471481323,
"step": 210
},
{
"epoch": 0.35323110624315446,
"grad_norm": 19.28978220217397,
"learning_rate": 4.783863644106502e-07,
"logits/chosen": 17.9003849029541,
"logits/rejected": 19.15799903869629,
"logps/chosen": -240.30958557128906,
"logps/rejected": -187.32284545898438,
"loss": 0.546,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -5.292669296264648,
"rewards/margins": 5.617033004760742,
"rewards/rejected": -10.909701347351074,
"sft_loss": 0.9965067505836487,
"step": 215
},
{
"epoch": 0.3614457831325301,
"grad_norm": 8.529326956491959,
"learning_rate": 4.769823849410053e-07,
"logits/chosen": 15.990920066833496,
"logits/rejected": 17.267040252685547,
"logps/chosen": -283.7446594238281,
"logps/rejected": -209.57525634765625,
"loss": 0.5062,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.516228675842285,
"rewards/margins": 6.538068771362305,
"rewards/rejected": -12.054296493530273,
"sft_loss": 0.9376140832901001,
"step": 220
},
{
"epoch": 0.3696604600219058,
"grad_norm": 14.616903394027977,
"learning_rate": 4.7553642365287127e-07,
"logits/chosen": 16.816274642944336,
"logits/rejected": 17.819963455200195,
"logps/chosen": -245.84878540039062,
"logps/rejected": -188.35284423828125,
"loss": 0.5832,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -5.176213264465332,
"rewards/margins": 5.302474498748779,
"rewards/rejected": -10.478687286376953,
"sft_loss": 1.0134352445602417,
"step": 225
},
{
"epoch": 0.3778751369112815,
"grad_norm": 15.650965079934865,
"learning_rate": 4.7404874798587493e-07,
"logits/chosen": 18.04664421081543,
"logits/rejected": 19.232574462890625,
"logps/chosen": -268.1763610839844,
"logps/rejected": -193.1671600341797,
"loss": 0.5248,
"rewards/accuracies": 0.9599999785423279,
"rewards/chosen": -5.173998832702637,
"rewards/margins": 5.888847827911377,
"rewards/rejected": -11.062848091125488,
"sft_loss": 0.9188562035560608,
"step": 230
},
{
"epoch": 0.38608981380065716,
"grad_norm": 9.155968536476317,
"learning_rate": 4.7251963309497965e-07,
"logits/chosen": 17.16444206237793,
"logits/rejected": 18.188404083251953,
"logps/chosen": -281.6944580078125,
"logps/rejected": -214.91883850097656,
"loss": 0.5831,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -6.146281719207764,
"rewards/margins": 6.598486423492432,
"rewards/rejected": -12.744769096374512,
"sft_loss": 1.0649549961090088,
"step": 235
},
{
"epoch": 0.39430449069003287,
"grad_norm": 13.480064050397614,
"learning_rate": 4.709493617995938e-07,
"logits/chosen": 18.09016227722168,
"logits/rejected": 18.207592010498047,
"logps/chosen": -278.3957214355469,
"logps/rejected": -195.16822814941406,
"loss": 0.4846,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -5.534964084625244,
"rewards/margins": 6.029869079589844,
"rewards/rejected": -11.564833641052246,
"sft_loss": 0.9166081547737122,
"step": 240
},
{
"epoch": 0.40251916757940853,
"grad_norm": 8.853519364453792,
"learning_rate": 4.6933822453126114e-07,
"logits/chosen": 17.334672927856445,
"logits/rejected": 18.275968551635742,
"logps/chosen": -229.73594665527344,
"logps/rejected": -182.89251708984375,
"loss": 0.5795,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.739324569702148,
"rewards/margins": 5.62018346786499,
"rewards/rejected": -11.35950756072998,
"sft_loss": 1.0507081747055054,
"step": 245
},
{
"epoch": 0.41073384446878425,
"grad_norm": 23.105340732527253,
"learning_rate": 4.676865192799443e-07,
"logits/chosen": 18.659299850463867,
"logits/rejected": 19.426942825317383,
"logps/chosen": -310.3028869628906,
"logps/rejected": -233.80967712402344,
"loss": 0.5041,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -7.381309509277344,
"rewards/margins": 6.830047607421875,
"rewards/rejected": -14.211358070373535,
"sft_loss": 0.9847605228424072,
"step": 250
},
{
"epoch": 0.4189485213581599,
"grad_norm": 12.714869052495171,
"learning_rate": 4.65994551538909e-07,
"logits/chosen": 17.69913101196289,
"logits/rejected": 17.626365661621094,
"logps/chosen": -286.1001892089844,
"logps/rejected": -213.40573120117188,
"loss": 0.5671,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -6.464048385620117,
"rewards/margins": 6.595151424407959,
"rewards/rejected": -13.059199333190918,
"sft_loss": 1.0706934928894043,
"step": 255
},
{
"epoch": 0.42716319824753557,
"grad_norm": 18.758504329716533,
"learning_rate": 4.642626342482215e-07,
"logits/chosen": 17.131309509277344,
"logits/rejected": 17.48920440673828,
"logps/chosen": -231.87130737304688,
"logps/rejected": -174.91970825195312,
"loss": 0.5728,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -4.785408973693848,
"rewards/margins": 5.311648368835449,
"rewards/rejected": -10.097058296203613,
"sft_loss": 0.9056914448738098,
"step": 260
},
{
"epoch": 0.4353778751369113,
"grad_norm": 16.34948060980786,
"learning_rate": 4.624910877368684e-07,
"logits/chosen": 17.2136287689209,
"logits/rejected": 18.958431243896484,
"logps/chosen": -265.6873474121094,
"logps/rejected": -200.37913513183594,
"loss": 0.5359,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -5.0484795570373535,
"rewards/margins": 6.235151767730713,
"rewards/rejected": -11.283629417419434,
"sft_loss": 0.897827684879303,
"step": 265
},
{
"epoch": 0.44359255202628695,
"grad_norm": 8.468377967197654,
"learning_rate": 4.606802396635098e-07,
"logits/chosen": 18.035551071166992,
"logits/rejected": 19.360517501831055,
"logps/chosen": -279.75555419921875,
"logps/rejected": -217.06832885742188,
"loss": 0.4866,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -6.324933052062988,
"rewards/margins": 6.823997974395752,
"rewards/rejected": -13.148929595947266,
"sft_loss": 0.9062218070030212,
"step": 270
},
{
"epoch": 0.45180722891566266,
"grad_norm": 8.67268227774844,
"learning_rate": 4.588304249558763e-07,
"logits/chosen": 17.523601531982422,
"logits/rejected": 17.99420166015625,
"logps/chosen": -290.8741760253906,
"logps/rejected": -215.23924255371094,
"loss": 0.5245,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -6.641848087310791,
"rewards/margins": 6.469297409057617,
"rewards/rejected": -13.11114501953125,
"sft_loss": 0.9921270608901978,
"step": 275
},
{
"epoch": 0.4600219058050383,
"grad_norm": 12.040362030861928,
"learning_rate": 4.569419857488228e-07,
"logits/chosen": 17.7161808013916,
"logits/rejected": 17.987571716308594,
"logps/chosen": -297.76318359375,
"logps/rejected": -205.46383666992188,
"loss": 0.5407,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -5.587214469909668,
"rewards/margins": 6.473574638366699,
"rewards/rejected": -12.060790061950684,
"sft_loss": 0.9882974028587341,
"step": 280
},
{
"epoch": 0.46823658269441404,
"grad_norm": 18.26018008195662,
"learning_rate": 4.550152713210478e-07,
"logits/chosen": 17.55337905883789,
"logits/rejected": 18.636327743530273,
"logps/chosen": -247.40650939941406,
"logps/rejected": -190.1718292236328,
"loss": 0.5136,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -5.608924865722656,
"rewards/margins": 5.688971996307373,
"rewards/rejected": -11.297897338867188,
"sft_loss": 0.9460915327072144,
"step": 285
},
{
"epoch": 0.4764512595837897,
"grad_norm": 12.950141797441761,
"learning_rate": 4.530506380304925e-07,
"logits/chosen": 16.12598419189453,
"logits/rejected": 16.963117599487305,
"logps/chosen": -315.90838623046875,
"logps/rejected": -234.9226837158203,
"loss": 0.5254,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -7.075807094573975,
"rewards/margins": 7.303346157073975,
"rewards/rejected": -14.37915325164795,
"sft_loss": 1.0791391134262085,
"step": 290
},
{
"epoch": 0.4846659364731654,
"grad_norm": 8.510727267783134,
"learning_rate": 4.510484492484301e-07,
"logits/chosen": 16.052139282226562,
"logits/rejected": 18.621992111206055,
"logps/chosen": -293.8525695800781,
"logps/rejected": -249.84762573242188,
"loss": 0.502,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.701572895050049,
"rewards/margins": 7.963890552520752,
"rewards/rejected": -15.665464401245117,
"sft_loss": 1.0291332006454468,
"step": 295
},
{
"epoch": 0.4928806133625411,
"grad_norm": 11.486611534499634,
"learning_rate": 4.4900907529225797e-07,
"logits/chosen": 15.679919242858887,
"logits/rejected": 16.096633911132812,
"logps/chosen": -295.52557373046875,
"logps/rejected": -208.35264587402344,
"loss": 0.5684,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.8986592292785645,
"rewards/margins": 6.808130264282227,
"rewards/rejected": -12.70678997039795,
"sft_loss": 0.9438207149505615,
"step": 300
},
{
"epoch": 0.5010952902519168,
"grad_norm": 11.462228668252441,
"learning_rate": 4.46932893357005e-07,
"logits/chosen": 17.438947677612305,
"logits/rejected": 18.582027435302734,
"logps/chosen": -282.1226501464844,
"logps/rejected": -213.70437622070312,
"loss": 0.4316,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -5.921773433685303,
"rewards/margins": 6.760589599609375,
"rewards/rejected": -12.68236255645752,
"sft_loss": 0.9546439051628113,
"step": 305
},
{
"epoch": 0.5093099671412924,
"grad_norm": 25.401018773660823,
"learning_rate": 4.448202874455672e-07,
"logits/chosen": 16.973630905151367,
"logits/rejected": 17.916053771972656,
"logps/chosen": -303.2902526855469,
"logps/rejected": -214.27862548828125,
"loss": 0.5904,
"rewards/accuracies": 0.9200000762939453,
"rewards/chosen": -6.163903713226318,
"rewards/margins": 6.488450050354004,
"rewards/rejected": -12.65235424041748,
"sft_loss": 1.065365195274353,
"step": 310
},
{
"epoch": 0.5175246440306681,
"grad_norm": 9.834689967221987,
"learning_rate": 4.426716482976838e-07,
"logits/chosen": 18.023340225219727,
"logits/rejected": 19.0910587310791,
"logps/chosen": -296.31610107421875,
"logps/rejected": -209.1268768310547,
"loss": 0.5109,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.737242221832275,
"rewards/margins": 6.57784366607666,
"rewards/rejected": -12.315085411071777,
"sft_loss": 0.966189444065094,
"step": 315
},
{
"epoch": 0.5257393209200438,
"grad_norm": 12.349530912080413,
"learning_rate": 4.4048737331766774e-07,
"logits/chosen": 19.084957122802734,
"logits/rejected": 19.039499282836914,
"logps/chosen": -273.5611877441406,
"logps/rejected": -193.39707946777344,
"loss": 0.5342,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.792872428894043,
"rewards/margins": 5.747686386108398,
"rewards/rejected": -11.540557861328125,
"sft_loss": 0.8884872198104858,
"step": 320
},
{
"epoch": 0.5339539978094195,
"grad_norm": 13.644842027024733,
"learning_rate": 4.3826786650090273e-07,
"logits/chosen": 15.30917739868164,
"logits/rejected": 16.686445236206055,
"logps/chosen": -261.4600524902344,
"logps/rejected": -197.12661743164062,
"loss": 0.5439,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -6.051290988922119,
"rewards/margins": 5.911606311798096,
"rewards/rejected": -11.962896347045898,
"sft_loss": 1.0287508964538574,
"step": 325
},
{
"epoch": 0.5421686746987951,
"grad_norm": 14.919840859492023,
"learning_rate": 4.3601353835912235e-07,
"logits/chosen": 17.14605712890625,
"logits/rejected": 18.71445655822754,
"logps/chosen": -240.4210968017578,
"logps/rejected": -191.06373596191406,
"loss": 0.5566,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -6.086501121520996,
"rewards/margins": 5.632846355438232,
"rewards/rejected": -11.719347953796387,
"sft_loss": 0.9403523206710815,
"step": 330
},
{
"epoch": 0.5503833515881709,
"grad_norm": 34.32717833778974,
"learning_rate": 4.337248058444831e-07,
"logits/chosen": 15.827594757080078,
"logits/rejected": 16.74897575378418,
"logps/chosen": -327.0185852050781,
"logps/rejected": -250.9954376220703,
"loss": 0.5323,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.161806106567383,
"rewards/margins": 7.682919979095459,
"rewards/rejected": -15.844725608825684,
"sft_loss": 1.1408532857894897,
"step": 335
},
{
"epoch": 0.5585980284775466,
"grad_norm": 13.092331667096667,
"learning_rate": 4.3140209227244617e-07,
"logits/chosen": 17.278669357299805,
"logits/rejected": 18.425344467163086,
"logps/chosen": -254.86746215820312,
"logps/rejected": -201.89547729492188,
"loss": 0.5321,
"rewards/accuracies": 0.9200000762939453,
"rewards/chosen": -6.691502094268799,
"rewards/margins": 6.277873516082764,
"rewards/rejected": -12.969375610351562,
"sft_loss": 1.0744267702102661,
"step": 340
},
{
"epoch": 0.5668127053669222,
"grad_norm": 12.663182255141733,
"learning_rate": 4.2904582724348316e-07,
"logits/chosen": 16.910207748413086,
"logits/rejected": 17.029691696166992,
"logps/chosen": -287.6109313964844,
"logps/rejected": -202.47837829589844,
"loss": 0.4913,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -5.874098777770996,
"rewards/margins": 6.561362266540527,
"rewards/rejected": -12.43545913696289,
"sft_loss": 1.1810688972473145,
"step": 345
},
{
"epoch": 0.5750273822562979,
"grad_norm": 18.101790487079715,
"learning_rate": 4.266564465636182e-07,
"logits/chosen": 17.891399383544922,
"logits/rejected": 19.3447208404541,
"logps/chosen": -306.7535705566406,
"logps/rejected": -237.83753967285156,
"loss": 0.482,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -7.050681114196777,
"rewards/margins": 7.2094807624816895,
"rewards/rejected": -14.260162353515625,
"sft_loss": 0.964589536190033,
"step": 350
},
{
"epoch": 0.5832420591456736,
"grad_norm": 9.764394722665243,
"learning_rate": 4.242343921638234e-07,
"logits/chosen": 17.71145248413086,
"logits/rejected": 18.48440170288086,
"logps/chosen": -317.6193542480469,
"logps/rejected": -230.48606872558594,
"loss": 0.45,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -6.803144931793213,
"rewards/margins": 7.825214862823486,
"rewards/rejected": -14.6283597946167,
"sft_loss": 1.0540062189102173,
"step": 355
},
{
"epoch": 0.5914567360350493,
"grad_norm": 10.821777272670147,
"learning_rate": 4.2178011201828044e-07,
"logits/chosen": 17.3190975189209,
"logits/rejected": 17.47244644165039,
"logps/chosen": -288.40374755859375,
"logps/rejected": -211.689453125,
"loss": 0.5051,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -6.330109596252441,
"rewards/margins": 6.8288254737854,
"rewards/rejected": -13.158934593200684,
"sft_loss": 1.0400768518447876,
"step": 360
},
{
"epoch": 0.5996714129244249,
"grad_norm": 17.857419277834527,
"learning_rate": 4.1929406006152546e-07,
"logits/chosen": 18.516992568969727,
"logits/rejected": 19.116985321044922,
"logps/chosen": -281.31695556640625,
"logps/rejected": -213.61634826660156,
"loss": 0.5566,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -6.528250217437744,
"rewards/margins": 6.995584487915039,
"rewards/rejected": -13.523836135864258,
"sft_loss": 1.0151982307434082,
"step": 365
},
{
"epoch": 0.6078860898138007,
"grad_norm": 14.30267906956284,
"learning_rate": 4.167766961044906e-07,
"logits/chosen": 18.10727882385254,
"logits/rejected": 18.658222198486328,
"logps/chosen": -276.7471923828125,
"logps/rejected": -210.30068969726562,
"loss": 0.4918,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -5.930126667022705,
"rewards/margins": 6.7833099365234375,
"rewards/rejected": -12.7134370803833,
"sft_loss": 0.8878603577613831,
"step": 370
},
{
"epoch": 0.6161007667031764,
"grad_norm": 13.544123616662414,
"learning_rate": 4.1422848574945923e-07,
"logits/chosen": 18.04473876953125,
"logits/rejected": 18.60536003112793,
"logps/chosen": -297.9788513183594,
"logps/rejected": -217.53721618652344,
"loss": 0.486,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -5.85421085357666,
"rewards/margins": 7.5605010986328125,
"rewards/rejected": -13.414711952209473,
"sft_loss": 1.005669355392456,
"step": 375
},
{
"epoch": 0.624315443592552,
"grad_norm": 21.50695855504068,
"learning_rate": 4.1164990030394985e-07,
"logits/chosen": 17.071107864379883,
"logits/rejected": 18.0479679107666,
"logps/chosen": -287.5808410644531,
"logps/rejected": -229.66249084472656,
"loss": 0.5873,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -7.32340145111084,
"rewards/margins": 7.0957746505737305,
"rewards/rejected": -14.419175148010254,
"sft_loss": 0.9805389046669006,
"step": 380
},
{
"epoch": 0.6325301204819277,
"grad_norm": 8.192201815991636,
"learning_rate": 4.09041416693545e-07,
"logits/chosen": 17.63469886779785,
"logits/rejected": 18.505117416381836,
"logps/chosen": -279.4613342285156,
"logps/rejected": -218.5486297607422,
"loss": 0.5224,
"rewards/accuracies": 0.9599999785423279,
"rewards/chosen": -7.260526657104492,
"rewards/margins": 6.764527320861816,
"rewards/rejected": -14.025053024291992,
"sft_loss": 1.06680166721344,
"step": 385
},
{
"epoch": 0.6407447973713034,
"grad_norm": 47.97635903653397,
"learning_rate": 4.064035173736804e-07,
"logits/chosen": 15.768574714660645,
"logits/rejected": 16.24512481689453,
"logps/chosen": -303.8434753417969,
"logps/rejected": -227.7042999267578,
"loss": 0.5142,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -6.973790168762207,
"rewards/margins": 7.414584159851074,
"rewards/rejected": -14.388375282287598,
"sft_loss": 1.1620056629180908,
"step": 390
},
{
"epoch": 0.6489594742606791,
"grad_norm": 22.56750049467428,
"learning_rate": 4.0373669024041225e-07,
"logits/chosen": 17.480152130126953,
"logits/rejected": 19.36970329284668,
"logps/chosen": -268.9180908203125,
"logps/rejected": -223.1499786376953,
"loss": 0.48,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -7.060788154602051,
"rewards/margins": 7.270442485809326,
"rewards/rejected": -14.331231117248535,
"sft_loss": 1.0084587335586548,
"step": 395
},
{
"epoch": 0.6571741511500547,
"grad_norm": 14.852542656702267,
"learning_rate": 4.010414285401776e-07,
"logits/chosen": 19.486713409423828,
"logits/rejected": 19.6448917388916,
"logps/chosen": -278.3014831542969,
"logps/rejected": -204.4377899169922,
"loss": 0.4865,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.484006404876709,
"rewards/margins": 6.507460594177246,
"rewards/rejected": -12.991467475891113,
"sft_loss": 1.0000892877578735,
"step": 400
},
{
"epoch": 0.6653888280394304,
"grad_norm": 10.619044380244409,
"learning_rate": 3.9831823077856565e-07,
"logits/chosen": 16.79458236694336,
"logits/rejected": 17.91153907775879,
"logps/chosen": -281.0224304199219,
"logps/rejected": -210.1667022705078,
"loss": 0.5159,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -6.140867710113525,
"rewards/margins": 6.682094097137451,
"rewards/rejected": -12.822961807250977,
"sft_loss": 1.0717185735702515,
"step": 405
},
{
"epoch": 0.6736035049288062,
"grad_norm": 47.72028643830778,
"learning_rate": 3.95567600628115e-07,
"logits/chosen": 17.3284912109375,
"logits/rejected": 17.72430419921875,
"logps/chosen": -275.4824523925781,
"logps/rejected": -210.34494018554688,
"loss": 0.4746,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -6.135570049285889,
"rewards/margins": 6.734328746795654,
"rewards/rejected": -12.86989974975586,
"sft_loss": 0.9496582746505737,
"step": 410
},
{
"epoch": 0.6818181818181818,
"grad_norm": 13.659449625348234,
"learning_rate": 3.9279004683515783e-07,
"logits/chosen": 17.051794052124023,
"logits/rejected": 18.201574325561523,
"logps/chosen": -283.5098876953125,
"logps/rejected": -217.13720703125,
"loss": 0.4834,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.5031657218933105,
"rewards/margins": 6.981623649597168,
"rewards/rejected": -13.484789848327637,
"sft_loss": 1.008143663406372,
"step": 415
},
{
"epoch": 0.6900328587075575,
"grad_norm": 10.688936022811054,
"learning_rate": 3.8998608312572234e-07,
"logits/chosen": 18.112707138061523,
"logits/rejected": 18.169342041015625,
"logps/chosen": -316.6014709472656,
"logps/rejected": -224.16824340820312,
"loss": 0.4278,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.8555474281311035,
"rewards/margins": 7.1582255363464355,
"rewards/rejected": -14.013773918151855,
"sft_loss": 0.9130622148513794,
"step": 420
},
{
"epoch": 0.6982475355969332,
"grad_norm": 13.038331606353893,
"learning_rate": 3.8715622811051753e-07,
"logits/chosen": 17.96015739440918,
"logits/rejected": 18.90926742553711,
"logps/chosen": -330.01348876953125,
"logps/rejected": -245.21107482910156,
"loss": 0.4744,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -7.965524196624756,
"rewards/margins": 7.482056617736816,
"rewards/rejected": -15.44758129119873,
"sft_loss": 0.9873117208480835,
"step": 425
},
{
"epoch": 0.7064622124863089,
"grad_norm": 14.058645020755897,
"learning_rate": 3.843010051890114e-07,
"logits/chosen": 16.319496154785156,
"logits/rejected": 16.970029830932617,
"logps/chosen": -317.0173645019531,
"logps/rejected": -243.7187042236328,
"loss": 0.5166,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -7.986619472503662,
"rewards/margins": 7.993711471557617,
"rewards/rejected": -15.980331420898438,
"sft_loss": 1.082601547241211,
"step": 430
},
{
"epoch": 0.7146768893756845,
"grad_norm": 19.82457118000122,
"learning_rate": 3.8142094245262615e-07,
"logits/chosen": 17.59951400756836,
"logits/rejected": 17.434412002563477,
"logps/chosen": -294.1492919921875,
"logps/rejected": -218.65521240234375,
"loss": 0.5787,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -7.534255504608154,
"rewards/margins": 6.933282852172852,
"rewards/rejected": -14.467540740966797,
"sft_loss": 1.660515308380127,
"step": 435
},
{
"epoch": 0.7228915662650602,
"grad_norm": 11.251537042250078,
"learning_rate": 3.785165725870637e-07,
"logits/chosen": 17.26852798461914,
"logits/rejected": 17.4658203125,
"logps/chosen": -318.1449279785156,
"logps/rejected": -243.87478637695312,
"loss": 0.4501,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -7.651638507843018,
"rewards/margins": 7.704569339752197,
"rewards/rejected": -15.356207847595215,
"sft_loss": 1.0064337253570557,
"step": 440
},
{
"epoch": 0.731106243154436,
"grad_norm": 13.388962596712888,
"learning_rate": 3.7558843277378203e-07,
"logits/chosen": 17.070295333862305,
"logits/rejected": 17.869474411010742,
"logps/chosen": -280.3146057128906,
"logps/rejected": -216.09710693359375,
"loss": 0.4821,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -6.6985931396484375,
"rewards/margins": 7.212075233459473,
"rewards/rejected": -13.91066837310791,
"sft_loss": 0.9864783883094788,
"step": 445
},
{
"epoch": 0.7393209200438116,
"grad_norm": 14.813083085351893,
"learning_rate": 3.726370645906407e-07,
"logits/chosen": 16.521230697631836,
"logits/rejected": 17.734365463256836,
"logps/chosen": -294.2370300292969,
"logps/rejected": -221.69178771972656,
"loss": 0.4907,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.402077674865723,
"rewards/margins": 6.9355292320251465,
"rewards/rejected": -14.337605476379395,
"sft_loss": 1.1839743852615356,
"step": 450
},
{
"epoch": 0.7475355969331873,
"grad_norm": 12.059854940698536,
"learning_rate": 3.6966301391173204e-07,
"logits/chosen": 17.135530471801758,
"logits/rejected": 19.162967681884766,
"logps/chosen": -284.18438720703125,
"logps/rejected": -233.11984252929688,
"loss": 0.5102,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -7.429366111755371,
"rewards/margins": 7.891510009765625,
"rewards/rejected": -15.320878028869629,
"sft_loss": 1.079641580581665,
"step": 455
},
{
"epoch": 0.755750273822563,
"grad_norm": 22.260550575758643,
"learning_rate": 3.6666683080641843e-07,
"logits/chosen": 15.536272048950195,
"logits/rejected": 16.60968780517578,
"logps/chosen": -310.630859375,
"logps/rejected": -241.0422821044922,
"loss": 0.4597,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.208867073059082,
"rewards/margins": 7.248252868652344,
"rewards/rejected": -15.457121849060059,
"sft_loss": 1.0596128702163696,
"step": 460
},
{
"epoch": 0.7639649507119387,
"grad_norm": 12.19610863222244,
"learning_rate": 3.636490694375937e-07,
"logits/chosen": 17.03879165649414,
"logits/rejected": 17.748197555541992,
"logps/chosen": -308.9512023925781,
"logps/rejected": -236.08970642089844,
"loss": 0.4273,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.292715549468994,
"rewards/margins": 8.177362442016602,
"rewards/rejected": -15.470076560974121,
"sft_loss": 1.0068012475967407,
"step": 465
},
{
"epoch": 0.7721796276013143,
"grad_norm": 13.22565269945024,
"learning_rate": 3.6061028795918734e-07,
"logits/chosen": 17.87092399597168,
"logits/rejected": 18.572694778442383,
"logps/chosen": -314.8690490722656,
"logps/rejected": -240.42343139648438,
"loss": 0.5971,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.011045455932617,
"rewards/margins": 7.702009677886963,
"rewards/rejected": -15.713056564331055,
"sft_loss": 1.0346639156341553,
"step": 470
},
{
"epoch": 0.78039430449069,
"grad_norm": 23.36877627131626,
"learning_rate": 3.5755104841292974e-07,
"logits/chosen": 16.52726936340332,
"logits/rejected": 18.124269485473633,
"logps/chosen": -261.4451599121094,
"logps/rejected": -216.3064727783203,
"loss": 0.5188,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.079422950744629,
"rewards/margins": 6.967349052429199,
"rewards/rejected": -14.046771049499512,
"sft_loss": 1.0945566892623901,
"step": 475
},
{
"epoch": 0.7886089813800657,
"grad_norm": 12.346922738371601,
"learning_rate": 3.544719166243998e-07,
"logits/chosen": 17.161659240722656,
"logits/rejected": 18.612253189086914,
"logps/chosen": -295.6679992675781,
"logps/rejected": -228.33984375,
"loss": 0.4422,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -6.870236873626709,
"rewards/margins": 7.495952129364014,
"rewards/rejected": -14.36618709564209,
"sft_loss": 0.9808112382888794,
"step": 480
},
{
"epoch": 0.7968236582694413,
"grad_norm": 14.120403338012792,
"learning_rate": 3.513734620983716e-07,
"logits/chosen": 17.235340118408203,
"logits/rejected": 18.787269592285156,
"logps/chosen": -289.2434997558594,
"logps/rejected": -240.0524444580078,
"loss": 0.4205,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.261631965637207,
"rewards/margins": 8.297459602355957,
"rewards/rejected": -15.55909252166748,
"sft_loss": 0.9492250084877014,
"step": 485
},
{
"epoch": 0.8050383351588171,
"grad_norm": 14.978501832234636,
"learning_rate": 3.482562579134809e-07,
"logits/chosen": 15.85843276977539,
"logits/rejected": 17.14594268798828,
"logps/chosen": -256.8265380859375,
"logps/rejected": -214.51412963867188,
"loss": 0.466,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -7.612859725952148,
"rewards/margins": 6.866227626800537,
"rewards/rejected": -14.479085922241211,
"sft_loss": 1.0439454317092896,
"step": 490
},
{
"epoch": 0.8132530120481928,
"grad_norm": 13.645938681155632,
"learning_rate": 3.4512088061623073e-07,
"logits/chosen": 17.91840171813965,
"logits/rejected": 18.105796813964844,
"logps/chosen": -344.9450378417969,
"logps/rejected": -257.0929870605469,
"loss": 0.434,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.392577171325684,
"rewards/margins": 8.40063762664795,
"rewards/rejected": -16.793216705322266,
"sft_loss": 1.052524447441101,
"step": 495
},
{
"epoch": 0.8214676889375685,
"grad_norm": 11.793731298003246,
"learning_rate": 3.419679101143555e-07,
"logits/chosen": 16.95572280883789,
"logits/rejected": 18.109580993652344,
"logps/chosen": -257.8283996582031,
"logps/rejected": -217.70062255859375,
"loss": 0.4059,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -7.232075214385986,
"rewards/margins": 7.084912300109863,
"rewards/rejected": -14.316986083984375,
"sft_loss": 1.070483684539795,
"step": 500
},
{
"epoch": 0.8296823658269441,
"grad_norm": 18.160358009772516,
"learning_rate": 3.387979295695632e-07,
"logits/chosen": 17.402151107788086,
"logits/rejected": 17.819072723388672,
"logps/chosen": -284.08599853515625,
"logps/rejected": -228.4375,
"loss": 0.4832,
"rewards/accuracies": 0.9333333969116211,
"rewards/chosen": -7.799540042877197,
"rewards/margins": 7.30112886428833,
"rewards/rejected": -15.100667953491211,
"sft_loss": 1.0201059579849243,
"step": 505
},
{
"epoch": 0.8378970427163198,
"grad_norm": 24.681797914609845,
"learning_rate": 3.356115252896764e-07,
"logits/chosen": 16.481372833251953,
"logits/rejected": 17.393707275390625,
"logps/chosen": -318.48956298828125,
"logps/rejected": -238.67076110839844,
"loss": 0.4569,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.8467302322387695,
"rewards/margins": 7.6555867195129395,
"rewards/rejected": -15.502315521240234,
"sft_loss": 1.1412904262542725,
"step": 510
},
{
"epoch": 0.8461117196056955,
"grad_norm": 11.780066809990752,
"learning_rate": 3.3240928662019043e-07,
"logits/chosen": 14.776932716369629,
"logits/rejected": 16.346778869628906,
"logps/chosen": -313.47589111328125,
"logps/rejected": -242.91506958007812,
"loss": 0.4196,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -7.860676288604736,
"rewards/margins": 8.015517234802246,
"rewards/rejected": -15.876194953918457,
"sft_loss": 1.059720516204834,
"step": 515
},
{
"epoch": 0.8543263964950711,
"grad_norm": 14.114725277489077,
"learning_rate": 3.291918058352706e-07,
"logits/chosen": 16.27129554748535,
"logits/rejected": 17.153289794921875,
"logps/chosen": -306.25506591796875,
"logps/rejected": -249.3704071044922,
"loss": 0.5092,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.551309585571289,
"rewards/margins": 7.208839416503906,
"rewards/rejected": -16.760149002075195,
"sft_loss": 1.1138993501663208,
"step": 520
},
{
"epoch": 0.8625410733844469,
"grad_norm": 27.760604608400726,
"learning_rate": 3.259596780282074e-07,
"logits/chosen": 18.246183395385742,
"logits/rejected": 18.89859390258789,
"logps/chosen": -346.7146301269531,
"logps/rejected": -260.1651916503906,
"loss": 0.4395,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.043220520019531,
"rewards/margins": 8.856348991394043,
"rewards/rejected": -16.899568557739258,
"sft_loss": 1.1765520572662354,
"step": 525
},
{
"epoch": 0.8707557502738226,
"grad_norm": 15.402410015157315,
"learning_rate": 3.2271350100134975e-07,
"logits/chosen": 17.567943572998047,
"logits/rejected": 17.768869400024414,
"logps/chosen": -298.6788024902344,
"logps/rejected": -236.3932647705078,
"loss": 0.4193,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.671787738800049,
"rewards/margins": 7.834874629974365,
"rewards/rejected": -15.506662368774414,
"sft_loss": 1.071178913116455,
"step": 530
},
{
"epoch": 0.8789704271631983,
"grad_norm": 18.947114003342495,
"learning_rate": 3.1945387515553843e-07,
"logits/chosen": 17.647369384765625,
"logits/rejected": 18.73533821105957,
"logps/chosen": -310.0240478515625,
"logps/rejected": -251.67193603515625,
"loss": 0.441,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.7946672439575195,
"rewards/margins": 9.0897798538208,
"rewards/rejected": -16.88444709777832,
"sft_loss": 1.0311574935913086,
"step": 535
},
{
"epoch": 0.8871851040525739,
"grad_norm": 11.041389503496823,
"learning_rate": 3.1618140337905764e-07,
"logits/chosen": 17.451311111450195,
"logits/rejected": 18.353700637817383,
"logps/chosen": -297.8014831542969,
"logps/rejected": -240.24606323242188,
"loss": 0.4126,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.106889724731445,
"rewards/margins": 7.919802188873291,
"rewards/rejected": -16.02669334411621,
"sft_loss": 1.1384520530700684,
"step": 540
},
{
"epoch": 0.8953997809419496,
"grad_norm": 9.858801498232785,
"learning_rate": 3.128966909361271e-07,
"logits/chosen": 16.695926666259766,
"logits/rejected": 18.67499351501465,
"logps/chosen": -320.1283874511719,
"logps/rejected": -254.82162475585938,
"loss": 0.3699,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -8.027070999145508,
"rewards/margins": 8.428789138793945,
"rewards/rejected": -16.455860137939453,
"sft_loss": 1.0505129098892212,
"step": 545
},
{
"epoch": 0.9036144578313253,
"grad_norm": 16.103503361054337,
"learning_rate": 3.096003453549549e-07,
"logits/chosen": 17.31558609008789,
"logits/rejected": 17.725223541259766,
"logps/chosen": -345.3844299316406,
"logps/rejected": -261.2863464355469,
"loss": 0.4497,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.239363670349121,
"rewards/margins": 9.474674224853516,
"rewards/rejected": -17.714040756225586,
"sft_loss": 1.020671010017395,
"step": 550
},
{
"epoch": 0.911829134720701,
"grad_norm": 12.01821136380653,
"learning_rate": 3.06292976315371e-07,
"logits/chosen": 16.277523040771484,
"logits/rejected": 17.34755516052246,
"logps/chosen": -304.7778625488281,
"logps/rejected": -241.48277282714844,
"loss": 0.4126,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.762810230255127,
"rewards/margins": 8.414238929748535,
"rewards/rejected": -16.17704963684082,
"sft_loss": 1.1222290992736816,
"step": 555
},
{
"epoch": 0.9200438116100766,
"grad_norm": 11.41112495788056,
"learning_rate": 3.0297519553606324e-07,
"logits/chosen": 17.731529235839844,
"logits/rejected": 18.088359832763672,
"logps/chosen": -305.7876281738281,
"logps/rejected": -246.57879638671875,
"loss": 0.4401,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -7.948428630828857,
"rewards/margins": 8.658875465393066,
"rewards/rejected": -16.60730743408203,
"sft_loss": 1.067797064781189,
"step": 560
},
{
"epoch": 0.9282584884994524,
"grad_norm": 21.985722962679343,
"learning_rate": 2.996476166614363e-07,
"logits/chosen": 15.972024917602539,
"logits/rejected": 16.38096809387207,
"logps/chosen": -330.54388427734375,
"logps/rejected": -267.4414367675781,
"loss": 0.5027,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.933554649353027,
"rewards/margins": 9.095858573913574,
"rewards/rejected": -18.0294132232666,
"sft_loss": 1.1063634157180786,
"step": 565
},
{
"epoch": 0.9364731653888281,
"grad_norm": 10.308382930028264,
"learning_rate": 2.963108551481142e-07,
"logits/chosen": 17.77937889099121,
"logits/rejected": 18.134130477905273,
"logps/chosen": -339.63079833984375,
"logps/rejected": -260.2466735839844,
"loss": 0.4519,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.120518684387207,
"rewards/margins": 9.045032501220703,
"rewards/rejected": -17.165552139282227,
"sft_loss": 1.072819471359253,
"step": 570
},
{
"epoch": 0.9446878422782037,
"grad_norm": 15.634526225587425,
"learning_rate": 2.929655281511075e-07,
"logits/chosen": 16.544097900390625,
"logits/rejected": 17.375316619873047,
"logps/chosen": -319.2738037109375,
"logps/rejected": -257.0357971191406,
"loss": 0.4126,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.096101760864258,
"rewards/margins": 8.786704063415527,
"rewards/rejected": -16.8828067779541,
"sft_loss": 1.0927081108093262,
"step": 575
},
{
"epoch": 0.9529025191675794,
"grad_norm": 8.788361215925173,
"learning_rate": 2.896122544096667e-07,
"logits/chosen": 16.77577018737793,
"logits/rejected": 17.813331604003906,
"logps/chosen": -297.43548583984375,
"logps/rejected": -240.00099182128906,
"loss": 0.4592,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -7.802213191986084,
"rewards/margins": 8.326114654541016,
"rewards/rejected": -16.12833023071289,
"sft_loss": 1.088619589805603,
"step": 580
},
{
"epoch": 0.9611171960569551,
"grad_norm": 20.34248392425272,
"learning_rate": 2.8625165413284307e-07,
"logits/chosen": 16.004566192626953,
"logits/rejected": 17.70891761779785,
"logps/chosen": -328.6180725097656,
"logps/rejected": -263.9577941894531,
"loss": 0.5055,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.101932525634766,
"rewards/margins": 9.129469871520996,
"rewards/rejected": -17.23140525817871,
"sft_loss": 1.0326135158538818,
"step": 585
},
{
"epoch": 0.9693318729463308,
"grad_norm": 13.09030046415886,
"learning_rate": 2.8288434888477626e-07,
"logits/chosen": 18.028348922729492,
"logits/rejected": 17.76748275756836,
"logps/chosen": -287.28692626953125,
"logps/rejected": -231.44729614257812,
"loss": 0.3908,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -7.940645217895508,
"rewards/margins": 7.91243839263916,
"rewards/rejected": -15.853084564208984,
"sft_loss": 1.0779129266738892,
"step": 590
},
{
"epoch": 0.9775465498357064,
"grad_norm": 20.95262748964158,
"learning_rate": 2.795109614697326e-07,
"logits/chosen": 17.00741195678711,
"logits/rejected": 18.209590911865234,
"logps/chosen": -275.52880859375,
"logps/rejected": -232.07052612304688,
"loss": 0.4225,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -7.112081050872803,
"rewards/margins": 8.281967163085938,
"rewards/rejected": -15.394047737121582,
"sft_loss": 1.0076452493667603,
"step": 595
},
{
"epoch": 0.9857612267250822,
"grad_norm": 13.158949539443237,
"learning_rate": 2.761321158169134e-07,
"logits/chosen": 18.07162094116211,
"logits/rejected": 19.637807846069336,
"logps/chosen": -307.5865478515625,
"logps/rejected": -249.9253387451172,
"loss": 0.4339,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.806564807891846,
"rewards/margins": 8.727023124694824,
"rewards/rejected": -16.53359031677246,
"sft_loss": 1.06932532787323,
"step": 600
},
{
"epoch": 0.9939759036144579,
"grad_norm": 13.610109275739992,
"learning_rate": 2.727484368650553e-07,
"logits/chosen": 15.262972831726074,
"logits/rejected": 16.486412048339844,
"logps/chosen": -305.6347351074219,
"logps/rejected": -252.50546264648438,
"loss": 0.4625,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.769743919372559,
"rewards/margins": 8.321878433227539,
"rewards/rejected": -17.091623306274414,
"sft_loss": 1.1903793811798096,
"step": 605
},
{
"epoch": 1.0021905805038336,
"grad_norm": 9.988555947945434,
"learning_rate": 2.6936055044684425e-07,
"logits/chosen": 17.130857467651367,
"logits/rejected": 17.868497848510742,
"logps/chosen": -278.2147216796875,
"logps/rejected": -229.0367889404297,
"loss": 0.4205,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.578054428100586,
"rewards/margins": 7.36480188369751,
"rewards/rejected": -15.942855834960938,
"sft_loss": 1.0933631658554077,
"step": 610
},
{
"epoch": 1.0104052573932092,
"grad_norm": 11.824094414048218,
"learning_rate": 2.659690831731631e-07,
"logits/chosen": 17.553348541259766,
"logits/rejected": 18.92648696899414,
"logps/chosen": -317.8105163574219,
"logps/rejected": -263.2023620605469,
"loss": 0.3385,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.87080192565918,
"rewards/margins": 9.268915176391602,
"rewards/rejected": -18.13971710205078,
"sft_loss": 1.0447877645492554,
"step": 615
},
{
"epoch": 1.0186199342825848,
"grad_norm": 15.737059861781074,
"learning_rate": 2.6257466231719676e-07,
"logits/chosen": 15.165780067443848,
"logits/rejected": 16.453243255615234,
"logps/chosen": -338.23773193359375,
"logps/rejected": -283.7428283691406,
"loss": 0.3123,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.231574058532715,
"rewards/margins": 10.158638954162598,
"rewards/rejected": -19.390214920043945,
"sft_loss": 1.2299811840057373,
"step": 620
},
{
"epoch": 1.0268346111719606,
"grad_norm": 11.900623330243908,
"learning_rate": 2.591779156984137e-07,
"logits/chosen": 16.764328002929688,
"logits/rejected": 16.837923049926758,
"logps/chosen": -322.6804504394531,
"logps/rejected": -269.0111999511719,
"loss": 0.3671,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -9.284835815429688,
"rewards/margins": 9.551142692565918,
"rewards/rejected": -18.83597755432129,
"sft_loss": 1.0855733156204224,
"step": 625
},
{
"epoch": 1.0350492880613362,
"grad_norm": 18.88025576733879,
"learning_rate": 2.557794715664465e-07,
"logits/chosen": 15.582106590270996,
"logits/rejected": 16.574077606201172,
"logps/chosen": -330.9181213378906,
"logps/rejected": -281.83709716796875,
"loss": 0.4083,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.849162101745605,
"rewards/margins": 10.020377159118652,
"rewards/rejected": -19.86954116821289,
"sft_loss": 1.11058509349823,
"step": 630
},
{
"epoch": 1.0432639649507118,
"grad_norm": 22.56812145625195,
"learning_rate": 2.5237995848489417e-07,
"logits/chosen": 16.257413864135742,
"logits/rejected": 16.71412467956543,
"logps/chosen": -332.62506103515625,
"logps/rejected": -271.0566101074219,
"loss": 0.4569,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.809398651123047,
"rewards/margins": 10.076090812683105,
"rewards/rejected": -18.88549041748047,
"sft_loss": 1.1897673606872559,
"step": 635
},
{
"epoch": 1.0514786418400877,
"grad_norm": 10.647617140402389,
"learning_rate": 2.48980005215064e-07,
"logits/chosen": 16.611183166503906,
"logits/rejected": 17.89920425415039,
"logps/chosen": -271.6616516113281,
"logps/rejected": -231.13978576660156,
"loss": 0.4444,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.08376407623291,
"rewards/margins": 8.19190502166748,
"rewards/rejected": -16.27566909790039,
"sft_loss": 1.3704915046691895,
"step": 640
},
{
"epoch": 1.0596933187294633,
"grad_norm": 19.247491047471033,
"learning_rate": 2.45580240599679e-07,
"logits/chosen": 16.49073028564453,
"logits/rejected": 17.990306854248047,
"logps/chosen": -358.3551025390625,
"logps/rejected": -288.8968505859375,
"loss": 0.3691,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.500346183776855,
"rewards/margins": 10.489169120788574,
"rewards/rejected": -18.98951530456543,
"sft_loss": 1.2408881187438965,
"step": 645
},
{
"epoch": 1.067907995618839,
"grad_norm": 13.44526599292449,
"learning_rate": 2.421812934465696e-07,
"logits/chosen": 17.065837860107422,
"logits/rejected": 17.75263214111328,
"logps/chosen": -308.9762878417969,
"logps/rejected": -256.1690979003906,
"loss": 0.3945,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.130703926086426,
"rewards/margins": 9.368220329284668,
"rewards/rejected": -17.49892234802246,
"sft_loss": 1.1205145120620728,
"step": 650
},
{
"epoch": 1.0761226725082147,
"grad_norm": 10.753673167776007,
"learning_rate": 2.3878379241237134e-07,
"logits/chosen": 16.457183837890625,
"logits/rejected": 17.42021942138672,
"logps/chosen": -312.5380554199219,
"logps/rejected": -251.23977661132812,
"loss": 0.3696,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.389382362365723,
"rewards/margins": 8.924761772155762,
"rewards/rejected": -17.314144134521484,
"sft_loss": 1.2173506021499634,
"step": 655
},
{
"epoch": 1.0843373493975903,
"grad_norm": 23.82423804722956,
"learning_rate": 2.3538836588625077e-07,
"logits/chosen": 15.20209789276123,
"logits/rejected": 15.774395942687988,
"logps/chosen": -297.73260498046875,
"logps/rejected": -246.4073944091797,
"loss": 0.4032,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -8.610387802124023,
"rewards/margins": 8.70635986328125,
"rewards/rejected": -17.31674575805664,
"sft_loss": 1.3788336515426636,
"step": 660
},
{
"epoch": 1.0925520262869661,
"grad_norm": 7.166962184073318,
"learning_rate": 2.3199564187368153e-07,
"logits/chosen": 15.194981575012207,
"logits/rejected": 17.136018753051758,
"logps/chosen": -328.6063537597656,
"logps/rejected": -288.6786804199219,
"loss": 0.366,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.775790214538574,
"rewards/margins": 10.185883522033691,
"rewards/rejected": -19.9616756439209,
"sft_loss": 1.1107780933380127,
"step": 665
},
{
"epoch": 1.1007667031763417,
"grad_norm": 13.216204703949911,
"learning_rate": 2.2860624788029013e-07,
"logits/chosen": 16.70530891418457,
"logits/rejected": 17.76304817199707,
"logps/chosen": -289.44476318359375,
"logps/rejected": -245.6142120361328,
"loss": 0.4321,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -8.357013702392578,
"rewards/margins": 8.433321952819824,
"rewards/rejected": -16.790334701538086,
"sft_loss": 1.1908717155456543,
"step": 670
},
{
"epoch": 1.1089813800657173,
"grad_norm": 26.032896310058877,
"learning_rate": 2.2522081079579497e-07,
"logits/chosen": 15.079482078552246,
"logits/rejected": 16.43825340270996,
"logps/chosen": -327.8377380371094,
"logps/rejected": -283.44158935546875,
"loss": 0.389,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.565984725952148,
"rewards/margins": 10.288603782653809,
"rewards/rejected": -19.854589462280273,
"sft_loss": 1.4105526208877563,
"step": 675
},
{
"epoch": 1.1171960569550932,
"grad_norm": 7.35341298145847,
"learning_rate": 2.2183995677805967e-07,
"logits/chosen": 15.347798347473145,
"logits/rejected": 16.887144088745117,
"logps/chosen": -343.8727722167969,
"logps/rejected": -289.7627258300781,
"loss": 0.3343,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.85545539855957,
"rewards/margins": 10.418365478515625,
"rewards/rejected": -20.273822784423828,
"sft_loss": 1.2016042470932007,
"step": 680
},
{
"epoch": 1.1254107338444688,
"grad_norm": 13.095979555911432,
"learning_rate": 2.1846431113728062e-07,
"logits/chosen": 15.633400917053223,
"logits/rejected": 17.45536994934082,
"logps/chosen": -328.1496887207031,
"logps/rejected": -281.7301025390625,
"loss": 0.3718,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.103165626525879,
"rewards/margins": 10.700647354125977,
"rewards/rejected": -19.80381202697754,
"sft_loss": 1.198488473892212,
"step": 685
},
{
"epoch": 1.1336254107338444,
"grad_norm": 17.038758672339643,
"learning_rate": 2.1509449822033205e-07,
"logits/chosen": 16.633058547973633,
"logits/rejected": 17.105684280395508,
"logps/chosen": -340.9743957519531,
"logps/rejected": -273.4366455078125,
"loss": 0.3328,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.657751083374023,
"rewards/margins": 9.9077787399292,
"rewards/rejected": -18.565532684326172,
"sft_loss": 1.1622406244277954,
"step": 690
},
{
"epoch": 1.1418400876232202,
"grad_norm": 13.181289081121232,
"learning_rate": 2.1173114129528957e-07,
"logits/chosen": 16.235170364379883,
"logits/rejected": 17.971439361572266,
"logps/chosen": -289.8466491699219,
"logps/rejected": -249.1376495361328,
"loss": 0.3625,
"rewards/accuracies": 0.9333333373069763,
"rewards/chosen": -8.312536239624023,
"rewards/margins": 9.367281913757324,
"rewards/rejected": -17.679819107055664,
"sft_loss": 1.2810382843017578,
"step": 695
},
{
"epoch": 1.1500547645125958,
"grad_norm": 13.226133090678903,
"learning_rate": 2.0837486243615226e-07,
"logits/chosen": 16.742103576660156,
"logits/rejected": 17.46257781982422,
"logps/chosen": -364.11041259765625,
"logps/rejected": -300.90618896484375,
"loss": 0.3981,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.691442489624023,
"rewards/margins": 11.124072074890137,
"rewards/rejected": -20.81551742553711,
"sft_loss": 1.0426690578460693,
"step": 700
},
{
"epoch": 1.1582694414019716,
"grad_norm": 16.747134822775763,
"learning_rate": 2.0502628240778653e-07,
"logits/chosen": 17.3011474609375,
"logits/rejected": 19.28099822998047,
"logps/chosen": -329.4310607910156,
"logps/rejected": -291.73443603515625,
"loss": 0.3664,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.230022430419922,
"rewards/margins": 11.087077140808105,
"rewards/rejected": -20.31709861755371,
"sft_loss": 1.0452929735183716,
"step": 705
},
{
"epoch": 1.1664841182913472,
"grad_norm": 11.712195080946406,
"learning_rate": 2.0168602055111173e-07,
"logits/chosen": 16.063915252685547,
"logits/rejected": 17.033220291137695,
"logps/chosen": -324.21099853515625,
"logps/rejected": -281.9880065917969,
"loss": 0.3326,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.243894577026367,
"rewards/margins": 10.908008575439453,
"rewards/rejected": -20.15190315246582,
"sft_loss": 1.1959102153778076,
"step": 710
},
{
"epoch": 1.1746987951807228,
"grad_norm": 19.52291295321317,
"learning_rate": 1.9835469466854887e-07,
"logits/chosen": 14.572199821472168,
"logits/rejected": 16.15847396850586,
"logps/chosen": -322.0695495605469,
"logps/rejected": -283.8585205078125,
"loss": 0.3275,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.329268455505371,
"rewards/margins": 10.466280937194824,
"rewards/rejected": -19.795551300048828,
"sft_loss": 1.1618155241012573,
"step": 715
},
{
"epoch": 1.1829134720700987,
"grad_norm": 14.04137372253548,
"learning_rate": 1.9503292090975454e-07,
"logits/chosen": 16.88302993774414,
"logits/rejected": 17.57504653930664,
"logps/chosen": -292.8112487792969,
"logps/rejected": -249.99221801757812,
"loss": 0.3841,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.16586685180664,
"rewards/margins": 9.12321662902832,
"rewards/rejected": -18.28908348083496,
"sft_loss": 1.2042182683944702,
"step": 720
},
{
"epoch": 1.1911281489594743,
"grad_norm": 12.34681171872866,
"learning_rate": 1.917213136576602e-07,
"logits/chosen": 16.656551361083984,
"logits/rejected": 17.51203155517578,
"logps/chosen": -327.6507568359375,
"logps/rejected": -284.38262939453125,
"loss": 0.3207,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.919057846069336,
"rewards/margins": 10.376687049865723,
"rewards/rejected": -20.295743942260742,
"sft_loss": 1.18035089969635,
"step": 725
},
{
"epoch": 1.1993428258488499,
"grad_norm": 10.050794300712155,
"learning_rate": 1.8842048541483756e-07,
"logits/chosen": 18.090221405029297,
"logits/rejected": 18.187620162963867,
"logps/chosen": -322.1310119628906,
"logps/rejected": -253.3239288330078,
"loss": 0.3945,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.852034568786621,
"rewards/margins": 9.027352333068848,
"rewards/rejected": -17.879384994506836,
"sft_loss": 1.199164628982544,
"step": 730
},
{
"epoch": 1.2075575027382257,
"grad_norm": 11.698463225887238,
"learning_rate": 1.8513104669021314e-07,
"logits/chosen": 15.768450736999512,
"logits/rejected": 17.4649715423584,
"logps/chosen": -315.5854797363281,
"logps/rejected": -270.3199462890625,
"loss": 0.3727,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.129364967346191,
"rewards/margins": 9.496920585632324,
"rewards/rejected": -18.626283645629883,
"sft_loss": 1.1171187162399292,
"step": 735
},
{
"epoch": 1.2157721796276013,
"grad_norm": 15.670433550127342,
"learning_rate": 1.8185360588615057e-07,
"logits/chosen": 17.373594284057617,
"logits/rejected": 18.17388916015625,
"logps/chosen": -349.6602478027344,
"logps/rejected": -286.2644958496094,
"loss": 0.3583,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.793112754821777,
"rewards/margins": 10.82769775390625,
"rewards/rejected": -19.620811462402344,
"sft_loss": 1.1327273845672607,
"step": 740
},
{
"epoch": 1.223986856516977,
"grad_norm": 17.513419996090132,
"learning_rate": 1.7858876918592232e-07,
"logits/chosen": 15.862748146057129,
"logits/rejected": 17.21187400817871,
"logps/chosen": -301.255859375,
"logps/rejected": -256.63555908203125,
"loss": 0.3533,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.769881248474121,
"rewards/margins": 9.70648193359375,
"rewards/rejected": -18.476362228393555,
"sft_loss": 1.1204417943954468,
"step": 745
},
{
"epoch": 1.2322015334063527,
"grad_norm": 19.125724690968823,
"learning_rate": 1.7533714044159299e-07,
"logits/chosen": 15.58492374420166,
"logits/rejected": 16.52800941467285,
"logps/chosen": -298.8733215332031,
"logps/rejected": -268.4566650390625,
"loss": 0.4265,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.785615921020508,
"rewards/margins": 9.117348670959473,
"rewards/rejected": -18.902963638305664,
"sft_loss": 1.6064594984054565,
"step": 750
},
{
"epoch": 1.2404162102957283,
"grad_norm": 17.968784609019274,
"learning_rate": 1.7209932106233264e-07,
"logits/chosen": 15.145374298095703,
"logits/rejected": 17.433292388916016,
"logps/chosen": -342.9417724609375,
"logps/rejected": -296.39654541015625,
"loss": 0.3766,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.673720359802246,
"rewards/margins": 10.815841674804688,
"rewards/rejected": -20.489561080932617,
"sft_loss": 1.145885944366455,
"step": 755
},
{
"epoch": 1.248630887185104,
"grad_norm": 13.684786311914898,
"learning_rate": 1.688759099031824e-07,
"logits/chosen": 15.70371150970459,
"logits/rejected": 16.69938087463379,
"logps/chosen": -361.2178955078125,
"logps/rejected": -309.79150390625,
"loss": 0.3508,
"rewards/accuracies": 0.9599999785423279,
"rewards/chosen": -10.506484031677246,
"rewards/margins": 11.59350872039795,
"rewards/rejected": -22.099994659423828,
"sft_loss": 1.1850322484970093,
"step": 760
},
{
"epoch": 1.2568455640744798,
"grad_norm": 14.244960468313039,
"learning_rate": 1.656675031542925e-07,
"logits/chosen": 17.195899963378906,
"logits/rejected": 18.426219940185547,
"logps/chosen": -363.3425598144531,
"logps/rejected": -301.96063232421875,
"loss": 0.3397,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.729473114013672,
"rewards/margins": 11.475974082946777,
"rewards/rejected": -21.205448150634766,
"sft_loss": 1.1794105768203735,
"step": 765
},
{
"epoch": 1.2650602409638554,
"grad_norm": 8.622276211404598,
"learning_rate": 1.6247469423065343e-07,
"logits/chosen": 16.508113861083984,
"logits/rejected": 17.097890853881836,
"logps/chosen": -305.1572570800781,
"logps/rejected": -249.677001953125,
"loss": 0.3759,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.539449691772461,
"rewards/margins": 8.976499557495117,
"rewards/rejected": -17.515949249267578,
"sft_loss": 1.196576714515686,
"step": 770
},
{
"epoch": 1.273274917853231,
"grad_norm": 12.358403358119775,
"learning_rate": 1.5929807366233977e-07,
"logits/chosen": 16.241657257080078,
"logits/rejected": 17.03815269470215,
"logps/chosen": -369.39556884765625,
"logps/rejected": -303.64337158203125,
"loss": 0.3163,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.143651008605957,
"rewards/margins": 11.741597175598145,
"rewards/rejected": -20.885250091552734,
"sft_loss": 1.1366469860076904,
"step": 775
},
{
"epoch": 1.2814895947426068,
"grad_norm": 16.14061914284979,
"learning_rate": 1.5613822898528794e-07,
"logits/chosen": 16.795856475830078,
"logits/rejected": 17.53175163269043,
"logps/chosen": -345.46929931640625,
"logps/rejected": -292.4604187011719,
"loss": 0.3369,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.835062026977539,
"rewards/margins": 11.170233726501465,
"rewards/rejected": -21.005298614501953,
"sft_loss": 1.3101640939712524,
"step": 780
},
{
"epoch": 1.2897042716319824,
"grad_norm": 12.538981244658086,
"learning_rate": 1.5299574463262794e-07,
"logits/chosen": 15.523879051208496,
"logits/rejected": 16.796798706054688,
"logps/chosen": -377.0471496582031,
"logps/rejected": -319.5939025878906,
"loss": 0.4028,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.438889503479004,
"rewards/margins": 12.12649917602539,
"rewards/rejected": -22.565387725830078,
"sft_loss": 1.1697484254837036,
"step": 785
},
{
"epoch": 1.297918948521358,
"grad_norm": 13.959611183566771,
"learning_rate": 1.4987120182658877e-07,
"logits/chosen": 15.972567558288574,
"logits/rejected": 18.35633659362793,
"logps/chosen": -330.76104736328125,
"logps/rejected": -282.9498291015625,
"loss": 0.3757,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.672747611999512,
"rewards/margins": 10.330121994018555,
"rewards/rejected": -20.002866744995117,
"sft_loss": 1.1246702671051025,
"step": 790
},
{
"epoch": 1.3061336254107339,
"grad_norm": 12.65020419545928,
"learning_rate": 1.4676517847099745e-07,
"logits/chosen": 16.62309455871582,
"logits/rejected": 17.682994842529297,
"logps/chosen": -309.1587829589844,
"logps/rejected": -255.12290954589844,
"loss": 0.3603,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.413141250610352,
"rewards/margins": 9.33283805847168,
"rewards/rejected": -17.7459774017334,
"sft_loss": 1.1139575242996216,
"step": 795
},
{
"epoch": 1.3143483023001095,
"grad_norm": 11.10720994563204,
"learning_rate": 1.4367824904439242e-07,
"logits/chosen": 17.087141036987305,
"logits/rejected": 17.25540542602539,
"logps/chosen": -336.4616394042969,
"logps/rejected": -273.6061096191406,
"loss": 0.371,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.36069393157959,
"rewards/margins": 10.413783073425293,
"rewards/rejected": -18.774477005004883,
"sft_loss": 1.0689451694488525,
"step": 800
},
{
"epoch": 1.3225629791894853,
"grad_norm": 12.783247596774917,
"learning_rate": 1.4061098449376985e-07,
"logits/chosen": 15.60853099822998,
"logits/rejected": 17.57704734802246,
"logps/chosen": -362.2177734375,
"logps/rejected": -308.759765625,
"loss": 0.3288,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.05817699432373,
"rewards/margins": 11.833967208862305,
"rewards/rejected": -20.89214324951172,
"sft_loss": 1.2039010524749756,
"step": 805
},
{
"epoch": 1.330777656078861,
"grad_norm": 8.359077848319595,
"learning_rate": 1.375639521289836e-07,
"logits/chosen": 15.683825492858887,
"logits/rejected": 16.602642059326172,
"logps/chosen": -332.6221008300781,
"logps/rejected": -278.2598571777344,
"loss": 0.3387,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.19861888885498,
"rewards/margins": 10.37482738494873,
"rewards/rejected": -19.57344627380371,
"sft_loss": 1.17559015750885,
"step": 810
},
{
"epoch": 1.3389923329682367,
"grad_norm": 13.496245877040751,
"learning_rate": 1.3453771551781756e-07,
"logits/chosen": 16.44358253479004,
"logits/rejected": 17.437644958496094,
"logps/chosen": -307.6462707519531,
"logps/rejected": -271.81683349609375,
"loss": 0.3318,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.48726749420166,
"rewards/margins": 10.40507698059082,
"rewards/rejected": -18.892345428466797,
"sft_loss": 1.1855844259262085,
"step": 815
},
{
"epoch": 1.3472070098576123,
"grad_norm": 14.433148985359804,
"learning_rate": 1.3153283438175034e-07,
"logits/chosen": 15.872283935546875,
"logits/rejected": 16.650604248046875,
"logps/chosen": -324.4306945800781,
"logps/rejected": -276.83929443359375,
"loss": 0.3743,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.549914360046387,
"rewards/margins": 10.544774055480957,
"rewards/rejected": -20.094688415527344,
"sft_loss": 1.1671338081359863,
"step": 820
},
{
"epoch": 1.355421686746988,
"grad_norm": 22.22401225520154,
"learning_rate": 1.2854986449243124e-07,
"logits/chosen": 16.34712028503418,
"logits/rejected": 16.94756317138672,
"logps/chosen": -331.7503662109375,
"logps/rejected": -286.41705322265625,
"loss": 0.3285,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.810836791992188,
"rewards/margins": 10.913623809814453,
"rewards/rejected": -20.724462509155273,
"sft_loss": 1.0781916379928589,
"step": 825
},
{
"epoch": 1.3636363636363638,
"grad_norm": 11.973621147714551,
"learning_rate": 1.2558935756888675e-07,
"logits/chosen": 15.828746795654297,
"logits/rejected": 16.91975212097168,
"logps/chosen": -322.3880310058594,
"logps/rejected": -279.2362365722656,
"loss": 0.3542,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.630596160888672,
"rewards/margins": 10.590496063232422,
"rewards/rejected": -20.221094131469727,
"sft_loss": 1.1420843601226807,
"step": 830
},
{
"epoch": 1.3718510405257394,
"grad_norm": 12.897955971332296,
"learning_rate": 1.226518611754767e-07,
"logits/chosen": 17.223234176635742,
"logits/rejected": 18.44441795349121,
"logps/chosen": -314.6831970214844,
"logps/rejected": -273.42083740234375,
"loss": 0.3494,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.791934967041016,
"rewards/margins": 10.404925346374512,
"rewards/rejected": -19.196863174438477,
"sft_loss": 1.11257803440094,
"step": 835
},
{
"epoch": 1.380065717415115,
"grad_norm": 14.822041663775748,
"learning_rate": 1.1973791862061871e-07,
"logits/chosen": 15.981986045837402,
"logits/rejected": 16.508832931518555,
"logps/chosen": -357.4217529296875,
"logps/rejected": -279.4723815917969,
"loss": 0.4071,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -8.533044815063477,
"rewards/margins": 10.845856666564941,
"rewards/rejected": -19.378902435302734,
"sft_loss": 1.071024775505066,
"step": 840
},
{
"epoch": 1.3882803943044908,
"grad_norm": 12.166531109745293,
"learning_rate": 1.1684806885630003e-07,
"logits/chosen": 17.19085693359375,
"logits/rejected": 18.22423553466797,
"logps/chosen": -336.6310729980469,
"logps/rejected": -288.2579040527344,
"loss": 0.3543,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.727254867553711,
"rewards/margins": 11.067011833190918,
"rewards/rejected": -19.794267654418945,
"sft_loss": 1.0941708087921143,
"step": 845
},
{
"epoch": 1.3964950711938664,
"grad_norm": 19.61480809183216,
"learning_rate": 1.1398284637839486e-07,
"logits/chosen": 17.393543243408203,
"logits/rejected": 17.97818946838379,
"logps/chosen": -290.88043212890625,
"logps/rejected": -248.78334045410156,
"loss": 0.3532,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.74341869354248,
"rewards/margins": 8.96402359008789,
"rewards/rejected": -17.707439422607422,
"sft_loss": 1.3463881015777588,
"step": 850
},
{
"epoch": 1.404709748083242,
"grad_norm": 13.04687226615894,
"learning_rate": 1.1114278112780601e-07,
"logits/chosen": 16.697458267211914,
"logits/rejected": 17.817760467529297,
"logps/chosen": -376.94256591796875,
"logps/rejected": -319.7321472167969,
"loss": 0.308,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -10.05405044555664,
"rewards/margins": 12.681156158447266,
"rewards/rejected": -22.735204696655273,
"sft_loss": 1.1224801540374756,
"step": 855
},
{
"epoch": 1.4129244249726178,
"grad_norm": 13.443707852848624,
"learning_rate": 1.08328398392449e-07,
"logits/chosen": 17.408639907836914,
"logits/rejected": 17.620332717895508,
"logps/chosen": -365.28131103515625,
"logps/rejected": -308.3528137207031,
"loss": 0.3755,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -10.822293281555176,
"rewards/margins": 11.438949584960938,
"rewards/rejected": -22.261241912841797,
"sft_loss": 1.178871750831604,
"step": 860
},
{
"epoch": 1.4211391018619934,
"grad_norm": 21.58859732751979,
"learning_rate": 1.0554021871009677e-07,
"logits/chosen": 16.947927474975586,
"logits/rejected": 17.420812606811523,
"logps/chosen": -340.0753479003906,
"logps/rejected": -297.9937438964844,
"loss": 0.3588,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.554227828979492,
"rewards/margins": 12.149679183959961,
"rewards/rejected": -21.70391082763672,
"sft_loss": 1.3246734142303467,
"step": 865
},
{
"epoch": 1.429353778751369,
"grad_norm": 13.8734601142875,
"learning_rate": 1.0277875777210299e-07,
"logits/chosen": 14.887709617614746,
"logits/rejected": 15.843902587890625,
"logps/chosen": -324.3350830078125,
"logps/rejected": -275.6741943359375,
"loss": 0.3712,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.113089561462402,
"rewards/margins": 10.9337158203125,
"rewards/rejected": -20.046804428100586,
"sft_loss": 1.2739402055740356,
"step": 870
},
{
"epoch": 1.4375684556407449,
"grad_norm": 13.714339626163223,
"learning_rate": 1.0004452632802158e-07,
"logits/chosen": 17.476552963256836,
"logits/rejected": 17.923315048217773,
"logps/chosen": -338.1813049316406,
"logps/rejected": -277.5501403808594,
"loss": 0.3129,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.576931953430176,
"rewards/margins": 10.827016830444336,
"rewards/rejected": -19.403947830200195,
"sft_loss": 1.1658498048782349,
"step": 875
},
{
"epoch": 1.4457831325301205,
"grad_norm": 13.805309365020234,
"learning_rate": 9.733803009114044e-08,
"logits/chosen": 16.891300201416016,
"logits/rejected": 17.32049560546875,
"logps/chosen": -322.0257263183594,
"logps/rejected": -274.27691650390625,
"loss": 0.316,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.144042015075684,
"rewards/margins": 10.516057014465332,
"rewards/rejected": -18.660099029541016,
"sft_loss": 1.110759973526001,
"step": 880
},
{
"epoch": 1.453997809419496,
"grad_norm": 29.77032111690104,
"learning_rate": 9.465976964494682e-08,
"logits/chosen": 16.620283126831055,
"logits/rejected": 17.72939682006836,
"logps/chosen": -300.1767578125,
"logps/rejected": -261.1438903808594,
"loss": 0.361,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.963627815246582,
"rewards/margins": 9.891008377075195,
"rewards/rejected": -18.854639053344727,
"sft_loss": 1.2920080423355103,
"step": 885
},
{
"epoch": 1.462212486308872,
"grad_norm": 9.782780560332286,
"learning_rate": 9.201024035054053e-08,
"logits/chosen": 17.15985107421875,
"logits/rejected": 17.535512924194336,
"logps/chosen": -286.6101379394531,
"logps/rejected": -247.57127380371094,
"loss": 0.3835,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.331561088562012,
"rewards/margins": 9.115301132202148,
"rewards/rejected": -18.446863174438477,
"sft_loss": 1.3567354679107666,
"step": 890
},
{
"epoch": 1.4704271631982475,
"grad_norm": 8.768405187815805,
"learning_rate": 8.938993225501495e-08,
"logits/chosen": 17.89764976501465,
"logits/rejected": 18.452497482299805,
"logps/chosen": -351.6549987792969,
"logps/rejected": -302.9189453125,
"loss": 0.3592,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.618634223937988,
"rewards/margins": 11.819962501525879,
"rewards/rejected": -21.4385986328125,
"sft_loss": 1.0768134593963623,
"step": 895
},
{
"epoch": 1.4786418400876231,
"grad_norm": 21.82788195022886,
"learning_rate": 8.679933000081879e-08,
"logits/chosen": 15.745450019836426,
"logits/rejected": 17.15949249267578,
"logps/chosen": -307.5598449707031,
"logps/rejected": -271.531494140625,
"loss": 0.3801,
"rewards/accuracies": 0.9599999785423279,
"rewards/chosen": -8.928607940673828,
"rewards/margins": 10.275431632995605,
"rewards/rejected": -19.204038619995117,
"sft_loss": 1.1987248659133911,
"step": 900
},
{
"epoch": 1.486856516976999,
"grad_norm": 12.077209434939249,
"learning_rate": 8.423891273611855e-08,
"logits/chosen": 16.016569137573242,
"logits/rejected": 16.249284744262695,
"logps/chosen": -311.76934814453125,
"logps/rejected": -261.8121643066406,
"loss": 0.3799,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.412449836730957,
"rewards/margins": 10.322086334228516,
"rewards/rejected": -18.734539031982422,
"sft_loss": 1.2180228233337402,
"step": 905
},
{
"epoch": 1.4950711938663745,
"grad_norm": 20.15671717033895,
"learning_rate": 8.170915402617739e-08,
"logits/chosen": 15.889266014099121,
"logits/rejected": 17.218164443969727,
"logps/chosen": -335.0419921875,
"logps/rejected": -293.2705078125,
"loss": 0.4051,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.390657424926758,
"rewards/margins": 11.274243354797363,
"rewards/rejected": -20.664899826049805,
"sft_loss": 1.1832726001739502,
"step": 910
},
{
"epoch": 1.5032858707557502,
"grad_norm": 11.069682914043863,
"learning_rate": 7.921052176576643e-08,
"logits/chosen": 17.052453994750977,
"logits/rejected": 17.67256736755371,
"logps/chosen": -305.6400146484375,
"logps/rejected": -266.4335632324219,
"loss": 0.3165,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.719382286071777,
"rewards/margins": 10.110651016235352,
"rewards/rejected": -18.830034255981445,
"sft_loss": 1.0706188678741455,
"step": 915
},
{
"epoch": 1.511500547645126,
"grad_norm": 27.258481926608287,
"learning_rate": 7.674347809262377e-08,
"logits/chosen": 16.615238189697266,
"logits/rejected": 17.932260513305664,
"logps/chosen": -288.8174743652344,
"logps/rejected": -250.63177490234375,
"loss": 0.3758,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -7.821852684020996,
"rewards/margins": 9.426899909973145,
"rewards/rejected": -17.24875259399414,
"sft_loss": 1.12588369846344,
"step": 920
},
{
"epoch": 1.5197152245345018,
"grad_norm": 8.415837096456798,
"learning_rate": 7.430847930198009e-08,
"logits/chosen": 16.921852111816406,
"logits/rejected": 17.39198875427246,
"logps/chosen": -329.8725891113281,
"logps/rejected": -274.1763000488281,
"loss": 0.3708,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.946272850036621,
"rewards/margins": 11.179486274719238,
"rewards/rejected": -19.12575912475586,
"sft_loss": 1.286713719367981,
"step": 925
},
{
"epoch": 1.5279299014238772,
"grad_norm": 7.8387923698583295,
"learning_rate": 7.190597576216384e-08,
"logits/chosen": 15.69840145111084,
"logits/rejected": 17.983213424682617,
"logps/chosen": -329.1253967285156,
"logps/rejected": -290.71051025390625,
"loss": 0.3144,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.661620140075684,
"rewards/margins": 11.160269737243652,
"rewards/rejected": -19.821889877319336,
"sft_loss": 1.1312789916992188,
"step": 930
},
{
"epoch": 1.536144578313253,
"grad_norm": 14.005325625629936,
"learning_rate": 6.953641183130224e-08,
"logits/chosen": 16.529827117919922,
"logits/rejected": 16.534809112548828,
"logps/chosen": -333.02813720703125,
"logps/rejected": -275.6182556152344,
"loss": 0.3675,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.715841293334961,
"rewards/margins": 9.975454330444336,
"rewards/rejected": -19.691295623779297,
"sft_loss": 1.2341707944869995,
"step": 935
},
{
"epoch": 1.5443592552026288,
"grad_norm": 11.238181780972436,
"learning_rate": 6.720022577513507e-08,
"logits/chosen": 15.408208847045898,
"logits/rejected": 16.01373291015625,
"logps/chosen": -350.6366882324219,
"logps/rejected": -291.2669677734375,
"loss": 0.3381,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -10.044300079345703,
"rewards/margins": 10.780247688293457,
"rewards/rejected": -20.82455062866211,
"sft_loss": 1.26254141330719,
"step": 940
},
{
"epoch": 1.5525739320920042,
"grad_norm": 11.413642178268471,
"learning_rate": 6.489784968595444e-08,
"logits/chosen": 15.467609405517578,
"logits/rejected": 16.952180862426758,
"logps/chosen": -346.5306091308594,
"logps/rejected": -312.6312561035156,
"loss": 0.3402,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.993128776550293,
"rewards/margins": 12.808844566345215,
"rewards/rejected": -22.801973342895508,
"sft_loss": 1.1826088428497314,
"step": 945
},
{
"epoch": 1.56078860898138,
"grad_norm": 22.79199458890795,
"learning_rate": 6.262970940268652e-08,
"logits/chosen": 16.051044464111328,
"logits/rejected": 17.10271453857422,
"logps/chosen": -313.6996765136719,
"logps/rejected": -278.2881774902344,
"loss": 0.333,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.396943092346191,
"rewards/margins": 10.23829460144043,
"rewards/rejected": -19.635236740112305,
"sft_loss": 1.1279245615005493,
"step": 950
},
{
"epoch": 1.5690032858707559,
"grad_norm": 11.668850401054987,
"learning_rate": 6.039622443213008e-08,
"logits/chosen": 16.13634490966797,
"logits/rejected": 17.919300079345703,
"logps/chosen": -325.7288513183594,
"logps/rejected": -289.1236267089844,
"loss": 0.3346,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.695369720458984,
"rewards/margins": 11.087947845458984,
"rewards/rejected": -20.78331756591797,
"sft_loss": 1.1951278448104858,
"step": 955
},
{
"epoch": 1.5772179627601315,
"grad_norm": 13.415709297062323,
"learning_rate": 5.8197807871366e-08,
"logits/chosen": 15.244779586791992,
"logits/rejected": 16.526262283325195,
"logps/chosen": -370.6669616699219,
"logps/rejected": -322.87847900390625,
"loss": 0.3428,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.789223670959473,
"rewards/margins": 12.587509155273438,
"rewards/rejected": -22.376733779907227,
"sft_loss": 2.1045873165130615,
"step": 960
},
{
"epoch": 1.585432639649507,
"grad_norm": 13.58873079620651,
"learning_rate": 5.6034866331352376e-08,
"logits/chosen": 15.409506797790527,
"logits/rejected": 16.128753662109375,
"logps/chosen": -322.9807434082031,
"logps/rejected": -271.06378173828125,
"loss": 0.347,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.693291664123535,
"rewards/margins": 10.159637451171875,
"rewards/rejected": -19.852930068969727,
"sft_loss": 1.1238617897033691,
"step": 965
},
{
"epoch": 1.593647316538883,
"grad_norm": 16.504268173121613,
"learning_rate": 5.390779986171934e-08,
"logits/chosen": 15.72015380859375,
"logits/rejected": 17.518657684326172,
"logps/chosen": -337.39349365234375,
"logps/rejected": -302.06109619140625,
"loss": 0.3214,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.937000274658203,
"rewards/margins": 11.78524112701416,
"rewards/rejected": -20.72224235534668,
"sft_loss": 1.129492998123169,
"step": 970
},
{
"epoch": 1.6018619934282585,
"grad_norm": 14.941336561605484,
"learning_rate": 5.1817001876777314e-08,
"logits/chosen": 15.710195541381836,
"logits/rejected": 16.9680233001709,
"logps/chosen": -324.51251220703125,
"logps/rejected": -286.7372741699219,
"loss": 0.3363,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.67973518371582,
"rewards/margins": 11.069150924682617,
"rewards/rejected": -19.74888801574707,
"sft_loss": 1.168811559677124,
"step": 975
},
{
"epoch": 1.6100766703176341,
"grad_norm": 11.368129107493246,
"learning_rate": 4.9762859082752464e-08,
"logits/chosen": 17.196496963500977,
"logits/rejected": 18.05078125,
"logps/chosen": -340.8441162109375,
"logps/rejected": -291.5513610839844,
"loss": 0.332,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.99682903289795,
"rewards/margins": 11.247660636901855,
"rewards/rejected": -20.244489669799805,
"sft_loss": 1.040310025215149,
"step": 980
},
{
"epoch": 1.61829134720701,
"grad_norm": 17.375398637805176,
"learning_rate": 4.774575140626316e-08,
"logits/chosen": 15.612386703491211,
"logits/rejected": 17.049909591674805,
"logps/chosen": -315.4412841796875,
"logps/rejected": -273.022216796875,
"loss": 0.2981,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.78695011138916,
"rewards/margins": 10.45934772491455,
"rewards/rejected": -19.24629783630371,
"sft_loss": 1.122090458869934,
"step": 985
},
{
"epoch": 1.6265060240963856,
"grad_norm": 18.391059447329464,
"learning_rate": 4.5766051924049975e-08,
"logits/chosen": 19.033084869384766,
"logits/rejected": 19.09506607055664,
"logps/chosen": -344.99224853515625,
"logps/rejected": -281.4374084472656,
"loss": 0.4023,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.408563613891602,
"rewards/margins": 11.344144821166992,
"rewards/rejected": -19.752708435058594,
"sft_loss": 1.2188175916671753,
"step": 990
},
{
"epoch": 1.6347207009857612,
"grad_norm": 4.7857387318547255,
"learning_rate": 4.3824126793972934e-08,
"logits/chosen": 15.44153118133545,
"logits/rejected": 16.74248504638672,
"logps/chosen": -348.91326904296875,
"logps/rejected": -291.33905029296875,
"loss": 0.3604,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.16443157196045,
"rewards/margins": 12.226564407348633,
"rewards/rejected": -20.390995025634766,
"sft_loss": 1.1215661764144897,
"step": 995
},
{
"epoch": 1.642935377875137,
"grad_norm": 8.425137005894317,
"learning_rate": 4.192033518728819e-08,
"logits/chosen": 16.596193313598633,
"logits/rejected": 16.706600189208984,
"logps/chosen": -337.87109375,
"logps/rejected": -279.28277587890625,
"loss": 0.3546,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.45007038116455,
"rewards/margins": 11.166516304016113,
"rewards/rejected": -19.616586685180664,
"sft_loss": 1.3097057342529297,
"step": 1000
},
{
"epoch": 1.6511500547645126,
"grad_norm": 10.216638281397124,
"learning_rate": 4.0055029222217125e-08,
"logits/chosen": 16.447404861450195,
"logits/rejected": 16.960412979125977,
"logps/chosen": -313.47698974609375,
"logps/rejected": -269.1077880859375,
"loss": 0.3193,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.434925079345703,
"rewards/margins": 10.487255096435547,
"rewards/rejected": -19.92218017578125,
"sft_loss": 1.099938988685608,
"step": 1005
},
{
"epoch": 1.6593647316538882,
"grad_norm": 10.435302161258754,
"learning_rate": 3.8228553898819904e-08,
"logits/chosen": 17.95560073852539,
"logits/rejected": 19.009355545043945,
"logps/chosen": -340.97222900390625,
"logps/rejected": -298.7887268066406,
"loss": 0.3949,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.733698844909668,
"rewards/margins": 11.381470680236816,
"rewards/rejected": -21.115171432495117,
"sft_loss": 1.1103211641311646,
"step": 1010
},
{
"epoch": 1.667579408543264,
"grad_norm": 11.907900026431262,
"learning_rate": 3.6441247035185416e-08,
"logits/chosen": 16.81635284423828,
"logits/rejected": 17.959022521972656,
"logps/chosen": -361.63812255859375,
"logps/rejected": -303.6453552246094,
"loss": 0.3353,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.310194969177246,
"rewards/margins": 11.503978729248047,
"rewards/rejected": -20.814172744750977,
"sft_loss": 1.1227651834487915,
"step": 1015
},
{
"epoch": 1.6757940854326396,
"grad_norm": 10.027831680647658,
"learning_rate": 3.4693439204949855e-08,
"logits/chosen": 15.768338203430176,
"logits/rejected": 17.33998680114746,
"logps/chosen": -292.4506530761719,
"logps/rejected": -263.465087890625,
"loss": 0.3701,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.007841110229492,
"rewards/margins": 10.145407676696777,
"rewards/rejected": -19.153249740600586,
"sft_loss": 1.1951355934143066,
"step": 1020
},
{
"epoch": 1.6840087623220152,
"grad_norm": 19.083513373797096,
"learning_rate": 3.298545367615493e-08,
"logits/chosen": 17.174707412719727,
"logits/rejected": 17.86057472229004,
"logps/chosen": -288.18280029296875,
"logps/rejected": -254.59439086914062,
"loss": 0.4406,
"rewards/accuracies": 0.9200000166893005,
"rewards/chosen": -8.990920066833496,
"rewards/margins": 9.506522178649902,
"rewards/rejected": -18.4974422454834,
"sft_loss": 1.2072545289993286,
"step": 1025
},
{
"epoch": 1.692223439211391,
"grad_norm": 12.02229671131509,
"learning_rate": 3.13176063514575e-08,
"logits/chosen": 17.051944732666016,
"logits/rejected": 17.904996871948242,
"logps/chosen": -359.4859619140625,
"logps/rejected": -295.76361083984375,
"loss": 0.3592,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.758131980895996,
"rewards/margins": 11.989044189453125,
"rewards/rejected": -20.747175216674805,
"sft_loss": 1.2417008876800537,
"step": 1030
},
{
"epoch": 1.7004381161007667,
"grad_norm": 14.595666831687033,
"learning_rate": 2.96902057097011e-08,
"logits/chosen": 16.427305221557617,
"logits/rejected": 17.641498565673828,
"logps/chosen": -320.2253723144531,
"logps/rejected": -269.6889953613281,
"loss": 0.3571,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -8.658390998840332,
"rewards/margins": 10.258994102478027,
"rewards/rejected": -18.91738510131836,
"sft_loss": 1.332204818725586,
"step": 1035
},
{
"epoch": 1.7086527929901423,
"grad_norm": 13.068829943035729,
"learning_rate": 2.8103552748861475e-08,
"logits/chosen": 15.954511642456055,
"logits/rejected": 16.74055290222168,
"logps/chosen": -331.81707763671875,
"logps/rejected": -280.3811950683594,
"loss": 0.335,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.056927680969238,
"rewards/margins": 10.4561767578125,
"rewards/rejected": -19.51310157775879,
"sft_loss": 1.1305441856384277,
"step": 1040
},
{
"epoch": 1.716867469879518,
"grad_norm": 14.364271003384296,
"learning_rate": 2.65579409303745e-08,
"logits/chosen": 17.06740951538086,
"logits/rejected": 17.10344886779785,
"logps/chosen": -364.3813171386719,
"logps/rejected": -293.8392333984375,
"loss": 0.3632,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.905304908752441,
"rewards/margins": 11.902792930603027,
"rewards/rejected": -20.808101654052734,
"sft_loss": 1.149087905883789,
"step": 1045
},
{
"epoch": 1.7250821467688937,
"grad_norm": 21.44861485257077,
"learning_rate": 2.505365612485874e-08,
"logits/chosen": 14.690909385681152,
"logits/rejected": 15.39016056060791,
"logps/chosen": -310.1071472167969,
"logps/rejected": -257.1431884765625,
"loss": 0.3935,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -9.03943157196045,
"rewards/margins": 9.36133098602295,
"rewards/rejected": -18.4007625579834,
"sft_loss": 1.492135763168335,
"step": 1050
},
{
"epoch": 1.7332968236582693,
"grad_norm": 10.165639822250112,
"learning_rate": 2.3590976559242275e-08,
"logits/chosen": 16.5327091217041,
"logits/rejected": 17.50569725036621,
"logps/chosen": -327.5498962402344,
"logps/rejected": -288.2828674316406,
"loss": 0.3287,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.587510108947754,
"rewards/margins": 10.448949813842773,
"rewards/rejected": -20.036460876464844,
"sft_loss": 1.2338570356369019,
"step": 1055
},
{
"epoch": 1.7415115005476451,
"grad_norm": 9.87040734328389,
"learning_rate": 2.21701727653025e-08,
"logits/chosen": 15.633200645446777,
"logits/rejected": 16.086591720581055,
"logps/chosen": -352.7239990234375,
"logps/rejected": -294.7661437988281,
"loss": 0.3506,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.915904998779297,
"rewards/margins": 11.233012199401855,
"rewards/rejected": -21.14891815185547,
"sft_loss": 1.2049648761749268,
"step": 1060
},
{
"epoch": 1.749726177437021,
"grad_norm": 12.834737803326664,
"learning_rate": 2.0791507529629522e-08,
"logits/chosen": 16.351898193359375,
"logits/rejected": 17.47950553894043,
"logps/chosen": -281.7489318847656,
"logps/rejected": -243.97483825683594,
"loss": 0.3882,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.137645721435547,
"rewards/margins": 8.15616226196289,
"rewards/rejected": -17.29380989074707,
"sft_loss": 1.157172679901123,
"step": 1065
},
{
"epoch": 1.7579408543263964,
"grad_norm": 17.205116768747743,
"learning_rate": 1.945523584502262e-08,
"logits/chosen": 17.508634567260742,
"logits/rejected": 17.94008445739746,
"logps/chosen": -381.6427917480469,
"logps/rejected": -311.2584228515625,
"loss": 0.277,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.13695240020752,
"rewards/margins": 12.883686065673828,
"rewards/rejected": -22.02063751220703,
"sft_loss": 1.055487036705017,
"step": 1070
},
{
"epoch": 1.7661555312157722,
"grad_norm": 20.851515512896743,
"learning_rate": 1.8161604863327072e-08,
"logits/chosen": 15.488776206970215,
"logits/rejected": 16.223703384399414,
"logps/chosen": -325.0180358886719,
"logps/rejected": -262.5523376464844,
"loss": 0.3441,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.707998275756836,
"rewards/margins": 9.961923599243164,
"rewards/rejected": -18.669921875,
"sft_loss": 1.1589832305908203,
"step": 1075
},
{
"epoch": 1.774370208105148,
"grad_norm": 17.972861201786518,
"learning_rate": 1.691085384972235e-08,
"logits/chosen": 14.909817695617676,
"logits/rejected": 15.637177467346191,
"logps/chosen": -278.62322998046875,
"logps/rejected": -248.10516357421875,
"loss": 0.3273,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.496481895446777,
"rewards/margins": 9.37110424041748,
"rewards/rejected": -17.867582321166992,
"sft_loss": 1.2477223873138428,
"step": 1080
},
{
"epoch": 1.7825848849945234,
"grad_norm": 14.287110123465489,
"learning_rate": 1.570321413846845e-08,
"logits/chosen": 15.394953727722168,
"logits/rejected": 17.261220932006836,
"logps/chosen": -303.1915588378906,
"logps/rejected": -277.51458740234375,
"loss": 0.2832,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -8.813228607177734,
"rewards/margins": 10.778543472290039,
"rewards/rejected": -19.59177017211914,
"sft_loss": 1.2371479272842407,
"step": 1085
},
{
"epoch": 1.7907995618838992,
"grad_norm": 25.07441398024989,
"learning_rate": 1.4538909090118846e-08,
"logits/chosen": 16.854040145874023,
"logits/rejected": 16.584880828857422,
"logps/chosen": -322.2169494628906,
"logps/rejected": -270.48895263671875,
"loss": 0.3503,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.387935638427734,
"rewards/margins": 10.15947437286377,
"rewards/rejected": -19.547407150268555,
"sft_loss": 1.2250884771347046,
"step": 1090
},
{
"epoch": 1.799014238773275,
"grad_norm": 12.397083886048673,
"learning_rate": 1.3418154050208936e-08,
"logits/chosen": 15.345029830932617,
"logits/rejected": 16.834665298461914,
"logps/chosen": -297.9521484375,
"logps/rejected": -269.69659423828125,
"loss": 0.3526,
"rewards/accuracies": 0.9066667556762695,
"rewards/chosen": -8.90621280670166,
"rewards/margins": 10.480603218078613,
"rewards/rejected": -19.386816024780273,
"sft_loss": 1.1300204992294312,
"step": 1095
},
{
"epoch": 1.8072289156626506,
"grad_norm": 11.032455088728524,
"learning_rate": 1.2341156309426447e-08,
"logits/chosen": 14.950087547302246,
"logits/rejected": 16.54684829711914,
"logps/chosen": -332.92596435546875,
"logps/rejected": -291.2406005859375,
"loss": 0.289,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.32970905303955,
"rewards/margins": 11.054911613464355,
"rewards/rejected": -20.384618759155273,
"sft_loss": 1.0616583824157715,
"step": 1100
},
{
"epoch": 1.8154435925520263,
"grad_norm": 12.222936203639813,
"learning_rate": 1.130811506527149e-08,
"logits/chosen": 16.257431030273438,
"logits/rejected": 17.80784034729004,
"logps/chosen": -374.90716552734375,
"logps/rejected": -309.1212158203125,
"loss": 0.2761,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.983473777770996,
"rewards/margins": 11.696609497070312,
"rewards/rejected": -21.680082321166992,
"sft_loss": 1.1142687797546387,
"step": 1105
},
{
"epoch": 1.823658269441402,
"grad_norm": 8.603199609340459,
"learning_rate": 1.0319221385213934e-08,
"logits/chosen": 15.376051902770996,
"logits/rejected": 16.714609146118164,
"logps/chosen": -314.27996826171875,
"logps/rejected": -280.79901123046875,
"loss": 0.3201,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.61314868927002,
"rewards/margins": 9.986039161682129,
"rewards/rejected": -19.59918785095215,
"sft_loss": 1.2583483457565308,
"step": 1110
},
{
"epoch": 1.8318729463307777,
"grad_norm": 19.393082549696974,
"learning_rate": 9.374658171354411e-09,
"logits/chosen": 16.10991859436035,
"logits/rejected": 17.19182586669922,
"logps/chosen": -335.8138122558594,
"logps/rejected": -285.86859130859375,
"loss": 0.3573,
"rewards/accuracies": 0.9466666579246521,
"rewards/chosen": -9.598699569702148,
"rewards/margins": 11.110600471496582,
"rewards/rejected": -20.709299087524414,
"sft_loss": 1.2626595497131348,
"step": 1115
},
{
"epoch": 1.8400876232201533,
"grad_norm": 11.87556668069316,
"learning_rate": 8.474600126594983e-09,
"logits/chosen": 16.182172775268555,
"logits/rejected": 17.73249053955078,
"logps/chosen": -327.0877685546875,
"logps/rejected": -281.38848876953125,
"loss": 0.3247,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.166089057922363,
"rewards/margins": 11.149109840393066,
"rewards/rejected": -20.315196990966797,
"sft_loss": 1.3075504302978516,
"step": 1120
},
{
"epoch": 1.8483023001095291,
"grad_norm": 14.892979384936938,
"learning_rate": 7.619213722327184e-09,
"logits/chosen": 16.07329750061035,
"logits/rejected": 16.353158950805664,
"logps/chosen": -328.3527526855469,
"logps/rejected": -281.48565673828125,
"loss": 0.3187,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.420902252197266,
"rewards/margins": 10.730939865112305,
"rewards/rejected": -20.151844024658203,
"sft_loss": 1.2091686725616455,
"step": 1125
},
{
"epoch": 1.8565169769989047,
"grad_norm": 12.380319924456133,
"learning_rate": 6.808657167641896e-09,
"logits/chosen": 15.801959037780762,
"logits/rejected": 16.7104434967041,
"logps/chosen": -357.0127258300781,
"logps/rejected": -303.44989013671875,
"loss": 0.3863,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.63782024383545,
"rewards/margins": 12.076054573059082,
"rewards/rejected": -21.71387481689453,
"sft_loss": 1.1681187152862549,
"step": 1130
},
{
"epoch": 1.8647316538882803,
"grad_norm": 15.785691804360567,
"learning_rate": 6.043080380067539e-09,
"logits/chosen": 15.678844451904297,
"logits/rejected": 16.41909408569336,
"logps/chosen": -383.7453918457031,
"logps/rejected": -308.8125915527344,
"loss": 0.3156,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.041726112365723,
"rewards/margins": 12.560318946838379,
"rewards/rejected": -21.602046966552734,
"sft_loss": 1.186676263809204,
"step": 1135
},
{
"epoch": 1.8729463307776562,
"grad_norm": 19.758267623181617,
"learning_rate": 5.322624957841998e-09,
"logits/chosen": 16.686138153076172,
"logits/rejected": 17.78066062927246,
"logps/chosen": -342.8313293457031,
"logps/rejected": -297.6686096191406,
"loss": 0.38,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.914877891540527,
"rewards/margins": 11.297541618347168,
"rewards/rejected": -21.21242332458496,
"sft_loss": 1.1149108409881592,
"step": 1140
},
{
"epoch": 1.8811610076670318,
"grad_norm": 14.356418798551582,
"learning_rate": 4.647424153723101e-09,
"logits/chosen": 16.441852569580078,
"logits/rejected": 16.586217880249023,
"logps/chosen": -318.8826599121094,
"logps/rejected": -271.4314880371094,
"loss": 0.367,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.366458892822266,
"rewards/margins": 10.299457550048828,
"rewards/rejected": -19.665918350219727,
"sft_loss": 1.2187005281448364,
"step": 1145
},
{
"epoch": 1.8893756845564074,
"grad_norm": 18.826459574147577,
"learning_rate": 4.0176028503425826e-09,
"logits/chosen": 15.749044418334961,
"logits/rejected": 16.83735466003418,
"logps/chosen": -308.5406188964844,
"logps/rejected": -271.7100830078125,
"loss": 0.3801,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.177282333374023,
"rewards/margins": 10.317461967468262,
"rewards/rejected": -19.4947452545166,
"sft_loss": 1.252463698387146,
"step": 1150
},
{
"epoch": 1.8975903614457832,
"grad_norm": 16.226959543929514,
"learning_rate": 3.433277537108481e-09,
"logits/chosen": 15.832767486572266,
"logits/rejected": 17.746004104614258,
"logps/chosen": -343.33447265625,
"logps/rejected": -305.2869873046875,
"loss": 0.335,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.902791976928711,
"rewards/margins": 11.3204345703125,
"rewards/rejected": -21.223228454589844,
"sft_loss": 1.2560192346572876,
"step": 1155
},
{
"epoch": 1.9058050383351588,
"grad_norm": 14.860604119401957,
"learning_rate": 2.8945562886593944e-09,
"logits/chosen": 14.95615005493164,
"logits/rejected": 16.35462760925293,
"logps/chosen": -287.0328369140625,
"logps/rejected": -257.26080322265625,
"loss": 0.3677,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.02625560760498,
"rewards/margins": 9.515448570251465,
"rewards/rejected": -18.541706085205078,
"sft_loss": 1.1147348880767822,
"step": 1160
},
{
"epoch": 1.9140197152245344,
"grad_norm": 12.84326688048793,
"learning_rate": 2.4015387448756976e-09,
"logits/chosen": 15.258326530456543,
"logits/rejected": 16.413604736328125,
"logps/chosen": -337.6728820800781,
"logps/rejected": -276.5948181152344,
"loss": 0.333,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -8.658801078796387,
"rewards/margins": 11.077197074890137,
"rewards/rejected": -19.736000061035156,
"sft_loss": 1.313868761062622,
"step": 1165
},
{
"epoch": 1.9222343921139102,
"grad_norm": 9.16948186267524,
"learning_rate": 1.954316092450281e-09,
"logits/chosen": 16.7126522064209,
"logits/rejected": 16.963319778442383,
"logps/chosen": -349.0697326660156,
"logps/rejected": -294.1761169433594,
"loss": 0.299,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.689640998840332,
"rewards/margins": 11.0868558883667,
"rewards/rejected": -20.77649688720703,
"sft_loss": 1.2454497814178467,
"step": 1170
},
{
"epoch": 1.9304490690032858,
"grad_norm": 19.811301652971856,
"learning_rate": 1.5529710480231272e-09,
"logits/chosen": 17.24116325378418,
"logits/rejected": 16.968626022338867,
"logps/chosen": -310.8689270019531,
"logps/rejected": -274.0096740722656,
"loss": 0.3,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.42411994934082,
"rewards/margins": 10.301589012145996,
"rewards/rejected": -19.725709915161133,
"sft_loss": 1.0850669145584106,
"step": 1175
},
{
"epoch": 1.9386637458926614,
"grad_norm": 12.455631194759059,
"learning_rate": 1.1975778428823524e-09,
"logits/chosen": 15.130066871643066,
"logits/rejected": 16.740190505981445,
"logps/chosen": -351.4178466796875,
"logps/rejected": -299.97235107421875,
"loss": 0.3093,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.884756088256836,
"rewards/margins": 11.209836959838867,
"rewards/rejected": -21.094594955444336,
"sft_loss": 1.0997297763824463,
"step": 1180
},
{
"epoch": 1.9468784227820373,
"grad_norm": 12.90904121827512,
"learning_rate": 8.882022092346064e-10,
"logits/chosen": 16.643354415893555,
"logits/rejected": 16.99618148803711,
"logps/chosen": -355.08087158203125,
"logps/rejected": -291.8462219238281,
"loss": 0.3245,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.204696655273438,
"rewards/margins": 11.567381858825684,
"rewards/rejected": -20.77208137512207,
"sft_loss": 1.2387458086013794,
"step": 1185
},
{
"epoch": 1.9550930996714129,
"grad_norm": 10.02772673186922,
"learning_rate": 6.249013680474368e-10,
"logits/chosen": 16.724010467529297,
"logits/rejected": 16.2373104095459,
"logps/chosen": -319.0643310546875,
"logps/rejected": -268.2914123535156,
"loss": 0.3367,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.252176284790039,
"rewards/margins": 9.910921096801758,
"rewards/rejected": -19.163097381591797,
"sft_loss": 1.1700729131698608,
"step": 1190
},
{
"epoch": 1.9633077765607885,
"grad_norm": 16.734218614778506,
"learning_rate": 4.0772401846608794e-10,
"logits/chosen": 17.680179595947266,
"logits/rejected": 17.80653190612793,
"logps/chosen": -305.4862060546875,
"logps/rejected": -267.6892395019531,
"loss": 0.4133,
"rewards/accuracies": 0.9466667175292969,
"rewards/chosen": -9.576127052307129,
"rewards/margins": 9.75358772277832,
"rewards/rejected": -19.329715728759766,
"sft_loss": 1.1736282110214233,
"step": 1195
},
{
"epoch": 1.9715224534501643,
"grad_norm": 19.267511912968715,
"learning_rate": 2.367103288061223e-10,
"logits/chosen": 16.904399871826172,
"logits/rejected": 16.482337951660156,
"logps/chosen": -316.0256652832031,
"logps/rejected": -265.80157470703125,
"loss": 0.3574,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.74451732635498,
"rewards/margins": 9.55905532836914,
"rewards/rejected": -19.303569793701172,
"sft_loss": 1.2237191200256348,
"step": 1200
},
{
"epoch": 1.9797371303395401,
"grad_norm": 11.51131736701793,
"learning_rate": 1.1189192912416933e-10,
"logits/chosen": 15.607586860656738,
"logits/rejected": 16.690214157104492,
"logps/chosen": -370.86328125,
"logps/rejected": -313.1533508300781,
"loss": 0.2989,
"rewards/accuracies": 0.9866666793823242,
"rewards/chosen": -9.272278785705566,
"rewards/margins": 12.769195556640625,
"rewards/rejected": -22.041475296020508,
"sft_loss": 1.1835730075836182,
"step": 1205
},
{
"epoch": 1.9879518072289155,
"grad_norm": 11.903861482033115,
"learning_rate": 3.329190536757731e-11,
"logits/chosen": 17.456689834594727,
"logits/rejected": 18.812978744506836,
"logps/chosen": -314.75823974609375,
"logps/rejected": -271.9325256347656,
"loss": 0.3344,
"rewards/accuracies": 0.9733333587646484,
"rewards/chosen": -9.277753829956055,
"rewards/margins": 10.492895126342773,
"rewards/rejected": -19.770648956298828,
"sft_loss": 1.1376186609268188,
"step": 1210
},
{
"epoch": 1.9961664841182913,
"grad_norm": 14.744908012876596,
"learning_rate": 9.247951046897906e-13,
"logits/chosen": 16.54582977294922,
"logits/rejected": 18.33929443359375,
"logps/chosen": -319.89813232421875,
"logps/rejected": -279.7975769042969,
"loss": 0.352,
"rewards/accuracies": 0.9600000381469727,
"rewards/chosen": -9.205850601196289,
"rewards/margins": 10.397418975830078,
"rewards/rejected": -19.603271484375,
"sft_loss": 1.1400221586227417,
"step": 1215
},
{
"epoch": 1.9978094194961664,
"step": 1216,
"total_flos": 200111899688960.0,
"train_loss": 0.4716386401069988,
"train_runtime": 41653.1021,
"train_samples_per_second": 1.753,
"train_steps_per_second": 0.029
}
],
"logging_steps": 5,
"max_steps": 1216,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 200111899688960.0,
"train_batch_size": 5,
"trial_name": null,
"trial_params": null
}