{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9978094194961664, "eval_steps": 50000, "global_step": 1216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008214676889375685, "grad_norm": 47.48785366410319, "learning_rate": 4.0983606557377046e-08, "logits/chosen": 26.403932571411133, "logits/rejected": 25.755094528198242, "logps/chosen": -185.5782928466797, "logps/rejected": -79.66442108154297, "loss": 1.7879, "rewards/accuracies": 0.30666670203208923, "rewards/chosen": 0.008285612799227238, "rewards/margins": 0.017053820192813873, "rewards/rejected": -0.008768204599618912, "sft_loss": 0.6387583017349243, "step": 5 }, { "epoch": 0.01642935377875137, "grad_norm": 36.134481992571715, "learning_rate": 8.196721311475409e-08, "logits/chosen": 25.775484085083008, "logits/rejected": 25.31159210205078, "logps/chosen": -152.4672088623047, "logps/rejected": -72.757080078125, "loss": 1.6789, "rewards/accuracies": 0.7333334684371948, "rewards/chosen": -0.026889141649007797, "rewards/margins": 0.14848218858242035, "rewards/rejected": -0.17537136375904083, "sft_loss": 0.6469724774360657, "step": 10 }, { "epoch": 0.024644030668127054, "grad_norm": 19.978551205164187, "learning_rate": 1.2295081967213113e-07, "logits/chosen": 26.670787811279297, "logits/rejected": 26.257781982421875, "logps/chosen": -176.73304748535156, "logps/rejected": -84.2028579711914, "loss": 1.4459, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.1812039315700531, "rewards/margins": 0.5640282034873962, "rewards/rejected": -0.7452322244644165, "sft_loss": 0.6364741921424866, "step": 15 }, { "epoch": 0.03285870755750274, "grad_norm": 20.48799482343835, "learning_rate": 1.6393442622950818e-07, "logits/chosen": 26.263166427612305, "logits/rejected": 26.03022003173828, "logps/chosen": -214.57823181152344, "logps/rejected": -111.45527648925781, "loss": 1.316, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -0.642541766166687, "rewards/margins": 1.2724699974060059, "rewards/rejected": -1.9150116443634033, "sft_loss": 0.7241686582565308, "step": 20 }, { "epoch": 0.04107338444687842, "grad_norm": 24.43893120773317, "learning_rate": 2.0491803278688524e-07, "logits/chosen": 25.63840103149414, "logits/rejected": 25.88968849182129, "logps/chosen": -180.67430114746094, "logps/rejected": -108.99486541748047, "loss": 1.26, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -0.9794896245002747, "rewards/margins": 1.706125020980835, "rewards/rejected": -2.6856143474578857, "sft_loss": 0.7140628695487976, "step": 25 }, { "epoch": 0.04928806133625411, "grad_norm": 15.575922163206743, "learning_rate": 2.4590163934426226e-07, "logits/chosen": 25.174482345581055, "logits/rejected": 25.23969841003418, "logps/chosen": -213.48123168945312, "logps/rejected": -114.4116439819336, "loss": 1.1511, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -0.9717932343482971, "rewards/margins": 2.3237061500549316, "rewards/rejected": -3.295499563217163, "sft_loss": 0.6879211664199829, "step": 30 }, { "epoch": 0.05750273822562979, "grad_norm": 12.317269413176323, "learning_rate": 2.868852459016393e-07, "logits/chosen": 24.615764617919922, "logits/rejected": 24.808069229125977, "logps/chosen": -202.15489196777344, "logps/rejected": -124.00420379638672, "loss": 1.0435, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -1.0643211603164673, "rewards/margins": 2.588770627975464, "rewards/rejected": -3.6530916690826416, "sft_loss": 0.7430208325386047, "step": 35 }, { "epoch": 0.06571741511500548, "grad_norm": 11.98913328054039, "learning_rate": 3.2786885245901637e-07, "logits/chosen": 24.245140075683594, "logits/rejected": 24.268098831176758, "logps/chosen": -207.348876953125, "logps/rejected": -116.2168960571289, "loss": 0.9343, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -1.0519558191299438, "rewards/margins": 2.5677475929260254, "rewards/rejected": -3.619703531265259, "sft_loss": 0.7124413251876831, "step": 40 }, { "epoch": 0.07393209200438117, "grad_norm": 11.849547311712948, "learning_rate": 3.6885245901639347e-07, "logits/chosen": 22.61182403564453, "logits/rejected": 22.616382598876953, "logps/chosen": -222.93838500976562, "logps/rejected": -123.43074798583984, "loss": 0.8683, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -1.442903995513916, "rewards/margins": 2.7517101764678955, "rewards/rejected": -4.194613456726074, "sft_loss": 0.702341616153717, "step": 45 }, { "epoch": 0.08214676889375684, "grad_norm": 11.859772629239353, "learning_rate": 4.0983606557377047e-07, "logits/chosen": 20.62839126586914, "logits/rejected": 20.336801528930664, "logps/chosen": -241.59852600097656, "logps/rejected": -132.82681274414062, "loss": 0.7963, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -2.2198777198791504, "rewards/margins": 3.0024592876434326, "rewards/rejected": -5.2223358154296875, "sft_loss": 0.7061720490455627, "step": 50 }, { "epoch": 0.09036144578313253, "grad_norm": 9.406942779681957, "learning_rate": 4.508196721311475e-07, "logits/chosen": 19.715351104736328, "logits/rejected": 20.35331153869629, "logps/chosen": -208.7209930419922, "logps/rejected": -150.72914123535156, "loss": 0.8148, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -2.6436386108398438, "rewards/margins": 3.832695722579956, "rewards/rejected": -6.476334571838379, "sft_loss": 0.7786983251571655, "step": 55 }, { "epoch": 0.09857612267250822, "grad_norm": 10.934185099116656, "learning_rate": 4.918032786885245e-07, "logits/chosen": 20.9300537109375, "logits/rejected": 21.388505935668945, "logps/chosen": -192.5828399658203, "logps/rejected": -125.1326904296875, "loss": 0.8114, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -2.3978421688079834, "rewards/margins": 3.1992502212524414, "rewards/rejected": -5.597092628479004, "sft_loss": 0.698898434638977, "step": 60 }, { "epoch": 0.10679079956188389, "grad_norm": 12.361759850691927, "learning_rate": 4.999852034151641e-07, "logits/chosen": 19.11568832397461, "logits/rejected": 19.857196807861328, "logps/chosen": -242.90460205078125, "logps/rejected": -149.67938232421875, "loss": 0.7666, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -2.5540611743927, "rewards/margins": 3.71470308303833, "rewards/rejected": -6.268764019012451, "sft_loss": 0.7993389368057251, "step": 65 }, { "epoch": 0.11500547645125958, "grad_norm": 14.492049698010485, "learning_rate": 4.999250952911133e-07, "logits/chosen": 20.96298599243164, "logits/rejected": 20.906280517578125, "logps/chosen": -236.47763061523438, "logps/rejected": -142.59445190429688, "loss": 0.6927, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -2.4225642681121826, "rewards/margins": 4.021193981170654, "rewards/rejected": -6.4437575340271, "sft_loss": 0.8038942217826843, "step": 70 }, { "epoch": 0.12322015334063527, "grad_norm": 17.551745284718073, "learning_rate": 4.998187619501184e-07, "logits/chosen": 20.637529373168945, "logits/rejected": 21.148029327392578, "logps/chosen": -266.9391784667969, "logps/rejected": -173.1654510498047, "loss": 0.6651, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -3.129103660583496, "rewards/margins": 5.091865062713623, "rewards/rejected": -8.220968246459961, "sft_loss": 0.8789225816726685, "step": 75 }, { "epoch": 0.13143483023001096, "grad_norm": 21.266109357356587, "learning_rate": 4.996662230591989e-07, "logits/chosen": 18.540781021118164, "logits/rejected": 19.185565948486328, "logps/chosen": -252.1251983642578, "logps/rejected": -169.13851928710938, "loss": 0.706, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -3.4190313816070557, "rewards/margins": 4.7408881187438965, "rewards/rejected": -8.159918785095215, "sft_loss": 0.8200284242630005, "step": 80 }, { "epoch": 0.13964950711938665, "grad_norm": 14.68921798619268, "learning_rate": 4.994675068313813e-07, "logits/chosen": 17.844524383544922, "logits/rejected": 19.307209014892578, "logps/chosen": -235.93295288085938, "logps/rejected": -164.65467834472656, "loss": 0.6425, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.202953577041626, "rewards/margins": 4.453563213348389, "rewards/rejected": -7.656517028808594, "sft_loss": 0.8084096908569336, "step": 85 }, { "epoch": 0.14786418400876233, "grad_norm": 9.391954380287526, "learning_rate": 4.992226500204806e-07, "logits/chosen": 18.810604095458984, "logits/rejected": 19.509326934814453, "logps/chosen": -239.79638671875, "logps/rejected": -149.21372985839844, "loss": 0.6741, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -2.8668696880340576, "rewards/margins": 4.099938869476318, "rewards/rejected": -6.966808795928955, "sft_loss": 0.8505186438560486, "step": 90 }, { "epoch": 0.156078860898138, "grad_norm": 8.292078325061183, "learning_rate": 4.989316979143029e-07, "logits/chosen": 19.036439895629883, "logits/rejected": 18.50504493713379, "logps/chosen": -243.55430603027344, "logps/rejected": -141.56640625, "loss": 0.7786, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -2.8055150508880615, "rewards/margins": 4.023467540740967, "rewards/rejected": -6.828982830047607, "sft_loss": 0.8537193536758423, "step": 95 }, { "epoch": 0.16429353778751368, "grad_norm": 11.759496127210191, "learning_rate": 4.985947043262686e-07, "logits/chosen": 18.438268661499023, "logits/rejected": 18.92384147644043, "logps/chosen": -256.82135009765625, "logps/rejected": -162.3760223388672, "loss": 0.656, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -3.251594066619873, "rewards/margins": 4.7661452293396, "rewards/rejected": -8.017740249633789, "sft_loss": 0.8523219227790833, "step": 100 }, { "epoch": 0.17250821467688937, "grad_norm": 13.225983298656475, "learning_rate": 4.982117315854593e-07, "logits/chosen": 19.018491744995117, "logits/rejected": 19.4432373046875, "logps/chosen": -242.88742065429688, "logps/rejected": -160.6437225341797, "loss": 0.6173, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -3.371706962585449, "rewards/margins": 4.9150261878967285, "rewards/rejected": -8.286733627319336, "sft_loss": 0.8633176684379578, "step": 105 }, { "epoch": 0.18072289156626506, "grad_norm": 33.60275949690272, "learning_rate": 4.977828505250903e-07, "logits/chosen": 18.26275062561035, "logits/rejected": 18.561012268066406, "logps/chosen": -232.76333618164062, "logps/rejected": -153.5156707763672, "loss": 0.6725, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -3.7783102989196777, "rewards/margins": 4.282144069671631, "rewards/rejected": -8.060454368591309, "sft_loss": 0.8514001369476318, "step": 110 }, { "epoch": 0.18893756845564075, "grad_norm": 29.7322754671122, "learning_rate": 4.973081404694087e-07, "logits/chosen": 17.40985679626465, "logits/rejected": 18.532135009765625, "logps/chosen": -263.5098571777344, "logps/rejected": -179.07461547851562, "loss": 0.6416, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -4.028925895690918, "rewards/margins": 5.305994033813477, "rewards/rejected": -9.334918975830078, "sft_loss": 0.9138454794883728, "step": 115 }, { "epoch": 0.19715224534501644, "grad_norm": 11.64226677208211, "learning_rate": 4.967876892190227e-07, "logits/chosen": 18.535491943359375, "logits/rejected": 18.528560638427734, "logps/chosen": -261.1396484375, "logps/rejected": -164.66261291503906, "loss": 0.6327, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -3.7333872318267822, "rewards/margins": 4.9736409187316895, "rewards/rejected": -8.707027435302734, "sft_loss": 0.8873167634010315, "step": 120 }, { "epoch": 0.20536692223439212, "grad_norm": 10.419489404925384, "learning_rate": 4.962215930346614e-07, "logits/chosen": 18.076738357543945, "logits/rejected": 18.797412872314453, "logps/chosen": -240.43885803222656, "logps/rejected": -170.57994079589844, "loss": 0.6021, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -3.8788936138153076, "rewards/margins": 5.037622928619385, "rewards/rejected": -8.91651725769043, "sft_loss": 0.8787587285041809, "step": 125 }, { "epoch": 0.21358159912376778, "grad_norm": 13.538147865005195, "learning_rate": 4.956099566193716e-07, "logits/chosen": 17.794748306274414, "logits/rejected": 18.117393493652344, "logps/chosen": -263.0421447753906, "logps/rejected": -180.68548583984375, "loss": 0.5662, "rewards/accuracies": 1.0, "rewards/chosen": -4.157100677490234, "rewards/margins": 5.220449924468994, "rewards/rejected": -9.377551078796387, "sft_loss": 0.8972741961479187, "step": 130 }, { "epoch": 0.22179627601314347, "grad_norm": 14.579358533691158, "learning_rate": 4.949528930991521e-07, "logits/chosen": 17.554058074951172, "logits/rejected": 18.180675506591797, "logps/chosen": -265.0473327636719, "logps/rejected": -177.85751342773438, "loss": 0.6399, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.8493740558624268, "rewards/margins": 5.235028266906738, "rewards/rejected": -9.084402084350586, "sft_loss": 0.8204969167709351, "step": 135 }, { "epoch": 0.23001095290251916, "grad_norm": 12.455186291161809, "learning_rate": 4.9425052400203e-07, "logits/chosen": 17.611921310424805, "logits/rejected": 17.878339767456055, "logps/chosen": -265.25787353515625, "logps/rejected": -185.50375366210938, "loss": 0.6103, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.896492958068848, "rewards/margins": 4.968528747558594, "rewards/rejected": -9.865021705627441, "sft_loss": 0.8832098245620728, "step": 140 }, { "epoch": 0.23822562979189485, "grad_norm": 12.455826945227212, "learning_rate": 4.935029792355834e-07, "logits/chosen": 17.996692657470703, "logits/rejected": 18.594377517700195, "logps/chosen": -286.6059875488281, "logps/rejected": -200.05149841308594, "loss": 0.543, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -5.263542175292969, "rewards/margins": 5.605571269989014, "rewards/rejected": -10.86911392211914, "sft_loss": 0.8996745944023132, "step": 145 }, { "epoch": 0.24644030668127054, "grad_norm": 15.362573644388778, "learning_rate": 4.927103970629147e-07, "logits/chosen": 18.072965621948242, "logits/rejected": 18.25052261352539, "logps/chosen": -269.8097839355469, "logps/rejected": -185.32431030273438, "loss": 0.6219, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -4.9397172927856445, "rewards/margins": 5.206496715545654, "rewards/rejected": -10.14621353149414, "sft_loss": 0.7995728254318237, "step": 150 }, { "epoch": 0.2546549835706462, "grad_norm": 11.01295684823168, "learning_rate": 4.918729240770775e-07, "logits/chosen": 17.353046417236328, "logits/rejected": 18.587129592895508, "logps/chosen": -240.89488220214844, "logps/rejected": -173.4665069580078, "loss": 0.5702, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -4.678676128387451, "rewards/margins": 5.200152397155762, "rewards/rejected": -9.878829002380371, "sft_loss": 0.9399448037147522, "step": 155 }, { "epoch": 0.2628696604600219, "grad_norm": 19.41550454155631, "learning_rate": 4.909907151739633e-07, "logits/chosen": 18.130189895629883, "logits/rejected": 18.379247665405273, "logps/chosen": -292.39990234375, "logps/rejected": -188.62220764160156, "loss": 0.6561, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.744537353515625, "rewards/margins": 5.8243794441223145, "rewards/rejected": -10.568917274475098, "sft_loss": 0.8947219848632812, "step": 160 }, { "epoch": 0.2710843373493976, "grad_norm": 11.498941704903908, "learning_rate": 4.900639335236526e-07, "logits/chosen": 18.79334259033203, "logits/rejected": 19.24587059020996, "logps/chosen": -271.9427185058594, "logps/rejected": -179.41293334960938, "loss": 0.607, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.425381183624268, "rewards/margins": 5.343040943145752, "rewards/rejected": -9.76842212677002, "sft_loss": 0.9064626097679138, "step": 165 }, { "epoch": 0.2792990142387733, "grad_norm": 10.84500529690262, "learning_rate": 4.890927505402359e-07, "logits/chosen": 16.892650604248047, "logits/rejected": 17.597482681274414, "logps/chosen": -238.55162048339844, "logps/rejected": -170.00466918945312, "loss": 0.5889, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.459685802459717, "rewards/margins": 4.842031002044678, "rewards/rejected": -9.301715850830078, "sft_loss": 0.8489271402359009, "step": 170 }, { "epoch": 0.28751369112814895, "grad_norm": 16.18327307978759, "learning_rate": 4.880773458501089e-07, "logits/chosen": 19.4614315032959, "logits/rejected": 19.801300048828125, "logps/chosen": -232.73573303222656, "logps/rejected": -165.04103088378906, "loss": 0.5662, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -4.241008758544922, "rewards/margins": 4.859863758087158, "rewards/rejected": -9.100872993469238, "sft_loss": 0.8601513504981995, "step": 175 }, { "epoch": 0.29572836801752467, "grad_norm": 10.178119222467004, "learning_rate": 4.870179072587498e-07, "logits/chosen": 17.228599548339844, "logits/rejected": 17.30803871154785, "logps/chosen": -250.42587280273438, "logps/rejected": -171.54635620117188, "loss": 0.6129, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -5.068057537078857, "rewards/margins": 5.1040239334106445, "rewards/rejected": -10.172082901000977, "sft_loss": 0.9672516584396362, "step": 180 }, { "epoch": 0.30394304490690033, "grad_norm": 8.317024863573055, "learning_rate": 4.859146307159841e-07, "logits/chosen": 18.039478302001953, "logits/rejected": 18.52968406677246, "logps/chosen": -248.23155212402344, "logps/rejected": -179.2881317138672, "loss": 0.5417, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.143929481506348, "rewards/margins": 5.1267170906066895, "rewards/rejected": -10.270648002624512, "sft_loss": 0.8881379961967468, "step": 185 }, { "epoch": 0.312157721796276, "grad_norm": 11.89542898588366, "learning_rate": 4.847677202797414e-07, "logits/chosen": 18.8001708984375, "logits/rejected": 19.126699447631836, "logps/chosen": -263.02789306640625, "logps/rejected": -183.99911499023438, "loss": 0.5551, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.0858845710754395, "rewards/margins": 5.6055006980896, "rewards/rejected": -10.691385269165039, "sft_loss": 0.8070122599601746, "step": 190 }, { "epoch": 0.3203723986856517, "grad_norm": 12.1886798786308, "learning_rate": 4.835773880783144e-07, "logits/chosen": 16.390464782714844, "logits/rejected": 17.854284286499023, "logps/chosen": -269.9723815917969, "logps/rejected": -200.60789489746094, "loss": 0.5446, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.6444902420043945, "rewards/margins": 6.389484882354736, "rewards/rejected": -12.033974647521973, "sft_loss": 0.8605390191078186, "step": 195 }, { "epoch": 0.32858707557502737, "grad_norm": 11.13911341274398, "learning_rate": 4.823438542711238e-07, "logits/chosen": 17.828205108642578, "logits/rejected": 18.60173797607422, "logps/chosen": -277.97259521484375, "logps/rejected": -203.9155731201172, "loss": 0.5444, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -5.445572853088379, "rewards/margins": 6.231374740600586, "rewards/rejected": -11.676946640014648, "sft_loss": 0.9524543881416321, "step": 200 }, { "epoch": 0.3368017524644031, "grad_norm": 59.69351759377879, "learning_rate": 4.81067347007999e-07, "logits/chosen": 18.93602752685547, "logits/rejected": 19.728424072265625, "logps/chosen": -247.34567260742188, "logps/rejected": -173.0783233642578, "loss": 0.6075, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.630053997039795, "rewards/margins": 4.911181449890137, "rewards/rejected": -9.54123592376709, "sft_loss": 0.9002848863601685, "step": 205 }, { "epoch": 0.34501642935377874, "grad_norm": 8.657351709118194, "learning_rate": 4.797481023869801e-07, "logits/chosen": 18.50823974609375, "logits/rejected": 18.78363037109375, "logps/chosen": -245.55979919433594, "logps/rejected": -182.1737518310547, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": -5.3583760261535645, "rewards/margins": 5.406437397003174, "rewards/rejected": -10.764813423156738, "sft_loss": 0.9510916471481323, "step": 210 }, { "epoch": 0.35323110624315446, "grad_norm": 19.28978220217397, "learning_rate": 4.783863644106502e-07, "logits/chosen": 17.9003849029541, "logits/rejected": 19.15799903869629, "logps/chosen": -240.30958557128906, "logps/rejected": -187.32284545898438, "loss": 0.546, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.292669296264648, "rewards/margins": 5.617033004760742, "rewards/rejected": -10.909701347351074, "sft_loss": 0.9965067505836487, "step": 215 }, { "epoch": 0.3614457831325301, "grad_norm": 8.529326956491959, "learning_rate": 4.769823849410053e-07, "logits/chosen": 15.990920066833496, "logits/rejected": 17.267040252685547, "logps/chosen": -283.7446594238281, "logps/rejected": -209.57525634765625, "loss": 0.5062, "rewards/accuracies": 1.0, "rewards/chosen": -5.516228675842285, "rewards/margins": 6.538068771362305, "rewards/rejected": -12.054296493530273, "sft_loss": 0.9376140832901001, "step": 220 }, { "epoch": 0.3696604600219058, "grad_norm": 14.616903394027977, "learning_rate": 4.7553642365287127e-07, "logits/chosen": 16.816274642944336, "logits/rejected": 17.819963455200195, "logps/chosen": -245.84878540039062, "logps/rejected": -188.35284423828125, "loss": 0.5832, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.176213264465332, "rewards/margins": 5.302474498748779, "rewards/rejected": -10.478687286376953, "sft_loss": 1.0134352445602417, "step": 225 }, { "epoch": 0.3778751369112815, "grad_norm": 15.650965079934865, "learning_rate": 4.7404874798587493e-07, "logits/chosen": 18.04664421081543, "logits/rejected": 19.232574462890625, "logps/chosen": -268.1763610839844, "logps/rejected": -193.1671600341797, "loss": 0.5248, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -5.173998832702637, "rewards/margins": 5.888847827911377, "rewards/rejected": -11.062848091125488, "sft_loss": 0.9188562035560608, "step": 230 }, { "epoch": 0.38608981380065716, "grad_norm": 9.155968536476317, "learning_rate": 4.7251963309497965e-07, "logits/chosen": 17.16444206237793, "logits/rejected": 18.188404083251953, "logps/chosen": -281.6944580078125, "logps/rejected": -214.91883850097656, "loss": 0.5831, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.146281719207764, "rewards/margins": 6.598486423492432, "rewards/rejected": -12.744769096374512, "sft_loss": 1.0649549961090088, "step": 235 }, { "epoch": 0.39430449069003287, "grad_norm": 13.480064050397614, "learning_rate": 4.709493617995938e-07, "logits/chosen": 18.09016227722168, "logits/rejected": 18.207592010498047, "logps/chosen": -278.3957214355469, "logps/rejected": -195.16822814941406, "loss": 0.4846, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -5.534964084625244, "rewards/margins": 6.029869079589844, "rewards/rejected": -11.564833641052246, "sft_loss": 0.9166081547737122, "step": 240 }, { "epoch": 0.40251916757940853, "grad_norm": 8.853519364453792, "learning_rate": 4.6933822453126114e-07, "logits/chosen": 17.334672927856445, "logits/rejected": 18.275968551635742, "logps/chosen": -229.73594665527344, "logps/rejected": -182.89251708984375, "loss": 0.5795, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.739324569702148, "rewards/margins": 5.62018346786499, "rewards/rejected": -11.35950756072998, "sft_loss": 1.0507081747055054, "step": 245 }, { "epoch": 0.41073384446878425, "grad_norm": 23.105340732527253, "learning_rate": 4.676865192799443e-07, "logits/chosen": 18.659299850463867, "logits/rejected": 19.426942825317383, "logps/chosen": -310.3028869628906, "logps/rejected": -233.80967712402344, "loss": 0.5041, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -7.381309509277344, "rewards/margins": 6.830047607421875, "rewards/rejected": -14.211358070373535, "sft_loss": 0.9847605228424072, "step": 250 }, { "epoch": 0.4189485213581599, "grad_norm": 12.714869052495171, "learning_rate": 4.65994551538909e-07, "logits/chosen": 17.69913101196289, "logits/rejected": 17.626365661621094, "logps/chosen": -286.1001892089844, "logps/rejected": -213.40573120117188, "loss": 0.5671, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -6.464048385620117, "rewards/margins": 6.595151424407959, "rewards/rejected": -13.059199333190918, "sft_loss": 1.0706934928894043, "step": 255 }, { "epoch": 0.42716319824753557, "grad_norm": 18.758504329716533, "learning_rate": 4.642626342482215e-07, "logits/chosen": 17.131309509277344, "logits/rejected": 17.48920440673828, "logps/chosen": -231.87130737304688, "logps/rejected": -174.91970825195312, "loss": 0.5728, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.785408973693848, "rewards/margins": 5.311648368835449, "rewards/rejected": -10.097058296203613, "sft_loss": 0.9056914448738098, "step": 260 }, { "epoch": 0.4353778751369113, "grad_norm": 16.34948060980786, "learning_rate": 4.624910877368684e-07, "logits/chosen": 17.2136287689209, "logits/rejected": 18.958431243896484, "logps/chosen": -265.6873474121094, "logps/rejected": -200.37913513183594, "loss": 0.5359, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -5.0484795570373535, "rewards/margins": 6.235151767730713, "rewards/rejected": -11.283629417419434, "sft_loss": 0.897827684879303, "step": 265 }, { "epoch": 0.44359255202628695, "grad_norm": 8.468377967197654, "learning_rate": 4.606802396635098e-07, "logits/chosen": 18.035551071166992, "logits/rejected": 19.360517501831055, "logps/chosen": -279.75555419921875, "logps/rejected": -217.06832885742188, "loss": 0.4866, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -6.324933052062988, "rewards/margins": 6.823997974395752, "rewards/rejected": -13.148929595947266, "sft_loss": 0.9062218070030212, "step": 270 }, { "epoch": 0.45180722891566266, "grad_norm": 8.67268227774844, "learning_rate": 4.588304249558763e-07, "logits/chosen": 17.523601531982422, "logits/rejected": 17.99420166015625, "logps/chosen": -290.8741760253906, "logps/rejected": -215.23924255371094, "loss": 0.5245, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.641848087310791, "rewards/margins": 6.469297409057617, "rewards/rejected": -13.11114501953125, "sft_loss": 0.9921270608901978, "step": 275 }, { "epoch": 0.4600219058050383, "grad_norm": 12.040362030861928, "learning_rate": 4.569419857488228e-07, "logits/chosen": 17.7161808013916, "logits/rejected": 17.987571716308594, "logps/chosen": -297.76318359375, "logps/rejected": -205.46383666992188, "loss": 0.5407, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -5.587214469909668, "rewards/margins": 6.473574638366699, "rewards/rejected": -12.060790061950684, "sft_loss": 0.9882974028587341, "step": 280 }, { "epoch": 0.46823658269441404, "grad_norm": 18.26018008195662, "learning_rate": 4.550152713210478e-07, "logits/chosen": 17.55337905883789, "logits/rejected": 18.636327743530273, "logps/chosen": -247.40650939941406, "logps/rejected": -190.1718292236328, "loss": 0.5136, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -5.608924865722656, "rewards/margins": 5.688971996307373, "rewards/rejected": -11.297897338867188, "sft_loss": 0.9460915327072144, "step": 285 }, { "epoch": 0.4764512595837897, "grad_norm": 12.950141797441761, "learning_rate": 4.530506380304925e-07, "logits/chosen": 16.12598419189453, "logits/rejected": 16.963117599487305, "logps/chosen": -315.90838623046875, "logps/rejected": -234.9226837158203, "loss": 0.5254, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.075807094573975, "rewards/margins": 7.303346157073975, "rewards/rejected": -14.37915325164795, "sft_loss": 1.0791391134262085, "step": 290 }, { "epoch": 0.4846659364731654, "grad_norm": 8.510727267783134, "learning_rate": 4.510484492484301e-07, "logits/chosen": 16.052139282226562, "logits/rejected": 18.621992111206055, "logps/chosen": -293.8525695800781, "logps/rejected": -249.84762573242188, "loss": 0.502, "rewards/accuracies": 1.0, "rewards/chosen": -7.701572895050049, "rewards/margins": 7.963890552520752, "rewards/rejected": -15.665464401245117, "sft_loss": 1.0291332006454468, "step": 295 }, { "epoch": 0.4928806133625411, "grad_norm": 11.486611534499634, "learning_rate": 4.4900907529225797e-07, "logits/chosen": 15.679919242858887, "logits/rejected": 16.096633911132812, "logps/chosen": -295.52557373046875, "logps/rejected": -208.35264587402344, "loss": 0.5684, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.8986592292785645, "rewards/margins": 6.808130264282227, "rewards/rejected": -12.70678997039795, "sft_loss": 0.9438207149505615, "step": 300 }, { "epoch": 0.5010952902519168, "grad_norm": 11.462228668252441, "learning_rate": 4.46932893357005e-07, "logits/chosen": 17.438947677612305, "logits/rejected": 18.582027435302734, "logps/chosen": -282.1226501464844, "logps/rejected": -213.70437622070312, "loss": 0.4316, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.921773433685303, "rewards/margins": 6.760589599609375, "rewards/rejected": -12.68236255645752, "sft_loss": 0.9546439051628113, "step": 305 }, { "epoch": 0.5093099671412924, "grad_norm": 25.401018773660823, "learning_rate": 4.448202874455672e-07, "logits/chosen": 16.973630905151367, "logits/rejected": 17.916053771972656, "logps/chosen": -303.2902526855469, "logps/rejected": -214.27862548828125, "loss": 0.5904, "rewards/accuracies": 0.9200000762939453, "rewards/chosen": -6.163903713226318, "rewards/margins": 6.488450050354004, "rewards/rejected": -12.65235424041748, "sft_loss": 1.065365195274353, "step": 310 }, { "epoch": 0.5175246440306681, "grad_norm": 9.834689967221987, "learning_rate": 4.426716482976838e-07, "logits/chosen": 18.023340225219727, "logits/rejected": 19.0910587310791, "logps/chosen": -296.31610107421875, "logps/rejected": -209.1268768310547, "loss": 0.5109, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.737242221832275, "rewards/margins": 6.57784366607666, "rewards/rejected": -12.315085411071777, "sft_loss": 0.966189444065094, "step": 315 }, { "epoch": 0.5257393209200438, "grad_norm": 12.349530912080413, "learning_rate": 4.4048737331766774e-07, "logits/chosen": 19.084957122802734, "logits/rejected": 19.039499282836914, "logps/chosen": -273.5611877441406, "logps/rejected": -193.39707946777344, "loss": 0.5342, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.792872428894043, "rewards/margins": 5.747686386108398, "rewards/rejected": -11.540557861328125, "sft_loss": 0.8884872198104858, "step": 320 }, { "epoch": 0.5339539978094195, "grad_norm": 13.644842027024733, "learning_rate": 4.3826786650090273e-07, "logits/chosen": 15.30917739868164, "logits/rejected": 16.686445236206055, "logps/chosen": -261.4600524902344, "logps/rejected": -197.12661743164062, "loss": 0.5439, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -6.051290988922119, "rewards/margins": 5.911606311798096, "rewards/rejected": -11.962896347045898, "sft_loss": 1.0287508964538574, "step": 325 }, { "epoch": 0.5421686746987951, "grad_norm": 14.919840859492023, "learning_rate": 4.3601353835912235e-07, "logits/chosen": 17.14605712890625, "logits/rejected": 18.71445655822754, "logps/chosen": -240.4210968017578, "logps/rejected": -191.06373596191406, "loss": 0.5566, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.086501121520996, "rewards/margins": 5.632846355438232, "rewards/rejected": -11.719347953796387, "sft_loss": 0.9403523206710815, "step": 330 }, { "epoch": 0.5503833515881709, "grad_norm": 34.32717833778974, "learning_rate": 4.337248058444831e-07, "logits/chosen": 15.827594757080078, "logits/rejected": 16.74897575378418, "logps/chosen": -327.0185852050781, "logps/rejected": -250.9954376220703, "loss": 0.5323, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.161806106567383, "rewards/margins": 7.682919979095459, "rewards/rejected": -15.844725608825684, "sft_loss": 1.1408532857894897, "step": 335 }, { "epoch": 0.5585980284775466, "grad_norm": 13.092331667096667, "learning_rate": 4.3140209227244617e-07, "logits/chosen": 17.278669357299805, "logits/rejected": 18.425344467163086, "logps/chosen": -254.86746215820312, "logps/rejected": -201.89547729492188, "loss": 0.5321, "rewards/accuracies": 0.9200000762939453, "rewards/chosen": -6.691502094268799, "rewards/margins": 6.277873516082764, "rewards/rejected": -12.969375610351562, "sft_loss": 1.0744267702102661, "step": 340 }, { "epoch": 0.5668127053669222, "grad_norm": 12.663182255141733, "learning_rate": 4.2904582724348316e-07, "logits/chosen": 16.910207748413086, "logits/rejected": 17.029691696166992, "logps/chosen": -287.6109313964844, "logps/rejected": -202.47837829589844, "loss": 0.4913, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.874098777770996, "rewards/margins": 6.561362266540527, "rewards/rejected": -12.43545913696289, "sft_loss": 1.1810688972473145, "step": 345 }, { "epoch": 0.5750273822562979, "grad_norm": 18.101790487079715, "learning_rate": 4.266564465636182e-07, "logits/chosen": 17.891399383544922, "logits/rejected": 19.3447208404541, "logps/chosen": -306.7535705566406, "logps/rejected": -237.83753967285156, "loss": 0.482, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.050681114196777, "rewards/margins": 7.2094807624816895, "rewards/rejected": -14.260162353515625, "sft_loss": 0.964589536190033, "step": 350 }, { "epoch": 0.5832420591456736, "grad_norm": 9.764394722665243, "learning_rate": 4.242343921638234e-07, "logits/chosen": 17.71145248413086, "logits/rejected": 18.48440170288086, "logps/chosen": -317.6193542480469, "logps/rejected": -230.48606872558594, "loss": 0.45, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.803144931793213, "rewards/margins": 7.825214862823486, "rewards/rejected": -14.6283597946167, "sft_loss": 1.0540062189102173, "step": 355 }, { "epoch": 0.5914567360350493, "grad_norm": 10.821777272670147, "learning_rate": 4.2178011201828044e-07, "logits/chosen": 17.3190975189209, "logits/rejected": 17.47244644165039, "logps/chosen": -288.40374755859375, "logps/rejected": -211.689453125, "loss": 0.5051, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.330109596252441, "rewards/margins": 6.8288254737854, "rewards/rejected": -13.158934593200684, "sft_loss": 1.0400768518447876, "step": 360 }, { "epoch": 0.5996714129244249, "grad_norm": 17.857419277834527, "learning_rate": 4.1929406006152546e-07, "logits/chosen": 18.516992568969727, "logits/rejected": 19.116985321044922, "logps/chosen": -281.31695556640625, "logps/rejected": -213.61634826660156, "loss": 0.5566, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -6.528250217437744, "rewards/margins": 6.995584487915039, "rewards/rejected": -13.523836135864258, "sft_loss": 1.0151982307434082, "step": 365 }, { "epoch": 0.6078860898138007, "grad_norm": 14.30267906956284, "learning_rate": 4.167766961044906e-07, "logits/chosen": 18.10727882385254, "logits/rejected": 18.658222198486328, "logps/chosen": -276.7471923828125, "logps/rejected": -210.30068969726562, "loss": 0.4918, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.930126667022705, "rewards/margins": 6.7833099365234375, "rewards/rejected": -12.7134370803833, "sft_loss": 0.8878603577613831, "step": 370 }, { "epoch": 0.6161007667031764, "grad_norm": 13.544123616662414, "learning_rate": 4.1422848574945923e-07, "logits/chosen": 18.04473876953125, "logits/rejected": 18.60536003112793, "logps/chosen": -297.9788513183594, "logps/rejected": -217.53721618652344, "loss": 0.486, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.85421085357666, "rewards/margins": 7.5605010986328125, "rewards/rejected": -13.414711952209473, "sft_loss": 1.005669355392456, "step": 375 }, { "epoch": 0.624315443592552, "grad_norm": 21.50695855504068, "learning_rate": 4.1164990030394985e-07, "logits/chosen": 17.071107864379883, "logits/rejected": 18.0479679107666, "logps/chosen": -287.5808410644531, "logps/rejected": -229.66249084472656, "loss": 0.5873, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -7.32340145111084, "rewards/margins": 7.0957746505737305, "rewards/rejected": -14.419175148010254, "sft_loss": 0.9805389046669006, "step": 380 }, { "epoch": 0.6325301204819277, "grad_norm": 8.192201815991636, "learning_rate": 4.09041416693545e-07, "logits/chosen": 17.63469886779785, "logits/rejected": 18.505117416381836, "logps/chosen": -279.4613342285156, "logps/rejected": -218.5486297607422, "loss": 0.5224, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.260526657104492, "rewards/margins": 6.764527320861816, "rewards/rejected": -14.025053024291992, "sft_loss": 1.06680166721344, "step": 385 }, { "epoch": 0.6407447973713034, "grad_norm": 47.97635903653397, "learning_rate": 4.064035173736804e-07, "logits/chosen": 15.768574714660645, "logits/rejected": 16.24512481689453, "logps/chosen": -303.8434753417969, "logps/rejected": -227.7042999267578, "loss": 0.5142, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.973790168762207, "rewards/margins": 7.414584159851074, "rewards/rejected": -14.388375282287598, "sft_loss": 1.1620056629180908, "step": 390 }, { "epoch": 0.6489594742606791, "grad_norm": 22.56750049467428, "learning_rate": 4.0373669024041225e-07, "logits/chosen": 17.480152130126953, "logits/rejected": 19.36970329284668, "logps/chosen": -268.9180908203125, "logps/rejected": -223.1499786376953, "loss": 0.48, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.060788154602051, "rewards/margins": 7.270442485809326, "rewards/rejected": -14.331231117248535, "sft_loss": 1.0084587335586548, "step": 395 }, { "epoch": 0.6571741511500547, "grad_norm": 14.852542656702267, "learning_rate": 4.010414285401776e-07, "logits/chosen": 19.486713409423828, "logits/rejected": 19.6448917388916, "logps/chosen": -278.3014831542969, "logps/rejected": -204.4377899169922, "loss": 0.4865, "rewards/accuracies": 1.0, "rewards/chosen": -6.484006404876709, "rewards/margins": 6.507460594177246, "rewards/rejected": -12.991467475891113, "sft_loss": 1.0000892877578735, "step": 400 }, { "epoch": 0.6653888280394304, "grad_norm": 10.619044380244409, "learning_rate": 3.9831823077856565e-07, "logits/chosen": 16.79458236694336, "logits/rejected": 17.91153907775879, "logps/chosen": -281.0224304199219, "logps/rejected": -210.1667022705078, "loss": 0.5159, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.140867710113525, "rewards/margins": 6.682094097137451, "rewards/rejected": -12.822961807250977, "sft_loss": 1.0717185735702515, "step": 405 }, { "epoch": 0.6736035049288062, "grad_norm": 47.72028643830778, "learning_rate": 3.95567600628115e-07, "logits/chosen": 17.3284912109375, "logits/rejected": 17.72430419921875, "logps/chosen": -275.4824523925781, "logps/rejected": -210.34494018554688, "loss": 0.4746, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.135570049285889, "rewards/margins": 6.734328746795654, "rewards/rejected": -12.86989974975586, "sft_loss": 0.9496582746505737, "step": 410 }, { "epoch": 0.6818181818181818, "grad_norm": 13.659449625348234, "learning_rate": 3.9279004683515783e-07, "logits/chosen": 17.051794052124023, "logits/rejected": 18.201574325561523, "logps/chosen": -283.5098876953125, "logps/rejected": -217.13720703125, "loss": 0.4834, "rewards/accuracies": 1.0, "rewards/chosen": -6.5031657218933105, "rewards/margins": 6.981623649597168, "rewards/rejected": -13.484789848327637, "sft_loss": 1.008143663406372, "step": 415 }, { "epoch": 0.6900328587075575, "grad_norm": 10.688936022811054, "learning_rate": 3.8998608312572234e-07, "logits/chosen": 18.112707138061523, "logits/rejected": 18.169342041015625, "logps/chosen": -316.6014709472656, "logps/rejected": -224.16824340820312, "loss": 0.4278, "rewards/accuracies": 1.0, "rewards/chosen": -6.8555474281311035, "rewards/margins": 7.1582255363464355, "rewards/rejected": -14.013773918151855, "sft_loss": 0.9130622148513794, "step": 420 }, { "epoch": 0.6982475355969332, "grad_norm": 13.038331606353893, "learning_rate": 3.8715622811051753e-07, "logits/chosen": 17.96015739440918, "logits/rejected": 18.90926742553711, "logps/chosen": -330.01348876953125, "logps/rejected": -245.21107482910156, "loss": 0.4744, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.965524196624756, "rewards/margins": 7.482056617736816, "rewards/rejected": -15.44758129119873, "sft_loss": 0.9873117208480835, "step": 425 }, { "epoch": 0.7064622124863089, "grad_norm": 14.058645020755897, "learning_rate": 3.843010051890114e-07, "logits/chosen": 16.319496154785156, "logits/rejected": 16.970029830932617, "logps/chosen": -317.0173645019531, "logps/rejected": -243.7187042236328, "loss": 0.5166, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -7.986619472503662, "rewards/margins": 7.993711471557617, "rewards/rejected": -15.980331420898438, "sft_loss": 1.082601547241211, "step": 430 }, { "epoch": 0.7146768893756845, "grad_norm": 19.82457118000122, "learning_rate": 3.8142094245262615e-07, "logits/chosen": 17.59951400756836, "logits/rejected": 17.434412002563477, "logps/chosen": -294.1492919921875, "logps/rejected": -218.65521240234375, "loss": 0.5787, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -7.534255504608154, "rewards/margins": 6.933282852172852, "rewards/rejected": -14.467540740966797, "sft_loss": 1.660515308380127, "step": 435 }, { "epoch": 0.7228915662650602, "grad_norm": 11.251537042250078, "learning_rate": 3.785165725870637e-07, "logits/chosen": 17.26852798461914, "logits/rejected": 17.4658203125, "logps/chosen": -318.1449279785156, "logps/rejected": -243.87478637695312, "loss": 0.4501, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.651638507843018, "rewards/margins": 7.704569339752197, "rewards/rejected": -15.356207847595215, "sft_loss": 1.0064337253570557, "step": 440 }, { "epoch": 0.731106243154436, "grad_norm": 13.388962596712888, "learning_rate": 3.7558843277378203e-07, "logits/chosen": 17.070295333862305, "logits/rejected": 17.869474411010742, "logps/chosen": -280.3146057128906, "logps/rejected": -216.09710693359375, "loss": 0.4821, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -6.6985931396484375, "rewards/margins": 7.212075233459473, "rewards/rejected": -13.91066837310791, "sft_loss": 0.9864783883094788, "step": 445 }, { "epoch": 0.7393209200438116, "grad_norm": 14.813083085351893, "learning_rate": 3.726370645906407e-07, "logits/chosen": 16.521230697631836, "logits/rejected": 17.734365463256836, "logps/chosen": -294.2370300292969, "logps/rejected": -221.69178771972656, "loss": 0.4907, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.402077674865723, "rewards/margins": 6.9355292320251465, "rewards/rejected": -14.337605476379395, "sft_loss": 1.1839743852615356, "step": 450 }, { "epoch": 0.7475355969331873, "grad_norm": 12.059854940698536, "learning_rate": 3.6966301391173204e-07, "logits/chosen": 17.135530471801758, "logits/rejected": 19.162967681884766, "logps/chosen": -284.18438720703125, "logps/rejected": -233.11984252929688, "loss": 0.5102, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -7.429366111755371, "rewards/margins": 7.891510009765625, "rewards/rejected": -15.320878028869629, "sft_loss": 1.079641580581665, "step": 455 }, { "epoch": 0.755750273822563, "grad_norm": 22.260550575758643, "learning_rate": 3.6666683080641843e-07, "logits/chosen": 15.536272048950195, "logits/rejected": 16.60968780517578, "logps/chosen": -310.630859375, "logps/rejected": -241.0422821044922, "loss": 0.4597, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.208867073059082, "rewards/margins": 7.248252868652344, "rewards/rejected": -15.457121849060059, "sft_loss": 1.0596128702163696, "step": 460 }, { "epoch": 0.7639649507119387, "grad_norm": 12.19610863222244, "learning_rate": 3.636490694375937e-07, "logits/chosen": 17.03879165649414, "logits/rejected": 17.748197555541992, "logps/chosen": -308.9512023925781, "logps/rejected": -236.08970642089844, "loss": 0.4273, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.292715549468994, "rewards/margins": 8.177362442016602, "rewards/rejected": -15.470076560974121, "sft_loss": 1.0068012475967407, "step": 465 }, { "epoch": 0.7721796276013143, "grad_norm": 13.22565269945024, "learning_rate": 3.6061028795918734e-07, "logits/chosen": 17.87092399597168, "logits/rejected": 18.572694778442383, "logps/chosen": -314.8690490722656, "logps/rejected": -240.42343139648438, "loss": 0.5971, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.011045455932617, "rewards/margins": 7.702009677886963, "rewards/rejected": -15.713056564331055, "sft_loss": 1.0346639156341553, "step": 470 }, { "epoch": 0.78039430449069, "grad_norm": 23.36877627131626, "learning_rate": 3.5755104841292974e-07, "logits/chosen": 16.52726936340332, "logits/rejected": 18.124269485473633, "logps/chosen": -261.4451599121094, "logps/rejected": -216.3064727783203, "loss": 0.5188, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.079422950744629, "rewards/margins": 6.967349052429199, "rewards/rejected": -14.046771049499512, "sft_loss": 1.0945566892623901, "step": 475 }, { "epoch": 0.7886089813800657, "grad_norm": 12.346922738371601, "learning_rate": 3.544719166243998e-07, "logits/chosen": 17.161659240722656, "logits/rejected": 18.612253189086914, "logps/chosen": -295.6679992675781, "logps/rejected": -228.33984375, "loss": 0.4422, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.870236873626709, "rewards/margins": 7.495952129364014, "rewards/rejected": -14.36618709564209, "sft_loss": 0.9808112382888794, "step": 480 }, { "epoch": 0.7968236582694413, "grad_norm": 14.120403338012792, "learning_rate": 3.513734620983716e-07, "logits/chosen": 17.235340118408203, "logits/rejected": 18.787269592285156, "logps/chosen": -289.2434997558594, "logps/rejected": -240.0524444580078, "loss": 0.4205, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.261631965637207, "rewards/margins": 8.297459602355957, "rewards/rejected": -15.55909252166748, "sft_loss": 0.9492250084877014, "step": 485 }, { "epoch": 0.8050383351588171, "grad_norm": 14.978501832234636, "learning_rate": 3.482562579134809e-07, "logits/chosen": 15.85843276977539, "logits/rejected": 17.14594268798828, "logps/chosen": -256.8265380859375, "logps/rejected": -214.51412963867188, "loss": 0.466, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -7.612859725952148, "rewards/margins": 6.866227626800537, "rewards/rejected": -14.479085922241211, "sft_loss": 1.0439454317092896, "step": 490 }, { "epoch": 0.8132530120481928, "grad_norm": 13.645938681155632, "learning_rate": 3.4512088061623073e-07, "logits/chosen": 17.91840171813965, "logits/rejected": 18.105796813964844, "logps/chosen": -344.9450378417969, "logps/rejected": -257.0929870605469, "loss": 0.434, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.392577171325684, "rewards/margins": 8.40063762664795, "rewards/rejected": -16.793216705322266, "sft_loss": 1.052524447441101, "step": 495 }, { "epoch": 0.8214676889375685, "grad_norm": 11.793731298003246, "learning_rate": 3.419679101143555e-07, "logits/chosen": 16.95572280883789, "logits/rejected": 18.109580993652344, "logps/chosen": -257.8283996582031, "logps/rejected": -217.70062255859375, "loss": 0.4059, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.232075214385986, "rewards/margins": 7.084912300109863, "rewards/rejected": -14.316986083984375, "sft_loss": 1.070483684539795, "step": 500 }, { "epoch": 0.8296823658269441, "grad_norm": 18.160358009772516, "learning_rate": 3.387979295695632e-07, "logits/chosen": 17.402151107788086, "logits/rejected": 17.819072723388672, "logps/chosen": -284.08599853515625, "logps/rejected": -228.4375, "loss": 0.4832, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -7.799540042877197, "rewards/margins": 7.30112886428833, "rewards/rejected": -15.100667953491211, "sft_loss": 1.0201059579849243, "step": 505 }, { "epoch": 0.8378970427163198, "grad_norm": 24.681797914609845, "learning_rate": 3.356115252896764e-07, "logits/chosen": 16.481372833251953, "logits/rejected": 17.393707275390625, "logps/chosen": -318.48956298828125, "logps/rejected": -238.67076110839844, "loss": 0.4569, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.8467302322387695, "rewards/margins": 7.6555867195129395, "rewards/rejected": -15.502315521240234, "sft_loss": 1.1412904262542725, "step": 510 }, { "epoch": 0.8461117196056955, "grad_norm": 11.780066809990752, "learning_rate": 3.3240928662019043e-07, "logits/chosen": 14.776932716369629, "logits/rejected": 16.346778869628906, "logps/chosen": -313.47589111328125, "logps/rejected": -242.91506958007812, "loss": 0.4196, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -7.860676288604736, "rewards/margins": 8.015517234802246, "rewards/rejected": -15.876194953918457, "sft_loss": 1.059720516204834, "step": 515 }, { "epoch": 0.8543263964950711, "grad_norm": 14.114725277489077, "learning_rate": 3.291918058352706e-07, "logits/chosen": 16.27129554748535, "logits/rejected": 17.153289794921875, "logps/chosen": -306.25506591796875, "logps/rejected": -249.3704071044922, "loss": 0.5092, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.551309585571289, "rewards/margins": 7.208839416503906, "rewards/rejected": -16.760149002075195, "sft_loss": 1.1138993501663208, "step": 520 }, { "epoch": 0.8625410733844469, "grad_norm": 27.760604608400726, "learning_rate": 3.259596780282074e-07, "logits/chosen": 18.246183395385742, "logits/rejected": 18.89859390258789, "logps/chosen": -346.7146301269531, "logps/rejected": -260.1651916503906, "loss": 0.4395, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.043220520019531, "rewards/margins": 8.856348991394043, "rewards/rejected": -16.899568557739258, "sft_loss": 1.1765520572662354, "step": 525 }, { "epoch": 0.8707557502738226, "grad_norm": 15.402410015157315, "learning_rate": 3.2271350100134975e-07, "logits/chosen": 17.567943572998047, "logits/rejected": 17.768869400024414, "logps/chosen": -298.6788024902344, "logps/rejected": -236.3932647705078, "loss": 0.4193, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.671787738800049, "rewards/margins": 7.834874629974365, "rewards/rejected": -15.506662368774414, "sft_loss": 1.071178913116455, "step": 530 }, { "epoch": 0.8789704271631983, "grad_norm": 18.947114003342495, "learning_rate": 3.1945387515553843e-07, "logits/chosen": 17.647369384765625, "logits/rejected": 18.73533821105957, "logps/chosen": -310.0240478515625, "logps/rejected": -251.67193603515625, "loss": 0.441, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.7946672439575195, "rewards/margins": 9.0897798538208, "rewards/rejected": -16.88444709777832, "sft_loss": 1.0311574935913086, "step": 535 }, { "epoch": 0.8871851040525739, "grad_norm": 11.041389503496823, "learning_rate": 3.1618140337905764e-07, "logits/chosen": 17.451311111450195, "logits/rejected": 18.353700637817383, "logps/chosen": -297.8014831542969, "logps/rejected": -240.24606323242188, "loss": 0.4126, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.106889724731445, "rewards/margins": 7.919802188873291, "rewards/rejected": -16.02669334411621, "sft_loss": 1.1384520530700684, "step": 540 }, { "epoch": 0.8953997809419496, "grad_norm": 9.858801498232785, "learning_rate": 3.128966909361271e-07, "logits/chosen": 16.695926666259766, "logits/rejected": 18.67499351501465, "logps/chosen": -320.1283874511719, "logps/rejected": -254.82162475585938, "loss": 0.3699, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -8.027070999145508, "rewards/margins": 8.428789138793945, "rewards/rejected": -16.455860137939453, "sft_loss": 1.0505129098892212, "step": 545 }, { "epoch": 0.9036144578313253, "grad_norm": 16.103503361054337, "learning_rate": 3.096003453549549e-07, "logits/chosen": 17.31558609008789, "logits/rejected": 17.725223541259766, "logps/chosen": -345.3844299316406, "logps/rejected": -261.2863464355469, "loss": 0.4497, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.239363670349121, "rewards/margins": 9.474674224853516, "rewards/rejected": -17.714040756225586, "sft_loss": 1.020671010017395, "step": 550 }, { "epoch": 0.911829134720701, "grad_norm": 12.01821136380653, "learning_rate": 3.06292976315371e-07, "logits/chosen": 16.277523040771484, "logits/rejected": 17.34755516052246, "logps/chosen": -304.7778625488281, "logps/rejected": -241.48277282714844, "loss": 0.4126, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.762810230255127, "rewards/margins": 8.414238929748535, "rewards/rejected": -16.17704963684082, "sft_loss": 1.1222290992736816, "step": 555 }, { "epoch": 0.9200438116100766, "grad_norm": 11.41112495788056, "learning_rate": 3.0297519553606324e-07, "logits/chosen": 17.731529235839844, "logits/rejected": 18.088359832763672, "logps/chosen": -305.7876281738281, "logps/rejected": -246.57879638671875, "loss": 0.4401, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -7.948428630828857, "rewards/margins": 8.658875465393066, "rewards/rejected": -16.60730743408203, "sft_loss": 1.067797064781189, "step": 560 }, { "epoch": 0.9282584884994524, "grad_norm": 21.985722962679343, "learning_rate": 2.996476166614363e-07, "logits/chosen": 15.972024917602539, "logits/rejected": 16.38096809387207, "logps/chosen": -330.54388427734375, "logps/rejected": -267.4414367675781, "loss": 0.5027, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.933554649353027, "rewards/margins": 9.095858573913574, "rewards/rejected": -18.0294132232666, "sft_loss": 1.1063634157180786, "step": 565 }, { "epoch": 0.9364731653888281, "grad_norm": 10.308382930028264, "learning_rate": 2.963108551481142e-07, "logits/chosen": 17.77937889099121, "logits/rejected": 18.134130477905273, "logps/chosen": -339.63079833984375, "logps/rejected": -260.2466735839844, "loss": 0.4519, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.120518684387207, "rewards/margins": 9.045032501220703, "rewards/rejected": -17.165552139282227, "sft_loss": 1.072819471359253, "step": 570 }, { "epoch": 0.9446878422782037, "grad_norm": 15.634526225587425, "learning_rate": 2.929655281511075e-07, "logits/chosen": 16.544097900390625, "logits/rejected": 17.375316619873047, "logps/chosen": -319.2738037109375, "logps/rejected": -257.0357971191406, "loss": 0.4126, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.096101760864258, "rewards/margins": 8.786704063415527, "rewards/rejected": -16.8828067779541, "sft_loss": 1.0927081108093262, "step": 575 }, { "epoch": 0.9529025191675794, "grad_norm": 8.788361215925173, "learning_rate": 2.896122544096667e-07, "logits/chosen": 16.77577018737793, "logits/rejected": 17.813331604003906, "logps/chosen": -297.43548583984375, "logps/rejected": -240.00099182128906, "loss": 0.4592, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.802213191986084, "rewards/margins": 8.326114654541016, "rewards/rejected": -16.12833023071289, "sft_loss": 1.088619589805603, "step": 580 }, { "epoch": 0.9611171960569551, "grad_norm": 20.34248392425272, "learning_rate": 2.8625165413284307e-07, "logits/chosen": 16.004566192626953, "logits/rejected": 17.70891761779785, "logps/chosen": -328.6180725097656, "logps/rejected": -263.9577941894531, "loss": 0.5055, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.101932525634766, "rewards/margins": 9.129469871520996, "rewards/rejected": -17.23140525817871, "sft_loss": 1.0326135158538818, "step": 585 }, { "epoch": 0.9693318729463308, "grad_norm": 13.09030046415886, "learning_rate": 2.8288434888477626e-07, "logits/chosen": 18.028348922729492, "logits/rejected": 17.76748275756836, "logps/chosen": -287.28692626953125, "logps/rejected": -231.44729614257812, "loss": 0.3908, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.940645217895508, "rewards/margins": 7.91243839263916, "rewards/rejected": -15.853084564208984, "sft_loss": 1.0779129266738892, "step": 590 }, { "epoch": 0.9775465498357064, "grad_norm": 20.95262748964158, "learning_rate": 2.795109614697326e-07, "logits/chosen": 17.00741195678711, "logits/rejected": 18.209590911865234, "logps/chosen": -275.52880859375, "logps/rejected": -232.07052612304688, "loss": 0.4225, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -7.112081050872803, "rewards/margins": 8.281967163085938, "rewards/rejected": -15.394047737121582, "sft_loss": 1.0076452493667603, "step": 595 }, { "epoch": 0.9857612267250822, "grad_norm": 13.158949539443237, "learning_rate": 2.761321158169134e-07, "logits/chosen": 18.07162094116211, "logits/rejected": 19.637807846069336, "logps/chosen": -307.5865478515625, "logps/rejected": -249.9253387451172, "loss": 0.4339, "rewards/accuracies": 1.0, "rewards/chosen": -7.806564807891846, "rewards/margins": 8.727023124694824, "rewards/rejected": -16.53359031677246, "sft_loss": 1.06932532787323, "step": 600 }, { "epoch": 0.9939759036144579, "grad_norm": 13.610109275739992, "learning_rate": 2.727484368650553e-07, "logits/chosen": 15.262972831726074, "logits/rejected": 16.486412048339844, "logps/chosen": -305.6347351074219, "logps/rejected": -252.50546264648438, "loss": 0.4625, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.769743919372559, "rewards/margins": 8.321878433227539, "rewards/rejected": -17.091623306274414, "sft_loss": 1.1903793811798096, "step": 605 }, { "epoch": 1.0021905805038336, "grad_norm": 9.988555947945434, "learning_rate": 2.6936055044684425e-07, "logits/chosen": 17.130857467651367, "logits/rejected": 17.868497848510742, "logps/chosen": -278.2147216796875, "logps/rejected": -229.0367889404297, "loss": 0.4205, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.578054428100586, "rewards/margins": 7.36480188369751, "rewards/rejected": -15.942855834960938, "sft_loss": 1.0933631658554077, "step": 610 }, { "epoch": 1.0104052573932092, "grad_norm": 11.824094414048218, "learning_rate": 2.659690831731631e-07, "logits/chosen": 17.553348541259766, "logits/rejected": 18.92648696899414, "logps/chosen": -317.8105163574219, "logps/rejected": -263.2023620605469, "loss": 0.3385, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.87080192565918, "rewards/margins": 9.268915176391602, "rewards/rejected": -18.13971710205078, "sft_loss": 1.0447877645492554, "step": 615 }, { "epoch": 1.0186199342825848, "grad_norm": 15.737059861781074, "learning_rate": 2.6257466231719676e-07, "logits/chosen": 15.165780067443848, "logits/rejected": 16.453243255615234, "logps/chosen": -338.23773193359375, "logps/rejected": -283.7428283691406, "loss": 0.3123, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.231574058532715, "rewards/margins": 10.158638954162598, "rewards/rejected": -19.390214920043945, "sft_loss": 1.2299811840057373, "step": 620 }, { "epoch": 1.0268346111719606, "grad_norm": 11.900623330243908, "learning_rate": 2.591779156984137e-07, "logits/chosen": 16.764328002929688, "logits/rejected": 16.837923049926758, "logps/chosen": -322.6804504394531, "logps/rejected": -269.0111999511719, "loss": 0.3671, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -9.284835815429688, "rewards/margins": 9.551142692565918, "rewards/rejected": -18.83597755432129, "sft_loss": 1.0855733156204224, "step": 625 }, { "epoch": 1.0350492880613362, "grad_norm": 18.88025576733879, "learning_rate": 2.557794715664465e-07, "logits/chosen": 15.582106590270996, "logits/rejected": 16.574077606201172, "logps/chosen": -330.9181213378906, "logps/rejected": -281.83709716796875, "loss": 0.4083, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.849162101745605, "rewards/margins": 10.020377159118652, "rewards/rejected": -19.86954116821289, "sft_loss": 1.11058509349823, "step": 630 }, { "epoch": 1.0432639649507118, "grad_norm": 22.56812145625195, "learning_rate": 2.5237995848489417e-07, "logits/chosen": 16.257413864135742, "logits/rejected": 16.71412467956543, "logps/chosen": -332.62506103515625, "logps/rejected": -271.0566101074219, "loss": 0.4569, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.809398651123047, "rewards/margins": 10.076090812683105, "rewards/rejected": -18.88549041748047, "sft_loss": 1.1897673606872559, "step": 635 }, { "epoch": 1.0514786418400877, "grad_norm": 10.647617140402389, "learning_rate": 2.48980005215064e-07, "logits/chosen": 16.611183166503906, "logits/rejected": 17.89920425415039, "logps/chosen": -271.6616516113281, "logps/rejected": -231.13978576660156, "loss": 0.4444, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.08376407623291, "rewards/margins": 8.19190502166748, "rewards/rejected": -16.27566909790039, "sft_loss": 1.3704915046691895, "step": 640 }, { "epoch": 1.0596933187294633, "grad_norm": 19.247491047471033, "learning_rate": 2.45580240599679e-07, "logits/chosen": 16.49073028564453, "logits/rejected": 17.990306854248047, "logps/chosen": -358.3551025390625, "logps/rejected": -288.8968505859375, "loss": 0.3691, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.500346183776855, "rewards/margins": 10.489169120788574, "rewards/rejected": -18.98951530456543, "sft_loss": 1.2408881187438965, "step": 645 }, { "epoch": 1.067907995618839, "grad_norm": 13.44526599292449, "learning_rate": 2.421812934465696e-07, "logits/chosen": 17.065837860107422, "logits/rejected": 17.75263214111328, "logps/chosen": -308.9762878417969, "logps/rejected": -256.1690979003906, "loss": 0.3945, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.130703926086426, "rewards/margins": 9.368220329284668, "rewards/rejected": -17.49892234802246, "sft_loss": 1.1205145120620728, "step": 650 }, { "epoch": 1.0761226725082147, "grad_norm": 10.753673167776007, "learning_rate": 2.3878379241237134e-07, "logits/chosen": 16.457183837890625, "logits/rejected": 17.42021942138672, "logps/chosen": -312.5380554199219, "logps/rejected": -251.23977661132812, "loss": 0.3696, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.389382362365723, "rewards/margins": 8.924761772155762, "rewards/rejected": -17.314144134521484, "sft_loss": 1.2173506021499634, "step": 655 }, { "epoch": 1.0843373493975903, "grad_norm": 23.82423804722956, "learning_rate": 2.3538836588625077e-07, "logits/chosen": 15.20209789276123, "logits/rejected": 15.774395942687988, "logps/chosen": -297.73260498046875, "logps/rejected": -246.4073944091797, "loss": 0.4032, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -8.610387802124023, "rewards/margins": 8.70635986328125, "rewards/rejected": -17.31674575805664, "sft_loss": 1.3788336515426636, "step": 660 }, { "epoch": 1.0925520262869661, "grad_norm": 7.166962184073318, "learning_rate": 2.3199564187368153e-07, "logits/chosen": 15.194981575012207, "logits/rejected": 17.136018753051758, "logps/chosen": -328.6063537597656, "logps/rejected": -288.6786804199219, "loss": 0.366, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.775790214538574, "rewards/margins": 10.185883522033691, "rewards/rejected": -19.9616756439209, "sft_loss": 1.1107780933380127, "step": 665 }, { "epoch": 1.1007667031763417, "grad_norm": 13.216204703949911, "learning_rate": 2.2860624788029013e-07, "logits/chosen": 16.70530891418457, "logits/rejected": 17.76304817199707, "logps/chosen": -289.44476318359375, "logps/rejected": -245.6142120361328, "loss": 0.4321, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -8.357013702392578, "rewards/margins": 8.433321952819824, "rewards/rejected": -16.790334701538086, "sft_loss": 1.1908717155456543, "step": 670 }, { "epoch": 1.1089813800657173, "grad_norm": 26.032896310058877, "learning_rate": 2.2522081079579497e-07, "logits/chosen": 15.079482078552246, "logits/rejected": 16.43825340270996, "logps/chosen": -327.8377380371094, "logps/rejected": -283.44158935546875, "loss": 0.389, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.565984725952148, "rewards/margins": 10.288603782653809, "rewards/rejected": -19.854589462280273, "sft_loss": 1.4105526208877563, "step": 675 }, { "epoch": 1.1171960569550932, "grad_norm": 7.35341298145847, "learning_rate": 2.2183995677805967e-07, "logits/chosen": 15.347798347473145, "logits/rejected": 16.887144088745117, "logps/chosen": -343.8727722167969, "logps/rejected": -289.7627258300781, "loss": 0.3343, "rewards/accuracies": 1.0, "rewards/chosen": -9.85545539855957, "rewards/margins": 10.418365478515625, "rewards/rejected": -20.273822784423828, "sft_loss": 1.2016042470932007, "step": 680 }, { "epoch": 1.1254107338444688, "grad_norm": 13.095979555911432, "learning_rate": 2.1846431113728062e-07, "logits/chosen": 15.633400917053223, "logits/rejected": 17.45536994934082, "logps/chosen": -328.1496887207031, "logps/rejected": -281.7301025390625, "loss": 0.3718, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.103165626525879, "rewards/margins": 10.700647354125977, "rewards/rejected": -19.80381202697754, "sft_loss": 1.198488473892212, "step": 685 }, { "epoch": 1.1336254107338444, "grad_norm": 17.038758672339643, "learning_rate": 2.1509449822033205e-07, "logits/chosen": 16.633058547973633, "logits/rejected": 17.105684280395508, "logps/chosen": -340.9743957519531, "logps/rejected": -273.4366455078125, "loss": 0.3328, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.657751083374023, "rewards/margins": 9.9077787399292, "rewards/rejected": -18.565532684326172, "sft_loss": 1.1622406244277954, "step": 690 }, { "epoch": 1.1418400876232202, "grad_norm": 13.181289081121232, "learning_rate": 2.1173114129528957e-07, "logits/chosen": 16.235170364379883, "logits/rejected": 17.971439361572266, "logps/chosen": -289.8466491699219, "logps/rejected": -249.1376495361328, "loss": 0.3625, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -8.312536239624023, "rewards/margins": 9.367281913757324, "rewards/rejected": -17.679819107055664, "sft_loss": 1.2810382843017578, "step": 695 }, { "epoch": 1.1500547645125958, "grad_norm": 13.226133090678903, "learning_rate": 2.0837486243615226e-07, "logits/chosen": 16.742103576660156, "logits/rejected": 17.46257781982422, "logps/chosen": -364.11041259765625, "logps/rejected": -300.90618896484375, "loss": 0.3981, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.691442489624023, "rewards/margins": 11.124072074890137, "rewards/rejected": -20.81551742553711, "sft_loss": 1.0426690578460693, "step": 700 }, { "epoch": 1.1582694414019716, "grad_norm": 16.747134822775763, "learning_rate": 2.0502628240778653e-07, "logits/chosen": 17.3011474609375, "logits/rejected": 19.28099822998047, "logps/chosen": -329.4310607910156, "logps/rejected": -291.73443603515625, "loss": 0.3664, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.230022430419922, "rewards/margins": 11.087077140808105, "rewards/rejected": -20.31709861755371, "sft_loss": 1.0452929735183716, "step": 705 }, { "epoch": 1.1664841182913472, "grad_norm": 11.712195080946406, "learning_rate": 2.0168602055111173e-07, "logits/chosen": 16.063915252685547, "logits/rejected": 17.033220291137695, "logps/chosen": -324.21099853515625, "logps/rejected": -281.9880065917969, "loss": 0.3326, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.243894577026367, "rewards/margins": 10.908008575439453, "rewards/rejected": -20.15190315246582, "sft_loss": 1.1959102153778076, "step": 710 }, { "epoch": 1.1746987951807228, "grad_norm": 19.52291295321317, "learning_rate": 1.9835469466854887e-07, "logits/chosen": 14.572199821472168, "logits/rejected": 16.15847396850586, "logps/chosen": -322.0695495605469, "logps/rejected": -283.8585205078125, "loss": 0.3275, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.329268455505371, "rewards/margins": 10.466280937194824, "rewards/rejected": -19.795551300048828, "sft_loss": 1.1618155241012573, "step": 715 }, { "epoch": 1.1829134720700987, "grad_norm": 14.04137372253548, "learning_rate": 1.9503292090975454e-07, "logits/chosen": 16.88302993774414, "logits/rejected": 17.57504653930664, "logps/chosen": -292.8112487792969, "logps/rejected": -249.99221801757812, "loss": 0.3841, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.16586685180664, "rewards/margins": 9.12321662902832, "rewards/rejected": -18.28908348083496, "sft_loss": 1.2042182683944702, "step": 720 }, { "epoch": 1.1911281489594743, "grad_norm": 12.34681171872866, "learning_rate": 1.917213136576602e-07, "logits/chosen": 16.656551361083984, "logits/rejected": 17.51203155517578, "logps/chosen": -327.6507568359375, "logps/rejected": -284.38262939453125, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": -9.919057846069336, "rewards/margins": 10.376687049865723, "rewards/rejected": -20.295743942260742, "sft_loss": 1.18035089969635, "step": 725 }, { "epoch": 1.1993428258488499, "grad_norm": 10.050794300712155, "learning_rate": 1.8842048541483756e-07, "logits/chosen": 18.090221405029297, "logits/rejected": 18.187620162963867, "logps/chosen": -322.1310119628906, "logps/rejected": -253.3239288330078, "loss": 0.3945, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.852034568786621, "rewards/margins": 9.027352333068848, "rewards/rejected": -17.879384994506836, "sft_loss": 1.199164628982544, "step": 730 }, { "epoch": 1.2075575027382257, "grad_norm": 11.698463225887238, "learning_rate": 1.8513104669021314e-07, "logits/chosen": 15.768450736999512, "logits/rejected": 17.4649715423584, "logps/chosen": -315.5854797363281, "logps/rejected": -270.3199462890625, "loss": 0.3727, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.129364967346191, "rewards/margins": 9.496920585632324, "rewards/rejected": -18.626283645629883, "sft_loss": 1.1171187162399292, "step": 735 }, { "epoch": 1.2157721796276013, "grad_norm": 15.670433550127342, "learning_rate": 1.8185360588615057e-07, "logits/chosen": 17.373594284057617, "logits/rejected": 18.17388916015625, "logps/chosen": -349.6602478027344, "logps/rejected": -286.2644958496094, "loss": 0.3583, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.793112754821777, "rewards/margins": 10.82769775390625, "rewards/rejected": -19.620811462402344, "sft_loss": 1.1327273845672607, "step": 740 }, { "epoch": 1.223986856516977, "grad_norm": 17.513419996090132, "learning_rate": 1.7858876918592232e-07, "logits/chosen": 15.862748146057129, "logits/rejected": 17.21187400817871, "logps/chosen": -301.255859375, "logps/rejected": -256.63555908203125, "loss": 0.3533, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.769881248474121, "rewards/margins": 9.70648193359375, "rewards/rejected": -18.476362228393555, "sft_loss": 1.1204417943954468, "step": 745 }, { "epoch": 1.2322015334063527, "grad_norm": 19.125724690968823, "learning_rate": 1.7533714044159299e-07, "logits/chosen": 15.58492374420166, "logits/rejected": 16.52800941467285, "logps/chosen": -298.8733215332031, "logps/rejected": -268.4566650390625, "loss": 0.4265, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.785615921020508, "rewards/margins": 9.117348670959473, "rewards/rejected": -18.902963638305664, "sft_loss": 1.6064594984054565, "step": 750 }, { "epoch": 1.2404162102957283, "grad_norm": 17.968784609019274, "learning_rate": 1.7209932106233264e-07, "logits/chosen": 15.145374298095703, "logits/rejected": 17.433292388916016, "logps/chosen": -342.9417724609375, "logps/rejected": -296.39654541015625, "loss": 0.3766, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.673720359802246, "rewards/margins": 10.815841674804688, "rewards/rejected": -20.489561080932617, "sft_loss": 1.145885944366455, "step": 755 }, { "epoch": 1.248630887185104, "grad_norm": 13.684786311914898, "learning_rate": 1.688759099031824e-07, "logits/chosen": 15.70371150970459, "logits/rejected": 16.69938087463379, "logps/chosen": -361.2178955078125, "logps/rejected": -309.79150390625, "loss": 0.3508, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -10.506484031677246, "rewards/margins": 11.59350872039795, "rewards/rejected": -22.099994659423828, "sft_loss": 1.1850322484970093, "step": 760 }, { "epoch": 1.2568455640744798, "grad_norm": 14.244960468313039, "learning_rate": 1.656675031542925e-07, "logits/chosen": 17.195899963378906, "logits/rejected": 18.426219940185547, "logps/chosen": -363.3425598144531, "logps/rejected": -301.96063232421875, "loss": 0.3397, "rewards/accuracies": 1.0, "rewards/chosen": -9.729473114013672, "rewards/margins": 11.475974082946777, "rewards/rejected": -21.205448150634766, "sft_loss": 1.1794105768203735, "step": 765 }, { "epoch": 1.2650602409638554, "grad_norm": 8.622276211404598, "learning_rate": 1.6247469423065343e-07, "logits/chosen": 16.508113861083984, "logits/rejected": 17.097890853881836, "logps/chosen": -305.1572570800781, "logps/rejected": -249.677001953125, "loss": 0.3759, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.539449691772461, "rewards/margins": 8.976499557495117, "rewards/rejected": -17.515949249267578, "sft_loss": 1.196576714515686, "step": 770 }, { "epoch": 1.273274917853231, "grad_norm": 12.358403358119775, "learning_rate": 1.5929807366233977e-07, "logits/chosen": 16.241657257080078, "logits/rejected": 17.03815269470215, "logps/chosen": -369.39556884765625, "logps/rejected": -303.64337158203125, "loss": 0.3163, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.143651008605957, "rewards/margins": 11.741597175598145, "rewards/rejected": -20.885250091552734, "sft_loss": 1.1366469860076904, "step": 775 }, { "epoch": 1.2814895947426068, "grad_norm": 16.14061914284979, "learning_rate": 1.5613822898528794e-07, "logits/chosen": 16.795856475830078, "logits/rejected": 17.53175163269043, "logps/chosen": -345.46929931640625, "logps/rejected": -292.4604187011719, "loss": 0.3369, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.835062026977539, "rewards/margins": 11.170233726501465, "rewards/rejected": -21.005298614501953, "sft_loss": 1.3101640939712524, "step": 780 }, { "epoch": 1.2897042716319824, "grad_norm": 12.538981244658086, "learning_rate": 1.5299574463262794e-07, "logits/chosen": 15.523879051208496, "logits/rejected": 16.796798706054688, "logps/chosen": -377.0471496582031, "logps/rejected": -319.5939025878906, "loss": 0.4028, "rewards/accuracies": 1.0, "rewards/chosen": -10.438889503479004, "rewards/margins": 12.12649917602539, "rewards/rejected": -22.565387725830078, "sft_loss": 1.1697484254837036, "step": 785 }, { "epoch": 1.297918948521358, "grad_norm": 13.959611183566771, "learning_rate": 1.4987120182658877e-07, "logits/chosen": 15.972567558288574, "logits/rejected": 18.35633659362793, "logps/chosen": -330.76104736328125, "logps/rejected": -282.9498291015625, "loss": 0.3757, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.672747611999512, "rewards/margins": 10.330121994018555, "rewards/rejected": -20.002866744995117, "sft_loss": 1.1246702671051025, "step": 790 }, { "epoch": 1.3061336254107339, "grad_norm": 12.65020419545928, "learning_rate": 1.4676517847099745e-07, "logits/chosen": 16.62309455871582, "logits/rejected": 17.682994842529297, "logps/chosen": -309.1587829589844, "logps/rejected": -255.12290954589844, "loss": 0.3603, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.413141250610352, "rewards/margins": 9.33283805847168, "rewards/rejected": -17.7459774017334, "sft_loss": 1.1139575242996216, "step": 795 }, { "epoch": 1.3143483023001095, "grad_norm": 11.10720994563204, "learning_rate": 1.4367824904439242e-07, "logits/chosen": 17.087141036987305, "logits/rejected": 17.25540542602539, "logps/chosen": -336.4616394042969, "logps/rejected": -273.6061096191406, "loss": 0.371, "rewards/accuracies": 1.0, "rewards/chosen": -8.36069393157959, "rewards/margins": 10.413783073425293, "rewards/rejected": -18.774477005004883, "sft_loss": 1.0689451694488525, "step": 800 }, { "epoch": 1.3225629791894853, "grad_norm": 12.783247596774917, "learning_rate": 1.4061098449376985e-07, "logits/chosen": 15.60853099822998, "logits/rejected": 17.57704734802246, "logps/chosen": -362.2177734375, "logps/rejected": -308.759765625, "loss": 0.3288, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.05817699432373, "rewards/margins": 11.833967208862305, "rewards/rejected": -20.89214324951172, "sft_loss": 1.2039010524749756, "step": 805 }, { "epoch": 1.330777656078861, "grad_norm": 8.359077848319595, "learning_rate": 1.375639521289836e-07, "logits/chosen": 15.683825492858887, "logits/rejected": 16.602642059326172, "logps/chosen": -332.6221008300781, "logps/rejected": -278.2598571777344, "loss": 0.3387, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.19861888885498, "rewards/margins": 10.37482738494873, "rewards/rejected": -19.57344627380371, "sft_loss": 1.17559015750885, "step": 810 }, { "epoch": 1.3389923329682367, "grad_norm": 13.496245877040751, "learning_rate": 1.3453771551781756e-07, "logits/chosen": 16.44358253479004, "logits/rejected": 17.437644958496094, "logps/chosen": -307.6462707519531, "logps/rejected": -271.81683349609375, "loss": 0.3318, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.48726749420166, "rewards/margins": 10.40507698059082, "rewards/rejected": -18.892345428466797, "sft_loss": 1.1855844259262085, "step": 815 }, { "epoch": 1.3472070098576123, "grad_norm": 14.433148985359804, "learning_rate": 1.3153283438175034e-07, "logits/chosen": 15.872283935546875, "logits/rejected": 16.650604248046875, "logps/chosen": -324.4306945800781, "logps/rejected": -276.83929443359375, "loss": 0.3743, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.549914360046387, "rewards/margins": 10.544774055480957, "rewards/rejected": -20.094688415527344, "sft_loss": 1.1671338081359863, "step": 820 }, { "epoch": 1.355421686746988, "grad_norm": 22.22401225520154, "learning_rate": 1.2854986449243124e-07, "logits/chosen": 16.34712028503418, "logits/rejected": 16.94756317138672, "logps/chosen": -331.7503662109375, "logps/rejected": -286.41705322265625, "loss": 0.3285, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.810836791992188, "rewards/margins": 10.913623809814453, "rewards/rejected": -20.724462509155273, "sft_loss": 1.0781916379928589, "step": 825 }, { "epoch": 1.3636363636363638, "grad_norm": 11.973621147714551, "learning_rate": 1.2558935756888675e-07, "logits/chosen": 15.828746795654297, "logits/rejected": 16.91975212097168, "logps/chosen": -322.3880310058594, "logps/rejected": -279.2362365722656, "loss": 0.3542, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.630596160888672, "rewards/margins": 10.590496063232422, "rewards/rejected": -20.221094131469727, "sft_loss": 1.1420843601226807, "step": 830 }, { "epoch": 1.3718510405257394, "grad_norm": 12.897955971332296, "learning_rate": 1.226518611754767e-07, "logits/chosen": 17.223234176635742, "logits/rejected": 18.44441795349121, "logps/chosen": -314.6831970214844, "logps/rejected": -273.42083740234375, "loss": 0.3494, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.791934967041016, "rewards/margins": 10.404925346374512, "rewards/rejected": -19.196863174438477, "sft_loss": 1.11257803440094, "step": 835 }, { "epoch": 1.380065717415115, "grad_norm": 14.822041663775748, "learning_rate": 1.1973791862061871e-07, "logits/chosen": 15.981986045837402, "logits/rejected": 16.508832931518555, "logps/chosen": -357.4217529296875, "logps/rejected": -279.4723815917969, "loss": 0.4071, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -8.533044815063477, "rewards/margins": 10.845856666564941, "rewards/rejected": -19.378902435302734, "sft_loss": 1.071024775505066, "step": 840 }, { "epoch": 1.3882803943044908, "grad_norm": 12.166531109745293, "learning_rate": 1.1684806885630003e-07, "logits/chosen": 17.19085693359375, "logits/rejected": 18.22423553466797, "logps/chosen": -336.6310729980469, "logps/rejected": -288.2579040527344, "loss": 0.3543, "rewards/accuracies": 1.0, "rewards/chosen": -8.727254867553711, "rewards/margins": 11.067011833190918, "rewards/rejected": -19.794267654418945, "sft_loss": 1.0941708087921143, "step": 845 }, { "epoch": 1.3964950711938664, "grad_norm": 19.61480809183216, "learning_rate": 1.1398284637839486e-07, "logits/chosen": 17.393543243408203, "logits/rejected": 17.97818946838379, "logps/chosen": -290.88043212890625, "logps/rejected": -248.78334045410156, "loss": 0.3532, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.74341869354248, "rewards/margins": 8.96402359008789, "rewards/rejected": -17.707439422607422, "sft_loss": 1.3463881015777588, "step": 850 }, { "epoch": 1.404709748083242, "grad_norm": 13.04687226615894, "learning_rate": 1.1114278112780601e-07, "logits/chosen": 16.697458267211914, "logits/rejected": 17.817760467529297, "logps/chosen": -376.94256591796875, "logps/rejected": -319.7321472167969, "loss": 0.308, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -10.05405044555664, "rewards/margins": 12.681156158447266, "rewards/rejected": -22.735204696655273, "sft_loss": 1.1224801540374756, "step": 855 }, { "epoch": 1.4129244249726178, "grad_norm": 13.443707852848624, "learning_rate": 1.08328398392449e-07, "logits/chosen": 17.408639907836914, "logits/rejected": 17.620332717895508, "logps/chosen": -365.28131103515625, "logps/rejected": -308.3528137207031, "loss": 0.3755, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -10.822293281555176, "rewards/margins": 11.438949584960938, "rewards/rejected": -22.261241912841797, "sft_loss": 1.178871750831604, "step": 860 }, { "epoch": 1.4211391018619934, "grad_norm": 21.58859732751979, "learning_rate": 1.0554021871009677e-07, "logits/chosen": 16.947927474975586, "logits/rejected": 17.420812606811523, "logps/chosen": -340.0753479003906, "logps/rejected": -297.9937438964844, "loss": 0.3588, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.554227828979492, "rewards/margins": 12.149679183959961, "rewards/rejected": -21.70391082763672, "sft_loss": 1.3246734142303467, "step": 865 }, { "epoch": 1.429353778751369, "grad_norm": 13.8734601142875, "learning_rate": 1.0277875777210299e-07, "logits/chosen": 14.887709617614746, "logits/rejected": 15.843902587890625, "logps/chosen": -324.3350830078125, "logps/rejected": -275.6741943359375, "loss": 0.3712, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.113089561462402, "rewards/margins": 10.9337158203125, "rewards/rejected": -20.046804428100586, "sft_loss": 1.2739402055740356, "step": 870 }, { "epoch": 1.4375684556407449, "grad_norm": 13.714339626163223, "learning_rate": 1.0004452632802158e-07, "logits/chosen": 17.476552963256836, "logits/rejected": 17.923315048217773, "logps/chosen": -338.1813049316406, "logps/rejected": -277.5501403808594, "loss": 0.3129, "rewards/accuracies": 1.0, "rewards/chosen": -8.576931953430176, "rewards/margins": 10.827016830444336, "rewards/rejected": -19.403947830200195, "sft_loss": 1.1658498048782349, "step": 875 }, { "epoch": 1.4457831325301205, "grad_norm": 13.805309365020234, "learning_rate": 9.733803009114044e-08, "logits/chosen": 16.891300201416016, "logits/rejected": 17.32049560546875, "logps/chosen": -322.0257263183594, "logps/rejected": -274.27691650390625, "loss": 0.316, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.144042015075684, "rewards/margins": 10.516057014465332, "rewards/rejected": -18.660099029541016, "sft_loss": 1.110759973526001, "step": 880 }, { "epoch": 1.453997809419496, "grad_norm": 29.77032111690104, "learning_rate": 9.465976964494682e-08, "logits/chosen": 16.620283126831055, "logits/rejected": 17.72939682006836, "logps/chosen": -300.1767578125, "logps/rejected": -261.1438903808594, "loss": 0.361, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.963627815246582, "rewards/margins": 9.891008377075195, "rewards/rejected": -18.854639053344727, "sft_loss": 1.2920080423355103, "step": 885 }, { "epoch": 1.462212486308872, "grad_norm": 9.782780560332286, "learning_rate": 9.201024035054053e-08, "logits/chosen": 17.15985107421875, "logits/rejected": 17.535512924194336, "logps/chosen": -286.6101379394531, "logps/rejected": -247.57127380371094, "loss": 0.3835, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.331561088562012, "rewards/margins": 9.115301132202148, "rewards/rejected": -18.446863174438477, "sft_loss": 1.3567354679107666, "step": 890 }, { "epoch": 1.4704271631982475, "grad_norm": 8.768405187815805, "learning_rate": 8.938993225501495e-08, "logits/chosen": 17.89764976501465, "logits/rejected": 18.452497482299805, "logps/chosen": -351.6549987792969, "logps/rejected": -302.9189453125, "loss": 0.3592, "rewards/accuracies": 1.0, "rewards/chosen": -9.618634223937988, "rewards/margins": 11.819962501525879, "rewards/rejected": -21.4385986328125, "sft_loss": 1.0768134593963623, "step": 895 }, { "epoch": 1.4786418400876231, "grad_norm": 21.82788195022886, "learning_rate": 8.679933000081879e-08, "logits/chosen": 15.745450019836426, "logits/rejected": 17.15949249267578, "logps/chosen": -307.5598449707031, "logps/rejected": -271.531494140625, "loss": 0.3801, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.928607940673828, "rewards/margins": 10.275431632995605, "rewards/rejected": -19.204038619995117, "sft_loss": 1.1987248659133911, "step": 900 }, { "epoch": 1.486856516976999, "grad_norm": 12.077209434939249, "learning_rate": 8.423891273611855e-08, "logits/chosen": 16.016569137573242, "logits/rejected": 16.249284744262695, "logps/chosen": -311.76934814453125, "logps/rejected": -261.8121643066406, "loss": 0.3799, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.412449836730957, "rewards/margins": 10.322086334228516, "rewards/rejected": -18.734539031982422, "sft_loss": 1.2180228233337402, "step": 905 }, { "epoch": 1.4950711938663745, "grad_norm": 20.15671717033895, "learning_rate": 8.170915402617739e-08, "logits/chosen": 15.889266014099121, "logits/rejected": 17.218164443969727, "logps/chosen": -335.0419921875, "logps/rejected": -293.2705078125, "loss": 0.4051, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.390657424926758, "rewards/margins": 11.274243354797363, "rewards/rejected": -20.664899826049805, "sft_loss": 1.1832726001739502, "step": 910 }, { "epoch": 1.5032858707557502, "grad_norm": 11.069682914043863, "learning_rate": 7.921052176576643e-08, "logits/chosen": 17.052453994750977, "logits/rejected": 17.67256736755371, "logps/chosen": -305.6400146484375, "logps/rejected": -266.4335632324219, "loss": 0.3165, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.719382286071777, "rewards/margins": 10.110651016235352, "rewards/rejected": -18.830034255981445, "sft_loss": 1.0706188678741455, "step": 915 }, { "epoch": 1.511500547645126, "grad_norm": 27.258481926608287, "learning_rate": 7.674347809262377e-08, "logits/chosen": 16.615238189697266, "logits/rejected": 17.932260513305664, "logps/chosen": -288.8174743652344, "logps/rejected": -250.63177490234375, "loss": 0.3758, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -7.821852684020996, "rewards/margins": 9.426899909973145, "rewards/rejected": -17.24875259399414, "sft_loss": 1.12588369846344, "step": 920 }, { "epoch": 1.5197152245345018, "grad_norm": 8.415837096456798, "learning_rate": 7.430847930198009e-08, "logits/chosen": 16.921852111816406, "logits/rejected": 17.39198875427246, "logps/chosen": -329.8725891113281, "logps/rejected": -274.1763000488281, "loss": 0.3708, "rewards/accuracies": 1.0, "rewards/chosen": -7.946272850036621, "rewards/margins": 11.179486274719238, "rewards/rejected": -19.12575912475586, "sft_loss": 1.286713719367981, "step": 925 }, { "epoch": 1.5279299014238772, "grad_norm": 7.8387923698583295, "learning_rate": 7.190597576216384e-08, "logits/chosen": 15.69840145111084, "logits/rejected": 17.983213424682617, "logps/chosen": -329.1253967285156, "logps/rejected": -290.71051025390625, "loss": 0.3144, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.661620140075684, "rewards/margins": 11.160269737243652, "rewards/rejected": -19.821889877319336, "sft_loss": 1.1312789916992188, "step": 930 }, { "epoch": 1.536144578313253, "grad_norm": 14.005325625629936, "learning_rate": 6.953641183130224e-08, "logits/chosen": 16.529827117919922, "logits/rejected": 16.534809112548828, "logps/chosen": -333.02813720703125, "logps/rejected": -275.6182556152344, "loss": 0.3675, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.715841293334961, "rewards/margins": 9.975454330444336, "rewards/rejected": -19.691295623779297, "sft_loss": 1.2341707944869995, "step": 935 }, { "epoch": 1.5443592552026288, "grad_norm": 11.238181780972436, "learning_rate": 6.720022577513507e-08, "logits/chosen": 15.408208847045898, "logits/rejected": 16.01373291015625, "logps/chosen": -350.6366882324219, "logps/rejected": -291.2669677734375, "loss": 0.3381, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -10.044300079345703, "rewards/margins": 10.780247688293457, "rewards/rejected": -20.82455062866211, "sft_loss": 1.26254141330719, "step": 940 }, { "epoch": 1.5525739320920042, "grad_norm": 11.413642178268471, "learning_rate": 6.489784968595444e-08, "logits/chosen": 15.467609405517578, "logits/rejected": 16.952180862426758, "logps/chosen": -346.5306091308594, "logps/rejected": -312.6312561035156, "loss": 0.3402, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.993128776550293, "rewards/margins": 12.808844566345215, "rewards/rejected": -22.801973342895508, "sft_loss": 1.1826088428497314, "step": 945 }, { "epoch": 1.56078860898138, "grad_norm": 22.79199458890795, "learning_rate": 6.262970940268652e-08, "logits/chosen": 16.051044464111328, "logits/rejected": 17.10271453857422, "logps/chosen": -313.6996765136719, "logps/rejected": -278.2881774902344, "loss": 0.333, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.396943092346191, "rewards/margins": 10.23829460144043, "rewards/rejected": -19.635236740112305, "sft_loss": 1.1279245615005493, "step": 950 }, { "epoch": 1.5690032858707559, "grad_norm": 11.668850401054987, "learning_rate": 6.039622443213008e-08, "logits/chosen": 16.13634490966797, "logits/rejected": 17.919300079345703, "logps/chosen": -325.7288513183594, "logps/rejected": -289.1236267089844, "loss": 0.3346, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.695369720458984, "rewards/margins": 11.087947845458984, "rewards/rejected": -20.78331756591797, "sft_loss": 1.1951278448104858, "step": 955 }, { "epoch": 1.5772179627601315, "grad_norm": 13.415709297062323, "learning_rate": 5.8197807871366e-08, "logits/chosen": 15.244779586791992, "logits/rejected": 16.526262283325195, "logps/chosen": -370.6669616699219, "logps/rejected": -322.87847900390625, "loss": 0.3428, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.789223670959473, "rewards/margins": 12.587509155273438, "rewards/rejected": -22.376733779907227, "sft_loss": 2.1045873165130615, "step": 960 }, { "epoch": 1.585432639649507, "grad_norm": 13.58873079620651, "learning_rate": 5.6034866331352376e-08, "logits/chosen": 15.409506797790527, "logits/rejected": 16.128753662109375, "logps/chosen": -322.9807434082031, "logps/rejected": -271.06378173828125, "loss": 0.347, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.693291664123535, "rewards/margins": 10.159637451171875, "rewards/rejected": -19.852930068969727, "sft_loss": 1.1238617897033691, "step": 965 }, { "epoch": 1.593647316538883, "grad_norm": 16.504268173121613, "learning_rate": 5.390779986171934e-08, "logits/chosen": 15.72015380859375, "logits/rejected": 17.518657684326172, "logps/chosen": -337.39349365234375, "logps/rejected": -302.06109619140625, "loss": 0.3214, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.937000274658203, "rewards/margins": 11.78524112701416, "rewards/rejected": -20.72224235534668, "sft_loss": 1.129492998123169, "step": 970 }, { "epoch": 1.6018619934282585, "grad_norm": 14.941336561605484, "learning_rate": 5.1817001876777314e-08, "logits/chosen": 15.710195541381836, "logits/rejected": 16.9680233001709, "logps/chosen": -324.51251220703125, "logps/rejected": -286.7372741699219, "loss": 0.3363, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.67973518371582, "rewards/margins": 11.069150924682617, "rewards/rejected": -19.74888801574707, "sft_loss": 1.168811559677124, "step": 975 }, { "epoch": 1.6100766703176341, "grad_norm": 11.368129107493246, "learning_rate": 4.9762859082752464e-08, "logits/chosen": 17.196496963500977, "logits/rejected": 18.05078125, "logps/chosen": -340.8441162109375, "logps/rejected": -291.5513610839844, "loss": 0.332, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.99682903289795, "rewards/margins": 11.247660636901855, "rewards/rejected": -20.244489669799805, "sft_loss": 1.040310025215149, "step": 980 }, { "epoch": 1.61829134720701, "grad_norm": 17.375398637805176, "learning_rate": 4.774575140626316e-08, "logits/chosen": 15.612386703491211, "logits/rejected": 17.049909591674805, "logps/chosen": -315.4412841796875, "logps/rejected": -273.022216796875, "loss": 0.2981, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.78695011138916, "rewards/margins": 10.45934772491455, "rewards/rejected": -19.24629783630371, "sft_loss": 1.122090458869934, "step": 985 }, { "epoch": 1.6265060240963856, "grad_norm": 18.391059447329464, "learning_rate": 4.5766051924049975e-08, "logits/chosen": 19.033084869384766, "logits/rejected": 19.09506607055664, "logps/chosen": -344.99224853515625, "logps/rejected": -281.4374084472656, "loss": 0.4023, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.408563613891602, "rewards/margins": 11.344144821166992, "rewards/rejected": -19.752708435058594, "sft_loss": 1.2188175916671753, "step": 990 }, { "epoch": 1.6347207009857612, "grad_norm": 4.7857387318547255, "learning_rate": 4.3824126793972934e-08, "logits/chosen": 15.44153118133545, "logits/rejected": 16.74248504638672, "logps/chosen": -348.91326904296875, "logps/rejected": -291.33905029296875, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": -8.16443157196045, "rewards/margins": 12.226564407348633, "rewards/rejected": -20.390995025634766, "sft_loss": 1.1215661764144897, "step": 995 }, { "epoch": 1.642935377875137, "grad_norm": 8.425137005894317, "learning_rate": 4.192033518728819e-08, "logits/chosen": 16.596193313598633, "logits/rejected": 16.706600189208984, "logps/chosen": -337.87109375, "logps/rejected": -279.28277587890625, "loss": 0.3546, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.45007038116455, "rewards/margins": 11.166516304016113, "rewards/rejected": -19.616586685180664, "sft_loss": 1.3097057342529297, "step": 1000 }, { "epoch": 1.6511500547645126, "grad_norm": 10.216638281397124, "learning_rate": 4.0055029222217125e-08, "logits/chosen": 16.447404861450195, "logits/rejected": 16.960412979125977, "logps/chosen": -313.47698974609375, "logps/rejected": -269.1077880859375, "loss": 0.3193, "rewards/accuracies": 1.0, "rewards/chosen": -9.434925079345703, "rewards/margins": 10.487255096435547, "rewards/rejected": -19.92218017578125, "sft_loss": 1.099938988685608, "step": 1005 }, { "epoch": 1.6593647316538882, "grad_norm": 10.435302161258754, "learning_rate": 3.8228553898819904e-08, "logits/chosen": 17.95560073852539, "logits/rejected": 19.009355545043945, "logps/chosen": -340.97222900390625, "logps/rejected": -298.7887268066406, "loss": 0.3949, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.733698844909668, "rewards/margins": 11.381470680236816, "rewards/rejected": -21.115171432495117, "sft_loss": 1.1103211641311646, "step": 1010 }, { "epoch": 1.667579408543264, "grad_norm": 11.907900026431262, "learning_rate": 3.6441247035185416e-08, "logits/chosen": 16.81635284423828, "logits/rejected": 17.959022521972656, "logps/chosen": -361.63812255859375, "logps/rejected": -303.6453552246094, "loss": 0.3353, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.310194969177246, "rewards/margins": 11.503978729248047, "rewards/rejected": -20.814172744750977, "sft_loss": 1.1227651834487915, "step": 1015 }, { "epoch": 1.6757940854326396, "grad_norm": 10.027831680647658, "learning_rate": 3.4693439204949855e-08, "logits/chosen": 15.768338203430176, "logits/rejected": 17.33998680114746, "logps/chosen": -292.4506530761719, "logps/rejected": -263.465087890625, "loss": 0.3701, "rewards/accuracies": 1.0, "rewards/chosen": -9.007841110229492, "rewards/margins": 10.145407676696777, "rewards/rejected": -19.153249740600586, "sft_loss": 1.1951355934143066, "step": 1020 }, { "epoch": 1.6840087623220152, "grad_norm": 19.083513373797096, "learning_rate": 3.298545367615493e-08, "logits/chosen": 17.174707412719727, "logits/rejected": 17.86057472229004, "logps/chosen": -288.18280029296875, "logps/rejected": -254.59439086914062, "loss": 0.4406, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -8.990920066833496, "rewards/margins": 9.506522178649902, "rewards/rejected": -18.4974422454834, "sft_loss": 1.2072545289993286, "step": 1025 }, { "epoch": 1.692223439211391, "grad_norm": 12.02229671131509, "learning_rate": 3.13176063514575e-08, "logits/chosen": 17.051944732666016, "logits/rejected": 17.904996871948242, "logps/chosen": -359.4859619140625, "logps/rejected": -295.76361083984375, "loss": 0.3592, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.758131980895996, "rewards/margins": 11.989044189453125, "rewards/rejected": -20.747175216674805, "sft_loss": 1.2417008876800537, "step": 1030 }, { "epoch": 1.7004381161007667, "grad_norm": 14.595666831687033, "learning_rate": 2.96902057097011e-08, "logits/chosen": 16.427305221557617, "logits/rejected": 17.641498565673828, "logps/chosen": -320.2253723144531, "logps/rejected": -269.6889953613281, "loss": 0.3571, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.658390998840332, "rewards/margins": 10.258994102478027, "rewards/rejected": -18.91738510131836, "sft_loss": 1.332204818725586, "step": 1035 }, { "epoch": 1.7086527929901423, "grad_norm": 13.068829943035729, "learning_rate": 2.8103552748861475e-08, "logits/chosen": 15.954511642456055, "logits/rejected": 16.74055290222168, "logps/chosen": -331.81707763671875, "logps/rejected": -280.3811950683594, "loss": 0.335, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.056927680969238, "rewards/margins": 10.4561767578125, "rewards/rejected": -19.51310157775879, "sft_loss": 1.1305441856384277, "step": 1040 }, { "epoch": 1.716867469879518, "grad_norm": 14.364271003384296, "learning_rate": 2.65579409303745e-08, "logits/chosen": 17.06740951538086, "logits/rejected": 17.10344886779785, "logps/chosen": -364.3813171386719, "logps/rejected": -293.8392333984375, "loss": 0.3632, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.905304908752441, "rewards/margins": 11.902792930603027, "rewards/rejected": -20.808101654052734, "sft_loss": 1.149087905883789, "step": 1045 }, { "epoch": 1.7250821467688937, "grad_norm": 21.44861485257077, "learning_rate": 2.505365612485874e-08, "logits/chosen": 14.690909385681152, "logits/rejected": 15.39016056060791, "logps/chosen": -310.1071472167969, "logps/rejected": -257.1431884765625, "loss": 0.3935, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -9.03943157196045, "rewards/margins": 9.36133098602295, "rewards/rejected": -18.4007625579834, "sft_loss": 1.492135763168335, "step": 1050 }, { "epoch": 1.7332968236582693, "grad_norm": 10.165639822250112, "learning_rate": 2.3590976559242275e-08, "logits/chosen": 16.5327091217041, "logits/rejected": 17.50569725036621, "logps/chosen": -327.5498962402344, "logps/rejected": -288.2828674316406, "loss": 0.3287, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.587510108947754, "rewards/margins": 10.448949813842773, "rewards/rejected": -20.036460876464844, "sft_loss": 1.2338570356369019, "step": 1055 }, { "epoch": 1.7415115005476451, "grad_norm": 9.87040734328389, "learning_rate": 2.21701727653025e-08, "logits/chosen": 15.633200645446777, "logits/rejected": 16.086591720581055, "logps/chosen": -352.7239990234375, "logps/rejected": -294.7661437988281, "loss": 0.3506, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.915904998779297, "rewards/margins": 11.233012199401855, "rewards/rejected": -21.14891815185547, "sft_loss": 1.2049648761749268, "step": 1060 }, { "epoch": 1.749726177437021, "grad_norm": 12.834737803326664, "learning_rate": 2.0791507529629522e-08, "logits/chosen": 16.351898193359375, "logits/rejected": 17.47950553894043, "logps/chosen": -281.7489318847656, "logps/rejected": -243.97483825683594, "loss": 0.3882, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.137645721435547, "rewards/margins": 8.15616226196289, "rewards/rejected": -17.29380989074707, "sft_loss": 1.157172679901123, "step": 1065 }, { "epoch": 1.7579408543263964, "grad_norm": 17.205116768747743, "learning_rate": 1.945523584502262e-08, "logits/chosen": 17.508634567260742, "logits/rejected": 17.94008445739746, "logps/chosen": -381.6427917480469, "logps/rejected": -311.2584228515625, "loss": 0.277, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.13695240020752, "rewards/margins": 12.883686065673828, "rewards/rejected": -22.02063751220703, "sft_loss": 1.055487036705017, "step": 1070 }, { "epoch": 1.7661555312157722, "grad_norm": 20.851515512896743, "learning_rate": 1.8161604863327072e-08, "logits/chosen": 15.488776206970215, "logits/rejected": 16.223703384399414, "logps/chosen": -325.0180358886719, "logps/rejected": -262.5523376464844, "loss": 0.3441, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.707998275756836, "rewards/margins": 9.961923599243164, "rewards/rejected": -18.669921875, "sft_loss": 1.1589832305908203, "step": 1075 }, { "epoch": 1.774370208105148, "grad_norm": 17.972861201786518, "learning_rate": 1.691085384972235e-08, "logits/chosen": 14.909817695617676, "logits/rejected": 15.637177467346191, "logps/chosen": -278.62322998046875, "logps/rejected": -248.10516357421875, "loss": 0.3273, "rewards/accuracies": 1.0, "rewards/chosen": -8.496481895446777, "rewards/margins": 9.37110424041748, "rewards/rejected": -17.867582321166992, "sft_loss": 1.2477223873138428, "step": 1080 }, { "epoch": 1.7825848849945234, "grad_norm": 14.287110123465489, "learning_rate": 1.570321413846845e-08, "logits/chosen": 15.394953727722168, "logits/rejected": 17.261220932006836, "logps/chosen": -303.1915588378906, "logps/rejected": -277.51458740234375, "loss": 0.2832, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.813228607177734, "rewards/margins": 10.778543472290039, "rewards/rejected": -19.59177017211914, "sft_loss": 1.2371479272842407, "step": 1085 }, { "epoch": 1.7907995618838992, "grad_norm": 25.07441398024989, "learning_rate": 1.4538909090118846e-08, "logits/chosen": 16.854040145874023, "logits/rejected": 16.584880828857422, "logps/chosen": -322.2169494628906, "logps/rejected": -270.48895263671875, "loss": 0.3503, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.387935638427734, "rewards/margins": 10.15947437286377, "rewards/rejected": -19.547407150268555, "sft_loss": 1.2250884771347046, "step": 1090 }, { "epoch": 1.799014238773275, "grad_norm": 12.397083886048673, "learning_rate": 1.3418154050208936e-08, "logits/chosen": 15.345029830932617, "logits/rejected": 16.834665298461914, "logps/chosen": -297.9521484375, "logps/rejected": -269.69659423828125, "loss": 0.3526, "rewards/accuracies": 0.9066667556762695, "rewards/chosen": -8.90621280670166, "rewards/margins": 10.480603218078613, "rewards/rejected": -19.386816024780273, "sft_loss": 1.1300204992294312, "step": 1095 }, { "epoch": 1.8072289156626506, "grad_norm": 11.032455088728524, "learning_rate": 1.2341156309426447e-08, "logits/chosen": 14.950087547302246, "logits/rejected": 16.54684829711914, "logps/chosen": -332.92596435546875, "logps/rejected": -291.2406005859375, "loss": 0.289, "rewards/accuracies": 1.0, "rewards/chosen": -9.32970905303955, "rewards/margins": 11.054911613464355, "rewards/rejected": -20.384618759155273, "sft_loss": 1.0616583824157715, "step": 1100 }, { "epoch": 1.8154435925520263, "grad_norm": 12.222936203639813, "learning_rate": 1.130811506527149e-08, "logits/chosen": 16.257431030273438, "logits/rejected": 17.80784034729004, "logps/chosen": -374.90716552734375, "logps/rejected": -309.1212158203125, "loss": 0.2761, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.983473777770996, "rewards/margins": 11.696609497070312, "rewards/rejected": -21.680082321166992, "sft_loss": 1.1142687797546387, "step": 1105 }, { "epoch": 1.823658269441402, "grad_norm": 8.603199609340459, "learning_rate": 1.0319221385213934e-08, "logits/chosen": 15.376051902770996, "logits/rejected": 16.714609146118164, "logps/chosen": -314.27996826171875, "logps/rejected": -280.79901123046875, "loss": 0.3201, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.61314868927002, "rewards/margins": 9.986039161682129, "rewards/rejected": -19.59918785095215, "sft_loss": 1.2583483457565308, "step": 1110 }, { "epoch": 1.8318729463307777, "grad_norm": 19.393082549696974, "learning_rate": 9.374658171354411e-09, "logits/chosen": 16.10991859436035, "logits/rejected": 17.19182586669922, "logps/chosen": -335.8138122558594, "logps/rejected": -285.86859130859375, "loss": 0.3573, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -9.598699569702148, "rewards/margins": 11.110600471496582, "rewards/rejected": -20.709299087524414, "sft_loss": 1.2626595497131348, "step": 1115 }, { "epoch": 1.8400876232201533, "grad_norm": 11.87556668069316, "learning_rate": 8.474600126594983e-09, "logits/chosen": 16.182172775268555, "logits/rejected": 17.73249053955078, "logps/chosen": -327.0877685546875, "logps/rejected": -281.38848876953125, "loss": 0.3247, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.166089057922363, "rewards/margins": 11.149109840393066, "rewards/rejected": -20.315196990966797, "sft_loss": 1.3075504302978516, "step": 1120 }, { "epoch": 1.8483023001095291, "grad_norm": 14.892979384936938, "learning_rate": 7.619213722327184e-09, "logits/chosen": 16.07329750061035, "logits/rejected": 16.353158950805664, "logps/chosen": -328.3527526855469, "logps/rejected": -281.48565673828125, "loss": 0.3187, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.420902252197266, "rewards/margins": 10.730939865112305, "rewards/rejected": -20.151844024658203, "sft_loss": 1.2091686725616455, "step": 1125 }, { "epoch": 1.8565169769989047, "grad_norm": 12.380319924456133, "learning_rate": 6.808657167641896e-09, "logits/chosen": 15.801959037780762, "logits/rejected": 16.7104434967041, "logps/chosen": -357.0127258300781, "logps/rejected": -303.44989013671875, "loss": 0.3863, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.63782024383545, "rewards/margins": 12.076054573059082, "rewards/rejected": -21.71387481689453, "sft_loss": 1.1681187152862549, "step": 1130 }, { "epoch": 1.8647316538882803, "grad_norm": 15.785691804360567, "learning_rate": 6.043080380067539e-09, "logits/chosen": 15.678844451904297, "logits/rejected": 16.41909408569336, "logps/chosen": -383.7453918457031, "logps/rejected": -308.8125915527344, "loss": 0.3156, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.041726112365723, "rewards/margins": 12.560318946838379, "rewards/rejected": -21.602046966552734, "sft_loss": 1.186676263809204, "step": 1135 }, { "epoch": 1.8729463307776562, "grad_norm": 19.758267623181617, "learning_rate": 5.322624957841998e-09, "logits/chosen": 16.686138153076172, "logits/rejected": 17.78066062927246, "logps/chosen": -342.8313293457031, "logps/rejected": -297.6686096191406, "loss": 0.38, "rewards/accuracies": 1.0, "rewards/chosen": -9.914877891540527, "rewards/margins": 11.297541618347168, "rewards/rejected": -21.21242332458496, "sft_loss": 1.1149108409881592, "step": 1140 }, { "epoch": 1.8811610076670318, "grad_norm": 14.356418798551582, "learning_rate": 4.647424153723101e-09, "logits/chosen": 16.441852569580078, "logits/rejected": 16.586217880249023, "logps/chosen": -318.8826599121094, "logps/rejected": -271.4314880371094, "loss": 0.367, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.366458892822266, "rewards/margins": 10.299457550048828, "rewards/rejected": -19.665918350219727, "sft_loss": 1.2187005281448364, "step": 1145 }, { "epoch": 1.8893756845564074, "grad_norm": 18.826459574147577, "learning_rate": 4.0176028503425826e-09, "logits/chosen": 15.749044418334961, "logits/rejected": 16.83735466003418, "logps/chosen": -308.5406188964844, "logps/rejected": -271.7100830078125, "loss": 0.3801, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.177282333374023, "rewards/margins": 10.317461967468262, "rewards/rejected": -19.4947452545166, "sft_loss": 1.252463698387146, "step": 1150 }, { "epoch": 1.8975903614457832, "grad_norm": 16.226959543929514, "learning_rate": 3.433277537108481e-09, "logits/chosen": 15.832767486572266, "logits/rejected": 17.746004104614258, "logps/chosen": -343.33447265625, "logps/rejected": -305.2869873046875, "loss": 0.335, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.902791976928711, "rewards/margins": 11.3204345703125, "rewards/rejected": -21.223228454589844, "sft_loss": 1.2560192346572876, "step": 1155 }, { "epoch": 1.9058050383351588, "grad_norm": 14.860604119401957, "learning_rate": 2.8945562886593944e-09, "logits/chosen": 14.95615005493164, "logits/rejected": 16.35462760925293, "logps/chosen": -287.0328369140625, "logps/rejected": -257.26080322265625, "loss": 0.3677, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.02625560760498, "rewards/margins": 9.515448570251465, "rewards/rejected": -18.541706085205078, "sft_loss": 1.1147348880767822, "step": 1160 }, { "epoch": 1.9140197152245344, "grad_norm": 12.84326688048793, "learning_rate": 2.4015387448756976e-09, "logits/chosen": 15.258326530456543, "logits/rejected": 16.413604736328125, "logps/chosen": -337.6728820800781, "logps/rejected": -276.5948181152344, "loss": 0.333, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.658801078796387, "rewards/margins": 11.077197074890137, "rewards/rejected": -19.736000061035156, "sft_loss": 1.313868761062622, "step": 1165 }, { "epoch": 1.9222343921139102, "grad_norm": 9.16948186267524, "learning_rate": 1.954316092450281e-09, "logits/chosen": 16.7126522064209, "logits/rejected": 16.963319778442383, "logps/chosen": -349.0697326660156, "logps/rejected": -294.1761169433594, "loss": 0.299, "rewards/accuracies": 1.0, "rewards/chosen": -9.689640998840332, "rewards/margins": 11.0868558883667, "rewards/rejected": -20.77649688720703, "sft_loss": 1.2454497814178467, "step": 1170 }, { "epoch": 1.9304490690032858, "grad_norm": 19.811301652971856, "learning_rate": 1.5529710480231272e-09, "logits/chosen": 17.24116325378418, "logits/rejected": 16.968626022338867, "logps/chosen": -310.8689270019531, "logps/rejected": -274.0096740722656, "loss": 0.3, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.42411994934082, "rewards/margins": 10.301589012145996, "rewards/rejected": -19.725709915161133, "sft_loss": 1.0850669145584106, "step": 1175 }, { "epoch": 1.9386637458926614, "grad_norm": 12.455631194759059, "learning_rate": 1.1975778428823524e-09, "logits/chosen": 15.130066871643066, "logits/rejected": 16.740190505981445, "logps/chosen": -351.4178466796875, "logps/rejected": -299.97235107421875, "loss": 0.3093, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.884756088256836, "rewards/margins": 11.209836959838867, "rewards/rejected": -21.094594955444336, "sft_loss": 1.0997297763824463, "step": 1180 }, { "epoch": 1.9468784227820373, "grad_norm": 12.90904121827512, "learning_rate": 8.882022092346064e-10, "logits/chosen": 16.643354415893555, "logits/rejected": 16.99618148803711, "logps/chosen": -355.08087158203125, "logps/rejected": -291.8462219238281, "loss": 0.3245, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.204696655273438, "rewards/margins": 11.567381858825684, "rewards/rejected": -20.77208137512207, "sft_loss": 1.2387458086013794, "step": 1185 }, { "epoch": 1.9550930996714129, "grad_norm": 10.02772673186922, "learning_rate": 6.249013680474368e-10, "logits/chosen": 16.724010467529297, "logits/rejected": 16.2373104095459, "logps/chosen": -319.0643310546875, "logps/rejected": -268.2914123535156, "loss": 0.3367, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.252176284790039, "rewards/margins": 9.910921096801758, "rewards/rejected": -19.163097381591797, "sft_loss": 1.1700729131698608, "step": 1190 }, { "epoch": 1.9633077765607885, "grad_norm": 16.734218614778506, "learning_rate": 4.0772401846608794e-10, "logits/chosen": 17.680179595947266, "logits/rejected": 17.80653190612793, "logps/chosen": -305.4862060546875, "logps/rejected": -267.6892395019531, "loss": 0.4133, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.576127052307129, "rewards/margins": 9.75358772277832, "rewards/rejected": -19.329715728759766, "sft_loss": 1.1736282110214233, "step": 1195 }, { "epoch": 1.9715224534501643, "grad_norm": 19.267511912968715, "learning_rate": 2.367103288061223e-10, "logits/chosen": 16.904399871826172, "logits/rejected": 16.482337951660156, "logps/chosen": -316.0256652832031, "logps/rejected": -265.80157470703125, "loss": 0.3574, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.74451732635498, "rewards/margins": 9.55905532836914, "rewards/rejected": -19.303569793701172, "sft_loss": 1.2237191200256348, "step": 1200 }, { "epoch": 1.9797371303395401, "grad_norm": 11.51131736701793, "learning_rate": 1.1189192912416933e-10, "logits/chosen": 15.607586860656738, "logits/rejected": 16.690214157104492, "logps/chosen": -370.86328125, "logps/rejected": -313.1533508300781, "loss": 0.2989, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.272278785705566, "rewards/margins": 12.769195556640625, "rewards/rejected": -22.041475296020508, "sft_loss": 1.1835730075836182, "step": 1205 }, { "epoch": 1.9879518072289155, "grad_norm": 11.903861482033115, "learning_rate": 3.329190536757731e-11, "logits/chosen": 17.456689834594727, "logits/rejected": 18.812978744506836, "logps/chosen": -314.75823974609375, "logps/rejected": -271.9325256347656, "loss": 0.3344, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.277753829956055, "rewards/margins": 10.492895126342773, "rewards/rejected": -19.770648956298828, "sft_loss": 1.1376186609268188, "step": 1210 }, { "epoch": 1.9961664841182913, "grad_norm": 14.744908012876596, "learning_rate": 9.247951046897906e-13, "logits/chosen": 16.54582977294922, "logits/rejected": 18.33929443359375, "logps/chosen": -319.89813232421875, "logps/rejected": -279.7975769042969, "loss": 0.352, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.205850601196289, "rewards/margins": 10.397418975830078, "rewards/rejected": -19.603271484375, "sft_loss": 1.1400221586227417, "step": 1215 }, { "epoch": 1.9978094194961664, "step": 1216, "total_flos": 200111899688960.0, "train_loss": 0.4716386401069988, "train_runtime": 41653.1021, "train_samples_per_second": 1.753, "train_steps_per_second": 0.029 } ], "logging_steps": 5, "max_steps": 1216, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 200111899688960.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }