diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 0.999972591475949, "eval_steps": 100, - "global_step": 877, + "global_step": 9121, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 1.4296875, - "learning_rate": 5.681818181818181e-09, - "logits/chosen": -2.9108104705810547, - "logits/rejected": -2.4286632537841797, - "logps/chosen": -178.6770782470703, - "logps/rejected": -204.91192626953125, + "grad_norm": 1.71875, + "learning_rate": 5.47645125958379e-09, + "logits/chosen": -2.5463764667510986, + "logits/rejected": -2.7253260612487793, + "logps/chosen": -119.17886352539062, + "logps/rejected": -170.38873291015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,1343 +24,13718 @@ "step": 1 }, { - "epoch": 0.01, - "grad_norm": 1.3203125, - "learning_rate": 5.6818181818181815e-08, - "logits/chosen": -2.882168769836426, - "logits/rejected": -2.433367967605591, - "logps/chosen": -161.60011291503906, - "logps/rejected": -154.95980834960938, - "loss": 0.693, - "rewards/accuracies": 0.5069444179534912, - "rewards/chosen": 9.658561612013727e-05, - "rewards/margins": 0.0003618511836975813, - "rewards/rejected": -0.00026526558212935925, + "epoch": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 5.47645125958379e-08, + "logits/chosen": -2.8245480060577393, + "logits/rejected": -2.6224019527435303, + "logps/chosen": -258.6461486816406, + "logps/rejected": -221.91812133789062, + "loss": 0.6931, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": 0.00018174077558796853, + "rewards/margins": 1.8675553292268887e-05, + "rewards/rejected": 0.00016306524048559368, "step": 10 }, { - "epoch": 0.02, - "grad_norm": 1.3671875, - "learning_rate": 1.1363636363636363e-07, - "logits/chosen": -2.804980754852295, - "logits/rejected": -2.529651641845703, - "logps/chosen": -173.15240478515625, - "logps/rejected": -193.84674072265625, + "epoch": 0.0, + "grad_norm": 2.265625, + "learning_rate": 1.095290251916758e-07, + "logits/chosen": -2.854735851287842, + "logits/rejected": -2.782925605773926, + "logps/chosen": -281.14056396484375, + "logps/rejected": -228.5689697265625, "loss": 0.6929, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.0004781131574418396, - "rewards/margins": 0.0005443316185846925, - "rewards/rejected": -0.0010224448051303625, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.0003215556498616934, + "rewards/margins": -0.00038876369944773614, + "rewards/rejected": 6.720804231008515e-05, "step": 20 }, { - "epoch": 0.03, - "grad_norm": 1.375, - "learning_rate": 1.7045454545454543e-07, - "logits/chosen": -2.855222225189209, - "logits/rejected": -2.6262366771698, - "logps/chosen": -182.0742950439453, - "logps/rejected": -180.81005859375, - "loss": 0.6923, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.000352335482602939, - "rewards/margins": 0.0016974856844171882, - "rewards/rejected": -0.002049820963293314, + "epoch": 0.0, + "grad_norm": 1.7890625, + "learning_rate": 1.642935377875137e-07, + "logits/chosen": -2.8477203845977783, + "logits/rejected": -2.6674091815948486, + "logps/chosen": -274.7274169921875, + "logps/rejected": -218.6525115966797, + "loss": 0.6928, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.000438558345194906, + "rewards/margins": 0.0004634124634321779, + "rewards/rejected": -2.4854158255038783e-05, "step": 30 }, { - "epoch": 0.05, - "grad_norm": 1.4375, - "learning_rate": 2.2727272727272726e-07, - "logits/chosen": -2.8790335655212402, - "logits/rejected": -2.4088683128356934, - "logps/chosen": -196.99288940429688, - "logps/rejected": -187.37911987304688, - "loss": 0.6909, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -0.0007218262180685997, - "rewards/margins": 0.004465717822313309, - "rewards/rejected": -0.005187544040381908, + "epoch": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 2.190580503833516e-07, + "logits/chosen": -2.7696642875671387, + "logits/rejected": -2.658219814300537, + "logps/chosen": -249.36056518554688, + "logps/rejected": -237.4716033935547, + "loss": 0.6927, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0007732320809736848, + "rewards/margins": 0.0005145312752574682, + "rewards/rejected": 0.00025870074750855565, "step": 40 }, { - "epoch": 0.06, - "grad_norm": 1.34375, - "learning_rate": 2.840909090909091e-07, - "logits/chosen": -2.842094898223877, - "logits/rejected": -2.60573148727417, - "logps/chosen": -189.9934539794922, - "logps/rejected": -188.2660675048828, - "loss": 0.6894, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -0.0015374603681266308, - "rewards/margins": 0.007600747048854828, - "rewards/rejected": -0.009138207882642746, + "epoch": 0.01, + "grad_norm": 1.75, + "learning_rate": 2.738225629791895e-07, + "logits/chosen": -2.7960524559020996, + "logits/rejected": -2.7738702297210693, + "logps/chosen": -263.43096923828125, + "logps/rejected": -245.37545776367188, + "loss": 0.6925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0021389503963291645, + "rewards/margins": 0.0015721675008535385, + "rewards/rejected": 0.0005667829536832869, "step": 50 }, { - "epoch": 0.07, - "grad_norm": 1.359375, - "learning_rate": 3.4090909090909085e-07, - "logits/chosen": -2.875406265258789, - "logits/rejected": -2.4904425144195557, - "logps/chosen": -176.8124237060547, - "logps/rejected": -174.96426391601562, - "loss": 0.6876, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -0.0027205829974263906, - "rewards/margins": 0.011133117601275444, - "rewards/rejected": -0.013853700831532478, + "epoch": 0.01, + "grad_norm": 1.7421875, + "learning_rate": 3.285870755750274e-07, + "logits/chosen": -2.7675509452819824, + "logits/rejected": -2.7465765476226807, + "logps/chosen": -233.56893920898438, + "logps/rejected": -223.9210205078125, + "loss": 0.6923, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.002511745085939765, + "rewards/margins": 0.001435694401152432, + "rewards/rejected": 0.0010760504519566894, "step": 60 }, { - "epoch": 0.08, - "grad_norm": 1.515625, - "learning_rate": 3.977272727272727e-07, - "logits/chosen": -2.847299098968506, - "logits/rejected": -2.5116541385650635, - "logps/chosen": -181.4229278564453, - "logps/rejected": -183.06118774414062, - "loss": 0.685, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.0031291332561522722, - "rewards/margins": 0.016442101448774338, - "rewards/rejected": -0.01957123540341854, + "epoch": 0.01, + "grad_norm": 1.6953125, + "learning_rate": 3.833515881708653e-07, + "logits/chosen": -2.884610176086426, + "logits/rejected": -2.672100782394409, + "logps/chosen": -241.30599975585938, + "logps/rejected": -238.361328125, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0032493043690919876, + "rewards/margins": 0.0012350049801170826, + "rewards/rejected": 0.0020142991561442614, "step": 70 }, { - "epoch": 0.09, - "grad_norm": 1.5546875, - "learning_rate": 4.545454545454545e-07, - "logits/chosen": -2.8827993869781494, - "logits/rejected": -2.432978868484497, - "logps/chosen": -171.40695190429688, - "logps/rejected": -159.97158813476562, - "loss": 0.6804, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.0013542206725105643, - "rewards/margins": 0.025738287717103958, - "rewards/rejected": -0.027092507109045982, + "epoch": 0.01, + "grad_norm": 1.765625, + "learning_rate": 4.381161007667032e-07, + "logits/chosen": -2.9205610752105713, + "logits/rejected": -2.763939380645752, + "logps/chosen": -256.89056396484375, + "logps/rejected": -225.00540161132812, + "loss": 0.6917, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.004547181539237499, + "rewards/margins": 0.00310302572324872, + "rewards/rejected": 0.0014441555831581354, "step": 80 }, { - "epoch": 0.1, - "grad_norm": 1.53125, - "learning_rate": 4.999920729162206e-07, - "logits/chosen": -2.8681678771972656, - "logits/rejected": -2.436992883682251, - "logps/chosen": -180.45338439941406, - "logps/rejected": -173.6167755126953, - "loss": 0.6777, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.0007069273851811886, - "rewards/margins": 0.03129100054502487, - "rewards/rejected": -0.030584078282117844, + "epoch": 0.01, + "grad_norm": 1.8828125, + "learning_rate": 4.928806133625412e-07, + "logits/chosen": -2.785702705383301, + "logits/rejected": -2.61993408203125, + "logps/chosen": -218.4326934814453, + "logps/rejected": -202.32655334472656, + "loss": 0.6909, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0056652226485311985, + "rewards/margins": 0.004679783247411251, + "rewards/rejected": 0.000985439051873982, "step": 90 }, { - "epoch": 0.11, - "grad_norm": 1.4296875, - "learning_rate": 4.997146777648452e-07, - "logits/chosen": -2.834399700164795, - "logits/rejected": -2.428107738494873, - "logps/chosen": -169.1546173095703, - "logps/rejected": -179.9269256591797, - "loss": 0.6723, - "rewards/accuracies": 0.90625, - "rewards/chosen": 0.009452215395867825, - "rewards/margins": 0.042565904557704926, - "rewards/rejected": -0.033113688230514526, + "epoch": 0.01, + "grad_norm": 1.8203125, + "learning_rate": 5.47645125958379e-07, + "logits/chosen": -2.837836503982544, + "logits/rejected": -2.6859841346740723, + "logps/chosen": -231.037353515625, + "logps/rejected": -209.5705108642578, + "loss": 0.6907, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.006748904474079609, + "rewards/margins": 0.004814919084310532, + "rewards/rejected": 0.001933985622599721, "step": 100 }, { - "epoch": 0.13, - "grad_norm": 1.484375, - "learning_rate": 4.99041430990478e-07, - "logits/chosen": -2.848217725753784, - "logits/rejected": -2.517399549484253, - "logps/chosen": -165.34046936035156, - "logps/rejected": -170.87728881835938, - "loss": 0.6734, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": 0.008225902915000916, - "rewards/margins": 0.04048804193735123, - "rewards/rejected": -0.03226213902235031, + "epoch": 0.01, + "grad_norm": 1.8046875, + "learning_rate": 6.024096385542169e-07, + "logits/chosen": -2.9445126056671143, + "logits/rejected": -2.715280532836914, + "logps/chosen": -266.7657775878906, + "logps/rejected": -232.91357421875, + "loss": 0.6905, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.00909162312746048, + "rewards/margins": 0.005524896085262299, + "rewards/rejected": 0.003566727042198181, "step": 110 }, { - "epoch": 0.14, - "grad_norm": 1.3125, - "learning_rate": 4.979733998344632e-07, - "logits/chosen": -2.8693971633911133, - "logits/rejected": -2.4118335247039795, - "logps/chosen": -172.3426971435547, - "logps/rejected": -174.4741973876953, - "loss": 0.6651, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": 0.019977785646915436, - "rewards/margins": 0.05767567828297615, - "rewards/rejected": -0.037697892636060715, + "epoch": 0.01, + "grad_norm": 1.78125, + "learning_rate": 6.571741511500548e-07, + "logits/chosen": -2.8411850929260254, + "logits/rejected": -2.661163806915283, + "logps/chosen": -265.49334716796875, + "logps/rejected": -249.6112518310547, + "loss": 0.6899, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.008854064159095287, + "rewards/margins": 0.0061075459234416485, + "rewards/rejected": 0.00274651893414557, "step": 120 }, { - "epoch": 0.15, - "grad_norm": 1.421875, - "learning_rate": 4.965122773565369e-07, - "logits/chosen": -2.8834993839263916, - "logits/rejected": -2.61167311668396, - "logps/chosen": -173.71585083007812, - "logps/rejected": -184.6459503173828, - "loss": 0.6633, - "rewards/accuracies": 0.90625, - "rewards/chosen": 0.019093584269285202, - "rewards/margins": 0.06135554984211922, - "rewards/rejected": -0.04226196929812431, + "epoch": 0.01, + "grad_norm": 1.9609375, + "learning_rate": 7.119386637458927e-07, + "logits/chosen": -2.8140649795532227, + "logits/rejected": -2.759573459625244, + "logps/chosen": -244.5257110595703, + "logps/rejected": -239.27978515625, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.013553033582866192, + "rewards/margins": 0.008457383140921593, + "rewards/rejected": 0.005095650441944599, "step": 130 }, { - "epoch": 0.16, - "grad_norm": 1.4375, - "learning_rate": 4.946603797509634e-07, - "logits/chosen": -2.84814453125, - "logits/rejected": -2.4157638549804688, - "logps/chosen": -174.6752471923828, - "logps/rejected": -169.3231658935547, - "loss": 0.6584, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.016533980146050453, - "rewards/margins": 0.0715695172548294, - "rewards/rejected": -0.0550355389714241, + "epoch": 0.02, + "grad_norm": 1.9453125, + "learning_rate": 7.667031763417306e-07, + "logits/chosen": -2.8219151496887207, + "logits/rejected": -2.768402338027954, + "logps/chosen": -225.80712890625, + "logps/rejected": -218.02938842773438, + "loss": 0.6871, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.01494568306952715, + "rewards/margins": 0.011659353971481323, + "rewards/rejected": 0.0032863281667232513, "step": 140 }, { - "epoch": 0.17, - "grad_norm": 1.6015625, - "learning_rate": 4.924206426748668e-07, - "logits/chosen": -2.851029634475708, - "logits/rejected": -2.491598606109619, - "logps/chosen": -168.1584014892578, - "logps/rejected": -173.1723175048828, - "loss": 0.6518, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.026840489357709885, - "rewards/margins": 0.08569900691509247, - "rewards/rejected": -0.05885852128267288, + "epoch": 0.02, + "grad_norm": 1.9609375, + "learning_rate": 8.214676889375685e-07, + "logits/chosen": -2.8939976692199707, + "logits/rejected": -2.781658172607422, + "logps/chosen": -242.2596893310547, + "logps/rejected": -209.86294555664062, + "loss": 0.6871, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.014889853075146675, + "rewards/margins": 0.011720051988959312, + "rewards/rejected": 0.0031697992235422134, "step": 150 }, { - "epoch": 0.18, - "grad_norm": 1.4375, - "learning_rate": 4.897966165945815e-07, - "logits/chosen": -2.9286844730377197, - "logits/rejected": -2.573695659637451, - "logps/chosen": -185.7178955078125, - "logps/rejected": -196.64303588867188, - "loss": 0.6485, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.04040909558534622, - "rewards/margins": 0.09287194907665253, - "rewards/rejected": -0.052462853491306305, + "epoch": 0.02, + "grad_norm": 1.8203125, + "learning_rate": 8.762322015334064e-07, + "logits/chosen": -2.852910280227661, + "logits/rejected": -2.6357014179229736, + "logps/chosen": -236.601806640625, + "logps/rejected": -235.04736328125, + "loss": 0.6852, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020399365574121475, + "rewards/margins": 0.016088809818029404, + "rewards/rejected": 0.004310556687414646, "step": 160 }, { - "epoch": 0.19, - "grad_norm": 1.625, - "learning_rate": 4.867924611573976e-07, - "logits/chosen": -2.885529041290283, - "logits/rejected": -2.495492935180664, - "logps/chosen": -163.14358520507812, - "logps/rejected": -171.71200561523438, - "loss": 0.6395, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.042703740298748016, - "rewards/margins": 0.1115904226899147, - "rewards/rejected": -0.06888668239116669, + "epoch": 0.02, + "grad_norm": 1.828125, + "learning_rate": 9.309967141292443e-07, + "logits/chosen": -2.844787836074829, + "logits/rejected": -2.6956756114959717, + "logps/chosen": -221.9651641845703, + "logps/rejected": -221.60665893554688, + "loss": 0.6846, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.02167845331132412, + "rewards/margins": 0.018461132422089577, + "rewards/rejected": 0.0032173222862184048, "step": 170 }, { - "epoch": 0.21, - "grad_norm": 1.578125, - "learning_rate": 4.834129385976226e-07, - "logits/chosen": -2.885967254638672, - "logits/rejected": -2.491732120513916, - "logps/chosen": -182.1871795654297, - "logps/rejected": -185.06875610351562, - "loss": 0.6385, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.03652585297822952, - "rewards/margins": 0.11418838798999786, - "rewards/rejected": -0.07766252011060715, + "epoch": 0.02, + "grad_norm": 1.875, + "learning_rate": 9.857612267250823e-07, + "logits/chosen": -2.838764190673828, + "logits/rejected": -2.676421880722046, + "logps/chosen": -249.272216796875, + "logps/rejected": -215.9721221923828, + "loss": 0.6834, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.02556927502155304, + "rewards/margins": 0.0214368999004364, + "rewards/rejected": 0.004132373258471489, "step": 180 }, { - "epoch": 0.22, - "grad_norm": 1.46875, - "learning_rate": 4.796634061874129e-07, - "logits/chosen": -2.8707358837127686, - "logits/rejected": -2.4426345825195312, - "logps/chosen": -186.44851684570312, - "logps/rejected": -184.1309051513672, - "loss": 0.63, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.04190249368548393, - "rewards/margins": 0.13247118890285492, - "rewards/rejected": -0.09056870639324188, + "epoch": 0.02, + "grad_norm": 1.71875, + "learning_rate": 1.0405257393209202e-06, + "logits/chosen": -2.746431827545166, + "logits/rejected": -2.6380066871643066, + "logps/chosen": -258.790771484375, + "logps/rejected": -221.072265625, + "loss": 0.6816, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.023285698145627975, + "rewards/margins": 0.017937498167157173, + "rewards/rejected": 0.005348199047148228, "step": 190 }, { - "epoch": 0.23, - "grad_norm": 1.546875, - "learning_rate": 4.755498077443419e-07, - "logits/chosen": -2.9424500465393066, - "logits/rejected": -2.5191268920898438, - "logps/chosen": -186.60379028320312, - "logps/rejected": -198.8110809326172, - "loss": 0.6206, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.050092898309230804, - "rewards/margins": 0.15314312279224396, - "rewards/rejected": -0.10305019468069077, + "epoch": 0.02, + "grad_norm": 1.734375, + "learning_rate": 1.095290251916758e-06, + "logits/chosen": -2.772921562194824, + "logits/rejected": -2.7706680297851562, + "logps/chosen": -283.05999755859375, + "logps/rejected": -252.75210571289062, + "loss": 0.6811, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.028658848255872726, + "rewards/margins": 0.024413492530584335, + "rewards/rejected": 0.004245352931320667, "step": 200 }, { - "epoch": 0.24, - "grad_norm": 1.7265625, - "learning_rate": 4.710786642091672e-07, - "logits/chosen": -2.837841749191284, - "logits/rejected": -2.421677827835083, - "logps/chosen": -172.65603637695312, - "logps/rejected": -173.9297332763672, - "loss": 0.6195, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.030706852674484253, - "rewards/margins": 0.15557673573493958, - "rewards/rejected": -0.12486988306045532, + "epoch": 0.02, + "grad_norm": 1.828125, + "learning_rate": 1.150054764512596e-06, + "logits/chosen": -2.8384251594543457, + "logits/rejected": -2.713503360748291, + "logps/chosen": -261.55718994140625, + "logps/rejected": -217.0393524169922, + "loss": 0.6806, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.03805224969983101, + "rewards/margins": 0.028020748868584633, + "rewards/rejected": 0.010031499899923801, "step": 210 }, { - "epoch": 0.25, - "grad_norm": 1.484375, - "learning_rate": 4.6625706330873327e-07, - "logits/chosen": -2.894131898880005, - "logits/rejected": -2.390162706375122, - "logps/chosen": -166.30995178222656, - "logps/rejected": -163.71377563476562, - "loss": 0.6108, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.056603990495204926, - "rewards/margins": 0.1745707094669342, - "rewards/rejected": -0.11796671152114868, + "epoch": 0.02, + "grad_norm": 1.984375, + "learning_rate": 1.2048192771084338e-06, + "logits/chosen": -2.669668674468994, + "logits/rejected": -2.6532857418060303, + "logps/chosen": -304.7740783691406, + "logps/rejected": -250.16488647460938, + "loss": 0.676, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03027193807065487, + "rewards/margins": 0.030051440000534058, + "rewards/rejected": 0.00022049779363442212, "step": 220 }, { - "epoch": 0.26, - "grad_norm": 1.65625, - "learning_rate": 4.610926483203954e-07, - "logits/chosen": -2.9105746746063232, - "logits/rejected": -2.4803171157836914, - "logps/chosen": -162.8751983642578, - "logps/rejected": -186.7078094482422, - "loss": 0.6028, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.046747613698244095, - "rewards/margins": 0.19275279343128204, - "rewards/rejected": -0.14600518345832825, + "epoch": 0.03, + "grad_norm": 2.203125, + "learning_rate": 1.2595837897042718e-06, + "logits/chosen": -2.7831225395202637, + "logits/rejected": -2.591188430786133, + "logps/chosen": -248.563720703125, + "logps/rejected": -212.5738067626953, + "loss": 0.6785, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04051009565591812, + "rewards/margins": 0.04064738005399704, + "rewards/rejected": -0.0001372937549604103, "step": 230 }, { - "epoch": 0.27, - "grad_norm": 1.8671875, - "learning_rate": 4.555936059557768e-07, - "logits/chosen": -2.8267128467559814, - "logits/rejected": -2.4609925746917725, - "logps/chosen": -186.00393676757812, - "logps/rejected": -205.76620483398438, - "loss": 0.5998, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.05246806889772415, - "rewards/margins": 0.19960594177246094, - "rewards/rejected": -0.1471378654241562, + "epoch": 0.03, + "grad_norm": 2.328125, + "learning_rate": 1.3143483023001096e-06, + "logits/chosen": -2.8160877227783203, + "logits/rejected": -2.7005228996276855, + "logps/chosen": -263.27325439453125, + "logps/rejected": -277.45355224609375, + "loss": 0.6763, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04233872890472412, + "rewards/margins": 0.03695423901081085, + "rewards/rejected": 0.005384491756558418, "step": 240 }, { - "epoch": 0.29, - "grad_norm": 1.6875, - "learning_rate": 4.497686533830648e-07, - "logits/chosen": -2.842864990234375, - "logits/rejected": -2.4678244590759277, - "logps/chosen": -180.1123504638672, - "logps/rejected": -192.2842254638672, - "loss": 0.5956, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.05008302256464958, - "rewards/margins": 0.2085999995470047, - "rewards/rejected": -0.158516988158226, + "epoch": 0.03, + "grad_norm": 1.890625, + "learning_rate": 1.3691128148959477e-06, + "logits/chosen": -2.806075096130371, + "logits/rejected": -2.5909342765808105, + "logps/chosen": -207.13577270507812, + "logps/rejected": -195.98587036132812, + "loss": 0.6769, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.031694140285253525, + "rewards/margins": 0.03491998463869095, + "rewards/rejected": -0.0032258417923003435, "step": 250 }, { - "epoch": 0.3, - "grad_norm": 2.1875, - "learning_rate": 4.436270244084194e-07, - "logits/chosen": -2.85605788230896, - "logits/rejected": -2.4859187602996826, - "logps/chosen": -178.90333557128906, - "logps/rejected": -193.69593811035156, - "loss": 0.5795, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.06047965958714485, - "rewards/margins": 0.2455468624830246, - "rewards/rejected": -0.18506722152233124, + "epoch": 0.03, + "grad_norm": 1.96875, + "learning_rate": 1.4238773274917855e-06, + "logits/chosen": -2.7374420166015625, + "logits/rejected": -2.6870639324188232, + "logps/chosen": -237.32568359375, + "logps/rejected": -201.85385131835938, + "loss": 0.6685, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.04964232072234154, + "rewards/margins": 0.058151841163635254, + "rewards/rejected": -0.008509524166584015, "step": 260 }, { - "epoch": 0.31, - "grad_norm": 1.6953125, - "learning_rate": 4.3717845483839846e-07, - "logits/chosen": -2.883763551712036, - "logits/rejected": -2.5161526203155518, - "logps/chosen": -186.10284423828125, - "logps/rejected": -189.4357147216797, - "loss": 0.5716, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.07114340364933014, - "rewards/margins": 0.2645713686943054, - "rewards/rejected": -0.19342796504497528, + "epoch": 0.03, + "grad_norm": 1.703125, + "learning_rate": 1.4786418400876235e-06, + "logits/chosen": -2.8123316764831543, + "logits/rejected": -2.5519070625305176, + "logps/chosen": -250.7762451171875, + "logps/rejected": -213.5297088623047, + "loss": 0.6684, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.04026147723197937, + "rewards/margins": 0.05658264085650444, + "rewards/rejected": -0.01632116362452507, "step": 270 }, { - "epoch": 0.32, - "grad_norm": 2.203125, - "learning_rate": 4.3043316704660515e-07, - "logits/chosen": -2.8337454795837402, - "logits/rejected": -2.464289903640747, - "logps/chosen": -168.27120971679688, - "logps/rejected": -192.19229125976562, - "loss": 0.5699, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.06063656881451607, - "rewards/margins": 0.2684558033943176, - "rewards/rejected": -0.20781925320625305, + "epoch": 0.03, + "grad_norm": 1.875, + "learning_rate": 1.5334063526834611e-06, + "logits/chosen": -2.829249143600464, + "logits/rejected": -2.7455642223358154, + "logps/chosen": -260.5328674316406, + "logps/rejected": -229.86483764648438, + "loss": 0.6651, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.035987488925457, + "rewards/margins": 0.03967081382870674, + "rewards/rejected": -0.003683327231556177, "step": 280 }, { - "epoch": 0.33, - "grad_norm": 1.703125, - "learning_rate": 4.2340185376902036e-07, - "logits/chosen": -2.868380308151245, - "logits/rejected": -2.575737714767456, - "logps/chosen": -161.83665466308594, - "logps/rejected": -204.4718475341797, - "loss": 0.5572, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.06080051511526108, - "rewards/margins": 0.2986100912094116, - "rewards/rejected": -0.23780956864356995, + "epoch": 0.03, + "grad_norm": 1.671875, + "learning_rate": 1.5881708652792991e-06, + "logits/chosen": -2.7662546634674072, + "logits/rejected": -2.6975953578948975, + "logps/chosen": -216.63467407226562, + "logps/rejected": -221.8691864013672, + "loss": 0.6683, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.035637084394693375, + "rewards/margins": 0.052807390689849854, + "rewards/rejected": -0.017170313745737076, "step": 290 }, { - "epoch": 0.34, - "grad_norm": 3.015625, - "learning_rate": 4.160956611537106e-07, - "logits/chosen": -2.8991565704345703, - "logits/rejected": -2.4560790061950684, - "logps/chosen": -179.05690002441406, - "logps/rejected": -192.29713439941406, - "loss": 0.5569, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.07949185371398926, - "rewards/margins": 0.2993330955505371, - "rewards/rejected": -0.21984124183654785, + "epoch": 0.03, + "grad_norm": 1.7734375, + "learning_rate": 1.642935377875137e-06, + "logits/chosen": -2.785132646560669, + "logits/rejected": -2.71547794342041, + "logps/chosen": -256.48065185546875, + "logps/rejected": -243.45333862304688, + "loss": 0.6639, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0422230139374733, + "rewards/margins": 0.06598812341690063, + "rewards/rejected": -0.023765115067362785, "step": 300 }, { - "epoch": 0.35, - "grad_norm": 2.0, - "learning_rate": 4.0852617109177856e-07, - "logits/chosen": -2.841251850128174, - "logits/rejected": -2.5635604858398438, - "logps/chosen": -167.36886596679688, - "logps/rejected": -198.37551879882812, - "loss": 0.5586, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.0692732185125351, - "rewards/margins": 0.2971513867378235, - "rewards/rejected": -0.227878138422966, + "epoch": 0.03, + "grad_norm": 1.734375, + "learning_rate": 1.697699890470975e-06, + "logits/chosen": -2.805774211883545, + "logits/rejected": -2.714113473892212, + "logps/chosen": -290.1482849121094, + "logps/rejected": -239.5406951904297, + "loss": 0.6648, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.05262279510498047, + "rewards/margins": 0.05965893343091011, + "rewards/rejected": -0.007036137394607067, "step": 310 }, { - "epoch": 0.36, - "grad_norm": 2.109375, - "learning_rate": 4.0070538285756837e-07, - "logits/chosen": -2.863354206085205, - "logits/rejected": -2.530529499053955, - "logps/chosen": -150.37063598632812, - "logps/rejected": -184.52720642089844, - "loss": 0.5459, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.06634830683469772, - "rewards/margins": 0.32723596692085266, - "rewards/rejected": -0.26088768243789673, + "epoch": 0.04, + "grad_norm": 1.8203125, + "learning_rate": 1.7524644030668128e-06, + "logits/chosen": -2.8585636615753174, + "logits/rejected": -2.671229362487793, + "logps/chosen": -236.3716278076172, + "logps/rejected": -214.996337890625, + "loss": 0.6594, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.04011351615190506, + "rewards/margins": 0.0729607418179512, + "rewards/rejected": -0.03284722566604614, "step": 320 }, { - "epoch": 0.38, - "grad_norm": 1.5703125, - "learning_rate": 3.9264569408722736e-07, - "logits/chosen": -2.77630352973938, - "logits/rejected": -2.4585368633270264, - "logps/chosen": -160.86729431152344, - "logps/rejected": -196.7068634033203, - "loss": 0.5395, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.06719180196523666, - "rewards/margins": 0.34240782260894775, - "rewards/rejected": -0.2752160429954529, + "epoch": 0.04, + "grad_norm": 1.6640625, + "learning_rate": 1.8072289156626508e-06, + "logits/chosen": -2.890075206756592, + "logits/rejected": -2.561285972595215, + "logps/chosen": -272.7789001464844, + "logps/rejected": -226.96517944335938, + "loss": 0.6478, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.044897016137838364, + "rewards/margins": 0.092148557305336, + "rewards/rejected": -0.04725153371691704, "step": 330 }, { - "epoch": 0.39, - "grad_norm": 2.0, - "learning_rate": 3.843598811257789e-07, - "logits/chosen": -2.85556960105896, - "logits/rejected": -2.4548332691192627, - "logps/chosen": -178.36248779296875, - "logps/rejected": -205.183349609375, - "loss": 0.5291, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08187603950500488, - "rewards/margins": 0.3680132329463959, - "rewards/rejected": -0.286137193441391, + "epoch": 0.04, + "grad_norm": 2.078125, + "learning_rate": 1.8619934282584886e-06, + "logits/chosen": -2.8873629570007324, + "logits/rejected": -2.5671157836914062, + "logps/chosen": -287.2441711425781, + "logps/rejected": -257.72027587890625, + "loss": 0.648, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.044538464397192, + "rewards/margins": 0.09786990284919739, + "rewards/rejected": -0.05333143472671509, "step": 340 }, { - "epoch": 0.4, - "grad_norm": 1.84375, - "learning_rate": 3.7586107877386034e-07, - "logits/chosen": -2.870631456375122, - "logits/rejected": -2.450942039489746, - "logps/chosen": -181.5648956298828, - "logps/rejected": -211.733154296875, - "loss": 0.5304, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.07045459002256393, - "rewards/margins": 0.3658156991004944, - "rewards/rejected": -0.29536113142967224, + "epoch": 0.04, + "grad_norm": 1.75, + "learning_rate": 1.9167579408543267e-06, + "logits/chosen": -2.749216079711914, + "logits/rejected": -2.748255491256714, + "logps/chosen": -237.42495727539062, + "logps/rejected": -235.8989715576172, + "loss": 0.6427, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.031294722110033035, + "rewards/margins": 0.09009484201669693, + "rewards/rejected": -0.0588001124560833, "step": 350 }, { - "epoch": 0.41, - "grad_norm": 2.5625, - "learning_rate": 3.6716275946623024e-07, - "logits/chosen": -2.839635133743286, - "logits/rejected": -2.456277847290039, - "logps/chosen": -174.53482055664062, - "logps/rejected": -209.671630859375, - "loss": 0.5212, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.08681714534759521, - "rewards/margins": 0.38686448335647583, - "rewards/rejected": -0.3000473380088806, + "epoch": 0.04, + "grad_norm": 1.78125, + "learning_rate": 1.9715224534501647e-06, + "logits/chosen": -2.8299384117126465, + "logits/rejected": -2.712571382522583, + "logps/chosen": -274.341064453125, + "logps/rejected": -209.17556762695312, + "loss": 0.6318, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.045265499502420425, + "rewards/margins": 0.11940612643957138, + "rewards/rejected": -0.07414063066244125, "step": 360 }, { - "epoch": 0.42, - "grad_norm": 3.03125, - "learning_rate": 3.5827871191505423e-07, - "logits/chosen": -2.8490676879882812, - "logits/rejected": -2.543060541152954, - "logps/chosen": -176.3990478515625, - "logps/rejected": -199.08612060546875, - "loss": 0.5152, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08045300096273422, - "rewards/margins": 0.40350237488746643, - "rewards/rejected": -0.3230493664741516, + "epoch": 0.04, + "grad_norm": 1.9375, + "learning_rate": 2.0262869660460023e-06, + "logits/chosen": -2.749109983444214, + "logits/rejected": -2.6670565605163574, + "logps/chosen": -232.65957641601562, + "logps/rejected": -239.2484893798828, + "loss": 0.631, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.029074231162667274, + "rewards/margins": 0.15582485496997833, + "rewards/rejected": -0.12675060331821442, "step": 370 }, { - "epoch": 0.43, - "grad_norm": 2.328125, - "learning_rate": 3.492230192518221e-07, - "logits/chosen": -2.895632028579712, - "logits/rejected": -2.409252166748047, - "logps/chosen": -171.3096923828125, - "logps/rejected": -209.94967651367188, - "loss": 0.5111, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.09112190455198288, - "rewards/margins": 0.4146246314048767, - "rewards/rejected": -0.32350271940231323, + "epoch": 0.04, + "grad_norm": 2.0625, + "learning_rate": 2.0810514786418403e-06, + "logits/chosen": -2.8259024620056152, + "logits/rejected": -2.6365818977355957, + "logps/chosen": -281.94342041015625, + "logps/rejected": -231.7667236328125, + "loss": 0.6284, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.060919325798749924, + "rewards/margins": 0.12978330254554749, + "rewards/rejected": -0.06886397302150726, "step": 380 }, { - "epoch": 0.44, - "grad_norm": 2.265625, - "learning_rate": 3.400100367025465e-07, - "logits/chosen": -2.8731470108032227, - "logits/rejected": -2.503204822540283, - "logps/chosen": -173.32028198242188, - "logps/rejected": -196.65586853027344, - "loss": 0.5192, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.0665183961391449, - "rewards/margins": 0.3936524987220764, - "rewards/rejected": -0.32713407278060913, + "epoch": 0.04, + "grad_norm": 2.234375, + "learning_rate": 2.135815991237678e-06, + "logits/chosen": -2.7310776710510254, + "logits/rejected": -2.6332387924194336, + "logps/chosen": -245.1631317138672, + "logps/rejected": -235.0743408203125, + "loss": 0.6379, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.025382841005921364, + "rewards/margins": 0.12476933002471924, + "rewards/rejected": -0.09938649833202362, "step": 390 }, { - "epoch": 0.46, - "grad_norm": 2.8125, - "learning_rate": 3.306543688316345e-07, - "logits/chosen": -2.888214588165283, - "logits/rejected": -2.442842960357666, - "logps/chosen": -170.76528930664062, - "logps/rejected": -205.6727752685547, - "loss": 0.5168, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.08383838832378387, - "rewards/margins": 0.399533212184906, - "rewards/rejected": -0.31569480895996094, + "epoch": 0.04, + "grad_norm": 1.921875, + "learning_rate": 2.190580503833516e-06, + "logits/chosen": -2.8063597679138184, + "logits/rejected": -2.6112935543060303, + "logps/chosen": -272.6799011230469, + "logps/rejected": -218.8790740966797, + "loss": 0.6146, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04419289529323578, + "rewards/margins": 0.1746334731578827, + "rewards/rejected": -0.1304405778646469, "step": 400 }, { - "epoch": 0.47, - "grad_norm": 2.125, - "learning_rate": 3.21170846390502e-07, - "logits/chosen": -2.851073980331421, - "logits/rejected": -2.49242901802063, - "logps/chosen": -168.2925567626953, - "logps/rejected": -201.2331085205078, - "loss": 0.5028, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.08710882067680359, - "rewards/margins": 0.4355054795742035, - "rewards/rejected": -0.3483967185020447, + "epoch": 0.04, + "grad_norm": 2.203125, + "learning_rate": 2.245345016429354e-06, + "logits/chosen": -2.780463695526123, + "logits/rejected": -2.659960985183716, + "logps/chosen": -291.32049560546875, + "logps/rejected": -300.0713806152344, + "loss": 0.6114, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.034593094140291214, + "rewards/margins": 0.1796794831752777, + "rewards/rejected": -0.1450863927602768, "step": 410 }, { - "epoch": 0.48, - "grad_norm": 1.9375, - "learning_rate": 3.115745028076346e-07, - "logits/chosen": -2.8840301036834717, - "logits/rejected": -2.433309316635132, - "logps/chosen": -184.21217346191406, - "logps/rejected": -212.9416046142578, - "loss": 0.5007, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.06778125464916229, - "rewards/margins": 0.4425056576728821, - "rewards/rejected": -0.3747243583202362, + "epoch": 0.05, + "grad_norm": 2.078125, + "learning_rate": 2.300109529025192e-06, + "logits/chosen": -2.8896381855010986, + "logits/rejected": -2.830109119415283, + "logps/chosen": -274.0310974121094, + "logps/rejected": -263.286865234375, + "loss": 0.5948, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.042153507471084595, + "rewards/margins": 0.2208167314529419, + "rewards/rejected": -0.1786632239818573, "step": 420 }, { - "epoch": 0.49, - "grad_norm": 2.640625, - "learning_rate": 3.0188055035736117e-07, - "logits/chosen": -2.8709633350372314, - "logits/rejected": -2.5184578895568848, - "logps/chosen": -165.3046417236328, - "logps/rejected": -204.6185302734375, - "loss": 0.4985, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.08469386398792267, - "rewards/margins": 0.4470769762992859, - "rewards/rejected": -0.3623831272125244, + "epoch": 0.05, + "grad_norm": 2.5625, + "learning_rate": 2.3548740416210296e-06, + "logits/chosen": -2.6790504455566406, + "logits/rejected": -2.634852170944214, + "logps/chosen": -250.50045776367188, + "logps/rejected": -239.4146270751953, + "loss": 0.5947, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016501255333423615, + "rewards/margins": 0.22882139682769775, + "rewards/rejected": -0.21232013404369354, "step": 430 }, { - "epoch": 0.5, - "grad_norm": 3.03125, - "learning_rate": 2.9210435604511753e-07, - "logits/chosen": -2.857630491256714, - "logits/rejected": -2.430194854736328, - "logps/chosen": -174.75064086914062, - "logps/rejected": -220.9373016357422, - "loss": 0.4981, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.06860364973545074, - "rewards/margins": 0.4483613967895508, - "rewards/rejected": -0.37975770235061646, + "epoch": 0.05, + "grad_norm": 2.84375, + "learning_rate": 2.4096385542168676e-06, + "logits/chosen": -2.8568007946014404, + "logits/rejected": -2.575193405151367, + "logps/chosen": -285.1177673339844, + "logps/rejected": -249.79638671875, + "loss": 0.5765, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.016978111118078232, + "rewards/margins": 0.27164173126220703, + "rewards/rejected": -0.2546636164188385, "step": 440 }, { - "epoch": 0.51, - "grad_norm": 2.71875, - "learning_rate": 2.8226141724742885e-07, - "logits/chosen": -2.858372449874878, - "logits/rejected": -2.530306100845337, - "logps/chosen": -157.61270141601562, - "logps/rejected": -204.2855224609375, - "loss": 0.497, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08262623846530914, - "rewards/margins": 0.4500602185726166, - "rewards/rejected": -0.36743396520614624, + "epoch": 0.05, + "grad_norm": 2.96875, + "learning_rate": 2.4644030668127056e-06, + "logits/chosen": -2.8484041690826416, + "logits/rejected": -2.5546345710754395, + "logps/chosen": -311.09759521484375, + "logps/rejected": -252.48458862304688, + "loss": 0.6098, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07847253978252411, + "rewards/margins": 0.21573105454444885, + "rewards/rejected": -0.29420357942581177, "step": 450 }, { - "epoch": 0.52, - "grad_norm": 2.203125, - "learning_rate": 2.7236733714522536e-07, - "logits/chosen": -2.83618426322937, - "logits/rejected": -2.454885244369507, - "logps/chosen": -172.3725128173828, - "logps/rejected": -214.83633422851562, - "loss": 0.4892, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.07927767187356949, - "rewards/margins": 0.46998995542526245, - "rewards/rejected": -0.39071232080459595, + "epoch": 0.05, + "grad_norm": 2.53125, + "learning_rate": 2.5191675794085437e-06, + "logits/chosen": -2.702130079269409, + "logits/rejected": -2.7694859504699707, + "logps/chosen": -261.28387451171875, + "logps/rejected": -263.8294372558594, + "loss": 0.6259, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.061372626572847366, + "rewards/margins": 0.1949286311864853, + "rewards/rejected": -0.25630128383636475, "step": 460 }, { - "epoch": 0.54, - "grad_norm": 1.4921875, - "learning_rate": 2.6243779998943493e-07, - "logits/chosen": -2.90132999420166, - "logits/rejected": -2.533686876296997, - "logps/chosen": -170.46307373046875, - "logps/rejected": -215.61697387695312, - "loss": 0.4886, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08604458719491959, - "rewards/margins": 0.4726595878601074, - "rewards/rejected": -0.3866150379180908, + "epoch": 0.05, + "grad_norm": 2.65625, + "learning_rate": 2.5739320920043813e-06, + "logits/chosen": -2.7930312156677246, + "logits/rejected": -2.5885863304138184, + "logps/chosen": -257.1583557128906, + "logps/rejected": -248.2337646484375, + "loss": 0.5675, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.01281928550451994, + "rewards/margins": 0.31158119440078735, + "rewards/rejected": -0.29876190423965454, "step": 470 }, { - "epoch": 0.55, - "grad_norm": 2.234375, - "learning_rate": 2.524885462380629e-07, - "logits/chosen": -2.892915964126587, - "logits/rejected": -2.521907329559326, - "logps/chosen": -163.7088165283203, - "logps/rejected": -215.0779266357422, - "loss": 0.4815, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.08504535257816315, - "rewards/margins": 0.4920417368412018, - "rewards/rejected": -0.4069964289665222, + "epoch": 0.05, + "grad_norm": 3.3125, + "learning_rate": 2.6286966046002193e-06, + "logits/chosen": -2.8419480323791504, + "logits/rejected": -2.593590021133423, + "logps/chosen": -272.9858093261719, + "logps/rejected": -242.7591094970703, + "loss": 0.5794, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.10869578272104263, + "rewards/margins": 0.32147008180618286, + "rewards/rejected": -0.4301658272743225, "step": 480 }, { - "epoch": 0.56, - "grad_norm": 1.5859375, - "learning_rate": 2.4253534760417125e-07, - "logits/chosen": -2.862372398376465, - "logits/rejected": -2.471256971359253, - "logps/chosen": -176.91098022460938, - "logps/rejected": -223.06460571289062, - "loss": 0.4866, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.08416497707366943, - "rewards/margins": 0.4780053198337555, - "rewards/rejected": -0.39384034276008606, + "epoch": 0.05, + "grad_norm": 4.34375, + "learning_rate": 2.6834611171960573e-06, + "logits/chosen": -2.8444559574127197, + "logits/rejected": -2.623065233230591, + "logps/chosen": -247.183837890625, + "logps/rejected": -231.38815307617188, + "loss": 0.5691, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.024225344881415367, + "rewards/margins": 0.3494858145713806, + "rewards/rejected": -0.37371116876602173, "step": 490 }, { - "epoch": 0.57, - "grad_norm": 2.9375, - "learning_rate": 2.3259398205431142e-07, - "logits/chosen": -2.8392093181610107, - "logits/rejected": -2.4134931564331055, - "logps/chosen": -174.56719970703125, - "logps/rejected": -214.64663696289062, - "loss": 0.4838, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.07892782241106033, - "rewards/margins": 0.48496612906455994, - "rewards/rejected": -0.4060383439064026, + "epoch": 0.05, + "grad_norm": 4.1875, + "learning_rate": 2.7382256297918953e-06, + "logits/chosen": -2.8263964653015137, + "logits/rejected": -2.557645797729492, + "logps/chosen": -246.4414825439453, + "logps/rejected": -258.9903869628906, + "loss": 0.5767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07571586966514587, + "rewards/margins": 0.3399045467376709, + "rewards/rejected": -0.4156204164028168, "step": 500 }, { - "epoch": 0.58, - "grad_norm": 2.328125, - "learning_rate": 2.226802087970444e-07, - "logits/chosen": -2.8758647441864014, - "logits/rejected": -2.5020365715026855, - "logps/chosen": -169.37881469726562, - "logps/rejected": -202.48895263671875, - "loss": 0.4897, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.07302182167768478, - "rewards/margins": 0.47209176421165466, - "rewards/rejected": -0.3990699350833893, + "epoch": 0.06, + "grad_norm": 3.5, + "learning_rate": 2.792990142387733e-06, + "logits/chosen": -2.7393596172332764, + "logits/rejected": -2.6252517700195312, + "logps/chosen": -286.2635803222656, + "logps/rejected": -284.5791320800781, + "loss": 0.5585, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.21136817336082458, + "rewards/margins": 0.34657496213912964, + "rewards/rejected": -0.5579430460929871, "step": 510 }, { - "epoch": 0.59, - "grad_norm": 4.0, - "learning_rate": 2.1280974330119645e-07, - "logits/chosen": -2.8340697288513184, - "logits/rejected": -2.515873670578003, - "logps/chosen": -172.9220733642578, - "logps/rejected": -220.0417938232422, - "loss": 0.4925, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.06756650656461716, - "rewards/margins": 0.466563880443573, - "rewards/rejected": -0.39899739623069763, + "epoch": 0.06, + "grad_norm": 5.28125, + "learning_rate": 2.847754654983571e-06, + "logits/chosen": -2.7105748653411865, + "logits/rejected": -2.6639561653137207, + "logps/chosen": -273.41949462890625, + "logps/rejected": -260.33502197265625, + "loss": 0.5835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04918425530195236, + "rewards/margins": 0.35303714871406555, + "rewards/rejected": -0.4022213816642761, "step": 520 }, { - "epoch": 0.6, - "grad_norm": 2.046875, - "learning_rate": 2.0299823238345123e-07, - "logits/chosen": -2.9098801612854004, - "logits/rejected": -2.4559361934661865, - "logps/chosen": -172.07351684570312, - "logps/rejected": -210.4265594482422, - "loss": 0.4786, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.07923649251461029, - "rewards/margins": 0.500261664390564, - "rewards/rejected": -0.42102521657943726, + "epoch": 0.06, + "grad_norm": 4.40625, + "learning_rate": 2.902519167579409e-06, + "logits/chosen": -2.719219446182251, + "logits/rejected": -2.664323568344116, + "logps/chosen": -208.7344207763672, + "logps/rejected": -253.9524383544922, + "loss": 0.5753, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05337945371866226, + "rewards/margins": 0.324246346950531, + "rewards/rejected": -0.37762585282325745, "step": 530 }, { - "epoch": 0.62, - "grad_norm": 2.859375, - "learning_rate": 1.9326122940477098e-07, - "logits/chosen": -2.835815906524658, - "logits/rejected": -2.534432888031006, - "logps/chosen": -158.05502319335938, - "logps/rejected": -210.82467651367188, - "loss": 0.4777, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.09574057161808014, - "rewards/margins": 0.5016153454780579, - "rewards/rejected": -0.40587472915649414, + "epoch": 0.06, + "grad_norm": 3.65625, + "learning_rate": 2.957283680175247e-06, + "logits/chosen": -2.7607245445251465, + "logits/rejected": -2.6070408821105957, + "logps/chosen": -268.8612365722656, + "logps/rejected": -245.3338623046875, + "loss": 0.5964, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09268485754728317, + "rewards/margins": 0.32926663756370544, + "rewards/rejected": -0.4219515323638916, "step": 540 }, { - "epoch": 0.63, - "grad_norm": 1.953125, - "learning_rate": 1.836141696149641e-07, - "logits/chosen": -2.873433828353882, - "logits/rejected": -2.4963955879211426, - "logps/chosen": -175.86912536621094, - "logps/rejected": -213.4418487548828, - "loss": 0.4757, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.07796572893857956, - "rewards/margins": 0.5069178938865662, - "rewards/rejected": -0.428952157497406, + "epoch": 0.06, + "grad_norm": 3.59375, + "learning_rate": 3.012048192771085e-06, + "logits/chosen": -2.627256393432617, + "logits/rejected": -2.6897642612457275, + "logps/chosen": -238.22653198242188, + "logps/rejected": -257.1009826660156, + "loss": 0.5663, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1254665106534958, + "rewards/margins": 0.34596338868141174, + "rewards/rejected": -0.4714299142360687, "step": 550 }, { - "epoch": 0.64, - "grad_norm": 2.59375, - "learning_rate": 1.7407234568448582e-07, - "logits/chosen": -2.866560220718384, - "logits/rejected": -2.452404260635376, - "logps/chosen": -174.77688598632812, - "logps/rejected": -208.7044219970703, - "loss": 0.4859, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.06438983976840973, - "rewards/margins": 0.4824218153953552, - "rewards/rejected": -0.4180319905281067, + "epoch": 0.06, + "grad_norm": 3.828125, + "learning_rate": 3.0668127053669222e-06, + "logits/chosen": -2.76472806930542, + "logits/rejected": -2.44730806350708, + "logps/chosen": -276.2402038574219, + "logps/rejected": -265.4513244628906, + "loss": 0.5514, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.18088696897029877, + "rewards/margins": 0.3992847204208374, + "rewards/rejected": -0.5801717042922974, "step": 560 }, { - "epoch": 0.65, - "grad_norm": 3.671875, - "learning_rate": 1.6465088346225718e-07, - "logits/chosen": -2.846304178237915, - "logits/rejected": -2.5482144355773926, - "logps/chosen": -169.07928466796875, - "logps/rejected": -215.85073852539062, - "loss": 0.4794, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08944657444953918, - "rewards/margins": 0.49800539016723633, - "rewards/rejected": -0.40855884552001953, + "epoch": 0.06, + "grad_norm": 3.359375, + "learning_rate": 3.1215772179627602e-06, + "logits/chosen": -2.70143461227417, + "logits/rejected": -2.5528149604797363, + "logps/chosen": -251.514404296875, + "logps/rejected": -253.4136505126953, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11225078254938126, + "rewards/margins": 0.3681809604167938, + "rewards/rejected": -0.4804317355155945, "step": 570 }, { - "epoch": 0.66, - "grad_norm": 2.390625, - "learning_rate": 1.5536471799793138e-07, - "logits/chosen": -2.8919830322265625, - "logits/rejected": -2.521151065826416, - "logps/chosen": -170.53781127929688, - "logps/rejected": -232.29037475585938, - "loss": 0.4928, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.05806390568614006, - "rewards/margins": 0.46449360251426697, - "rewards/rejected": -0.4064297080039978, + "epoch": 0.06, + "grad_norm": 6.5, + "learning_rate": 3.1763417305585983e-06, + "logits/chosen": -2.6715104579925537, + "logits/rejected": -2.6148133277893066, + "logps/chosen": -249.9071807861328, + "logps/rejected": -284.11407470703125, + "loss": 0.5215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18943080306053162, + "rewards/margins": 0.5768919587135315, + "rewards/rejected": -0.7663227319717407, "step": 580 }, { - "epoch": 0.67, - "grad_norm": 3.09375, - "learning_rate": 1.4622856986661987e-07, - "logits/chosen": -2.8473172187805176, - "logits/rejected": -2.546738624572754, - "logps/chosen": -176.05282592773438, - "logps/rejected": -217.82015991210938, - "loss": 0.4902, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.058601103723049164, - "rewards/margins": 0.4699791371822357, - "rewards/rejected": -0.41137805581092834, + "epoch": 0.06, + "grad_norm": 5.3125, + "learning_rate": 3.231106243154436e-06, + "logits/chosen": -2.7282440662384033, + "logits/rejected": -2.653993844985962, + "logps/chosen": -280.6946716308594, + "logps/rejected": -313.78082275390625, + "loss": 0.5539, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.32655638456344604, + "rewards/margins": 0.34934642910957336, + "rewards/rejected": -0.6759028434753418, "step": 590 }, { - "epoch": 0.68, - "grad_norm": 2.09375, - "learning_rate": 1.3725692183360528e-07, - "logits/chosen": -2.8582708835601807, - "logits/rejected": -2.4678211212158203, - "logps/chosen": -153.3969268798828, - "logps/rejected": -212.5640869140625, - "loss": 0.4709, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08193036168813705, - "rewards/margins": 0.5185868740081787, - "rewards/rejected": -0.43665653467178345, + "epoch": 0.07, + "grad_norm": 8.5625, + "learning_rate": 3.285870755750274e-06, + "logits/chosen": -2.6690428256988525, + "logits/rejected": -2.477430820465088, + "logps/chosen": -245.534912109375, + "logps/rejected": -280.96728515625, + "loss": 0.5835, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.2352246791124344, + "rewards/margins": 0.35390979051589966, + "rewards/rejected": -0.5891345143318176, "step": 600 }, { - "epoch": 0.7, - "grad_norm": 3.671875, - "learning_rate": 1.284639958960345e-07, - "logits/chosen": -2.8366339206695557, - "logits/rejected": -2.518401861190796, - "logps/chosen": -163.55068969726562, - "logps/rejected": -202.49319458007812, - "loss": 0.4748, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.07512009888887405, - "rewards/margins": 0.5090826749801636, - "rewards/rejected": -0.4339626431465149, + "epoch": 0.07, + "grad_norm": 4.40625, + "learning_rate": 3.340635268346112e-06, + "logits/chosen": -2.716850757598877, + "logits/rejected": -2.547917127609253, + "logps/chosen": -231.4207305908203, + "logps/rejected": -293.59246826171875, + "loss": 0.495, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0661562904715538, + "rewards/margins": 0.8044396638870239, + "rewards/rejected": -0.8705958127975464, "step": 610 }, { - "epoch": 0.71, - "grad_norm": 2.90625, - "learning_rate": 1.1986373073798666e-07, - "logits/chosen": -2.8479573726654053, - "logits/rejected": -2.573652982711792, - "logps/chosen": -180.49862670898438, - "logps/rejected": -229.23193359375, - "loss": 0.4973, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.07855061441659927, - "rewards/margins": 0.45434314012527466, - "rewards/rejected": -0.3757924735546112, + "epoch": 0.07, + "grad_norm": 5.15625, + "learning_rate": 3.39539978094195e-06, + "logits/chosen": -2.6627285480499268, + "logits/rejected": -2.6417980194091797, + "logps/chosen": -286.7968444824219, + "logps/rejected": -309.31048583984375, + "loss": 0.5694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31544560194015503, + "rewards/margins": 0.37990233302116394, + "rewards/rejected": -0.6953479647636414, "step": 620 }, { - "epoch": 0.72, - "grad_norm": 2.765625, - "learning_rate": 1.1146975963465177e-07, - "logits/chosen": -2.861266613006592, - "logits/rejected": -2.4928736686706543, - "logps/chosen": -181.80465698242188, - "logps/rejected": -228.2000274658203, - "loss": 0.4826, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.0723063200712204, - "rewards/margins": 0.4900081753730774, - "rewards/rejected": -0.4177018105983734, + "epoch": 0.07, + "grad_norm": 6.0625, + "learning_rate": 3.4501642935377876e-06, + "logits/chosen": -2.665808916091919, + "logits/rejected": -2.5588669776916504, + "logps/chosen": -268.2917175292969, + "logps/rejected": -325.01263427734375, + "loss": 0.4968, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22434011101722717, + "rewards/margins": 0.664271891117096, + "rewards/rejected": -0.8886119723320007, "step": 630 }, { - "epoch": 0.73, - "grad_norm": 3.28125, - "learning_rate": 1.0329538884064947e-07, - "logits/chosen": -2.8532233238220215, - "logits/rejected": -2.4556894302368164, - "logps/chosen": -175.3080596923828, - "logps/rejected": -223.6966094970703, - "loss": 0.4813, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.07171030342578888, - "rewards/margins": 0.4941326081752777, - "rewards/rejected": -0.42242231965065, + "epoch": 0.07, + "grad_norm": 4.875, + "learning_rate": 3.5049288061336256e-06, + "logits/chosen": -2.7550580501556396, + "logits/rejected": -2.5946969985961914, + "logps/chosen": -297.67041015625, + "logps/rejected": -325.63446044921875, + "loss": 0.5099, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07306022942066193, + "rewards/margins": 0.6391320824623108, + "rewards/rejected": -0.7121923565864563, "step": 640 }, { - "epoch": 0.74, - "grad_norm": 2.171875, - "learning_rate": 9.535357649674552e-08, - "logits/chosen": -2.870288372039795, - "logits/rejected": -2.5269229412078857, - "logps/chosen": -174.1137237548828, - "logps/rejected": -219.01199340820312, - "loss": 0.4729, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.07941259443759918, - "rewards/margins": 0.5140343904495239, - "rewards/rejected": -0.43462175130844116, + "epoch": 0.07, + "grad_norm": 6.0625, + "learning_rate": 3.5596933187294636e-06, + "logits/chosen": -2.663618564605713, + "logits/rejected": -2.622084140777588, + "logps/chosen": -258.5335693359375, + "logps/rejected": -281.60498046875, + "loss": 0.5017, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.23784513771533966, + "rewards/margins": 0.5352921485900879, + "rewards/rejected": -0.7731372714042664, "step": 650 }, { - "epoch": 0.75, - "grad_norm": 2.375, - "learning_rate": 8.765691208840373e-08, - "logits/chosen": -2.8760666847229004, - "logits/rejected": -2.5234336853027344, - "logps/chosen": -166.46878051757812, - "logps/rejected": -212.5137939453125, - "loss": 0.4921, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.04601392149925232, - "rewards/margins": 0.466000497341156, - "rewards/rejected": -0.41998663544654846, + "epoch": 0.07, + "grad_norm": 7.09375, + "learning_rate": 3.6144578313253016e-06, + "logits/chosen": -2.6709766387939453, + "logits/rejected": -2.5652520656585693, + "logps/chosen": -276.91802978515625, + "logps/rejected": -282.68133544921875, + "loss": 0.5117, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.27552682161331177, + "rewards/margins": 0.5321172475814819, + "rewards/rejected": -0.8076440095901489, "step": 660 }, { - "epoch": 0.76, - "grad_norm": 2.0625, - "learning_rate": 8.021759648873641e-08, - "logits/chosen": -2.8481667041778564, - "logits/rejected": -2.4773175716400146, - "logps/chosen": -155.8638916015625, - "logps/rejected": -201.8092041015625, - "loss": 0.4883, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.07519511878490448, - "rewards/margins": 0.47641342878341675, - "rewards/rejected": -0.40121832489967346, + "epoch": 0.07, + "grad_norm": 7.4375, + "learning_rate": 3.6692223439211392e-06, + "logits/chosen": -2.6343860626220703, + "logits/rejected": -2.4500298500061035, + "logps/chosen": -283.170166015625, + "logps/rejected": -309.2031555175781, + "loss": 0.4935, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3035723865032196, + "rewards/margins": 0.6837118268013, + "rewards/rejected": -0.9872843027114868, "step": 670 }, { - "epoch": 0.78, - "grad_norm": 2.015625, - "learning_rate": 7.304742261748848e-08, - "logits/chosen": -2.8874032497406006, - "logits/rejected": -2.5201120376586914, - "logps/chosen": -169.80618286132812, - "logps/rejected": -226.75784301757812, - "loss": 0.4751, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.07257901132106781, - "rewards/margins": 0.509274959564209, - "rewards/rejected": -0.43669596314430237, + "epoch": 0.07, + "grad_norm": 8.9375, + "learning_rate": 3.7239868565169773e-06, + "logits/chosen": -2.7095823287963867, + "logits/rejected": -2.3579328060150146, + "logps/chosen": -315.2501220703125, + "logps/rejected": -323.98712158203125, + "loss": 0.5121, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3494178354740143, + "rewards/margins": 0.6535326242446899, + "rewards/rejected": -1.0029505491256714, "step": 680 }, { - "epoch": 0.79, - "grad_norm": 2.21875, - "learning_rate": 6.615775674671705e-08, - "logits/chosen": -2.855459213256836, - "logits/rejected": -2.5406861305236816, - "logps/chosen": -176.09823608398438, - "logps/rejected": -217.3997802734375, - "loss": 0.483, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.07467280328273773, - "rewards/margins": 0.4878435730934143, - "rewards/rejected": -0.4131707549095154, + "epoch": 0.08, + "grad_norm": 9.75, + "learning_rate": 3.7787513691128153e-06, + "logits/chosen": -2.681112289428711, + "logits/rejected": -2.602525472640991, + "logps/chosen": -294.92193603515625, + "logps/rejected": -324.9920959472656, + "loss": 0.5031, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3271883428096771, + "rewards/margins": 0.5680071711540222, + "rewards/rejected": -0.8951956033706665, "step": 690 }, { - "epoch": 0.8, - "grad_norm": 1.8359375, - "learning_rate": 5.9559520482797945e-08, - "logits/chosen": -2.887681245803833, - "logits/rejected": -2.4717302322387695, - "logps/chosen": -170.18370056152344, - "logps/rejected": -214.17800903320312, - "loss": 0.4695, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.08642037957906723, - "rewards/margins": 0.5243626236915588, - "rewards/rejected": -0.4379422068595886, + "epoch": 0.08, + "grad_norm": 5.96875, + "learning_rate": 3.833515881708653e-06, + "logits/chosen": -2.5626072883605957, + "logits/rejected": -2.456106424331665, + "logps/chosen": -292.56036376953125, + "logps/rejected": -303.7664489746094, + "loss": 0.5005, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.30799394845962524, + "rewards/margins": 0.5858756899833679, + "rewards/rejected": -0.8938696980476379, "step": 700 }, { - "epoch": 0.81, - "grad_norm": 2.296875, - "learning_rate": 5.326317345332415e-08, - "logits/chosen": -2.8460443019866943, - "logits/rejected": -2.539069414138794, - "logps/chosen": -178.67799377441406, - "logps/rejected": -226.777587890625, - "loss": 0.4742, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08472954481840134, - "rewards/margins": 0.5101224780082703, - "rewards/rejected": -0.4253929555416107, + "epoch": 0.08, + "grad_norm": 5.84375, + "learning_rate": 3.888280394304491e-06, + "logits/chosen": -2.6345629692077637, + "logits/rejected": -2.486281156539917, + "logps/chosen": -292.9887390136719, + "logps/rejected": -362.00103759765625, + "loss": 0.4687, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4237740635871887, + "rewards/margins": 0.9254468083381653, + "rewards/rejected": -1.349220871925354, "step": 710 }, { - "epoch": 0.82, - "grad_norm": 2.109375, - "learning_rate": 4.727869672634044e-08, - "logits/chosen": -2.867482900619507, - "logits/rejected": -2.413532018661499, - "logps/chosen": -182.20126342773438, - "logps/rejected": -220.12564086914062, - "loss": 0.4739, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.07562164962291718, - "rewards/margins": 0.5126403570175171, - "rewards/rejected": -0.43701863288879395, + "epoch": 0.08, + "grad_norm": 4.96875, + "learning_rate": 3.943044906900329e-06, + "logits/chosen": -2.5242300033569336, + "logits/rejected": -2.475724697113037, + "logps/chosen": -282.6354064941406, + "logps/rejected": -336.75152587890625, + "loss": 0.5243, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5285394191741943, + "rewards/margins": 0.6452410221099854, + "rewards/rejected": -1.1737804412841797, "step": 720 }, { - "epoch": 0.83, - "grad_norm": 2.625, - "learning_rate": 4.161557698819756e-08, - "logits/chosen": -2.912193775177002, - "logits/rejected": -2.421642303466797, - "logps/chosen": -166.98995971679688, - "logps/rejected": -213.90029907226562, - "loss": 0.4764, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.08546830713748932, - "rewards/margins": 0.5058776140213013, - "rewards/rejected": -0.42040929198265076, + "epoch": 0.08, + "grad_norm": 6.40625, + "learning_rate": 3.997809419496167e-06, + "logits/chosen": -2.579012155532837, + "logits/rejected": -2.5811235904693604, + "logps/chosen": -332.92205810546875, + "logps/rejected": -357.8192443847656, + "loss": 0.483, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.634338915348053, + "rewards/margins": 0.6850191950798035, + "rewards/rejected": -1.3193581104278564, "step": 730 }, { - "epoch": 0.84, - "grad_norm": 2.03125, - "learning_rate": 3.628279150510832e-08, - "logits/chosen": -2.8754191398620605, - "logits/rejected": -2.4535117149353027, - "logps/chosen": -177.17019653320312, - "logps/rejected": -215.27096557617188, - "loss": 0.4792, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.07282783091068268, - "rewards/margins": 0.499560683965683, - "rewards/rejected": -0.4267328679561615, + "epoch": 0.08, + "grad_norm": 3.828125, + "learning_rate": 4.0525739320920046e-06, + "logits/chosen": -2.4780213832855225, + "logits/rejected": -2.362420082092285, + "logps/chosen": -308.8953857421875, + "logps/rejected": -385.4682312011719, + "loss": 0.4825, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6396969556808472, + "rewards/margins": 0.9579526782035828, + "rewards/rejected": -1.5976496934890747, "step": 740 }, { - "epoch": 0.86, - "grad_norm": 3.3125, - "learning_rate": 3.128879389224442e-08, - "logits/chosen": -2.836944341659546, - "logits/rejected": -2.520453453063965, - "logps/chosen": -172.06668090820312, - "logps/rejected": -213.32241821289062, - "loss": 0.4839, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.08796529471874237, - "rewards/margins": 0.48748818039894104, - "rewards/rejected": -0.3995228707790375, + "epoch": 0.08, + "grad_norm": 6.75, + "learning_rate": 4.107338444687843e-06, + "logits/chosen": -2.3642187118530273, + "logits/rejected": -2.164095401763916, + "logps/chosen": -329.96270751953125, + "logps/rejected": -368.11114501953125, + "loss": 0.4501, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7073386311531067, + "rewards/margins": 0.9214792251586914, + "rewards/rejected": -1.6288179159164429, "step": 750 }, { - "epoch": 0.87, - "grad_norm": 2.296875, - "learning_rate": 2.664150071293314e-08, - "logits/chosen": -2.9090170860290527, - "logits/rejected": -2.5711700916290283, - "logps/chosen": -167.62237548828125, - "logps/rejected": -214.95864868164062, - "loss": 0.4775, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.07353739440441132, - "rewards/margins": 0.5037845373153687, - "rewards/rejected": -0.43024712800979614, + "epoch": 0.08, + "grad_norm": 5.375, + "learning_rate": 4.162102957283681e-06, + "logits/chosen": -2.4715116024017334, + "logits/rejected": -2.364187717437744, + "logps/chosen": -288.19873046875, + "logps/rejected": -359.4644470214844, + "loss": 0.4932, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.607178807258606, + "rewards/margins": 0.8696765899658203, + "rewards/rejected": -1.4768553972244263, "step": 760 }, { - "epoch": 0.88, - "grad_norm": 2.265625, - "learning_rate": 2.2348278929196885e-08, - "logits/chosen": -2.863905191421509, - "logits/rejected": -2.483037233352661, - "logps/chosen": -174.7396240234375, - "logps/rejected": -221.8039093017578, - "loss": 0.477, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.06731019914150238, - "rewards/margins": 0.5031149387359619, - "rewards/rejected": -0.43580469489097595, + "epoch": 0.08, + "grad_norm": 6.5625, + "learning_rate": 4.216867469879519e-06, + "logits/chosen": -2.487168788909912, + "logits/rejected": -2.2629923820495605, + "logps/chosen": -322.0716552734375, + "logps/rejected": -372.9342956542969, + "loss": 0.4592, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8693659901618958, + "rewards/margins": 0.8600971102714539, + "rewards/rejected": -1.72946298122406, "step": 770 }, { - "epoch": 0.89, - "grad_norm": 3.4375, - "learning_rate": 1.8415934223529662e-08, - "logits/chosen": -2.838940382003784, - "logits/rejected": -2.4448282718658447, - "logps/chosen": -178.02725219726562, - "logps/rejected": -222.06912231445312, - "loss": 0.4793, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08046524226665497, - "rewards/margins": 0.49785953760147095, - "rewards/rejected": -0.4173942506313324, + "epoch": 0.09, + "grad_norm": 8.125, + "learning_rate": 4.271631982475356e-06, + "logits/chosen": -2.342072010040283, + "logits/rejected": -2.256777286529541, + "logps/chosen": -326.3128662109375, + "logps/rejected": -389.41168212890625, + "loss": 0.4854, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8249146342277527, + "rewards/margins": 0.8689907193183899, + "rewards/rejected": -1.693905234336853, "step": 780 }, { - "epoch": 0.9, - "grad_norm": 2.390625, - "learning_rate": 1.4850700210422367e-08, - "logits/chosen": -2.846452236175537, - "logits/rejected": -2.409905433654785, - "logps/chosen": -168.58853149414062, - "logps/rejected": -220.01123046875, - "loss": 0.481, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.08234288543462753, - "rewards/margins": 0.49280768632888794, - "rewards/rejected": -0.4104648232460022, + "epoch": 0.09, + "grad_norm": 6.65625, + "learning_rate": 4.326396495071194e-06, + "logits/chosen": -2.3683266639709473, + "logits/rejected": -2.3185696601867676, + "logps/chosen": -329.46038818359375, + "logps/rejected": -412.4815979003906, + "loss": 0.4752, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6800652742385864, + "rewards/margins": 0.9414347410202026, + "rewards/rejected": -1.621500015258789, "step": 790 }, { - "epoch": 0.91, - "grad_norm": 3.21875, - "learning_rate": 1.1658228554739358e-08, - "logits/chosen": -2.848785877227783, - "logits/rejected": -2.557270050048828, - "logps/chosen": -174.9704132080078, - "logps/rejected": -213.8970947265625, - "loss": 0.4835, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.0747850090265274, - "rewards/margins": 0.48944082856178284, - "rewards/rejected": -0.4146558344364166, + "epoch": 0.09, + "grad_norm": 8.0, + "learning_rate": 4.381161007667032e-06, + "logits/chosen": -2.401245594024658, + "logits/rejected": -2.251966953277588, + "logps/chosen": -381.958984375, + "logps/rejected": -436.2625427246094, + "loss": 0.4993, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.013897180557251, + "rewards/margins": 0.9869539141654968, + "rewards/rejected": -2.0008511543273926, "step": 800 }, { - "epoch": 0.92, - "grad_norm": 3.0, - "learning_rate": 8.843580012610625e-09, - "logits/chosen": -2.8714425563812256, - "logits/rejected": -2.456696033477783, - "logps/chosen": -166.1836700439453, - "logps/rejected": -213.8503875732422, - "loss": 0.4802, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.0698079839348793, - "rewards/margins": 0.49577993154525757, - "rewards/rejected": -0.42597198486328125, + "epoch": 0.09, + "grad_norm": 8.3125, + "learning_rate": 4.4359255202628695e-06, + "logits/chosen": -2.253350019454956, + "logits/rejected": -2.158560276031494, + "logps/chosen": -362.97845458984375, + "logps/rejected": -460.44427490234375, + "loss": 0.5004, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1743993759155273, + "rewards/margins": 1.1072254180908203, + "rewards/rejected": -2.2816247940063477, "step": 810 }, { - "epoch": 0.94, - "grad_norm": 2.046875, - "learning_rate": 6.411216409041964e-09, - "logits/chosen": -2.9290549755096436, - "logits/rejected": -2.4687247276306152, - "logps/chosen": -186.51126098632812, - "logps/rejected": -221.556640625, - "loss": 0.472, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.07280518114566803, - "rewards/margins": 0.5188139081001282, - "rewards/rejected": -0.44600874185562134, + "epoch": 0.09, + "grad_norm": 8.3125, + "learning_rate": 4.490690032858708e-06, + "logits/chosen": -2.231471538543701, + "logits/rejected": -2.0662503242492676, + "logps/chosen": -394.73712158203125, + "logps/rejected": -462.4637756347656, + "loss": 0.4406, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3580061197280884, + "rewards/margins": 1.1027196645736694, + "rewards/rejected": -2.460726261138916, "step": 820 }, { - "epoch": 0.95, - "grad_norm": 2.109375, - "learning_rate": 4.36499356495984e-09, - "logits/chosen": -2.8895645141601562, - "logits/rejected": -2.498453378677368, - "logps/chosen": -156.81045532226562, - "logps/rejected": -215.7611083984375, - "loss": 0.482, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.06585963815450668, - "rewards/margins": 0.4925321936607361, - "rewards/rejected": -0.4266725480556488, + "epoch": 0.09, + "grad_norm": 9.4375, + "learning_rate": 4.5454545454545455e-06, + "logits/chosen": -2.252098560333252, + "logits/rejected": -2.1463170051574707, + "logps/chosen": -358.0401306152344, + "logps/rejected": -438.9056701660156, + "loss": 0.4271, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0561422109603882, + "rewards/margins": 1.161704421043396, + "rewards/rejected": -2.2178468704223633, "step": 830 }, { - "epoch": 0.96, - "grad_norm": 2.046875, - "learning_rate": 2.7081551849036656e-09, - "logits/chosen": -2.8575549125671387, - "logits/rejected": -2.473635196685791, - "logps/chosen": -178.793701171875, - "logps/rejected": -232.97518920898438, - "loss": 0.4741, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.09117928892374039, - "rewards/margins": 0.5147069096565247, - "rewards/rejected": -0.42352762818336487, + "epoch": 0.09, + "grad_norm": 12.75, + "learning_rate": 4.600219058050384e-06, + "logits/chosen": -2.292865037918091, + "logits/rejected": -2.2712926864624023, + "logps/chosen": -373.51861572265625, + "logps/rejected": -444.30291748046875, + "loss": 0.502, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.145864725112915, + "rewards/margins": 0.9754177331924438, + "rewards/rejected": -2.1212821006774902, "step": 840 }, { - "epoch": 0.97, - "grad_norm": 2.21875, - "learning_rate": 1.443327715054593e-09, - "logits/chosen": -2.857140064239502, - "logits/rejected": -2.527884006500244, - "logps/chosen": -158.11717224121094, - "logps/rejected": -219.9044189453125, - "loss": 0.4812, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.07437891513109207, - "rewards/margins": 0.49480295181274414, - "rewards/rejected": -0.42042404413223267, + "epoch": 0.09, + "grad_norm": 6.15625, + "learning_rate": 4.6549835706462216e-06, + "logits/chosen": -2.2711682319641113, + "logits/rejected": -2.269557476043701, + "logps/chosen": -353.79595947265625, + "logps/rejected": -465.54473876953125, + "loss": 0.4536, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.955910325050354, + "rewards/margins": 1.2166513204574585, + "rewards/rejected": -2.1725614070892334, "step": 850 }, { - "epoch": 0.98, - "grad_norm": 2.6875, - "learning_rate": 5.725161797517087e-10, - "logits/chosen": -2.8359732627868652, - "logits/rejected": -2.5596938133239746, - "logps/chosen": -163.6195831298828, - "logps/rejected": -211.8534698486328, - "loss": 0.4877, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.08172888308763504, - "rewards/margins": 0.4786837697029114, - "rewards/rejected": -0.39695486426353455, + "epoch": 0.09, + "grad_norm": 4.8125, + "learning_rate": 4.709748083242059e-06, + "logits/chosen": -2.3038644790649414, + "logits/rejected": -2.1753878593444824, + "logps/chosen": -419.98175048828125, + "logps/rejected": -491.05548095703125, + "loss": 0.4188, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2254718542099, + "rewards/margins": 1.1390998363494873, + "rewards/rejected": -2.3645718097686768, "step": 860 }, { - "epoch": 0.99, - "grad_norm": 2.328125, - "learning_rate": 9.710100309603953e-11, - "logits/chosen": -2.8185763359069824, - "logits/rejected": -2.526491641998291, - "logps/chosen": -158.81956481933594, - "logps/rejected": -223.47232055664062, - "loss": 0.4786, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.07483495026826859, - "rewards/margins": 0.5013977885246277, - "rewards/rejected": -0.4265628755092621, + "epoch": 0.1, + "grad_norm": 8.3125, + "learning_rate": 4.764512595837898e-06, + "logits/chosen": -2.267216682434082, + "logits/rejected": -2.234022617340088, + "logps/chosen": -373.25836181640625, + "logps/rejected": -496.11474609375, + "loss": 0.4299, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3257873058319092, + "rewards/margins": 1.1447004079818726, + "rewards/rejected": -2.4704878330230713, "step": 870 }, + { + "epoch": 0.1, + "grad_norm": 6.78125, + "learning_rate": 4.819277108433735e-06, + "logits/chosen": -2.294813632965088, + "logits/rejected": -2.0653631687164307, + "logps/chosen": -341.44232177734375, + "logps/rejected": -385.06817626953125, + "loss": 0.4802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8620034456253052, + "rewards/margins": 0.8676112294197083, + "rewards/rejected": -1.7296146154403687, + "step": 880 + }, + { + "epoch": 0.1, + "grad_norm": 6.71875, + "learning_rate": 4.874041621029573e-06, + "logits/chosen": -2.2022156715393066, + "logits/rejected": -2.1993393898010254, + "logps/chosen": -364.7856140136719, + "logps/rejected": -428.79461669921875, + "loss": 0.521, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1121677160263062, + "rewards/margins": 1.0526173114776611, + "rewards/rejected": -2.1647849082946777, + "step": 890 + }, + { + "epoch": 0.1, + "grad_norm": 5.625, + "learning_rate": 4.928806133625411e-06, + "logits/chosen": -2.2716591358184814, + "logits/rejected": -2.1455495357513428, + "logps/chosen": -381.13714599609375, + "logps/rejected": -432.849853515625, + "loss": 0.4345, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9938165545463562, + "rewards/margins": 1.0592728853225708, + "rewards/rejected": -2.053089141845703, + "step": 900 + }, + { + "epoch": 0.1, + "grad_norm": 19.0, + "learning_rate": 4.983570646221249e-06, + "logits/chosen": -2.23356294631958, + "logits/rejected": -2.20814847946167, + "logps/chosen": -351.2264709472656, + "logps/rejected": -456.77667236328125, + "loss": 0.4907, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1159381866455078, + "rewards/margins": 1.285226583480835, + "rewards/rejected": -2.4011645317077637, + "step": 910 + }, + { + "epoch": 0.1, + "grad_norm": 18.625, + "learning_rate": 4.999991027140177e-06, + "logits/chosen": -2.2818655967712402, + "logits/rejected": -2.141960859298706, + "logps/chosen": -329.9190673828125, + "logps/rejected": -425.6123046875, + "loss": 0.4396, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.751863956451416, + "rewards/margins": 1.0393619537353516, + "rewards/rejected": -1.791225790977478, + "step": 920 + }, + { + "epoch": 0.1, + "grad_norm": 14.8125, + "learning_rate": 4.999947078594061e-06, + "logits/chosen": -2.2424190044403076, + "logits/rejected": -2.1010446548461914, + "logps/chosen": -322.34930419921875, + "logps/rejected": -452.09527587890625, + "loss": 0.465, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0421596765518188, + "rewards/margins": 1.3879188299179077, + "rewards/rejected": -2.4300787448883057, + "step": 930 + }, + { + "epoch": 0.1, + "grad_norm": 6.0625, + "learning_rate": 4.999866506928377e-06, + "logits/chosen": -2.2041916847229004, + "logits/rejected": -2.0878233909606934, + "logps/chosen": -382.32427978515625, + "logps/rejected": -456.57647705078125, + "loss": 0.4144, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2141783237457275, + "rewards/margins": 1.1722166538238525, + "rewards/rejected": -2.38639497756958, + "step": 940 + }, + { + "epoch": 0.1, + "grad_norm": 12.375, + "learning_rate": 4.999749313323467e-06, + "logits/chosen": -2.1606268882751465, + "logits/rejected": -1.9255397319793701, + "logps/chosen": -394.6891174316406, + "logps/rejected": -462.6358947753906, + "loss": 0.4377, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3240916728973389, + "rewards/margins": 1.0859105587005615, + "rewards/rejected": -2.4100022315979004, + "step": 950 + }, + { + "epoch": 0.11, + "grad_norm": 8.9375, + "learning_rate": 4.999595499496163e-06, + "logits/chosen": -2.0902233123779297, + "logits/rejected": -2.0232083797454834, + "logps/chosen": -429.0926818847656, + "logps/rejected": -512.0153198242188, + "loss": 0.4304, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.627703070640564, + "rewards/margins": 1.2354023456573486, + "rewards/rejected": -2.863105297088623, + "step": 960 + }, + { + "epoch": 0.11, + "grad_norm": 4.96875, + "learning_rate": 4.999405067699773e-06, + "logits/chosen": -2.136571168899536, + "logits/rejected": -2.0519003868103027, + "logps/chosen": -407.35528564453125, + "logps/rejected": -479.8490295410156, + "loss": 0.4268, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4073892831802368, + "rewards/margins": 1.2354086637496948, + "rewards/rejected": -2.6427979469299316, + "step": 970 + }, + { + "epoch": 0.11, + "grad_norm": 9.125, + "learning_rate": 4.999178020724036e-06, + "logits/chosen": -2.079451084136963, + "logits/rejected": -2.0004889965057373, + "logps/chosen": -400.48455810546875, + "logps/rejected": -515.6251831054688, + "loss": 0.3717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3892953395843506, + "rewards/margins": 1.5400440692901611, + "rewards/rejected": -2.929339647293091, + "step": 980 + }, + { + "epoch": 0.11, + "grad_norm": 5.59375, + "learning_rate": 4.998914361895091e-06, + "logits/chosen": -1.999040961265564, + "logits/rejected": -1.9203087091445923, + "logps/chosen": -379.88165283203125, + "logps/rejected": -526.2828369140625, + "loss": 0.3611, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.4323089122772217, + "rewards/margins": 1.6874799728393555, + "rewards/rejected": -3.1197891235351562, + "step": 990 + }, + { + "epoch": 0.11, + "grad_norm": 6.9375, + "learning_rate": 4.998614095075421e-06, + "logits/chosen": -2.1877896785736084, + "logits/rejected": -1.9169838428497314, + "logps/chosen": -380.15899658203125, + "logps/rejected": -524.8260498046875, + "loss": 0.4187, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2418038845062256, + "rewards/margins": 1.701460599899292, + "rewards/rejected": -2.9432644844055176, + "step": 1000 + }, + { + "epoch": 0.11, + "grad_norm": 8.875, + "learning_rate": 4.998277224663806e-06, + "logits/chosen": -2.157895565032959, + "logits/rejected": -1.9858131408691406, + "logps/chosen": -361.245361328125, + "logps/rejected": -458.28753662109375, + "loss": 0.4594, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9365634918212891, + "rewards/margins": 1.2251060009002686, + "rewards/rejected": -2.1616694927215576, + "step": 1010 + }, + { + "epoch": 0.11, + "grad_norm": 9.3125, + "learning_rate": 4.997903755595245e-06, + "logits/chosen": -2.1277756690979004, + "logits/rejected": -1.9612410068511963, + "logps/chosen": -337.10546875, + "logps/rejected": -464.539306640625, + "loss": 0.4232, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.894736111164093, + "rewards/margins": 1.5047707557678223, + "rewards/rejected": -2.3995068073272705, + "step": 1020 + }, + { + "epoch": 0.11, + "grad_norm": 6.5625, + "learning_rate": 4.997493693340898e-06, + "logits/chosen": -2.089784860610962, + "logits/rejected": -1.9085884094238281, + "logps/chosen": -362.52142333984375, + "logps/rejected": -454.03118896484375, + "loss": 0.3982, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1274363994598389, + "rewards/margins": 1.304640531539917, + "rewards/rejected": -2.432076930999756, + "step": 1030 + }, + { + "epoch": 0.11, + "grad_norm": 7.96875, + "learning_rate": 4.9970470439079925e-06, + "logits/chosen": -2.074920177459717, + "logits/rejected": -1.8465259075164795, + "logps/chosen": -416.3273010253906, + "logps/rejected": -483.27783203125, + "loss": 0.446, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4994056224822998, + "rewards/margins": 1.3981702327728271, + "rewards/rejected": -2.897575616836548, + "step": 1040 + }, + { + "epoch": 0.12, + "grad_norm": 11.125, + "learning_rate": 4.996563813839747e-06, + "logits/chosen": -2.0008907318115234, + "logits/rejected": -1.744895577430725, + "logps/chosen": -459.7574157714844, + "logps/rejected": -559.5464477539062, + "loss": 0.409, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1601481437683105, + "rewards/margins": 1.3848545551300049, + "rewards/rejected": -3.5450026988983154, + "step": 1050 + }, + { + "epoch": 0.12, + "grad_norm": 6.84375, + "learning_rate": 4.9960440102152695e-06, + "logits/chosen": -1.9616371393203735, + "logits/rejected": -1.837031364440918, + "logps/chosen": -400.9773864746094, + "logps/rejected": -552.2626342773438, + "loss": 0.4114, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6529262065887451, + "rewards/margins": 1.4914474487304688, + "rewards/rejected": -3.144373655319214, + "step": 1060 + }, + { + "epoch": 0.12, + "grad_norm": 8.3125, + "learning_rate": 4.995487640649451e-06, + "logits/chosen": -2.1133296489715576, + "logits/rejected": -1.9116332530975342, + "logps/chosen": -439.38604736328125, + "logps/rejected": -533.5519409179688, + "loss": 0.4041, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.541914701461792, + "rewards/margins": 1.3880865573883057, + "rewards/rejected": -2.9300010204315186, + "step": 1070 + }, + { + "epoch": 0.12, + "grad_norm": 9.875, + "learning_rate": 4.994894713292862e-06, + "logits/chosen": -1.9837061166763306, + "logits/rejected": -1.8434041738510132, + "logps/chosen": -408.0911560058594, + "logps/rejected": -529.5433349609375, + "loss": 0.3985, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6117709875106812, + "rewards/margins": 1.4262521266937256, + "rewards/rejected": -3.0380234718322754, + "step": 1080 + }, + { + "epoch": 0.12, + "grad_norm": 6.03125, + "learning_rate": 4.994265236831623e-06, + "logits/chosen": -1.995814323425293, + "logits/rejected": -1.848125696182251, + "logps/chosen": -386.9561462402344, + "logps/rejected": -475.1553649902344, + "loss": 0.4286, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5808651447296143, + "rewards/margins": 1.2475537061691284, + "rewards/rejected": -2.828418493270874, + "step": 1090 + }, + { + "epoch": 0.12, + "grad_norm": 17.5, + "learning_rate": 4.993599220487289e-06, + "logits/chosen": -2.0678231716156006, + "logits/rejected": -1.9785680770874023, + "logps/chosen": -387.25933837890625, + "logps/rejected": -540.8172607421875, + "loss": 0.3951, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.318528652191162, + "rewards/margins": 1.6313440799713135, + "rewards/rejected": -2.9498727321624756, + "step": 1100 + }, + { + "epoch": 0.12, + "grad_norm": 6.625, + "learning_rate": 4.992896674016703e-06, + "logits/chosen": -2.0437819957733154, + "logits/rejected": -1.9170739650726318, + "logps/chosen": -397.6013488769531, + "logps/rejected": -516.1052856445312, + "loss": 0.4134, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.327467441558838, + "rewards/margins": 1.743734359741211, + "rewards/rejected": -3.0712015628814697, + "step": 1110 + }, + { + "epoch": 0.12, + "grad_norm": 6.71875, + "learning_rate": 4.992157607711862e-06, + "logits/chosen": -2.171450614929199, + "logits/rejected": -1.8918256759643555, + "logps/chosen": -389.7965087890625, + "logps/rejected": -540.8098754882812, + "loss": 0.4019, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2510523796081543, + "rewards/margins": 1.8322986364364624, + "rewards/rejected": -3.083350896835327, + "step": 1120 + }, + { + "epoch": 0.12, + "grad_norm": 13.5625, + "learning_rate": 4.991382032399759e-06, + "logits/chosen": -2.013124942779541, + "logits/rejected": -1.855377197265625, + "logps/chosen": -411.5616760253906, + "logps/rejected": -569.2857666015625, + "loss": 0.4031, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.634118676185608, + "rewards/margins": 1.8356876373291016, + "rewards/rejected": -3.46980619430542, + "step": 1130 + }, + { + "epoch": 0.12, + "grad_norm": 6.46875, + "learning_rate": 4.990569959442231e-06, + "logits/chosen": -1.971697449684143, + "logits/rejected": -1.7529220581054688, + "logps/chosen": -445.6668395996094, + "logps/rejected": -566.9530029296875, + "loss": 0.3871, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.822587251663208, + "rewards/margins": 1.8279403448104858, + "rewards/rejected": -3.650527238845825, + "step": 1140 + }, + { + "epoch": 0.13, + "grad_norm": 10.625, + "learning_rate": 4.989721400735786e-06, + "logits/chosen": -1.8915526866912842, + "logits/rejected": -1.7562611103057861, + "logps/chosen": -475.46673583984375, + "logps/rejected": -578.044677734375, + "loss": 0.4826, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3858120441436768, + "rewards/margins": 1.2627156972885132, + "rewards/rejected": -3.6485278606414795, + "step": 1150 + }, + { + "epoch": 0.13, + "grad_norm": 8.375, + "learning_rate": 4.988836368711435e-06, + "logits/chosen": -1.9814294576644897, + "logits/rejected": -1.8079755306243896, + "logps/chosen": -442.216064453125, + "logps/rejected": -577.0377807617188, + "loss": 0.4268, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7222038507461548, + "rewards/margins": 1.7043015956878662, + "rewards/rejected": -3.4265055656433105, + "step": 1160 + }, + { + "epoch": 0.13, + "grad_norm": 22.875, + "learning_rate": 4.987914876334506e-06, + "logits/chosen": -1.8097915649414062, + "logits/rejected": -1.792376160621643, + "logps/chosen": -458.9969787597656, + "logps/rejected": -616.7918701171875, + "loss": 0.4292, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.132195234298706, + "rewards/margins": 1.814105749130249, + "rewards/rejected": -3.946300983428955, + "step": 1170 + }, + { + "epoch": 0.13, + "grad_norm": 14.5, + "learning_rate": 4.986956937104455e-06, + "logits/chosen": -1.8919658660888672, + "logits/rejected": -1.9282894134521484, + "logps/chosen": -405.42669677734375, + "logps/rejected": -530.1327514648438, + "loss": 0.4355, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7110258340835571, + "rewards/margins": 1.1865997314453125, + "rewards/rejected": -2.89762544631958, + "step": 1180 + }, + { + "epoch": 0.13, + "grad_norm": 8.9375, + "learning_rate": 4.985962565054668e-06, + "logits/chosen": -1.9831883907318115, + "logits/rejected": -1.8783514499664307, + "logps/chosen": -408.9219665527344, + "logps/rejected": -523.4224243164062, + "loss": 0.4497, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4924160242080688, + "rewards/margins": 1.3742684125900269, + "rewards/rejected": -2.8666844367980957, + "step": 1190 + }, + { + "epoch": 0.13, + "grad_norm": 10.1875, + "learning_rate": 4.9849317747522565e-06, + "logits/chosen": -2.016627311706543, + "logits/rejected": -1.7545188665390015, + "logps/chosen": -445.33807373046875, + "logps/rejected": -580.2514038085938, + "loss": 0.4233, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5981049537658691, + "rewards/margins": 1.7466585636138916, + "rewards/rejected": -3.3447635173797607, + "step": 1200 + }, + { + "epoch": 0.13, + "grad_norm": 8.6875, + "learning_rate": 4.983864581297841e-06, + "logits/chosen": -1.9554979801177979, + "logits/rejected": -1.8426473140716553, + "logps/chosen": -451.56243896484375, + "logps/rejected": -569.8328857421875, + "loss": 0.4047, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0063488483428955, + "rewards/margins": 1.292351484298706, + "rewards/rejected": -3.2987003326416016, + "step": 1210 + }, + { + "epoch": 0.13, + "grad_norm": 10.6875, + "learning_rate": 4.982761000325336e-06, + "logits/chosen": -1.9751933813095093, + "logits/rejected": -1.7088596820831299, + "logps/chosen": -400.3304443359375, + "logps/rejected": -548.0325927734375, + "loss": 0.408, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.64980947971344, + "rewards/margins": 1.6963685750961304, + "rewards/rejected": -3.3461780548095703, + "step": 1220 + }, + { + "epoch": 0.13, + "grad_norm": 16.0, + "learning_rate": 4.9816210480017135e-06, + "logits/chosen": -1.8304758071899414, + "logits/rejected": -1.6517435312271118, + "logps/chosen": -428.72918701171875, + "logps/rejected": -578.1611328125, + "loss": 0.4003, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7572234869003296, + "rewards/margins": 1.845768928527832, + "rewards/rejected": -3.602992296218872, + "step": 1230 + }, + { + "epoch": 0.14, + "grad_norm": 8.8125, + "learning_rate": 4.980444741026772e-06, + "logits/chosen": -1.9374967813491821, + "logits/rejected": -1.8622459173202515, + "logps/chosen": -387.9987487792969, + "logps/rejected": -497.81964111328125, + "loss": 0.4411, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4748280048370361, + "rewards/margins": 1.2174007892608643, + "rewards/rejected": -2.6922287940979004, + "step": 1240 + }, + { + "epoch": 0.14, + "grad_norm": 6.1875, + "learning_rate": 4.979232096632889e-06, + "logits/chosen": -1.923143744468689, + "logits/rejected": -1.8795055150985718, + "logps/chosen": -433.1803283691406, + "logps/rejected": -532.388427734375, + "loss": 0.4082, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7599979639053345, + "rewards/margins": 1.4135503768920898, + "rewards/rejected": -3.173548460006714, + "step": 1250 + }, + { + "epoch": 0.14, + "grad_norm": 3.875, + "learning_rate": 4.977983132584767e-06, + "logits/chosen": -1.8299497365951538, + "logits/rejected": -1.6137624979019165, + "logps/chosen": -494.11767578125, + "logps/rejected": -596.8160400390625, + "loss": 0.3545, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9803826808929443, + "rewards/margins": 1.60739004611969, + "rewards/rejected": -3.5877723693847656, + "step": 1260 + }, + { + "epoch": 0.14, + "grad_norm": 9.25, + "learning_rate": 4.976697867179179e-06, + "logits/chosen": -1.8574409484863281, + "logits/rejected": -1.720315933227539, + "logps/chosen": -387.02056884765625, + "logps/rejected": -507.2344665527344, + "loss": 0.4025, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4940776824951172, + "rewards/margins": 1.4518120288848877, + "rewards/rejected": -2.945889472961426, + "step": 1270 + }, + { + "epoch": 0.14, + "grad_norm": 9.25, + "learning_rate": 4.9753763192446934e-06, + "logits/chosen": -1.8598644733428955, + "logits/rejected": -1.7275965213775635, + "logps/chosen": -376.22637939453125, + "logps/rejected": -540.2301635742188, + "loss": 0.3467, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.3400099277496338, + "rewards/margins": 1.7799888849258423, + "rewards/rejected": -3.1199991703033447, + "step": 1280 + }, + { + "epoch": 0.14, + "grad_norm": 12.1875, + "learning_rate": 4.9740185081414045e-06, + "logits/chosen": -1.7983766794204712, + "logits/rejected": -1.640138030052185, + "logps/chosen": -423.83740234375, + "logps/rejected": -619.3489379882812, + "loss": 0.4084, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0708699226379395, + "rewards/margins": 2.0533876419067383, + "rewards/rejected": -4.1242570877075195, + "step": 1290 + }, + { + "epoch": 0.14, + "grad_norm": 9.375, + "learning_rate": 4.972624453760644e-06, + "logits/chosen": -1.801692247390747, + "logits/rejected": -1.6416904926300049, + "logps/chosen": -451.0541076660156, + "logps/rejected": -605.9425048828125, + "loss": 0.4457, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2591934204101562, + "rewards/margins": 1.7616710662841797, + "rewards/rejected": -4.020864486694336, + "step": 1300 + }, + { + "epoch": 0.14, + "grad_norm": 13.375, + "learning_rate": 4.971194176524694e-06, + "logits/chosen": -1.7507768869400024, + "logits/rejected": -1.671445608139038, + "logps/chosen": -464.3245544433594, + "logps/rejected": -683.9586181640625, + "loss": 0.4168, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.3371691703796387, + "rewards/margins": 2.2439422607421875, + "rewards/rejected": -4.581111431121826, + "step": 1310 + }, + { + "epoch": 0.14, + "grad_norm": 3.8125, + "learning_rate": 4.969727697386481e-06, + "logits/chosen": -1.8506873846054077, + "logits/rejected": -1.6854591369628906, + "logps/chosen": -395.36798095703125, + "logps/rejected": -576.8516235351562, + "loss": 0.3666, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4899570941925049, + "rewards/margins": 2.163172960281372, + "rewards/rejected": -3.653130292892456, + "step": 1320 + }, + { + "epoch": 0.15, + "grad_norm": 5.375, + "learning_rate": 4.968225037829278e-06, + "logits/chosen": -1.7551017999649048, + "logits/rejected": -1.6515169143676758, + "logps/chosen": -359.97686767578125, + "logps/rejected": -579.1767578125, + "loss": 0.3398, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.489561676979065, + "rewards/margins": 2.073533058166504, + "rewards/rejected": -3.5630946159362793, + "step": 1330 + }, + { + "epoch": 0.15, + "grad_norm": 13.3125, + "learning_rate": 4.966686219866383e-06, + "logits/chosen": -1.611549973487854, + "logits/rejected": -1.4388806819915771, + "logps/chosen": -481.92352294921875, + "logps/rejected": -676.6087646484375, + "loss": 0.3615, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.57257342338562, + "rewards/margins": 2.1287455558776855, + "rewards/rejected": -4.701319217681885, + "step": 1340 + }, + { + "epoch": 0.15, + "grad_norm": 14.625, + "learning_rate": 4.965111266040798e-06, + "logits/chosen": -1.5021296739578247, + "logits/rejected": -1.352057695388794, + "logps/chosen": -703.9445190429688, + "logps/rejected": -870.5361328125, + "loss": 0.4178, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.507909297943115, + "rewards/margins": 2.0341744422912598, + "rewards/rejected": -6.542083740234375, + "step": 1350 + }, + { + "epoch": 0.15, + "grad_norm": 16.125, + "learning_rate": 4.963500199424902e-06, + "logits/chosen": -1.537723183631897, + "logits/rejected": -1.462158203125, + "logps/chosen": -558.639404296875, + "logps/rejected": -762.3148803710938, + "loss": 0.3576, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.149137258529663, + "rewards/margins": 2.3914108276367188, + "rewards/rejected": -5.540547847747803, + "step": 1360 + }, + { + "epoch": 0.15, + "grad_norm": 9.625, + "learning_rate": 4.961853043620106e-06, + "logits/chosen": -1.5368038415908813, + "logits/rejected": -1.3882620334625244, + "logps/chosen": -605.232421875, + "logps/rejected": -822.916015625, + "loss": 0.4187, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.424246311187744, + "rewards/margins": 2.386579990386963, + "rewards/rejected": -5.810825824737549, + "step": 1370 + }, + { + "epoch": 0.15, + "grad_norm": 7.28125, + "learning_rate": 4.960169822756518e-06, + "logits/chosen": -1.603130578994751, + "logits/rejected": -1.571542501449585, + "logps/chosen": -448.19970703125, + "logps/rejected": -630.0074462890625, + "loss": 0.3851, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.305044174194336, + "rewards/margins": 1.898432970046997, + "rewards/rejected": -4.203476905822754, + "step": 1380 + }, + { + "epoch": 0.15, + "grad_norm": 19.375, + "learning_rate": 4.9584505614925765e-06, + "logits/chosen": -1.5551097393035889, + "logits/rejected": -1.5678659677505493, + "logps/chosen": -576.1119384765625, + "logps/rejected": -855.1985473632812, + "loss": 0.3961, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1297695636749268, + "rewards/margins": 2.3528432846069336, + "rewards/rejected": -5.4826130867004395, + "step": 1390 + }, + { + "epoch": 0.15, + "grad_norm": 15.6875, + "learning_rate": 4.9566952850147e-06, + "logits/chosen": -1.5597752332687378, + "logits/rejected": -1.2877302169799805, + "logps/chosen": -732.5062866210938, + "logps/rejected": -891.2120971679688, + "loss": 0.4319, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.452132225036621, + "rewards/margins": 2.127747058868408, + "rewards/rejected": -6.579878330230713, + "step": 1400 + }, + { + "epoch": 0.15, + "grad_norm": 16.25, + "learning_rate": 4.954904019036914e-06, + "logits/chosen": -1.4755786657333374, + "logits/rejected": -1.3513275384902954, + "logps/chosen": -710.7896728515625, + "logps/rejected": -850.7200927734375, + "loss": 0.4551, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.545330047607422, + "rewards/margins": 1.7735296487808228, + "rewards/rejected": -6.318860054016113, + "step": 1410 + }, + { + "epoch": 0.16, + "grad_norm": 7.46875, + "learning_rate": 4.953076789800472e-06, + "logits/chosen": -1.6501293182373047, + "logits/rejected": -1.5329220294952393, + "logps/chosen": -646.9166259765625, + "logps/rejected": -773.8204956054688, + "loss": 0.3812, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.004889011383057, + "rewards/margins": 1.6534801721572876, + "rewards/rejected": -5.658369541168213, + "step": 1420 + }, + { + "epoch": 0.16, + "grad_norm": 13.75, + "learning_rate": 4.951213624073475e-06, + "logits/chosen": -1.6762621402740479, + "logits/rejected": -1.4565012454986572, + "logps/chosen": -643.4212646484375, + "logps/rejected": -826.5513916015625, + "loss": 0.3944, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.824911117553711, + "rewards/margins": 2.2226672172546387, + "rewards/rejected": -6.04757833480835, + "step": 1430 + }, + { + "epoch": 0.16, + "grad_norm": 10.9375, + "learning_rate": 4.949314549150477e-06, + "logits/chosen": -1.5981062650680542, + "logits/rejected": -1.553539514541626, + "logps/chosen": -531.7981567382812, + "logps/rejected": -671.7160034179688, + "loss": 0.3675, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0833277702331543, + "rewards/margins": 1.7200475931167603, + "rewards/rejected": -4.803375720977783, + "step": 1440 + }, + { + "epoch": 0.16, + "grad_norm": 9.0625, + "learning_rate": 4.9473795928520854e-06, + "logits/chosen": -1.6070849895477295, + "logits/rejected": -1.542314887046814, + "logps/chosen": -570.6192016601562, + "logps/rejected": -741.8380737304688, + "loss": 0.3616, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0219061374664307, + "rewards/margins": 1.9794813394546509, + "rewards/rejected": -5.001387596130371, + "step": 1450 + }, + { + "epoch": 0.16, + "grad_norm": 11.5625, + "learning_rate": 4.945408783524556e-06, + "logits/chosen": -1.562904715538025, + "logits/rejected": -1.5386149883270264, + "logps/chosen": -600.2249145507812, + "logps/rejected": -726.2491455078125, + "loss": 0.4721, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.3292298316955566, + "rewards/margins": 1.6055278778076172, + "rewards/rejected": -4.934757232666016, + "step": 1460 + }, + { + "epoch": 0.16, + "grad_norm": 6.0625, + "learning_rate": 4.943402150039374e-06, + "logits/chosen": -1.7516462802886963, + "logits/rejected": -1.5450327396392822, + "logps/chosen": -492.21826171875, + "logps/rejected": -702.0546875, + "loss": 0.337, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.622464418411255, + "rewards/margins": 2.2660393714904785, + "rewards/rejected": -4.888503074645996, + "step": 1470 + }, + { + "epoch": 0.16, + "grad_norm": 20.75, + "learning_rate": 4.941359721792832e-06, + "logits/chosen": -1.586670994758606, + "logits/rejected": -1.5777990818023682, + "logps/chosen": -584.0569458007812, + "logps/rejected": -739.6729736328125, + "loss": 0.4586, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.6003963947296143, + "rewards/margins": 1.5527204275131226, + "rewards/rejected": -5.153116226196289, + "step": 1480 + }, + { + "epoch": 0.16, + "grad_norm": 6.125, + "learning_rate": 4.939281528705603e-06, + "logits/chosen": -1.6687746047973633, + "logits/rejected": -1.7045615911483765, + "logps/chosen": -590.5048217773438, + "logps/rejected": -745.39404296875, + "loss": 0.4248, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.331634521484375, + "rewards/margins": 1.7136656045913696, + "rewards/rejected": -5.045300483703613, + "step": 1490 + }, + { + "epoch": 0.16, + "grad_norm": 6.3125, + "learning_rate": 4.937167601222293e-06, + "logits/chosen": -1.6695022583007812, + "logits/rejected": -1.5278782844543457, + "logps/chosen": -585.8997802734375, + "logps/rejected": -733.9624633789062, + "loss": 0.3783, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4611308574676514, + "rewards/margins": 1.740176796913147, + "rewards/rejected": -5.201308250427246, + "step": 1500 + }, + { + "epoch": 0.17, + "grad_norm": 14.0625, + "learning_rate": 4.93501797031101e-06, + "logits/chosen": -1.7035127878189087, + "logits/rejected": -1.6354587078094482, + "logps/chosen": -619.9177856445312, + "logps/rejected": -704.5440673828125, + "loss": 0.4471, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.5777313709259033, + "rewards/margins": 1.317919373512268, + "rewards/rejected": -4.8956499099731445, + "step": 1510 + }, + { + "epoch": 0.17, + "grad_norm": 9.8125, + "learning_rate": 4.932832667462894e-06, + "logits/chosen": -1.7624809741973877, + "logits/rejected": -1.754815697669983, + "logps/chosen": -472.76611328125, + "logps/rejected": -598.0167846679688, + "loss": 0.3985, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.121837615966797, + "rewards/margins": 1.5273975133895874, + "rewards/rejected": -3.6492347717285156, + "step": 1520 + }, + { + "epoch": 0.17, + "grad_norm": 10.375, + "learning_rate": 4.9306117246916655e-06, + "logits/chosen": -1.7742128372192383, + "logits/rejected": -1.7258265018463135, + "logps/chosen": -435.621826171875, + "logps/rejected": -593.1024169921875, + "loss": 0.4035, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1559865474700928, + "rewards/margins": 1.4799325466156006, + "rewards/rejected": -3.6359190940856934, + "step": 1530 + }, + { + "epoch": 0.17, + "grad_norm": 9.375, + "learning_rate": 4.928355174533153e-06, + "logits/chosen": -1.8224060535430908, + "logits/rejected": -1.716186761856079, + "logps/chosen": -365.55206298828125, + "logps/rejected": -502.70819091796875, + "loss": 0.3831, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5456788539886475, + "rewards/margins": 1.562146544456482, + "rewards/rejected": -3.107825517654419, + "step": 1540 + }, + { + "epoch": 0.17, + "grad_norm": 11.1875, + "learning_rate": 4.926063050044823e-06, + "logits/chosen": -1.848811149597168, + "logits/rejected": -1.7806282043457031, + "logps/chosen": -437.16552734375, + "logps/rejected": -645.4912109375, + "loss": 0.335, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9283971786499023, + "rewards/margins": 2.0607380867004395, + "rewards/rejected": -3.9891350269317627, + "step": 1550 + }, + { + "epoch": 0.17, + "grad_norm": 14.5, + "learning_rate": 4.923735384805282e-06, + "logits/chosen": -1.7096211910247803, + "logits/rejected": -1.6008384227752686, + "logps/chosen": -564.7811279296875, + "logps/rejected": -746.1441650390625, + "loss": 0.4176, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1216702461242676, + "rewards/margins": 2.0287418365478516, + "rewards/rejected": -5.150412082672119, + "step": 1560 + }, + { + "epoch": 0.17, + "grad_norm": 13.75, + "learning_rate": 4.9213722129138e-06, + "logits/chosen": -1.576120376586914, + "logits/rejected": -1.5141403675079346, + "logps/chosen": -638.0278930664062, + "logps/rejected": -855.17138671875, + "loss": 0.4371, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.029786109924316, + "rewards/margins": 2.417710781097412, + "rewards/rejected": -6.4474968910217285, + "step": 1570 + }, + { + "epoch": 0.17, + "grad_norm": 5.21875, + "learning_rate": 4.9189735689897975e-06, + "logits/chosen": -1.4834842681884766, + "logits/rejected": -1.4277558326721191, + "logps/chosen": -727.564697265625, + "logps/rejected": -880.00244140625, + "loss": 0.4386, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.80576229095459, + "rewards/margins": 1.8440172672271729, + "rewards/rejected": -6.649779319763184, + "step": 1580 + }, + { + "epoch": 0.17, + "grad_norm": 8.5625, + "learning_rate": 4.9165394881723496e-06, + "logits/chosen": -1.544121503829956, + "logits/rejected": -1.4586372375488281, + "logps/chosen": -663.8779296875, + "logps/rejected": -801.9801635742188, + "loss": 0.4244, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.109539031982422, + "rewards/margins": 1.7661683559417725, + "rewards/rejected": -5.875707626342773, + "step": 1590 + }, + { + "epoch": 0.18, + "grad_norm": 12.1875, + "learning_rate": 4.914070006119663e-06, + "logits/chosen": -1.6799728870391846, + "logits/rejected": -1.5815536975860596, + "logps/chosen": -521.51025390625, + "logps/rejected": -723.5499877929688, + "loss": 0.3701, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.940391778945923, + "rewards/margins": 2.182253360748291, + "rewards/rejected": -5.122644901275635, + "step": 1600 + }, + { + "epoch": 0.18, + "grad_norm": 9.125, + "learning_rate": 4.911565159008559e-06, + "logits/chosen": -1.6074203252792358, + "logits/rejected": -1.4795572757720947, + "logps/chosen": -509.55059814453125, + "logps/rejected": -720.853759765625, + "loss": 0.3817, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7229371070861816, + "rewards/margins": 2.353829860687256, + "rewards/rejected": -5.076766490936279, + "step": 1610 + }, + { + "epoch": 0.18, + "grad_norm": 13.375, + "learning_rate": 4.9090249835339395e-06, + "logits/chosen": -1.7791054248809814, + "logits/rejected": -1.6644093990325928, + "logps/chosen": -538.9613037109375, + "logps/rejected": -649.7662353515625, + "loss": 0.4153, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.080209493637085, + "rewards/margins": 1.4292373657226562, + "rewards/rejected": -4.50944709777832, + "step": 1620 + }, + { + "epoch": 0.18, + "grad_norm": 6.53125, + "learning_rate": 4.9064495169082546e-06, + "logits/chosen": -1.7422281503677368, + "logits/rejected": -1.690978765487671, + "logps/chosen": -569.5952758789062, + "logps/rejected": -812.381591796875, + "loss": 0.3203, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.2451515197753906, + "rewards/margins": 2.6106667518615723, + "rewards/rejected": -5.855818271636963, + "step": 1630 + }, + { + "epoch": 0.18, + "grad_norm": 18.0, + "learning_rate": 4.903838796860949e-06, + "logits/chosen": -1.8155062198638916, + "logits/rejected": -1.6272052526474, + "logps/chosen": -543.8080444335938, + "logps/rejected": -680.8631591796875, + "loss": 0.3484, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.746061325073242, + "rewards/margins": 1.9182088375091553, + "rewards/rejected": -4.664269924163818, + "step": 1640 + }, + { + "epoch": 0.18, + "grad_norm": 11.0, + "learning_rate": 4.901192861637919e-06, + "logits/chosen": -1.7754013538360596, + "logits/rejected": -1.6697216033935547, + "logps/chosen": -569.3841552734375, + "logps/rejected": -768.0147705078125, + "loss": 0.3654, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1462929248809814, + "rewards/margins": 2.184648275375366, + "rewards/rejected": -5.330941200256348, + "step": 1650 + }, + { + "epoch": 0.18, + "grad_norm": 17.375, + "learning_rate": 4.898511750000945e-06, + "logits/chosen": -1.6868972778320312, + "logits/rejected": -1.7233774662017822, + "logps/chosen": -575.5565185546875, + "logps/rejected": -778.8382568359375, + "loss": 0.4352, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.5181102752685547, + "rewards/margins": 1.8371772766113281, + "rewards/rejected": -5.355288028717041, + "step": 1660 + }, + { + "epoch": 0.18, + "grad_norm": 7.3125, + "learning_rate": 4.895795501227129e-06, + "logits/chosen": -1.7727956771850586, + "logits/rejected": -1.703072190284729, + "logps/chosen": -502.931884765625, + "logps/rejected": -759.9163818359375, + "loss": 0.3062, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.904383659362793, + "rewards/margins": 2.637402057647705, + "rewards/rejected": -5.54178524017334, + "step": 1670 + }, + { + "epoch": 0.18, + "grad_norm": 12.9375, + "learning_rate": 4.8930441551083116e-06, + "logits/chosen": -1.757830023765564, + "logits/rejected": -1.6589701175689697, + "logps/chosen": -619.2037963867188, + "logps/rejected": -841.9880981445312, + "loss": 0.275, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.5335075855255127, + "rewards/margins": 2.527801990509033, + "rewards/rejected": -6.061309337615967, + "step": 1680 + }, + { + "epoch": 0.19, + "grad_norm": 16.375, + "learning_rate": 4.8902577519505e-06, + "logits/chosen": -1.6742019653320312, + "logits/rejected": -1.5545579195022583, + "logps/chosen": -744.1402587890625, + "logps/rejected": -958.6746826171875, + "loss": 0.3465, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.717513084411621, + "rewards/margins": 2.5799853801727295, + "rewards/rejected": -7.2974982261657715, + "step": 1690 + }, + { + "epoch": 0.19, + "grad_norm": 14.5, + "learning_rate": 4.887436332573269e-06, + "logits/chosen": -1.6981080770492554, + "logits/rejected": -1.5422903299331665, + "logps/chosen": -667.7444458007812, + "logps/rejected": -903.4476318359375, + "loss": 0.3323, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.170659065246582, + "rewards/margins": 2.6540169715881348, + "rewards/rejected": -6.824675559997559, + "step": 1700 + }, + { + "epoch": 0.19, + "grad_norm": 5.09375, + "learning_rate": 4.8845799383091624e-06, + "logits/chosen": -1.5376986265182495, + "logits/rejected": -1.3050415515899658, + "logps/chosen": -684.7621459960938, + "logps/rejected": -937.4235229492188, + "loss": 0.3492, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.572120666503906, + "rewards/margins": 2.85258150100708, + "rewards/rejected": -7.4247026443481445, + "step": 1710 + }, + { + "epoch": 0.19, + "grad_norm": 5.5, + "learning_rate": 4.8816886110030956e-06, + "logits/chosen": -1.50538170337677, + "logits/rejected": -1.3387202024459839, + "logps/chosen": -707.296875, + "logps/rejected": -952.4454956054688, + "loss": 0.3487, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.720381259918213, + "rewards/margins": 2.521765947341919, + "rewards/rejected": -7.2421464920043945, + "step": 1720 + }, + { + "epoch": 0.19, + "grad_norm": 15.0625, + "learning_rate": 4.878762393011735e-06, + "logits/chosen": -1.5213243961334229, + "logits/rejected": -1.3421592712402344, + "logps/chosen": -852.3511962890625, + "logps/rejected": -1091.234619140625, + "loss": 0.3853, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.962169170379639, + "rewards/margins": 2.756431818008423, + "rewards/rejected": -8.71860122680664, + "step": 1730 + }, + { + "epoch": 0.19, + "grad_norm": 9.0, + "learning_rate": 4.875801327202881e-06, + "logits/chosen": -1.6128612756729126, + "logits/rejected": -1.5079094171524048, + "logps/chosen": -646.0087890625, + "logps/rejected": -901.9317626953125, + "loss": 0.3273, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9934964179992676, + "rewards/margins": 2.6840078830718994, + "rewards/rejected": -6.677504062652588, + "step": 1740 + }, + { + "epoch": 0.19, + "grad_norm": 7.96875, + "learning_rate": 4.872805456954837e-06, + "logits/chosen": -1.6950695514678955, + "logits/rejected": -1.4976861476898193, + "logps/chosen": -701.8513793945312, + "logps/rejected": -902.2637939453125, + "loss": 0.441, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.194971084594727, + "rewards/margins": 2.463911533355713, + "rewards/rejected": -6.658883094787598, + "step": 1750 + }, + { + "epoch": 0.19, + "grad_norm": 12.0625, + "learning_rate": 4.86977482615578e-06, + "logits/chosen": -1.7622724771499634, + "logits/rejected": -1.6467702388763428, + "logps/chosen": -596.7037353515625, + "logps/rejected": -848.4417114257812, + "loss": 0.3332, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.473130464553833, + "rewards/margins": 2.713343381881714, + "rewards/rejected": -6.1864728927612305, + "step": 1760 + }, + { + "epoch": 0.19, + "grad_norm": 9.9375, + "learning_rate": 4.86670947920311e-06, + "logits/chosen": -1.6568279266357422, + "logits/rejected": -1.4951436519622803, + "logps/chosen": -596.8392944335938, + "logps/rejected": -764.641845703125, + "loss": 0.4159, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.4699103832244873, + "rewards/margins": 2.223597288131714, + "rewards/rejected": -5.693507671356201, + "step": 1770 + }, + { + "epoch": 0.2, + "grad_norm": 11.75, + "learning_rate": 4.863609461002806e-06, + "logits/chosen": -1.9406402111053467, + "logits/rejected": -1.8285242319107056, + "logps/chosen": -511.17999267578125, + "logps/rejected": -689.396484375, + "loss": 0.3648, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.585395097732544, + "rewards/margins": 1.9654815196990967, + "rewards/rejected": -4.550877571105957, + "step": 1780 + }, + { + "epoch": 0.2, + "grad_norm": 6.3125, + "learning_rate": 4.860474816968763e-06, + "logits/chosen": -1.9118030071258545, + "logits/rejected": -1.7153613567352295, + "logps/chosen": -525.9435424804688, + "logps/rejected": -674.3565673828125, + "loss": 0.3374, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.4560446739196777, + "rewards/margins": 2.0078539848327637, + "rewards/rejected": -4.463898658752441, + "step": 1790 + }, + { + "epoch": 0.2, + "grad_norm": 9.0625, + "learning_rate": 4.857305593022132e-06, + "logits/chosen": -1.8036915063858032, + "logits/rejected": -1.6467492580413818, + "logps/chosen": -536.428955078125, + "logps/rejected": -724.3878784179688, + "loss": 0.3819, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.147836685180664, + "rewards/margins": 2.1298842430114746, + "rewards/rejected": -5.2777204513549805, + "step": 1800 + }, + { + "epoch": 0.2, + "grad_norm": 18.5, + "learning_rate": 4.85410183559064e-06, + "logits/chosen": -1.7944046258926392, + "logits/rejected": -1.7419912815093994, + "logps/chosen": -580.3853149414062, + "logps/rejected": -805.3980712890625, + "loss": 0.405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1071701049804688, + "rewards/margins": 2.100389242172241, + "rewards/rejected": -5.207559585571289, + "step": 1810 + }, + { + "epoch": 0.2, + "grad_norm": 9.1875, + "learning_rate": 4.85086359160792e-06, + "logits/chosen": -1.715841293334961, + "logits/rejected": -1.58900785446167, + "logps/chosen": -550.0962524414062, + "logps/rejected": -736.8685302734375, + "loss": 0.4055, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.0267930030822754, + "rewards/margins": 2.080324649810791, + "rewards/rejected": -5.107117652893066, + "step": 1820 + }, + { + "epoch": 0.2, + "grad_norm": 7.75, + "learning_rate": 4.847590908512814e-06, + "logits/chosen": -1.7633041143417358, + "logits/rejected": -1.6234302520751953, + "logps/chosen": -579.5623168945312, + "logps/rejected": -834.1442260742188, + "loss": 0.3518, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.146751880645752, + "rewards/margins": 2.793820858001709, + "rewards/rejected": -5.940572261810303, + "step": 1830 + }, + { + "epoch": 0.2, + "grad_norm": 8.125, + "learning_rate": 4.844283834248681e-06, + "logits/chosen": -1.72641921043396, + "logits/rejected": -1.5783042907714844, + "logps/chosen": -602.4619750976562, + "logps/rejected": -790.2619018554688, + "loss": 0.4045, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.565643310546875, + "rewards/margins": 2.098188638687134, + "rewards/rejected": -5.66383171081543, + "step": 1840 + }, + { + "epoch": 0.2, + "grad_norm": 4.46875, + "learning_rate": 4.840942417262699e-06, + "logits/chosen": -1.679517388343811, + "logits/rejected": -1.547668218612671, + "logps/chosen": -662.6768188476562, + "logps/rejected": -823.7969970703125, + "loss": 0.3232, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.702730655670166, + "rewards/margins": 2.315844774246216, + "rewards/rejected": -6.0185747146606445, + "step": 1850 + }, + { + "epoch": 0.2, + "grad_norm": 6.84375, + "learning_rate": 4.837566706505149e-06, + "logits/chosen": -1.599431037902832, + "logits/rejected": -1.4237494468688965, + "logps/chosen": -699.7108764648438, + "logps/rejected": -896.8258056640625, + "loss": 0.4644, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.556941032409668, + "rewards/margins": 2.101416826248169, + "rewards/rejected": -6.6583571434021, + "step": 1860 + }, + { + "epoch": 0.21, + "grad_norm": 5.6875, + "learning_rate": 4.834156751428702e-06, + "logits/chosen": -1.609161376953125, + "logits/rejected": -1.3990987539291382, + "logps/chosen": -697.2935791015625, + "logps/rejected": -889.5928955078125, + "loss": 0.3877, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.595686435699463, + "rewards/margins": 2.2219977378845215, + "rewards/rejected": -6.817684173583984, + "step": 1870 + }, + { + "epoch": 0.21, + "grad_norm": 9.4375, + "learning_rate": 4.830712601987691e-06, + "logits/chosen": -1.5358078479766846, + "logits/rejected": -1.4598655700683594, + "logps/chosen": -674.640380859375, + "logps/rejected": -925.7941284179688, + "loss": 0.3267, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.129373073577881, + "rewards/margins": 2.953611373901367, + "rewards/rejected": -7.08298397064209, + "step": 1880 + }, + { + "epoch": 0.21, + "grad_norm": 11.5, + "learning_rate": 4.827234308637384e-06, + "logits/chosen": -1.7646366357803345, + "logits/rejected": -1.5783905982971191, + "logps/chosen": -635.8955688476562, + "logps/rejected": -823.4091796875, + "loss": 0.3211, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.4353556632995605, + "rewards/margins": 2.4891724586486816, + "rewards/rejected": -5.924528121948242, + "step": 1890 + }, + { + "epoch": 0.21, + "grad_norm": 9.5, + "learning_rate": 4.82372192233324e-06, + "logits/chosen": -1.7017968893051147, + "logits/rejected": -1.6013975143432617, + "logps/chosen": -640.56787109375, + "logps/rejected": -839.2247314453125, + "loss": 0.3439, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.049724102020264, + "rewards/margins": 2.2435050010681152, + "rewards/rejected": -6.293229579925537, + "step": 1900 + }, + { + "epoch": 0.21, + "grad_norm": 13.5625, + "learning_rate": 4.820175494530167e-06, + "logits/chosen": -1.7699428796768188, + "logits/rejected": -1.656707763671875, + "logps/chosen": -696.4724731445312, + "logps/rejected": -936.5437622070312, + "loss": 0.4768, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.422625541687012, + "rewards/margins": 2.5353317260742188, + "rewards/rejected": -6.9579572677612305, + "step": 1910 + }, + { + "epoch": 0.21, + "grad_norm": 15.4375, + "learning_rate": 4.8165950771817635e-06, + "logits/chosen": -1.7166239023208618, + "logits/rejected": -1.6046946048736572, + "logps/chosen": -628.2581176757812, + "logps/rejected": -835.54345703125, + "loss": 0.3241, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.9973514080047607, + "rewards/margins": 2.387856960296631, + "rewards/rejected": -6.385209083557129, + "step": 1920 + }, + { + "epoch": 0.21, + "grad_norm": 10.125, + "learning_rate": 4.812980722739561e-06, + "logits/chosen": -1.774428367614746, + "logits/rejected": -1.5067561864852905, + "logps/chosen": -595.8507080078125, + "logps/rejected": -791.5101928710938, + "loss": 0.3807, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.4049582481384277, + "rewards/margins": 2.3243491649627686, + "rewards/rejected": -5.729308128356934, + "step": 1930 + }, + { + "epoch": 0.21, + "grad_norm": 11.1875, + "learning_rate": 4.8093324841522545e-06, + "logits/chosen": -1.7582162618637085, + "logits/rejected": -1.5303871631622314, + "logps/chosen": -502.6371154785156, + "logps/rejected": -653.8784790039062, + "loss": 0.3797, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6202964782714844, + "rewards/margins": 2.0217912197113037, + "rewards/rejected": -4.642087459564209, + "step": 1940 + }, + { + "epoch": 0.21, + "grad_norm": 4.84375, + "learning_rate": 4.805650414864928e-06, + "logits/chosen": -1.7339603900909424, + "logits/rejected": -1.61464524269104, + "logps/chosen": -549.4959716796875, + "logps/rejected": -771.3903198242188, + "loss": 0.3365, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.108638286590576, + "rewards/margins": 2.5363717079162598, + "rewards/rejected": -5.645009994506836, + "step": 1950 + }, + { + "epoch": 0.21, + "grad_norm": 7.59375, + "learning_rate": 4.801934568818265e-06, + "logits/chosen": -1.7844552993774414, + "logits/rejected": -1.6534397602081299, + "logps/chosen": -544.9444580078125, + "logps/rejected": -742.1279296875, + "loss": 0.3024, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9675326347351074, + "rewards/margins": 2.412907361984253, + "rewards/rejected": -5.3804402351379395, + "step": 1960 + }, + { + "epoch": 0.22, + "grad_norm": 8.0, + "learning_rate": 4.798185000447771e-06, + "logits/chosen": -1.6767135858535767, + "logits/rejected": -1.6212692260742188, + "logps/chosen": -616.3997192382812, + "logps/rejected": -812.9508056640625, + "loss": 0.3364, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.690504789352417, + "rewards/margins": 2.3401267528533936, + "rewards/rejected": -6.0306315422058105, + "step": 1970 + }, + { + "epoch": 0.22, + "grad_norm": 10.9375, + "learning_rate": 4.794401764682964e-06, + "logits/chosen": -1.7837855815887451, + "logits/rejected": -1.6498676538467407, + "logps/chosen": -546.6578369140625, + "logps/rejected": -729.5473022460938, + "loss": 0.3942, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1827778816223145, + "rewards/margins": 2.0691211223602295, + "rewards/rejected": -5.251899242401123, + "step": 1980 + }, + { + "epoch": 0.22, + "grad_norm": 11.5625, + "learning_rate": 4.790584916946575e-06, + "logits/chosen": -1.7239339351654053, + "logits/rejected": -1.6220976114273071, + "logps/chosen": -630.3147583007812, + "logps/rejected": -814.8788452148438, + "loss": 0.3617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7947936058044434, + "rewards/margins": 2.1071815490722656, + "rewards/rejected": -5.901975154876709, + "step": 1990 + }, + { + "epoch": 0.22, + "grad_norm": 10.3125, + "learning_rate": 4.786734513153737e-06, + "logits/chosen": -1.508429765701294, + "logits/rejected": -1.5528059005737305, + "logps/chosen": -683.3448486328125, + "logps/rejected": -926.5250244140625, + "loss": 0.4309, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.474085807800293, + "rewards/margins": 2.2944424152374268, + "rewards/rejected": -6.768527984619141, + "step": 2000 + }, + { + "epoch": 0.22, + "grad_norm": 20.25, + "learning_rate": 4.782850609711162e-06, + "logits/chosen": -1.8134782314300537, + "logits/rejected": -1.702701210975647, + "logps/chosen": -614.7468872070312, + "logps/rejected": -814.0368041992188, + "loss": 0.4204, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.45196533203125, + "rewards/margins": 2.598999500274658, + "rewards/rejected": -6.05096435546875, + "step": 2010 + }, + { + "epoch": 0.22, + "grad_norm": 3.4375, + "learning_rate": 4.77893326351632e-06, + "logits/chosen": -1.7865079641342163, + "logits/rejected": -1.662840485572815, + "logps/chosen": -584.34423828125, + "logps/rejected": -799.9741821289062, + "loss": 0.312, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1864304542541504, + "rewards/margins": 2.596405506134033, + "rewards/rejected": -5.782835960388184, + "step": 2020 + }, + { + "epoch": 0.22, + "grad_norm": 4.75, + "learning_rate": 4.774982531956601e-06, + "logits/chosen": -1.8261836767196655, + "logits/rejected": -1.7252075672149658, + "logps/chosen": -577.5345458984375, + "logps/rejected": -810.6778564453125, + "loss": 0.3252, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1356899738311768, + "rewards/margins": 2.2451186180114746, + "rewards/rejected": -5.3808088302612305, + "step": 2030 + }, + { + "epoch": 0.22, + "grad_norm": 7.0, + "learning_rate": 4.770998472908474e-06, + "logits/chosen": -1.7403227090835571, + "logits/rejected": -1.5963876247406006, + "logps/chosen": -650.9176025390625, + "logps/rejected": -910.06591796875, + "loss": 0.3273, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.535475254058838, + "rewards/margins": 3.0692672729492188, + "rewards/rejected": -6.604742527008057, + "step": 2040 + }, + { + "epoch": 0.22, + "grad_norm": 7.71875, + "learning_rate": 4.7669811447366456e-06, + "logits/chosen": -1.8360906839370728, + "logits/rejected": -1.6912624835968018, + "logps/chosen": -613.3699340820312, + "logps/rejected": -780.2928466796875, + "loss": 0.4429, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.4288430213928223, + "rewards/margins": 2.1679155826568604, + "rewards/rejected": -5.5967583656311035, + "step": 2050 + }, + { + "epoch": 0.23, + "grad_norm": 17.125, + "learning_rate": 4.762930606293195e-06, + "logits/chosen": -1.943179726600647, + "logits/rejected": -1.7242149114608765, + "logps/chosen": -567.2021484375, + "logps/rejected": -694.3384399414062, + "loss": 0.4428, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6646947860717773, + "rewards/margins": 1.765342116355896, + "rewards/rejected": -4.430037021636963, + "step": 2060 + }, + { + "epoch": 0.23, + "grad_norm": 11.3125, + "learning_rate": 4.758846916916723e-06, + "logits/chosen": -1.9121663570404053, + "logits/rejected": -1.7436062097549438, + "logps/chosen": -513.0670166015625, + "logps/rejected": -703.3099365234375, + "loss": 0.3246, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.622066020965576, + "rewards/margins": 2.5250179767608643, + "rewards/rejected": -5.1470842361450195, + "step": 2070 + }, + { + "epoch": 0.23, + "grad_norm": 6.09375, + "learning_rate": 4.7547301364314706e-06, + "logits/chosen": -1.7420368194580078, + "logits/rejected": -1.6554521322250366, + "logps/chosen": -489.30462646484375, + "logps/rejected": -714.0049438476562, + "loss": 0.4398, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.5048279762268066, + "rewards/margins": 2.1611030101776123, + "rewards/rejected": -4.66593074798584, + "step": 2080 + }, + { + "epoch": 0.23, + "grad_norm": 5.53125, + "learning_rate": 4.750580325146452e-06, + "logits/chosen": -1.751765489578247, + "logits/rejected": -1.612554907798767, + "logps/chosen": -607.8345336914062, + "logps/rejected": -800.8319091796875, + "loss": 0.317, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.540098190307617, + "rewards/margins": 2.2895872592926025, + "rewards/rejected": -5.829685688018799, + "step": 2090 + }, + { + "epoch": 0.23, + "grad_norm": 8.6875, + "learning_rate": 4.746397543854571e-06, + "logits/chosen": -1.5960346460342407, + "logits/rejected": -1.6565916538238525, + "logps/chosen": -666.1968383789062, + "logps/rejected": -846.1529541015625, + "loss": 0.4137, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.241049289703369, + "rewards/margins": 2.0566153526306152, + "rewards/rejected": -6.297663688659668, + "step": 2100 + }, + { + "epoch": 0.23, + "grad_norm": 8.8125, + "learning_rate": 4.742181853831721e-06, + "logits/chosen": -1.8421558141708374, + "logits/rejected": -1.7174789905548096, + "logps/chosen": -711.9818115234375, + "logps/rejected": -979.9613037109375, + "loss": 0.3737, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.880516052246094, + "rewards/margins": 2.587658643722534, + "rewards/rejected": -7.468174934387207, + "step": 2110 + }, + { + "epoch": 0.23, + "grad_norm": 14.125, + "learning_rate": 4.7379333168359e-06, + "logits/chosen": -1.7798649072647095, + "logits/rejected": -1.5828914642333984, + "logps/chosen": -639.2733154296875, + "logps/rejected": -831.1605224609375, + "loss": 0.4617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8239293098449707, + "rewards/margins": 2.3531832695007324, + "rewards/rejected": -6.177112579345703, + "step": 2120 + }, + { + "epoch": 0.23, + "grad_norm": 7.28125, + "learning_rate": 4.7336519951062956e-06, + "logits/chosen": -1.9025436639785767, + "logits/rejected": -1.831382393836975, + "logps/chosen": -498.43731689453125, + "logps/rejected": -681.0035400390625, + "loss": 0.3754, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3599936962127686, + "rewards/margins": 1.8823719024658203, + "rewards/rejected": -4.24236536026001, + "step": 2130 + }, + { + "epoch": 0.23, + "grad_norm": 9.1875, + "learning_rate": 4.729337951362378e-06, + "logits/chosen": -1.844146490097046, + "logits/rejected": -1.6676963567733765, + "logps/chosen": -622.884765625, + "logps/rejected": -781.4063720703125, + "loss": 0.3561, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.4065089225769043, + "rewards/margins": 2.0723495483398438, + "rewards/rejected": -5.47885799407959, + "step": 2140 + }, + { + "epoch": 0.24, + "grad_norm": 6.21875, + "learning_rate": 4.724991248802982e-06, + "logits/chosen": -1.7362396717071533, + "logits/rejected": -1.6606781482696533, + "logps/chosen": -687.84423828125, + "logps/rejected": -896.1067504882812, + "loss": 0.3267, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.33128547668457, + "rewards/margins": 2.4070208072662354, + "rewards/rejected": -6.738305568695068, + "step": 2150 + }, + { + "epoch": 0.24, + "grad_norm": 5.71875, + "learning_rate": 4.7206119511053785e-06, + "logits/chosen": -1.9050432443618774, + "logits/rejected": -1.7029850482940674, + "logps/chosen": -673.17822265625, + "logps/rejected": -864.0721435546875, + "loss": 0.3621, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.113177299499512, + "rewards/margins": 2.2918124198913574, + "rewards/rejected": -6.404989719390869, + "step": 2160 + }, + { + "epoch": 0.24, + "grad_norm": 9.25, + "learning_rate": 4.716200122424342e-06, + "logits/chosen": -1.7786645889282227, + "logits/rejected": -1.6034196615219116, + "logps/chosen": -647.9398803710938, + "logps/rejected": -924.6145629882812, + "loss": 0.3531, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.923243761062622, + "rewards/margins": 2.890488862991333, + "rewards/rejected": -6.813732147216797, + "step": 2170 + }, + { + "epoch": 0.24, + "grad_norm": 17.375, + "learning_rate": 4.7117558273912135e-06, + "logits/chosen": -1.8197492361068726, + "logits/rejected": -1.591330885887146, + "logps/chosen": -645.5103759765625, + "logps/rejected": -934.8626708984375, + "loss": 0.3161, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.1829514503479, + "rewards/margins": 3.0477194786071777, + "rewards/rejected": -7.230670928955078, + "step": 2180 + }, + { + "epoch": 0.24, + "grad_norm": 22.125, + "learning_rate": 4.70727913111295e-06, + "logits/chosen": -1.6078674793243408, + "logits/rejected": -1.624535322189331, + "logps/chosen": -672.8421020507812, + "logps/rejected": -986.5765380859375, + "loss": 0.3542, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.4793381690979, + "rewards/margins": 3.052140235900879, + "rewards/rejected": -7.5314788818359375, + "step": 2190 + }, + { + "epoch": 0.24, + "grad_norm": 6.03125, + "learning_rate": 4.702770099171174e-06, + "logits/chosen": -1.7934156656265259, + "logits/rejected": -1.7412006855010986, + "logps/chosen": -752.6240234375, + "logps/rejected": -967.0823364257812, + "loss": 0.3425, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.715743541717529, + "rewards/margins": 2.2941176891326904, + "rewards/rejected": -7.009861946105957, + "step": 2200 + }, + { + "epoch": 0.24, + "grad_norm": 8.4375, + "learning_rate": 4.698228797621207e-06, + "logits/chosen": -1.749159574508667, + "logits/rejected": -1.54318368434906, + "logps/chosen": -701.2145385742188, + "logps/rejected": -879.3968505859375, + "loss": 0.4538, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.7158203125, + "rewards/margins": 1.8668746948242188, + "rewards/rejected": -6.582695007324219, + "step": 2210 + }, + { + "epoch": 0.24, + "grad_norm": 4.84375, + "learning_rate": 4.693655292991113e-06, + "logits/chosen": -1.8090322017669678, + "logits/rejected": -1.674194574356079, + "logps/chosen": -615.958251953125, + "logps/rejected": -852.201171875, + "loss": 0.3202, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6867663860321045, + "rewards/margins": 2.5732481479644775, + "rewards/rejected": -6.260014533996582, + "step": 2220 + }, + { + "epoch": 0.24, + "grad_norm": 14.625, + "learning_rate": 4.6890496522807106e-06, + "logits/chosen": -1.7101945877075195, + "logits/rejected": -1.5192331075668335, + "logps/chosen": -664.71484375, + "logps/rejected": -908.4508056640625, + "loss": 0.3465, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.288886070251465, + "rewards/margins": 2.8523240089416504, + "rewards/rejected": -7.141211032867432, + "step": 2230 + }, + { + "epoch": 0.25, + "grad_norm": 6.15625, + "learning_rate": 4.6844119429606005e-06, + "logits/chosen": -1.5674327611923218, + "logits/rejected": -1.4923779964447021, + "logps/chosen": -700.1953735351562, + "logps/rejected": -1012.2203979492188, + "loss": 0.4172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.7158403396606445, + "rewards/margins": 3.152606725692749, + "rewards/rejected": -7.868446350097656, + "step": 2240 + }, + { + "epoch": 0.25, + "grad_norm": 7.1875, + "learning_rate": 4.679742232971176e-06, + "logits/chosen": -1.7056481838226318, + "logits/rejected": -1.4993385076522827, + "logps/chosen": -657.4998779296875, + "logps/rejected": -836.1708984375, + "loss": 0.39, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.398617267608643, + "rewards/margins": 1.9246914386749268, + "rewards/rejected": -6.32330846786499, + "step": 2250 + }, + { + "epoch": 0.25, + "grad_norm": 5.125, + "learning_rate": 4.6750405907216214e-06, + "logits/chosen": -1.846686601638794, + "logits/rejected": -1.6531479358673096, + "logps/chosen": -647.4444580078125, + "logps/rejected": -857.6990966796875, + "loss": 0.3374, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.6424338817596436, + "rewards/margins": 2.567347288131714, + "rewards/rejected": -6.209781169891357, + "step": 2260 + }, + { + "epoch": 0.25, + "grad_norm": 9.8125, + "learning_rate": 4.670307085088919e-06, + "logits/chosen": -1.7421979904174805, + "logits/rejected": -1.559637188911438, + "logps/chosen": -687.2337646484375, + "logps/rejected": -925.9066162109375, + "loss": 0.3341, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.3395819664001465, + "rewards/margins": 2.659339189529419, + "rewards/rejected": -6.998921871185303, + "step": 2270 + }, + { + "epoch": 0.25, + "grad_norm": 10.125, + "learning_rate": 4.665541785416834e-06, + "logits/chosen": -1.6589972972869873, + "logits/rejected": -1.5069977045059204, + "logps/chosen": -652.8977661132812, + "logps/rejected": -922.2073974609375, + "loss": 0.3592, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8562488555908203, + "rewards/margins": 2.938584804534912, + "rewards/rejected": -6.794833183288574, + "step": 2280 + }, + { + "epoch": 0.25, + "grad_norm": 10.4375, + "learning_rate": 4.660744761514899e-06, + "logits/chosen": -1.603572130203247, + "logits/rejected": -1.5159236192703247, + "logps/chosen": -594.9527587890625, + "logps/rejected": -843.875, + "loss": 0.3217, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.7605960369110107, + "rewards/margins": 2.3999485969543457, + "rewards/rejected": -6.1605448722839355, + "step": 2290 + }, + { + "epoch": 0.25, + "grad_norm": 6.9375, + "learning_rate": 4.655916083657394e-06, + "logits/chosen": -1.6065428256988525, + "logits/rejected": -1.600203514099121, + "logps/chosen": -669.9727783203125, + "logps/rejected": -909.6575927734375, + "loss": 0.3634, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.289010524749756, + "rewards/margins": 2.5129246711730957, + "rewards/rejected": -6.80193567276001, + "step": 2300 + }, + { + "epoch": 0.25, + "grad_norm": 9.375, + "learning_rate": 4.651055822582314e-06, + "logits/chosen": -1.533845067024231, + "logits/rejected": -1.4352896213531494, + "logps/chosen": -700.1546630859375, + "logps/rejected": -957.0443115234375, + "loss": 0.26, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.702630043029785, + "rewards/margins": 2.8981518745422363, + "rewards/rejected": -7.6007819175720215, + "step": 2310 + }, + { + "epoch": 0.25, + "grad_norm": 22.25, + "learning_rate": 4.646164049490337e-06, + "logits/chosen": -1.4168152809143066, + "logits/rejected": -1.3744142055511475, + "logps/chosen": -914.4959716796875, + "logps/rejected": -1145.4097900390625, + "loss": 0.3703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.4313859939575195, + "rewards/margins": 2.5605456829071045, + "rewards/rejected": -8.991931915283203, + "step": 2320 + }, + { + "epoch": 0.26, + "grad_norm": 21.125, + "learning_rate": 4.641240836043776e-06, + "logits/chosen": -1.5654710531234741, + "logits/rejected": -1.3139206171035767, + "logps/chosen": -1029.357177734375, + "logps/rejected": -1266.2813720703125, + "loss": 0.3432, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.560727119445801, + "rewards/margins": 2.830021381378174, + "rewards/rejected": -10.390748977661133, + "step": 2330 + }, + { + "epoch": 0.26, + "grad_norm": 18.125, + "learning_rate": 4.6362862543655314e-06, + "logits/chosen": -1.5531178712844849, + "logits/rejected": -1.3913999795913696, + "logps/chosen": -937.0853271484375, + "logps/rejected": -1237.7294921875, + "loss": 0.3952, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.953960418701172, + "rewards/margins": 3.1741385459899902, + "rewards/rejected": -10.12809944152832, + "step": 2340 + }, + { + "epoch": 0.26, + "grad_norm": 11.125, + "learning_rate": 4.6313003770380335e-06, + "logits/chosen": -1.6151460409164429, + "logits/rejected": -1.608443021774292, + "logps/chosen": -787.9945068359375, + "logps/rejected": -1090.0296630859375, + "loss": 0.4087, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.520703315734863, + "rewards/margins": 2.7734127044677734, + "rewards/rejected": -8.29411506652832, + "step": 2350 + }, + { + "epoch": 0.26, + "grad_norm": 7.90625, + "learning_rate": 4.626283277102182e-06, + "logits/chosen": -1.7765910625457764, + "logits/rejected": -1.4998692274093628, + "logps/chosen": -660.5685424804688, + "logps/rejected": -928.3544921875, + "loss": 0.2884, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.098455429077148, + "rewards/margins": 2.9521777629852295, + "rewards/rejected": -7.050633430480957, + "step": 2360 + }, + { + "epoch": 0.26, + "grad_norm": 18.125, + "learning_rate": 4.621235028056274e-06, + "logits/chosen": -1.8305524587631226, + "logits/rejected": -1.6008224487304688, + "logps/chosen": -634.6265869140625, + "logps/rejected": -855.9949951171875, + "loss": 0.329, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.020810127258301, + "rewards/margins": 2.545905828475952, + "rewards/rejected": -6.56671667098999, + "step": 2370 + }, + { + "epoch": 0.26, + "grad_norm": 14.1875, + "learning_rate": 4.616155703854923e-06, + "logits/chosen": -1.8088457584381104, + "logits/rejected": -1.6297317743301392, + "logps/chosen": -691.1348876953125, + "logps/rejected": -918.11865234375, + "loss": 0.3006, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.095786094665527, + "rewards/margins": 2.6793673038482666, + "rewards/rejected": -6.775153160095215, + "step": 2380 + }, + { + "epoch": 0.26, + "grad_norm": 13.9375, + "learning_rate": 4.611045378907988e-06, + "logits/chosen": -1.7895567417144775, + "logits/rejected": -1.6470712423324585, + "logps/chosen": -713.7218627929688, + "logps/rejected": -987.1383056640625, + "loss": 0.3267, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.424603462219238, + "rewards/margins": 3.1563782691955566, + "rewards/rejected": -7.580981254577637, + "step": 2390 + }, + { + "epoch": 0.26, + "grad_norm": 15.3125, + "learning_rate": 4.605904128079466e-06, + "logits/chosen": -1.8198306560516357, + "logits/rejected": -1.5846660137176514, + "logps/chosen": -802.4262084960938, + "logps/rejected": -1083.5089111328125, + "loss": 0.3287, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.456070899963379, + "rewards/margins": 3.1990363597869873, + "rewards/rejected": -8.655107498168945, + "step": 2400 + }, + { + "epoch": 0.26, + "grad_norm": 28.375, + "learning_rate": 4.6007320266864115e-06, + "logits/chosen": -1.6783891916275024, + "logits/rejected": -1.4447085857391357, + "logps/chosen": -804.8671264648438, + "logps/rejected": -1153.2021484375, + "loss": 0.3032, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.6516947746276855, + "rewards/margins": 3.853173017501831, + "rewards/rejected": -9.504868507385254, + "step": 2410 + }, + { + "epoch": 0.27, + "grad_norm": 18.25, + "learning_rate": 4.595529150497823e-06, + "logits/chosen": -1.786781907081604, + "logits/rejected": -1.6980905532836914, + "logps/chosen": -803.776611328125, + "logps/rejected": -1066.002685546875, + "loss": 0.3907, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.371316909790039, + "rewards/margins": 2.9260621070861816, + "rewards/rejected": -8.297379493713379, + "step": 2420 + }, + { + "epoch": 0.27, + "grad_norm": 8.5, + "learning_rate": 4.590295575733537e-06, + "logits/chosen": -1.8521955013275146, + "logits/rejected": -1.6868292093276978, + "logps/chosen": -649.4505615234375, + "logps/rejected": -958.0223388671875, + "loss": 0.3521, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.102126121520996, + "rewards/margins": 3.1137630939483643, + "rewards/rejected": -7.215888977050781, + "step": 2430 + }, + { + "epoch": 0.27, + "grad_norm": 9.25, + "learning_rate": 4.585031379063109e-06, + "logits/chosen": -1.7045252323150635, + "logits/rejected": -1.4645246267318726, + "logps/chosen": -720.8236694335938, + "logps/rejected": -1014.7720947265625, + "loss": 0.371, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.843767166137695, + "rewards/margins": 2.9885916709899902, + "rewards/rejected": -7.83236026763916, + "step": 2440 + }, + { + "epoch": 0.27, + "grad_norm": 13.75, + "learning_rate": 4.579736637604693e-06, + "logits/chosen": -1.8547112941741943, + "logits/rejected": -1.6168155670166016, + "logps/chosen": -736.5202026367188, + "logps/rejected": -1012.2490234375, + "loss": 0.3478, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.784785747528076, + "rewards/margins": 3.218639373779297, + "rewards/rejected": -8.003424644470215, + "step": 2450 + }, + { + "epoch": 0.27, + "grad_norm": 7.25, + "learning_rate": 4.574411428923912e-06, + "logits/chosen": -1.793267846107483, + "logits/rejected": -1.5930099487304688, + "logps/chosen": -669.6464233398438, + "logps/rejected": -879.7228393554688, + "loss": 0.3507, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.299405574798584, + "rewards/margins": 2.3168437480926514, + "rewards/rejected": -6.616249084472656, + "step": 2460 + }, + { + "epoch": 0.27, + "grad_norm": 5.40625, + "learning_rate": 4.56905583103272e-06, + "logits/chosen": -1.8413889408111572, + "logits/rejected": -1.694618582725525, + "logps/chosen": -623.8663330078125, + "logps/rejected": -857.3548583984375, + "loss": 0.3944, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.761960506439209, + "rewards/margins": 2.7422902584075928, + "rewards/rejected": -6.504251003265381, + "step": 2470 + }, + { + "epoch": 0.27, + "grad_norm": 10.5, + "learning_rate": 4.563669922388255e-06, + "logits/chosen": -1.7537349462509155, + "logits/rejected": -1.6799370050430298, + "logps/chosen": -593.227294921875, + "logps/rejected": -850.1495971679688, + "loss": 0.3166, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.8012282848358154, + "rewards/margins": 2.4183883666992188, + "rewards/rejected": -6.219616889953613, + "step": 2480 + }, + { + "epoch": 0.27, + "grad_norm": 9.125, + "learning_rate": 4.558253781891701e-06, + "logits/chosen": -1.7070331573486328, + "logits/rejected": -1.6442807912826538, + "logps/chosen": -621.2658081054688, + "logps/rejected": -907.6800537109375, + "loss": 0.3222, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.8588707447052, + "rewards/margins": 3.021385908126831, + "rewards/rejected": -6.880256652832031, + "step": 2490 + }, + { + "epoch": 0.27, + "grad_norm": 9.75, + "learning_rate": 4.552807488887121e-06, + "logits/chosen": -1.6518423557281494, + "logits/rejected": -1.642883539199829, + "logps/chosen": -643.2070922851562, + "logps/rejected": -931.0227661132812, + "loss": 0.3931, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.137774467468262, + "rewards/margins": 2.761242389678955, + "rewards/rejected": -6.899017333984375, + "step": 2500 + }, + { + "epoch": 0.28, + "grad_norm": 9.6875, + "learning_rate": 4.5473311231603e-06, + "logits/chosen": -1.7930376529693604, + "logits/rejected": -1.663177728652954, + "logps/chosen": -630.5863037109375, + "logps/rejected": -852.806640625, + "loss": 0.3411, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.6254143714904785, + "rewards/margins": 2.5663650035858154, + "rewards/rejected": -6.191779136657715, + "step": 2510 + }, + { + "epoch": 0.28, + "grad_norm": 5.875, + "learning_rate": 4.541824764937575e-06, + "logits/chosen": -1.8731021881103516, + "logits/rejected": -1.5891236066818237, + "logps/chosen": -568.6345825195312, + "logps/rejected": -787.8675537109375, + "loss": 0.3054, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0964901447296143, + "rewards/margins": 2.7063517570495605, + "rewards/rejected": -5.802841663360596, + "step": 2520 + }, + { + "epoch": 0.28, + "grad_norm": 6.65625, + "learning_rate": 4.536288494884659e-06, + "logits/chosen": -1.7358343601226807, + "logits/rejected": -1.742810845375061, + "logps/chosen": -643.03662109375, + "logps/rejected": -880.1419067382812, + "loss": 0.4214, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7804508209228516, + "rewards/margins": 2.4696078300476074, + "rewards/rejected": -6.250058650970459, + "step": 2530 + }, + { + "epoch": 0.28, + "grad_norm": 6.53125, + "learning_rate": 4.530722394105463e-06, + "logits/chosen": -1.8084142208099365, + "logits/rejected": -1.6399023532867432, + "logps/chosen": -602.0330810546875, + "logps/rejected": -821.3145751953125, + "loss": 0.3328, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.485562801361084, + "rewards/margins": 2.505746364593506, + "rewards/rejected": -5.99130916595459, + "step": 2540 + }, + { + "epoch": 0.28, + "grad_norm": 16.25, + "learning_rate": 4.5251265441408995e-06, + "logits/chosen": -1.9206883907318115, + "logits/rejected": -1.6699693202972412, + "logps/chosen": -660.8924560546875, + "logps/rejected": -896.7345581054688, + "loss": 0.3125, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.8069050312042236, + "rewards/margins": 2.955810785293579, + "rewards/rejected": -6.762715816497803, + "step": 2550 + }, + { + "epoch": 0.28, + "grad_norm": 5.1875, + "learning_rate": 4.519501026967699e-06, + "logits/chosen": -1.88157057762146, + "logits/rejected": -1.6978178024291992, + "logps/chosen": -637.7794189453125, + "logps/rejected": -960.7884521484375, + "loss": 0.2713, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.7565255165100098, + "rewards/margins": 3.5921425819396973, + "rewards/rejected": -7.348668098449707, + "step": 2560 + }, + { + "epoch": 0.28, + "grad_norm": 13.0, + "learning_rate": 4.513845924997202e-06, + "logits/chosen": -1.7731971740722656, + "logits/rejected": -1.6160789728164673, + "logps/chosen": -666.6055908203125, + "logps/rejected": -972.20166015625, + "loss": 0.3012, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.320812702178955, + "rewards/margins": 2.965691328048706, + "rewards/rejected": -7.286503791809082, + "step": 2570 + }, + { + "epoch": 0.28, + "grad_norm": 9.5, + "learning_rate": 4.50816132107415e-06, + "logits/chosen": -1.7413545846939087, + "logits/rejected": -1.5816386938095093, + "logps/chosen": -736.944091796875, + "logps/rejected": -1049.9334716796875, + "loss": 0.3336, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.026797294616699, + "rewards/margins": 3.10628604888916, + "rewards/rejected": -8.133084297180176, + "step": 2580 + }, + { + "epoch": 0.28, + "grad_norm": 33.5, + "learning_rate": 4.50244729847548e-06, + "logits/chosen": -1.7032111883163452, + "logits/rejected": -1.5218799114227295, + "logps/chosen": -768.3067626953125, + "logps/rejected": -1104.8232421875, + "loss": 0.3317, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.417250156402588, + "rewards/margins": 3.513458728790283, + "rewards/rejected": -8.930707931518555, + "step": 2590 + }, + { + "epoch": 0.29, + "grad_norm": 9.25, + "learning_rate": 4.496703940909095e-06, + "logits/chosen": -1.6479904651641846, + "logits/rejected": -1.516813039779663, + "logps/chosen": -765.780029296875, + "logps/rejected": -1066.3165283203125, + "loss": 0.4186, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.460672378540039, + "rewards/margins": 3.1719117164611816, + "rewards/rejected": -8.632583618164062, + "step": 2600 + }, + { + "epoch": 0.29, + "grad_norm": 16.75, + "learning_rate": 4.490931332512647e-06, + "logits/chosen": -1.8246721029281616, + "logits/rejected": -1.5898334980010986, + "logps/chosen": -717.524658203125, + "logps/rejected": -958.0949096679688, + "loss": 0.3745, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.360651016235352, + "rewards/margins": 2.8058013916015625, + "rewards/rejected": -7.166452884674072, + "step": 2610 + }, + { + "epoch": 0.29, + "grad_norm": 6.0625, + "learning_rate": 4.485129557852294e-06, + "logits/chosen": -1.8659942150115967, + "logits/rejected": -1.765414834022522, + "logps/chosen": -643.419677734375, + "logps/rejected": -885.83740234375, + "loss": 0.3561, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.94073486328125, + "rewards/margins": 2.4912197589874268, + "rewards/rejected": -6.431954860687256, + "step": 2620 + }, + { + "epoch": 0.29, + "grad_norm": 16.875, + "learning_rate": 4.479298701921472e-06, + "logits/chosen": -1.8935619592666626, + "logits/rejected": -1.739542007446289, + "logps/chosen": -654.87744140625, + "logps/rejected": -835.7071533203125, + "loss": 0.4105, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.887012481689453, + "rewards/margins": 2.0911285877227783, + "rewards/rejected": -5.978140830993652, + "step": 2630 + }, + { + "epoch": 0.29, + "grad_norm": 7.6875, + "learning_rate": 4.473438850139642e-06, + "logits/chosen": -1.8068138360977173, + "logits/rejected": -1.771322250366211, + "logps/chosen": -563.6519775390625, + "logps/rejected": -799.8988037109375, + "loss": 0.3648, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2894883155822754, + "rewards/margins": 2.5785439014434814, + "rewards/rejected": -5.868031978607178, + "step": 2640 + }, + { + "epoch": 0.29, + "grad_norm": 3.140625, + "learning_rate": 4.467550088351044e-06, + "logits/chosen": -2.001492500305176, + "logits/rejected": -1.8911540508270264, + "logps/chosen": -516.89013671875, + "logps/rejected": -631.4158325195312, + "loss": 0.3663, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2585456371307373, + "rewards/margins": 1.9317734241485596, + "rewards/rejected": -4.190319061279297, + "step": 2650 + }, + { + "epoch": 0.29, + "grad_norm": 8.5, + "learning_rate": 4.461632502823432e-06, + "logits/chosen": -1.8647596836090088, + "logits/rejected": -1.8298171758651733, + "logps/chosen": -564.4725341796875, + "logps/rejected": -827.29296875, + "loss": 0.3478, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.5231659412384033, + "rewards/margins": 2.4797768592834473, + "rewards/rejected": -6.00294303894043, + "step": 2660 + }, + { + "epoch": 0.29, + "grad_norm": 9.3125, + "learning_rate": 4.45568618024682e-06, + "logits/chosen": -1.8346755504608154, + "logits/rejected": -1.7226743698120117, + "logps/chosen": -638.9258422851562, + "logps/rejected": -870.7545166015625, + "loss": 0.3205, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.05217170715332, + "rewards/margins": 2.6186983585357666, + "rewards/rejected": -6.67086935043335, + "step": 2670 + }, + { + "epoch": 0.29, + "grad_norm": 9.4375, + "learning_rate": 4.4497112077322045e-06, + "logits/chosen": -1.8311612606048584, + "logits/rejected": -1.6320518255233765, + "logps/chosen": -714.7132568359375, + "logps/rejected": -952.9669189453125, + "loss": 0.3266, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.814743995666504, + "rewards/margins": 2.5750961303710938, + "rewards/rejected": -7.389841556549072, + "step": 2680 + }, + { + "epoch": 0.29, + "grad_norm": 6.625, + "learning_rate": 4.443707672810292e-06, + "logits/chosen": -1.8469340801239014, + "logits/rejected": -1.6672306060791016, + "logps/chosen": -680.5579833984375, + "logps/rejected": -974.1163940429688, + "loss": 0.3468, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.323453426361084, + "rewards/margins": 3.323153018951416, + "rewards/rejected": -7.6466064453125, + "step": 2690 + }, + { + "epoch": 0.3, + "grad_norm": 14.375, + "learning_rate": 4.437675663430215e-06, + "logits/chosen": -1.8789142370224, + "logits/rejected": -1.7172365188598633, + "logps/chosen": -661.81298828125, + "logps/rejected": -863.11669921875, + "loss": 0.3752, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.031962871551514, + "rewards/margins": 2.3848776817321777, + "rewards/rejected": -6.416840553283691, + "step": 2700 + }, + { + "epoch": 0.3, + "grad_norm": 15.3125, + "learning_rate": 4.431615267958244e-06, + "logits/chosen": -1.8058879375457764, + "logits/rejected": -1.6935449838638306, + "logps/chosen": -701.770263671875, + "logps/rejected": -941.5208129882812, + "loss": 0.3634, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.30440616607666, + "rewards/margins": 2.668226718902588, + "rewards/rejected": -6.97263240814209, + "step": 2710 + }, + { + "epoch": 0.3, + "grad_norm": 22.875, + "learning_rate": 4.425526575176494e-06, + "logits/chosen": -1.7980619668960571, + "logits/rejected": -1.6272914409637451, + "logps/chosen": -686.6034545898438, + "logps/rejected": -1029.8941650390625, + "loss": 0.35, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.6202287673950195, + "rewards/margins": 3.343231201171875, + "rewards/rejected": -7.9634599685668945, + "step": 2720 + }, + { + "epoch": 0.3, + "grad_norm": 10.875, + "learning_rate": 4.419409674281622e-06, + "logits/chosen": -1.7512645721435547, + "logits/rejected": -1.7330853939056396, + "logps/chosen": -690.3822021484375, + "logps/rejected": -970.2443237304688, + "loss": 0.3537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.381566047668457, + "rewards/margins": 2.8806538581848145, + "rewards/rejected": -7.2622199058532715, + "step": 2730 + }, + { + "epoch": 0.3, + "grad_norm": 20.75, + "learning_rate": 4.413264654883524e-06, + "logits/chosen": -1.7192140817642212, + "logits/rejected": -1.5540837049484253, + "logps/chosen": -640.34228515625, + "logps/rejected": -903.4295654296875, + "loss": 0.378, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.083595275878906, + "rewards/margins": 2.8579540252685547, + "rewards/rejected": -6.941549777984619, + "step": 2740 + }, + { + "epoch": 0.3, + "grad_norm": 10.625, + "learning_rate": 4.407091607004014e-06, + "logits/chosen": -1.860775351524353, + "logits/rejected": -1.7315584421157837, + "logps/chosen": -637.8150634765625, + "logps/rejected": -933.8444213867188, + "loss": 0.3118, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.6638870239257812, + "rewards/margins": 3.033277750015259, + "rewards/rejected": -6.697165012359619, + "step": 2750 + }, + { + "epoch": 0.3, + "grad_norm": 7.78125, + "learning_rate": 4.400890621075518e-06, + "logits/chosen": -1.8842658996582031, + "logits/rejected": -1.7074472904205322, + "logps/chosen": -665.3984985351562, + "logps/rejected": -953.5999145507812, + "loss": 0.3474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.0106096267700195, + "rewards/margins": 3.2054710388183594, + "rewards/rejected": -7.216081142425537, + "step": 2760 + }, + { + "epoch": 0.3, + "grad_norm": 5.375, + "learning_rate": 4.394661787939737e-06, + "logits/chosen": -1.8050187826156616, + "logits/rejected": -1.5999888181686401, + "logps/chosen": -684.3742065429688, + "logps/rejected": -915.3817138671875, + "loss": 0.3805, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.206297874450684, + "rewards/margins": 2.891274929046631, + "rewards/rejected": -7.097573280334473, + "step": 2770 + }, + { + "epoch": 0.3, + "grad_norm": 6.0625, + "learning_rate": 4.388405198846324e-06, + "logits/chosen": -1.8219858407974243, + "logits/rejected": -1.5969222784042358, + "logps/chosen": -615.5216674804688, + "logps/rejected": -829.7557373046875, + "loss": 0.3464, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.6517574787139893, + "rewards/margins": 2.367446184158325, + "rewards/rejected": -6.019203186035156, + "step": 2780 + }, + { + "epoch": 0.31, + "grad_norm": 10.375, + "learning_rate": 4.382120945451542e-06, + "logits/chosen": -1.877050757408142, + "logits/rejected": -1.7803242206573486, + "logps/chosen": -564.2947998046875, + "logps/rejected": -788.2153930664062, + "loss": 0.4411, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3386852741241455, + "rewards/margins": 2.3518283367156982, + "rewards/rejected": -5.690512657165527, + "step": 2790 + }, + { + "epoch": 0.31, + "grad_norm": 5.15625, + "learning_rate": 4.375809119816926e-06, + "logits/chosen": -1.9508346319198608, + "logits/rejected": -1.7179477214813232, + "logps/chosen": -507.37158203125, + "logps/rejected": -676.9286499023438, + "loss": 0.3594, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.7584104537963867, + "rewards/margins": 2.1321988105773926, + "rewards/rejected": -4.890608787536621, + "step": 2800 + }, + { + "epoch": 0.31, + "grad_norm": 15.625, + "learning_rate": 4.369469814407931e-06, + "logits/chosen": -1.914142370223999, + "logits/rejected": -1.8020412921905518, + "logps/chosen": -549.481689453125, + "logps/rejected": -768.0693359375, + "loss": 0.4121, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.856779098510742, + "rewards/margins": 2.5775933265686035, + "rewards/rejected": -5.434372425079346, + "step": 2810 + }, + { + "epoch": 0.31, + "grad_norm": 17.875, + "learning_rate": 4.36310312209258e-06, + "logits/chosen": -1.8019615411758423, + "logits/rejected": -1.6631901264190674, + "logps/chosen": -619.0120849609375, + "logps/rejected": -832.4976806640625, + "loss": 0.3878, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.7315170764923096, + "rewards/margins": 2.5570669174194336, + "rewards/rejected": -6.288584232330322, + "step": 2820 + }, + { + "epoch": 0.31, + "grad_norm": 8.25, + "learning_rate": 4.3567091361401e-06, + "logits/chosen": -1.7397171258926392, + "logits/rejected": -1.6148868799209595, + "logps/chosen": -658.5980224609375, + "logps/rejected": -961.6345825195312, + "loss": 0.3243, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.154839515686035, + "rewards/margins": 3.2377586364746094, + "rewards/rejected": -7.392598628997803, + "step": 2830 + }, + { + "epoch": 0.31, + "grad_norm": 10.0625, + "learning_rate": 4.350287950219558e-06, + "logits/chosen": -1.7648308277130127, + "logits/rejected": -1.686745047569275, + "logps/chosen": -631.946533203125, + "logps/rejected": -879.3807373046875, + "loss": 0.3101, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.915051221847534, + "rewards/margins": 2.592646598815918, + "rewards/rejected": -6.507698059082031, + "step": 2840 + }, + { + "epoch": 0.31, + "grad_norm": 5.125, + "learning_rate": 4.343839658398491e-06, + "logits/chosen": -1.8027474880218506, + "logits/rejected": -1.6782081127166748, + "logps/chosen": -785.123046875, + "logps/rejected": -1088.863037109375, + "loss": 0.3222, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.03970193862915, + "rewards/margins": 3.144005537033081, + "rewards/rejected": -8.183708190917969, + "step": 2850 + }, + { + "epoch": 0.31, + "grad_norm": 8.125, + "learning_rate": 4.337364355141521e-06, + "logits/chosen": -1.6401450634002686, + "logits/rejected": -1.5747284889221191, + "logps/chosen": -770.7257080078125, + "logps/rejected": -1088.777099609375, + "loss": 0.2874, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.524303436279297, + "rewards/margins": 3.1013340950012207, + "rewards/rejected": -8.62563705444336, + "step": 2860 + }, + { + "epoch": 0.31, + "grad_norm": 10.5, + "learning_rate": 4.3308621353089806e-06, + "logits/chosen": -1.7272131443023682, + "logits/rejected": -1.4576951265335083, + "logps/chosen": -759.34033203125, + "logps/rejected": -973.9849853515625, + "loss": 0.3478, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.959906578063965, + "rewards/margins": 2.663212299346924, + "rewards/rejected": -7.6231184005737305, + "step": 2870 + }, + { + "epoch": 0.32, + "grad_norm": 9.5, + "learning_rate": 4.324333094155515e-06, + "logits/chosen": -1.7019875049591064, + "logits/rejected": -1.462611436843872, + "logps/chosen": -773.68359375, + "logps/rejected": -1054.2923583984375, + "loss": 0.3277, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.21914005279541, + "rewards/margins": 3.045546770095825, + "rewards/rejected": -8.264686584472656, + "step": 2880 + }, + { + "epoch": 0.32, + "grad_norm": 10.9375, + "learning_rate": 4.31777732732869e-06, + "logits/chosen": -1.6625709533691406, + "logits/rejected": -1.5200001001358032, + "logps/chosen": -793.2101440429688, + "logps/rejected": -1097.2315673828125, + "loss": 0.3478, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.424624443054199, + "rewards/margins": 3.308104991912842, + "rewards/rejected": -8.7327299118042, + "step": 2890 + }, + { + "epoch": 0.32, + "grad_norm": 16.125, + "learning_rate": 4.311194930867594e-06, + "logits/chosen": -1.6474769115447998, + "logits/rejected": -1.4816436767578125, + "logps/chosen": -964.2386474609375, + "logps/rejected": -1355.0809326171875, + "loss": 0.3372, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.856487274169922, + "rewards/margins": 4.235562324523926, + "rewards/rejected": -11.092049598693848, + "step": 2900 + }, + { + "epoch": 0.32, + "grad_norm": 6.46875, + "learning_rate": 4.3045860012014225e-06, + "logits/chosen": -1.540679931640625, + "logits/rejected": -1.3765604496002197, + "logps/chosen": -842.7080078125, + "logps/rejected": -1134.4898681640625, + "loss": 0.3775, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.936496734619141, + "rewards/margins": 3.270134687423706, + "rewards/rejected": -9.20663070678711, + "step": 2910 + }, + { + "epoch": 0.32, + "grad_norm": 14.8125, + "learning_rate": 4.297950635148075e-06, + "logits/chosen": -1.6093122959136963, + "logits/rejected": -1.4189211130142212, + "logps/chosen": -861.2029418945312, + "logps/rejected": -1175.083251953125, + "loss": 0.3492, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.912752151489258, + "rewards/margins": 3.653116226196289, + "rewards/rejected": -9.565869331359863, + "step": 2920 + }, + { + "epoch": 0.32, + "grad_norm": 6.15625, + "learning_rate": 4.291288929912731e-06, + "logits/chosen": -1.7269961833953857, + "logits/rejected": -1.4260787963867188, + "logps/chosen": -744.4271240234375, + "logps/rejected": -1044.799072265625, + "loss": 0.3848, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.722255229949951, + "rewards/margins": 3.557300567626953, + "rewards/rejected": -8.279556274414062, + "step": 2930 + }, + { + "epoch": 0.32, + "grad_norm": 7.03125, + "learning_rate": 4.284600983086427e-06, + "logits/chosen": -1.767612099647522, + "logits/rejected": -1.5415174961090088, + "logps/chosen": -675.13232421875, + "logps/rejected": -946.1424560546875, + "loss": 0.3039, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.355227470397949, + "rewards/margins": 3.0903382301330566, + "rewards/rejected": -7.445565700531006, + "step": 2940 + }, + { + "epoch": 0.32, + "grad_norm": 13.4375, + "learning_rate": 4.277886892644628e-06, + "logits/chosen": -1.7471729516983032, + "logits/rejected": -1.6363605260849, + "logps/chosen": -743.9191284179688, + "logps/rejected": -1031.248779296875, + "loss": 0.3606, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.678009986877441, + "rewards/margins": 3.1960835456848145, + "rewards/rejected": -7.874093532562256, + "step": 2950 + }, + { + "epoch": 0.32, + "grad_norm": 9.875, + "learning_rate": 4.27114675694579e-06, + "logits/chosen": -1.6881649494171143, + "logits/rejected": -1.5661311149597168, + "logps/chosen": -676.97216796875, + "logps/rejected": -919.08251953125, + "loss": 0.4585, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.500565052032471, + "rewards/margins": 2.536072015762329, + "rewards/rejected": -7.036637306213379, + "step": 2960 + }, + { + "epoch": 0.33, + "grad_norm": 5.6875, + "learning_rate": 4.2643806747299215e-06, + "logits/chosen": -1.546457052230835, + "logits/rejected": -1.4857088327407837, + "logps/chosen": -689.1171875, + "logps/rejected": -897.0452880859375, + "loss": 0.3766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.533634185791016, + "rewards/margins": 2.1539206504821777, + "rewards/rejected": -6.687554836273193, + "step": 2970 + }, + { + "epoch": 0.33, + "grad_norm": 11.4375, + "learning_rate": 4.257588745117137e-06, + "logits/chosen": -1.8319320678710938, + "logits/rejected": -1.6175273656845093, + "logps/chosen": -698.0921630859375, + "logps/rejected": -847.1058349609375, + "loss": 0.3708, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.007431983947754, + "rewards/margins": 2.373777389526367, + "rewards/rejected": -6.381209373474121, + "step": 2980 + }, + { + "epoch": 0.33, + "grad_norm": 12.8125, + "learning_rate": 4.250771067606202e-06, + "logits/chosen": -1.7856239080429077, + "logits/rejected": -1.638135552406311, + "logps/chosen": -654.0118408203125, + "logps/rejected": -903.6787109375, + "loss": 0.3679, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.074786186218262, + "rewards/margins": 2.626342535018921, + "rewards/rejected": -6.701128959655762, + "step": 2990 + }, + { + "epoch": 0.33, + "grad_norm": 6.5625, + "learning_rate": 4.243927742073079e-06, + "logits/chosen": -1.8184303045272827, + "logits/rejected": -1.7012269496917725, + "logps/chosen": -648.0172119140625, + "logps/rejected": -786.4598388671875, + "loss": 0.4536, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.711496353149414, + "rewards/margins": 1.8608808517456055, + "rewards/rejected": -5.5723772048950195, + "step": 3000 + }, + { + "epoch": 0.33, + "grad_norm": 6.03125, + "learning_rate": 4.23705886876946e-06, + "logits/chosen": -1.8321473598480225, + "logits/rejected": -1.6904895305633545, + "logps/chosen": -633.664306640625, + "logps/rejected": -920.38330078125, + "loss": 0.2791, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8917510509490967, + "rewards/margins": 2.9860751628875732, + "rewards/rejected": -6.877825736999512, + "step": 3010 + }, + { + "epoch": 0.33, + "grad_norm": 7.25, + "learning_rate": 4.230164548321304e-06, + "logits/chosen": -1.6236194372177124, + "logits/rejected": -1.5045777559280396, + "logps/chosen": -771.1322021484375, + "logps/rejected": -1063.0523681640625, + "loss": 0.3098, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.411649227142334, + "rewards/margins": 2.9278597831726074, + "rewards/rejected": -8.339509010314941, + "step": 3020 + }, + { + "epoch": 0.33, + "grad_norm": 15.9375, + "learning_rate": 4.223244881727358e-06, + "logits/chosen": -1.7324135303497314, + "logits/rejected": -1.6419090032577515, + "logps/chosen": -747.5384521484375, + "logps/rejected": -984.9309692382812, + "loss": 0.3754, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.048555850982666, + "rewards/margins": 2.6506364345550537, + "rewards/rejected": -7.699190616607666, + "step": 3030 + }, + { + "epoch": 0.33, + "grad_norm": 15.5, + "learning_rate": 4.216299970357678e-06, + "logits/chosen": -1.7320209741592407, + "logits/rejected": -1.5307482481002808, + "logps/chosen": -606.3222045898438, + "logps/rejected": -850.8665771484375, + "loss": 0.3579, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.924663543701172, + "rewards/margins": 2.7316477298736572, + "rewards/rejected": -6.65631103515625, + "step": 3040 + }, + { + "epoch": 0.33, + "grad_norm": 11.125, + "learning_rate": 4.209329915952145e-06, + "logits/chosen": -1.8027076721191406, + "logits/rejected": -1.6230281591415405, + "logps/chosen": -714.841552734375, + "logps/rejected": -966.0123901367188, + "loss": 0.4092, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.130362033843994, + "rewards/margins": 2.7953155040740967, + "rewards/rejected": -6.925677299499512, + "step": 3050 + }, + { + "epoch": 0.34, + "grad_norm": 7.0, + "learning_rate": 4.202334820618976e-06, + "logits/chosen": -1.7829900979995728, + "logits/rejected": -1.6366180181503296, + "logps/chosen": -673.6806640625, + "logps/rejected": -902.1439208984375, + "loss": 0.3396, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.039895534515381, + "rewards/margins": 2.9028658866882324, + "rewards/rejected": -6.942761421203613, + "step": 3060 + }, + { + "epoch": 0.34, + "grad_norm": 20.0, + "learning_rate": 4.195314786833226e-06, + "logits/chosen": -1.7807728052139282, + "logits/rejected": -1.6683670282363892, + "logps/chosen": -691.9683227539062, + "logps/rejected": -920.7662963867188, + "loss": 0.3201, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.230267524719238, + "rewards/margins": 2.434760570526123, + "rewards/rejected": -6.665027618408203, + "step": 3070 + }, + { + "epoch": 0.34, + "grad_norm": 6.03125, + "learning_rate": 4.188269917435284e-06, + "logits/chosen": -1.8601696491241455, + "logits/rejected": -1.691224455833435, + "logps/chosen": -767.4114990234375, + "logps/rejected": -1053.155517578125, + "loss": 0.3316, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.046138763427734, + "rewards/margins": 3.024237871170044, + "rewards/rejected": -8.0703763961792, + "step": 3080 + }, + { + "epoch": 0.34, + "grad_norm": 6.46875, + "learning_rate": 4.1812003156293746e-06, + "logits/chosen": -1.6772750616073608, + "logits/rejected": -1.413812518119812, + "logps/chosen": -798.2786254882812, + "logps/rejected": -1134.8336181640625, + "loss": 0.289, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.309187889099121, + "rewards/margins": 3.91924786567688, + "rewards/rejected": -9.228434562683105, + "step": 3090 + }, + { + "epoch": 0.34, + "grad_norm": 5.53125, + "learning_rate": 4.174106084982038e-06, + "logits/chosen": -1.6264317035675049, + "logits/rejected": -1.5525383949279785, + "logps/chosen": -769.614013671875, + "logps/rejected": -1143.1754150390625, + "loss": 0.3892, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.245709419250488, + "rewards/margins": 3.952500820159912, + "rewards/rejected": -9.198210716247559, + "step": 3100 + }, + { + "epoch": 0.34, + "grad_norm": 19.625, + "learning_rate": 4.166987329420617e-06, + "logits/chosen": -1.819851279258728, + "logits/rejected": -1.6190983057022095, + "logps/chosen": -662.8875732421875, + "logps/rejected": -1021.9993286132812, + "loss": 0.353, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.123751163482666, + "rewards/margins": 3.588541030883789, + "rewards/rejected": -7.712292671203613, + "step": 3110 + }, + { + "epoch": 0.34, + "grad_norm": 4.4375, + "learning_rate": 4.1598441532317354e-06, + "logits/chosen": -1.815704345703125, + "logits/rejected": -1.5594969987869263, + "logps/chosen": -643.9127807617188, + "logps/rejected": -1007.67138671875, + "loss": 0.3267, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.976807117462158, + "rewards/margins": 4.193514347076416, + "rewards/rejected": -8.17032241821289, + "step": 3120 + }, + { + "epoch": 0.34, + "grad_norm": 7.5, + "learning_rate": 4.152676661059763e-06, + "logits/chosen": -1.814147710800171, + "logits/rejected": -1.6966230869293213, + "logps/chosen": -625.9550170898438, + "logps/rejected": -870.1177978515625, + "loss": 0.3574, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.638632297515869, + "rewards/margins": 2.8637073040008545, + "rewards/rejected": -6.5023393630981445, + "step": 3130 + }, + { + "epoch": 0.34, + "grad_norm": 5.53125, + "learning_rate": 4.145484957905296e-06, + "logits/chosen": -1.8992061614990234, + "logits/rejected": -1.6906745433807373, + "logps/chosen": -716.0441284179688, + "logps/rejected": -967.3855590820312, + "loss": 0.2994, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.009130477905273, + "rewards/margins": 3.1680514812469482, + "rewards/rejected": -7.177181243896484, + "step": 3140 + }, + { + "epoch": 0.35, + "grad_norm": 5.875, + "learning_rate": 4.138269149123605e-06, + "logits/chosen": -1.74996817111969, + "logits/rejected": -1.6120145320892334, + "logps/chosen": -578.5724487304688, + "logps/rejected": -882.8814697265625, + "loss": 0.3658, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5675582885742188, + "rewards/margins": 3.063821315765381, + "rewards/rejected": -6.631380558013916, + "step": 3150 + }, + { + "epoch": 0.35, + "grad_norm": 5.8125, + "learning_rate": 4.131029340423096e-06, + "logits/chosen": -1.7814871072769165, + "logits/rejected": -1.6990219354629517, + "logps/chosen": -614.2649536132812, + "logps/rejected": -885.3175659179688, + "loss": 0.3921, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.5258917808532715, + "rewards/margins": 3.0348682403564453, + "rewards/rejected": -6.560759544372559, + "step": 3160 + }, + { + "epoch": 0.35, + "grad_norm": 6.5, + "learning_rate": 4.1237656378637695e-06, + "logits/chosen": -1.8630361557006836, + "logits/rejected": -1.6976873874664307, + "logps/chosen": -599.9679565429688, + "logps/rejected": -792.5631713867188, + "loss": 0.3736, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.309110641479492, + "rewards/margins": 2.313337564468384, + "rewards/rejected": -5.622448444366455, + "step": 3170 + }, + { + "epoch": 0.35, + "grad_norm": 7.65625, + "learning_rate": 4.116478147855655e-06, + "logits/chosen": -1.8541367053985596, + "logits/rejected": -1.5817817449569702, + "logps/chosen": -720.530517578125, + "logps/rejected": -1013.4244995117188, + "loss": 0.2959, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.644924163818359, + "rewards/margins": 3.345923662185669, + "rewards/rejected": -7.990847587585449, + "step": 3180 + }, + { + "epoch": 0.35, + "grad_norm": 14.5625, + "learning_rate": 4.1091669771572605e-06, + "logits/chosen": -1.5258910655975342, + "logits/rejected": -1.320874810218811, + "logps/chosen": -900.20361328125, + "logps/rejected": -1216.658203125, + "loss": 0.4195, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.907217502593994, + "rewards/margins": 3.4350783824920654, + "rewards/rejected": -10.342296600341797, + "step": 3190 + }, + { + "epoch": 0.35, + "grad_norm": 7.625, + "learning_rate": 4.1018322328740045e-06, + "logits/chosen": -1.6131141185760498, + "logits/rejected": -1.4722521305084229, + "logps/chosen": -923.4912109375, + "logps/rejected": -1197.76171875, + "loss": 0.4053, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.626331329345703, + "rewards/margins": 2.8401241302490234, + "rewards/rejected": -9.466455459594727, + "step": 3200 + }, + { + "epoch": 0.35, + "grad_norm": 7.8125, + "learning_rate": 4.0944740224566485e-06, + "logits/chosen": -1.666741132736206, + "logits/rejected": -1.6140238046646118, + "logps/chosen": -729.1771850585938, + "logps/rejected": -986.33984375, + "loss": 0.3472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.623842239379883, + "rewards/margins": 2.821453809738159, + "rewards/rejected": -7.445296287536621, + "step": 3210 + }, + { + "epoch": 0.35, + "grad_norm": 8.5625, + "learning_rate": 4.087092453699722e-06, + "logits/chosen": -1.8436431884765625, + "logits/rejected": -1.701120138168335, + "logps/chosen": -637.4710693359375, + "logps/rejected": -887.5546875, + "loss": 0.2911, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.062954902648926, + "rewards/margins": 2.95530104637146, + "rewards/rejected": -7.018255710601807, + "step": 3220 + }, + { + "epoch": 0.35, + "grad_norm": 9.75, + "learning_rate": 4.079687634739944e-06, + "logits/chosen": -1.783953070640564, + "logits/rejected": -1.6719030141830444, + "logps/chosen": -704.0686645507812, + "logps/rejected": -1015.9108276367188, + "loss": 0.3041, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.340396404266357, + "rewards/margins": 3.3204360008239746, + "rewards/rejected": -7.660832405090332, + "step": 3230 + }, + { + "epoch": 0.36, + "grad_norm": 2.984375, + "learning_rate": 4.0722596740546396e-06, + "logits/chosen": -1.6815096139907837, + "logits/rejected": -1.6264407634735107, + "logps/chosen": -778.5204467773438, + "logps/rejected": -1107.3726806640625, + "loss": 0.3773, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.322343349456787, + "rewards/margins": 3.4361038208007812, + "rewards/rejected": -8.758447647094727, + "step": 3240 + }, + { + "epoch": 0.36, + "grad_norm": 6.5625, + "learning_rate": 4.064808680460149e-06, + "logits/chosen": -1.7088276147842407, + "logits/rejected": -1.6062335968017578, + "logps/chosen": -723.3032836914062, + "logps/rejected": -982.4822387695312, + "loss": 0.3957, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.938814163208008, + "rewards/margins": 2.5391156673431396, + "rewards/rejected": -7.477929592132568, + "step": 3250 + }, + { + "epoch": 0.36, + "grad_norm": 13.625, + "learning_rate": 4.057334763110233e-06, + "logits/chosen": -1.5773365497589111, + "logits/rejected": -1.4870086908340454, + "logps/chosen": -751.6768188476562, + "logps/rejected": -1021.1192626953125, + "loss": 0.3552, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.245368003845215, + "rewards/margins": 2.644904613494873, + "rewards/rejected": -7.890272617340088, + "step": 3260 + }, + { + "epoch": 0.36, + "grad_norm": 15.125, + "learning_rate": 4.0498380314944785e-06, + "logits/chosen": -1.7176663875579834, + "logits/rejected": -1.6359691619873047, + "logps/chosen": -692.2754516601562, + "logps/rejected": -952.640625, + "loss": 0.3959, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.529394149780273, + "rewards/margins": 2.5767228603363037, + "rewards/rejected": -7.10611629486084, + "step": 3270 + }, + { + "epoch": 0.36, + "grad_norm": 13.9375, + "learning_rate": 4.04231859543669e-06, + "logits/chosen": -1.8002666234970093, + "logits/rejected": -1.6360117197036743, + "logps/chosen": -744.1111450195312, + "logps/rejected": -991.4514770507812, + "loss": 0.3912, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.721501350402832, + "rewards/margins": 3.0893428325653076, + "rewards/rejected": -7.810844421386719, + "step": 3280 + }, + { + "epoch": 0.36, + "grad_norm": 15.0, + "learning_rate": 4.03477656509328e-06, + "logits/chosen": -1.7730823755264282, + "logits/rejected": -1.6022087335586548, + "logps/chosen": -699.7794189453125, + "logps/rejected": -953.2333984375, + "loss": 0.3205, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.060299873352051, + "rewards/margins": 2.962904453277588, + "rewards/rejected": -7.0232038497924805, + "step": 3290 + }, + { + "epoch": 0.36, + "grad_norm": 13.6875, + "learning_rate": 4.027212050951661e-06, + "logits/chosen": -1.8707292079925537, + "logits/rejected": -1.684025764465332, + "logps/chosen": -693.9949340820312, + "logps/rejected": -935.79931640625, + "loss": 0.3364, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.253175735473633, + "rewards/margins": 2.7378435134887695, + "rewards/rejected": -6.991018772125244, + "step": 3300 + }, + { + "epoch": 0.36, + "grad_norm": 8.0, + "learning_rate": 4.01962516382862e-06, + "logits/chosen": -1.7355482578277588, + "logits/rejected": -1.6192363500595093, + "logps/chosen": -778.0126953125, + "logps/rejected": -1100.327392578125, + "loss": 0.3037, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.34079647064209, + "rewards/margins": 3.6528754234313965, + "rewards/rejected": -8.993672370910645, + "step": 3310 + }, + { + "epoch": 0.36, + "grad_norm": 10.0, + "learning_rate": 4.012016014868699e-06, + "logits/chosen": -1.748742699623108, + "logits/rejected": -1.5708705186843872, + "logps/chosen": -751.8130493164062, + "logps/rejected": -1023.572265625, + "loss": 0.3546, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.248095512390137, + "rewards/margins": 2.8984551429748535, + "rewards/rejected": -8.146551132202148, + "step": 3320 + }, + { + "epoch": 0.37, + "grad_norm": 8.8125, + "learning_rate": 4.004384715542568e-06, + "logits/chosen": -1.658837080001831, + "logits/rejected": -1.5111137628555298, + "logps/chosen": -789.9134521484375, + "logps/rejected": -1057.479248046875, + "loss": 0.3551, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.232223987579346, + "rewards/margins": 2.74873685836792, + "rewards/rejected": -7.980960845947266, + "step": 3330 + }, + { + "epoch": 0.37, + "grad_norm": 9.3125, + "learning_rate": 3.996731377645387e-06, + "logits/chosen": -1.862450361251831, + "logits/rejected": -1.6684503555297852, + "logps/chosen": -824.6218872070312, + "logps/rejected": -1113.2679443359375, + "loss": 0.2476, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.709708213806152, + "rewards/margins": 3.1390631198883057, + "rewards/rejected": -8.848771095275879, + "step": 3340 + }, + { + "epoch": 0.37, + "grad_norm": 4.78125, + "learning_rate": 3.989056113295172e-06, + "logits/chosen": -1.6206367015838623, + "logits/rejected": -1.5377117395401, + "logps/chosen": -919.9542846679688, + "logps/rejected": -1284.454345703125, + "loss": 0.3032, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.776396751403809, + "rewards/margins": 3.8057003021240234, + "rewards/rejected": -10.582097053527832, + "step": 3350 + }, + { + "epoch": 0.37, + "grad_norm": 26.0, + "learning_rate": 3.981359034931157e-06, + "logits/chosen": -1.623618721961975, + "logits/rejected": -1.6006911993026733, + "logps/chosen": -860.43408203125, + "logps/rejected": -1141.6221923828125, + "loss": 0.4126, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -6.220419883728027, + "rewards/margins": 2.7853264808654785, + "rewards/rejected": -9.005746841430664, + "step": 3360 + }, + { + "epoch": 0.37, + "grad_norm": 12.8125, + "learning_rate": 3.973640255312134e-06, + "logits/chosen": -1.8134819269180298, + "logits/rejected": -1.600759744644165, + "logps/chosen": -854.4776611328125, + "logps/rejected": -1175.90771484375, + "loss": 0.3317, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.388555526733398, + "rewards/margins": 3.4591269493103027, + "rewards/rejected": -9.847681999206543, + "step": 3370 + }, + { + "epoch": 0.37, + "grad_norm": 15.0625, + "learning_rate": 3.965899887514813e-06, + "logits/chosen": -1.7391765117645264, + "logits/rejected": -1.553661584854126, + "logps/chosen": -904.6530151367188, + "logps/rejected": -1177.836669921875, + "loss": 0.373, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.360637187957764, + "rewards/margins": 2.99670147895813, + "rewards/rejected": -9.357338905334473, + "step": 3380 + }, + { + "epoch": 0.37, + "grad_norm": 21.0, + "learning_rate": 3.95813804493216e-06, + "logits/chosen": -1.7204653024673462, + "logits/rejected": -1.5455065965652466, + "logps/chosen": -832.7272338867188, + "logps/rejected": -1085.985107421875, + "loss": 0.3597, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.792819023132324, + "rewards/margins": 3.0068650245666504, + "rewards/rejected": -8.799684524536133, + "step": 3390 + }, + { + "epoch": 0.37, + "grad_norm": 13.9375, + "learning_rate": 3.9503548412717395e-06, + "logits/chosen": -1.7215102910995483, + "logits/rejected": -1.56084406375885, + "logps/chosen": -826.5172119140625, + "logps/rejected": -1092.782958984375, + "loss": 0.3104, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6354289054870605, + "rewards/margins": 3.2176215648651123, + "rewards/rejected": -8.85305118560791, + "step": 3400 + }, + { + "epoch": 0.37, + "grad_norm": 8.6875, + "learning_rate": 3.9425503905540425e-06, + "logits/chosen": -1.658698320388794, + "logits/rejected": -1.5339425802230835, + "logps/chosen": -865.9066162109375, + "logps/rejected": -1228.95166015625, + "loss": 0.3078, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.399567604064941, + "rewards/margins": 3.5442421436309814, + "rewards/rejected": -9.943809509277344, + "step": 3410 + }, + { + "epoch": 0.37, + "grad_norm": 10.875, + "learning_rate": 3.934724807110822e-06, + "logits/chosen": -1.801999807357788, + "logits/rejected": -1.7076435089111328, + "logps/chosen": -879.8175048828125, + "logps/rejected": -1125.389892578125, + "loss": 0.3492, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.065579414367676, + "rewards/margins": 2.740830659866333, + "rewards/rejected": -8.80640983581543, + "step": 3420 + }, + { + "epoch": 0.38, + "grad_norm": 5.78125, + "learning_rate": 3.926878205583418e-06, + "logits/chosen": -1.835727334022522, + "logits/rejected": -1.6412999629974365, + "logps/chosen": -796.3560791015625, + "logps/rejected": -1036.200439453125, + "loss": 0.2825, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.206488132476807, + "rewards/margins": 3.0628457069396973, + "rewards/rejected": -8.269333839416504, + "step": 3430 + }, + { + "epoch": 0.38, + "grad_norm": 5.875, + "learning_rate": 3.9190107009210725e-06, + "logits/chosen": -1.5755434036254883, + "logits/rejected": -1.4851596355438232, + "logps/chosen": -879.2232666015625, + "logps/rejected": -1259.41552734375, + "loss": 0.2884, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.654427528381348, + "rewards/margins": 3.8516311645507812, + "rewards/rejected": -10.506058692932129, + "step": 3440 + }, + { + "epoch": 0.38, + "grad_norm": 12.125, + "learning_rate": 3.91112240837925e-06, + "logits/chosen": -1.5965105295181274, + "logits/rejected": -1.5096617937088013, + "logps/chosen": -980.5611572265625, + "logps/rejected": -1194.956787109375, + "loss": 0.3113, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -7.223876953125, + "rewards/margins": 2.4613471031188965, + "rewards/rejected": -9.685223579406738, + "step": 3450 + }, + { + "epoch": 0.38, + "grad_norm": 14.625, + "learning_rate": 3.903213443517951e-06, + "logits/chosen": -1.6950514316558838, + "logits/rejected": -1.6710115671157837, + "logps/chosen": -893.9850463867188, + "logps/rejected": -1198.269287109375, + "loss": 0.3529, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.156774997711182, + "rewards/margins": 3.416081666946411, + "rewards/rejected": -9.572855949401855, + "step": 3460 + }, + { + "epoch": 0.38, + "grad_norm": 12.125, + "learning_rate": 3.895283922200015e-06, + "logits/chosen": -1.7677247524261475, + "logits/rejected": -1.5214824676513672, + "logps/chosen": -830.6671752929688, + "logps/rejected": -1145.7469482421875, + "loss": 0.3314, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.837531089782715, + "rewards/margins": 3.347288131713867, + "rewards/rejected": -9.184819221496582, + "step": 3470 + }, + { + "epoch": 0.38, + "grad_norm": 9.6875, + "learning_rate": 3.887333960589421e-06, + "logits/chosen": -1.7601115703582764, + "logits/rejected": -1.7030465602874756, + "logps/chosen": -754.3997192382812, + "logps/rejected": -1062.421875, + "loss": 0.3472, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.149960517883301, + "rewards/margins": 3.0467920303344727, + "rewards/rejected": -8.196752548217773, + "step": 3480 + }, + { + "epoch": 0.38, + "grad_norm": 6.5625, + "learning_rate": 3.879363675149595e-06, + "logits/chosen": -1.8171226978302002, + "logits/rejected": -1.6923831701278687, + "logps/chosen": -657.47265625, + "logps/rejected": -931.1018676757812, + "loss": 0.3312, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.01503849029541, + "rewards/margins": 3.0791122913360596, + "rewards/rejected": -7.094151496887207, + "step": 3490 + }, + { + "epoch": 0.38, + "grad_norm": 6.90625, + "learning_rate": 3.871373182641694e-06, + "logits/chosen": -1.948322057723999, + "logits/rejected": -1.726316213607788, + "logps/chosen": -683.44140625, + "logps/rejected": -992.7235107421875, + "loss": 0.3074, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.266858100891113, + "rewards/margins": 3.536748170852661, + "rewards/rejected": -7.803605556488037, + "step": 3500 + }, + { + "epoch": 0.38, + "grad_norm": 14.4375, + "learning_rate": 3.8633626001229e-06, + "logits/chosen": -1.8469184637069702, + "logits/rejected": -1.6542524099349976, + "logps/chosen": -673.9595336914062, + "logps/rejected": -924.1552734375, + "loss": 0.3306, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.249420642852783, + "rewards/margins": 2.887775421142578, + "rewards/rejected": -7.1371965408325195, + "step": 3510 + }, + { + "epoch": 0.39, + "grad_norm": 7.8125, + "learning_rate": 3.8553320449447085e-06, + "logits/chosen": -1.8664662837982178, + "logits/rejected": -1.7244393825531006, + "logps/chosen": -709.9466552734375, + "logps/rejected": -902.0543823242188, + "loss": 0.2878, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.276833534240723, + "rewards/margins": 2.484013795852661, + "rewards/rejected": -6.760847568511963, + "step": 3520 + }, + { + "epoch": 0.39, + "grad_norm": 7.46875, + "learning_rate": 3.8472816347512e-06, + "logits/chosen": -1.814070463180542, + "logits/rejected": -1.5864160060882568, + "logps/chosen": -741.174072265625, + "logps/rejected": -942.9097900390625, + "loss": 0.366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.6682047843933105, + "rewards/margins": 2.5133891105651855, + "rewards/rejected": -7.181593894958496, + "step": 3530 + }, + { + "epoch": 0.39, + "grad_norm": 7.375, + "learning_rate": 3.839211487477327e-06, + "logits/chosen": -1.8798805475234985, + "logits/rejected": -1.6832101345062256, + "logps/chosen": -773.2493286132812, + "logps/rejected": -946.4519653320312, + "loss": 0.3203, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.654687404632568, + "rewards/margins": 2.5848886966705322, + "rewards/rejected": -7.239575386047363, + "step": 3540 + }, + { + "epoch": 0.39, + "grad_norm": 16.375, + "learning_rate": 3.8311217213471784e-06, + "logits/chosen": -1.8097636699676514, + "logits/rejected": -1.6332147121429443, + "logps/chosen": -781.9320068359375, + "logps/rejected": -1008.4404296875, + "loss": 0.414, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -5.136870384216309, + "rewards/margins": 2.3919105529785156, + "rewards/rejected": -7.528780937194824, + "step": 3550 + }, + { + "epoch": 0.39, + "grad_norm": 7.03125, + "learning_rate": 3.823012454872253e-06, + "logits/chosen": -1.8550316095352173, + "logits/rejected": -1.681195855140686, + "logps/chosen": -720.2999267578125, + "logps/rejected": -959.1920166015625, + "loss": 0.3316, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.353830814361572, + "rewards/margins": 2.711290121078491, + "rewards/rejected": -7.065121650695801, + "step": 3560 + }, + { + "epoch": 0.39, + "grad_norm": 11.625, + "learning_rate": 3.8148838068497185e-06, + "logits/chosen": -1.7598867416381836, + "logits/rejected": -1.71384596824646, + "logps/chosen": -688.1556396484375, + "logps/rejected": -909.5509643554688, + "loss": 0.3929, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.356272220611572, + "rewards/margins": 2.4079337120056152, + "rewards/rejected": -6.7642059326171875, + "step": 3570 + }, + { + "epoch": 0.39, + "grad_norm": 9.5, + "learning_rate": 3.806735896360676e-06, + "logits/chosen": -1.8148400783538818, + "logits/rejected": -1.7857723236083984, + "logps/chosen": -707.8656005859375, + "logps/rejected": -894.2692260742188, + "loss": 0.3739, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.625170707702637, + "rewards/margins": 2.145871162414551, + "rewards/rejected": -6.771040916442871, + "step": 3580 + }, + { + "epoch": 0.39, + "grad_norm": 7.4375, + "learning_rate": 3.798568842768412e-06, + "logits/chosen": -1.6439335346221924, + "logits/rejected": -1.43841552734375, + "logps/chosen": -703.6756591796875, + "logps/rejected": -981.4595947265625, + "loss": 0.3405, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.793991565704346, + "rewards/margins": 3.056474208831787, + "rewards/rejected": -7.850466251373291, + "step": 3590 + }, + { + "epoch": 0.39, + "grad_norm": 13.1875, + "learning_rate": 3.7903827657166502e-06, + "logits/chosen": -1.6609017848968506, + "logits/rejected": -1.5292900800704956, + "logps/chosen": -723.912841796875, + "logps/rejected": -1038.378662109375, + "loss": 0.2499, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.166630744934082, + "rewards/margins": 3.452667713165283, + "rewards/rejected": -8.619297981262207, + "step": 3600 + }, + { + "epoch": 0.4, + "grad_norm": 16.875, + "learning_rate": 3.7821777851278026e-06, + "logits/chosen": -1.801356315612793, + "logits/rejected": -1.6508514881134033, + "logps/chosen": -785.6228637695312, + "logps/rejected": -1081.4769287109375, + "loss": 0.3305, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.240972518920898, + "rewards/margins": 3.141313076019287, + "rewards/rejected": -8.382286071777344, + "step": 3610 + }, + { + "epoch": 0.4, + "grad_norm": 7.84375, + "learning_rate": 3.7739540212012065e-06, + "logits/chosen": -1.6620194911956787, + "logits/rejected": -1.5934765338897705, + "logps/chosen": -803.1600952148438, + "logps/rejected": -1116.5655517578125, + "loss": 0.361, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.819075107574463, + "rewards/margins": 3.1100993156433105, + "rewards/rejected": -8.929174423217773, + "step": 3620 + }, + { + "epoch": 0.4, + "grad_norm": 6.5, + "learning_rate": 3.765711594411369e-06, + "logits/chosen": -1.759314775466919, + "logits/rejected": -1.467779517173767, + "logps/chosen": -800.6527099609375, + "logps/rejected": -1049.361572265625, + "loss": 0.3722, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.49520206451416, + "rewards/margins": 2.9657607078552246, + "rewards/rejected": -8.460963249206543, + "step": 3630 + }, + { + "epoch": 0.4, + "grad_norm": 11.25, + "learning_rate": 3.757450625506197e-06, + "logits/chosen": -1.7700992822647095, + "logits/rejected": -1.6644207239151, + "logps/chosen": -815.521240234375, + "logps/rejected": -1115.4090576171875, + "loss": 0.3176, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.371822834014893, + "rewards/margins": 3.1899073123931885, + "rewards/rejected": -8.56173038482666, + "step": 3640 + }, + { + "epoch": 0.4, + "grad_norm": 12.125, + "learning_rate": 3.7491712355052344e-06, + "logits/chosen": -1.7033885717391968, + "logits/rejected": -1.589464545249939, + "logps/chosen": -815.9098510742188, + "logps/rejected": -1124.0107421875, + "loss": 0.2706, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.681308746337891, + "rewards/margins": 3.3855297565460205, + "rewards/rejected": -9.066838264465332, + "step": 3650 + }, + { + "epoch": 0.4, + "grad_norm": 20.125, + "learning_rate": 3.740873545697885e-06, + "logits/chosen": -1.6968969106674194, + "logits/rejected": -1.5185701847076416, + "logps/chosen": -871.4266357421875, + "logps/rejected": -1073.5, + "loss": 0.3776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.137188911437988, + "rewards/margins": 2.2045655250549316, + "rewards/rejected": -8.341753959655762, + "step": 3660 + }, + { + "epoch": 0.4, + "grad_norm": 5.0, + "learning_rate": 3.7325576776416357e-06, + "logits/chosen": -1.6873281002044678, + "logits/rejected": -1.5950572490692139, + "logps/chosen": -777.6759643554688, + "logps/rejected": -1021.8255615234375, + "loss": 0.3747, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.265838623046875, + "rewards/margins": 2.9609577655792236, + "rewards/rejected": -8.226797103881836, + "step": 3670 + }, + { + "epoch": 0.4, + "grad_norm": 9.0, + "learning_rate": 3.7242237531602776e-06, + "logits/chosen": -1.7522151470184326, + "logits/rejected": -1.4457781314849854, + "logps/chosen": -844.4553833007812, + "logps/rejected": -1089.444580078125, + "loss": 0.3274, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.8134002685546875, + "rewards/margins": 2.918278932571411, + "rewards/rejected": -8.73167896270752, + "step": 3680 + }, + { + "epoch": 0.4, + "grad_norm": 9.5, + "learning_rate": 3.71587189434212e-06, + "logits/chosen": -1.661102294921875, + "logits/rejected": -1.4757112264633179, + "logps/chosen": -811.2490234375, + "logps/rejected": -1072.3931884765625, + "loss": 0.27, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.557997226715088, + "rewards/margins": 3.007561445236206, + "rewards/rejected": -8.565557479858398, + "step": 3690 + }, + { + "epoch": 0.41, + "grad_norm": 7.84375, + "learning_rate": 3.707502223538203e-06, + "logits/chosen": -1.7072679996490479, + "logits/rejected": -1.7122284173965454, + "logps/chosen": -812.7813720703125, + "logps/rejected": -998.013671875, + "loss": 0.416, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.577877044677734, + "rewards/margins": 2.0804283618927, + "rewards/rejected": -7.658304691314697, + "step": 3700 + }, + { + "epoch": 0.41, + "grad_norm": 5.9375, + "learning_rate": 3.6991148633605033e-06, + "logits/chosen": -1.7110984325408936, + "logits/rejected": -1.565523386001587, + "logps/chosen": -866.1978759765625, + "logps/rejected": -1143.2158203125, + "loss": 0.2944, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.027534008026123, + "rewards/margins": 3.099961042404175, + "rewards/rejected": -9.127494812011719, + "step": 3710 + }, + { + "epoch": 0.41, + "grad_norm": 2.671875, + "learning_rate": 3.690709936680141e-06, + "logits/chosen": -1.7361294031143188, + "logits/rejected": -1.5887517929077148, + "logps/chosen": -840.6126708984375, + "logps/rejected": -1183.771240234375, + "loss": 0.3003, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.820080757141113, + "rewards/margins": 3.5648980140686035, + "rewards/rejected": -9.384979248046875, + "step": 3720 + }, + { + "epoch": 0.41, + "grad_norm": 12.5625, + "learning_rate": 3.6822875666255752e-06, + "logits/chosen": -1.544201135635376, + "logits/rejected": -1.417991042137146, + "logps/chosen": -864.2254028320312, + "logps/rejected": -1155.492919921875, + "loss": 0.3474, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.094112396240234, + "rewards/margins": 3.2589404582977295, + "rewards/rejected": -9.35305404663086, + "step": 3730 + }, + { + "epoch": 0.41, + "grad_norm": 10.25, + "learning_rate": 3.673847876580804e-06, + "logits/chosen": -1.6829497814178467, + "logits/rejected": -1.4793031215667725, + "logps/chosen": -957.1594848632812, + "logps/rejected": -1237.01318359375, + "loss": 0.284, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.7301836013793945, + "rewards/margins": 3.329521894454956, + "rewards/rejected": -10.05970573425293, + "step": 3740 + }, + { + "epoch": 0.41, + "grad_norm": 10.6875, + "learning_rate": 3.665390990183556e-06, + "logits/chosen": -1.5240252017974854, + "logits/rejected": -1.4051783084869385, + "logps/chosen": -977.9109497070312, + "logps/rejected": -1271.693603515625, + "loss": 0.3086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -7.398751735687256, + "rewards/margins": 3.284559726715088, + "rewards/rejected": -10.683311462402344, + "step": 3750 + }, + { + "epoch": 0.41, + "grad_norm": 9.0625, + "learning_rate": 3.656917031323479e-06, + "logits/chosen": -1.519801378250122, + "logits/rejected": -1.4436184167861938, + "logps/chosen": -883.30908203125, + "logps/rejected": -1234.6417236328125, + "loss": 0.297, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.577837944030762, + "rewards/margins": 3.6773438453674316, + "rewards/rejected": -10.255182266235352, + "step": 3760 + }, + { + "epoch": 0.41, + "grad_norm": 13.875, + "learning_rate": 3.6484261241403237e-06, + "logits/chosen": -1.566115140914917, + "logits/rejected": -1.4563671350479126, + "logps/chosen": -835.1639404296875, + "logps/rejected": -1195.8740234375, + "loss": 0.3187, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.0843024253845215, + "rewards/margins": 3.806533098220825, + "rewards/rejected": -9.890835762023926, + "step": 3770 + }, + { + "epoch": 0.41, + "grad_norm": 15.0, + "learning_rate": 3.6399183930221272e-06, + "logits/chosen": -1.6799434423446655, + "logits/rejected": -1.530663251876831, + "logps/chosen": -785.623291015625, + "logps/rejected": -1073.7479248046875, + "loss": 0.3717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.4268798828125, + "rewards/margins": 3.1673882007598877, + "rewards/rejected": -8.594268798828125, + "step": 3780 + }, + { + "epoch": 0.42, + "grad_norm": 4.90625, + "learning_rate": 3.6313939626033886e-06, + "logits/chosen": -1.8415935039520264, + "logits/rejected": -1.6387983560562134, + "logps/chosen": -772.8441772460938, + "logps/rejected": -1062.6365966796875, + "loss": 0.2554, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.709065914154053, + "rewards/margins": 3.452897310256958, + "rewards/rejected": -8.16196346282959, + "step": 3790 + }, + { + "epoch": 0.42, + "grad_norm": 26.375, + "learning_rate": 3.622852957763246e-06, + "logits/chosen": -1.7335326671600342, + "logits/rejected": -1.610952377319336, + "logps/chosen": -784.820556640625, + "logps/rejected": -992.4671630859375, + "loss": 0.3893, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.1696672439575195, + "rewards/margins": 2.561317205429077, + "rewards/rejected": -7.730984687805176, + "step": 3800 + }, + { + "epoch": 0.42, + "grad_norm": 8.875, + "learning_rate": 3.6142955036236443e-06, + "logits/chosen": -1.759277105331421, + "logits/rejected": -1.6954658031463623, + "logps/chosen": -815.3748168945312, + "logps/rejected": -1064.999267578125, + "loss": 0.3638, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.647228240966797, + "rewards/margins": 2.809572696685791, + "rewards/rejected": -8.45680046081543, + "step": 3810 + }, + { + "epoch": 0.42, + "grad_norm": 8.3125, + "learning_rate": 3.6057217255475034e-06, + "logits/chosen": -1.6846386194229126, + "logits/rejected": -1.6692529916763306, + "logps/chosen": -817.9186401367188, + "logps/rejected": -1061.077880859375, + "loss": 0.3367, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.512367248535156, + "rewards/margins": 2.7981624603271484, + "rewards/rejected": -8.310529708862305, + "step": 3820 + }, + { + "epoch": 0.42, + "grad_norm": 16.25, + "learning_rate": 3.5971317491368828e-06, + "logits/chosen": -1.6079378128051758, + "logits/rejected": -1.4816101789474487, + "logps/chosen": -848.1488037109375, + "logps/rejected": -1172.9901123046875, + "loss": 0.3363, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.177741050720215, + "rewards/margins": 3.6861069202423096, + "rewards/rejected": -9.863848686218262, + "step": 3830 + }, + { + "epoch": 0.42, + "grad_norm": 13.0, + "learning_rate": 3.5885257002311393e-06, + "logits/chosen": -1.6359446048736572, + "logits/rejected": -1.4892915487289429, + "logps/chosen": -886.916015625, + "logps/rejected": -1150.810546875, + "loss": 0.3809, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.581486701965332, + "rewards/margins": 2.914276599884033, + "rewards/rejected": -9.495763778686523, + "step": 3840 + }, + { + "epoch": 0.42, + "grad_norm": 14.9375, + "learning_rate": 3.579903704905084e-06, + "logits/chosen": -1.6376152038574219, + "logits/rejected": -1.4847644567489624, + "logps/chosen": -839.4830932617188, + "logps/rejected": -1169.574951171875, + "loss": 0.3179, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.823979377746582, + "rewards/margins": 3.583953857421875, + "rewards/rejected": -9.407933235168457, + "step": 3850 + }, + { + "epoch": 0.42, + "grad_norm": 13.5625, + "learning_rate": 3.571265889467138e-06, + "logits/chosen": -1.6352030038833618, + "logits/rejected": -1.4837052822113037, + "logps/chosen": -859.36865234375, + "logps/rejected": -1179.4688720703125, + "loss": 0.3233, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.078195095062256, + "rewards/margins": 3.4678738117218018, + "rewards/rejected": -9.546069145202637, + "step": 3860 + }, + { + "epoch": 0.42, + "grad_norm": 13.125, + "learning_rate": 3.562612380457479e-06, + "logits/chosen": -1.714168906211853, + "logits/rejected": -1.5869059562683105, + "logps/chosen": -736.8973999023438, + "logps/rejected": -1032.638916015625, + "loss": 0.3912, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.122539043426514, + "rewards/margins": 3.153698444366455, + "rewards/rejected": -8.276237487792969, + "step": 3870 + }, + { + "epoch": 0.43, + "grad_norm": 6.9375, + "learning_rate": 3.5539433046461887e-06, + "logits/chosen": -1.7515767812728882, + "logits/rejected": -1.5186426639556885, + "logps/chosen": -830.2685546875, + "logps/rejected": -1052.764892578125, + "loss": 0.3197, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.4353742599487305, + "rewards/margins": 2.7352347373962402, + "rewards/rejected": -8.170609474182129, + "step": 3880 + }, + { + "epoch": 0.43, + "grad_norm": 8.9375, + "learning_rate": 3.545258789031395e-06, + "logits/chosen": -1.6532068252563477, + "logits/rejected": -1.543492317199707, + "logps/chosen": -838.9913940429688, + "logps/rejected": -1127.5892333984375, + "loss": 0.3464, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.892169952392578, + "rewards/margins": 3.052790403366089, + "rewards/rejected": -8.94495964050293, + "step": 3890 + }, + { + "epoch": 0.43, + "grad_norm": 17.875, + "learning_rate": 3.536558960837412e-06, + "logits/chosen": -1.7087767124176025, + "logits/rejected": -1.5840725898742676, + "logps/chosen": -854.00927734375, + "logps/rejected": -1102.651123046875, + "loss": 0.4302, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -5.754866600036621, + "rewards/margins": 2.779989242553711, + "rewards/rejected": -8.534856796264648, + "step": 3900 + }, + { + "epoch": 0.43, + "grad_norm": 5.375, + "learning_rate": 3.527843947512878e-06, + "logits/chosen": -1.6563613414764404, + "logits/rejected": -1.5425432920455933, + "logps/chosen": -896.7854614257812, + "logps/rejected": -1136.749755859375, + "loss": 0.3326, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -6.656495571136475, + "rewards/margins": 2.413738250732422, + "rewards/rejected": -9.070234298706055, + "step": 3910 + }, + { + "epoch": 0.43, + "grad_norm": 29.125, + "learning_rate": 3.5191138767288844e-06, + "logits/chosen": -1.7289520502090454, + "logits/rejected": -1.4805307388305664, + "logps/chosen": -976.8359375, + "logps/rejected": -1252.5855712890625, + "loss": 0.3946, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.875887393951416, + "rewards/margins": 3.222200393676758, + "rewards/rejected": -10.098088264465332, + "step": 3920 + }, + { + "epoch": 0.43, + "grad_norm": 5.875, + "learning_rate": 3.5103688763771106e-06, + "logits/chosen": -1.7309353351593018, + "logits/rejected": -1.5070985555648804, + "logps/chosen": -863.6087036132812, + "logps/rejected": -1097.306396484375, + "loss": 0.2986, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.180785179138184, + "rewards/margins": 2.7020928859710693, + "rewards/rejected": -8.882879257202148, + "step": 3930 + }, + { + "epoch": 0.43, + "grad_norm": 13.125, + "learning_rate": 3.5016090745679446e-06, + "logits/chosen": -1.711968183517456, + "logits/rejected": -1.5483481884002686, + "logps/chosen": -947.0003051757812, + "logps/rejected": -1256.926025390625, + "loss": 0.3394, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.693899631500244, + "rewards/margins": 3.441843032836914, + "rewards/rejected": -10.135743141174316, + "step": 3940 + }, + { + "epoch": 0.43, + "grad_norm": 12.0, + "learning_rate": 3.4928345996286108e-06, + "logits/chosen": -1.7167565822601318, + "logits/rejected": -1.42672598361969, + "logps/chosen": -946.0501098632812, + "logps/rejected": -1168.4315185546875, + "loss": 0.3132, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.709987640380859, + "rewards/margins": 2.842014789581299, + "rewards/rejected": -9.552001953125, + "step": 3950 + }, + { + "epoch": 0.43, + "grad_norm": 16.625, + "learning_rate": 3.4840455801012884e-06, + "logits/chosen": -1.5150548219680786, + "logits/rejected": -1.3856568336486816, + "logps/chosen": -971.4849853515625, + "logps/rejected": -1367.3092041015625, + "loss": 0.336, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.312326908111572, + "rewards/margins": 4.269160270690918, + "rewards/rejected": -11.581486701965332, + "step": 3960 + }, + { + "epoch": 0.44, + "grad_norm": 7.875, + "learning_rate": 3.475242144741228e-06, + "logits/chosen": -1.5725386142730713, + "logits/rejected": -1.4214681386947632, + "logps/chosen": -1021.2891845703125, + "logps/rejected": -1320.8013916015625, + "loss": 0.2838, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.6074419021606445, + "rewards/margins": 3.6123204231262207, + "rewards/rejected": -11.219761848449707, + "step": 3970 + }, + { + "epoch": 0.44, + "grad_norm": 11.25, + "learning_rate": 3.466424422514866e-06, + "logits/chosen": -1.641242265701294, + "logits/rejected": -1.4053254127502441, + "logps/chosen": -920.1550903320312, + "logps/rejected": -1280.826416015625, + "loss": 0.3221, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.688957214355469, + "rewards/margins": 3.827831983566284, + "rewards/rejected": -10.516789436340332, + "step": 3980 + }, + { + "epoch": 0.44, + "grad_norm": 5.25, + "learning_rate": 3.457592542597935e-06, + "logits/chosen": -1.766448736190796, + "logits/rejected": -1.5231091976165771, + "logps/chosen": -848.12646484375, + "logps/rejected": -1088.4576416015625, + "loss": 0.371, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.89625358581543, + "rewards/margins": 2.9001736640930176, + "rewards/rejected": -8.796426773071289, + "step": 3990 + }, + { + "epoch": 0.44, + "grad_norm": 8.5, + "learning_rate": 3.4487466343735717e-06, + "logits/chosen": -1.8337370157241821, + "logits/rejected": -1.7262868881225586, + "logps/chosen": -763.9608154296875, + "logps/rejected": -1027.250244140625, + "loss": 0.3181, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.042724609375, + "rewards/margins": 3.0173439979553223, + "rewards/rejected": -8.060068130493164, + "step": 4000 + }, + { + "epoch": 0.44, + "grad_norm": 11.8125, + "learning_rate": 3.4398868274304203e-06, + "logits/chosen": -1.8141720294952393, + "logits/rejected": -1.7070974111557007, + "logps/chosen": -804.0147705078125, + "logps/rejected": -1095.2791748046875, + "loss": 0.2976, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.2392988204956055, + "rewards/margins": 2.8451783657073975, + "rewards/rejected": -8.084477424621582, + "step": 4010 + }, + { + "epoch": 0.44, + "grad_norm": 4.90625, + "learning_rate": 3.4310132515607377e-06, + "logits/chosen": -1.7127498388290405, + "logits/rejected": -1.5779279470443726, + "logps/chosen": -902.0662841796875, + "logps/rejected": -1199.7047119140625, + "loss": 0.3474, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.672659397125244, + "rewards/margins": 3.2791314125061035, + "rewards/rejected": -9.951791763305664, + "step": 4020 + }, + { + "epoch": 0.44, + "grad_norm": 8.4375, + "learning_rate": 3.4221260367584856e-06, + "logits/chosen": -1.6787185668945312, + "logits/rejected": -1.5419065952301025, + "logps/chosen": -905.1513671875, + "logps/rejected": -1287.8984375, + "loss": 0.2746, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.539885520935059, + "rewards/margins": 4.068717002868652, + "rewards/rejected": -10.608602523803711, + "step": 4030 + }, + { + "epoch": 0.44, + "grad_norm": 13.875, + "learning_rate": 3.4132253132174342e-06, + "logits/chosen": -1.7214152812957764, + "logits/rejected": -1.5914138555526733, + "logps/chosen": -891.1428833007812, + "logps/rejected": -1152.3057861328125, + "loss": 0.3818, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.4611358642578125, + "rewards/margins": 2.9782230854034424, + "rewards/rejected": -9.439358711242676, + "step": 4040 + }, + { + "epoch": 0.44, + "grad_norm": 9.0625, + "learning_rate": 3.404311211329247e-06, + "logits/chosen": -1.7428048849105835, + "logits/rejected": -1.622994065284729, + "logps/chosen": -930.5426025390625, + "logps/rejected": -1190.580810546875, + "loss": 0.3824, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.604745388031006, + "rewards/margins": 2.9242148399353027, + "rewards/rejected": -9.528961181640625, + "step": 4050 + }, + { + "epoch": 0.45, + "grad_norm": 10.3125, + "learning_rate": 3.3953838616815764e-06, + "logits/chosen": -1.7631595134735107, + "logits/rejected": -1.7142387628555298, + "logps/chosen": -791.2510375976562, + "logps/rejected": -1087.9287109375, + "loss": 0.3363, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.338350296020508, + "rewards/margins": 3.158299207687378, + "rewards/rejected": -8.496649742126465, + "step": 4060 + }, + { + "epoch": 0.45, + "grad_norm": 8.5, + "learning_rate": 3.3864433950561492e-06, + "logits/chosen": -1.7354265451431274, + "logits/rejected": -1.5557833909988403, + "logps/chosen": -776.976806640625, + "logps/rejected": -1030.87158203125, + "loss": 0.3498, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.204459190368652, + "rewards/margins": 2.962646007537842, + "rewards/rejected": -8.167105674743652, + "step": 4070 + }, + { + "epoch": 0.45, + "grad_norm": 12.375, + "learning_rate": 3.3774899424268483e-06, + "logits/chosen": -1.832986831665039, + "logits/rejected": -1.6982206106185913, + "logps/chosen": -717.0919799804688, + "logps/rejected": -967.1322021484375, + "loss": 0.3504, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.7921142578125, + "rewards/margins": 2.705533981323242, + "rewards/rejected": -7.497647762298584, + "step": 4080 + }, + { + "epoch": 0.45, + "grad_norm": 4.90625, + "learning_rate": 3.3685236349577977e-06, + "logits/chosen": -1.857508659362793, + "logits/rejected": -1.8278354406356812, + "logps/chosen": -736.6697387695312, + "logps/rejected": -995.06396484375, + "loss": 0.3674, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.863997459411621, + "rewards/margins": 2.7255420684814453, + "rewards/rejected": -7.589540004730225, + "step": 4090 + }, + { + "epoch": 0.45, + "grad_norm": 6.375, + "learning_rate": 3.3595446040014368e-06, + "logits/chosen": -1.7319189310073853, + "logits/rejected": -1.5259469747543335, + "logps/chosen": -758.9457397460938, + "logps/rejected": -1097.6107177734375, + "loss": 0.2216, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.303053855895996, + "rewards/margins": 3.954939603805542, + "rewards/rejected": -9.257993698120117, + "step": 4100 + }, + { + "epoch": 0.45, + "grad_norm": 3.03125, + "learning_rate": 3.3505529810965996e-06, + "logits/chosen": -1.6667169332504272, + "logits/rejected": -1.5365195274353027, + "logps/chosen": -779.6480712890625, + "logps/rejected": -1110.586181640625, + "loss": 0.3238, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.53013277053833, + "rewards/margins": 3.356299877166748, + "rewards/rejected": -8.886432647705078, + "step": 4110 + }, + { + "epoch": 0.45, + "grad_norm": 10.875, + "learning_rate": 3.3415488979665895e-06, + "logits/chosen": -1.7522022724151611, + "logits/rejected": -1.6286662817001343, + "logps/chosen": -767.9406127929688, + "logps/rejected": -1019.7306518554688, + "loss": 0.3883, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.965468406677246, + "rewards/margins": 2.892747402191162, + "rewards/rejected": -7.858216285705566, + "step": 4120 + }, + { + "epoch": 0.45, + "grad_norm": 11.75, + "learning_rate": 3.3325324865172417e-06, + "logits/chosen": -1.7419570684432983, + "logits/rejected": -1.5770286321640015, + "logps/chosen": -784.3014526367188, + "logps/rejected": -983.5319213867188, + "loss": 0.3696, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.313802719116211, + "rewards/margins": 2.4930548667907715, + "rewards/rejected": -7.806857109069824, + "step": 4130 + }, + { + "epoch": 0.45, + "grad_norm": 10.5625, + "learning_rate": 3.323503878834997e-06, + "logits/chosen": -1.7354421615600586, + "logits/rejected": -1.5942169427871704, + "logps/chosen": -843.3011474609375, + "logps/rejected": -1183.236572265625, + "loss": 0.3809, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -6.083972930908203, + "rewards/margins": 3.278273820877075, + "rewards/rejected": -9.3622465133667, + "step": 4140 + }, + { + "epoch": 0.45, + "grad_norm": 14.25, + "learning_rate": 3.31446320718497e-06, + "logits/chosen": -1.7270911931991577, + "logits/rejected": -1.5912507772445679, + "logps/chosen": -869.9052734375, + "logps/rejected": -1216.485595703125, + "loss": 0.3167, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.085524559020996, + "rewards/margins": 3.927401304244995, + "rewards/rejected": -10.01292610168457, + "step": 4150 + }, + { + "epoch": 0.46, + "grad_norm": 10.5, + "learning_rate": 3.305410604009002e-06, + "logits/chosen": -1.6486060619354248, + "logits/rejected": -1.4116010665893555, + "logps/chosen": -823.9255981445312, + "logps/rejected": -1169.0379638671875, + "loss": 0.2726, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0159525871276855, + "rewards/margins": 3.7058632373809814, + "rewards/rejected": -9.721817016601562, + "step": 4160 + }, + { + "epoch": 0.46, + "grad_norm": 8.3125, + "learning_rate": 3.296346201923727e-06, + "logits/chosen": -1.7354011535644531, + "logits/rejected": -1.5042176246643066, + "logps/chosen": -879.6804809570312, + "logps/rejected": -1166.0205078125, + "loss": 0.3063, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.214269161224365, + "rewards/margins": 3.2446606159210205, + "rewards/rejected": -9.458930015563965, + "step": 4170 + }, + { + "epoch": 0.46, + "grad_norm": 13.0, + "learning_rate": 3.2872701337186298e-06, + "logits/chosen": -1.7529525756835938, + "logits/rejected": -1.5280386209487915, + "logps/chosen": -896.4444580078125, + "logps/rejected": -1237.9212646484375, + "loss": 0.366, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.32108736038208, + "rewards/margins": 3.884061098098755, + "rewards/rejected": -10.205148696899414, + "step": 4180 + }, + { + "epoch": 0.46, + "grad_norm": 21.125, + "learning_rate": 3.2781825323540966e-06, + "logits/chosen": -1.7329429388046265, + "logits/rejected": -1.5486929416656494, + "logps/chosen": -925.0105590820312, + "logps/rejected": -1214.88525390625, + "loss": 0.3979, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.715539455413818, + "rewards/margins": 3.1681172847747803, + "rewards/rejected": -9.88365650177002, + "step": 4190 + }, + { + "epoch": 0.46, + "grad_norm": 6.90625, + "learning_rate": 3.269083530959471e-06, + "logits/chosen": -1.6382604837417603, + "logits/rejected": -1.6061570644378662, + "logps/chosen": -898.3675537109375, + "logps/rejected": -1170.880859375, + "loss": 0.2923, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.614271640777588, + "rewards/margins": 2.939804792404175, + "rewards/rejected": -9.554075241088867, + "step": 4200 + }, + { + "epoch": 0.46, + "grad_norm": 25.625, + "learning_rate": 3.2599732628310997e-06, + "logits/chosen": -1.558943271636963, + "logits/rejected": -1.5377624034881592, + "logps/chosen": -945.9231567382812, + "logps/rejected": -1242.856201171875, + "loss": 0.2978, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.835444450378418, + "rewards/margins": 3.2567591667175293, + "rewards/rejected": -10.092204093933105, + "step": 4210 + }, + { + "epoch": 0.46, + "grad_norm": 6.71875, + "learning_rate": 3.2508518614303845e-06, + "logits/chosen": -1.5757478475570679, + "logits/rejected": -1.5064926147460938, + "logps/chosen": -899.0611572265625, + "logps/rejected": -1227.7999267578125, + "loss": 0.2801, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.633514404296875, + "rewards/margins": 3.5772900581359863, + "rewards/rejected": -10.21080493927002, + "step": 4220 + }, + { + "epoch": 0.46, + "grad_norm": 6.71875, + "learning_rate": 3.2417194603818243e-06, + "logits/chosen": -1.5554733276367188, + "logits/rejected": -1.3921247720718384, + "logps/chosen": -960.9365234375, + "logps/rejected": -1300.28369140625, + "loss": 0.2902, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -7.113761901855469, + "rewards/margins": 3.9551892280578613, + "rewards/rejected": -11.068951606750488, + "step": 4230 + }, + { + "epoch": 0.46, + "grad_norm": 11.9375, + "learning_rate": 3.232576193471056e-06, + "logits/chosen": -1.6244421005249023, + "logits/rejected": -1.5446125268936157, + "logps/chosen": -920.2755126953125, + "logps/rejected": -1287.5836181640625, + "loss": 0.3462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.749262809753418, + "rewards/margins": 3.8735268115997314, + "rewards/rejected": -10.62278938293457, + "step": 4240 + }, + { + "epoch": 0.47, + "grad_norm": 5.6875, + "learning_rate": 3.2234221946428986e-06, + "logits/chosen": -1.5003159046173096, + "logits/rejected": -1.3111991882324219, + "logps/chosen": -1052.294189453125, + "logps/rejected": -1449.4342041015625, + "loss": 0.3119, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.947057247161865, + "rewards/margins": 4.450392246246338, + "rewards/rejected": -12.397449493408203, + "step": 4250 + }, + { + "epoch": 0.47, + "grad_norm": 8.4375, + "learning_rate": 3.2142575979993884e-06, + "logits/chosen": -1.5765480995178223, + "logits/rejected": -1.6133105754852295, + "logps/chosen": -1062.2218017578125, + "logps/rejected": -1340.002197265625, + "loss": 0.3231, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.858480930328369, + "rewards/margins": 2.8712422847747803, + "rewards/rejected": -10.729722023010254, + "step": 4260 + }, + { + "epoch": 0.47, + "grad_norm": 12.6875, + "learning_rate": 3.2050825377978132e-06, + "logits/chosen": -1.5111725330352783, + "logits/rejected": -1.3289752006530762, + "logps/chosen": -1059.257080078125, + "logps/rejected": -1523.3133544921875, + "loss": 0.275, + "rewards/accuracies": 0.9375, + "rewards/chosen": -7.937346458435059, + "rewards/margins": 4.893280982971191, + "rewards/rejected": -12.83062744140625, + "step": 4270 + }, + { + "epoch": 0.47, + "grad_norm": 6.59375, + "learning_rate": 3.195897148448752e-06, + "logits/chosen": -1.5582547187805176, + "logits/rejected": -1.4566032886505127, + "logps/chosen": -1015.9686279296875, + "logps/rejected": -1487.0274658203125, + "loss": 0.2919, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.858682155609131, + "rewards/margins": 4.774253845214844, + "rewards/rejected": -12.632936477661133, + "step": 4280 + }, + { + "epoch": 0.47, + "grad_norm": 14.6875, + "learning_rate": 3.1867015645140966e-06, + "logits/chosen": -1.627729058265686, + "logits/rejected": -1.4988969564437866, + "logps/chosen": -930.3201904296875, + "logps/rejected": -1309.8641357421875, + "loss": 0.2643, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.838916778564453, + "rewards/margins": 4.006606101989746, + "rewards/rejected": -10.845524787902832, + "step": 4290 + }, + { + "epoch": 0.47, + "grad_norm": 14.6875, + "learning_rate": 3.1774959207050856e-06, + "logits/chosen": -1.6902227401733398, + "logits/rejected": -1.3787007331848145, + "logps/chosen": -963.9844970703125, + "logps/rejected": -1280.5059814453125, + "loss": 0.4064, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -7.143684387207031, + "rewards/margins": 3.2914936542510986, + "rewards/rejected": -10.435178756713867, + "step": 4300 + }, + { + "epoch": 0.47, + "grad_norm": 9.875, + "learning_rate": 3.168280351880333e-06, + "logits/chosen": -1.6307446956634521, + "logits/rejected": -1.5734670162200928, + "logps/chosen": -815.3425903320312, + "logps/rejected": -1124.6192626953125, + "loss": 0.3229, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.559720039367676, + "rewards/margins": 2.9877076148986816, + "rewards/rejected": -8.547428131103516, + "step": 4310 + }, + { + "epoch": 0.47, + "grad_norm": 5.90625, + "learning_rate": 3.159054993043848e-06, + "logits/chosen": -1.6573463678359985, + "logits/rejected": -1.6634113788604736, + "logps/chosen": -762.5718994140625, + "logps/rejected": -1045.3062744140625, + "loss": 0.3572, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.033192157745361, + "rewards/margins": 2.958360195159912, + "rewards/rejected": -7.991551876068115, + "step": 4320 + }, + { + "epoch": 0.47, + "grad_norm": 2.53125, + "learning_rate": 3.149819979343059e-06, + "logits/chosen": -1.827853798866272, + "logits/rejected": -1.505097508430481, + "logps/chosen": -790.1149291992188, + "logps/rejected": -1086.11474609375, + "loss": 0.2791, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.076987266540527, + "rewards/margins": 3.5544357299804688, + "rewards/rejected": -8.63142204284668, + "step": 4330 + }, + { + "epoch": 0.48, + "grad_norm": 7.9375, + "learning_rate": 3.140575446066834e-06, + "logits/chosen": -1.8484938144683838, + "logits/rejected": -1.6618080139160156, + "logps/chosen": -783.72607421875, + "logps/rejected": -1043.5955810546875, + "loss": 0.3489, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.0774126052856445, + "rewards/margins": 3.091043710708618, + "rewards/rejected": -8.16845703125, + "step": 4340 + }, + { + "epoch": 0.48, + "grad_norm": 14.1875, + "learning_rate": 3.1313215286434983e-06, + "logits/chosen": -1.6827160120010376, + "logits/rejected": -1.5614336729049683, + "logps/chosen": -809.9287109375, + "logps/rejected": -1161.54443359375, + "loss": 0.3202, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.700687408447266, + "rewards/margins": 3.7789082527160645, + "rewards/rejected": -9.479597091674805, + "step": 4350 + }, + { + "epoch": 0.48, + "grad_norm": 12.8125, + "learning_rate": 3.1220583626388535e-06, + "logits/chosen": -1.7776035070419312, + "logits/rejected": -1.6708093881607056, + "logps/chosen": -702.1504516601562, + "logps/rejected": -952.6173706054688, + "loss": 0.3111, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.633187770843506, + "rewards/margins": 2.6518521308898926, + "rewards/rejected": -7.285040378570557, + "step": 4360 + }, + { + "epoch": 0.48, + "grad_norm": 6.90625, + "learning_rate": 3.1127860837541847e-06, + "logits/chosen": -1.8694965839385986, + "logits/rejected": -1.7426789999008179, + "logps/chosen": -761.8948974609375, + "logps/rejected": -990.4112548828125, + "loss": 0.4573, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.806753635406494, + "rewards/margins": 2.517787218093872, + "rewards/rejected": -7.324540615081787, + "step": 4370 + }, + { + "epoch": 0.48, + "grad_norm": 9.375, + "learning_rate": 3.1035048278242785e-06, + "logits/chosen": -1.766170859336853, + "logits/rejected": -1.7188829183578491, + "logps/chosen": -691.8225708007812, + "logps/rejected": -999.8314208984375, + "loss": 0.3621, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.453577995300293, + "rewards/margins": 3.1541543006896973, + "rewards/rejected": -7.607731819152832, + "step": 4380 + }, + { + "epoch": 0.48, + "grad_norm": 5.03125, + "learning_rate": 3.094214730815433e-06, + "logits/chosen": -1.8985979557037354, + "logits/rejected": -1.7381813526153564, + "logps/chosen": -691.7315673828125, + "logps/rejected": -909.2740478515625, + "loss": 0.3306, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.886500597000122, + "rewards/margins": 2.603572368621826, + "rewards/rejected": -6.490072727203369, + "step": 4390 + }, + { + "epoch": 0.48, + "grad_norm": 6.53125, + "learning_rate": 3.0849159288234614e-06, + "logits/chosen": -1.950784683227539, + "logits/rejected": -1.8165565729141235, + "logps/chosen": -649.2279052734375, + "logps/rejected": -930.6763916015625, + "loss": 0.3537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.162286281585693, + "rewards/margins": 2.854403018951416, + "rewards/rejected": -7.016690254211426, + "step": 4400 + }, + { + "epoch": 0.48, + "grad_norm": 5.0, + "learning_rate": 3.0756085580717028e-06, + "logits/chosen": -2.0243048667907715, + "logits/rejected": -1.7273666858673096, + "logps/chosen": -667.6845703125, + "logps/rejected": -881.2068481445312, + "loss": 0.3434, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.9375414848327637, + "rewards/margins": 2.70707631111145, + "rewards/rejected": -6.644617557525635, + "step": 4410 + }, + { + "epoch": 0.48, + "grad_norm": 10.3125, + "learning_rate": 3.0662927549090234e-06, + "logits/chosen": -1.9025144577026367, + "logits/rejected": -1.8328468799591064, + "logps/chosen": -688.2364501953125, + "logps/rejected": -920.9490966796875, + "loss": 0.3421, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.42232608795166, + "rewards/margins": 2.3738596439361572, + "rewards/rejected": -6.796185493469238, + "step": 4420 + }, + { + "epoch": 0.49, + "grad_norm": 7.03125, + "learning_rate": 3.0569686558078217e-06, + "logits/chosen": -1.776731252670288, + "logits/rejected": -1.619316816329956, + "logps/chosen": -678.84326171875, + "logps/rejected": -1008.5579833984375, + "loss": 0.3584, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.808873176574707, + "rewards/margins": 3.1528220176696777, + "rewards/rejected": -7.961695671081543, + "step": 4430 + }, + { + "epoch": 0.49, + "grad_norm": 16.375, + "learning_rate": 3.0476363973620283e-06, + "logits/chosen": -1.7848304510116577, + "logits/rejected": -1.69439697265625, + "logps/chosen": -793.9459228515625, + "logps/rejected": -1062.8822021484375, + "loss": 0.3091, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.048398971557617, + "rewards/margins": 3.055074453353882, + "rewards/rejected": -8.103472709655762, + "step": 4440 + }, + { + "epoch": 0.49, + "grad_norm": 6.71875, + "learning_rate": 3.0382961162851033e-06, + "logits/chosen": -1.8261514902114868, + "logits/rejected": -1.5759637355804443, + "logps/chosen": -738.7652587890625, + "logps/rejected": -965.4041748046875, + "loss": 0.3168, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.835686206817627, + "rewards/margins": 2.8481087684631348, + "rewards/rejected": -7.683794975280762, + "step": 4450 + }, + { + "epoch": 0.49, + "grad_norm": 7.15625, + "learning_rate": 3.0289479494080354e-06, + "logits/chosen": -1.7834552526474, + "logits/rejected": -1.7425804138183594, + "logps/chosen": -708.6024169921875, + "logps/rejected": -1020.5144653320312, + "loss": 0.305, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.569744110107422, + "rewards/margins": 2.9914474487304688, + "rewards/rejected": -7.561191558837891, + "step": 4460 + }, + { + "epoch": 0.49, + "grad_norm": 4.84375, + "learning_rate": 3.019592033677338e-06, + "logits/chosen": -1.8455852270126343, + "logits/rejected": -1.664345383644104, + "logps/chosen": -649.6399536132812, + "logps/rejected": -923.3863525390625, + "loss": 0.2518, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.25808048248291, + "rewards/margins": 3.0072407722473145, + "rewards/rejected": -7.265321254730225, + "step": 4470 + }, + { + "epoch": 0.49, + "grad_norm": 12.1875, + "learning_rate": 3.0102285061530396e-06, + "logits/chosen": -1.8630342483520508, + "logits/rejected": -1.6471426486968994, + "logps/chosen": -738.3119506835938, + "logps/rejected": -960.4339599609375, + "loss": 0.3674, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.971234321594238, + "rewards/margins": 2.5135326385498047, + "rewards/rejected": -7.484766960144043, + "step": 4480 + }, + { + "epoch": 0.49, + "grad_norm": 12.125, + "learning_rate": 3.0008575040066796e-06, + "logits/chosen": -1.8722641468048096, + "logits/rejected": -1.6389853954315186, + "logps/chosen": -667.253173828125, + "logps/rejected": -1001.1974487304688, + "loss": 0.303, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.240373134613037, + "rewards/margins": 3.569399356842041, + "rewards/rejected": -7.8097734451293945, + "step": 4490 + }, + { + "epoch": 0.49, + "grad_norm": 4.03125, + "learning_rate": 2.9914791645192963e-06, + "logits/chosen": -1.8622334003448486, + "logits/rejected": -1.816104531288147, + "logps/chosen": -669.1975708007812, + "logps/rejected": -1032.929443359375, + "loss": 0.2611, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.387300968170166, + "rewards/margins": 3.554762601852417, + "rewards/rejected": -7.942063331604004, + "step": 4500 + }, + { + "epoch": 0.49, + "grad_norm": 4.78125, + "learning_rate": 2.9820936250794177e-06, + "logits/chosen": -1.907970666885376, + "logits/rejected": -1.7195755243301392, + "logps/chosen": -717.5592041015625, + "logps/rejected": -961.0452270507812, + "loss": 0.3511, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.61825704574585, + "rewards/margins": 2.5773959159851074, + "rewards/rejected": -7.195652961730957, + "step": 4510 + }, + { + "epoch": 0.5, + "grad_norm": 8.3125, + "learning_rate": 2.9727010231810477e-06, + "logits/chosen": -1.7633917331695557, + "logits/rejected": -1.6857026815414429, + "logps/chosen": -771.6585693359375, + "logps/rejected": -1031.649658203125, + "loss": 0.316, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.288061618804932, + "rewards/margins": 2.7907071113586426, + "rewards/rejected": -8.078767776489258, + "step": 4520 + }, + { + "epoch": 0.5, + "grad_norm": 10.5, + "learning_rate": 2.963301496421652e-06, + "logits/chosen": -1.7977771759033203, + "logits/rejected": -1.7072093486785889, + "logps/chosen": -712.3453979492188, + "logps/rejected": -1032.0924072265625, + "loss": 0.354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.614495754241943, + "rewards/margins": 3.3302791118621826, + "rewards/rejected": -7.944774627685547, + "step": 4530 + }, + { + "epoch": 0.5, + "grad_norm": 6.25, + "learning_rate": 2.9538951825001423e-06, + "logits/chosen": -1.8113962411880493, + "logits/rejected": -1.5610144138336182, + "logps/chosen": -701.635986328125, + "logps/rejected": -997.3121948242188, + "loss": 0.3228, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.848748683929443, + "rewards/margins": 3.268231153488159, + "rewards/rejected": -8.116979598999023, + "step": 4540 + }, + { + "epoch": 0.5, + "grad_norm": 5.75, + "learning_rate": 2.944482219214859e-06, + "logits/chosen": -1.6049013137817383, + "logits/rejected": -1.4833654165267944, + "logps/chosen": -823.8816528320312, + "logps/rejected": -1248.07275390625, + "loss": 0.2261, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.065056324005127, + "rewards/margins": 4.461723327636719, + "rewards/rejected": -10.526779174804688, + "step": 4550 + }, + { + "epoch": 0.5, + "grad_norm": 3.28125, + "learning_rate": 2.935062744461554e-06, + "logits/chosen": -1.6843500137329102, + "logits/rejected": -1.6731818914413452, + "logps/chosen": -918.1310424804688, + "logps/rejected": -1335.868408203125, + "loss": 0.32, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.9652299880981445, + "rewards/margins": 4.029031276702881, + "rewards/rejected": -10.994260787963867, + "step": 4560 + }, + { + "epoch": 0.5, + "grad_norm": 12.0625, + "learning_rate": 2.9256368962313687e-06, + "logits/chosen": -1.8122971057891846, + "logits/rejected": -1.6537513732910156, + "logps/chosen": -918.64111328125, + "logps/rejected": -1267.21435546875, + "loss": 0.3419, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.513071537017822, + "rewards/margins": 3.7809951305389404, + "rewards/rejected": -10.294066429138184, + "step": 4570 + }, + { + "epoch": 0.5, + "grad_norm": 8.375, + "learning_rate": 2.9162048126088115e-06, + "logits/chosen": -1.8140627145767212, + "logits/rejected": -1.6087058782577515, + "logps/chosen": -912.8229370117188, + "logps/rejected": -1274.7252197265625, + "loss": 0.3664, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -6.4958295822143555, + "rewards/margins": 3.5449740886688232, + "rewards/rejected": -10.040803909301758, + "step": 4580 + }, + { + "epoch": 0.5, + "grad_norm": 6.40625, + "learning_rate": 2.906766631769738e-06, + "logits/chosen": -1.7398548126220703, + "logits/rejected": -1.6397594213485718, + "logps/chosen": -965.7960815429688, + "logps/rejected": -1230.6099853515625, + "loss": 0.3257, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.0341291427612305, + "rewards/margins": 2.845816135406494, + "rewards/rejected": -9.879945755004883, + "step": 4590 + }, + { + "epoch": 0.5, + "grad_norm": 23.375, + "learning_rate": 2.8973224919793257e-06, + "logits/chosen": -1.6098241806030273, + "logits/rejected": -1.433070182800293, + "logps/chosen": -982.9953002929688, + "logps/rejected": -1209.6009521484375, + "loss": 0.3257, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.4152512550354, + "rewards/margins": 2.706810474395752, + "rewards/rejected": -10.122062683105469, + "step": 4600 + }, + { + "epoch": 0.51, + "grad_norm": 8.8125, + "learning_rate": 2.887872531590048e-06, + "logits/chosen": -1.6789734363555908, + "logits/rejected": -1.5520908832550049, + "logps/chosen": -926.7234497070312, + "logps/rejected": -1242.9869384765625, + "loss": 0.3023, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.823643684387207, + "rewards/margins": 3.4915969371795654, + "rewards/rejected": -10.315240859985352, + "step": 4610 + }, + { + "epoch": 0.51, + "grad_norm": 19.625, + "learning_rate": 2.878416889039647e-06, + "logits/chosen": -1.7004142999649048, + "logits/rejected": -1.5697875022888184, + "logps/chosen": -985.3995971679688, + "logps/rejected": -1286.0374755859375, + "loss": 0.4768, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -7.433926582336426, + "rewards/margins": 3.0062403678894043, + "rewards/rejected": -10.440167427062988, + "step": 4620 + }, + { + "epoch": 0.51, + "grad_norm": 11.9375, + "learning_rate": 2.8689557028491056e-06, + "logits/chosen": -1.7655107975006104, + "logits/rejected": -1.6117546558380127, + "logps/chosen": -912.0537109375, + "logps/rejected": -1163.073486328125, + "loss": 0.3361, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.284945011138916, + "rewards/margins": 2.837637424468994, + "rewards/rejected": -9.122583389282227, + "step": 4630 + }, + { + "epoch": 0.51, + "grad_norm": 6.09375, + "learning_rate": 2.8594891116206192e-06, + "logits/chosen": -1.6980937719345093, + "logits/rejected": -1.50301194190979, + "logps/chosen": -893.98828125, + "logps/rejected": -1251.1690673828125, + "loss": 0.249, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.560011863708496, + "rewards/margins": 3.8832569122314453, + "rewards/rejected": -10.443269729614258, + "step": 4640 + }, + { + "epoch": 0.51, + "grad_norm": 10.5625, + "learning_rate": 2.8500172540355647e-06, + "logits/chosen": -1.7568638324737549, + "logits/rejected": -1.6024482250213623, + "logps/chosen": -976.40625, + "logps/rejected": -1351.19287109375, + "loss": 0.2348, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.202592372894287, + "rewards/margins": 4.044913291931152, + "rewards/rejected": -11.247505187988281, + "step": 4650 + }, + { + "epoch": 0.51, + "grad_norm": 6.875, + "learning_rate": 2.840540268852468e-06, + "logits/chosen": -1.6918563842773438, + "logits/rejected": -1.5516451597213745, + "logps/chosen": -1009.6102294921875, + "logps/rejected": -1396.6876220703125, + "loss": 0.3243, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.661390781402588, + "rewards/margins": 3.9626758098602295, + "rewards/rejected": -11.624066352844238, + "step": 4660 + }, + { + "epoch": 0.51, + "grad_norm": 8.0625, + "learning_rate": 2.831058294904973e-06, + "logits/chosen": -1.6180967092514038, + "logits/rejected": -1.5064449310302734, + "logps/chosen": -1111.2506103515625, + "logps/rejected": -1462.503173828125, + "loss": 0.2775, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -8.54361629486084, + "rewards/margins": 3.5878500938415527, + "rewards/rejected": -12.131465911865234, + "step": 4670 + }, + { + "epoch": 0.51, + "grad_norm": 5.3125, + "learning_rate": 2.8215714710998065e-06, + "logits/chosen": -1.666666030883789, + "logits/rejected": -1.5591226816177368, + "logps/chosen": -999.6051025390625, + "logps/rejected": -1298.0440673828125, + "loss": 0.3982, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -7.6425676345825195, + "rewards/margins": 3.1262950897216797, + "rewards/rejected": -10.768861770629883, + "step": 4680 + }, + { + "epoch": 0.51, + "grad_norm": 11.625, + "learning_rate": 2.812079936414744e-06, + "logits/chosen": -1.82754385471344, + "logits/rejected": -1.462524175643921, + "logps/chosen": -974.0478515625, + "logps/rejected": -1280.0806884765625, + "loss": 0.3418, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.4038896560668945, + "rewards/margins": 3.449916124343872, + "rewards/rejected": -10.853803634643555, + "step": 4690 + }, + { + "epoch": 0.52, + "grad_norm": 4.21875, + "learning_rate": 2.8025838298965714e-06, + "logits/chosen": -1.7871055603027344, + "logits/rejected": -1.6704909801483154, + "logps/chosen": -882.3492431640625, + "logps/rejected": -1246.852783203125, + "loss": 0.3639, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.362827777862549, + "rewards/margins": 3.7324092388153076, + "rewards/rejected": -10.095235824584961, + "step": 4700 + }, + { + "epoch": 0.52, + "grad_norm": 4.3125, + "learning_rate": 2.7930832906590523e-06, + "logits/chosen": -1.8106727600097656, + "logits/rejected": -1.8662102222442627, + "logps/chosen": -829.2774658203125, + "logps/rejected": -1101.789794921875, + "loss": 0.2858, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.783268928527832, + "rewards/margins": 2.7687697410583496, + "rewards/rejected": -8.55203914642334, + "step": 4710 + }, + { + "epoch": 0.52, + "grad_norm": 4.8125, + "learning_rate": 2.7835784578808867e-06, + "logits/chosen": -1.905045747756958, + "logits/rejected": -1.7801287174224854, + "logps/chosen": -790.1764526367188, + "logps/rejected": -1043.71484375, + "loss": 0.3723, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.093790054321289, + "rewards/margins": 2.965834379196167, + "rewards/rejected": -8.059624671936035, + "step": 4720 + }, + { + "epoch": 0.52, + "grad_norm": 8.1875, + "learning_rate": 2.7740694708036723e-06, + "logits/chosen": -1.8596729040145874, + "logits/rejected": -1.697564721107483, + "logps/chosen": -854.0379638671875, + "logps/rejected": -1200.6275634765625, + "loss": 0.3728, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.81211519241333, + "rewards/margins": 3.747648239135742, + "rewards/rejected": -9.55976390838623, + "step": 4730 + }, + { + "epoch": 0.52, + "grad_norm": 11.75, + "learning_rate": 2.764556468729867e-06, + "logits/chosen": -1.7627031803131104, + "logits/rejected": -1.572540521621704, + "logps/chosen": -791.794677734375, + "logps/rejected": -1094.9952392578125, + "loss": 0.3187, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.64096212387085, + "rewards/margins": 3.312873363494873, + "rewards/rejected": -8.953835487365723, + "step": 4740 + }, + { + "epoch": 0.52, + "grad_norm": 5.34375, + "learning_rate": 2.755039591020745e-06, + "logits/chosen": -1.794011116027832, + "logits/rejected": -1.6732063293457031, + "logps/chosen": -777.0990600585938, + "logps/rejected": -1066.388916015625, + "loss": 0.4103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.290256023406982, + "rewards/margins": 3.013242483139038, + "rewards/rejected": -8.303497314453125, + "step": 4750 + }, + { + "epoch": 0.52, + "grad_norm": 11.5625, + "learning_rate": 2.745518977094359e-06, + "logits/chosen": -1.897469162940979, + "logits/rejected": -1.829250693321228, + "logps/chosen": -719.7210693359375, + "logps/rejected": -1133.2857666015625, + "loss": 0.2703, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.082152366638184, + "rewards/margins": 4.014290809631348, + "rewards/rejected": -9.096443176269531, + "step": 4760 + }, + { + "epoch": 0.52, + "grad_norm": 20.375, + "learning_rate": 2.7359947664234936e-06, + "logits/chosen": -1.835961937904358, + "logits/rejected": -1.6534532308578491, + "logps/chosen": -825.0054931640625, + "logps/rejected": -1104.5386962890625, + "loss": 0.3515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.615521430969238, + "rewards/margins": 3.4076926708221436, + "rewards/rejected": -9.023214340209961, + "step": 4770 + }, + { + "epoch": 0.52, + "grad_norm": 6.6875, + "learning_rate": 2.726467098533624e-06, + "logits/chosen": -1.7411930561065674, + "logits/rejected": -1.5590285062789917, + "logps/chosen": -884.9173583984375, + "logps/rejected": -1189.7298583984375, + "loss": 0.2674, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.529805660247803, + "rewards/margins": 3.3226776123046875, + "rewards/rejected": -9.852482795715332, + "step": 4780 + }, + { + "epoch": 0.53, + "grad_norm": 15.1875, + "learning_rate": 2.7169361130008744e-06, + "logits/chosen": -1.789324402809143, + "logits/rejected": -1.582918405532837, + "logps/chosen": -897.6589965820312, + "logps/rejected": -1196.581787109375, + "loss": 0.3223, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.584109306335449, + "rewards/margins": 3.358776092529297, + "rewards/rejected": -9.942886352539062, + "step": 4790 + }, + { + "epoch": 0.53, + "grad_norm": 5.90625, + "learning_rate": 2.7074019494499683e-06, + "logits/chosen": -1.611797571182251, + "logits/rejected": -1.6217677593231201, + "logps/chosen": -838.6533203125, + "logps/rejected": -1168.1236572265625, + "loss": 0.3267, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.868292808532715, + "rewards/margins": 3.414698362350464, + "rewards/rejected": -9.282991409301758, + "step": 4800 + }, + { + "epoch": 0.53, + "grad_norm": 9.6875, + "learning_rate": 2.697864747552188e-06, + "logits/chosen": -1.7624536752700806, + "logits/rejected": -1.5906658172607422, + "logps/chosen": -847.8079223632812, + "logps/rejected": -1114.502197265625, + "loss": 0.3557, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.0780029296875, + "rewards/margins": 2.859889268875122, + "rewards/rejected": -8.937891960144043, + "step": 4810 + }, + { + "epoch": 0.53, + "grad_norm": 6.3125, + "learning_rate": 2.688324647023325e-06, + "logits/chosen": -1.867327094078064, + "logits/rejected": -1.6651290655136108, + "logps/chosen": -759.8614501953125, + "logps/rejected": -1067.374267578125, + "loss": 0.3364, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.199161529541016, + "rewards/margins": 3.3958678245544434, + "rewards/rejected": -8.595029830932617, + "step": 4820 + }, + { + "epoch": 0.53, + "grad_norm": 10.125, + "learning_rate": 2.678781787621633e-06, + "logits/chosen": -1.8439241647720337, + "logits/rejected": -1.7810585498809814, + "logps/chosen": -763.2825927734375, + "logps/rejected": -1023.9285888671875, + "loss": 0.3489, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.9004435539245605, + "rewards/margins": 2.8901400566101074, + "rewards/rejected": -7.790583610534668, + "step": 4830 + }, + { + "epoch": 0.53, + "grad_norm": 3.703125, + "learning_rate": 2.6692363091457868e-06, + "logits/chosen": -1.9333171844482422, + "logits/rejected": -1.6581255197525024, + "logps/chosen": -823.2359619140625, + "logps/rejected": -1097.895751953125, + "loss": 0.3257, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.133155345916748, + "rewards/margins": 3.2096519470214844, + "rewards/rejected": -8.34280776977539, + "step": 4840 + }, + { + "epoch": 0.53, + "grad_norm": 4.65625, + "learning_rate": 2.659688351432825e-06, + "logits/chosen": -1.8767311573028564, + "logits/rejected": -1.6026397943496704, + "logps/chosen": -801.2988891601562, + "logps/rejected": -1094.2728271484375, + "loss": 0.29, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.360578536987305, + "rewards/margins": 3.2975661754608154, + "rewards/rejected": -8.6581449508667, + "step": 4850 + }, + { + "epoch": 0.53, + "grad_norm": 15.3125, + "learning_rate": 2.650138054356106e-06, + "logits/chosen": -1.766493797302246, + "logits/rejected": -1.7181847095489502, + "logps/chosen": -743.2913818359375, + "logps/rejected": -997.1356201171875, + "loss": 0.3308, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.975522518157959, + "rewards/margins": 2.792332410812378, + "rewards/rejected": -7.767855167388916, + "step": 4860 + }, + { + "epoch": 0.53, + "grad_norm": 13.375, + "learning_rate": 2.6405855578232616e-06, + "logits/chosen": -1.6527888774871826, + "logits/rejected": -1.5627895593643188, + "logps/chosen": -860.6240234375, + "logps/rejected": -1148.0487060546875, + "loss": 0.3901, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.155169486999512, + "rewards/margins": 3.519181728363037, + "rewards/rejected": -9.674351692199707, + "step": 4870 + }, + { + "epoch": 0.54, + "grad_norm": 5.53125, + "learning_rate": 2.6310310017741432e-06, + "logits/chosen": -1.8419584035873413, + "logits/rejected": -1.6139633655548096, + "logps/chosen": -881.3860473632812, + "logps/rejected": -1219.8428955078125, + "loss": 0.2868, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.187490463256836, + "rewards/margins": 3.8332645893096924, + "rewards/rejected": -10.02075481414795, + "step": 4880 + }, + { + "epoch": 0.54, + "grad_norm": 5.75, + "learning_rate": 2.621474526178772e-06, + "logits/chosen": -1.8520715236663818, + "logits/rejected": -1.5670760869979858, + "logps/chosen": -829.3567504882812, + "logps/rejected": -1177.9678955078125, + "loss": 0.3032, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.758260726928711, + "rewards/margins": 3.860233783721924, + "rewards/rejected": -9.618494033813477, + "step": 4890 + }, + { + "epoch": 0.54, + "grad_norm": 11.6875, + "learning_rate": 2.6119162710352922e-06, + "logits/chosen": -1.8403129577636719, + "logits/rejected": -1.6947139501571655, + "logps/chosen": -751.439208984375, + "logps/rejected": -1028.140380859375, + "loss": 0.3662, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.088431358337402, + "rewards/margins": 3.0783374309539795, + "rewards/rejected": -8.166769027709961, + "step": 4900 + }, + { + "epoch": 0.54, + "grad_norm": 6.15625, + "learning_rate": 2.602356376367916e-06, + "logits/chosen": -1.8566335439682007, + "logits/rejected": -1.7237964868545532, + "logps/chosen": -757.4112548828125, + "logps/rejected": -1170.2598876953125, + "loss": 0.286, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.3142194747924805, + "rewards/margins": 4.182933807373047, + "rewards/rejected": -9.497153282165527, + "step": 4910 + }, + { + "epoch": 0.54, + "grad_norm": 14.0625, + "learning_rate": 2.592794982224874e-06, + "logits/chosen": -1.9052515029907227, + "logits/rejected": -1.6820745468139648, + "logps/chosen": -728.7723388671875, + "logps/rejected": -1039.0322265625, + "loss": 0.3447, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.784475803375244, + "rewards/margins": 3.3095829486846924, + "rewards/rejected": -8.0940580368042, + "step": 4920 + }, + { + "epoch": 0.54, + "grad_norm": 14.5625, + "learning_rate": 2.5832322286763636e-06, + "logits/chosen": -1.95851731300354, + "logits/rejected": -1.7905120849609375, + "logps/chosen": -764.6600952148438, + "logps/rejected": -1016.0764770507812, + "loss": 0.3397, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.774062156677246, + "rewards/margins": 3.1858105659484863, + "rewards/rejected": -7.959872245788574, + "step": 4930 + }, + { + "epoch": 0.54, + "grad_norm": 15.6875, + "learning_rate": 2.5736682558124966e-06, + "logits/chosen": -1.7527990341186523, + "logits/rejected": -1.6510803699493408, + "logps/chosen": -715.636962890625, + "logps/rejected": -988.65185546875, + "loss": 0.3906, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.075923442840576, + "rewards/margins": 2.9820237159729004, + "rewards/rejected": -8.057947158813477, + "step": 4940 + }, + { + "epoch": 0.54, + "grad_norm": 9.0625, + "learning_rate": 2.5641032037412483e-06, + "logits/chosen": -1.7088632583618164, + "logits/rejected": -1.740706443786621, + "logps/chosen": -716.7030639648438, + "logps/rejected": -1014.44384765625, + "loss": 0.3424, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.85519552230835, + "rewards/margins": 3.064009428024292, + "rewards/rejected": -7.9192047119140625, + "step": 4950 + }, + { + "epoch": 0.54, + "grad_norm": 6.34375, + "learning_rate": 2.554537212586403e-06, + "logits/chosen": -1.8739116191864014, + "logits/rejected": -1.7162269353866577, + "logps/chosen": -694.1937255859375, + "logps/rejected": -944.1072387695312, + "loss": 0.3579, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.407443046569824, + "rewards/margins": 2.8134512901306152, + "rewards/rejected": -7.220894813537598, + "step": 4960 + }, + { + "epoch": 0.54, + "grad_norm": 11.5, + "learning_rate": 2.5449704224855026e-06, + "logits/chosen": -1.920742392539978, + "logits/rejected": -1.637102484703064, + "logps/chosen": -697.308349609375, + "logps/rejected": -983.8723754882812, + "loss": 0.317, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.615049839019775, + "rewards/margins": 3.2308669090270996, + "rewards/rejected": -7.845916748046875, + "step": 4970 + }, + { + "epoch": 0.55, + "grad_norm": 13.0, + "learning_rate": 2.535402973587792e-06, + "logits/chosen": -1.7808973789215088, + "logits/rejected": -1.7882381677627563, + "logps/chosen": -714.9547119140625, + "logps/rejected": -947.0159301757812, + "loss": 0.4228, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.695001602172852, + "rewards/margins": 2.332240343093872, + "rewards/rejected": -7.0272417068481445, + "step": 4980 + }, + { + "epoch": 0.55, + "grad_norm": 5.625, + "learning_rate": 2.5258350060521685e-06, + "logits/chosen": -1.8723738193511963, + "logits/rejected": -1.8183752298355103, + "logps/chosen": -715.4143676757812, + "logps/rejected": -994.7745971679688, + "loss": 0.302, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.7178521156311035, + "rewards/margins": 2.984717845916748, + "rewards/rejected": -7.70257043838501, + "step": 4990 + }, + { + "epoch": 0.55, + "grad_norm": 13.625, + "learning_rate": 2.5162666600451275e-06, + "logits/chosen": -1.8568050861358643, + "logits/rejected": -1.5623481273651123, + "logps/chosen": -709.01708984375, + "logps/rejected": -985.3059692382812, + "loss": 0.2696, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.896820068359375, + "rewards/margins": 3.1123669147491455, + "rewards/rejected": -8.009186744689941, + "step": 5000 + }, + { + "epoch": 0.55, + "grad_norm": 11.0625, + "learning_rate": 2.5066980757387076e-06, + "logits/chosen": -1.832991600036621, + "logits/rejected": -1.7874338626861572, + "logps/chosen": -742.8668212890625, + "logps/rejected": -992.8267822265625, + "loss": 0.366, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.801213264465332, + "rewards/margins": 2.804513931274414, + "rewards/rejected": -7.605727195739746, + "step": 5010 + }, + { + "epoch": 0.55, + "grad_norm": 15.3125, + "learning_rate": 2.4971293933084386e-06, + "logits/chosen": -1.777300238609314, + "logits/rejected": -1.6475311517715454, + "logps/chosen": -799.867431640625, + "logps/rejected": -1122.72998046875, + "loss": 0.3672, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -5.682173728942871, + "rewards/margins": 3.4197421073913574, + "rewards/rejected": -9.101916313171387, + "step": 5020 + }, + { + "epoch": 0.55, + "grad_norm": 8.0, + "learning_rate": 2.487560752931289e-06, + "logits/chosen": -1.7989394664764404, + "logits/rejected": -1.619760513305664, + "logps/chosen": -769.2176513671875, + "logps/rejected": -1161.066650390625, + "loss": 0.3123, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.510601997375488, + "rewards/margins": 4.077925682067871, + "rewards/rejected": -9.588526725769043, + "step": 5030 + }, + { + "epoch": 0.55, + "grad_norm": 13.25, + "learning_rate": 2.477992294783611e-06, + "logits/chosen": -1.8363641500473022, + "logits/rejected": -1.7375316619873047, + "logps/chosen": -675.3231201171875, + "logps/rejected": -1037.930908203125, + "loss": 0.2675, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.814789772033691, + "rewards/margins": 3.380969524383545, + "rewards/rejected": -8.195757865905762, + "step": 5040 + }, + { + "epoch": 0.55, + "grad_norm": 8.625, + "learning_rate": 2.4684241590390847e-06, + "logits/chosen": -1.7425663471221924, + "logits/rejected": -1.7237985134124756, + "logps/chosen": -771.497314453125, + "logps/rejected": -1087.204833984375, + "loss": 0.3232, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.486248016357422, + "rewards/margins": 2.961683750152588, + "rewards/rejected": -8.447932243347168, + "step": 5050 + }, + { + "epoch": 0.55, + "grad_norm": 11.6875, + "learning_rate": 2.458856485866669e-06, + "logits/chosen": -1.9245576858520508, + "logits/rejected": -1.7072875499725342, + "logps/chosen": -781.1033935546875, + "logps/rejected": -1056.111083984375, + "loss": 0.3141, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.307318210601807, + "rewards/margins": 3.1197099685668945, + "rewards/rejected": -8.427027702331543, + "step": 5060 + }, + { + "epoch": 0.56, + "grad_norm": 10.9375, + "learning_rate": 2.4492894154285494e-06, + "logits/chosen": -1.909092664718628, + "logits/rejected": -1.63583242893219, + "logps/chosen": -682.8969116210938, + "logps/rejected": -951.3707275390625, + "loss": 0.4042, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.411766529083252, + "rewards/margins": 3.0700290203094482, + "rewards/rejected": -7.481795310974121, + "step": 5070 + }, + { + "epoch": 0.56, + "grad_norm": 9.3125, + "learning_rate": 2.4397230878780747e-06, + "logits/chosen": -1.9019575119018555, + "logits/rejected": -1.692522406578064, + "logps/chosen": -795.1134033203125, + "logps/rejected": -1069.1993408203125, + "loss": 0.3036, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.416313648223877, + "rewards/margins": 3.246833086013794, + "rewards/rejected": -8.663146018981934, + "step": 5080 + }, + { + "epoch": 0.56, + "grad_norm": 5.125, + "learning_rate": 2.4301576433577166e-06, + "logits/chosen": -1.8777668476104736, + "logits/rejected": -1.750526785850525, + "logps/chosen": -735.9993896484375, + "logps/rejected": -1076.8115234375, + "loss": 0.3293, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.7474260330200195, + "rewards/margins": 3.803342819213867, + "rewards/rejected": -8.550768852233887, + "step": 5090 + }, + { + "epoch": 0.56, + "grad_norm": 11.4375, + "learning_rate": 2.42059322199701e-06, + "logits/chosen": -1.9741106033325195, + "logits/rejected": -1.6629884243011475, + "logps/chosen": -788.11669921875, + "logps/rejected": -1081.1214599609375, + "loss": 0.2948, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.0293707847595215, + "rewards/margins": 3.4977710247039795, + "rewards/rejected": -8.527141571044922, + "step": 5100 + }, + { + "epoch": 0.56, + "grad_norm": 7.75, + "learning_rate": 2.411029963910497e-06, + "logits/chosen": -1.8040987253189087, + "logits/rejected": -1.7162173986434937, + "logps/chosen": -796.7048950195312, + "logps/rejected": -1114.1865234375, + "loss": 0.3679, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.643942832946777, + "rewards/margins": 3.2460663318634033, + "rewards/rejected": -8.890008926391602, + "step": 5110 + }, + { + "epoch": 0.56, + "grad_norm": 2.234375, + "learning_rate": 2.4014680091956825e-06, + "logits/chosen": -1.848706841468811, + "logits/rejected": -1.6445939540863037, + "logps/chosen": -751.1235961914062, + "logps/rejected": -1056.409912109375, + "loss": 0.3067, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.992753982543945, + "rewards/margins": 3.262176036834717, + "rewards/rejected": -8.25493049621582, + "step": 5120 + }, + { + "epoch": 0.56, + "grad_norm": 11.875, + "learning_rate": 2.3919074979309768e-06, + "logits/chosen": -1.8263747692108154, + "logits/rejected": -1.5735846757888794, + "logps/chosen": -822.7265625, + "logps/rejected": -1180.38134765625, + "loss": 0.3145, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.626978874206543, + "rewards/margins": 4.117857933044434, + "rewards/rejected": -9.744836807250977, + "step": 5130 + }, + { + "epoch": 0.56, + "grad_norm": 22.375, + "learning_rate": 2.3823485701736416e-06, + "logits/chosen": -1.8495702743530273, + "logits/rejected": -1.7161544561386108, + "logps/chosen": -750.7886352539062, + "logps/rejected": -1055.391845703125, + "loss": 0.4135, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.105189323425293, + "rewards/margins": 3.4504802227020264, + "rewards/rejected": -8.555668830871582, + "step": 5140 + }, + { + "epoch": 0.56, + "grad_norm": 6.96875, + "learning_rate": 2.372791365957744e-06, + "logits/chosen": -2.0103096961975098, + "logits/rejected": -1.7737452983856201, + "logps/chosen": -767.8741455078125, + "logps/rejected": -1023.9263916015625, + "loss": 0.2834, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.861449241638184, + "rewards/margins": 3.2771213054656982, + "rewards/rejected": -8.138570785522461, + "step": 5150 + }, + { + "epoch": 0.57, + "grad_norm": 8.3125, + "learning_rate": 2.3632360252921004e-06, + "logits/chosen": -1.8676027059555054, + "logits/rejected": -1.8206102848052979, + "logps/chosen": -858.2623901367188, + "logps/rejected": -1226.688720703125, + "loss": 0.3868, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.874178886413574, + "rewards/margins": 3.770275831222534, + "rewards/rejected": -9.644455909729004, + "step": 5160 + }, + { + "epoch": 0.57, + "grad_norm": 10.5625, + "learning_rate": 2.3536826881582295e-06, + "logits/chosen": -1.8250720500946045, + "logits/rejected": -1.7611805200576782, + "logps/chosen": -833.5970458984375, + "logps/rejected": -1175.5667724609375, + "loss": 0.3954, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.6825151443481445, + "rewards/margins": 3.4448533058166504, + "rewards/rejected": -9.127367973327637, + "step": 5170 + }, + { + "epoch": 0.57, + "grad_norm": 4.1875, + "learning_rate": 2.344131494508295e-06, + "logits/chosen": -1.819046974182129, + "logits/rejected": -1.6965080499649048, + "logps/chosen": -797.1341552734375, + "logps/rejected": -1017.3854370117188, + "loss": 0.3719, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -5.471802711486816, + "rewards/margins": 2.4914708137512207, + "rewards/rejected": -7.963273525238037, + "step": 5180 + }, + { + "epoch": 0.57, + "grad_norm": 19.375, + "learning_rate": 2.3345825842630613e-06, + "logits/chosen": -1.9554027318954468, + "logits/rejected": -1.7067487239837646, + "logps/chosen": -814.7120971679688, + "logps/rejected": -1028.6104736328125, + "loss": 0.3764, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.460103511810303, + "rewards/margins": 2.3483777046203613, + "rewards/rejected": -7.808480262756348, + "step": 5190 + }, + { + "epoch": 0.57, + "grad_norm": 4.21875, + "learning_rate": 2.3250360973098444e-06, + "logits/chosen": -1.8486143350601196, + "logits/rejected": -1.8309625387191772, + "logps/chosen": -755.470703125, + "logps/rejected": -1079.958740234375, + "loss": 0.2969, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.042849540710449, + "rewards/margins": 3.1690776348114014, + "rewards/rejected": -8.21192741394043, + "step": 5200 + }, + { + "epoch": 0.57, + "grad_norm": 21.125, + "learning_rate": 2.315492173500456e-06, + "logits/chosen": -1.8065963983535767, + "logits/rejected": -1.7205009460449219, + "logps/chosen": -793.24267578125, + "logps/rejected": -1061.001220703125, + "loss": 0.3208, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.3179497718811035, + "rewards/margins": 3.1200079917907715, + "rewards/rejected": -8.437957763671875, + "step": 5210 + }, + { + "epoch": 0.57, + "grad_norm": 9.0625, + "learning_rate": 2.305950952649161e-06, + "logits/chosen": -1.84344482421875, + "logits/rejected": -1.7116506099700928, + "logps/chosen": -725.4070434570312, + "logps/rejected": -962.2274169921875, + "loss": 0.2945, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.08535099029541, + "rewards/margins": 2.6185996532440186, + "rewards/rejected": -7.70395040512085, + "step": 5220 + }, + { + "epoch": 0.57, + "grad_norm": 11.75, + "learning_rate": 2.296412574530629e-06, + "logits/chosen": -1.9053932428359985, + "logits/rejected": -1.8481519222259521, + "logps/chosen": -766.845703125, + "logps/rejected": -1005.0877685546875, + "loss": 0.463, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.859908103942871, + "rewards/margins": 2.4817686080932617, + "rewards/rejected": -7.341676235198975, + "step": 5230 + }, + { + "epoch": 0.57, + "grad_norm": 11.375, + "learning_rate": 2.286877178877881e-06, + "logits/chosen": -1.9200611114501953, + "logits/rejected": -1.8272422552108765, + "logps/chosen": -762.1739501953125, + "logps/rejected": -991.4666137695312, + "loss": 0.3337, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.833366870880127, + "rewards/margins": 2.7550368309020996, + "rewards/rejected": -7.588404178619385, + "step": 5240 + }, + { + "epoch": 0.58, + "grad_norm": 5.09375, + "learning_rate": 2.2773449053802487e-06, + "logits/chosen": -1.9553371667861938, + "logits/rejected": -1.7168327569961548, + "logps/chosen": -686.4542236328125, + "logps/rejected": -1001.7738037109375, + "loss": 0.2978, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.594350814819336, + "rewards/margins": 3.4331741333007812, + "rewards/rejected": -8.027524948120117, + "step": 5250 + }, + { + "epoch": 0.58, + "grad_norm": 5.75, + "learning_rate": 2.267815893681325e-06, + "logits/chosen": -1.8839950561523438, + "logits/rejected": -1.805219292640686, + "logps/chosen": -730.7767944335938, + "logps/rejected": -1046.740234375, + "loss": 0.3618, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.902741432189941, + "rewards/margins": 3.4987361431121826, + "rewards/rejected": -8.401477813720703, + "step": 5260 + }, + { + "epoch": 0.58, + "grad_norm": 7.59375, + "learning_rate": 2.258290283376919e-06, + "logits/chosen": -1.8422069549560547, + "logits/rejected": -1.8269844055175781, + "logps/chosen": -665.9560546875, + "logps/rejected": -949.4150390625, + "loss": 0.343, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.380220890045166, + "rewards/margins": 2.8068947792053223, + "rewards/rejected": -7.187114715576172, + "step": 5270 + }, + { + "epoch": 0.58, + "grad_norm": 7.71875, + "learning_rate": 2.24876821401301e-06, + "logits/chosen": -1.9029918909072876, + "logits/rejected": -1.6550133228302002, + "logps/chosen": -740.3123168945312, + "logps/rejected": -1030.387939453125, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.780288219451904, + "rewards/margins": 3.4410834312438965, + "rewards/rejected": -8.2213716506958, + "step": 5280 + }, + { + "epoch": 0.58, + "grad_norm": 11.8125, + "learning_rate": 2.2392498250837062e-06, + "logits/chosen": -1.8962256908416748, + "logits/rejected": -1.7280795574188232, + "logps/chosen": -642.1261596679688, + "logps/rejected": -942.3056640625, + "loss": 0.3029, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.261740684509277, + "rewards/margins": 3.016481399536133, + "rewards/rejected": -7.278221130371094, + "step": 5290 + }, + { + "epoch": 0.58, + "grad_norm": 13.0, + "learning_rate": 2.2297352560291955e-06, + "logits/chosen": -1.9412901401519775, + "logits/rejected": -1.6703341007232666, + "logps/chosen": -727.5746459960938, + "logps/rejected": -1046.568603515625, + "loss": 0.2872, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.7617340087890625, + "rewards/margins": 3.6345114707946777, + "rewards/rejected": -8.396245956420898, + "step": 5300 + }, + { + "epoch": 0.58, + "grad_norm": 13.5625, + "learning_rate": 2.22022464623371e-06, + "logits/chosen": -1.8608739376068115, + "logits/rejected": -1.5924700498580933, + "logps/chosen": -725.82177734375, + "logps/rejected": -931.2943115234375, + "loss": 0.3606, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.6100969314575195, + "rewards/margins": 2.606773853302002, + "rewards/rejected": -7.2168707847595215, + "step": 5310 + }, + { + "epoch": 0.58, + "grad_norm": 14.8125, + "learning_rate": 2.210718135023479e-06, + "logits/chosen": -1.7885816097259521, + "logits/rejected": -1.7658262252807617, + "logps/chosen": -763.822265625, + "logps/rejected": -1056.6031494140625, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.182581424713135, + "rewards/margins": 3.114133834838867, + "rewards/rejected": -8.296714782714844, + "step": 5320 + }, + { + "epoch": 0.58, + "grad_norm": 9.875, + "learning_rate": 2.2012158616646897e-06, + "logits/chosen": -1.8850820064544678, + "logits/rejected": -1.512588620185852, + "logps/chosen": -774.5667724609375, + "logps/rejected": -1086.21240234375, + "loss": 0.3135, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.835562229156494, + "rewards/margins": 3.8193767070770264, + "rewards/rejected": -8.654937744140625, + "step": 5330 + }, + { + "epoch": 0.59, + "grad_norm": 11.5, + "learning_rate": 2.191717965361445e-06, + "logits/chosen": -1.756353735923767, + "logits/rejected": -1.684138298034668, + "logps/chosen": -799.0158081054688, + "logps/rejected": -1205.3870849609375, + "loss": 0.3193, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.717110633850098, + "rewards/margins": 3.896951675415039, + "rewards/rejected": -9.614062309265137, + "step": 5340 + }, + { + "epoch": 0.59, + "grad_norm": 7.25, + "learning_rate": 2.1822245852537276e-06, + "logits/chosen": -1.8540637493133545, + "logits/rejected": -1.6626100540161133, + "logps/chosen": -800.69580078125, + "logps/rejected": -1162.5860595703125, + "loss": 0.2365, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.62246561050415, + "rewards/margins": 3.6908676624298096, + "rewards/rejected": -9.313332557678223, + "step": 5350 + }, + { + "epoch": 0.59, + "grad_norm": 22.375, + "learning_rate": 2.1727358604153596e-06, + "logits/chosen": -1.8244962692260742, + "logits/rejected": -1.6953014135360718, + "logps/chosen": -936.8606567382812, + "logps/rejected": -1320.625732421875, + "loss": 0.3374, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.811500549316406, + "rewards/margins": 3.8612866401672363, + "rewards/rejected": -10.672786712646484, + "step": 5360 + }, + { + "epoch": 0.59, + "grad_norm": 15.1875, + "learning_rate": 2.1632519298519645e-06, + "logits/chosen": -1.8253173828125, + "logits/rejected": -1.7334740161895752, + "logps/chosen": -832.3849487304688, + "logps/rejected": -1138.943115234375, + "loss": 0.3319, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.906573295593262, + "rewards/margins": 3.133772611618042, + "rewards/rejected": -9.040346145629883, + "step": 5370 + }, + { + "epoch": 0.59, + "grad_norm": 16.0, + "learning_rate": 2.153772932498933e-06, + "logits/chosen": -1.8624969720840454, + "logits/rejected": -1.6420419216156006, + "logps/chosen": -844.8991088867188, + "logps/rejected": -1129.8504638671875, + "loss": 0.3175, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.589677333831787, + "rewards/margins": 3.2225849628448486, + "rewards/rejected": -8.812261581420898, + "step": 5380 + }, + { + "epoch": 0.59, + "grad_norm": 7.0, + "learning_rate": 2.144299007219387e-06, + "logits/chosen": -1.7543365955352783, + "logits/rejected": -1.507838487625122, + "logps/chosen": -842.1004638671875, + "logps/rejected": -1153.1676025390625, + "loss": 0.377, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.9858078956604, + "rewards/margins": 3.7289624214172363, + "rewards/rejected": -9.714770317077637, + "step": 5390 + }, + { + "epoch": 0.59, + "grad_norm": 15.625, + "learning_rate": 2.1348302928021415e-06, + "logits/chosen": -1.6280733346939087, + "logits/rejected": -1.6308244466781616, + "logps/chosen": -905.1400146484375, + "logps/rejected": -1294.8294677734375, + "loss": 0.3272, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.647294521331787, + "rewards/margins": 3.816831111907959, + "rewards/rejected": -10.464126586914062, + "step": 5400 + }, + { + "epoch": 0.59, + "grad_norm": 16.125, + "learning_rate": 2.125366927959679e-06, + "logits/chosen": -1.7895599603652954, + "logits/rejected": -1.5008745193481445, + "logps/chosen": -873.99365234375, + "logps/rejected": -1178.2862548828125, + "loss": 0.3002, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.928724765777588, + "rewards/margins": 3.731779098510742, + "rewards/rejected": -9.660503387451172, + "step": 5410 + }, + { + "epoch": 0.59, + "grad_norm": 17.25, + "learning_rate": 2.115909051326111e-06, + "logits/chosen": -1.9102405309677124, + "logits/rejected": -1.7207285165786743, + "logps/chosen": -843.8084106445312, + "logps/rejected": -1179.2857666015625, + "loss": 0.3686, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.7015275955200195, + "rewards/margins": 3.4867584705352783, + "rewards/rejected": -9.188285827636719, + "step": 5420 + }, + { + "epoch": 0.6, + "grad_norm": 8.6875, + "learning_rate": 2.1064568014551476e-06, + "logits/chosen": -1.8128067255020142, + "logits/rejected": -1.5954325199127197, + "logps/chosen": -788.5177001953125, + "logps/rejected": -1115.3687744140625, + "loss": 0.3045, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.471436500549316, + "rewards/margins": 3.7813453674316406, + "rewards/rejected": -9.252781867980957, + "step": 5430 + }, + { + "epoch": 0.6, + "grad_norm": 29.5, + "learning_rate": 2.0970103168180718e-06, + "logits/chosen": -1.7206628322601318, + "logits/rejected": -1.642055869102478, + "logps/chosen": -955.9063720703125, + "logps/rejected": -1252.1689453125, + "loss": 0.313, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.901654243469238, + "rewards/margins": 3.1550278663635254, + "rewards/rejected": -10.056680679321289, + "step": 5440 + }, + { + "epoch": 0.6, + "grad_norm": 11.125, + "learning_rate": 2.0875697358017084e-06, + "logits/chosen": -1.7763115167617798, + "logits/rejected": -1.6835920810699463, + "logps/chosen": -892.0051879882812, + "logps/rejected": -1175.192138671875, + "loss": 0.3524, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.515921592712402, + "rewards/margins": 2.981050729751587, + "rewards/rejected": -9.496973037719727, + "step": 5450 + }, + { + "epoch": 0.6, + "grad_norm": 18.375, + "learning_rate": 2.0781351967063944e-06, + "logits/chosen": -1.8522727489471436, + "logits/rejected": -1.6612794399261475, + "logps/chosen": -864.7391357421875, + "logps/rejected": -1150.5281982421875, + "loss": 0.344, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.121758460998535, + "rewards/margins": 3.126701593399048, + "rewards/rejected": -9.24846076965332, + "step": 5460 + }, + { + "epoch": 0.6, + "grad_norm": 8.9375, + "learning_rate": 2.0687068377439574e-06, + "logits/chosen": -1.8284565210342407, + "logits/rejected": -1.66012704372406, + "logps/chosen": -803.3771362304688, + "logps/rejected": -1118.2249755859375, + "loss": 0.3186, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.604450225830078, + "rewards/margins": 3.39811372756958, + "rewards/rejected": -9.0025634765625, + "step": 5470 + }, + { + "epoch": 0.6, + "grad_norm": 6.0, + "learning_rate": 2.0592847970356895e-06, + "logits/chosen": -1.7944530248641968, + "logits/rejected": -1.6097066402435303, + "logps/chosen": -831.7261962890625, + "logps/rejected": -1147.4573974609375, + "loss": 0.3072, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.842739582061768, + "rewards/margins": 3.563870906829834, + "rewards/rejected": -9.406610488891602, + "step": 5480 + }, + { + "epoch": 0.6, + "grad_norm": 9.0, + "learning_rate": 2.0498692126103205e-06, + "logits/chosen": -1.8073241710662842, + "logits/rejected": -1.6143966913223267, + "logps/chosen": -776.9193115234375, + "logps/rejected": -1022.8428955078125, + "loss": 0.2578, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.540380001068115, + "rewards/margins": 2.5791163444519043, + "rewards/rejected": -8.119495391845703, + "step": 5490 + }, + { + "epoch": 0.6, + "grad_norm": 10.75, + "learning_rate": 2.0404602224020007e-06, + "logits/chosen": -1.922685980796814, + "logits/rejected": -1.8148208856582642, + "logps/chosen": -888.9217529296875, + "logps/rejected": -1137.7242431640625, + "loss": 0.3301, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -6.342520713806152, + "rewards/margins": 2.593397378921509, + "rewards/rejected": -8.935918807983398, + "step": 5500 + }, + { + "epoch": 0.6, + "grad_norm": 7.0625, + "learning_rate": 2.0310579642482763e-06, + "logits/chosen": -1.8253530263900757, + "logits/rejected": -1.686448097229004, + "logps/chosen": -849.5123901367188, + "logps/rejected": -1186.3817138671875, + "loss": 0.3266, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.07211446762085, + "rewards/margins": 3.6559500694274902, + "rewards/rejected": -9.72806453704834, + "step": 5510 + }, + { + "epoch": 0.61, + "grad_norm": 8.9375, + "learning_rate": 2.0216625758880746e-06, + "logits/chosen": -1.8174808025360107, + "logits/rejected": -1.629827857017517, + "logps/chosen": -936.3171997070312, + "logps/rejected": -1207.284912109375, + "loss": 0.2923, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.667569160461426, + "rewards/margins": 3.391627788543701, + "rewards/rejected": -10.059196472167969, + "step": 5520 + }, + { + "epoch": 0.61, + "grad_norm": 2.734375, + "learning_rate": 2.01227419495968e-06, + "logits/chosen": -1.8035520315170288, + "logits/rejected": -1.6917310953140259, + "logps/chosen": -869.4240112304688, + "logps/rejected": -1205.707763671875, + "loss": 0.3019, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.383000373840332, + "rewards/margins": 3.5399258136749268, + "rewards/rejected": -9.92292594909668, + "step": 5530 + }, + { + "epoch": 0.61, + "grad_norm": 11.25, + "learning_rate": 2.002892958998723e-06, + "logits/chosen": -1.8921566009521484, + "logits/rejected": -1.5785813331604004, + "logps/chosen": -874.6940307617188, + "logps/rejected": -1185.021728515625, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.966357707977295, + "rewards/margins": 3.6595263481140137, + "rewards/rejected": -9.625885009765625, + "step": 5540 + }, + { + "epoch": 0.61, + "grad_norm": 13.9375, + "learning_rate": 1.993519005436165e-06, + "logits/chosen": -1.727246642112732, + "logits/rejected": -1.5885595083236694, + "logps/chosen": -923.2052001953125, + "logps/rejected": -1306.3717041015625, + "loss": 0.3263, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.839955806732178, + "rewards/margins": 3.9164340496063232, + "rewards/rejected": -10.756390571594238, + "step": 5550 + }, + { + "epoch": 0.61, + "grad_norm": 8.5625, + "learning_rate": 1.9841524715962795e-06, + "logits/chosen": -1.8413150310516357, + "logits/rejected": -1.4534183740615845, + "logps/chosen": -804.7468872070312, + "logps/rejected": -1101.3648681640625, + "loss": 0.2952, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.537604808807373, + "rewards/margins": 3.5032219886779785, + "rewards/rejected": -9.040825843811035, + "step": 5560 + }, + { + "epoch": 0.61, + "grad_norm": 10.75, + "learning_rate": 1.974793494694649e-06, + "logits/chosen": -1.7739450931549072, + "logits/rejected": -1.6275430917739868, + "logps/chosen": -879.09228515625, + "logps/rejected": -1178.486328125, + "loss": 0.3066, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.293656349182129, + "rewards/margins": 3.310952663421631, + "rewards/rejected": -9.604609489440918, + "step": 5570 + }, + { + "epoch": 0.61, + "grad_norm": 19.875, + "learning_rate": 1.965442211836146e-06, + "logits/chosen": -1.6935495138168335, + "logits/rejected": -1.5774030685424805, + "logps/chosen": -865.4642333984375, + "logps/rejected": -1209.8426513671875, + "loss": 0.3768, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.319661617279053, + "rewards/margins": 3.723717212677002, + "rewards/rejected": -10.043378829956055, + "step": 5580 + }, + { + "epoch": 0.61, + "grad_norm": 14.0, + "learning_rate": 1.956098760012931e-06, + "logits/chosen": -1.8487932682037354, + "logits/rejected": -1.7617868185043335, + "logps/chosen": -823.6121215820312, + "logps/rejected": -1098.447509765625, + "loss": 0.3821, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.626644134521484, + "rewards/margins": 2.8804688453674316, + "rewards/rejected": -8.507113456726074, + "step": 5590 + }, + { + "epoch": 0.61, + "grad_norm": 7.53125, + "learning_rate": 1.946763276102443e-06, + "logits/chosen": -1.7802562713623047, + "logits/rejected": -1.5826749801635742, + "logps/chosen": -836.3021240234375, + "logps/rejected": -1226.001708984375, + "loss": 0.2374, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.714804649353027, + "rewards/margins": 4.182717800140381, + "rewards/rejected": -9.89752197265625, + "step": 5600 + }, + { + "epoch": 0.62, + "grad_norm": 8.125, + "learning_rate": 1.937435896865394e-06, + "logits/chosen": -1.908953070640564, + "logits/rejected": -1.6755850315093994, + "logps/chosen": -764.5101318359375, + "logps/rejected": -1075.877685546875, + "loss": 0.3491, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.964207649230957, + "rewards/margins": 3.4992454051971436, + "rewards/rejected": -8.46345329284668, + "step": 5610 + }, + { + "epoch": 0.62, + "grad_norm": 16.25, + "learning_rate": 1.928116758943768e-06, + "logits/chosen": -1.7479801177978516, + "logits/rejected": -1.5826520919799805, + "logps/chosen": -851.9039916992188, + "logps/rejected": -1239.47021484375, + "loss": 0.2824, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.055527687072754, + "rewards/margins": 4.3375067710876465, + "rewards/rejected": -10.393033981323242, + "step": 5620 + }, + { + "epoch": 0.62, + "grad_norm": 16.5, + "learning_rate": 1.9188059988588144e-06, + "logits/chosen": -1.8514654636383057, + "logits/rejected": -1.6861293315887451, + "logps/chosen": -784.0506591796875, + "logps/rejected": -1048.3255615234375, + "loss": 0.3324, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.314345359802246, + "rewards/margins": 2.872969627380371, + "rewards/rejected": -8.187314987182617, + "step": 5630 + }, + { + "epoch": 0.62, + "grad_norm": 5.75, + "learning_rate": 1.909503753009053e-06, + "logits/chosen": -1.7940887212753296, + "logits/rejected": -1.5948840379714966, + "logps/chosen": -729.5975341796875, + "logps/rejected": -1004.31298828125, + "loss": 0.3492, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.785807132720947, + "rewards/margins": 3.057375907897949, + "rewards/rejected": -7.8431830406188965, + "step": 5640 + }, + { + "epoch": 0.62, + "grad_norm": 19.875, + "learning_rate": 1.900210157668273e-06, + "logits/chosen": -1.8168922662734985, + "logits/rejected": -1.783815622329712, + "logps/chosen": -783.7774047851562, + "logps/rejected": -1062.7139892578125, + "loss": 0.4096, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.354575157165527, + "rewards/margins": 2.8967928886413574, + "rewards/rejected": -8.251367568969727, + "step": 5650 + }, + { + "epoch": 0.62, + "grad_norm": 10.4375, + "learning_rate": 1.8909253489835383e-06, + "logits/chosen": -1.8400566577911377, + "logits/rejected": -1.6582765579223633, + "logps/chosen": -645.9819946289062, + "logps/rejected": -904.8903198242188, + "loss": 0.3586, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.380860328674316, + "rewards/margins": 2.798842668533325, + "rewards/rejected": -7.1797027587890625, + "step": 5660 + }, + { + "epoch": 0.62, + "grad_norm": 8.5625, + "learning_rate": 1.8816494629731906e-06, + "logits/chosen": -1.8205207586288452, + "logits/rejected": -1.6158344745635986, + "logps/chosen": -768.203125, + "logps/rejected": -1033.510498046875, + "loss": 0.3192, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.650243282318115, + "rewards/margins": 3.1681137084960938, + "rewards/rejected": -7.818356990814209, + "step": 5670 + }, + { + "epoch": 0.62, + "grad_norm": 6.4375, + "learning_rate": 1.8723826355248617e-06, + "logits/chosen": -1.770133376121521, + "logits/rejected": -1.7739508152008057, + "logps/chosen": -704.6603393554688, + "logps/rejected": -998.0908203125, + "loss": 0.2889, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.7477216720581055, + "rewards/margins": 3.2017383575439453, + "rewards/rejected": -7.949460029602051, + "step": 5680 + }, + { + "epoch": 0.62, + "grad_norm": 17.375, + "learning_rate": 1.8631250023934744e-06, + "logits/chosen": -1.8060996532440186, + "logits/rejected": -1.720334768295288, + "logps/chosen": -767.8399658203125, + "logps/rejected": -1117.341552734375, + "loss": 0.3223, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.236262321472168, + "rewards/margins": 3.533201217651367, + "rewards/rejected": -8.769462585449219, + "step": 5690 + }, + { + "epoch": 0.62, + "grad_norm": 7.4375, + "learning_rate": 1.853876699199263e-06, + "logits/chosen": -1.9054569005966187, + "logits/rejected": -1.7849172353744507, + "logps/chosen": -690.4293212890625, + "logps/rejected": -907.9588623046875, + "loss": 0.4254, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.491607666015625, + "rewards/margins": 2.468785047531128, + "rewards/rejected": -6.96039342880249, + "step": 5700 + }, + { + "epoch": 0.63, + "grad_norm": 7.375, + "learning_rate": 1.8446378614257818e-06, + "logits/chosen": -1.7242883443832397, + "logits/rejected": -1.5490120649337769, + "logps/chosen": -762.9089965820312, + "logps/rejected": -1174.6580810546875, + "loss": 0.2789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.443944454193115, + "rewards/margins": 4.231475353240967, + "rewards/rejected": -9.675419807434082, + "step": 5710 + }, + { + "epoch": 0.63, + "grad_norm": 7.34375, + "learning_rate": 1.8354086244179182e-06, + "logits/chosen": -1.7836189270019531, + "logits/rejected": -1.6674197912216187, + "logps/chosen": -753.1982421875, + "logps/rejected": -1042.6341552734375, + "loss": 0.3482, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.81417179107666, + "rewards/margins": 3.213437557220459, + "rewards/rejected": -8.027608871459961, + "step": 5720 + }, + { + "epoch": 0.63, + "grad_norm": 11.0, + "learning_rate": 1.8261891233799157e-06, + "logits/chosen": -1.905552864074707, + "logits/rejected": -1.6187970638275146, + "logps/chosen": -720.9931030273438, + "logps/rejected": -1007.8626708984375, + "loss": 0.391, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.785830497741699, + "rewards/margins": 3.266265869140625, + "rewards/rejected": -8.052096366882324, + "step": 5730 + }, + { + "epoch": 0.63, + "grad_norm": 9.9375, + "learning_rate": 1.8169794933733892e-06, + "logits/chosen": -1.8743784427642822, + "logits/rejected": -1.7526649236679077, + "logps/chosen": -723.316650390625, + "logps/rejected": -964.9153442382812, + "loss": 0.3582, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.770673751831055, + "rewards/margins": 2.9115653038024902, + "rewards/rejected": -7.682238578796387, + "step": 5740 + }, + { + "epoch": 0.63, + "grad_norm": 17.125, + "learning_rate": 1.8077798693153453e-06, + "logits/chosen": -1.8923301696777344, + "logits/rejected": -1.670064926147461, + "logps/chosen": -760.1171875, + "logps/rejected": -967.6064453125, + "loss": 0.2822, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.848750114440918, + "rewards/margins": 2.742661237716675, + "rewards/rejected": -7.591412544250488, + "step": 5750 + }, + { + "epoch": 0.63, + "grad_norm": 6.125, + "learning_rate": 1.7985903859762107e-06, + "logits/chosen": -1.8723474740982056, + "logits/rejected": -1.7387202978134155, + "logps/chosen": -773.7932739257812, + "logps/rejected": -1028.6981201171875, + "loss": 0.2929, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.2045488357543945, + "rewards/margins": 2.884753465652466, + "rewards/rejected": -8.089302062988281, + "step": 5760 + }, + { + "epoch": 0.63, + "grad_norm": 14.3125, + "learning_rate": 1.7894111779778542e-06, + "logits/chosen": -1.7293593883514404, + "logits/rejected": -1.5826575756072998, + "logps/chosen": -844.86083984375, + "logps/rejected": -1127.7862548828125, + "loss": 0.3565, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -6.109644412994385, + "rewards/margins": 3.078591823577881, + "rewards/rejected": -9.188236236572266, + "step": 5770 + }, + { + "epoch": 0.63, + "grad_norm": 12.1875, + "learning_rate": 1.7802423797916158e-06, + "logits/chosen": -1.7901630401611328, + "logits/rejected": -1.5765742063522339, + "logps/chosen": -837.0283203125, + "logps/rejected": -1186.0885009765625, + "loss": 0.2181, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.779475212097168, + "rewards/margins": 3.7795677185058594, + "rewards/rejected": -9.559041976928711, + "step": 5780 + }, + { + "epoch": 0.63, + "grad_norm": 16.5, + "learning_rate": 1.7710841257363342e-06, + "logits/chosen": -1.787597894668579, + "logits/rejected": -1.58553946018219, + "logps/chosen": -784.6412963867188, + "logps/rejected": -1163.6343994140625, + "loss": 0.2605, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.457271099090576, + "rewards/margins": 4.151125907897949, + "rewards/rejected": -9.608396530151367, + "step": 5790 + }, + { + "epoch": 0.64, + "grad_norm": 16.75, + "learning_rate": 1.7619365499763841e-06, + "logits/chosen": -1.7975685596466064, + "logits/rejected": -1.7442238330841064, + "logps/chosen": -840.3092041015625, + "logps/rejected": -1181.0552978515625, + "loss": 0.2492, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.679610252380371, + "rewards/margins": 3.856942653656006, + "rewards/rejected": -9.536553382873535, + "step": 5800 + }, + { + "epoch": 0.64, + "grad_norm": 16.0, + "learning_rate": 1.7527997865197056e-06, + "logits/chosen": -1.8668601512908936, + "logits/rejected": -1.762565016746521, + "logps/chosen": -855.630859375, + "logps/rejected": -1103.0985107421875, + "loss": 0.3395, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.921446800231934, + "rewards/margins": 2.914483070373535, + "rewards/rejected": -8.835928916931152, + "step": 5810 + }, + { + "epoch": 0.64, + "grad_norm": 8.6875, + "learning_rate": 1.743673969215845e-06, + "logits/chosen": -1.8538572788238525, + "logits/rejected": -1.6862990856170654, + "logps/chosen": -813.991455078125, + "logps/rejected": -1137.831787109375, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6803154945373535, + "rewards/margins": 3.458456039428711, + "rewards/rejected": -9.138771057128906, + "step": 5820 + }, + { + "epoch": 0.64, + "grad_norm": 11.1875, + "learning_rate": 1.7345592317539907e-06, + "logits/chosen": -1.7850124835968018, + "logits/rejected": -1.5441216230392456, + "logps/chosen": -875.34326171875, + "logps/rejected": -1240.687744140625, + "loss": 0.303, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.1803693771362305, + "rewards/margins": 4.338943958282471, + "rewards/rejected": -10.519311904907227, + "step": 5830 + }, + { + "epoch": 0.64, + "grad_norm": 8.4375, + "learning_rate": 1.725455707661019e-06, + "logits/chosen": -1.8240129947662354, + "logits/rejected": -1.52000093460083, + "logps/chosen": -930.5938720703125, + "logps/rejected": -1293.7413330078125, + "loss": 0.3072, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.682262420654297, + "rewards/margins": 3.9236347675323486, + "rewards/rejected": -10.605896949768066, + "step": 5840 + }, + { + "epoch": 0.64, + "grad_norm": 19.125, + "learning_rate": 1.7163635302995312e-06, + "logits/chosen": -1.6874158382415771, + "logits/rejected": -1.4417365789413452, + "logps/chosen": -891.6917724609375, + "logps/rejected": -1229.5272216796875, + "loss": 0.2347, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.707302093505859, + "rewards/margins": 3.610724687576294, + "rewards/rejected": -10.318026542663574, + "step": 5850 + }, + { + "epoch": 0.64, + "grad_norm": 10.0625, + "learning_rate": 1.707282832865908e-06, + "logits/chosen": -1.7582849264144897, + "logits/rejected": -1.5748875141143799, + "logps/chosen": -890.2194213867188, + "logps/rejected": -1267.2203369140625, + "loss": 0.2739, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.514025688171387, + "rewards/margins": 4.0218329429626465, + "rewards/rejected": -10.535859107971191, + "step": 5860 + }, + { + "epoch": 0.64, + "grad_norm": 4.5625, + "learning_rate": 1.6982137483883525e-06, + "logits/chosen": -1.780111312866211, + "logits/rejected": -1.5787776708602905, + "logps/chosen": -926.9237060546875, + "logps/rejected": -1376.11962890625, + "loss": 0.3595, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.871534824371338, + "rewards/margins": 4.550869941711426, + "rewards/rejected": -11.422405242919922, + "step": 5870 + }, + { + "epoch": 0.64, + "grad_norm": 10.5625, + "learning_rate": 1.689156409724942e-06, + "logits/chosen": -1.8624169826507568, + "logits/rejected": -1.7056688070297241, + "logps/chosen": -849.5943603515625, + "logps/rejected": -1107.88916015625, + "loss": 0.3925, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -5.929068088531494, + "rewards/margins": 2.858696460723877, + "rewards/rejected": -8.787763595581055, + "step": 5880 + }, + { + "epoch": 0.65, + "grad_norm": 16.375, + "learning_rate": 1.6801109495616852e-06, + "logits/chosen": -1.7686786651611328, + "logits/rejected": -1.5813677310943604, + "logps/chosen": -906.7224731445312, + "logps/rejected": -1270.3275146484375, + "loss": 0.3895, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.698974609375, + "rewards/margins": 3.743776321411133, + "rewards/rejected": -10.442750930786133, + "step": 5890 + }, + { + "epoch": 0.65, + "grad_norm": 6.375, + "learning_rate": 1.671077500410575e-06, + "logits/chosen": -1.7392158508300781, + "logits/rejected": -1.6476109027862549, + "logps/chosen": -797.5294799804688, + "logps/rejected": -1132.521240234375, + "loss": 0.3159, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.61722993850708, + "rewards/margins": 3.430088520050049, + "rewards/rejected": -9.047319412231445, + "step": 5900 + }, + { + "epoch": 0.65, + "grad_norm": 7.59375, + "learning_rate": 1.6620561946076462e-06, + "logits/chosen": -1.8596200942993164, + "logits/rejected": -1.623265027999878, + "logps/chosen": -852.513671875, + "logps/rejected": -1169.1783447265625, + "loss": 0.3289, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.718398094177246, + "rewards/margins": 3.5520122051239014, + "rewards/rejected": -9.270410537719727, + "step": 5910 + }, + { + "epoch": 0.65, + "grad_norm": 4.65625, + "learning_rate": 1.6530471643110427e-06, + "logits/chosen": -1.7657800912857056, + "logits/rejected": -1.6776021718978882, + "logps/chosen": -816.8603515625, + "logps/rejected": -1190.194091796875, + "loss": 0.2589, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.9652628898620605, + "rewards/margins": 4.06595516204834, + "rewards/rejected": -10.031218528747559, + "step": 5920 + }, + { + "epoch": 0.65, + "grad_norm": 29.5, + "learning_rate": 1.644050541499075e-06, + "logits/chosen": -1.8302139043807983, + "logits/rejected": -1.645371675491333, + "logps/chosen": -842.4248046875, + "logps/rejected": -1106.85302734375, + "loss": 0.3638, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.834165096282959, + "rewards/margins": 3.2980194091796875, + "rewards/rejected": -9.132184028625488, + "step": 5930 + }, + { + "epoch": 0.65, + "grad_norm": 6.84375, + "learning_rate": 1.635066457968291e-06, + "logits/chosen": -1.7842804193496704, + "logits/rejected": -1.6632730960845947, + "logps/chosen": -805.5394897460938, + "logps/rejected": -1163.0198974609375, + "loss": 0.2767, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.835457801818848, + "rewards/margins": 3.9816908836364746, + "rewards/rejected": -9.817148208618164, + "step": 5940 + }, + { + "epoch": 0.65, + "grad_norm": 10.0625, + "learning_rate": 1.6260950453315415e-06, + "logits/chosen": -1.7197744846343994, + "logits/rejected": -1.5342010259628296, + "logps/chosen": -829.5427856445312, + "logps/rejected": -1178.2186279296875, + "loss": 0.3645, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.188284873962402, + "rewards/margins": 3.6141514778137207, + "rewards/rejected": -9.802434921264648, + "step": 5950 + }, + { + "epoch": 0.65, + "grad_norm": 10.6875, + "learning_rate": 1.617136435016057e-06, + "logits/chosen": -1.7495660781860352, + "logits/rejected": -1.5230835676193237, + "logps/chosen": -848.271484375, + "logps/rejected": -1184.021728515625, + "loss": 0.3131, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.939058303833008, + "rewards/margins": 3.7495408058166504, + "rewards/rejected": -9.6885986328125, + "step": 5960 + }, + { + "epoch": 0.65, + "grad_norm": 9.4375, + "learning_rate": 1.6081907582615185e-06, + "logits/chosen": -1.8010448217391968, + "logits/rejected": -1.5700279474258423, + "logps/chosen": -907.3184814453125, + "logps/rejected": -1256.716552734375, + "loss": 0.3224, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.7544379234313965, + "rewards/margins": 3.8060402870178223, + "rewards/rejected": -10.560476303100586, + "step": 5970 + }, + { + "epoch": 0.66, + "grad_norm": 10.5, + "learning_rate": 1.5992581461181339e-06, + "logits/chosen": -1.6741087436676025, + "logits/rejected": -1.587148904800415, + "logps/chosen": -852.9539794921875, + "logps/rejected": -1276.917724609375, + "loss": 0.3426, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.123904228210449, + "rewards/margins": 4.117240905761719, + "rewards/rejected": -10.241144180297852, + "step": 5980 + }, + { + "epoch": 0.66, + "grad_norm": 15.4375, + "learning_rate": 1.5903387294447243e-06, + "logits/chosen": -1.6439861059188843, + "logits/rejected": -1.5073421001434326, + "logps/chosen": -842.2957763671875, + "logps/rejected": -1137.3599853515625, + "loss": 0.3824, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.288571357727051, + "rewards/margins": 3.313798427581787, + "rewards/rejected": -9.60236930847168, + "step": 5990 + }, + { + "epoch": 0.66, + "grad_norm": 9.25, + "learning_rate": 1.5814326389068007e-06, + "logits/chosen": -1.560915231704712, + "logits/rejected": -1.513789176940918, + "logps/chosen": -813.4639892578125, + "logps/rejected": -1126.408935546875, + "loss": 0.3177, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.865071773529053, + "rewards/margins": 3.2660088539123535, + "rewards/rejected": -9.131080627441406, + "step": 6000 + }, + { + "epoch": 0.66, + "grad_norm": 19.0, + "learning_rate": 1.5725400049746514e-06, + "logits/chosen": -1.7389227151870728, + "logits/rejected": -1.661863923072815, + "logps/chosen": -814.8272094726562, + "logps/rejected": -1148.040771484375, + "loss": 0.2769, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.657896995544434, + "rewards/margins": 3.643965482711792, + "rewards/rejected": -9.301862716674805, + "step": 6010 + }, + { + "epoch": 0.66, + "grad_norm": 10.5625, + "learning_rate": 1.5636609579214332e-06, + "logits/chosen": -1.8740335702896118, + "logits/rejected": -1.6608235836029053, + "logps/chosen": -763.8502197265625, + "logps/rejected": -1066.5037841796875, + "loss": 0.3303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.875387668609619, + "rewards/margins": 3.483891725540161, + "rewards/rejected": -8.359278678894043, + "step": 6020 + }, + { + "epoch": 0.66, + "grad_norm": 3.5, + "learning_rate": 1.5547956278212612e-06, + "logits/chosen": -1.7967027425765991, + "logits/rejected": -1.7144142389297485, + "logps/chosen": -852.5750122070312, + "logps/rejected": -1168.55517578125, + "loss": 0.3859, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.0317301750183105, + "rewards/margins": 3.273757219314575, + "rewards/rejected": -9.305487632751465, + "step": 6030 + }, + { + "epoch": 0.66, + "grad_norm": 12.9375, + "learning_rate": 1.5459441445473005e-06, + "logits/chosen": -1.7630300521850586, + "logits/rejected": -1.6553971767425537, + "logps/chosen": -817.3908081054688, + "logps/rejected": -1146.626708984375, + "loss": 0.3637, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.671682834625244, + "rewards/margins": 3.2606587409973145, + "rewards/rejected": -8.932340621948242, + "step": 6040 + }, + { + "epoch": 0.66, + "grad_norm": 12.9375, + "learning_rate": 1.537106637769869e-06, + "logits/chosen": -1.8469219207763672, + "logits/rejected": -1.6494554281234741, + "logps/chosen": -836.8327026367188, + "logps/rejected": -1137.49169921875, + "loss": 0.2877, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.7754011154174805, + "rewards/margins": 3.3301830291748047, + "rewards/rejected": -9.105584144592285, + "step": 6050 + }, + { + "epoch": 0.66, + "grad_norm": 21.875, + "learning_rate": 1.5282832369545352e-06, + "logits/chosen": -1.7016947269439697, + "logits/rejected": -1.4927685260772705, + "logps/chosen": -869.5623779296875, + "logps/rejected": -1243.234130859375, + "loss": 0.3686, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.531283378601074, + "rewards/margins": 3.7193942070007324, + "rewards/rejected": -10.250677108764648, + "step": 6060 + }, + { + "epoch": 0.67, + "grad_norm": 21.125, + "learning_rate": 1.5194740713602184e-06, + "logits/chosen": -1.750287652015686, + "logits/rejected": -1.5996938943862915, + "logps/chosen": -848.9260864257812, + "logps/rejected": -1182.40283203125, + "loss": 0.3908, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.015078544616699, + "rewards/margins": 3.463412046432495, + "rewards/rejected": -9.478490829467773, + "step": 6070 + }, + { + "epoch": 0.67, + "grad_norm": 15.25, + "learning_rate": 1.5106792700373016e-06, + "logits/chosen": -1.7111488580703735, + "logits/rejected": -1.4944158792495728, + "logps/chosen": -918.2913208007812, + "logps/rejected": -1329.18603515625, + "loss": 0.2768, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.294156551361084, + "rewards/margins": 4.574944019317627, + "rewards/rejected": -10.869100570678711, + "step": 6080 + }, + { + "epoch": 0.67, + "grad_norm": 7.9375, + "learning_rate": 1.5018989618257363e-06, + "logits/chosen": -1.6870285272598267, + "logits/rejected": -1.6207847595214844, + "logps/chosen": -851.7330322265625, + "logps/rejected": -1216.6829833984375, + "loss": 0.2726, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.913125038146973, + "rewards/margins": 3.8008270263671875, + "rewards/rejected": -9.713953018188477, + "step": 6090 + }, + { + "epoch": 0.67, + "grad_norm": 16.375, + "learning_rate": 1.4931332753531575e-06, + "logits/chosen": -1.751688003540039, + "logits/rejected": -1.6195459365844727, + "logps/chosen": -831.5275268554688, + "logps/rejected": -1253.249267578125, + "loss": 0.2504, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.229335784912109, + "rewards/margins": 4.20377254486084, + "rewards/rejected": -10.433107376098633, + "step": 6100 + }, + { + "epoch": 0.67, + "grad_norm": 9.0, + "learning_rate": 1.4843823390329948e-06, + "logits/chosen": -1.7316057682037354, + "logits/rejected": -1.480668067932129, + "logps/chosen": -933.5306396484375, + "logps/rejected": -1298.552490234375, + "loss": 0.2976, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.674904823303223, + "rewards/margins": 3.9925849437713623, + "rewards/rejected": -10.667489051818848, + "step": 6110 + }, + { + "epoch": 0.67, + "grad_norm": 13.5625, + "learning_rate": 1.4756462810625976e-06, + "logits/chosen": -1.8041664361953735, + "logits/rejected": -1.5091979503631592, + "logps/chosen": -894.63037109375, + "logps/rejected": -1198.416259765625, + "loss": 0.2388, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.1819963455200195, + "rewards/margins": 3.534543514251709, + "rewards/rejected": -9.716540336608887, + "step": 6120 + }, + { + "epoch": 0.67, + "grad_norm": 10.5, + "learning_rate": 1.4669252294213549e-06, + "logits/chosen": -1.6366676092147827, + "logits/rejected": -1.492136001586914, + "logps/chosen": -933.29833984375, + "logps/rejected": -1315.9599609375, + "loss": 0.2736, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.071603298187256, + "rewards/margins": 3.833287477493286, + "rewards/rejected": -10.904890060424805, + "step": 6130 + }, + { + "epoch": 0.67, + "grad_norm": 14.75, + "learning_rate": 1.4582193118688147e-06, + "logits/chosen": -1.648318886756897, + "logits/rejected": -1.6478042602539062, + "logps/chosen": -905.2322998046875, + "logps/rejected": -1232.490966796875, + "loss": 0.3396, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.7228102684021, + "rewards/margins": 3.1533992290496826, + "rewards/rejected": -9.876211166381836, + "step": 6140 + }, + { + "epoch": 0.67, + "grad_norm": 9.125, + "learning_rate": 1.4495286559428245e-06, + "logits/chosen": -1.8594694137573242, + "logits/rejected": -1.6731624603271484, + "logps/chosen": -878.3630981445312, + "logps/rejected": -1230.520263671875, + "loss": 0.2608, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.03685998916626, + "rewards/margins": 3.7346909046173096, + "rewards/rejected": -9.771550178527832, + "step": 6150 + }, + { + "epoch": 0.68, + "grad_norm": 16.625, + "learning_rate": 1.4408533889576486e-06, + "logits/chosen": -1.7269805669784546, + "logits/rejected": -1.5672860145568848, + "logps/chosen": -954.9598388671875, + "logps/rejected": -1284.303466796875, + "loss": 0.3716, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.064768314361572, + "rewards/margins": 3.7832260131835938, + "rewards/rejected": -10.847993850708008, + "step": 6160 + }, + { + "epoch": 0.68, + "grad_norm": 28.375, + "learning_rate": 1.4321936380021153e-06, + "logits/chosen": -1.6238672733306885, + "logits/rejected": -1.4094452857971191, + "logps/chosen": -914.1920166015625, + "logps/rejected": -1240.7899169921875, + "loss": 0.3631, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.617919921875, + "rewards/margins": 3.567502975463867, + "rewards/rejected": -10.185422897338867, + "step": 6170 + }, + { + "epoch": 0.68, + "grad_norm": 9.1875, + "learning_rate": 1.4235495299377464e-06, + "logits/chosen": -1.7354987859725952, + "logits/rejected": -1.6134350299835205, + "logps/chosen": -906.9595947265625, + "logps/rejected": -1206.542724609375, + "loss": 0.2608, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.341557502746582, + "rewards/margins": 3.5254769325256348, + "rewards/rejected": -9.867034912109375, + "step": 6180 + }, + { + "epoch": 0.68, + "grad_norm": 19.625, + "learning_rate": 1.4149211913969056e-06, + "logits/chosen": -1.7079238891601562, + "logits/rejected": -1.449621319770813, + "logps/chosen": -892.2933349609375, + "logps/rejected": -1241.3409423828125, + "loss": 0.3409, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.547828674316406, + "rewards/margins": 3.920067548751831, + "rewards/rejected": -10.467897415161133, + "step": 6190 + }, + { + "epoch": 0.68, + "grad_norm": 5.28125, + "learning_rate": 1.406308748780936e-06, + "logits/chosen": -1.6679325103759766, + "logits/rejected": -1.4103087186813354, + "logps/chosen": -886.9486083984375, + "logps/rejected": -1227.058349609375, + "loss": 0.2853, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.423262119293213, + "rewards/margins": 3.8474388122558594, + "rewards/rejected": -10.270700454711914, + "step": 6200 + }, + { + "epoch": 0.68, + "grad_norm": 7.4375, + "learning_rate": 1.397712328258316e-06, + "logits/chosen": -1.6770254373550415, + "logits/rejected": -1.6445395946502686, + "logps/chosen": -938.4742431640625, + "logps/rejected": -1259.4876708984375, + "loss": 0.3645, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.811001777648926, + "rewards/margins": 3.5583834648132324, + "rewards/rejected": -10.369386672973633, + "step": 6210 + }, + { + "epoch": 0.68, + "grad_norm": 8.75, + "learning_rate": 1.3891320557628068e-06, + "logits/chosen": -1.569022536277771, + "logits/rejected": -1.5453184843063354, + "logps/chosen": -1007.5875854492188, + "logps/rejected": -1446.0450439453125, + "loss": 0.2446, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -7.541706085205078, + "rewards/margins": 4.5069169998168945, + "rewards/rejected": -12.048623085021973, + "step": 6220 + }, + { + "epoch": 0.68, + "grad_norm": 12.6875, + "learning_rate": 1.3805680569916057e-06, + "logits/chosen": -1.7015600204467773, + "logits/rejected": -1.4529300928115845, + "logps/chosen": -948.0101318359375, + "logps/rejected": -1326.308349609375, + "loss": 0.2714, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.830543518066406, + "rewards/margins": 4.251474857330322, + "rewards/rejected": -11.082018852233887, + "step": 6230 + }, + { + "epoch": 0.68, + "grad_norm": 15.375, + "learning_rate": 1.3720204574035101e-06, + "logits/chosen": -1.658085823059082, + "logits/rejected": -1.3701894283294678, + "logps/chosen": -982.7488403320312, + "logps/rejected": -1394.266845703125, + "loss": 0.3439, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.510354518890381, + "rewards/margins": 4.444123268127441, + "rewards/rejected": -11.95447826385498, + "step": 6240 + }, + { + "epoch": 0.69, + "grad_norm": 12.6875, + "learning_rate": 1.3634893822170748e-06, + "logits/chosen": -1.6761715412139893, + "logits/rejected": -1.4190032482147217, + "logps/chosen": -939.4337768554688, + "logps/rejected": -1383.1217041015625, + "loss": 0.256, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.759289741516113, + "rewards/margins": 4.878803730010986, + "rewards/rejected": -11.638092994689941, + "step": 6250 + }, + { + "epoch": 0.69, + "grad_norm": 10.125, + "learning_rate": 1.3549749564087814e-06, + "logits/chosen": -1.7849088907241821, + "logits/rejected": -1.5404027700424194, + "logps/chosen": -968.2351684570312, + "logps/rejected": -1316.9656982421875, + "loss": 0.31, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.001092433929443, + "rewards/margins": 3.9078311920166016, + "rewards/rejected": -10.908923149108887, + "step": 6260 + }, + { + "epoch": 0.69, + "grad_norm": 13.9375, + "learning_rate": 1.3464773047112017e-06, + "logits/chosen": -1.6646318435668945, + "logits/rejected": -1.5404531955718994, + "logps/chosen": -889.8689575195312, + "logps/rejected": -1183.7486572265625, + "loss": 0.2965, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.504197597503662, + "rewards/margins": 3.3462109565734863, + "rewards/rejected": -9.850407600402832, + "step": 6270 + }, + { + "epoch": 0.69, + "grad_norm": 4.46875, + "learning_rate": 1.3379965516111781e-06, + "logits/chosen": -1.678044080734253, + "logits/rejected": -1.6487442255020142, + "logps/chosen": -893.35205078125, + "logps/rejected": -1321.9339599609375, + "loss": 0.32, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.475593566894531, + "rewards/margins": 4.448634624481201, + "rewards/rejected": -10.924229621887207, + "step": 6280 + }, + { + "epoch": 0.69, + "grad_norm": 19.5, + "learning_rate": 1.3295328213479953e-06, + "logits/chosen": -1.8396753072738647, + "logits/rejected": -1.6186325550079346, + "logps/chosen": -886.9278564453125, + "logps/rejected": -1180.5267333984375, + "loss": 0.3725, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -6.176216125488281, + "rewards/margins": 3.411938428878784, + "rewards/rejected": -9.588154792785645, + "step": 6290 + }, + { + "epoch": 0.69, + "grad_norm": 11.875, + "learning_rate": 1.3210862379115577e-06, + "logits/chosen": -1.709861397743225, + "logits/rejected": -1.589601755142212, + "logps/chosen": -883.3557739257812, + "logps/rejected": -1270.0673828125, + "loss": 0.2544, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.184123992919922, + "rewards/margins": 4.429729461669922, + "rewards/rejected": -10.61385440826416, + "step": 6300 + }, + { + "epoch": 0.69, + "grad_norm": 23.375, + "learning_rate": 1.3126569250405812e-06, + "logits/chosen": -1.8077514171600342, + "logits/rejected": -1.5814578533172607, + "logps/chosen": -850.3025512695312, + "logps/rejected": -1178.780517578125, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.933908462524414, + "rewards/margins": 3.5799102783203125, + "rewards/rejected": -9.513818740844727, + "step": 6310 + }, + { + "epoch": 0.69, + "grad_norm": 13.5625, + "learning_rate": 1.304245006220772e-06, + "logits/chosen": -1.6199226379394531, + "logits/rejected": -1.4725515842437744, + "logps/chosen": -911.97216796875, + "logps/rejected": -1304.743408203125, + "loss": 0.2637, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.5924882888793945, + "rewards/margins": 4.06698751449585, + "rewards/rejected": -10.659475326538086, + "step": 6320 + }, + { + "epoch": 0.69, + "grad_norm": 30.625, + "learning_rate": 1.295850604683021e-06, + "logits/chosen": -1.763615369796753, + "logits/rejected": -1.5394920110702515, + "logps/chosen": -776.0261840820312, + "logps/rejected": -1117.340087890625, + "loss": 0.291, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.098698616027832, + "rewards/margins": 3.993137836456299, + "rewards/rejected": -9.091835975646973, + "step": 6330 + }, + { + "epoch": 0.7, + "grad_norm": 6.625, + "learning_rate": 1.2874738434016012e-06, + "logits/chosen": -1.6925674676895142, + "logits/rejected": -1.434727668762207, + "logps/chosen": -919.2091064453125, + "logps/rejected": -1166.97705078125, + "loss": 0.2981, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.609585762023926, + "rewards/margins": 3.0651919841766357, + "rewards/rejected": -9.67477798461914, + "step": 6340 + }, + { + "epoch": 0.7, + "grad_norm": 14.6875, + "learning_rate": 1.279114845092363e-06, + "logits/chosen": -1.6643526554107666, + "logits/rejected": -1.5005366802215576, + "logps/chosen": -948.8317260742188, + "logps/rejected": -1280.733642578125, + "loss": 0.3547, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.925854682922363, + "rewards/margins": 3.682194471359253, + "rewards/rejected": -10.608049392700195, + "step": 6350 + }, + { + "epoch": 0.7, + "grad_norm": 16.25, + "learning_rate": 1.270773732210935e-06, + "logits/chosen": -1.7767980098724365, + "logits/rejected": -1.5632175207138062, + "logps/chosen": -810.5305786132812, + "logps/rejected": -1178.248291015625, + "loss": 0.304, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.36056661605835, + "rewards/margins": 4.046128273010254, + "rewards/rejected": -9.406694412231445, + "step": 6360 + }, + { + "epoch": 0.7, + "grad_norm": 13.3125, + "learning_rate": 1.2624506269509334e-06, + "logits/chosen": -1.6541216373443604, + "logits/rejected": -1.6172449588775635, + "logps/chosen": -880.1060791015625, + "logps/rejected": -1272.1063232421875, + "loss": 0.351, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.718127250671387, + "rewards/margins": 3.934053897857666, + "rewards/rejected": -10.652181625366211, + "step": 6370 + }, + { + "epoch": 0.7, + "grad_norm": 6.4375, + "learning_rate": 1.2541456512421734e-06, + "logits/chosen": -1.787191390991211, + "logits/rejected": -1.59225594997406, + "logps/chosen": -831.5227661132812, + "logps/rejected": -1213.252685546875, + "loss": 0.3012, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.886507511138916, + "rewards/margins": 4.129021644592285, + "rewards/rejected": -10.015528678894043, + "step": 6380 + }, + { + "epoch": 0.7, + "grad_norm": 6.96875, + "learning_rate": 1.2458589267488746e-06, + "logits/chosen": -1.8449147939682007, + "logits/rejected": -1.653042197227478, + "logps/chosen": -854.3810424804688, + "logps/rejected": -1163.3541259765625, + "loss": 0.3506, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.0314483642578125, + "rewards/margins": 3.4089901447296143, + "rewards/rejected": -9.440439224243164, + "step": 6390 + }, + { + "epoch": 0.7, + "grad_norm": 8.6875, + "learning_rate": 1.2375905748678894e-06, + "logits/chosen": -1.789484977722168, + "logits/rejected": -1.6050348281860352, + "logps/chosen": -899.3917236328125, + "logps/rejected": -1237.197998046875, + "loss": 0.2631, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.938449382781982, + "rewards/margins": 3.880220890045166, + "rewards/rejected": -9.818670272827148, + "step": 6400 + }, + { + "epoch": 0.7, + "grad_norm": 8.0, + "learning_rate": 1.2293407167269172e-06, + "logits/chosen": -1.7475976943969727, + "logits/rejected": -1.5245461463928223, + "logps/chosen": -821.0612182617188, + "logps/rejected": -1156.541748046875, + "loss": 0.3617, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.830685615539551, + "rewards/margins": 3.7571768760681152, + "rewards/rejected": -9.587862014770508, + "step": 6410 + }, + { + "epoch": 0.7, + "grad_norm": 5.5, + "learning_rate": 1.2211094731827342e-06, + "logits/chosen": -1.6776186227798462, + "logits/rejected": -1.7348531484603882, + "logps/chosen": -881.3181762695312, + "logps/rejected": -1200.8673095703125, + "loss": 0.3442, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.884750843048096, + "rewards/margins": 3.171901226043701, + "rewards/rejected": -9.05665111541748, + "step": 6420 + }, + { + "epoch": 0.7, + "grad_norm": 8.875, + "learning_rate": 1.2128969648194172e-06, + "logits/chosen": -1.8538802862167358, + "logits/rejected": -1.6758854389190674, + "logps/chosen": -804.3789672851562, + "logps/rejected": -1110.3843994140625, + "loss": 0.2905, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.430224895477295, + "rewards/margins": 3.13891863822937, + "rewards/rejected": -8.569143295288086, + "step": 6430 + }, + { + "epoch": 0.71, + "grad_norm": 13.875, + "learning_rate": 1.2047033119465844e-06, + "logits/chosen": -1.7386085987091064, + "logits/rejected": -1.4384286403656006, + "logps/chosen": -862.9212646484375, + "logps/rejected": -1223.8856201171875, + "loss": 0.2842, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.083265781402588, + "rewards/margins": 3.91027569770813, + "rewards/rejected": -9.99354076385498, + "step": 6440 + }, + { + "epoch": 0.71, + "grad_norm": 16.375, + "learning_rate": 1.1965286345976294e-06, + "logits/chosen": -1.758026123046875, + "logits/rejected": -1.3832170963287354, + "logps/chosen": -845.3821411132812, + "logps/rejected": -1195.1385498046875, + "loss": 0.2784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.997134685516357, + "rewards/margins": 3.7753360271453857, + "rewards/rejected": -9.772470474243164, + "step": 6450 + }, + { + "epoch": 0.71, + "grad_norm": 20.125, + "learning_rate": 1.1883730525279597e-06, + "logits/chosen": -1.729758620262146, + "logits/rejected": -1.540727138519287, + "logps/chosen": -923.4788208007812, + "logps/rejected": -1255.874755859375, + "loss": 0.3046, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1862077713012695, + "rewards/margins": 3.9544425010681152, + "rewards/rejected": -10.140649795532227, + "step": 6460 + }, + { + "epoch": 0.71, + "grad_norm": 9.75, + "learning_rate": 1.1802366852132487e-06, + "logits/chosen": -1.781943917274475, + "logits/rejected": -1.5474400520324707, + "logps/chosen": -918.9052734375, + "logps/rejected": -1242.8492431640625, + "loss": 0.3093, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.6265997886657715, + "rewards/margins": 3.531494617462158, + "rewards/rejected": -10.15809440612793, + "step": 6470 + }, + { + "epoch": 0.71, + "grad_norm": 10.375, + "learning_rate": 1.1721196518476807e-06, + "logits/chosen": -1.748970627784729, + "logits/rejected": -1.5054429769515991, + "logps/chosen": -875.9109497070312, + "logps/rejected": -1274.7432861328125, + "loss": 0.2407, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.267611026763916, + "rewards/margins": 4.428039073944092, + "rewards/rejected": -10.695650100708008, + "step": 6480 + }, + { + "epoch": 0.71, + "grad_norm": 20.125, + "learning_rate": 1.1640220713422048e-06, + "logits/chosen": -1.7895793914794922, + "logits/rejected": -1.57892644405365, + "logps/chosen": -930.29443359375, + "logps/rejected": -1254.5989990234375, + "loss": 0.3237, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.561395168304443, + "rewards/margins": 3.753086805343628, + "rewards/rejected": -10.314481735229492, + "step": 6490 + }, + { + "epoch": 0.71, + "grad_norm": 8.125, + "learning_rate": 1.1559440623227968e-06, + "logits/chosen": -1.768509864807129, + "logits/rejected": -1.6114537715911865, + "logps/chosen": -855.0602416992188, + "logps/rejected": -1181.176025390625, + "loss": 0.3039, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.91488790512085, + "rewards/margins": 3.6292884349823, + "rewards/rejected": -9.54417610168457, + "step": 6500 + }, + { + "epoch": 0.71, + "grad_norm": 13.5, + "learning_rate": 1.1478857431287188e-06, + "logits/chosen": -1.821258306503296, + "logits/rejected": -1.6138159036636353, + "logps/chosen": -876.9791870117188, + "logps/rejected": -1180.69189453125, + "loss": 0.3352, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.100554466247559, + "rewards/margins": 3.3296401500701904, + "rewards/rejected": -9.430193901062012, + "step": 6510 + }, + { + "epoch": 0.71, + "grad_norm": 19.75, + "learning_rate": 1.139847231810783e-06, + "logits/chosen": -1.716790795326233, + "logits/rejected": -1.5229729413986206, + "logps/chosen": -904.1443481445312, + "logps/rejected": -1273.534912109375, + "loss": 0.2664, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.780157566070557, + "rewards/margins": 4.107874870300293, + "rewards/rejected": -10.888032913208008, + "step": 6520 + }, + { + "epoch": 0.72, + "grad_norm": 12.1875, + "learning_rate": 1.131828646129627e-06, + "logits/chosen": -1.7101356983184814, + "logits/rejected": -1.612854242324829, + "logps/chosen": -931.1962890625, + "logps/rejected": -1336.392578125, + "loss": 0.2998, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.89572286605835, + "rewards/margins": 4.209144592285156, + "rewards/rejected": -11.104867935180664, + "step": 6530 + }, + { + "epoch": 0.72, + "grad_norm": 9.1875, + "learning_rate": 1.1238301035539881e-06, + "logits/chosen": -1.7779966592788696, + "logits/rejected": -1.4742963314056396, + "logps/chosen": -902.3121337890625, + "logps/rejected": -1290.1854248046875, + "loss": 0.3137, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.5398430824279785, + "rewards/margins": 4.179739952087402, + "rewards/rejected": -10.719582557678223, + "step": 6540 + }, + { + "epoch": 0.72, + "grad_norm": 7.40625, + "learning_rate": 1.1158517212589764e-06, + "logits/chosen": -1.7396529912948608, + "logits/rejected": -1.6895620822906494, + "logps/chosen": -896.58984375, + "logps/rejected": -1212.4940185546875, + "loss": 0.2906, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.47376012802124, + "rewards/margins": 3.383216381072998, + "rewards/rejected": -9.856977462768555, + "step": 6550 + }, + { + "epoch": 0.72, + "grad_norm": 7.84375, + "learning_rate": 1.1078936161243672e-06, + "logits/chosen": -1.7748429775238037, + "logits/rejected": -1.6418899297714233, + "logps/chosen": -908.0927734375, + "logps/rejected": -1194.327880859375, + "loss": 0.317, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.318285942077637, + "rewards/margins": 3.234949827194214, + "rewards/rejected": -9.55323600769043, + "step": 6560 + }, + { + "epoch": 0.72, + "grad_norm": 5.625, + "learning_rate": 1.0999559047328825e-06, + "logits/chosen": -1.6568149328231812, + "logits/rejected": -1.429935336112976, + "logps/chosen": -948.4285278320312, + "logps/rejected": -1380.120361328125, + "loss": 0.2754, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -7.366261959075928, + "rewards/margins": 4.303637504577637, + "rewards/rejected": -11.669899940490723, + "step": 6570 + }, + { + "epoch": 0.72, + "grad_norm": 6.03125, + "learning_rate": 1.0920387033684862e-06, + "logits/chosen": -1.713789701461792, + "logits/rejected": -1.6896559000015259, + "logps/chosen": -827.7850341796875, + "logps/rejected": -1260.190185546875, + "loss": 0.2486, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.949117183685303, + "rewards/margins": 4.567793846130371, + "rewards/rejected": -10.516910552978516, + "step": 6580 + }, + { + "epoch": 0.72, + "grad_norm": 13.625, + "learning_rate": 1.0841421280146764e-06, + "logits/chosen": -1.721596360206604, + "logits/rejected": -1.5261449813842773, + "logps/chosen": -948.0220947265625, + "logps/rejected": -1222.8203125, + "loss": 0.3435, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.662171363830566, + "rewards/margins": 3.3473353385925293, + "rewards/rejected": -10.00950813293457, + "step": 6590 + }, + { + "epoch": 0.72, + "grad_norm": 10.5625, + "learning_rate": 1.0762662943527925e-06, + "logits/chosen": -1.7362477779388428, + "logits/rejected": -1.5353807210922241, + "logps/chosen": -964.3173828125, + "logps/rejected": -1349.978271484375, + "loss": 0.2973, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.134352207183838, + "rewards/margins": 4.122511863708496, + "rewards/rejected": -11.256864547729492, + "step": 6600 + }, + { + "epoch": 0.72, + "grad_norm": 8.6875, + "learning_rate": 1.0684113177603161e-06, + "logits/chosen": -1.7353332042694092, + "logits/rejected": -1.5633857250213623, + "logps/chosen": -868.8815307617188, + "logps/rejected": -1287.3275146484375, + "loss": 0.2995, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.295851707458496, + "rewards/margins": 4.255226135253906, + "rewards/rejected": -10.551077842712402, + "step": 6610 + }, + { + "epoch": 0.73, + "grad_norm": 16.375, + "learning_rate": 1.060577313309182e-06, + "logits/chosen": -1.7985957860946655, + "logits/rejected": -1.5454607009887695, + "logps/chosen": -813.262939453125, + "logps/rejected": -1199.162109375, + "loss": 0.2947, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.009547233581543, + "rewards/margins": 3.735764265060425, + "rewards/rejected": -9.745311737060547, + "step": 6620 + }, + { + "epoch": 0.73, + "grad_norm": 18.5, + "learning_rate": 1.0527643957640909e-06, + "logits/chosen": -1.807539939880371, + "logits/rejected": -1.7820017337799072, + "logps/chosen": -795.8912353515625, + "logps/rejected": -1172.1004638671875, + "loss": 0.3187, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.615099906921387, + "rewards/margins": 4.021520137786865, + "rewards/rejected": -9.63662052154541, + "step": 6630 + }, + { + "epoch": 0.73, + "grad_norm": 13.8125, + "learning_rate": 1.044972679580833e-06, + "logits/chosen": -1.858746886253357, + "logits/rejected": -1.6967544555664062, + "logps/chosen": -893.7203979492188, + "logps/rejected": -1219.598388671875, + "loss": 0.3288, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.252263069152832, + "rewards/margins": 3.6001133918762207, + "rewards/rejected": -9.852375984191895, + "step": 6640 + }, + { + "epoch": 0.73, + "grad_norm": 11.875, + "learning_rate": 1.0372022789046034e-06, + "logits/chosen": -1.779028296470642, + "logits/rejected": -1.5356533527374268, + "logps/chosen": -901.763671875, + "logps/rejected": -1271.1663818359375, + "loss": 0.3278, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.742913722991943, + "rewards/margins": 4.039364814758301, + "rewards/rejected": -10.782279014587402, + "step": 6650 + }, + { + "epoch": 0.73, + "grad_norm": 14.25, + "learning_rate": 1.0294533075683392e-06, + "logits/chosen": -1.8552062511444092, + "logits/rejected": -1.6880321502685547, + "logps/chosen": -872.4426879882812, + "logps/rejected": -1239.5291748046875, + "loss": 0.2894, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.895809173583984, + "rewards/margins": 4.074398040771484, + "rewards/rejected": -9.970207214355469, + "step": 6660 + }, + { + "epoch": 0.73, + "grad_norm": 22.375, + "learning_rate": 1.0217258790910447e-06, + "logits/chosen": -1.7303069829940796, + "logits/rejected": -1.6344144344329834, + "logps/chosen": -881.7399291992188, + "logps/rejected": -1185.889404296875, + "loss": 0.4592, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.5407514572143555, + "rewards/margins": 3.0542140007019043, + "rewards/rejected": -9.594964981079102, + "step": 6670 + }, + { + "epoch": 0.73, + "grad_norm": 7.6875, + "learning_rate": 1.0140201066761301e-06, + "logits/chosen": -1.7924892902374268, + "logits/rejected": -1.6166517734527588, + "logps/chosen": -887.1427612304688, + "logps/rejected": -1136.970947265625, + "loss": 0.3912, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.021181583404541, + "rewards/margins": 2.903892755508423, + "rewards/rejected": -8.925074577331543, + "step": 6680 + }, + { + "epoch": 0.73, + "grad_norm": 8.875, + "learning_rate": 1.0063361032097552e-06, + "logits/chosen": -1.7463325262069702, + "logits/rejected": -1.7109959125518799, + "logps/chosen": -849.6920776367188, + "logps/rejected": -1114.09814453125, + "loss": 0.3581, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.863959312438965, + "rewards/margins": 2.9045028686523438, + "rewards/rejected": -8.768461227416992, + "step": 6690 + }, + { + "epoch": 0.73, + "grad_norm": 4.78125, + "learning_rate": 9.98673981259174e-07, + "logits/chosen": -1.8467506170272827, + "logits/rejected": -1.600547432899475, + "logps/chosen": -827.7647705078125, + "logps/rejected": -1134.1593017578125, + "loss": 0.3359, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.684759140014648, + "rewards/margins": 3.592005968093872, + "rewards/rejected": -9.276765823364258, + "step": 6700 + }, + { + "epoch": 0.74, + "grad_norm": 7.0625, + "learning_rate": 9.910338530710872e-07, + "logits/chosen": -1.845618486404419, + "logits/rejected": -1.7022597789764404, + "logps/chosen": -815.3365478515625, + "logps/rejected": -1129.656494140625, + "loss": 0.3283, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.407408237457275, + "rewards/margins": 3.6242358684539795, + "rewards/rejected": -9.031643867492676, + "step": 6710 + }, + { + "epoch": 0.74, + "grad_norm": 8.0, + "learning_rate": 9.834158305699935e-07, + "logits/chosen": -1.820191740989685, + "logits/rejected": -1.6064832210540771, + "logps/chosen": -816.0989990234375, + "logps/rejected": -1112.416259765625, + "loss": 0.3147, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.489621162414551, + "rewards/margins": 3.400510311126709, + "rewards/rejected": -8.890131950378418, + "step": 6720 + }, + { + "epoch": 0.74, + "grad_norm": 14.5, + "learning_rate": 9.758200253565553e-07, + "logits/chosen": -1.9184739589691162, + "logits/rejected": -1.5242900848388672, + "logps/chosen": -823.4591674804688, + "logps/rejected": -1122.5181884765625, + "loss": 0.2952, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.468290328979492, + "rewards/margins": 3.606449604034424, + "rewards/rejected": -9.074739456176758, + "step": 6730 + }, + { + "epoch": 0.74, + "grad_norm": 6.6875, + "learning_rate": 9.682465487059623e-07, + "logits/chosen": -1.8069225549697876, + "logits/rejected": -1.6427466869354248, + "logps/chosen": -832.9793090820312, + "logps/rejected": -1096.363525390625, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.609135627746582, + "rewards/margins": 3.198622941970825, + "rewards/rejected": -8.807758331298828, + "step": 6740 + }, + { + "epoch": 0.74, + "grad_norm": 18.0, + "learning_rate": 9.606955115662977e-07, + "logits/chosen": -1.7683712244033813, + "logits/rejected": -1.6010938882827759, + "logps/chosen": -828.6780395507812, + "logps/rejected": -1225.0675048828125, + "loss": 0.3082, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.857048988342285, + "rewards/margins": 4.212182998657227, + "rewards/rejected": -10.069231986999512, + "step": 6750 + }, + { + "epoch": 0.74, + "grad_norm": 10.1875, + "learning_rate": 9.531670245569188e-07, + "logits/chosen": -1.698781967163086, + "logits/rejected": -1.7032835483551025, + "logps/chosen": -853.0354614257812, + "logps/rejected": -1186.471435546875, + "loss": 0.2796, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.772950172424316, + "rewards/margins": 3.6296393871307373, + "rewards/rejected": -9.402589797973633, + "step": 6760 + }, + { + "epoch": 0.74, + "grad_norm": 13.375, + "learning_rate": 9.456611979668326e-07, + "logits/chosen": -1.6784782409667969, + "logits/rejected": -1.6231733560562134, + "logps/chosen": -852.38916015625, + "logps/rejected": -1165.4674072265625, + "loss": 0.3341, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -5.905057907104492, + "rewards/margins": 3.3571553230285645, + "rewards/rejected": -9.262212753295898, + "step": 6770 + }, + { + "epoch": 0.74, + "grad_norm": 6.1875, + "learning_rate": 9.381781417530797e-07, + "logits/chosen": -1.8013633489608765, + "logits/rejected": -1.6034777164459229, + "logps/chosen": -815.9773559570312, + "logps/rejected": -1128.221435546875, + "loss": 0.2647, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.480761528015137, + "rewards/margins": 3.7314491271972656, + "rewards/rejected": -9.212210655212402, + "step": 6780 + }, + { + "epoch": 0.74, + "grad_norm": 22.25, + "learning_rate": 9.307179655391252e-07, + "logits/chosen": -1.8249568939208984, + "logits/rejected": -1.6295055150985718, + "logps/chosen": -898.42529296875, + "logps/rejected": -1220.7489013671875, + "loss": 0.2893, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.2368855476379395, + "rewards/margins": 3.4644157886505127, + "rewards/rejected": -9.701300621032715, + "step": 6790 + }, + { + "epoch": 0.75, + "grad_norm": 7.625, + "learning_rate": 9.232807786132541e-07, + "logits/chosen": -1.8028749227523804, + "logits/rejected": -1.4783848524093628, + "logps/chosen": -882.7718505859375, + "logps/rejected": -1178.804931640625, + "loss": 0.3164, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.196017265319824, + "rewards/margins": 3.4154820442199707, + "rewards/rejected": -9.61149787902832, + "step": 6800 + }, + { + "epoch": 0.75, + "grad_norm": 20.125, + "learning_rate": 9.158666899269658e-07, + "logits/chosen": -1.6653850078582764, + "logits/rejected": -1.515275239944458, + "logps/chosen": -829.1525268554688, + "logps/rejected": -1204.567626953125, + "loss": 0.3592, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.954301834106445, + "rewards/margins": 4.0168352127075195, + "rewards/rejected": -9.971138000488281, + "step": 6810 + }, + { + "epoch": 0.75, + "grad_norm": 16.875, + "learning_rate": 9.084758080933828e-07, + "logits/chosen": -1.6684086322784424, + "logits/rejected": -1.5503395795822144, + "logps/chosen": -855.4036254882812, + "logps/rejected": -1171.8280029296875, + "loss": 0.257, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.031086444854736, + "rewards/margins": 3.4870121479034424, + "rewards/rejected": -9.518098831176758, + "step": 6820 + }, + { + "epoch": 0.75, + "grad_norm": 14.8125, + "learning_rate": 9.011082413856572e-07, + "logits/chosen": -1.8281484842300415, + "logits/rejected": -1.5216374397277832, + "logps/chosen": -883.1175537109375, + "logps/rejected": -1220.309814453125, + "loss": 0.2412, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.369784832000732, + "rewards/margins": 3.6747524738311768, + "rewards/rejected": -10.044537544250488, + "step": 6830 + }, + { + "epoch": 0.75, + "grad_norm": 11.3125, + "learning_rate": 8.937640977353831e-07, + "logits/chosen": -1.8083150386810303, + "logits/rejected": -1.6462488174438477, + "logps/chosen": -847.7652587890625, + "logps/rejected": -1172.885498046875, + "loss": 0.3277, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.887875556945801, + "rewards/margins": 3.660693407058716, + "rewards/rejected": -9.548568725585938, + "step": 6840 + }, + { + "epoch": 0.75, + "grad_norm": 5.375, + "learning_rate": 8.864434847310191e-07, + "logits/chosen": -1.7279911041259766, + "logits/rejected": -1.5335490703582764, + "logps/chosen": -790.4332275390625, + "logps/rejected": -1030.9033203125, + "loss": 0.355, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.834652900695801, + "rewards/margins": 2.645068407058716, + "rewards/rejected": -8.479721069335938, + "step": 6850 + }, + { + "epoch": 0.75, + "grad_norm": 16.25, + "learning_rate": 8.791465096163093e-07, + "logits/chosen": -1.7005350589752197, + "logits/rejected": -1.6139520406723022, + "logps/chosen": -896.1902465820312, + "logps/rejected": -1218.143798828125, + "loss": 0.3511, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.393213748931885, + "rewards/margins": 3.6145317554473877, + "rewards/rejected": -10.007745742797852, + "step": 6860 + }, + { + "epoch": 0.75, + "grad_norm": 12.8125, + "learning_rate": 8.718732792887147e-07, + "logits/chosen": -1.828971266746521, + "logits/rejected": -1.682080626487732, + "logps/chosen": -883.0609130859375, + "logps/rejected": -1277.6956787109375, + "loss": 0.341, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.466531276702881, + "rewards/margins": 4.012367248535156, + "rewards/rejected": -10.478899002075195, + "step": 6870 + }, + { + "epoch": 0.75, + "grad_norm": 10.0625, + "learning_rate": 8.646239002978423e-07, + "logits/chosen": -1.8473354578018188, + "logits/rejected": -1.642524003982544, + "logps/chosen": -863.5262451171875, + "logps/rejected": -1155.427490234375, + "loss": 0.3389, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.900906562805176, + "rewards/margins": 3.4661476612091064, + "rewards/rejected": -9.367053985595703, + "step": 6880 + }, + { + "epoch": 0.76, + "grad_norm": 9.9375, + "learning_rate": 8.573984788438908e-07, + "logits/chosen": -1.8394482135772705, + "logits/rejected": -1.6173152923583984, + "logps/chosen": -813.5635986328125, + "logps/rejected": -1160.148193359375, + "loss": 0.3168, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.568140983581543, + "rewards/margins": 3.606654644012451, + "rewards/rejected": -9.174795150756836, + "step": 6890 + }, + { + "epoch": 0.76, + "grad_norm": 9.625, + "learning_rate": 8.501971207760909e-07, + "logits/chosen": -1.8376007080078125, + "logits/rejected": -1.6657264232635498, + "logps/chosen": -846.84423828125, + "logps/rejected": -1178.4444580078125, + "loss": 0.2576, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.554091930389404, + "rewards/margins": 3.8931515216827393, + "rewards/rejected": -9.447242736816406, + "step": 6900 + }, + { + "epoch": 0.76, + "grad_norm": 12.25, + "learning_rate": 8.430199315911536e-07, + "logits/chosen": -1.7550468444824219, + "logits/rejected": -1.61020028591156, + "logps/chosen": -828.6867065429688, + "logps/rejected": -1184.166015625, + "loss": 0.2841, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.820553779602051, + "rewards/margins": 3.9148383140563965, + "rewards/rejected": -9.735391616821289, + "step": 6910 + }, + { + "epoch": 0.76, + "grad_norm": 9.25, + "learning_rate": 8.358670164317287e-07, + "logits/chosen": -1.7112252712249756, + "logits/rejected": -1.5311949253082275, + "logps/chosen": -883.9182739257812, + "logps/rejected": -1330.2860107421875, + "loss": 0.2785, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.623790740966797, + "rewards/margins": 4.71503210067749, + "rewards/rejected": -11.338823318481445, + "step": 6920 + }, + { + "epoch": 0.76, + "grad_norm": 11.3125, + "learning_rate": 8.287384800848602e-07, + "logits/chosen": -1.8638818264007568, + "logits/rejected": -1.6914039850234985, + "logps/chosen": -862.38671875, + "logps/rejected": -1159.9150390625, + "loss": 0.3364, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.71813440322876, + "rewards/margins": 3.4209227561950684, + "rewards/rejected": -9.139058113098145, + "step": 6930 + }, + { + "epoch": 0.76, + "grad_norm": 8.1875, + "learning_rate": 8.216344269804557e-07, + "logits/chosen": -1.920515775680542, + "logits/rejected": -1.6044695377349854, + "logps/chosen": -894.814453125, + "logps/rejected": -1157.91357421875, + "loss": 0.2659, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.165295124053955, + "rewards/margins": 3.2159457206726074, + "rewards/rejected": -9.381241798400879, + "step": 6940 + }, + { + "epoch": 0.76, + "grad_norm": 8.8125, + "learning_rate": 8.14554961189751e-07, + "logits/chosen": -1.799884557723999, + "logits/rejected": -1.738631010055542, + "logps/chosen": -900.06494140625, + "logps/rejected": -1267.641845703125, + "loss": 0.3535, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.605263710021973, + "rewards/margins": 3.498300552368164, + "rewards/rejected": -10.10356330871582, + "step": 6950 + }, + { + "epoch": 0.76, + "grad_norm": 6.03125, + "learning_rate": 8.075001864237922e-07, + "logits/chosen": -1.831827163696289, + "logits/rejected": -1.6931339502334595, + "logps/chosen": -821.5388793945312, + "logps/rejected": -1178.9677734375, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.829926490783691, + "rewards/margins": 3.8328564167022705, + "rewards/rejected": -9.662782669067383, + "step": 6960 + }, + { + "epoch": 0.76, + "grad_norm": 12.25, + "learning_rate": 8.004702060319095e-07, + "logits/chosen": -1.7438271045684814, + "logits/rejected": -1.545538306236267, + "logps/chosen": -891.6124267578125, + "logps/rejected": -1276.505859375, + "loss": 0.2763, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.604761600494385, + "rewards/margins": 3.9493439197540283, + "rewards/rejected": -10.554105758666992, + "step": 6970 + }, + { + "epoch": 0.77, + "grad_norm": 10.875, + "learning_rate": 7.934651230002083e-07, + "logits/chosen": -1.7993844747543335, + "logits/rejected": -1.5770235061645508, + "logps/chosen": -866.4713745117188, + "logps/rejected": -1236.7886962890625, + "loss": 0.2717, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.054971218109131, + "rewards/margins": 4.1365766525268555, + "rewards/rejected": -10.191548347473145, + "step": 6980 + }, + { + "epoch": 0.77, + "grad_norm": 8.0, + "learning_rate": 7.864850399500593e-07, + "logits/chosen": -1.7757495641708374, + "logits/rejected": -1.6646168231964111, + "logps/chosen": -877.4182739257812, + "logps/rejected": -1228.856689453125, + "loss": 0.381, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.315406322479248, + "rewards/margins": 3.7438035011291504, + "rewards/rejected": -10.059210777282715, + "step": 6990 + }, + { + "epoch": 0.77, + "grad_norm": 11.4375, + "learning_rate": 7.79530059136592e-07, + "logits/chosen": -1.6602022647857666, + "logits/rejected": -1.4764797687530518, + "logps/chosen": -867.9508056640625, + "logps/rejected": -1190.60302734375, + "loss": 0.3367, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.469017028808594, + "rewards/margins": 3.5658771991729736, + "rewards/rejected": -10.034894943237305, + "step": 7000 + }, + { + "epoch": 0.77, + "grad_norm": 18.75, + "learning_rate": 7.726002824472017e-07, + "logits/chosen": -1.7378963232040405, + "logits/rejected": -1.6374704837799072, + "logps/chosen": -822.0137939453125, + "logps/rejected": -1171.52294921875, + "loss": 0.3009, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.60142183303833, + "rewards/margins": 3.8019073009490967, + "rewards/rejected": -9.403329849243164, + "step": 7010 + }, + { + "epoch": 0.77, + "grad_norm": 15.4375, + "learning_rate": 7.656958114000532e-07, + "logits/chosen": -1.707689881324768, + "logits/rejected": -1.6087570190429688, + "logps/chosen": -807.1795654296875, + "logps/rejected": -1149.58447265625, + "loss": 0.3076, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.597757339477539, + "rewards/margins": 3.582385301589966, + "rewards/rejected": -9.180142402648926, + "step": 7020 + }, + { + "epoch": 0.77, + "grad_norm": 5.40625, + "learning_rate": 7.588167471425961e-07, + "logits/chosen": -1.6894845962524414, + "logits/rejected": -1.5767422914505005, + "logps/chosen": -927.3870849609375, + "logps/rejected": -1268.9322509765625, + "loss": 0.3347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.907823085784912, + "rewards/margins": 3.7478816509246826, + "rewards/rejected": -10.655705451965332, + "step": 7030 + }, + { + "epoch": 0.77, + "grad_norm": 17.5, + "learning_rate": 7.519631904500793e-07, + "logits/chosen": -1.6538364887237549, + "logits/rejected": -1.5163317918777466, + "logps/chosen": -851.0865478515625, + "logps/rejected": -1211.658203125, + "loss": 0.3123, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.350710868835449, + "rewards/margins": 3.6207275390625, + "rewards/rejected": -9.971439361572266, + "step": 7040 + }, + { + "epoch": 0.77, + "grad_norm": 5.09375, + "learning_rate": 7.451352417240792e-07, + "logits/chosen": -1.7234725952148438, + "logits/rejected": -1.5533872842788696, + "logps/chosen": -859.8151245117188, + "logps/rejected": -1228.08740234375, + "loss": 0.2857, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.268445014953613, + "rewards/margins": 3.5081348419189453, + "rewards/rejected": -9.776580810546875, + "step": 7050 + }, + { + "epoch": 0.77, + "grad_norm": 9.0625, + "learning_rate": 7.383330009910275e-07, + "logits/chosen": -1.74725341796875, + "logits/rejected": -1.556362271308899, + "logps/chosen": -838.7161865234375, + "logps/rejected": -1240.0087890625, + "loss": 0.274, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.789851188659668, + "rewards/margins": 4.1738715171813965, + "rewards/rejected": -9.963723182678223, + "step": 7060 + }, + { + "epoch": 0.78, + "grad_norm": 20.125, + "learning_rate": 7.315565679007416e-07, + "logits/chosen": -1.676500916481018, + "logits/rejected": -1.690222144126892, + "logps/chosen": -833.3739013671875, + "logps/rejected": -1239.5394287109375, + "loss": 0.2855, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.904994964599609, + "rewards/margins": 3.8461384773254395, + "rewards/rejected": -9.751134872436523, + "step": 7070 + }, + { + "epoch": 0.78, + "grad_norm": 6.3125, + "learning_rate": 7.248060417249728e-07, + "logits/chosen": -1.7067276239395142, + "logits/rejected": -1.5767923593521118, + "logps/chosen": -891.3411254882812, + "logps/rejected": -1246.4296875, + "loss": 0.3126, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.274333953857422, + "rewards/margins": 3.9644057750701904, + "rewards/rejected": -10.238740921020508, + "step": 7080 + }, + { + "epoch": 0.78, + "grad_norm": 18.75, + "learning_rate": 7.180815213559436e-07, + "logits/chosen": -1.7147737741470337, + "logits/rejected": -1.4879252910614014, + "logps/chosen": -881.2738037109375, + "logps/rejected": -1279.04638671875, + "loss": 0.3535, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.539353370666504, + "rewards/margins": 4.110602378845215, + "rewards/rejected": -10.649957656860352, + "step": 7090 + }, + { + "epoch": 0.78, + "grad_norm": 15.0625, + "learning_rate": 7.113831053049064e-07, + "logits/chosen": -1.7741687297821045, + "logits/rejected": -1.5063207149505615, + "logps/chosen": -861.1339721679688, + "logps/rejected": -1189.671630859375, + "loss": 0.3221, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.920536041259766, + "rewards/margins": 3.793320417404175, + "rewards/rejected": -9.713855743408203, + "step": 7100 + }, + { + "epoch": 0.78, + "grad_norm": 4.8125, + "learning_rate": 7.047108917006937e-07, + "logits/chosen": -1.792576551437378, + "logits/rejected": -1.594225287437439, + "logps/chosen": -840.9654541015625, + "logps/rejected": -1065.2393798828125, + "loss": 0.2601, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.833611011505127, + "rewards/margins": 2.7121076583862305, + "rewards/rejected": -8.545719146728516, + "step": 7110 + }, + { + "epoch": 0.78, + "grad_norm": 4.375, + "learning_rate": 6.980649782882868e-07, + "logits/chosen": -1.7369096279144287, + "logits/rejected": -1.504765272140503, + "logps/chosen": -882.0428466796875, + "logps/rejected": -1222.780517578125, + "loss": 0.2693, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.466701507568359, + "rewards/margins": 3.7631821632385254, + "rewards/rejected": -10.229884147644043, + "step": 7120 + }, + { + "epoch": 0.78, + "grad_norm": 9.4375, + "learning_rate": 6.914454624273776e-07, + "logits/chosen": -1.7207447290420532, + "logits/rejected": -1.4872944355010986, + "logps/chosen": -889.8289184570312, + "logps/rejected": -1204.4718017578125, + "loss": 0.3492, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.271018028259277, + "rewards/margins": 3.8208961486816406, + "rewards/rejected": -10.091914176940918, + "step": 7130 + }, + { + "epoch": 0.78, + "grad_norm": 10.6875, + "learning_rate": 6.84852441090948e-07, + "logits/chosen": -1.7410838603973389, + "logits/rejected": -1.5534770488739014, + "logps/chosen": -859.7381591796875, + "logps/rejected": -1203.47900390625, + "loss": 0.3647, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.149327278137207, + "rewards/margins": 3.8373286724090576, + "rewards/rejected": -9.986655235290527, + "step": 7140 + }, + { + "epoch": 0.78, + "grad_norm": 5.25, + "learning_rate": 6.78286010863847e-07, + "logits/chosen": -1.8207283020019531, + "logits/rejected": -1.6441676616668701, + "logps/chosen": -854.4300537109375, + "logps/rejected": -1163.5126953125, + "loss": 0.3115, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.045530796051025, + "rewards/margins": 3.4978995323181152, + "rewards/rejected": -9.543431282043457, + "step": 7150 + }, + { + "epoch": 0.78, + "grad_norm": 17.25, + "learning_rate": 6.71746267941373e-07, + "logits/chosen": -1.7627538442611694, + "logits/rejected": -1.6272588968276978, + "logps/chosen": -848.9931640625, + "logps/rejected": -1124.865478515625, + "loss": 0.3555, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.84621000289917, + "rewards/margins": 3.135558843612671, + "rewards/rejected": -8.981768608093262, + "step": 7160 + }, + { + "epoch": 0.79, + "grad_norm": 15.5625, + "learning_rate": 6.652333081278695e-07, + "logits/chosen": -1.7236335277557373, + "logits/rejected": -1.6223046779632568, + "logps/chosen": -852.3353271484375, + "logps/rejected": -1292.3487548828125, + "loss": 0.2388, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.423079490661621, + "rewards/margins": 4.434754371643066, + "rewards/rejected": -10.857833862304688, + "step": 7170 + }, + { + "epoch": 0.79, + "grad_norm": 5.875, + "learning_rate": 6.587472268353187e-07, + "logits/chosen": -1.7483717203140259, + "logits/rejected": -1.642332673072815, + "logps/chosen": -902.3590698242188, + "logps/rejected": -1259.443603515625, + "loss": 0.3225, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.439301490783691, + "rewards/margins": 3.6546664237976074, + "rewards/rejected": -10.09396743774414, + "step": 7180 + }, + { + "epoch": 0.79, + "grad_norm": 4.3125, + "learning_rate": 6.522881190819452e-07, + "logits/chosen": -1.7664976119995117, + "logits/rejected": -1.6344228982925415, + "logps/chosen": -891.5355224609375, + "logps/rejected": -1263.79931640625, + "loss": 0.2637, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.201501369476318, + "rewards/margins": 4.120504379272461, + "rewards/rejected": -10.322006225585938, + "step": 7190 + }, + { + "epoch": 0.79, + "grad_norm": 6.1875, + "learning_rate": 6.458560794908206e-07, + "logits/chosen": -1.7002456188201904, + "logits/rejected": -1.6243194341659546, + "logps/chosen": -894.4249877929688, + "logps/rejected": -1269.411376953125, + "loss": 0.2255, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.3139262199401855, + "rewards/margins": 4.004617691040039, + "rewards/rejected": -10.318544387817383, + "step": 7200 + }, + { + "epoch": 0.79, + "grad_norm": 21.0, + "learning_rate": 6.394512022884825e-07, + "logits/chosen": -1.660723090171814, + "logits/rejected": -1.5254361629486084, + "logps/chosen": -881.8076171875, + "logps/rejected": -1254.076904296875, + "loss": 0.3214, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.587075710296631, + "rewards/margins": 3.848062515258789, + "rewards/rejected": -10.435137748718262, + "step": 7210 + }, + { + "epoch": 0.79, + "grad_norm": 4.59375, + "learning_rate": 6.330735813035508e-07, + "logits/chosen": -1.7457917928695679, + "logits/rejected": -1.542885422706604, + "logps/chosen": -856.2156372070312, + "logps/rejected": -1260.879638671875, + "loss": 0.2671, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.061094760894775, + "rewards/margins": 4.405296802520752, + "rewards/rejected": -10.466390609741211, + "step": 7220 + }, + { + "epoch": 0.79, + "grad_norm": 25.625, + "learning_rate": 6.267233099653525e-07, + "logits/chosen": -1.8128639459609985, + "logits/rejected": -1.6141884326934814, + "logps/chosen": -845.5862426757812, + "logps/rejected": -1090.631103515625, + "loss": 0.3208, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.950997829437256, + "rewards/margins": 2.8874282836914062, + "rewards/rejected": -8.83842658996582, + "step": 7230 + }, + { + "epoch": 0.79, + "grad_norm": 18.5, + "learning_rate": 6.204004813025569e-07, + "logits/chosen": -1.7146689891815186, + "logits/rejected": -1.5228564739227295, + "logps/chosen": -879.8238525390625, + "logps/rejected": -1134.5081787109375, + "loss": 0.3585, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -6.110958099365234, + "rewards/margins": 2.9403300285339355, + "rewards/rejected": -9.051287651062012, + "step": 7240 + }, + { + "epoch": 0.79, + "grad_norm": 5.25, + "learning_rate": 6.141051879418072e-07, + "logits/chosen": -1.6125646829605103, + "logits/rejected": -1.541886568069458, + "logps/chosen": -862.8126831054688, + "logps/rejected": -1255.688720703125, + "loss": 0.28, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.609311580657959, + "rewards/margins": 3.898472309112549, + "rewards/rejected": -10.507783889770508, + "step": 7250 + }, + { + "epoch": 0.8, + "grad_norm": 18.25, + "learning_rate": 6.078375221063701e-07, + "logits/chosen": -1.7883628606796265, + "logits/rejected": -1.5690773725509644, + "logps/chosen": -860.203125, + "logps/rejected": -1254.3834228515625, + "loss": 0.2509, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.284285545349121, + "rewards/margins": 4.019959926605225, + "rewards/rejected": -10.304245948791504, + "step": 7260 + }, + { + "epoch": 0.8, + "grad_norm": 10.1875, + "learning_rate": 6.015975756147773e-07, + "logits/chosen": -1.7416576147079468, + "logits/rejected": -1.4930078983306885, + "logps/chosen": -864.5078125, + "logps/rejected": -1202.0550537109375, + "loss": 0.2799, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.963892936706543, + "rewards/margins": 3.8358120918273926, + "rewards/rejected": -9.799704551696777, + "step": 7270 + }, + { + "epoch": 0.8, + "grad_norm": 14.5625, + "learning_rate": 5.953854398794887e-07, + "logits/chosen": -1.7058597803115845, + "logits/rejected": -1.4826360940933228, + "logps/chosen": -930.9503784179688, + "logps/rejected": -1231.099365234375, + "loss": 0.3303, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.8695526123046875, + "rewards/margins": 3.4407119750976562, + "rewards/rejected": -10.31026554107666, + "step": 7280 + }, + { + "epoch": 0.8, + "grad_norm": 13.4375, + "learning_rate": 5.892012059055443e-07, + "logits/chosen": -1.722477674484253, + "logits/rejected": -1.584524393081665, + "logps/chosen": -928.5767822265625, + "logps/rejected": -1305.730224609375, + "loss": 0.3057, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.670234680175781, + "rewards/margins": 4.110318183898926, + "rewards/rejected": -10.780552864074707, + "step": 7290 + }, + { + "epoch": 0.8, + "grad_norm": 13.0, + "learning_rate": 5.830449642892394e-07, + "logits/chosen": -1.6535228490829468, + "logits/rejected": -1.4783786535263062, + "logps/chosen": -881.0309448242188, + "logps/rejected": -1291.6666259765625, + "loss": 0.3369, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.464174747467041, + "rewards/margins": 4.342896461486816, + "rewards/rejected": -10.8070707321167, + "step": 7300 + }, + { + "epoch": 0.8, + "grad_norm": 9.75, + "learning_rate": 5.769168052167928e-07, + "logits/chosen": -1.7944562435150146, + "logits/rejected": -1.6461677551269531, + "logps/chosen": -864.3699340820312, + "logps/rejected": -1192.555908203125, + "loss": 0.3409, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.018045902252197, + "rewards/margins": 3.5270333290100098, + "rewards/rejected": -9.545079231262207, + "step": 7310 + }, + { + "epoch": 0.8, + "grad_norm": 10.1875, + "learning_rate": 5.708168184630241e-07, + "logits/chosen": -1.7635242938995361, + "logits/rejected": -1.6745281219482422, + "logps/chosen": -921.7453002929688, + "logps/rejected": -1249.630126953125, + "loss": 0.3346, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.4944353103637695, + "rewards/margins": 3.501138687133789, + "rewards/rejected": -9.995573997497559, + "step": 7320 + }, + { + "epoch": 0.8, + "grad_norm": 18.875, + "learning_rate": 5.647450933900439e-07, + "logits/chosen": -1.783334732055664, + "logits/rejected": -1.5911145210266113, + "logps/chosen": -833.4718017578125, + "logps/rejected": -1209.2564697265625, + "loss": 0.3042, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.029026985168457, + "rewards/margins": 3.8303840160369873, + "rewards/rejected": -9.859411239624023, + "step": 7330 + }, + { + "epoch": 0.8, + "grad_norm": 5.71875, + "learning_rate": 5.587017189459401e-07, + "logits/chosen": -1.626389503479004, + "logits/rejected": -1.4770653247833252, + "logps/chosen": -848.9024658203125, + "logps/rejected": -1214.1951904296875, + "loss": 0.349, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.202887058258057, + "rewards/margins": 3.8618569374084473, + "rewards/rejected": -10.064742088317871, + "step": 7340 + }, + { + "epoch": 0.81, + "grad_norm": 11.3125, + "learning_rate": 5.526867836634775e-07, + "logits/chosen": -1.7388553619384766, + "logits/rejected": -1.4613442420959473, + "logps/chosen": -900.7569580078125, + "logps/rejected": -1297.839599609375, + "loss": 0.2209, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.501623630523682, + "rewards/margins": 4.509965896606445, + "rewards/rejected": -11.011590003967285, + "step": 7350 + }, + { + "epoch": 0.81, + "grad_norm": 6.59375, + "learning_rate": 5.467003756587976e-07, + "logits/chosen": -1.7643015384674072, + "logits/rejected": -1.6337299346923828, + "logps/chosen": -877.3990478515625, + "logps/rejected": -1230.098876953125, + "loss": 0.3009, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.12090539932251, + "rewards/margins": 3.873993396759033, + "rewards/rejected": -9.994897842407227, + "step": 7360 + }, + { + "epoch": 0.81, + "grad_norm": 18.5, + "learning_rate": 5.407425826301321e-07, + "logits/chosen": -1.785988211631775, + "logits/rejected": -1.470233678817749, + "logps/chosen": -906.3011474609375, + "logps/rejected": -1151.123291015625, + "loss": 0.3568, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.426381587982178, + "rewards/margins": 3.073103189468384, + "rewards/rejected": -9.49948501586914, + "step": 7370 + }, + { + "epoch": 0.81, + "grad_norm": 5.6875, + "learning_rate": 5.34813491856516e-07, + "logits/chosen": -1.876861333847046, + "logits/rejected": -1.6604998111724854, + "logps/chosen": -833.1232299804688, + "logps/rejected": -1068.74951171875, + "loss": 0.3481, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.484971523284912, + "rewards/margins": 3.0646376609802246, + "rewards/rejected": -8.549609184265137, + "step": 7380 + }, + { + "epoch": 0.81, + "grad_norm": 9.3125, + "learning_rate": 5.289131901965062e-07, + "logits/chosen": -1.7233154773712158, + "logits/rejected": -1.580396294593811, + "logps/chosen": -828.4129638671875, + "logps/rejected": -1192.831787109375, + "loss": 0.2694, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.812646389007568, + "rewards/margins": 3.988722324371338, + "rewards/rejected": -9.801368713378906, + "step": 7390 + }, + { + "epoch": 0.81, + "grad_norm": 18.25, + "learning_rate": 5.230417640869154e-07, + "logits/chosen": -1.5984258651733398, + "logits/rejected": -1.4450984001159668, + "logps/chosen": -854.2269287109375, + "logps/rejected": -1259.1380615234375, + "loss": 0.2552, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.263187408447266, + "rewards/margins": 4.227612018585205, + "rewards/rejected": -10.490798950195312, + "step": 7400 + }, + { + "epoch": 0.81, + "grad_norm": 19.125, + "learning_rate": 5.171992995415398e-07, + "logits/chosen": -1.7642548084259033, + "logits/rejected": -1.6240993738174438, + "logps/chosen": -890.0231323242188, + "logps/rejected": -1244.30810546875, + "loss": 0.3886, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.3501057624816895, + "rewards/margins": 3.7915267944335938, + "rewards/rejected": -10.141632080078125, + "step": 7410 + }, + { + "epoch": 0.81, + "grad_norm": 10.625, + "learning_rate": 5.113858821499018e-07, + "logits/chosen": -1.6966578960418701, + "logits/rejected": -1.5958200693130493, + "logps/chosen": -865.1398315429688, + "logps/rejected": -1239.331787109375, + "loss": 0.251, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.2037811279296875, + "rewards/margins": 4.0393853187561035, + "rewards/rejected": -10.243165969848633, + "step": 7420 + }, + { + "epoch": 0.81, + "grad_norm": 11.6875, + "learning_rate": 5.056015970759967e-07, + "logits/chosen": -1.6901695728302002, + "logits/rejected": -1.566306471824646, + "logps/chosen": -898.8585205078125, + "logps/rejected": -1279.3475341796875, + "loss": 0.2495, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.751455783843994, + "rewards/margins": 4.020721912384033, + "rewards/rejected": -10.772176742553711, + "step": 7430 + }, + { + "epoch": 0.82, + "grad_norm": 5.28125, + "learning_rate": 4.998465290570445e-07, + "logits/chosen": -1.7205312252044678, + "logits/rejected": -1.5678558349609375, + "logps/chosen": -919.13427734375, + "logps/rejected": -1401.692138671875, + "loss": 0.2407, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.721285343170166, + "rewards/margins": 4.9993672370910645, + "rewards/rejected": -11.720651626586914, + "step": 7440 + }, + { + "epoch": 0.82, + "grad_norm": 6.96875, + "learning_rate": 4.941207624022465e-07, + "logits/chosen": -1.7078001499176025, + "logits/rejected": -1.5286405086517334, + "logps/chosen": -843.447265625, + "logps/rejected": -1174.151123046875, + "loss": 0.4277, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.121189594268799, + "rewards/margins": 3.7856547832489014, + "rewards/rejected": -9.906843185424805, + "step": 7450 + }, + { + "epoch": 0.82, + "grad_norm": 4.0, + "learning_rate": 4.884243809915535e-07, + "logits/chosen": -1.8341057300567627, + "logits/rejected": -1.6554839611053467, + "logps/chosen": -910.3515625, + "logps/rejected": -1216.1929931640625, + "loss": 0.415, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.3772687911987305, + "rewards/margins": 3.601078510284424, + "rewards/rejected": -9.978346824645996, + "step": 7460 + }, + { + "epoch": 0.82, + "grad_norm": 11.9375, + "learning_rate": 4.827574682744354e-07, + "logits/chosen": -1.7621341943740845, + "logits/rejected": -1.5860872268676758, + "logps/chosen": -849.1729736328125, + "logps/rejected": -1212.963623046875, + "loss": 0.3066, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.894879341125488, + "rewards/margins": 3.7651658058166504, + "rewards/rejected": -9.66004467010498, + "step": 7470 + }, + { + "epoch": 0.82, + "grad_norm": 6.53125, + "learning_rate": 4.771201072686576e-07, + "logits/chosen": -1.6475419998168945, + "logits/rejected": -1.628507375717163, + "logps/chosen": -896.3299560546875, + "logps/rejected": -1262.529052734375, + "loss": 0.2798, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.091849327087402, + "rewards/margins": 3.8673205375671387, + "rewards/rejected": -9.9591703414917, + "step": 7480 + }, + { + "epoch": 0.82, + "grad_norm": 11.6875, + "learning_rate": 4.715123805590671e-07, + "logits/chosen": -1.7955982685089111, + "logits/rejected": -1.6593544483184814, + "logps/chosen": -930.4306640625, + "logps/rejected": -1221.9071044921875, + "loss": 0.3502, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.474607944488525, + "rewards/margins": 3.2586116790771484, + "rewards/rejected": -9.733220100402832, + "step": 7490 + }, + { + "epoch": 0.82, + "grad_norm": 9.125, + "learning_rate": 4.6593437029638145e-07, + "logits/chosen": -1.7222249507904053, + "logits/rejected": -1.6309341192245483, + "logps/chosen": -897.2501831054688, + "logps/rejected": -1291.535888671875, + "loss": 0.2653, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.353585720062256, + "rewards/margins": 3.8765571117401123, + "rewards/rejected": -10.230142593383789, + "step": 7500 + }, + { + "epoch": 0.82, + "grad_norm": 18.125, + "learning_rate": 4.6038615819598537e-07, + "logits/chosen": -1.69767165184021, + "logits/rejected": -1.5328662395477295, + "logps/chosen": -877.96875, + "logps/rejected": -1169.4010009765625, + "loss": 0.3024, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.244325160980225, + "rewards/margins": 3.464245557785034, + "rewards/rejected": -9.70857048034668, + "step": 7510 + }, + { + "epoch": 0.82, + "grad_norm": 9.0, + "learning_rate": 4.5486782553673253e-07, + "logits/chosen": -1.707326889038086, + "logits/rejected": -1.5242290496826172, + "logps/chosen": -839.7559814453125, + "logps/rejected": -1154.3704833984375, + "loss": 0.2966, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.840934753417969, + "rewards/margins": 3.6412436962127686, + "rewards/rejected": -9.482179641723633, + "step": 7520 + }, + { + "epoch": 0.83, + "grad_norm": 10.75, + "learning_rate": 4.4937945315975755e-07, + "logits/chosen": -1.786686897277832, + "logits/rejected": -1.6044752597808838, + "logps/chosen": -882.7330932617188, + "logps/rejected": -1253.3262939453125, + "loss": 0.3575, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.368939399719238, + "rewards/margins": 3.842020034790039, + "rewards/rejected": -10.210960388183594, + "step": 7530 + }, + { + "epoch": 0.83, + "grad_norm": 7.78125, + "learning_rate": 4.4392112146729004e-07, + "logits/chosen": -1.8403841257095337, + "logits/rejected": -1.5574525594711304, + "logps/chosen": -867.54931640625, + "logps/rejected": -1171.932861328125, + "loss": 0.2632, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.748641014099121, + "rewards/margins": 3.634506940841675, + "rewards/rejected": -9.383148193359375, + "step": 7540 + }, + { + "epoch": 0.83, + "grad_norm": 9.5, + "learning_rate": 4.3849291042147526e-07, + "logits/chosen": -1.7866106033325195, + "logits/rejected": -1.7302137613296509, + "logps/chosen": -797.72802734375, + "logps/rejected": -1120.34765625, + "loss": 0.3801, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.607116222381592, + "rewards/margins": 3.372204542160034, + "rewards/rejected": -8.979320526123047, + "step": 7550 + }, + { + "epoch": 0.83, + "grad_norm": 4.84375, + "learning_rate": 4.3309489954320663e-07, + "logits/chosen": -1.7582762241363525, + "logits/rejected": -1.6715036630630493, + "logps/chosen": -836.2733154296875, + "logps/rejected": -1193.603759765625, + "loss": 0.2599, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.045174598693848, + "rewards/margins": 3.8334827423095703, + "rewards/rejected": -9.878657341003418, + "step": 7560 + }, + { + "epoch": 0.83, + "grad_norm": 21.0, + "learning_rate": 4.2772716791095654e-07, + "logits/chosen": -1.801546335220337, + "logits/rejected": -1.5964499711990356, + "logps/chosen": -897.0732421875, + "logps/rejected": -1234.9854736328125, + "loss": 0.3324, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.4745001792907715, + "rewards/margins": 3.7684922218322754, + "rewards/rejected": -10.242993354797363, + "step": 7570 + }, + { + "epoch": 0.83, + "grad_norm": 4.875, + "learning_rate": 4.2238979415962003e-07, + "logits/chosen": -1.7250404357910156, + "logits/rejected": -1.599498987197876, + "logps/chosen": -852.4415893554688, + "logps/rejected": -1294.011474609375, + "loss": 0.2536, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.324819564819336, + "rewards/margins": 4.217616081237793, + "rewards/rejected": -10.542435646057129, + "step": 7580 + }, + { + "epoch": 0.83, + "grad_norm": 16.0, + "learning_rate": 4.170828564793636e-07, + "logits/chosen": -1.752338171005249, + "logits/rejected": -1.6183712482452393, + "logps/chosen": -912.9306640625, + "logps/rejected": -1252.3656005859375, + "loss": 0.3023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.85143518447876, + "rewards/margins": 3.432034969329834, + "rewards/rejected": -10.283470153808594, + "step": 7590 + }, + { + "epoch": 0.83, + "grad_norm": 24.625, + "learning_rate": 4.118064326144791e-07, + "logits/chosen": -1.705780029296875, + "logits/rejected": -1.5567110776901245, + "logps/chosen": -860.8953857421875, + "logps/rejected": -1174.3311767578125, + "loss": 0.3728, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.2867817878723145, + "rewards/margins": 3.4596657752990723, + "rewards/rejected": -9.746447563171387, + "step": 7600 + }, + { + "epoch": 0.83, + "grad_norm": 5.46875, + "learning_rate": 4.065605998622421e-07, + "logits/chosen": -1.7140576839447021, + "logits/rejected": -1.6214179992675781, + "logps/chosen": -878.6866455078125, + "logps/rejected": -1217.408447265625, + "loss": 0.2556, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.069504737854004, + "rewards/margins": 3.8884830474853516, + "rewards/rejected": -9.957986831665039, + "step": 7610 + }, + { + "epoch": 0.84, + "grad_norm": 9.875, + "learning_rate": 4.0134543507178407e-07, + "logits/chosen": -1.7090330123901367, + "logits/rejected": -1.5843477249145508, + "logps/chosen": -864.51708984375, + "logps/rejected": -1293.622314453125, + "loss": 0.271, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.462640285491943, + "rewards/margins": 4.236240863800049, + "rewards/rejected": -10.698881149291992, + "step": 7620 + }, + { + "epoch": 0.84, + "grad_norm": 25.625, + "learning_rate": 3.9616101464296423e-07, + "logits/chosen": -1.6794599294662476, + "logits/rejected": -1.5829523801803589, + "logps/chosen": -892.05810546875, + "logps/rejected": -1241.007568359375, + "loss": 0.2633, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.504463195800781, + "rewards/margins": 3.840977430343628, + "rewards/rejected": -10.345439910888672, + "step": 7630 + }, + { + "epoch": 0.84, + "grad_norm": 15.9375, + "learning_rate": 3.910074145252499e-07, + "logits/chosen": -1.6919753551483154, + "logits/rejected": -1.5030763149261475, + "logps/chosen": -917.7779541015625, + "logps/rejected": -1318.099853515625, + "loss": 0.2948, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.6831817626953125, + "rewards/margins": 4.50520658493042, + "rewards/rejected": -11.188387870788574, + "step": 7640 + }, + { + "epoch": 0.84, + "grad_norm": 11.6875, + "learning_rate": 3.8588471021660385e-07, + "logits/chosen": -1.749140739440918, + "logits/rejected": -1.6207001209259033, + "logps/chosen": -889.9952392578125, + "logps/rejected": -1233.945556640625, + "loss": 0.2901, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.423489570617676, + "rewards/margins": 3.7349534034729004, + "rewards/rejected": -10.158441543579102, + "step": 7650 + }, + { + "epoch": 0.84, + "grad_norm": 20.125, + "learning_rate": 3.8079297676238027e-07, + "logits/chosen": -1.8519413471221924, + "logits/rejected": -1.709180235862732, + "logps/chosen": -889.0953979492188, + "logps/rejected": -1154.34814453125, + "loss": 0.4062, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.117581844329834, + "rewards/margins": 3.006014585494995, + "rewards/rejected": -9.12359619140625, + "step": 7660 + }, + { + "epoch": 0.84, + "grad_norm": 12.9375, + "learning_rate": 3.7573228875422363e-07, + "logits/chosen": -1.8086681365966797, + "logits/rejected": -1.6694767475128174, + "logps/chosen": -890.94482421875, + "logps/rejected": -1278.070556640625, + "loss": 0.2971, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.203786849975586, + "rewards/margins": 4.456539630889893, + "rewards/rejected": -10.66032600402832, + "step": 7670 + }, + { + "epoch": 0.84, + "grad_norm": 11.375, + "learning_rate": 3.7070272032897534e-07, + "logits/chosen": -1.6744012832641602, + "logits/rejected": -1.5800849199295044, + "logps/chosen": -885.0576171875, + "logps/rejected": -1221.6343994140625, + "loss": 0.3048, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.356607437133789, + "rewards/margins": 3.651845932006836, + "rewards/rejected": -10.008453369140625, + "step": 7680 + }, + { + "epoch": 0.84, + "grad_norm": 12.0625, + "learning_rate": 3.6570434516758985e-07, + "logits/chosen": -1.6506437063217163, + "logits/rejected": -1.517558217048645, + "logps/chosen": -827.3564453125, + "logps/rejected": -1216.013427734375, + "loss": 0.2536, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.802563190460205, + "rewards/margins": 4.133772373199463, + "rewards/rejected": -9.936335563659668, + "step": 7690 + }, + { + "epoch": 0.84, + "grad_norm": 8.5625, + "learning_rate": 3.607372364940542e-07, + "logits/chosen": -1.7288329601287842, + "logits/rejected": -1.6743700504302979, + "logps/chosen": -897.0811767578125, + "logps/rejected": -1315.781005859375, + "loss": 0.3454, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.499758720397949, + "rewards/margins": 4.186190128326416, + "rewards/rejected": -10.685949325561523, + "step": 7700 + }, + { + "epoch": 0.85, + "grad_norm": 23.75, + "learning_rate": 3.558014670743143e-07, + "logits/chosen": -1.6753463745117188, + "logits/rejected": -1.5366500616073608, + "logps/chosen": -927.1793212890625, + "logps/rejected": -1327.123046875, + "loss": 0.3906, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.827184200286865, + "rewards/margins": 4.379610061645508, + "rewards/rejected": -11.206794738769531, + "step": 7710 + }, + { + "epoch": 0.85, + "grad_norm": 20.5, + "learning_rate": 3.5089710921520915e-07, + "logits/chosen": -1.809252142906189, + "logits/rejected": -1.6530039310455322, + "logps/chosen": -849.5697021484375, + "logps/rejected": -1175.16015625, + "loss": 0.3878, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.797966957092285, + "rewards/margins": 3.77616810798645, + "rewards/rejected": -9.57413387298584, + "step": 7720 + }, + { + "epoch": 0.85, + "grad_norm": 5.46875, + "learning_rate": 3.4602423476341516e-07, + "logits/chosen": -1.6660228967666626, + "logits/rejected": -1.596799612045288, + "logps/chosen": -885.8453369140625, + "logps/rejected": -1264.83740234375, + "loss": 0.2998, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.3856353759765625, + "rewards/margins": 4.11879301071167, + "rewards/rejected": -10.504427909851074, + "step": 7730 + }, + { + "epoch": 0.85, + "grad_norm": 6.1875, + "learning_rate": 3.4118291510438715e-07, + "logits/chosen": -1.8259109258651733, + "logits/rejected": -1.6981360912322998, + "logps/chosen": -878.1605224609375, + "logps/rejected": -1187.2197265625, + "loss": 0.3091, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.885763645172119, + "rewards/margins": 3.427297592163086, + "rewards/rejected": -9.31306266784668, + "step": 7740 + }, + { + "epoch": 0.85, + "grad_norm": 12.8125, + "learning_rate": 3.363732211613191e-07, + "logits/chosen": -1.675179123878479, + "logits/rejected": -1.4914882183074951, + "logps/chosen": -985.6388549804688, + "logps/rejected": -1389.0338134765625, + "loss": 0.3154, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.28152322769165, + "rewards/margins": 4.4234724044799805, + "rewards/rejected": -11.704996109008789, + "step": 7750 + }, + { + "epoch": 0.85, + "grad_norm": 6.3125, + "learning_rate": 3.3159522339410136e-07, + "logits/chosen": -1.764434814453125, + "logits/rejected": -1.6614373922348022, + "logps/chosen": -865.7063598632812, + "logps/rejected": -1231.4600830078125, + "loss": 0.2615, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.150195121765137, + "rewards/margins": 3.785039186477661, + "rewards/rejected": -9.935232162475586, + "step": 7760 + }, + { + "epoch": 0.85, + "grad_norm": 9.625, + "learning_rate": 3.268489917982881e-07, + "logits/chosen": -1.746270775794983, + "logits/rejected": -1.548775315284729, + "logps/chosen": -912.9557495117188, + "logps/rejected": -1234.2415771484375, + "loss": 0.2973, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.389470100402832, + "rewards/margins": 3.7607929706573486, + "rewards/rejected": -10.150262832641602, + "step": 7770 + }, + { + "epoch": 0.85, + "grad_norm": 13.3125, + "learning_rate": 3.221345959040742e-07, + "logits/chosen": -1.7302089929580688, + "logits/rejected": -1.55826735496521, + "logps/chosen": -872.9494018554688, + "logps/rejected": -1229.177734375, + "loss": 0.3363, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.178586006164551, + "rewards/margins": 3.7981743812561035, + "rewards/rejected": -9.976759910583496, + "step": 7780 + }, + { + "epoch": 0.85, + "grad_norm": 8.8125, + "learning_rate": 3.17452104775276e-07, + "logits/chosen": -1.7663154602050781, + "logits/rejected": -1.7209417819976807, + "logps/chosen": -844.4359130859375, + "logps/rejected": -1182.0413818359375, + "loss": 0.3074, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.759959697723389, + "rewards/margins": 3.583317518234253, + "rewards/rejected": -9.343277931213379, + "step": 7790 + }, + { + "epoch": 0.86, + "grad_norm": 7.09375, + "learning_rate": 3.128015870083187e-07, + "logits/chosen": -1.7534910440444946, + "logits/rejected": -1.5289908647537231, + "logps/chosen": -854.9951171875, + "logps/rejected": -1184.4505615234375, + "loss": 0.3368, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.345522880554199, + "rewards/margins": 3.5583224296569824, + "rewards/rejected": -9.903844833374023, + "step": 7800 + }, + { + "epoch": 0.86, + "grad_norm": 31.25, + "learning_rate": 3.081831107312308e-07, + "logits/chosen": -1.6859992742538452, + "logits/rejected": -1.5666173696517944, + "logps/chosen": -873.9437255859375, + "logps/rejected": -1248.6187744140625, + "loss": 0.2809, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.370795726776123, + "rewards/margins": 4.2174787521362305, + "rewards/rejected": -10.588274955749512, + "step": 7810 + }, + { + "epoch": 0.86, + "grad_norm": 7.65625, + "learning_rate": 3.035967436026485e-07, + "logits/chosen": -1.7774591445922852, + "logits/rejected": -1.649836778640747, + "logps/chosen": -786.0259399414062, + "logps/rejected": -1140.343017578125, + "loss": 0.2813, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.513943672180176, + "rewards/margins": 3.829460620880127, + "rewards/rejected": -9.343404769897461, + "step": 7820 + }, + { + "epoch": 0.86, + "grad_norm": 6.0, + "learning_rate": 2.990425528108237e-07, + "logits/chosen": -1.7569217681884766, + "logits/rejected": -1.6845314502716064, + "logps/chosen": -843.4617309570312, + "logps/rejected": -1145.234130859375, + "loss": 0.2431, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.905159950256348, + "rewards/margins": 3.378962993621826, + "rewards/rejected": -9.284123420715332, + "step": 7830 + }, + { + "epoch": 0.86, + "grad_norm": 11.875, + "learning_rate": 2.945206050726371e-07, + "logits/chosen": -1.888319730758667, + "logits/rejected": -1.6877343654632568, + "logps/chosen": -891.9373168945312, + "logps/rejected": -1159.262939453125, + "loss": 0.4229, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -6.2926435470581055, + "rewards/margins": 3.153834581375122, + "rewards/rejected": -9.446477890014648, + "step": 7840 + }, + { + "epoch": 0.86, + "grad_norm": 8.75, + "learning_rate": 2.900309666326248e-07, + "logits/chosen": -1.790503740310669, + "logits/rejected": -1.5645792484283447, + "logps/chosen": -858.7918701171875, + "logps/rejected": -1215.578857421875, + "loss": 0.2891, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0547614097595215, + "rewards/margins": 3.7089340686798096, + "rewards/rejected": -9.763696670532227, + "step": 7850 + }, + { + "epoch": 0.86, + "grad_norm": 3.421875, + "learning_rate": 2.855737032620043e-07, + "logits/chosen": -1.7688602209091187, + "logits/rejected": -1.613595724105835, + "logps/chosen": -815.7376098632812, + "logps/rejected": -1069.392333984375, + "loss": 0.3175, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.67942476272583, + "rewards/margins": 3.0152783393859863, + "rewards/rejected": -8.694704055786133, + "step": 7860 + }, + { + "epoch": 0.86, + "grad_norm": 16.0, + "learning_rate": 2.8114888025771503e-07, + "logits/chosen": -1.7481235265731812, + "logits/rejected": -1.5339552164077759, + "logps/chosen": -858.4660034179688, + "logps/rejected": -1279.783203125, + "loss": 0.2798, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.02412223815918, + "rewards/margins": 4.391145706176758, + "rewards/rejected": -10.415266990661621, + "step": 7870 + }, + { + "epoch": 0.86, + "grad_norm": 10.5625, + "learning_rate": 2.7675656244145644e-07, + "logits/chosen": -1.7800451517105103, + "logits/rejected": -1.619799256324768, + "logps/chosen": -884.7041015625, + "logps/rejected": -1207.646484375, + "loss": 0.2655, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.253584861755371, + "rewards/margins": 3.450160264968872, + "rewards/rejected": -9.70374584197998, + "step": 7880 + }, + { + "epoch": 0.87, + "grad_norm": 7.21875, + "learning_rate": 2.723968141587438e-07, + "logits/chosen": -1.6956205368041992, + "logits/rejected": -1.474829077720642, + "logps/chosen": -892.4122314453125, + "logps/rejected": -1240.1959228515625, + "loss": 0.2689, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.371469974517822, + "rewards/margins": 3.8045947551727295, + "rewards/rejected": -10.176065444946289, + "step": 7890 + }, + { + "epoch": 0.87, + "grad_norm": 11.75, + "learning_rate": 2.680696992779616e-07, + "logits/chosen": -1.8051077127456665, + "logits/rejected": -1.6118837594985962, + "logps/chosen": -855.6483154296875, + "logps/rejected": -1234.9158935546875, + "loss": 0.3719, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.082594394683838, + "rewards/margins": 3.94830322265625, + "rewards/rejected": -10.03089714050293, + "step": 7900 + }, + { + "epoch": 0.87, + "grad_norm": 16.125, + "learning_rate": 2.637752811894304e-07, + "logits/chosen": -1.770971655845642, + "logits/rejected": -1.5673869848251343, + "logps/chosen": -787.9400634765625, + "logps/rejected": -1066.273193359375, + "loss": 0.2567, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.11167049407959, + "rewards/margins": 3.5057969093322754, + "rewards/rejected": -8.617467880249023, + "step": 7910 + }, + { + "epoch": 0.87, + "grad_norm": 8.625, + "learning_rate": 2.595136228044773e-07, + "logits/chosen": -1.8858044147491455, + "logits/rejected": -1.6247947216033936, + "logps/chosen": -874.8401489257812, + "logps/rejected": -1120.9029541015625, + "loss": 0.2496, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.799472332000732, + "rewards/margins": 3.1634693145751953, + "rewards/rejected": -8.962942123413086, + "step": 7920 + }, + { + "epoch": 0.87, + "grad_norm": 5.375, + "learning_rate": 2.552847865545122e-07, + "logits/chosen": -1.7997881174087524, + "logits/rejected": -1.6337566375732422, + "logps/chosen": -899.0103759765625, + "logps/rejected": -1264.661865234375, + "loss": 0.3405, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.257597923278809, + "rewards/margins": 3.764179229736328, + "rewards/rejected": -10.021777153015137, + "step": 7930 + }, + { + "epoch": 0.87, + "grad_norm": 19.0, + "learning_rate": 2.51088834390118e-07, + "logits/chosen": -1.7815898656845093, + "logits/rejected": -1.4949190616607666, + "logps/chosen": -964.5501098632812, + "logps/rejected": -1292.2457275390625, + "loss": 0.3465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.983921051025391, + "rewards/margins": 3.533292770385742, + "rewards/rejected": -10.517213821411133, + "step": 7940 + }, + { + "epoch": 0.87, + "grad_norm": 20.0, + "learning_rate": 2.469258277801384e-07, + "logits/chosen": -1.6875766515731812, + "logits/rejected": -1.4591151475906372, + "logps/chosen": -972.9832763671875, + "logps/rejected": -1370.740478515625, + "loss": 0.3216, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.336121559143066, + "rewards/margins": 3.924180269241333, + "rewards/rejected": -11.260300636291504, + "step": 7950 + }, + { + "epoch": 0.87, + "grad_norm": 19.75, + "learning_rate": 2.4279582771078085e-07, + "logits/chosen": -1.7713184356689453, + "logits/rejected": -1.592571496963501, + "logps/chosen": -853.7166137695312, + "logps/rejected": -1269.4556884765625, + "loss": 0.3175, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.178139686584473, + "rewards/margins": 4.15941047668457, + "rewards/rejected": -10.337550163269043, + "step": 7960 + }, + { + "epoch": 0.87, + "grad_norm": 10.6875, + "learning_rate": 2.3869889468471936e-07, + "logits/chosen": -1.6376993656158447, + "logits/rejected": -1.4584661722183228, + "logps/chosen": -941.0533447265625, + "logps/rejected": -1372.649169921875, + "loss": 0.3151, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -7.0856475830078125, + "rewards/margins": 4.373940467834473, + "rewards/rejected": -11.459589004516602, + "step": 7970 + }, + { + "epoch": 0.87, + "grad_norm": 6.90625, + "learning_rate": 2.3463508872021257e-07, + "logits/chosen": -1.7968809604644775, + "logits/rejected": -1.6161655187606812, + "logps/chosen": -831.0450439453125, + "logps/rejected": -1068.81005859375, + "loss": 0.3814, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.642725944519043, + "rewards/margins": 2.8636467456817627, + "rewards/rejected": -8.506372451782227, + "step": 7980 + }, + { + "epoch": 0.88, + "grad_norm": 10.8125, + "learning_rate": 2.3060446935022128e-07, + "logits/chosen": -1.6785504817962646, + "logits/rejected": -1.6747753620147705, + "logps/chosen": -870.033203125, + "logps/rejected": -1159.7274169921875, + "loss": 0.3636, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.041116237640381, + "rewards/margins": 3.014256000518799, + "rewards/rejected": -9.05537223815918, + "step": 7990 + }, + { + "epoch": 0.88, + "grad_norm": 10.375, + "learning_rate": 2.2660709562153715e-07, + "logits/chosen": -1.7694013118743896, + "logits/rejected": -1.6744987964630127, + "logps/chosen": -848.1533203125, + "logps/rejected": -1249.2984619140625, + "loss": 0.2911, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.074583053588867, + "rewards/margins": 4.102524280548096, + "rewards/rejected": -10.177106857299805, + "step": 8000 + }, + { + "epoch": 0.88, + "grad_norm": 7.25, + "learning_rate": 2.2264302609391857e-07, + "logits/chosen": -1.8036963939666748, + "logits/rejected": -1.6646007299423218, + "logps/chosen": -835.9616088867188, + "logps/rejected": -1177.264892578125, + "loss": 0.3362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.917352199554443, + "rewards/margins": 3.543867588043213, + "rewards/rejected": -9.46121883392334, + "step": 8010 + }, + { + "epoch": 0.88, + "grad_norm": 20.0, + "learning_rate": 2.1871231883923121e-07, + "logits/chosen": -1.751940131187439, + "logits/rejected": -1.6008822917938232, + "logps/chosen": -882.1159057617188, + "logps/rejected": -1178.8541259765625, + "loss": 0.314, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.385968208312988, + "rewards/margins": 3.3863720893859863, + "rewards/rejected": -9.772339820861816, + "step": 8020 + }, + { + "epoch": 0.88, + "grad_norm": 11.8125, + "learning_rate": 2.1481503144059945e-07, + "logits/chosen": -1.669878602027893, + "logits/rejected": -1.5334699153900146, + "logps/chosen": -898.9529418945312, + "logps/rejected": -1219.326416015625, + "loss": 0.3, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.713472843170166, + "rewards/margins": 3.4800522327423096, + "rewards/rejected": -10.193525314331055, + "step": 8030 + }, + { + "epoch": 0.88, + "grad_norm": 12.0, + "learning_rate": 2.109512209915601e-07, + "logits/chosen": -1.7940858602523804, + "logits/rejected": -1.599538803100586, + "logps/chosen": -814.0867919921875, + "logps/rejected": -1151.7001953125, + "loss": 0.4027, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.870598316192627, + "rewards/margins": 3.350947856903076, + "rewards/rejected": -9.221546173095703, + "step": 8040 + }, + { + "epoch": 0.88, + "grad_norm": 23.875, + "learning_rate": 2.0712094409522903e-07, + "logits/chosen": -1.726629614830017, + "logits/rejected": -1.5041388273239136, + "logps/chosen": -835.77587890625, + "logps/rejected": -1150.446533203125, + "loss": 0.3215, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.7710418701171875, + "rewards/margins": 3.6039910316467285, + "rewards/rejected": -9.375032424926758, + "step": 8050 + }, + { + "epoch": 0.88, + "grad_norm": 8.875, + "learning_rate": 2.0332425686346857e-07, + "logits/chosen": -1.7781845331192017, + "logits/rejected": -1.70371413230896, + "logps/chosen": -920.7423706054688, + "logps/rejected": -1301.527099609375, + "loss": 0.2929, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.6630353927612305, + "rewards/margins": 3.8167953491210938, + "rewards/rejected": -10.479830741882324, + "step": 8060 + }, + { + "epoch": 0.88, + "grad_norm": 7.46875, + "learning_rate": 1.995612149160689e-07, + "logits/chosen": -1.6686102151870728, + "logits/rejected": -1.3945950269699097, + "logps/chosen": -840.4537963867188, + "logps/rejected": -1196.80078125, + "loss": 0.2298, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.082284450531006, + "rewards/margins": 4.004199981689453, + "rewards/rejected": -10.0864839553833, + "step": 8070 + }, + { + "epoch": 0.89, + "grad_norm": 16.375, + "learning_rate": 1.958318733799311e-07, + "logits/chosen": -1.6705032587051392, + "logits/rejected": -1.520719289779663, + "logps/chosen": -977.9481201171875, + "logps/rejected": -1328.1171875, + "loss": 0.324, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.053180694580078, + "rewards/margins": 3.8459064960479736, + "rewards/rejected": -10.899085998535156, + "step": 8080 + }, + { + "epoch": 0.89, + "grad_norm": 8.375, + "learning_rate": 1.9213628688825952e-07, + "logits/chosen": -1.5538527965545654, + "logits/rejected": -1.555267095565796, + "logps/chosen": -908.5615234375, + "logps/rejected": -1275.070556640625, + "loss": 0.2884, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.881302833557129, + "rewards/margins": 3.827199935913086, + "rewards/rejected": -10.708501815795898, + "step": 8090 + }, + { + "epoch": 0.89, + "grad_norm": 8.3125, + "learning_rate": 1.8847450957976326e-07, + "logits/chosen": -1.7549211978912354, + "logits/rejected": -1.711284875869751, + "logps/chosen": -824.0286254882812, + "logps/rejected": -1155.532958984375, + "loss": 0.2896, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.689845085144043, + "rewards/margins": 3.4961960315704346, + "rewards/rejected": -9.186039924621582, + "step": 8100 + }, + { + "epoch": 0.89, + "grad_norm": 5.71875, + "learning_rate": 1.8484659509786063e-07, + "logits/chosen": -1.7344443798065186, + "logits/rejected": -1.6067701578140259, + "logps/chosen": -853.2337646484375, + "logps/rejected": -1220.155029296875, + "loss": 0.2736, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.966510772705078, + "rewards/margins": 3.8081135749816895, + "rewards/rejected": -9.774622917175293, + "step": 8110 + }, + { + "epoch": 0.89, + "grad_norm": 5.15625, + "learning_rate": 1.8125259658989564e-07, + "logits/chosen": -1.7662878036499023, + "logits/rejected": -1.5873587131500244, + "logps/chosen": -840.8899536132812, + "logps/rejected": -1244.8267822265625, + "loss": 0.2856, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.167403221130371, + "rewards/margins": 4.161871433258057, + "rewards/rejected": -10.329275131225586, + "step": 8120 + }, + { + "epoch": 0.89, + "grad_norm": 11.125, + "learning_rate": 1.776925667063567e-07, + "logits/chosen": -1.6066818237304688, + "logits/rejected": -1.55959951877594, + "logps/chosen": -913.4326171875, + "logps/rejected": -1377.5582275390625, + "loss": 0.3393, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.9011945724487305, + "rewards/margins": 4.665724277496338, + "rewards/rejected": -11.56691837310791, + "step": 8130 + }, + { + "epoch": 0.89, + "grad_norm": 16.875, + "learning_rate": 1.7416655760010864e-07, + "logits/chosen": -1.7098257541656494, + "logits/rejected": -1.6590735912322998, + "logps/chosen": -832.9755859375, + "logps/rejected": -1148.1285400390625, + "loss": 0.3379, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.869045734405518, + "rewards/margins": 3.2598578929901123, + "rewards/rejected": -9.12890338897705, + "step": 8140 + }, + { + "epoch": 0.89, + "grad_norm": 4.6875, + "learning_rate": 1.7067462092562602e-07, + "logits/chosen": -1.7314354181289673, + "logits/rejected": -1.5623210668563843, + "logps/chosen": -870.1740112304688, + "logps/rejected": -1205.736083984375, + "loss": 0.3203, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.256227016448975, + "rewards/margins": 3.703942060470581, + "rewards/rejected": -9.960168838500977, + "step": 8150 + }, + { + "epoch": 0.89, + "grad_norm": 2.703125, + "learning_rate": 1.672168078382372e-07, + "logits/chosen": -1.7182223796844482, + "logits/rejected": -1.6501004695892334, + "logps/chosen": -840.8328857421875, + "logps/rejected": -1232.8226318359375, + "loss": 0.2812, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.062809944152832, + "rewards/margins": 4.007100582122803, + "rewards/rejected": -10.069910049438477, + "step": 8160 + }, + { + "epoch": 0.9, + "grad_norm": 11.25, + "learning_rate": 1.6379316899337583e-07, + "logits/chosen": -1.799210548400879, + "logits/rejected": -1.5335028171539307, + "logps/chosen": -800.4320068359375, + "logps/rejected": -1156.6783447265625, + "loss": 0.3116, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.830155372619629, + "rewards/margins": 3.653149127960205, + "rewards/rejected": -9.483304977416992, + "step": 8170 + }, + { + "epoch": 0.9, + "grad_norm": 10.1875, + "learning_rate": 1.60403754545837e-07, + "logits/chosen": -1.8233129978179932, + "logits/rejected": -1.454587459564209, + "logps/chosen": -934.2824096679688, + "logps/rejected": -1299.5472412109375, + "loss": 0.2505, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.649183750152588, + "rewards/margins": 4.442193031311035, + "rewards/rejected": -11.091377258300781, + "step": 8180 + }, + { + "epoch": 0.9, + "grad_norm": 10.625, + "learning_rate": 1.570486141490443e-07, + "logits/chosen": -1.8078670501708984, + "logits/rejected": -1.6148269176483154, + "logps/chosen": -835.0123291015625, + "logps/rejected": -1198.2021484375, + "loss": 0.3394, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.811249256134033, + "rewards/margins": 3.9912467002868652, + "rewards/rejected": -9.802495002746582, + "step": 8190 + }, + { + "epoch": 0.9, + "grad_norm": 20.125, + "learning_rate": 1.5372779695432104e-07, + "logits/chosen": -1.7451341152191162, + "logits/rejected": -1.5011942386627197, + "logps/chosen": -902.0758666992188, + "logps/rejected": -1163.143798828125, + "loss": 0.3428, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.007842540740967, + "rewards/margins": 3.027606248855591, + "rewards/rejected": -9.035449028015137, + "step": 8200 + }, + { + "epoch": 0.9, + "grad_norm": 10.625, + "learning_rate": 1.5044135161017159e-07, + "logits/chosen": -1.778619408607483, + "logits/rejected": -1.615190863609314, + "logps/chosen": -919.9606323242188, + "logps/rejected": -1274.1259765625, + "loss": 0.264, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.442839622497559, + "rewards/margins": 3.9409873485565186, + "rewards/rejected": -10.38382625579834, + "step": 8210 + }, + { + "epoch": 0.9, + "grad_norm": 11.0, + "learning_rate": 1.4718932626156674e-07, + "logits/chosen": -1.6741091012954712, + "logits/rejected": -1.6174064874649048, + "logps/chosen": -898.1390380859375, + "logps/rejected": -1307.5364990234375, + "loss": 0.275, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.5224409103393555, + "rewards/margins": 4.26509428024292, + "rewards/rejected": -10.78753662109375, + "step": 8220 + }, + { + "epoch": 0.9, + "grad_norm": 9.0625, + "learning_rate": 1.4397176854924088e-07, + "logits/chosen": -1.7280805110931396, + "logits/rejected": -1.400599718093872, + "logps/chosen": -824.90283203125, + "logps/rejected": -1194.1768798828125, + "loss": 0.2792, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.942082405090332, + "rewards/margins": 4.179556846618652, + "rewards/rejected": -10.121639251708984, + "step": 8230 + }, + { + "epoch": 0.9, + "grad_norm": 10.875, + "learning_rate": 1.4078872560899238e-07, + "logits/chosen": -1.7294423580169678, + "logits/rejected": -1.6640352010726929, + "logps/chosen": -874.5509643554688, + "logps/rejected": -1249.1871337890625, + "loss": 0.3401, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.3454694747924805, + "rewards/margins": 3.9045634269714355, + "rewards/rejected": -10.250032424926758, + "step": 8240 + }, + { + "epoch": 0.9, + "grad_norm": 18.625, + "learning_rate": 1.3764024407099202e-07, + "logits/chosen": -1.7650864124298096, + "logits/rejected": -1.6377605199813843, + "logps/chosen": -853.1730346679688, + "logps/rejected": -1142.5782470703125, + "loss": 0.3546, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -6.12570333480835, + "rewards/margins": 2.988436222076416, + "rewards/rejected": -9.114139556884766, + "step": 8250 + }, + { + "epoch": 0.91, + "grad_norm": 14.125, + "learning_rate": 1.3452637005910346e-07, + "logits/chosen": -1.7427947521209717, + "logits/rejected": -1.5696041584014893, + "logps/chosen": -869.3728637695312, + "logps/rejected": -1187.717529296875, + "loss": 0.3838, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.139901638031006, + "rewards/margins": 3.552396297454834, + "rewards/rejected": -9.692298889160156, + "step": 8260 + }, + { + "epoch": 0.91, + "grad_norm": 15.4375, + "learning_rate": 1.3144714919020446e-07, + "logits/chosen": -1.7864854335784912, + "logits/rejected": -1.6193021535873413, + "logps/chosen": -933.5149536132812, + "logps/rejected": -1164.5860595703125, + "loss": 0.4327, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.707808017730713, + "rewards/margins": 2.7715964317321777, + "rewards/rejected": -9.47940444946289, + "step": 8270 + }, + { + "epoch": 0.91, + "grad_norm": 7.3125, + "learning_rate": 1.2840262657351943e-07, + "logits/chosen": -1.7463548183441162, + "logits/rejected": -1.5344195365905762, + "logps/chosen": -900.3997802734375, + "logps/rejected": -1360.7066650390625, + "loss": 0.321, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.662268161773682, + "rewards/margins": 4.563970565795898, + "rewards/rejected": -11.226239204406738, + "step": 8280 + }, + { + "epoch": 0.91, + "grad_norm": 16.0, + "learning_rate": 1.2539284680995828e-07, + "logits/chosen": -1.7757755517959595, + "logits/rejected": -1.5259753465652466, + "logps/chosen": -890.5540161132812, + "logps/rejected": -1277.96826171875, + "loss": 0.3466, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.300421237945557, + "rewards/margins": 4.339416980743408, + "rewards/rejected": -10.639837265014648, + "step": 8290 + }, + { + "epoch": 0.91, + "grad_norm": 6.96875, + "learning_rate": 1.2241785399146445e-07, + "logits/chosen": -1.8160043954849243, + "logits/rejected": -1.560031533241272, + "logps/chosen": -835.5760498046875, + "logps/rejected": -1217.8272705078125, + "loss": 0.2919, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.840697765350342, + "rewards/margins": 4.279562473297119, + "rewards/rejected": -10.120260238647461, + "step": 8300 + }, + { + "epoch": 0.91, + "grad_norm": 11.25, + "learning_rate": 1.1947769170036816e-07, + "logits/chosen": -1.7607462406158447, + "logits/rejected": -1.6372970342636108, + "logps/chosen": -857.7052612304688, + "logps/rejected": -1227.233154296875, + "loss": 0.3601, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.307439804077148, + "rewards/margins": 4.00991153717041, + "rewards/rejected": -10.317351341247559, + "step": 8310 + }, + { + "epoch": 0.91, + "grad_norm": 13.125, + "learning_rate": 1.1657240300874617e-07, + "logits/chosen": -1.661383032798767, + "logits/rejected": -1.5546385049819946, + "logps/chosen": -797.6553955078125, + "logps/rejected": -1129.082275390625, + "loss": 0.2895, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.5190863609313965, + "rewards/margins": 3.2949841022491455, + "rewards/rejected": -8.814070701599121, + "step": 8320 + }, + { + "epoch": 0.91, + "grad_norm": 26.625, + "learning_rate": 1.1370203047779444e-07, + "logits/chosen": -1.6538612842559814, + "logits/rejected": -1.5534578561782837, + "logps/chosen": -945.4832763671875, + "logps/rejected": -1345.5697021484375, + "loss": 0.3418, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.983366966247559, + "rewards/margins": 4.081393241882324, + "rewards/rejected": -11.064760208129883, + "step": 8330 + }, + { + "epoch": 0.91, + "grad_norm": 10.625, + "learning_rate": 1.1086661615720085e-07, + "logits/chosen": -1.743573546409607, + "logits/rejected": -1.5641024112701416, + "logps/chosen": -922.2161865234375, + "logps/rejected": -1315.272705078125, + "loss": 0.3336, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.565232753753662, + "rewards/margins": 4.487412452697754, + "rewards/rejected": -11.052644729614258, + "step": 8340 + }, + { + "epoch": 0.92, + "grad_norm": 5.5, + "learning_rate": 1.080662015845324e-07, + "logits/chosen": -1.8400115966796875, + "logits/rejected": -1.6913303136825562, + "logps/chosen": -840.02880859375, + "logps/rejected": -1065.344970703125, + "loss": 0.3381, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.888175010681152, + "rewards/margins": 2.558291435241699, + "rewards/rejected": -8.446466445922852, + "step": 8350 + }, + { + "epoch": 0.92, + "grad_norm": 23.625, + "learning_rate": 1.0530082778462403e-07, + "logits/chosen": -1.7687995433807373, + "logits/rejected": -1.600886344909668, + "logps/chosen": -854.3064575195312, + "logps/rejected": -1164.97216796875, + "loss": 0.2667, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.020127296447754, + "rewards/margins": 3.382431745529175, + "rewards/rejected": -9.402558326721191, + "step": 8360 + }, + { + "epoch": 0.92, + "grad_norm": 18.625, + "learning_rate": 1.0257053526898014e-07, + "logits/chosen": -1.7117531299591064, + "logits/rejected": -1.5225669145584106, + "logps/chosen": -922.4918823242188, + "logps/rejected": -1265.3609619140625, + "loss": 0.3194, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.966604709625244, + "rewards/margins": 3.5312399864196777, + "rewards/rejected": -10.497845649719238, + "step": 8370 + }, + { + "epoch": 0.92, + "grad_norm": 10.25, + "learning_rate": 9.98753640351785e-08, + "logits/chosen": -1.7740246057510376, + "logits/rejected": -1.613180160522461, + "logps/chosen": -934.2409057617188, + "logps/rejected": -1386.287353515625, + "loss": 0.3292, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.729189872741699, + "rewards/margins": 4.512759208679199, + "rewards/rejected": -11.241950988769531, + "step": 8380 + }, + { + "epoch": 0.92, + "grad_norm": 22.875, + "learning_rate": 9.721535356628647e-08, + "logits/chosen": -1.696153998374939, + "logits/rejected": -1.4395227432250977, + "logps/chosen": -943.3603515625, + "logps/rejected": -1311.7186279296875, + "loss": 0.2247, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.838650703430176, + "rewards/margins": 4.299423694610596, + "rewards/rejected": -11.13807487487793, + "step": 8390 + }, + { + "epoch": 0.92, + "grad_norm": 10.875, + "learning_rate": 9.45905428302818e-08, + "logits/chosen": -1.6262012720108032, + "logits/rejected": -1.5233070850372314, + "logps/chosen": -875.2687377929688, + "logps/rejected": -1282.696044921875, + "loss": 0.2979, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.545266151428223, + "rewards/margins": 4.346266269683838, + "rewards/rejected": -10.891532897949219, + "step": 8400 + }, + { + "epoch": 0.92, + "grad_norm": 7.34375, + "learning_rate": 9.20009702794808e-08, + "logits/chosen": -1.7878459692001343, + "logits/rejected": -1.5736232995986938, + "logps/chosen": -872.7595825195312, + "logps/rejected": -1151.0841064453125, + "loss": 0.4009, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.106377601623535, + "rewards/margins": 3.363679885864258, + "rewards/rejected": -9.470057487487793, + "step": 8410 + }, + { + "epoch": 0.92, + "grad_norm": 8.125, + "learning_rate": 8.944667384997718e-08, + "logits/chosen": -1.7182190418243408, + "logits/rejected": -1.5366103649139404, + "logps/chosen": -851.119140625, + "logps/rejected": -1210.4205322265625, + "loss": 0.3275, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.222013473510742, + "rewards/margins": 3.645902633666992, + "rewards/rejected": -9.867916107177734, + "step": 8420 + }, + { + "epoch": 0.92, + "grad_norm": 5.78125, + "learning_rate": 8.692769096108417e-08, + "logits/chosen": -1.7388851642608643, + "logits/rejected": -1.6992442607879639, + "logps/chosen": -869.1339111328125, + "logps/rejected": -1209.418212890625, + "loss": 0.3368, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.214917182922363, + "rewards/margins": 3.501326084136963, + "rewards/rejected": -9.716242790222168, + "step": 8430 + }, + { + "epoch": 0.93, + "grad_norm": 13.6875, + "learning_rate": 8.444405851478792e-08, + "logits/chosen": -1.7211973667144775, + "logits/rejected": -1.6156280040740967, + "logps/chosen": -846.5421752929688, + "logps/rejected": -1145.1234130859375, + "loss": 0.3658, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.011951446533203, + "rewards/margins": 3.24515962600708, + "rewards/rejected": -9.257111549377441, + "step": 8440 + }, + { + "epoch": 0.93, + "grad_norm": 9.75, + "learning_rate": 8.199581289520531e-08, + "logits/chosen": -1.7406879663467407, + "logits/rejected": -1.6221586465835571, + "logps/chosen": -910.1179809570312, + "logps/rejected": -1326.797607421875, + "loss": 0.2453, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.542128562927246, + "rewards/margins": 4.120578289031982, + "rewards/rejected": -10.662707328796387, + "step": 8450 + }, + { + "epoch": 0.93, + "grad_norm": 16.625, + "learning_rate": 7.958298996805252e-08, + "logits/chosen": -1.630231261253357, + "logits/rejected": -1.5238778591156006, + "logps/chosen": -874.5634765625, + "logps/rejected": -1285.994384765625, + "loss": 0.3154, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.423235893249512, + "rewards/margins": 4.102036952972412, + "rewards/rejected": -10.525274276733398, + "step": 8460 + }, + { + "epoch": 0.93, + "grad_norm": 5.9375, + "learning_rate": 7.720562508011952e-08, + "logits/chosen": -1.7538082599639893, + "logits/rejected": -1.6801809072494507, + "logps/chosen": -833.3396606445312, + "logps/rejected": -1164.5806884765625, + "loss": 0.2693, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.955268859863281, + "rewards/margins": 3.645991086959839, + "rewards/rejected": -9.601259231567383, + "step": 8470 + }, + { + "epoch": 0.93, + "grad_norm": 5.03125, + "learning_rate": 7.486375305874983e-08, + "logits/chosen": -1.7145363092422485, + "logits/rejected": -1.5746572017669678, + "logps/chosen": -838.48486328125, + "logps/rejected": -1151.9617919921875, + "loss": 0.2852, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.829164981842041, + "rewards/margins": 3.462231397628784, + "rewards/rejected": -9.29139518737793, + "step": 8480 + }, + { + "epoch": 0.93, + "grad_norm": 4.40625, + "learning_rate": 7.255740821133422e-08, + "logits/chosen": -1.7553069591522217, + "logits/rejected": -1.5316652059555054, + "logps/chosen": -928.4365234375, + "logps/rejected": -1251.0413818359375, + "loss": 0.2833, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.6341552734375, + "rewards/margins": 3.6069247722625732, + "rewards/rejected": -10.241080284118652, + "step": 8490 + }, + { + "epoch": 0.93, + "grad_norm": 12.125, + "learning_rate": 7.028662432480488e-08, + "logits/chosen": -1.744205117225647, + "logits/rejected": -1.5588257312774658, + "logps/chosen": -976.7257080078125, + "logps/rejected": -1328.8199462890625, + "loss": 0.3202, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.0989274978637695, + "rewards/margins": 3.9402706623077393, + "rewards/rejected": -11.039196968078613, + "step": 8500 + }, + { + "epoch": 0.93, + "grad_norm": 21.375, + "learning_rate": 6.805143466514147e-08, + "logits/chosen": -1.6923844814300537, + "logits/rejected": -1.5863544940948486, + "logps/chosen": -901.4617309570312, + "logps/rejected": -1182.4007568359375, + "loss": 0.3568, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.320597171783447, + "rewards/margins": 3.4521725177764893, + "rewards/rejected": -9.772769927978516, + "step": 8510 + }, + { + "epoch": 0.93, + "grad_norm": 12.4375, + "learning_rate": 6.58518719768847e-08, + "logits/chosen": -1.8096717596054077, + "logits/rejected": -1.5730687379837036, + "logps/chosen": -877.3757934570312, + "logps/rejected": -1273.2822265625, + "loss": 0.2494, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.122791767120361, + "rewards/margins": 4.229358196258545, + "rewards/rejected": -10.352149963378906, + "step": 8520 + }, + { + "epoch": 0.94, + "grad_norm": 19.125, + "learning_rate": 6.368796848265607e-08, + "logits/chosen": -1.7305965423583984, + "logits/rejected": -1.5812265872955322, + "logps/chosen": -859.87060546875, + "logps/rejected": -1240.608642578125, + "loss": 0.2324, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.233240604400635, + "rewards/margins": 4.026539325714111, + "rewards/rejected": -10.25977897644043, + "step": 8530 + }, + { + "epoch": 0.94, + "grad_norm": 3.171875, + "learning_rate": 6.155975588268442e-08, + "logits/chosen": -1.7094066143035889, + "logits/rejected": -1.5719234943389893, + "logps/chosen": -909.7531127929688, + "logps/rejected": -1278.919189453125, + "loss": 0.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.607012748718262, + "rewards/margins": 3.689466953277588, + "rewards/rejected": -10.296480178833008, + "step": 8540 + }, + { + "epoch": 0.94, + "grad_norm": 5.0625, + "learning_rate": 5.946726535434405e-08, + "logits/chosen": -1.7609121799468994, + "logits/rejected": -1.614426851272583, + "logps/chosen": -881.8375244140625, + "logps/rejected": -1283.181396484375, + "loss": 0.2425, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.505706787109375, + "rewards/margins": 4.399263381958008, + "rewards/rejected": -10.904970169067383, + "step": 8550 + }, + { + "epoch": 0.94, + "grad_norm": 11.375, + "learning_rate": 5.7410527551696185e-08, + "logits/chosen": -1.6927732229232788, + "logits/rejected": -1.5905221700668335, + "logps/chosen": -889.5582885742188, + "logps/rejected": -1238.0943603515625, + "loss": 0.309, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.32875919342041, + "rewards/margins": 3.465683698654175, + "rewards/rejected": -9.794443130493164, + "step": 8560 + }, + { + "epoch": 0.94, + "grad_norm": 10.875, + "learning_rate": 5.5389572605039907e-08, + "logits/chosen": -1.8454198837280273, + "logits/rejected": -1.5477443933486938, + "logps/chosen": -898.9234619140625, + "logps/rejected": -1335.352294921875, + "loss": 0.2536, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.292233467102051, + "rewards/margins": 4.961319923400879, + "rewards/rejected": -11.25355339050293, + "step": 8570 + }, + { + "epoch": 0.94, + "grad_norm": 11.375, + "learning_rate": 5.340443012047253e-08, + "logits/chosen": -1.7861913442611694, + "logits/rejected": -1.5166192054748535, + "logps/chosen": -956.1566162109375, + "logps/rejected": -1269.5057373046875, + "loss": 0.375, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.935385704040527, + "rewards/margins": 3.6592369079589844, + "rewards/rejected": -10.594622611999512, + "step": 8580 + }, + { + "epoch": 0.94, + "grad_norm": 5.375, + "learning_rate": 5.1455129179453225e-08, + "logits/chosen": -1.747261643409729, + "logits/rejected": -1.5503513813018799, + "logps/chosen": -869.4830322265625, + "logps/rejected": -1201.221923828125, + "loss": 0.2257, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.184055805206299, + "rewards/margins": 3.931041717529297, + "rewards/rejected": -10.11509895324707, + "step": 8590 + }, + { + "epoch": 0.94, + "grad_norm": 6.6875, + "learning_rate": 4.9541698338379793e-08, + "logits/chosen": -1.696441411972046, + "logits/rejected": -1.5529413223266602, + "logps/chosen": -859.9136962890625, + "logps/rejected": -1285.3963623046875, + "loss": 0.2125, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.4006757736206055, + "rewards/margins": 4.483840465545654, + "rewards/rejected": -10.884517669677734, + "step": 8600 + }, + { + "epoch": 0.94, + "grad_norm": 6.6875, + "learning_rate": 4.766416562816761e-08, + "logits/chosen": -1.6991993188858032, + "logits/rejected": -1.5058152675628662, + "logps/chosen": -884.54443359375, + "logps/rejected": -1273.7708740234375, + "loss": 0.2337, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.4322662353515625, + "rewards/margins": 4.321031093597412, + "rewards/rejected": -10.753296852111816, + "step": 8610 + }, + { + "epoch": 0.95, + "grad_norm": 6.6875, + "learning_rate": 4.5822558553841046e-08, + "logits/chosen": -1.7637078762054443, + "logits/rejected": -1.717961072921753, + "logps/chosen": -821.3655395507812, + "logps/rejected": -1141.134033203125, + "loss": 0.3064, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.945315837860107, + "rewards/margins": 3.126410961151123, + "rewards/rejected": -9.071725845336914, + "step": 8620 + }, + { + "epoch": 0.95, + "grad_norm": 10.0625, + "learning_rate": 4.401690409412962e-08, + "logits/chosen": -1.7791696786880493, + "logits/rejected": -1.4950644969940186, + "logps/chosen": -859.9221801757812, + "logps/rejected": -1173.716552734375, + "loss": 0.2718, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.231206893920898, + "rewards/margins": 3.5634307861328125, + "rewards/rejected": -9.794637680053711, + "step": 8630 + }, + { + "epoch": 0.95, + "grad_norm": 12.9375, + "learning_rate": 4.224722870107334e-08, + "logits/chosen": -1.7482883930206299, + "logits/rejected": -1.5676014423370361, + "logps/chosen": -884.0187377929688, + "logps/rejected": -1209.926513671875, + "loss": 0.2652, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.280738830566406, + "rewards/margins": 3.5966503620147705, + "rewards/rejected": -9.877388954162598, + "step": 8640 + }, + { + "epoch": 0.95, + "grad_norm": 21.5, + "learning_rate": 4.051355829963355e-08, + "logits/chosen": -1.6613428592681885, + "logits/rejected": -1.4597740173339844, + "logps/chosen": -868.7833862304688, + "logps/rejected": -1239.80859375, + "loss": 0.2912, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.3964056968688965, + "rewards/margins": 4.130435466766357, + "rewards/rejected": -10.526841163635254, + "step": 8650 + }, + { + "epoch": 0.95, + "grad_norm": 6.84375, + "learning_rate": 3.881591828731601e-08, + "logits/chosen": -1.750144600868225, + "logits/rejected": -1.577070951461792, + "logps/chosen": -843.1873779296875, + "logps/rejected": -1152.3223876953125, + "loss": 0.3013, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.964517593383789, + "rewards/margins": 3.456860065460205, + "rewards/rejected": -9.421377182006836, + "step": 8660 + }, + { + "epoch": 0.95, + "grad_norm": 8.4375, + "learning_rate": 3.715433353379622e-08, + "logits/chosen": -1.8859994411468506, + "logits/rejected": -1.5886578559875488, + "logps/chosen": -836.3206176757812, + "logps/rejected": -1184.9434814453125, + "loss": 0.2582, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.761916160583496, + "rewards/margins": 3.6475181579589844, + "rewards/rejected": -9.409435272216797, + "step": 8670 + }, + { + "epoch": 0.95, + "grad_norm": 6.96875, + "learning_rate": 3.5528828380556046e-08, + "logits/chosen": -1.7788301706314087, + "logits/rejected": -1.5392310619354248, + "logps/chosen": -853.8429565429688, + "logps/rejected": -1238.690673828125, + "loss": 0.2833, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0929741859436035, + "rewards/margins": 4.391332626342773, + "rewards/rejected": -10.484306335449219, + "step": 8680 + }, + { + "epoch": 0.95, + "grad_norm": 7.25, + "learning_rate": 3.3939426640527685e-08, + "logits/chosen": -1.7998135089874268, + "logits/rejected": -1.700740098953247, + "logps/chosen": -871.15576171875, + "logps/rejected": -1199.4805908203125, + "loss": 0.3191, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.332724094390869, + "rewards/margins": 3.5880367755889893, + "rewards/rejected": -9.920762062072754, + "step": 8690 + }, + { + "epoch": 0.95, + "grad_norm": 13.75, + "learning_rate": 3.2386151597743886e-08, + "logits/chosen": -1.6843607425689697, + "logits/rejected": -1.5651808977127075, + "logps/chosen": -825.6369018554688, + "logps/rejected": -1188.4197998046875, + "loss": 0.2704, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.117964744567871, + "rewards/margins": 3.754467010498047, + "rewards/rejected": -9.872430801391602, + "step": 8700 + }, + { + "epoch": 0.95, + "grad_norm": 8.375, + "learning_rate": 3.0869026006997424e-08, + "logits/chosen": -1.7384942770004272, + "logits/rejected": -1.5310828685760498, + "logps/chosen": -897.5598754882812, + "logps/rejected": -1171.763671875, + "loss": 0.327, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.161147117614746, + "rewards/margins": 3.2217929363250732, + "rewards/rejected": -9.382940292358398, + "step": 8710 + }, + { + "epoch": 0.96, + "grad_norm": 7.65625, + "learning_rate": 2.938807209350747e-08, + "logits/chosen": -1.6134307384490967, + "logits/rejected": -1.6143958568572998, + "logps/chosen": -846.0046997070312, + "logps/rejected": -1244.562255859375, + "loss": 0.3512, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.1300764083862305, + "rewards/margins": 4.035763740539551, + "rewards/rejected": -10.165840148925781, + "step": 8720 + }, + { + "epoch": 0.96, + "grad_norm": 5.9375, + "learning_rate": 2.7943311552594277e-08, + "logits/chosen": -1.7048689126968384, + "logits/rejected": -1.5456026792526245, + "logps/chosen": -862.3226318359375, + "logps/rejected": -1150.2349853515625, + "loss": 0.2979, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.1456732749938965, + "rewards/margins": 3.3211357593536377, + "rewards/rejected": -9.466809272766113, + "step": 8730 + }, + { + "epoch": 0.96, + "grad_norm": 2.734375, + "learning_rate": 2.653476554936113e-08, + "logits/chosen": -1.807687759399414, + "logits/rejected": -1.5694173574447632, + "logps/chosen": -845.2843627929688, + "logps/rejected": -1142.07080078125, + "loss": 0.3356, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.715015888214111, + "rewards/margins": 3.636295795440674, + "rewards/rejected": -9.351311683654785, + "step": 8740 + }, + { + "epoch": 0.96, + "grad_norm": 12.4375, + "learning_rate": 2.516245471838402e-08, + "logits/chosen": -1.714991807937622, + "logits/rejected": -1.5783560276031494, + "logps/chosen": -910.5891723632812, + "logps/rejected": -1301.325439453125, + "loss": 0.234, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.659640312194824, + "rewards/margins": 4.164636611938477, + "rewards/rejected": -10.824277877807617, + "step": 8750 + }, + { + "epoch": 0.96, + "grad_norm": 18.75, + "learning_rate": 2.382639916341051e-08, + "logits/chosen": -1.7989475727081299, + "logits/rejected": -1.5774462223052979, + "logps/chosen": -859.6707153320312, + "logps/rejected": -1245.830322265625, + "loss": 0.3096, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.176312446594238, + "rewards/margins": 4.091503143310547, + "rewards/rejected": -10.267816543579102, + "step": 8760 + }, + { + "epoch": 0.96, + "grad_norm": 11.8125, + "learning_rate": 2.2526618457063552e-08, + "logits/chosen": -1.739678978919983, + "logits/rejected": -1.621240258216858, + "logps/chosen": -853.2716674804688, + "logps/rejected": -1183.2579345703125, + "loss": 0.3345, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.820164680480957, + "rewards/margins": 3.5776379108428955, + "rewards/rejected": -9.397802352905273, + "step": 8770 + }, + { + "epoch": 0.96, + "grad_norm": 13.5625, + "learning_rate": 2.1263131640555655e-08, + "logits/chosen": -1.7273060083389282, + "logits/rejected": -1.734060287475586, + "logps/chosen": -830.8797607421875, + "logps/rejected": -1117.0892333984375, + "loss": 0.3449, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.878612041473389, + "rewards/margins": 2.9985110759735107, + "rewards/rejected": -8.877123832702637, + "step": 8780 + }, + { + "epoch": 0.96, + "grad_norm": 8.125, + "learning_rate": 2.0035957223410728e-08, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.4688756465911865, + "logps/chosen": -908.91162109375, + "logps/rejected": -1246.2274169921875, + "loss": 0.3075, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.609283447265625, + "rewards/margins": 3.7445359230041504, + "rewards/rejected": -10.353818893432617, + "step": 8790 + }, + { + "epoch": 0.96, + "grad_norm": 20.25, + "learning_rate": 1.884511318319099e-08, + "logits/chosen": -1.713685393333435, + "logits/rejected": -1.5836366415023804, + "logps/chosen": -855.9851684570312, + "logps/rejected": -1244.0166015625, + "loss": 0.3068, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.028940200805664, + "rewards/margins": 3.9285213947296143, + "rewards/rejected": -9.9574613571167, + "step": 8800 + }, + { + "epoch": 0.97, + "grad_norm": 10.4375, + "learning_rate": 1.769061696523522e-08, + "logits/chosen": -1.8203010559082031, + "logits/rejected": -1.6822617053985596, + "logps/chosen": -844.7947387695312, + "logps/rejected": -1132.2039794921875, + "loss": 0.3348, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.922959327697754, + "rewards/margins": 3.06416654586792, + "rewards/rejected": -8.987126350402832, + "step": 8810 + }, + { + "epoch": 0.97, + "grad_norm": 4.75, + "learning_rate": 1.6572485482402866e-08, + "logits/chosen": -1.7305076122283936, + "logits/rejected": -1.5352022647857666, + "logps/chosen": -925.4586791992188, + "logps/rejected": -1248.900634765625, + "loss": 0.2435, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.847769737243652, + "rewards/margins": 3.4829070568084717, + "rewards/rejected": -10.330677032470703, + "step": 8820 + }, + { + "epoch": 0.97, + "grad_norm": 8.1875, + "learning_rate": 1.5490735114825905e-08, + "logits/chosen": -1.7927026748657227, + "logits/rejected": -1.6285203695297241, + "logps/chosen": -810.3460083007812, + "logps/rejected": -1135.664306640625, + "loss": 0.2852, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.547456741333008, + "rewards/margins": 3.5589656829833984, + "rewards/rejected": -9.106423377990723, + "step": 8830 + }, + { + "epoch": 0.97, + "grad_norm": 5.375, + "learning_rate": 1.444538170966875e-08, + "logits/chosen": -1.8000272512435913, + "logits/rejected": -1.630263090133667, + "logps/chosen": -860.5777587890625, + "logps/rejected": -1172.9053955078125, + "loss": 0.3322, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.924122333526611, + "rewards/margins": 3.4490065574645996, + "rewards/rejected": -9.373128890991211, + "step": 8840 + }, + { + "epoch": 0.97, + "grad_norm": 19.75, + "learning_rate": 1.343644058089677e-08, + "logits/chosen": -1.7713226079940796, + "logits/rejected": -1.5916401147842407, + "logps/chosen": -895.43359375, + "logps/rejected": -1194.712890625, + "loss": 0.324, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.143908500671387, + "rewards/margins": 3.621004581451416, + "rewards/rejected": -9.764913558959961, + "step": 8850 + }, + { + "epoch": 0.97, + "grad_norm": 9.5, + "learning_rate": 1.2463926509051761e-08, + "logits/chosen": -1.7831714153289795, + "logits/rejected": -1.606441855430603, + "logps/chosen": -836.9681396484375, + "logps/rejected": -1183.5531005859375, + "loss": 0.3243, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.789910793304443, + "rewards/margins": 3.6431357860565186, + "rewards/rejected": -9.433046340942383, + "step": 8860 + }, + { + "epoch": 0.97, + "grad_norm": 17.75, + "learning_rate": 1.1527853741034601e-08, + "logits/chosen": -1.8333390951156616, + "logits/rejected": -1.7460883855819702, + "logps/chosen": -856.7185668945312, + "logps/rejected": -1247.995849609375, + "loss": 0.3394, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.000288963317871, + "rewards/margins": 4.036570072174072, + "rewards/rejected": -10.036860466003418, + "step": 8870 + }, + { + "epoch": 0.97, + "grad_norm": 3.921875, + "learning_rate": 1.062823598989765e-08, + "logits/chosen": -1.7239465713500977, + "logits/rejected": -1.6083152294158936, + "logps/chosen": -856.9772338867188, + "logps/rejected": -1170.557373046875, + "loss": 0.3311, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.047800064086914, + "rewards/margins": 3.267475128173828, + "rewards/rejected": -9.315275192260742, + "step": 8880 + }, + { + "epoch": 0.97, + "grad_norm": 14.0, + "learning_rate": 9.765086434643523e-09, + "logits/chosen": -1.8090740442276, + "logits/rejected": -1.6163864135742188, + "logps/chosen": -847.0315551757812, + "logps/rejected": -1137.6201171875, + "loss": 0.3615, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -5.919991970062256, + "rewards/margins": 3.3702468872070312, + "rewards/rejected": -9.290239334106445, + "step": 8890 + }, + { + "epoch": 0.98, + "grad_norm": 6.21875, + "learning_rate": 8.938417720031344e-09, + "logits/chosen": -1.8706165552139282, + "logits/rejected": -1.558516263961792, + "logps/chosen": -884.3927001953125, + "logps/rejected": -1200.9991455078125, + "loss": 0.3602, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.243582248687744, + "rewards/margins": 3.560075283050537, + "rewards/rejected": -9.803657531738281, + "step": 8900 + }, + { + "epoch": 0.98, + "grad_norm": 14.4375, + "learning_rate": 8.148241956392189e-09, + "logits/chosen": -1.6413205862045288, + "logits/rejected": -1.5120948553085327, + "logps/chosen": -857.3433837890625, + "logps/rejected": -1257.877685546875, + "loss": 0.3154, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.217789649963379, + "rewards/margins": 4.340211868286133, + "rewards/rejected": -10.558000564575195, + "step": 8910 + }, + { + "epoch": 0.98, + "grad_norm": 13.5, + "learning_rate": 7.394570719452265e-09, + "logits/chosen": -1.7332340478897095, + "logits/rejected": -1.4671766757965088, + "logps/chosen": -909.5938720703125, + "logps/rejected": -1232.474853515625, + "loss": 0.3409, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.609376430511475, + "rewards/margins": 3.6301121711730957, + "rewards/rejected": -10.23948860168457, + "step": 8920 + }, + { + "epoch": 0.98, + "grad_norm": 6.625, + "learning_rate": 6.677415050161395e-09, + "logits/chosen": -1.6812623739242554, + "logits/rejected": -1.535421371459961, + "logps/chosen": -833.1346435546875, + "logps/rejected": -1143.276123046875, + "loss": 0.2799, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.277042865753174, + "rewards/margins": 3.3461127281188965, + "rewards/rejected": -9.623156547546387, + "step": 8930 + }, + { + "epoch": 0.98, + "grad_norm": 16.0, + "learning_rate": 5.9967854545336935e-09, + "logits/chosen": -1.6680654287338257, + "logits/rejected": -1.6262805461883545, + "logps/chosen": -811.8204345703125, + "logps/rejected": -1223.6153564453125, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6745452880859375, + "rewards/margins": 4.048461437225342, + "rewards/rejected": -9.723006248474121, + "step": 8940 + }, + { + "epoch": 0.98, + "grad_norm": 14.625, + "learning_rate": 5.352691903491303e-09, + "logits/chosen": -1.677345633506775, + "logits/rejected": -1.5468947887420654, + "logps/chosen": -860.4957885742188, + "logps/rejected": -1259.432373046875, + "loss": 0.2673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.164395332336426, + "rewards/margins": 4.450338840484619, + "rewards/rejected": -10.614734649658203, + "step": 8950 + }, + { + "epoch": 0.98, + "grad_norm": 13.0625, + "learning_rate": 4.745143832720345e-09, + "logits/chosen": -1.8141733407974243, + "logits/rejected": -1.655142068862915, + "logps/chosen": -865.24072265625, + "logps/rejected": -1143.602783203125, + "loss": 0.3475, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.9203715324401855, + "rewards/margins": 3.2836527824401855, + "rewards/rejected": -9.204024314880371, + "step": 8960 + }, + { + "epoch": 0.98, + "grad_norm": 6.59375, + "learning_rate": 4.174150142531308e-09, + "logits/chosen": -1.8261626958847046, + "logits/rejected": -1.5636265277862549, + "logps/chosen": -922.0318603515625, + "logps/rejected": -1283.6092529296875, + "loss": 0.2254, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.292181491851807, + "rewards/margins": 3.960326671600342, + "rewards/rejected": -10.252508163452148, + "step": 8970 + }, + { + "epoch": 0.98, + "grad_norm": 9.0625, + "learning_rate": 3.6397191977297073e-09, + "logits/chosen": -1.815691590309143, + "logits/rejected": -1.6554666757583618, + "logps/chosen": -899.6243896484375, + "logps/rejected": -1250.398193359375, + "loss": 0.2859, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.550722599029541, + "rewards/margins": 3.5904955863952637, + "rewards/rejected": -10.141218185424805, + "step": 8980 + }, + { + "epoch": 0.99, + "grad_norm": 5.25, + "learning_rate": 3.141858827492017e-09, + "logits/chosen": -1.7077680826187134, + "logits/rejected": -1.6172462701797485, + "logps/chosen": -849.1980590820312, + "logps/rejected": -1161.189208984375, + "loss": 0.3531, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.898131370544434, + "rewards/margins": 3.3684685230255127, + "rewards/rejected": -9.266599655151367, + "step": 8990 + }, + { + "epoch": 0.99, + "grad_norm": 11.3125, + "learning_rate": 2.6805763252529815e-09, + "logits/chosen": -1.702214002609253, + "logits/rejected": -1.5313847064971924, + "logps/chosen": -901.3880615234375, + "logps/rejected": -1197.772216796875, + "loss": 0.3448, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.388072967529297, + "rewards/margins": 3.217925548553467, + "rewards/rejected": -9.605999946594238, + "step": 9000 + }, + { + "epoch": 0.99, + "grad_norm": 5.75, + "learning_rate": 2.2558784485973707e-09, + "logits/chosen": -1.7994321584701538, + "logits/rejected": -1.6961534023284912, + "logps/chosen": -870.2404174804688, + "logps/rejected": -1212.2100830078125, + "loss": 0.3341, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.827722072601318, + "rewards/margins": 3.980966567993164, + "rewards/rejected": -9.808688163757324, + "step": 9010 + }, + { + "epoch": 0.99, + "grad_norm": 9.0, + "learning_rate": 1.8677714191611683e-09, + "logits/chosen": -1.663535714149475, + "logits/rejected": -1.675575613975525, + "logps/chosen": -892.9075927734375, + "logps/rejected": -1231.8533935546875, + "loss": 0.3238, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.452673435211182, + "rewards/margins": 3.5098235607147217, + "rewards/rejected": -9.962496757507324, + "step": 9020 + }, + { + "epoch": 0.99, + "grad_norm": 5.84375, + "learning_rate": 1.5162609225410906e-09, + "logits/chosen": -1.7724593877792358, + "logits/rejected": -1.6986019611358643, + "logps/chosen": -870.46142578125, + "logps/rejected": -1143.853759765625, + "loss": 0.3162, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.889965534210205, + "rewards/margins": 3.3081562519073486, + "rewards/rejected": -9.198122024536133, + "step": 9030 + }, + { + "epoch": 0.99, + "grad_norm": 22.0, + "learning_rate": 1.20135210820993e-09, + "logits/chosen": -1.693870186805725, + "logits/rejected": -1.6122093200683594, + "logps/chosen": -910.7996826171875, + "logps/rejected": -1292.490478515625, + "loss": 0.2653, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.9459733963012695, + "rewards/margins": 3.8942108154296875, + "rewards/rejected": -10.840185165405273, + "step": 9040 + }, + { + "epoch": 0.99, + "grad_norm": 16.5, + "learning_rate": 9.230495894432811e-10, + "logits/chosen": -1.6869347095489502, + "logits/rejected": -1.5478661060333252, + "logps/chosen": -844.4357299804688, + "logps/rejected": -1264.430419921875, + "loss": 0.2303, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.327622890472412, + "rewards/margins": 4.380006313323975, + "rewards/rejected": -10.707629203796387, + "step": 9050 + }, + { + "epoch": 0.99, + "grad_norm": 15.3125, + "learning_rate": 6.813574432495973e-10, + "logits/chosen": -1.727709174156189, + "logits/rejected": -1.5712103843688965, + "logps/chosen": -843.1636962890625, + "logps/rejected": -1151.973388671875, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.890585422515869, + "rewards/margins": 3.4982428550720215, + "rewards/rejected": -9.38882827758789, + "step": 9060 + }, + { + "epoch": 0.99, + "grad_norm": 6.5, + "learning_rate": 4.76279210311903e-10, + "logits/chosen": -1.7272812128067017, + "logits/rejected": -1.5622541904449463, + "logps/chosen": -919.0368041992188, + "logps/rejected": -1366.074951171875, + "loss": 0.238, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.605795860290527, + "rewards/margins": 4.80692195892334, + "rewards/rejected": -11.412717819213867, + "step": 9070 + }, + { + "epoch": 1.0, + "grad_norm": 8.375, + "learning_rate": 3.0781789493616876e-10, + "logits/chosen": -1.7719471454620361, + "logits/rejected": -1.5814306735992432, + "logps/chosen": -817.85986328125, + "logps/rejected": -1173.8828125, + "loss": 0.2836, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.826724052429199, + "rewards/margins": 3.6778602600097656, + "rewards/rejected": -9.504584312438965, + "step": 9080 + }, + { + "epoch": 1.0, + "grad_norm": 11.3125, + "learning_rate": 1.7597596500551483e-10, + "logits/chosen": -1.7802696228027344, + "logits/rejected": -1.7031152248382568, + "logps/chosen": -881.9097900390625, + "logps/rejected": -1236.870849609375, + "loss": 0.2663, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.290283679962158, + "rewards/margins": 3.718712329864502, + "rewards/rejected": -10.008995056152344, + "step": 9090 + }, + { + "epoch": 1.0, + "grad_norm": 27.625, + "learning_rate": 8.075535194634886e-11, + "logits/chosen": -1.8207433223724365, + "logits/rejected": -1.6089866161346436, + "logps/chosen": -851.9786987304688, + "logps/rejected": -1123.9854736328125, + "loss": 0.3983, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.776336669921875, + "rewards/margins": 3.294705629348755, + "rewards/rejected": -9.071043014526367, + "step": 9100 + }, + { + "epoch": 1.0, + "grad_norm": 6.71875, + "learning_rate": 2.2157450698667706e-11, + "logits/chosen": -1.7677398920059204, + "logits/rejected": -1.6829955577850342, + "logps/chosen": -809.0109252929688, + "logps/rejected": -1118.8795166015625, + "loss": 0.2936, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.768967151641846, + "rewards/margins": 3.1985678672790527, + "rewards/rejected": -8.967535018920898, + "step": 9110 + }, + { + "epoch": 1.0, + "grad_norm": 10.4375, + "learning_rate": 1.8311969551820797e-13, + "logits/chosen": -1.6750282049179077, + "logits/rejected": -1.4944937229156494, + "logps/chosen": -890.4017333984375, + "logps/rejected": -1208.764404296875, + "loss": 0.3948, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -6.475982666015625, + "rewards/margins": 3.3494327068328857, + "rewards/rejected": -9.82541561126709, + "step": 9120 + }, { "epoch": 1.0, - "eval_logits/chosen": -2.704888105392456, - "eval_logits/rejected": -2.5702741146087646, - "eval_logps/chosen": -283.2044677734375, - "eval_logps/rejected": -266.81488037109375, - "eval_loss": 0.6710447072982788, - "eval_rewards/accuracies": 0.5899999737739563, - "eval_rewards/chosen": 0.019298046827316284, - "eval_rewards/margins": 0.054997824132442474, - "eval_rewards/rejected": -0.03569978475570679, - "eval_runtime": 792.688, - "eval_samples_per_second": 2.523, - "eval_steps_per_second": 0.315, - "step": 877 + "eval_logits/chosen": -2.0556910037994385, + "eval_logits/rejected": -2.008650302886963, + "eval_logps/chosen": -1017.7702026367188, + "eval_logps/rejected": -1238.5491943359375, + "eval_loss": 0.3553338348865509, + "eval_rewards/accuracies": 0.8223314881324768, + "eval_rewards/chosen": -6.314454555511475, + "eval_rewards/margins": 2.7745566368103027, + "eval_rewards/rejected": -9.089012145996094, + "eval_runtime": 2606.4426, + "eval_samples_per_second": 2.185, + "eval_steps_per_second": 0.137, + "step": 9121 }, { "epoch": 1.0, - "step": 877, + "step": 9121, "total_flos": 0.0, - "train_loss": 0.542221034538243, - "train_runtime": 10885.06, - "train_samples_per_second": 1.289, - "train_steps_per_second": 0.081 + "train_loss": 0.35941912531329934, + "train_runtime": 141935.9761, + "train_samples_per_second": 1.028, + "train_steps_per_second": 0.064 } ], "logging_steps": 10, - "max_steps": 877, + "max_steps": 9121, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, - "train_batch_size": 4, + "train_batch_size": 2, "trial_name": null, "trial_params": null }