{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 1065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.601457471427555, "learning_rate": 4.672897196261682e-08, "logits/chosen": -2.861618995666504, "logits/rejected": -2.8205904960632324, "logps/chosen": -271.06011962890625, "logps/rejected": -211.1704559326172, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 9.383478018784075, "learning_rate": 4.6728971962616824e-07, "logits/chosen": -2.834562063217163, "logits/rejected": -2.7922489643096924, "logps/chosen": -325.0357360839844, "logps/rejected": -274.966796875, "loss": 0.6931, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.00014581691357307136, "rewards/margins": 0.0001575500500621274, "rewards/margins_max": 0.0024408893659710884, "rewards/margins_min": -0.002742145210504532, "rewards/margins_std": 0.0023130779154598713, "rewards/rejected": -1.173312557511963e-05, "step": 10 }, { "epoch": 0.06, "grad_norm": 1.8412658637892019, "learning_rate": 9.345794392523365e-07, "logits/chosen": -2.7256200313568115, "logits/rejected": -2.707315444946289, "logps/chosen": -293.6407775878906, "logps/rejected": -215.7820281982422, "loss": 0.6922, "rewards/accuracies": 0.75, "rewards/chosen": 0.0018517475109547377, "rewards/margins": 0.0018822858110070229, "rewards/margins_max": 0.005471331533044577, "rewards/margins_min": -0.0010383042972534895, "rewards/margins_std": 0.002963448641821742, "rewards/rejected": -3.053832188015804e-05, "step": 20 }, { "epoch": 0.08, "grad_norm": 2.174968684179302, "learning_rate": 1.4018691588785047e-06, "logits/chosen": -2.8197181224823, "logits/rejected": -2.7506394386291504, "logps/chosen": -302.8995666503906, "logps/rejected": -232.47256469726562, "loss": 0.6888, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.008063090965151787, "rewards/margins": 0.007646501995623112, "rewards/margins_max": 0.015395646914839745, "rewards/margins_min": 0.0007923411321826279, "rewards/margins_std": 0.006716990377753973, "rewards/rejected": 0.0004165889695286751, "step": 30 }, { "epoch": 0.11, "grad_norm": 1.7099389772513702, "learning_rate": 1.869158878504673e-06, "logits/chosen": -2.8403024673461914, "logits/rejected": -2.759880781173706, "logps/chosen": -275.9002380371094, "logps/rejected": -225.5954132080078, "loss": 0.6849, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.01584392786026001, "rewards/margins": 0.014450883492827415, "rewards/margins_max": 0.03173653036355972, "rewards/margins_min": -2.6600435376167297e-05, "rewards/margins_std": 0.014551711268723011, "rewards/rejected": 0.0013930455315858126, "step": 40 }, { "epoch": 0.14, "grad_norm": 2.1338277224043574, "learning_rate": 2.3364485981308413e-06, "logits/chosen": -2.8058629035949707, "logits/rejected": -2.734032154083252, "logps/chosen": -271.67120361328125, "logps/rejected": -233.6707305908203, "loss": 0.6753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.038989000022411346, "rewards/margins": 0.036923374980688095, "rewards/margins_max": 0.08067025989294052, "rewards/margins_min": 0.006618264131247997, "rewards/margins_std": 0.03399632126092911, "rewards/rejected": 0.0020656271371990442, "step": 50 }, { "epoch": 0.17, "grad_norm": 2.3538977095192313, "learning_rate": 2.8037383177570094e-06, "logits/chosen": -2.739483594894409, "logits/rejected": -2.7014524936676025, "logps/chosen": -306.43206787109375, "logps/rejected": -262.4384460449219, "loss": 0.6619, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0720754936337471, "rewards/margins": 0.06874484568834305, "rewards/margins_max": 0.12744362652301788, "rewards/margins_min": 0.017528068274259567, "rewards/margins_std": 0.04889371618628502, "rewards/rejected": 0.0033306567929685116, "step": 60 }, { "epoch": 0.2, "grad_norm": 1.673361144474326, "learning_rate": 3.2710280373831774e-06, "logits/chosen": -2.761547565460205, "logits/rejected": -2.701035976409912, "logps/chosen": -312.3368225097656, "logps/rejected": -234.6005401611328, "loss": 0.6461, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1052437424659729, "rewards/margins": 0.09483315050601959, "rewards/margins_max": 0.19849452376365662, "rewards/margins_min": 0.015507131814956665, "rewards/margins_std": 0.08316393196582794, "rewards/rejected": 0.010410590097308159, "step": 70 }, { "epoch": 0.23, "grad_norm": 1.8350886553726478, "learning_rate": 3.738317757009346e-06, "logits/chosen": -2.7897353172302246, "logits/rejected": -2.7348127365112305, "logps/chosen": -310.0438537597656, "logps/rejected": -290.1259765625, "loss": 0.6264, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11226633936166763, "rewards/margins": 0.14973895251750946, "rewards/margins_max": 0.30203038454055786, "rewards/margins_min": 0.01934988982975483, "rewards/margins_std": 0.13135038316249847, "rewards/rejected": -0.03747261315584183, "step": 80 }, { "epoch": 0.25, "grad_norm": 2.370057132370328, "learning_rate": 4.205607476635514e-06, "logits/chosen": -2.6879115104675293, "logits/rejected": -2.650247812271118, "logps/chosen": -264.0439453125, "logps/rejected": -208.5765380859375, "loss": 0.5913, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11950834840536118, "rewards/margins": 0.21540161967277527, "rewards/margins_max": 0.40502986311912537, "rewards/margins_min": 0.061323970556259155, "rewards/margins_std": 0.15978315472602844, "rewards/rejected": -0.09589327871799469, "step": 90 }, { "epoch": 0.28, "grad_norm": 2.3715260848384814, "learning_rate": 4.6728971962616825e-06, "logits/chosen": -2.6909117698669434, "logits/rejected": -2.6588971614837646, "logps/chosen": -273.89483642578125, "logps/rejected": -280.07440185546875, "loss": 0.5649, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.07516907155513763, "rewards/margins": 0.25403863191604614, "rewards/margins_max": 0.5021854639053345, "rewards/margins_min": 0.0338195376098156, "rewards/margins_std": 0.20746219158172607, "rewards/rejected": -0.1788695752620697, "step": 100 }, { "epoch": 0.28, "eval_logits/chosen": -2.6587636470794678, "eval_logits/rejected": -2.624938726425171, "eval_logps/chosen": -294.36553955078125, "eval_logps/rejected": -276.0350341796875, "eval_loss": 0.6725258231163025, "eval_rewards/accuracies": 0.6029999852180481, "eval_rewards/chosen": -0.09772102534770966, "eval_rewards/margins": 0.07684005051851273, "eval_rewards/margins_max": 0.4634929597377777, "eval_rewards/margins_min": -0.27960655093193054, "eval_rewards/margins_std": 0.25082939863204956, "eval_rewards/rejected": -0.17456106841564178, "eval_runtime": 429.6888, "eval_samples_per_second": 4.655, "eval_steps_per_second": 0.291, "step": 100 }, { "epoch": 0.31, "grad_norm": 4.663865383973278, "learning_rate": 4.999879018839288e-06, "logits/chosen": -2.696274518966675, "logits/rejected": -2.6191954612731934, "logps/chosen": -361.00341796875, "logps/rejected": -324.7152404785156, "loss": 0.4866, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1360231339931488, "rewards/margins": 0.5188378095626831, "rewards/margins_max": 0.8877674341201782, "rewards/margins_min": 0.15628832578659058, "rewards/margins_std": 0.33103400468826294, "rewards/rejected": -0.3828147053718567, "step": 110 }, { "epoch": 0.34, "grad_norm": 2.659078012596696, "learning_rate": 4.99772856836941e-06, "logits/chosen": -2.6332004070281982, "logits/rejected": -2.58402681350708, "logps/chosen": -338.8200988769531, "logps/rejected": -314.74078369140625, "loss": 0.4569, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09566140174865723, "rewards/margins": 0.5811273455619812, "rewards/margins_max": 1.0773193836212158, "rewards/margins_min": 0.19689173996448517, "rewards/margins_std": 0.4066368043422699, "rewards/rejected": -0.4854659140110016, "step": 120 }, { "epoch": 0.37, "grad_norm": 5.638039796957378, "learning_rate": 4.992892309373227e-06, "logits/chosen": -2.5800509452819824, "logits/rejected": -2.5182909965515137, "logps/chosen": -377.07415771484375, "logps/rejected": -370.76007080078125, "loss": 0.4111, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.05290098860859871, "rewards/margins": 0.7561925649642944, "rewards/margins_max": 1.318340539932251, "rewards/margins_min": 0.10839029401540756, "rewards/margins_std": 0.5403656363487244, "rewards/rejected": -0.7032915949821472, "step": 130 }, { "epoch": 0.39, "grad_norm": 3.185506159687688, "learning_rate": 4.985375442281969e-06, "logits/chosen": -2.529670476913452, "logits/rejected": -2.505495548248291, "logps/chosen": -311.046875, "logps/rejected": -341.42388916015625, "loss": 0.4278, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07367168366909027, "rewards/margins": 0.7894155383110046, "rewards/margins_max": 1.4857099056243896, "rewards/margins_min": 0.17245283722877502, "rewards/margins_std": 0.6018984317779541, "rewards/rejected": -0.715743899345398, "step": 140 }, { "epoch": 0.42, "grad_norm": 6.522701528001161, "learning_rate": 4.9751860499858175e-06, "logits/chosen": -2.501380443572998, "logits/rejected": -2.4765429496765137, "logps/chosen": -295.21844482421875, "logps/rejected": -294.5282897949219, "loss": 0.4, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0653342604637146, "rewards/margins": 0.7491210699081421, "rewards/margins_max": 1.2866442203521729, "rewards/margins_min": 0.1819653958082199, "rewards/margins_std": 0.5079216957092285, "rewards/rejected": -0.8144553303718567, "step": 150 }, { "epoch": 0.45, "grad_norm": 7.099952708342032, "learning_rate": 4.962335089142376e-06, "logits/chosen": -2.4243741035461426, "logits/rejected": -2.382873058319092, "logps/chosen": -311.75506591796875, "logps/rejected": -337.52227783203125, "loss": 0.357, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11208178848028183, "rewards/margins": 0.919207751750946, "rewards/margins_max": 1.5249192714691162, "rewards/margins_min": 0.28068000078201294, "rewards/margins_std": 0.5570467710494995, "rewards/rejected": -1.0312894582748413, "step": 160 }, { "epoch": 0.48, "grad_norm": 15.17640060673072, "learning_rate": 4.946836378394967e-06, "logits/chosen": -2.3504722118377686, "logits/rejected": -2.3078646659851074, "logps/chosen": -345.75726318359375, "logps/rejected": -430.4729919433594, "loss": 0.3207, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.16135653853416443, "rewards/margins": 1.1807162761688232, "rewards/margins_max": 1.7726972103118896, "rewards/margins_min": 0.30320629477500916, "rewards/margins_std": 0.6691843867301941, "rewards/rejected": -1.34207284450531, "step": 170 }, { "epoch": 0.51, "grad_norm": 8.646835771533034, "learning_rate": 4.928706583513441e-06, "logits/chosen": -2.1459343433380127, "logits/rejected": -2.055025577545166, "logps/chosen": -378.0511779785156, "logps/rejected": -468.014404296875, "loss": 0.3002, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.46150344610214233, "rewards/margins": 1.2767913341522217, "rewards/margins_max": 2.0464911460876465, "rewards/margins_min": 0.511903703212738, "rewards/margins_std": 0.6761992573738098, "rewards/rejected": -1.7382948398590088, "step": 180 }, { "epoch": 0.54, "grad_norm": 4.978015250452758, "learning_rate": 4.907965199473471e-06, "logits/chosen": -1.873817801475525, "logits/rejected": -1.7417463064193726, "logps/chosen": -362.2750549316406, "logps/rejected": -456.6219787597656, "loss": 0.2276, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.40314167737960815, "rewards/margins": 1.7882015705108643, "rewards/margins_max": 2.7738163471221924, "rewards/margins_min": 0.8575057983398438, "rewards/margins_std": 0.8512203097343445, "rewards/rejected": -2.191343069076538, "step": 190 }, { "epoch": 0.56, "grad_norm": 9.452666973020474, "learning_rate": 4.884634529493591e-06, "logits/chosen": -1.8183701038360596, "logits/rejected": -1.7065311670303345, "logps/chosen": -416.6236877441406, "logps/rejected": -549.5675048828125, "loss": 0.2267, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7709085941314697, "rewards/margins": 2.068791627883911, "rewards/margins_max": 3.4109108448028564, "rewards/margins_min": 0.585421621799469, "rewards/margins_std": 1.2988938093185425, "rewards/rejected": -2.839700222015381, "step": 200 }, { "epoch": 0.56, "eval_logits/chosen": -1.6714030504226685, "eval_logits/rejected": -1.6187551021575928, "eval_logps/chosen": -474.511962890625, "eval_logps/rejected": -497.81463623046875, "eval_loss": 0.7397594451904297, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": -1.899185299873352, "eval_rewards/margins": 0.49317169189453125, "eval_rewards/margins_max": 2.671410083770752, "eval_rewards/margins_min": -1.8999947309494019, "eval_rewards/margins_std": 1.5475962162017822, "eval_rewards/rejected": -2.392357110977173, "eval_runtime": 429.7827, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 200 }, { "epoch": 0.59, "grad_norm": 8.87270228770415, "learning_rate": 4.858739661052539e-06, "logits/chosen": -1.511608600616455, "logits/rejected": -1.4413245916366577, "logps/chosen": -427.55413818359375, "logps/rejected": -620.9583740234375, "loss": 0.1779, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0736979246139526, "rewards/margins": 2.571638822555542, "rewards/margins_max": 4.121321678161621, "rewards/margins_min": 0.6724111437797546, "rewards/margins_std": 1.547525405883789, "rewards/rejected": -3.645336866378784, "step": 210 }, { "epoch": 0.62, "grad_norm": 6.5430570772956, "learning_rate": 4.830308438912687e-06, "logits/chosen": -1.3631094694137573, "logits/rejected": -1.1896626949310303, "logps/chosen": -610.7598876953125, "logps/rejected": -881.2283325195312, "loss": 0.1472, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5730366706848145, "rewards/margins": 3.4959709644317627, "rewards/margins_max": 5.009349822998047, "rewards/margins_min": 1.5561037063598633, "rewards/margins_std": 1.543906331062317, "rewards/rejected": -6.069007396697998, "step": 220 }, { "epoch": 0.65, "grad_norm": 11.567738598963295, "learning_rate": 4.799371435178544e-06, "logits/chosen": -1.2935478687286377, "logits/rejected": -1.1057153940200806, "logps/chosen": -756.6351318359375, "logps/rejected": -983.3760986328125, "loss": 0.2065, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8495945930480957, "rewards/margins": 3.363232135772705, "rewards/margins_max": 5.4596266746521, "rewards/margins_min": 0.4015835225582123, "rewards/margins_std": 2.3402669429779053, "rewards/rejected": -7.212827205657959, "step": 230 }, { "epoch": 0.68, "grad_norm": 14.908052027638925, "learning_rate": 4.765961916422575e-06, "logits/chosen": -1.3409693241119385, "logits/rejected": -1.2054760456085205, "logps/chosen": -675.9885864257812, "logps/rejected": -992.09375, "loss": 0.1872, "rewards/accuracies": 0.9375, "rewards/chosen": -3.511915683746338, "rewards/margins": 3.5772738456726074, "rewards/margins_max": 5.6575751304626465, "rewards/margins_min": 1.1669104099273682, "rewards/margins_std": 2.040917158126831, "rewards/rejected": -7.089189052581787, "step": 240 }, { "epoch": 0.7, "grad_norm": 9.045837659115827, "learning_rate": 4.730115807913627e-06, "logits/chosen": -1.4189417362213135, "logits/rejected": -1.2720701694488525, "logps/chosen": -674.1248779296875, "logps/rejected": -974.5089721679688, "loss": 0.1161, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.62843656539917, "rewards/margins": 3.6062092781066895, "rewards/margins_max": 5.835866451263428, "rewards/margins_min": 1.5058424472808838, "rewards/margins_std": 1.905207633972168, "rewards/rejected": -7.234647274017334, "step": 250 }, { "epoch": 0.73, "grad_norm": 9.416665631409534, "learning_rate": 4.691871654986485e-06, "logits/chosen": -1.5399147272109985, "logits/rejected": -1.3777363300323486, "logps/chosen": -710.0699462890625, "logps/rejected": -1064.373779296875, "loss": 0.1185, "rewards/accuracies": 0.9375, "rewards/chosen": -4.1797637939453125, "rewards/margins": 3.8746650218963623, "rewards/margins_max": 5.889615058898926, "rewards/margins_min": 1.7422330379486084, "rewards/margins_std": 1.8929340839385986, "rewards/rejected": -8.054429054260254, "step": 260 }, { "epoch": 0.76, "grad_norm": 56.620770226956026, "learning_rate": 4.651270581594054e-06, "logits/chosen": -1.5505702495574951, "logits/rejected": -1.439883828163147, "logps/chosen": -655.2439575195312, "logps/rejected": -985.9658203125, "loss": 0.2278, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.4612839221954346, "rewards/margins": 3.773378372192383, "rewards/margins_max": 5.983767509460449, "rewards/margins_min": 1.2523890733718872, "rewards/margins_std": 2.149752378463745, "rewards/rejected": -7.234662055969238, "step": 270 }, { "epoch": 0.79, "grad_norm": 9.941259668614844, "learning_rate": 4.6083562460867545e-06, "logits/chosen": -1.4796500205993652, "logits/rejected": -1.3813179731369019, "logps/chosen": -780.708984375, "logps/rejected": -1187.9755859375, "loss": 0.1019, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.772520542144775, "rewards/margins": 4.3812642097473145, "rewards/margins_max": 6.6738691329956055, "rewards/margins_min": 1.5353296995162964, "rewards/margins_std": 2.349224805831909, "rewards/rejected": -9.153783798217773, "step": 280 }, { "epoch": 0.82, "grad_norm": 15.917323127244398, "learning_rate": 4.563174794266684e-06, "logits/chosen": -1.5392366647720337, "logits/rejected": -1.4464019536972046, "logps/chosen": -692.5883178710938, "logps/rejected": -963.4357299804688, "loss": 0.2109, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.056720733642578, "rewards/margins": 2.9374544620513916, "rewards/margins_max": 5.395993232727051, "rewards/margins_min": 0.5851330161094666, "rewards/margins_std": 2.2416446208953857, "rewards/rejected": -6.994175910949707, "step": 290 }, { "epoch": 0.85, "grad_norm": 11.476540562223757, "learning_rate": 4.5157748097670125e-06, "logits/chosen": -1.5950560569763184, "logits/rejected": -1.4536263942718506, "logps/chosen": -938.9279174804688, "logps/rejected": -1296.3175048828125, "loss": 0.1011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.9241485595703125, "rewards/margins": 4.1446919441223145, "rewards/margins_max": 6.245351314544678, "rewards/margins_min": 1.5497524738311768, "rewards/margins_std": 2.1239330768585205, "rewards/rejected": -10.068840026855469, "step": 300 }, { "epoch": 0.85, "eval_logits/chosen": -1.527121663093567, "eval_logits/rejected": -1.4628735780715942, "eval_logps/chosen": -1076.8594970703125, "eval_logps/rejected": -1150.1253662109375, "eval_loss": 0.9229267835617065, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": -7.9226603507995605, "eval_rewards/margins": 0.992804765701294, "eval_rewards/margins_max": 5.051580905914307, "eval_rewards/margins_min": -3.0808050632476807, "eval_rewards/margins_std": 2.7076425552368164, "eval_rewards/rejected": -8.915464401245117, "eval_runtime": 428.5869, "eval_samples_per_second": 4.666, "eval_steps_per_second": 0.292, "step": 300 }, { "epoch": 0.87, "grad_norm": 5.622465452747041, "learning_rate": 4.466207261809989e-06, "logits/chosen": -1.625128149986267, "logits/rejected": -1.4389641284942627, "logps/chosen": -856.7615356445312, "logps/rejected": -1196.298583984375, "loss": 0.1046, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.589455604553223, "rewards/margins": 4.231289863586426, "rewards/margins_max": 6.631104946136475, "rewards/margins_min": 1.5690397024154663, "rewards/margins_std": 2.2617735862731934, "rewards/rejected": -9.820745468139648, "step": 310 }, { "epoch": 0.9, "grad_norm": 37.31728926549998, "learning_rate": 4.414525450399713e-06, "logits/chosen": -1.6272573471069336, "logits/rejected": -1.5049296617507935, "logps/chosen": -816.5538330078125, "logps/rejected": -1220.7586669921875, "loss": 0.1477, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.082805633544922, "rewards/margins": 4.518080711364746, "rewards/margins_max": 6.932036399841309, "rewards/margins_min": 1.335532546043396, "rewards/margins_std": 2.588527202606201, "rewards/rejected": -9.600885391235352, "step": 320 }, { "epoch": 0.93, "grad_norm": 5.37421997088044, "learning_rate": 4.360784949008615e-06, "logits/chosen": -1.8167043924331665, "logits/rejected": -1.645042061805725, "logps/chosen": -831.2081298828125, "logps/rejected": -1208.270263671875, "loss": 0.111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.768882751464844, "rewards/margins": 4.644423007965088, "rewards/margins_max": 7.169321537017822, "rewards/margins_min": 1.9509897232055664, "rewards/margins_std": 2.3954663276672363, "rewards/rejected": -9.413305282592773, "step": 330 }, { "epoch": 0.96, "grad_norm": 6.115341044262903, "learning_rate": 4.30504354481929e-06, "logits/chosen": -1.7410516738891602, "logits/rejected": -1.6124862432479858, "logps/chosen": -741.0687866210938, "logps/rejected": -1153.75390625, "loss": 0.1044, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.412232875823975, "rewards/margins": 4.613609790802002, "rewards/margins_max": 6.75095272064209, "rewards/margins_min": 1.8239591121673584, "rewards/margins_std": 2.2112793922424316, "rewards/rejected": -9.025842666625977, "step": 340 }, { "epoch": 0.99, "grad_norm": 8.804373815951685, "learning_rate": 4.247361176585904e-06, "logits/chosen": -1.6892824172973633, "logits/rejected": -1.567959189414978, "logps/chosen": -782.8369140625, "logps/rejected": -1259.287353515625, "loss": 0.0817, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.678778648376465, "rewards/margins": 5.049575328826904, "rewards/margins_max": 6.799111366271973, "rewards/margins_min": 2.811235189437866, "rewards/margins_std": 1.8613466024398804, "rewards/rejected": -9.728352546691895, "step": 350 }, { "epoch": 1.01, "grad_norm": 22.068799726915795, "learning_rate": 4.187799870182038e-06, "logits/chosen": -1.7105668783187866, "logits/rejected": -1.5694526433944702, "logps/chosen": -762.7816162109375, "logps/rejected": -1217.321044921875, "loss": 0.1032, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.626708507537842, "rewards/margins": 5.017427444458008, "rewards/margins_max": 7.252201080322266, "rewards/margins_min": 2.1052348613739014, "rewards/margins_std": 2.383836507797241, "rewards/rejected": -9.644137382507324, "step": 360 }, { "epoch": 1.04, "grad_norm": 10.290993940063032, "learning_rate": 4.1264236719042365e-06, "logits/chosen": -1.7839868068695068, "logits/rejected": -1.6120306253433228, "logps/chosen": -801.9637451171875, "logps/rejected": -1164.2841796875, "loss": 0.1588, "rewards/accuracies": 0.9375, "rewards/chosen": -4.478141784667969, "rewards/margins": 4.724917411804199, "rewards/margins_max": 7.042010307312012, "rewards/margins_min": 1.6127008199691772, "rewards/margins_std": 2.5478250980377197, "rewards/rejected": -9.203059196472168, "step": 370 }, { "epoch": 1.07, "grad_norm": 0.5210034728309734, "learning_rate": 4.063298579603001e-06, "logits/chosen": -1.6867786645889282, "logits/rejected": -1.4948246479034424, "logps/chosen": -782.1204223632812, "logps/rejected": -1320.6646728515625, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": -4.968000888824463, "rewards/margins": 5.882228851318359, "rewards/margins_max": 7.5977654457092285, "rewards/margins_min": 3.866016387939453, "rewards/margins_std": 1.67121160030365, "rewards/rejected": -10.850229263305664, "step": 380 }, { "epoch": 1.1, "grad_norm": 13.427534231462952, "learning_rate": 3.998492471715272e-06, "logits/chosen": -1.6988388299942017, "logits/rejected": -1.5951545238494873, "logps/chosen": -877.1390380859375, "logps/rejected": -1402.83203125, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": -5.7572479248046875, "rewards/margins": 5.6500396728515625, "rewards/margins_max": 7.7508039474487305, "rewards/margins_min": 3.0813615322113037, "rewards/margins_std": 2.1727612018585205, "rewards/rejected": -11.407288551330566, "step": 390 }, { "epoch": 1.13, "grad_norm": 2.4923200900882536, "learning_rate": 3.932075034274723e-06, "logits/chosen": -1.695990800857544, "logits/rejected": -1.5507137775421143, "logps/chosen": -851.5281372070312, "logps/rejected": -1309.4801025390625, "loss": 0.1396, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.59298038482666, "rewards/margins": 5.056563854217529, "rewards/margins_max": 7.312686920166016, "rewards/margins_min": 1.9958137273788452, "rewards/margins_std": 2.379727840423584, "rewards/rejected": -10.649542808532715, "step": 400 }, { "epoch": 1.13, "eval_logits/chosen": -1.667060375213623, "eval_logits/rejected": -1.5979340076446533, "eval_logps/chosen": -1116.99462890625, "eval_logps/rejected": -1209.6519775390625, "eval_loss": 0.9696508646011353, "eval_rewards/accuracies": 0.6779999732971191, "eval_rewards/chosen": -8.324010848999023, "eval_rewards/margins": 1.1867200136184692, "eval_rewards/margins_max": 5.737547397613525, "eval_rewards/margins_min": -3.3923180103302, "eval_rewards/margins_std": 3.034074544906616, "eval_rewards/rejected": -9.510730743408203, "eval_runtime": 428.9385, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.291, "step": 400 }, { "epoch": 1.15, "grad_norm": 6.537657300759786, "learning_rate": 3.864117685978339e-06, "logits/chosen": -1.705518126487732, "logits/rejected": -1.5725294351577759, "logps/chosen": -897.5511474609375, "logps/rejected": -1346.69091796875, "loss": 0.0939, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.0087456703186035, "rewards/margins": 4.9630446434021, "rewards/margins_max": 7.469670295715332, "rewards/margins_min": 1.8066009283065796, "rewards/margins_std": 2.5759172439575195, "rewards/rejected": -10.971790313720703, "step": 410 }, { "epoch": 1.18, "grad_norm": 8.302069752936143, "learning_rate": 3.794693501389861e-06, "logits/chosen": -1.6544630527496338, "logits/rejected": -1.5131093263626099, "logps/chosen": -929.0003051757812, "logps/rejected": -1400.305419921875, "loss": 0.0548, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.110236644744873, "rewards/margins": 5.219418525695801, "rewards/margins_max": 7.189891815185547, "rewards/margins_min": 2.5033233165740967, "rewards/margins_std": 2.120957374572754, "rewards/rejected": -11.329654693603516, "step": 420 }, { "epoch": 1.21, "grad_norm": 1.8619960615196327, "learning_rate": 3.7238771323626822e-06, "logits/chosen": -1.677835464477539, "logits/rejected": -1.5019906759262085, "logps/chosen": -999.4791259765625, "logps/rejected": -1461.9598388671875, "loss": 0.0742, "rewards/accuracies": 0.9375, "rewards/chosen": -6.562595367431641, "rewards/margins": 5.4552412033081055, "rewards/margins_max": 7.70766544342041, "rewards/margins_min": 2.421809434890747, "rewards/margins_std": 2.3860526084899902, "rewards/rejected": -12.01783561706543, "step": 430 }, { "epoch": 1.24, "grad_norm": 5.868124977117504, "learning_rate": 3.651744727766676e-06, "logits/chosen": -1.6518735885620117, "logits/rejected": -1.497201681137085, "logps/chosen": -996.3165893554688, "logps/rejected": -1532.0673828125, "loss": 0.0519, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.051259517669678, "rewards/margins": 5.819365501403809, "rewards/margins_max": 7.876378536224365, "rewards/margins_min": 2.8851966857910156, "rewards/margins_std": 2.268291473388672, "rewards/rejected": -12.870625495910645, "step": 440 }, { "epoch": 1.27, "grad_norm": 10.805483266746087, "learning_rate": 3.57837385160529e-06, "logits/chosen": -1.621983289718628, "logits/rejected": -1.479236364364624, "logps/chosen": -850.7548828125, "logps/rejected": -1321.7237548828125, "loss": 0.0641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.389442443847656, "rewards/margins": 5.238432884216309, "rewards/margins_max": 7.7712531089782715, "rewards/margins_min": 2.671607255935669, "rewards/margins_std": 2.3198580741882324, "rewards/rejected": -10.627875328063965, "step": 450 }, { "epoch": 1.3, "grad_norm": 2.3174254055425183, "learning_rate": 3.503843399610941e-06, "logits/chosen": -1.6503874063491821, "logits/rejected": -1.4967344999313354, "logps/chosen": -1084.4403076171875, "logps/rejected": -1629.10693359375, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -7.2910637855529785, "rewards/margins": 6.02940034866333, "rewards/margins_max": 8.215094566345215, "rewards/margins_min": 3.005946636199951, "rewards/margins_std": 2.3927676677703857, "rewards/rejected": -13.320462226867676, "step": 460 }, { "epoch": 1.32, "grad_norm": 2.3666379183603676, "learning_rate": 3.4282335144083985e-06, "logits/chosen": -1.6708223819732666, "logits/rejected": -1.5695239305496216, "logps/chosen": -911.2108154296875, "logps/rejected": -1447.9605712890625, "loss": 0.046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.949938774108887, "rewards/margins": 6.058177471160889, "rewards/margins_max": 8.24023723602295, "rewards/margins_min": 3.495572566986084, "rewards/margins_std": 2.158477783203125, "rewards/rejected": -12.008115768432617, "step": 470 }, { "epoch": 1.35, "grad_norm": 5.998521622676278, "learning_rate": 3.351625499337395e-06, "logits/chosen": -1.7066646814346313, "logits/rejected": -1.5283164978027344, "logps/chosen": -988.6871337890625, "logps/rejected": -1536.398681640625, "loss": 0.0589, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.574495792388916, "rewards/margins": 6.3713812828063965, "rewards/margins_max": 8.416463851928711, "rewards/margins_min": 3.5055854320526123, "rewards/margins_std": 2.2722041606903076, "rewards/rejected": -12.945878982543945, "step": 480 }, { "epoch": 1.38, "grad_norm": 2.3121304603384734, "learning_rate": 3.2741017310271056e-06, "logits/chosen": -1.6702913045883179, "logits/rejected": -1.5516611337661743, "logps/chosen": -985.5250244140625, "logps/rejected": -1516.626708984375, "loss": 0.0956, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.465473175048828, "rewards/margins": 5.861384868621826, "rewards/margins_max": 8.299718856811523, "rewards/margins_min": 2.847576141357422, "rewards/margins_std": 2.479989767074585, "rewards/rejected": -12.326857566833496, "step": 490 }, { "epoch": 1.41, "grad_norm": 7.590092284353976, "learning_rate": 3.195745570816532e-06, "logits/chosen": -1.580214500427246, "logits/rejected": -1.4903004169464111, "logps/chosen": -1054.06103515625, "logps/rejected": -1565.05810546875, "loss": 0.078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.562958717346191, "rewards/margins": 5.623807430267334, "rewards/margins_max": 7.8602399826049805, "rewards/margins_min": 2.7611820697784424, "rewards/margins_std": 2.2531216144561768, "rewards/rejected": -13.186765670776367, "step": 500 }, { "epoch": 1.41, "eval_logits/chosen": -1.625468134880066, "eval_logits/rejected": -1.5552992820739746, "eval_logps/chosen": -1304.2783203125, "eval_logps/rejected": -1404.43701171875, "eval_loss": 1.0425163507461548, "eval_rewards/accuracies": 0.6539999842643738, "eval_rewards/chosen": -10.196849822998047, "eval_rewards/margins": 1.2617301940917969, "eval_rewards/margins_max": 6.198861598968506, "eval_rewards/margins_min": -3.7952890396118164, "eval_rewards/margins_std": 3.3487019538879395, "eval_rewards/rejected": -11.45858097076416, "eval_runtime": 428.5936, "eval_samples_per_second": 4.666, "eval_steps_per_second": 0.292, "step": 500 }, { "epoch": 1.44, "grad_norm": 6.506789256384592, "learning_rate": 3.116641275116018e-06, "logits/chosen": -1.6757932901382446, "logits/rejected": -1.4905316829681396, "logps/chosen": -1033.5491943359375, "logps/rejected": -1559.284912109375, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -6.929083824157715, "rewards/margins": 6.069881916046143, "rewards/margins_max": 8.297313690185547, "rewards/margins_min": 3.3508517742156982, "rewards/margins_std": 2.215510606765747, "rewards/rejected": -12.998964309692383, "step": 510 }, { "epoch": 1.46, "grad_norm": 2.799331098085792, "learning_rate": 3.0368739048062956e-06, "logits/chosen": -1.759708046913147, "logits/rejected": -1.5871171951293945, "logps/chosen": -981.7990112304688, "logps/rejected": -1526.3701171875, "loss": 0.0613, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.813815116882324, "rewards/margins": 6.061458587646484, "rewards/margins_max": 8.386785507202148, "rewards/margins_min": 3.3189563751220703, "rewards/margins_std": 2.240609884262085, "rewards/rejected": -12.875274658203125, "step": 520 }, { "epoch": 1.49, "grad_norm": 5.0163934897293325, "learning_rate": 2.956529233772492e-06, "logits/chosen": -1.8143419027328491, "logits/rejected": -1.6911777257919312, "logps/chosen": -1105.9581298828125, "logps/rejected": -1680.5181884765625, "loss": 0.0611, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.72055721282959, "rewards/margins": 6.132667064666748, "rewards/margins_max": 8.71304702758789, "rewards/margins_min": 2.979393243789673, "rewards/margins_std": 2.5647242069244385, "rewards/rejected": -13.85322380065918, "step": 530 }, { "epoch": 1.52, "grad_norm": 8.7260672105137, "learning_rate": 2.8756936566714317e-06, "logits/chosen": -1.8574295043945312, "logits/rejected": -1.6885216236114502, "logps/chosen": -1066.135009765625, "logps/rejected": -1536.2845458984375, "loss": 0.0701, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.2507524490356445, "rewards/margins": 5.671202659606934, "rewards/margins_max": 8.193965911865234, "rewards/margins_min": 2.9109997749328613, "rewards/margins_std": 2.3909668922424316, "rewards/rejected": -12.921956062316895, "step": 540 }, { "epoch": 1.55, "grad_norm": 1.3009208627187219, "learning_rate": 2.794454096031429e-06, "logits/chosen": -1.9122663736343384, "logits/rejected": -1.7744579315185547, "logps/chosen": -971.1412963867188, "logps/rejected": -1555.514404296875, "loss": 0.0719, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.982313632965088, "rewards/margins": 6.4228410720825195, "rewards/margins_max": 8.959406852722168, "rewards/margins_min": 3.101313352584839, "rewards/margins_std": 2.621415615081787, "rewards/rejected": -12.405153274536133, "step": 550 }, { "epoch": 1.58, "grad_norm": 1.9328399730262527, "learning_rate": 2.71289790878446e-06, "logits/chosen": -1.8311843872070312, "logits/rejected": -1.6815801858901978, "logps/chosen": -1012.6105346679688, "logps/rejected": -1622.107666015625, "loss": 0.0697, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.073877811431885, "rewards/margins": 6.487514495849609, "rewards/margins_max": 8.834905624389648, "rewards/margins_min": 3.4811978340148926, "rewards/margins_std": 2.3898167610168457, "rewards/rejected": -13.561391830444336, "step": 560 }, { "epoch": 1.61, "grad_norm": 0.17510978882217287, "learning_rate": 2.6311127923312156e-06, "logits/chosen": -1.8733352422714233, "logits/rejected": -1.731903314590454, "logps/chosen": -1004.5771484375, "logps/rejected": -1612.7529296875, "loss": 0.042, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.689506530761719, "rewards/margins": 6.522040367126465, "rewards/margins_max": 8.876073837280273, "rewards/margins_min": 3.256171464920044, "rewards/margins_std": 2.6161324977874756, "rewards/rejected": -13.211545944213867, "step": 570 }, { "epoch": 1.63, "grad_norm": 12.817311644147658, "learning_rate": 2.549186690240057e-06, "logits/chosen": -1.7239491939544678, "logits/rejected": -1.6188468933105469, "logps/chosen": -1058.948486328125, "logps/rejected": -1677.268310546875, "loss": 0.0444, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.725058078765869, "rewards/margins": 6.387824058532715, "rewards/margins_max": 8.449275016784668, "rewards/margins_min": 3.585833787918091, "rewards/margins_std": 2.1896438598632812, "rewards/rejected": -14.112882614135742, "step": 580 }, { "epoch": 1.66, "grad_norm": 2.200716474214234, "learning_rate": 2.4672076976812548e-06, "logits/chosen": -1.7416937351226807, "logits/rejected": -1.5824648141860962, "logps/chosen": -1067.9490966796875, "logps/rejected": -1658.8199462890625, "loss": 0.0499, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.612107753753662, "rewards/margins": 6.36210823059082, "rewards/margins_max": 8.805683135986328, "rewards/margins_min": 3.530320405960083, "rewards/margins_std": 2.4696502685546875, "rewards/rejected": -13.974217414855957, "step": 590 }, { "epoch": 1.69, "grad_norm": 5.8486806702260115, "learning_rate": 2.3852639666982218e-06, "logits/chosen": -1.729406714439392, "logits/rejected": -1.5859451293945312, "logps/chosen": -1029.7244873046875, "logps/rejected": -1697.3372802734375, "loss": 0.0765, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.36349630355835, "rewards/margins": 6.701470851898193, "rewards/margins_max": 9.139188766479492, "rewards/margins_min": 3.311300754547119, "rewards/margins_std": 2.6797633171081543, "rewards/rejected": -14.064967155456543, "step": 600 }, { "epoch": 1.69, "eval_logits/chosen": -1.7166643142700195, "eval_logits/rejected": -1.6462373733520508, "eval_logps/chosen": -1312.563232421875, "eval_logps/rejected": -1434.9708251953125, "eval_loss": 1.171522855758667, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": -10.279698371887207, "eval_rewards/margins": 1.4842207431793213, "eval_rewards/margins_max": 7.0606184005737305, "eval_rewards/margins_min": -4.507997989654541, "eval_rewards/margins_std": 3.902109384536743, "eval_rewards/rejected": -11.76391887664795, "eval_runtime": 428.7286, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 600 }, { "epoch": 1.72, "grad_norm": 2.9887908700456385, "learning_rate": 2.303443611417584e-06, "logits/chosen": -1.7610228061676025, "logits/rejected": -1.5708558559417725, "logps/chosen": -1019.3812255859375, "logps/rejected": -1596.500244140625, "loss": 0.0749, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.916808128356934, "rewards/margins": 6.629319190979004, "rewards/margins_max": 9.27853775024414, "rewards/margins_min": 3.639543056488037, "rewards/margins_std": 2.523704767227173, "rewards/rejected": -13.546127319335938, "step": 610 }, { "epoch": 1.75, "grad_norm": 0.09345851725609673, "learning_rate": 2.2218346133000264e-06, "logits/chosen": -1.8310705423355103, "logits/rejected": -1.6571632623672485, "logps/chosen": -1089.475341796875, "logps/rejected": -1714.6595458984375, "loss": 0.0874, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.478503227233887, "rewards/margins": 6.835662841796875, "rewards/margins_max": 9.080436706542969, "rewards/margins_min": 3.885005235671997, "rewards/margins_std": 2.379390239715576, "rewards/rejected": -14.314167976379395, "step": 620 }, { "epoch": 1.77, "grad_norm": 13.20707399800831, "learning_rate": 2.140524726533792e-06, "logits/chosen": -1.787641167640686, "logits/rejected": -1.661877989768982, "logps/chosen": -947.0399169921875, "logps/rejected": -1524.828369140625, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": -5.729840278625488, "rewards/margins": 6.693819999694824, "rewards/margins_max": 9.224821090698242, "rewards/margins_min": 3.935499906539917, "rewards/margins_std": 2.420135021209717, "rewards/rejected": -12.423660278320312, "step": 630 }, { "epoch": 1.8, "grad_norm": 3.4772116065816014, "learning_rate": 2.059601383672566e-06, "logits/chosen": -1.8164135217666626, "logits/rejected": -1.6359403133392334, "logps/chosen": -1021.05322265625, "logps/rejected": -1599.884033203125, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": -7.091916561126709, "rewards/margins": 6.576811790466309, "rewards/margins_max": 8.615550994873047, "rewards/margins_min": 4.3320631980896, "rewards/margins_std": 2.022761821746826, "rewards/rejected": -13.668729782104492, "step": 640 }, { "epoch": 1.83, "grad_norm": 1.910640538145904, "learning_rate": 1.9791516016192214e-06, "logits/chosen": -1.7743873596191406, "logits/rejected": -1.6393556594848633, "logps/chosen": -1051.207763671875, "logps/rejected": -1660.7542724609375, "loss": 0.0612, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.210175514221191, "rewards/margins": 6.484718322753906, "rewards/margins_max": 9.02783489227295, "rewards/margins_min": 3.8449549674987793, "rewards/margins_std": 2.3319091796875, "rewards/rejected": -13.694894790649414, "step": 650 }, { "epoch": 1.86, "grad_norm": 0.4156394296306771, "learning_rate": 1.8992618880565039e-06, "logits/chosen": -1.6157350540161133, "logits/rejected": -1.5133240222930908, "logps/chosen": -1027.439453125, "logps/rejected": -1595.850830078125, "loss": 0.0679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.812713623046875, "rewards/margins": 5.9946794509887695, "rewards/margins_max": 8.839725494384766, "rewards/margins_min": 2.7734172344207764, "rewards/margins_std": 2.6815245151519775, "rewards/rejected": -13.807393074035645, "step": 660 }, { "epoch": 1.89, "grad_norm": 1.3243616077705502, "learning_rate": 1.8200181484252888e-06, "logits/chosen": -1.809934377670288, "logits/rejected": -1.6905943155288696, "logps/chosen": -1084.2518310546875, "logps/rejected": -1680.405029296875, "loss": 0.0558, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.783478736877441, "rewards/margins": 6.605474948883057, "rewards/margins_max": 9.080102920532227, "rewards/margins_min": 3.5593819618225098, "rewards/margins_std": 2.538597822189331, "rewards/rejected": -14.388954162597656, "step": 670 }, { "epoch": 1.92, "grad_norm": 4.935603103347596, "learning_rate": 1.7415055935504234e-06, "logits/chosen": -1.845766305923462, "logits/rejected": -1.6762946844100952, "logps/chosen": -1092.99609375, "logps/rejected": -1732.690185546875, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -7.822856903076172, "rewards/margins": 6.983065605163574, "rewards/margins_max": 9.305206298828125, "rewards/margins_min": 4.250351428985596, "rewards/margins_std": 2.260586738586426, "rewards/rejected": -14.80592155456543, "step": 680 }, { "epoch": 1.94, "grad_norm": 7.946766648058278, "learning_rate": 1.6638086480134954e-06, "logits/chosen": -1.7061771154403687, "logits/rejected": -1.5929887294769287, "logps/chosen": -1015.9044189453125, "logps/rejected": -1602.688232421875, "loss": 0.0565, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.775514125823975, "rewards/margins": 6.233893394470215, "rewards/margins_max": 9.065168380737305, "rewards/margins_min": 2.735471725463867, "rewards/margins_std": 2.843477725982666, "rewards/rejected": -14.009408950805664, "step": 690 }, { "epoch": 1.97, "grad_norm": 14.357423867713438, "learning_rate": 1.5870108593710473e-06, "logits/chosen": -1.6323438882827759, "logits/rejected": -1.4323724508285522, "logps/chosen": -1116.0875244140625, "logps/rejected": -1646.796875, "loss": 0.0521, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.165563583374023, "rewards/margins": 6.203519821166992, "rewards/margins_max": 8.34221363067627, "rewards/margins_min": 3.4348888397216797, "rewards/margins_std": 2.265625476837158, "rewards/rejected": -14.369084358215332, "step": 700 }, { "epoch": 1.97, "eval_logits/chosen": -1.7082782983779907, "eval_logits/rejected": -1.6383651494979858, "eval_logps/chosen": -1494.51513671875, "eval_logps/rejected": -1592.3466796875, "eval_loss": 1.10393488407135, "eval_rewards/accuracies": 0.6510000228881836, "eval_rewards/chosen": -12.099217414855957, "eval_rewards/margins": 1.2384591102600098, "eval_rewards/margins_max": 6.618937015533447, "eval_rewards/margins_min": -4.080103874206543, "eval_rewards/margins_std": 3.540152072906494, "eval_rewards/rejected": -13.337677001953125, "eval_runtime": 428.89, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.291, "step": 700 }, { "epoch": 2.0, "grad_norm": 1.9864414899165639, "learning_rate": 1.511194808315853e-06, "logits/chosen": -1.6388124227523804, "logits/rejected": -1.5256621837615967, "logps/chosen": -1023.98486328125, "logps/rejected": -1670.1683349609375, "loss": 0.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.728632926940918, "rewards/margins": 6.720816135406494, "rewards/margins_max": 8.741449356079102, "rewards/margins_min": 3.9726672172546387, "rewards/margins_std": 2.161562204360962, "rewards/rejected": -14.44944953918457, "step": 710 }, { "epoch": 2.03, "grad_norm": 2.4447392288346776, "learning_rate": 1.4364420198778662e-06, "logits/chosen": -1.9084421396255493, "logits/rejected": -1.7372974157333374, "logps/chosen": -1069.986572265625, "logps/rejected": -1748.271484375, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -7.059340000152588, "rewards/margins": 7.3868408203125, "rewards/margins_max": 9.624174118041992, "rewards/margins_min": 5.218744277954102, "rewards/margins_std": 2.0435428619384766, "rewards/rejected": -14.44618034362793, "step": 720 }, { "epoch": 2.06, "grad_norm": 0.3283356036109342, "learning_rate": 1.3628328757603243e-06, "logits/chosen": -1.7824742794036865, "logits/rejected": -1.607553243637085, "logps/chosen": -1106.8240966796875, "logps/rejected": -1757.1396484375, "loss": 0.0279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.68569278717041, "rewards/margins": 7.141098976135254, "rewards/margins_max": 9.206721305847168, "rewards/margins_min": 4.535717010498047, "rewards/margins_std": 2.135599374771118, "rewards/rejected": -14.826791763305664, "step": 730 }, { "epoch": 2.08, "grad_norm": 0.466472720676363, "learning_rate": 1.2904465279052725e-06, "logits/chosen": -1.7631629705429077, "logits/rejected": -1.602264165878296, "logps/chosen": -1061.498291015625, "logps/rejected": -1701.393798828125, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": -7.410794734954834, "rewards/margins": 6.905499458312988, "rewards/margins_max": 9.171496391296387, "rewards/margins_min": 3.907447099685669, "rewards/margins_std": 2.4243547916412354, "rewards/rejected": -14.316293716430664, "step": 740 }, { "epoch": 2.11, "grad_norm": 0.18287903072298267, "learning_rate": 1.219360813381446e-06, "logits/chosen": -1.707327127456665, "logits/rejected": -1.5934031009674072, "logps/chosen": -995.9183349609375, "logps/rejected": -1665.1839599609375, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.091825008392334, "rewards/margins": 7.211878776550293, "rewards/margins_max": 9.057371139526367, "rewards/margins_min": 5.072964668273926, "rewards/margins_std": 1.7897049188613892, "rewards/rejected": -14.303705215454102, "step": 750 }, { "epoch": 2.14, "grad_norm": 3.972318831886565, "learning_rate": 1.1496521706860392e-06, "logits/chosen": -1.6829960346221924, "logits/rejected": -1.5544617176055908, "logps/chosen": -1081.756103515625, "logps/rejected": -1768.875732421875, "loss": 0.0206, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.939679145812988, "rewards/margins": 7.1435041427612305, "rewards/margins_max": 9.630821228027344, "rewards/margins_min": 4.051230430603027, "rewards/margins_std": 2.557648181915283, "rewards/rejected": -15.083181381225586, "step": 760 }, { "epoch": 2.17, "grad_norm": 0.15453005325463406, "learning_rate": 1.0813955575503588e-06, "logits/chosen": -1.7566072940826416, "logits/rejected": -1.5845129489898682, "logps/chosen": -1044.108154296875, "logps/rejected": -1700.744140625, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -7.392706871032715, "rewards/margins": 7.490866661071777, "rewards/margins_max": 9.396993637084961, "rewards/margins_min": 5.832265377044678, "rewards/margins_std": 1.631260871887207, "rewards/rejected": -14.883572578430176, "step": 770 }, { "epoch": 2.2, "grad_norm": 5.041769273622829, "learning_rate": 1.0146643703377488e-06, "logits/chosen": -1.817198395729065, "logits/rejected": -1.6213362216949463, "logps/chosen": -1110.951416015625, "logps/rejected": -1716.0474853515625, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -7.933794975280762, "rewards/margins": 6.956693172454834, "rewards/margins_max": 9.445747375488281, "rewards/margins_min": 4.286118984222412, "rewards/margins_std": 2.26704740524292, "rewards/rejected": -14.89048957824707, "step": 780 }, { "epoch": 2.23, "grad_norm": 0.028319940482359873, "learning_rate": 9.495303651204496e-07, "logits/chosen": -1.7651485204696655, "logits/rejected": -1.5782719850540161, "logps/chosen": -1116.5997314453125, "logps/rejected": -1775.474853515625, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -7.942474365234375, "rewards/margins": 7.359992027282715, "rewards/margins_max": 9.237930297851562, "rewards/margins_min": 5.0483293533325195, "rewards/margins_std": 1.877681016921997, "rewards/rejected": -15.302465438842773, "step": 790 }, { "epoch": 2.25, "grad_norm": 5.560910630060733, "learning_rate": 8.860635805202616e-07, "logits/chosen": -1.7791054248809814, "logits/rejected": -1.6470226049423218, "logps/chosen": -1128.700439453125, "logps/rejected": -1887.7562255859375, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -8.071495056152344, "rewards/margins": 7.690678596496582, "rewards/margins_max": 10.314142227172852, "rewards/margins_min": 4.382508754730225, "rewards/margins_std": 2.6400108337402344, "rewards/rejected": -15.762173652648926, "step": 800 }, { "epoch": 2.25, "eval_logits/chosen": -1.7630056142807007, "eval_logits/rejected": -1.6934845447540283, "eval_logps/chosen": -1308.7979736328125, "eval_logps/rejected": -1442.1707763671875, "eval_loss": 1.2213647365570068, "eval_rewards/accuracies": 0.6600000262260437, "eval_rewards/chosen": -10.242044448852539, "eval_rewards/margins": 1.5938735008239746, "eval_rewards/margins_max": 7.453612327575684, "eval_rewards/margins_min": -4.738708972930908, "eval_rewards/margins_std": 4.117012023925781, "eval_rewards/rejected": -11.835918426513672, "eval_runtime": 428.6302, "eval_samples_per_second": 4.666, "eval_steps_per_second": 0.292, "step": 800 }, { "epoch": 2.28, "grad_norm": 2.130920241454253, "learning_rate": 8.24332262395994e-07, "logits/chosen": -1.8262383937835693, "logits/rejected": -1.701570749282837, "logps/chosen": -990.9318237304688, "logps/rejected": -1709.775390625, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -6.961573600769043, "rewards/margins": 7.437863826751709, "rewards/margins_max": 9.480931282043457, "rewards/margins_min": 4.916778087615967, "rewards/margins_std": 2.042966365814209, "rewards/rejected": -14.399436950683594, "step": 810 }, { "epoch": 2.31, "grad_norm": 2.7824509845813816, "learning_rate": 7.644027904586587e-07, "logits/chosen": -1.7199032306671143, "logits/rejected": -1.584393858909607, "logps/chosen": -1131.29541015625, "logps/rejected": -1871.33984375, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -8.013903617858887, "rewards/margins": 7.894224643707275, "rewards/margins_max": 10.34322738647461, "rewards/margins_min": 5.228058338165283, "rewards/margins_std": 2.269243001937866, "rewards/rejected": -15.908126831054688, "step": 820 }, { "epoch": 2.34, "grad_norm": 2.942249921804053, "learning_rate": 7.06339606893347e-07, "logits/chosen": -1.7625993490219116, "logits/rejected": -1.552851915359497, "logps/chosen": -1175.3865966796875, "logps/rejected": -1861.589599609375, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -7.902280330657959, "rewards/margins": 8.019886016845703, "rewards/margins_max": 10.1281156539917, "rewards/margins_min": 6.0280256271362305, "rewards/margins_std": 1.8365955352783203, "rewards/rejected": -15.92216682434082, "step": 830 }, { "epoch": 2.37, "grad_norm": 0.9426802566028485, "learning_rate": 6.502051470645149e-07, "logits/chosen": -1.780339241027832, "logits/rejected": -1.6216917037963867, "logps/chosen": -1083.676513671875, "logps/rejected": -1733.9345703125, "loss": 0.0234, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.57614278793335, "rewards/margins": 7.293878078460693, "rewards/margins_max": 9.180914878845215, "rewards/margins_min": 4.970505714416504, "rewards/margins_std": 1.8781248331069946, "rewards/rejected": -14.870019912719727, "step": 840 }, { "epoch": 2.39, "grad_norm": 0.2273620604649508, "learning_rate": 5.960597723792194e-07, "logits/chosen": -1.7474027872085571, "logits/rejected": -1.575292944908142, "logps/chosen": -1081.188232421875, "logps/rejected": -1787.9605712890625, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -7.850655555725098, "rewards/margins": 7.63167667388916, "rewards/margins_max": 10.038192749023438, "rewards/margins_min": 5.033900737762451, "rewards/margins_std": 2.2446444034576416, "rewards/rejected": -15.482332229614258, "step": 850 }, { "epoch": 2.42, "grad_norm": 2.008660400899101, "learning_rate": 5.43961705380465e-07, "logits/chosen": -1.791469931602478, "logits/rejected": -1.6313838958740234, "logps/chosen": -1132.4666748046875, "logps/rejected": -1828.349609375, "loss": 0.0326, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.8505706787109375, "rewards/margins": 7.863633632659912, "rewards/margins_max": 10.446196556091309, "rewards/margins_min": 4.516094207763672, "rewards/margins_std": 2.653343915939331, "rewards/rejected": -15.714204788208008, "step": 860 }, { "epoch": 2.45, "grad_norm": 1.9443236752501327, "learning_rate": 4.939669671404871e-07, "logits/chosen": -1.708809494972229, "logits/rejected": -1.5626459121704102, "logps/chosen": -1073.6954345703125, "logps/rejected": -1811.253662109375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -7.609139919281006, "rewards/margins": 7.719372749328613, "rewards/margins_max": 9.861176490783691, "rewards/margins_min": 5.364067077636719, "rewards/margins_std": 2.00132155418396, "rewards/rejected": -15.328509330749512, "step": 870 }, { "epoch": 2.48, "grad_norm": 2.06741221987676, "learning_rate": 4.461293170212644e-07, "logits/chosen": -1.8483781814575195, "logits/rejected": -1.6546274423599243, "logps/chosen": -1123.468017578125, "logps/rejected": -1798.621826171875, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -7.950200080871582, "rewards/margins": 7.411231994628906, "rewards/margins_max": 10.043291091918945, "rewards/margins_min": 4.137426853179932, "rewards/margins_std": 2.554241418838501, "rewards/rejected": -15.361432075500488, "step": 880 }, { "epoch": 2.51, "grad_norm": 0.8360988782034983, "learning_rate": 4.005001948670606e-07, "logits/chosen": -1.813595175743103, "logits/rejected": -1.6409099102020264, "logps/chosen": -1167.838623046875, "logps/rejected": -1849.715576171875, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -8.237103462219238, "rewards/margins": 7.6171464920043945, "rewards/margins_max": 10.030054092407227, "rewards/margins_min": 5.162562847137451, "rewards/margins_std": 2.175448417663574, "rewards/rejected": -15.854248046875, "step": 890 }, { "epoch": 2.54, "grad_norm": 0.28012086124588453, "learning_rate": 3.571286656911377e-07, "logits/chosen": -1.765481948852539, "logits/rejected": -1.5610095262527466, "logps/chosen": -1176.97509765625, "logps/rejected": -1906.4827880859375, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -8.486291885375977, "rewards/margins": 7.7398223876953125, "rewards/margins_max": 10.43345832824707, "rewards/margins_min": 4.932800769805908, "rewards/margins_std": 2.4372851848602295, "rewards/rejected": -16.226112365722656, "step": 900 }, { "epoch": 2.54, "eval_logits/chosen": -1.7013623714447021, "eval_logits/rejected": -1.6318581104278564, "eval_logps/chosen": -1451.88916015625, "eval_logps/rejected": -1581.395751953125, "eval_loss": 1.202013373374939, "eval_rewards/accuracies": 0.6620000004768372, "eval_rewards/chosen": -11.672956466674805, "eval_rewards/margins": 1.555212140083313, "eval_rewards/margins_max": 7.462009906768799, "eval_rewards/margins_min": -4.611362457275391, "eval_rewards/margins_std": 4.051472187042236, "eval_rewards/rejected": -13.228167533874512, "eval_runtime": 428.5009, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.292, "step": 900 }, { "epoch": 2.56, "grad_norm": 0.6107279357125659, "learning_rate": 3.1606136691612555e-07, "logits/chosen": -1.7235673666000366, "logits/rejected": -1.5583069324493408, "logps/chosen": -1131.2056884765625, "logps/rejected": -1782.4332275390625, "loss": 0.0174, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.073932647705078, "rewards/margins": 7.4360671043396, "rewards/margins_max": 9.632651329040527, "rewards/margins_min": 5.327752113342285, "rewards/margins_std": 1.8935825824737549, "rewards/rejected": -15.50999927520752, "step": 910 }, { "epoch": 2.59, "grad_norm": 0.00966975682935343, "learning_rate": 2.773424582247844e-07, "logits/chosen": -1.6917803287506104, "logits/rejected": -1.4805718660354614, "logps/chosen": -1141.4068603515625, "logps/rejected": -1758.5318603515625, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -8.279281616210938, "rewards/margins": 7.198742866516113, "rewards/margins_max": 9.452996253967285, "rewards/margins_min": 4.647868633270264, "rewards/margins_std": 2.1528563499450684, "rewards/rejected": -15.478025436401367, "step": 920 }, { "epoch": 2.62, "grad_norm": 3.0376153555107446, "learning_rate": 2.410135740750821e-07, "logits/chosen": -1.7053037881851196, "logits/rejected": -1.5509663820266724, "logps/chosen": -1090.0576171875, "logps/rejected": -1777.7945556640625, "loss": 0.043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.083145141601562, "rewards/margins": 7.410282135009766, "rewards/margins_max": 9.764742851257324, "rewards/margins_min": 5.139273643493652, "rewards/margins_std": 2.1024787425994873, "rewards/rejected": -15.493428230285645, "step": 930 }, { "epoch": 2.65, "grad_norm": 0.6859350599797326, "learning_rate": 2.0711377893064182e-07, "logits/chosen": -1.8094221353530884, "logits/rejected": -1.6414306163787842, "logps/chosen": -1164.137451171875, "logps/rejected": -1852.5625, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": -8.158674240112305, "rewards/margins": 7.432664394378662, "rewards/margins_max": 10.111716270446777, "rewards/margins_min": 4.017355442047119, "rewards/margins_std": 2.706058979034424, "rewards/rejected": -15.591337203979492, "step": 940 }, { "epoch": 2.68, "grad_norm": 6.314035361122387, "learning_rate": 1.756795252547111e-07, "logits/chosen": -1.665837287902832, "logits/rejected": -1.5277420282363892, "logps/chosen": -1078.7557373046875, "logps/rejected": -1684.4287109375, "loss": 0.0295, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.158330917358398, "rewards/margins": 6.608637809753418, "rewards/margins_max": 9.02342700958252, "rewards/margins_min": 3.459864854812622, "rewards/margins_std": 2.52087664604187, "rewards/rejected": -14.766969680786133, "step": 950 }, { "epoch": 2.7, "grad_norm": 7.031354165895073, "learning_rate": 1.4674461431281013e-07, "logits/chosen": -1.7678325176239014, "logits/rejected": -1.6092376708984375, "logps/chosen": -1103.3350830078125, "logps/rejected": -1758.6500244140625, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -8.200953483581543, "rewards/margins": 7.219841957092285, "rewards/margins_max": 9.616026878356934, "rewards/margins_min": 4.519529819488525, "rewards/margins_std": 2.263463258743286, "rewards/rejected": -15.420794486999512, "step": 960 }, { "epoch": 2.73, "grad_norm": 0.3134845483065753, "learning_rate": 1.2034015982622243e-07, "logits/chosen": -1.7572071552276611, "logits/rejected": -1.5487779378890991, "logps/chosen": -1225.2569580078125, "logps/rejected": -1896.434326171875, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -8.728785514831543, "rewards/margins": 7.427072048187256, "rewards/margins_max": 9.873006820678711, "rewards/margins_min": 4.541165351867676, "rewards/margins_std": 2.364122152328491, "rewards/rejected": -16.155858993530273, "step": 970 }, { "epoch": 2.76, "grad_norm": 0.3690247126654468, "learning_rate": 9.649455451539419e-08, "logits/chosen": -1.6380853652954102, "logits/rejected": -1.4841035604476929, "logps/chosen": -1118.8951416015625, "logps/rejected": -1833.6126708984375, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": -8.304391860961914, "rewards/margins": 7.7766313552856445, "rewards/margins_max": 10.320856094360352, "rewards/margins_min": 5.046825885772705, "rewards/margins_std": 2.3247740268707275, "rewards/rejected": -16.081022262573242, "step": 980 }, { "epoch": 2.79, "grad_norm": 0.035471082675790036, "learning_rate": 7.523343956923196e-08, "logits/chosen": -1.7599372863769531, "logits/rejected": -1.5641086101531982, "logps/chosen": -1154.5972900390625, "logps/rejected": -1892.8466796875, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -8.36001205444336, "rewards/margins": 7.979167938232422, "rewards/margins_max": 10.33701229095459, "rewards/margins_min": 5.4104814529418945, "rewards/margins_std": 2.1920626163482666, "rewards/rejected": -16.33917999267578, "step": 990 }, { "epoch": 2.82, "grad_norm": 3.7962060660896455, "learning_rate": 5.657967707312195e-08, "logits/chosen": -1.6692126989364624, "logits/rejected": -1.5857051610946655, "logps/chosen": -1184.306884765625, "logps/rejected": -1848.2109375, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -8.908744812011719, "rewards/margins": 6.874230861663818, "rewards/margins_max": 9.302106857299805, "rewards/margins_min": 3.831719160079956, "rewards/margins_std": 2.495060443878174, "rewards/rejected": -15.782976150512695, "step": 1000 }, { "epoch": 2.82, "eval_logits/chosen": -1.6955701112747192, "eval_logits/rejected": -1.6262598037719727, "eval_logps/chosen": -1466.096923828125, "eval_logps/rejected": -1594.279541015625, "eval_loss": 1.2153818607330322, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": -11.815034866333008, "eval_rewards/margins": 1.5419700145721436, "eval_rewards/margins_max": 7.536928653717041, "eval_rewards/margins_min": -4.68462610244751, "eval_rewards/margins_std": 4.09072208404541, "eval_rewards/rejected": -13.357006072998047, "eval_runtime": 428.679, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 1000 }, { "epoch": 2.85, "grad_norm": 0.6729976013886217, "learning_rate": 4.055332542531959e-08, "logits/chosen": -1.7815234661102295, "logits/rejected": -1.622179627418518, "logps/chosen": -1156.6016845703125, "logps/rejected": -1884.806884765625, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -8.329094886779785, "rewards/margins": 7.462734222412109, "rewards/margins_max": 10.055309295654297, "rewards/margins_min": 4.672645568847656, "rewards/margins_std": 2.401289701461792, "rewards/rejected": -15.791829109191895, "step": 1010 }, { "epoch": 2.87, "grad_norm": 0.3931332359603542, "learning_rate": 2.7171617768147472e-08, "logits/chosen": -1.757817268371582, "logits/rejected": -1.6103594303131104, "logps/chosen": -1205.610107421875, "logps/rejected": -1883.1265869140625, "loss": 0.0152, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.91980266571045, "rewards/margins": 7.268828392028809, "rewards/margins_max": 9.548690795898438, "rewards/margins_min": 4.854549884796143, "rewards/margins_std": 2.135824203491211, "rewards/rejected": -16.18863296508789, "step": 1020 }, { "epoch": 2.9, "grad_norm": 0.22753287376533807, "learning_rate": 1.6448943457189616e-08, "logits/chosen": -1.680837869644165, "logits/rejected": -1.540766716003418, "logps/chosen": -1161.3184814453125, "logps/rejected": -1843.625, "loss": 0.0264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.517123222351074, "rewards/margins": 7.349859714508057, "rewards/margins_max": 10.050976753234863, "rewards/margins_min": 4.578632354736328, "rewards/margins_std": 2.4442994594573975, "rewards/rejected": -15.866983413696289, "step": 1030 }, { "epoch": 2.93, "grad_norm": 0.6569270546900866, "learning_rate": 8.39683258841123e-09, "logits/chosen": -1.621664047241211, "logits/rejected": -1.4453307390213013, "logps/chosen": -1106.493896484375, "logps/rejected": -1756.8822021484375, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -8.116470336914062, "rewards/margins": 7.085239410400391, "rewards/margins_max": 9.558416366577148, "rewards/margins_min": 4.0455121994018555, "rewards/margins_std": 2.4827523231506348, "rewards/rejected": -15.20171070098877, "step": 1040 }, { "epoch": 2.96, "grad_norm": 1.8056138868772267, "learning_rate": 3.0239435998430376e-09, "logits/chosen": -1.7272727489471436, "logits/rejected": -1.5463558435440063, "logps/chosen": -1105.938720703125, "logps/rejected": -1762.3658447265625, "loss": 0.0288, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -8.043752670288086, "rewards/margins": 7.3192243576049805, "rewards/margins_max": 9.873791694641113, "rewards/margins_min": 4.222177028656006, "rewards/margins_std": 2.4955527782440186, "rewards/rejected": -15.36297607421875, "step": 1050 }, { "epoch": 2.99, "grad_norm": 0.37053192172842564, "learning_rate": 3.3605396115826695e-10, "logits/chosen": -1.6333061456680298, "logits/rejected": -1.5385651588439941, "logps/chosen": -1083.2177734375, "logps/rejected": -1845.8958740234375, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -8.038263320922852, "rewards/margins": 7.885945796966553, "rewards/margins_max": 9.852472305297852, "rewards/margins_min": 5.348562717437744, "rewards/margins_std": 1.9733645915985107, "rewards/rejected": -15.924209594726562, "step": 1060 }, { "epoch": 3.0, "step": 1065, "total_flos": 0.0, "train_loss": 0.14573693349257882, "train_runtime": 13238.8899, "train_samples_per_second": 1.287, "train_steps_per_second": 0.08 } ], "logging_steps": 10, "max_steps": 1065, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }