zephyr-dpo-qlora-uf-ours-5e-6 / trainer_state.json
just1nseo's picture
Model save
3f6f664 verified
raw
history blame contribute delete
No virus
78.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 1065,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.601457471427555,
"learning_rate": 4.672897196261682e-08,
"logits/chosen": -2.861618995666504,
"logits/rejected": -2.8205904960632324,
"logps/chosen": -271.06011962890625,
"logps/rejected": -211.1704559326172,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.03,
"grad_norm": 9.383478018784075,
"learning_rate": 4.6728971962616824e-07,
"logits/chosen": -2.834562063217163,
"logits/rejected": -2.7922489643096924,
"logps/chosen": -325.0357360839844,
"logps/rejected": -274.966796875,
"loss": 0.6931,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.00014581691357307136,
"rewards/margins": 0.0001575500500621274,
"rewards/margins_max": 0.0024408893659710884,
"rewards/margins_min": -0.002742145210504532,
"rewards/margins_std": 0.0023130779154598713,
"rewards/rejected": -1.173312557511963e-05,
"step": 10
},
{
"epoch": 0.06,
"grad_norm": 1.8412658637892019,
"learning_rate": 9.345794392523365e-07,
"logits/chosen": -2.7256200313568115,
"logits/rejected": -2.707315444946289,
"logps/chosen": -293.6407775878906,
"logps/rejected": -215.7820281982422,
"loss": 0.6922,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0018517475109547377,
"rewards/margins": 0.0018822858110070229,
"rewards/margins_max": 0.005471331533044577,
"rewards/margins_min": -0.0010383042972534895,
"rewards/margins_std": 0.002963448641821742,
"rewards/rejected": -3.053832188015804e-05,
"step": 20
},
{
"epoch": 0.08,
"grad_norm": 2.174968684179302,
"learning_rate": 1.4018691588785047e-06,
"logits/chosen": -2.8197181224823,
"logits/rejected": -2.7506394386291504,
"logps/chosen": -302.8995666503906,
"logps/rejected": -232.47256469726562,
"loss": 0.6888,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.008063090965151787,
"rewards/margins": 0.007646501995623112,
"rewards/margins_max": 0.015395646914839745,
"rewards/margins_min": 0.0007923411321826279,
"rewards/margins_std": 0.006716990377753973,
"rewards/rejected": 0.0004165889695286751,
"step": 30
},
{
"epoch": 0.11,
"grad_norm": 1.7099389772513702,
"learning_rate": 1.869158878504673e-06,
"logits/chosen": -2.8403024673461914,
"logits/rejected": -2.759880781173706,
"logps/chosen": -275.9002380371094,
"logps/rejected": -225.5954132080078,
"loss": 0.6849,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.01584392786026001,
"rewards/margins": 0.014450883492827415,
"rewards/margins_max": 0.03173653036355972,
"rewards/margins_min": -2.6600435376167297e-05,
"rewards/margins_std": 0.014551711268723011,
"rewards/rejected": 0.0013930455315858126,
"step": 40
},
{
"epoch": 0.14,
"grad_norm": 2.1338277224043574,
"learning_rate": 2.3364485981308413e-06,
"logits/chosen": -2.8058629035949707,
"logits/rejected": -2.734032154083252,
"logps/chosen": -271.67120361328125,
"logps/rejected": -233.6707305908203,
"loss": 0.6753,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.038989000022411346,
"rewards/margins": 0.036923374980688095,
"rewards/margins_max": 0.08067025989294052,
"rewards/margins_min": 0.006618264131247997,
"rewards/margins_std": 0.03399632126092911,
"rewards/rejected": 0.0020656271371990442,
"step": 50
},
{
"epoch": 0.17,
"grad_norm": 2.3538977095192313,
"learning_rate": 2.8037383177570094e-06,
"logits/chosen": -2.739483594894409,
"logits/rejected": -2.7014524936676025,
"logps/chosen": -306.43206787109375,
"logps/rejected": -262.4384460449219,
"loss": 0.6619,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.0720754936337471,
"rewards/margins": 0.06874484568834305,
"rewards/margins_max": 0.12744362652301788,
"rewards/margins_min": 0.017528068274259567,
"rewards/margins_std": 0.04889371618628502,
"rewards/rejected": 0.0033306567929685116,
"step": 60
},
{
"epoch": 0.2,
"grad_norm": 1.673361144474326,
"learning_rate": 3.2710280373831774e-06,
"logits/chosen": -2.761547565460205,
"logits/rejected": -2.701035976409912,
"logps/chosen": -312.3368225097656,
"logps/rejected": -234.6005401611328,
"loss": 0.6461,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.1052437424659729,
"rewards/margins": 0.09483315050601959,
"rewards/margins_max": 0.19849452376365662,
"rewards/margins_min": 0.015507131814956665,
"rewards/margins_std": 0.08316393196582794,
"rewards/rejected": 0.010410590097308159,
"step": 70
},
{
"epoch": 0.23,
"grad_norm": 1.8350886553726478,
"learning_rate": 3.738317757009346e-06,
"logits/chosen": -2.7897353172302246,
"logits/rejected": -2.7348127365112305,
"logps/chosen": -310.0438537597656,
"logps/rejected": -290.1259765625,
"loss": 0.6264,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.11226633936166763,
"rewards/margins": 0.14973895251750946,
"rewards/margins_max": 0.30203038454055786,
"rewards/margins_min": 0.01934988982975483,
"rewards/margins_std": 0.13135038316249847,
"rewards/rejected": -0.03747261315584183,
"step": 80
},
{
"epoch": 0.25,
"grad_norm": 2.370057132370328,
"learning_rate": 4.205607476635514e-06,
"logits/chosen": -2.6879115104675293,
"logits/rejected": -2.650247812271118,
"logps/chosen": -264.0439453125,
"logps/rejected": -208.5765380859375,
"loss": 0.5913,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.11950834840536118,
"rewards/margins": 0.21540161967277527,
"rewards/margins_max": 0.40502986311912537,
"rewards/margins_min": 0.061323970556259155,
"rewards/margins_std": 0.15978315472602844,
"rewards/rejected": -0.09589327871799469,
"step": 90
},
{
"epoch": 0.28,
"grad_norm": 2.3715260848384814,
"learning_rate": 4.6728971962616825e-06,
"logits/chosen": -2.6909117698669434,
"logits/rejected": -2.6588971614837646,
"logps/chosen": -273.89483642578125,
"logps/rejected": -280.07440185546875,
"loss": 0.5649,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.07516907155513763,
"rewards/margins": 0.25403863191604614,
"rewards/margins_max": 0.5021854639053345,
"rewards/margins_min": 0.0338195376098156,
"rewards/margins_std": 0.20746219158172607,
"rewards/rejected": -0.1788695752620697,
"step": 100
},
{
"epoch": 0.28,
"eval_logits/chosen": -2.6587636470794678,
"eval_logits/rejected": -2.624938726425171,
"eval_logps/chosen": -294.36553955078125,
"eval_logps/rejected": -276.0350341796875,
"eval_loss": 0.6725258231163025,
"eval_rewards/accuracies": 0.6029999852180481,
"eval_rewards/chosen": -0.09772102534770966,
"eval_rewards/margins": 0.07684005051851273,
"eval_rewards/margins_max": 0.4634929597377777,
"eval_rewards/margins_min": -0.27960655093193054,
"eval_rewards/margins_std": 0.25082939863204956,
"eval_rewards/rejected": -0.17456106841564178,
"eval_runtime": 429.6888,
"eval_samples_per_second": 4.655,
"eval_steps_per_second": 0.291,
"step": 100
},
{
"epoch": 0.31,
"grad_norm": 4.663865383973278,
"learning_rate": 4.999879018839288e-06,
"logits/chosen": -2.696274518966675,
"logits/rejected": -2.6191954612731934,
"logps/chosen": -361.00341796875,
"logps/rejected": -324.7152404785156,
"loss": 0.4866,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.1360231339931488,
"rewards/margins": 0.5188378095626831,
"rewards/margins_max": 0.8877674341201782,
"rewards/margins_min": 0.15628832578659058,
"rewards/margins_std": 0.33103400468826294,
"rewards/rejected": -0.3828147053718567,
"step": 110
},
{
"epoch": 0.34,
"grad_norm": 2.659078012596696,
"learning_rate": 4.99772856836941e-06,
"logits/chosen": -2.6332004070281982,
"logits/rejected": -2.58402681350708,
"logps/chosen": -338.8200988769531,
"logps/rejected": -314.74078369140625,
"loss": 0.4569,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.09566140174865723,
"rewards/margins": 0.5811273455619812,
"rewards/margins_max": 1.0773193836212158,
"rewards/margins_min": 0.19689173996448517,
"rewards/margins_std": 0.4066368043422699,
"rewards/rejected": -0.4854659140110016,
"step": 120
},
{
"epoch": 0.37,
"grad_norm": 5.638039796957378,
"learning_rate": 4.992892309373227e-06,
"logits/chosen": -2.5800509452819824,
"logits/rejected": -2.5182909965515137,
"logps/chosen": -377.07415771484375,
"logps/rejected": -370.76007080078125,
"loss": 0.4111,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.05290098860859871,
"rewards/margins": 0.7561925649642944,
"rewards/margins_max": 1.318340539932251,
"rewards/margins_min": 0.10839029401540756,
"rewards/margins_std": 0.5403656363487244,
"rewards/rejected": -0.7032915949821472,
"step": 130
},
{
"epoch": 0.39,
"grad_norm": 3.185506159687688,
"learning_rate": 4.985375442281969e-06,
"logits/chosen": -2.529670476913452,
"logits/rejected": -2.505495548248291,
"logps/chosen": -311.046875,
"logps/rejected": -341.42388916015625,
"loss": 0.4278,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.07367168366909027,
"rewards/margins": 0.7894155383110046,
"rewards/margins_max": 1.4857099056243896,
"rewards/margins_min": 0.17245283722877502,
"rewards/margins_std": 0.6018984317779541,
"rewards/rejected": -0.715743899345398,
"step": 140
},
{
"epoch": 0.42,
"grad_norm": 6.522701528001161,
"learning_rate": 4.9751860499858175e-06,
"logits/chosen": -2.501380443572998,
"logits/rejected": -2.4765429496765137,
"logps/chosen": -295.21844482421875,
"logps/rejected": -294.5282897949219,
"loss": 0.4,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0653342604637146,
"rewards/margins": 0.7491210699081421,
"rewards/margins_max": 1.2866442203521729,
"rewards/margins_min": 0.1819653958082199,
"rewards/margins_std": 0.5079216957092285,
"rewards/rejected": -0.8144553303718567,
"step": 150
},
{
"epoch": 0.45,
"grad_norm": 7.099952708342032,
"learning_rate": 4.962335089142376e-06,
"logits/chosen": -2.4243741035461426,
"logits/rejected": -2.382873058319092,
"logps/chosen": -311.75506591796875,
"logps/rejected": -337.52227783203125,
"loss": 0.357,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.11208178848028183,
"rewards/margins": 0.919207751750946,
"rewards/margins_max": 1.5249192714691162,
"rewards/margins_min": 0.28068000078201294,
"rewards/margins_std": 0.5570467710494995,
"rewards/rejected": -1.0312894582748413,
"step": 160
},
{
"epoch": 0.48,
"grad_norm": 15.17640060673072,
"learning_rate": 4.946836378394967e-06,
"logits/chosen": -2.3504722118377686,
"logits/rejected": -2.3078646659851074,
"logps/chosen": -345.75726318359375,
"logps/rejected": -430.4729919433594,
"loss": 0.3207,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.16135653853416443,
"rewards/margins": 1.1807162761688232,
"rewards/margins_max": 1.7726972103118896,
"rewards/margins_min": 0.30320629477500916,
"rewards/margins_std": 0.6691843867301941,
"rewards/rejected": -1.34207284450531,
"step": 170
},
{
"epoch": 0.51,
"grad_norm": 8.646835771533034,
"learning_rate": 4.928706583513441e-06,
"logits/chosen": -2.1459343433380127,
"logits/rejected": -2.055025577545166,
"logps/chosen": -378.0511779785156,
"logps/rejected": -468.014404296875,
"loss": 0.3002,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.46150344610214233,
"rewards/margins": 1.2767913341522217,
"rewards/margins_max": 2.0464911460876465,
"rewards/margins_min": 0.511903703212738,
"rewards/margins_std": 0.6761992573738098,
"rewards/rejected": -1.7382948398590088,
"step": 180
},
{
"epoch": 0.54,
"grad_norm": 4.978015250452758,
"learning_rate": 4.907965199473471e-06,
"logits/chosen": -1.873817801475525,
"logits/rejected": -1.7417463064193726,
"logps/chosen": -362.2750549316406,
"logps/rejected": -456.6219787597656,
"loss": 0.2276,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.40314167737960815,
"rewards/margins": 1.7882015705108643,
"rewards/margins_max": 2.7738163471221924,
"rewards/margins_min": 0.8575057983398438,
"rewards/margins_std": 0.8512203097343445,
"rewards/rejected": -2.191343069076538,
"step": 190
},
{
"epoch": 0.56,
"grad_norm": 9.452666973020474,
"learning_rate": 4.884634529493591e-06,
"logits/chosen": -1.8183701038360596,
"logits/rejected": -1.7065311670303345,
"logps/chosen": -416.6236877441406,
"logps/rejected": -549.5675048828125,
"loss": 0.2267,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7709085941314697,
"rewards/margins": 2.068791627883911,
"rewards/margins_max": 3.4109108448028564,
"rewards/margins_min": 0.585421621799469,
"rewards/margins_std": 1.2988938093185425,
"rewards/rejected": -2.839700222015381,
"step": 200
},
{
"epoch": 0.56,
"eval_logits/chosen": -1.6714030504226685,
"eval_logits/rejected": -1.6187551021575928,
"eval_logps/chosen": -474.511962890625,
"eval_logps/rejected": -497.81463623046875,
"eval_loss": 0.7397594451904297,
"eval_rewards/accuracies": 0.6439999938011169,
"eval_rewards/chosen": -1.899185299873352,
"eval_rewards/margins": 0.49317169189453125,
"eval_rewards/margins_max": 2.671410083770752,
"eval_rewards/margins_min": -1.8999947309494019,
"eval_rewards/margins_std": 1.5475962162017822,
"eval_rewards/rejected": -2.392357110977173,
"eval_runtime": 429.7827,
"eval_samples_per_second": 4.654,
"eval_steps_per_second": 0.291,
"step": 200
},
{
"epoch": 0.59,
"grad_norm": 8.87270228770415,
"learning_rate": 4.858739661052539e-06,
"logits/chosen": -1.511608600616455,
"logits/rejected": -1.4413245916366577,
"logps/chosen": -427.55413818359375,
"logps/rejected": -620.9583740234375,
"loss": 0.1779,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.0736979246139526,
"rewards/margins": 2.571638822555542,
"rewards/margins_max": 4.121321678161621,
"rewards/margins_min": 0.6724111437797546,
"rewards/margins_std": 1.547525405883789,
"rewards/rejected": -3.645336866378784,
"step": 210
},
{
"epoch": 0.62,
"grad_norm": 6.5430570772956,
"learning_rate": 4.830308438912687e-06,
"logits/chosen": -1.3631094694137573,
"logits/rejected": -1.1896626949310303,
"logps/chosen": -610.7598876953125,
"logps/rejected": -881.2283325195312,
"loss": 0.1472,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.5730366706848145,
"rewards/margins": 3.4959709644317627,
"rewards/margins_max": 5.009349822998047,
"rewards/margins_min": 1.5561037063598633,
"rewards/margins_std": 1.543906331062317,
"rewards/rejected": -6.069007396697998,
"step": 220
},
{
"epoch": 0.65,
"grad_norm": 11.567738598963295,
"learning_rate": 4.799371435178544e-06,
"logits/chosen": -1.2935478687286377,
"logits/rejected": -1.1057153940200806,
"logps/chosen": -756.6351318359375,
"logps/rejected": -983.3760986328125,
"loss": 0.2065,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.8495945930480957,
"rewards/margins": 3.363232135772705,
"rewards/margins_max": 5.4596266746521,
"rewards/margins_min": 0.4015835225582123,
"rewards/margins_std": 2.3402669429779053,
"rewards/rejected": -7.212827205657959,
"step": 230
},
{
"epoch": 0.68,
"grad_norm": 14.908052027638925,
"learning_rate": 4.765961916422575e-06,
"logits/chosen": -1.3409693241119385,
"logits/rejected": -1.2054760456085205,
"logps/chosen": -675.9885864257812,
"logps/rejected": -992.09375,
"loss": 0.1872,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.511915683746338,
"rewards/margins": 3.5772738456726074,
"rewards/margins_max": 5.6575751304626465,
"rewards/margins_min": 1.1669104099273682,
"rewards/margins_std": 2.040917158126831,
"rewards/rejected": -7.089189052581787,
"step": 240
},
{
"epoch": 0.7,
"grad_norm": 9.045837659115827,
"learning_rate": 4.730115807913627e-06,
"logits/chosen": -1.4189417362213135,
"logits/rejected": -1.2720701694488525,
"logps/chosen": -674.1248779296875,
"logps/rejected": -974.5089721679688,
"loss": 0.1161,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -3.62843656539917,
"rewards/margins": 3.6062092781066895,
"rewards/margins_max": 5.835866451263428,
"rewards/margins_min": 1.5058424472808838,
"rewards/margins_std": 1.905207633972168,
"rewards/rejected": -7.234647274017334,
"step": 250
},
{
"epoch": 0.73,
"grad_norm": 9.416665631409534,
"learning_rate": 4.691871654986485e-06,
"logits/chosen": -1.5399147272109985,
"logits/rejected": -1.3777363300323486,
"logps/chosen": -710.0699462890625,
"logps/rejected": -1064.373779296875,
"loss": 0.1185,
"rewards/accuracies": 0.9375,
"rewards/chosen": -4.1797637939453125,
"rewards/margins": 3.8746650218963623,
"rewards/margins_max": 5.889615058898926,
"rewards/margins_min": 1.7422330379486084,
"rewards/margins_std": 1.8929340839385986,
"rewards/rejected": -8.054429054260254,
"step": 260
},
{
"epoch": 0.76,
"grad_norm": 56.620770226956026,
"learning_rate": 4.651270581594054e-06,
"logits/chosen": -1.5505702495574951,
"logits/rejected": -1.439883828163147,
"logps/chosen": -655.2439575195312,
"logps/rejected": -985.9658203125,
"loss": 0.2278,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -3.4612839221954346,
"rewards/margins": 3.773378372192383,
"rewards/margins_max": 5.983767509460449,
"rewards/margins_min": 1.2523890733718872,
"rewards/margins_std": 2.149752378463745,
"rewards/rejected": -7.234662055969238,
"step": 270
},
{
"epoch": 0.79,
"grad_norm": 9.941259668614844,
"learning_rate": 4.6083562460867545e-06,
"logits/chosen": -1.4796500205993652,
"logits/rejected": -1.3813179731369019,
"logps/chosen": -780.708984375,
"logps/rejected": -1187.9755859375,
"loss": 0.1019,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -4.772520542144775,
"rewards/margins": 4.3812642097473145,
"rewards/margins_max": 6.6738691329956055,
"rewards/margins_min": 1.5353296995162964,
"rewards/margins_std": 2.349224805831909,
"rewards/rejected": -9.153783798217773,
"step": 280
},
{
"epoch": 0.82,
"grad_norm": 15.917323127244398,
"learning_rate": 4.563174794266684e-06,
"logits/chosen": -1.5392366647720337,
"logits/rejected": -1.4464019536972046,
"logps/chosen": -692.5883178710938,
"logps/rejected": -963.4357299804688,
"loss": 0.2109,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.056720733642578,
"rewards/margins": 2.9374544620513916,
"rewards/margins_max": 5.395993232727051,
"rewards/margins_min": 0.5851330161094666,
"rewards/margins_std": 2.2416446208953857,
"rewards/rejected": -6.994175910949707,
"step": 290
},
{
"epoch": 0.85,
"grad_norm": 11.476540562223757,
"learning_rate": 4.5157748097670125e-06,
"logits/chosen": -1.5950560569763184,
"logits/rejected": -1.4536263942718506,
"logps/chosen": -938.9279174804688,
"logps/rejected": -1296.3175048828125,
"loss": 0.1011,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -5.9241485595703125,
"rewards/margins": 4.1446919441223145,
"rewards/margins_max": 6.245351314544678,
"rewards/margins_min": 1.5497524738311768,
"rewards/margins_std": 2.1239330768585205,
"rewards/rejected": -10.068840026855469,
"step": 300
},
{
"epoch": 0.85,
"eval_logits/chosen": -1.527121663093567,
"eval_logits/rejected": -1.4628735780715942,
"eval_logps/chosen": -1076.8594970703125,
"eval_logps/rejected": -1150.1253662109375,
"eval_loss": 0.9229267835617065,
"eval_rewards/accuracies": 0.6470000147819519,
"eval_rewards/chosen": -7.9226603507995605,
"eval_rewards/margins": 0.992804765701294,
"eval_rewards/margins_max": 5.051580905914307,
"eval_rewards/margins_min": -3.0808050632476807,
"eval_rewards/margins_std": 2.7076425552368164,
"eval_rewards/rejected": -8.915464401245117,
"eval_runtime": 428.5869,
"eval_samples_per_second": 4.666,
"eval_steps_per_second": 0.292,
"step": 300
},
{
"epoch": 0.87,
"grad_norm": 5.622465452747041,
"learning_rate": 4.466207261809989e-06,
"logits/chosen": -1.625128149986267,
"logits/rejected": -1.4389641284942627,
"logps/chosen": -856.7615356445312,
"logps/rejected": -1196.298583984375,
"loss": 0.1046,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -5.589455604553223,
"rewards/margins": 4.231289863586426,
"rewards/margins_max": 6.631104946136475,
"rewards/margins_min": 1.5690397024154663,
"rewards/margins_std": 2.2617735862731934,
"rewards/rejected": -9.820745468139648,
"step": 310
},
{
"epoch": 0.9,
"grad_norm": 37.31728926549998,
"learning_rate": 4.414525450399713e-06,
"logits/chosen": -1.6272573471069336,
"logits/rejected": -1.5049296617507935,
"logps/chosen": -816.5538330078125,
"logps/rejected": -1220.7586669921875,
"loss": 0.1477,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.082805633544922,
"rewards/margins": 4.518080711364746,
"rewards/margins_max": 6.932036399841309,
"rewards/margins_min": 1.335532546043396,
"rewards/margins_std": 2.588527202606201,
"rewards/rejected": -9.600885391235352,
"step": 320
},
{
"epoch": 0.93,
"grad_norm": 5.37421997088044,
"learning_rate": 4.360784949008615e-06,
"logits/chosen": -1.8167043924331665,
"logits/rejected": -1.645042061805725,
"logps/chosen": -831.2081298828125,
"logps/rejected": -1208.270263671875,
"loss": 0.111,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -4.768882751464844,
"rewards/margins": 4.644423007965088,
"rewards/margins_max": 7.169321537017822,
"rewards/margins_min": 1.9509897232055664,
"rewards/margins_std": 2.3954663276672363,
"rewards/rejected": -9.413305282592773,
"step": 330
},
{
"epoch": 0.96,
"grad_norm": 6.115341044262903,
"learning_rate": 4.30504354481929e-06,
"logits/chosen": -1.7410516738891602,
"logits/rejected": -1.6124862432479858,
"logps/chosen": -741.0687866210938,
"logps/rejected": -1153.75390625,
"loss": 0.1044,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -4.412232875823975,
"rewards/margins": 4.613609790802002,
"rewards/margins_max": 6.75095272064209,
"rewards/margins_min": 1.8239591121673584,
"rewards/margins_std": 2.2112793922424316,
"rewards/rejected": -9.025842666625977,
"step": 340
},
{
"epoch": 0.99,
"grad_norm": 8.804373815951685,
"learning_rate": 4.247361176585904e-06,
"logits/chosen": -1.6892824172973633,
"logits/rejected": -1.567959189414978,
"logps/chosen": -782.8369140625,
"logps/rejected": -1259.287353515625,
"loss": 0.0817,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -4.678778648376465,
"rewards/margins": 5.049575328826904,
"rewards/margins_max": 6.799111366271973,
"rewards/margins_min": 2.811235189437866,
"rewards/margins_std": 1.8613466024398804,
"rewards/rejected": -9.728352546691895,
"step": 350
},
{
"epoch": 1.01,
"grad_norm": 22.068799726915795,
"learning_rate": 4.187799870182038e-06,
"logits/chosen": -1.7105668783187866,
"logits/rejected": -1.5694526433944702,
"logps/chosen": -762.7816162109375,
"logps/rejected": -1217.321044921875,
"loss": 0.1032,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -4.626708507537842,
"rewards/margins": 5.017427444458008,
"rewards/margins_max": 7.252201080322266,
"rewards/margins_min": 2.1052348613739014,
"rewards/margins_std": 2.383836507797241,
"rewards/rejected": -9.644137382507324,
"step": 360
},
{
"epoch": 1.04,
"grad_norm": 10.290993940063032,
"learning_rate": 4.1264236719042365e-06,
"logits/chosen": -1.7839868068695068,
"logits/rejected": -1.6120306253433228,
"logps/chosen": -801.9637451171875,
"logps/rejected": -1164.2841796875,
"loss": 0.1588,
"rewards/accuracies": 0.9375,
"rewards/chosen": -4.478141784667969,
"rewards/margins": 4.724917411804199,
"rewards/margins_max": 7.042010307312012,
"rewards/margins_min": 1.6127008199691772,
"rewards/margins_std": 2.5478250980377197,
"rewards/rejected": -9.203059196472168,
"step": 370
},
{
"epoch": 1.07,
"grad_norm": 0.5210034728309734,
"learning_rate": 4.063298579603001e-06,
"logits/chosen": -1.6867786645889282,
"logits/rejected": -1.4948246479034424,
"logps/chosen": -782.1204223632812,
"logps/rejected": -1320.6646728515625,
"loss": 0.0414,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.968000888824463,
"rewards/margins": 5.882228851318359,
"rewards/margins_max": 7.5977654457092285,
"rewards/margins_min": 3.866016387939453,
"rewards/margins_std": 1.67121160030365,
"rewards/rejected": -10.850229263305664,
"step": 380
},
{
"epoch": 1.1,
"grad_norm": 13.427534231462952,
"learning_rate": 3.998492471715272e-06,
"logits/chosen": -1.6988388299942017,
"logits/rejected": -1.5951545238494873,
"logps/chosen": -877.1390380859375,
"logps/rejected": -1402.83203125,
"loss": 0.0685,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.7572479248046875,
"rewards/margins": 5.6500396728515625,
"rewards/margins_max": 7.7508039474487305,
"rewards/margins_min": 3.0813615322113037,
"rewards/margins_std": 2.1727612018585205,
"rewards/rejected": -11.407288551330566,
"step": 390
},
{
"epoch": 1.13,
"grad_norm": 2.4923200900882536,
"learning_rate": 3.932075034274723e-06,
"logits/chosen": -1.695990800857544,
"logits/rejected": -1.5507137775421143,
"logps/chosen": -851.5281372070312,
"logps/rejected": -1309.4801025390625,
"loss": 0.1396,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -5.59298038482666,
"rewards/margins": 5.056563854217529,
"rewards/margins_max": 7.312686920166016,
"rewards/margins_min": 1.9958137273788452,
"rewards/margins_std": 2.379727840423584,
"rewards/rejected": -10.649542808532715,
"step": 400
},
{
"epoch": 1.13,
"eval_logits/chosen": -1.667060375213623,
"eval_logits/rejected": -1.5979340076446533,
"eval_logps/chosen": -1116.99462890625,
"eval_logps/rejected": -1209.6519775390625,
"eval_loss": 0.9696508646011353,
"eval_rewards/accuracies": 0.6779999732971191,
"eval_rewards/chosen": -8.324010848999023,
"eval_rewards/margins": 1.1867200136184692,
"eval_rewards/margins_max": 5.737547397613525,
"eval_rewards/margins_min": -3.3923180103302,
"eval_rewards/margins_std": 3.034074544906616,
"eval_rewards/rejected": -9.510730743408203,
"eval_runtime": 428.9385,
"eval_samples_per_second": 4.663,
"eval_steps_per_second": 0.291,
"step": 400
},
{
"epoch": 1.15,
"grad_norm": 6.537657300759786,
"learning_rate": 3.864117685978339e-06,
"logits/chosen": -1.705518126487732,
"logits/rejected": -1.5725294351577759,
"logps/chosen": -897.5511474609375,
"logps/rejected": -1346.69091796875,
"loss": 0.0939,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -6.0087456703186035,
"rewards/margins": 4.9630446434021,
"rewards/margins_max": 7.469670295715332,
"rewards/margins_min": 1.8066009283065796,
"rewards/margins_std": 2.5759172439575195,
"rewards/rejected": -10.971790313720703,
"step": 410
},
{
"epoch": 1.18,
"grad_norm": 8.302069752936143,
"learning_rate": 3.794693501389861e-06,
"logits/chosen": -1.6544630527496338,
"logits/rejected": -1.5131093263626099,
"logps/chosen": -929.0003051757812,
"logps/rejected": -1400.305419921875,
"loss": 0.0548,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -6.110236644744873,
"rewards/margins": 5.219418525695801,
"rewards/margins_max": 7.189891815185547,
"rewards/margins_min": 2.5033233165740967,
"rewards/margins_std": 2.120957374572754,
"rewards/rejected": -11.329654693603516,
"step": 420
},
{
"epoch": 1.21,
"grad_norm": 1.8619960615196327,
"learning_rate": 3.7238771323626822e-06,
"logits/chosen": -1.677835464477539,
"logits/rejected": -1.5019906759262085,
"logps/chosen": -999.4791259765625,
"logps/rejected": -1461.9598388671875,
"loss": 0.0742,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.562595367431641,
"rewards/margins": 5.4552412033081055,
"rewards/margins_max": 7.70766544342041,
"rewards/margins_min": 2.421809434890747,
"rewards/margins_std": 2.3860526084899902,
"rewards/rejected": -12.01783561706543,
"step": 430
},
{
"epoch": 1.24,
"grad_norm": 5.868124977117504,
"learning_rate": 3.651744727766676e-06,
"logits/chosen": -1.6518735885620117,
"logits/rejected": -1.497201681137085,
"logps/chosen": -996.3165893554688,
"logps/rejected": -1532.0673828125,
"loss": 0.0519,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.051259517669678,
"rewards/margins": 5.819365501403809,
"rewards/margins_max": 7.876378536224365,
"rewards/margins_min": 2.8851966857910156,
"rewards/margins_std": 2.268291473388672,
"rewards/rejected": -12.870625495910645,
"step": 440
},
{
"epoch": 1.27,
"grad_norm": 10.805483266746087,
"learning_rate": 3.57837385160529e-06,
"logits/chosen": -1.621983289718628,
"logits/rejected": -1.479236364364624,
"logps/chosen": -850.7548828125,
"logps/rejected": -1321.7237548828125,
"loss": 0.0641,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -5.389442443847656,
"rewards/margins": 5.238432884216309,
"rewards/margins_max": 7.7712531089782715,
"rewards/margins_min": 2.671607255935669,
"rewards/margins_std": 2.3198580741882324,
"rewards/rejected": -10.627875328063965,
"step": 450
},
{
"epoch": 1.3,
"grad_norm": 2.3174254055425183,
"learning_rate": 3.503843399610941e-06,
"logits/chosen": -1.6503874063491821,
"logits/rejected": -1.4967344999313354,
"logps/chosen": -1084.4403076171875,
"logps/rejected": -1629.10693359375,
"loss": 0.0463,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.2910637855529785,
"rewards/margins": 6.02940034866333,
"rewards/margins_max": 8.215094566345215,
"rewards/margins_min": 3.005946636199951,
"rewards/margins_std": 2.3927676677703857,
"rewards/rejected": -13.320462226867676,
"step": 460
},
{
"epoch": 1.32,
"grad_norm": 2.3666379183603676,
"learning_rate": 3.4282335144083985e-06,
"logits/chosen": -1.6708223819732666,
"logits/rejected": -1.5695239305496216,
"logps/chosen": -911.2108154296875,
"logps/rejected": -1447.9605712890625,
"loss": 0.046,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -5.949938774108887,
"rewards/margins": 6.058177471160889,
"rewards/margins_max": 8.24023723602295,
"rewards/margins_min": 3.495572566986084,
"rewards/margins_std": 2.158477783203125,
"rewards/rejected": -12.008115768432617,
"step": 470
},
{
"epoch": 1.35,
"grad_norm": 5.998521622676278,
"learning_rate": 3.351625499337395e-06,
"logits/chosen": -1.7066646814346313,
"logits/rejected": -1.5283164978027344,
"logps/chosen": -988.6871337890625,
"logps/rejected": -1536.398681640625,
"loss": 0.0589,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -6.574495792388916,
"rewards/margins": 6.3713812828063965,
"rewards/margins_max": 8.416463851928711,
"rewards/margins_min": 3.5055854320526123,
"rewards/margins_std": 2.2722041606903076,
"rewards/rejected": -12.945878982543945,
"step": 480
},
{
"epoch": 1.38,
"grad_norm": 2.3121304603384734,
"learning_rate": 3.2741017310271056e-06,
"logits/chosen": -1.6702913045883179,
"logits/rejected": -1.5516611337661743,
"logps/chosen": -985.5250244140625,
"logps/rejected": -1516.626708984375,
"loss": 0.0956,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.465473175048828,
"rewards/margins": 5.861384868621826,
"rewards/margins_max": 8.299718856811523,
"rewards/margins_min": 2.847576141357422,
"rewards/margins_std": 2.479989767074585,
"rewards/rejected": -12.326857566833496,
"step": 490
},
{
"epoch": 1.41,
"grad_norm": 7.590092284353976,
"learning_rate": 3.195745570816532e-06,
"logits/chosen": -1.580214500427246,
"logits/rejected": -1.4903004169464111,
"logps/chosen": -1054.06103515625,
"logps/rejected": -1565.05810546875,
"loss": 0.078,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.562958717346191,
"rewards/margins": 5.623807430267334,
"rewards/margins_max": 7.8602399826049805,
"rewards/margins_min": 2.7611820697784424,
"rewards/margins_std": 2.2531216144561768,
"rewards/rejected": -13.186765670776367,
"step": 500
},
{
"epoch": 1.41,
"eval_logits/chosen": -1.625468134880066,
"eval_logits/rejected": -1.5552992820739746,
"eval_logps/chosen": -1304.2783203125,
"eval_logps/rejected": -1404.43701171875,
"eval_loss": 1.0425163507461548,
"eval_rewards/accuracies": 0.6539999842643738,
"eval_rewards/chosen": -10.196849822998047,
"eval_rewards/margins": 1.2617301940917969,
"eval_rewards/margins_max": 6.198861598968506,
"eval_rewards/margins_min": -3.7952890396118164,
"eval_rewards/margins_std": 3.3487019538879395,
"eval_rewards/rejected": -11.45858097076416,
"eval_runtime": 428.5936,
"eval_samples_per_second": 4.666,
"eval_steps_per_second": 0.292,
"step": 500
},
{
"epoch": 1.44,
"grad_norm": 6.506789256384592,
"learning_rate": 3.116641275116018e-06,
"logits/chosen": -1.6757932901382446,
"logits/rejected": -1.4905316829681396,
"logps/chosen": -1033.5491943359375,
"logps/rejected": -1559.284912109375,
"loss": 0.0438,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.929083824157715,
"rewards/margins": 6.069881916046143,
"rewards/margins_max": 8.297313690185547,
"rewards/margins_min": 3.3508517742156982,
"rewards/margins_std": 2.215510606765747,
"rewards/rejected": -12.998964309692383,
"step": 510
},
{
"epoch": 1.46,
"grad_norm": 2.799331098085792,
"learning_rate": 3.0368739048062956e-06,
"logits/chosen": -1.759708046913147,
"logits/rejected": -1.5871171951293945,
"logps/chosen": -981.7990112304688,
"logps/rejected": -1526.3701171875,
"loss": 0.0613,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.813815116882324,
"rewards/margins": 6.061458587646484,
"rewards/margins_max": 8.386785507202148,
"rewards/margins_min": 3.3189563751220703,
"rewards/margins_std": 2.240609884262085,
"rewards/rejected": -12.875274658203125,
"step": 520
},
{
"epoch": 1.49,
"grad_norm": 5.0163934897293325,
"learning_rate": 2.956529233772492e-06,
"logits/chosen": -1.8143419027328491,
"logits/rejected": -1.6911777257919312,
"logps/chosen": -1105.9581298828125,
"logps/rejected": -1680.5181884765625,
"loss": 0.0611,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.72055721282959,
"rewards/margins": 6.132667064666748,
"rewards/margins_max": 8.71304702758789,
"rewards/margins_min": 2.979393243789673,
"rewards/margins_std": 2.5647242069244385,
"rewards/rejected": -13.85322380065918,
"step": 530
},
{
"epoch": 1.52,
"grad_norm": 8.7260672105137,
"learning_rate": 2.8756936566714317e-06,
"logits/chosen": -1.8574295043945312,
"logits/rejected": -1.6885216236114502,
"logps/chosen": -1066.135009765625,
"logps/rejected": -1536.2845458984375,
"loss": 0.0701,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.2507524490356445,
"rewards/margins": 5.671202659606934,
"rewards/margins_max": 8.193965911865234,
"rewards/margins_min": 2.9109997749328613,
"rewards/margins_std": 2.3909668922424316,
"rewards/rejected": -12.921956062316895,
"step": 540
},
{
"epoch": 1.55,
"grad_norm": 1.3009208627187219,
"learning_rate": 2.794454096031429e-06,
"logits/chosen": -1.9122663736343384,
"logits/rejected": -1.7744579315185547,
"logps/chosen": -971.1412963867188,
"logps/rejected": -1555.514404296875,
"loss": 0.0719,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -5.982313632965088,
"rewards/margins": 6.4228410720825195,
"rewards/margins_max": 8.959406852722168,
"rewards/margins_min": 3.101313352584839,
"rewards/margins_std": 2.621415615081787,
"rewards/rejected": -12.405153274536133,
"step": 550
},
{
"epoch": 1.58,
"grad_norm": 1.9328399730262527,
"learning_rate": 2.71289790878446e-06,
"logits/chosen": -1.8311843872070312,
"logits/rejected": -1.6815801858901978,
"logps/chosen": -1012.6105346679688,
"logps/rejected": -1622.107666015625,
"loss": 0.0697,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.073877811431885,
"rewards/margins": 6.487514495849609,
"rewards/margins_max": 8.834905624389648,
"rewards/margins_min": 3.4811978340148926,
"rewards/margins_std": 2.3898167610168457,
"rewards/rejected": -13.561391830444336,
"step": 560
},
{
"epoch": 1.61,
"grad_norm": 0.17510978882217287,
"learning_rate": 2.6311127923312156e-06,
"logits/chosen": -1.8733352422714233,
"logits/rejected": -1.731903314590454,
"logps/chosen": -1004.5771484375,
"logps/rejected": -1612.7529296875,
"loss": 0.042,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.689506530761719,
"rewards/margins": 6.522040367126465,
"rewards/margins_max": 8.876073837280273,
"rewards/margins_min": 3.256171464920044,
"rewards/margins_std": 2.6161324977874756,
"rewards/rejected": -13.211545944213867,
"step": 570
},
{
"epoch": 1.63,
"grad_norm": 12.817311644147658,
"learning_rate": 2.549186690240057e-06,
"logits/chosen": -1.7239491939544678,
"logits/rejected": -1.6188468933105469,
"logps/chosen": -1058.948486328125,
"logps/rejected": -1677.268310546875,
"loss": 0.0444,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.725058078765869,
"rewards/margins": 6.387824058532715,
"rewards/margins_max": 8.449275016784668,
"rewards/margins_min": 3.585833787918091,
"rewards/margins_std": 2.1896438598632812,
"rewards/rejected": -14.112882614135742,
"step": 580
},
{
"epoch": 1.66,
"grad_norm": 2.200716474214234,
"learning_rate": 2.4672076976812548e-06,
"logits/chosen": -1.7416937351226807,
"logits/rejected": -1.5824648141860962,
"logps/chosen": -1067.9490966796875,
"logps/rejected": -1658.8199462890625,
"loss": 0.0499,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.612107753753662,
"rewards/margins": 6.36210823059082,
"rewards/margins_max": 8.805683135986328,
"rewards/margins_min": 3.530320405960083,
"rewards/margins_std": 2.4696502685546875,
"rewards/rejected": -13.974217414855957,
"step": 590
},
{
"epoch": 1.69,
"grad_norm": 5.8486806702260115,
"learning_rate": 2.3852639666982218e-06,
"logits/chosen": -1.729406714439392,
"logits/rejected": -1.5859451293945312,
"logps/chosen": -1029.7244873046875,
"logps/rejected": -1697.3372802734375,
"loss": 0.0765,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.36349630355835,
"rewards/margins": 6.701470851898193,
"rewards/margins_max": 9.139188766479492,
"rewards/margins_min": 3.311300754547119,
"rewards/margins_std": 2.6797633171081543,
"rewards/rejected": -14.064967155456543,
"step": 600
},
{
"epoch": 1.69,
"eval_logits/chosen": -1.7166643142700195,
"eval_logits/rejected": -1.6462373733520508,
"eval_logps/chosen": -1312.563232421875,
"eval_logps/rejected": -1434.9708251953125,
"eval_loss": 1.171522855758667,
"eval_rewards/accuracies": 0.6610000133514404,
"eval_rewards/chosen": -10.279698371887207,
"eval_rewards/margins": 1.4842207431793213,
"eval_rewards/margins_max": 7.0606184005737305,
"eval_rewards/margins_min": -4.507997989654541,
"eval_rewards/margins_std": 3.902109384536743,
"eval_rewards/rejected": -11.76391887664795,
"eval_runtime": 428.7286,
"eval_samples_per_second": 4.665,
"eval_steps_per_second": 0.292,
"step": 600
},
{
"epoch": 1.72,
"grad_norm": 2.9887908700456385,
"learning_rate": 2.303443611417584e-06,
"logits/chosen": -1.7610228061676025,
"logits/rejected": -1.5708558559417725,
"logps/chosen": -1019.3812255859375,
"logps/rejected": -1596.500244140625,
"loss": 0.0749,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.916808128356934,
"rewards/margins": 6.629319190979004,
"rewards/margins_max": 9.27853775024414,
"rewards/margins_min": 3.639543056488037,
"rewards/margins_std": 2.523704767227173,
"rewards/rejected": -13.546127319335938,
"step": 610
},
{
"epoch": 1.75,
"grad_norm": 0.09345851725609673,
"learning_rate": 2.2218346133000264e-06,
"logits/chosen": -1.8310705423355103,
"logits/rejected": -1.6571632623672485,
"logps/chosen": -1089.475341796875,
"logps/rejected": -1714.6595458984375,
"loss": 0.0874,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.478503227233887,
"rewards/margins": 6.835662841796875,
"rewards/margins_max": 9.080436706542969,
"rewards/margins_min": 3.885005235671997,
"rewards/margins_std": 2.379390239715576,
"rewards/rejected": -14.314167976379395,
"step": 620
},
{
"epoch": 1.77,
"grad_norm": 13.20707399800831,
"learning_rate": 2.140524726533792e-06,
"logits/chosen": -1.787641167640686,
"logits/rejected": -1.661877989768982,
"logps/chosen": -947.0399169921875,
"logps/rejected": -1524.828369140625,
"loss": 0.0539,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.729840278625488,
"rewards/margins": 6.693819999694824,
"rewards/margins_max": 9.224821090698242,
"rewards/margins_min": 3.935499906539917,
"rewards/margins_std": 2.420135021209717,
"rewards/rejected": -12.423660278320312,
"step": 630
},
{
"epoch": 1.8,
"grad_norm": 3.4772116065816014,
"learning_rate": 2.059601383672566e-06,
"logits/chosen": -1.8164135217666626,
"logits/rejected": -1.6359403133392334,
"logps/chosen": -1021.05322265625,
"logps/rejected": -1599.884033203125,
"loss": 0.04,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.091916561126709,
"rewards/margins": 6.576811790466309,
"rewards/margins_max": 8.615550994873047,
"rewards/margins_min": 4.3320631980896,
"rewards/margins_std": 2.022761821746826,
"rewards/rejected": -13.668729782104492,
"step": 640
},
{
"epoch": 1.83,
"grad_norm": 1.910640538145904,
"learning_rate": 1.9791516016192214e-06,
"logits/chosen": -1.7743873596191406,
"logits/rejected": -1.6393556594848633,
"logps/chosen": -1051.207763671875,
"logps/rejected": -1660.7542724609375,
"loss": 0.0612,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.210175514221191,
"rewards/margins": 6.484718322753906,
"rewards/margins_max": 9.02783489227295,
"rewards/margins_min": 3.8449549674987793,
"rewards/margins_std": 2.3319091796875,
"rewards/rejected": -13.694894790649414,
"step": 650
},
{
"epoch": 1.86,
"grad_norm": 0.4156394296306771,
"learning_rate": 1.8992618880565039e-06,
"logits/chosen": -1.6157350540161133,
"logits/rejected": -1.5133240222930908,
"logps/chosen": -1027.439453125,
"logps/rejected": -1595.850830078125,
"loss": 0.0679,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.812713623046875,
"rewards/margins": 5.9946794509887695,
"rewards/margins_max": 8.839725494384766,
"rewards/margins_min": 2.7734172344207764,
"rewards/margins_std": 2.6815245151519775,
"rewards/rejected": -13.807393074035645,
"step": 660
},
{
"epoch": 1.89,
"grad_norm": 1.3243616077705502,
"learning_rate": 1.8200181484252888e-06,
"logits/chosen": -1.809934377670288,
"logits/rejected": -1.6905943155288696,
"logps/chosen": -1084.2518310546875,
"logps/rejected": -1680.405029296875,
"loss": 0.0558,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.783478736877441,
"rewards/margins": 6.605474948883057,
"rewards/margins_max": 9.080102920532227,
"rewards/margins_min": 3.5593819618225098,
"rewards/margins_std": 2.538597822189331,
"rewards/rejected": -14.388954162597656,
"step": 670
},
{
"epoch": 1.92,
"grad_norm": 4.935603103347596,
"learning_rate": 1.7415055935504234e-06,
"logits/chosen": -1.845766305923462,
"logits/rejected": -1.6762946844100952,
"logps/chosen": -1092.99609375,
"logps/rejected": -1732.690185546875,
"loss": 0.0317,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.822856903076172,
"rewards/margins": 6.983065605163574,
"rewards/margins_max": 9.305206298828125,
"rewards/margins_min": 4.250351428985596,
"rewards/margins_std": 2.260586738586426,
"rewards/rejected": -14.80592155456543,
"step": 680
},
{
"epoch": 1.94,
"grad_norm": 7.946766648058278,
"learning_rate": 1.6638086480134954e-06,
"logits/chosen": -1.7061771154403687,
"logits/rejected": -1.5929887294769287,
"logps/chosen": -1015.9044189453125,
"logps/rejected": -1602.688232421875,
"loss": 0.0565,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.775514125823975,
"rewards/margins": 6.233893394470215,
"rewards/margins_max": 9.065168380737305,
"rewards/margins_min": 2.735471725463867,
"rewards/margins_std": 2.843477725982666,
"rewards/rejected": -14.009408950805664,
"step": 690
},
{
"epoch": 1.97,
"grad_norm": 14.357423867713438,
"learning_rate": 1.5870108593710473e-06,
"logits/chosen": -1.6323438882827759,
"logits/rejected": -1.4323724508285522,
"logps/chosen": -1116.0875244140625,
"logps/rejected": -1646.796875,
"loss": 0.0521,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -8.165563583374023,
"rewards/margins": 6.203519821166992,
"rewards/margins_max": 8.34221363067627,
"rewards/margins_min": 3.4348888397216797,
"rewards/margins_std": 2.265625476837158,
"rewards/rejected": -14.369084358215332,
"step": 700
},
{
"epoch": 1.97,
"eval_logits/chosen": -1.7082782983779907,
"eval_logits/rejected": -1.6383651494979858,
"eval_logps/chosen": -1494.51513671875,
"eval_logps/rejected": -1592.3466796875,
"eval_loss": 1.10393488407135,
"eval_rewards/accuracies": 0.6510000228881836,
"eval_rewards/chosen": -12.099217414855957,
"eval_rewards/margins": 1.2384591102600098,
"eval_rewards/margins_max": 6.618937015533447,
"eval_rewards/margins_min": -4.080103874206543,
"eval_rewards/margins_std": 3.540152072906494,
"eval_rewards/rejected": -13.337677001953125,
"eval_runtime": 428.89,
"eval_samples_per_second": 4.663,
"eval_steps_per_second": 0.291,
"step": 700
},
{
"epoch": 2.0,
"grad_norm": 1.9864414899165639,
"learning_rate": 1.511194808315853e-06,
"logits/chosen": -1.6388124227523804,
"logits/rejected": -1.5256621837615967,
"logps/chosen": -1023.98486328125,
"logps/rejected": -1670.1683349609375,
"loss": 0.0281,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.728632926940918,
"rewards/margins": 6.720816135406494,
"rewards/margins_max": 8.741449356079102,
"rewards/margins_min": 3.9726672172546387,
"rewards/margins_std": 2.161562204360962,
"rewards/rejected": -14.44944953918457,
"step": 710
},
{
"epoch": 2.03,
"grad_norm": 2.4447392288346776,
"learning_rate": 1.4364420198778662e-06,
"logits/chosen": -1.9084421396255493,
"logits/rejected": -1.7372974157333374,
"logps/chosen": -1069.986572265625,
"logps/rejected": -1748.271484375,
"loss": 0.0222,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.059340000152588,
"rewards/margins": 7.3868408203125,
"rewards/margins_max": 9.624174118041992,
"rewards/margins_min": 5.218744277954102,
"rewards/margins_std": 2.0435428619384766,
"rewards/rejected": -14.44618034362793,
"step": 720
},
{
"epoch": 2.06,
"grad_norm": 0.3283356036109342,
"learning_rate": 1.3628328757603243e-06,
"logits/chosen": -1.7824742794036865,
"logits/rejected": -1.607553243637085,
"logps/chosen": -1106.8240966796875,
"logps/rejected": -1757.1396484375,
"loss": 0.0279,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.68569278717041,
"rewards/margins": 7.141098976135254,
"rewards/margins_max": 9.206721305847168,
"rewards/margins_min": 4.535717010498047,
"rewards/margins_std": 2.135599374771118,
"rewards/rejected": -14.826791763305664,
"step": 730
},
{
"epoch": 2.08,
"grad_norm": 0.466472720676363,
"learning_rate": 1.2904465279052725e-06,
"logits/chosen": -1.7631629705429077,
"logits/rejected": -1.602264165878296,
"logps/chosen": -1061.498291015625,
"logps/rejected": -1701.393798828125,
"loss": 0.0468,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.410794734954834,
"rewards/margins": 6.905499458312988,
"rewards/margins_max": 9.171496391296387,
"rewards/margins_min": 3.907447099685669,
"rewards/margins_std": 2.4243547916412354,
"rewards/rejected": -14.316293716430664,
"step": 740
},
{
"epoch": 2.11,
"grad_norm": 0.18287903072298267,
"learning_rate": 1.219360813381446e-06,
"logits/chosen": -1.707327127456665,
"logits/rejected": -1.5934031009674072,
"logps/chosen": -995.9183349609375,
"logps/rejected": -1665.1839599609375,
"loss": 0.0293,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.091825008392334,
"rewards/margins": 7.211878776550293,
"rewards/margins_max": 9.057371139526367,
"rewards/margins_min": 5.072964668273926,
"rewards/margins_std": 1.7897049188613892,
"rewards/rejected": -14.303705215454102,
"step": 750
},
{
"epoch": 2.14,
"grad_norm": 3.972318831886565,
"learning_rate": 1.1496521706860392e-06,
"logits/chosen": -1.6829960346221924,
"logits/rejected": -1.5544617176055908,
"logps/chosen": -1081.756103515625,
"logps/rejected": -1768.875732421875,
"loss": 0.0206,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.939679145812988,
"rewards/margins": 7.1435041427612305,
"rewards/margins_max": 9.630821228027344,
"rewards/margins_min": 4.051230430603027,
"rewards/margins_std": 2.557648181915283,
"rewards/rejected": -15.083181381225586,
"step": 760
},
{
"epoch": 2.17,
"grad_norm": 0.15453005325463406,
"learning_rate": 1.0813955575503588e-06,
"logits/chosen": -1.7566072940826416,
"logits/rejected": -1.5845129489898682,
"logps/chosen": -1044.108154296875,
"logps/rejected": -1700.744140625,
"loss": 0.0287,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.392706871032715,
"rewards/margins": 7.490866661071777,
"rewards/margins_max": 9.396993637084961,
"rewards/margins_min": 5.832265377044678,
"rewards/margins_std": 1.631260871887207,
"rewards/rejected": -14.883572578430176,
"step": 770
},
{
"epoch": 2.2,
"grad_norm": 5.041769273622829,
"learning_rate": 1.0146643703377488e-06,
"logits/chosen": -1.817198395729065,
"logits/rejected": -1.6213362216949463,
"logps/chosen": -1110.951416015625,
"logps/rejected": -1716.0474853515625,
"loss": 0.0267,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.933794975280762,
"rewards/margins": 6.956693172454834,
"rewards/margins_max": 9.445747375488281,
"rewards/margins_min": 4.286118984222412,
"rewards/margins_std": 2.26704740524292,
"rewards/rejected": -14.89048957824707,
"step": 780
},
{
"epoch": 2.23,
"grad_norm": 0.028319940482359873,
"learning_rate": 9.495303651204496e-07,
"logits/chosen": -1.7651485204696655,
"logits/rejected": -1.5782719850540161,
"logps/chosen": -1116.5997314453125,
"logps/rejected": -1775.474853515625,
"loss": 0.0151,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.942474365234375,
"rewards/margins": 7.359992027282715,
"rewards/margins_max": 9.237930297851562,
"rewards/margins_min": 5.0483293533325195,
"rewards/margins_std": 1.877681016921997,
"rewards/rejected": -15.302465438842773,
"step": 790
},
{
"epoch": 2.25,
"grad_norm": 5.560910630060733,
"learning_rate": 8.860635805202616e-07,
"logits/chosen": -1.7791054248809814,
"logits/rejected": -1.6470226049423218,
"logps/chosen": -1128.700439453125,
"logps/rejected": -1887.7562255859375,
"loss": 0.0325,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.071495056152344,
"rewards/margins": 7.690678596496582,
"rewards/margins_max": 10.314142227172852,
"rewards/margins_min": 4.382508754730225,
"rewards/margins_std": 2.6400108337402344,
"rewards/rejected": -15.762173652648926,
"step": 800
},
{
"epoch": 2.25,
"eval_logits/chosen": -1.7630056142807007,
"eval_logits/rejected": -1.6934845447540283,
"eval_logps/chosen": -1308.7979736328125,
"eval_logps/rejected": -1442.1707763671875,
"eval_loss": 1.2213647365570068,
"eval_rewards/accuracies": 0.6600000262260437,
"eval_rewards/chosen": -10.242044448852539,
"eval_rewards/margins": 1.5938735008239746,
"eval_rewards/margins_max": 7.453612327575684,
"eval_rewards/margins_min": -4.738708972930908,
"eval_rewards/margins_std": 4.117012023925781,
"eval_rewards/rejected": -11.835918426513672,
"eval_runtime": 428.6302,
"eval_samples_per_second": 4.666,
"eval_steps_per_second": 0.292,
"step": 800
},
{
"epoch": 2.28,
"grad_norm": 2.130920241454253,
"learning_rate": 8.24332262395994e-07,
"logits/chosen": -1.8262383937835693,
"logits/rejected": -1.701570749282837,
"logps/chosen": -990.9318237304688,
"logps/rejected": -1709.775390625,
"loss": 0.0187,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.961573600769043,
"rewards/margins": 7.437863826751709,
"rewards/margins_max": 9.480931282043457,
"rewards/margins_min": 4.916778087615967,
"rewards/margins_std": 2.042966365814209,
"rewards/rejected": -14.399436950683594,
"step": 810
},
{
"epoch": 2.31,
"grad_norm": 2.7824509845813816,
"learning_rate": 7.644027904586587e-07,
"logits/chosen": -1.7199032306671143,
"logits/rejected": -1.584393858909607,
"logps/chosen": -1131.29541015625,
"logps/rejected": -1871.33984375,
"loss": 0.0245,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.013903617858887,
"rewards/margins": 7.894224643707275,
"rewards/margins_max": 10.34322738647461,
"rewards/margins_min": 5.228058338165283,
"rewards/margins_std": 2.269243001937866,
"rewards/rejected": -15.908126831054688,
"step": 820
},
{
"epoch": 2.34,
"grad_norm": 2.942249921804053,
"learning_rate": 7.06339606893347e-07,
"logits/chosen": -1.7625993490219116,
"logits/rejected": -1.552851915359497,
"logps/chosen": -1175.3865966796875,
"logps/rejected": -1861.589599609375,
"loss": 0.0079,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.902280330657959,
"rewards/margins": 8.019886016845703,
"rewards/margins_max": 10.1281156539917,
"rewards/margins_min": 6.0280256271362305,
"rewards/margins_std": 1.8365955352783203,
"rewards/rejected": -15.92216682434082,
"step": 830
},
{
"epoch": 2.37,
"grad_norm": 0.9426802566028485,
"learning_rate": 6.502051470645149e-07,
"logits/chosen": -1.780339241027832,
"logits/rejected": -1.6216917037963867,
"logps/chosen": -1083.676513671875,
"logps/rejected": -1733.9345703125,
"loss": 0.0234,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.57614278793335,
"rewards/margins": 7.293878078460693,
"rewards/margins_max": 9.180914878845215,
"rewards/margins_min": 4.970505714416504,
"rewards/margins_std": 1.8781248331069946,
"rewards/rejected": -14.870019912719727,
"step": 840
},
{
"epoch": 2.39,
"grad_norm": 0.2273620604649508,
"learning_rate": 5.960597723792194e-07,
"logits/chosen": -1.7474027872085571,
"logits/rejected": -1.575292944908142,
"logps/chosen": -1081.188232421875,
"logps/rejected": -1787.9605712890625,
"loss": 0.0229,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.850655555725098,
"rewards/margins": 7.63167667388916,
"rewards/margins_max": 10.038192749023438,
"rewards/margins_min": 5.033900737762451,
"rewards/margins_std": 2.2446444034576416,
"rewards/rejected": -15.482332229614258,
"step": 850
},
{
"epoch": 2.42,
"grad_norm": 2.008660400899101,
"learning_rate": 5.43961705380465e-07,
"logits/chosen": -1.791469931602478,
"logits/rejected": -1.6313838958740234,
"logps/chosen": -1132.4666748046875,
"logps/rejected": -1828.349609375,
"loss": 0.0326,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.8505706787109375,
"rewards/margins": 7.863633632659912,
"rewards/margins_max": 10.446196556091309,
"rewards/margins_min": 4.516094207763672,
"rewards/margins_std": 2.653343915939331,
"rewards/rejected": -15.714204788208008,
"step": 860
},
{
"epoch": 2.45,
"grad_norm": 1.9443236752501327,
"learning_rate": 4.939669671404871e-07,
"logits/chosen": -1.708809494972229,
"logits/rejected": -1.5626459121704102,
"logps/chosen": -1073.6954345703125,
"logps/rejected": -1811.253662109375,
"loss": 0.0095,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.609139919281006,
"rewards/margins": 7.719372749328613,
"rewards/margins_max": 9.861176490783691,
"rewards/margins_min": 5.364067077636719,
"rewards/margins_std": 2.00132155418396,
"rewards/rejected": -15.328509330749512,
"step": 870
},
{
"epoch": 2.48,
"grad_norm": 2.06741221987676,
"learning_rate": 4.461293170212644e-07,
"logits/chosen": -1.8483781814575195,
"logits/rejected": -1.6546274423599243,
"logps/chosen": -1123.468017578125,
"logps/rejected": -1798.621826171875,
"loss": 0.0322,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.950200080871582,
"rewards/margins": 7.411231994628906,
"rewards/margins_max": 10.043291091918945,
"rewards/margins_min": 4.137426853179932,
"rewards/margins_std": 2.554241418838501,
"rewards/rejected": -15.361432075500488,
"step": 880
},
{
"epoch": 2.51,
"grad_norm": 0.8360988782034983,
"learning_rate": 4.005001948670606e-07,
"logits/chosen": -1.813595175743103,
"logits/rejected": -1.6409099102020264,
"logps/chosen": -1167.838623046875,
"logps/rejected": -1849.715576171875,
"loss": 0.0177,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.237103462219238,
"rewards/margins": 7.6171464920043945,
"rewards/margins_max": 10.030054092407227,
"rewards/margins_min": 5.162562847137451,
"rewards/margins_std": 2.175448417663574,
"rewards/rejected": -15.854248046875,
"step": 890
},
{
"epoch": 2.54,
"grad_norm": 0.28012086124588453,
"learning_rate": 3.571286656911377e-07,
"logits/chosen": -1.765481948852539,
"logits/rejected": -1.5610095262527466,
"logps/chosen": -1176.97509765625,
"logps/rejected": -1906.4827880859375,
"loss": 0.0256,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.486291885375977,
"rewards/margins": 7.7398223876953125,
"rewards/margins_max": 10.43345832824707,
"rewards/margins_min": 4.932800769805908,
"rewards/margins_std": 2.4372851848602295,
"rewards/rejected": -16.226112365722656,
"step": 900
},
{
"epoch": 2.54,
"eval_logits/chosen": -1.7013623714447021,
"eval_logits/rejected": -1.6318581104278564,
"eval_logps/chosen": -1451.88916015625,
"eval_logps/rejected": -1581.395751953125,
"eval_loss": 1.202013373374939,
"eval_rewards/accuracies": 0.6620000004768372,
"eval_rewards/chosen": -11.672956466674805,
"eval_rewards/margins": 1.555212140083313,
"eval_rewards/margins_max": 7.462009906768799,
"eval_rewards/margins_min": -4.611362457275391,
"eval_rewards/margins_std": 4.051472187042236,
"eval_rewards/rejected": -13.228167533874512,
"eval_runtime": 428.5009,
"eval_samples_per_second": 4.667,
"eval_steps_per_second": 0.292,
"step": 900
},
{
"epoch": 2.56,
"grad_norm": 0.6107279357125659,
"learning_rate": 3.1606136691612555e-07,
"logits/chosen": -1.7235673666000366,
"logits/rejected": -1.5583069324493408,
"logps/chosen": -1131.2056884765625,
"logps/rejected": -1782.4332275390625,
"loss": 0.0174,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -8.073932647705078,
"rewards/margins": 7.4360671043396,
"rewards/margins_max": 9.632651329040527,
"rewards/margins_min": 5.327752113342285,
"rewards/margins_std": 1.8935825824737549,
"rewards/rejected": -15.50999927520752,
"step": 910
},
{
"epoch": 2.59,
"grad_norm": 0.00966975682935343,
"learning_rate": 2.773424582247844e-07,
"logits/chosen": -1.6917803287506104,
"logits/rejected": -1.4805718660354614,
"logps/chosen": -1141.4068603515625,
"logps/rejected": -1758.5318603515625,
"loss": 0.0178,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.279281616210938,
"rewards/margins": 7.198742866516113,
"rewards/margins_max": 9.452996253967285,
"rewards/margins_min": 4.647868633270264,
"rewards/margins_std": 2.1528563499450684,
"rewards/rejected": -15.478025436401367,
"step": 920
},
{
"epoch": 2.62,
"grad_norm": 3.0376153555107446,
"learning_rate": 2.410135740750821e-07,
"logits/chosen": -1.7053037881851196,
"logits/rejected": -1.5509663820266724,
"logps/chosen": -1090.0576171875,
"logps/rejected": -1777.7945556640625,
"loss": 0.043,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -8.083145141601562,
"rewards/margins": 7.410282135009766,
"rewards/margins_max": 9.764742851257324,
"rewards/margins_min": 5.139273643493652,
"rewards/margins_std": 2.1024787425994873,
"rewards/rejected": -15.493428230285645,
"step": 930
},
{
"epoch": 2.65,
"grad_norm": 0.6859350599797326,
"learning_rate": 2.0711377893064182e-07,
"logits/chosen": -1.8094221353530884,
"logits/rejected": -1.6414306163787842,
"logps/chosen": -1164.137451171875,
"logps/rejected": -1852.5625,
"loss": 0.0298,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.158674240112305,
"rewards/margins": 7.432664394378662,
"rewards/margins_max": 10.111716270446777,
"rewards/margins_min": 4.017355442047119,
"rewards/margins_std": 2.706058979034424,
"rewards/rejected": -15.591337203979492,
"step": 940
},
{
"epoch": 2.68,
"grad_norm": 6.314035361122387,
"learning_rate": 1.756795252547111e-07,
"logits/chosen": -1.665837287902832,
"logits/rejected": -1.5277420282363892,
"logps/chosen": -1078.7557373046875,
"logps/rejected": -1684.4287109375,
"loss": 0.0295,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -8.158330917358398,
"rewards/margins": 6.608637809753418,
"rewards/margins_max": 9.02342700958252,
"rewards/margins_min": 3.459864854812622,
"rewards/margins_std": 2.52087664604187,
"rewards/rejected": -14.766969680786133,
"step": 950
},
{
"epoch": 2.7,
"grad_norm": 7.031354165895073,
"learning_rate": 1.4674461431281013e-07,
"logits/chosen": -1.7678325176239014,
"logits/rejected": -1.6092376708984375,
"logps/chosen": -1103.3350830078125,
"logps/rejected": -1758.6500244140625,
"loss": 0.0242,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.200953483581543,
"rewards/margins": 7.219841957092285,
"rewards/margins_max": 9.616026878356934,
"rewards/margins_min": 4.519529819488525,
"rewards/margins_std": 2.263463258743286,
"rewards/rejected": -15.420794486999512,
"step": 960
},
{
"epoch": 2.73,
"grad_norm": 0.3134845483065753,
"learning_rate": 1.2034015982622243e-07,
"logits/chosen": -1.7572071552276611,
"logits/rejected": -1.5487779378890991,
"logps/chosen": -1225.2569580078125,
"logps/rejected": -1896.434326171875,
"loss": 0.0271,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.728785514831543,
"rewards/margins": 7.427072048187256,
"rewards/margins_max": 9.873006820678711,
"rewards/margins_min": 4.541165351867676,
"rewards/margins_std": 2.364122152328491,
"rewards/rejected": -16.155858993530273,
"step": 970
},
{
"epoch": 2.76,
"grad_norm": 0.3690247126654468,
"learning_rate": 9.649455451539419e-08,
"logits/chosen": -1.6380853652954102,
"logits/rejected": -1.4841035604476929,
"logps/chosen": -1118.8951416015625,
"logps/rejected": -1833.6126708984375,
"loss": 0.0234,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.304391860961914,
"rewards/margins": 7.7766313552856445,
"rewards/margins_max": 10.320856094360352,
"rewards/margins_min": 5.046825885772705,
"rewards/margins_std": 2.3247740268707275,
"rewards/rejected": -16.081022262573242,
"step": 980
},
{
"epoch": 2.79,
"grad_norm": 0.035471082675790036,
"learning_rate": 7.523343956923196e-08,
"logits/chosen": -1.7599372863769531,
"logits/rejected": -1.5641086101531982,
"logps/chosen": -1154.5972900390625,
"logps/rejected": -1892.8466796875,
"loss": 0.0177,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.36001205444336,
"rewards/margins": 7.979167938232422,
"rewards/margins_max": 10.33701229095459,
"rewards/margins_min": 5.4104814529418945,
"rewards/margins_std": 2.1920626163482666,
"rewards/rejected": -16.33917999267578,
"step": 990
},
{
"epoch": 2.82,
"grad_norm": 3.7962060660896455,
"learning_rate": 5.657967707312195e-08,
"logits/chosen": -1.6692126989364624,
"logits/rejected": -1.5857051610946655,
"logps/chosen": -1184.306884765625,
"logps/rejected": -1848.2109375,
"loss": 0.0246,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.908744812011719,
"rewards/margins": 6.874230861663818,
"rewards/margins_max": 9.302106857299805,
"rewards/margins_min": 3.831719160079956,
"rewards/margins_std": 2.495060443878174,
"rewards/rejected": -15.782976150512695,
"step": 1000
},
{
"epoch": 2.82,
"eval_logits/chosen": -1.6955701112747192,
"eval_logits/rejected": -1.6262598037719727,
"eval_logps/chosen": -1466.096923828125,
"eval_logps/rejected": -1594.279541015625,
"eval_loss": 1.2153818607330322,
"eval_rewards/accuracies": 0.6570000052452087,
"eval_rewards/chosen": -11.815034866333008,
"eval_rewards/margins": 1.5419700145721436,
"eval_rewards/margins_max": 7.536928653717041,
"eval_rewards/margins_min": -4.68462610244751,
"eval_rewards/margins_std": 4.09072208404541,
"eval_rewards/rejected": -13.357006072998047,
"eval_runtime": 428.679,
"eval_samples_per_second": 4.665,
"eval_steps_per_second": 0.292,
"step": 1000
},
{
"epoch": 2.85,
"grad_norm": 0.6729976013886217,
"learning_rate": 4.055332542531959e-08,
"logits/chosen": -1.7815234661102295,
"logits/rejected": -1.622179627418518,
"logps/chosen": -1156.6016845703125,
"logps/rejected": -1884.806884765625,
"loss": 0.036,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.329094886779785,
"rewards/margins": 7.462734222412109,
"rewards/margins_max": 10.055309295654297,
"rewards/margins_min": 4.672645568847656,
"rewards/margins_std": 2.401289701461792,
"rewards/rejected": -15.791829109191895,
"step": 1010
},
{
"epoch": 2.87,
"grad_norm": 0.3931332359603542,
"learning_rate": 2.7171617768147472e-08,
"logits/chosen": -1.757817268371582,
"logits/rejected": -1.6103594303131104,
"logps/chosen": -1205.610107421875,
"logps/rejected": -1883.1265869140625,
"loss": 0.0152,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -8.91980266571045,
"rewards/margins": 7.268828392028809,
"rewards/margins_max": 9.548690795898438,
"rewards/margins_min": 4.854549884796143,
"rewards/margins_std": 2.135824203491211,
"rewards/rejected": -16.18863296508789,
"step": 1020
},
{
"epoch": 2.9,
"grad_norm": 0.22753287376533807,
"learning_rate": 1.6448943457189616e-08,
"logits/chosen": -1.680837869644165,
"logits/rejected": -1.540766716003418,
"logps/chosen": -1161.3184814453125,
"logps/rejected": -1843.625,
"loss": 0.0264,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -8.517123222351074,
"rewards/margins": 7.349859714508057,
"rewards/margins_max": 10.050976753234863,
"rewards/margins_min": 4.578632354736328,
"rewards/margins_std": 2.4442994594573975,
"rewards/rejected": -15.866983413696289,
"step": 1030
},
{
"epoch": 2.93,
"grad_norm": 0.6569270546900866,
"learning_rate": 8.39683258841123e-09,
"logits/chosen": -1.621664047241211,
"logits/rejected": -1.4453307390213013,
"logps/chosen": -1106.493896484375,
"logps/rejected": -1756.8822021484375,
"loss": 0.0222,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.116470336914062,
"rewards/margins": 7.085239410400391,
"rewards/margins_max": 9.558416366577148,
"rewards/margins_min": 4.0455121994018555,
"rewards/margins_std": 2.4827523231506348,
"rewards/rejected": -15.20171070098877,
"step": 1040
},
{
"epoch": 2.96,
"grad_norm": 1.8056138868772267,
"learning_rate": 3.0239435998430376e-09,
"logits/chosen": -1.7272727489471436,
"logits/rejected": -1.5463558435440063,
"logps/chosen": -1105.938720703125,
"logps/rejected": -1762.3658447265625,
"loss": 0.0288,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -8.043752670288086,
"rewards/margins": 7.3192243576049805,
"rewards/margins_max": 9.873791694641113,
"rewards/margins_min": 4.222177028656006,
"rewards/margins_std": 2.4955527782440186,
"rewards/rejected": -15.36297607421875,
"step": 1050
},
{
"epoch": 2.99,
"grad_norm": 0.37053192172842564,
"learning_rate": 3.3605396115826695e-10,
"logits/chosen": -1.6333061456680298,
"logits/rejected": -1.5385651588439941,
"logps/chosen": -1083.2177734375,
"logps/rejected": -1845.8958740234375,
"loss": 0.004,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.038263320922852,
"rewards/margins": 7.885945796966553,
"rewards/margins_max": 9.852472305297852,
"rewards/margins_min": 5.348562717437744,
"rewards/margins_std": 1.9733645915985107,
"rewards/rejected": -15.924209594726562,
"step": 1060
},
{
"epoch": 3.0,
"step": 1065,
"total_flos": 0.0,
"train_loss": 0.14573693349257882,
"train_runtime": 13238.8899,
"train_samples_per_second": 1.287,
"train_steps_per_second": 0.08
}
],
"logging_steps": 10,
"max_steps": 1065,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}