zephyr-7b-dpo-oursuf6k-qlora-5e-6 / trainer_state.json
just1nseo's picture
Model save
84deb4c verified
raw
history blame contribute delete
No virus
51 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 684,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 2.024759928283202,
"learning_rate": 7.246376811594204e-08,
"logits/chosen": -2.961127519607544,
"logits/rejected": -2.9461119174957275,
"logps/chosen": -261.90582275390625,
"logps/rejected": -270.03265380859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 1.8980625865574121,
"learning_rate": 7.246376811594204e-07,
"logits/chosen": -2.873429775238037,
"logits/rejected": -2.8538858890533447,
"logps/chosen": -217.4855194091797,
"logps/rejected": -222.1319580078125,
"loss": 0.6928,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": 0.000618334801401943,
"rewards/margins": 0.0005746870301663876,
"rewards/margins_max": 0.0019774516113102436,
"rewards/margins_min": -0.0008280774345621467,
"rewards/margins_std": 0.001983808586373925,
"rewards/rejected": 4.364784399513155e-05,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 2.110946747867592,
"learning_rate": 1.4492753623188408e-06,
"logits/chosen": -2.856001377105713,
"logits/rejected": -2.873141050338745,
"logps/chosen": -228.9456787109375,
"logps/rejected": -176.6509246826172,
"loss": 0.6924,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.000968199223279953,
"rewards/margins": 0.0013832334661856294,
"rewards/margins_max": 0.002613522345200181,
"rewards/margins_min": 0.0001529444707557559,
"rewards/margins_std": 0.0017398912459611893,
"rewards/rejected": -0.00041503418469801545,
"step": 20
},
{
"epoch": 0.04,
"grad_norm": 2.0573110330048388,
"learning_rate": 2.173913043478261e-06,
"logits/chosen": -2.929853677749634,
"logits/rejected": -2.875521183013916,
"logps/chosen": -260.0462951660156,
"logps/rejected": -239.0731658935547,
"loss": 0.6904,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.004958462901413441,
"rewards/margins": 0.005339525174349546,
"rewards/margins_max": 0.011697771959006786,
"rewards/margins_min": -0.001018722541630268,
"rewards/margins_std": 0.00899192038923502,
"rewards/rejected": -0.00038106151623651385,
"step": 30
},
{
"epoch": 0.06,
"grad_norm": 1.9291759229840313,
"learning_rate": 2.8985507246376816e-06,
"logits/chosen": -2.8282618522644043,
"logits/rejected": -2.7805304527282715,
"logps/chosen": -326.31719970703125,
"logps/rejected": -365.41064453125,
"loss": 0.6861,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.01588786579668522,
"rewards/margins": 0.01591557264328003,
"rewards/margins_max": 0.029321899637579918,
"rewards/margins_min": 0.0025092470459640026,
"rewards/margins_std": 0.0189594067633152,
"rewards/rejected": -2.7706240871339105e-05,
"step": 40
},
{
"epoch": 0.07,
"grad_norm": 1.679205040063332,
"learning_rate": 3.6231884057971017e-06,
"logits/chosen": -2.890886068344116,
"logits/rejected": -2.8183233737945557,
"logps/chosen": -249.2432098388672,
"logps/rejected": -246.5466766357422,
"loss": 0.6811,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.013654066249728203,
"rewards/margins": 0.013491788879036903,
"rewards/margins_max": 0.030749738216400146,
"rewards/margins_min": -0.003766159061342478,
"rewards/margins_std": 0.024406425654888153,
"rewards/rejected": 0.00016227728337980807,
"step": 50
},
{
"epoch": 0.09,
"grad_norm": 1.7034528224314027,
"learning_rate": 4.347826086956522e-06,
"logits/chosen": -3.011662006378174,
"logits/rejected": -2.9398560523986816,
"logps/chosen": -304.8922119140625,
"logps/rejected": -246.6385498046875,
"loss": 0.6705,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.03926776722073555,
"rewards/margins": 0.04536719247698784,
"rewards/margins_max": 0.0852198451757431,
"rewards/margins_min": 0.005514549091458321,
"rewards/margins_std": 0.056360144168138504,
"rewards/rejected": -0.006099428050220013,
"step": 60
},
{
"epoch": 0.1,
"grad_norm": 1.9031177489294464,
"learning_rate": 4.999967381905813e-06,
"logits/chosen": -3.010206460952759,
"logits/rejected": -2.93892240524292,
"logps/chosen": -268.6554870605469,
"logps/rejected": -208.88131713867188,
"loss": 0.6535,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.034810397773981094,
"rewards/margins": 0.07472650706768036,
"rewards/margins_max": 0.11295346170663834,
"rewards/margins_min": 0.03649955615401268,
"rewards/margins_std": 0.054061077535152435,
"rewards/rejected": -0.039916109293699265,
"step": 70
},
{
"epoch": 0.12,
"grad_norm": 2.0564287009461806,
"learning_rate": 4.9960542403925095e-06,
"logits/chosen": -2.7611324787139893,
"logits/rejected": -2.683593273162842,
"logps/chosen": -256.10906982421875,
"logps/rejected": -241.2465057373047,
"loss": 0.6403,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.01417328417301178,
"rewards/margins": 0.08097021281719208,
"rewards/margins_max": 0.19939911365509033,
"rewards/margins_min": -0.03745868057012558,
"rewards/margins_std": 0.16748374700546265,
"rewards/rejected": -0.0667969286441803,
"step": 80
},
{
"epoch": 0.13,
"grad_norm": 1.9620079371246095,
"learning_rate": 4.98562917836165e-06,
"logits/chosen": -2.7862794399261475,
"logits/rejected": -2.755009651184082,
"logps/chosen": -260.46527099609375,
"logps/rejected": -209.8500213623047,
"loss": 0.6216,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.02841970883309841,
"rewards/margins": 0.1533060073852539,
"rewards/margins_max": 0.2169797718524933,
"rewards/margins_min": 0.08963226526975632,
"rewards/margins_std": 0.09004827588796616,
"rewards/rejected": -0.12488631159067154,
"step": 90
},
{
"epoch": 0.15,
"grad_norm": 2.682732545242212,
"learning_rate": 4.968719393609757e-06,
"logits/chosen": -2.8458776473999023,
"logits/rejected": -2.793788194656372,
"logps/chosen": -373.66241455078125,
"logps/rejected": -257.68841552734375,
"loss": 0.5821,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.04686256870627403,
"rewards/margins": 0.3232998847961426,
"rewards/margins_max": 0.5046892166137695,
"rewards/margins_min": 0.14191053807735443,
"rewards/margins_std": 0.2565232217311859,
"rewards/rejected": -0.27643734216690063,
"step": 100
},
{
"epoch": 0.15,
"eval_logits/chosen": -2.7354917526245117,
"eval_logits/rejected": -2.6965763568878174,
"eval_logps/chosen": -303.8941650390625,
"eval_logps/rejected": -286.7320861816406,
"eval_loss": 0.6622087359428406,
"eval_rewards/accuracies": 0.6150793433189392,
"eval_rewards/chosen": -0.186729297041893,
"eval_rewards/margins": 0.08876504004001617,
"eval_rewards/margins_max": 0.39532026648521423,
"eval_rewards/margins_min": -0.1795283555984497,
"eval_rewards/margins_std": 0.25878801941871643,
"eval_rewards/rejected": -0.2754943370819092,
"eval_runtime": 284.1012,
"eval_samples_per_second": 7.04,
"eval_steps_per_second": 0.222,
"step": 100
},
{
"epoch": 0.16,
"grad_norm": 3.178965793594782,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": -2.7822461128234863,
"logits/rejected": -2.7551050186157227,
"logps/chosen": -346.56549072265625,
"logps/rejected": -336.0809020996094,
"loss": 0.5771,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.02590801753103733,
"rewards/margins": 0.3409319818019867,
"rewards/margins_max": 0.4643324017524719,
"rewards/margins_min": 0.21753165125846863,
"rewards/margins_std": 0.17451441287994385,
"rewards/rejected": -0.3668400049209595,
"step": 110
},
{
"epoch": 0.18,
"grad_norm": 3.521510373240137,
"learning_rate": 4.915638921541952e-06,
"logits/chosen": -2.7236685752868652,
"logits/rejected": -2.7280526161193848,
"logps/chosen": -310.40826416015625,
"logps/rejected": -319.103515625,
"loss": 0.5407,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.22706270217895508,
"rewards/margins": 0.33027681708335876,
"rewards/margins_max": 0.5542212724685669,
"rewards/margins_min": 0.10633233934640884,
"rewards/margins_std": 0.3167053163051605,
"rewards/rejected": -0.5573395490646362,
"step": 120
},
{
"epoch": 0.19,
"grad_norm": 2.110244119813805,
"learning_rate": 4.879606715117019e-06,
"logits/chosen": -2.8192946910858154,
"logits/rejected": -2.731480836868286,
"logps/chosen": -343.879150390625,
"logps/rejected": -308.55780029296875,
"loss": 0.5251,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.37679314613342285,
"rewards/margins": 0.30480116605758667,
"rewards/margins_max": 0.5037060379981995,
"rewards/margins_min": 0.10589637607336044,
"rewards/margins_std": 0.2812938690185547,
"rewards/rejected": -0.6815943121910095,
"step": 130
},
{
"epoch": 0.2,
"grad_norm": 5.313237756103227,
"learning_rate": 4.837366386472175e-06,
"logits/chosen": -2.8442323207855225,
"logits/rejected": -2.757608413696289,
"logps/chosen": -338.12030029296875,
"logps/rejected": -321.89703369140625,
"loss": 0.5564,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.4649910032749176,
"rewards/margins": 0.3815266191959381,
"rewards/margins_max": 0.6749385595321655,
"rewards/margins_min": 0.08811453729867935,
"rewards/margins_std": 0.4149473309516907,
"rewards/rejected": -0.8465176820755005,
"step": 140
},
{
"epoch": 0.22,
"grad_norm": 3.739167908499058,
"learning_rate": 4.789028135801919e-06,
"logits/chosen": -2.8220436573028564,
"logits/rejected": -2.8031575679779053,
"logps/chosen": -304.0033874511719,
"logps/rejected": -348.67108154296875,
"loss": 0.5705,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.05871645733714104,
"rewards/margins": 0.4513682425022125,
"rewards/margins_max": 0.7341547012329102,
"rewards/margins_min": 0.16858164966106415,
"rewards/margins_std": 0.39992058277130127,
"rewards/rejected": -0.5100846290588379,
"step": 150
},
{
"epoch": 0.23,
"grad_norm": 4.005524634813479,
"learning_rate": 4.7347180720830635e-06,
"logits/chosen": -2.8278419971466064,
"logits/rejected": -2.6905112266540527,
"logps/chosen": -371.9062805175781,
"logps/rejected": -375.65728759765625,
"loss": 0.5226,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.3725179135799408,
"rewards/margins": 0.5365481972694397,
"rewards/margins_max": 0.8424277305603027,
"rewards/margins_min": 0.2306685745716095,
"rewards/margins_std": 0.4325791001319885,
"rewards/rejected": -0.9090660810470581,
"step": 160
},
{
"epoch": 0.25,
"grad_norm": 8.89880059550579,
"learning_rate": 4.674577884070811e-06,
"logits/chosen": -2.7482759952545166,
"logits/rejected": -2.7097363471984863,
"logps/chosen": -368.9304504394531,
"logps/rejected": -368.5115661621094,
"loss": 0.4421,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.42384210228919983,
"rewards/margins": 0.7086302638053894,
"rewards/margins_max": 1.0815365314483643,
"rewards/margins_min": 0.33572402596473694,
"rewards/margins_std": 0.5273691415786743,
"rewards/rejected": -1.1324723958969116,
"step": 170
},
{
"epoch": 0.26,
"grad_norm": 12.030137047752005,
"learning_rate": 4.608764470648971e-06,
"logits/chosen": -2.782106399536133,
"logits/rejected": -2.7207627296447754,
"logps/chosen": -344.0601501464844,
"logps/rejected": -446.43328857421875,
"loss": 0.4886,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3604986369609833,
"rewards/margins": 0.7753941416740417,
"rewards/margins_max": 1.1650350093841553,
"rewards/margins_min": 0.38575348258018494,
"rewards/margins_std": 0.5510352253913879,
"rewards/rejected": -1.1358928680419922,
"step": 180
},
{
"epoch": 0.28,
"grad_norm": 4.3464674099690255,
"learning_rate": 4.5374495314986874e-06,
"logits/chosen": -2.5000321865081787,
"logits/rejected": -2.5499188899993896,
"logps/chosen": -381.9058532714844,
"logps/rejected": -390.3780212402344,
"loss": 0.4735,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.4734285771846771,
"rewards/margins": 1.050258994102478,
"rewards/margins_max": 1.5981611013412476,
"rewards/margins_min": 0.5023568868637085,
"rewards/margins_std": 0.7748504877090454,
"rewards/rejected": -1.523687481880188,
"step": 190
},
{
"epoch": 0.29,
"grad_norm": 9.11783684491386,
"learning_rate": 4.460819119153574e-06,
"logits/chosen": -2.5870699882507324,
"logits/rejected": -2.5736021995544434,
"logps/chosen": -338.88360595703125,
"logps/rejected": -435.57855224609375,
"loss": 0.481,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.7319141030311584,
"rewards/margins": 0.7730057835578918,
"rewards/margins_max": 1.262407660484314,
"rewards/margins_min": 0.2836039066314697,
"rewards/margins_std": 0.6921188235282898,
"rewards/rejected": -1.5049200057983398,
"step": 200
},
{
"epoch": 0.29,
"eval_logits/chosen": -2.5660696029663086,
"eval_logits/rejected": -2.540174722671509,
"eval_logps/chosen": -410.9715881347656,
"eval_logps/rejected": -423.9108581542969,
"eval_loss": 0.6257268190383911,
"eval_rewards/accuracies": 0.6746031641960144,
"eval_rewards/chosen": -1.2575041055679321,
"eval_rewards/margins": 0.389777809381485,
"eval_rewards/margins_max": 1.3411911725997925,
"eval_rewards/margins_min": -0.5259115695953369,
"eval_rewards/margins_std": 0.8282801508903503,
"eval_rewards/rejected": -1.6472818851470947,
"eval_runtime": 283.175,
"eval_samples_per_second": 7.063,
"eval_steps_per_second": 0.222,
"step": 200
},
{
"epoch": 0.31,
"grad_norm": 8.587152119215933,
"learning_rate": 4.379073153609896e-06,
"logits/chosen": -2.6241440773010254,
"logits/rejected": -2.5915939807891846,
"logps/chosen": -408.4817199707031,
"logps/rejected": -439.4585876464844,
"loss": 0.4772,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.8052468299865723,
"rewards/margins": 0.7498449087142944,
"rewards/margins_max": 1.3446953296661377,
"rewards/margins_min": 0.15499453246593475,
"rewards/margins_std": 0.8412453532218933,
"rewards/rejected": -1.5550918579101562,
"step": 210
},
{
"epoch": 0.32,
"grad_norm": 12.858094013695773,
"learning_rate": 4.292424900758129e-06,
"logits/chosen": -2.3552744388580322,
"logits/rejected": -2.266150712966919,
"logps/chosen": -326.7586669921875,
"logps/rejected": -419.92950439453125,
"loss": 0.4407,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.6805569529533386,
"rewards/margins": 0.8906445503234863,
"rewards/margins_max": 1.5019675493240356,
"rewards/margins_min": 0.27932122349739075,
"rewards/margins_std": 0.8645416498184204,
"rewards/rejected": -1.5712013244628906,
"step": 220
},
{
"epoch": 0.34,
"grad_norm": 10.547994008924384,
"learning_rate": 4.201100415996598e-06,
"logits/chosen": -2.2731940746307373,
"logits/rejected": -2.208517551422119,
"logps/chosen": -396.6240234375,
"logps/rejected": -439.95367431640625,
"loss": 0.4373,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.4822484254837036,
"rewards/margins": 0.3612428605556488,
"rewards/margins_max": 0.8377896547317505,
"rewards/margins_min": -0.11530391871929169,
"rewards/margins_std": 0.6739388704299927,
"rewards/rejected": -1.8434913158416748,
"step": 230
},
{
"epoch": 0.35,
"grad_norm": 7.682347998116211,
"learning_rate": 4.105337954478756e-06,
"logits/chosen": -2.2987964153289795,
"logits/rejected": -2.232954263687134,
"logps/chosen": -488.194580078125,
"logps/rejected": -482.121826171875,
"loss": 0.497,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9666223526000977,
"rewards/margins": 1.5014089345932007,
"rewards/margins_max": 2.3409581184387207,
"rewards/margins_min": 0.6618598103523254,
"rewards/margins_std": 1.1873016357421875,
"rewards/rejected": -2.468031167984009,
"step": 240
},
{
"epoch": 0.37,
"grad_norm": 6.522613509898599,
"learning_rate": 4.005387349532697e-06,
"logits/chosen": -2.3422703742980957,
"logits/rejected": -2.3057916164398193,
"logps/chosen": -449.12469482421875,
"logps/rejected": -528.1080322265625,
"loss": 0.3759,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4108879566192627,
"rewards/margins": 1.1858874559402466,
"rewards/margins_max": 2.1756746768951416,
"rewards/margins_min": 0.19610002636909485,
"rewards/margins_std": 1.3997704982757568,
"rewards/rejected": -2.596775531768799,
"step": 250
},
{
"epoch": 0.38,
"grad_norm": 9.556919040050822,
"learning_rate": 3.901509360874515e-06,
"logits/chosen": -2.1438629627227783,
"logits/rejected": -2.12241792678833,
"logps/chosen": -336.4192199707031,
"logps/rejected": -396.7298889160156,
"loss": 0.3976,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.233567237854004,
"rewards/margins": 0.7809652090072632,
"rewards/margins_max": 1.5155531167984009,
"rewards/margins_min": 0.04637749865651131,
"rewards/margins_std": 1.0388638973236084,
"rewards/rejected": -2.0145325660705566,
"step": 260
},
{
"epoch": 0.39,
"grad_norm": 8.191966927640436,
"learning_rate": 3.793974994315991e-06,
"logits/chosen": -1.8757396936416626,
"logits/rejected": -1.9094167947769165,
"logps/chosen": -297.0498352050781,
"logps/rejected": -410.0951232910156,
"loss": 0.4022,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1613843441009521,
"rewards/margins": 1.04617440700531,
"rewards/margins_max": 1.7157318592071533,
"rewards/margins_min": 0.3766169548034668,
"rewards/margins_std": 0.9468971490859985,
"rewards/rejected": -2.2075586318969727,
"step": 270
},
{
"epoch": 0.41,
"grad_norm": 9.284581469509805,
"learning_rate": 3.68306479474137e-06,
"logits/chosen": -2.3123269081115723,
"logits/rejected": -2.223254680633545,
"logps/chosen": -509.96392822265625,
"logps/rejected": -465.5381774902344,
"loss": 0.3547,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3590073585510254,
"rewards/margins": 1.2631444931030273,
"rewards/margins_max": 1.895696997642517,
"rewards/margins_min": 0.6305915713310242,
"rewards/margins_std": 0.8945645093917847,
"rewards/rejected": -2.6221518516540527,
"step": 280
},
{
"epoch": 0.42,
"grad_norm": 23.380435739964835,
"learning_rate": 3.569068114197784e-06,
"logits/chosen": -2.013559103012085,
"logits/rejected": -1.945844054222107,
"logps/chosen": -327.3985290527344,
"logps/rejected": -448.2659606933594,
"loss": 0.3626,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.0471247434616089,
"rewards/margins": 1.6471458673477173,
"rewards/margins_max": 2.4206135272979736,
"rewards/margins_min": 0.8736783266067505,
"rewards/margins_std": 1.0938485860824585,
"rewards/rejected": -2.694270610809326,
"step": 290
},
{
"epoch": 0.44,
"grad_norm": 14.305787223907084,
"learning_rate": 3.4522823570088073e-06,
"logits/chosen": -1.9044002294540405,
"logits/rejected": -1.8846553564071655,
"logps/chosen": -411.216552734375,
"logps/rejected": -525.3153686523438,
"loss": 0.4017,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.564225196838379,
"rewards/margins": 1.5319175720214844,
"rewards/margins_max": 2.567899703979492,
"rewards/margins_min": 0.49593567848205566,
"rewards/margins_std": 1.4650996923446655,
"rewards/rejected": -3.0961427688598633,
"step": 300
},
{
"epoch": 0.44,
"eval_logits/chosen": -2.0223634243011475,
"eval_logits/rejected": -1.9880024194717407,
"eval_logps/chosen": -462.021728515625,
"eval_logps/rejected": -509.34765625,
"eval_loss": 0.6111792922019958,
"eval_rewards/accuracies": 0.6944444179534912,
"eval_rewards/chosen": -1.7680050134658813,
"eval_rewards/margins": 0.7336447238922119,
"eval_rewards/margins_max": 2.3329057693481445,
"eval_rewards/margins_min": -0.8123146891593933,
"eval_rewards/margins_std": 1.4011034965515137,
"eval_rewards/rejected": -2.501649856567383,
"eval_runtime": 283.1994,
"eval_samples_per_second": 7.062,
"eval_steps_per_second": 0.222,
"step": 300
},
{
"epoch": 0.45,
"grad_norm": 13.373839711281382,
"learning_rate": 3.333012203880528e-06,
"logits/chosen": -2.034450054168701,
"logits/rejected": -1.9856176376342773,
"logps/chosen": -361.3979187011719,
"logps/rejected": -426.4706115722656,
"loss": 0.379,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.1962629556655884,
"rewards/margins": 1.4263014793395996,
"rewards/margins_max": 2.3072102069854736,
"rewards/margins_min": 0.5453929901123047,
"rewards/margins_std": 1.2457928657531738,
"rewards/rejected": -2.6225647926330566,
"step": 310
},
{
"epoch": 0.47,
"grad_norm": 12.791871374689537,
"learning_rate": 3.2115688170243735e-06,
"logits/chosen": -2.104572057723999,
"logits/rejected": -2.1095337867736816,
"logps/chosen": -430.00299072265625,
"logps/rejected": -597.9464721679688,
"loss": 0.357,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.14143705368042,
"rewards/margins": 1.844452142715454,
"rewards/margins_max": 2.6857852935791016,
"rewards/margins_min": 1.0031189918518066,
"rewards/margins_std": 1.1898245811462402,
"rewards/rejected": -2.985888957977295,
"step": 320
},
{
"epoch": 0.48,
"grad_norm": 7.114464717414236,
"learning_rate": 3.0882690283704355e-06,
"logits/chosen": -1.9746555089950562,
"logits/rejected": -1.915001630783081,
"logps/chosen": -350.51678466796875,
"logps/rejected": -462.4185485839844,
"loss": 0.3842,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.0409258604049683,
"rewards/margins": 1.465539813041687,
"rewards/margins_max": 2.2879223823547363,
"rewards/margins_min": 0.6431571841239929,
"rewards/margins_std": 1.1630247831344604,
"rewards/rejected": -2.506465435028076,
"step": 330
},
{
"epoch": 0.5,
"grad_norm": 11.717977839144085,
"learning_rate": 2.9634345129891296e-06,
"logits/chosen": -1.998160719871521,
"logits/rejected": -1.8835424184799194,
"logps/chosen": -419.52362060546875,
"logps/rejected": -527.9315185546875,
"loss": 0.3414,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.097505807876587,
"rewards/margins": 1.5337189435958862,
"rewards/margins_max": 2.7524514198303223,
"rewards/margins_min": 0.31498652696609497,
"rewards/margins_std": 1.723548173904419,
"rewards/rejected": -2.631225109100342,
"step": 340
},
{
"epoch": 0.51,
"grad_norm": 14.724171411488078,
"learning_rate": 2.8373909498776746e-06,
"logits/chosen": -2.16463565826416,
"logits/rejected": -2.167154550552368,
"logps/chosen": -366.3368225097656,
"logps/rejected": -522.6026611328125,
"loss": 0.4421,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.8319808840751648,
"rewards/margins": 1.7606074810028076,
"rewards/margins_max": 2.4449410438537598,
"rewards/margins_min": 1.076274037361145,
"rewards/margins_std": 0.9677937626838684,
"rewards/rejected": -2.5925886631011963,
"step": 350
},
{
"epoch": 0.53,
"grad_norm": 9.998195485379709,
"learning_rate": 2.710467172300768e-06,
"logits/chosen": -2.146489143371582,
"logits/rejected": -2.0911166667938232,
"logps/chosen": -431.58660888671875,
"logps/rejected": -554.4029541015625,
"loss": 0.3448,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.7858428955078125,
"rewards/margins": 1.576249361038208,
"rewards/margins_max": 2.419480562210083,
"rewards/margins_min": 0.7330182194709778,
"rewards/margins_std": 1.1925089359283447,
"rewards/rejected": -2.3620922565460205,
"step": 360
},
{
"epoch": 0.54,
"grad_norm": 16.428444808566788,
"learning_rate": 2.582994309902146e-06,
"logits/chosen": -2.021066665649414,
"logits/rejected": -1.8753414154052734,
"logps/chosen": -438.49176025390625,
"logps/rejected": -512.5082397460938,
"loss": 0.4111,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.347548007965088,
"rewards/margins": 1.3524014949798584,
"rewards/margins_max": 2.1463735103607178,
"rewards/margins_min": 0.5584291815757751,
"rewards/margins_std": 1.122846245765686,
"rewards/rejected": -2.699949264526367,
"step": 370
},
{
"epoch": 0.56,
"grad_norm": 9.34369488447896,
"learning_rate": 2.4553049248251512e-06,
"logits/chosen": -1.9435670375823975,
"logits/rejected": -1.9931520223617554,
"logps/chosen": -381.89532470703125,
"logps/rejected": -504.81341552734375,
"loss": 0.3435,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.481609582901001,
"rewards/margins": 1.2445435523986816,
"rewards/margins_max": 1.940123200416565,
"rewards/margins_min": 0.5489639043807983,
"rewards/margins_std": 0.9836981892585754,
"rewards/rejected": -2.7261533737182617,
"step": 380
},
{
"epoch": 0.57,
"grad_norm": 12.062558188816904,
"learning_rate": 2.3277321440960733e-06,
"logits/chosen": -2.1283860206604004,
"logits/rejected": -2.116414785385132,
"logps/chosen": -395.5448913574219,
"logps/rejected": -539.8890380859375,
"loss": 0.3457,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2231833934783936,
"rewards/margins": 1.5952281951904297,
"rewards/margins_max": 2.423370838165283,
"rewards/margins_min": 0.7670857906341553,
"rewards/margins_std": 1.1711702346801758,
"rewards/rejected": -2.8184115886688232,
"step": 390
},
{
"epoch": 0.58,
"grad_norm": 10.060650957709997,
"learning_rate": 2.20060879053377e-06,
"logits/chosen": -1.8268073797225952,
"logits/rejected": -1.7773048877716064,
"logps/chosen": -360.2474670410156,
"logps/rejected": -529.291259765625,
"loss": 0.3427,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.5878477096557617,
"rewards/margins": 1.5394123792648315,
"rewards/margins_max": 2.434696674346924,
"rewards/margins_min": 0.6441282033920288,
"rewards/margins_std": 1.2661231756210327,
"rewards/rejected": -3.127260208129883,
"step": 400
},
{
"epoch": 0.58,
"eval_logits/chosen": -1.986289143562317,
"eval_logits/rejected": -1.9447433948516846,
"eval_logps/chosen": -476.6219177246094,
"eval_logps/rejected": -527.7764892578125,
"eval_loss": 0.5955031514167786,
"eval_rewards/accuracies": 0.7023809552192688,
"eval_rewards/chosen": -1.9140070676803589,
"eval_rewards/margins": 0.7719313502311707,
"eval_rewards/margins_max": 2.272120952606201,
"eval_rewards/margins_min": -0.7218120098114014,
"eval_rewards/margins_std": 1.3400975465774536,
"eval_rewards/rejected": -2.685938596725464,
"eval_runtime": 283.4614,
"eval_samples_per_second": 7.056,
"eval_steps_per_second": 0.222,
"step": 400
},
{
"epoch": 0.6,
"grad_norm": 11.99849119237726,
"learning_rate": 2.0742665144529374e-06,
"logits/chosen": -1.9736402034759521,
"logits/rejected": -1.901617407798767,
"logps/chosen": -446.02227783203125,
"logps/rejected": -546.3521728515625,
"loss": 0.3666,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.0405278205871582,
"rewards/margins": 1.8972772359848022,
"rewards/margins_max": 2.6515231132507324,
"rewards/margins_min": 1.1430312395095825,
"rewards/margins_std": 1.066664695739746,
"rewards/rejected": -2.937804937362671,
"step": 410
},
{
"epoch": 0.61,
"grad_norm": 6.458783566768089,
"learning_rate": 1.9490349284263036e-06,
"logits/chosen": -1.8606504201889038,
"logits/rejected": -1.7730525732040405,
"logps/chosen": -470.9712829589844,
"logps/rejected": -612.8939208984375,
"loss": 0.3331,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.6054725646972656,
"rewards/margins": 2.13732647895813,
"rewards/margins_max": 2.9402005672454834,
"rewards/margins_min": 1.334452509880066,
"rewards/margins_std": 1.1354353427886963,
"rewards/rejected": -3.7427992820739746,
"step": 420
},
{
"epoch": 0.63,
"grad_norm": 9.837849328941022,
"learning_rate": 1.8252407473630606e-06,
"logits/chosen": -1.983541488647461,
"logits/rejected": -1.9613971710205078,
"logps/chosen": -458.0785217285156,
"logps/rejected": -566.628662109375,
"loss": 0.3559,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.701336145401001,
"rewards/margins": 1.274548053741455,
"rewards/margins_max": 1.9377739429473877,
"rewards/margins_min": 0.6113225221633911,
"rewards/margins_std": 0.9379426836967468,
"rewards/rejected": -2.975884437561035,
"step": 430
},
{
"epoch": 0.64,
"grad_norm": 10.432841283470852,
"learning_rate": 1.7032069361469765e-06,
"logits/chosen": -1.858236312866211,
"logits/rejected": -1.8411200046539307,
"logps/chosen": -359.7177429199219,
"logps/rejected": -587.9736938476562,
"loss": 0.3632,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2637131214141846,
"rewards/margins": 1.9256131649017334,
"rewards/margins_max": 2.9136805534362793,
"rewards/margins_min": 0.9375454783439636,
"rewards/margins_std": 1.397338628768921,
"rewards/rejected": -3.189326047897339,
"step": 440
},
{
"epoch": 0.66,
"grad_norm": 18.395508823434238,
"learning_rate": 1.5832518670578802e-06,
"logits/chosen": -2.010892391204834,
"logits/rejected": -1.9377777576446533,
"logps/chosen": -429.5704040527344,
"logps/rejected": -617.7625732421875,
"loss": 0.4001,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.3690681457519531,
"rewards/margins": 1.9702503681182861,
"rewards/margins_max": 3.1194424629211426,
"rewards/margins_min": 0.8210585713386536,
"rewards/margins_std": 1.6252025365829468,
"rewards/rejected": -3.3393185138702393,
"step": 450
},
{
"epoch": 0.67,
"grad_norm": 11.402368506953254,
"learning_rate": 1.4656884891747398e-06,
"logits/chosen": -1.8819122314453125,
"logits/rejected": -1.9186322689056396,
"logps/chosen": -424.16986083984375,
"logps/rejected": -586.4650268554688,
"loss": 0.3423,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.3872971534729004,
"rewards/margins": 1.951498031616211,
"rewards/margins_max": 3.129544496536255,
"rewards/margins_min": 0.7734516263008118,
"rewards/margins_std": 1.6660093069076538,
"rewards/rejected": -3.3387951850891113,
"step": 460
},
{
"epoch": 0.69,
"grad_norm": 10.853275743311396,
"learning_rate": 1.3508235119272466e-06,
"logits/chosen": -1.8404920101165771,
"logits/rejected": -1.80814528465271,
"logps/chosen": -476.41473388671875,
"logps/rejected": -532.2838745117188,
"loss": 0.3542,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.109175205230713,
"rewards/margins": 0.6896736025810242,
"rewards/margins_max": 1.9764328002929688,
"rewards/margins_min": -0.5970857739448547,
"rewards/margins_std": 1.8197526931762695,
"rewards/rejected": -2.7988486289978027,
"step": 470
},
{
"epoch": 0.7,
"grad_norm": 13.491064434657412,
"learning_rate": 1.238956604925934e-06,
"logits/chosen": -1.7815355062484741,
"logits/rejected": -1.760663628578186,
"logps/chosen": -379.5592346191406,
"logps/rejected": -617.5568237304688,
"loss": 0.377,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.146844506263733,
"rewards/margins": 2.3650784492492676,
"rewards/margins_max": 3.120236396789551,
"rewards/margins_min": 1.6099202632904053,
"rewards/margins_std": 1.0679547786712646,
"rewards/rejected": -3.5119223594665527,
"step": 480
},
{
"epoch": 0.72,
"grad_norm": 5.424010985619327,
"learning_rate": 1.1303796161583763e-06,
"logits/chosen": -2.0357632637023926,
"logits/rejected": -2.0330684185028076,
"logps/chosen": -418.2298278808594,
"logps/rejected": -598.3395385742188,
"loss": 0.3425,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2266708612442017,
"rewards/margins": 1.6023613214492798,
"rewards/margins_max": 2.5102345943450928,
"rewards/margins_min": 0.6944878697395325,
"rewards/margins_std": 1.2839267253875732,
"rewards/rejected": -2.8290319442749023,
"step": 490
},
{
"epoch": 0.73,
"grad_norm": 8.453582130806156,
"learning_rate": 1.0253758105911169e-06,
"logits/chosen": -2.0878758430480957,
"logits/rejected": -1.991579294204712,
"logps/chosen": -448.9693298339844,
"logps/rejected": -671.1817626953125,
"loss": 0.3246,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.2098839282989502,
"rewards/margins": 2.05537486076355,
"rewards/margins_max": 2.98439359664917,
"rewards/margins_min": 1.1263563632965088,
"rewards/margins_std": 1.313830852508545,
"rewards/rejected": -3.2652587890625,
"step": 500
},
{
"epoch": 0.73,
"eval_logits/chosen": -1.886383295059204,
"eval_logits/rejected": -1.84441077709198,
"eval_logps/chosen": -513.3748168945312,
"eval_logps/rejected": -561.1234130859375,
"eval_loss": 0.6025983691215515,
"eval_rewards/accuracies": 0.6626983880996704,
"eval_rewards/chosen": -2.281536102294922,
"eval_rewards/margins": 0.7378710508346558,
"eval_rewards/margins_max": 2.2879276275634766,
"eval_rewards/margins_min": -0.7821336388587952,
"eval_rewards/margins_std": 1.3716031312942505,
"eval_rewards/rejected": -3.019407033920288,
"eval_runtime": 283.5562,
"eval_samples_per_second": 7.053,
"eval_steps_per_second": 0.222,
"step": 500
},
{
"epoch": 0.75,
"grad_norm": 17.08831198492527,
"learning_rate": 9.24219131163705e-07,
"logits/chosen": -1.7034380435943604,
"logits/rejected": -1.677706003189087,
"logps/chosen": -501.74334716796875,
"logps/rejected": -674.4254150390625,
"loss": 0.3481,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.918776273727417,
"rewards/margins": 1.897600531578064,
"rewards/margins_max": 3.511080265045166,
"rewards/margins_min": 0.284121036529541,
"rewards/margins_std": 2.2818045616149902,
"rewards/rejected": -3.8163769245147705,
"step": 510
},
{
"epoch": 0.76,
"grad_norm": 3.7704796107061735,
"learning_rate": 8.271734841028553e-07,
"logits/chosen": -1.5852091312408447,
"logits/rejected": -1.471635103225708,
"logps/chosen": -462.9111328125,
"logps/rejected": -592.2215576171875,
"loss": 0.3213,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.484692096710205,
"rewards/margins": 2.1865782737731934,
"rewards/margins_max": 2.6932501792907715,
"rewards/margins_min": 1.6799061298370361,
"rewards/margins_std": 0.7165425419807434,
"rewards/rejected": -3.6712703704833984,
"step": 520
},
{
"epoch": 0.77,
"grad_norm": 5.607658272294071,
"learning_rate": 7.344920504212244e-07,
"logits/chosen": -1.838727593421936,
"logits/rejected": -1.8044246435165405,
"logps/chosen": -373.3072204589844,
"logps/rejected": -488.6890563964844,
"loss": 0.36,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4621307849884033,
"rewards/margins": 1.4751158952713013,
"rewards/margins_max": 2.411809206008911,
"rewards/margins_min": 0.5384225845336914,
"rewards/margins_std": 1.3246843814849854,
"rewards/rejected": -2.937246799468994,
"step": 530
},
{
"epoch": 0.79,
"grad_norm": 10.180442565411226,
"learning_rate": 6.464166253970672e-07,
"logits/chosen": -1.888649344444275,
"logits/rejected": -1.9107511043548584,
"logps/chosen": -483.1812438964844,
"logps/rejected": -572.9668579101562,
"loss": 0.3483,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5701286792755127,
"rewards/margins": 1.3985799551010132,
"rewards/margins_max": 2.0270140171051025,
"rewards/margins_min": 0.7701458930969238,
"rewards/margins_std": 0.8887398838996887,
"rewards/rejected": -2.9687085151672363,
"step": 540
},
{
"epoch": 0.8,
"grad_norm": 7.067320132481652,
"learning_rate": 5.631769877579535e-07,
"logits/chosen": -1.9241430759429932,
"logits/rejected": -1.8503801822662354,
"logps/chosen": -391.8656311035156,
"logps/rejected": -515.9195556640625,
"loss": 0.3546,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.404034972190857,
"rewards/margins": 1.4414308071136475,
"rewards/margins_max": 2.243879556655884,
"rewards/margins_min": 0.6389821171760559,
"rewards/margins_std": 1.1348340511322021,
"rewards/rejected": -2.845465898513794,
"step": 550
},
{
"epoch": 0.82,
"grad_norm": 23.109650997760248,
"learning_rate": 4.849903002143114e-07,
"logits/chosen": -2.134927988052368,
"logits/rejected": -2.0764718055725098,
"logps/chosen": -540.7327880859375,
"logps/rejected": -669.7936401367188,
"loss": 0.3152,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.6844943761825562,
"rewards/margins": 1.754024863243103,
"rewards/margins_max": 2.7484121322631836,
"rewards/margins_min": 0.7596377730369568,
"rewards/margins_std": 1.4062758684158325,
"rewards/rejected": -3.438519239425659,
"step": 560
},
{
"epoch": 0.83,
"grad_norm": 14.65033008607176,
"learning_rate": 4.1206054290670537e-07,
"logits/chosen": -1.8871490955352783,
"logits/rejected": -1.8968530893325806,
"logps/chosen": -413.68682861328125,
"logps/rejected": -668.4096069335938,
"loss": 0.3224,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.614575743675232,
"rewards/margins": 2.1032443046569824,
"rewards/margins_max": 3.0623726844787598,
"rewards/margins_min": 1.144116759300232,
"rewards/margins_std": 1.3564116954803467,
"rewards/rejected": -3.717820405960083,
"step": 570
},
{
"epoch": 0.85,
"grad_norm": 7.593895176885149,
"learning_rate": 3.44577981244944e-07,
"logits/chosen": -1.9477647542953491,
"logits/rejected": -1.9660476446151733,
"logps/chosen": -438.8433532714844,
"logps/rejected": -546.8043212890625,
"loss": 0.4149,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9253628253936768,
"rewards/margins": 1.1086914539337158,
"rewards/margins_max": 2.1370372772216797,
"rewards/margins_min": 0.08034573495388031,
"rewards/margins_std": 1.4543002843856812,
"rewards/rejected": -3.0340542793273926,
"step": 580
},
{
"epoch": 0.86,
"grad_norm": 5.844449740859913,
"learning_rate": 2.827186695273482e-07,
"logits/chosen": -2.1650023460388184,
"logits/rejected": -2.0401604175567627,
"logps/chosen": -522.0162963867188,
"logps/rejected": -637.875732421875,
"loss": 0.3966,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.199137806892395,
"rewards/margins": 1.8260653018951416,
"rewards/margins_max": 3.141328811645508,
"rewards/margins_min": 0.5108016729354858,
"rewards/margins_std": 1.8600635528564453,
"rewards/rejected": -3.025203227996826,
"step": 590
},
{
"epoch": 0.88,
"grad_norm": 8.653551252508137,
"learning_rate": 2.2664399163518786e-07,
"logits/chosen": -1.9303522109985352,
"logits/rejected": -1.865822434425354,
"logps/chosen": -474.86810302734375,
"logps/rejected": -598.7958374023438,
"loss": 0.2747,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.5423686504364014,
"rewards/margins": 2.11772084236145,
"rewards/margins_max": 3.12973690032959,
"rewards/margins_min": 1.1057052612304688,
"rewards/margins_std": 1.431206464767456,
"rewards/rejected": -3.6600890159606934,
"step": 600
},
{
"epoch": 0.88,
"eval_logits/chosen": -1.8380076885223389,
"eval_logits/rejected": -1.7933586835861206,
"eval_logps/chosen": -502.5606994628906,
"eval_logps/rejected": -556.8073120117188,
"eval_loss": 0.5972898602485657,
"eval_rewards/accuracies": 0.6785714030265808,
"eval_rewards/chosen": -2.1733951568603516,
"eval_rewards/margins": 0.8028514385223389,
"eval_rewards/margins_max": 2.427276134490967,
"eval_rewards/margins_min": -0.751455545425415,
"eval_rewards/margins_std": 1.4232866764068604,
"eval_rewards/rejected": -2.9762465953826904,
"eval_runtime": 283.2826,
"eval_samples_per_second": 7.06,
"eval_steps_per_second": 0.222,
"step": 600
},
{
"epoch": 0.89,
"grad_norm": 9.369824417732737,
"learning_rate": 1.7650024000056415e-07,
"logits/chosen": -1.8018262386322021,
"logits/rejected": -1.7738994359970093,
"logps/chosen": -374.61505126953125,
"logps/rejected": -558.9880981445312,
"loss": 0.3286,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.5432480573654175,
"rewards/margins": 1.8530250787734985,
"rewards/margins_max": 2.7572684288024902,
"rewards/margins_min": 0.9487819671630859,
"rewards/margins_std": 1.2787930965423584,
"rewards/rejected": -3.396273374557495,
"step": 610
},
{
"epoch": 0.91,
"grad_norm": 6.630649676820421,
"learning_rate": 1.324182339461544e-07,
"logits/chosen": -1.8650553226470947,
"logits/rejected": -1.8343786001205444,
"logps/chosen": -429.2254333496094,
"logps/rejected": -518.1988525390625,
"loss": 0.3508,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.508697271347046,
"rewards/margins": 1.630495309829712,
"rewards/margins_max": 2.3903756141662598,
"rewards/margins_min": 0.8706151247024536,
"rewards/margins_std": 1.0746327638626099,
"rewards/rejected": -3.139192581176758,
"step": 620
},
{
"epoch": 0.92,
"grad_norm": 16.0557186788671,
"learning_rate": 9.451297839253915e-08,
"logits/chosen": -1.8724334239959717,
"logits/rejected": -1.7563972473144531,
"logps/chosen": -476.4231872558594,
"logps/rejected": -727.5858154296875,
"loss": 0.3078,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.4125255346298218,
"rewards/margins": 2.7260119915008545,
"rewards/margins_max": 3.8035919666290283,
"rewards/margins_min": 1.6484321355819702,
"rewards/margins_std": 1.5239282846450806,
"rewards/rejected": -4.138537406921387,
"step": 630
},
{
"epoch": 0.94,
"grad_norm": 10.546642069187087,
"learning_rate": 6.288336382349463e-08,
"logits/chosen": -1.8520616292953491,
"logits/rejected": -1.7307716608047485,
"logps/chosen": -558.7571411132812,
"logps/rejected": -656.2450561523438,
"loss": 0.2733,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.7490208148956299,
"rewards/margins": 2.0255560874938965,
"rewards/margins_max": 2.8944671154022217,
"rewards/margins_min": 1.1566449403762817,
"rewards/margins_std": 1.2288259267807007,
"rewards/rejected": -3.7745769023895264,
"step": 640
},
{
"epoch": 0.95,
"grad_norm": 14.096656651836478,
"learning_rate": 3.761190829201067e-08,
"logits/chosen": -1.8188579082489014,
"logits/rejected": -1.7514938116073608,
"logps/chosen": -534.6187133789062,
"logps/rejected": -567.9749755859375,
"loss": 0.3434,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4054492712020874,
"rewards/margins": 1.610443115234375,
"rewards/margins_max": 2.8384835720062256,
"rewards/margins_min": 0.38240256905555725,
"rewards/margins_std": 1.7367115020751953,
"rewards/rejected": -3.015892505645752,
"step": 650
},
{
"epoch": 0.96,
"grad_norm": 8.340110871014865,
"learning_rate": 1.876454214011253e-08,
"logits/chosen": -1.8232501745224,
"logits/rejected": -1.779158592224121,
"logps/chosen": -416.4646911621094,
"logps/rejected": -530.9276123046875,
"loss": 0.3517,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5091989040374756,
"rewards/margins": 1.5967410802841187,
"rewards/margins_max": 2.6342692375183105,
"rewards/margins_min": 0.5592130422592163,
"rewards/margins_std": 1.4672863483428955,
"rewards/rejected": -3.105940103530884,
"step": 660
},
{
"epoch": 0.98,
"grad_norm": 11.832885590441766,
"learning_rate": 6.390435994127753e-09,
"logits/chosen": -1.7567815780639648,
"logits/rejected": -1.7844308614730835,
"logps/chosen": -505.89019775390625,
"logps/rejected": -757.8609619140625,
"loss": 0.4029,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9832277297973633,
"rewards/margins": 2.035264492034912,
"rewards/margins_max": 3.0966219902038574,
"rewards/margins_min": 0.9739071130752563,
"rewards/margins_std": 1.500985860824585,
"rewards/rejected": -4.018492221832275,
"step": 670
},
{
"epoch": 0.99,
"grad_norm": 7.732345900580283,
"learning_rate": 5.218724841346556e-10,
"logits/chosen": -1.5676209926605225,
"logits/rejected": -1.594948172569275,
"logps/chosen": -497.2765197753906,
"logps/rejected": -658.2857666015625,
"loss": 0.3008,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.0746806859970093,
"rewards/margins": 2.671840190887451,
"rewards/margins_max": 3.8470091819763184,
"rewards/margins_min": 1.4966704845428467,
"rewards/margins_std": 1.661940336227417,
"rewards/rejected": -3.74652099609375,
"step": 680
},
{
"epoch": 1.0,
"step": 684,
"total_flos": 0.0,
"train_loss": 0.42957196057888497,
"train_runtime": 6346.2002,
"train_samples_per_second": 1.724,
"train_steps_per_second": 0.108
}
],
"logging_steps": 10,
"max_steps": 684,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}