Jimmy19991222's picture
Upload folder using huggingface_hub
99d9e31 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982631930527722,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01068804275217101,
"grad_norm": 47.923506570215594,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -1.0110366344451904,
"logits/rejected": -0.9818881750106812,
"logps/chosen": -0.27409863471984863,
"logps/rejected": -0.27151164412498474,
"loss": 3.0607,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -2.7409865856170654,
"rewards/margins": -0.025869915261864662,
"rewards/rejected": -2.715116500854492,
"step": 5
},
{
"epoch": 0.02137608550434202,
"grad_norm": 39.987585891736785,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -1.0418651103973389,
"logits/rejected": -0.9748126864433289,
"logps/chosen": -0.2945522964000702,
"logps/rejected": -0.29994362592697144,
"loss": 3.0104,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -2.9455230236053467,
"rewards/margins": 0.05391312763094902,
"rewards/rejected": -2.999436378479004,
"step": 10
},
{
"epoch": 0.03206412825651302,
"grad_norm": 52.07278122268582,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.963701069355011,
"logits/rejected": -0.9835487604141235,
"logps/chosen": -0.2644619345664978,
"logps/rejected": -0.3007102608680725,
"loss": 3.0162,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.6446194648742676,
"rewards/margins": 0.362483412027359,
"rewards/rejected": -3.0071024894714355,
"step": 15
},
{
"epoch": 0.04275217100868404,
"grad_norm": 93.33861075914483,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.9671205282211304,
"logits/rejected": -0.9406957626342773,
"logps/chosen": -0.27761051058769226,
"logps/rejected": -0.2907746732234955,
"loss": 2.9342,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.7761049270629883,
"rewards/margins": 0.13164177536964417,
"rewards/rejected": -2.9077467918395996,
"step": 20
},
{
"epoch": 0.053440213760855046,
"grad_norm": 52.349708457694014,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -1.015834093093872,
"logits/rejected": -0.9864752888679504,
"logps/chosen": -0.2717323899269104,
"logps/rejected": -0.27839282155036926,
"loss": 3.1216,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.7173242568969727,
"rewards/margins": 0.06660404056310654,
"rewards/rejected": -2.783928394317627,
"step": 25
},
{
"epoch": 0.06412825651302605,
"grad_norm": 45.104515251326376,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -0.9981824159622192,
"logits/rejected": -0.9536676406860352,
"logps/chosen": -0.2733208239078522,
"logps/rejected": -0.2788906693458557,
"loss": 2.9453,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -2.733208179473877,
"rewards/margins": 0.055698495358228683,
"rewards/rejected": -2.7889065742492676,
"step": 30
},
{
"epoch": 0.07481629926519706,
"grad_norm": 61.54928932943931,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -1.051733136177063,
"logits/rejected": -0.9763606190681458,
"logps/chosen": -0.2938762605190277,
"logps/rejected": -0.3207188844680786,
"loss": 2.9156,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.938762664794922,
"rewards/margins": 0.26842620968818665,
"rewards/rejected": -3.2071890830993652,
"step": 35
},
{
"epoch": 0.08550434201736808,
"grad_norm": 55.913783341396325,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -1.0160491466522217,
"logits/rejected": -0.9717121124267578,
"logps/chosen": -0.27992749214172363,
"logps/rejected": -0.32374969124794006,
"loss": 2.9079,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.7992749214172363,
"rewards/margins": 0.43822187185287476,
"rewards/rejected": -3.237496852874756,
"step": 40
},
{
"epoch": 0.09619238476953908,
"grad_norm": 38.79733201252679,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -1.0506359338760376,
"logits/rejected": -1.0073621273040771,
"logps/chosen": -0.3326144218444824,
"logps/rejected": -0.38409319519996643,
"loss": 2.9658,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -3.3261444568634033,
"rewards/margins": 0.5147874355316162,
"rewards/rejected": -3.8409321308135986,
"step": 45
},
{
"epoch": 0.10688042752171009,
"grad_norm": 101.77454221179983,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -1.028257131576538,
"logits/rejected": -0.9783049821853638,
"logps/chosen": -0.3342127203941345,
"logps/rejected": -0.3756522536277771,
"loss": 2.9987,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -3.342127561569214,
"rewards/margins": 0.4143945574760437,
"rewards/rejected": -3.7565224170684814,
"step": 50
},
{
"epoch": 0.11756847027388109,
"grad_norm": 70.06029649060484,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -1.0614262819290161,
"logits/rejected": -1.025525689125061,
"logps/chosen": -0.2905944287776947,
"logps/rejected": -0.35211512446403503,
"loss": 2.7815,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.905944347381592,
"rewards/margins": 0.6152070164680481,
"rewards/rejected": -3.521151065826416,
"step": 55
},
{
"epoch": 0.1282565130260521,
"grad_norm": 49.123079394299815,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -1.0964637994766235,
"logits/rejected": -1.061679720878601,
"logps/chosen": -0.3209289014339447,
"logps/rejected": -0.3418692350387573,
"loss": 2.8596,
"rewards/accuracies": 0.5625,
"rewards/chosen": -3.209289073944092,
"rewards/margins": 0.2094031274318695,
"rewards/rejected": -3.4186923503875732,
"step": 60
},
{
"epoch": 0.13894455577822312,
"grad_norm": 53.59523574650431,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -1.0083563327789307,
"logits/rejected": -0.9795120358467102,
"logps/chosen": -0.3694208264350891,
"logps/rejected": -0.4273703694343567,
"loss": 2.7899,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -3.6942081451416016,
"rewards/margins": 0.5794947743415833,
"rewards/rejected": -4.273703098297119,
"step": 65
},
{
"epoch": 0.14963259853039412,
"grad_norm": 39.11989937521066,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -1.02675461769104,
"logits/rejected": -1.0018466711044312,
"logps/chosen": -0.35180264711380005,
"logps/rejected": -0.4284419119358063,
"loss": 2.8671,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -3.518026828765869,
"rewards/margins": 0.7663925290107727,
"rewards/rejected": -4.284419059753418,
"step": 70
},
{
"epoch": 0.16032064128256512,
"grad_norm": 51.11281867224414,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.9933602213859558,
"logits/rejected": -0.9224111437797546,
"logps/chosen": -0.3594875931739807,
"logps/rejected": -0.40996867418289185,
"loss": 2.7704,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.5948760509490967,
"rewards/margins": 0.5048106908798218,
"rewards/rejected": -4.099686622619629,
"step": 75
},
{
"epoch": 0.17100868403473615,
"grad_norm": 48.022103189017436,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -0.9609634280204773,
"logits/rejected": -0.9471040964126587,
"logps/chosen": -0.35821908712387085,
"logps/rejected": -0.45667845010757446,
"loss": 2.6966,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -3.582190990447998,
"rewards/margins": 0.984593391418457,
"rewards/rejected": -4.566784858703613,
"step": 80
},
{
"epoch": 0.18169672678690715,
"grad_norm": 54.03450562178558,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.9785356521606445,
"logits/rejected": -0.9566847085952759,
"logps/chosen": -0.3405897319316864,
"logps/rejected": -0.4017128050327301,
"loss": 2.6144,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -3.405897617340088,
"rewards/margins": 0.6112309098243713,
"rewards/rejected": -4.0171284675598145,
"step": 85
},
{
"epoch": 0.19238476953907815,
"grad_norm": 62.750052897303675,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -1.0191900730133057,
"logits/rejected": -0.9845901727676392,
"logps/chosen": -0.4232923090457916,
"logps/rejected": -0.5109944939613342,
"loss": 2.866,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -4.23292350769043,
"rewards/margins": 0.8770216107368469,
"rewards/rejected": -5.109944820404053,
"step": 90
},
{
"epoch": 0.20307281229124916,
"grad_norm": 55.61240306403997,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -1.1014890670776367,
"logits/rejected": -1.0177241563796997,
"logps/chosen": -0.4533822536468506,
"logps/rejected": -0.4995104670524597,
"loss": 2.7432,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -4.533822536468506,
"rewards/margins": 0.4612821042537689,
"rewards/rejected": -4.995104789733887,
"step": 95
},
{
"epoch": 0.21376085504342018,
"grad_norm": 80.5027346612393,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -0.9957372546195984,
"logits/rejected": -0.9701834917068481,
"logps/chosen": -0.43816161155700684,
"logps/rejected": -0.5128804445266724,
"loss": 2.7813,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -4.381616592407227,
"rewards/margins": 0.7471875548362732,
"rewards/rejected": -5.128803253173828,
"step": 100
},
{
"epoch": 0.22444889779559118,
"grad_norm": 66.31806821536476,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -0.9997787475585938,
"logits/rejected": -0.947482705116272,
"logps/chosen": -0.4254922866821289,
"logps/rejected": -0.5347083806991577,
"loss": 2.7046,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -4.254923343658447,
"rewards/margins": 1.092160701751709,
"rewards/rejected": -5.347084045410156,
"step": 105
},
{
"epoch": 0.23513694054776219,
"grad_norm": 61.1266120827584,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.9583929181098938,
"logits/rejected": -0.8993922472000122,
"logps/chosen": -0.4909549355506897,
"logps/rejected": -0.620493471622467,
"loss": 2.6559,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -4.909549713134766,
"rewards/margins": 1.2953848838806152,
"rewards/rejected": -6.204934120178223,
"step": 110
},
{
"epoch": 0.2458249832999332,
"grad_norm": 66.56145340935555,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -1.019431471824646,
"logits/rejected": -0.9595627784729004,
"logps/chosen": -0.5270282030105591,
"logps/rejected": -0.600238025188446,
"loss": 2.4928,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -5.27028226852417,
"rewards/margins": 0.7320979833602905,
"rewards/rejected": -6.00238037109375,
"step": 115
},
{
"epoch": 0.2565130260521042,
"grad_norm": 66.16205862286387,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.9745362997055054,
"logits/rejected": -0.8843653798103333,
"logps/chosen": -0.5472803115844727,
"logps/rejected": -0.7492850422859192,
"loss": 2.3982,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -5.472803115844727,
"rewards/margins": 2.020047187805176,
"rewards/rejected": -7.492850303649902,
"step": 120
},
{
"epoch": 0.26720106880427524,
"grad_norm": 70.88843943146098,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -1.0500959157943726,
"logits/rejected": -1.007611632347107,
"logps/chosen": -0.6212247610092163,
"logps/rejected": -0.7247714996337891,
"loss": 2.3233,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -6.212247371673584,
"rewards/margins": 1.0354671478271484,
"rewards/rejected": -7.247714042663574,
"step": 125
},
{
"epoch": 0.27788911155644624,
"grad_norm": 94.40161191780366,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -1.0675666332244873,
"logits/rejected": -1.0614221096038818,
"logps/chosen": -0.6142371892929077,
"logps/rejected": -0.8813148736953735,
"loss": 2.1102,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -6.1423726081848145,
"rewards/margins": 2.670776844024658,
"rewards/rejected": -8.813148498535156,
"step": 130
},
{
"epoch": 0.28857715430861725,
"grad_norm": 71.42739738901432,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -1.0529481172561646,
"logits/rejected": -1.0047996044158936,
"logps/chosen": -0.7235802412033081,
"logps/rejected": -0.8823626637458801,
"loss": 2.1377,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -7.235803127288818,
"rewards/margins": 1.5878244638442993,
"rewards/rejected": -8.823626518249512,
"step": 135
},
{
"epoch": 0.29926519706078825,
"grad_norm": 87.9759333714625,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -1.1124293804168701,
"logits/rejected": -1.0896517038345337,
"logps/chosen": -0.862978458404541,
"logps/rejected": -1.0037717819213867,
"loss": 2.1017,
"rewards/accuracies": 0.75,
"rewards/chosen": -8.62978458404541,
"rewards/margins": 1.4079326391220093,
"rewards/rejected": -10.037717819213867,
"step": 140
},
{
"epoch": 0.30995323981295925,
"grad_norm": 78.07225371686874,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -1.030829906463623,
"logits/rejected": -1.0042556524276733,
"logps/chosen": -0.8588500022888184,
"logps/rejected": -1.1039783954620361,
"loss": 2.0002,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -8.588499069213867,
"rewards/margins": 2.4512839317321777,
"rewards/rejected": -11.039785385131836,
"step": 145
},
{
"epoch": 0.32064128256513025,
"grad_norm": 70.30730129459549,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -1.060490369796753,
"logits/rejected": -1.0404036045074463,
"logps/chosen": -0.9423840641975403,
"logps/rejected": -1.1874125003814697,
"loss": 1.9455,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -9.42384147644043,
"rewards/margins": 2.4502837657928467,
"rewards/rejected": -11.874125480651855,
"step": 150
},
{
"epoch": 0.33132932531730125,
"grad_norm": 81.54625041986957,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -1.0910407304763794,
"logits/rejected": -1.0684020519256592,
"logps/chosen": -0.9991434812545776,
"logps/rejected": -1.3156726360321045,
"loss": 2.0451,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -9.991434097290039,
"rewards/margins": 3.165290355682373,
"rewards/rejected": -13.15672492980957,
"step": 155
},
{
"epoch": 0.3420173680694723,
"grad_norm": 78.5490421908409,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -1.109403371810913,
"logits/rejected": -1.090001106262207,
"logps/chosen": -1.1215949058532715,
"logps/rejected": -1.5121821165084839,
"loss": 1.9436,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -11.215949058532715,
"rewards/margins": 3.9058711528778076,
"rewards/rejected": -15.121821403503418,
"step": 160
},
{
"epoch": 0.3527054108216433,
"grad_norm": 59.08371857927558,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -1.1232795715332031,
"logits/rejected": -1.0980435609817505,
"logps/chosen": -1.0903780460357666,
"logps/rejected": -1.459205150604248,
"loss": 1.8384,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -10.903780937194824,
"rewards/margins": 3.688269853591919,
"rewards/rejected": -14.59205150604248,
"step": 165
},
{
"epoch": 0.3633934535738143,
"grad_norm": 85.71218468828272,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -1.1307361125946045,
"logits/rejected": -1.1074953079223633,
"logps/chosen": -1.0654685497283936,
"logps/rejected": -1.4472792148590088,
"loss": 1.7884,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -10.654685020446777,
"rewards/margins": 3.818106174468994,
"rewards/rejected": -14.47279167175293,
"step": 170
},
{
"epoch": 0.3740814963259853,
"grad_norm": 92.85957749639208,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -1.1518357992172241,
"logits/rejected": -1.102372407913208,
"logps/chosen": -1.1460392475128174,
"logps/rejected": -1.4155685901641846,
"loss": 1.6771,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -11.460393905639648,
"rewards/margins": 2.695291757583618,
"rewards/rejected": -14.155685424804688,
"step": 175
},
{
"epoch": 0.3847695390781563,
"grad_norm": 75.98858315922392,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -1.0996112823486328,
"logits/rejected": -1.0788969993591309,
"logps/chosen": -1.1098445653915405,
"logps/rejected": -1.476881504058838,
"loss": 1.6011,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -11.098443984985352,
"rewards/margins": 3.6703686714172363,
"rewards/rejected": -14.76881217956543,
"step": 180
},
{
"epoch": 0.3954575818303273,
"grad_norm": 88.81502196023631,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -1.1378796100616455,
"logits/rejected": -1.0828906297683716,
"logps/chosen": -1.1474685668945312,
"logps/rejected": -1.5796287059783936,
"loss": 1.6663,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -11.474684715270996,
"rewards/margins": 4.321602821350098,
"rewards/rejected": -15.796287536621094,
"step": 185
},
{
"epoch": 0.4061456245824983,
"grad_norm": 90.03714203036446,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -1.1278326511383057,
"logits/rejected": -1.1358839273452759,
"logps/chosen": -1.2729408740997314,
"logps/rejected": -1.7558482885360718,
"loss": 1.5442,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -12.729410171508789,
"rewards/margins": 4.829073905944824,
"rewards/rejected": -17.558483123779297,
"step": 190
},
{
"epoch": 0.4168336673346693,
"grad_norm": 72.89600233357321,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -1.0958189964294434,
"logits/rejected": -1.076554775238037,
"logps/chosen": -1.2896816730499268,
"logps/rejected": -1.6636635065078735,
"loss": 1.6021,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -12.896817207336426,
"rewards/margins": 3.7398200035095215,
"rewards/rejected": -16.636634826660156,
"step": 195
},
{
"epoch": 0.42752171008684037,
"grad_norm": 93.9340667463585,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -1.0816549062728882,
"logits/rejected": -1.0706536769866943,
"logps/chosen": -1.3197344541549683,
"logps/rejected": -1.7450058460235596,
"loss": 1.5092,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -13.197346687316895,
"rewards/margins": 4.252710819244385,
"rewards/rejected": -17.450056076049805,
"step": 200
},
{
"epoch": 0.43820975283901137,
"grad_norm": 91.15403821743105,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -1.1295057535171509,
"logits/rejected": -1.0786478519439697,
"logps/chosen": -1.3944091796875,
"logps/rejected": -1.8417927026748657,
"loss": 1.7223,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -13.944093704223633,
"rewards/margins": 4.473834037780762,
"rewards/rejected": -18.417926788330078,
"step": 205
},
{
"epoch": 0.44889779559118237,
"grad_norm": 99.88773415242756,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -1.1591789722442627,
"logits/rejected": -1.147062063217163,
"logps/chosen": -1.3990533351898193,
"logps/rejected": -1.8112404346466064,
"loss": 1.6082,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -13.990533828735352,
"rewards/margins": 4.121870040893555,
"rewards/rejected": -18.112403869628906,
"step": 210
},
{
"epoch": 0.45958583834335337,
"grad_norm": 108.24791172325133,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -1.1198530197143555,
"logits/rejected": -1.101109504699707,
"logps/chosen": -1.390649437904358,
"logps/rejected": -1.8630450963974,
"loss": 1.4791,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -13.906494140625,
"rewards/margins": 4.7239580154418945,
"rewards/rejected": -18.630451202392578,
"step": 215
},
{
"epoch": 0.47027388109552437,
"grad_norm": 95.83911690989143,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -1.1780140399932861,
"logits/rejected": -1.1579878330230713,
"logps/chosen": -1.4568861722946167,
"logps/rejected": -1.9470503330230713,
"loss": 1.4586,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -14.56886100769043,
"rewards/margins": 4.901640892028809,
"rewards/rejected": -19.470502853393555,
"step": 220
},
{
"epoch": 0.48096192384769537,
"grad_norm": 69.10471107204022,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -1.1474467515945435,
"logits/rejected": -1.1124647855758667,
"logps/chosen": -1.48002028465271,
"logps/rejected": -1.9400886297225952,
"loss": 1.4409,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -14.800203323364258,
"rewards/margins": 4.600685119628906,
"rewards/rejected": -19.400888442993164,
"step": 225
},
{
"epoch": 0.4916499665998664,
"grad_norm": 80.62484140193865,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -1.2124546766281128,
"logits/rejected": -1.161115050315857,
"logps/chosen": -1.4423153400421143,
"logps/rejected": -1.9036369323730469,
"loss": 1.3817,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -14.423154830932617,
"rewards/margins": 4.613214015960693,
"rewards/rejected": -19.03636932373047,
"step": 230
},
{
"epoch": 0.5023380093520374,
"grad_norm": 84.05912123531321,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -1.2311934232711792,
"logits/rejected": -1.2020883560180664,
"logps/chosen": -1.4844694137573242,
"logps/rejected": -1.9821853637695312,
"loss": 1.4172,
"rewards/accuracies": 0.78125,
"rewards/chosen": -14.844694137573242,
"rewards/margins": 4.97715950012207,
"rewards/rejected": -19.821855545043945,
"step": 235
},
{
"epoch": 0.5130260521042084,
"grad_norm": 84.00316536161533,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -1.198540449142456,
"logits/rejected": -1.1992590427398682,
"logps/chosen": -1.3957428932189941,
"logps/rejected": -1.8944737911224365,
"loss": 1.4343,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -13.957429885864258,
"rewards/margins": 4.987309455871582,
"rewards/rejected": -18.944738388061523,
"step": 240
},
{
"epoch": 0.5237140948563794,
"grad_norm": 137.49078119090206,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -1.2955886125564575,
"logits/rejected": -1.23685622215271,
"logps/chosen": -1.460442066192627,
"logps/rejected": -2.0612359046936035,
"loss": 1.3532,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -14.60442066192627,
"rewards/margins": 6.007939338684082,
"rewards/rejected": -20.612361907958984,
"step": 245
},
{
"epoch": 0.5344021376085505,
"grad_norm": 129.54289500612722,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -1.2238231897354126,
"logits/rejected": -1.2080833911895752,
"logps/chosen": -1.5243932008743286,
"logps/rejected": -2.1077561378479004,
"loss": 1.3459,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.243929862976074,
"rewards/margins": 5.833629608154297,
"rewards/rejected": -21.077558517456055,
"step": 250
},
{
"epoch": 0.5450901803607214,
"grad_norm": 102.89768684384153,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -1.267345905303955,
"logits/rejected": -1.2413192987442017,
"logps/chosen": -1.6330616474151611,
"logps/rejected": -2.104926824569702,
"loss": 1.4434,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.330615997314453,
"rewards/margins": 4.718654155731201,
"rewards/rejected": -21.049266815185547,
"step": 255
},
{
"epoch": 0.5557782231128925,
"grad_norm": 89.20095630673174,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -1.245793104171753,
"logits/rejected": -1.214970350265503,
"logps/chosen": -1.554158091545105,
"logps/rejected": -2.0427088737487793,
"loss": 1.4276,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -15.541582107543945,
"rewards/margins": 4.8855085372924805,
"rewards/rejected": -20.42708969116211,
"step": 260
},
{
"epoch": 0.5664662658650634,
"grad_norm": 76.055827552827,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -1.2102077007293701,
"logits/rejected": -1.1913068294525146,
"logps/chosen": -1.6448841094970703,
"logps/rejected": -2.155822277069092,
"loss": 1.3609,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -16.448841094970703,
"rewards/margins": 5.109385967254639,
"rewards/rejected": -21.5582275390625,
"step": 265
},
{
"epoch": 0.5771543086172345,
"grad_norm": 129.8909118017969,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -1.2157796621322632,
"logits/rejected": -1.186073899269104,
"logps/chosen": -1.4407769441604614,
"logps/rejected": -1.9774402379989624,
"loss": 1.2996,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.407770156860352,
"rewards/margins": 5.366633415222168,
"rewards/rejected": -19.774402618408203,
"step": 270
},
{
"epoch": 0.5878423513694054,
"grad_norm": 74.47995587471961,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -1.177504301071167,
"logits/rejected": -1.1408427953720093,
"logps/chosen": -1.4323005676269531,
"logps/rejected": -1.8869625329971313,
"loss": 1.4405,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -14.323003768920898,
"rewards/margins": 4.546619892120361,
"rewards/rejected": -18.869625091552734,
"step": 275
},
{
"epoch": 0.5985303941215765,
"grad_norm": 96.7891504750656,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -1.262804627418518,
"logits/rejected": -1.2378443479537964,
"logps/chosen": -1.4722181558609009,
"logps/rejected": -2.023758888244629,
"loss": 1.3202,
"rewards/accuracies": 0.8125,
"rewards/chosen": -14.72218132019043,
"rewards/margins": 5.515408515930176,
"rewards/rejected": -20.23758888244629,
"step": 280
},
{
"epoch": 0.6092184368737475,
"grad_norm": 98.51578082175142,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -1.2408018112182617,
"logits/rejected": -1.2075875997543335,
"logps/chosen": -1.5188751220703125,
"logps/rejected": -2.0878236293792725,
"loss": 1.0977,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -15.188751220703125,
"rewards/margins": 5.689483642578125,
"rewards/rejected": -20.878236770629883,
"step": 285
},
{
"epoch": 0.6199064796259185,
"grad_norm": 77.81254701258105,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -1.2707680463790894,
"logits/rejected": -1.2261282205581665,
"logps/chosen": -1.5314843654632568,
"logps/rejected": -2.0101191997528076,
"loss": 1.3577,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -15.314845085144043,
"rewards/margins": 4.786349296569824,
"rewards/rejected": -20.101192474365234,
"step": 290
},
{
"epoch": 0.6305945223780896,
"grad_norm": 87.64994632483507,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -1.199864387512207,
"logits/rejected": -1.1842243671417236,
"logps/chosen": -1.545689344406128,
"logps/rejected": -2.0575714111328125,
"loss": 1.1919,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -15.456893920898438,
"rewards/margins": 5.118819713592529,
"rewards/rejected": -20.575714111328125,
"step": 295
},
{
"epoch": 0.6412825651302605,
"grad_norm": 84.61392252215398,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -1.1328258514404297,
"logits/rejected": -1.1063092947006226,
"logps/chosen": -1.631317138671875,
"logps/rejected": -2.079132556915283,
"loss": 1.6288,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -16.31317138671875,
"rewards/margins": 4.478152275085449,
"rewards/rejected": -20.791322708129883,
"step": 300
},
{
"epoch": 0.6519706078824316,
"grad_norm": 84.46013666927763,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -1.2454413175582886,
"logits/rejected": -1.1997601985931396,
"logps/chosen": -1.6037687063217163,
"logps/rejected": -2.0645315647125244,
"loss": 1.3155,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -16.03768539428711,
"rewards/margins": 4.6076273918151855,
"rewards/rejected": -20.645313262939453,
"step": 305
},
{
"epoch": 0.6626586506346025,
"grad_norm": 91.59670184758677,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -1.2738150358200073,
"logits/rejected": -1.253278136253357,
"logps/chosen": -1.6317838430404663,
"logps/rejected": -2.138291835784912,
"loss": 1.372,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -16.31783676147461,
"rewards/margins": 5.065082550048828,
"rewards/rejected": -21.382923126220703,
"step": 310
},
{
"epoch": 0.6733466933867736,
"grad_norm": 87.02614481244046,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -1.284517526626587,
"logits/rejected": -1.2254732847213745,
"logps/chosen": -1.618208885192871,
"logps/rejected": -2.1373062133789062,
"loss": 1.3415,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.182092666625977,
"rewards/margins": 5.190975189208984,
"rewards/rejected": -21.373065948486328,
"step": 315
},
{
"epoch": 0.6840347361389446,
"grad_norm": 104.27219685818618,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -1.2704033851623535,
"logits/rejected": -1.2657862901687622,
"logps/chosen": -1.6442874670028687,
"logps/rejected": -2.2715744972229004,
"loss": 1.2614,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.4428768157959,
"rewards/margins": 6.2728681564331055,
"rewards/rejected": -22.715742111206055,
"step": 320
},
{
"epoch": 0.6947227788911156,
"grad_norm": 78.77445808060149,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -1.2807691097259521,
"logits/rejected": -1.2303869724273682,
"logps/chosen": -1.6990268230438232,
"logps/rejected": -2.368220329284668,
"loss": 1.3078,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.99026870727539,
"rewards/margins": 6.6919355392456055,
"rewards/rejected": -23.682205200195312,
"step": 325
},
{
"epoch": 0.7054108216432866,
"grad_norm": 70.04351714156043,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -1.1783530712127686,
"logits/rejected": -1.136584758758545,
"logps/chosen": -1.6521613597869873,
"logps/rejected": -2.1305041313171387,
"loss": 1.3592,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.5216121673584,
"rewards/margins": 4.783430099487305,
"rewards/rejected": -21.305042266845703,
"step": 330
},
{
"epoch": 0.7160988643954576,
"grad_norm": 75.03379354143011,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -1.2156535387039185,
"logits/rejected": -1.1975212097167969,
"logps/chosen": -1.6360639333724976,
"logps/rejected": -2.187391757965088,
"loss": 1.1952,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.360637664794922,
"rewards/margins": 5.513278484344482,
"rewards/rejected": -21.873918533325195,
"step": 335
},
{
"epoch": 0.7267869071476286,
"grad_norm": 103.30210442360509,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -1.2379086017608643,
"logits/rejected": -1.2029554843902588,
"logps/chosen": -1.5814708471298218,
"logps/rejected": -2.1416497230529785,
"loss": 1.3219,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.814706802368164,
"rewards/margins": 5.601790428161621,
"rewards/rejected": -21.41649627685547,
"step": 340
},
{
"epoch": 0.7374749498997996,
"grad_norm": 87.41940209533863,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -1.254070520401001,
"logits/rejected": -1.2306808233261108,
"logps/chosen": -1.665123701095581,
"logps/rejected": -2.303457021713257,
"loss": 1.3788,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -16.65123748779297,
"rewards/margins": 6.3833327293396,
"rewards/rejected": -23.034570693969727,
"step": 345
},
{
"epoch": 0.7481629926519706,
"grad_norm": 147.74903637059256,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -1.2324841022491455,
"logits/rejected": -1.2356057167053223,
"logps/chosen": -1.5469470024108887,
"logps/rejected": -2.0821375846862793,
"loss": 1.2883,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -15.469470024108887,
"rewards/margins": 5.351906776428223,
"rewards/rejected": -20.82137680053711,
"step": 350
},
{
"epoch": 0.7588510354041417,
"grad_norm": 66.59688038674247,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -1.1590429544448853,
"logits/rejected": -1.1739274263381958,
"logps/chosen": -1.5381479263305664,
"logps/rejected": -2.0882415771484375,
"loss": 1.181,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.381479263305664,
"rewards/margins": 5.500934600830078,
"rewards/rejected": -20.882413864135742,
"step": 355
},
{
"epoch": 0.7695390781563126,
"grad_norm": 79.30848988409956,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -1.2186603546142578,
"logits/rejected": -1.2177612781524658,
"logps/chosen": -1.5483216047286987,
"logps/rejected": -2.190535306930542,
"loss": 1.3007,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -15.48321533203125,
"rewards/margins": 6.422137260437012,
"rewards/rejected": -21.905353546142578,
"step": 360
},
{
"epoch": 0.7802271209084837,
"grad_norm": 71.1995833686848,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -1.2522964477539062,
"logits/rejected": -1.1880736351013184,
"logps/chosen": -1.599200963973999,
"logps/rejected": -2.2274394035339355,
"loss": 1.294,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.992010116577148,
"rewards/margins": 6.28238582611084,
"rewards/rejected": -22.274394989013672,
"step": 365
},
{
"epoch": 0.7909151636606546,
"grad_norm": 97.88071644925103,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -1.1726973056793213,
"logits/rejected": -1.1615909337997437,
"logps/chosen": -1.5849040746688843,
"logps/rejected": -2.063690662384033,
"loss": 1.2653,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.849041938781738,
"rewards/margins": 4.787867546081543,
"rewards/rejected": -20.63690948486328,
"step": 370
},
{
"epoch": 0.8016032064128257,
"grad_norm": 93.22521656476633,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -1.2195771932601929,
"logits/rejected": -1.223771095275879,
"logps/chosen": -1.7087455987930298,
"logps/rejected": -2.2848830223083496,
"loss": 1.2583,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -17.087453842163086,
"rewards/margins": 5.76137638092041,
"rewards/rejected": -22.848833084106445,
"step": 375
},
{
"epoch": 0.8122912491649966,
"grad_norm": 122.65282224004734,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -1.2245140075683594,
"logits/rejected": -1.2064614295959473,
"logps/chosen": -1.5752016305923462,
"logps/rejected": -2.1021199226379395,
"loss": 1.4127,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.752016067504883,
"rewards/margins": 5.2691850662231445,
"rewards/rejected": -21.021198272705078,
"step": 380
},
{
"epoch": 0.8229792919171677,
"grad_norm": 79.05677313510314,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -1.2155396938323975,
"logits/rejected": -1.194136142730713,
"logps/chosen": -1.5979677438735962,
"logps/rejected": -2.291325330734253,
"loss": 1.0803,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -15.9796781539917,
"rewards/margins": 6.9335784912109375,
"rewards/rejected": -22.91325569152832,
"step": 385
},
{
"epoch": 0.8336673346693386,
"grad_norm": 83.81957692142457,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -1.223331093788147,
"logits/rejected": -1.209160327911377,
"logps/chosen": -1.668593406677246,
"logps/rejected": -2.259793519973755,
"loss": 1.2346,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -16.68593406677246,
"rewards/margins": 5.911999702453613,
"rewards/rejected": -22.59793472290039,
"step": 390
},
{
"epoch": 0.8443553774215097,
"grad_norm": 76.36054598990746,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -1.2313239574432373,
"logits/rejected": -1.211395502090454,
"logps/chosen": -1.5763094425201416,
"logps/rejected": -2.1317121982574463,
"loss": 1.2849,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.763093948364258,
"rewards/margins": 5.554028511047363,
"rewards/rejected": -21.317119598388672,
"step": 395
},
{
"epoch": 0.8550434201736807,
"grad_norm": 62.72495111163961,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -1.2403868436813354,
"logits/rejected": -1.2243949174880981,
"logps/chosen": -1.6198228597640991,
"logps/rejected": -2.1331706047058105,
"loss": 1.2445,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.19822883605957,
"rewards/margins": 5.133477687835693,
"rewards/rejected": -21.331707000732422,
"step": 400
},
{
"epoch": 0.8550434201736807,
"eval_logits/chosen": -1.4456316232681274,
"eval_logits/rejected": -1.4547516107559204,
"eval_logps/chosen": -1.623605728149414,
"eval_logps/rejected": -2.176786422729492,
"eval_loss": 1.3307912349700928,
"eval_rewards/accuracies": 0.8353658318519592,
"eval_rewards/chosen": -16.23605728149414,
"eval_rewards/margins": 5.531808376312256,
"eval_rewards/rejected": -21.767864227294922,
"eval_runtime": 94.8719,
"eval_samples_per_second": 20.67,
"eval_steps_per_second": 1.296,
"step": 400
},
{
"epoch": 0.8657314629258517,
"grad_norm": 99.36460974571031,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -1.2008370161056519,
"logits/rejected": -1.2194417715072632,
"logps/chosen": -1.7075388431549072,
"logps/rejected": -2.2549824714660645,
"loss": 1.2048,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -17.075387954711914,
"rewards/margins": 5.474437713623047,
"rewards/rejected": -22.54982566833496,
"step": 405
},
{
"epoch": 0.8764195056780227,
"grad_norm": 138.34780783301264,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -1.216778039932251,
"logits/rejected": -1.2035914659500122,
"logps/chosen": -1.6350791454315186,
"logps/rejected": -2.172778844833374,
"loss": 1.3538,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -16.350793838500977,
"rewards/margins": 5.376997947692871,
"rewards/rejected": -21.727787017822266,
"step": 410
},
{
"epoch": 0.8871075484301937,
"grad_norm": 72.98512679113071,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -1.2200865745544434,
"logits/rejected": -1.1671762466430664,
"logps/chosen": -1.5793800354003906,
"logps/rejected": -2.232057571411133,
"loss": 1.3478,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.793802261352539,
"rewards/margins": 6.5267744064331055,
"rewards/rejected": -22.32057762145996,
"step": 415
},
{
"epoch": 0.8977955911823647,
"grad_norm": 98.07615613251582,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -1.2454715967178345,
"logits/rejected": -1.195953607559204,
"logps/chosen": -1.5281785726547241,
"logps/rejected": -2.1256656646728516,
"loss": 1.2056,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.28178596496582,
"rewards/margins": 5.974873065948486,
"rewards/rejected": -21.25665855407715,
"step": 420
},
{
"epoch": 0.9084836339345357,
"grad_norm": 106.16711447135498,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -1.2416163682937622,
"logits/rejected": -1.231783390045166,
"logps/chosen": -1.7100231647491455,
"logps/rejected": -2.2363858222961426,
"loss": 1.4487,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.100229263305664,
"rewards/margins": 5.263625621795654,
"rewards/rejected": -22.36385726928711,
"step": 425
},
{
"epoch": 0.9191716766867067,
"grad_norm": 104.40358802274892,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -1.2276403903961182,
"logits/rejected": -1.211700439453125,
"logps/chosen": -1.6992714405059814,
"logps/rejected": -2.2929625511169434,
"loss": 1.2603,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.992717742919922,
"rewards/margins": 5.9369096755981445,
"rewards/rejected": -22.92962646484375,
"step": 430
},
{
"epoch": 0.9298597194388778,
"grad_norm": 76.71410236428167,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -1.1829754114151,
"logits/rejected": -1.1444637775421143,
"logps/chosen": -1.5611233711242676,
"logps/rejected": -2.1490211486816406,
"loss": 1.2066,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.611233711242676,
"rewards/margins": 5.8789777755737305,
"rewards/rejected": -21.490211486816406,
"step": 435
},
{
"epoch": 0.9405477621910487,
"grad_norm": 89.37849635838704,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -1.2062740325927734,
"logits/rejected": -1.148478627204895,
"logps/chosen": -1.6622101068496704,
"logps/rejected": -2.1926727294921875,
"loss": 1.253,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.622098922729492,
"rewards/margins": 5.304628849029541,
"rewards/rejected": -21.926727294921875,
"step": 440
},
{
"epoch": 0.9512358049432198,
"grad_norm": 78.38263983661439,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -1.1661369800567627,
"logits/rejected": -1.1493126153945923,
"logps/chosen": -1.6586837768554688,
"logps/rejected": -2.293992280960083,
"loss": 1.0815,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -16.58683967590332,
"rewards/margins": 6.353082180023193,
"rewards/rejected": -22.939918518066406,
"step": 445
},
{
"epoch": 0.9619238476953907,
"grad_norm": 85.39108439896182,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -1.2724522352218628,
"logits/rejected": -1.2523143291473389,
"logps/chosen": -1.607690453529358,
"logps/rejected": -2.168273448944092,
"loss": 1.3282,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -16.076906204223633,
"rewards/margins": 5.605828285217285,
"rewards/rejected": -21.682735443115234,
"step": 450
},
{
"epoch": 0.9726118904475618,
"grad_norm": 102.09796631455698,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -1.2248659133911133,
"logits/rejected": -1.1913433074951172,
"logps/chosen": -1.6112314462661743,
"logps/rejected": -2.2650115489959717,
"loss": 1.0445,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -16.112314224243164,
"rewards/margins": 6.537802696228027,
"rewards/rejected": -22.650117874145508,
"step": 455
},
{
"epoch": 0.9832999331997327,
"grad_norm": 90.8560495778911,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": -1.2340514659881592,
"logits/rejected": -1.2225282192230225,
"logps/chosen": -1.6787292957305908,
"logps/rejected": -2.2820496559143066,
"loss": 1.1894,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.787290573120117,
"rewards/margins": 6.033202648162842,
"rewards/rejected": -22.82049560546875,
"step": 460
},
{
"epoch": 0.9939879759519038,
"grad_norm": 86.47798441930765,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -1.2089763879776,
"logits/rejected": -1.2146103382110596,
"logps/chosen": -1.703181266784668,
"logps/rejected": -2.318962335586548,
"loss": 1.3219,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.03181266784668,
"rewards/margins": 6.157810688018799,
"rewards/rejected": -23.189624786376953,
"step": 465
},
{
"epoch": 0.9982631930527722,
"step": 467,
"total_flos": 0.0,
"train_loss": 1.8032665589636858,
"train_runtime": 11474.0462,
"train_samples_per_second": 5.218,
"train_steps_per_second": 0.041
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}