Transformers
PyTorch
Inference Endpoints
mamba-2.8b-zephyr / trainer_state.json
xiuyul's picture
Upload folder using huggingface_hub
92c52cf
raw
history blame
157 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998451213216314,
"eval_steps": 100,
"global_step": 2904,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.7182130584192438e-09,
"logits/chosen": 22.749126434326172,
"logits/rejected": 22.455398559570312,
"logps/chosen": -415.7331848144531,
"logps/rejected": -294.51483154296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0,
"eval_logits/chosen": 23.82334327697754,
"eval_logits/rejected": 23.573287963867188,
"eval_logps/chosen": -354.5701599121094,
"eval_logps/rejected": -274.08343505859375,
"eval_loss": 0.6931473612785339,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": 0.0,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": 0.0,
"eval_runtime": 208.2485,
"eval_samples_per_second": 9.604,
"eval_steps_per_second": 0.303,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 1.718213058419244e-08,
"logits/chosen": 23.493385314941406,
"logits/rejected": 23.479415893554688,
"logps/chosen": -359.0509948730469,
"logps/rejected": -263.7375793457031,
"loss": 0.692,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 0.016306404024362564,
"rewards/margins": 0.025918345898389816,
"rewards/rejected": -0.009611942805349827,
"step": 10
},
{
"epoch": 0.02,
"learning_rate": 3.436426116838488e-08,
"logits/chosen": 23.505186080932617,
"logits/rejected": 23.52346420288086,
"logps/chosen": -327.48468017578125,
"logps/rejected": -279.432861328125,
"loss": 0.6965,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.013154825195670128,
"rewards/margins": -0.014362807385623455,
"rewards/rejected": 0.0012079827720299363,
"step": 20
},
{
"epoch": 0.03,
"learning_rate": 5.154639175257731e-08,
"logits/chosen": 23.50873374938965,
"logits/rejected": 23.2880859375,
"logps/chosen": -340.9912109375,
"logps/rejected": -269.15045166015625,
"loss": 0.6955,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.0022484897635877132,
"rewards/margins": -0.017411604523658752,
"rewards/rejected": 0.0196601003408432,
"step": 30
},
{
"epoch": 0.04,
"learning_rate": 6.872852233676976e-08,
"logits/chosen": 23.961822509765625,
"logits/rejected": 23.730144500732422,
"logps/chosen": -414.52447509765625,
"logps/rejected": -300.4974670410156,
"loss": 0.6961,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.003453383222222328,
"rewards/margins": 0.017737122252583504,
"rewards/rejected": -0.014283737167716026,
"step": 40
},
{
"epoch": 0.05,
"learning_rate": 8.59106529209622e-08,
"logits/chosen": 23.999908447265625,
"logits/rejected": 23.47333335876465,
"logps/chosen": -313.49395751953125,
"logps/rejected": -216.2849578857422,
"loss": 0.691,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.021781612187623978,
"rewards/margins": 0.03288044035434723,
"rewards/rejected": -0.011098823510110378,
"step": 50
},
{
"epoch": 0.06,
"learning_rate": 1.0309278350515462e-07,
"logits/chosen": 23.825542449951172,
"logits/rejected": 23.716323852539062,
"logps/chosen": -306.31744384765625,
"logps/rejected": -260.7249755859375,
"loss": 0.6916,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.009675316512584686,
"rewards/margins": -0.021775808185338974,
"rewards/rejected": 0.03145112842321396,
"step": 60
},
{
"epoch": 0.07,
"learning_rate": 1.202749140893471e-07,
"logits/chosen": 23.89028549194336,
"logits/rejected": 23.66950798034668,
"logps/chosen": -364.57757568359375,
"logps/rejected": -250.9732208251953,
"loss": 0.6871,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0638527050614357,
"rewards/margins": 0.016006827354431152,
"rewards/rejected": 0.047845881432294846,
"step": 70
},
{
"epoch": 0.08,
"learning_rate": 1.3745704467353952e-07,
"logits/chosen": 23.972980499267578,
"logits/rejected": 23.702159881591797,
"logps/chosen": -360.4600524902344,
"logps/rejected": -277.17767333984375,
"loss": 0.6826,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.10171504318714142,
"rewards/margins": 0.051059722900390625,
"rewards/rejected": 0.05065532401204109,
"step": 80
},
{
"epoch": 0.09,
"learning_rate": 1.5463917525773197e-07,
"logits/chosen": 23.601802825927734,
"logits/rejected": 23.44902229309082,
"logps/chosen": -256.45306396484375,
"logps/rejected": -228.2622528076172,
"loss": 0.6742,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.13566820323467255,
"rewards/margins": 0.05287040024995804,
"rewards/rejected": 0.0827978178858757,
"step": 90
},
{
"epoch": 0.1,
"learning_rate": 1.718213058419244e-07,
"logits/chosen": 23.945114135742188,
"logits/rejected": 23.670852661132812,
"logps/chosen": -317.6385192871094,
"logps/rejected": -238.4324188232422,
"loss": 0.6639,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.14283855259418488,
"rewards/margins": 0.07535254955291748,
"rewards/rejected": 0.0674859955906868,
"step": 100
},
{
"epoch": 0.1,
"eval_logits/chosen": 23.83555793762207,
"eval_logits/rejected": 23.585235595703125,
"eval_logps/chosen": -352.80859375,
"eval_logps/rejected": -273.12677001953125,
"eval_loss": 0.6592543125152588,
"eval_rewards/accuracies": 0.6150793433189392,
"eval_rewards/chosen": 0.17615097761154175,
"eval_rewards/margins": 0.0804828330874443,
"eval_rewards/rejected": 0.09566814452409744,
"eval_runtime": 210.7096,
"eval_samples_per_second": 9.492,
"eval_steps_per_second": 0.299,
"step": 100
},
{
"epoch": 0.11,
"learning_rate": 1.8900343642611682e-07,
"logits/chosen": 23.709579467773438,
"logits/rejected": 23.512853622436523,
"logps/chosen": -349.40234375,
"logps/rejected": -243.11532592773438,
"loss": 0.6541,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.16010603308677673,
"rewards/margins": 0.09831614792346954,
"rewards/rejected": 0.06178988143801689,
"step": 110
},
{
"epoch": 0.12,
"learning_rate": 2.0618556701030925e-07,
"logits/chosen": 23.544376373291016,
"logits/rejected": 23.377239227294922,
"logps/chosen": -341.64080810546875,
"logps/rejected": -247.55844116210938,
"loss": 0.6539,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.1492854803800583,
"rewards/margins": 0.0526873879134655,
"rewards/rejected": 0.09659810364246368,
"step": 120
},
{
"epoch": 0.13,
"learning_rate": 2.2336769759450173e-07,
"logits/chosen": 24.006563186645508,
"logits/rejected": 23.8785457611084,
"logps/chosen": -321.85467529296875,
"logps/rejected": -281.0990905761719,
"loss": 0.6401,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.19155286252498627,
"rewards/margins": 0.1119670420885086,
"rewards/rejected": 0.07958582043647766,
"step": 130
},
{
"epoch": 0.14,
"learning_rate": 2.405498281786942e-07,
"logits/chosen": 23.71746826171875,
"logits/rejected": 23.616607666015625,
"logps/chosen": -346.86761474609375,
"logps/rejected": -257.8626708984375,
"loss": 0.6319,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.179647758603096,
"rewards/margins": 0.20804457366466522,
"rewards/rejected": -0.02839680388569832,
"step": 140
},
{
"epoch": 0.15,
"learning_rate": 2.5773195876288655e-07,
"logits/chosen": 23.601333618164062,
"logits/rejected": 23.368152618408203,
"logps/chosen": -342.10003662109375,
"logps/rejected": -261.25201416015625,
"loss": 0.6243,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.13529065251350403,
"rewards/margins": 0.20980004966259003,
"rewards/rejected": -0.0745093896985054,
"step": 150
},
{
"epoch": 0.17,
"learning_rate": 2.7491408934707903e-07,
"logits/chosen": 24.020530700683594,
"logits/rejected": 23.818883895874023,
"logps/chosen": -362.73968505859375,
"logps/rejected": -253.7847137451172,
"loss": 0.5915,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.13591055572032928,
"rewards/margins": 0.31857621669769287,
"rewards/rejected": -0.1826656460762024,
"step": 160
},
{
"epoch": 0.18,
"learning_rate": 2.9209621993127146e-07,
"logits/chosen": 23.72347640991211,
"logits/rejected": 23.625173568725586,
"logps/chosen": -337.2410583496094,
"logps/rejected": -265.833740234375,
"loss": 0.5966,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.06029454618692398,
"rewards/margins": 0.21368882060050964,
"rewards/rejected": -0.15339429676532745,
"step": 170
},
{
"epoch": 0.19,
"learning_rate": 3.0927835051546394e-07,
"logits/chosen": 24.024005889892578,
"logits/rejected": 23.694889068603516,
"logps/chosen": -303.23358154296875,
"logps/rejected": -259.80047607421875,
"loss": 0.5912,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.10565178096294403,
"rewards/margins": 0.33361369371414185,
"rewards/rejected": -0.2279619425535202,
"step": 180
},
{
"epoch": 0.2,
"learning_rate": 3.2646048109965636e-07,
"logits/chosen": 23.458202362060547,
"logits/rejected": 23.41326904296875,
"logps/chosen": -278.2962341308594,
"logps/rejected": -242.08627319335938,
"loss": 0.5826,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.05947988107800484,
"rewards/margins": 0.3678347170352936,
"rewards/rejected": -0.30835479497909546,
"step": 190
},
{
"epoch": 0.21,
"learning_rate": 3.436426116838488e-07,
"logits/chosen": 23.741714477539062,
"logits/rejected": 23.483057022094727,
"logps/chosen": -314.7781066894531,
"logps/rejected": -248.27880859375,
"loss": 0.5804,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.1307556927204132,
"rewards/margins": 0.3709767758846283,
"rewards/rejected": -0.2402210682630539,
"step": 200
},
{
"epoch": 0.21,
"eval_logits/chosen": 23.830230712890625,
"eval_logits/rejected": 23.587175369262695,
"eval_logps/chosen": -353.7904052734375,
"eval_logps/rejected": -277.4797668457031,
"eval_loss": 0.5836150646209717,
"eval_rewards/accuracies": 0.6507936716079712,
"eval_rewards/chosen": 0.07797454297542572,
"eval_rewards/margins": 0.41760751605033875,
"eval_rewards/rejected": -0.33963292837142944,
"eval_runtime": 208.5861,
"eval_samples_per_second": 9.588,
"eval_steps_per_second": 0.302,
"step": 200
},
{
"epoch": 0.22,
"learning_rate": 3.608247422680412e-07,
"logits/chosen": 23.76480484008789,
"logits/rejected": 23.56380271911621,
"logps/chosen": -377.50799560546875,
"logps/rejected": -279.08978271484375,
"loss": 0.5611,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0905425176024437,
"rewards/margins": 0.527503252029419,
"rewards/rejected": -0.43696069717407227,
"step": 210
},
{
"epoch": 0.23,
"learning_rate": 3.7800687285223364e-07,
"logits/chosen": 23.482959747314453,
"logits/rejected": 23.370895385742188,
"logps/chosen": -316.96038818359375,
"logps/rejected": -253.94686889648438,
"loss": 0.5691,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.16429784893989563,
"rewards/margins": 0.4349435865879059,
"rewards/rejected": -0.5992413759231567,
"step": 220
},
{
"epoch": 0.24,
"learning_rate": 3.9518900343642607e-07,
"logits/chosen": 23.473817825317383,
"logits/rejected": 23.369760513305664,
"logps/chosen": -334.98663330078125,
"logps/rejected": -293.44854736328125,
"loss": 0.5962,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1187012642621994,
"rewards/margins": 0.38999611139297485,
"rewards/rejected": -0.5086973905563354,
"step": 230
},
{
"epoch": 0.25,
"learning_rate": 4.123711340206185e-07,
"logits/chosen": 23.480493545532227,
"logits/rejected": 23.42662239074707,
"logps/chosen": -329.04595947265625,
"logps/rejected": -243.5697784423828,
"loss": 0.564,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.16112008690834045,
"rewards/margins": 0.41859644651412964,
"rewards/rejected": -0.5797165036201477,
"step": 240
},
{
"epoch": 0.26,
"learning_rate": 4.2955326460481097e-07,
"logits/chosen": 23.753459930419922,
"logits/rejected": 23.624629974365234,
"logps/chosen": -347.7720642089844,
"logps/rejected": -273.23162841796875,
"loss": 0.5833,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06806284189224243,
"rewards/margins": 0.4797073304653168,
"rewards/rejected": -0.5477702021598816,
"step": 250
},
{
"epoch": 0.27,
"learning_rate": 4.4673539518900345e-07,
"logits/chosen": 23.70407485961914,
"logits/rejected": 23.5228328704834,
"logps/chosen": -310.2815856933594,
"logps/rejected": -250.3536376953125,
"loss": 0.5718,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.07945041358470917,
"rewards/margins": 0.49080556631088257,
"rewards/rejected": -0.5702559351921082,
"step": 260
},
{
"epoch": 0.28,
"learning_rate": 4.639175257731959e-07,
"logits/chosen": 23.76226234436035,
"logits/rejected": 23.472620010375977,
"logps/chosen": -301.5387268066406,
"logps/rejected": -240.7628631591797,
"loss": 0.601,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07512088119983673,
"rewards/margins": 0.4631730914115906,
"rewards/rejected": -0.5382939577102661,
"step": 270
},
{
"epoch": 0.29,
"learning_rate": 4.810996563573884e-07,
"logits/chosen": 23.984760284423828,
"logits/rejected": 23.863937377929688,
"logps/chosen": -373.2278137207031,
"logps/rejected": -285.9132995605469,
"loss": 0.5712,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.10135757923126221,
"rewards/margins": 0.6022639274597168,
"rewards/rejected": -0.5009063482284546,
"step": 280
},
{
"epoch": 0.3,
"learning_rate": 4.982817869415807e-07,
"logits/chosen": 23.733274459838867,
"logits/rejected": 23.508481979370117,
"logps/chosen": -356.46099853515625,
"logps/rejected": -259.97003173828125,
"loss": 0.5751,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.18595895171165466,
"rewards/margins": 0.5399104952812195,
"rewards/rejected": -0.7258695363998413,
"step": 290
},
{
"epoch": 0.31,
"learning_rate": 4.982778415614236e-07,
"logits/chosen": 23.513275146484375,
"logits/rejected": 23.471511840820312,
"logps/chosen": -293.0979919433594,
"logps/rejected": -249.67446899414062,
"loss": 0.5815,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.22943711280822754,
"rewards/margins": 0.6720036268234253,
"rewards/rejected": -0.9014407396316528,
"step": 300
},
{
"epoch": 0.31,
"eval_logits/chosen": 23.749773025512695,
"eval_logits/rejected": 23.52240753173828,
"eval_logps/chosen": -356.49285888671875,
"eval_logps/rejected": -281.9402770996094,
"eval_loss": 0.5510157942771912,
"eval_rewards/accuracies": 0.7420634627342224,
"eval_rewards/chosen": -0.19227494299411774,
"eval_rewards/margins": 0.5934095978736877,
"eval_rewards/rejected": -0.7856844663619995,
"eval_runtime": 210.4467,
"eval_samples_per_second": 9.504,
"eval_steps_per_second": 0.299,
"step": 300
},
{
"epoch": 0.32,
"learning_rate": 4.963643321852277e-07,
"logits/chosen": 23.758647918701172,
"logits/rejected": 23.599285125732422,
"logps/chosen": -387.0029296875,
"logps/rejected": -297.8297119140625,
"loss": 0.5858,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.22534582018852234,
"rewards/margins": 0.4947783946990967,
"rewards/rejected": -0.7201241254806519,
"step": 310
},
{
"epoch": 0.33,
"learning_rate": 4.944508228090318e-07,
"logits/chosen": 23.673627853393555,
"logits/rejected": 23.470468521118164,
"logps/chosen": -269.2679748535156,
"logps/rejected": -209.1413116455078,
"loss": 0.5428,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2326889932155609,
"rewards/margins": 0.49293145537376404,
"rewards/rejected": -0.7256205677986145,
"step": 320
},
{
"epoch": 0.34,
"learning_rate": 4.925373134328357e-07,
"logits/chosen": 23.728256225585938,
"logits/rejected": 23.57656478881836,
"logps/chosen": -341.84552001953125,
"logps/rejected": -279.85650634765625,
"loss": 0.5848,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2053179293870926,
"rewards/margins": 0.5053264498710632,
"rewards/rejected": -0.7106443643569946,
"step": 330
},
{
"epoch": 0.35,
"learning_rate": 4.906238040566398e-07,
"logits/chosen": 23.395517349243164,
"logits/rejected": 23.30283546447754,
"logps/chosen": -276.00958251953125,
"logps/rejected": -245.6515655517578,
"loss": 0.5731,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.4472281038761139,
"rewards/margins": 0.5112749338150024,
"rewards/rejected": -0.9585030674934387,
"step": 340
},
{
"epoch": 0.36,
"learning_rate": 4.887102946804438e-07,
"logits/chosen": 23.459369659423828,
"logits/rejected": 23.258296966552734,
"logps/chosen": -351.51153564453125,
"logps/rejected": -265.9107666015625,
"loss": 0.5436,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.248783141374588,
"rewards/margins": 0.7760157585144043,
"rewards/rejected": -1.02479887008667,
"step": 350
},
{
"epoch": 0.37,
"learning_rate": 4.867967853042479e-07,
"logits/chosen": 23.64513397216797,
"logits/rejected": 23.49908447265625,
"logps/chosen": -327.1449890136719,
"logps/rejected": -301.5306396484375,
"loss": 0.5287,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.29137879610061646,
"rewards/margins": 0.8306191563606262,
"rewards/rejected": -1.1219979524612427,
"step": 360
},
{
"epoch": 0.38,
"learning_rate": 4.84883275928052e-07,
"logits/chosen": 23.72499656677246,
"logits/rejected": 23.477428436279297,
"logps/chosen": -337.2041320800781,
"logps/rejected": -292.93463134765625,
"loss": 0.5549,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.20421965420246124,
"rewards/margins": 0.776501476764679,
"rewards/rejected": -0.980721116065979,
"step": 370
},
{
"epoch": 0.39,
"learning_rate": 4.82969766551856e-07,
"logits/chosen": 23.811683654785156,
"logits/rejected": 23.42571258544922,
"logps/chosen": -364.2945251464844,
"logps/rejected": -283.0462646484375,
"loss": 0.5698,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.06022878363728523,
"rewards/margins": 0.7983857989311218,
"rewards/rejected": -0.8586145639419556,
"step": 380
},
{
"epoch": 0.4,
"learning_rate": 4.810562571756601e-07,
"logits/chosen": 23.39688491821289,
"logits/rejected": 23.162023544311523,
"logps/chosen": -323.4096984863281,
"logps/rejected": -250.5354461669922,
"loss": 0.5771,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1642606556415558,
"rewards/margins": 0.8033970594406128,
"rewards/rejected": -0.967657744884491,
"step": 390
},
{
"epoch": 0.41,
"learning_rate": 4.791427477994642e-07,
"logits/chosen": 23.463436126708984,
"logits/rejected": 23.304988861083984,
"logps/chosen": -290.4604797363281,
"logps/rejected": -257.20977783203125,
"loss": 0.5526,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2738291919231415,
"rewards/margins": 0.6287984848022461,
"rewards/rejected": -0.9026277661323547,
"step": 400
},
{
"epoch": 0.41,
"eval_logits/chosen": 23.72638702392578,
"eval_logits/rejected": 23.50330352783203,
"eval_logps/chosen": -356.5235290527344,
"eval_logps/rejected": -283.01190185546875,
"eval_loss": 0.5360822081565857,
"eval_rewards/accuracies": 0.7341269850730896,
"eval_rewards/chosen": -0.19533830881118774,
"eval_rewards/margins": 0.6975098848342896,
"eval_rewards/rejected": -0.8928481936454773,
"eval_runtime": 211.6561,
"eval_samples_per_second": 9.449,
"eval_steps_per_second": 0.298,
"step": 400
},
{
"epoch": 0.42,
"learning_rate": 4.772292384232682e-07,
"logits/chosen": 23.651836395263672,
"logits/rejected": 23.562541961669922,
"logps/chosen": -295.5309143066406,
"logps/rejected": -256.42864990234375,
"loss": 0.5646,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3951832056045532,
"rewards/margins": 0.48603707551956177,
"rewards/rejected": -0.8812202215194702,
"step": 410
},
{
"epoch": 0.43,
"learning_rate": 4.753157290470723e-07,
"logits/chosen": 23.555591583251953,
"logits/rejected": 23.477405548095703,
"logps/chosen": -291.4106140136719,
"logps/rejected": -254.1300048828125,
"loss": 0.5647,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.44952210783958435,
"rewards/margins": 0.4881154000759125,
"rewards/rejected": -0.937637448310852,
"step": 420
},
{
"epoch": 0.44,
"learning_rate": 4.7340221967087635e-07,
"logits/chosen": 23.740278244018555,
"logits/rejected": 23.43073844909668,
"logps/chosen": -283.1187438964844,
"logps/rejected": -268.111083984375,
"loss": 0.5611,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.4848947525024414,
"rewards/margins": 0.5371454954147339,
"rewards/rejected": -1.0220401287078857,
"step": 430
},
{
"epoch": 0.45,
"learning_rate": 4.714887102946804e-07,
"logits/chosen": 23.772052764892578,
"logits/rejected": 23.574148178100586,
"logps/chosen": -316.62310791015625,
"logps/rejected": -249.21389770507812,
"loss": 0.5237,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.24736304581165314,
"rewards/margins": 0.6857331395149231,
"rewards/rejected": -0.9330962300300598,
"step": 440
},
{
"epoch": 0.46,
"learning_rate": 4.6957520091848447e-07,
"logits/chosen": 23.818843841552734,
"logits/rejected": 23.663349151611328,
"logps/chosen": -301.2689208984375,
"logps/rejected": -274.0567932128906,
"loss": 0.5833,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.04559071734547615,
"rewards/margins": 0.5639020800590515,
"rewards/rejected": -0.6094927191734314,
"step": 450
},
{
"epoch": 0.47,
"learning_rate": 4.6766169154228853e-07,
"logits/chosen": 23.291194915771484,
"logits/rejected": 23.422870635986328,
"logps/chosen": -323.94488525390625,
"logps/rejected": -233.00833129882812,
"loss": 0.5194,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.03107648529112339,
"rewards/margins": 0.7044192552566528,
"rewards/rejected": -0.7354957461357117,
"step": 460
},
{
"epoch": 0.49,
"learning_rate": 4.657481821660926e-07,
"logits/chosen": 23.393449783325195,
"logits/rejected": 23.201961517333984,
"logps/chosen": -318.3015441894531,
"logps/rejected": -219.90170288085938,
"loss": 0.5072,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.05152938514947891,
"rewards/margins": 0.8564150929450989,
"rewards/rejected": -0.90794438123703,
"step": 470
},
{
"epoch": 0.5,
"learning_rate": 4.6383467278989666e-07,
"logits/chosen": 23.315677642822266,
"logits/rejected": 23.30160903930664,
"logps/chosen": -354.28302001953125,
"logps/rejected": -268.2124938964844,
"loss": 0.5397,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.004499013535678387,
"rewards/margins": 0.9241636395454407,
"rewards/rejected": -0.9196645617485046,
"step": 480
},
{
"epoch": 0.51,
"learning_rate": 4.6192116341370067e-07,
"logits/chosen": 23.65988540649414,
"logits/rejected": 23.262027740478516,
"logps/chosen": -363.83416748046875,
"logps/rejected": -269.9544677734375,
"loss": 0.5463,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.03792769834399223,
"rewards/margins": 0.8194522857666016,
"rewards/rejected": -0.8573800325393677,
"step": 490
},
{
"epoch": 0.52,
"learning_rate": 4.6000765403750473e-07,
"logits/chosen": 23.21782684326172,
"logits/rejected": 22.95124053955078,
"logps/chosen": -272.1101379394531,
"logps/rejected": -233.3650665283203,
"loss": 0.5225,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.18960613012313843,
"rewards/margins": 0.5189381837844849,
"rewards/rejected": -0.7085443139076233,
"step": 500
},
{
"epoch": 0.52,
"eval_logits/chosen": 23.67182731628418,
"eval_logits/rejected": 23.457815170288086,
"eval_logps/chosen": -355.6113586425781,
"eval_logps/rejected": -282.89288330078125,
"eval_loss": 0.5261635184288025,
"eval_rewards/accuracies": 0.7539682388305664,
"eval_rewards/chosen": -0.10412228107452393,
"eval_rewards/margins": 0.7768236994743347,
"eval_rewards/rejected": -0.8809459805488586,
"eval_runtime": 208.2947,
"eval_samples_per_second": 9.602,
"eval_steps_per_second": 0.302,
"step": 500
},
{
"epoch": 0.53,
"learning_rate": 4.580941446613088e-07,
"logits/chosen": 23.604022979736328,
"logits/rejected": 23.44409942626953,
"logps/chosen": -326.31378173828125,
"logps/rejected": -279.2933349609375,
"loss": 0.5379,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2937595248222351,
"rewards/margins": 0.6278744339942932,
"rewards/rejected": -0.9216337203979492,
"step": 510
},
{
"epoch": 0.54,
"learning_rate": 4.5618063528511285e-07,
"logits/chosen": 23.713848114013672,
"logits/rejected": 23.53582000732422,
"logps/chosen": -304.9338684082031,
"logps/rejected": -268.5104064941406,
"loss": 0.5433,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.10584266483783722,
"rewards/margins": 0.8640462160110474,
"rewards/rejected": -0.9698888063430786,
"step": 520
},
{
"epoch": 0.55,
"learning_rate": 4.542671259089169e-07,
"logits/chosen": 23.50804328918457,
"logits/rejected": 23.286922454833984,
"logps/chosen": -291.4549560546875,
"logps/rejected": -222.6033935546875,
"loss": 0.553,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3200764060020447,
"rewards/margins": 0.6575796008110046,
"rewards/rejected": -0.9776560068130493,
"step": 530
},
{
"epoch": 0.56,
"learning_rate": 4.52353616532721e-07,
"logits/chosen": 23.716320037841797,
"logits/rejected": 23.562469482421875,
"logps/chosen": -322.17047119140625,
"logps/rejected": -258.58917236328125,
"loss": 0.5491,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.1657361090183258,
"rewards/margins": 0.7579048871994019,
"rewards/rejected": -0.9236409068107605,
"step": 540
},
{
"epoch": 0.57,
"learning_rate": 4.5044010715652504e-07,
"logits/chosen": 23.58610725402832,
"logits/rejected": 23.374378204345703,
"logps/chosen": -303.7857971191406,
"logps/rejected": -266.2262878417969,
"loss": 0.5447,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1886710226535797,
"rewards/margins": 0.5424461364746094,
"rewards/rejected": -0.7311171293258667,
"step": 550
},
{
"epoch": 0.58,
"learning_rate": 4.485265977803291e-07,
"logits/chosen": 23.34024429321289,
"logits/rejected": 23.08355140686035,
"logps/chosen": -346.318603515625,
"logps/rejected": -288.74432373046875,
"loss": 0.5315,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1655091792345047,
"rewards/margins": 0.8151466250419617,
"rewards/rejected": -0.98065584897995,
"step": 560
},
{
"epoch": 0.59,
"learning_rate": 4.4661308840413316e-07,
"logits/chosen": 23.876493453979492,
"logits/rejected": 23.629627227783203,
"logps/chosen": -300.15509033203125,
"logps/rejected": -273.7672424316406,
"loss": 0.5197,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.30857834219932556,
"rewards/margins": 0.6672872304916382,
"rewards/rejected": -0.9758656620979309,
"step": 570
},
{
"epoch": 0.6,
"learning_rate": 4.446995790279372e-07,
"logits/chosen": 23.596471786499023,
"logits/rejected": 23.427684783935547,
"logps/chosen": -334.6555480957031,
"logps/rejected": -264.3316345214844,
"loss": 0.5231,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2455914467573166,
"rewards/margins": 0.883080005645752,
"rewards/rejected": -1.128671407699585,
"step": 580
},
{
"epoch": 0.61,
"learning_rate": 4.4278606965174123e-07,
"logits/chosen": 23.314987182617188,
"logits/rejected": 23.11943817138672,
"logps/chosen": -298.7581787109375,
"logps/rejected": -246.3135223388672,
"loss": 0.5253,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3651345372200012,
"rewards/margins": 0.677712619304657,
"rewards/rejected": -1.0428470373153687,
"step": 590
},
{
"epoch": 0.62,
"learning_rate": 4.408725602755453e-07,
"logits/chosen": 23.37234115600586,
"logits/rejected": 23.340373992919922,
"logps/chosen": -286.86737060546875,
"logps/rejected": -220.9873504638672,
"loss": 0.5577,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4736716151237488,
"rewards/margins": 0.5933648347854614,
"rewards/rejected": -1.0670363903045654,
"step": 600
},
{
"epoch": 0.62,
"eval_logits/chosen": 23.661834716796875,
"eval_logits/rejected": 23.4466495513916,
"eval_logps/chosen": -356.5157775878906,
"eval_logps/rejected": -284.3682861328125,
"eval_loss": 0.5155950784683228,
"eval_rewards/accuracies": 0.7658730149269104,
"eval_rewards/chosen": -0.19456443190574646,
"eval_rewards/margins": 0.8339203000068665,
"eval_rewards/rejected": -1.02848482131958,
"eval_runtime": 211.7272,
"eval_samples_per_second": 9.446,
"eval_steps_per_second": 0.298,
"step": 600
},
{
"epoch": 0.63,
"learning_rate": 4.3895905089934936e-07,
"logits/chosen": 23.545116424560547,
"logits/rejected": 23.458499908447266,
"logps/chosen": -338.8705139160156,
"logps/rejected": -272.00714111328125,
"loss": 0.5331,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2067803144454956,
"rewards/margins": 0.8310182690620422,
"rewards/rejected": -1.037798523902893,
"step": 610
},
{
"epoch": 0.64,
"learning_rate": 4.370455415231534e-07,
"logits/chosen": 23.625316619873047,
"logits/rejected": 23.448591232299805,
"logps/chosen": -345.89208984375,
"logps/rejected": -306.5247497558594,
"loss": 0.5182,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.30394795536994934,
"rewards/margins": 0.5751500725746155,
"rewards/rejected": -0.8790979385375977,
"step": 620
},
{
"epoch": 0.65,
"learning_rate": 4.351320321469575e-07,
"logits/chosen": 22.87206268310547,
"logits/rejected": 22.757465362548828,
"logps/chosen": -309.4687805175781,
"logps/rejected": -291.12847900390625,
"loss": 0.5417,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.20515112578868866,
"rewards/margins": 0.6493626236915588,
"rewards/rejected": -0.8545138239860535,
"step": 630
},
{
"epoch": 0.66,
"learning_rate": 4.3321852277076154e-07,
"logits/chosen": 23.304141998291016,
"logits/rejected": 23.251794815063477,
"logps/chosen": -333.1667785644531,
"logps/rejected": -272.1311950683594,
"loss": 0.5271,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.41801586747169495,
"rewards/margins": 0.5841894149780273,
"rewards/rejected": -1.0022052526474,
"step": 640
},
{
"epoch": 0.67,
"learning_rate": 4.313050133945656e-07,
"logits/chosen": 23.594745635986328,
"logits/rejected": 23.500207901000977,
"logps/chosen": -357.5419616699219,
"logps/rejected": -274.1604309082031,
"loss": 0.5192,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.20376773178577423,
"rewards/margins": 0.8759373426437378,
"rewards/rejected": -1.0797051191329956,
"step": 650
},
{
"epoch": 0.68,
"learning_rate": 4.2939150401836967e-07,
"logits/chosen": 23.767133712768555,
"logits/rejected": 23.464405059814453,
"logps/chosen": -308.720458984375,
"logps/rejected": -290.14306640625,
"loss": 0.5137,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.28276339173316956,
"rewards/margins": 0.6749929189682007,
"rewards/rejected": -0.9577562212944031,
"step": 660
},
{
"epoch": 0.69,
"learning_rate": 4.2747799464217373e-07,
"logits/chosen": 23.170560836791992,
"logits/rejected": 23.10344886779785,
"logps/chosen": -350.4610595703125,
"logps/rejected": -267.5567932128906,
"loss": 0.5273,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.439180463552475,
"rewards/margins": 0.6723843216896057,
"rewards/rejected": -1.1115647554397583,
"step": 670
},
{
"epoch": 0.7,
"learning_rate": 4.255644852659778e-07,
"logits/chosen": 23.414445877075195,
"logits/rejected": 23.61502456665039,
"logps/chosen": -374.15692138671875,
"logps/rejected": -288.3631286621094,
"loss": 0.5761,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.08570393174886703,
"rewards/margins": 0.6574904918670654,
"rewards/rejected": -0.7431942820549011,
"step": 680
},
{
"epoch": 0.71,
"learning_rate": 4.236509758897818e-07,
"logits/chosen": 23.552722930908203,
"logits/rejected": 23.40909194946289,
"logps/chosen": -342.1145324707031,
"logps/rejected": -264.7322082519531,
"loss": 0.5549,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.16522836685180664,
"rewards/margins": 0.6978949904441833,
"rewards/rejected": -0.8631232976913452,
"step": 690
},
{
"epoch": 0.72,
"learning_rate": 4.2173746651358586e-07,
"logits/chosen": 23.72678565979004,
"logits/rejected": 23.425289154052734,
"logps/chosen": -331.14410400390625,
"logps/rejected": -285.4593200683594,
"loss": 0.5515,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10031759738922119,
"rewards/margins": 0.6387730836868286,
"rewards/rejected": -0.739090621471405,
"step": 700
},
{
"epoch": 0.72,
"eval_logits/chosen": 23.634296417236328,
"eval_logits/rejected": 23.424331665039062,
"eval_logps/chosen": -353.9219665527344,
"eval_logps/rejected": -281.7333679199219,
"eval_loss": 0.5162664651870728,
"eval_rewards/accuracies": 0.7658730149269104,
"eval_rewards/chosen": 0.0648159608244896,
"eval_rewards/margins": 0.8298115730285645,
"eval_rewards/rejected": -0.7649956345558167,
"eval_runtime": 211.7482,
"eval_samples_per_second": 9.445,
"eval_steps_per_second": 0.298,
"step": 700
},
{
"epoch": 0.73,
"learning_rate": 4.198239571373899e-07,
"logits/chosen": 23.546445846557617,
"logits/rejected": 23.238723754882812,
"logps/chosen": -307.0013122558594,
"logps/rejected": -247.3063201904297,
"loss": 0.5341,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.08633746951818466,
"rewards/margins": 0.7773478031158447,
"rewards/rejected": -0.8636852502822876,
"step": 710
},
{
"epoch": 0.74,
"learning_rate": 4.17910447761194e-07,
"logits/chosen": 23.390674591064453,
"logits/rejected": 23.3981876373291,
"logps/chosen": -337.4337158203125,
"logps/rejected": -311.81414794921875,
"loss": 0.5774,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.21913309395313263,
"rewards/margins": 0.4323801100254059,
"rewards/rejected": -0.6515131592750549,
"step": 720
},
{
"epoch": 0.75,
"learning_rate": 4.1599693838499805e-07,
"logits/chosen": 23.517433166503906,
"logits/rejected": 23.37581443786621,
"logps/chosen": -291.6369323730469,
"logps/rejected": -265.741943359375,
"loss": 0.5377,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.25872206687927246,
"rewards/margins": 0.6349440813064575,
"rewards/rejected": -0.8936660885810852,
"step": 730
},
{
"epoch": 0.76,
"learning_rate": 4.140834290088021e-07,
"logits/chosen": 23.460777282714844,
"logits/rejected": 23.235326766967773,
"logps/chosen": -339.9285888671875,
"logps/rejected": -266.0412292480469,
"loss": 0.5197,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.11111575365066528,
"rewards/margins": 0.9949262738227844,
"rewards/rejected": -1.1060421466827393,
"step": 740
},
{
"epoch": 0.77,
"learning_rate": 4.121699196326062e-07,
"logits/chosen": 23.193099975585938,
"logits/rejected": 23.17205238342285,
"logps/chosen": -333.4886474609375,
"logps/rejected": -274.37274169921875,
"loss": 0.5358,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.505832850933075,
"rewards/margins": 0.6223492622375488,
"rewards/rejected": -1.1281821727752686,
"step": 750
},
{
"epoch": 0.78,
"learning_rate": 4.1025641025641024e-07,
"logits/chosen": 23.626953125,
"logits/rejected": 23.612979888916016,
"logps/chosen": -327.89111328125,
"logps/rejected": -297.7337341308594,
"loss": 0.5246,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.35025396943092346,
"rewards/margins": 0.8313090205192566,
"rewards/rejected": -1.1815630197525024,
"step": 760
},
{
"epoch": 0.8,
"learning_rate": 4.083429008802143e-07,
"logits/chosen": 23.58197593688965,
"logits/rejected": 23.45255470275879,
"logps/chosen": -272.42156982421875,
"logps/rejected": -270.8283386230469,
"loss": 0.5165,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3132372796535492,
"rewards/margins": 0.5522381663322449,
"rewards/rejected": -0.8654754757881165,
"step": 770
},
{
"epoch": 0.81,
"learning_rate": 4.0642939150401836e-07,
"logits/chosen": 23.23889923095703,
"logits/rejected": 23.24991798400879,
"logps/chosen": -314.5959167480469,
"logps/rejected": -257.6392517089844,
"loss": 0.534,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.4284973740577698,
"rewards/margins": 0.6584367156028748,
"rewards/rejected": -1.086934208869934,
"step": 780
},
{
"epoch": 0.82,
"learning_rate": 4.0451588212782237e-07,
"logits/chosen": 23.414813995361328,
"logits/rejected": 23.380718231201172,
"logps/chosen": -291.32501220703125,
"logps/rejected": -256.01263427734375,
"loss": 0.4937,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.18692317605018616,
"rewards/margins": 0.8645000457763672,
"rewards/rejected": -1.051423192024231,
"step": 790
},
{
"epoch": 0.83,
"learning_rate": 4.0260237275162643e-07,
"logits/chosen": 23.40145492553711,
"logits/rejected": 23.43955421447754,
"logps/chosen": -313.07513427734375,
"logps/rejected": -277.38201904296875,
"loss": 0.5159,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.11286661773920059,
"rewards/margins": 0.8762611150741577,
"rewards/rejected": -0.9891278147697449,
"step": 800
},
{
"epoch": 0.83,
"eval_logits/chosen": 23.617877960205078,
"eval_logits/rejected": 23.40951156616211,
"eval_logps/chosen": -355.9697570800781,
"eval_logps/rejected": -284.6782531738281,
"eval_loss": 0.5112624764442444,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.1399604231119156,
"eval_rewards/margins": 0.9195234179496765,
"eval_rewards/rejected": -1.0594837665557861,
"eval_runtime": 211.1679,
"eval_samples_per_second": 9.471,
"eval_steps_per_second": 0.298,
"step": 800
},
{
"epoch": 0.84,
"learning_rate": 4.006888633754305e-07,
"logits/chosen": 23.594928741455078,
"logits/rejected": 23.54049301147461,
"logps/chosen": -319.88677978515625,
"logps/rejected": -260.43389892578125,
"loss": 0.4905,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.20980267226696014,
"rewards/margins": 0.9086447954177856,
"rewards/rejected": -1.1184475421905518,
"step": 810
},
{
"epoch": 0.85,
"learning_rate": 3.9877535399923456e-07,
"logits/chosen": 23.482894897460938,
"logits/rejected": 23.19647216796875,
"logps/chosen": -338.23223876953125,
"logps/rejected": -269.0614929199219,
"loss": 0.5256,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4344770014286041,
"rewards/margins": 0.710912823677063,
"rewards/rejected": -1.1453897953033447,
"step": 820
},
{
"epoch": 0.86,
"learning_rate": 3.968618446230386e-07,
"logits/chosen": 23.347646713256836,
"logits/rejected": 23.12314224243164,
"logps/chosen": -311.5711364746094,
"logps/rejected": -240.50125122070312,
"loss": 0.4916,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3820451498031616,
"rewards/margins": 1.0347092151641846,
"rewards/rejected": -1.4167543649673462,
"step": 830
},
{
"epoch": 0.87,
"learning_rate": 3.949483352468427e-07,
"logits/chosen": 23.311033248901367,
"logits/rejected": 23.248620986938477,
"logps/chosen": -281.490966796875,
"logps/rejected": -240.92086791992188,
"loss": 0.556,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.47001034021377563,
"rewards/margins": 0.762096643447876,
"rewards/rejected": -1.2321069240570068,
"step": 840
},
{
"epoch": 0.88,
"learning_rate": 3.9303482587064674e-07,
"logits/chosen": 23.50173568725586,
"logits/rejected": 23.377094268798828,
"logps/chosen": -290.4707336425781,
"logps/rejected": -248.4992218017578,
"loss": 0.5133,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.25379228591918945,
"rewards/margins": 0.8668729662895203,
"rewards/rejected": -1.1206653118133545,
"step": 850
},
{
"epoch": 0.89,
"learning_rate": 3.911213164944508e-07,
"logits/chosen": 23.733707427978516,
"logits/rejected": 23.433372497558594,
"logps/chosen": -346.18353271484375,
"logps/rejected": -291.7870788574219,
"loss": 0.5163,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3051421046257019,
"rewards/margins": 0.9217368364334106,
"rewards/rejected": -1.2268788814544678,
"step": 860
},
{
"epoch": 0.9,
"learning_rate": 3.8920780711825487e-07,
"logits/chosen": 23.664628982543945,
"logits/rejected": 23.414520263671875,
"logps/chosen": -396.57269287109375,
"logps/rejected": -270.23681640625,
"loss": 0.5127,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.19412247836589813,
"rewards/margins": 0.9694843292236328,
"rewards/rejected": -1.163606882095337,
"step": 870
},
{
"epoch": 0.91,
"learning_rate": 3.8729429774205893e-07,
"logits/chosen": 23.33928680419922,
"logits/rejected": 23.3987979888916,
"logps/chosen": -381.9424133300781,
"logps/rejected": -267.4436340332031,
"loss": 0.5481,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.1822233498096466,
"rewards/margins": 1.0088589191436768,
"rewards/rejected": -1.191082239151001,
"step": 880
},
{
"epoch": 0.92,
"learning_rate": 3.8538078836586294e-07,
"logits/chosen": 23.547710418701172,
"logits/rejected": 23.508426666259766,
"logps/chosen": -332.8504638671875,
"logps/rejected": -287.86712646484375,
"loss": 0.5454,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.26958388090133667,
"rewards/margins": 0.755517840385437,
"rewards/rejected": -1.025101661682129,
"step": 890
},
{
"epoch": 0.93,
"learning_rate": 3.83467278989667e-07,
"logits/chosen": 23.69491195678711,
"logits/rejected": 23.59137725830078,
"logps/chosen": -287.8290100097656,
"logps/rejected": -236.21438598632812,
"loss": 0.5242,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.07682293653488159,
"rewards/margins": 0.8297051191329956,
"rewards/rejected": -0.906528115272522,
"step": 900
},
{
"epoch": 0.93,
"eval_logits/chosen": 23.614517211914062,
"eval_logits/rejected": 23.403518676757812,
"eval_logps/chosen": -354.952880859375,
"eval_logps/rejected": -283.23175048828125,
"eval_loss": 0.5089067220687866,
"eval_rewards/accuracies": 0.7658730149269104,
"eval_rewards/chosen": -0.03827480971813202,
"eval_rewards/margins": 0.8765569925308228,
"eval_rewards/rejected": -0.914831817150116,
"eval_runtime": 210.9611,
"eval_samples_per_second": 9.48,
"eval_steps_per_second": 0.299,
"step": 900
},
{
"epoch": 0.94,
"learning_rate": 3.8155376961347106e-07,
"logits/chosen": 23.2309627532959,
"logits/rejected": 23.20724105834961,
"logps/chosen": -246.0962677001953,
"logps/rejected": -230.4759979248047,
"loss": 0.5286,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.24791307747364044,
"rewards/margins": 0.7959302663803101,
"rewards/rejected": -1.0438432693481445,
"step": 910
},
{
"epoch": 0.95,
"learning_rate": 3.796402602372751e-07,
"logits/chosen": 23.323627471923828,
"logits/rejected": 23.179901123046875,
"logps/chosen": -294.0438537597656,
"logps/rejected": -236.05673217773438,
"loss": 0.5097,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.21511948108673096,
"rewards/margins": 0.6154388189315796,
"rewards/rejected": -0.8305583000183105,
"step": 920
},
{
"epoch": 0.96,
"learning_rate": 3.777267508610792e-07,
"logits/chosen": 23.346338272094727,
"logits/rejected": 23.224199295043945,
"logps/chosen": -318.80096435546875,
"logps/rejected": -251.92593383789062,
"loss": 0.5228,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.1629999428987503,
"rewards/margins": 0.8454955816268921,
"rewards/rejected": -1.008495569229126,
"step": 930
},
{
"epoch": 0.97,
"learning_rate": 3.7581324148488325e-07,
"logits/chosen": 23.233306884765625,
"logits/rejected": 23.179094314575195,
"logps/chosen": -330.75469970703125,
"logps/rejected": -246.1700897216797,
"loss": 0.5374,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.317868173122406,
"rewards/margins": 0.8588002920150757,
"rewards/rejected": -1.176668405532837,
"step": 940
},
{
"epoch": 0.98,
"learning_rate": 3.738997321086873e-07,
"logits/chosen": 23.4466495513916,
"logits/rejected": 23.421428680419922,
"logps/chosen": -325.29388427734375,
"logps/rejected": -277.3059387207031,
"loss": 0.5251,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.012738706544041634,
"rewards/margins": 0.8846112489700317,
"rewards/rejected": -0.8973498344421387,
"step": 950
},
{
"epoch": 0.99,
"learning_rate": 3.7198622273249137e-07,
"logits/chosen": 23.535139083862305,
"logits/rejected": 23.37562370300293,
"logps/chosen": -335.9436340332031,
"logps/rejected": -274.2239990234375,
"loss": 0.5143,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.1785895973443985,
"rewards/margins": 0.5961320400238037,
"rewards/rejected": -0.7747215628623962,
"step": 960
},
{
"epoch": 1.0,
"learning_rate": 3.7007271335629544e-07,
"logits/chosen": 23.633747100830078,
"logits/rejected": 23.46231460571289,
"logps/chosen": -295.21759033203125,
"logps/rejected": -258.9006042480469,
"loss": 0.4828,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.19572855532169342,
"rewards/margins": 0.8321182131767273,
"rewards/rejected": -1.0278469324111938,
"step": 970
},
{
"epoch": 1.01,
"learning_rate": 3.681592039800995e-07,
"logits/chosen": 23.68822479248047,
"logits/rejected": 23.501148223876953,
"logps/chosen": -299.13140869140625,
"logps/rejected": -279.1519470214844,
"loss": 0.4402,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.023116961121559143,
"rewards/margins": 1.1388248205184937,
"rewards/rejected": -1.1157079935073853,
"step": 980
},
{
"epoch": 1.02,
"learning_rate": 3.662456946039035e-07,
"logits/chosen": 23.261262893676758,
"logits/rejected": 22.952524185180664,
"logps/chosen": -308.18536376953125,
"logps/rejected": -273.8042907714844,
"loss": 0.4855,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.0076486109755933285,
"rewards/margins": 1.2322968244552612,
"rewards/rejected": -1.2399452924728394,
"step": 990
},
{
"epoch": 1.03,
"learning_rate": 3.6433218522770757e-07,
"logits/chosen": 23.609331130981445,
"logits/rejected": 23.460206985473633,
"logps/chosen": -298.27410888671875,
"logps/rejected": -291.9101867675781,
"loss": 0.4618,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.07892550528049469,
"rewards/margins": 0.973551869392395,
"rewards/rejected": -1.0524773597717285,
"step": 1000
},
{
"epoch": 1.03,
"eval_logits/chosen": 23.585590362548828,
"eval_logits/rejected": 23.38045883178711,
"eval_logps/chosen": -355.79290771484375,
"eval_logps/rejected": -284.2840881347656,
"eval_loss": 0.5076952576637268,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.12227805703878403,
"eval_rewards/margins": 0.8977885842323303,
"eval_rewards/rejected": -1.020066499710083,
"eval_runtime": 209.3271,
"eval_samples_per_second": 9.554,
"eval_steps_per_second": 0.301,
"step": 1000
},
{
"epoch": 1.04,
"learning_rate": 3.6241867585151163e-07,
"logits/chosen": 23.0650577545166,
"logits/rejected": 22.977046966552734,
"logps/chosen": -336.06597900390625,
"logps/rejected": -278.80828857421875,
"loss": 0.4487,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.024078911170363426,
"rewards/margins": 1.0799994468688965,
"rewards/rejected": -1.1040784120559692,
"step": 1010
},
{
"epoch": 1.05,
"learning_rate": 3.605051664753157e-07,
"logits/chosen": 23.251041412353516,
"logits/rejected": 23.117984771728516,
"logps/chosen": -303.11199951171875,
"logps/rejected": -243.7981719970703,
"loss": 0.415,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.18380531668663025,
"rewards/margins": 1.1280765533447266,
"rewards/rejected": -1.3118817806243896,
"step": 1020
},
{
"epoch": 1.06,
"learning_rate": 3.5859165709911975e-07,
"logits/chosen": 23.297225952148438,
"logits/rejected": 23.318119049072266,
"logps/chosen": -334.6638488769531,
"logps/rejected": -316.2551574707031,
"loss": 0.4302,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2101324051618576,
"rewards/margins": 0.9589886665344238,
"rewards/rejected": -1.1691210269927979,
"step": 1030
},
{
"epoch": 1.07,
"learning_rate": 3.566781477229238e-07,
"logits/chosen": 23.525909423828125,
"logits/rejected": 23.159460067749023,
"logps/chosen": -318.52093505859375,
"logps/rejected": -268.981201171875,
"loss": 0.4484,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.31944024562835693,
"rewards/margins": 0.7762435674667358,
"rewards/rejected": -1.0956838130950928,
"step": 1040
},
{
"epoch": 1.08,
"learning_rate": 3.547646383467279e-07,
"logits/chosen": 23.090410232543945,
"logits/rejected": 23.27143669128418,
"logps/chosen": -314.944580078125,
"logps/rejected": -246.20974731445312,
"loss": 0.4362,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.34100785851478577,
"rewards/margins": 0.8710842132568359,
"rewards/rejected": -1.2120921611785889,
"step": 1050
},
{
"epoch": 1.09,
"learning_rate": 3.5285112897053194e-07,
"logits/chosen": 23.36246681213379,
"logits/rejected": 23.223459243774414,
"logps/chosen": -286.5093078613281,
"logps/rejected": -283.33514404296875,
"loss": 0.4188,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.0016206980217248201,
"rewards/margins": 1.3344464302062988,
"rewards/rejected": -1.3328258991241455,
"step": 1060
},
{
"epoch": 1.1,
"learning_rate": 3.50937619594336e-07,
"logits/chosen": 23.439857482910156,
"logits/rejected": 23.41635513305664,
"logps/chosen": -319.5511169433594,
"logps/rejected": -310.8269348144531,
"loss": 0.4497,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.29633527994155884,
"rewards/margins": 0.8939793705940247,
"rewards/rejected": -1.190314531326294,
"step": 1070
},
{
"epoch": 1.12,
"learning_rate": 3.4902411021814007e-07,
"logits/chosen": 23.304676055908203,
"logits/rejected": 23.236148834228516,
"logps/chosen": -329.2352600097656,
"logps/rejected": -261.53082275390625,
"loss": 0.4335,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.15099604427814484,
"rewards/margins": 1.2369451522827148,
"rewards/rejected": -1.3879411220550537,
"step": 1080
},
{
"epoch": 1.13,
"learning_rate": 3.4711060084194413e-07,
"logits/chosen": 23.37632179260254,
"logits/rejected": 23.350784301757812,
"logps/chosen": -360.5380859375,
"logps/rejected": -272.3520812988281,
"loss": 0.442,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.015167620964348316,
"rewards/margins": 1.156204342842102,
"rewards/rejected": -1.1410366296768188,
"step": 1090
},
{
"epoch": 1.14,
"learning_rate": 3.4519709146574814e-07,
"logits/chosen": 23.032573699951172,
"logits/rejected": 22.9952449798584,
"logps/chosen": -225.4923858642578,
"logps/rejected": -196.6672821044922,
"loss": 0.4484,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4950261116027832,
"rewards/margins": 0.9079095721244812,
"rewards/rejected": -1.4029356241226196,
"step": 1100
},
{
"epoch": 1.14,
"eval_logits/chosen": 23.538101196289062,
"eval_logits/rejected": 23.34269142150879,
"eval_logps/chosen": -357.8807373046875,
"eval_logps/rejected": -287.3826599121094,
"eval_loss": 0.5019155144691467,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.3310595154762268,
"eval_rewards/margins": 0.9988633990287781,
"eval_rewards/rejected": -1.3299229145050049,
"eval_runtime": 210.9987,
"eval_samples_per_second": 9.479,
"eval_steps_per_second": 0.299,
"step": 1100
},
{
"epoch": 1.15,
"learning_rate": 3.432835820895522e-07,
"logits/chosen": 23.460542678833008,
"logits/rejected": 23.26938247680664,
"logps/chosen": -358.67877197265625,
"logps/rejected": -289.0791931152344,
"loss": 0.4235,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.26543277502059937,
"rewards/margins": 1.2576963901519775,
"rewards/rejected": -1.5231291055679321,
"step": 1110
},
{
"epoch": 1.16,
"learning_rate": 3.4137007271335626e-07,
"logits/chosen": 23.63456916809082,
"logits/rejected": 23.502344131469727,
"logps/chosen": -284.9480895996094,
"logps/rejected": -279.4847412109375,
"loss": 0.4245,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3471234440803528,
"rewards/margins": 1.1683541536331177,
"rewards/rejected": -1.5154775381088257,
"step": 1120
},
{
"epoch": 1.17,
"learning_rate": 3.394565633371603e-07,
"logits/chosen": 23.349411010742188,
"logits/rejected": 23.35630226135254,
"logps/chosen": -341.84881591796875,
"logps/rejected": -319.86358642578125,
"loss": 0.4209,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.29219168424606323,
"rewards/margins": 1.1724560260772705,
"rewards/rejected": -1.464647889137268,
"step": 1130
},
{
"epoch": 1.18,
"learning_rate": 3.375430539609644e-07,
"logits/chosen": 23.028972625732422,
"logits/rejected": 23.105947494506836,
"logps/chosen": -298.8593444824219,
"logps/rejected": -308.6123046875,
"loss": 0.4111,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.41705456376075745,
"rewards/margins": 0.954562783241272,
"rewards/rejected": -1.3716174364089966,
"step": 1140
},
{
"epoch": 1.19,
"learning_rate": 3.3562954458476845e-07,
"logits/chosen": 23.36569595336914,
"logits/rejected": 23.174901962280273,
"logps/chosen": -414.62567138671875,
"logps/rejected": -282.6720275878906,
"loss": 0.4634,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.30735766887664795,
"rewards/margins": 1.2201875448226929,
"rewards/rejected": -1.5275452136993408,
"step": 1150
},
{
"epoch": 1.2,
"learning_rate": 3.337160352085725e-07,
"logits/chosen": 23.13878059387207,
"logits/rejected": 23.121612548828125,
"logps/chosen": -331.0238342285156,
"logps/rejected": -275.01129150390625,
"loss": 0.4363,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.260793000459671,
"rewards/margins": 1.1585649251937866,
"rewards/rejected": -1.4193580150604248,
"step": 1160
},
{
"epoch": 1.21,
"learning_rate": 3.3180252583237657e-07,
"logits/chosen": 23.040502548217773,
"logits/rejected": 22.87631607055664,
"logps/chosen": -301.6041564941406,
"logps/rejected": -246.01254272460938,
"loss": 0.4526,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.19479836523532867,
"rewards/margins": 1.1458656787872314,
"rewards/rejected": -1.340664029121399,
"step": 1170
},
{
"epoch": 1.22,
"learning_rate": 3.2988901645618063e-07,
"logits/chosen": 23.21322250366211,
"logits/rejected": 22.910724639892578,
"logps/chosen": -270.7756652832031,
"logps/rejected": -233.3585968017578,
"loss": 0.4396,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4070712924003601,
"rewards/margins": 0.8780719041824341,
"rewards/rejected": -1.2851431369781494,
"step": 1180
},
{
"epoch": 1.23,
"learning_rate": 3.279755070799847e-07,
"logits/chosen": 23.265064239501953,
"logits/rejected": 23.274433135986328,
"logps/chosen": -315.3988037109375,
"logps/rejected": -291.09375,
"loss": 0.4049,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.24129147827625275,
"rewards/margins": 1.1495540142059326,
"rewards/rejected": -1.390845537185669,
"step": 1190
},
{
"epoch": 1.24,
"learning_rate": 3.260619977037887e-07,
"logits/chosen": 23.321971893310547,
"logits/rejected": 23.265857696533203,
"logps/chosen": -304.5440368652344,
"logps/rejected": -283.64764404296875,
"loss": 0.4228,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.15949824452400208,
"rewards/margins": 1.1103687286376953,
"rewards/rejected": -1.269866704940796,
"step": 1200
},
{
"epoch": 1.24,
"eval_logits/chosen": 23.51008415222168,
"eval_logits/rejected": 23.319059371948242,
"eval_logps/chosen": -355.1871337890625,
"eval_logps/rejected": -285.07257080078125,
"eval_loss": 0.5033829212188721,
"eval_rewards/accuracies": 0.761904776096344,
"eval_rewards/chosen": -0.06169680133461952,
"eval_rewards/margins": 1.0372183322906494,
"eval_rewards/rejected": -1.0989152193069458,
"eval_runtime": 207.9261,
"eval_samples_per_second": 9.619,
"eval_steps_per_second": 0.303,
"step": 1200
},
{
"epoch": 1.25,
"learning_rate": 3.2414848832759277e-07,
"logits/chosen": 23.359235763549805,
"logits/rejected": 23.241931915283203,
"logps/chosen": -253.0906219482422,
"logps/rejected": -248.0492706298828,
"loss": 0.4028,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.09821876138448715,
"rewards/margins": 1.3042573928833008,
"rewards/rejected": -1.4024760723114014,
"step": 1210
},
{
"epoch": 1.26,
"learning_rate": 3.2223497895139683e-07,
"logits/chosen": 23.251922607421875,
"logits/rejected": 23.224475860595703,
"logps/chosen": -323.7980041503906,
"logps/rejected": -312.51934814453125,
"loss": 0.4376,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.02340073511004448,
"rewards/margins": 1.2130701541900635,
"rewards/rejected": -1.2364708185195923,
"step": 1220
},
{
"epoch": 1.27,
"learning_rate": 3.203214695752009e-07,
"logits/chosen": 23.745332717895508,
"logits/rejected": 23.615753173828125,
"logps/chosen": -313.46453857421875,
"logps/rejected": -308.4986877441406,
"loss": 0.4391,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.15489891171455383,
"rewards/margins": 0.8978082537651062,
"rewards/rejected": -1.0527071952819824,
"step": 1230
},
{
"epoch": 1.28,
"learning_rate": 3.1840796019900495e-07,
"logits/chosen": 23.76753044128418,
"logits/rejected": 23.41860580444336,
"logps/chosen": -348.83258056640625,
"logps/rejected": -300.46893310546875,
"loss": 0.4562,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.1853850781917572,
"rewards/margins": 1.198961853981018,
"rewards/rejected": -1.3843467235565186,
"step": 1240
},
{
"epoch": 1.29,
"learning_rate": 3.16494450822809e-07,
"logits/chosen": 23.356491088867188,
"logits/rejected": 23.26506996154785,
"logps/chosen": -251.2193145751953,
"logps/rejected": -268.14215087890625,
"loss": 0.4412,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5262545347213745,
"rewards/margins": 0.6762484312057495,
"rewards/rejected": -1.202502965927124,
"step": 1250
},
{
"epoch": 1.3,
"learning_rate": 3.145809414466131e-07,
"logits/chosen": 23.517498016357422,
"logits/rejected": 23.316844940185547,
"logps/chosen": -342.25604248046875,
"logps/rejected": -239.2180633544922,
"loss": 0.4239,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.42340001463890076,
"rewards/margins": 1.1046512126922607,
"rewards/rejected": -1.5280513763427734,
"step": 1260
},
{
"epoch": 1.31,
"learning_rate": 3.1266743207041714e-07,
"logits/chosen": 23.371051788330078,
"logits/rejected": 23.382333755493164,
"logps/chosen": -388.79425048828125,
"logps/rejected": -311.9518127441406,
"loss": 0.4435,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.25299787521362305,
"rewards/margins": 1.0895098447799683,
"rewards/rejected": -1.3425077199935913,
"step": 1270
},
{
"epoch": 1.32,
"learning_rate": 3.107539226942212e-07,
"logits/chosen": 23.398571014404297,
"logits/rejected": 23.297183990478516,
"logps/chosen": -284.0433349609375,
"logps/rejected": -269.79901123046875,
"loss": 0.4612,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.34444212913513184,
"rewards/margins": 1.0336390733718872,
"rewards/rejected": -1.3780810832977295,
"step": 1280
},
{
"epoch": 1.33,
"learning_rate": 3.0884041331802526e-07,
"logits/chosen": 23.48178482055664,
"logits/rejected": 23.24820899963379,
"logps/chosen": -316.18109130859375,
"logps/rejected": -268.31182861328125,
"loss": 0.4396,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.26420363783836365,
"rewards/margins": 1.2460224628448486,
"rewards/rejected": -1.5102260112762451,
"step": 1290
},
{
"epoch": 1.34,
"learning_rate": 3.0692690394182927e-07,
"logits/chosen": 23.08510971069336,
"logits/rejected": 22.929636001586914,
"logps/chosen": -303.6866760253906,
"logps/rejected": -280.0844421386719,
"loss": 0.4306,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.46213406324386597,
"rewards/margins": 1.072772741317749,
"rewards/rejected": -1.5349067449569702,
"step": 1300
},
{
"epoch": 1.34,
"eval_logits/chosen": 23.478702545166016,
"eval_logits/rejected": 23.2889404296875,
"eval_logps/chosen": -356.1548767089844,
"eval_logps/rejected": -285.9320373535156,
"eval_loss": 0.5032446384429932,
"eval_rewards/accuracies": 0.7698412537574768,
"eval_rewards/chosen": -0.15847428143024445,
"eval_rewards/margins": 1.0263888835906982,
"eval_rewards/rejected": -1.1848632097244263,
"eval_runtime": 214.9168,
"eval_samples_per_second": 9.306,
"eval_steps_per_second": 0.293,
"step": 1300
},
{
"epoch": 1.35,
"learning_rate": 3.0501339456563334e-07,
"logits/chosen": 23.21750831604004,
"logits/rejected": 23.04998016357422,
"logps/chosen": -319.6648254394531,
"logps/rejected": -272.9951477050781,
"loss": 0.42,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.12266921997070312,
"rewards/margins": 1.1443729400634766,
"rewards/rejected": -1.2670420408248901,
"step": 1310
},
{
"epoch": 1.36,
"learning_rate": 3.030998851894374e-07,
"logits/chosen": 22.977802276611328,
"logits/rejected": 22.969890594482422,
"logps/chosen": -301.5118408203125,
"logps/rejected": -249.35372924804688,
"loss": 0.4142,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.41769084334373474,
"rewards/margins": 0.9436267614364624,
"rewards/rejected": -1.3613176345825195,
"step": 1320
},
{
"epoch": 1.37,
"learning_rate": 3.0118637581324146e-07,
"logits/chosen": 23.275325775146484,
"logits/rejected": 23.174297332763672,
"logps/chosen": -339.08258056640625,
"logps/rejected": -282.3678283691406,
"loss": 0.422,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.15730026364326477,
"rewards/margins": 1.1286394596099854,
"rewards/rejected": -1.2859396934509277,
"step": 1330
},
{
"epoch": 1.38,
"learning_rate": 2.992728664370455e-07,
"logits/chosen": 23.317642211914062,
"logits/rejected": 23.297130584716797,
"logps/chosen": -263.40313720703125,
"logps/rejected": -259.40655517578125,
"loss": 0.4378,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5026736259460449,
"rewards/margins": 0.7866870760917664,
"rewards/rejected": -1.289360761642456,
"step": 1340
},
{
"epoch": 1.39,
"learning_rate": 2.973593570608496e-07,
"logits/chosen": 23.30654525756836,
"logits/rejected": 23.131305694580078,
"logps/chosen": -299.3922424316406,
"logps/rejected": -269.0783386230469,
"loss": 0.4366,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.43080934882164,
"rewards/margins": 1.0835515260696411,
"rewards/rejected": -1.514360785484314,
"step": 1350
},
{
"epoch": 1.4,
"learning_rate": 2.9544584768465365e-07,
"logits/chosen": 23.53885269165039,
"logits/rejected": 23.299760818481445,
"logps/chosen": -324.5937194824219,
"logps/rejected": -272.784912109375,
"loss": 0.4427,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.40283799171447754,
"rewards/margins": 1.1869771480560303,
"rewards/rejected": -1.5898151397705078,
"step": 1360
},
{
"epoch": 1.41,
"learning_rate": 2.935323383084577e-07,
"logits/chosen": 23.592838287353516,
"logits/rejected": 23.14777183532715,
"logps/chosen": -392.72662353515625,
"logps/rejected": -286.9781494140625,
"loss": 0.4427,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.39330539107322693,
"rewards/margins": 1.0419480800628662,
"rewards/rejected": -1.4352535009384155,
"step": 1370
},
{
"epoch": 1.42,
"learning_rate": 2.9161882893226177e-07,
"logits/chosen": 23.269372940063477,
"logits/rejected": 22.95911979675293,
"logps/chosen": -316.82574462890625,
"logps/rejected": -261.1885070800781,
"loss": 0.433,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.317082941532135,
"rewards/margins": 1.2134672403335571,
"rewards/rejected": -1.530550241470337,
"step": 1380
},
{
"epoch": 1.44,
"learning_rate": 2.8970531955606583e-07,
"logits/chosen": 23.39419937133789,
"logits/rejected": 23.142974853515625,
"logps/chosen": -336.1145935058594,
"logps/rejected": -242.3621826171875,
"loss": 0.4514,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.19971036911010742,
"rewards/margins": 1.381958246231079,
"rewards/rejected": -1.5816686153411865,
"step": 1390
},
{
"epoch": 1.45,
"learning_rate": 2.8779181017986984e-07,
"logits/chosen": 23.005327224731445,
"logits/rejected": 22.998939514160156,
"logps/chosen": -389.1512756347656,
"logps/rejected": -290.7593688964844,
"loss": 0.4678,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.16274584829807281,
"rewards/margins": 1.0353623628616333,
"rewards/rejected": -1.198108196258545,
"step": 1400
},
{
"epoch": 1.45,
"eval_logits/chosen": 23.455062866210938,
"eval_logits/rejected": 23.266075134277344,
"eval_logps/chosen": -356.9206848144531,
"eval_logps/rejected": -285.68414306640625,
"eval_loss": 0.5029928684234619,
"eval_rewards/accuracies": 0.7817460298538208,
"eval_rewards/chosen": -0.23505355417728424,
"eval_rewards/margins": 0.9250208735466003,
"eval_rewards/rejected": -1.1600743532180786,
"eval_runtime": 212.5498,
"eval_samples_per_second": 9.41,
"eval_steps_per_second": 0.296,
"step": 1400
},
{
"epoch": 1.46,
"learning_rate": 2.858783008036739e-07,
"logits/chosen": 23.323863983154297,
"logits/rejected": 23.302270889282227,
"logps/chosen": -362.5963134765625,
"logps/rejected": -284.63519287109375,
"loss": 0.4375,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.15233711898326874,
"rewards/margins": 1.4009491205215454,
"rewards/rejected": -1.5532863140106201,
"step": 1410
},
{
"epoch": 1.47,
"learning_rate": 2.8396479142747797e-07,
"logits/chosen": 23.571504592895508,
"logits/rejected": 23.417720794677734,
"logps/chosen": -313.31170654296875,
"logps/rejected": -293.6142272949219,
"loss": 0.4351,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.16742083430290222,
"rewards/margins": 1.0415483713150024,
"rewards/rejected": -1.208969235420227,
"step": 1420
},
{
"epoch": 1.48,
"learning_rate": 2.8205128205128203e-07,
"logits/chosen": 23.33370590209961,
"logits/rejected": 23.32365608215332,
"logps/chosen": -299.41351318359375,
"logps/rejected": -302.18939208984375,
"loss": 0.4146,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.1101953536272049,
"rewards/margins": 1.1390321254730225,
"rewards/rejected": -1.2492274045944214,
"step": 1430
},
{
"epoch": 1.49,
"learning_rate": 2.801377726750861e-07,
"logits/chosen": 23.091251373291016,
"logits/rejected": 23.06249237060547,
"logps/chosen": -288.70050048828125,
"logps/rejected": -261.63494873046875,
"loss": 0.4288,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.41927942633628845,
"rewards/margins": 0.8092526197433472,
"rewards/rejected": -1.228532075881958,
"step": 1440
},
{
"epoch": 1.5,
"learning_rate": 2.7822426329889015e-07,
"logits/chosen": 22.919397354125977,
"logits/rejected": 22.959392547607422,
"logps/chosen": -316.126953125,
"logps/rejected": -247.66845703125,
"loss": 0.4636,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.22512026131153107,
"rewards/margins": 1.1928333044052124,
"rewards/rejected": -1.4179537296295166,
"step": 1450
},
{
"epoch": 1.51,
"learning_rate": 2.763107539226942e-07,
"logits/chosen": 23.120534896850586,
"logits/rejected": 22.996898651123047,
"logps/chosen": -366.4676513671875,
"logps/rejected": -257.8288879394531,
"loss": 0.4564,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3532385230064392,
"rewards/margins": 1.007889986038208,
"rewards/rejected": -1.361128568649292,
"step": 1460
},
{
"epoch": 1.52,
"learning_rate": 2.743972445464983e-07,
"logits/chosen": 23.175884246826172,
"logits/rejected": 23.074565887451172,
"logps/chosen": -285.8373718261719,
"logps/rejected": -230.23263549804688,
"loss": 0.4406,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.30376359820365906,
"rewards/margins": 0.9630700945854187,
"rewards/rejected": -1.2668339014053345,
"step": 1470
},
{
"epoch": 1.53,
"learning_rate": 2.7248373517030234e-07,
"logits/chosen": 23.030946731567383,
"logits/rejected": 23.114126205444336,
"logps/chosen": -342.0677185058594,
"logps/rejected": -273.91558837890625,
"loss": 0.4265,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3027496933937073,
"rewards/margins": 0.7853070497512817,
"rewards/rejected": -1.0880568027496338,
"step": 1480
},
{
"epoch": 1.54,
"learning_rate": 2.705702257941064e-07,
"logits/chosen": 23.109298706054688,
"logits/rejected": 22.95934295654297,
"logps/chosen": -353.9482727050781,
"logps/rejected": -293.26165771484375,
"loss": 0.3973,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.19786301255226135,
"rewards/margins": 1.2528201341629028,
"rewards/rejected": -1.4506832361221313,
"step": 1490
},
{
"epoch": 1.55,
"learning_rate": 2.686567164179104e-07,
"logits/chosen": 23.07802391052246,
"logits/rejected": 22.99662208557129,
"logps/chosen": -328.4552001953125,
"logps/rejected": -250.57901000976562,
"loss": 0.4317,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1944916695356369,
"rewards/margins": 1.2932884693145752,
"rewards/rejected": -1.4877803325653076,
"step": 1500
},
{
"epoch": 1.55,
"eval_logits/chosen": 23.452411651611328,
"eval_logits/rejected": 23.262121200561523,
"eval_logps/chosen": -355.9715576171875,
"eval_logps/rejected": -285.541748046875,
"eval_loss": 0.49968841671943665,
"eval_rewards/accuracies": 0.761904776096344,
"eval_rewards/chosen": -0.1401444375514984,
"eval_rewards/margins": 1.0056895017623901,
"eval_rewards/rejected": -1.1458338499069214,
"eval_runtime": 210.203,
"eval_samples_per_second": 9.515,
"eval_steps_per_second": 0.3,
"step": 1500
},
{
"epoch": 1.56,
"learning_rate": 2.6674320704171447e-07,
"logits/chosen": 23.38498306274414,
"logits/rejected": 23.101451873779297,
"logps/chosen": -313.84991455078125,
"logps/rejected": -229.9058837890625,
"loss": 0.4147,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2590484023094177,
"rewards/margins": 0.8956319689750671,
"rewards/rejected": -1.1546803712844849,
"step": 1510
},
{
"epoch": 1.57,
"learning_rate": 2.6482969766551853e-07,
"logits/chosen": 23.376522064208984,
"logits/rejected": 23.071407318115234,
"logps/chosen": -294.4135437011719,
"logps/rejected": -286.2037658691406,
"loss": 0.4243,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.2608863413333893,
"rewards/margins": 1.1486496925354004,
"rewards/rejected": -1.4095360040664673,
"step": 1520
},
{
"epoch": 1.58,
"learning_rate": 2.629161882893226e-07,
"logits/chosen": 23.766990661621094,
"logits/rejected": 23.534847259521484,
"logps/chosen": -363.1257629394531,
"logps/rejected": -257.43377685546875,
"loss": 0.4044,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.28155818581581116,
"rewards/margins": 1.1225359439849854,
"rewards/rejected": -1.4040942192077637,
"step": 1530
},
{
"epoch": 1.59,
"learning_rate": 2.6100267891312666e-07,
"logits/chosen": 23.727947235107422,
"logits/rejected": 23.551546096801758,
"logps/chosen": -354.59808349609375,
"logps/rejected": -309.74041748046875,
"loss": 0.4358,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2375718653202057,
"rewards/margins": 0.9404077529907227,
"rewards/rejected": -1.177979588508606,
"step": 1540
},
{
"epoch": 1.6,
"learning_rate": 2.590891695369307e-07,
"logits/chosen": 23.435352325439453,
"logits/rejected": 23.340909957885742,
"logps/chosen": -323.2891845703125,
"logps/rejected": -256.3253479003906,
"loss": 0.4108,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.21504418551921844,
"rewards/margins": 1.1017714738845825,
"rewards/rejected": -1.316815733909607,
"step": 1550
},
{
"epoch": 1.61,
"learning_rate": 2.571756601607348e-07,
"logits/chosen": 23.279882431030273,
"logits/rejected": 22.866252899169922,
"logps/chosen": -376.61431884765625,
"logps/rejected": -252.8503875732422,
"loss": 0.4265,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.24882745742797852,
"rewards/margins": 1.2063392400741577,
"rewards/rejected": -1.4551665782928467,
"step": 1560
},
{
"epoch": 1.62,
"learning_rate": 2.5526215078453884e-07,
"logits/chosen": 23.572551727294922,
"logits/rejected": 23.359222412109375,
"logps/chosen": -348.06396484375,
"logps/rejected": -301.94830322265625,
"loss": 0.4353,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.1706874668598175,
"rewards/margins": 1.228468656539917,
"rewards/rejected": -1.399156093597412,
"step": 1570
},
{
"epoch": 1.63,
"learning_rate": 2.533486414083429e-07,
"logits/chosen": 23.559356689453125,
"logits/rejected": 23.3609676361084,
"logps/chosen": -354.1952209472656,
"logps/rejected": -299.01385498046875,
"loss": 0.392,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.025229115039110184,
"rewards/margins": 1.3790032863616943,
"rewards/rejected": -1.3537743091583252,
"step": 1580
},
{
"epoch": 1.64,
"learning_rate": 2.5143513203214697e-07,
"logits/chosen": 23.176847457885742,
"logits/rejected": 23.128990173339844,
"logps/chosen": -395.842529296875,
"logps/rejected": -295.98162841796875,
"loss": 0.4379,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.10837908089160919,
"rewards/margins": 1.3724091053009033,
"rewards/rejected": -1.480788230895996,
"step": 1590
},
{
"epoch": 1.65,
"learning_rate": 2.49521622655951e-07,
"logits/chosen": 23.313915252685547,
"logits/rejected": 23.103626251220703,
"logps/chosen": -350.606689453125,
"logps/rejected": -294.15594482421875,
"loss": 0.4363,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.2652584910392761,
"rewards/margins": 1.1176103353500366,
"rewards/rejected": -1.382868766784668,
"step": 1600
},
{
"epoch": 1.65,
"eval_logits/chosen": 23.417835235595703,
"eval_logits/rejected": 23.231985092163086,
"eval_logps/chosen": -357.8829650878906,
"eval_logps/rejected": -287.6752014160156,
"eval_loss": 0.5009579062461853,
"eval_rewards/accuracies": 0.773809552192688,
"eval_rewards/chosen": -0.3312842845916748,
"eval_rewards/margins": 1.0278921127319336,
"eval_rewards/rejected": -1.3591763973236084,
"eval_runtime": 211.3907,
"eval_samples_per_second": 9.461,
"eval_steps_per_second": 0.298,
"step": 1600
},
{
"epoch": 1.66,
"learning_rate": 2.4760811327975504e-07,
"logits/chosen": 23.531373977661133,
"logits/rejected": 23.429351806640625,
"logps/chosen": -347.46490478515625,
"logps/rejected": -289.24176025390625,
"loss": 0.4249,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.28774064779281616,
"rewards/margins": 1.0696442127227783,
"rewards/rejected": -1.3573849201202393,
"step": 1610
},
{
"epoch": 1.67,
"learning_rate": 2.456946039035591e-07,
"logits/chosen": 23.160110473632812,
"logits/rejected": 23.07761001586914,
"logps/chosen": -372.6455993652344,
"logps/rejected": -254.6509246826172,
"loss": 0.4312,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.29867392778396606,
"rewards/margins": 1.093621850013733,
"rewards/rejected": -1.3922955989837646,
"step": 1620
},
{
"epoch": 1.68,
"learning_rate": 2.4378109452736316e-07,
"logits/chosen": 23.148435592651367,
"logits/rejected": 23.090463638305664,
"logps/chosen": -316.5802917480469,
"logps/rejected": -288.7501220703125,
"loss": 0.4262,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.3632759749889374,
"rewards/margins": 1.0216796398162842,
"rewards/rejected": -1.3849557638168335,
"step": 1630
},
{
"epoch": 1.69,
"learning_rate": 2.418675851511672e-07,
"logits/chosen": 23.17940330505371,
"logits/rejected": 23.190204620361328,
"logps/chosen": -346.47625732421875,
"logps/rejected": -270.1147155761719,
"loss": 0.4333,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2974759638309479,
"rewards/margins": 1.0174903869628906,
"rewards/rejected": -1.3149662017822266,
"step": 1640
},
{
"epoch": 1.7,
"learning_rate": 2.399540757749713e-07,
"logits/chosen": 23.138202667236328,
"logits/rejected": 22.985610961914062,
"logps/chosen": -340.69940185546875,
"logps/rejected": -311.3159484863281,
"loss": 0.4341,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1574067324399948,
"rewards/margins": 1.2777574062347412,
"rewards/rejected": -1.4351643323898315,
"step": 1650
},
{
"epoch": 1.71,
"learning_rate": 2.3804056639877535e-07,
"logits/chosen": 23.348569869995117,
"logits/rejected": 23.167980194091797,
"logps/chosen": -273.6280822753906,
"logps/rejected": -238.4679412841797,
"loss": 0.4392,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.03210877254605293,
"rewards/margins": 1.156890630722046,
"rewards/rejected": -1.1889994144439697,
"step": 1660
},
{
"epoch": 1.72,
"learning_rate": 2.361270570225794e-07,
"logits/chosen": 23.42279052734375,
"logits/rejected": 23.08903694152832,
"logps/chosen": -358.487548828125,
"logps/rejected": -266.75341796875,
"loss": 0.3848,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.0037168667186051607,
"rewards/margins": 1.2687807083129883,
"rewards/rejected": -1.272497534751892,
"step": 1670
},
{
"epoch": 1.73,
"learning_rate": 2.3421354764638345e-07,
"logits/chosen": 23.328954696655273,
"logits/rejected": 23.21335792541504,
"logps/chosen": -294.31402587890625,
"logps/rejected": -263.2884826660156,
"loss": 0.44,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.1213906854391098,
"rewards/margins": 0.9413496255874634,
"rewards/rejected": -1.0627403259277344,
"step": 1680
},
{
"epoch": 1.74,
"learning_rate": 2.323000382701875e-07,
"logits/chosen": 23.385282516479492,
"logits/rejected": 23.229637145996094,
"logps/chosen": -392.8078308105469,
"logps/rejected": -314.957275390625,
"loss": 0.4084,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1200430616736412,
"rewards/margins": 1.2001924514770508,
"rewards/rejected": -1.3202354907989502,
"step": 1690
},
{
"epoch": 1.76,
"learning_rate": 2.3038652889399157e-07,
"logits/chosen": 23.32499122619629,
"logits/rejected": 23.208293914794922,
"logps/chosen": -338.33892822265625,
"logps/rejected": -305.3815612792969,
"loss": 0.408,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.3193683624267578,
"rewards/margins": 1.028808832168579,
"rewards/rejected": -1.348177433013916,
"step": 1700
},
{
"epoch": 1.76,
"eval_logits/chosen": 23.395021438598633,
"eval_logits/rejected": 23.213520050048828,
"eval_logps/chosen": -357.0264892578125,
"eval_logps/rejected": -287.1567687988281,
"eval_loss": 0.4989284873008728,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.24563594162464142,
"eval_rewards/margins": 1.0617001056671143,
"eval_rewards/rejected": -1.3073359727859497,
"eval_runtime": 212.6457,
"eval_samples_per_second": 9.405,
"eval_steps_per_second": 0.296,
"step": 1700
},
{
"epoch": 1.77,
"learning_rate": 2.2847301951779563e-07,
"logits/chosen": 23.246747970581055,
"logits/rejected": 23.268218994140625,
"logps/chosen": -298.40838623046875,
"logps/rejected": -294.05877685546875,
"loss": 0.4063,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.37956395745277405,
"rewards/margins": 1.2896459102630615,
"rewards/rejected": -1.6692098379135132,
"step": 1710
},
{
"epoch": 1.78,
"learning_rate": 2.265595101415997e-07,
"logits/chosen": 23.170940399169922,
"logits/rejected": 23.139057159423828,
"logps/chosen": -333.7245178222656,
"logps/rejected": -287.9451599121094,
"loss": 0.4352,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.23647813498973846,
"rewards/margins": 1.159517526626587,
"rewards/rejected": -1.3959954977035522,
"step": 1720
},
{
"epoch": 1.79,
"learning_rate": 2.2464600076540373e-07,
"logits/chosen": 23.052528381347656,
"logits/rejected": 22.96520233154297,
"logps/chosen": -327.0295104980469,
"logps/rejected": -272.34539794921875,
"loss": 0.4108,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3730023503303528,
"rewards/margins": 1.0741784572601318,
"rewards/rejected": -1.4471808671951294,
"step": 1730
},
{
"epoch": 1.8,
"learning_rate": 2.227324913892078e-07,
"logits/chosen": 23.350069046020508,
"logits/rejected": 23.162134170532227,
"logps/chosen": -338.16583251953125,
"logps/rejected": -292.6080627441406,
"loss": 0.4213,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.24311116337776184,
"rewards/margins": 1.0276445150375366,
"rewards/rejected": -1.270755648612976,
"step": 1740
},
{
"epoch": 1.81,
"learning_rate": 2.2081898201301186e-07,
"logits/chosen": 23.173582077026367,
"logits/rejected": 23.145183563232422,
"logps/chosen": -329.4697265625,
"logps/rejected": -266.3700256347656,
"loss": 0.407,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.023505190387368202,
"rewards/margins": 1.2212746143341064,
"rewards/rejected": -1.2447797060012817,
"step": 1750
},
{
"epoch": 1.82,
"learning_rate": 2.1890547263681592e-07,
"logits/chosen": 23.282878875732422,
"logits/rejected": 23.070077896118164,
"logps/chosen": -321.710205078125,
"logps/rejected": -277.3352966308594,
"loss": 0.4071,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.25008895993232727,
"rewards/margins": 1.3580764532089233,
"rewards/rejected": -1.6081653833389282,
"step": 1760
},
{
"epoch": 1.83,
"learning_rate": 2.1699196326061998e-07,
"logits/chosen": 23.08016586303711,
"logits/rejected": 23.203523635864258,
"logps/chosen": -297.59130859375,
"logps/rejected": -280.3871765136719,
"loss": 0.4554,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.3134928345680237,
"rewards/margins": 1.07438063621521,
"rewards/rejected": -1.387873649597168,
"step": 1770
},
{
"epoch": 1.84,
"learning_rate": 2.1507845388442402e-07,
"logits/chosen": 23.223857879638672,
"logits/rejected": 23.118579864501953,
"logps/chosen": -307.32745361328125,
"logps/rejected": -264.244384765625,
"loss": 0.4215,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4751613140106201,
"rewards/margins": 1.2573087215423584,
"rewards/rejected": -1.732469916343689,
"step": 1780
},
{
"epoch": 1.85,
"learning_rate": 2.1316494450822808e-07,
"logits/chosen": 23.08175277709961,
"logits/rejected": 23.210302352905273,
"logps/chosen": -348.2358093261719,
"logps/rejected": -269.00909423828125,
"loss": 0.4148,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.5350313186645508,
"rewards/margins": 1.2447912693023682,
"rewards/rejected": -1.779822587966919,
"step": 1790
},
{
"epoch": 1.86,
"learning_rate": 2.1125143513203214e-07,
"logits/chosen": 23.117109298706055,
"logits/rejected": 23.03956413269043,
"logps/chosen": -350.7597961425781,
"logps/rejected": -257.7859191894531,
"loss": 0.4076,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.42596331238746643,
"rewards/margins": 1.1088837385177612,
"rewards/rejected": -1.5348470211029053,
"step": 1800
},
{
"epoch": 1.86,
"eval_logits/chosen": 23.361677169799805,
"eval_logits/rejected": 23.18657112121582,
"eval_logps/chosen": -358.4737854003906,
"eval_logps/rejected": -288.44818115234375,
"eval_loss": 0.4995974004268646,
"eval_rewards/accuracies": 0.7658730149269104,
"eval_rewards/chosen": -0.39036476612091064,
"eval_rewards/margins": 1.0461114645004272,
"eval_rewards/rejected": -1.4364763498306274,
"eval_runtime": 207.254,
"eval_samples_per_second": 9.65,
"eval_steps_per_second": 0.304,
"step": 1800
},
{
"epoch": 1.87,
"learning_rate": 2.093379257558362e-07,
"logits/chosen": 23.419483184814453,
"logits/rejected": 23.20507049560547,
"logps/chosen": -321.2181396484375,
"logps/rejected": -264.3491516113281,
"loss": 0.4189,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.5122248530387878,
"rewards/margins": 1.2178277969360352,
"rewards/rejected": -1.7300525903701782,
"step": 1810
},
{
"epoch": 1.88,
"learning_rate": 2.0742441637964026e-07,
"logits/chosen": 22.683643341064453,
"logits/rejected": 22.83184242248535,
"logps/chosen": -339.54498291015625,
"logps/rejected": -250.641845703125,
"loss": 0.4102,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.3275443911552429,
"rewards/margins": 1.4303803443908691,
"rewards/rejected": -1.7579247951507568,
"step": 1820
},
{
"epoch": 1.89,
"learning_rate": 2.055109070034443e-07,
"logits/chosen": 23.576740264892578,
"logits/rejected": 23.395267486572266,
"logps/chosen": -349.0409240722656,
"logps/rejected": -268.94268798828125,
"loss": 0.4055,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.29274967312812805,
"rewards/margins": 1.1743746995925903,
"rewards/rejected": -1.4671242237091064,
"step": 1830
},
{
"epoch": 1.9,
"learning_rate": 2.0359739762724836e-07,
"logits/chosen": 23.352815628051758,
"logits/rejected": 23.3568058013916,
"logps/chosen": -352.31304931640625,
"logps/rejected": -284.7132873535156,
"loss": 0.4133,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.33104512095451355,
"rewards/margins": 1.168330192565918,
"rewards/rejected": -1.499375343322754,
"step": 1840
},
{
"epoch": 1.91,
"learning_rate": 2.0168388825105242e-07,
"logits/chosen": 23.261768341064453,
"logits/rejected": 23.228496551513672,
"logps/chosen": -345.1585998535156,
"logps/rejected": -339.8830871582031,
"loss": 0.3868,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.12148020416498184,
"rewards/margins": 1.3025624752044678,
"rewards/rejected": -1.424042820930481,
"step": 1850
},
{
"epoch": 1.92,
"learning_rate": 1.997703788748565e-07,
"logits/chosen": 23.22171401977539,
"logits/rejected": 22.969928741455078,
"logps/chosen": -328.1639099121094,
"logps/rejected": -222.90762329101562,
"loss": 0.406,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.09366317093372345,
"rewards/margins": 1.3427413702011108,
"rewards/rejected": -1.4364044666290283,
"step": 1860
},
{
"epoch": 1.93,
"learning_rate": 1.9785686949866055e-07,
"logits/chosen": 22.87033462524414,
"logits/rejected": 23.068653106689453,
"logps/chosen": -331.31146240234375,
"logps/rejected": -285.5386962890625,
"loss": 0.4169,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.057704973965883255,
"rewards/margins": 1.1090171337127686,
"rewards/rejected": -1.1667221784591675,
"step": 1870
},
{
"epoch": 1.94,
"learning_rate": 1.9594336012246458e-07,
"logits/chosen": 23.055828094482422,
"logits/rejected": 23.081539154052734,
"logps/chosen": -312.4957275390625,
"logps/rejected": -268.7204895019531,
"loss": 0.3909,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.07027649134397507,
"rewards/margins": 1.1952614784240723,
"rewards/rejected": -1.2655378580093384,
"step": 1880
},
{
"epoch": 1.95,
"learning_rate": 1.9402985074626865e-07,
"logits/chosen": 22.958328247070312,
"logits/rejected": 23.175926208496094,
"logps/chosen": -312.7813720703125,
"logps/rejected": -272.52215576171875,
"loss": 0.4153,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2530692219734192,
"rewards/margins": 1.2337100505828857,
"rewards/rejected": -1.4867792129516602,
"step": 1890
},
{
"epoch": 1.96,
"learning_rate": 1.921163413700727e-07,
"logits/chosen": 23.173688888549805,
"logits/rejected": 23.038455963134766,
"logps/chosen": -327.50531005859375,
"logps/rejected": -257.269287109375,
"loss": 0.4547,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.3794935941696167,
"rewards/margins": 1.0488073825836182,
"rewards/rejected": -1.4283010959625244,
"step": 1900
},
{
"epoch": 1.96,
"eval_logits/chosen": 23.329803466796875,
"eval_logits/rejected": 23.160478591918945,
"eval_logps/chosen": -357.08575439453125,
"eval_logps/rejected": -286.7316589355469,
"eval_loss": 0.5008072853088379,
"eval_rewards/accuracies": 0.7857142686843872,
"eval_rewards/chosen": -0.25156161189079285,
"eval_rewards/margins": 1.0132601261138916,
"eval_rewards/rejected": -1.2648216485977173,
"eval_runtime": 212.8249,
"eval_samples_per_second": 9.397,
"eval_steps_per_second": 0.296,
"step": 1900
},
{
"epoch": 1.97,
"learning_rate": 1.9020283199387677e-07,
"logits/chosen": 23.098756790161133,
"logits/rejected": 23.10630226135254,
"logps/chosen": -345.1263122558594,
"logps/rejected": -260.34613037109375,
"loss": 0.4359,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.14109382033348083,
"rewards/margins": 1.0066546201705933,
"rewards/rejected": -1.147748589515686,
"step": 1910
},
{
"epoch": 1.98,
"learning_rate": 1.8828932261768083e-07,
"logits/chosen": 23.233016967773438,
"logits/rejected": 23.24778175354004,
"logps/chosen": -295.8356628417969,
"logps/rejected": -244.9901885986328,
"loss": 0.4224,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.49083733558654785,
"rewards/margins": 1.3773233890533447,
"rewards/rejected": -1.868160605430603,
"step": 1920
},
{
"epoch": 1.99,
"learning_rate": 1.8637581324148487e-07,
"logits/chosen": 23.030765533447266,
"logits/rejected": 22.91606330871582,
"logps/chosen": -311.2125549316406,
"logps/rejected": -287.0599670410156,
"loss": 0.4249,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.3956405520439148,
"rewards/margins": 1.194427728652954,
"rewards/rejected": -1.5900681018829346,
"step": 1930
},
{
"epoch": 2.0,
"learning_rate": 1.8446230386528893e-07,
"logits/chosen": 22.784257888793945,
"logits/rejected": 22.93459701538086,
"logps/chosen": -270.6285095214844,
"logps/rejected": -242.1689453125,
"loss": 0.3953,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.2096777856349945,
"rewards/margins": 1.2774814367294312,
"rewards/rejected": -1.487159252166748,
"step": 1940
},
{
"epoch": 2.01,
"learning_rate": 1.82548794489093e-07,
"logits/chosen": 22.975915908813477,
"logits/rejected": 23.076732635498047,
"logps/chosen": -285.87164306640625,
"logps/rejected": -285.27105712890625,
"loss": 0.3506,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.4368739724159241,
"rewards/margins": 1.2533791065216064,
"rewards/rejected": -1.6902532577514648,
"step": 1950
},
{
"epoch": 2.02,
"learning_rate": 1.8063528511289706e-07,
"logits/chosen": 23.094058990478516,
"logits/rejected": 23.000532150268555,
"logps/chosen": -311.85418701171875,
"logps/rejected": -342.6611633300781,
"loss": 0.3454,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.3331337571144104,
"rewards/margins": 1.497775912284851,
"rewards/rejected": -1.8309099674224854,
"step": 1960
},
{
"epoch": 2.03,
"learning_rate": 1.7872177573670112e-07,
"logits/chosen": 23.15587043762207,
"logits/rejected": 22.996431350708008,
"logps/chosen": -284.33099365234375,
"logps/rejected": -247.60586547851562,
"loss": 0.3782,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.3313903212547302,
"rewards/margins": 1.394980549812317,
"rewards/rejected": -1.7263710498809814,
"step": 1970
},
{
"epoch": 2.04,
"learning_rate": 1.7680826636050515e-07,
"logits/chosen": 23.24311637878418,
"logits/rejected": 23.19965171813965,
"logps/chosen": -307.9765930175781,
"logps/rejected": -258.64697265625,
"loss": 0.3464,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.12858574092388153,
"rewards/margins": 1.5073888301849365,
"rewards/rejected": -1.635974645614624,
"step": 1980
},
{
"epoch": 2.05,
"learning_rate": 1.7489475698430921e-07,
"logits/chosen": 23.421756744384766,
"logits/rejected": 23.180278778076172,
"logps/chosen": -351.59796142578125,
"logps/rejected": -271.3390808105469,
"loss": 0.3522,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.3069804012775421,
"rewards/margins": 1.5511845350265503,
"rewards/rejected": -1.8581645488739014,
"step": 1990
},
{
"epoch": 2.07,
"learning_rate": 1.7298124760811328e-07,
"logits/chosen": 23.29781723022461,
"logits/rejected": 23.215654373168945,
"logps/chosen": -335.22943115234375,
"logps/rejected": -245.5157470703125,
"loss": 0.3469,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.39654579758644104,
"rewards/margins": 1.623525857925415,
"rewards/rejected": -2.020071506500244,
"step": 2000
},
{
"epoch": 2.07,
"eval_logits/chosen": 23.29904556274414,
"eval_logits/rejected": 23.136056900024414,
"eval_logps/chosen": -357.43829345703125,
"eval_logps/rejected": -287.9998779296875,
"eval_loss": 0.49774664640426636,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.28681743144989014,
"eval_rewards/margins": 1.10482656955719,
"eval_rewards/rejected": -1.39164400100708,
"eval_runtime": 207.5885,
"eval_samples_per_second": 9.634,
"eval_steps_per_second": 0.303,
"step": 2000
},
{
"epoch": 2.08,
"learning_rate": 1.7106773823191734e-07,
"logits/chosen": 23.3192138671875,
"logits/rejected": 23.1693115234375,
"logps/chosen": -360.62701416015625,
"logps/rejected": -267.00836181640625,
"loss": 0.3505,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.2135746031999588,
"rewards/margins": 1.6586072444915771,
"rewards/rejected": -1.8721816539764404,
"step": 2010
},
{
"epoch": 2.09,
"learning_rate": 1.691542288557214e-07,
"logits/chosen": 22.96431541442871,
"logits/rejected": 22.824430465698242,
"logps/chosen": -327.3959655761719,
"logps/rejected": -293.3819580078125,
"loss": 0.3412,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.4780098795890808,
"rewards/margins": 1.3323651552200317,
"rewards/rejected": -1.8103749752044678,
"step": 2020
},
{
"epoch": 2.1,
"learning_rate": 1.6724071947952544e-07,
"logits/chosen": 22.99736213684082,
"logits/rejected": 22.809282302856445,
"logps/chosen": -286.00433349609375,
"logps/rejected": -253.54635620117188,
"loss": 0.3509,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.5133158564567566,
"rewards/margins": 1.4120731353759766,
"rewards/rejected": -1.9253889322280884,
"step": 2030
},
{
"epoch": 2.11,
"learning_rate": 1.653272101033295e-07,
"logits/chosen": 22.923995971679688,
"logits/rejected": 22.87368392944336,
"logps/chosen": -293.9073791503906,
"logps/rejected": -263.51397705078125,
"loss": 0.3527,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.28854408860206604,
"rewards/margins": 1.3332937955856323,
"rewards/rejected": -1.6218379735946655,
"step": 2040
},
{
"epoch": 2.12,
"learning_rate": 1.6341370072713356e-07,
"logits/chosen": 23.294551849365234,
"logits/rejected": 23.248403549194336,
"logps/chosen": -354.1997985839844,
"logps/rejected": -338.943603515625,
"loss": 0.3538,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.2328769713640213,
"rewards/margins": 1.2254259586334229,
"rewards/rejected": -1.4583029747009277,
"step": 2050
},
{
"epoch": 2.13,
"learning_rate": 1.6150019135093762e-07,
"logits/chosen": 23.40046501159668,
"logits/rejected": 23.170814514160156,
"logps/chosen": -376.11871337890625,
"logps/rejected": -280.2356872558594,
"loss": 0.3409,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.1297599822282791,
"rewards/margins": 1.5910810232162476,
"rewards/rejected": -1.7208411693572998,
"step": 2060
},
{
"epoch": 2.14,
"learning_rate": 1.5958668197474169e-07,
"logits/chosen": 23.402286529541016,
"logits/rejected": 23.344829559326172,
"logps/chosen": -316.00634765625,
"logps/rejected": -308.0472717285156,
"loss": 0.3556,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.00810793973505497,
"rewards/margins": 1.592138409614563,
"rewards/rejected": -1.5840303897857666,
"step": 2070
},
{
"epoch": 2.15,
"learning_rate": 1.5767317259854572e-07,
"logits/chosen": 23.049579620361328,
"logits/rejected": 23.11331558227539,
"logps/chosen": -320.4759826660156,
"logps/rejected": -237.2650604248047,
"loss": 0.3655,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.1983695775270462,
"rewards/margins": 1.2847638130187988,
"rewards/rejected": -1.4831334352493286,
"step": 2080
},
{
"epoch": 2.16,
"learning_rate": 1.5575966322234978e-07,
"logits/chosen": 23.234739303588867,
"logits/rejected": 22.95262336730957,
"logps/chosen": -360.20367431640625,
"logps/rejected": -292.31317138671875,
"loss": 0.3307,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.17867961525917053,
"rewards/margins": 1.6552801132202148,
"rewards/rejected": -1.8339598178863525,
"step": 2090
},
{
"epoch": 2.17,
"learning_rate": 1.5384615384615385e-07,
"logits/chosen": 23.25923728942871,
"logits/rejected": 23.007633209228516,
"logps/chosen": -305.6047058105469,
"logps/rejected": -246.4465789794922,
"loss": 0.3547,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.2684822380542755,
"rewards/margins": 1.560947060585022,
"rewards/rejected": -1.829429268836975,
"step": 2100
},
{
"epoch": 2.17,
"eval_logits/chosen": 23.273019790649414,
"eval_logits/rejected": 23.114229202270508,
"eval_logps/chosen": -358.821044921875,
"eval_logps/rejected": -289.5934753417969,
"eval_loss": 0.49868160486221313,
"eval_rewards/accuracies": 0.761904776096344,
"eval_rewards/chosen": -0.42508772015571594,
"eval_rewards/margins": 1.125916838645935,
"eval_rewards/rejected": -1.5510046482086182,
"eval_runtime": 211.1219,
"eval_samples_per_second": 9.473,
"eval_steps_per_second": 0.298,
"step": 2100
},
{
"epoch": 2.18,
"learning_rate": 1.519326444699579e-07,
"logits/chosen": 23.230274200439453,
"logits/rejected": 23.15807342529297,
"logps/chosen": -334.8540954589844,
"logps/rejected": -263.5167236328125,
"loss": 0.3289,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.251259982585907,
"rewards/margins": 1.5572983026504517,
"rewards/rejected": -1.8085582256317139,
"step": 2110
},
{
"epoch": 2.19,
"learning_rate": 1.5001913509376197e-07,
"logits/chosen": 23.217041015625,
"logits/rejected": 23.1701717376709,
"logps/chosen": -330.7687683105469,
"logps/rejected": -285.98907470703125,
"loss": 0.3463,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.2551344037055969,
"rewards/margins": 1.4968717098236084,
"rewards/rejected": -1.7520062923431396,
"step": 2120
},
{
"epoch": 2.2,
"learning_rate": 1.4810562571756603e-07,
"logits/chosen": 23.419551849365234,
"logits/rejected": 23.216039657592773,
"logps/chosen": -312.8890380859375,
"logps/rejected": -267.16729736328125,
"loss": 0.3688,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.34911391139030457,
"rewards/margins": 1.548995018005371,
"rewards/rejected": -1.8981088399887085,
"step": 2130
},
{
"epoch": 2.21,
"learning_rate": 1.4619211634137007e-07,
"logits/chosen": 22.947824478149414,
"logits/rejected": 22.82015037536621,
"logps/chosen": -313.2303771972656,
"logps/rejected": -252.9609375,
"loss": 0.3447,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.2606348693370819,
"rewards/margins": 1.5146172046661377,
"rewards/rejected": -1.7752519845962524,
"step": 2140
},
{
"epoch": 2.22,
"learning_rate": 1.4427860696517413e-07,
"logits/chosen": 23.21828269958496,
"logits/rejected": 23.22684097290039,
"logps/chosen": -369.90924072265625,
"logps/rejected": -314.48016357421875,
"loss": 0.3561,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.12711670994758606,
"rewards/margins": 1.666666030883789,
"rewards/rejected": -1.7937828302383423,
"step": 2150
},
{
"epoch": 2.23,
"learning_rate": 1.423650975889782e-07,
"logits/chosen": 23.231754302978516,
"logits/rejected": 23.19542694091797,
"logps/chosen": -351.79913330078125,
"logps/rejected": -271.44427490234375,
"loss": 0.3303,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.14719603955745697,
"rewards/margins": 1.6395785808563232,
"rewards/rejected": -1.7867748737335205,
"step": 2160
},
{
"epoch": 2.24,
"learning_rate": 1.4045158821278225e-07,
"logits/chosen": 22.95505142211914,
"logits/rejected": 22.855873107910156,
"logps/chosen": -340.75238037109375,
"logps/rejected": -316.02386474609375,
"loss": 0.3293,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.26123401522636414,
"rewards/margins": 1.4234856367111206,
"rewards/rejected": -1.6847198009490967,
"step": 2170
},
{
"epoch": 2.25,
"learning_rate": 1.3853807883658632e-07,
"logits/chosen": 23.029155731201172,
"logits/rejected": 23.120052337646484,
"logps/chosen": -371.22100830078125,
"logps/rejected": -319.261474609375,
"loss": 0.3694,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.3324764668941498,
"rewards/margins": 1.6106780767440796,
"rewards/rejected": -1.9431545734405518,
"step": 2180
},
{
"epoch": 2.26,
"learning_rate": 1.3662456946039035e-07,
"logits/chosen": 23.185413360595703,
"logits/rejected": 22.925167083740234,
"logps/chosen": -305.14410400390625,
"logps/rejected": -296.2259521484375,
"loss": 0.3664,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.40003857016563416,
"rewards/margins": 1.3513580560684204,
"rewards/rejected": -1.7513965368270874,
"step": 2190
},
{
"epoch": 2.27,
"learning_rate": 1.3471106008419441e-07,
"logits/chosen": 23.18272590637207,
"logits/rejected": 23.115558624267578,
"logps/chosen": -306.1884460449219,
"logps/rejected": -288.0213928222656,
"loss": 0.3468,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.359417200088501,
"rewards/margins": 1.435274362564087,
"rewards/rejected": -1.7946914434432983,
"step": 2200
},
{
"epoch": 2.27,
"eval_logits/chosen": 23.256072998046875,
"eval_logits/rejected": 23.099788665771484,
"eval_logps/chosen": -357.2442932128906,
"eval_logps/rejected": -288.0285339355469,
"eval_loss": 0.49792206287384033,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.26741600036621094,
"eval_rewards/margins": 1.1270908117294312,
"eval_rewards/rejected": -1.394506812095642,
"eval_runtime": 210.9966,
"eval_samples_per_second": 9.479,
"eval_steps_per_second": 0.299,
"step": 2200
},
{
"epoch": 2.28,
"learning_rate": 1.3279755070799848e-07,
"logits/chosen": 23.182937622070312,
"logits/rejected": 23.00518035888672,
"logps/chosen": -325.1390075683594,
"logps/rejected": -254.9105224609375,
"loss": 0.3562,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.2701284885406494,
"rewards/margins": 1.4486573934555054,
"rewards/rejected": -1.7187858819961548,
"step": 2210
},
{
"epoch": 2.29,
"learning_rate": 1.3088404133180254e-07,
"logits/chosen": 23.192270278930664,
"logits/rejected": 22.86314582824707,
"logps/chosen": -354.3294677734375,
"logps/rejected": -277.08319091796875,
"loss": 0.3275,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3135210871696472,
"rewards/margins": 1.6535711288452148,
"rewards/rejected": -1.9670922756195068,
"step": 2220
},
{
"epoch": 2.3,
"learning_rate": 1.289705319556066e-07,
"logits/chosen": 23.285938262939453,
"logits/rejected": 23.20859146118164,
"logps/chosen": -341.8558044433594,
"logps/rejected": -260.49853515625,
"loss": 0.3339,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.49324899911880493,
"rewards/margins": 1.3661364316940308,
"rewards/rejected": -1.8593854904174805,
"step": 2230
},
{
"epoch": 2.31,
"learning_rate": 1.2705702257941064e-07,
"logits/chosen": 23.054574966430664,
"logits/rejected": 22.94180679321289,
"logps/chosen": -314.9513244628906,
"logps/rejected": -253.89779663085938,
"loss": 0.3683,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.5942099690437317,
"rewards/margins": 1.3034783601760864,
"rewards/rejected": -1.8976882696151733,
"step": 2240
},
{
"epoch": 2.32,
"learning_rate": 1.251435132032147e-07,
"logits/chosen": 23.040285110473633,
"logits/rejected": 23.092458724975586,
"logps/chosen": -347.84820556640625,
"logps/rejected": -300.02069091796875,
"loss": 0.3424,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3747033476829529,
"rewards/margins": 1.4116895198822021,
"rewards/rejected": -1.7863928079605103,
"step": 2250
},
{
"epoch": 2.33,
"learning_rate": 1.2323000382701873e-07,
"logits/chosen": 23.061431884765625,
"logits/rejected": 22.941814422607422,
"logps/chosen": -363.65850830078125,
"logps/rejected": -299.712890625,
"loss": 0.3701,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.34447842836380005,
"rewards/margins": 1.6744133234024048,
"rewards/rejected": -2.0188918113708496,
"step": 2260
},
{
"epoch": 2.34,
"learning_rate": 1.213164944508228e-07,
"logits/chosen": 23.18351936340332,
"logits/rejected": 23.060955047607422,
"logps/chosen": -386.75701904296875,
"logps/rejected": -311.6101379394531,
"loss": 0.3375,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.2950562834739685,
"rewards/margins": 1.5026452541351318,
"rewards/rejected": -1.7977014780044556,
"step": 2270
},
{
"epoch": 2.35,
"learning_rate": 1.1940298507462686e-07,
"logits/chosen": 23.240421295166016,
"logits/rejected": 23.235857009887695,
"logps/chosen": -306.81561279296875,
"logps/rejected": -249.85324096679688,
"loss": 0.3425,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.47128137946128845,
"rewards/margins": 1.526531457901001,
"rewards/rejected": -1.9978128671646118,
"step": 2280
},
{
"epoch": 2.36,
"learning_rate": 1.1748947569843092e-07,
"logits/chosen": 23.31965446472168,
"logits/rejected": 23.00503158569336,
"logps/chosen": -366.4286193847656,
"logps/rejected": -287.65399169921875,
"loss": 0.3404,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.20483896136283875,
"rewards/margins": 1.4863460063934326,
"rewards/rejected": -1.6911849975585938,
"step": 2290
},
{
"epoch": 2.37,
"learning_rate": 1.1557596632223497e-07,
"logits/chosen": 23.109445571899414,
"logits/rejected": 23.121734619140625,
"logps/chosen": -339.75689697265625,
"logps/rejected": -266.4504699707031,
"loss": 0.3432,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.32633692026138306,
"rewards/margins": 1.3159123659133911,
"rewards/rejected": -1.642249345779419,
"step": 2300
},
{
"epoch": 2.37,
"eval_logits/chosen": 23.223342895507812,
"eval_logits/rejected": 23.0726318359375,
"eval_logps/chosen": -358.362060546875,
"eval_logps/rejected": -288.7130126953125,
"eval_loss": 0.5026321411132812,
"eval_rewards/accuracies": 0.773809552192688,
"eval_rewards/chosen": -0.3791937828063965,
"eval_rewards/margins": 1.0837651491165161,
"eval_rewards/rejected": -1.4629590511322021,
"eval_runtime": 212.4288,
"eval_samples_per_second": 9.415,
"eval_steps_per_second": 0.297,
"step": 2300
},
{
"epoch": 2.39,
"learning_rate": 1.1366245694603903e-07,
"logits/chosen": 22.923072814941406,
"logits/rejected": 22.96480369567871,
"logps/chosen": -329.4259338378906,
"logps/rejected": -294.24127197265625,
"loss": 0.3706,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.3439275026321411,
"rewards/margins": 1.2762069702148438,
"rewards/rejected": -1.6201345920562744,
"step": 2310
},
{
"epoch": 2.4,
"learning_rate": 1.1174894756984308e-07,
"logits/chosen": 23.266117095947266,
"logits/rejected": 23.19167137145996,
"logps/chosen": -274.62237548828125,
"logps/rejected": -247.6970672607422,
"loss": 0.3588,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.4183998107910156,
"rewards/margins": 1.5969486236572266,
"rewards/rejected": -2.015348434448242,
"step": 2320
},
{
"epoch": 2.41,
"learning_rate": 1.0983543819364714e-07,
"logits/chosen": 23.18532943725586,
"logits/rejected": 23.074344635009766,
"logps/chosen": -333.7981262207031,
"logps/rejected": -262.7727966308594,
"loss": 0.3035,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.3411490321159363,
"rewards/margins": 1.6993077993392944,
"rewards/rejected": -2.040456771850586,
"step": 2330
},
{
"epoch": 2.42,
"learning_rate": 1.079219288174512e-07,
"logits/chosen": 22.650278091430664,
"logits/rejected": 22.700809478759766,
"logps/chosen": -256.5923156738281,
"logps/rejected": -267.3676452636719,
"loss": 0.3527,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5605247616767883,
"rewards/margins": 1.3785268068313599,
"rewards/rejected": -1.939051628112793,
"step": 2340
},
{
"epoch": 2.43,
"learning_rate": 1.0600841944125525e-07,
"logits/chosen": 23.22505760192871,
"logits/rejected": 23.044265747070312,
"logps/chosen": -377.84765625,
"logps/rejected": -308.9931945800781,
"loss": 0.3613,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.2691905200481415,
"rewards/margins": 1.7195707559585571,
"rewards/rejected": -1.988761305809021,
"step": 2350
},
{
"epoch": 2.44,
"learning_rate": 1.0409491006505931e-07,
"logits/chosen": 23.209278106689453,
"logits/rejected": 23.045013427734375,
"logps/chosen": -341.085205078125,
"logps/rejected": -284.80987548828125,
"loss": 0.3373,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.32976096868515015,
"rewards/margins": 1.4313738346099854,
"rewards/rejected": -1.7611347436904907,
"step": 2360
},
{
"epoch": 2.45,
"learning_rate": 1.0218140068886336e-07,
"logits/chosen": 22.989057540893555,
"logits/rejected": 22.930797576904297,
"logps/chosen": -349.1778564453125,
"logps/rejected": -276.46905517578125,
"loss": 0.351,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.3507656157016754,
"rewards/margins": 1.4063034057617188,
"rewards/rejected": -1.7570692300796509,
"step": 2370
},
{
"epoch": 2.46,
"learning_rate": 1.0026789131266743e-07,
"logits/chosen": 22.69498062133789,
"logits/rejected": 22.762619018554688,
"logps/chosen": -319.37701416015625,
"logps/rejected": -285.0171813964844,
"loss": 0.3298,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.41146141290664673,
"rewards/margins": 1.4631173610687256,
"rewards/rejected": -1.874578833580017,
"step": 2380
},
{
"epoch": 2.47,
"learning_rate": 9.835438193647149e-08,
"logits/chosen": 22.913543701171875,
"logits/rejected": 22.7869873046875,
"logps/chosen": -328.5509948730469,
"logps/rejected": -253.9607696533203,
"loss": 0.3565,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.4099615216255188,
"rewards/margins": 1.3734912872314453,
"rewards/rejected": -1.7834527492523193,
"step": 2390
},
{
"epoch": 2.48,
"learning_rate": 9.644087256027554e-08,
"logits/chosen": 23.1809139251709,
"logits/rejected": 23.06944465637207,
"logps/chosen": -286.20904541015625,
"logps/rejected": -235.44140625,
"loss": 0.324,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.5443722009658813,
"rewards/margins": 1.3517777919769287,
"rewards/rejected": -1.8961498737335205,
"step": 2400
},
{
"epoch": 2.48,
"eval_logits/chosen": 23.200559616088867,
"eval_logits/rejected": 23.054319381713867,
"eval_logps/chosen": -359.46197509765625,
"eval_logps/rejected": -290.1737060546875,
"eval_loss": 0.5021990537643433,
"eval_rewards/accuracies": 0.7698412537574768,
"eval_rewards/chosen": -0.48918139934539795,
"eval_rewards/margins": 1.1198451519012451,
"eval_rewards/rejected": -1.609026551246643,
"eval_runtime": 211.6095,
"eval_samples_per_second": 9.451,
"eval_steps_per_second": 0.298,
"step": 2400
},
{
"epoch": 2.49,
"learning_rate": 9.45273631840796e-08,
"logits/chosen": 22.888259887695312,
"logits/rejected": 22.884002685546875,
"logps/chosen": -351.7564392089844,
"logps/rejected": -299.20611572265625,
"loss": 0.3645,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.16312038898468018,
"rewards/margins": 1.5724434852600098,
"rewards/rejected": -1.73556387424469,
"step": 2410
},
{
"epoch": 2.5,
"learning_rate": 9.261385380788366e-08,
"logits/chosen": 23.231523513793945,
"logits/rejected": 23.333255767822266,
"logps/chosen": -338.1724548339844,
"logps/rejected": -284.2950744628906,
"loss": 0.3772,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.48969048261642456,
"rewards/margins": 1.1433426141738892,
"rewards/rejected": -1.633033037185669,
"step": 2420
},
{
"epoch": 2.51,
"learning_rate": 9.070034443168771e-08,
"logits/chosen": 23.16311264038086,
"logits/rejected": 22.952455520629883,
"logps/chosen": -304.960205078125,
"logps/rejected": -273.458251953125,
"loss": 0.3659,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.46648114919662476,
"rewards/margins": 1.2427117824554443,
"rewards/rejected": -1.7091929912567139,
"step": 2430
},
{
"epoch": 2.52,
"learning_rate": 8.878683505549177e-08,
"logits/chosen": 23.104694366455078,
"logits/rejected": 23.080604553222656,
"logps/chosen": -287.4930419921875,
"logps/rejected": -265.7315368652344,
"loss": 0.3395,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0566287636756897,
"rewards/margins": 1.7359685897827148,
"rewards/rejected": -1.7925974130630493,
"step": 2440
},
{
"epoch": 2.53,
"learning_rate": 8.687332567929582e-08,
"logits/chosen": 22.983219146728516,
"logits/rejected": 23.094844818115234,
"logps/chosen": -316.54547119140625,
"logps/rejected": -292.8838806152344,
"loss": 0.3579,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.4759410321712494,
"rewards/margins": 1.2076809406280518,
"rewards/rejected": -1.6836220026016235,
"step": 2450
},
{
"epoch": 2.54,
"learning_rate": 8.495981630309988e-08,
"logits/chosen": 22.804758071899414,
"logits/rejected": 22.753246307373047,
"logps/chosen": -366.3809509277344,
"logps/rejected": -290.44805908203125,
"loss": 0.3613,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.37290042638778687,
"rewards/margins": 1.3757580518722534,
"rewards/rejected": -1.7486584186553955,
"step": 2460
},
{
"epoch": 2.55,
"learning_rate": 8.304630692690395e-08,
"logits/chosen": 22.425283432006836,
"logits/rejected": 22.801471710205078,
"logps/chosen": -316.58416748046875,
"logps/rejected": -268.90423583984375,
"loss": 0.3686,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.44941702485084534,
"rewards/margins": 1.3106696605682373,
"rewards/rejected": -1.7600864171981812,
"step": 2470
},
{
"epoch": 2.56,
"learning_rate": 8.1132797550708e-08,
"logits/chosen": 23.177642822265625,
"logits/rejected": 23.025049209594727,
"logps/chosen": -345.25958251953125,
"logps/rejected": -269.18951416015625,
"loss": 0.3337,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.33066803216934204,
"rewards/margins": 1.614101767539978,
"rewards/rejected": -1.9447696208953857,
"step": 2480
},
{
"epoch": 2.57,
"learning_rate": 7.921928817451206e-08,
"logits/chosen": 23.251216888427734,
"logits/rejected": 23.127471923828125,
"logps/chosen": -321.37078857421875,
"logps/rejected": -251.3941192626953,
"loss": 0.3158,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.3629501461982727,
"rewards/margins": 1.3023030757904053,
"rewards/rejected": -1.6652530431747437,
"step": 2490
},
{
"epoch": 2.58,
"learning_rate": 7.73057787983161e-08,
"logits/chosen": 23.11884880065918,
"logits/rejected": 23.025787353515625,
"logps/chosen": -372.3116760253906,
"logps/rejected": -297.9188232421875,
"loss": 0.3556,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.08432115614414215,
"rewards/margins": 1.5375818014144897,
"rewards/rejected": -1.6219028234481812,
"step": 2500
},
{
"epoch": 2.58,
"eval_logits/chosen": 23.198068618774414,
"eval_logits/rejected": 23.05204963684082,
"eval_logps/chosen": -359.8403625488281,
"eval_logps/rejected": -290.6595458984375,
"eval_loss": 0.5010030269622803,
"eval_rewards/accuracies": 0.7817460298538208,
"eval_rewards/chosen": -0.5270243287086487,
"eval_rewards/margins": 1.130587100982666,
"eval_rewards/rejected": -1.6576114892959595,
"eval_runtime": 208.0836,
"eval_samples_per_second": 9.612,
"eval_steps_per_second": 0.303,
"step": 2500
},
{
"epoch": 2.59,
"learning_rate": 7.539226942212017e-08,
"logits/chosen": 22.56097412109375,
"logits/rejected": 22.520360946655273,
"logps/chosen": -299.781982421875,
"logps/rejected": -319.82171630859375,
"loss": 0.3419,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.4395579397678375,
"rewards/margins": 1.4742848873138428,
"rewards/rejected": -1.9138429164886475,
"step": 2510
},
{
"epoch": 2.6,
"learning_rate": 7.347876004592423e-08,
"logits/chosen": 23.056285858154297,
"logits/rejected": 22.88377571105957,
"logps/chosen": -274.74249267578125,
"logps/rejected": -217.62075805664062,
"loss": 0.3617,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.48249778151512146,
"rewards/margins": 1.2477834224700928,
"rewards/rejected": -1.730281114578247,
"step": 2520
},
{
"epoch": 2.61,
"learning_rate": 7.156525066972828e-08,
"logits/chosen": 23.401355743408203,
"logits/rejected": 23.3753604888916,
"logps/chosen": -305.857666015625,
"logps/rejected": -269.84344482421875,
"loss": 0.3566,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5294305086135864,
"rewards/margins": 1.2906124591827393,
"rewards/rejected": -1.8200428485870361,
"step": 2530
},
{
"epoch": 2.62,
"learning_rate": 6.965174129353234e-08,
"logits/chosen": 23.210468292236328,
"logits/rejected": 23.068767547607422,
"logps/chosen": -418.8457946777344,
"logps/rejected": -322.9905700683594,
"loss": 0.3506,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.12323112785816193,
"rewards/margins": 1.8070284128189087,
"rewards/rejected": -1.9302597045898438,
"step": 2540
},
{
"epoch": 2.63,
"learning_rate": 6.773823191733639e-08,
"logits/chosen": 23.159671783447266,
"logits/rejected": 23.031639099121094,
"logps/chosen": -341.6581726074219,
"logps/rejected": -329.7276611328125,
"loss": 0.3489,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.43813997507095337,
"rewards/margins": 1.6730334758758545,
"rewards/rejected": -2.111173152923584,
"step": 2550
},
{
"epoch": 2.64,
"learning_rate": 6.582472254114045e-08,
"logits/chosen": 22.655208587646484,
"logits/rejected": 22.458200454711914,
"logps/chosen": -298.61737060546875,
"logps/rejected": -273.9313659667969,
"loss": 0.3689,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.464535653591156,
"rewards/margins": 1.3690606355667114,
"rewards/rejected": -1.8335964679718018,
"step": 2560
},
{
"epoch": 2.65,
"learning_rate": 6.391121316494451e-08,
"logits/chosen": 23.089435577392578,
"logits/rejected": 23.145009994506836,
"logps/chosen": -347.81793212890625,
"logps/rejected": -289.2439880371094,
"loss": 0.3245,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.26666170358657837,
"rewards/margins": 1.7196756601333618,
"rewards/rejected": -1.9863373041152954,
"step": 2570
},
{
"epoch": 2.66,
"learning_rate": 6.199770378874856e-08,
"logits/chosen": 23.03936767578125,
"logits/rejected": 22.83783531188965,
"logps/chosen": -326.26123046875,
"logps/rejected": -285.95294189453125,
"loss": 0.3369,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.3959008455276489,
"rewards/margins": 1.5528860092163086,
"rewards/rejected": -1.948786735534668,
"step": 2580
},
{
"epoch": 2.67,
"learning_rate": 6.008419441255262e-08,
"logits/chosen": 22.73525047302246,
"logits/rejected": 22.791269302368164,
"logps/chosen": -291.1043395996094,
"logps/rejected": -250.67880249023438,
"loss": 0.3344,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.7139278650283813,
"rewards/margins": 1.2345958948135376,
"rewards/rejected": -1.948523759841919,
"step": 2590
},
{
"epoch": 2.68,
"learning_rate": 5.817068503635668e-08,
"logits/chosen": 23.139039993286133,
"logits/rejected": 23.094404220581055,
"logps/chosen": -375.664794921875,
"logps/rejected": -297.1061706542969,
"loss": 0.3277,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.29812633991241455,
"rewards/margins": 1.5680840015411377,
"rewards/rejected": -1.8662105798721313,
"step": 2600
},
{
"epoch": 2.68,
"eval_logits/chosen": 23.19009780883789,
"eval_logits/rejected": 23.044872283935547,
"eval_logps/chosen": -359.9708251953125,
"eval_logps/rejected": -290.89959716796875,
"eval_loss": 0.49901142716407776,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.5400659441947937,
"eval_rewards/margins": 1.141547679901123,
"eval_rewards/rejected": -1.6816134452819824,
"eval_runtime": 212.6416,
"eval_samples_per_second": 9.405,
"eval_steps_per_second": 0.296,
"step": 2600
},
{
"epoch": 2.69,
"learning_rate": 5.6257175660160735e-08,
"logits/chosen": 23.452491760253906,
"logits/rejected": 23.291522979736328,
"logps/chosen": -321.8898010253906,
"logps/rejected": -302.38250732421875,
"loss": 0.3198,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.5151566863059998,
"rewards/margins": 1.5868747234344482,
"rewards/rejected": -2.1020312309265137,
"step": 2610
},
{
"epoch": 2.71,
"learning_rate": 5.4343666283964784e-08,
"logits/chosen": 22.93613052368164,
"logits/rejected": 23.02700424194336,
"logps/chosen": -337.6763000488281,
"logps/rejected": -263.98406982421875,
"loss": 0.3544,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.38289493322372437,
"rewards/margins": 1.6999647617340088,
"rewards/rejected": -2.0828592777252197,
"step": 2620
},
{
"epoch": 2.72,
"learning_rate": 5.243015690776884e-08,
"logits/chosen": 23.027408599853516,
"logits/rejected": 23.09657096862793,
"logps/chosen": -300.5941162109375,
"logps/rejected": -263.323486328125,
"loss": 0.3481,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.6709809899330139,
"rewards/margins": 1.2178490161895752,
"rewards/rejected": -1.8888299465179443,
"step": 2630
},
{
"epoch": 2.73,
"learning_rate": 5.05166475315729e-08,
"logits/chosen": 23.28525161743164,
"logits/rejected": 23.195045471191406,
"logps/chosen": -272.472900390625,
"logps/rejected": -262.8435974121094,
"loss": 0.3379,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.8382778167724609,
"rewards/margins": 1.3932462930679321,
"rewards/rejected": -2.2315242290496826,
"step": 2640
},
{
"epoch": 2.74,
"learning_rate": 4.860313815537696e-08,
"logits/chosen": 22.973094940185547,
"logits/rejected": 22.961816787719727,
"logps/chosen": -367.30596923828125,
"logps/rejected": -294.1488952636719,
"loss": 0.3489,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.19501671195030212,
"rewards/margins": 1.6359647512435913,
"rewards/rejected": -1.8309814929962158,
"step": 2650
},
{
"epoch": 2.75,
"learning_rate": 4.668962877918101e-08,
"logits/chosen": 23.02678680419922,
"logits/rejected": 22.784521102905273,
"logps/chosen": -329.62030029296875,
"logps/rejected": -373.6632080078125,
"loss": 0.3306,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.4452723562717438,
"rewards/margins": 1.5809751749038696,
"rewards/rejected": -2.026247501373291,
"step": 2660
},
{
"epoch": 2.76,
"learning_rate": 4.477611940298507e-08,
"logits/chosen": 22.902379989624023,
"logits/rejected": 22.912425994873047,
"logps/chosen": -332.11102294921875,
"logps/rejected": -280.44976806640625,
"loss": 0.3247,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.35678499937057495,
"rewards/margins": 1.8047094345092773,
"rewards/rejected": -2.161494493484497,
"step": 2670
},
{
"epoch": 2.77,
"learning_rate": 4.2862610026789124e-08,
"logits/chosen": 23.08858871459961,
"logits/rejected": 22.95041275024414,
"logps/chosen": -337.7413330078125,
"logps/rejected": -293.63623046875,
"loss": 0.3618,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3582797646522522,
"rewards/margins": 1.579685091972351,
"rewards/rejected": -1.9379650354385376,
"step": 2680
},
{
"epoch": 2.78,
"learning_rate": 4.0949100650593186e-08,
"logits/chosen": 22.903911590576172,
"logits/rejected": 22.945873260498047,
"logps/chosen": -272.99493408203125,
"logps/rejected": -277.4879455566406,
"loss": 0.3657,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6198464035987854,
"rewards/margins": 1.3183465003967285,
"rewards/rejected": -1.9381929636001587,
"step": 2690
},
{
"epoch": 2.79,
"learning_rate": 3.903559127439724e-08,
"logits/chosen": 23.07727813720703,
"logits/rejected": 22.927719116210938,
"logps/chosen": -282.8468933105469,
"logps/rejected": -237.0935821533203,
"loss": 0.3262,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.40676426887512207,
"rewards/margins": 1.3733875751495361,
"rewards/rejected": -1.7801517248153687,
"step": 2700
},
{
"epoch": 2.79,
"eval_logits/chosen": 23.187774658203125,
"eval_logits/rejected": 23.043867111206055,
"eval_logps/chosen": -359.5220031738281,
"eval_logps/rejected": -290.49322509765625,
"eval_loss": 0.4993184804916382,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.4951845407485962,
"eval_rewards/margins": 1.1457940340042114,
"eval_rewards/rejected": -1.6409783363342285,
"eval_runtime": 210.8376,
"eval_samples_per_second": 9.486,
"eval_steps_per_second": 0.299,
"step": 2700
},
{
"epoch": 2.8,
"learning_rate": 3.71220818982013e-08,
"logits/chosen": 23.060585021972656,
"logits/rejected": 22.836994171142578,
"logps/chosen": -348.59539794921875,
"logps/rejected": -282.60064697265625,
"loss": 0.3585,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.4623526632785797,
"rewards/margins": 1.4493197202682495,
"rewards/rejected": -1.911672592163086,
"step": 2710
},
{
"epoch": 2.81,
"learning_rate": 3.520857252200535e-08,
"logits/chosen": 23.332260131835938,
"logits/rejected": 23.22934341430664,
"logps/chosen": -373.9750061035156,
"logps/rejected": -321.8055725097656,
"loss": 0.334,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.24776187539100647,
"rewards/margins": 1.6318897008895874,
"rewards/rejected": -1.879651427268982,
"step": 2720
},
{
"epoch": 2.82,
"learning_rate": 3.3295063145809414e-08,
"logits/chosen": 23.10513687133789,
"logits/rejected": 23.070053100585938,
"logps/chosen": -295.7916259765625,
"logps/rejected": -298.29132080078125,
"loss": 0.3567,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.4222725033760071,
"rewards/margins": 1.4338531494140625,
"rewards/rejected": -1.8561254739761353,
"step": 2730
},
{
"epoch": 2.83,
"learning_rate": 3.138155376961347e-08,
"logits/chosen": 22.993267059326172,
"logits/rejected": 22.975433349609375,
"logps/chosen": -340.28515625,
"logps/rejected": -270.3987731933594,
"loss": 0.3505,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.4524230360984802,
"rewards/margins": 1.500270962715149,
"rewards/rejected": -1.9526941776275635,
"step": 2740
},
{
"epoch": 2.84,
"learning_rate": 2.9468044393417525e-08,
"logits/chosen": 22.807130813598633,
"logits/rejected": 22.657257080078125,
"logps/chosen": -302.7679748535156,
"logps/rejected": -253.1012420654297,
"loss": 0.3457,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.3587990403175354,
"rewards/margins": 1.4861528873443604,
"rewards/rejected": -1.8449519872665405,
"step": 2750
},
{
"epoch": 2.85,
"learning_rate": 2.755453501722158e-08,
"logits/chosen": 22.716732025146484,
"logits/rejected": 22.806201934814453,
"logps/chosen": -340.51287841796875,
"logps/rejected": -296.96673583984375,
"loss": 0.3386,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.4456048011779785,
"rewards/margins": 1.4643114805221558,
"rewards/rejected": -1.9099165201187134,
"step": 2760
},
{
"epoch": 2.86,
"learning_rate": 2.564102564102564e-08,
"logits/chosen": 23.153486251831055,
"logits/rejected": 23.201038360595703,
"logps/chosen": -308.52288818359375,
"logps/rejected": -289.1993408203125,
"loss": 0.3403,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3693477213382721,
"rewards/margins": 1.4097144603729248,
"rewards/rejected": -1.779062032699585,
"step": 2770
},
{
"epoch": 2.87,
"learning_rate": 2.3727516264829695e-08,
"logits/chosen": 22.77389907836914,
"logits/rejected": 22.64432144165039,
"logps/chosen": -388.85552978515625,
"logps/rejected": -363.8034362792969,
"loss": 0.3601,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.32134681940078735,
"rewards/margins": 1.4628154039382935,
"rewards/rejected": -1.784161925315857,
"step": 2780
},
{
"epoch": 2.88,
"learning_rate": 2.1814006888633754e-08,
"logits/chosen": 22.792617797851562,
"logits/rejected": 22.739648818969727,
"logps/chosen": -340.0162353515625,
"logps/rejected": -269.2567443847656,
"loss": 0.3476,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.3537456691265106,
"rewards/margins": 1.403322696685791,
"rewards/rejected": -1.7570682764053345,
"step": 2790
},
{
"epoch": 2.89,
"learning_rate": 1.990049751243781e-08,
"logits/chosen": 23.177873611450195,
"logits/rejected": 23.13758087158203,
"logps/chosen": -343.5755615234375,
"logps/rejected": -288.75555419921875,
"loss": 0.3566,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.39105096459388733,
"rewards/margins": 1.0802559852600098,
"rewards/rejected": -1.4713070392608643,
"step": 2800
},
{
"epoch": 2.89,
"eval_logits/chosen": 23.187063217163086,
"eval_logits/rejected": 23.043275833129883,
"eval_logps/chosen": -359.0445251464844,
"eval_logps/rejected": -290.0010070800781,
"eval_loss": 0.4985302686691284,
"eval_rewards/accuracies": 0.7777777910232544,
"eval_rewards/chosen": -0.44743794202804565,
"eval_rewards/margins": 1.144317388534546,
"eval_rewards/rejected": -1.5917555093765259,
"eval_runtime": 208.7121,
"eval_samples_per_second": 9.583,
"eval_steps_per_second": 0.302,
"step": 2800
},
{
"epoch": 2.9,
"learning_rate": 1.7986988136241865e-08,
"logits/chosen": 22.997955322265625,
"logits/rejected": 23.055164337158203,
"logps/chosen": -362.84918212890625,
"logps/rejected": -298.51922607421875,
"loss": 0.3433,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.2676233649253845,
"rewards/margins": 1.3830516338348389,
"rewards/rejected": -1.650674819946289,
"step": 2810
},
{
"epoch": 2.91,
"learning_rate": 1.6073478760045924e-08,
"logits/chosen": 23.13959312438965,
"logits/rejected": 22.91689682006836,
"logps/chosen": -358.01666259765625,
"logps/rejected": -246.228515625,
"loss": 0.3319,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.24576838314533234,
"rewards/margins": 1.788368582725525,
"rewards/rejected": -2.03413724899292,
"step": 2820
},
{
"epoch": 2.92,
"learning_rate": 1.4159969383849981e-08,
"logits/chosen": 22.98459243774414,
"logits/rejected": 23.025390625,
"logps/chosen": -346.28973388671875,
"logps/rejected": -279.1742858886719,
"loss": 0.3331,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.40038982033729553,
"rewards/margins": 1.4257802963256836,
"rewards/rejected": -1.8261702060699463,
"step": 2830
},
{
"epoch": 2.93,
"learning_rate": 1.2246460007654037e-08,
"logits/chosen": 23.19771957397461,
"logits/rejected": 23.1368408203125,
"logps/chosen": -349.0854187011719,
"logps/rejected": -281.1717529296875,
"loss": 0.3685,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.481764018535614,
"rewards/margins": 1.4477870464324951,
"rewards/rejected": -1.929551124572754,
"step": 2840
},
{
"epoch": 2.94,
"learning_rate": 1.0332950631458094e-08,
"logits/chosen": 23.161651611328125,
"logits/rejected": 23.000064849853516,
"logps/chosen": -338.02288818359375,
"logps/rejected": -283.4983215332031,
"loss": 0.3501,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.45347729325294495,
"rewards/margins": 1.2738498449325562,
"rewards/rejected": -1.7273271083831787,
"step": 2850
},
{
"epoch": 2.95,
"learning_rate": 8.419441255262151e-09,
"logits/chosen": 22.9562931060791,
"logits/rejected": 22.93158531188965,
"logps/chosen": -301.29132080078125,
"logps/rejected": -239.3927001953125,
"loss": 0.3382,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.4900715947151184,
"rewards/margins": 1.371382713317871,
"rewards/rejected": -1.8614543676376343,
"step": 2860
},
{
"epoch": 2.96,
"learning_rate": 6.505931879066207e-09,
"logits/chosen": 22.94473648071289,
"logits/rejected": 23.008617401123047,
"logps/chosen": -302.16436767578125,
"logps/rejected": -269.48828125,
"loss": 0.3549,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.308788925409317,
"rewards/margins": 1.3573650121688843,
"rewards/rejected": -1.666154146194458,
"step": 2870
},
{
"epoch": 2.97,
"learning_rate": 4.592422502870264e-09,
"logits/chosen": 23.0222225189209,
"logits/rejected": 22.979480743408203,
"logps/chosen": -329.9389343261719,
"logps/rejected": -272.41351318359375,
"loss": 0.3559,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.35177531838417053,
"rewards/margins": 1.4292513132095337,
"rewards/rejected": -1.781026840209961,
"step": 2880
},
{
"epoch": 2.98,
"learning_rate": 2.6789131266743202e-09,
"logits/chosen": 23.136159896850586,
"logits/rejected": 23.02133560180664,
"logps/chosen": -328.77093505859375,
"logps/rejected": -275.63995361328125,
"loss": 0.3498,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.4196252226829529,
"rewards/margins": 1.4171664714813232,
"rewards/rejected": -1.8367916345596313,
"step": 2890
},
{
"epoch": 2.99,
"learning_rate": 7.654037504783773e-10,
"logits/chosen": 23.257701873779297,
"logits/rejected": 23.05466079711914,
"logps/chosen": -311.91217041015625,
"logps/rejected": -304.32501220703125,
"loss": 0.3386,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.36090317368507385,
"rewards/margins": 1.4512748718261719,
"rewards/rejected": -1.8121780157089233,
"step": 2900
},
{
"epoch": 2.99,
"eval_logits/chosen": 23.18655014038086,
"eval_logits/rejected": 23.042728424072266,
"eval_logps/chosen": -359.16790771484375,
"eval_logps/rejected": -290.12347412109375,
"eval_loss": 0.4982847273349762,
"eval_rewards/accuracies": 0.7817460298538208,
"eval_rewards/chosen": -0.4597766697406769,
"eval_rewards/margins": 1.144227385520935,
"eval_rewards/rejected": -1.6040042638778687,
"eval_runtime": 212.9399,
"eval_samples_per_second": 9.392,
"eval_steps_per_second": 0.296,
"step": 2900
},
{
"epoch": 3.0,
"step": 2904,
"total_flos": 0.0,
"train_loss": 0.446941960284861,
"train_runtime": 57869.3533,
"train_samples_per_second": 3.212,
"train_steps_per_second": 0.05
}
],
"logging_steps": 10,
"max_steps": 2904,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}