IKEA-7b-UC-0 / trainer_state.json
weijie210's picture
Model save
a0b8529 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1388,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 3.5971223021582734e-09,
"logits/chosen": -2.8839163780212402,
"logits/rejected": -2.699483633041382,
"logps/chosen": -106.361572265625,
"logps/rejected": -50.8937873840332,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -2.9716877937316895,
"logits/rejected": -2.8243343830108643,
"logps/chosen": -148.80015563964844,
"logps/rejected": -84.43142700195312,
"loss": 0.6918,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.006020313128829002,
"rewards/margins": 0.0030713342130184174,
"rewards/rejected": 0.0029489779844880104,
"step": 10
},
{
"epoch": 0.03,
"learning_rate": 7.194244604316546e-08,
"logits/chosen": -2.9206809997558594,
"logits/rejected": -2.7788352966308594,
"logps/chosen": -167.4009246826172,
"logps/rejected": -95.04873657226562,
"loss": 0.6525,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.06761552393436432,
"rewards/margins": 0.0887872725725174,
"rewards/rejected": -0.021171752363443375,
"step": 20
},
{
"epoch": 0.04,
"learning_rate": 1.0791366906474819e-07,
"logits/chosen": -2.907208204269409,
"logits/rejected": -2.7389509677886963,
"logps/chosen": -128.09487915039062,
"logps/rejected": -80.83646392822266,
"loss": 0.5577,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2007008045911789,
"rewards/margins": 0.2701273560523987,
"rewards/rejected": -0.06942657381296158,
"step": 30
},
{
"epoch": 0.06,
"learning_rate": 1.4388489208633092e-07,
"logits/chosen": -2.9200387001037598,
"logits/rejected": -2.8407883644104004,
"logps/chosen": -148.62106323242188,
"logps/rejected": -105.0569839477539,
"loss": 0.3744,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6417331099510193,
"rewards/margins": 1.1058695316314697,
"rewards/rejected": -0.46413642168045044,
"step": 40
},
{
"epoch": 0.07,
"learning_rate": 1.7985611510791365e-07,
"logits/chosen": -2.7872376441955566,
"logits/rejected": -2.709198236465454,
"logps/chosen": -146.15286254882812,
"logps/rejected": -104.78489685058594,
"loss": 0.2995,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.24810883402824402,
"rewards/margins": 1.5657349824905396,
"rewards/rejected": -1.3176262378692627,
"step": 50
},
{
"epoch": 0.09,
"learning_rate": 2.1582733812949638e-07,
"logits/chosen": -2.8873581886291504,
"logits/rejected": -2.7115185260772705,
"logps/chosen": -146.1516571044922,
"logps/rejected": -108.72274017333984,
"loss": 0.1946,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5725937485694885,
"rewards/margins": 2.431591033935547,
"rewards/rejected": -1.8589973449707031,
"step": 60
},
{
"epoch": 0.1,
"learning_rate": 2.517985611510791e-07,
"logits/chosen": -2.838667392730713,
"logits/rejected": -2.7343862056732178,
"logps/chosen": -130.49063110351562,
"logps/rejected": -113.7320327758789,
"loss": 0.1533,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.05752415582537651,
"rewards/margins": 2.663848400115967,
"rewards/rejected": -2.6063244342803955,
"step": 70
},
{
"epoch": 0.12,
"learning_rate": 2.8776978417266184e-07,
"logits/chosen": -2.8950698375701904,
"logits/rejected": -2.691622495651245,
"logps/chosen": -138.45028686523438,
"logps/rejected": -100.14655303955078,
"loss": 0.1717,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.42671164870262146,
"rewards/margins": 2.7645459175109863,
"rewards/rejected": -2.337834358215332,
"step": 80
},
{
"epoch": 0.13,
"learning_rate": 3.2374100719424457e-07,
"logits/chosen": -2.7722439765930176,
"logits/rejected": -2.690833330154419,
"logps/chosen": -135.5113067626953,
"logps/rejected": -121.447021484375,
"loss": 0.1075,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.4925897717475891,
"rewards/margins": 4.084370136260986,
"rewards/rejected": -3.591780185699463,
"step": 90
},
{
"epoch": 0.14,
"learning_rate": 3.597122302158273e-07,
"logits/chosen": -2.888807773590088,
"logits/rejected": -2.7131495475769043,
"logps/chosen": -164.85647583007812,
"logps/rejected": -124.16983795166016,
"loss": 0.1169,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2238633632659912,
"rewards/margins": 3.586012601852417,
"rewards/rejected": -3.362149715423584,
"step": 100
},
{
"epoch": 0.16,
"learning_rate": 3.9568345323741003e-07,
"logits/chosen": -2.7212650775909424,
"logits/rejected": -2.576204538345337,
"logps/chosen": -136.82293701171875,
"logps/rejected": -123.34732818603516,
"loss": 0.0757,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.43352875113487244,
"rewards/margins": 4.669638633728027,
"rewards/rejected": -4.236109733581543,
"step": 110
},
{
"epoch": 0.17,
"learning_rate": 4.3165467625899276e-07,
"logits/chosen": -2.7657124996185303,
"logits/rejected": -2.6242692470550537,
"logps/chosen": -165.17178344726562,
"logps/rejected": -137.9097442626953,
"loss": 0.0741,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4255678057670593,
"rewards/margins": 4.908309459686279,
"rewards/rejected": -4.482741355895996,
"step": 120
},
{
"epoch": 0.19,
"learning_rate": 4.676258992805755e-07,
"logits/chosen": -2.7995333671569824,
"logits/rejected": -2.628948926925659,
"logps/chosen": -143.13916015625,
"logps/rejected": -135.01576232910156,
"loss": 0.0557,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4051126539707184,
"rewards/margins": 6.454569339752197,
"rewards/rejected": -6.0494561195373535,
"step": 130
},
{
"epoch": 0.2,
"learning_rate": 4.99599679743795e-07,
"logits/chosen": -2.856595516204834,
"logits/rejected": -2.64650297164917,
"logps/chosen": -182.11863708496094,
"logps/rejected": -170.402587890625,
"loss": 0.0904,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6085208058357239,
"rewards/margins": 5.310309886932373,
"rewards/rejected": -5.918830871582031,
"step": 140
},
{
"epoch": 0.22,
"learning_rate": 4.955964771817453e-07,
"logits/chosen": -2.720083475112915,
"logits/rejected": -2.5524630546569824,
"logps/chosen": -138.2317352294922,
"logps/rejected": -122.82208251953125,
"loss": 0.0887,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.38773685693740845,
"rewards/margins": 3.9186530113220215,
"rewards/rejected": -4.306389808654785,
"step": 150
},
{
"epoch": 0.23,
"learning_rate": 4.915932746196957e-07,
"logits/chosen": -2.6586403846740723,
"logits/rejected": -2.5184950828552246,
"logps/chosen": -146.91192626953125,
"logps/rejected": -146.19537353515625,
"loss": 0.0528,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4298267364501953,
"rewards/margins": 5.707052707672119,
"rewards/rejected": -6.136878490447998,
"step": 160
},
{
"epoch": 0.24,
"learning_rate": 4.875900720576461e-07,
"logits/chosen": -2.7607617378234863,
"logits/rejected": -2.59885573387146,
"logps/chosen": -161.85403442382812,
"logps/rejected": -172.4330596923828,
"loss": 0.0589,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16717226803302765,
"rewards/margins": 7.492938041687012,
"rewards/rejected": -7.325766086578369,
"step": 170
},
{
"epoch": 0.26,
"learning_rate": 4.835868694955965e-07,
"logits/chosen": -2.6541225910186768,
"logits/rejected": -2.5471792221069336,
"logps/chosen": -148.55508422851562,
"logps/rejected": -166.07257080078125,
"loss": 0.0758,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.3034805059432983,
"rewards/margins": 6.427194118499756,
"rewards/rejected": -7.730674743652344,
"step": 180
},
{
"epoch": 0.27,
"learning_rate": 4.795836669335467e-07,
"logits/chosen": -2.652890205383301,
"logits/rejected": -2.4126124382019043,
"logps/chosen": -134.67652893066406,
"logps/rejected": -130.59608459472656,
"loss": 0.0909,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.3820854127407074,
"rewards/margins": 5.234072685241699,
"rewards/rejected": -5.616158485412598,
"step": 190
},
{
"epoch": 0.29,
"learning_rate": 4.755804643714972e-07,
"logits/chosen": -2.6649863719940186,
"logits/rejected": -2.4534084796905518,
"logps/chosen": -154.67408752441406,
"logps/rejected": -138.3970947265625,
"loss": 0.1012,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.47632989287376404,
"rewards/margins": 5.756840229034424,
"rewards/rejected": -5.2805094718933105,
"step": 200
},
{
"epoch": 0.3,
"learning_rate": 4.715772618094475e-07,
"logits/chosen": -2.7960267066955566,
"logits/rejected": -2.5353095531463623,
"logps/chosen": -171.26986694335938,
"logps/rejected": -157.50350952148438,
"loss": 0.0516,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2727116644382477,
"rewards/margins": 5.267422676086426,
"rewards/rejected": -5.540134429931641,
"step": 210
},
{
"epoch": 0.32,
"learning_rate": 4.675740592473979e-07,
"logits/chosen": -2.5903918743133545,
"logits/rejected": -2.481639862060547,
"logps/chosen": -164.92575073242188,
"logps/rejected": -157.99099731445312,
"loss": 0.0328,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6272125244140625,
"rewards/margins": 6.131289005279541,
"rewards/rejected": -6.7585015296936035,
"step": 220
},
{
"epoch": 0.33,
"learning_rate": 4.635708566853482e-07,
"logits/chosen": -2.683683156967163,
"logits/rejected": -2.465529441833496,
"logps/chosen": -179.80831909179688,
"logps/rejected": -153.77401733398438,
"loss": 0.0659,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9624654054641724,
"rewards/margins": 5.732499599456787,
"rewards/rejected": -6.6949663162231445,
"step": 230
},
{
"epoch": 0.35,
"learning_rate": 4.595676541232986e-07,
"logits/chosen": -2.538198709487915,
"logits/rejected": -2.472057580947876,
"logps/chosen": -134.72840881347656,
"logps/rejected": -163.64105224609375,
"loss": 0.0385,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.4930785596370697,
"rewards/margins": 7.688417911529541,
"rewards/rejected": -8.181497573852539,
"step": 240
},
{
"epoch": 0.36,
"learning_rate": 4.5556445156124894e-07,
"logits/chosen": -2.7781453132629395,
"logits/rejected": -2.5276522636413574,
"logps/chosen": -143.19754028320312,
"logps/rejected": -150.41925048828125,
"loss": 0.0375,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.8589959144592285,
"rewards/margins": 6.875401973724365,
"rewards/rejected": -7.73439884185791,
"step": 250
},
{
"epoch": 0.37,
"learning_rate": 4.515612489991993e-07,
"logits/chosen": -2.6978352069854736,
"logits/rejected": -2.4410510063171387,
"logps/chosen": -163.37667846679688,
"logps/rejected": -157.6444854736328,
"loss": 0.0255,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5269836783409119,
"rewards/margins": 6.9025750160217285,
"rewards/rejected": -7.429558753967285,
"step": 260
},
{
"epoch": 0.39,
"learning_rate": 4.4755804643714965e-07,
"logits/chosen": -2.6981711387634277,
"logits/rejected": -2.4240591526031494,
"logps/chosen": -145.78114318847656,
"logps/rejected": -153.4759521484375,
"loss": 0.0466,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.961786150932312,
"rewards/margins": 6.817984580993652,
"rewards/rejected": -7.779770851135254,
"step": 270
},
{
"epoch": 0.4,
"learning_rate": 4.4355484387510004e-07,
"logits/chosen": -2.6088438034057617,
"logits/rejected": -2.49806809425354,
"logps/chosen": -167.5294952392578,
"logps/rejected": -196.126953125,
"loss": 0.0326,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.763117790222168,
"rewards/margins": 8.220617294311523,
"rewards/rejected": -9.983735084533691,
"step": 280
},
{
"epoch": 0.42,
"learning_rate": 4.3955164131305047e-07,
"logits/chosen": -2.692411184310913,
"logits/rejected": -2.426178455352783,
"logps/chosen": -149.58558654785156,
"logps/rejected": -154.73678588867188,
"loss": 0.0126,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.13495102524757385,
"rewards/margins": 8.114767074584961,
"rewards/rejected": -8.249719619750977,
"step": 290
},
{
"epoch": 0.43,
"learning_rate": 4.355484387510008e-07,
"logits/chosen": -2.507620334625244,
"logits/rejected": -2.3317294120788574,
"logps/chosen": -165.7401123046875,
"logps/rejected": -168.38839721679688,
"loss": 0.0765,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.2425320148468018,
"rewards/margins": 5.922076225280762,
"rewards/rejected": -8.164608001708984,
"step": 300
},
{
"epoch": 0.45,
"learning_rate": 4.315452361889512e-07,
"logits/chosen": -2.6149442195892334,
"logits/rejected": -2.3606739044189453,
"logps/chosen": -135.47378540039062,
"logps/rejected": -146.0089569091797,
"loss": 0.0298,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.3370214700698853,
"rewards/margins": 6.675353050231934,
"rewards/rejected": -8.012373924255371,
"step": 310
},
{
"epoch": 0.46,
"learning_rate": 4.275420336269015e-07,
"logits/chosen": -2.5880959033966064,
"logits/rejected": -2.35605788230896,
"logps/chosen": -182.91497802734375,
"logps/rejected": -189.45333862304688,
"loss": 0.0479,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.185288190841675,
"rewards/margins": 7.346780300140381,
"rewards/rejected": -9.532068252563477,
"step": 320
},
{
"epoch": 0.48,
"learning_rate": 4.235388310648519e-07,
"logits/chosen": -2.5866305828094482,
"logits/rejected": -2.3037662506103516,
"logps/chosen": -184.96115112304688,
"logps/rejected": -184.9438934326172,
"loss": 0.0831,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.369056224822998,
"rewards/margins": 7.428493499755859,
"rewards/rejected": -10.797548294067383,
"step": 330
},
{
"epoch": 0.49,
"learning_rate": 4.1953562850280223e-07,
"logits/chosen": -2.519735336303711,
"logits/rejected": -2.334322690963745,
"logps/chosen": -173.40858459472656,
"logps/rejected": -215.0013885498047,
"loss": 0.031,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.5725579261779785,
"rewards/margins": 9.760233879089355,
"rewards/rejected": -12.332793235778809,
"step": 340
},
{
"epoch": 0.5,
"learning_rate": 4.155324259407526e-07,
"logits/chosen": -2.586958408355713,
"logits/rejected": -2.3933067321777344,
"logps/chosen": -180.2490234375,
"logps/rejected": -214.9405059814453,
"loss": 0.0285,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.519786834716797,
"rewards/margins": 9.661985397338867,
"rewards/rejected": -12.181772232055664,
"step": 350
},
{
"epoch": 0.52,
"learning_rate": 4.1152922337870295e-07,
"logits/chosen": -2.4591023921966553,
"logits/rejected": -2.2511239051818848,
"logps/chosen": -131.71694946289062,
"logps/rejected": -165.09132385253906,
"loss": 0.0247,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7444407939910889,
"rewards/margins": 9.10871410369873,
"rewards/rejected": -9.853155136108398,
"step": 360
},
{
"epoch": 0.53,
"learning_rate": 4.0752602081665333e-07,
"logits/chosen": -2.651655912399292,
"logits/rejected": -2.4027347564697266,
"logps/chosen": -175.1227569580078,
"logps/rejected": -186.62240600585938,
"loss": 0.0247,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8536550998687744,
"rewards/margins": 8.069160461425781,
"rewards/rejected": -9.922816276550293,
"step": 370
},
{
"epoch": 0.55,
"learning_rate": 4.0352281825460366e-07,
"logits/chosen": -2.515545606613159,
"logits/rejected": -2.379769802093506,
"logps/chosen": -159.44683837890625,
"logps/rejected": -185.20753479003906,
"loss": 0.0349,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.584536612033844,
"rewards/margins": 8.809714317321777,
"rewards/rejected": -9.394251823425293,
"step": 380
},
{
"epoch": 0.56,
"learning_rate": 3.9951961569255404e-07,
"logits/chosen": -2.880056142807007,
"logits/rejected": -2.5667223930358887,
"logps/chosen": -166.15879821777344,
"logps/rejected": -169.60914611816406,
"loss": 0.0666,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2807844877243042,
"rewards/margins": 7.306063652038574,
"rewards/rejected": -8.586848258972168,
"step": 390
},
{
"epoch": 0.58,
"learning_rate": 3.9551641313050437e-07,
"logits/chosen": -2.6910109519958496,
"logits/rejected": -2.438204526901245,
"logps/chosen": -167.52279663085938,
"logps/rejected": -193.27291870117188,
"loss": 0.0591,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8132003545761108,
"rewards/margins": 8.618528366088867,
"rewards/rejected": -10.431727409362793,
"step": 400
},
{
"epoch": 0.59,
"learning_rate": 3.9151321056845476e-07,
"logits/chosen": -2.5091681480407715,
"logits/rejected": -2.2911810874938965,
"logps/chosen": -140.93154907226562,
"logps/rejected": -179.24794006347656,
"loss": 0.0231,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7046592235565186,
"rewards/margins": 9.365106582641602,
"rewards/rejected": -11.0697660446167,
"step": 410
},
{
"epoch": 0.61,
"learning_rate": 3.875100080064051e-07,
"logits/chosen": -2.5162720680236816,
"logits/rejected": -2.318171501159668,
"logps/chosen": -140.7128143310547,
"logps/rejected": -169.2827911376953,
"loss": 0.0437,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0878578424453735,
"rewards/margins": 8.321041107177734,
"rewards/rejected": -9.408899307250977,
"step": 420
},
{
"epoch": 0.62,
"learning_rate": 3.8350680544435547e-07,
"logits/chosen": -2.5058627128601074,
"logits/rejected": -2.285526752471924,
"logps/chosen": -158.39208984375,
"logps/rejected": -191.4017791748047,
"loss": 0.0205,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.077756643295288,
"rewards/margins": 9.962626457214355,
"rewards/rejected": -11.040384292602539,
"step": 430
},
{
"epoch": 0.63,
"learning_rate": 3.795036028823058e-07,
"logits/chosen": -2.5467917919158936,
"logits/rejected": -2.293295383453369,
"logps/chosen": -165.51400756835938,
"logps/rejected": -176.27957153320312,
"loss": 0.0429,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.651084542274475,
"rewards/margins": 8.067974090576172,
"rewards/rejected": -9.719058990478516,
"step": 440
},
{
"epoch": 0.65,
"learning_rate": 3.755004003202562e-07,
"logits/chosen": -2.5901718139648438,
"logits/rejected": -2.3814101219177246,
"logps/chosen": -149.50888061523438,
"logps/rejected": -192.05587768554688,
"loss": 0.0323,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6450309753417969,
"rewards/margins": 9.971355438232422,
"rewards/rejected": -11.616386413574219,
"step": 450
},
{
"epoch": 0.66,
"learning_rate": 3.714971977582065e-07,
"logits/chosen": -2.676997661590576,
"logits/rejected": -2.5134191513061523,
"logps/chosen": -156.29513549804688,
"logps/rejected": -190.77088928222656,
"loss": 0.0357,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.1767628192901611,
"rewards/margins": 9.212038040161133,
"rewards/rejected": -10.388800621032715,
"step": 460
},
{
"epoch": 0.68,
"learning_rate": 3.674939951961569e-07,
"logits/chosen": -2.75260853767395,
"logits/rejected": -2.4446868896484375,
"logps/chosen": -167.54098510742188,
"logps/rejected": -198.13467407226562,
"loss": 0.0099,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5461426973342896,
"rewards/margins": 10.665987968444824,
"rewards/rejected": -11.212130546569824,
"step": 470
},
{
"epoch": 0.69,
"learning_rate": 3.634907926341073e-07,
"logits/chosen": -2.7042384147644043,
"logits/rejected": -2.5021824836730957,
"logps/chosen": -177.01632690429688,
"logps/rejected": -194.0543670654297,
"loss": 0.0369,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1976065635681152,
"rewards/margins": 7.7528533935546875,
"rewards/rejected": -9.950460433959961,
"step": 480
},
{
"epoch": 0.71,
"learning_rate": 3.5948759007205767e-07,
"logits/chosen": -2.609654188156128,
"logits/rejected": -2.442094326019287,
"logps/chosen": -186.2327423095703,
"logps/rejected": -238.8030242919922,
"loss": 0.0343,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.969175934791565,
"rewards/margins": 12.21094036102295,
"rewards/rejected": -14.18011474609375,
"step": 490
},
{
"epoch": 0.72,
"learning_rate": 3.55484387510008e-07,
"logits/chosen": -2.5849597454071045,
"logits/rejected": -2.4151904582977295,
"logps/chosen": -184.76492309570312,
"logps/rejected": -203.13241577148438,
"loss": 0.0268,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.5779895782470703,
"rewards/margins": 8.900407791137695,
"rewards/rejected": -11.478398323059082,
"step": 500
},
{
"epoch": 0.72,
"eval_logits/chosen": -2.4085986614227295,
"eval_logits/rejected": -2.233201026916504,
"eval_logps/chosen": -157.914306640625,
"eval_logps/rejected": -183.62203979492188,
"eval_loss": 0.03143342584371567,
"eval_rewards/accuracies": 0.9960317611694336,
"eval_rewards/chosen": -0.9699568152427673,
"eval_rewards/margins": 8.822220802307129,
"eval_rewards/rejected": -9.7921781539917,
"eval_runtime": 869.9338,
"eval_samples_per_second": 2.299,
"eval_steps_per_second": 0.072,
"step": 500
},
{
"epoch": 0.73,
"learning_rate": 3.514811849479584e-07,
"logits/chosen": -2.4951071739196777,
"logits/rejected": -2.249694585800171,
"logps/chosen": -164.900634765625,
"logps/rejected": -189.8306427001953,
"loss": 0.0301,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.6467971801757812,
"rewards/margins": 9.143733024597168,
"rewards/rejected": -10.79053020477295,
"step": 510
},
{
"epoch": 0.75,
"learning_rate": 3.474779823859087e-07,
"logits/chosen": -2.575314521789551,
"logits/rejected": -2.233131170272827,
"logps/chosen": -182.89932250976562,
"logps/rejected": -243.9858856201172,
"loss": 0.0217,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4116153717041016,
"rewards/margins": 13.464820861816406,
"rewards/rejected": -14.876436233520508,
"step": 520
},
{
"epoch": 0.76,
"learning_rate": 3.434747798238591e-07,
"logits/chosen": -2.4697818756103516,
"logits/rejected": -2.356581926345825,
"logps/chosen": -201.47634887695312,
"logps/rejected": -548.7425537109375,
"loss": 0.0351,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.6284077167510986,
"rewards/margins": 39.81824493408203,
"rewards/rejected": -43.44664764404297,
"step": 530
},
{
"epoch": 0.78,
"learning_rate": 3.394715772618094e-07,
"logits/chosen": -2.418208599090576,
"logits/rejected": -2.22477126121521,
"logps/chosen": -165.20034790039062,
"logps/rejected": -628.0509643554688,
"loss": 0.0124,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.872553825378418,
"rewards/margins": 52.413848876953125,
"rewards/rejected": -54.286399841308594,
"step": 540
},
{
"epoch": 0.79,
"learning_rate": 3.354683746997598e-07,
"logits/chosen": -2.5192372798919678,
"logits/rejected": -2.295382261276245,
"logps/chosen": -176.81497192382812,
"logps/rejected": -363.97174072265625,
"loss": 0.0275,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7440744638442993,
"rewards/margins": 24.896778106689453,
"rewards/rejected": -26.640857696533203,
"step": 550
},
{
"epoch": 0.81,
"learning_rate": 3.3146517213771014e-07,
"logits/chosen": -2.6569995880126953,
"logits/rejected": -2.2872815132141113,
"logps/chosen": -215.7018585205078,
"logps/rejected": -305.87799072265625,
"loss": 0.0359,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.8822388648986816,
"rewards/margins": 16.257402420043945,
"rewards/rejected": -19.1396427154541,
"step": 560
},
{
"epoch": 0.82,
"learning_rate": 3.274619695756605e-07,
"logits/chosen": -2.393214702606201,
"logits/rejected": -2.0127763748168945,
"logps/chosen": -185.98187255859375,
"logps/rejected": -338.15576171875,
"loss": 0.0316,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.6043496131896973,
"rewards/margins": 21.948997497558594,
"rewards/rejected": -25.553346633911133,
"step": 570
},
{
"epoch": 0.84,
"learning_rate": 3.2345876701361085e-07,
"logits/chosen": -2.4011263847351074,
"logits/rejected": -2.0070765018463135,
"logps/chosen": -196.2584228515625,
"logps/rejected": -484.80926513671875,
"loss": 0.0152,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4227471351623535,
"rewards/margins": 34.711463928222656,
"rewards/rejected": -38.13420867919922,
"step": 580
},
{
"epoch": 0.85,
"learning_rate": 3.1945556445156124e-07,
"logits/chosen": -1.9645344018936157,
"logits/rejected": -1.2225711345672607,
"logps/chosen": -209.5574951171875,
"logps/rejected": -543.1734008789062,
"loss": 0.0257,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6722359657287598,
"rewards/margins": 40.712772369384766,
"rewards/rejected": -44.385005950927734,
"step": 590
},
{
"epoch": 0.86,
"learning_rate": 3.1545236188951157e-07,
"logits/chosen": -1.978044867515564,
"logits/rejected": -1.2324669361114502,
"logps/chosen": -192.2949981689453,
"logps/rejected": -480.520751953125,
"loss": 0.0255,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.5515990257263184,
"rewards/margins": 35.85133743286133,
"rewards/rejected": -39.40293884277344,
"step": 600
},
{
"epoch": 0.88,
"learning_rate": 3.1144915932746195e-07,
"logits/chosen": -2.275550365447998,
"logits/rejected": -1.6282291412353516,
"logps/chosen": -196.13803100585938,
"logps/rejected": -297.6045837402344,
"loss": 0.0368,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.565072059631348,
"rewards/margins": 17.173152923583984,
"rewards/rejected": -21.738224029541016,
"step": 610
},
{
"epoch": 0.89,
"learning_rate": 3.074459567654123e-07,
"logits/chosen": -2.4316773414611816,
"logits/rejected": -1.7663853168487549,
"logps/chosen": -189.8267822265625,
"logps/rejected": -321.9093017578125,
"loss": 0.0363,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.32307505607605,
"rewards/margins": 19.51993179321289,
"rewards/rejected": -22.843008041381836,
"step": 620
},
{
"epoch": 0.91,
"learning_rate": 3.0344275420336267e-07,
"logits/chosen": -2.2213199138641357,
"logits/rejected": -1.702415108680725,
"logps/chosen": -189.29393005371094,
"logps/rejected": -392.8915100097656,
"loss": 0.0107,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.8561224937438965,
"rewards/margins": 26.04262924194336,
"rewards/rejected": -29.898754119873047,
"step": 630
},
{
"epoch": 0.92,
"learning_rate": 2.99439551641313e-07,
"logits/chosen": -2.0172629356384277,
"logits/rejected": -1.2431235313415527,
"logps/chosen": -208.375732421875,
"logps/rejected": -378.9666748046875,
"loss": 0.0228,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.202768325805664,
"rewards/margins": 22.777095794677734,
"rewards/rejected": -28.9798641204834,
"step": 640
},
{
"epoch": 0.94,
"learning_rate": 2.954363490792634e-07,
"logits/chosen": -1.7751468420028687,
"logits/rejected": -1.2209514379501343,
"logps/chosen": -241.2014617919922,
"logps/rejected": -514.4192504882812,
"loss": 0.0131,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.191336154937744,
"rewards/margins": 33.45779800415039,
"rewards/rejected": -40.649131774902344,
"step": 650
},
{
"epoch": 0.95,
"learning_rate": 2.914331465172137e-07,
"logits/chosen": -1.6847556829452515,
"logits/rejected": -1.00700843334198,
"logps/chosen": -197.2582550048828,
"logps/rejected": -443.4234313964844,
"loss": 0.0166,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.894466400146484,
"rewards/margins": 29.733402252197266,
"rewards/rejected": -35.627864837646484,
"step": 660
},
{
"epoch": 0.97,
"learning_rate": 2.8742994395516415e-07,
"logits/chosen": -2.154357433319092,
"logits/rejected": -1.1991710662841797,
"logps/chosen": -189.2927703857422,
"logps/rejected": -484.61785888671875,
"loss": 0.0175,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.121435642242432,
"rewards/margins": 36.48408508300781,
"rewards/rejected": -40.60551834106445,
"step": 670
},
{
"epoch": 0.98,
"learning_rate": 2.834267413931145e-07,
"logits/chosen": -1.9125760793685913,
"logits/rejected": -1.0993740558624268,
"logps/chosen": -212.220947265625,
"logps/rejected": -427.8121643066406,
"loss": 0.0128,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -5.176814079284668,
"rewards/margins": 28.130443572998047,
"rewards/rejected": -33.30725860595703,
"step": 680
},
{
"epoch": 0.99,
"learning_rate": 2.7942353883106486e-07,
"logits/chosen": -2.2864699363708496,
"logits/rejected": -1.399320363998413,
"logps/chosen": -189.4803924560547,
"logps/rejected": -217.796142578125,
"loss": 0.0196,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.550410032272339,
"rewards/margins": 9.777883529663086,
"rewards/rejected": -13.328292846679688,
"step": 690
},
{
"epoch": 1.01,
"learning_rate": 2.754203362690152e-07,
"logits/chosen": -2.773916244506836,
"logits/rejected": -2.527047872543335,
"logps/chosen": -183.30543518066406,
"logps/rejected": -429.06365966796875,
"loss": 0.0442,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.4481396675109863,
"rewards/margins": 30.67641258239746,
"rewards/rejected": -33.124549865722656,
"step": 700
},
{
"epoch": 1.02,
"learning_rate": 2.714171337069656e-07,
"logits/chosen": -2.882967948913574,
"logits/rejected": -2.72076153755188,
"logps/chosen": -167.57542419433594,
"logps/rejected": -250.15274047851562,
"loss": 0.0082,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.485212802886963,
"rewards/margins": 14.637022018432617,
"rewards/rejected": -16.122234344482422,
"step": 710
},
{
"epoch": 1.04,
"learning_rate": 2.674139311449159e-07,
"logits/chosen": -2.7207860946655273,
"logits/rejected": -2.5453267097473145,
"logps/chosen": -151.00723266601562,
"logps/rejected": -295.5516357421875,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0635440349578857,
"rewards/margins": 20.84624671936035,
"rewards/rejected": -21.9097900390625,
"step": 720
},
{
"epoch": 1.05,
"learning_rate": 2.634107285828663e-07,
"logits/chosen": -2.799225091934204,
"logits/rejected": -2.6195671558380127,
"logps/chosen": -207.414306640625,
"logps/rejected": -471.4579162597656,
"loss": 0.015,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.837430477142334,
"rewards/margins": 32.82604217529297,
"rewards/rejected": -38.66347885131836,
"step": 730
},
{
"epoch": 1.07,
"learning_rate": 2.594075260208166e-07,
"logits/chosen": -2.8809666633605957,
"logits/rejected": -2.6668760776519775,
"logps/chosen": -189.98104858398438,
"logps/rejected": -271.88641357421875,
"loss": 0.0102,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.0119662284851074,
"rewards/margins": 15.516934394836426,
"rewards/rejected": -17.528902053833008,
"step": 740
},
{
"epoch": 1.08,
"learning_rate": 2.55404323458767e-07,
"logits/chosen": -2.8235630989074707,
"logits/rejected": -2.6322312355041504,
"logps/chosen": -159.7130584716797,
"logps/rejected": -270.10101318359375,
"loss": 0.0086,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5178642272949219,
"rewards/margins": 17.20760154724121,
"rewards/rejected": -18.725465774536133,
"step": 750
},
{
"epoch": 1.1,
"learning_rate": 2.514011208967174e-07,
"logits/chosen": -2.6800270080566406,
"logits/rejected": -2.516126871109009,
"logps/chosen": -163.38233947753906,
"logps/rejected": -561.2886352539062,
"loss": 0.0059,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.2010345458984375,
"rewards/margins": 45.494956970214844,
"rewards/rejected": -47.69599914550781,
"step": 760
},
{
"epoch": 1.11,
"learning_rate": 2.473979183346677e-07,
"logits/chosen": -2.7506349086761475,
"logits/rejected": -2.5886902809143066,
"logps/chosen": -141.7732696533203,
"logps/rejected": -377.6195373535156,
"loss": 0.0144,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.402200698852539,
"rewards/margins": 28.883886337280273,
"rewards/rejected": -30.286090850830078,
"step": 770
},
{
"epoch": 1.12,
"learning_rate": 2.433947157726181e-07,
"logits/chosen": -2.842419147491455,
"logits/rejected": -2.6155142784118652,
"logps/chosen": -180.7171630859375,
"logps/rejected": -282.32220458984375,
"loss": 0.0039,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1550252437591553,
"rewards/margins": 17.016422271728516,
"rewards/rejected": -19.17144775390625,
"step": 780
},
{
"epoch": 1.14,
"learning_rate": 2.3939151321056843e-07,
"logits/chosen": -2.924471378326416,
"logits/rejected": -2.6854348182678223,
"logps/chosen": -170.48342895507812,
"logps/rejected": -230.1216583251953,
"loss": 0.0056,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.0587759017944336,
"rewards/margins": 12.352742195129395,
"rewards/rejected": -14.411517143249512,
"step": 790
},
{
"epoch": 1.15,
"learning_rate": 2.353883106485188e-07,
"logits/chosen": -2.8059592247009277,
"logits/rejected": -2.5042202472686768,
"logps/chosen": -162.3031463623047,
"logps/rejected": -295.27105712890625,
"loss": 0.0055,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9291874170303345,
"rewards/margins": 20.166751861572266,
"rewards/rejected": -22.09593963623047,
"step": 800
},
{
"epoch": 1.17,
"learning_rate": 2.3138510808646917e-07,
"logits/chosen": -2.7831666469573975,
"logits/rejected": -2.5290932655334473,
"logps/chosen": -156.5865478515625,
"logps/rejected": -512.7086791992188,
"loss": 0.0045,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6812311410903931,
"rewards/margins": 43.404396057128906,
"rewards/rejected": -44.085628509521484,
"step": 810
},
{
"epoch": 1.18,
"learning_rate": 2.2738190552441953e-07,
"logits/chosen": -2.8566126823425293,
"logits/rejected": -2.582984447479248,
"logps/chosen": -172.5872802734375,
"logps/rejected": -197.41136169433594,
"loss": 0.0106,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.777562141418457,
"rewards/margins": 9.757658958435059,
"rewards/rejected": -11.535221099853516,
"step": 820
},
{
"epoch": 1.2,
"learning_rate": 2.2337870296236989e-07,
"logits/chosen": -2.78861927986145,
"logits/rejected": -2.546877384185791,
"logps/chosen": -144.1377716064453,
"logps/rejected": -255.5492706298828,
"loss": 0.0064,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.23271174728870392,
"rewards/margins": 17.866634368896484,
"rewards/rejected": -18.09934425354004,
"step": 830
},
{
"epoch": 1.21,
"learning_rate": 2.1937550040032024e-07,
"logits/chosen": -2.7955825328826904,
"logits/rejected": -2.6178054809570312,
"logps/chosen": -174.67660522460938,
"logps/rejected": -284.92230224609375,
"loss": 0.0046,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.2215218544006348,
"rewards/margins": 17.042997360229492,
"rewards/rejected": -19.264522552490234,
"step": 840
},
{
"epoch": 1.22,
"learning_rate": 2.153722978382706e-07,
"logits/chosen": -2.6289403438568115,
"logits/rejected": -2.4111552238464355,
"logps/chosen": -174.08555603027344,
"logps/rejected": -477.3523864746094,
"loss": 0.0051,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.3877720832824707,
"rewards/margins": 36.872703552246094,
"rewards/rejected": -39.260475158691406,
"step": 850
},
{
"epoch": 1.24,
"learning_rate": 2.1136909527622096e-07,
"logits/chosen": -2.8065085411071777,
"logits/rejected": -2.5915045738220215,
"logps/chosen": -155.339599609375,
"logps/rejected": -247.944091796875,
"loss": 0.0049,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.14878419041633606,
"rewards/margins": 16.316041946411133,
"rewards/rejected": -16.464826583862305,
"step": 860
},
{
"epoch": 1.25,
"learning_rate": 2.0736589271417131e-07,
"logits/chosen": -2.9013023376464844,
"logits/rejected": -2.6518332958221436,
"logps/chosen": -196.78958129882812,
"logps/rejected": -258.1006774902344,
"loss": 0.0071,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.3248391151428223,
"rewards/margins": 12.6865816116333,
"rewards/rejected": -16.011423110961914,
"step": 870
},
{
"epoch": 1.27,
"learning_rate": 2.0336269015212167e-07,
"logits/chosen": -2.7131872177124023,
"logits/rejected": -2.4915966987609863,
"logps/chosen": -186.66929626464844,
"logps/rejected": -523.0761108398438,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.9156155586242676,
"rewards/margins": 41.05046844482422,
"rewards/rejected": -43.96608352661133,
"step": 880
},
{
"epoch": 1.28,
"learning_rate": 1.9935948759007203e-07,
"logits/chosen": -2.802422285079956,
"logits/rejected": -2.6357274055480957,
"logps/chosen": -171.12216186523438,
"logps/rejected": -279.912109375,
"loss": 0.0198,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.927133560180664,
"rewards/margins": 16.069978713989258,
"rewards/rejected": -19.997112274169922,
"step": 890
},
{
"epoch": 1.3,
"learning_rate": 1.953562850280224e-07,
"logits/chosen": -2.743924140930176,
"logits/rejected": -2.569491147994995,
"logps/chosen": -167.91322326660156,
"logps/rejected": -284.1127624511719,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.3164217472076416,
"rewards/margins": 17.26920509338379,
"rewards/rejected": -19.585628509521484,
"step": 900
},
{
"epoch": 1.31,
"learning_rate": 1.9135308246597277e-07,
"logits/chosen": -2.8673033714294434,
"logits/rejected": -2.6498348712921143,
"logps/chosen": -142.06503295898438,
"logps/rejected": -239.8588409423828,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7565892338752747,
"rewards/margins": 15.723353385925293,
"rewards/rejected": -16.47994613647461,
"step": 910
},
{
"epoch": 1.33,
"learning_rate": 1.8734987990392313e-07,
"logits/chosen": -2.821174144744873,
"logits/rejected": -2.6474757194519043,
"logps/chosen": -171.9628143310547,
"logps/rejected": -232.91439819335938,
"loss": 0.0072,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7939846515655518,
"rewards/margins": 12.49173355102539,
"rewards/rejected": -14.285717964172363,
"step": 920
},
{
"epoch": 1.34,
"learning_rate": 1.8334667734187348e-07,
"logits/chosen": -2.755138397216797,
"logits/rejected": -2.5319771766662598,
"logps/chosen": -180.15707397460938,
"logps/rejected": -481.31512451171875,
"loss": 0.0056,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.5247740745544434,
"rewards/margins": 36.42875289916992,
"rewards/rejected": -38.95352554321289,
"step": 930
},
{
"epoch": 1.35,
"learning_rate": 1.7934347477982384e-07,
"logits/chosen": -2.872758388519287,
"logits/rejected": -2.609778881072998,
"logps/chosen": -157.4242401123047,
"logps/rejected": -223.9275360107422,
"loss": 0.0048,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7862883806228638,
"rewards/margins": 13.082984924316406,
"rewards/rejected": -13.869272232055664,
"step": 940
},
{
"epoch": 1.37,
"learning_rate": 1.753402722177742e-07,
"logits/chosen": -2.79685115814209,
"logits/rejected": -2.5169830322265625,
"logps/chosen": -157.01585388183594,
"logps/rejected": -436.61602783203125,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8217869997024536,
"rewards/margins": 34.775390625,
"rewards/rejected": -35.59718322753906,
"step": 950
},
{
"epoch": 1.38,
"learning_rate": 1.7133706965572455e-07,
"logits/chosen": -2.896519660949707,
"logits/rejected": -2.5488381385803223,
"logps/chosen": -175.59397888183594,
"logps/rejected": -218.03466796875,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.972507119178772,
"rewards/margins": 11.009244918823242,
"rewards/rejected": -12.981752395629883,
"step": 960
},
{
"epoch": 1.4,
"learning_rate": 1.673338670936749e-07,
"logits/chosen": -2.8404831886291504,
"logits/rejected": -2.5836331844329834,
"logps/chosen": -184.15939331054688,
"logps/rejected": -292.54693603515625,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.556754469871521,
"rewards/margins": 17.97411346435547,
"rewards/rejected": -19.530866622924805,
"step": 970
},
{
"epoch": 1.41,
"learning_rate": 1.633306645316253e-07,
"logits/chosen": -2.78204345703125,
"logits/rejected": -2.52099609375,
"logps/chosen": -180.96939086914062,
"logps/rejected": -375.949462890625,
"loss": 0.0131,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.8274238109588623,
"rewards/margins": 26.82219886779785,
"rewards/rejected": -28.64962387084961,
"step": 980
},
{
"epoch": 1.43,
"learning_rate": 1.5932746196957568e-07,
"logits/chosen": -2.8116698265075684,
"logits/rejected": -2.564847707748413,
"logps/chosen": -159.66482543945312,
"logps/rejected": -267.16583251953125,
"loss": 0.0067,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7409461736679077,
"rewards/margins": 17.136062622070312,
"rewards/rejected": -18.87700843811035,
"step": 990
},
{
"epoch": 1.44,
"learning_rate": 1.5532425940752604e-07,
"logits/chosen": -2.7542591094970703,
"logits/rejected": -2.5157008171081543,
"logps/chosen": -183.57919311523438,
"logps/rejected": -581.8060913085938,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.2423195838928223,
"rewards/margins": 46.37403869628906,
"rewards/rejected": -49.616355895996094,
"step": 1000
},
{
"epoch": 1.44,
"eval_logits/chosen": -2.633044958114624,
"eval_logits/rejected": -2.436069965362549,
"eval_logps/chosen": -164.32315063476562,
"eval_logps/rejected": -331.22625732421875,
"eval_loss": 0.022912979125976562,
"eval_rewards/accuracies": 0.9960317611694336,
"eval_rewards/chosen": -1.6108430624008179,
"eval_rewards/margins": 22.94175910949707,
"eval_rewards/rejected": -24.55260467529297,
"eval_runtime": 924.6056,
"eval_samples_per_second": 2.163,
"eval_steps_per_second": 0.068,
"step": 1000
},
{
"epoch": 1.46,
"learning_rate": 1.513210568454764e-07,
"logits/chosen": -2.764820098876953,
"logits/rejected": -2.5704541206359863,
"logps/chosen": -163.67437744140625,
"logps/rejected": -395.3125915527344,
"loss": 0.0031,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.4941201210021973,
"rewards/margins": 27.328876495361328,
"rewards/rejected": -29.8229923248291,
"step": 1010
},
{
"epoch": 1.47,
"learning_rate": 1.4731785428342675e-07,
"logits/chosen": -2.8067145347595215,
"logits/rejected": -2.502478837966919,
"logps/chosen": -146.5375518798828,
"logps/rejected": -257.7701110839844,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.222342848777771,
"rewards/margins": 17.490814208984375,
"rewards/rejected": -18.713157653808594,
"step": 1020
},
{
"epoch": 1.48,
"learning_rate": 1.433146517213771e-07,
"logits/chosen": -2.8830108642578125,
"logits/rejected": -2.599515438079834,
"logps/chosen": -164.00958251953125,
"logps/rejected": -202.6780242919922,
"loss": 0.003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6045265197753906,
"rewards/margins": 10.405461311340332,
"rewards/rejected": -12.009988784790039,
"step": 1030
},
{
"epoch": 1.5,
"learning_rate": 1.3931144915932746e-07,
"logits/chosen": -2.8134148120880127,
"logits/rejected": -2.5562379360198975,
"logps/chosen": -167.71530151367188,
"logps/rejected": -212.62570190429688,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.176457166671753,
"rewards/margins": 11.668563842773438,
"rewards/rejected": -12.84502124786377,
"step": 1040
},
{
"epoch": 1.51,
"learning_rate": 1.3530824659727782e-07,
"logits/chosen": -2.564943790435791,
"logits/rejected": -2.4214589595794678,
"logps/chosen": -144.50045776367188,
"logps/rejected": -427.9203186035156,
"loss": 0.0074,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0412085056304932,
"rewards/margins": 33.99787139892578,
"rewards/rejected": -35.03908157348633,
"step": 1050
},
{
"epoch": 1.53,
"learning_rate": 1.3130504403522818e-07,
"logits/chosen": -2.8820528984069824,
"logits/rejected": -2.634192943572998,
"logps/chosen": -147.8389129638672,
"logps/rejected": -352.130859375,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7727874517440796,
"rewards/margins": 26.576446533203125,
"rewards/rejected": -27.3492374420166,
"step": 1060
},
{
"epoch": 1.54,
"learning_rate": 1.2730184147317853e-07,
"logits/chosen": -2.7832694053649902,
"logits/rejected": -2.5845096111297607,
"logps/chosen": -170.01785278320312,
"logps/rejected": -473.90863037109375,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1985461711883545,
"rewards/margins": 36.935890197753906,
"rewards/rejected": -38.13444137573242,
"step": 1070
},
{
"epoch": 1.56,
"learning_rate": 1.232986389111289e-07,
"logits/chosen": -2.7855515480041504,
"logits/rejected": -2.5294415950775146,
"logps/chosen": -158.38758850097656,
"logps/rejected": -244.84619140625,
"loss": 0.0049,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8522005081176758,
"rewards/margins": 15.829713821411133,
"rewards/rejected": -16.681913375854492,
"step": 1080
},
{
"epoch": 1.57,
"learning_rate": 1.1929543634907927e-07,
"logits/chosen": -2.898603916168213,
"logits/rejected": -2.705836772918701,
"logps/chosen": -159.56637573242188,
"logps/rejected": -371.81805419921875,
"loss": 0.0088,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.5499234199523926,
"rewards/margins": 27.168338775634766,
"rewards/rejected": -28.71826171875,
"step": 1090
},
{
"epoch": 1.59,
"learning_rate": 1.1529223378702962e-07,
"logits/chosen": -2.733206272125244,
"logits/rejected": -2.5612919330596924,
"logps/chosen": -172.57748413085938,
"logps/rejected": -559.9437255859375,
"loss": 0.0195,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0921106338500977,
"rewards/margins": 45.248077392578125,
"rewards/rejected": -46.340187072753906,
"step": 1100
},
{
"epoch": 1.6,
"learning_rate": 1.1128903122497999e-07,
"logits/chosen": -2.8048062324523926,
"logits/rejected": -2.533332347869873,
"logps/chosen": -145.4043731689453,
"logps/rejected": -206.748291015625,
"loss": 0.0061,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4871163368225098,
"rewards/margins": 12.425850868225098,
"rewards/rejected": -13.91296672821045,
"step": 1110
},
{
"epoch": 1.61,
"learning_rate": 1.0728582866293035e-07,
"logits/chosen": -2.8180341720581055,
"logits/rejected": -2.671854257583618,
"logps/chosen": -164.4940948486328,
"logps/rejected": -299.62310791015625,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2362325191497803,
"rewards/margins": 19.141382217407227,
"rewards/rejected": -20.377614974975586,
"step": 1120
},
{
"epoch": 1.63,
"learning_rate": 1.032826261008807e-07,
"logits/chosen": -2.7941746711730957,
"logits/rejected": -2.4854462146759033,
"logps/chosen": -170.0912628173828,
"logps/rejected": -517.0354614257812,
"loss": 0.0037,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6481285095214844,
"rewards/margins": 41.76611328125,
"rewards/rejected": -43.414241790771484,
"step": 1130
},
{
"epoch": 1.64,
"learning_rate": 9.927942353883106e-08,
"logits/chosen": -2.7304294109344482,
"logits/rejected": -2.458939790725708,
"logps/chosen": -147.95701599121094,
"logps/rejected": -289.230224609375,
"loss": 0.0069,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3893832266330719,
"rewards/margins": 20.953866958618164,
"rewards/rejected": -21.343250274658203,
"step": 1140
},
{
"epoch": 1.66,
"learning_rate": 9.527622097678143e-08,
"logits/chosen": -2.786956310272217,
"logits/rejected": -2.565520763397217,
"logps/chosen": -172.35365295410156,
"logps/rejected": -397.92059326171875,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4105862379074097,
"rewards/margins": 28.689733505249023,
"rewards/rejected": -30.10032081604004,
"step": 1150
},
{
"epoch": 1.67,
"learning_rate": 9.127301841473179e-08,
"logits/chosen": -2.803377151489258,
"logits/rejected": -2.601539134979248,
"logps/chosen": -152.02151489257812,
"logps/rejected": -315.120849609375,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8650063276290894,
"rewards/margins": 22.168514251708984,
"rewards/rejected": -23.033519744873047,
"step": 1160
},
{
"epoch": 1.69,
"learning_rate": 8.726981585268214e-08,
"logits/chosen": -2.8182191848754883,
"logits/rejected": -2.570002555847168,
"logps/chosen": -149.609619140625,
"logps/rejected": -231.5469512939453,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5421051979064941,
"rewards/margins": 14.738876342773438,
"rewards/rejected": -15.280984878540039,
"step": 1170
},
{
"epoch": 1.7,
"learning_rate": 8.32666132906325e-08,
"logits/chosen": -2.7290663719177246,
"logits/rejected": -2.519298791885376,
"logps/chosen": -149.35226440429688,
"logps/rejected": -345.97222900390625,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2555046081542969,
"rewards/margins": 24.839550018310547,
"rewards/rejected": -26.09505271911621,
"step": 1180
},
{
"epoch": 1.71,
"learning_rate": 7.926341072858286e-08,
"logits/chosen": -2.8426907062530518,
"logits/rejected": -2.560478448867798,
"logps/chosen": -147.8146209716797,
"logps/rejected": -243.75064086914062,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8054535984992981,
"rewards/margins": 15.806452751159668,
"rewards/rejected": -16.61190414428711,
"step": 1190
},
{
"epoch": 1.73,
"learning_rate": 7.526020816653323e-08,
"logits/chosen": -2.674760341644287,
"logits/rejected": -2.367633819580078,
"logps/chosen": -151.23182678222656,
"logps/rejected": -421.35333251953125,
"loss": 0.0018,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7838943004608154,
"rewards/margins": 32.74809646606445,
"rewards/rejected": -35.5319938659668,
"step": 1200
},
{
"epoch": 1.74,
"learning_rate": 7.125700560448359e-08,
"logits/chosen": -2.726081371307373,
"logits/rejected": -2.4065871238708496,
"logps/chosen": -150.547119140625,
"logps/rejected": -347.12457275390625,
"loss": 0.003,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.623246431350708,
"rewards/margins": 26.384979248046875,
"rewards/rejected": -27.008224487304688,
"step": 1210
},
{
"epoch": 1.76,
"learning_rate": 6.725380304243394e-08,
"logits/chosen": -2.7504703998565674,
"logits/rejected": -2.449279308319092,
"logps/chosen": -158.4932861328125,
"logps/rejected": -315.344482421875,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7700117826461792,
"rewards/margins": 22.627595901489258,
"rewards/rejected": -23.397607803344727,
"step": 1220
},
{
"epoch": 1.77,
"learning_rate": 6.32506004803843e-08,
"logits/chosen": -2.7155704498291016,
"logits/rejected": -2.4125099182128906,
"logps/chosen": -149.95974731445312,
"logps/rejected": -408.9559631347656,
"loss": 0.0061,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.43645238876342773,
"rewards/margins": 32.31442642211914,
"rewards/rejected": -32.750877380371094,
"step": 1230
},
{
"epoch": 1.79,
"learning_rate": 5.9247397918334664e-08,
"logits/chosen": -2.7009987831115723,
"logits/rejected": -2.5183584690093994,
"logps/chosen": -175.1053466796875,
"logps/rejected": -374.5074157714844,
"loss": 0.0018,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2859503030776978,
"rewards/margins": 26.71976089477539,
"rewards/rejected": -28.005706787109375,
"step": 1240
},
{
"epoch": 1.8,
"learning_rate": 5.524419535628502e-08,
"logits/chosen": -2.724604368209839,
"logits/rejected": -2.4586219787597656,
"logps/chosen": -204.1591033935547,
"logps/rejected": -626.8215942382812,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.939435958862305,
"rewards/margins": 48.43818283081055,
"rewards/rejected": -53.37761688232422,
"step": 1250
},
{
"epoch": 1.82,
"learning_rate": 5.1240992794235385e-08,
"logits/chosen": -2.8005754947662354,
"logits/rejected": -2.5299735069274902,
"logps/chosen": -153.12969970703125,
"logps/rejected": -334.99786376953125,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5083599090576172,
"rewards/margins": 24.945537567138672,
"rewards/rejected": -25.45389747619629,
"step": 1260
},
{
"epoch": 1.83,
"learning_rate": 4.723779023218575e-08,
"logits/chosen": -2.721325159072876,
"logits/rejected": -2.4367713928222656,
"logps/chosen": -154.41665649414062,
"logps/rejected": -428.780517578125,
"loss": 0.0082,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7341115474700928,
"rewards/margins": 34.29324722290039,
"rewards/rejected": -36.02735137939453,
"step": 1270
},
{
"epoch": 1.84,
"learning_rate": 4.323458767013611e-08,
"logits/chosen": -2.7661736011505127,
"logits/rejected": -2.489382028579712,
"logps/chosen": -153.54531860351562,
"logps/rejected": -436.8839416503906,
"loss": 0.006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.38232460618019104,
"rewards/margins": 35.33488082885742,
"rewards/rejected": -35.71720504760742,
"step": 1280
},
{
"epoch": 1.86,
"learning_rate": 3.923138510808647e-08,
"logits/chosen": -2.6737539768218994,
"logits/rejected": -2.4625658988952637,
"logps/chosen": -154.32327270507812,
"logps/rejected": -682.4031982421875,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4486008882522583,
"rewards/margins": 58.11639404296875,
"rewards/rejected": -59.56499481201172,
"step": 1290
},
{
"epoch": 1.87,
"learning_rate": 3.5228182546036826e-08,
"logits/chosen": -2.5800118446350098,
"logits/rejected": -2.395155191421509,
"logps/chosen": -140.3927764892578,
"logps/rejected": -396.32806396484375,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.859173595905304,
"rewards/margins": 30.5933780670166,
"rewards/rejected": -31.452550888061523,
"step": 1300
},
{
"epoch": 1.89,
"learning_rate": 3.122497998398719e-08,
"logits/chosen": -2.6436543464660645,
"logits/rejected": -2.371372699737549,
"logps/chosen": -123.04959869384766,
"logps/rejected": -331.83624267578125,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4394907057285309,
"rewards/margins": 26.18316650390625,
"rewards/rejected": -26.622655868530273,
"step": 1310
},
{
"epoch": 1.9,
"learning_rate": 2.722177742193755e-08,
"logits/chosen": -2.7385857105255127,
"logits/rejected": -2.4508581161499023,
"logps/chosen": -175.63052368164062,
"logps/rejected": -294.7503356933594,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2934653759002686,
"rewards/margins": 19.555923461914062,
"rewards/rejected": -20.849384307861328,
"step": 1320
},
{
"epoch": 1.92,
"learning_rate": 2.3218574859887907e-08,
"logits/chosen": -2.794159412384033,
"logits/rejected": -2.5210330486297607,
"logps/chosen": -155.2234344482422,
"logps/rejected": -240.988037109375,
"loss": 0.0078,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8834775686264038,
"rewards/margins": 14.685577392578125,
"rewards/rejected": -15.569055557250977,
"step": 1330
},
{
"epoch": 1.93,
"learning_rate": 1.9215372297838268e-08,
"logits/chosen": -2.6487419605255127,
"logits/rejected": -2.3688254356384277,
"logps/chosen": -140.86117553710938,
"logps/rejected": -280.75250244140625,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05746353790163994,
"rewards/margins": 20.066030502319336,
"rewards/rejected": -20.008569717407227,
"step": 1340
},
{
"epoch": 1.95,
"learning_rate": 1.521216973578863e-08,
"logits/chosen": -2.875211715698242,
"logits/rejected": -2.490218162536621,
"logps/chosen": -172.50912475585938,
"logps/rejected": -273.8616943359375,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1841375827789307,
"rewards/margins": 17.2589054107666,
"rewards/rejected": -18.443042755126953,
"step": 1350
},
{
"epoch": 1.96,
"learning_rate": 1.120896717373899e-08,
"logits/chosen": -2.7332968711853027,
"logits/rejected": -2.491285800933838,
"logps/chosen": -150.06475830078125,
"logps/rejected": -376.63824462890625,
"loss": 0.0102,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.1961749792099,
"rewards/margins": 28.852294921875,
"rewards/rejected": -30.048471450805664,
"step": 1360
},
{
"epoch": 1.97,
"learning_rate": 7.205764611689351e-09,
"logits/chosen": -2.6668992042541504,
"logits/rejected": -2.389853000640869,
"logps/chosen": -170.96156311035156,
"logps/rejected": -512.9969482421875,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6127735376358032,
"rewards/margins": 40.84336471557617,
"rewards/rejected": -41.456138610839844,
"step": 1370
},
{
"epoch": 1.99,
"learning_rate": 3.2025620496397115e-09,
"logits/chosen": -2.7764010429382324,
"logits/rejected": -2.528985023498535,
"logps/chosen": -173.30990600585938,
"logps/rejected": -363.98358154296875,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7063196897506714,
"rewards/margins": 26.459331512451172,
"rewards/rejected": -27.165653228759766,
"step": 1380
},
{
"epoch": 2.0,
"step": 1388,
"total_flos": 0.0,
"train_loss": 0.04402416471998484,
"train_runtime": 16535.8816,
"train_samples_per_second": 1.341,
"train_steps_per_second": 0.084
}
],
"logging_steps": 10,
"max_steps": 1388,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}