CantoneseLLMChat-v1.0-7B / trainer_state.json
indiejoseph's picture
Upload folder using huggingface_hub
c4ef698 verified
raw
history blame
117 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9959514170040484,
"eval_steps": 500,
"global_step": 1110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01349527665317139,
"grad_norm": 85.5,
"learning_rate": 2.2522522522522524e-07,
"logits/chosen": -1.500240683555603,
"logits/rejected": -1.5190627574920654,
"logps/chosen": -159.05484008789062,
"logps/rejected": -164.59542846679688,
"loss": 0.6946,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": 0.006750366650521755,
"rewards/margins": -0.002313111675903201,
"rewards/rejected": 0.0090634785592556,
"step": 5
},
{
"epoch": 0.02699055330634278,
"grad_norm": 92.5,
"learning_rate": 4.504504504504505e-07,
"logits/chosen": -1.4508098363876343,
"logits/rejected": -1.4352288246154785,
"logps/chosen": -141.31773376464844,
"logps/rejected": -167.95175170898438,
"loss": 0.7035,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": -0.00739473570138216,
"rewards/margins": -0.01960981823503971,
"rewards/rejected": 0.01221508253365755,
"step": 10
},
{
"epoch": 0.04048582995951417,
"grad_norm": 74.0,
"learning_rate": 6.756756756756758e-07,
"logits/chosen": -1.3884494304656982,
"logits/rejected": -1.3975419998168945,
"logps/chosen": -192.84548950195312,
"logps/rejected": -180.82046508789062,
"loss": 0.6966,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.004980484023690224,
"rewards/margins": -0.006102551706135273,
"rewards/rejected": 0.011083034798502922,
"step": 15
},
{
"epoch": 0.05398110661268556,
"grad_norm": 99.0,
"learning_rate": 9.00900900900901e-07,
"logits/chosen": -1.4855096340179443,
"logits/rejected": -1.4922425746917725,
"logps/chosen": -148.1718292236328,
"logps/rejected": -152.18133544921875,
"loss": 0.6843,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.002431074623018503,
"rewards/margins": 0.018751021474599838,
"rewards/rejected": -0.016319945454597473,
"step": 20
},
{
"epoch": 0.06747638326585695,
"grad_norm": 113.0,
"learning_rate": 1.1261261261261262e-06,
"logits/chosen": -1.4175087213516235,
"logits/rejected": -1.4836245775222778,
"logps/chosen": -264.17132568359375,
"logps/rejected": -193.3080596923828,
"loss": 0.6911,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.002699580043554306,
"rewards/margins": 0.005426598247140646,
"rewards/rejected": -0.00272701820358634,
"step": 25
},
{
"epoch": 0.08097165991902834,
"grad_norm": 89.0,
"learning_rate": 1.3513513513513515e-06,
"logits/chosen": -1.3333433866500854,
"logits/rejected": -1.4199435710906982,
"logps/chosen": -220.9799041748047,
"logps/rejected": -186.35690307617188,
"loss": 0.688,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.009898080490529537,
"rewards/margins": 0.012090040370821953,
"rewards/rejected": -0.0021919584833085537,
"step": 30
},
{
"epoch": 0.09446693657219973,
"grad_norm": 66.5,
"learning_rate": 1.5765765765765766e-06,
"logits/chosen": -1.5576092004776,
"logits/rejected": -1.493931770324707,
"logps/chosen": -148.85377502441406,
"logps/rejected": -168.85574340820312,
"loss": 0.6811,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.014485938474535942,
"rewards/margins": 0.025426441803574562,
"rewards/rejected": -0.010940502397716045,
"step": 35
},
{
"epoch": 0.10796221322537113,
"grad_norm": 87.5,
"learning_rate": 1.801801801801802e-06,
"logits/chosen": -1.460998296737671,
"logits/rejected": -1.4714558124542236,
"logps/chosen": -165.34341430664062,
"logps/rejected": -167.67092895507812,
"loss": 0.6808,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.018663501366972923,
"rewards/margins": 0.027817577123641968,
"rewards/rejected": -0.009154075756669044,
"step": 40
},
{
"epoch": 0.1214574898785425,
"grad_norm": 93.0,
"learning_rate": 2.0270270270270273e-06,
"logits/chosen": -1.3859444856643677,
"logits/rejected": -1.4024606943130493,
"logps/chosen": -162.58734130859375,
"logps/rejected": -191.04025268554688,
"loss": 0.6846,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.009018613025546074,
"rewards/margins": 0.019761864095926285,
"rewards/rejected": -0.010743250139057636,
"step": 45
},
{
"epoch": 0.1349527665317139,
"grad_norm": 89.5,
"learning_rate": 2.2522522522522524e-06,
"logits/chosen": -1.4222023487091064,
"logits/rejected": -1.54598069190979,
"logps/chosen": -285.5871276855469,
"logps/rejected": -167.19281005859375,
"loss": 0.6684,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.02634511888027191,
"rewards/margins": 0.052618540823459625,
"rewards/rejected": -0.026273420080542564,
"step": 50
},
{
"epoch": 0.1484480431848853,
"grad_norm": 69.5,
"learning_rate": 2.4774774774774775e-06,
"logits/chosen": -1.5841736793518066,
"logits/rejected": -1.516913890838623,
"logps/chosen": -170.33505249023438,
"logps/rejected": -188.19314575195312,
"loss": 0.6639,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.004526221659034491,
"rewards/margins": 0.06425820291042328,
"rewards/rejected": -0.06878442317247391,
"step": 55
},
{
"epoch": 0.16194331983805668,
"grad_norm": 72.0,
"learning_rate": 2.702702702702703e-06,
"logits/chosen": -1.438759207725525,
"logits/rejected": -1.3985353708267212,
"logps/chosen": -198.15411376953125,
"logps/rejected": -208.3758544921875,
"loss": 0.6501,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.047606997191905975,
"rewards/margins": 0.09706764668226242,
"rewards/rejected": -0.049460653215646744,
"step": 60
},
{
"epoch": 0.17543859649122806,
"grad_norm": 164.0,
"learning_rate": 2.927927927927928e-06,
"logits/chosen": -1.4191879034042358,
"logits/rejected": -1.5293009281158447,
"logps/chosen": -217.4423370361328,
"logps/rejected": -202.1327362060547,
"loss": 0.6846,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.014217356219887733,
"rewards/margins": 0.027354473248124123,
"rewards/rejected": -0.013137114234268665,
"step": 65
},
{
"epoch": 0.18893387314439947,
"grad_norm": 75.5,
"learning_rate": 3.1531531531531532e-06,
"logits/chosen": -1.510615587234497,
"logits/rejected": -1.5524317026138306,
"logps/chosen": -277.9597473144531,
"logps/rejected": -174.99221801757812,
"loss": 0.6538,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.01016303151845932,
"rewards/margins": 0.08965723216533661,
"rewards/rejected": -0.07949419319629669,
"step": 70
},
{
"epoch": 0.20242914979757085,
"grad_norm": 127.5,
"learning_rate": 3.3783783783783788e-06,
"logits/chosen": -1.5467108488082886,
"logits/rejected": -1.7057151794433594,
"logps/chosen": -236.87759399414062,
"logps/rejected": -171.19088745117188,
"loss": 0.6316,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.024651767686009407,
"rewards/margins": 0.13629736006259918,
"rewards/rejected": -0.11164556443691254,
"step": 75
},
{
"epoch": 0.21592442645074225,
"grad_norm": 67.0,
"learning_rate": 3.603603603603604e-06,
"logits/chosen": -1.3438420295715332,
"logits/rejected": -1.5014269351959229,
"logps/chosen": -211.7142791748047,
"logps/rejected": -149.79403686523438,
"loss": 0.6296,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.016034509986639023,
"rewards/margins": 0.1428973227739334,
"rewards/rejected": -0.12686282396316528,
"step": 80
},
{
"epoch": 0.22941970310391363,
"grad_norm": 67.0,
"learning_rate": 3.828828828828829e-06,
"logits/chosen": -1.580759048461914,
"logits/rejected": -1.5942776203155518,
"logps/chosen": -186.5341339111328,
"logps/rejected": -198.06871032714844,
"loss": 0.6112,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.011369394138455391,
"rewards/margins": 0.18740348517894745,
"rewards/rejected": -0.1987728774547577,
"step": 85
},
{
"epoch": 0.242914979757085,
"grad_norm": 104.5,
"learning_rate": 4.0540540540540545e-06,
"logits/chosen": -1.5142263174057007,
"logits/rejected": -1.526908040046692,
"logps/chosen": -172.0498504638672,
"logps/rejected": -204.1090545654297,
"loss": 0.5947,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.001767634996213019,
"rewards/margins": 0.23096399009227753,
"rewards/rejected": -0.2327316552400589,
"step": 90
},
{
"epoch": 0.2564102564102564,
"grad_norm": 67.0,
"learning_rate": 4.27927927927928e-06,
"logits/chosen": -1.2964483499526978,
"logits/rejected": -1.287847876548767,
"logps/chosen": -152.49652099609375,
"logps/rejected": -162.25242614746094,
"loss": 0.6261,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.031346581876277924,
"rewards/margins": 0.17656004428863525,
"rewards/rejected": -0.20790663361549377,
"step": 95
},
{
"epoch": 0.2699055330634278,
"grad_norm": 122.0,
"learning_rate": 4.504504504504505e-06,
"logits/chosen": -1.6146646738052368,
"logits/rejected": -1.6288648843765259,
"logps/chosen": -245.85440063476562,
"logps/rejected": -252.33163452148438,
"loss": 0.5388,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.008112089708447456,
"rewards/margins": 0.4617583155632019,
"rewards/rejected": -0.4536462426185608,
"step": 100
},
{
"epoch": 0.2834008097165992,
"grad_norm": 54.75,
"learning_rate": 4.72972972972973e-06,
"logits/chosen": -1.7181060314178467,
"logits/rejected": -1.6348508596420288,
"logps/chosen": -181.34054565429688,
"logps/rejected": -187.49969482421875,
"loss": 0.5332,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.032877303659915924,
"rewards/margins": 0.5002557635307312,
"rewards/rejected": -0.5331330895423889,
"step": 105
},
{
"epoch": 0.2968960863697706,
"grad_norm": 93.5,
"learning_rate": 4.954954954954955e-06,
"logits/chosen": -1.471880555152893,
"logits/rejected": -1.4882009029388428,
"logps/chosen": -239.46017456054688,
"logps/rejected": -203.43408203125,
"loss": 0.639,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.20699986815452576,
"rewards/margins": 0.2872315049171448,
"rewards/rejected": -0.49423137307167053,
"step": 110
},
{
"epoch": 0.31039136302294196,
"grad_norm": 83.5,
"learning_rate": 4.999802215142814e-06,
"logits/chosen": -1.572249174118042,
"logits/rejected": -1.5214914083480835,
"logps/chosen": -181.75244140625,
"logps/rejected": -206.9883270263672,
"loss": 0.4953,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.2786533534526825,
"rewards/margins": 0.6539293527603149,
"rewards/rejected": -0.932582676410675,
"step": 115
},
{
"epoch": 0.32388663967611336,
"grad_norm": 63.25,
"learning_rate": 4.998998767795805e-06,
"logits/chosen": -1.3965647220611572,
"logits/rejected": -1.5122724771499634,
"logps/chosen": -185.1367645263672,
"logps/rejected": -141.9375457763672,
"loss": 0.5188,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.12487339973449707,
"rewards/margins": 0.5116696357727051,
"rewards/rejected": -0.6365430951118469,
"step": 120
},
{
"epoch": 0.33738191632928477,
"grad_norm": 94.5,
"learning_rate": 4.9975774948882615e-06,
"logits/chosen": -1.5592033863067627,
"logits/rejected": -1.5545122623443604,
"logps/chosen": -134.59095764160156,
"logps/rejected": -159.44424438476562,
"loss": 0.5878,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.218244269490242,
"rewards/margins": 0.560061514377594,
"rewards/rejected": -0.7783057689666748,
"step": 125
},
{
"epoch": 0.3508771929824561,
"grad_norm": 159.0,
"learning_rate": 4.995538747800403e-06,
"logits/chosen": -1.5116926431655884,
"logits/rejected": -1.5991663932800293,
"logps/chosen": -196.37417602539062,
"logps/rejected": -162.26467895507812,
"loss": 0.555,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6864209175109863,
"rewards/margins": 0.5580738186836243,
"rewards/rejected": -1.2444946765899658,
"step": 130
},
{
"epoch": 0.3643724696356275,
"grad_norm": 77.5,
"learning_rate": 4.9928830305701164e-06,
"logits/chosen": -1.4444091320037842,
"logits/rejected": -1.404262661933899,
"logps/chosen": -185.04042053222656,
"logps/rejected": -186.958740234375,
"loss": 0.4598,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.22133490443229675,
"rewards/margins": 0.7992109060287476,
"rewards/rejected": -1.0205457210540771,
"step": 135
},
{
"epoch": 0.37786774628879893,
"grad_norm": 50.25,
"learning_rate": 4.98961099976835e-06,
"logits/chosen": -1.5445549488067627,
"logits/rejected": -1.586544156074524,
"logps/chosen": -199.28408813476562,
"logps/rejected": -183.11032104492188,
"loss": 0.4536,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.06479507684707642,
"rewards/margins": 0.9296582341194153,
"rewards/rejected": -0.9944533109664917,
"step": 140
},
{
"epoch": 0.3913630229419703,
"grad_norm": 68.0,
"learning_rate": 4.985723464336783e-06,
"logits/chosen": -1.4274847507476807,
"logits/rejected": -1.4104160070419312,
"logps/chosen": -185.9368896484375,
"logps/rejected": -188.2207489013672,
"loss": 0.4902,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.17553560435771942,
"rewards/margins": 0.6832131743431091,
"rewards/rejected": -0.8587487936019897,
"step": 145
},
{
"epoch": 0.4048582995951417,
"grad_norm": 65.0,
"learning_rate": 4.9812213853878376e-06,
"logits/chosen": -1.6410919427871704,
"logits/rejected": -1.6832342147827148,
"logps/chosen": -168.22726440429688,
"logps/rejected": -165.28591918945312,
"loss": 0.4942,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.19691412150859833,
"rewards/margins": 0.8052200078964233,
"rewards/rejected": -1.002134084701538,
"step": 150
},
{
"epoch": 0.4183535762483131,
"grad_norm": 84.0,
"learning_rate": 4.9761058759670625e-06,
"logits/chosen": -1.4086945056915283,
"logits/rejected": -1.3933309316635132,
"logps/chosen": -200.54226684570312,
"logps/rejected": -191.30516052246094,
"loss": 0.5805,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.38961219787597656,
"rewards/margins": 0.6619648337364197,
"rewards/rejected": -1.051577091217041,
"step": 155
},
{
"epoch": 0.4318488529014845,
"grad_norm": 48.75,
"learning_rate": 4.970378200777949e-06,
"logits/chosen": -1.4240281581878662,
"logits/rejected": -1.5275284051895142,
"logps/chosen": -149.6121826171875,
"logps/rejected": -153.7329864501953,
"loss": 0.3726,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.22904136776924133,
"rewards/margins": 1.2087788581848145,
"rewards/rejected": -1.4378201961517334,
"step": 160
},
{
"epoch": 0.44534412955465585,
"grad_norm": 57.5,
"learning_rate": 4.964039775869271e-06,
"logits/chosen": -1.5353929996490479,
"logits/rejected": -1.5400171279907227,
"logps/chosen": -172.69320678710938,
"logps/rejected": -186.09596252441406,
"loss": 0.4821,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.14687059819698334,
"rewards/margins": 1.0381742715835571,
"rewards/rejected": -1.1850450038909912,
"step": 165
},
{
"epoch": 0.45883940620782726,
"grad_norm": 68.5,
"learning_rate": 4.957092168284987e-06,
"logits/chosen": -1.5351091623306274,
"logits/rejected": -1.480067253112793,
"logps/chosen": -224.7134246826172,
"logps/rejected": -280.2825012207031,
"loss": 0.4522,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.15150094032287598,
"rewards/margins": 0.8322998881340027,
"rewards/rejected": -0.9838007092475891,
"step": 170
},
{
"epoch": 0.47233468286099867,
"grad_norm": 47.25,
"learning_rate": 4.949537095676824e-06,
"logits/chosen": -1.5415345430374146,
"logits/rejected": -1.4604427814483643,
"logps/chosen": -173.94085693359375,
"logps/rejected": -215.93075561523438,
"loss": 0.45,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3776322901248932,
"rewards/margins": 1.5937398672103882,
"rewards/rejected": -1.9713722467422485,
"step": 175
},
{
"epoch": 0.48582995951417,
"grad_norm": 95.5,
"learning_rate": 4.9413764258796236e-06,
"logits/chosen": -1.5088344812393188,
"logits/rejected": -1.6158044338226318,
"logps/chosen": -273.03594970703125,
"logps/rejected": -221.93997192382812,
"loss": 0.5881,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.25630080699920654,
"rewards/margins": 0.5983410477638245,
"rewards/rejected": -0.8546417951583862,
"step": 180
},
{
"epoch": 0.4993252361673414,
"grad_norm": 83.0,
"learning_rate": 4.93261217644956e-06,
"logits/chosen": -1.3866004943847656,
"logits/rejected": -1.363396406173706,
"logps/chosen": -211.2840576171875,
"logps/rejected": -256.87811279296875,
"loss": 0.4912,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.24753907322883606,
"rewards/margins": 0.9087351560592651,
"rewards/rejected": -1.1562741994857788,
"step": 185
},
{
"epoch": 0.5128205128205128,
"grad_norm": 79.0,
"learning_rate": 4.923246514165339e-06,
"logits/chosen": -1.357788324356079,
"logits/rejected": -1.322389841079712,
"logps/chosen": -221.6494598388672,
"logps/rejected": -238.56637573242188,
"loss": 0.3841,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.21661829948425293,
"rewards/margins": 1.6020748615264893,
"rewards/rejected": -1.8186931610107422,
"step": 190
},
{
"epoch": 0.5263157894736842,
"grad_norm": 78.0,
"learning_rate": 4.913281754492509e-06,
"logits/chosen": -1.5164716243743896,
"logits/rejected": -1.5658130645751953,
"logps/chosen": -211.942138671875,
"logps/rejected": -251.4232177734375,
"loss": 0.439,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.2759682238101959,
"rewards/margins": 1.2201299667358398,
"rewards/rejected": -1.4960981607437134,
"step": 195
},
{
"epoch": 0.5398110661268556,
"grad_norm": 68.0,
"learning_rate": 4.902720361011007e-06,
"logits/chosen": -1.43938148021698,
"logits/rejected": -1.4012665748596191,
"logps/chosen": -198.0753936767578,
"logps/rejected": -230.1431121826172,
"loss": 0.436,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4660988748073578,
"rewards/margins": 1.3129799365997314,
"rewards/rejected": -1.7790788412094116,
"step": 200
},
{
"epoch": 0.553306342780027,
"grad_norm": 116.0,
"learning_rate": 4.891564944806095e-06,
"logits/chosen": -1.3829123973846436,
"logits/rejected": -1.4532912969589233,
"logps/chosen": -204.92056274414062,
"logps/rejected": -184.2178192138672,
"loss": 0.4408,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4832437038421631,
"rewards/margins": 1.4000451564788818,
"rewards/rejected": -1.8832887411117554,
"step": 205
},
{
"epoch": 0.5668016194331984,
"grad_norm": 39.0,
"learning_rate": 4.879818263822816e-06,
"logits/chosen": -1.5301909446716309,
"logits/rejected": -1.4669263362884521,
"logps/chosen": -176.71139526367188,
"logps/rejected": -210.8941192626953,
"loss": 0.4359,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7508934140205383,
"rewards/margins": 1.5884822607040405,
"rewards/rejected": -2.3393757343292236,
"step": 210
},
{
"epoch": 0.5802968960863698,
"grad_norm": 118.5,
"learning_rate": 4.867483222184158e-06,
"logits/chosen": -1.4969114065170288,
"logits/rejected": -1.4513076543807983,
"logps/chosen": -183.51742553710938,
"logps/rejected": -234.21078491210938,
"loss": 0.4083,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.1092641353607178,
"rewards/margins": 2.7672932147979736,
"rewards/rejected": -4.876556873321533,
"step": 215
},
{
"epoch": 0.5937921727395412,
"grad_norm": 82.5,
"learning_rate": 4.854562869473063e-06,
"logits/chosen": -1.6114156246185303,
"logits/rejected": -1.6086403131484985,
"logps/chosen": -158.5917510986328,
"logps/rejected": -182.981689453125,
"loss": 0.5288,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8133976459503174,
"rewards/margins": 2.3693175315856934,
"rewards/rejected": -4.182714939117432,
"step": 220
},
{
"epoch": 0.6072874493927125,
"grad_norm": 64.5,
"learning_rate": 4.841060399978481e-06,
"logits/chosen": -1.4258265495300293,
"logits/rejected": -1.5041557550430298,
"logps/chosen": -203.29505920410156,
"logps/rejected": -173.55667114257812,
"loss": 0.467,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.451561838388443,
"rewards/margins": 0.9895628094673157,
"rewards/rejected": -1.4411247968673706,
"step": 225
},
{
"epoch": 0.6207827260458839,
"grad_norm": 53.75,
"learning_rate": 4.826979151905655e-06,
"logits/chosen": -1.3954380750656128,
"logits/rejected": -1.4369020462036133,
"logps/chosen": -133.7052764892578,
"logps/rejected": -152.63189697265625,
"loss": 0.3819,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.21247024834156036,
"rewards/margins": 1.1218936443328857,
"rewards/rejected": -1.3343639373779297,
"step": 230
},
{
"epoch": 0.6342780026990553,
"grad_norm": 34.25,
"learning_rate": 4.812322606550813e-06,
"logits/chosen": -1.477416753768921,
"logits/rejected": -1.35099196434021,
"logps/chosen": -183.8603057861328,
"logps/rejected": -200.47122192382812,
"loss": 0.403,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.22856561839580536,
"rewards/margins": 1.1782000064849854,
"rewards/rejected": -1.4067654609680176,
"step": 235
},
{
"epoch": 0.6477732793522267,
"grad_norm": 142.0,
"learning_rate": 4.7970943874404904e-06,
"logits/chosen": -1.5746204853057861,
"logits/rejected": -1.5317301750183105,
"logps/chosen": -132.62966918945312,
"logps/rejected": -169.4604034423828,
"loss": 0.4905,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2887588441371918,
"rewards/margins": 1.0178512334823608,
"rewards/rejected": -1.3066102266311646,
"step": 240
},
{
"epoch": 0.6612685560053981,
"grad_norm": 81.5,
"learning_rate": 4.781298259435691e-06,
"logits/chosen": -1.4620139598846436,
"logits/rejected": -1.5366100072860718,
"logps/chosen": -207.0232696533203,
"logps/rejected": -182.5987548828125,
"loss": 0.3498,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.38011789321899414,
"rewards/margins": 1.517073392868042,
"rewards/rejected": -1.8971912860870361,
"step": 245
},
{
"epoch": 0.6747638326585695,
"grad_norm": 59.0,
"learning_rate": 4.7649381278011e-06,
"logits/chosen": -1.525059700012207,
"logits/rejected": -1.4892899990081787,
"logps/chosen": -132.02548217773438,
"logps/rejected": -172.75595092773438,
"loss": 0.4596,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.47625675797462463,
"rewards/margins": 1.6200672388076782,
"rewards/rejected": -2.0963237285614014,
"step": 250
},
{
"epoch": 0.6882591093117408,
"grad_norm": 93.5,
"learning_rate": 4.748018037239592e-06,
"logits/chosen": -1.6185624599456787,
"logits/rejected": -1.6007747650146484,
"logps/chosen": -190.04196166992188,
"logps/rejected": -271.9373474121094,
"loss": 0.377,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.29186195135116577,
"rewards/margins": 1.4247747659683228,
"rewards/rejected": -1.7166366577148438,
"step": 255
},
{
"epoch": 0.7017543859649122,
"grad_norm": 54.75,
"learning_rate": 4.7305421708922596e-06,
"logits/chosen": -1.5387685298919678,
"logits/rejected": -1.4462766647338867,
"logps/chosen": -199.54568481445312,
"logps/rejected": -219.14901733398438,
"loss": 0.5013,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.4976847767829895,
"rewards/margins": 1.649714708328247,
"rewards/rejected": -2.147399425506592,
"step": 260
},
{
"epoch": 0.7152496626180836,
"grad_norm": 92.0,
"learning_rate": 4.712514849304219e-06,
"logits/chosen": -1.4592026472091675,
"logits/rejected": -1.5086675882339478,
"logps/chosen": -203.43939208984375,
"logps/rejected": -182.27008056640625,
"loss": 0.3704,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.30615222454071045,
"rewards/margins": 1.7558097839355469,
"rewards/rejected": -2.0619618892669678,
"step": 265
},
{
"epoch": 0.728744939271255,
"grad_norm": 94.0,
"learning_rate": 4.693940529356444e-06,
"logits/chosen": -1.5462654829025269,
"logits/rejected": -1.5494886636734009,
"logps/chosen": -204.8282470703125,
"logps/rejected": -262.1166076660156,
"loss": 0.4081,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.18543025851249695,
"rewards/margins": 1.581555724143982,
"rewards/rejected": -1.7669861316680908,
"step": 270
},
{
"epoch": 0.7422402159244265,
"grad_norm": 49.5,
"learning_rate": 4.674823803163899e-06,
"logits/chosen": -1.5121240615844727,
"logits/rejected": -1.378418207168579,
"logps/chosen": -176.5196533203125,
"logps/rejected": -259.83154296875,
"loss": 0.2792,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.2915424704551697,
"rewards/margins": 2.276181697845459,
"rewards/rejected": -2.5677244663238525,
"step": 275
},
{
"epoch": 0.7557354925775979,
"grad_norm": 63.5,
"learning_rate": 4.655169396940229e-06,
"logits/chosen": -1.488743782043457,
"logits/rejected": -1.4984915256500244,
"logps/chosen": -227.04574584960938,
"logps/rejected": -223.5692596435547,
"loss": 0.3756,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.3987753987312317,
"rewards/margins": 1.6648337841033936,
"rewards/rejected": -2.0636088848114014,
"step": 280
},
{
"epoch": 0.7692307692307693,
"grad_norm": 62.75,
"learning_rate": 4.6349821698293025e-06,
"logits/chosen": -1.4782928228378296,
"logits/rejected": -1.480554223060608,
"logps/chosen": -168.77146911621094,
"logps/rejected": -283.3312683105469,
"loss": 0.3639,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.32490164041519165,
"rewards/margins": 1.6070611476898193,
"rewards/rejected": -1.9319626092910767,
"step": 285
},
{
"epoch": 0.7827260458839406,
"grad_norm": 85.0,
"learning_rate": 4.6142671127038905e-06,
"logits/chosen": -1.5204181671142578,
"logits/rejected": -1.4846007823944092,
"logps/chosen": -122.49859619140625,
"logps/rejected": -159.67666625976562,
"loss": 0.3855,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.5428152680397034,
"rewards/margins": 1.4056587219238281,
"rewards/rejected": -1.9484741687774658,
"step": 290
},
{
"epoch": 0.796221322537112,
"grad_norm": 124.5,
"learning_rate": 4.593029346931777e-06,
"logits/chosen": -1.5233218669891357,
"logits/rejected": -1.4880311489105225,
"logps/chosen": -190.8978271484375,
"logps/rejected": -212.50808715820312,
"loss": 0.4094,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.5791584253311157,
"rewards/margins": 1.7821210622787476,
"rewards/rejected": -2.3612794876098633,
"step": 295
},
{
"epoch": 0.8097165991902834,
"grad_norm": 121.0,
"learning_rate": 4.571274123109606e-06,
"logits/chosen": -1.5600152015686035,
"logits/rejected": -1.5772325992584229,
"logps/chosen": -211.6980438232422,
"logps/rejected": -159.11520385742188,
"loss": 0.5103,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5033570528030396,
"rewards/margins": 1.3233957290649414,
"rewards/rejected": -1.8267529010772705,
"step": 300
},
{
"epoch": 0.8232118758434548,
"grad_norm": 87.0,
"learning_rate": 4.549006819764779e-06,
"logits/chosen": -1.3667839765548706,
"logits/rejected": -1.408111333847046,
"logps/chosen": -252.8665008544922,
"logps/rejected": -246.56600952148438,
"loss": 0.6645,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4156951904296875,
"rewards/margins": 0.9969050288200378,
"rewards/rejected": -1.4126002788543701,
"step": 305
},
{
"epoch": 0.8367071524966262,
"grad_norm": 65.0,
"learning_rate": 4.52623294202573e-06,
"logits/chosen": -1.5357733964920044,
"logits/rejected": -1.6000627279281616,
"logps/chosen": -203.2954864501953,
"logps/rejected": -178.47378540039062,
"loss": 0.3625,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.1179068312048912,
"rewards/margins": 1.5459201335906982,
"rewards/rejected": -1.6638271808624268,
"step": 310
},
{
"epoch": 0.8502024291497976,
"grad_norm": 38.75,
"learning_rate": 4.502958120260894e-06,
"logits/chosen": -1.4177687168121338,
"logits/rejected": -1.466953992843628,
"logps/chosen": -208.93142700195312,
"logps/rejected": -204.0532989501953,
"loss": 0.3943,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.10137398540973663,
"rewards/margins": 1.5527517795562744,
"rewards/rejected": -1.6541255712509155,
"step": 315
},
{
"epoch": 0.863697705802969,
"grad_norm": 94.5,
"learning_rate": 4.479188108686714e-06,
"logits/chosen": -1.543738603591919,
"logits/rejected": -1.5562658309936523,
"logps/chosen": -195.75601196289062,
"logps/rejected": -243.9476776123047,
"loss": 0.393,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.09615819901227951,
"rewards/margins": 1.808638334274292,
"rewards/rejected": -1.9047966003417969,
"step": 320
},
{
"epoch": 0.8771929824561403,
"grad_norm": 53.25,
"learning_rate": 4.454928783945033e-06,
"logits/chosen": -1.4368815422058105,
"logits/rejected": -1.465288519859314,
"logps/chosen": -182.02488708496094,
"logps/rejected": -166.5155487060547,
"loss": 0.3673,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.09929310530424118,
"rewards/margins": 1.477452039718628,
"rewards/rejected": -1.5767452716827393,
"step": 325
},
{
"epoch": 0.8906882591093117,
"grad_norm": 94.5,
"learning_rate": 4.430186143650216e-06,
"logits/chosen": -1.3891671895980835,
"logits/rejected": -1.3638372421264648,
"logps/chosen": -167.63204956054688,
"logps/rejected": -166.39913940429688,
"loss": 0.4332,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.18427793681621552,
"rewards/margins": 1.2914403676986694,
"rewards/rejected": -1.4757182598114014,
"step": 330
},
{
"epoch": 0.9041835357624831,
"grad_norm": 68.5,
"learning_rate": 4.404966304906363e-06,
"logits/chosen": -1.5300304889678955,
"logits/rejected": -1.541245698928833,
"logps/chosen": -237.1887969970703,
"logps/rejected": -258.4833984375,
"loss": 0.2851,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2509092092514038,
"rewards/margins": 2.2454378604888916,
"rewards/rejected": -2.496346950531006,
"step": 335
},
{
"epoch": 0.9176788124156545,
"grad_norm": 91.5,
"learning_rate": 4.379275502794984e-06,
"logits/chosen": -1.4159671068191528,
"logits/rejected": -1.3942148685455322,
"logps/chosen": -204.76268005371094,
"logps/rejected": -194.83755493164062,
"loss": 0.3974,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.590856671333313,
"rewards/margins": 1.8947960138320923,
"rewards/rejected": -2.4856529235839844,
"step": 340
},
{
"epoch": 0.9311740890688259,
"grad_norm": 24.875,
"learning_rate": 4.3531200888335015e-06,
"logits/chosen": -1.499260663986206,
"logits/rejected": -1.5041369199752808,
"logps/chosen": -158.403076171875,
"logps/rejected": -188.42300415039062,
"loss": 0.3399,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.4716406464576721,
"rewards/margins": 2.255904197692871,
"rewards/rejected": -2.7275447845458984,
"step": 345
},
{
"epoch": 0.9446693657219973,
"grad_norm": 49.0,
"learning_rate": 4.326506529404973e-06,
"logits/chosen": -1.4987239837646484,
"logits/rejected": -1.5489791631698608,
"logps/chosen": -228.030517578125,
"logps/rejected": -199.24453735351562,
"loss": 0.4954,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.5366212129592896,
"rewards/margins": 1.576836347579956,
"rewards/rejected": -2.113457441329956,
"step": 350
},
{
"epoch": 0.9581646423751687,
"grad_norm": 50.5,
"learning_rate": 4.299441404159409e-06,
"logits/chosen": -1.4427543878555298,
"logits/rejected": -1.4410443305969238,
"logps/chosen": -142.67196655273438,
"logps/rejected": -182.15530395507812,
"loss": 0.3882,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.45489731431007385,
"rewards/margins": 1.885206937789917,
"rewards/rejected": -2.340104341506958,
"step": 355
},
{
"epoch": 0.97165991902834,
"grad_norm": 71.0,
"learning_rate": 4.271931404387096e-06,
"logits/chosen": -1.4958666563034058,
"logits/rejected": -1.4852968454360962,
"logps/chosen": -203.7172088623047,
"logps/rejected": -223.72958374023438,
"loss": 0.3129,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.4084866940975189,
"rewards/margins": 2.0505545139312744,
"rewards/rejected": -2.459041118621826,
"step": 360
},
{
"epoch": 0.9851551956815114,
"grad_norm": 72.0,
"learning_rate": 4.243983331364307e-06,
"logits/chosen": -1.6051279306411743,
"logits/rejected": -1.5763704776763916,
"logps/chosen": -156.02700805664062,
"logps/rejected": -212.16317749023438,
"loss": 0.4821,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.6169974207878113,
"rewards/margins": 1.195291519165039,
"rewards/rejected": -1.8122888803482056,
"step": 365
},
{
"epoch": 0.9986504723346828,
"grad_norm": 91.0,
"learning_rate": 4.215604094671835e-06,
"logits/chosen": -1.5946276187896729,
"logits/rejected": -1.525407075881958,
"logps/chosen": -190.231689453125,
"logps/rejected": -210.0182342529297,
"loss": 0.4743,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.5886441469192505,
"rewards/margins": 1.6572681665420532,
"rewards/rejected": -2.245912551879883,
"step": 370
},
{
"epoch": 1.0121457489878543,
"grad_norm": 71.5,
"learning_rate": 4.186800710486732e-06,
"logits/chosen": -1.503097414970398,
"logits/rejected": -1.4615429639816284,
"logps/chosen": -177.4516143798828,
"logps/rejected": -223.7339324951172,
"loss": 0.2691,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.2226639688014984,
"rewards/margins": 2.2762439250946045,
"rewards/rejected": -2.4989078044891357,
"step": 375
},
{
"epoch": 1.0256410256410255,
"grad_norm": 16.75,
"learning_rate": 4.157580299847717e-06,
"logits/chosen": -1.4365036487579346,
"logits/rejected": -1.4489128589630127,
"logps/chosen": -185.9925994873047,
"logps/rejected": -210.19802856445312,
"loss": 0.126,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1753823310136795,
"rewards/margins": 3.160768508911133,
"rewards/rejected": -3.336151123046875,
"step": 380
},
{
"epoch": 1.039136302294197,
"grad_norm": 27.125,
"learning_rate": 4.12795008689464e-06,
"logits/chosen": -1.4434540271759033,
"logits/rejected": -1.5021578073501587,
"logps/chosen": -210.2549591064453,
"logps/rejected": -247.6964569091797,
"loss": 0.2329,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.21405327320098877,
"rewards/margins": 2.4333832263946533,
"rewards/rejected": -2.219329833984375,
"step": 385
},
{
"epoch": 1.0526315789473684,
"grad_norm": 29.5,
"learning_rate": 4.0979173970824626e-06,
"logits/chosen": -1.5133657455444336,
"logits/rejected": -1.5038350820541382,
"logps/chosen": -187.3416290283203,
"logps/rejected": -197.63766479492188,
"loss": 0.1885,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0815470814704895,
"rewards/margins": 2.5452542304992676,
"rewards/rejected": -2.463707447052002,
"step": 390
},
{
"epoch": 1.0661268556005399,
"grad_norm": 11.3125,
"learning_rate": 4.067489655370197e-06,
"logits/chosen": -1.486011028289795,
"logits/rejected": -1.5427876710891724,
"logps/chosen": -248.8966064453125,
"logps/rejected": -205.6848602294922,
"loss": 0.1103,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.543197751045227,
"rewards/margins": 3.468106746673584,
"rewards/rejected": -2.9249091148376465,
"step": 395
},
{
"epoch": 1.0796221322537112,
"grad_norm": 21.625,
"learning_rate": 4.0366743843852315e-06,
"logits/chosen": -1.4536128044128418,
"logits/rejected": -1.39426851272583,
"logps/chosen": -157.4046173095703,
"logps/rejected": -206.4637451171875,
"loss": 0.1189,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.14299368858337402,
"rewards/margins": 3.642580032348633,
"rewards/rejected": -3.7855734825134277,
"step": 400
},
{
"epoch": 1.0931174089068827,
"grad_norm": 73.0,
"learning_rate": 4.005479202563524e-06,
"logits/chosen": -1.4207379817962646,
"logits/rejected": -1.4653427600860596,
"logps/chosen": -175.64657592773438,
"logps/rejected": -188.96347045898438,
"loss": 0.113,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.22152826189994812,
"rewards/margins": 3.9064407348632812,
"rewards/rejected": -4.127968788146973,
"step": 405
},
{
"epoch": 1.106612685560054,
"grad_norm": 22.5,
"learning_rate": 3.973911822266099e-06,
"logits/chosen": -1.3683284521102905,
"logits/rejected": -1.4073810577392578,
"logps/chosen": -200.2495880126953,
"logps/rejected": -196.02499389648438,
"loss": 0.1506,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.4190312922000885,
"rewards/margins": 3.017284393310547,
"rewards/rejected": -3.4363160133361816,
"step": 410
},
{
"epoch": 1.1201079622132253,
"grad_norm": 61.0,
"learning_rate": 3.941980047872324e-06,
"logits/chosen": -1.3142037391662598,
"logits/rejected": -1.3677208423614502,
"logps/chosen": -200.49827575683594,
"logps/rejected": -213.0048828125,
"loss": 0.2229,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.29499849677085876,
"rewards/margins": 2.430476188659668,
"rewards/rejected": -2.7254748344421387,
"step": 415
},
{
"epoch": 1.1336032388663968,
"grad_norm": 33.5,
"learning_rate": 3.9096917738504445e-06,
"logits/chosen": -1.5029326677322388,
"logits/rejected": -1.522037386894226,
"logps/chosen": -211.3799285888672,
"logps/rejected": -195.49777221679688,
"loss": 0.2023,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.20138970017433167,
"rewards/margins": 3.0471653938293457,
"rewards/rejected": -3.2485554218292236,
"step": 420
},
{
"epoch": 1.147098515519568,
"grad_norm": 67.5,
"learning_rate": 3.877054982805835e-06,
"logits/chosen": -1.503327488899231,
"logits/rejected": -1.5182857513427734,
"logps/chosen": -206.69345092773438,
"logps/rejected": -220.8511505126953,
"loss": 0.2,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.075591079890728,
"rewards/margins": 3.3199775218963623,
"rewards/rejected": -3.39556884765625,
"step": 425
},
{
"epoch": 1.1605937921727396,
"grad_norm": 41.25,
"learning_rate": 3.844077743507468e-06,
"logits/chosen": -1.4972890615463257,
"logits/rejected": -1.4547359943389893,
"logps/chosen": -190.38272094726562,
"logps/rejected": -237.7483367919922,
"loss": 0.1763,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.00510750338435173,
"rewards/margins": 3.4418201446533203,
"rewards/rejected": -3.446927309036255,
"step": 430
},
{
"epoch": 1.174089068825911,
"grad_norm": 43.0,
"learning_rate": 3.8107682088930797e-06,
"logits/chosen": -1.5898491144180298,
"logits/rejected": -1.628394365310669,
"logps/chosen": -209.7681884765625,
"logps/rejected": -223.9811248779297,
"loss": 0.2875,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.15720273554325104,
"rewards/margins": 2.540311336517334,
"rewards/rejected": -2.697514057159424,
"step": 435
},
{
"epoch": 1.1875843454790824,
"grad_norm": 19.875,
"learning_rate": 3.777134614053522e-06,
"logits/chosen": -1.3833550214767456,
"logits/rejected": -1.3048458099365234,
"logps/chosen": -153.44886779785156,
"logps/rejected": -187.23211669921875,
"loss": 0.2094,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.15450401604175568,
"rewards/margins": 2.7406177520751953,
"rewards/rejected": -2.8951218128204346,
"step": 440
},
{
"epoch": 1.2010796221322537,
"grad_norm": 25.25,
"learning_rate": 3.7431852741968104e-06,
"logits/chosen": -1.5894601345062256,
"logits/rejected": -1.4398654699325562,
"logps/chosen": -161.95870971679688,
"logps/rejected": -259.89544677734375,
"loss": 0.2674,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3261250853538513,
"rewards/margins": 2.652719497680664,
"rewards/rejected": -2.9788451194763184,
"step": 445
},
{
"epoch": 1.214574898785425,
"grad_norm": 25.625,
"learning_rate": 3.7089285825923614e-06,
"logits/chosen": -1.481194257736206,
"logits/rejected": -1.4744828939437866,
"logps/chosen": -136.75341796875,
"logps/rejected": -182.98255920410156,
"loss": 0.216,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.15366610884666443,
"rewards/margins": 2.4346675872802734,
"rewards/rejected": -2.5883336067199707,
"step": 450
},
{
"epoch": 1.2280701754385965,
"grad_norm": 59.0,
"learning_rate": 3.6743730084959275e-06,
"logits/chosen": -1.4641847610473633,
"logits/rejected": -1.4495608806610107,
"logps/chosen": -226.5570068359375,
"logps/rejected": -231.99484252929688,
"loss": 0.1606,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.0351928249001503,
"rewards/margins": 2.678950071334839,
"rewards/rejected": -2.6437573432922363,
"step": 455
},
{
"epoch": 1.2415654520917678,
"grad_norm": 29.125,
"learning_rate": 3.639527095055753e-06,
"logits/chosen": -1.4890583753585815,
"logits/rejected": -1.4146323204040527,
"logps/chosen": -211.8848419189453,
"logps/rejected": -223.7265167236328,
"loss": 0.1515,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.1912011355161667,
"rewards/margins": 3.216825008392334,
"rewards/rejected": -3.4080262184143066,
"step": 460
},
{
"epoch": 1.2550607287449393,
"grad_norm": 28.5,
"learning_rate": 3.604399457200458e-06,
"logits/chosen": -1.5582194328308105,
"logits/rejected": -1.530056357383728,
"logps/chosen": -174.59786987304688,
"logps/rejected": -235.314697265625,
"loss": 0.1586,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.02259807661175728,
"rewards/margins": 3.3205082416534424,
"rewards/rejected": -3.343106508255005,
"step": 465
},
{
"epoch": 1.2685560053981106,
"grad_norm": 47.0,
"learning_rate": 3.5689987795091735e-06,
"logits/chosen": -1.5336169004440308,
"logits/rejected": -1.5555146932601929,
"logps/chosen": -192.9527587890625,
"logps/rejected": -217.05029296875,
"loss": 0.1666,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.11079835891723633,
"rewards/margins": 2.9511632919311523,
"rewards/rejected": -3.0619616508483887,
"step": 470
},
{
"epoch": 1.282051282051282,
"grad_norm": 31.5,
"learning_rate": 3.5333338140644602e-06,
"logits/chosen": -1.567378044128418,
"logits/rejected": -1.5020748376846313,
"logps/chosen": -151.2008819580078,
"logps/rejected": -193.5251007080078,
"loss": 0.1562,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.06739845871925354,
"rewards/margins": 2.88545560836792,
"rewards/rejected": -2.81805682182312,
"step": 475
},
{
"epoch": 1.2955465587044535,
"grad_norm": 27.625,
"learning_rate": 3.497413378288541e-06,
"logits/chosen": -1.558091402053833,
"logits/rejected": -1.5880284309387207,
"logps/chosen": -208.2618408203125,
"logps/rejected": -215.33065795898438,
"loss": 0.1537,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.08185993134975433,
"rewards/margins": 2.7448792457580566,
"rewards/rejected": -2.8267390727996826,
"step": 480
},
{
"epoch": 1.3090418353576248,
"grad_norm": 21.0,
"learning_rate": 3.4612463527633728e-06,
"logits/chosen": -1.517230749130249,
"logits/rejected": -1.5125114917755127,
"logps/chosen": -165.6942138671875,
"logps/rejected": -177.20965576171875,
"loss": 0.1097,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.21497321128845215,
"rewards/margins": 3.283679485321045,
"rewards/rejected": -3.498652935028076,
"step": 485
},
{
"epoch": 1.3225371120107963,
"grad_norm": 58.25,
"learning_rate": 3.4248416790351086e-06,
"logits/chosen": -1.4563219547271729,
"logits/rejected": -1.4463237524032593,
"logps/chosen": -222.70803833007812,
"logps/rejected": -276.1205139160156,
"loss": 0.1741,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.18195849657058716,
"rewards/margins": 3.079150438308716,
"rewards/rejected": -3.2611091136932373,
"step": 490
},
{
"epoch": 1.3360323886639676,
"grad_norm": 26.5,
"learning_rate": 3.3882083574034847e-06,
"logits/chosen": -1.495981216430664,
"logits/rejected": -1.510833501815796,
"logps/chosen": -217.92416381835938,
"logps/rejected": -232.9659881591797,
"loss": 0.1075,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07798905670642853,
"rewards/margins": 3.7004494667053223,
"rewards/rejected": -3.6224606037139893,
"step": 495
},
{
"epoch": 1.349527665317139,
"grad_norm": 11.625,
"learning_rate": 3.3513554446966846e-06,
"logits/chosen": -1.607877492904663,
"logits/rejected": -1.5209126472473145,
"logps/chosen": -145.24710083007812,
"logps/rejected": -269.81951904296875,
"loss": 0.0835,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.1404910534620285,
"rewards/margins": 3.947847843170166,
"rewards/rejected": -4.088338375091553,
"step": 500
},
{
"epoch": 1.349527665317139,
"eval_logits/chosen": -1.5215187072753906,
"eval_logits/rejected": -1.5562808513641357,
"eval_logps/chosen": -190.62527465820312,
"eval_logps/rejected": -222.86770629882812,
"eval_loss": 0.3281523883342743,
"eval_rewards/accuracies": 0.849397599697113,
"eval_rewards/chosen": -0.6181024312973022,
"eval_rewards/margins": 2.1862471103668213,
"eval_rewards/rejected": -2.804349660873413,
"eval_runtime": 23.4839,
"eval_samples_per_second": 14.052,
"eval_steps_per_second": 3.534,
"step": 500
},
{
"epoch": 1.3630229419703104,
"grad_norm": 25.625,
"learning_rate": 3.314292052032227e-06,
"logits/chosen": -1.4269988536834717,
"logits/rejected": -1.5553017854690552,
"logps/chosen": -245.88330078125,
"logps/rejected": -144.62518310546875,
"loss": 0.2057,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.03630426153540611,
"rewards/margins": 2.8089230060577393,
"rewards/rejected": -2.8452274799346924,
"step": 505
},
{
"epoch": 1.376518218623482,
"grad_norm": 42.75,
"learning_rate": 3.2770273425644285e-06,
"logits/chosen": -1.3818541765213013,
"logits/rejected": -1.31718909740448,
"logps/chosen": -194.84194946289062,
"logps/rejected": -197.08505249023438,
"loss": 0.1862,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.11487498134374619,
"rewards/margins": 3.057730197906494,
"rewards/rejected": -3.172605276107788,
"step": 510
},
{
"epoch": 1.3900134952766532,
"grad_norm": 29.0,
"learning_rate": 3.2395705292190067e-06,
"logits/chosen": -1.467614769935608,
"logits/rejected": -1.438024640083313,
"logps/chosen": -180.7233428955078,
"logps/rejected": -217.57559204101562,
"loss": 0.1711,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.10827420651912689,
"rewards/margins": 3.1877129077911377,
"rewards/rejected": -3.295987367630005,
"step": 515
},
{
"epoch": 1.4035087719298245,
"grad_norm": 12.125,
"learning_rate": 3.2019308724153743e-06,
"logits/chosen": -1.4175347089767456,
"logits/rejected": -1.5785712003707886,
"logps/chosen": -196.76730346679688,
"logps/rejected": -179.5243377685547,
"loss": 0.114,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12531700730323792,
"rewards/margins": 3.2615838050842285,
"rewards/rejected": -3.1362667083740234,
"step": 520
},
{
"epoch": 1.417004048582996,
"grad_norm": 27.375,
"learning_rate": 3.164117677777191e-06,
"logits/chosen": -1.5264801979064941,
"logits/rejected": -1.6040115356445312,
"logps/chosen": -150.361328125,
"logps/rejected": -164.02816772460938,
"loss": 0.1757,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.4109339118003845,
"rewards/margins": 3.098153591156006,
"rewards/rejected": -3.509087324142456,
"step": 525
},
{
"epoch": 1.4304993252361673,
"grad_norm": 38.25,
"learning_rate": 3.1261402938317465e-06,
"logits/chosen": -1.5730303525924683,
"logits/rejected": -1.6026499271392822,
"logps/chosen": -164.3070831298828,
"logps/rejected": -246.06338500976562,
"loss": 0.1532,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.005527207162231207,
"rewards/margins": 3.9187304973602295,
"rewards/rejected": -3.913203477859497,
"step": 530
},
{
"epoch": 1.4439946018893388,
"grad_norm": 20.375,
"learning_rate": 3.088008109698726e-06,
"logits/chosen": -1.444838285446167,
"logits/rejected": -1.5232534408569336,
"logps/chosen": -194.70555114746094,
"logps/rejected": -218.77590942382812,
"loss": 0.1892,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.10898719727993011,
"rewards/margins": 3.2815093994140625,
"rewards/rejected": -3.1725223064422607,
"step": 535
},
{
"epoch": 1.45748987854251,
"grad_norm": 43.0,
"learning_rate": 3.0497305527689446e-06,
"logits/chosen": -1.4176692962646484,
"logits/rejected": -1.4581646919250488,
"logps/chosen": -190.53550720214844,
"logps/rejected": -202.92530822753906,
"loss": 0.1852,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.12854930758476257,
"rewards/margins": 3.113678216934204,
"rewards/rejected": -3.242227554321289,
"step": 540
},
{
"epoch": 1.4709851551956814,
"grad_norm": 42.0,
"learning_rate": 3.011317086373628e-06,
"logits/chosen": -1.4024337530136108,
"logits/rejected": -1.4260265827178955,
"logps/chosen": -222.62124633789062,
"logps/rejected": -228.56295776367188,
"loss": 0.1847,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.02214776910841465,
"rewards/margins": 3.127570629119873,
"rewards/rejected": -3.1497180461883545,
"step": 545
},
{
"epoch": 1.484480431848853,
"grad_norm": 38.5,
"learning_rate": 2.9727772074447916e-06,
"logits/chosen": -1.4362146854400635,
"logits/rejected": -1.4737937450408936,
"logps/chosen": -190.13218688964844,
"logps/rejected": -182.9353790283203,
"loss": 0.1473,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.015029204078018665,
"rewards/margins": 3.5559723377227783,
"rewards/rejected": -3.5710015296936035,
"step": 550
},
{
"epoch": 1.4979757085020242,
"grad_norm": 105.0,
"learning_rate": 2.9341204441673267e-06,
"logits/chosen": -1.5892771482467651,
"logits/rejected": -1.5846550464630127,
"logps/chosen": -128.55142211914062,
"logps/rejected": -169.93356323242188,
"loss": 0.2029,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.536339282989502,
"rewards/margins": 2.9536054134368896,
"rewards/rejected": -3.4899444580078125,
"step": 555
},
{
"epoch": 1.5114709851551957,
"grad_norm": 49.75,
"learning_rate": 2.8953563536233525e-06,
"logits/chosen": -1.650007963180542,
"logits/rejected": -1.6943776607513428,
"logps/chosen": -168.49082946777344,
"logps/rejected": -202.83924865722656,
"loss": 0.186,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.5661638379096985,
"rewards/margins": 3.2901504039764404,
"rewards/rejected": -3.856314182281494,
"step": 560
},
{
"epoch": 1.524966261808367,
"grad_norm": 21.0,
"learning_rate": 2.8564945194294273e-06,
"logits/chosen": -1.5658307075500488,
"logits/rejected": -1.46593177318573,
"logps/chosen": -162.1931915283203,
"logps/rejected": -254.7098388671875,
"loss": 0.168,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.5101041793823242,
"rewards/margins": 3.1685078144073486,
"rewards/rejected": -3.6786117553710938,
"step": 565
},
{
"epoch": 1.5384615384615383,
"grad_norm": 13.125,
"learning_rate": 2.817544549367197e-06,
"logits/chosen": -1.4567762613296509,
"logits/rejected": -1.4438632726669312,
"logps/chosen": -173.05821228027344,
"logps/rejected": -226.567626953125,
"loss": 0.1935,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.4246034622192383,
"rewards/margins": 3.5222201347351074,
"rewards/rejected": -3.946824312210083,
"step": 570
},
{
"epoch": 1.5519568151147098,
"grad_norm": 18.875,
"learning_rate": 2.778516073008071e-06,
"logits/chosen": -1.3770719766616821,
"logits/rejected": -1.4858124256134033,
"logps/chosen": -178.8583221435547,
"logps/rejected": -180.4306640625,
"loss": 0.2049,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.34754911065101624,
"rewards/margins": 2.8492796421051025,
"rewards/rejected": -3.196829080581665,
"step": 575
},
{
"epoch": 1.5654520917678814,
"grad_norm": 51.0,
"learning_rate": 2.7394187393325107e-06,
"logits/chosen": -1.4935017824172974,
"logits/rejected": -1.482154130935669,
"logps/chosen": -183.38815307617188,
"logps/rejected": -203.4325714111328,
"loss": 0.2601,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.4926990866661072,
"rewards/margins": 2.8543787002563477,
"rewards/rejected": -3.347078323364258,
"step": 580
},
{
"epoch": 1.5789473684210527,
"grad_norm": 11.875,
"learning_rate": 2.7002622143445177e-06,
"logits/chosen": -1.5763792991638184,
"logits/rejected": -1.581122875213623,
"logps/chosen": -230.5819854736328,
"logps/rejected": -290.4792175292969,
"loss": 0.1305,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.07175219804048538,
"rewards/margins": 4.184114933013916,
"rewards/rejected": -4.112362861633301,
"step": 585
},
{
"epoch": 1.592442645074224,
"grad_norm": 46.75,
"learning_rate": 2.6610561786819207e-06,
"logits/chosen": -1.6340926885604858,
"logits/rejected": -1.5590074062347412,
"logps/chosen": -145.62442016601562,
"logps/rejected": -248.79403686523438,
"loss": 0.1715,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.3683429956436157,
"rewards/margins": 3.4242138862609863,
"rewards/rejected": -3.7925562858581543,
"step": 590
},
{
"epoch": 1.6059379217273952,
"grad_norm": 8.5625,
"learning_rate": 2.6218103252230302e-06,
"logits/chosen": -1.5815064907073975,
"logits/rejected": -1.558189868927002,
"logps/chosen": -145.986572265625,
"logps/rejected": -209.48776245117188,
"loss": 0.1382,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.15136297047138214,
"rewards/margins": 3.156534194946289,
"rewards/rejected": -3.3078970909118652,
"step": 595
},
{
"epoch": 1.6194331983805668,
"grad_norm": 33.5,
"learning_rate": 2.582534356690284e-06,
"logits/chosen": -1.4829189777374268,
"logits/rejected": -1.5618332624435425,
"logps/chosen": -280.50482177734375,
"logps/rejected": -227.37191772460938,
"loss": 0.111,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014963224530220032,
"rewards/margins": 3.7380282878875732,
"rewards/rejected": -3.723065137863159,
"step": 600
},
{
"epoch": 1.6329284750337383,
"grad_norm": 19.25,
"learning_rate": 2.5432379832514437e-06,
"logits/chosen": -1.5892632007598877,
"logits/rejected": -1.6352291107177734,
"logps/chosen": -158.56002807617188,
"logps/rejected": -202.90060424804688,
"loss": 0.2301,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.49658140540122986,
"rewards/margins": 3.0457987785339355,
"rewards/rejected": -3.5423800945281982,
"step": 605
},
{
"epoch": 1.6464237516869096,
"grad_norm": 18.5,
"learning_rate": 2.5039309201189618e-06,
"logits/chosen": -1.6018474102020264,
"logits/rejected": -1.6965217590332031,
"logps/chosen": -161.53518676757812,
"logps/rejected": -185.10025024414062,
"loss": 0.1597,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.14256028831005096,
"rewards/margins": 3.0957140922546387,
"rewards/rejected": -3.238274335861206,
"step": 610
},
{
"epoch": 1.6599190283400809,
"grad_norm": 22.375,
"learning_rate": 2.4646228851480957e-06,
"logits/chosen": -1.391078233718872,
"logits/rejected": -1.3691911697387695,
"logps/chosen": -206.93734741210938,
"logps/rejected": -213.29428100585938,
"loss": 0.2172,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01679610088467598,
"rewards/margins": 2.988704204559326,
"rewards/rejected": -2.9719078540802,
"step": 615
},
{
"epoch": 1.6734143049932524,
"grad_norm": 13.25,
"learning_rate": 2.4253235964343677e-06,
"logits/chosen": -1.590201497077942,
"logits/rejected": -1.4947328567504883,
"logps/chosen": -162.37301635742188,
"logps/rejected": -259.95294189453125,
"loss": 0.1116,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.33501359820365906,
"rewards/margins": 4.118770599365234,
"rewards/rejected": -4.453783988952637,
"step": 620
},
{
"epoch": 1.686909581646424,
"grad_norm": 73.0,
"learning_rate": 2.3860427699109726e-06,
"logits/chosen": -1.6217790842056274,
"logits/rejected": -1.6454839706420898,
"logps/chosen": -172.94483947753906,
"logps/rejected": -205.34475708007812,
"loss": 0.2869,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.9174480438232422,
"rewards/margins": 3.128140449523926,
"rewards/rejected": -4.045588493347168,
"step": 625
},
{
"epoch": 1.7004048582995952,
"grad_norm": 32.5,
"learning_rate": 2.34679011694671e-06,
"logits/chosen": -1.5026500225067139,
"logits/rejected": -1.6494897603988647,
"logps/chosen": -268.9452209472656,
"logps/rejected": -212.0578155517578,
"loss": 0.1194,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.23328566551208496,
"rewards/margins": 4.139514446258545,
"rewards/rejected": -4.372800350189209,
"step": 630
},
{
"epoch": 1.7139001349527665,
"grad_norm": 70.5,
"learning_rate": 2.3075753419450524e-06,
"logits/chosen": -1.5526963472366333,
"logits/rejected": -1.6195096969604492,
"logps/chosen": -205.20431518554688,
"logps/rejected": -197.59744262695312,
"loss": 0.2026,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.3273767828941345,
"rewards/margins": 2.9169745445251465,
"rewards/rejected": -3.2443511486053467,
"step": 635
},
{
"epoch": 1.7273954116059378,
"grad_norm": 38.5,
"learning_rate": 2.2684081399449327e-06,
"logits/chosen": -1.4865336418151855,
"logits/rejected": -1.479229211807251,
"logps/chosen": -188.85787963867188,
"logps/rejected": -203.17514038085938,
"loss": 0.269,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.778891921043396,
"rewards/margins": 3.5117366313934326,
"rewards/rejected": -4.290627956390381,
"step": 640
},
{
"epoch": 1.7408906882591093,
"grad_norm": 116.0,
"learning_rate": 2.2292981942238454e-06,
"logits/chosen": -1.598434329032898,
"logits/rejected": -1.6193567514419556,
"logps/chosen": -170.999267578125,
"logps/rejected": -234.42391967773438,
"loss": 0.3528,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.586583137512207,
"rewards/margins": 3.1737523078918457,
"rewards/rejected": -3.7603354454040527,
"step": 645
},
{
"epoch": 1.7543859649122808,
"grad_norm": 36.0,
"learning_rate": 2.1902551739038624e-06,
"logits/chosen": -1.5177044868469238,
"logits/rejected": -1.4585306644439697,
"logps/chosen": -171.92758178710938,
"logps/rejected": -219.8982696533203,
"loss": 0.2386,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.5028108954429626,
"rewards/margins": 3.118129253387451,
"rewards/rejected": -3.6209399700164795,
"step": 650
},
{
"epoch": 1.7678812415654521,
"grad_norm": 11.0625,
"learning_rate": 2.151288731561136e-06,
"logits/chosen": -1.532063364982605,
"logits/rejected": -1.4071648120880127,
"logps/chosen": -211.4221649169922,
"logps/rejected": -240.8402099609375,
"loss": 0.1651,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.42295771837234497,
"rewards/margins": 3.7488913536071777,
"rewards/rejected": -4.171849250793457,
"step": 655
},
{
"epoch": 1.7813765182186234,
"grad_norm": 23.625,
"learning_rate": 2.1124085008395056e-06,
"logits/chosen": -1.4962142705917358,
"logits/rejected": -1.4677404165267944,
"logps/chosen": -197.39447021484375,
"logps/rejected": -263.4613342285156,
"loss": 0.1999,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.16278569400310516,
"rewards/margins": 3.5397281646728516,
"rewards/rejected": -3.7025134563446045,
"step": 660
},
{
"epoch": 1.7948717948717947,
"grad_norm": 20.625,
"learning_rate": 2.073624094068776e-06,
"logits/chosen": -1.5467997789382935,
"logits/rejected": -1.540650725364685,
"logps/chosen": -186.6321563720703,
"logps/rejected": -259.65045166015625,
"loss": 0.2781,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.22982105612754822,
"rewards/margins": 3.7238681316375732,
"rewards/rejected": -3.9536895751953125,
"step": 665
},
{
"epoch": 1.8083670715249662,
"grad_norm": 12.875,
"learning_rate": 2.03494509988827e-06,
"logits/chosen": -1.6044431924819946,
"logits/rejected": -1.627730131149292,
"logps/chosen": -184.64320373535156,
"logps/rejected": -204.9185791015625,
"loss": 0.1212,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.07731951773166656,
"rewards/margins": 3.6050572395324707,
"rewards/rejected": -3.6823768615722656,
"step": 670
},
{
"epoch": 1.8218623481781377,
"grad_norm": 22.375,
"learning_rate": 1.996381080876237e-06,
"logits/chosen": -1.6212413311004639,
"logits/rejected": -1.5563671588897705,
"logps/chosen": -219.73171997070312,
"logps/rejected": -281.0826721191406,
"loss": 0.1177,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.023561427369713783,
"rewards/margins": 3.5450756549835205,
"rewards/rejected": -3.521514415740967,
"step": 675
},
{
"epoch": 1.835357624831309,
"grad_norm": 42.5,
"learning_rate": 1.957941571185702e-06,
"logits/chosen": -1.4472072124481201,
"logits/rejected": -1.5231066942214966,
"logps/chosen": -256.3811950683594,
"logps/rejected": -225.1781768798828,
"loss": 0.2672,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.01850978098809719,
"rewards/margins": 3.1582770347595215,
"rewards/rejected": -3.1767868995666504,
"step": 680
},
{
"epoch": 1.8488529014844803,
"grad_norm": 30.625,
"learning_rate": 1.919636074187346e-06,
"logits/chosen": -1.388319730758667,
"logits/rejected": -1.4473168849945068,
"logps/chosen": -253.48312377929688,
"logps/rejected": -212.169189453125,
"loss": 0.1468,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.18813356757164001,
"rewards/margins": 3.097019672393799,
"rewards/rejected": -2.908886194229126,
"step": 685
},
{
"epoch": 1.8623481781376519,
"grad_norm": 90.0,
"learning_rate": 1.8814740601199943e-06,
"logits/chosen": -1.4006351232528687,
"logits/rejected": -1.4068963527679443,
"logps/chosen": -164.6719970703125,
"logps/rejected": -193.83538818359375,
"loss": 0.2666,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.42647585272789,
"rewards/margins": 2.7546064853668213,
"rewards/rejected": -3.181082248687744,
"step": 690
},
{
"epoch": 1.8758434547908234,
"grad_norm": 25.75,
"learning_rate": 1.8434649637492952e-06,
"logits/chosen": -1.341395616531372,
"logits/rejected": -1.3592100143432617,
"logps/chosen": -181.58978271484375,
"logps/rejected": -235.27456665039062,
"loss": 0.1718,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.15408626198768616,
"rewards/margins": 3.203856945037842,
"rewards/rejected": -3.357943296432495,
"step": 695
},
{
"epoch": 1.8893387314439947,
"grad_norm": 18.625,
"learning_rate": 1.8056181820351737e-06,
"logits/chosen": -1.565199613571167,
"logits/rejected": -1.5012518167495728,
"logps/chosen": -241.5365753173828,
"logps/rejected": -229.1800079345703,
"loss": 0.1734,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.34816664457321167,
"rewards/margins": 4.178171634674072,
"rewards/rejected": -3.830005168914795,
"step": 700
},
{
"epoch": 1.902834008097166,
"grad_norm": 8.875,
"learning_rate": 1.7679430718086244e-06,
"logits/chosen": -1.5023219585418701,
"logits/rejected": -1.4059240818023682,
"logps/chosen": -240.8516082763672,
"logps/rejected": -287.47955322265625,
"loss": 0.0641,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.22036854922771454,
"rewards/margins": 4.166906833648682,
"rewards/rejected": -3.946538209915161,
"step": 705
},
{
"epoch": 1.9163292847503373,
"grad_norm": 35.5,
"learning_rate": 1.7304489474584307e-06,
"logits/chosen": -1.565582036972046,
"logits/rejected": -1.4994531869888306,
"logps/chosen": -148.25338745117188,
"logps/rejected": -231.37741088867188,
"loss": 0.123,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.1026371493935585,
"rewards/margins": 3.7467575073242188,
"rewards/rejected": -3.6441197395324707,
"step": 710
},
{
"epoch": 1.9298245614035088,
"grad_norm": 28.125,
"learning_rate": 1.693145078628377e-06,
"logits/chosen": -1.6054456233978271,
"logits/rejected": -1.6087411642074585,
"logps/chosen": -159.12234497070312,
"logps/rejected": -214.5330352783203,
"loss": 0.1255,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.13751724362373352,
"rewards/margins": 3.8032824993133545,
"rewards/rejected": -3.940800428390503,
"step": 715
},
{
"epoch": 1.9433198380566803,
"grad_norm": 18.375,
"learning_rate": 1.6560406879255192e-06,
"logits/chosen": -1.615686058998108,
"logits/rejected": -1.678998351097107,
"logps/chosen": -179.3768768310547,
"logps/rejected": -188.79124450683594,
"loss": 0.1608,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.13511483371257782,
"rewards/margins": 3.1098551750183105,
"rewards/rejected": -3.2449698448181152,
"step": 720
},
{
"epoch": 1.9568151147098516,
"grad_norm": 20.5,
"learning_rate": 1.6191449486400893e-06,
"logits/chosen": -1.5641348361968994,
"logits/rejected": -1.5269627571105957,
"logps/chosen": -190.90200805664062,
"logps/rejected": -200.14797973632812,
"loss": 0.1858,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.057850100100040436,
"rewards/margins": 3.392789363861084,
"rewards/rejected": -3.4506402015686035,
"step": 725
},
{
"epoch": 1.9703103913630229,
"grad_norm": 46.25,
"learning_rate": 1.5824669824775868e-06,
"logits/chosen": -1.6585397720336914,
"logits/rejected": -1.6145107746124268,
"logps/chosen": -153.5370330810547,
"logps/rejected": -246.87869262695312,
"loss": 0.1935,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.1705460101366043,
"rewards/margins": 3.2724738121032715,
"rewards/rejected": -3.4430203437805176,
"step": 730
},
{
"epoch": 1.9838056680161942,
"grad_norm": 21.125,
"learning_rate": 1.5460158573036288e-06,
"logits/chosen": -1.425318956375122,
"logits/rejected": -1.5616633892059326,
"logps/chosen": -228.63955688476562,
"logps/rejected": -232.26235961914062,
"loss": 0.1763,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.15325972437858582,
"rewards/margins": 2.676074504852295,
"rewards/rejected": -2.8293344974517822,
"step": 735
},
{
"epoch": 1.9973009446693657,
"grad_norm": 57.75,
"learning_rate": 1.509800584902108e-06,
"logits/chosen": -1.4701238870620728,
"logits/rejected": -1.335039496421814,
"logps/chosen": -165.36788940429688,
"logps/rejected": -248.84915161132812,
"loss": 0.2088,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.21872563660144806,
"rewards/margins": 3.6630032062530518,
"rewards/rejected": -3.8817286491394043,
"step": 740
},
{
"epoch": 2.010796221322537,
"grad_norm": 13.0625,
"learning_rate": 1.473830118747216e-06,
"logits/chosen": -1.3533880710601807,
"logits/rejected": -1.4392606019973755,
"logps/chosen": -173.4610595703125,
"logps/rejected": -189.3846435546875,
"loss": 0.1035,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.04286568984389305,
"rewards/margins": 3.4945671558380127,
"rewards/rejected": -3.537432909011841,
"step": 745
},
{
"epoch": 2.0242914979757085,
"grad_norm": 5.46875,
"learning_rate": 1.4381133517898803e-06,
"logits/chosen": -1.5612472295761108,
"logits/rejected": -1.6096746921539307,
"logps/chosen": -244.1045684814453,
"logps/rejected": -227.0,
"loss": 0.068,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.28070664405822754,
"rewards/margins": 4.070573329925537,
"rewards/rejected": -3.7898666858673096,
"step": 750
},
{
"epoch": 2.03778677462888,
"grad_norm": 46.75,
"learning_rate": 1.4026591142591733e-06,
"logits/chosen": -1.4181170463562012,
"logits/rejected": -1.5695334672927856,
"logps/chosen": -218.1271514892578,
"logps/rejected": -171.77000427246094,
"loss": 0.1633,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.2111760377883911,
"rewards/margins": 2.9384093284606934,
"rewards/rejected": -3.149585247039795,
"step": 755
},
{
"epoch": 2.051282051282051,
"grad_norm": 24.5,
"learning_rate": 1.3674761714792153e-06,
"logits/chosen": -1.5777294635772705,
"logits/rejected": -1.6976985931396484,
"logps/chosen": -224.3392791748047,
"logps/rejected": -254.0798797607422,
"loss": 0.0739,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.15005287528038025,
"rewards/margins": 4.0651535987854,
"rewards/rejected": -3.91510009765625,
"step": 760
},
{
"epoch": 2.064777327935223,
"grad_norm": 23.75,
"learning_rate": 1.33257322170213e-06,
"logits/chosen": -1.4911249876022339,
"logits/rejected": -1.500860571861267,
"logps/chosen": -172.9776611328125,
"logps/rejected": -201.8260040283203,
"loss": 0.1002,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.21206021308898926,
"rewards/margins": 3.810685634613037,
"rewards/rejected": -3.598625659942627,
"step": 765
},
{
"epoch": 2.078272604588394,
"grad_norm": 28.375,
"learning_rate": 1.2979588939575879e-06,
"logits/chosen": -1.5784046649932861,
"logits/rejected": -1.5579355955123901,
"logps/chosen": -192.16024780273438,
"logps/rejected": -219.4779510498047,
"loss": 0.1696,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.06577552855014801,
"rewards/margins": 3.573701858520508,
"rewards/rejected": -3.5079262256622314,
"step": 770
},
{
"epoch": 2.0917678812415654,
"grad_norm": 14.8125,
"learning_rate": 1.2636417459194536e-06,
"logits/chosen": -1.5944167375564575,
"logits/rejected": -1.6392465829849243,
"logps/chosen": -235.58633422851562,
"logps/rejected": -274.0408630371094,
"loss": 0.0593,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.08247147500514984,
"rewards/margins": 4.281358242034912,
"rewards/rejected": -4.363830089569092,
"step": 775
},
{
"epoch": 2.1052631578947367,
"grad_norm": 5.21875,
"learning_rate": 1.2296302617900772e-06,
"logits/chosen": -1.5774985551834106,
"logits/rejected": -1.6413581371307373,
"logps/chosen": -171.0308074951172,
"logps/rejected": -183.9725341796875,
"loss": 0.0845,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.016073107719421387,
"rewards/margins": 3.9465243816375732,
"rewards/rejected": -3.9304511547088623,
"step": 780
},
{
"epoch": 2.118758434547908,
"grad_norm": 15.0,
"learning_rate": 1.1959328502027556e-06,
"logits/chosen": -1.5672693252563477,
"logits/rejected": -1.5724976062774658,
"logps/chosen": -161.8846435546875,
"logps/rejected": -190.6571807861328,
"loss": 0.1138,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02019577845931053,
"rewards/margins": 3.7148475646972656,
"rewards/rejected": -3.6946518421173096,
"step": 785
},
{
"epoch": 2.1322537112010798,
"grad_norm": 19.125,
"learning_rate": 1.1625578421428714e-06,
"logits/chosen": -1.4088555574417114,
"logits/rejected": -1.331659197807312,
"logps/chosen": -197.23593139648438,
"logps/rejected": -279.7657470703125,
"loss": 0.1239,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.08394167572259903,
"rewards/margins": 3.702916383743286,
"rewards/rejected": -3.786858081817627,
"step": 790
},
{
"epoch": 2.145748987854251,
"grad_norm": 19.625,
"learning_rate": 1.1295134888882258e-06,
"logits/chosen": -1.5858689546585083,
"logits/rejected": -1.6758959293365479,
"logps/chosen": -194.56253051757812,
"logps/rejected": -206.4073028564453,
"loss": 0.0922,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.14167055487632751,
"rewards/margins": 3.8033957481384277,
"rewards/rejected": -3.945065975189209,
"step": 795
},
{
"epoch": 2.1592442645074224,
"grad_norm": 16.25,
"learning_rate": 1.0968079599690872e-06,
"logits/chosen": -1.5427080392837524,
"logits/rejected": -1.509251356124878,
"logps/chosen": -227.91281127929688,
"logps/rejected": -196.93661499023438,
"loss": 0.112,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.0069570960476994514,
"rewards/margins": 3.6783218383789062,
"rewards/rejected": -3.6852786540985107,
"step": 800
},
{
"epoch": 2.1727395411605936,
"grad_norm": 19.875,
"learning_rate": 1.064449341148442e-06,
"logits/chosen": -1.624629020690918,
"logits/rejected": -1.647383689880371,
"logps/chosen": -203.95071411132812,
"logps/rejected": -221.9706573486328,
"loss": 0.1216,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.1412554681301117,
"rewards/margins": 3.5126278400421143,
"rewards/rejected": -3.6538829803466797,
"step": 805
},
{
"epoch": 2.1862348178137654,
"grad_norm": 14.5,
"learning_rate": 1.0324456324229536e-06,
"logits/chosen": -1.4194597005844116,
"logits/rejected": -1.3489387035369873,
"logps/chosen": -166.34426879882812,
"logps/rejected": -239.3138427734375,
"loss": 0.0775,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0588313452899456,
"rewards/margins": 3.9181437492370605,
"rewards/rejected": -3.976975202560425,
"step": 810
},
{
"epoch": 2.1997300944669367,
"grad_norm": 35.5,
"learning_rate": 1.000804746045138e-06,
"logits/chosen": -1.3923031091690063,
"logits/rejected": -1.4646499156951904,
"logps/chosen": -191.46279907226562,
"logps/rejected": -184.79953002929688,
"loss": 0.1111,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.008677695877850056,
"rewards/margins": 3.193809986114502,
"rewards/rejected": -3.2024874687194824,
"step": 815
},
{
"epoch": 2.213225371120108,
"grad_norm": 16.0,
"learning_rate": 9.695345045672167e-07,
"logits/chosen": -1.4313310384750366,
"logits/rejected": -1.4792088270187378,
"logps/chosen": -191.17092895507812,
"logps/rejected": -196.5364532470703,
"loss": 0.118,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.33248454332351685,
"rewards/margins": 3.7640583515167236,
"rewards/rejected": -4.096542835235596,
"step": 820
},
{
"epoch": 2.2267206477732793,
"grad_norm": 15.5,
"learning_rate": 9.386426389071532e-07,
"logits/chosen": -1.4152162075042725,
"logits/rejected": -1.363843321800232,
"logps/chosen": -229.3914031982422,
"logps/rejected": -278.37847900390625,
"loss": 0.0961,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.30344587564468384,
"rewards/margins": 4.63069486618042,
"rewards/rejected": -4.934141635894775,
"step": 825
},
{
"epoch": 2.2402159244264506,
"grad_norm": 17.625,
"learning_rate": 9.081367864373489e-07,
"logits/chosen": -1.3973594903945923,
"logits/rejected": -1.524677038192749,
"logps/chosen": -168.33126831054688,
"logps/rejected": -156.55892944335938,
"loss": 0.0944,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1414262354373932,
"rewards/margins": 3.3840813636779785,
"rewards/rejected": -3.5255074501037598,
"step": 830
},
{
"epoch": 2.2537112010796223,
"grad_norm": 11.8125,
"learning_rate": 8.780244890964567e-07,
"logits/chosen": -1.4209728240966797,
"logits/rejected": -1.2569080591201782,
"logps/chosen": -177.04782104492188,
"logps/rejected": -275.0938415527344,
"loss": 0.0698,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16639022529125214,
"rewards/margins": 3.9153380393981934,
"rewards/rejected": -3.748948335647583,
"step": 835
},
{
"epoch": 2.2672064777327936,
"grad_norm": 10.625,
"learning_rate": 8.483131915247969e-07,
"logits/chosen": -1.563407301902771,
"logits/rejected": -1.534883975982666,
"logps/chosen": -171.35104370117188,
"logps/rejected": -242.4336700439453,
"loss": 0.0949,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.2416602075099945,
"rewards/margins": 4.914166450500488,
"rewards/rejected": -5.155826568603516,
"step": 840
},
{
"epoch": 2.280701754385965,
"grad_norm": 19.75,
"learning_rate": 8.190102392238191e-07,
"logits/chosen": -1.4438880681991577,
"logits/rejected": -1.4186255931854248,
"logps/chosen": -154.63705444335938,
"logps/rejected": -207.8048858642578,
"loss": 0.0783,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.18597714602947235,
"rewards/margins": 4.108304500579834,
"rewards/rejected": -4.294281959533691,
"step": 845
},
{
"epoch": 2.294197031039136,
"grad_norm": 32.25,
"learning_rate": 7.90122876740086e-07,
"logits/chosen": -1.63836669921875,
"logits/rejected": -1.5565919876098633,
"logps/chosen": -226.85037231445312,
"logps/rejected": -326.13421630859375,
"loss": 0.0577,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0758393257856369,
"rewards/margins": 4.579066276550293,
"rewards/rejected": -4.503227233886719,
"step": 850
},
{
"epoch": 2.3076923076923075,
"grad_norm": 5.0625,
"learning_rate": 7.616582458742059e-07,
"logits/chosen": -1.4565999507904053,
"logits/rejected": -1.455143928527832,
"logps/chosen": -212.2303009033203,
"logps/rejected": -276.86834716796875,
"loss": 0.0519,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1173635721206665,
"rewards/margins": 4.344286918640137,
"rewards/rejected": -4.46165132522583,
"step": 855
},
{
"epoch": 2.3211875843454792,
"grad_norm": 9.6875,
"learning_rate": 7.336233839151693e-07,
"logits/chosen": -1.6497745513916016,
"logits/rejected": -1.6588242053985596,
"logps/chosen": -169.42959594726562,
"logps/rejected": -258.19207763671875,
"loss": 0.1057,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.21658802032470703,
"rewards/margins": 3.805851697921753,
"rewards/rejected": -4.022439479827881,
"step": 860
},
{
"epoch": 2.3346828609986505,
"grad_norm": 21.5,
"learning_rate": 7.060252219005304e-07,
"logits/chosen": -1.520618200302124,
"logits/rejected": -1.5337458848953247,
"logps/chosen": -227.05679321289062,
"logps/rejected": -317.5985107421875,
"loss": 0.0774,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.06503160297870636,
"rewards/margins": 4.4666852951049805,
"rewards/rejected": -4.531716823577881,
"step": 865
},
{
"epoch": 2.348178137651822,
"grad_norm": 12.5,
"learning_rate": 6.788705829028483e-07,
"logits/chosen": -1.5424460172653198,
"logits/rejected": -1.527999997138977,
"logps/chosen": -186.46414184570312,
"logps/rejected": -190.83157348632812,
"loss": 0.1022,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1064692884683609,
"rewards/margins": 3.359034776687622,
"rewards/rejected": -3.2525649070739746,
"step": 870
},
{
"epoch": 2.361673414304993,
"grad_norm": 66.5,
"learning_rate": 6.521661803428225e-07,
"logits/chosen": -1.5013136863708496,
"logits/rejected": -1.5206286907196045,
"logps/chosen": -201.0956268310547,
"logps/rejected": -198.01573181152344,
"loss": 0.0955,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.13122853636741638,
"rewards/margins": 3.767671585083008,
"rewards/rejected": -3.898899793624878,
"step": 875
},
{
"epoch": 2.375168690958165,
"grad_norm": 11.3125,
"learning_rate": 6.259186163295439e-07,
"logits/chosen": -1.2552602291107178,
"logits/rejected": -1.3482682704925537,
"logps/chosen": -246.9757080078125,
"logps/rejected": -239.8274383544922,
"loss": 0.0983,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1879548728466034,
"rewards/margins": 3.7479751110076904,
"rewards/rejected": -3.935929775238037,
"step": 880
},
{
"epoch": 2.388663967611336,
"grad_norm": 16.0,
"learning_rate": 6.001343800282569e-07,
"logits/chosen": -1.5184439420700073,
"logits/rejected": -1.4158121347427368,
"logps/chosen": -145.63616943359375,
"logps/rejected": -212.58468627929688,
"loss": 0.0783,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.3523162603378296,
"rewards/margins": 4.166034698486328,
"rewards/rejected": -4.5183515548706055,
"step": 885
},
{
"epoch": 2.4021592442645074,
"grad_norm": 12.0625,
"learning_rate": 5.748198460560475e-07,
"logits/chosen": -1.602419137954712,
"logits/rejected": -1.6869083642959595,
"logps/chosen": -211.70947265625,
"logps/rejected": -220.8863525390625,
"loss": 0.0806,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16532480716705322,
"rewards/margins": 4.41878080368042,
"rewards/rejected": -4.253456115722656,
"step": 890
},
{
"epoch": 2.4156545209176787,
"grad_norm": 32.75,
"learning_rate": 5.499812729058546e-07,
"logits/chosen": -1.56089186668396,
"logits/rejected": -1.5883516073226929,
"logps/chosen": -181.11459350585938,
"logps/rejected": -161.60299682617188,
"loss": 0.1433,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.2707998752593994,
"rewards/margins": 3.216136932373047,
"rewards/rejected": -3.4869370460510254,
"step": 895
},
{
"epoch": 2.42914979757085,
"grad_norm": 14.8125,
"learning_rate": 5.256248013991857e-07,
"logits/chosen": -1.5014961957931519,
"logits/rejected": -1.4206339120864868,
"logps/chosen": -226.8283233642578,
"logps/rejected": -266.60333251953125,
"loss": 0.0715,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00030528902425430715,
"rewards/margins": 4.552371978759766,
"rewards/rejected": -4.552066802978516,
"step": 900
},
{
"epoch": 2.4426450742240218,
"grad_norm": 23.0,
"learning_rate": 5.01756453167925e-07,
"logits/chosen": -1.5279182195663452,
"logits/rejected": -1.5130751132965088,
"logps/chosen": -199.68397521972656,
"logps/rejected": -246.5128936767578,
"loss": 0.0683,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.14631351828575134,
"rewards/margins": 4.73899507522583,
"rewards/rejected": -4.592680931091309,
"step": 905
},
{
"epoch": 2.456140350877193,
"grad_norm": 21.375,
"learning_rate": 4.78382129165613e-07,
"logits/chosen": -1.4500765800476074,
"logits/rejected": -1.5014575719833374,
"logps/chosen": -185.51475524902344,
"logps/rejected": -181.7137908935547,
"loss": 0.1049,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09951256215572357,
"rewards/margins": 3.4707932472229004,
"rewards/rejected": -3.371281147003174,
"step": 910
},
{
"epoch": 2.4696356275303644,
"grad_norm": 32.5,
"learning_rate": 4.5550760820855633e-07,
"logits/chosen": -1.557877779006958,
"logits/rejected": -1.4586069583892822,
"logps/chosen": -209.05062866210938,
"logps/rejected": -308.66424560546875,
"loss": 0.118,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.2664136290550232,
"rewards/margins": 4.0513434410095215,
"rewards/rejected": -4.3177571296691895,
"step": 915
},
{
"epoch": 2.4831309041835357,
"grad_norm": 22.5,
"learning_rate": 4.3313854554713457e-07,
"logits/chosen": -1.5593338012695312,
"logits/rejected": -1.5647127628326416,
"logps/chosen": -197.6747283935547,
"logps/rejected": -253.01876831054688,
"loss": 0.0716,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0987640991806984,
"rewards/margins": 4.090095043182373,
"rewards/rejected": -3.9913315773010254,
"step": 920
},
{
"epoch": 2.4966261808367074,
"grad_norm": 20.125,
"learning_rate": 4.1128047146765936e-07,
"logits/chosen": -1.435847520828247,
"logits/rejected": -1.453253149986267,
"logps/chosen": -141.46656799316406,
"logps/rejected": -162.93905639648438,
"loss": 0.1007,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20369374752044678,
"rewards/margins": 3.790607452392578,
"rewards/rejected": -3.586913585662842,
"step": 925
},
{
"epoch": 2.5101214574898787,
"grad_norm": 32.5,
"learning_rate": 3.899387899251242e-07,
"logits/chosen": -1.499912142753601,
"logits/rejected": -1.5055288076400757,
"logps/chosen": -179.4788360595703,
"logps/rejected": -202.9369354248047,
"loss": 0.0808,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.04291580989956856,
"rewards/margins": 3.4943645000457764,
"rewards/rejected": -3.537280321121216,
"step": 930
},
{
"epoch": 2.52361673414305,
"grad_norm": 6.59375,
"learning_rate": 3.6911877720719053e-07,
"logits/chosen": -1.6243568658828735,
"logits/rejected": -1.5396671295166016,
"logps/chosen": -155.4473419189453,
"logps/rejected": -191.9477081298828,
"loss": 0.1245,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.33543360233306885,
"rewards/margins": 4.113525867462158,
"rewards/rejected": -4.4489593505859375,
"step": 935
},
{
"epoch": 2.5371120107962213,
"grad_norm": 10.3125,
"learning_rate": 3.488255806297311e-07,
"logits/chosen": -1.4612650871276855,
"logits/rejected": -1.6070709228515625,
"logps/chosen": -164.7592010498047,
"logps/rejected": -161.7231903076172,
"loss": 0.1901,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.06939269602298737,
"rewards/margins": 3.406930446624756,
"rewards/rejected": -3.3375372886657715,
"step": 940
},
{
"epoch": 2.5506072874493926,
"grad_norm": 7.46875,
"learning_rate": 3.2906421726426857e-07,
"logits/chosen": -1.4703078269958496,
"logits/rejected": -1.4379500150680542,
"logps/chosen": -204.19473266601562,
"logps/rejected": -244.11965942382812,
"loss": 0.0904,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6960457563400269,
"rewards/margins": 4.154335975646973,
"rewards/rejected": -4.850381851196289,
"step": 945
},
{
"epoch": 2.564102564102564,
"grad_norm": 17.375,
"learning_rate": 3.09839572697605e-07,
"logits/chosen": -1.560767412185669,
"logits/rejected": -1.4427921772003174,
"logps/chosen": -243.10568237304688,
"logps/rejected": -232.52108764648438,
"loss": 0.0844,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.049421075731515884,
"rewards/margins": 4.088489055633545,
"rewards/rejected": -4.137909889221191,
"step": 950
},
{
"epoch": 2.5775978407557356,
"grad_norm": 19.75,
"learning_rate": 2.9115639982396166e-07,
"logits/chosen": -1.515772819519043,
"logits/rejected": -1.6191974878311157,
"logps/chosen": -210.3816375732422,
"logps/rejected": -198.30801391601562,
"loss": 0.1289,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.32715946435928345,
"rewards/margins": 3.6732399463653564,
"rewards/rejected": -4.000399589538574,
"step": 955
},
{
"epoch": 2.591093117408907,
"grad_norm": 16.875,
"learning_rate": 2.7301931766992916e-07,
"logits/chosen": -1.53992760181427,
"logits/rejected": -1.6426169872283936,
"logps/chosen": -202.2464599609375,
"logps/rejected": -200.73020935058594,
"loss": 0.0916,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2096923142671585,
"rewards/margins": 3.49652361869812,
"rewards/rejected": -3.2868313789367676,
"step": 960
},
{
"epoch": 2.604588394062078,
"grad_norm": 15.875,
"learning_rate": 2.554328102525022e-07,
"logits/chosen": -1.468806505203247,
"logits/rejected": -1.5037376880645752,
"logps/chosen": -225.407470703125,
"logps/rejected": -265.16326904296875,
"loss": 0.0516,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1482563018798828,
"rewards/margins": 3.908936023712158,
"rewards/rejected": -3.760679244995117,
"step": 965
},
{
"epoch": 2.6180836707152495,
"grad_norm": 28.25,
"learning_rate": 2.3840122547050482e-07,
"logits/chosen": -1.4675546884536743,
"logits/rejected": -1.427056074142456,
"logps/chosen": -189.55482482910156,
"logps/rejected": -238.43399047851562,
"loss": 0.128,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.15342381596565247,
"rewards/margins": 4.185477256774902,
"rewards/rejected": -4.338901042938232,
"step": 970
},
{
"epoch": 2.6315789473684212,
"grad_norm": 10.6875,
"learning_rate": 2.219287740296605e-07,
"logits/chosen": -1.5017975568771362,
"logits/rejected": -1.5283129215240479,
"logps/chosen": -185.2952117919922,
"logps/rejected": -218.5054168701172,
"loss": 0.0971,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.2838120460510254,
"rewards/margins": 4.120657444000244,
"rewards/rejected": -4.4044694900512695,
"step": 975
},
{
"epoch": 2.6450742240215925,
"grad_norm": 21.0,
"learning_rate": 2.060195284015837e-07,
"logits/chosen": -1.662113904953003,
"logits/rejected": -1.6862503290176392,
"logps/chosen": -150.606689453125,
"logps/rejected": -198.61793518066406,
"loss": 0.1152,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2786501944065094,
"rewards/margins": 3.8265221118927,
"rewards/rejected": -4.105172157287598,
"step": 980
},
{
"epoch": 2.658569500674764,
"grad_norm": 19.75,
"learning_rate": 1.9067742181694353e-07,
"logits/chosen": -1.4568703174591064,
"logits/rejected": -1.4512639045715332,
"logps/chosen": -171.15443420410156,
"logps/rejected": -221.99526977539062,
"loss": 0.096,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17055651545524597,
"rewards/margins": 5.160454273223877,
"rewards/rejected": -5.3310112953186035,
"step": 985
},
{
"epoch": 2.672064777327935,
"grad_norm": 93.0,
"learning_rate": 1.75906247293057e-07,
"logits/chosen": -1.6594133377075195,
"logits/rejected": -1.5529086589813232,
"logps/chosen": -156.86392211914062,
"logps/rejected": -285.59197998046875,
"loss": 0.118,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.5136551856994629,
"rewards/margins": 4.625790596008301,
"rewards/rejected": -5.139446258544922,
"step": 990
},
{
"epoch": 2.6855600539811064,
"grad_norm": 11.3125,
"learning_rate": 1.617096566961429e-07,
"logits/chosen": -1.466498613357544,
"logits/rejected": -1.4549661874771118,
"logps/chosen": -155.0102081298828,
"logps/rejected": -232.1795654296875,
"loss": 0.152,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.09109257161617279,
"rewards/margins": 3.467794418334961,
"rewards/rejected": -3.558886766433716,
"step": 995
},
{
"epoch": 2.699055330634278,
"grad_norm": 15.9375,
"learning_rate": 1.4809115983847267e-07,
"logits/chosen": -1.377762794494629,
"logits/rejected": -1.3253929615020752,
"logps/chosen": -148.2834014892578,
"logps/rejected": -208.0382080078125,
"loss": 0.1151,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.24366268515586853,
"rewards/margins": 3.6103515625,
"rewards/rejected": -3.8540141582489014,
"step": 1000
},
{
"epoch": 2.699055330634278,
"eval_logits/chosen": -1.536294937133789,
"eval_logits/rejected": -1.5776937007904053,
"eval_logps/chosen": -191.7211456298828,
"eval_logps/rejected": -226.05455017089844,
"eval_loss": 0.31860384345054626,
"eval_rewards/accuracies": 0.8524096608161926,
"eval_rewards/chosen": -0.7276893258094788,
"eval_rewards/margins": 2.395343065261841,
"eval_rewards/rejected": -3.1230320930480957,
"eval_runtime": 23.3449,
"eval_samples_per_second": 14.136,
"eval_steps_per_second": 3.555,
"step": 1000
},
{
"epoch": 2.7125506072874495,
"grad_norm": 23.625,
"learning_rate": 1.3505412361064395e-07,
"logits/chosen": -1.4981733560562134,
"logits/rejected": -1.5207927227020264,
"logps/chosen": -192.99154663085938,
"logps/rejected": -194.6613311767578,
"loss": 0.0649,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07562440633773804,
"rewards/margins": 4.270889759063721,
"rewards/rejected": -4.195265769958496,
"step": 1005
},
{
"epoch": 2.7260458839406208,
"grad_norm": 20.5,
"learning_rate": 1.226017711491867e-07,
"logits/chosen": -1.5061196088790894,
"logits/rejected": -1.5956671237945557,
"logps/chosen": -170.25169372558594,
"logps/rejected": -240.0498046875,
"loss": 0.0805,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.27517637610435486,
"rewards/margins": 3.623337507247925,
"rewards/rejected": -3.89851450920105,
"step": 1010
},
{
"epoch": 2.739541160593792,
"grad_norm": 31.25,
"learning_rate": 1.107371810397076e-07,
"logits/chosen": -1.4881411790847778,
"logits/rejected": -1.5475780963897705,
"logps/chosen": -237.45504760742188,
"logps/rejected": -212.13330078125,
"loss": 0.0625,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10645435005426407,
"rewards/margins": 4.086081027984619,
"rewards/rejected": -4.192535400390625,
"step": 1015
},
{
"epoch": 2.753036437246964,
"grad_norm": 16.75,
"learning_rate": 9.946328655577625e-08,
"logits/chosen": -1.5837833881378174,
"logits/rejected": -1.6130040884017944,
"logps/chosen": -137.10398864746094,
"logps/rejected": -171.19357299804688,
"loss": 0.0725,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.27897781133651733,
"rewards/margins": 3.8964107036590576,
"rewards/rejected": -4.175389289855957,
"step": 1020
},
{
"epoch": 2.766531713900135,
"grad_norm": 28.25,
"learning_rate": 8.878287493373245e-08,
"logits/chosen": -1.5753690004348755,
"logits/rejected": -1.6070302724838257,
"logps/chosen": -214.03018188476562,
"logps/rejected": -189.55850219726562,
"loss": 0.1188,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.10192601382732391,
"rewards/margins": 3.4568443298339844,
"rewards/rejected": -3.558769941329956,
"step": 1025
},
{
"epoch": 2.7800269905533064,
"grad_norm": 26.625,
"learning_rate": 7.869858668360042e-08,
"logits/chosen": -1.4193127155303955,
"logits/rejected": -1.2717030048370361,
"logps/chosen": -187.0641632080078,
"logps/rejected": -224.65066528320312,
"loss": 0.1028,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.18936040997505188,
"rewards/margins": 4.242377758026123,
"rewards/rejected": -4.43173885345459,
"step": 1030
},
{
"epoch": 2.7935222672064777,
"grad_norm": 24.75,
"learning_rate": 6.921291493627747e-08,
"logits/chosen": -1.6177479028701782,
"logits/rejected": -1.6725289821624756,
"logps/chosen": -248.9903564453125,
"logps/rejected": -230.86611938476562,
"loss": 0.0856,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.29994627833366394,
"rewards/margins": 3.9232945442199707,
"rewards/rejected": -3.6233487129211426,
"step": 1035
},
{
"epoch": 2.807017543859649,
"grad_norm": 42.5,
"learning_rate": 6.032820482716001e-08,
"logits/chosen": -1.5851434469223022,
"logits/rejected": -1.5880482196807861,
"logps/chosen": -155.3755340576172,
"logps/rejected": -186.6389617919922,
"loss": 0.1754,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.19228528439998627,
"rewards/margins": 3.5754799842834473,
"rewards/rejected": -3.7677650451660156,
"step": 1040
},
{
"epoch": 2.8205128205128203,
"grad_norm": 43.0,
"learning_rate": 5.204665291635519e-08,
"logits/chosen": -1.496819019317627,
"logits/rejected": -1.5007538795471191,
"logps/chosen": -179.5200653076172,
"logps/rejected": -266.001953125,
"loss": 0.1038,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.35474246740341187,
"rewards/margins": 3.8987841606140137,
"rewards/rejected": -4.253526210784912,
"step": 1045
},
{
"epoch": 2.834008097165992,
"grad_norm": 27.5,
"learning_rate": 4.437030664562969e-08,
"logits/chosen": -1.470956563949585,
"logits/rejected": -1.52825927734375,
"logps/chosen": -203.93551635742188,
"logps/rejected": -220.02639770507812,
"loss": 0.1639,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.0977933406829834,
"rewards/margins": 3.205706834793091,
"rewards/rejected": -3.3035004138946533,
"step": 1050
},
{
"epoch": 2.8475033738191633,
"grad_norm": 65.0,
"learning_rate": 3.730106383222132e-08,
"logits/chosen": -1.5251743793487549,
"logits/rejected": -1.3242510557174683,
"logps/chosen": -186.79141235351562,
"logps/rejected": -250.46566772460938,
"loss": 0.0909,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.299465537071228,
"rewards/margins": 4.545691967010498,
"rewards/rejected": -4.845158100128174,
"step": 1055
},
{
"epoch": 2.8609986504723346,
"grad_norm": 19.75,
"learning_rate": 3.084067219964182e-08,
"logits/chosen": -1.527754783630371,
"logits/rejected": -1.5058457851409912,
"logps/chosen": -173.50900268554688,
"logps/rejected": -246.65628051757812,
"loss": 0.2529,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.42231351137161255,
"rewards/margins": 3.4638805389404297,
"rewards/rejected": -3.8861937522888184,
"step": 1060
},
{
"epoch": 2.8744939271255063,
"grad_norm": 43.75,
"learning_rate": 2.499072894559057e-08,
"logits/chosen": -1.6412513256072998,
"logits/rejected": -1.6829668283462524,
"logps/chosen": -180.06788635253906,
"logps/rejected": -219.94528198242188,
"loss": 0.1089,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.23302574455738068,
"rewards/margins": 3.369854688644409,
"rewards/rejected": -3.6028804779052734,
"step": 1065
},
{
"epoch": 2.8879892037786776,
"grad_norm": 13.75,
"learning_rate": 1.975268034707878e-08,
"logits/chosen": -1.4751927852630615,
"logits/rejected": -1.5141003131866455,
"logps/chosen": -204.79470825195312,
"logps/rejected": -223.97509765625,
"loss": 0.0822,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15369097888469696,
"rewards/margins": 3.9829258918762207,
"rewards/rejected": -3.8292346000671387,
"step": 1070
},
{
"epoch": 2.901484480431849,
"grad_norm": 39.0,
"learning_rate": 1.512782140286939e-08,
"logits/chosen": -1.4587006568908691,
"logits/rejected": -1.5042657852172852,
"logps/chosen": -156.6952667236328,
"logps/rejected": -263.0159912109375,
"loss": 0.0959,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11610189825296402,
"rewards/margins": 3.9884142875671387,
"rewards/rejected": -4.10451602935791,
"step": 1075
},
{
"epoch": 2.91497975708502,
"grad_norm": 17.75,
"learning_rate": 1.1117295513313475e-08,
"logits/chosen": -1.665400743484497,
"logits/rejected": -1.6617343425750732,
"logps/chosen": -161.07443237304688,
"logps/rejected": -207.7931671142578,
"loss": 0.0872,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.10158289968967438,
"rewards/margins": 4.066722869873047,
"rewards/rejected": -3.965139865875244,
"step": 1080
},
{
"epoch": 2.9284750337381915,
"grad_norm": 20.75,
"learning_rate": 7.72209419766995e-09,
"logits/chosen": -1.4860131740570068,
"logits/rejected": -1.3406977653503418,
"logps/chosen": -168.0951690673828,
"logps/rejected": -274.35113525390625,
"loss": 0.1053,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.44807571172714233,
"rewards/margins": 3.924337863922119,
"rewards/rejected": -4.372413635253906,
"step": 1085
},
{
"epoch": 2.941970310391363,
"grad_norm": 16.25,
"learning_rate": 4.943056848972227e-09,
"logits/chosen": -1.493690848350525,
"logits/rejected": -1.5224257707595825,
"logps/chosen": -209.3112335205078,
"logps/rejected": -208.22988891601562,
"loss": 0.0878,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.035886406898498535,
"rewards/margins": 3.8605358600616455,
"rewards/rejected": -3.8246493339538574,
"step": 1090
},
{
"epoch": 2.9554655870445345,
"grad_norm": 14.8125,
"learning_rate": 2.7808705265053305e-09,
"logits/chosen": -1.571223497390747,
"logits/rejected": -1.5577231645584106,
"logps/chosen": -169.42562866210938,
"logps/rejected": -181.50631713867188,
"loss": 0.1109,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.13199149072170258,
"rewards/margins": 3.612278699874878,
"rewards/rejected": -3.744269847869873,
"step": 1095
},
{
"epoch": 2.968960863697706,
"grad_norm": 27.75,
"learning_rate": 1.2360697859462035e-09,
"logits/chosen": -1.5886671543121338,
"logits/rejected": -1.562727928161621,
"logps/chosen": -162.84046936035156,
"logps/rejected": -219.8025360107422,
"loss": 0.118,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.3454614281654358,
"rewards/margins": 4.1069793701171875,
"rewards/rejected": -4.4524407386779785,
"step": 1100
},
{
"epoch": 2.982456140350877,
"grad_norm": 13.0625,
"learning_rate": 3.090365472041557e-10,
"logits/chosen": -1.5336341857910156,
"logits/rejected": -1.5714600086212158,
"logps/chosen": -217.091064453125,
"logps/rejected": -239.0583953857422,
"loss": 0.1793,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.2769380509853363,
"rewards/margins": 3.7623977661132812,
"rewards/rejected": -4.039335250854492,
"step": 1105
},
{
"epoch": 2.9959514170040484,
"grad_norm": 16.75,
"learning_rate": 0.0,
"logits/chosen": -1.4733049869537354,
"logits/rejected": -1.4821723699569702,
"logps/chosen": -191.77438354492188,
"logps/rejected": -275.19158935546875,
"loss": 0.0661,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11918088048696518,
"rewards/margins": 4.132817268371582,
"rewards/rejected": -4.013636589050293,
"step": 1110
},
{
"epoch": 2.9959514170040484,
"step": 1110,
"total_flos": 4.5615607240812134e+17,
"train_loss": 0.26195224279218965,
"train_runtime": 3105.2921,
"train_samples_per_second": 2.862,
"train_steps_per_second": 0.357
}
],
"logging_steps": 5,
"max_steps": 1110,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.5615607240812134e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}