martimfasantos's picture
Model save
c23b120 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 2776,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007204610951008645,
"grad_norm": 16.86738074547546,
"learning_rate": 1.7985611510791367e-10,
"logits/chosen": -1.901450514793396,
"logits/rejected": -1.9076323509216309,
"logps/chosen": -0.8524526953697205,
"logps/rejected": -0.9626365900039673,
"loss": 1.6316,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.704905390739441,
"rewards/margins": 0.22036786377429962,
"rewards/rejected": -1.9252731800079346,
"step": 1
},
{
"epoch": 0.007204610951008645,
"grad_norm": 20.67220170920981,
"learning_rate": 1.7985611510791365e-09,
"logits/chosen": -2.020613670349121,
"logits/rejected": -2.006347894668579,
"logps/chosen": -1.005244255065918,
"logps/rejected": -1.1096515655517578,
"loss": 1.6546,
"rewards/accuracies": 0.5208333134651184,
"rewards/chosen": -2.010488510131836,
"rewards/margins": 0.20881448686122894,
"rewards/rejected": -2.2193031311035156,
"step": 10
},
{
"epoch": 0.01440922190201729,
"grad_norm": 26.108277039722253,
"learning_rate": 3.597122302158273e-09,
"logits/chosen": -2.0260705947875977,
"logits/rejected": -2.022770643234253,
"logps/chosen": -1.052295446395874,
"logps/rejected": -1.1837208271026611,
"loss": 1.6167,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.104590892791748,
"rewards/margins": 0.26285091042518616,
"rewards/rejected": -2.3674416542053223,
"step": 20
},
{
"epoch": 0.021613832853025938,
"grad_norm": 20.47682519715639,
"learning_rate": 5.3956834532374095e-09,
"logits/chosen": -1.9848406314849854,
"logits/rejected": -1.9775378704071045,
"logps/chosen": -1.0540497303009033,
"logps/rejected": -1.1514469385147095,
"loss": 1.6715,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1080994606018066,
"rewards/margins": 0.19479455053806305,
"rewards/rejected": -2.302893877029419,
"step": 30
},
{
"epoch": 0.02881844380403458,
"grad_norm": 22.578054082763025,
"learning_rate": 7.194244604316546e-09,
"logits/chosen": -2.0309205055236816,
"logits/rejected": -2.030827045440674,
"logps/chosen": -1.0357428789138794,
"logps/rejected": -1.1376559734344482,
"loss": 1.674,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.071485757827759,
"rewards/margins": 0.20382657647132874,
"rewards/rejected": -2.2753119468688965,
"step": 40
},
{
"epoch": 0.03602305475504323,
"grad_norm": 17.189127890707947,
"learning_rate": 8.992805755395683e-09,
"logits/chosen": -1.9604355096817017,
"logits/rejected": -1.9610908031463623,
"logps/chosen": -0.9419905543327332,
"logps/rejected": -1.0071475505828857,
"loss": 1.7048,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8839811086654663,
"rewards/margins": 0.13031414151191711,
"rewards/rejected": -2.0142951011657715,
"step": 50
},
{
"epoch": 0.043227665706051875,
"grad_norm": 24.394161121983817,
"learning_rate": 1.0791366906474819e-08,
"logits/chosen": -2.0403716564178467,
"logits/rejected": -2.035911798477173,
"logps/chosen": -1.0892378091812134,
"logps/rejected": -1.1461578607559204,
"loss": 1.7173,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.1784756183624268,
"rewards/margins": 0.11384035646915436,
"rewards/rejected": -2.292315721511841,
"step": 60
},
{
"epoch": 0.05043227665706052,
"grad_norm": 23.080584749878106,
"learning_rate": 1.2589928057553956e-08,
"logits/chosen": -2.0298831462860107,
"logits/rejected": -2.0174343585968018,
"logps/chosen": -1.109933614730835,
"logps/rejected": -1.2047233581542969,
"loss": 1.6667,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.21986722946167,
"rewards/margins": 0.18957947194576263,
"rewards/rejected": -2.4094467163085938,
"step": 70
},
{
"epoch": 0.05763688760806916,
"grad_norm": 28.510083775511152,
"learning_rate": 1.4388489208633092e-08,
"logits/chosen": -2.0415005683898926,
"logits/rejected": -2.0385377407073975,
"logps/chosen": -1.1662975549697876,
"logps/rejected": -1.2378699779510498,
"loss": 1.7003,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.332595109939575,
"rewards/margins": 0.1431449055671692,
"rewards/rejected": -2.4757399559020996,
"step": 80
},
{
"epoch": 0.06484149855907781,
"grad_norm": 18.099831598265492,
"learning_rate": 1.618705035971223e-08,
"logits/chosen": -2.003298044204712,
"logits/rejected": -2.004725933074951,
"logps/chosen": -1.0415083169937134,
"logps/rejected": -1.149029016494751,
"loss": 1.6519,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.0830166339874268,
"rewards/margins": 0.21504120528697968,
"rewards/rejected": -2.298058032989502,
"step": 90
},
{
"epoch": 0.07204610951008646,
"grad_norm": 21.6296417312396,
"learning_rate": 1.7985611510791365e-08,
"logits/chosen": -2.036734104156494,
"logits/rejected": -2.0305848121643066,
"logps/chosen": -1.0069749355316162,
"logps/rejected": -1.1141220331192017,
"loss": 1.654,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0139498710632324,
"rewards/margins": 0.21429400146007538,
"rewards/rejected": -2.2282440662384033,
"step": 100
},
{
"epoch": 0.0792507204610951,
"grad_norm": 18.402417176588862,
"learning_rate": 1.9784172661870502e-08,
"logits/chosen": -1.9797817468643188,
"logits/rejected": -1.9685176610946655,
"logps/chosen": -1.0294291973114014,
"logps/rejected": -1.1286334991455078,
"loss": 1.6659,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0588583946228027,
"rewards/margins": 0.19840820133686066,
"rewards/rejected": -2.2572669982910156,
"step": 110
},
{
"epoch": 0.08645533141210375,
"grad_norm": 20.729624339345094,
"learning_rate": 2.1582733812949638e-08,
"logits/chosen": -1.9758269786834717,
"logits/rejected": -1.974029541015625,
"logps/chosen": -0.964306652545929,
"logps/rejected": -1.0657222270965576,
"loss": 1.6486,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.928613305091858,
"rewards/margins": 0.20283102989196777,
"rewards/rejected": -2.1314444541931152,
"step": 120
},
{
"epoch": 0.0936599423631124,
"grad_norm": 20.17439332595769,
"learning_rate": 2.3381294964028775e-08,
"logits/chosen": -2.0696139335632324,
"logits/rejected": -2.068974733352661,
"logps/chosen": -1.0797998905181885,
"logps/rejected": -1.1516422033309937,
"loss": 1.7012,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.159599781036377,
"rewards/margins": 0.14368471503257751,
"rewards/rejected": -2.3032844066619873,
"step": 130
},
{
"epoch": 0.10086455331412104,
"grad_norm": 24.064780949371126,
"learning_rate": 2.517985611510791e-08,
"logits/chosen": -1.9815738201141357,
"logits/rejected": -1.9751752614974976,
"logps/chosen": -0.9776951670646667,
"logps/rejected": -1.1230800151824951,
"loss": 1.5974,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9553903341293335,
"rewards/margins": 0.2907695770263672,
"rewards/rejected": -2.2461600303649902,
"step": 140
},
{
"epoch": 0.10806916426512968,
"grad_norm": 23.00427709241572,
"learning_rate": 2.6978417266187048e-08,
"logits/chosen": -1.99484121799469,
"logits/rejected": -1.9905335903167725,
"logps/chosen": -1.0193841457366943,
"logps/rejected": -1.1368898153305054,
"loss": 1.6404,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0387682914733887,
"rewards/margins": 0.23501136898994446,
"rewards/rejected": -2.2737796306610107,
"step": 150
},
{
"epoch": 0.11527377521613832,
"grad_norm": 20.432294969630874,
"learning_rate": 2.8776978417266184e-08,
"logits/chosen": -1.997571587562561,
"logits/rejected": -1.9914041757583618,
"logps/chosen": -0.947496771812439,
"logps/rejected": -1.0964053869247437,
"loss": 1.5792,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.894993543624878,
"rewards/margins": 0.2978169918060303,
"rewards/rejected": -2.1928107738494873,
"step": 160
},
{
"epoch": 0.12247838616714697,
"grad_norm": 25.103842505396916,
"learning_rate": 3.057553956834532e-08,
"logits/chosen": -2.006762981414795,
"logits/rejected": -1.9991118907928467,
"logps/chosen": -1.0366116762161255,
"logps/rejected": -1.1614980697631836,
"loss": 1.6344,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.073223352432251,
"rewards/margins": 0.2497730255126953,
"rewards/rejected": -2.322996139526367,
"step": 170
},
{
"epoch": 0.12968299711815562,
"grad_norm": 26.27968111051558,
"learning_rate": 3.237410071942446e-08,
"logits/chosen": -2.0409793853759766,
"logits/rejected": -2.034149646759033,
"logps/chosen": -1.0202006101608276,
"logps/rejected": -1.108983039855957,
"loss": 1.6865,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0404012203216553,
"rewards/margins": 0.177564799785614,
"rewards/rejected": -2.217966079711914,
"step": 180
},
{
"epoch": 0.13688760806916425,
"grad_norm": 25.939125627248345,
"learning_rate": 3.4172661870503594e-08,
"logits/chosen": -2.0743298530578613,
"logits/rejected": -2.072180986404419,
"logps/chosen": -0.9696714282035828,
"logps/rejected": -1.065748929977417,
"loss": 1.6537,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9393428564071655,
"rewards/margins": 0.19215507805347443,
"rewards/rejected": -2.131497859954834,
"step": 190
},
{
"epoch": 0.1440922190201729,
"grad_norm": 26.153404922121894,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -2.0394513607025146,
"logits/rejected": -2.0364089012145996,
"logps/chosen": -1.0258630514144897,
"logps/rejected": -1.1529974937438965,
"loss": 1.6189,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0517261028289795,
"rewards/margins": 0.2542688548564911,
"rewards/rejected": -2.305994987487793,
"step": 200
},
{
"epoch": 0.15129682997118155,
"grad_norm": 23.72175833145625,
"learning_rate": 3.776978417266187e-08,
"logits/chosen": -2.034412384033203,
"logits/rejected": -2.0315709114074707,
"logps/chosen": -1.073925256729126,
"logps/rejected": -1.1507259607315063,
"loss": 1.6945,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.147850513458252,
"rewards/margins": 0.1536014825105667,
"rewards/rejected": -2.3014519214630127,
"step": 210
},
{
"epoch": 0.1585014409221902,
"grad_norm": 18.049421620748525,
"learning_rate": 3.9568345323741003e-08,
"logits/chosen": -1.9837948083877563,
"logits/rejected": -1.9797385931015015,
"logps/chosen": -1.007852554321289,
"logps/rejected": -1.1767760515213013,
"loss": 1.5721,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.015705108642578,
"rewards/margins": 0.33784690499305725,
"rewards/rejected": -2.3535521030426025,
"step": 220
},
{
"epoch": 0.16570605187319884,
"grad_norm": 19.432289400345837,
"learning_rate": 4.136690647482014e-08,
"logits/chosen": -2.0252606868743896,
"logits/rejected": -2.025735378265381,
"logps/chosen": -1.0125794410705566,
"logps/rejected": -1.1261564493179321,
"loss": 1.6379,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0251588821411133,
"rewards/margins": 0.22715386748313904,
"rewards/rejected": -2.2523128986358643,
"step": 230
},
{
"epoch": 0.1729106628242075,
"grad_norm": 25.42149329995876,
"learning_rate": 4.3165467625899276e-08,
"logits/chosen": -2.0474588871002197,
"logits/rejected": -2.042466163635254,
"logps/chosen": -1.0612871646881104,
"logps/rejected": -1.1391594409942627,
"loss": 1.7029,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1225743293762207,
"rewards/margins": 0.15574422478675842,
"rewards/rejected": -2.2783188819885254,
"step": 240
},
{
"epoch": 0.18011527377521613,
"grad_norm": 21.827295632014227,
"learning_rate": 4.496402877697841e-08,
"logits/chosen": -1.9690139293670654,
"logits/rejected": -1.9651823043823242,
"logps/chosen": -1.081837773323059,
"logps/rejected": -1.173208236694336,
"loss": 1.6762,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.163675546646118,
"rewards/margins": 0.18274101614952087,
"rewards/rejected": -2.346416473388672,
"step": 250
},
{
"epoch": 0.1873198847262248,
"grad_norm": 23.883537952919536,
"learning_rate": 4.676258992805755e-08,
"logits/chosen": -1.9890559911727905,
"logits/rejected": -1.9971050024032593,
"logps/chosen": -1.1051918268203735,
"logps/rejected": -1.2165734767913818,
"loss": 1.6485,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.210383653640747,
"rewards/margins": 0.22276310622692108,
"rewards/rejected": -2.4331469535827637,
"step": 260
},
{
"epoch": 0.19452449567723343,
"grad_norm": 23.452229796100493,
"learning_rate": 4.8561151079136686e-08,
"logits/chosen": -2.0651626586914062,
"logits/rejected": -2.0570404529571533,
"logps/chosen": -1.0715770721435547,
"logps/rejected": -1.2007033824920654,
"loss": 1.6136,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.1431541442871094,
"rewards/margins": 0.2582527697086334,
"rewards/rejected": -2.401406764984131,
"step": 270
},
{
"epoch": 0.2017291066282421,
"grad_norm": 29.05416335946149,
"learning_rate": 4.999992091672379e-08,
"logits/chosen": -2.0108678340911865,
"logits/rejected": -2.0090978145599365,
"logps/chosen": -0.9353054761886597,
"logps/rejected": -1.0496169328689575,
"loss": 1.6345,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.8706109523773193,
"rewards/margins": 0.22862282395362854,
"rewards/rejected": -2.099233865737915,
"step": 280
},
{
"epoch": 0.20893371757925072,
"grad_norm": 24.746568624535307,
"learning_rate": 4.999715305459108e-08,
"logits/chosen": -2.0434165000915527,
"logits/rejected": -2.045293092727661,
"logps/chosen": -1.0135209560394287,
"logps/rejected": -1.1082584857940674,
"loss": 1.6735,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0270419120788574,
"rewards/margins": 0.18947532773017883,
"rewards/rejected": -2.2165169715881348,
"step": 290
},
{
"epoch": 0.21613832853025935,
"grad_norm": 23.704466162259774,
"learning_rate": 4.9990431528966836e-08,
"logits/chosen": -2.0209240913391113,
"logits/rejected": -2.012465476989746,
"logps/chosen": -1.0895938873291016,
"logps/rejected": -1.1909050941467285,
"loss": 1.653,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.179187774658203,
"rewards/margins": 0.2026222050189972,
"rewards/rejected": -2.381810188293457,
"step": 300
},
{
"epoch": 0.22334293948126802,
"grad_norm": 21.29734105171553,
"learning_rate": 4.997975740295813e-08,
"logits/chosen": -1.9576565027236938,
"logits/rejected": -1.9576654434204102,
"logps/chosen": -1.0862493515014648,
"logps/rejected": -1.172272801399231,
"loss": 1.6787,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1724987030029297,
"rewards/margins": 0.17204709351062775,
"rewards/rejected": -2.344545602798462,
"step": 310
},
{
"epoch": 0.23054755043227665,
"grad_norm": 18.963202937894597,
"learning_rate": 4.996513236483331e-08,
"logits/chosen": -2.033639907836914,
"logits/rejected": -2.025113821029663,
"logps/chosen": -1.008597493171692,
"logps/rejected": -1.1411330699920654,
"loss": 1.623,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.017194986343384,
"rewards/margins": 0.26507094502449036,
"rewards/rejected": -2.282266139984131,
"step": 320
},
{
"epoch": 0.2377521613832853,
"grad_norm": 18.118259435513053,
"learning_rate": 4.9946558727754974e-08,
"logits/chosen": -2.013747453689575,
"logits/rejected": -2.015979528427124,
"logps/chosen": -1.0458552837371826,
"logps/rejected": -1.0690838098526,
"loss": 1.7903,
"rewards/accuracies": 0.46875,
"rewards/chosen": -2.0917105674743652,
"rewards/margins": 0.04645707830786705,
"rewards/rejected": -2.1381676197052,
"step": 330
},
{
"epoch": 0.24495677233429394,
"grad_norm": 21.713334152733406,
"learning_rate": 4.9924039429414086e-08,
"logits/chosen": -2.0605921745300293,
"logits/rejected": -2.0548267364501953,
"logps/chosen": -1.0870946645736694,
"logps/rejected": -1.1672402620315552,
"loss": 1.6817,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.174189329147339,
"rewards/margins": 0.1602911800146103,
"rewards/rejected": -2.3344805240631104,
"step": 340
},
{
"epoch": 0.2521613832853026,
"grad_norm": 21.7230708429387,
"learning_rate": 4.989757803156537e-08,
"logits/chosen": -1.9891109466552734,
"logits/rejected": -1.983432412147522,
"logps/chosen": -0.988193690776825,
"logps/rejected": -1.115260362625122,
"loss": 1.6191,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.97638738155365,
"rewards/margins": 0.2541332542896271,
"rewards/rejected": -2.230520725250244,
"step": 350
},
{
"epoch": 0.25936599423631124,
"grad_norm": 24.29869182695538,
"learning_rate": 4.986717871946393e-08,
"logits/chosen": -1.995234727859497,
"logits/rejected": -1.9911472797393799,
"logps/chosen": -1.0861847400665283,
"logps/rejected": -1.202515959739685,
"loss": 1.6318,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.1723694801330566,
"rewards/margins": 0.23266229033470154,
"rewards/rejected": -2.40503191947937,
"step": 360
},
{
"epoch": 0.2665706051873199,
"grad_norm": 21.151241640685495,
"learning_rate": 4.983284630120331e-08,
"logits/chosen": -2.0005943775177,
"logits/rejected": -2.0005276203155518,
"logps/chosen": -1.050954818725586,
"logps/rejected": -1.180293083190918,
"loss": 1.6091,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.101909637451172,
"rewards/margins": 0.25867652893066406,
"rewards/rejected": -2.360586166381836,
"step": 370
},
{
"epoch": 0.2737752161383285,
"grad_norm": 18.702979404643415,
"learning_rate": 4.979458620695505e-08,
"logits/chosen": -2.0280709266662598,
"logits/rejected": -2.0321407318115234,
"logps/chosen": -1.01195228099823,
"logps/rejected": -1.0852843523025513,
"loss": 1.7127,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.02390456199646,
"rewards/margins": 0.14666402339935303,
"rewards/rejected": -2.1705687046051025,
"step": 380
},
{
"epoch": 0.28097982708933716,
"grad_norm": 18.04619323368983,
"learning_rate": 4.975240448810977e-08,
"logits/chosen": -2.0287792682647705,
"logits/rejected": -2.0225093364715576,
"logps/chosen": -1.0217763185501099,
"logps/rejected": -1.1489847898483276,
"loss": 1.614,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0435526371002197,
"rewards/margins": 0.25441741943359375,
"rewards/rejected": -2.2979695796966553,
"step": 390
},
{
"epoch": 0.2881844380403458,
"grad_norm": 22.031189167579363,
"learning_rate": 4.970630781632009e-08,
"logits/chosen": -2.034381628036499,
"logits/rejected": -2.034792423248291,
"logps/chosen": -0.9954347610473633,
"logps/rejected": -1.0486609935760498,
"loss": 1.7224,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.9908695220947266,
"rewards/margins": 0.10645285993814468,
"rewards/rejected": -2.0973219871520996,
"step": 400
},
{
"epoch": 0.2953890489913545,
"grad_norm": 21.234234457439968,
"learning_rate": 4.965630348244542e-08,
"logits/chosen": -2.0295231342315674,
"logits/rejected": -2.027569532394409,
"logps/chosen": -1.0738043785095215,
"logps/rejected": -1.1459261178970337,
"loss": 1.7042,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.147608757019043,
"rewards/margins": 0.14424346387386322,
"rewards/rejected": -2.2918522357940674,
"step": 410
},
{
"epoch": 0.3025936599423631,
"grad_norm": 19.466288881985548,
"learning_rate": 4.9602399395398786e-08,
"logits/chosen": -2.0115177631378174,
"logits/rejected": -2.0157418251037598,
"logps/chosen": -1.0445759296417236,
"logps/rejected": -1.1231411695480347,
"loss": 1.6872,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0891518592834473,
"rewards/margins": 0.15713071823120117,
"rewards/rejected": -2.2462823390960693,
"step": 420
},
{
"epoch": 0.30979827089337175,
"grad_norm": 21.116060561215924,
"learning_rate": 4.95446040808959e-08,
"logits/chosen": -1.9870742559432983,
"logits/rejected": -1.9879848957061768,
"logps/chosen": -1.0581797361373901,
"logps/rejected": -1.099675178527832,
"loss": 1.7526,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -2.1163594722747803,
"rewards/margins": 0.08299090713262558,
"rewards/rejected": -2.199350357055664,
"step": 430
},
{
"epoch": 0.3170028818443804,
"grad_norm": 18.736237528011962,
"learning_rate": 4.948292668010676e-08,
"logits/chosen": -1.9880409240722656,
"logits/rejected": -1.988071084022522,
"logps/chosen": -1.0214247703552246,
"logps/rejected": -1.1438568830490112,
"loss": 1.6286,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.042849540710449,
"rewards/margins": 0.2448642998933792,
"rewards/rejected": -2.2877137660980225,
"step": 440
},
{
"epoch": 0.3242074927953891,
"grad_norm": 20.43882931641836,
"learning_rate": 4.941737694820975e-08,
"logits/chosen": -2.0112996101379395,
"logits/rejected": -2.0076920986175537,
"logps/chosen": -1.144315242767334,
"logps/rejected": -1.1844433546066284,
"loss": 1.7537,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.288630485534668,
"rewards/margins": 0.08025630563497543,
"rewards/rejected": -2.368886709213257,
"step": 450
},
{
"epoch": 0.3314121037463977,
"grad_norm": 28.239708991055796,
"learning_rate": 4.93479652528488e-08,
"logits/chosen": -2.007514476776123,
"logits/rejected": -2.0019874572753906,
"logps/chosen": -1.1697793006896973,
"logps/rejected": -1.2875298261642456,
"loss": 1.6351,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.3395586013793945,
"rewards/margins": 0.23550114035606384,
"rewards/rejected": -2.575059652328491,
"step": 460
},
{
"epoch": 0.33861671469740634,
"grad_norm": 26.337152572859576,
"learning_rate": 4.9274702572493555e-08,
"logits/chosen": -2.040773868560791,
"logits/rejected": -2.028566598892212,
"logps/chosen": -1.0992854833602905,
"logps/rejected": -1.2018510103225708,
"loss": 1.6561,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.198570966720581,
"rewards/margins": 0.20513089001178741,
"rewards/rejected": -2.4037020206451416,
"step": 470
},
{
"epoch": 0.345821325648415,
"grad_norm": 25.04785849160215,
"learning_rate": 4.9197600494702955e-08,
"logits/chosen": -2.10146164894104,
"logits/rejected": -2.091294765472412,
"logps/chosen": -0.9840625524520874,
"logps/rejected": -1.106227993965149,
"loss": 1.6196,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.9681251049041748,
"rewards/margins": 0.24433092772960663,
"rewards/rejected": -2.212455987930298,
"step": 480
},
{
"epoch": 0.3530259365994236,
"grad_norm": 21.442417002640138,
"learning_rate": 4.9116671214292526e-08,
"logits/chosen": -2.017040491104126,
"logits/rejected": -2.0160226821899414,
"logps/chosen": -0.98698490858078,
"logps/rejected": -1.1014493703842163,
"loss": 1.6301,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.97396981716156,
"rewards/margins": 0.22892877459526062,
"rewards/rejected": -2.2028987407684326,
"step": 490
},
{
"epoch": 0.36023054755043227,
"grad_norm": 20.760566848988777,
"learning_rate": 4.903192753140557e-08,
"logits/chosen": -2.019878387451172,
"logits/rejected": -2.0063798427581787,
"logps/chosen": -1.0110963582992554,
"logps/rejected": -1.1396064758300781,
"loss": 1.6125,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0221927165985107,
"rewards/margins": 0.2570200562477112,
"rewards/rejected": -2.2792129516601562,
"step": 500
},
{
"epoch": 0.36743515850144093,
"grad_norm": 21.964679850201083,
"learning_rate": 4.894338284948866e-08,
"logits/chosen": -2.088066577911377,
"logits/rejected": -2.081502676010132,
"logps/chosen": -1.0429285764694214,
"logps/rejected": -1.1580368280410767,
"loss": 1.6406,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.0858571529388428,
"rewards/margins": 0.23021626472473145,
"rewards/rejected": -2.3160736560821533,
"step": 510
},
{
"epoch": 0.3746397694524496,
"grad_norm": 18.869869897878324,
"learning_rate": 4.8851051173171656e-08,
"logits/chosen": -2.0141520500183105,
"logits/rejected": -2.0126335620880127,
"logps/chosen": -1.1057158708572388,
"logps/rejected": -1.2025840282440186,
"loss": 1.6603,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2114317417144775,
"rewards/margins": 0.1937362551689148,
"rewards/rejected": -2.405168056488037,
"step": 520
},
{
"epoch": 0.3818443804034582,
"grad_norm": 20.027593416077924,
"learning_rate": 4.8754947106052696e-08,
"logits/chosen": -1.9756828546524048,
"logits/rejected": -1.9658939838409424,
"logps/chosen": -0.983010470867157,
"logps/rejected": -1.060978651046753,
"loss": 1.6924,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.966020941734314,
"rewards/margins": 0.15593627095222473,
"rewards/rejected": -2.121957302093506,
"step": 530
},
{
"epoch": 0.38904899135446686,
"grad_norm": 25.513748888084603,
"learning_rate": 4.865508584838841e-08,
"logits/chosen": -2.0021884441375732,
"logits/rejected": -1.9952195882797241,
"logps/chosen": -1.02981436252594,
"logps/rejected": -1.1320369243621826,
"loss": 1.6593,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.05962872505188,
"rewards/margins": 0.2044449746608734,
"rewards/rejected": -2.2640738487243652,
"step": 540
},
{
"epoch": 0.3962536023054755,
"grad_norm": 20.013888186622026,
"learning_rate": 4.855148319468979e-08,
"logits/chosen": -1.9607187509536743,
"logits/rejected": -1.9611743688583374,
"logps/chosen": -0.9991506338119507,
"logps/rejected": -1.0771278142929077,
"loss": 1.7089,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.9983012676239014,
"rewards/margins": 0.15595442056655884,
"rewards/rejected": -2.1542556285858154,
"step": 550
},
{
"epoch": 0.4034582132564842,
"grad_norm": 19.766020419371234,
"learning_rate": 4.8444155531224065e-08,
"logits/chosen": -2.0277419090270996,
"logits/rejected": -2.0282044410705566,
"logps/chosen": -1.0792930126190186,
"logps/rejected": -1.1801952123641968,
"loss": 1.6624,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.158586025238037,
"rewards/margins": 0.20180411636829376,
"rewards/rejected": -2.3603904247283936,
"step": 560
},
{
"epoch": 0.4106628242074928,
"grad_norm": 18.138026924157977,
"learning_rate": 4.833311983342292e-08,
"logits/chosen": -2.031890869140625,
"logits/rejected": -2.0176992416381836,
"logps/chosen": -1.0929630994796753,
"logps/rejected": -1.2075343132019043,
"loss": 1.6422,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1859261989593506,
"rewards/margins": 0.22914230823516846,
"rewards/rejected": -2.4150686264038086,
"step": 570
},
{
"epoch": 0.41786743515850144,
"grad_norm": 21.60868876989794,
"learning_rate": 4.821839366319768e-08,
"logits/chosen": -2.0339295864105225,
"logits/rejected": -2.0323901176452637,
"logps/chosen": -0.9861429929733276,
"logps/rejected": -1.0666794776916504,
"loss": 1.6891,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9722859859466553,
"rewards/margins": 0.16107279062271118,
"rewards/rejected": -2.133358955383301,
"step": 580
},
{
"epoch": 0.4250720461095101,
"grad_norm": 24.05826502769421,
"learning_rate": 4.8099995166161536e-08,
"logits/chosen": -2.0223276615142822,
"logits/rejected": -2.022588014602661,
"logps/chosen": -1.0257574319839478,
"logps/rejected": -1.1649348735809326,
"loss": 1.5961,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0515148639678955,
"rewards/margins": 0.27835482358932495,
"rewards/rejected": -2.3298697471618652,
"step": 590
},
{
"epoch": 0.4322766570605187,
"grad_norm": 22.540205396639372,
"learning_rate": 4.797794306875963e-08,
"logits/chosen": -2.0817413330078125,
"logits/rejected": -2.0777947902679443,
"logps/chosen": -1.032037615776062,
"logps/rejected": -1.1749569177627563,
"loss": 1.597,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.064075231552124,
"rewards/margins": 0.28583866357803345,
"rewards/rejected": -2.3499138355255127,
"step": 600
},
{
"epoch": 0.43948126801152737,
"grad_norm": 24.185799296259226,
"learning_rate": 4.785225667530716e-08,
"logits/chosen": -2.0293679237365723,
"logits/rejected": -2.019531488418579,
"logps/chosen": -1.0911657810211182,
"logps/rejected": -1.1501439809799194,
"loss": 1.7163,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.1823315620422363,
"rewards/margins": 0.11795620620250702,
"rewards/rejected": -2.300287961959839,
"step": 610
},
{
"epoch": 0.44668587896253603,
"grad_norm": 28.363607554135946,
"learning_rate": 4.772295586493613e-08,
"logits/chosen": -2.0295448303222656,
"logits/rejected": -2.0222747325897217,
"logps/chosen": -0.9944518804550171,
"logps/rejected": -1.082177758216858,
"loss": 1.6763,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -1.9889037609100342,
"rewards/margins": 0.1754516065120697,
"rewards/rejected": -2.164355516433716,
"step": 620
},
{
"epoch": 0.4538904899135447,
"grad_norm": 23.047596709189968,
"learning_rate": 4.759006108845116e-08,
"logits/chosen": -2.039217472076416,
"logits/rejected": -2.0391504764556885,
"logps/chosen": -1.0260940790176392,
"logps/rejected": -1.1543083190917969,
"loss": 1.6159,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0521881580352783,
"rewards/margins": 0.2564285397529602,
"rewards/rejected": -2.3086166381835938,
"step": 630
},
{
"epoch": 0.4610951008645533,
"grad_norm": 18.761921757090445,
"learning_rate": 4.7453593365094926e-08,
"logits/chosen": -1.9543355703353882,
"logits/rejected": -1.955255150794983,
"logps/chosen": -1.0088173151016235,
"logps/rejected": -1.116272211074829,
"loss": 1.6458,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.017634630203247,
"rewards/margins": 0.21490976214408875,
"rewards/rejected": -2.232544422149658,
"step": 640
},
{
"epoch": 0.46829971181556196,
"grad_norm": 24.54554910010224,
"learning_rate": 4.731357427922361e-08,
"logits/chosen": -2.053588390350342,
"logits/rejected": -2.038914442062378,
"logps/chosen": -1.0363117456436157,
"logps/rejected": -1.1169908046722412,
"loss": 1.6897,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.0726234912872314,
"rewards/margins": 0.1613583266735077,
"rewards/rejected": -2.2339816093444824,
"step": 650
},
{
"epoch": 0.4755043227665706,
"grad_norm": 26.37090068258741,
"learning_rate": 4.71700259768931e-08,
"logits/chosen": -2.032832145690918,
"logits/rejected": -2.0338807106018066,
"logps/chosen": -1.0859205722808838,
"logps/rejected": -1.174154281616211,
"loss": 1.6831,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.1718411445617676,
"rewards/margins": 0.17646734416484833,
"rewards/rejected": -2.348308563232422,
"step": 660
},
{
"epoch": 0.4827089337175792,
"grad_norm": 24.073708164903586,
"learning_rate": 4.7022971162356176e-08,
"logits/chosen": -2.0062692165374756,
"logits/rejected": -1.9969465732574463,
"logps/chosen": -1.060937523841858,
"logps/rejected": -1.1657707691192627,
"loss": 1.6525,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.121875047683716,
"rewards/margins": 0.20966656506061554,
"rewards/rejected": -2.3315415382385254,
"step": 670
},
{
"epoch": 0.4899135446685879,
"grad_norm": 21.794836443162072,
"learning_rate": 4.6872433094471577e-08,
"logits/chosen": -1.962633490562439,
"logits/rejected": -1.953546166419983,
"logps/chosen": -1.0352494716644287,
"logps/rejected": -1.1061433553695679,
"loss": 1.7057,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0704989433288574,
"rewards/margins": 0.1417877972126007,
"rewards/rejected": -2.2122867107391357,
"step": 680
},
{
"epoch": 0.49711815561959655,
"grad_norm": 29.899603243129143,
"learning_rate": 4.671843558302522e-08,
"logits/chosen": -2.025979518890381,
"logits/rejected": -2.020822525024414,
"logps/chosen": -1.1030082702636719,
"logps/rejected": -1.2090809345245361,
"loss": 1.6608,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2060165405273438,
"rewards/margins": 0.21214473247528076,
"rewards/rejected": -2.4181618690490723,
"step": 690
},
{
"epoch": 0.5043227665706052,
"grad_norm": 23.0987112187863,
"learning_rate": 4.656100298496439e-08,
"logits/chosen": -2.0181777477264404,
"logits/rejected": -2.0126051902770996,
"logps/chosen": -1.017418622970581,
"logps/rejected": -1.1510635614395142,
"loss": 1.6139,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.034837245941162,
"rewards/margins": 0.267289936542511,
"rewards/rejected": -2.3021271228790283,
"step": 700
},
{
"epoch": 0.5115273775216138,
"grad_norm": 21.978629277918778,
"learning_rate": 4.640016020054527e-08,
"logits/chosen": -1.9824316501617432,
"logits/rejected": -1.979188323020935,
"logps/chosen": -0.8954793810844421,
"logps/rejected": -1.0217931270599365,
"loss": 1.6279,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.7909587621688843,
"rewards/margins": 0.25262752175331116,
"rewards/rejected": -2.043586254119873,
"step": 710
},
{
"epoch": 0.5187319884726225,
"grad_norm": 21.932059439411073,
"learning_rate": 4.6235932669394676e-08,
"logits/chosen": -2.011847972869873,
"logits/rejected": -2.0056471824645996,
"logps/chosen": -1.0409430265426636,
"logps/rejected": -1.1656509637832642,
"loss": 1.6179,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.081886053085327,
"rewards/margins": 0.24941587448120117,
"rewards/rejected": -2.3313019275665283,
"step": 720
},
{
"epoch": 0.5259365994236311,
"grad_norm": 23.306210789596093,
"learning_rate": 4.6068346366486325e-08,
"logits/chosen": -2.013566732406616,
"logits/rejected": -2.002554178237915,
"logps/chosen": -1.0203325748443604,
"logps/rejected": -1.104425072669983,
"loss": 1.6898,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0406651496887207,
"rewards/margins": 0.168185293674469,
"rewards/rejected": -2.208850145339966,
"step": 730
},
{
"epoch": 0.5331412103746398,
"grad_norm": 20.259951331338133,
"learning_rate": 4.589742779803259e-08,
"logits/chosen": -1.9962953329086304,
"logits/rejected": -2.003087043762207,
"logps/chosen": -1.017392635345459,
"logps/rejected": -1.0792462825775146,
"loss": 1.7292,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -2.034785270690918,
"rewards/margins": 0.12370713800191879,
"rewards/rejected": -2.1584925651550293,
"step": 740
},
{
"epoch": 0.5403458213256485,
"grad_norm": 26.41919083462638,
"learning_rate": 4.5723203997292146e-08,
"logits/chosen": -2.014768123626709,
"logits/rejected": -2.009498119354248,
"logps/chosen": -1.0986969470977783,
"logps/rejected": -1.1911667585372925,
"loss": 1.6761,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.1973938941955566,
"rewards/margins": 0.18493981659412384,
"rewards/rejected": -2.382333517074585,
"step": 750
},
{
"epoch": 0.547550432276657,
"grad_norm": 22.856174101882264,
"learning_rate": 4.554570252029421e-08,
"logits/chosen": -1.9752864837646484,
"logits/rejected": -1.9790761470794678,
"logps/chosen": -1.0023285150527954,
"logps/rejected": -1.1136945486068726,
"loss": 1.6506,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.004657030105591,
"rewards/margins": 0.22273211181163788,
"rewards/rejected": -2.227389097213745,
"step": 760
},
{
"epoch": 0.5547550432276657,
"grad_norm": 24.809677260951396,
"learning_rate": 4.536495144148021e-08,
"logits/chosen": -1.9749062061309814,
"logits/rejected": -1.9776439666748047,
"logps/chosen": -0.974290668964386,
"logps/rejected": -1.1291836500167847,
"loss": 1.5873,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.948581337928772,
"rewards/margins": 0.3097858428955078,
"rewards/rejected": -2.2583673000335693,
"step": 770
},
{
"epoch": 0.5619596541786743,
"grad_norm": 25.374366616475292,
"learning_rate": 4.518097934926339e-08,
"logits/chosen": -1.9943599700927734,
"logits/rejected": -1.993017554283142,
"logps/chosen": -1.0385878086090088,
"logps/rejected": -1.1217256784439087,
"loss": 1.681,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0771756172180176,
"rewards/margins": 0.16627538204193115,
"rewards/rejected": -2.2434513568878174,
"step": 780
},
{
"epoch": 0.569164265129683,
"grad_norm": 20.014037840378,
"learning_rate": 4.499381534150714e-08,
"logits/chosen": -2.0125200748443604,
"logits/rejected": -2.0065932273864746,
"logps/chosen": -1.0741461515426636,
"logps/rejected": -1.2335011959075928,
"loss": 1.5826,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.148292303085327,
"rewards/margins": 0.31871041655540466,
"rewards/rejected": -2.4670023918151855,
"step": 790
},
{
"epoch": 0.5763688760806917,
"grad_norm": 21.859452356308726,
"learning_rate": 4.48034890209227e-08,
"logits/chosen": -2.037662982940674,
"logits/rejected": -2.0339465141296387,
"logps/chosen": -1.0423095226287842,
"logps/rejected": -1.1221187114715576,
"loss": 1.6972,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0846190452575684,
"rewards/margins": 0.1596187800168991,
"rewards/rejected": -2.2442374229431152,
"step": 800
},
{
"epoch": 0.5835734870317003,
"grad_norm": 18.638956420354962,
"learning_rate": 4.4610030490387154e-08,
"logits/chosen": -2.01869797706604,
"logits/rejected": -2.0211963653564453,
"logps/chosen": -1.0117915868759155,
"logps/rejected": -1.1026709079742432,
"loss": 1.6729,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.023583173751831,
"rewards/margins": 0.18175864219665527,
"rewards/rejected": -2.2053418159484863,
"step": 810
},
{
"epoch": 0.590778097982709,
"grad_norm": 24.338984987337728,
"learning_rate": 4.4413470348182124e-08,
"logits/chosen": -2.020242214202881,
"logits/rejected": -2.0103182792663574,
"logps/chosen": -1.0275824069976807,
"logps/rejected": -1.1156085729599,
"loss": 1.6885,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0551648139953613,
"rewards/margins": 0.1760522723197937,
"rewards/rejected": -2.2312171459198,
"step": 820
},
{
"epoch": 0.5979827089337176,
"grad_norm": 21.286386599253383,
"learning_rate": 4.421383968315427e-08,
"logits/chosen": -2.000786781311035,
"logits/rejected": -1.9987096786499023,
"logps/chosen": -0.9581828117370605,
"logps/rejected": -1.079776644706726,
"loss": 1.6345,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.916365623474121,
"rewards/margins": 0.24318790435791016,
"rewards/rejected": -2.159553289413452,
"step": 830
},
{
"epoch": 0.6051873198847262,
"grad_norm": 18.656428501584173,
"learning_rate": 4.4011170069798126e-08,
"logits/chosen": -2.02819561958313,
"logits/rejected": -2.0282022953033447,
"logps/chosen": -1.0859931707382202,
"logps/rejected": -1.1599055528640747,
"loss": 1.7042,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1719863414764404,
"rewards/margins": 0.14782461524009705,
"rewards/rejected": -2.3198111057281494,
"step": 840
},
{
"epoch": 0.6123919308357348,
"grad_norm": 18.413528164156084,
"learning_rate": 4.380549356326208e-08,
"logits/chosen": -2.0375325679779053,
"logits/rejected": -2.031755208969116,
"logps/chosen": -1.0479528903961182,
"logps/rejected": -1.1553928852081299,
"loss": 1.6564,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0959057807922363,
"rewards/margins": 0.21487931907176971,
"rewards/rejected": -2.3107857704162598,
"step": 850
},
{
"epoch": 0.6195965417867435,
"grad_norm": 20.693940592783264,
"learning_rate": 4.359684269427848e-08,
"logits/chosen": -2.009840726852417,
"logits/rejected": -2.0057568550109863,
"logps/chosen": -1.1601811647415161,
"logps/rejected": -1.2186861038208008,
"loss": 1.7248,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.3203623294830322,
"rewards/margins": 0.11700980365276337,
"rewards/rejected": -2.4373722076416016,
"step": 860
},
{
"epoch": 0.6268011527377522,
"grad_norm": 23.036875621985878,
"learning_rate": 4.3385250464018355e-08,
"logits/chosen": -2.0459847450256348,
"logits/rejected": -2.0400002002716064,
"logps/chosen": -1.0027254819869995,
"logps/rejected": -1.122941017150879,
"loss": 1.6293,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.005450963973999,
"rewards/margins": 0.2404308319091797,
"rewards/rejected": -2.245882034301758,
"step": 870
},
{
"epoch": 0.6340057636887608,
"grad_norm": 23.415166967310086,
"learning_rate": 4.3170750338871806e-08,
"logits/chosen": -2.034942626953125,
"logits/rejected": -2.032872438430786,
"logps/chosen": -1.0137414932250977,
"logps/rejected": -1.0794751644134521,
"loss": 1.7127,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0274829864501953,
"rewards/margins": 0.13146750628948212,
"rewards/rejected": -2.1589503288269043,
"step": 880
},
{
"epoch": 0.6412103746397695,
"grad_norm": 21.97241223292353,
"learning_rate": 4.295337624515485e-08,
"logits/chosen": -2.0610389709472656,
"logits/rejected": -2.0595154762268066,
"logps/chosen": -1.014024019241333,
"logps/rejected": -1.1112914085388184,
"loss": 1.6585,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.028048038482666,
"rewards/margins": 0.1945347934961319,
"rewards/rejected": -2.2225828170776367,
"step": 890
},
{
"epoch": 0.6484149855907781,
"grad_norm": 23.8540421436762,
"learning_rate": 4.273316256374342e-08,
"logits/chosen": -1.9776846170425415,
"logits/rejected": -1.9790922403335571,
"logps/chosen": -1.14115309715271,
"logps/rejected": -1.2139250040054321,
"loss": 1.7115,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.28230619430542,
"rewards/margins": 0.14554361999034882,
"rewards/rejected": -2.4278500080108643,
"step": 900
},
{
"epoch": 0.6556195965417867,
"grad_norm": 22.88947778871445,
"learning_rate": 4.2510144124635605e-08,
"logits/chosen": -1.9872970581054688,
"logits/rejected": -1.9910697937011719,
"logps/chosen": -1.0412156581878662,
"logps/rejected": -1.1080083847045898,
"loss": 1.7088,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.0824313163757324,
"rewards/margins": 0.1335853785276413,
"rewards/rejected": -2.2160167694091797,
"step": 910
},
{
"epoch": 0.6628242074927954,
"grad_norm": 25.19603603966565,
"learning_rate": 4.22843562014427e-08,
"logits/chosen": -2.016618490219116,
"logits/rejected": -2.0112040042877197,
"logps/chosen": -0.9714315533638,
"logps/rejected": -1.0625754594802856,
"loss": 1.675,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9428631067276,
"rewards/margins": 0.18228788673877716,
"rewards/rejected": -2.1251509189605713,
"step": 920
},
{
"epoch": 0.670028818443804,
"grad_norm": 20.60480228984638,
"learning_rate": 4.205583450581023e-08,
"logits/chosen": -2.050994873046875,
"logits/rejected": -2.0482630729675293,
"logps/chosen": -1.0324729681015015,
"logps/rejected": -1.1509991884231567,
"loss": 1.6261,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.064945936203003,
"rewards/margins": 0.23705241084098816,
"rewards/rejected": -2.3019983768463135,
"step": 930
},
{
"epoch": 0.6772334293948127,
"grad_norm": 22.35823019195503,
"learning_rate": 4.1824615181769577e-08,
"logits/chosen": -2.0053532123565674,
"logits/rejected": -2.0111613273620605,
"logps/chosen": -1.1246191263198853,
"logps/rejected": -1.1943109035491943,
"loss": 1.6997,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.2492382526397705,
"rewards/margins": 0.13938355445861816,
"rewards/rejected": -2.3886218070983887,
"step": 940
},
{
"epoch": 0.6844380403458213,
"grad_norm": 18.162143432621694,
"learning_rate": 4.1590734800021354e-08,
"logits/chosen": -1.9661105871200562,
"logits/rejected": -1.9702253341674805,
"logps/chosen": -1.0093214511871338,
"logps/rejected": -1.1327455043792725,
"loss": 1.633,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0186429023742676,
"rewards/margins": 0.24684815108776093,
"rewards/rejected": -2.265491008758545,
"step": 950
},
{
"epoch": 0.69164265129683,
"grad_norm": 23.965182376610645,
"learning_rate": 4.1354230352151143e-08,
"logits/chosen": -2.041020154953003,
"logits/rejected": -2.040239095687866,
"logps/chosen": -1.0478591918945312,
"logps/rejected": -1.1593987941741943,
"loss": 1.6429,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0957183837890625,
"rewards/margins": 0.223079115152359,
"rewards/rejected": -2.3187975883483887,
"step": 960
},
{
"epoch": 0.6988472622478387,
"grad_norm": 24.464714064797324,
"learning_rate": 4.111513924477878e-08,
"logits/chosen": -2.043121337890625,
"logits/rejected": -2.0391170978546143,
"logps/chosen": -0.9660174250602722,
"logps/rejected": -1.0939273834228516,
"loss": 1.6115,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.9320348501205444,
"rewards/margins": 0.2558196187019348,
"rewards/rejected": -2.187854766845703,
"step": 970
},
{
"epoch": 0.7060518731988472,
"grad_norm": 20.201261103575792,
"learning_rate": 4.087349929364192e-08,
"logits/chosen": -1.9678367376327515,
"logits/rejected": -1.9675403833389282,
"logps/chosen": -1.1067652702331543,
"logps/rejected": -1.2355201244354248,
"loss": 1.6115,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.2135305404663086,
"rewards/margins": 0.25750917196273804,
"rewards/rejected": -2.4710402488708496,
"step": 980
},
{
"epoch": 0.7132564841498559,
"grad_norm": 25.954821168667177,
"learning_rate": 4.062934871761497e-08,
"logits/chosen": -2.0314321517944336,
"logits/rejected": -2.0284628868103027,
"logps/chosen": -1.1079853773117065,
"logps/rejected": -1.2067363262176514,
"loss": 1.6694,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.215970754623413,
"rewards/margins": 0.19750212132930756,
"rewards/rejected": -2.4134726524353027,
"step": 990
},
{
"epoch": 0.7204610951008645,
"grad_norm": 22.341732516108657,
"learning_rate": 4.038272613266419e-08,
"logits/chosen": -2.0092735290527344,
"logits/rejected": -2.0061213970184326,
"logps/chosen": -1.00548255443573,
"logps/rejected": -1.1272741556167603,
"loss": 1.6421,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.01096510887146,
"rewards/margins": 0.24358315765857697,
"rewards/rejected": -2.2545483112335205,
"step": 1000
},
{
"epoch": 0.7276657060518732,
"grad_norm": 26.857801922416726,
"learning_rate": 4.0133670545740014e-08,
"logits/chosen": -2.0216879844665527,
"logits/rejected": -2.0182127952575684,
"logps/chosen": -0.9998480677604675,
"logps/rejected": -1.0946764945983887,
"loss": 1.6911,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.999696135520935,
"rewards/margins": 0.1896568387746811,
"rewards/rejected": -2.1893529891967773,
"step": 1010
},
{
"epoch": 0.7348703170028819,
"grad_norm": 20.733064848722005,
"learning_rate": 3.988222134860755e-08,
"logits/chosen": -2.016014337539673,
"logits/rejected": -2.0111701488494873,
"logps/chosen": -1.0305395126342773,
"logps/rejected": -1.1271092891693115,
"loss": 1.6525,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0610790252685547,
"rewards/margins": 0.19313934445381165,
"rewards/rejected": -2.254218578338623,
"step": 1020
},
{
"epoch": 0.7420749279538905,
"grad_norm": 20.161762859239953,
"learning_rate": 3.962841831161617e-08,
"logits/chosen": -1.9683122634887695,
"logits/rejected": -1.9676278829574585,
"logps/chosen": -1.0196747779846191,
"logps/rejected": -1.1555341482162476,
"loss": 1.6269,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.0393495559692383,
"rewards/margins": 0.2717186510562897,
"rewards/rejected": -2.311068296432495,
"step": 1030
},
{
"epoch": 0.7492795389048992,
"grad_norm": 20.049038068269237,
"learning_rate": 3.937230157740931e-08,
"logits/chosen": -2.0240588188171387,
"logits/rejected": -2.018101692199707,
"logps/chosen": -1.0107640027999878,
"logps/rejected": -1.0943820476531982,
"loss": 1.6788,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0215280055999756,
"rewards/margins": 0.1672360599040985,
"rewards/rejected": -2.1887640953063965,
"step": 1040
},
{
"epoch": 0.7564841498559077,
"grad_norm": 21.02175811345582,
"learning_rate": 3.9113911654575246e-08,
"logits/chosen": -1.967104196548462,
"logits/rejected": -1.9632108211517334,
"logps/chosen": -0.935411810874939,
"logps/rejected": -1.0687172412872314,
"loss": 1.6138,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.870823621749878,
"rewards/margins": 0.2666108012199402,
"rewards/rejected": -2.137434482574463,
"step": 1050
},
{
"epoch": 0.7636887608069164,
"grad_norm": 21.519122032376863,
"learning_rate": 3.885328941124014e-08,
"logits/chosen": -1.9991518259048462,
"logits/rejected": -1.993080496788025,
"logps/chosen": -1.0368106365203857,
"logps/rejected": -1.1475738286972046,
"loss": 1.6499,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0736212730407715,
"rewards/margins": 0.2215261161327362,
"rewards/rejected": -2.295147657394409,
"step": 1060
},
{
"epoch": 0.770893371757925,
"grad_norm": 23.262898735103672,
"learning_rate": 3.8590476068604106e-08,
"logits/chosen": -2.00036358833313,
"logits/rejected": -1.998552918434143,
"logps/chosen": -1.071908712387085,
"logps/rejected": -1.2022292613983154,
"loss": 1.6331,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.14381742477417,
"rewards/margins": 0.2606413960456848,
"rewards/rejected": -2.404458522796631,
"step": 1070
},
{
"epoch": 0.7780979827089337,
"grad_norm": 24.381883140284064,
"learning_rate": 3.832551319442151e-08,
"logits/chosen": -2.025217056274414,
"logits/rejected": -2.0259487628936768,
"logps/chosen": -1.0844666957855225,
"logps/rejected": -1.1969218254089355,
"loss": 1.6485,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.168933391571045,
"rewards/margins": 0.2249099314212799,
"rewards/rejected": -2.393843650817871,
"step": 1080
},
{
"epoch": 0.7853025936599424,
"grad_norm": 27.86719418333446,
"learning_rate": 3.8058442696426404e-08,
"logits/chosen": -2.0382745265960693,
"logits/rejected": -2.030484676361084,
"logps/chosen": -1.1074187755584717,
"logps/rejected": -1.211817979812622,
"loss": 1.6631,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.2148375511169434,
"rewards/margins": 0.20879819989204407,
"rewards/rejected": -2.423635959625244,
"step": 1090
},
{
"epoch": 0.792507204610951,
"grad_norm": 27.071495780290135,
"learning_rate": 3.7789306815704216e-08,
"logits/chosen": -2.026120185852051,
"logits/rejected": -2.020829916000366,
"logps/chosen": -1.0431629419326782,
"logps/rejected": -1.1798484325408936,
"loss": 1.6059,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0863258838653564,
"rewards/margins": 0.27337145805358887,
"rewards/rejected": -2.359696865081787,
"step": 1100
},
{
"epoch": 0.7997118155619597,
"grad_norm": 18.88096690716272,
"learning_rate": 3.7518148120010705e-08,
"logits/chosen": -2.0271174907684326,
"logits/rejected": -2.0201258659362793,
"logps/chosen": -1.0074636936187744,
"logps/rejected": -1.1304162740707397,
"loss": 1.6266,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.014927387237549,
"rewards/margins": 0.24590542912483215,
"rewards/rejected": -2.2608325481414795,
"step": 1110
},
{
"epoch": 0.8069164265129684,
"grad_norm": 21.911720790106997,
"learning_rate": 3.7245009497039244e-08,
"logits/chosen": -1.9778887033462524,
"logits/rejected": -1.9702438116073608,
"logps/chosen": -1.045611023902893,
"logps/rejected": -1.1082854270935059,
"loss": 1.7162,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.091222047805786,
"rewards/margins": 0.1253490000963211,
"rewards/rejected": -2.2165708541870117,
"step": 1120
},
{
"epoch": 0.8141210374639769,
"grad_norm": 20.41807103685389,
"learning_rate": 3.696993414763753e-08,
"logits/chosen": -2.0123400688171387,
"logits/rejected": -2.0082285404205322,
"logps/chosen": -0.9978957176208496,
"logps/rejected": -1.085761308670044,
"loss": 1.6839,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.9957914352416992,
"rewards/margins": 0.17573121190071106,
"rewards/rejected": -2.171522617340088,
"step": 1130
},
{
"epoch": 0.8213256484149856,
"grad_norm": 19.092407867541205,
"learning_rate": 3.66929655789747e-08,
"logits/chosen": -2.0528008937835693,
"logits/rejected": -2.051527738571167,
"logps/chosen": -1.046097993850708,
"logps/rejected": -1.165477991104126,
"loss": 1.6311,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.092195987701416,
"rewards/margins": 0.23876003921031952,
"rewards/rejected": -2.330955982208252,
"step": 1140
},
{
"epoch": 0.8285302593659942,
"grad_norm": 21.260673310622927,
"learning_rate": 3.64141475976601e-08,
"logits/chosen": -2.041018009185791,
"logits/rejected": -2.0345215797424316,
"logps/chosen": -1.0739690065383911,
"logps/rejected": -1.1673578023910522,
"loss": 1.6763,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1479380130767822,
"rewards/margins": 0.18677765130996704,
"rewards/rejected": -2.3347156047821045,
"step": 1150
},
{
"epoch": 0.8357348703170029,
"grad_norm": 26.34484214118367,
"learning_rate": 3.61335243028146e-08,
"logits/chosen": -1.9964382648468018,
"logits/rejected": -1.9950309991836548,
"logps/chosen": -1.0375313758850098,
"logps/rejected": -1.1169970035552979,
"loss": 1.706,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0750627517700195,
"rewards/margins": 0.15893153846263885,
"rewards/rejected": -2.2339940071105957,
"step": 1160
},
{
"epoch": 0.8429394812680115,
"grad_norm": 24.32927550103409,
"learning_rate": 3.585114007909562e-08,
"logits/chosen": -1.9961084127426147,
"logits/rejected": -1.9874632358551025,
"logps/chosen": -1.0144343376159668,
"logps/rejected": -1.1264407634735107,
"loss": 1.6405,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.0288686752319336,
"rewards/margins": 0.2240130603313446,
"rewards/rejected": -2.2528815269470215,
"step": 1170
},
{
"epoch": 0.8501440922190202,
"grad_norm": 26.36978810424015,
"learning_rate": 3.556703958967716e-08,
"logits/chosen": -1.9620872735977173,
"logits/rejected": -1.9580965042114258,
"logps/chosen": -1.059852123260498,
"logps/rejected": -1.1655702590942383,
"loss": 1.6635,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.119704246520996,
"rewards/margins": 0.21143603324890137,
"rewards/rejected": -2.3311405181884766,
"step": 1180
},
{
"epoch": 0.8573487031700289,
"grad_norm": 20.25819384222818,
"learning_rate": 3.528126776918559e-08,
"logits/chosen": -2.0512845516204834,
"logits/rejected": -2.0443384647369385,
"logps/chosen": -1.071276068687439,
"logps/rejected": -1.1480839252471924,
"loss": 1.6952,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.142552137374878,
"rewards/margins": 0.15361574292182922,
"rewards/rejected": -2.2961678504943848,
"step": 1190
},
{
"epoch": 0.8645533141210374,
"grad_norm": 25.590655270691236,
"learning_rate": 3.499386981659262e-08,
"logits/chosen": -1.9815905094146729,
"logits/rejected": -1.9693584442138672,
"logps/chosen": -1.086232304573059,
"logps/rejected": -1.1749858856201172,
"loss": 1.6708,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.172464609146118,
"rewards/margins": 0.17750723659992218,
"rewards/rejected": -2.3499717712402344,
"step": 1200
},
{
"epoch": 0.8717579250720461,
"grad_norm": 22.74687405755938,
"learning_rate": 3.47048911880664e-08,
"logits/chosen": -1.971374750137329,
"logits/rejected": -1.979832410812378,
"logps/chosen": -0.9375821352005005,
"logps/rejected": -1.0928103923797607,
"loss": 1.5833,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.875164270401001,
"rewards/margins": 0.31045612692832947,
"rewards/rejected": -2.1856207847595215,
"step": 1210
},
{
"epoch": 0.8789625360230547,
"grad_norm": 20.348332311917044,
"learning_rate": 3.4414377589782e-08,
"logits/chosen": -1.998797059059143,
"logits/rejected": -1.9944807291030884,
"logps/chosen": -1.008535623550415,
"logps/rejected": -1.1152924299240112,
"loss": 1.6564,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.01707124710083,
"rewards/margins": 0.21351365745067596,
"rewards/rejected": -2.2305848598480225,
"step": 1220
},
{
"epoch": 0.8861671469740634,
"grad_norm": 20.10685961084486,
"learning_rate": 3.412237497069226e-08,
"logits/chosen": -1.9737958908081055,
"logits/rejected": -1.9617822170257568,
"logps/chosen": -0.9835951924324036,
"logps/rejected": -1.0770342350006104,
"loss": 1.6699,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9671903848648071,
"rewards/margins": 0.1868787556886673,
"rewards/rejected": -2.1540684700012207,
"step": 1230
},
{
"epoch": 0.8933717579250721,
"grad_norm": 24.001680453622406,
"learning_rate": 3.382892951526036e-08,
"logits/chosen": -1.9840329885482788,
"logits/rejected": -1.9820177555084229,
"logps/chosen": -0.9354456067085266,
"logps/rejected": -1.0733263492584229,
"loss": 1.6076,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.8708912134170532,
"rewards/margins": 0.2757616937160492,
"rewards/rejected": -2.1466526985168457,
"step": 1240
},
{
"epoch": 0.9005763688760807,
"grad_norm": 21.463794442952526,
"learning_rate": 3.353408763615502e-08,
"logits/chosen": -2.019768238067627,
"logits/rejected": -2.0219955444335938,
"logps/chosen": -1.0638011693954468,
"logps/rejected": -1.219588041305542,
"loss": 1.5926,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.1276023387908936,
"rewards/margins": 0.31157371401786804,
"rewards/rejected": -2.439176082611084,
"step": 1250
},
{
"epoch": 0.9077809798270894,
"grad_norm": 28.944721871130742,
"learning_rate": 3.323789596690971e-08,
"logits/chosen": -2.0201096534729004,
"logits/rejected": -2.02502703666687,
"logps/chosen": -1.1166651248931885,
"logps/rejected": -1.2446470260620117,
"loss": 1.6227,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.233330249786377,
"rewards/margins": 0.25596344470977783,
"rewards/rejected": -2.4892940521240234,
"step": 1260
},
{
"epoch": 0.9149855907780979,
"grad_norm": 20.87871503781233,
"learning_rate": 3.294040135454681e-08,
"logits/chosen": -1.9817100763320923,
"logits/rejected": -1.975229024887085,
"logps/chosen": -0.9639909863471985,
"logps/rejected": -1.0865840911865234,
"loss": 1.6229,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.927981972694397,
"rewards/margins": 0.2451862096786499,
"rewards/rejected": -2.173168182373047,
"step": 1270
},
{
"epoch": 0.9221902017291066,
"grad_norm": 24.375366521893024,
"learning_rate": 3.264165085216817e-08,
"logits/chosen": -2.073403835296631,
"logits/rejected": -2.0691773891448975,
"logps/chosen": -1.0255587100982666,
"logps/rejected": -1.1626403331756592,
"loss": 1.6102,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.051117420196533,
"rewards/margins": 0.2741633653640747,
"rewards/rejected": -2.3252806663513184,
"step": 1280
},
{
"epoch": 0.9293948126801153,
"grad_norm": 19.708714819889934,
"learning_rate": 3.2341691711512854e-08,
"logits/chosen": -2.0348494052886963,
"logits/rejected": -2.0337963104248047,
"logps/chosen": -0.993812084197998,
"logps/rejected": -1.0996659994125366,
"loss": 1.648,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.987624168395996,
"rewards/margins": 0.2117079198360443,
"rewards/rejected": -2.1993319988250732,
"step": 1290
},
{
"epoch": 0.9365994236311239,
"grad_norm": 25.666809748463557,
"learning_rate": 3.204057137548371e-08,
"logits/chosen": -2.0750679969787598,
"logits/rejected": -2.077117919921875,
"logps/chosen": -1.0476573705673218,
"logps/rejected": -1.1119945049285889,
"loss": 1.7181,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.0953147411346436,
"rewards/margins": 0.12867406010627747,
"rewards/rejected": -2.2239890098571777,
"step": 1300
},
{
"epoch": 0.9438040345821326,
"grad_norm": 24.64906958910084,
"learning_rate": 3.173833747064351e-08,
"logits/chosen": -2.0429582595825195,
"logits/rejected": -2.0435373783111572,
"logps/chosen": -0.9831833839416504,
"logps/rejected": -1.0801106691360474,
"loss": 1.6616,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9663667678833008,
"rewards/margins": 0.19385461509227753,
"rewards/rejected": -2.1602213382720947,
"step": 1310
},
{
"epoch": 0.9510086455331412,
"grad_norm": 24.10994908767819,
"learning_rate": 3.143503779968213e-08,
"logits/chosen": -2.0116159915924072,
"logits/rejected": -2.005117893218994,
"logps/chosen": -1.0751299858093262,
"logps/rejected": -1.2202341556549072,
"loss": 1.594,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1502599716186523,
"rewards/margins": 0.29020795226097107,
"rewards/rejected": -2.4404683113098145,
"step": 1320
},
{
"epoch": 0.9582132564841499,
"grad_norm": 17.63664818825136,
"learning_rate": 3.113072033385589e-08,
"logits/chosen": -2.0390655994415283,
"logits/rejected": -2.0347390174865723,
"logps/chosen": -1.0593435764312744,
"logps/rejected": -1.1827442646026611,
"loss": 1.6315,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.118687152862549,
"rewards/margins": 0.24680104851722717,
"rewards/rejected": -2.3654885292053223,
"step": 1330
},
{
"epoch": 0.9654178674351584,
"grad_norm": 29.367574205793645,
"learning_rate": 3.082543320540015e-08,
"logits/chosen": -2.010270357131958,
"logits/rejected": -2.0109667778015137,
"logps/chosen": -1.0154287815093994,
"logps/rejected": -1.132817029953003,
"loss": 1.641,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.030857563018799,
"rewards/margins": 0.23477670550346375,
"rewards/rejected": -2.265634059906006,
"step": 1340
},
{
"epoch": 0.9726224783861671,
"grad_norm": 18.34859679144091,
"learning_rate": 3.051922469991655e-08,
"logits/chosen": -1.9400978088378906,
"logits/rejected": -1.9382463693618774,
"logps/chosen": -1.0125986337661743,
"logps/rejected": -1.087135910987854,
"loss": 1.7066,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.0251972675323486,
"rewards/margins": 0.149074524641037,
"rewards/rejected": -2.174271821975708,
"step": 1350
},
{
"epoch": 0.9798270893371758,
"grad_norm": 18.971281667579635,
"learning_rate": 3.0212143248735886e-08,
"logits/chosen": -1.9810125827789307,
"logits/rejected": -1.9789069890975952,
"logps/chosen": -0.961616039276123,
"logps/rejected": -1.0554723739624023,
"loss": 1.676,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.923232078552246,
"rewards/margins": 0.18771259486675262,
"rewards/rejected": -2.1109447479248047,
"step": 1360
},
{
"epoch": 0.9870317002881844,
"grad_norm": 24.96298831746159,
"learning_rate": 2.9904237421258046e-08,
"logits/chosen": -2.00824236869812,
"logits/rejected": -2.0035648345947266,
"logps/chosen": -0.9997411966323853,
"logps/rejected": -1.1224539279937744,
"loss": 1.6216,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9994823932647705,
"rewards/margins": 0.24542562663555145,
"rewards/rejected": -2.244907855987549,
"step": 1370
},
{
"epoch": 0.9942363112391931,
"grad_norm": 23.916004353098554,
"learning_rate": 2.9595555917269997e-08,
"logits/chosen": -1.9720462560653687,
"logits/rejected": -1.968483328819275,
"logps/chosen": -1.0500491857528687,
"logps/rejected": -1.1262010335922241,
"loss": 1.6923,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1000983715057373,
"rewards/margins": 0.1523038148880005,
"rewards/rejected": -2.2524020671844482,
"step": 1380
},
{
"epoch": 1.0014409221902016,
"grad_norm": 31.544680289629245,
"learning_rate": 2.928614755924327e-08,
"logits/chosen": -2.049835443496704,
"logits/rejected": -2.050297737121582,
"logps/chosen": -1.0141699314117432,
"logps/rejected": -1.12843656539917,
"loss": 1.645,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0283398628234863,
"rewards/margins": 0.22853314876556396,
"rewards/rejected": -2.25687313079834,
"step": 1390
},
{
"epoch": 1.0086455331412103,
"grad_norm": 22.171264495293624,
"learning_rate": 2.8976061284611908e-08,
"logits/chosen": -1.9490327835083008,
"logits/rejected": -1.9464311599731445,
"logps/chosen": -1.058870553970337,
"logps/rejected": -1.195708990097046,
"loss": 1.627,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.117741107940674,
"rewards/margins": 0.27367717027664185,
"rewards/rejected": -2.391417980194092,
"step": 1400
},
{
"epoch": 1.015850144092219,
"grad_norm": 26.077670247013415,
"learning_rate": 2.8665346138032327e-08,
"logits/chosen": -1.9846904277801514,
"logits/rejected": -1.9889650344848633,
"logps/chosen": -1.0112468004226685,
"logps/rejected": -1.1389497518539429,
"loss": 1.6326,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.022493600845337,
"rewards/margins": 0.2554059326648712,
"rewards/rejected": -2.2778995037078857,
"step": 1410
},
{
"epoch": 1.0230547550432276,
"grad_norm": 21.678779034806855,
"learning_rate": 2.8354051263626227e-08,
"logits/chosen": -2.014312505722046,
"logits/rejected": -2.0092453956604004,
"logps/chosen": -1.1152961254119873,
"logps/rejected": -1.2331750392913818,
"loss": 1.6569,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.2305922508239746,
"rewards/margins": 0.23575782775878906,
"rewards/rejected": -2.4663500785827637,
"step": 1420
},
{
"epoch": 1.0302593659942363,
"grad_norm": 24.392573956049755,
"learning_rate": 2.8042225897207648e-08,
"logits/chosen": -2.05131196975708,
"logits/rejected": -2.0500950813293457,
"logps/chosen": -0.9399210214614868,
"logps/rejected": -1.0278102159500122,
"loss": 1.6851,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.8798420429229736,
"rewards/margins": 0.17577846348285675,
"rewards/rejected": -2.0556204319000244,
"step": 1430
},
{
"epoch": 1.037463976945245,
"grad_norm": 26.584656132405183,
"learning_rate": 2.7729919358495728e-08,
"logits/chosen": -2.0092933177948,
"logits/rejected": -2.0025553703308105,
"logps/chosen": -1.1368526220321655,
"logps/rejected": -1.2202892303466797,
"loss": 1.6972,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.273705244064331,
"rewards/margins": 0.16687336564064026,
"rewards/rejected": -2.4405784606933594,
"step": 1440
},
{
"epoch": 1.0446685878962536,
"grad_norm": 19.430388178160936,
"learning_rate": 2.741718104331393e-08,
"logits/chosen": -2.06870698928833,
"logits/rejected": -2.0779881477355957,
"logps/chosen": -0.9939098358154297,
"logps/rejected": -1.1466522216796875,
"loss": 1.593,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9878196716308594,
"rewards/margins": 0.30548471212387085,
"rewards/rejected": -2.293304443359375,
"step": 1450
},
{
"epoch": 1.0518731988472623,
"grad_norm": 19.07915573550666,
"learning_rate": 2.710406041577751e-08,
"logits/chosen": -1.9887897968292236,
"logits/rejected": -1.9764807224273682,
"logps/chosen": -0.9991080164909363,
"logps/rejected": -1.1605703830718994,
"loss": 1.568,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.9982160329818726,
"rewards/margins": 0.3229246735572815,
"rewards/rejected": -2.321140766143799,
"step": 1460
},
{
"epoch": 1.059077809798271,
"grad_norm": 19.453172319867473,
"learning_rate": 2.679060700046994e-08,
"logits/chosen": -2.0260889530181885,
"logits/rejected": -2.0165085792541504,
"logps/chosen": -0.9590311050415039,
"logps/rejected": -1.092279076576233,
"loss": 1.6143,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.9180622100830078,
"rewards/margins": 0.266495943069458,
"rewards/rejected": -2.184558153152466,
"step": 1470
},
{
"epoch": 1.0662824207492796,
"grad_norm": 19.938817408083903,
"learning_rate": 2.647687037460996e-08,
"logits/chosen": -1.9772266149520874,
"logits/rejected": -1.9762458801269531,
"logps/chosen": -1.005311369895935,
"logps/rejected": -1.1241614818572998,
"loss": 1.6424,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.01062273979187,
"rewards/margins": 0.23770026862621307,
"rewards/rejected": -2.2483229637145996,
"step": 1480
},
{
"epoch": 1.0734870317002883,
"grad_norm": 22.895606799094736,
"learning_rate": 2.616290016021016e-08,
"logits/chosen": -1.9965251684188843,
"logits/rejected": -1.993642807006836,
"logps/chosen": -1.1106324195861816,
"logps/rejected": -1.1621644496917725,
"loss": 1.736,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.2212648391723633,
"rewards/margins": 0.10306410491466522,
"rewards/rejected": -2.324328899383545,
"step": 1490
},
{
"epoch": 1.080691642651297,
"grad_norm": 21.445560487885672,
"learning_rate": 2.584874601622854e-08,
"logits/chosen": -1.999669075012207,
"logits/rejected": -1.986670732498169,
"logps/chosen": -1.0090255737304688,
"logps/rejected": -1.12135910987854,
"loss": 1.6376,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0180511474609375,
"rewards/margins": 0.2246670424938202,
"rewards/rejected": -2.24271821975708,
"step": 1500
},
{
"epoch": 1.0878962536023056,
"grad_norm": 20.277040631917046,
"learning_rate": 2.5534457630714267e-08,
"logits/chosen": -2.044276714324951,
"logits/rejected": -2.0462608337402344,
"logps/chosen": -1.014711618423462,
"logps/rejected": -1.1253163814544678,
"loss": 1.65,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.029423236846924,
"rewards/margins": 0.22120928764343262,
"rewards/rejected": -2.2506327629089355,
"step": 1510
},
{
"epoch": 1.0951008645533142,
"grad_norm": 18.830969455717604,
"learning_rate": 2.5220084712948764e-08,
"logits/chosen": -1.985337495803833,
"logits/rejected": -1.9856802225112915,
"logps/chosen": -1.097121000289917,
"logps/rejected": -1.1095194816589355,
"loss": 1.7974,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -2.194242000579834,
"rewards/margins": 0.024796944111585617,
"rewards/rejected": -2.219038963317871,
"step": 1520
},
{
"epoch": 1.1023054755043227,
"grad_norm": 20.22397260920369,
"learning_rate": 2.490567698558343e-08,
"logits/chosen": -2.030097723007202,
"logits/rejected": -2.0213112831115723,
"logps/chosen": -0.9496325254440308,
"logps/rejected": -1.1169651746749878,
"loss": 1.5633,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.8992650508880615,
"rewards/margins": 0.3346652686595917,
"rewards/rejected": -2.2339303493499756,
"step": 1530
},
{
"epoch": 1.1095100864553313,
"grad_norm": 27.8479778302733,
"learning_rate": 2.4591284176775326e-08,
"logits/chosen": -2.033017158508301,
"logits/rejected": -2.0267956256866455,
"logps/chosen": -1.0237690210342407,
"logps/rejected": -1.148371934890747,
"loss": 1.6197,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.0475380420684814,
"rewards/margins": 0.24920621514320374,
"rewards/rejected": -2.296743869781494,
"step": 1540
},
{
"epoch": 1.11671469740634,
"grad_norm": 21.41619902791393,
"learning_rate": 2.4276956012321926e-08,
"logits/chosen": -2.0108845233917236,
"logits/rejected": -2.0055813789367676,
"logps/chosen": -1.026641607284546,
"logps/rejected": -1.133044719696045,
"loss": 1.6573,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.053283214569092,
"rewards/margins": 0.21280638873577118,
"rewards/rejected": -2.26608943939209,
"step": 1550
},
{
"epoch": 1.1239193083573487,
"grad_norm": 29.804837301654878,
"learning_rate": 2.3962742207796268e-08,
"logits/chosen": -2.0699496269226074,
"logits/rejected": -2.0638415813446045,
"logps/chosen": -1.0469386577606201,
"logps/rejected": -1.1839386224746704,
"loss": 1.6112,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0938773155212402,
"rewards/margins": 0.2739998698234558,
"rewards/rejected": -2.367877244949341,
"step": 1560
},
{
"epoch": 1.1311239193083573,
"grad_norm": 19.072257470295916,
"learning_rate": 2.364869246068368e-08,
"logits/chosen": -2.049614429473877,
"logits/rejected": -2.047759771347046,
"logps/chosen": -1.0033305883407593,
"logps/rejected": -1.1312639713287354,
"loss": 1.6191,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0066611766815186,
"rewards/margins": 0.255866676568985,
"rewards/rejected": -2.2625279426574707,
"step": 1570
},
{
"epoch": 1.138328530259366,
"grad_norm": 35.24129814573759,
"learning_rate": 2.3334856442521435e-08,
"logits/chosen": -2.021381139755249,
"logits/rejected": -2.0155997276306152,
"logps/chosen": -1.0470027923583984,
"logps/rejected": -1.1944993734359741,
"loss": 1.6022,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.094005584716797,
"rewards/margins": 0.29499319195747375,
"rewards/rejected": -2.3889987468719482,
"step": 1580
},
{
"epoch": 1.1455331412103746,
"grad_norm": 19.787153457776164,
"learning_rate": 2.3021283791042474e-08,
"logits/chosen": -1.9919068813323975,
"logits/rejected": -1.9873685836791992,
"logps/chosen": -0.9659102559089661,
"logps/rejected": -1.1006077527999878,
"loss": 1.6015,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.9318205118179321,
"rewards/margins": 0.26939502358436584,
"rewards/rejected": -2.2012155055999756,
"step": 1590
},
{
"epoch": 1.1527377521613833,
"grad_norm": 24.041299463029606,
"learning_rate": 2.2708024102324454e-08,
"logits/chosen": -2.0335323810577393,
"logits/rejected": -2.0249056816101074,
"logps/chosen": -1.0667665004730225,
"logps/rejected": -1.1801570653915405,
"loss": 1.6442,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.133533000946045,
"rewards/margins": 0.2267809808254242,
"rewards/rejected": -2.360314130783081,
"step": 1600
},
{
"epoch": 1.159942363112392,
"grad_norm": 24.12111757342678,
"learning_rate": 2.23951269229454e-08,
"logits/chosen": -1.9858362674713135,
"logits/rejected": -1.9893901348114014,
"logps/chosen": -1.0084689855575562,
"logps/rejected": -1.100411295890808,
"loss": 1.6765,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0169379711151123,
"rewards/margins": 0.18388447165489197,
"rewards/rejected": -2.200822591781616,
"step": 1610
},
{
"epoch": 1.1671469740634006,
"grad_norm": 24.80127928292751,
"learning_rate": 2.2082641742147238e-08,
"logits/chosen": -2.059333562850952,
"logits/rejected": -2.060598373413086,
"logps/chosen": -1.0564236640930176,
"logps/rejected": -1.1844590902328491,
"loss": 1.6208,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.112847328186035,
"rewards/margins": 0.25607064366340637,
"rewards/rejected": -2.3689181804656982,
"step": 1620
},
{
"epoch": 1.1743515850144093,
"grad_norm": 20.13313537520615,
"learning_rate": 2.177061798400832e-08,
"logits/chosen": -1.9521719217300415,
"logits/rejected": -1.9484403133392334,
"logps/chosen": -1.0265686511993408,
"logps/rejected": -1.0971999168395996,
"loss": 1.7039,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0531373023986816,
"rewards/margins": 0.14126275479793549,
"rewards/rejected": -2.194399833679199,
"step": 1630
},
{
"epoch": 1.181556195965418,
"grad_norm": 19.495579780217277,
"learning_rate": 2.145910499962628e-08,
"logits/chosen": -1.9787580966949463,
"logits/rejected": -1.9796241521835327,
"logps/chosen": -1.0164722204208374,
"logps/rejected": -1.1089410781860352,
"loss": 1.6796,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.032944440841675,
"rewards/margins": 0.1849372386932373,
"rewards/rejected": -2.2178821563720703,
"step": 1640
},
{
"epoch": 1.1887608069164266,
"grad_norm": 25.48988945819277,
"learning_rate": 2.1148152059312437e-08,
"logits/chosen": -2.0072319507598877,
"logits/rejected": -2.0051167011260986,
"logps/chosen": -1.005936861038208,
"logps/rejected": -1.077120304107666,
"loss": 1.707,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.011873722076416,
"rewards/margins": 0.14236697554588318,
"rewards/rejected": -2.154240608215332,
"step": 1650
},
{
"epoch": 1.195965417867435,
"grad_norm": 21.152990734847933,
"learning_rate": 2.0837808344799028e-08,
"logits/chosen": -1.9343931674957275,
"logits/rejected": -1.938122034072876,
"logps/chosen": -0.9725497961044312,
"logps/rejected": -1.0786354541778564,
"loss": 1.659,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9450995922088623,
"rewards/margins": 0.21217119693756104,
"rewards/rejected": -2.157270908355713,
"step": 1660
},
{
"epoch": 1.2031700288184437,
"grad_norm": 26.377266925973135,
"learning_rate": 2.052812294146033e-08,
"logits/chosen": -2.03047776222229,
"logits/rejected": -2.027374744415283,
"logps/chosen": -1.0395774841308594,
"logps/rejected": -1.1767067909240723,
"loss": 1.6092,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0791549682617188,
"rewards/margins": 0.27425867319107056,
"rewards/rejected": -2.3534135818481445,
"step": 1670
},
{
"epoch": 1.2103746397694524,
"grad_norm": 23.34792034910019,
"learning_rate": 2.0219144830549163e-08,
"logits/chosen": -1.971253752708435,
"logits/rejected": -1.9633668661117554,
"logps/chosen": -1.011054515838623,
"logps/rejected": -1.148241400718689,
"loss": 1.6033,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.022109031677246,
"rewards/margins": 0.2743736207485199,
"rewards/rejected": -2.296482801437378,
"step": 1680
},
{
"epoch": 1.217579250720461,
"grad_norm": 21.51470689698241,
"learning_rate": 1.9910922881449716e-08,
"logits/chosen": -2.011819362640381,
"logits/rejected": -2.01347017288208,
"logps/chosen": -1.0252724885940552,
"logps/rejected": -1.1493511199951172,
"loss": 1.6223,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0505449771881104,
"rewards/margins": 0.2481573075056076,
"rewards/rejected": -2.2987022399902344,
"step": 1690
},
{
"epoch": 1.2247838616714697,
"grad_norm": 26.201233097415667,
"learning_rate": 1.9603505843948214e-08,
"logits/chosen": -1.9847558736801147,
"logits/rejected": -1.9857571125030518,
"logps/chosen": -1.078355073928833,
"logps/rejected": -1.2002760171890259,
"loss": 1.6286,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.156710147857666,
"rewards/margins": 0.2438419610261917,
"rewards/rejected": -2.4005520343780518,
"step": 1700
},
{
"epoch": 1.2319884726224783,
"grad_norm": 23.36257553020346,
"learning_rate": 1.929694234052239e-08,
"logits/chosen": -2.0325675010681152,
"logits/rejected": -2.021353244781494,
"logps/chosen": -0.9391233325004578,
"logps/rejected": -1.0919990539550781,
"loss": 1.5888,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.8782466650009155,
"rewards/margins": 0.3057512640953064,
"rewards/rejected": -2.1839981079101562,
"step": 1710
},
{
"epoch": 1.239193083573487,
"grad_norm": 19.47841832843257,
"learning_rate": 1.8991280858651157e-08,
"logits/chosen": -1.9777787923812866,
"logits/rejected": -1.9743425846099854,
"logps/chosen": -0.9831492304801941,
"logps/rejected": -1.1153368949890137,
"loss": 1.6254,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9662984609603882,
"rewards/margins": 0.2643755078315735,
"rewards/rejected": -2.2306737899780273,
"step": 1720
},
{
"epoch": 1.2463976945244957,
"grad_norm": 29.21200511886615,
"learning_rate": 1.868656974314557e-08,
"logits/chosen": -2.0204837322235107,
"logits/rejected": -2.0202364921569824,
"logps/chosen": -1.032915711402893,
"logps/rejected": -1.1591503620147705,
"loss": 1.6285,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.065831422805786,
"rewards/margins": 0.2524695098400116,
"rewards/rejected": -2.318300724029541,
"step": 1730
},
{
"epoch": 1.2536023054755043,
"grad_norm": 19.748132847469815,
"learning_rate": 1.8382857188502422e-08,
"logits/chosen": -2.005788803100586,
"logits/rejected": -2.0102851390838623,
"logps/chosen": -1.0914397239685059,
"logps/rejected": -1.2216801643371582,
"loss": 1.6228,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1828794479370117,
"rewards/margins": 0.26048025488853455,
"rewards/rejected": -2.4433603286743164,
"step": 1740
},
{
"epoch": 1.260806916426513,
"grad_norm": 21.739700200932262,
"learning_rate": 1.8080191231281594e-08,
"logits/chosen": -1.9720125198364258,
"logits/rejected": -1.9585212469100952,
"logps/chosen": -1.0438520908355713,
"logps/rejected": -1.120289921760559,
"loss": 1.7131,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.0877041816711426,
"rewards/margins": 0.15287601947784424,
"rewards/rejected": -2.240579843521118,
"step": 1750
},
{
"epoch": 1.2680115273775217,
"grad_norm": 22.41910419325543,
"learning_rate": 1.7778619742508345e-08,
"logits/chosen": -2.0582499504089355,
"logits/rejected": -2.063490390777588,
"logps/chosen": -1.131137728691101,
"logps/rejected": -1.2315350770950317,
"loss": 1.6769,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.262275457382202,
"rewards/margins": 0.20079448819160461,
"rewards/rejected": -2.4630701541900635,
"step": 1760
},
{
"epoch": 1.2752161383285303,
"grad_norm": 21.482297600422083,
"learning_rate": 1.7478190420101796e-08,
"logits/chosen": -2.036742687225342,
"logits/rejected": -2.032109260559082,
"logps/chosen": -1.050010085105896,
"logps/rejected": -1.183793544769287,
"loss": 1.6195,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.100020170211792,
"rewards/margins": 0.2675670385360718,
"rewards/rejected": -2.367587089538574,
"step": 1770
},
{
"epoch": 1.282420749279539,
"grad_norm": 27.784893259936197,
"learning_rate": 1.717895078133088e-08,
"logits/chosen": -2.0239245891571045,
"logits/rejected": -2.0181150436401367,
"logps/chosen": -1.1388823986053467,
"logps/rejected": -1.2011306285858154,
"loss": 1.7186,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2777647972106934,
"rewards/margins": 0.12449675798416138,
"rewards/rejected": -2.402261257171631,
"step": 1780
},
{
"epoch": 1.2896253602305476,
"grad_norm": 15.507226212932258,
"learning_rate": 1.688094815529873e-08,
"logits/chosen": -1.978154182434082,
"logits/rejected": -1.9796226024627686,
"logps/chosen": -0.9750539660453796,
"logps/rejected": -1.0688438415527344,
"loss": 1.6649,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9501079320907593,
"rewards/margins": 0.18757975101470947,
"rewards/rejected": -2.1376876831054688,
"step": 1790
},
{
"epoch": 1.2968299711815563,
"grad_norm": 23.291152738627,
"learning_rate": 1.658422967545693e-08,
"logits/chosen": -2.0640625953674316,
"logits/rejected": -2.0586647987365723,
"logps/chosen": -1.0171083211898804,
"logps/rejected": -1.207260251045227,
"loss": 1.5423,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0342166423797607,
"rewards/margins": 0.38030365109443665,
"rewards/rejected": -2.414520502090454,
"step": 1800
},
{
"epoch": 1.304034582132565,
"grad_norm": 24.79688067458518,
"learning_rate": 1.6288842272150614e-08,
"logits/chosen": -1.9829915761947632,
"logits/rejected": -1.9848480224609375,
"logps/chosen": -0.9952167272567749,
"logps/rejected": -1.1160945892333984,
"loss": 1.6428,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9904334545135498,
"rewards/margins": 0.24175576865673065,
"rewards/rejected": -2.232189178466797,
"step": 1810
},
{
"epoch": 1.3112391930835736,
"grad_norm": 31.41860730573992,
"learning_rate": 1.5994832665195853e-08,
"logits/chosen": -2.014812469482422,
"logits/rejected": -2.0102453231811523,
"logps/chosen": -0.9722223281860352,
"logps/rejected": -1.119134545326233,
"loss": 1.5972,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9444446563720703,
"rewards/margins": 0.29382452368736267,
"rewards/rejected": -2.238269090652466,
"step": 1820
},
{
"epoch": 1.318443804034582,
"grad_norm": 18.540859576791913,
"learning_rate": 1.5702247356490134e-08,
"logits/chosen": -1.985517144203186,
"logits/rejected": -1.9945405721664429,
"logps/chosen": -1.0170261859893799,
"logps/rejected": -1.1490360498428345,
"loss": 1.6267,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0340523719787598,
"rewards/margins": 0.2640196681022644,
"rewards/rejected": -2.298072099685669,
"step": 1830
},
{
"epoch": 1.3256484149855907,
"grad_norm": 20.824160350380623,
"learning_rate": 1.541113262265748e-08,
"logits/chosen": -1.9940106868743896,
"logits/rejected": -1.9816381931304932,
"logps/chosen": -0.9921188354492188,
"logps/rejected": -1.1125357151031494,
"loss": 1.6419,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9842376708984375,
"rewards/margins": 0.2408340871334076,
"rewards/rejected": -2.225071430206299,
"step": 1840
},
{
"epoch": 1.3328530259365994,
"grad_norm": 20.85405330465015,
"learning_rate": 1.5121534507729073e-08,
"logits/chosen": -2.0356698036193848,
"logits/rejected": -2.029043197631836,
"logps/chosen": -0.9926624298095703,
"logps/rejected": -1.119905710220337,
"loss": 1.6287,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9853248596191406,
"rewards/margins": 0.25448688864707947,
"rewards/rejected": -2.239811420440674,
"step": 1850
},
{
"epoch": 1.340057636887608,
"grad_norm": 20.598069658664354,
"learning_rate": 1.4833498815860756e-08,
"logits/chosen": -2.0153400897979736,
"logits/rejected": -2.012422800064087,
"logps/chosen": -1.0502384901046753,
"logps/rejected": -1.1984398365020752,
"loss": 1.5909,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.1004769802093506,
"rewards/margins": 0.2964025139808655,
"rewards/rejected": -2.3968796730041504,
"step": 1860
},
{
"epoch": 1.3472622478386167,
"grad_norm": 24.169363767878462,
"learning_rate": 1.4547071104088443e-08,
"logits/chosen": -1.9878826141357422,
"logits/rejected": -1.976165771484375,
"logps/chosen": -0.9248664975166321,
"logps/rejected": -1.0936037302017212,
"loss": 1.5519,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8497329950332642,
"rewards/margins": 0.33747443556785583,
"rewards/rejected": -2.1872074604034424,
"step": 1870
},
{
"epoch": 1.3544668587896254,
"grad_norm": 22.28586277053279,
"learning_rate": 1.4262296675122592e-08,
"logits/chosen": -1.9980617761611938,
"logits/rejected": -1.993934988975525,
"logps/chosen": -1.0281975269317627,
"logps/rejected": -1.1155600547790527,
"loss": 1.6881,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0563950538635254,
"rewards/margins": 0.1747252643108368,
"rewards/rejected": -2.2311201095581055,
"step": 1880
},
{
"epoch": 1.361671469740634,
"grad_norm": 25.91029433001521,
"learning_rate": 1.3979220570182902e-08,
"logits/chosen": -1.9705870151519775,
"logits/rejected": -1.9714816808700562,
"logps/chosen": -1.0206265449523926,
"logps/rejected": -1.153298258781433,
"loss": 1.612,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.041253089904785,
"rewards/margins": 0.26534369587898254,
"rewards/rejected": -2.306596517562866,
"step": 1890
},
{
"epoch": 1.3688760806916427,
"grad_norm": 18.134528292331787,
"learning_rate": 1.369788756187445e-08,
"logits/chosen": -1.9956543445587158,
"logits/rejected": -2.0001983642578125,
"logps/chosen": -1.0346391201019287,
"logps/rejected": -1.1629480123519897,
"loss": 1.6253,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0692782402038574,
"rewards/margins": 0.2566176652908325,
"rewards/rejected": -2.3258960247039795,
"step": 1900
},
{
"epoch": 1.3760806916426513,
"grad_norm": 24.41945869354139,
"learning_rate": 1.3418342147106212e-08,
"logits/chosen": -2.027116537094116,
"logits/rejected": -2.0312001705169678,
"logps/chosen": -1.052958607673645,
"logps/rejected": -1.2006646394729614,
"loss": 1.5902,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.10591721534729,
"rewards/margins": 0.29541200399398804,
"rewards/rejected": -2.401329278945923,
"step": 1910
},
{
"epoch": 1.38328530259366,
"grad_norm": 20.07983364842535,
"learning_rate": 1.3140628540053218e-08,
"logits/chosen": -2.0397636890411377,
"logits/rejected": -2.039858341217041,
"logps/chosen": -0.9341287612915039,
"logps/rejected": -1.1015546321868896,
"loss": 1.5639,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.8682575225830078,
"rewards/margins": 0.334852010011673,
"rewards/rejected": -2.2031092643737793,
"step": 1920
},
{
"epoch": 1.3904899135446687,
"grad_norm": 21.51361282786986,
"learning_rate": 1.286479066516345e-08,
"logits/chosen": -1.9665225744247437,
"logits/rejected": -1.9671493768692017,
"logps/chosen": -1.0339243412017822,
"logps/rejected": -1.096381425857544,
"loss": 1.7206,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0678486824035645,
"rewards/margins": 0.12491452693939209,
"rewards/rejected": -2.192762851715088,
"step": 1930
},
{
"epoch": 1.397694524495677,
"grad_norm": 18.366780492328118,
"learning_rate": 1.2590872150210574e-08,
"logits/chosen": -2.0531787872314453,
"logits/rejected": -2.0568125247955322,
"logps/chosen": -0.9896559715270996,
"logps/rejected": -1.0928928852081299,
"loss": 1.6574,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9793119430541992,
"rewards/margins": 0.20647385716438293,
"rewards/rejected": -2.1857857704162598,
"step": 1940
},
{
"epoch": 1.4048991354466858,
"grad_norm": 20.45388742088791,
"learning_rate": 1.2318916319393555e-08,
"logits/chosen": -2.0167720317840576,
"logits/rejected": -2.011418581008911,
"logps/chosen": -0.9759003520011902,
"logps/rejected": -1.079329490661621,
"loss": 1.6557,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9518007040023804,
"rewards/margins": 0.20685818791389465,
"rewards/rejected": -2.158658981323242,
"step": 1950
},
{
"epoch": 1.4121037463976944,
"grad_norm": 22.381857191050415,
"learning_rate": 1.2048966186484282e-08,
"logits/chosen": -1.9989935159683228,
"logits/rejected": -1.995570421218872,
"logps/chosen": -1.0002979040145874,
"logps/rejected": -1.1596596240997314,
"loss": 1.5749,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.000595808029175,
"rewards/margins": 0.3187234103679657,
"rewards/rejected": -2.319319248199463,
"step": 1960
},
{
"epoch": 1.419308357348703,
"grad_norm": 26.68547824879629,
"learning_rate": 1.1781064448024333e-08,
"logits/chosen": -2.0441131591796875,
"logits/rejected": -2.0380544662475586,
"logps/chosen": -1.0262136459350586,
"logps/rejected": -1.1702954769134521,
"loss": 1.6003,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.052427291870117,
"rewards/margins": 0.288163959980011,
"rewards/rejected": -2.3405909538269043,
"step": 1970
},
{
"epoch": 1.4265129682997117,
"grad_norm": 22.583692034377382,
"learning_rate": 1.1515253476571923e-08,
"logits/chosen": -2.01566481590271,
"logits/rejected": -2.0159027576446533,
"logps/chosen": -1.0137196779251099,
"logps/rejected": -1.1474217176437378,
"loss": 1.6281,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0274393558502197,
"rewards/margins": 0.2674042582511902,
"rewards/rejected": -2.2948434352874756,
"step": 1980
},
{
"epoch": 1.4337175792507204,
"grad_norm": 20.384401270587638,
"learning_rate": 1.1251575314000034e-08,
"logits/chosen": -1.9947608709335327,
"logits/rejected": -1.9923511743545532,
"logps/chosen": -0.996396541595459,
"logps/rejected": -1.1149427890777588,
"loss": 1.6386,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.992793083190918,
"rewards/margins": 0.23709246516227722,
"rewards/rejected": -2.2298855781555176,
"step": 1990
},
{
"epoch": 1.440922190201729,
"grad_norm": 18.925724233722455,
"learning_rate": 1.0990071664846861e-08,
"logits/chosen": -2.055290460586548,
"logits/rejected": -2.047081232070923,
"logps/chosen": -0.9910066723823547,
"logps/rejected": -1.1598938703536987,
"loss": 1.5608,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9820133447647095,
"rewards/margins": 0.33777478337287903,
"rewards/rejected": -2.3197877407073975,
"step": 2000
},
{
"epoch": 1.4481268011527377,
"grad_norm": 19.05620802651582,
"learning_rate": 1.0730783889719711e-08,
"logits/chosen": -1.999265432357788,
"logits/rejected": -1.992251992225647,
"logps/chosen": -1.0045114755630493,
"logps/rejected": -1.145806908607483,
"loss": 1.6002,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0090229511260986,
"rewards/margins": 0.2825910151004791,
"rewards/rejected": -2.291613817214966,
"step": 2010
},
{
"epoch": 1.4553314121037464,
"grad_norm": 21.429538223490898,
"learning_rate": 1.0473752998753114e-08,
"logits/chosen": -2.030794143676758,
"logits/rejected": -2.0306010246276855,
"logps/chosen": -1.0277831554412842,
"logps/rejected": -1.1082103252410889,
"loss": 1.6945,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.0555663108825684,
"rewards/margins": 0.16085436940193176,
"rewards/rejected": -2.2164206504821777,
"step": 2020
},
{
"epoch": 1.462536023054755,
"grad_norm": 22.050870113308804,
"learning_rate": 1.0219019645122575e-08,
"logits/chosen": -2.045313835144043,
"logits/rejected": -2.04082989692688,
"logps/chosen": -0.9714498519897461,
"logps/rejected": -1.1051620244979858,
"loss": 1.6095,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.9428997039794922,
"rewards/margins": 0.26742416620254517,
"rewards/rejected": -2.2103240489959717,
"step": 2030
},
{
"epoch": 1.4697406340057637,
"grad_norm": 26.157508425601968,
"learning_rate": 9.966624118614611e-09,
"logits/chosen": -2.032585620880127,
"logits/rejected": -2.0330874919891357,
"logps/chosen": -0.9972192645072937,
"logps/rejected": -1.1311516761779785,
"loss": 1.6103,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9944385290145874,
"rewards/margins": 0.26786476373672485,
"rewards/rejected": -2.262303352355957,
"step": 2040
},
{
"epoch": 1.4769452449567724,
"grad_norm": 23.13602722680495,
"learning_rate": 9.71660633925438e-09,
"logits/chosen": -2.020906925201416,
"logits/rejected": -2.011904001235962,
"logps/chosen": -1.1174659729003906,
"logps/rejected": -1.2822870016098022,
"loss": 1.5902,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.2349319458007812,
"rewards/margins": 0.32964205741882324,
"rewards/rejected": -2.5645740032196045,
"step": 2050
},
{
"epoch": 1.484149855907781,
"grad_norm": 24.5077374796,
"learning_rate": 9.469005850991705e-09,
"logits/chosen": -2.0096890926361084,
"logits/rejected": -2.0027241706848145,
"logps/chosen": -1.0285447835922241,
"logps/rejected": -1.0884907245635986,
"loss": 1.726,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.0570895671844482,
"rewards/margins": 0.11989171802997589,
"rewards/rejected": -2.1769814491271973,
"step": 2060
},
{
"epoch": 1.4913544668587897,
"grad_norm": 28.391707951147488,
"learning_rate": 9.223861815446682e-09,
"logits/chosen": -2.0371224880218506,
"logits/rejected": -2.022277593612671,
"logps/chosen": -1.1392552852630615,
"logps/rejected": -1.2324645519256592,
"loss": 1.6607,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.278510570526123,
"rewards/margins": 0.18641893565654755,
"rewards/rejected": -2.4649291038513184,
"step": 2070
},
{
"epoch": 1.4985590778097984,
"grad_norm": 21.381110932577513,
"learning_rate": 8.981213005715627e-09,
"logits/chosen": -1.949776291847229,
"logits/rejected": -1.952210783958435,
"logps/chosen": -1.0574856996536255,
"logps/rejected": -1.1887096166610718,
"loss": 1.6153,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.114971399307251,
"rewards/margins": 0.26244813203811646,
"rewards/rejected": -2.3774192333221436,
"step": 2080
},
{
"epoch": 1.505763688760807,
"grad_norm": 22.794596171681757,
"learning_rate": 8.741097800238617e-09,
"logits/chosen": -2.036937713623047,
"logits/rejected": -2.031627893447876,
"logps/chosen": -1.0450177192687988,
"logps/rejected": -1.1693742275238037,
"loss": 1.6249,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0900354385375977,
"rewards/margins": 0.24871286749839783,
"rewards/rejected": -2.3387484550476074,
"step": 2090
},
{
"epoch": 1.5129682997118157,
"grad_norm": 21.632409202600087,
"learning_rate": 8.503554176729341e-09,
"logits/chosen": -1.989061713218689,
"logits/rejected": -1.9979686737060547,
"logps/chosen": -0.9342214465141296,
"logps/rejected": -1.0600559711456299,
"loss": 1.6299,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.8684428930282593,
"rewards/margins": 0.2516690790653229,
"rewards/rejected": -2.1201119422912598,
"step": 2100
},
{
"epoch": 1.5201729106628243,
"grad_norm": 24.73490951688704,
"learning_rate": 8.268619706168376e-09,
"logits/chosen": -1.9841842651367188,
"logits/rejected": -1.9747326374053955,
"logps/chosen": -1.015921950340271,
"logps/rejected": -1.1286394596099854,
"loss": 1.6542,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.031843900680542,
"rewards/margins": 0.2254345417022705,
"rewards/rejected": -2.2572789192199707,
"step": 2110
},
{
"epoch": 1.527377521613833,
"grad_norm": 21.05879085590256,
"learning_rate": 8.036331546860777e-09,
"logits/chosen": -1.908087134361267,
"logits/rejected": -1.9043972492218018,
"logps/chosen": -1.0467476844787598,
"logps/rejected": -1.1627973318099976,
"loss": 1.6392,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0934953689575195,
"rewards/margins": 0.232099249958992,
"rewards/rejected": -2.325594663619995,
"step": 2120
},
{
"epoch": 1.5345821325648417,
"grad_norm": 22.29777503336236,
"learning_rate": 7.806726438559003e-09,
"logits/chosen": -1.9843000173568726,
"logits/rejected": -1.9898990392684937,
"logps/chosen": -1.0579800605773926,
"logps/rejected": -1.1668568849563599,
"loss": 1.6476,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.115960121154785,
"rewards/margins": 0.21775400638580322,
"rewards/rejected": -2.3337137699127197,
"step": 2130
},
{
"epoch": 1.54178674351585,
"grad_norm": 23.19617849497256,
"learning_rate": 7.579840696651938e-09,
"logits/chosen": -1.9837186336517334,
"logits/rejected": -1.975818395614624,
"logps/chosen": -1.062901258468628,
"logps/rejected": -1.1917634010314941,
"loss": 1.624,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.125802516937256,
"rewards/margins": 0.25772443413734436,
"rewards/rejected": -2.3835268020629883,
"step": 2140
},
{
"epoch": 1.5489913544668588,
"grad_norm": 28.073689367264755,
"learning_rate": 7.355710206421098e-09,
"logits/chosen": -1.939091444015503,
"logits/rejected": -1.9357961416244507,
"logps/chosen": -1.0485239028930664,
"logps/rejected": -1.1550390720367432,
"loss": 1.6545,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.097047805786133,
"rewards/margins": 0.2130306214094162,
"rewards/rejected": -2.3100781440734863,
"step": 2150
},
{
"epoch": 1.5561959654178674,
"grad_norm": 22.523519226977555,
"learning_rate": 7.134370417364849e-09,
"logits/chosen": -2.002406120300293,
"logits/rejected": -2.003296136856079,
"logps/chosen": -1.1109195947647095,
"logps/rejected": -1.1835315227508545,
"loss": 1.7081,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.221839189529419,
"rewards/margins": 0.14522375166416168,
"rewards/rejected": -2.367063045501709,
"step": 2160
},
{
"epoch": 1.563400576368876,
"grad_norm": 21.935686976360966,
"learning_rate": 6.915856337591572e-09,
"logits/chosen": -1.994996428489685,
"logits/rejected": -1.9938243627548218,
"logps/chosen": -0.9442762136459351,
"logps/rejected": -1.0544160604476929,
"loss": 1.6479,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.8885524272918701,
"rewards/margins": 0.22027930617332458,
"rewards/rejected": -2.1088321208953857,
"step": 2170
},
{
"epoch": 1.5706051873198847,
"grad_norm": 25.455047703257026,
"learning_rate": 6.700202528282603e-09,
"logits/chosen": -1.9900974035263062,
"logits/rejected": -1.9881980419158936,
"logps/chosen": -1.0776493549346924,
"logps/rejected": -1.2152760028839111,
"loss": 1.6069,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.1552987098693848,
"rewards/margins": 0.27525344491004944,
"rewards/rejected": -2.4305520057678223,
"step": 2180
},
{
"epoch": 1.5778097982708934,
"grad_norm": 24.84944470319713,
"learning_rate": 6.487443098225892e-09,
"logits/chosen": -2.0501418113708496,
"logits/rejected": -2.0468356609344482,
"logps/chosen": -1.031200885772705,
"logps/rejected": -1.1816002130508423,
"loss": 1.5922,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.06240177154541,
"rewards/margins": 0.30079856514930725,
"rewards/rejected": -2.3632004261016846,
"step": 2190
},
{
"epoch": 1.585014409221902,
"grad_norm": 20.909336414895183,
"learning_rate": 6.277611698421179e-09,
"logits/chosen": -2.01149845123291,
"logits/rejected": -2.007169723510742,
"logps/chosen": -1.0256855487823486,
"logps/rejected": -1.1708935499191284,
"loss": 1.5954,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0513710975646973,
"rewards/margins": 0.29041624069213867,
"rewards/rejected": -2.341787099838257,
"step": 2200
},
{
"epoch": 1.5922190201729105,
"grad_norm": 20.206486553431677,
"learning_rate": 6.070741516757608e-09,
"logits/chosen": -2.0089163780212402,
"logits/rejected": -2.0105767250061035,
"logps/chosen": -1.0603289604187012,
"logps/rejected": -1.1521384716033936,
"loss": 1.6878,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.1206579208374023,
"rewards/margins": 0.18361885845661163,
"rewards/rejected": -2.304276943206787,
"step": 2210
},
{
"epoch": 1.5994236311239192,
"grad_norm": 22.350708958975336,
"learning_rate": 5.866865272764607e-09,
"logits/chosen": -2.014432668685913,
"logits/rejected": -2.0136168003082275,
"logps/chosen": -1.0870046615600586,
"logps/rejected": -1.2768774032592773,
"loss": 1.5475,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.174009323120117,
"rewards/margins": 0.3797454833984375,
"rewards/rejected": -2.5537548065185547,
"step": 2220
},
{
"epoch": 1.6066282420749278,
"grad_norm": 23.288337045237636,
"learning_rate": 5.666015212436795e-09,
"logits/chosen": -2.0216281414031982,
"logits/rejected": -2.0152599811553955,
"logps/chosen": -1.0820379257202148,
"logps/rejected": -1.2009848356246948,
"loss": 1.6402,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1640758514404297,
"rewards/margins": 0.2378939837217331,
"rewards/rejected": -2.4019696712493896,
"step": 2230
},
{
"epoch": 1.6138328530259365,
"grad_norm": 27.01583069454552,
"learning_rate": 5.46822310313379e-09,
"logits/chosen": -1.9930493831634521,
"logits/rejected": -1.9963033199310303,
"logps/chosen": -0.9150048494338989,
"logps/rejected": -1.0773169994354248,
"loss": 1.5742,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.8300096988677979,
"rewards/margins": 0.32462412118911743,
"rewards/rejected": -2.1546339988708496,
"step": 2240
},
{
"epoch": 1.6210374639769451,
"grad_norm": 21.967134306318375,
"learning_rate": 5.273520228555767e-09,
"logits/chosen": -2.0560572147369385,
"logits/rejected": -2.0469000339508057,
"logps/chosen": -1.0821588039398193,
"logps/rejected": -1.2086551189422607,
"loss": 1.6406,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1643176078796387,
"rewards/margins": 0.25299257040023804,
"rewards/rejected": -2.4173102378845215,
"step": 2250
},
{
"epoch": 1.6282420749279538,
"grad_norm": 24.925763712447488,
"learning_rate": 5.081937383795484e-09,
"logits/chosen": -1.9834489822387695,
"logits/rejected": -1.9737904071807861,
"logps/chosen": -0.9765061140060425,
"logps/rejected": -1.092413306236267,
"loss": 1.6417,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.953012228012085,
"rewards/margins": 0.2318144291639328,
"rewards/rejected": -2.184826612472534,
"step": 2260
},
{
"epoch": 1.6354466858789625,
"grad_norm": 24.22018193079342,
"learning_rate": 4.893504870467588e-09,
"logits/chosen": -2.0181922912597656,
"logits/rejected": -2.016724109649658,
"logps/chosen": -1.0404140949249268,
"logps/rejected": -1.1570017337799072,
"loss": 1.6379,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0808281898498535,
"rewards/margins": 0.2331748753786087,
"rewards/rejected": -2.3140034675598145,
"step": 2270
},
{
"epoch": 1.6426512968299711,
"grad_norm": 26.66835743730832,
"learning_rate": 4.708252491915951e-09,
"logits/chosen": -1.979501724243164,
"logits/rejected": -1.9686695337295532,
"logps/chosen": -1.1171302795410156,
"logps/rejected": -1.232032060623169,
"loss": 1.6298,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.2342605590820312,
"rewards/margins": 0.22980327904224396,
"rewards/rejected": -2.464064121246338,
"step": 2280
},
{
"epoch": 1.6498559077809798,
"grad_norm": 22.00835339861606,
"learning_rate": 4.526209548499877e-09,
"logits/chosen": -1.9684820175170898,
"logits/rejected": -1.9653692245483398,
"logps/chosen": -1.0476069450378418,
"logps/rejected": -1.1009780168533325,
"loss": 1.7427,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0952138900756836,
"rewards/margins": 0.10674209892749786,
"rewards/rejected": -2.201956033706665,
"step": 2290
},
{
"epoch": 1.6570605187319885,
"grad_norm": 24.136328386644493,
"learning_rate": 4.347404832959775e-09,
"logits/chosen": -2.0073463916778564,
"logits/rejected": -1.9953422546386719,
"logps/chosen": -1.00043785572052,
"logps/rejected": -1.1069265604019165,
"loss": 1.6592,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.00087571144104,
"rewards/margins": 0.21297720074653625,
"rewards/rejected": -2.213853120803833,
"step": 2300
},
{
"epoch": 1.6642651296829971,
"grad_norm": 22.057010770613108,
"learning_rate": 4.171866625863229e-09,
"logits/chosen": -1.9719337224960327,
"logits/rejected": -1.968071699142456,
"logps/chosen": -1.0742835998535156,
"logps/rejected": -1.1518973112106323,
"loss": 1.7035,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1485671997070312,
"rewards/margins": 0.15522751212120056,
"rewards/rejected": -2.3037946224212646,
"step": 2310
},
{
"epoch": 1.6714697406340058,
"grad_norm": 24.46288263177118,
"learning_rate": 3.9996226911319546e-09,
"logits/chosen": -2.044360637664795,
"logits/rejected": -2.044651746749878,
"logps/chosen": -1.0162107944488525,
"logps/rejected": -1.1387598514556885,
"loss": 1.635,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.032421588897705,
"rewards/margins": 0.24509792029857635,
"rewards/rejected": -2.277519702911377,
"step": 2320
},
{
"epoch": 1.6786743515850144,
"grad_norm": 22.36911873035069,
"learning_rate": 3.830700271650567e-09,
"logits/chosen": -2.045510768890381,
"logits/rejected": -2.0477523803710938,
"logps/chosen": -0.9628473520278931,
"logps/rejected": -1.124125599861145,
"loss": 1.5848,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9256947040557861,
"rewards/margins": 0.3225559890270233,
"rewards/rejected": -2.24825119972229,
"step": 2330
},
{
"epoch": 1.685878962536023,
"grad_norm": 22.36024023960374,
"learning_rate": 3.665126084957723e-09,
"logits/chosen": -1.9869651794433594,
"logits/rejected": -1.9849302768707275,
"logps/chosen": -0.9548214077949524,
"logps/rejected": -1.1073800325393677,
"loss": 1.5961,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9096428155899048,
"rewards/margins": 0.30511730909347534,
"rewards/rejected": -2.2147600650787354,
"step": 2340
},
{
"epoch": 1.6930835734870318,
"grad_norm": 26.20538266440962,
"learning_rate": 3.502926319020327e-09,
"logits/chosen": -1.953912377357483,
"logits/rejected": -1.943302869796753,
"logps/chosen": -1.060532569885254,
"logps/rejected": -1.1716785430908203,
"loss": 1.6529,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.121065139770508,
"rewards/margins": 0.22229187190532684,
"rewards/rejected": -2.3433570861816406,
"step": 2350
},
{
"epoch": 1.7002881844380404,
"grad_norm": 21.248350350748197,
"learning_rate": 3.3441266280915427e-09,
"logits/chosen": -1.9893739223480225,
"logits/rejected": -1.9806448221206665,
"logps/chosen": -0.9349273443222046,
"logps/rejected": -1.122243046760559,
"loss": 1.5398,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8698546886444092,
"rewards/margins": 0.3746311664581299,
"rewards/rejected": -2.244486093521118,
"step": 2360
},
{
"epoch": 1.707492795389049,
"grad_norm": 21.37718363400938,
"learning_rate": 3.1887521286532023e-09,
"logits/chosen": -2.0375638008117676,
"logits/rejected": -2.030097484588623,
"logps/chosen": -1.0926265716552734,
"logps/rejected": -1.156776785850525,
"loss": 1.7217,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.185253143310547,
"rewards/margins": 0.12830035388469696,
"rewards/rejected": -2.31355357170105,
"step": 2370
},
{
"epoch": 1.7146974063400577,
"grad_norm": 22.764740803633266,
"learning_rate": 3.0368273954432698e-09,
"logits/chosen": -2.0525734424591064,
"logits/rejected": -2.0523922443389893,
"logps/chosen": -1.1625360250473022,
"logps/rejected": -1.2549631595611572,
"loss": 1.6774,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.3250720500946045,
"rewards/margins": 0.18485429883003235,
"rewards/rejected": -2.5099263191223145,
"step": 2380
},
{
"epoch": 1.7219020172910664,
"grad_norm": 23.35653645378998,
"learning_rate": 2.888376457568964e-09,
"logits/chosen": -2.067924737930298,
"logits/rejected": -2.062840461730957,
"logps/chosen": -1.0438759326934814,
"logps/rejected": -1.1699734926223755,
"loss": 1.6262,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.087751865386963,
"rewards/margins": 0.25219520926475525,
"rewards/rejected": -2.339946985244751,
"step": 2390
},
{
"epoch": 1.729106628242075,
"grad_norm": 18.117475149068106,
"learning_rate": 2.7434227947062324e-09,
"logits/chosen": -2.028148651123047,
"logits/rejected": -2.022372245788574,
"logps/chosen": -1.0291882753372192,
"logps/rejected": -1.1964164972305298,
"loss": 1.5828,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0583765506744385,
"rewards/margins": 0.3344564139842987,
"rewards/rejected": -2.3928329944610596,
"step": 2400
},
{
"epoch": 1.7363112391930837,
"grad_norm": 25.38586926098497,
"learning_rate": 2.6019893333860954e-09,
"logits/chosen": -2.0227832794189453,
"logits/rejected": -2.0191638469696045,
"logps/chosen": -1.0688263177871704,
"logps/rejected": -1.1506297588348389,
"loss": 1.6948,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.137652635574341,
"rewards/margins": 0.16360695660114288,
"rewards/rejected": -2.3012595176696777,
"step": 2410
},
{
"epoch": 1.7435158501440924,
"grad_norm": 23.482865925242617,
"learning_rate": 2.4640984433684758e-09,
"logits/chosen": -2.0008485317230225,
"logits/rejected": -1.9945675134658813,
"logps/chosen": -0.9947766065597534,
"logps/rejected": -1.1175730228424072,
"loss": 1.6519,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.9895532131195068,
"rewards/margins": 0.24559286236763,
"rewards/rejected": -2.2351460456848145,
"step": 2420
},
{
"epoch": 1.7507204610951008,
"grad_norm": 21.083704816516438,
"learning_rate": 2.3297719341040856e-09,
"logits/chosen": -1.9876625537872314,
"logits/rejected": -1.9809608459472656,
"logps/chosen": -1.0115702152252197,
"logps/rejected": -1.1943610906600952,
"loss": 1.5407,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.0231404304504395,
"rewards/margins": 0.3655821681022644,
"rewards/rejected": -2.3887221813201904,
"step": 2430
},
{
"epoch": 1.7579250720461095,
"grad_norm": 22.214397294857353,
"learning_rate": 2.199031051284972e-09,
"logits/chosen": -2.029468297958374,
"logits/rejected": -2.0226287841796875,
"logps/chosen": -1.0236752033233643,
"logps/rejected": -1.0963797569274902,
"loss": 1.7051,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0473504066467285,
"rewards/margins": 0.14540909230709076,
"rewards/rejected": -2.1927595138549805,
"step": 2440
},
{
"epoch": 1.7651296829971181,
"grad_norm": 22.659240095278513,
"learning_rate": 2.0718964734841667e-09,
"logits/chosen": -2.0216195583343506,
"logits/rejected": -2.0182180404663086,
"logps/chosen": -1.1063746213912964,
"logps/rejected": -1.191334843635559,
"loss": 1.6945,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.2127492427825928,
"rewards/margins": 0.16992013156414032,
"rewards/rejected": -2.382669687271118,
"step": 2450
},
{
"epoch": 1.7723342939481268,
"grad_norm": 17.951670482135615,
"learning_rate": 1.948388308885102e-09,
"logits/chosen": -2.0658774375915527,
"logits/rejected": -2.0580222606658936,
"logps/chosen": -0.9547419548034668,
"logps/rejected": -1.0859801769256592,
"loss": 1.6238,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9094839096069336,
"rewards/margins": 0.2624765932559967,
"rewards/rejected": -2.1719603538513184,
"step": 2460
},
{
"epoch": 1.7795389048991355,
"grad_norm": 26.064153513909826,
"learning_rate": 1.8285260921011846e-09,
"logits/chosen": -2.047785520553589,
"logits/rejected": -2.038276433944702,
"logps/chosen": -1.1647402048110962,
"logps/rejected": -1.2458956241607666,
"loss": 1.7181,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.3294804096221924,
"rewards/margins": 0.16231082379817963,
"rewards/rejected": -2.491791248321533,
"step": 2470
},
{
"epoch": 1.7867435158501441,
"grad_norm": 19.498652452320453,
"learning_rate": 1.712328781086131e-09,
"logits/chosen": -1.9989614486694336,
"logits/rejected": -1.9959602355957031,
"logps/chosen": -1.0413289070129395,
"logps/rejected": -1.1756139993667603,
"loss": 1.6205,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.082657814025879,
"rewards/margins": 0.2685699760913849,
"rewards/rejected": -2.3512279987335205,
"step": 2480
},
{
"epoch": 1.7939481268011528,
"grad_norm": 19.579842521680934,
"learning_rate": 1.59981475413547e-09,
"logits/chosen": -1.982996940612793,
"logits/rejected": -1.9784364700317383,
"logps/chosen": -0.9387162923812866,
"logps/rejected": -1.0598050355911255,
"loss": 1.6288,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.8774325847625732,
"rewards/margins": 0.24217729270458221,
"rewards/rejected": -2.119610071182251,
"step": 2490
},
{
"epoch": 1.8011527377521612,
"grad_norm": 21.279275035018046,
"learning_rate": 1.491001806979772e-09,
"logits/chosen": -1.986486792564392,
"logits/rejected": -1.9824678897857666,
"logps/chosen": -1.0033233165740967,
"logps/rejected": -1.1795743703842163,
"loss": 1.5452,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.0066466331481934,
"rewards/margins": 0.35250231623649597,
"rewards/rejected": -2.3591487407684326,
"step": 2500
},
{
"epoch": 1.8083573487031699,
"grad_norm": 22.81377086234191,
"learning_rate": 1.3859071499699698e-09,
"logits/chosen": -1.9927501678466797,
"logits/rejected": -1.9867427349090576,
"logps/chosen": -1.0045326948165894,
"logps/rejected": -1.1114981174468994,
"loss": 1.6603,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0090653896331787,
"rewards/margins": 0.2139308750629425,
"rewards/rejected": -2.222996234893799,
"step": 2510
},
{
"epoch": 1.8155619596541785,
"grad_norm": 22.243359575974143,
"learning_rate": 1.2845474053553156e-09,
"logits/chosen": -1.967153549194336,
"logits/rejected": -1.9660823345184326,
"logps/chosen": -1.0100979804992676,
"logps/rejected": -1.1443712711334229,
"loss": 1.6266,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.020195960998535,
"rewards/margins": 0.26854681968688965,
"rewards/rejected": -2.2887425422668457,
"step": 2520
},
{
"epoch": 1.8227665706051872,
"grad_norm": 20.940981692691256,
"learning_rate": 1.1869386046543222e-09,
"logits/chosen": -1.9824377298355103,
"logits/rejected": -1.9759935140609741,
"logps/chosen": -1.0096970796585083,
"logps/rejected": -1.1292235851287842,
"loss": 1.6418,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0193941593170166,
"rewards/margins": 0.23905305564403534,
"rewards/rejected": -2.2584471702575684,
"step": 2530
},
{
"epoch": 1.8299711815561959,
"grad_norm": 23.31599170849686,
"learning_rate": 1.0930961861191302e-09,
"logits/chosen": -1.979270339012146,
"logits/rejected": -1.9758975505828857,
"logps/chosen": -0.9248117208480835,
"logps/rejected": -1.046858549118042,
"loss": 1.6419,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.849623441696167,
"rewards/margins": 0.2440938651561737,
"rewards/rejected": -2.093717098236084,
"step": 2540
},
{
"epoch": 1.8371757925072045,
"grad_norm": 25.087005895011544,
"learning_rate": 1.003034992293733e-09,
"logits/chosen": -2.0150609016418457,
"logits/rejected": -2.0050504207611084,
"logps/chosen": -0.9422048330307007,
"logps/rejected": -1.1044690608978271,
"loss": 1.5718,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8844096660614014,
"rewards/margins": 0.3245285153388977,
"rewards/rejected": -2.2089381217956543,
"step": 2550
},
{
"epoch": 1.8443804034582132,
"grad_norm": 21.9012224134329,
"learning_rate": 9.16769267666434e-10,
"logits/chosen": -1.9812209606170654,
"logits/rejected": -1.9651670455932617,
"logps/chosen": -0.984574019908905,
"logps/rejected": -1.0492041110992432,
"loss": 1.7224,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.96914803981781,
"rewards/margins": 0.12925995886325836,
"rewards/rejected": -2.0984082221984863,
"step": 2560
},
{
"epoch": 1.8515850144092219,
"grad_norm": 29.798850096367598,
"learning_rate": 8.343126564168412e-10,
"logits/chosen": -2.023308277130127,
"logits/rejected": -2.0172171592712402,
"logps/chosen": -1.0593010187149048,
"logps/rejected": -1.1926862001419067,
"loss": 1.6084,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1186020374298096,
"rewards/margins": 0.2667704224586487,
"rewards/rejected": -2.3853724002838135,
"step": 2570
},
{
"epoch": 1.8587896253602305,
"grad_norm": 23.081298452800397,
"learning_rate": 7.55678200257856e-10,
"logits/chosen": -1.986285924911499,
"logits/rejected": -1.9805923700332642,
"logps/chosen": -1.0580508708953857,
"logps/rejected": -1.134381651878357,
"loss": 1.7026,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.1161017417907715,
"rewards/margins": 0.1526612788438797,
"rewards/rejected": -2.268763303756714,
"step": 2580
},
{
"epoch": 1.8659942363112392,
"grad_norm": 19.289572768646487,
"learning_rate": 6.808783363729364e-10,
"logits/chosen": -1.9875192642211914,
"logits/rejected": -1.9795039892196655,
"logps/chosen": -0.9947047233581543,
"logps/rejected": -1.128720998764038,
"loss": 1.6133,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9894094467163086,
"rewards/margins": 0.2680323123931885,
"rewards/rejected": -2.257441997528076,
"step": 2590
},
{
"epoch": 1.8731988472622478,
"grad_norm": 19.644352031599038,
"learning_rate": 6.099248954489794e-10,
"logits/chosen": -2.0019712448120117,
"logits/rejected": -2.0046229362487793,
"logps/chosen": -0.9696542024612427,
"logps/rejected": -1.0707123279571533,
"loss": 1.6688,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -1.9393084049224854,
"rewards/margins": 0.20211617648601532,
"rewards/rejected": -2.1414246559143066,
"step": 2600
},
{
"epoch": 1.8804034582132565,
"grad_norm": 20.13597620650645,
"learning_rate": 5.428290998051116e-10,
"logits/chosen": -1.991913080215454,
"logits/rejected": -1.9869178533554077,
"logps/chosen": -0.9798202514648438,
"logps/rejected": -1.0955702066421509,
"loss": 1.63,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9596405029296875,
"rewards/margins": 0.2314998209476471,
"rewards/rejected": -2.1911404132843018,
"step": 2610
},
{
"epoch": 1.8876080691642652,
"grad_norm": 24.79934802241825,
"learning_rate": 4.796015616177401e-10,
"logits/chosen": -1.9711564779281616,
"logits/rejected": -1.9690234661102295,
"logps/chosen": -1.028472900390625,
"logps/rejected": -1.1603518724441528,
"loss": 1.6215,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.05694580078125,
"rewards/margins": 0.2637581527233124,
"rewards/rejected": -2.3207037448883057,
"step": 2620
},
{
"epoch": 1.8948126801152738,
"grad_norm": 22.569211582194647,
"learning_rate": 4.2025228124205335e-10,
"logits/chosen": -2.0415008068084717,
"logits/rejected": -2.0464255809783936,
"logps/chosen": -1.1165236234664917,
"logps/rejected": -1.1973609924316406,
"loss": 1.6887,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.2330472469329834,
"rewards/margins": 0.1616745889186859,
"rewards/rejected": -2.3947219848632812,
"step": 2630
},
{
"epoch": 1.9020172910662825,
"grad_norm": 29.251729021197608,
"learning_rate": 3.64790645630339e-10,
"logits/chosen": -2.0050177574157715,
"logits/rejected": -1.9978790283203125,
"logps/chosen": -1.0866100788116455,
"logps/rejected": -1.1683754920959473,
"loss": 1.7045,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.173220157623291,
"rewards/margins": 0.16353097558021545,
"rewards/rejected": -2.3367509841918945,
"step": 2640
},
{
"epoch": 1.9092219020172911,
"grad_norm": 26.686501597422076,
"learning_rate": 3.1322542684729945e-10,
"logits/chosen": -1.9958645105361938,
"logits/rejected": -1.9863427877426147,
"logps/chosen": -1.1009094715118408,
"logps/rejected": -1.2481192350387573,
"loss": 1.5988,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.2018189430236816,
"rewards/margins": 0.29441922903060913,
"rewards/rejected": -2.4962384700775146,
"step": 2650
},
{
"epoch": 1.9164265129682998,
"grad_norm": 24.025977247368846,
"learning_rate": 2.6556478068261447e-10,
"logits/chosen": -2.011669158935547,
"logits/rejected": -2.0208234786987305,
"logps/chosen": -1.0808687210083008,
"logps/rejected": -1.210559606552124,
"loss": 1.6254,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1617374420166016,
"rewards/margins": 0.2593816816806793,
"rewards/rejected": -2.421119213104248,
"step": 2660
},
{
"epoch": 1.9236311239193085,
"grad_norm": 23.159728407542556,
"learning_rate": 2.2181624536098952e-10,
"logits/chosen": -2.06213641166687,
"logits/rejected": -2.0583481788635254,
"logps/chosen": -1.054503321647644,
"logps/rejected": -1.183261513710022,
"loss": 1.624,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.109006643295288,
"rewards/margins": 0.2575165629386902,
"rewards/rejected": -2.366523027420044,
"step": 2670
},
{
"epoch": 1.9308357348703171,
"grad_norm": 24.285074754554824,
"learning_rate": 1.819867403498737e-10,
"logits/chosen": -2.0223278999328613,
"logits/rejected": -2.0300443172454834,
"logps/chosen": -1.0237973928451538,
"logps/rejected": -1.1488239765167236,
"loss": 1.616,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.0475947856903076,
"rewards/margins": 0.250053346157074,
"rewards/rejected": -2.2976479530334473,
"step": 2680
},
{
"epoch": 1.9380403458213258,
"grad_norm": 22.444579015136416,
"learning_rate": 1.4608256526505157e-10,
"logits/chosen": -1.9494588375091553,
"logits/rejected": -1.9468958377838135,
"logps/chosen": -1.1418302059173584,
"logps/rejected": -1.2206158638000488,
"loss": 1.6977,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.283660411834717,
"rewards/margins": 0.15757131576538086,
"rewards/rejected": -2.4412317276000977,
"step": 2690
},
{
"epoch": 1.9452449567723344,
"grad_norm": 26.381118967286472,
"learning_rate": 1.1410939887425141e-10,
"logits/chosen": -2.0545475482940674,
"logits/rejected": -2.041222095489502,
"logps/chosen": -0.9961267709732056,
"logps/rejected": -1.0998473167419434,
"loss": 1.6658,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.9922535419464111,
"rewards/margins": 0.20744113624095917,
"rewards/rejected": -2.1996946334838867,
"step": 2700
},
{
"epoch": 1.952449567723343,
"grad_norm": 24.607639118161103,
"learning_rate": 8.607229819898865e-11,
"logits/chosen": -2.02439022064209,
"logits/rejected": -2.0220091342926025,
"logps/chosen": -1.0608162879943848,
"logps/rejected": -1.1870168447494507,
"loss": 1.6251,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1216325759887695,
"rewards/margins": 0.2524010241031647,
"rewards/rejected": -2.3740336894989014,
"step": 2710
},
{
"epoch": 1.9596541786743515,
"grad_norm": 25.59822036865297,
"learning_rate": 6.19756977147029e-11,
"logits/chosen": -2.024225950241089,
"logits/rejected": -2.0201265811920166,
"logps/chosen": -0.9928004145622253,
"logps/rejected": -1.1591542959213257,
"loss": 1.5624,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.9856008291244507,
"rewards/margins": 0.3327081501483917,
"rewards/rejected": -2.3183085918426514,
"step": 2720
},
{
"epoch": 1.9668587896253602,
"grad_norm": 19.72589484365043,
"learning_rate": 4.1823408649391265e-11,
"logits/chosen": -1.9768123626708984,
"logits/rejected": -1.9772489070892334,
"logps/chosen": -1.0286433696746826,
"logps/rejected": -1.1329104900360107,
"loss": 1.6564,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0572867393493652,
"rewards/margins": 0.2085343301296234,
"rewards/rejected": -2.2658209800720215,
"step": 2730
},
{
"epoch": 1.9740634005763689,
"grad_norm": 22.64173380411296,
"learning_rate": 2.5618618380812694e-11,
"logits/chosen": -2.074462413787842,
"logits/rejected": -2.0713300704956055,
"logps/chosen": -0.9406960606575012,
"logps/rejected": -1.0647523403167725,
"loss": 1.6234,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.8813921213150024,
"rewards/margins": 0.24811263382434845,
"rewards/rejected": -2.129504680633545,
"step": 2740
},
{
"epoch": 1.9812680115273775,
"grad_norm": 18.289142985147425,
"learning_rate": 1.3363889932338501e-11,
"logits/chosen": -1.980743408203125,
"logits/rejected": -1.9810924530029297,
"logps/chosen": -1.0640665292739868,
"logps/rejected": -1.2089464664459229,
"loss": 1.5996,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.1281330585479736,
"rewards/margins": 0.2897598147392273,
"rewards/rejected": -2.4178929328918457,
"step": 2750
},
{
"epoch": 1.9884726224783862,
"grad_norm": 21.489973290523885,
"learning_rate": 5.061161567596061e-12,
"logits/chosen": -2.085202217102051,
"logits/rejected": -2.083030939102173,
"logps/chosen": -1.0205986499786377,
"logps/rejected": -1.12671959400177,
"loss": 1.6554,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0411972999572754,
"rewards/margins": 0.2122419774532318,
"rewards/rejected": -2.25343918800354,
"step": 2760
},
{
"epoch": 1.9956772334293948,
"grad_norm": 27.92987313520658,
"learning_rate": 7.11746483889053e-13,
"logits/chosen": -2.0238137245178223,
"logits/rejected": -2.018078327178955,
"logps/chosen": -1.0771089792251587,
"logps/rejected": -1.1666157245635986,
"loss": 1.6928,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.1542179584503174,
"rewards/margins": 0.17901378870010376,
"rewards/rejected": -2.3332314491271973,
"step": 2770
},
{
"epoch": 2.0,
"step": 2776,
"total_flos": 0.0,
"train_loss": 1.6477044489696322,
"train_runtime": 3633.5789,
"train_samples_per_second": 12.22,
"train_steps_per_second": 0.764
}
],
"logging_steps": 10,
"max_steps": 2776,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}