Llama-3-Instruct-8B-CPO / trainer_state.json
haoranxu's picture
Upload folder using huggingface_hub
fea62aa verified
raw
history blame
54.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 400,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010683760683760684,
"grad_norm": 53.325706395174244,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -0.129314124584198,
"logits/rejected": -0.1248931884765625,
"logps/chosen": -135.08358764648438,
"logps/rejected": -137.43325805664062,
"loss": 1.7058,
"nll_loss": 0.33312344551086426,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -13.508357048034668,
"rewards/margins": 0.23496881127357483,
"rewards/rejected": -13.743327140808105,
"step": 5
},
{
"epoch": 0.021367521367521368,
"grad_norm": 55.47829235394904,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.14523069560527802,
"logits/rejected": -0.12604156136512756,
"logps/chosen": -138.52426147460938,
"logps/rejected": -138.5316619873047,
"loss": 1.8327,
"nll_loss": 0.3906691372394562,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -13.85242748260498,
"rewards/margins": 0.0007393002742901444,
"rewards/rejected": -13.853166580200195,
"step": 10
},
{
"epoch": 0.03205128205128205,
"grad_norm": 49.43226333748803,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.11341211944818497,
"logits/rejected": -0.049495745450258255,
"logps/chosen": -137.65377807617188,
"logps/rejected": -135.8640594482422,
"loss": 1.8016,
"nll_loss": 0.38431602716445923,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -13.765378952026367,
"rewards/margins": -0.1789727509021759,
"rewards/rejected": -13.586407661437988,
"step": 15
},
{
"epoch": 0.042735042735042736,
"grad_norm": 49.78305628254871,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.016971366479992867,
"logits/rejected": 0.0019870593678206205,
"logps/chosen": -114.83842468261719,
"logps/rejected": -117.45811462402344,
"loss": 1.9135,
"nll_loss": 0.3634462356567383,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -11.483842849731445,
"rewards/margins": 0.2619696259498596,
"rewards/rejected": -11.74581241607666,
"step": 20
},
{
"epoch": 0.053418803418803416,
"grad_norm": 51.145326847505586,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -0.07376699149608612,
"logits/rejected": -0.0683017149567604,
"logps/chosen": -117.89664459228516,
"logps/rejected": -117.4900894165039,
"loss": 1.9047,
"nll_loss": 0.36660850048065186,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -11.789665222167969,
"rewards/margins": -0.040656279772520065,
"rewards/rejected": -11.749008178710938,
"step": 25
},
{
"epoch": 0.0641025641025641,
"grad_norm": 49.16493361461768,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -0.025973210111260414,
"logits/rejected": -0.06585584580898285,
"logps/chosen": -125.08711242675781,
"logps/rejected": -131.66029357910156,
"loss": 1.7073,
"nll_loss": 0.32747945189476013,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -12.508711814880371,
"rewards/margins": 0.6573159694671631,
"rewards/rejected": -13.166028022766113,
"step": 30
},
{
"epoch": 0.07478632478632478,
"grad_norm": 47.421405220624315,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -0.21625056862831116,
"logits/rejected": -0.2136630266904831,
"logps/chosen": -132.8556671142578,
"logps/rejected": -141.15939331054688,
"loss": 1.7525,
"nll_loss": 0.34708237648010254,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -13.285565376281738,
"rewards/margins": 0.8303732872009277,
"rewards/rejected": -14.115941047668457,
"step": 35
},
{
"epoch": 0.08547008547008547,
"grad_norm": 51.87443941562729,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -0.17511573433876038,
"logits/rejected": -0.1541801393032074,
"logps/chosen": -90.94963073730469,
"logps/rejected": -96.69755554199219,
"loss": 1.6671,
"nll_loss": 0.3230934143066406,
"rewards/accuracies": 0.625,
"rewards/chosen": -9.094963073730469,
"rewards/margins": 0.5747929215431213,
"rewards/rejected": -9.669755935668945,
"step": 40
},
{
"epoch": 0.09615384615384616,
"grad_norm": 52.47331055200087,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -0.1329955905675888,
"logits/rejected": -0.1524827778339386,
"logps/chosen": -109.51045989990234,
"logps/rejected": -113.86258697509766,
"loss": 1.7698,
"nll_loss": 0.35021230578422546,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -10.951045036315918,
"rewards/margins": 0.43521156907081604,
"rewards/rejected": -11.386259078979492,
"step": 45
},
{
"epoch": 0.10683760683760683,
"grad_norm": 47.963761732132916,
"learning_rate": 9.998747147528373e-07,
"logits/chosen": -0.19674669206142426,
"logits/rejected": -0.17990216612815857,
"logps/chosen": -133.21511840820312,
"logps/rejected": -130.31539916992188,
"loss": 1.632,
"nll_loss": 0.3093932569026947,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -13.321512222290039,
"rewards/margins": -0.28997141122817993,
"rewards/rejected": -13.031539916992188,
"step": 50
},
{
"epoch": 0.11752136752136752,
"grad_norm": 51.94560167409897,
"learning_rate": 9.991093100466482e-07,
"logits/chosen": -0.21401865780353546,
"logits/rejected": -0.19104179739952087,
"logps/chosen": -110.11128234863281,
"logps/rejected": -112.35163879394531,
"loss": 1.5823,
"nll_loss": 0.2831841707229614,
"rewards/accuracies": 0.375,
"rewards/chosen": -11.011127471923828,
"rewards/margins": 0.22403621673583984,
"rewards/rejected": -11.235164642333984,
"step": 55
},
{
"epoch": 0.1282051282051282,
"grad_norm": 58.49154963276172,
"learning_rate": 9.976491676662678e-07,
"logits/chosen": -0.13603931665420532,
"logits/rejected": -0.18451443314552307,
"logps/chosen": -119.95387268066406,
"logps/rejected": -133.7266845703125,
"loss": 1.6162,
"nll_loss": 0.2651820778846741,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -11.995387077331543,
"rewards/margins": 1.3772820234298706,
"rewards/rejected": -13.37267017364502,
"step": 60
},
{
"epoch": 0.1388888888888889,
"grad_norm": 54.3184236422879,
"learning_rate": 9.95496320064109e-07,
"logits/chosen": -0.2574421763420105,
"logits/rejected": -0.1615368276834488,
"logps/chosen": -102.09004211425781,
"logps/rejected": -92.2038345336914,
"loss": 1.5964,
"nll_loss": 0.27036991715431213,
"rewards/accuracies": 0.2750000059604645,
"rewards/chosen": -10.209004402160645,
"rewards/margins": -0.9886210560798645,
"rewards/rejected": -9.220383644104004,
"step": 65
},
{
"epoch": 0.14957264957264957,
"grad_norm": 59.11629390354106,
"learning_rate": 9.926537639070456e-07,
"logits/chosen": -0.28000539541244507,
"logits/rejected": -0.23213541507720947,
"logps/chosen": -119.5620346069336,
"logps/rejected": -123.65787506103516,
"loss": 1.7745,
"nll_loss": 0.36628904938697815,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -11.95620346069336,
"rewards/margins": 0.4095849096775055,
"rewards/rejected": -12.365787506103516,
"step": 70
},
{
"epoch": 0.16025641025641027,
"grad_norm": 57.53497747164318,
"learning_rate": 9.891254559051884e-07,
"logits/chosen": -0.1931193619966507,
"logits/rejected": -0.15742453932762146,
"logps/chosen": -115.27913665771484,
"logps/rejected": -126.52079772949219,
"loss": 1.5256,
"nll_loss": 0.3370510935783386,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -11.527913093566895,
"rewards/margins": 1.1241672039031982,
"rewards/rejected": -12.652081489562988,
"step": 75
},
{
"epoch": 0.17094017094017094,
"grad_norm": 52.58145471881646,
"learning_rate": 9.849163073043223e-07,
"logits/chosen": -0.12034114450216293,
"logits/rejected": -0.09974785149097443,
"logps/chosen": -134.50930786132812,
"logps/rejected": -125.11856842041016,
"loss": 1.7211,
"nll_loss": 0.3088015913963318,
"rewards/accuracies": 0.375,
"rewards/chosen": -13.450933456420898,
"rewards/margins": -0.939074695110321,
"rewards/rejected": -12.511857986450195,
"step": 80
},
{
"epoch": 0.18162393162393162,
"grad_norm": 52.38736187449637,
"learning_rate": 9.800321770496724e-07,
"logits/chosen": -0.0975460559129715,
"logits/rejected": -0.0958404392004013,
"logps/chosen": -89.18054962158203,
"logps/rejected": -94.65280151367188,
"loss": 1.4466,
"nll_loss": 0.301828533411026,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -8.918054580688477,
"rewards/margins": 0.5472255945205688,
"rewards/rejected": -9.465280532836914,
"step": 85
},
{
"epoch": 0.19230769230769232,
"grad_norm": 52.449979107103914,
"learning_rate": 9.744798636305187e-07,
"logits/chosen": -0.18882372975349426,
"logits/rejected": -0.18104039132595062,
"logps/chosen": -94.19184875488281,
"logps/rejected": -104.16746520996094,
"loss": 1.3906,
"nll_loss": 0.26845496892929077,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -9.419185638427734,
"rewards/margins": 0.9975622892379761,
"rewards/rejected": -10.416748046875,
"step": 90
},
{
"epoch": 0.202991452991453,
"grad_norm": 56.906972668285476,
"learning_rate": 9.68267095617003e-07,
"logits/chosen": -0.17206290364265442,
"logits/rejected": -0.12139072269201279,
"logps/chosen": -85.0337142944336,
"logps/rejected": -85.30671691894531,
"loss": 1.627,
"nll_loss": 0.3232787847518921,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -8.50337028503418,
"rewards/margins": 0.027300655841827393,
"rewards/rejected": -8.530672073364258,
"step": 95
},
{
"epoch": 0.21367521367521367,
"grad_norm": 56.063471274711276,
"learning_rate": 9.614025209023083e-07,
"logits/chosen": -0.21710339188575745,
"logits/rejected": -0.187973290681839,
"logps/chosen": -129.84634399414062,
"logps/rejected": -132.91452026367188,
"loss": 1.4296,
"nll_loss": 0.2875466048717499,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -12.984634399414062,
"rewards/margins": 0.3068179488182068,
"rewards/rejected": -13.291452407836914,
"step": 100
},
{
"epoch": 0.22435897435897437,
"grad_norm": 54.253076126268894,
"learning_rate": 9.538956946651815e-07,
"logits/chosen": -0.05166339874267578,
"logits/rejected": 0.029095903038978577,
"logps/chosen": -104.00286865234375,
"logps/rejected": -113.63838958740234,
"loss": 1.4212,
"nll_loss": 0.2850767970085144,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -10.400287628173828,
"rewards/margins": 0.9635505676269531,
"rewards/rejected": -11.363838195800781,
"step": 105
},
{
"epoch": 0.23504273504273504,
"grad_norm": 65.81870203591372,
"learning_rate": 9.457570660695539e-07,
"logits/chosen": -0.07271315902471542,
"logits/rejected": -0.11916762590408325,
"logps/chosen": -127.29766845703125,
"logps/rejected": -130.58584594726562,
"loss": 1.5306,
"nll_loss": 0.33747240900993347,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -12.729766845703125,
"rewards/margins": 0.3288170099258423,
"rewards/rejected": -13.05858325958252,
"step": 110
},
{
"epoch": 0.24572649572649571,
"grad_norm": 58.367624421766024,
"learning_rate": 9.369979637197774e-07,
"logits/chosen": -0.12270005792379379,
"logits/rejected": -0.17853489518165588,
"logps/chosen": -106.76663970947266,
"logps/rejected": -109.66035461425781,
"loss": 1.5831,
"nll_loss": 0.2727692425251007,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -10.676663398742676,
"rewards/margins": 0.2893708348274231,
"rewards/rejected": -10.966034889221191,
"step": 115
},
{
"epoch": 0.2564102564102564,
"grad_norm": 53.75246999549208,
"learning_rate": 9.276305798917158e-07,
"logits/chosen": -0.047394849359989166,
"logits/rejected": -0.08673441410064697,
"logps/chosen": -113.4908447265625,
"logps/rejected": -123.5615234375,
"loss": 1.551,
"nll_loss": 0.28447219729423523,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -11.34908390045166,
"rewards/margins": 1.0070677995681763,
"rewards/rejected": -12.35615348815918,
"step": 120
},
{
"epoch": 0.2670940170940171,
"grad_norm": 64.46832867529025,
"learning_rate": 9.176679535616476e-07,
"logits/chosen": 0.08038081228733063,
"logits/rejected": 0.05699559301137924,
"logps/chosen": -112.93634033203125,
"logps/rejected": -125.99974060058594,
"loss": 1.4662,
"nll_loss": 0.35629984736442566,
"rewards/accuracies": 0.75,
"rewards/chosen": -11.293634414672852,
"rewards/margins": 1.3063404560089111,
"rewards/rejected": -12.599973678588867,
"step": 125
},
{
"epoch": 0.2777777777777778,
"grad_norm": 55.82818467114938,
"learning_rate": 9.071239522565976e-07,
"logits/chosen": -0.07054271548986435,
"logits/rejected": -0.024266820400953293,
"logps/chosen": -113.1104507446289,
"logps/rejected": -116.25431060791016,
"loss": 1.4477,
"nll_loss": 0.33033448457717896,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -11.311044692993164,
"rewards/margins": 0.3143869638442993,
"rewards/rejected": -11.625432014465332,
"step": 130
},
{
"epoch": 0.28846153846153844,
"grad_norm": 59.467620258292605,
"learning_rate": 8.960132527513642e-07,
"logits/chosen": -0.06084425374865532,
"logits/rejected": -0.062250006943941116,
"logps/chosen": -127.11384582519531,
"logps/rejected": -127.90572357177734,
"loss": 1.5105,
"nll_loss": 0.354878306388855,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -12.711384773254395,
"rewards/margins": 0.07918860763311386,
"rewards/rejected": -12.790571212768555,
"step": 135
},
{
"epoch": 0.29914529914529914,
"grad_norm": 67.52329899130741,
"learning_rate": 8.8435132063911e-07,
"logits/chosen": -0.06606234610080719,
"logits/rejected": -0.02088196575641632,
"logps/chosen": -135.88082885742188,
"logps/rejected": -141.05441284179688,
"loss": 1.3854,
"nll_loss": 0.35649529099464417,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.58808422088623,
"rewards/margins": 0.5173591375350952,
"rewards/rejected": -14.105443000793457,
"step": 140
},
{
"epoch": 0.30982905982905984,
"grad_norm": 66.6282576658114,
"learning_rate": 8.721543888039532e-07,
"logits/chosen": -0.14654412865638733,
"logits/rejected": -0.15081673860549927,
"logps/chosen": -135.72756958007812,
"logps/rejected": -131.85450744628906,
"loss": 1.4429,
"nll_loss": 0.3281570076942444,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -13.57275676727295,
"rewards/margins": -0.38730812072753906,
"rewards/rejected": -13.185449600219727,
"step": 145
},
{
"epoch": 0.32051282051282054,
"grad_norm": 57.916364691190665,
"learning_rate": 8.594394348255237e-07,
"logits/chosen": -0.2932497560977936,
"logits/rejected": -0.25581642985343933,
"logps/chosen": -141.12599182128906,
"logps/rejected": -141.9236602783203,
"loss": 1.4486,
"nll_loss": 0.329673707485199,
"rewards/accuracies": 0.625,
"rewards/chosen": -14.11259937286377,
"rewards/margins": 0.07976653426885605,
"rewards/rejected": -14.192365646362305,
"step": 150
},
{
"epoch": 0.3311965811965812,
"grad_norm": 54.1500700625341,
"learning_rate": 8.462241573469377e-07,
"logits/chosen": -0.19544358551502228,
"logits/rejected": -0.1730412244796753,
"logps/chosen": -145.17381286621094,
"logps/rejected": -144.12461853027344,
"loss": 1.4251,
"nll_loss": 0.34274980425834656,
"rewards/accuracies": 0.5,
"rewards/chosen": -14.51738166809082,
"rewards/margins": -0.10491929203271866,
"rewards/rejected": -14.412463188171387,
"step": 155
},
{
"epoch": 0.3418803418803419,
"grad_norm": 62.757464156888936,
"learning_rate": 8.325269514390834e-07,
"logits/chosen": 0.013430899009108543,
"logits/rejected": -0.037021975964307785,
"logps/chosen": -111.84146881103516,
"logps/rejected": -128.94786071777344,
"loss": 1.4507,
"nll_loss": 0.315662145614624,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -11.184146881103516,
"rewards/margins": 1.7106406688690186,
"rewards/rejected": -12.894787788391113,
"step": 160
},
{
"epoch": 0.3525641025641026,
"grad_norm": 50.800038470428575,
"learning_rate": 8.183668829955111e-07,
"logits/chosen": -0.3039621412754059,
"logits/rejected": -0.287003755569458,
"logps/chosen": -139.82981872558594,
"logps/rejected": -146.7205352783203,
"loss": 1.37,
"nll_loss": 0.3524821996688843,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -13.982983589172363,
"rewards/margins": 0.6890703439712524,
"rewards/rejected": -14.672053337097168,
"step": 165
},
{
"epoch": 0.36324786324786323,
"grad_norm": 59.841427525159546,
"learning_rate": 8.037636621935684e-07,
"logits/chosen": -0.31735068559646606,
"logits/rejected": -0.23542420566082,
"logps/chosen": -103.27725982666016,
"logps/rejected": -106.25889587402344,
"loss": 1.3626,
"nll_loss": 0.3173479437828064,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -10.327725410461426,
"rewards/margins": 0.2981634736061096,
"rewards/rejected": -10.625889778137207,
"step": 170
},
{
"epoch": 0.37393162393162394,
"grad_norm": 52.9514862200353,
"learning_rate": 7.887376160587213e-07,
"logits/chosen": -0.20337620377540588,
"logits/rejected": -0.17184031009674072,
"logps/chosen": -116.390869140625,
"logps/rejected": -118.56620788574219,
"loss": 1.3521,
"nll_loss": 0.3265441358089447,
"rewards/accuracies": 0.625,
"rewards/chosen": -11.639086723327637,
"rewards/margins": 0.21753445267677307,
"rewards/rejected": -11.856620788574219,
"step": 175
},
{
"epoch": 0.38461538461538464,
"grad_norm": 50.63958304716666,
"learning_rate": 7.733096601702507e-07,
"logits/chosen": -0.012370765209197998,
"logits/rejected": 0.05527879670262337,
"logps/chosen": -113.68470764160156,
"logps/rejected": -110.1121597290039,
"loss": 1.3928,
"nll_loss": 0.3376830518245697,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -11.368470191955566,
"rewards/margins": -0.3572540283203125,
"rewards/rejected": -11.01121711730957,
"step": 180
},
{
"epoch": 0.3952991452991453,
"grad_norm": 53.743159898285576,
"learning_rate": 7.575012695477076e-07,
"logits/chosen": 0.06333889812231064,
"logits/rejected": 0.09429727494716644,
"logps/chosen": -110.7640609741211,
"logps/rejected": -115.84847259521484,
"loss": 1.5223,
"nll_loss": 0.3274967670440674,
"rewards/accuracies": 0.625,
"rewards/chosen": -11.076406478881836,
"rewards/margins": 0.5084413290023804,
"rewards/rejected": -11.584847450256348,
"step": 185
},
{
"epoch": 0.405982905982906,
"grad_norm": 56.40313393417172,
"learning_rate": 7.413344487586542e-07,
"logits/chosen": -0.08225846290588379,
"logits/rejected": -0.055325210094451904,
"logps/chosen": -116.2511978149414,
"logps/rejected": -132.88140869140625,
"loss": 1.4658,
"nll_loss": 0.3596312403678894,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -11.62511920928955,
"rewards/margins": 1.6630210876464844,
"rewards/rejected": -13.288141250610352,
"step": 190
},
{
"epoch": 0.4166666666666667,
"grad_norm": 54.645494345995765,
"learning_rate": 7.248317012892968e-07,
"logits/chosen": -0.1261519491672516,
"logits/rejected": -0.16661445796489716,
"logps/chosen": -125.1494140625,
"logps/rejected": -137.20791625976562,
"loss": 1.4354,
"nll_loss": 0.3696037232875824,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -12.514939308166504,
"rewards/margins": 1.2058517932891846,
"rewards/rejected": -13.720791816711426,
"step": 195
},
{
"epoch": 0.42735042735042733,
"grad_norm": 61.519785155196736,
"learning_rate": 7.08015998220647e-07,
"logits/chosen": -0.13151851296424866,
"logits/rejected": -0.08859863132238388,
"logps/chosen": -165.24661254882812,
"logps/rejected": -170.71624755859375,
"loss": 1.4629,
"nll_loss": 0.34175121784210205,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -16.524662017822266,
"rewards/margins": 0.5469658374786377,
"rewards/rejected": -17.07162857055664,
"step": 200
},
{
"epoch": 0.43803418803418803,
"grad_norm": 55.57722114588639,
"learning_rate": 6.909107462538111e-07,
"logits/chosen": -0.19121451675891876,
"logits/rejected": -0.20065097510814667,
"logps/chosen": -142.35458374023438,
"logps/rejected": -150.1237030029297,
"loss": 1.4106,
"nll_loss": 0.34670525789260864,
"rewards/accuracies": 0.625,
"rewards/chosen": -14.235458374023438,
"rewards/margins": 0.7769120335578918,
"rewards/rejected": -15.012370109558105,
"step": 205
},
{
"epoch": 0.44871794871794873,
"grad_norm": 69.86075329234693,
"learning_rate": 6.735397551289178e-07,
"logits/chosen": -0.1635718047618866,
"logits/rejected": -0.1015796884894371,
"logps/chosen": -132.5345458984375,
"logps/rejected": -135.8687286376953,
"loss": 1.4962,
"nll_loss": 0.32979699969291687,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -13.253456115722656,
"rewards/margins": 0.33341652154922485,
"rewards/rejected": -13.586873054504395,
"step": 210
},
{
"epoch": 0.4594017094017094,
"grad_norm": 58.3273614395911,
"learning_rate": 6.559272044830316e-07,
"logits/chosen": -0.07360713183879852,
"logits/rejected": -6.962567567825317e-05,
"logps/chosen": -122.03056335449219,
"logps/rejected": -129.28549194335938,
"loss": 1.3599,
"nll_loss": 0.3558313846588135,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -12.203059196472168,
"rewards/margins": 0.7254913449287415,
"rewards/rejected": -12.928548812866211,
"step": 215
},
{
"epoch": 0.4700854700854701,
"grad_norm": 50.10334120589398,
"learning_rate": 6.380976101931879e-07,
"logits/chosen": 0.01870536431670189,
"logits/rejected": 0.0029756189323961735,
"logps/chosen": -107.1243896484375,
"logps/rejected": -112.00141906738281,
"loss": 1.3138,
"nll_loss": 0.3773984909057617,
"rewards/accuracies": 0.625,
"rewards/chosen": -10.712437629699707,
"rewards/margins": 0.48770326375961304,
"rewards/rejected": -11.200141906738281,
"step": 220
},
{
"epoch": 0.4807692307692308,
"grad_norm": 57.01257456037248,
"learning_rate": 6.200757902513962e-07,
"logits/chosen": -0.0710291862487793,
"logits/rejected": -0.1122066006064415,
"logps/chosen": -132.7135467529297,
"logps/rejected": -147.7503204345703,
"loss": 1.3819,
"nll_loss": 0.3411773443222046,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.271354675292969,
"rewards/margins": 1.5036779642105103,
"rewards/rejected": -14.775032043457031,
"step": 225
},
{
"epoch": 0.49145299145299143,
"grad_norm": 56.10732954543998,
"learning_rate": 6.018868302191139e-07,
"logits/chosen": -0.02616579458117485,
"logits/rejected": -0.08220602571964264,
"logps/chosen": -109.6393051147461,
"logps/rejected": -119.71272277832031,
"loss": 1.4894,
"nll_loss": 0.32466286420822144,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -10.963930130004883,
"rewards/margins": 1.0073410272598267,
"rewards/rejected": -11.971270561218262,
"step": 230
},
{
"epoch": 0.5021367521367521,
"grad_norm": 57.150169644968244,
"learning_rate": 5.835560483092742e-07,
"logits/chosen": -0.08456510305404663,
"logits/rejected": -0.052301835268735886,
"logps/chosen": -110.45853424072266,
"logps/rejected": -109.49703216552734,
"loss": 1.5332,
"nll_loss": 0.35862648487091064,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -11.045854568481445,
"rewards/margins": -0.09615027904510498,
"rewards/rejected": -10.949705123901367,
"step": 235
},
{
"epoch": 0.5128205128205128,
"grad_norm": 56.92818984806646,
"learning_rate": 5.651089601444752e-07,
"logits/chosen": -0.25266486406326294,
"logits/rejected": -0.27402007579803467,
"logps/chosen": -163.11080932617188,
"logps/rejected": -170.6371612548828,
"loss": 1.3947,
"nll_loss": 0.3526006042957306,
"rewards/accuracies": 0.625,
"rewards/chosen": -16.311084747314453,
"rewards/margins": 0.7526326775550842,
"rewards/rejected": -17.063716888427734,
"step": 240
},
{
"epoch": 0.5235042735042735,
"grad_norm": 55.824252362448014,
"learning_rate": 5.465712432403811e-07,
"logits/chosen": -0.1512390673160553,
"logits/rejected": -0.15393702685832977,
"logps/chosen": -140.1497039794922,
"logps/rejected": -155.41702270507812,
"loss": 1.2664,
"nll_loss": 0.34822720289230347,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -14.014970779418945,
"rewards/margins": 1.5267311334609985,
"rewards/rejected": -15.541702270507812,
"step": 245
},
{
"epoch": 0.5341880341880342,
"grad_norm": 61.259217624033624,
"learning_rate": 5.279687012637798e-07,
"logits/chosen": 0.047980885952711105,
"logits/rejected": 0.0825057402253151,
"logps/chosen": -131.10092163085938,
"logps/rejected": -141.0630645751953,
"loss": 1.3979,
"nll_loss": 0.3648400902748108,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -13.110092163085938,
"rewards/margins": 0.9962142109870911,
"rewards/rejected": -14.106305122375488,
"step": 250
},
{
"epoch": 0.5448717948717948,
"grad_norm": 54.05289475380886,
"learning_rate": 5.093272281150382e-07,
"logits/chosen": -0.019353587180376053,
"logits/rejected": 0.09568696469068527,
"logps/chosen": -133.12542724609375,
"logps/rejected": -135.47377014160156,
"loss": 1.4605,
"nll_loss": 0.3375224173069,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -13.312542915344238,
"rewards/margins": 0.2348347008228302,
"rewards/rejected": -13.54737663269043,
"step": 255
},
{
"epoch": 0.5555555555555556,
"grad_norm": 55.813930730207794,
"learning_rate": 4.906727718849618e-07,
"logits/chosen": 0.031209534034132957,
"logits/rejected": 0.0828268900513649,
"logps/chosen": -109.07014465332031,
"logps/rejected": -123.6259994506836,
"loss": 1.3422,
"nll_loss": 0.2994317412376404,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -10.907014846801758,
"rewards/margins": 1.4555847644805908,
"rewards/rejected": -12.362600326538086,
"step": 260
},
{
"epoch": 0.5662393162393162,
"grad_norm": 55.164171360826785,
"learning_rate": 4.7203129873622036e-07,
"logits/chosen": -0.12409428507089615,
"logits/rejected": -0.08072350919246674,
"logps/chosen": -142.841552734375,
"logps/rejected": -144.4403533935547,
"loss": 1.3871,
"nll_loss": 0.36165937781333923,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -14.284154891967773,
"rewards/margins": 0.15988163650035858,
"rewards/rejected": -14.444036483764648,
"step": 265
},
{
"epoch": 0.5769230769230769,
"grad_norm": 61.02385741830308,
"learning_rate": 4.534287567596188e-07,
"logits/chosen": -0.16588857769966125,
"logits/rejected": -0.10881330817937851,
"logps/chosen": -142.35317993164062,
"logps/rejected": -145.8653106689453,
"loss": 1.4096,
"nll_loss": 0.3703126013278961,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -14.235316276550293,
"rewards/margins": 0.35121291875839233,
"rewards/rejected": -14.586529731750488,
"step": 270
},
{
"epoch": 0.5876068376068376,
"grad_norm": 58.004085751417605,
"learning_rate": 4.348910398555249e-07,
"logits/chosen": 0.025946801528334618,
"logits/rejected": 0.05849064514040947,
"logps/chosen": -112.86296081542969,
"logps/rejected": -121.48736572265625,
"loss": 1.2885,
"nll_loss": 0.3448982238769531,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -11.286294937133789,
"rewards/margins": 0.8624418377876282,
"rewards/rejected": -12.148736953735352,
"step": 275
},
{
"epoch": 0.5982905982905983,
"grad_norm": 60.093543566265524,
"learning_rate": 4.1644395169072575e-07,
"logits/chosen": -0.1264003962278366,
"logits/rejected": -0.11314131319522858,
"logps/chosen": -158.03598022460938,
"logps/rejected": -167.41009521484375,
"loss": 1.3685,
"nll_loss": 0.3446425497531891,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -15.803598403930664,
"rewards/margins": 0.9374116063117981,
"rewards/rejected": -16.741008758544922,
"step": 280
},
{
"epoch": 0.6089743589743589,
"grad_norm": 63.57104854646875,
"learning_rate": 3.9811316978088615e-07,
"logits/chosen": -0.08768756687641144,
"logits/rejected": -0.06864931434392929,
"logps/chosen": -114.5308837890625,
"logps/rejected": -113.0389633178711,
"loss": 1.3041,
"nll_loss": 0.3934231698513031,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -11.453088760375977,
"rewards/margins": -0.14919374883174896,
"rewards/rejected": -11.303895950317383,
"step": 285
},
{
"epoch": 0.6196581196581197,
"grad_norm": 46.88181022689408,
"learning_rate": 3.799242097486038e-07,
"logits/chosen": -0.06880898773670197,
"logits/rejected": -0.0963631272315979,
"logps/chosen": -118.00128173828125,
"logps/rejected": -126.088134765625,
"loss": 1.3619,
"nll_loss": 0.3445819616317749,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -11.800127983093262,
"rewards/margins": 0.8086857795715332,
"rewards/rejected": -12.60881519317627,
"step": 290
},
{
"epoch": 0.6303418803418803,
"grad_norm": 52.868293692367715,
"learning_rate": 3.619023898068123e-07,
"logits/chosen": 0.04794033616781235,
"logits/rejected": 0.02935163304209709,
"logps/chosen": -108.1368408203125,
"logps/rejected": -113.2763442993164,
"loss": 1.3385,
"nll_loss": 0.38119053840637207,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -10.813684463500977,
"rewards/margins": 0.5139498710632324,
"rewards/rejected": -11.327634811401367,
"step": 295
},
{
"epoch": 0.6410256410256411,
"grad_norm": 51.20036019131344,
"learning_rate": 3.4407279551696846e-07,
"logits/chosen": 0.03451260179281235,
"logits/rejected": 0.060915928333997726,
"logps/chosen": -117.63232421875,
"logps/rejected": -124.12806701660156,
"loss": 1.3697,
"nll_loss": 0.3462775647640228,
"rewards/accuracies": 0.5,
"rewards/chosen": -11.763232231140137,
"rewards/margins": 0.6495749354362488,
"rewards/rejected": -12.412806510925293,
"step": 300
},
{
"epoch": 0.6517094017094017,
"grad_norm": 58.582600175811734,
"learning_rate": 3.2646024487108213e-07,
"logits/chosen": -0.004241435322910547,
"logits/rejected": -0.03947947174310684,
"logps/chosen": -144.97406005859375,
"logps/rejected": -152.7334442138672,
"loss": 1.4605,
"nll_loss": 0.34281834959983826,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -14.497406005859375,
"rewards/margins": 0.7759408354759216,
"rewards/rejected": -15.273347854614258,
"step": 305
},
{
"epoch": 0.6623931623931624,
"grad_norm": 57.95314258124262,
"learning_rate": 3.0908925374618887e-07,
"logits/chosen": -0.12898002564907074,
"logits/rejected": -0.06738940626382828,
"logps/chosen": -147.439697265625,
"logps/rejected": -149.19259643554688,
"loss": 1.3813,
"nll_loss": 0.3252050578594208,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -14.74397087097168,
"rewards/margins": 0.1752903163433075,
"rewards/rejected": -14.91926097869873,
"step": 310
},
{
"epoch": 0.6730769230769231,
"grad_norm": 62.09918976711039,
"learning_rate": 2.91984001779353e-07,
"logits/chosen": -0.05374965816736221,
"logits/rejected": 0.0732099637389183,
"logps/chosen": -148.73020935058594,
"logps/rejected": -150.61476135253906,
"loss": 1.3364,
"nll_loss": 0.31267058849334717,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -14.873019218444824,
"rewards/margins": 0.18845567107200623,
"rewards/rejected": -15.06147575378418,
"step": 315
},
{
"epoch": 0.6837606837606838,
"grad_norm": 50.667972535787875,
"learning_rate": 2.751682987107029e-07,
"logits/chosen": 0.08584196865558624,
"logits/rejected": 0.1222977414727211,
"logps/chosen": -110.92390441894531,
"logps/rejected": -119.34083557128906,
"loss": 1.306,
"nll_loss": 0.3358796238899231,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -11.092391967773438,
"rewards/margins": 0.841692328453064,
"rewards/rejected": -11.934083938598633,
"step": 320
},
{
"epoch": 0.6944444444444444,
"grad_norm": 55.51527877131126,
"learning_rate": 2.5866555124134577e-07,
"logits/chosen": 0.03731069713830948,
"logits/rejected": 0.012952113524079323,
"logps/chosen": -145.85842895507812,
"logps/rejected": -154.74974060058594,
"loss": 1.3157,
"nll_loss": 0.31585749983787537,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -14.585843086242676,
"rewards/margins": 0.8891321420669556,
"rewards/rejected": -15.474973678588867,
"step": 325
},
{
"epoch": 0.7051282051282052,
"grad_norm": 52.22477831416299,
"learning_rate": 2.424987304522924e-07,
"logits/chosen": 0.07864940166473389,
"logits/rejected": 0.1205131784081459,
"logps/chosen": -111.40937805175781,
"logps/rejected": -114.6185073852539,
"loss": 1.3989,
"nll_loss": 0.3170923590660095,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -11.140935897827148,
"rewards/margins": 0.3209128677845001,
"rewards/rejected": -11.4618501663208,
"step": 330
},
{
"epoch": 0.7158119658119658,
"grad_norm": 59.70611600791039,
"learning_rate": 2.2669033982974944e-07,
"logits/chosen": -0.060904957354068756,
"logits/rejected": 0.005220590624958277,
"logps/chosen": -142.15638732910156,
"logps/rejected": -146.87728881835938,
"loss": 1.3442,
"nll_loss": 0.3452945053577423,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -14.2156400680542,
"rewards/margins": 0.472089946269989,
"rewards/rejected": -14.687728881835938,
"step": 335
},
{
"epoch": 0.7264957264957265,
"grad_norm": 71.85103597981242,
"learning_rate": 2.1126238394127867e-07,
"logits/chosen": -0.016889113932847977,
"logits/rejected": -0.0013866141671314836,
"logps/chosen": -133.07833862304688,
"logps/rejected": -141.3354949951172,
"loss": 1.3755,
"nll_loss": 0.294593870639801,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -13.307835578918457,
"rewards/margins": 0.8257135152816772,
"rewards/rejected": -14.133550643920898,
"step": 340
},
{
"epoch": 0.7371794871794872,
"grad_norm": 55.16644312498042,
"learning_rate": 1.9623633780643155e-07,
"logits/chosen": 0.07130751758813858,
"logits/rejected": 0.010698718018829823,
"logps/chosen": -111.66865539550781,
"logps/rejected": -113.45048522949219,
"loss": 1.3458,
"nll_loss": 0.42604750394821167,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -11.166866302490234,
"rewards/margins": 0.1781845986843109,
"rewards/rejected": -11.345048904418945,
"step": 345
},
{
"epoch": 0.7478632478632479,
"grad_norm": 56.371622200044676,
"learning_rate": 1.8163311700448898e-07,
"logits/chosen": -0.0578581877052784,
"logits/rejected": -0.026563648134469986,
"logps/chosen": -113.5242919921875,
"logps/rejected": -123.50186920166016,
"loss": 1.3004,
"nll_loss": 0.35681912302970886,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -11.352429389953613,
"rewards/margins": 0.9977572560310364,
"rewards/rejected": -12.350186347961426,
"step": 350
},
{
"epoch": 0.7585470085470085,
"grad_norm": 62.52480406001481,
"learning_rate": 1.674730485609166e-07,
"logits/chosen": -0.012843991629779339,
"logits/rejected": -0.071006640791893,
"logps/chosen": -118.711181640625,
"logps/rejected": -130.95870971679688,
"loss": 1.2971,
"nll_loss": 0.3193722665309906,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -11.871118545532227,
"rewards/margins": 1.22475266456604,
"rewards/rejected": -13.095870971679688,
"step": 355
},
{
"epoch": 0.7692307692307693,
"grad_norm": 55.56720356363359,
"learning_rate": 1.537758426530622e-07,
"logits/chosen": -0.01560185570269823,
"logits/rejected": 0.07330085337162018,
"logps/chosen": -118.5744400024414,
"logps/rejected": -115.8942642211914,
"loss": 1.4566,
"nll_loss": 0.38159191608428955,
"rewards/accuracies": 0.5,
"rewards/chosen": -11.857443809509277,
"rewards/margins": -0.26801711320877075,
"rewards/rejected": -11.589426040649414,
"step": 360
},
{
"epoch": 0.7799145299145299,
"grad_norm": 53.63595812481311,
"learning_rate": 1.4056056517447634e-07,
"logits/chosen": 0.0017164063174277544,
"logits/rejected": 0.017641058191657066,
"logps/chosen": -107.86031341552734,
"logps/rejected": -113.5797119140625,
"loss": 1.3115,
"nll_loss": 0.328066885471344,
"rewards/accuracies": 0.625,
"rewards/chosen": -10.786030769348145,
"rewards/margins": 0.5719406008720398,
"rewards/rejected": -11.357972145080566,
"step": 365
},
{
"epoch": 0.7905982905982906,
"grad_norm": 46.826432951695814,
"learning_rate": 1.2784561119604682e-07,
"logits/chosen": -0.038930945098400116,
"logits/rejected": -0.10993669927120209,
"logps/chosen": -128.37625122070312,
"logps/rejected": -142.15855407714844,
"loss": 1.3305,
"nll_loss": 0.38407421112060547,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -12.837625503540039,
"rewards/margins": 1.3782320022583008,
"rewards/rejected": -14.215856552124023,
"step": 370
},
{
"epoch": 0.8012820512820513,
"grad_norm": 54.285824433116986,
"learning_rate": 1.156486793608899e-07,
"logits/chosen": -0.0009039134019985795,
"logits/rejected": -0.0710187703371048,
"logps/chosen": -118.76985168457031,
"logps/rejected": -133.30014038085938,
"loss": 1.2896,
"nll_loss": 0.3520449101924896,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -11.876985549926758,
"rewards/margins": 1.4530264139175415,
"rewards/rejected": -13.330012321472168,
"step": 375
},
{
"epoch": 0.811965811965812,
"grad_norm": 52.729935383003465,
"learning_rate": 1.0398674724863581e-07,
"logits/chosen": -0.009277289733290672,
"logits/rejected": 0.07069944590330124,
"logps/chosen": -118.21263122558594,
"logps/rejected": -118.44798278808594,
"loss": 1.3282,
"nll_loss": 0.3493942320346832,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -11.821264266967773,
"rewards/margins": 0.02353498339653015,
"rewards/rejected": -11.844799041748047,
"step": 380
},
{
"epoch": 0.8226495726495726,
"grad_norm": 64.0051849097744,
"learning_rate": 9.287604774340235e-08,
"logits/chosen": 0.08045725524425507,
"logits/rejected": 0.1136372834444046,
"logps/chosen": -118.67901611328125,
"logps/rejected": -126.129638671875,
"loss": 1.2627,
"nll_loss": 0.3460482656955719,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -11.867900848388672,
"rewards/margins": 0.745063304901123,
"rewards/rejected": -12.612964630126953,
"step": 385
},
{
"epoch": 0.8333333333333334,
"grad_norm": 57.70135234770639,
"learning_rate": 8.233204643835234e-08,
"logits/chosen": -0.02369830384850502,
"logits/rejected": -0.13596853613853455,
"logps/chosen": -144.19119262695312,
"logps/rejected": -155.91761779785156,
"loss": 1.2748,
"nll_loss": 0.3630937933921814,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -14.419118881225586,
"rewards/margins": 1.1726421117782593,
"rewards/rejected": -15.591761589050293,
"step": 390
},
{
"epoch": 0.844017094017094,
"grad_norm": 58.46522179629448,
"learning_rate": 7.236942010828429e-08,
"logits/chosen": -0.08849911391735077,
"logits/rejected": -0.05617945268750191,
"logps/chosen": -123.7318344116211,
"logps/rejected": -124.00958251953125,
"loss": 1.3258,
"nll_loss": 0.3757060170173645,
"rewards/accuracies": 0.5,
"rewards/chosen": -12.373184204101562,
"rewards/margins": 0.027775108814239502,
"rewards/rejected": -12.400957107543945,
"step": 395
},
{
"epoch": 0.8547008547008547,
"grad_norm": 57.911987107984345,
"learning_rate": 6.300203628022271e-08,
"logits/chosen": -0.08188799023628235,
"logits/rejected": -0.1166069284081459,
"logps/chosen": -125.78984069824219,
"logps/rejected": -133.1624755859375,
"loss": 1.2968,
"nll_loss": 0.37085098028182983,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -12.578984260559082,
"rewards/margins": 0.7372626066207886,
"rewards/rejected": -13.316246032714844,
"step": 400
},
{
"epoch": 0.8547008547008547,
"eval_logits/chosen": 0.1708301454782486,
"eval_logits/rejected": 0.21349689364433289,
"eval_logps/chosen": -131.8848114013672,
"eval_logps/rejected": -143.98866271972656,
"eval_loss": 1.3273688554763794,
"eval_nll_loss": 0.3287227749824524,
"eval_rewards/accuracies": 0.7016128897666931,
"eval_rewards/chosen": -13.188480377197266,
"eval_rewards/margins": 1.2103854417800903,
"eval_rewards/rejected": -14.398866653442383,
"eval_runtime": 102.9227,
"eval_samples_per_second": 19.053,
"eval_steps_per_second": 0.301,
"step": 400
},
{
"epoch": 0.8653846153846154,
"grad_norm": 52.22311027332078,
"learning_rate": 5.42429339304461e-08,
"logits/chosen": 0.03789149597287178,
"logits/rejected": -0.024609360843896866,
"logps/chosen": -125.41561126708984,
"logps/rejected": -138.07968139648438,
"loss": 1.1837,
"nll_loss": 0.32930082082748413,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -12.541561126708984,
"rewards/margins": 1.2664083242416382,
"rewards/rejected": -13.807971000671387,
"step": 405
},
{
"epoch": 0.8760683760683761,
"grad_norm": 59.39997515872638,
"learning_rate": 4.610430533481857e-08,
"logits/chosen": -0.09911607950925827,
"logits/rejected": -0.08993721008300781,
"logps/chosen": -114.1660385131836,
"logps/rejected": -119.9376449584961,
"loss": 1.3091,
"nll_loss": 0.3979756832122803,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -11.416604042053223,
"rewards/margins": 0.5771591067314148,
"rewards/rejected": -11.993764877319336,
"step": 410
},
{
"epoch": 0.8867521367521367,
"grad_norm": 60.29868077187127,
"learning_rate": 3.859747909769162e-08,
"logits/chosen": -0.0670127421617508,
"logits/rejected": -0.014563268050551414,
"logps/chosen": -157.24630737304688,
"logps/rejected": -162.0565948486328,
"loss": 1.413,
"nll_loss": 0.33768731355667114,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -15.724632263183594,
"rewards/margins": 0.48102617263793945,
"rewards/rejected": -16.205659866333008,
"step": 415
},
{
"epoch": 0.8974358974358975,
"grad_norm": 60.41014318466645,
"learning_rate": 3.173290438299697e-08,
"logits/chosen": 0.08699943125247955,
"logits/rejected": 0.08032406866550446,
"logps/chosen": -125.44620513916016,
"logps/rejected": -134.0313720703125,
"loss": 1.3516,
"nll_loss": 0.35607296228408813,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -12.5446195602417,
"rewards/margins": 0.8585165739059448,
"rewards/rejected": -13.40313720703125,
"step": 420
},
{
"epoch": 0.9081196581196581,
"grad_norm": 55.3705487410181,
"learning_rate": 2.5520136369481194e-08,
"logits/chosen": -0.04224336892366409,
"logits/rejected": -0.06296161562204361,
"logps/chosen": -165.10733032226562,
"logps/rejected": -172.8993682861328,
"loss": 1.2141,
"nll_loss": 0.3679961562156677,
"rewards/accuracies": 0.625,
"rewards/chosen": -16.510732650756836,
"rewards/margins": 0.7792031168937683,
"rewards/rejected": -17.289936065673828,
"step": 425
},
{
"epoch": 0.9188034188034188,
"grad_norm": 54.089208241870814,
"learning_rate": 1.996782295032745e-08,
"logits/chosen": -0.14895446598529816,
"logits/rejected": -0.13183912634849548,
"logps/chosen": -142.65121459960938,
"logps/rejected": -148.99490356445312,
"loss": 1.2328,
"nll_loss": 0.31830939650535583,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -14.265121459960938,
"rewards/margins": 0.6343703269958496,
"rewards/rejected": -14.899490356445312,
"step": 430
},
{
"epoch": 0.9294871794871795,
"grad_norm": 55.028012487261535,
"learning_rate": 1.508369269567783e-08,
"logits/chosen": 0.002587652299553156,
"logits/rejected": -0.038656361401081085,
"logps/chosen": -132.2308349609375,
"logps/rejected": -143.6236114501953,
"loss": 1.3378,
"nll_loss": 0.35204577445983887,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -13.22308349609375,
"rewards/margins": 1.1392773389816284,
"rewards/rejected": -14.362360954284668,
"step": 435
},
{
"epoch": 0.9401709401709402,
"grad_norm": 65.09993319908288,
"learning_rate": 1.0874544094811422e-08,
"logits/chosen": 0.02723981812596321,
"logits/rejected": 0.026110615581274033,
"logps/chosen": -121.16329193115234,
"logps/rejected": -125.1588134765625,
"loss": 1.3971,
"nll_loss": 0.3366636037826538,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -12.116328239440918,
"rewards/margins": 0.3995509743690491,
"rewards/rejected": -12.515880584716797,
"step": 440
},
{
"epoch": 0.9508547008547008,
"grad_norm": 58.673331587997495,
"learning_rate": 7.346236092954316e-09,
"logits/chosen": -0.07683765143156052,
"logits/rejected": -0.08344952762126923,
"logps/chosen": -159.8972625732422,
"logps/rejected": -165.53884887695312,
"loss": 1.3386,
"nll_loss": 0.3503979444503784,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -15.989726066589355,
"rewards/margins": 0.5641571879386902,
"rewards/rejected": -16.553882598876953,
"step": 445
},
{
"epoch": 0.9615384615384616,
"grad_norm": 54.28528572494338,
"learning_rate": 4.50367993589107e-09,
"logits/chosen": -0.16253122687339783,
"logits/rejected": -0.07185138761997223,
"logps/chosen": -142.61741638183594,
"logps/rejected": -142.0267791748047,
"loss": 1.3953,
"nll_loss": 0.4227580428123474,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -14.261739730834961,
"rewards/margins": -0.05906330421566963,
"rewards/rejected": -14.202677726745605,
"step": 450
},
{
"epoch": 0.9722222222222222,
"grad_norm": 57.751616603455034,
"learning_rate": 2.3508323337321224e-09,
"logits/chosen": 0.05629957839846611,
"logits/rejected": 0.04388625547289848,
"logps/chosen": -109.04362487792969,
"logps/rejected": -116.85518646240234,
"loss": 1.3543,
"nll_loss": 0.4259931445121765,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -10.904363632202148,
"rewards/margins": 0.7811557650566101,
"rewards/rejected": -11.685519218444824,
"step": 455
},
{
"epoch": 0.9829059829059829,
"grad_norm": 53.54066392797414,
"learning_rate": 8.906899533517864e-10,
"logits/chosen": 0.12563523650169373,
"logits/rejected": 0.1722358763217926,
"logps/chosen": -120.39036560058594,
"logps/rejected": -125.317138671875,
"loss": 1.31,
"nll_loss": 0.3701631426811218,
"rewards/accuracies": 0.625,
"rewards/chosen": -12.039037704467773,
"rewards/margins": 0.49267855286598206,
"rewards/rejected": -12.531715393066406,
"step": 460
},
{
"epoch": 0.9935897435897436,
"grad_norm": 54.485812172434166,
"learning_rate": 1.252852471625987e-10,
"logits/chosen": 0.047066349536180496,
"logits/rejected": 0.08690531551837921,
"logps/chosen": -110.74609375,
"logps/rejected": -122.6104965209961,
"loss": 1.1667,
"nll_loss": 0.3447542190551758,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -11.074609756469727,
"rewards/margins": 1.1864404678344727,
"rewards/rejected": -12.261051177978516,
"step": 465
},
{
"epoch": 1.0,
"step": 468,
"total_flos": 0.0,
"train_loss": 1.4382328207676227,
"train_runtime": 9608.0854,
"train_samples_per_second": 6.232,
"train_steps_per_second": 0.049
}
],
"logging_steps": 5,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}