{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 400, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010683760683760684, "grad_norm": 53.325706395174244, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -0.129314124584198, "logits/rejected": -0.1248931884765625, "logps/chosen": -135.08358764648438, "logps/rejected": -137.43325805664062, "loss": 1.7058, "nll_loss": 0.33312344551086426, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -13.508357048034668, "rewards/margins": 0.23496881127357483, "rewards/rejected": -13.743327140808105, "step": 5 }, { "epoch": 0.021367521367521368, "grad_norm": 55.47829235394904, "learning_rate": 2.127659574468085e-07, "logits/chosen": -0.14523069560527802, "logits/rejected": -0.12604156136512756, "logps/chosen": -138.52426147460938, "logps/rejected": -138.5316619873047, "loss": 1.8327, "nll_loss": 0.3906691372394562, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -13.85242748260498, "rewards/margins": 0.0007393002742901444, "rewards/rejected": -13.853166580200195, "step": 10 }, { "epoch": 0.03205128205128205, "grad_norm": 49.43226333748803, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.11341211944818497, "logits/rejected": -0.049495745450258255, "logps/chosen": -137.65377807617188, "logps/rejected": -135.8640594482422, "loss": 1.8016, "nll_loss": 0.38431602716445923, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -13.765378952026367, "rewards/margins": -0.1789727509021759, "rewards/rejected": -13.586407661437988, "step": 15 }, { "epoch": 0.042735042735042736, "grad_norm": 49.78305628254871, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.016971366479992867, "logits/rejected": 0.0019870593678206205, "logps/chosen": -114.83842468261719, "logps/rejected": -117.45811462402344, "loss": 1.9135, "nll_loss": 0.3634462356567383, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.483842849731445, "rewards/margins": 0.2619696259498596, "rewards/rejected": -11.74581241607666, "step": 20 }, { "epoch": 0.053418803418803416, "grad_norm": 51.145326847505586, "learning_rate": 5.319148936170212e-07, "logits/chosen": -0.07376699149608612, "logits/rejected": -0.0683017149567604, "logps/chosen": -117.89664459228516, "logps/rejected": -117.4900894165039, "loss": 1.9047, "nll_loss": 0.36660850048065186, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.789665222167969, "rewards/margins": -0.040656279772520065, "rewards/rejected": -11.749008178710938, "step": 25 }, { "epoch": 0.0641025641025641, "grad_norm": 49.16493361461768, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.025973210111260414, "logits/rejected": -0.06585584580898285, "logps/chosen": -125.08711242675781, "logps/rejected": -131.66029357910156, "loss": 1.7073, "nll_loss": 0.32747945189476013, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -12.508711814880371, "rewards/margins": 0.6573159694671631, "rewards/rejected": -13.166028022766113, "step": 30 }, { "epoch": 0.07478632478632478, "grad_norm": 47.421405220624315, "learning_rate": 7.446808510638297e-07, "logits/chosen": -0.21625056862831116, "logits/rejected": -0.2136630266904831, "logps/chosen": -132.8556671142578, "logps/rejected": -141.15939331054688, "loss": 1.7525, "nll_loss": 0.34708237648010254, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -13.285565376281738, "rewards/margins": 0.8303732872009277, "rewards/rejected": -14.115941047668457, "step": 35 }, { "epoch": 0.08547008547008547, "grad_norm": 51.87443941562729, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.17511573433876038, "logits/rejected": -0.1541801393032074, "logps/chosen": -90.94963073730469, "logps/rejected": -96.69755554199219, "loss": 1.6671, "nll_loss": 0.3230934143066406, "rewards/accuracies": 0.625, "rewards/chosen": -9.094963073730469, "rewards/margins": 0.5747929215431213, "rewards/rejected": -9.669755935668945, "step": 40 }, { "epoch": 0.09615384615384616, "grad_norm": 52.47331055200087, "learning_rate": 9.574468085106384e-07, "logits/chosen": -0.1329955905675888, "logits/rejected": -0.1524827778339386, "logps/chosen": -109.51045989990234, "logps/rejected": -113.86258697509766, "loss": 1.7698, "nll_loss": 0.35021230578422546, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -10.951045036315918, "rewards/margins": 0.43521156907081604, "rewards/rejected": -11.386259078979492, "step": 45 }, { "epoch": 0.10683760683760683, "grad_norm": 47.963761732132916, "learning_rate": 9.998747147528373e-07, "logits/chosen": -0.19674669206142426, "logits/rejected": -0.17990216612815857, "logps/chosen": -133.21511840820312, "logps/rejected": -130.31539916992188, "loss": 1.632, "nll_loss": 0.3093932569026947, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -13.321512222290039, "rewards/margins": -0.28997141122817993, "rewards/rejected": -13.031539916992188, "step": 50 }, { "epoch": 0.11752136752136752, "grad_norm": 51.94560167409897, "learning_rate": 9.991093100466482e-07, "logits/chosen": -0.21401865780353546, "logits/rejected": -0.19104179739952087, "logps/chosen": -110.11128234863281, "logps/rejected": -112.35163879394531, "loss": 1.5823, "nll_loss": 0.2831841707229614, "rewards/accuracies": 0.375, "rewards/chosen": -11.011127471923828, "rewards/margins": 0.22403621673583984, "rewards/rejected": -11.235164642333984, "step": 55 }, { "epoch": 0.1282051282051282, "grad_norm": 58.49154963276172, "learning_rate": 9.976491676662678e-07, "logits/chosen": -0.13603931665420532, "logits/rejected": -0.18451443314552307, "logps/chosen": -119.95387268066406, "logps/rejected": -133.7266845703125, "loss": 1.6162, "nll_loss": 0.2651820778846741, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.995387077331543, "rewards/margins": 1.3772820234298706, "rewards/rejected": -13.37267017364502, "step": 60 }, { "epoch": 0.1388888888888889, "grad_norm": 54.3184236422879, "learning_rate": 9.95496320064109e-07, "logits/chosen": -0.2574421763420105, "logits/rejected": -0.1615368276834488, "logps/chosen": -102.09004211425781, "logps/rejected": -92.2038345336914, "loss": 1.5964, "nll_loss": 0.27036991715431213, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -10.209004402160645, "rewards/margins": -0.9886210560798645, "rewards/rejected": -9.220383644104004, "step": 65 }, { "epoch": 0.14957264957264957, "grad_norm": 59.11629390354106, "learning_rate": 9.926537639070456e-07, "logits/chosen": -0.28000539541244507, "logits/rejected": -0.23213541507720947, "logps/chosen": -119.5620346069336, "logps/rejected": -123.65787506103516, "loss": 1.7745, "nll_loss": 0.36628904938697815, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -11.95620346069336, "rewards/margins": 0.4095849096775055, "rewards/rejected": -12.365787506103516, "step": 70 }, { "epoch": 0.16025641025641027, "grad_norm": 57.53497747164318, "learning_rate": 9.891254559051884e-07, "logits/chosen": -0.1931193619966507, "logits/rejected": -0.15742453932762146, "logps/chosen": -115.27913665771484, "logps/rejected": -126.52079772949219, "loss": 1.5256, "nll_loss": 0.3370510935783386, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -11.527913093566895, "rewards/margins": 1.1241672039031982, "rewards/rejected": -12.652081489562988, "step": 75 }, { "epoch": 0.17094017094017094, "grad_norm": 52.58145471881646, "learning_rate": 9.849163073043223e-07, "logits/chosen": -0.12034114450216293, "logits/rejected": -0.09974785149097443, "logps/chosen": -134.50930786132812, "logps/rejected": -125.11856842041016, "loss": 1.7211, "nll_loss": 0.3088015913963318, "rewards/accuracies": 0.375, "rewards/chosen": -13.450933456420898, "rewards/margins": -0.939074695110321, "rewards/rejected": -12.511857986450195, "step": 80 }, { "epoch": 0.18162393162393162, "grad_norm": 52.38736187449637, "learning_rate": 9.800321770496724e-07, "logits/chosen": -0.0975460559129715, "logits/rejected": -0.0958404392004013, "logps/chosen": -89.18054962158203, "logps/rejected": -94.65280151367188, "loss": 1.4466, "nll_loss": 0.301828533411026, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.918054580688477, "rewards/margins": 0.5472255945205688, "rewards/rejected": -9.465280532836914, "step": 85 }, { "epoch": 0.19230769230769232, "grad_norm": 52.449979107103914, "learning_rate": 9.744798636305187e-07, "logits/chosen": -0.18882372975349426, "logits/rejected": -0.18104039132595062, "logps/chosen": -94.19184875488281, "logps/rejected": -104.16746520996094, "loss": 1.3906, "nll_loss": 0.26845496892929077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.419185638427734, "rewards/margins": 0.9975622892379761, "rewards/rejected": -10.416748046875, "step": 90 }, { "epoch": 0.202991452991453, "grad_norm": 56.906972668285476, "learning_rate": 9.68267095617003e-07, "logits/chosen": -0.17206290364265442, "logits/rejected": -0.12139072269201279, "logps/chosen": -85.0337142944336, "logps/rejected": -85.30671691894531, "loss": 1.627, "nll_loss": 0.3232787847518921, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.50337028503418, "rewards/margins": 0.027300655841827393, "rewards/rejected": -8.530672073364258, "step": 95 }, { "epoch": 0.21367521367521367, "grad_norm": 56.063471274711276, "learning_rate": 9.614025209023083e-07, "logits/chosen": -0.21710339188575745, "logits/rejected": -0.187973290681839, "logps/chosen": -129.84634399414062, "logps/rejected": -132.91452026367188, "loss": 1.4296, "nll_loss": 0.2875466048717499, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -12.984634399414062, "rewards/margins": 0.3068179488182068, "rewards/rejected": -13.291452407836914, "step": 100 }, { "epoch": 0.22435897435897437, "grad_norm": 54.253076126268894, "learning_rate": 9.538956946651815e-07, "logits/chosen": -0.05166339874267578, "logits/rejected": 0.029095903038978577, "logps/chosen": -104.00286865234375, "logps/rejected": -113.63838958740234, "loss": 1.4212, "nll_loss": 0.2850767970085144, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -10.400287628173828, "rewards/margins": 0.9635505676269531, "rewards/rejected": -11.363838195800781, "step": 105 }, { "epoch": 0.23504273504273504, "grad_norm": 65.81870203591372, "learning_rate": 9.457570660695539e-07, "logits/chosen": -0.07271315902471542, "logits/rejected": -0.11916762590408325, "logps/chosen": -127.29766845703125, "logps/rejected": -130.58584594726562, "loss": 1.5306, "nll_loss": 0.33747240900993347, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -12.729766845703125, "rewards/margins": 0.3288170099258423, "rewards/rejected": -13.05858325958252, "step": 110 }, { "epoch": 0.24572649572649571, "grad_norm": 58.367624421766024, "learning_rate": 9.369979637197774e-07, "logits/chosen": -0.12270005792379379, "logits/rejected": -0.17853489518165588, "logps/chosen": -106.76663970947266, "logps/rejected": -109.66035461425781, "loss": 1.5831, "nll_loss": 0.2727692425251007, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -10.676663398742676, "rewards/margins": 0.2893708348274231, "rewards/rejected": -10.966034889221191, "step": 115 }, { "epoch": 0.2564102564102564, "grad_norm": 53.75246999549208, "learning_rate": 9.276305798917158e-07, "logits/chosen": -0.047394849359989166, "logits/rejected": -0.08673441410064697, "logps/chosen": -113.4908447265625, "logps/rejected": -123.5615234375, "loss": 1.551, "nll_loss": 0.28447219729423523, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.34908390045166, "rewards/margins": 1.0070677995681763, "rewards/rejected": -12.35615348815918, "step": 120 }, { "epoch": 0.2670940170940171, "grad_norm": 64.46832867529025, "learning_rate": 9.176679535616476e-07, "logits/chosen": 0.08038081228733063, "logits/rejected": 0.05699559301137924, "logps/chosen": -112.93634033203125, "logps/rejected": -125.99974060058594, "loss": 1.4662, "nll_loss": 0.35629984736442566, "rewards/accuracies": 0.75, "rewards/chosen": -11.293634414672852, "rewards/margins": 1.3063404560089111, "rewards/rejected": -12.599973678588867, "step": 125 }, { "epoch": 0.2777777777777778, "grad_norm": 55.82818467114938, "learning_rate": 9.071239522565976e-07, "logits/chosen": -0.07054271548986435, "logits/rejected": -0.024266820400953293, "logps/chosen": -113.1104507446289, "logps/rejected": -116.25431060791016, "loss": 1.4477, "nll_loss": 0.33033448457717896, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -11.311044692993164, "rewards/margins": 0.3143869638442993, "rewards/rejected": -11.625432014465332, "step": 130 }, { "epoch": 0.28846153846153844, "grad_norm": 59.467620258292605, "learning_rate": 8.960132527513642e-07, "logits/chosen": -0.06084425374865532, "logits/rejected": -0.062250006943941116, "logps/chosen": -127.11384582519531, "logps/rejected": -127.90572357177734, "loss": 1.5105, "nll_loss": 0.354878306388855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -12.711384773254395, "rewards/margins": 0.07918860763311386, "rewards/rejected": -12.790571212768555, "step": 135 }, { "epoch": 0.29914529914529914, "grad_norm": 67.52329899130741, "learning_rate": 8.8435132063911e-07, "logits/chosen": -0.06606234610080719, "logits/rejected": -0.02088196575641632, "logps/chosen": -135.88082885742188, "logps/rejected": -141.05441284179688, "loss": 1.3854, "nll_loss": 0.35649529099464417, "rewards/accuracies": 0.625, "rewards/chosen": -13.58808422088623, "rewards/margins": 0.5173591375350952, "rewards/rejected": -14.105443000793457, "step": 140 }, { "epoch": 0.30982905982905984, "grad_norm": 66.6282576658114, "learning_rate": 8.721543888039532e-07, "logits/chosen": -0.14654412865638733, "logits/rejected": -0.15081673860549927, "logps/chosen": -135.72756958007812, "logps/rejected": -131.85450744628906, "loss": 1.4429, "nll_loss": 0.3281570076942444, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -13.57275676727295, "rewards/margins": -0.38730812072753906, "rewards/rejected": -13.185449600219727, "step": 145 }, { "epoch": 0.32051282051282054, "grad_norm": 57.916364691190665, "learning_rate": 8.594394348255237e-07, "logits/chosen": -0.2932497560977936, "logits/rejected": -0.25581642985343933, "logps/chosen": -141.12599182128906, "logps/rejected": -141.9236602783203, "loss": 1.4486, "nll_loss": 0.329673707485199, "rewards/accuracies": 0.625, "rewards/chosen": -14.11259937286377, "rewards/margins": 0.07976653426885605, "rewards/rejected": -14.192365646362305, "step": 150 }, { "epoch": 0.3311965811965812, "grad_norm": 54.1500700625341, "learning_rate": 8.462241573469377e-07, "logits/chosen": -0.19544358551502228, "logits/rejected": -0.1730412244796753, "logps/chosen": -145.17381286621094, "logps/rejected": -144.12461853027344, "loss": 1.4251, "nll_loss": 0.34274980425834656, "rewards/accuracies": 0.5, "rewards/chosen": -14.51738166809082, "rewards/margins": -0.10491929203271866, "rewards/rejected": -14.412463188171387, "step": 155 }, { "epoch": 0.3418803418803419, "grad_norm": 62.757464156888936, "learning_rate": 8.325269514390834e-07, "logits/chosen": 0.013430899009108543, "logits/rejected": -0.037021975964307785, "logps/chosen": -111.84146881103516, "logps/rejected": -128.94786071777344, "loss": 1.4507, "nll_loss": 0.315662145614624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.184146881103516, "rewards/margins": 1.7106406688690186, "rewards/rejected": -12.894787788391113, "step": 160 }, { "epoch": 0.3525641025641026, "grad_norm": 50.800038470428575, "learning_rate": 8.183668829955111e-07, "logits/chosen": -0.3039621412754059, "logits/rejected": -0.287003755569458, "logps/chosen": -139.82981872558594, "logps/rejected": -146.7205352783203, "loss": 1.37, "nll_loss": 0.3524821996688843, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13.982983589172363, "rewards/margins": 0.6890703439712524, "rewards/rejected": -14.672053337097168, "step": 165 }, { "epoch": 0.36324786324786323, "grad_norm": 59.841427525159546, "learning_rate": 8.037636621935684e-07, "logits/chosen": -0.31735068559646606, "logits/rejected": -0.23542420566082, "logps/chosen": -103.27725982666016, "logps/rejected": -106.25889587402344, "loss": 1.3626, "nll_loss": 0.3173479437828064, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -10.327725410461426, "rewards/margins": 0.2981634736061096, "rewards/rejected": -10.625889778137207, "step": 170 }, { "epoch": 0.37393162393162394, "grad_norm": 52.9514862200353, "learning_rate": 7.887376160587213e-07, "logits/chosen": -0.20337620377540588, "logits/rejected": -0.17184031009674072, "logps/chosen": -116.390869140625, "logps/rejected": -118.56620788574219, "loss": 1.3521, "nll_loss": 0.3265441358089447, "rewards/accuracies": 0.625, "rewards/chosen": -11.639086723327637, "rewards/margins": 0.21753445267677307, "rewards/rejected": -11.856620788574219, "step": 175 }, { "epoch": 0.38461538461538464, "grad_norm": 50.63958304716666, "learning_rate": 7.733096601702507e-07, "logits/chosen": -0.012370765209197998, "logits/rejected": 0.05527879670262337, "logps/chosen": -113.68470764160156, "logps/rejected": -110.1121597290039, "loss": 1.3928, "nll_loss": 0.3376830518245697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.368470191955566, "rewards/margins": -0.3572540283203125, "rewards/rejected": -11.01121711730957, "step": 180 }, { "epoch": 0.3952991452991453, "grad_norm": 53.743159898285576, "learning_rate": 7.575012695477076e-07, "logits/chosen": 0.06333889812231064, "logits/rejected": 0.09429727494716644, "logps/chosen": -110.7640609741211, "logps/rejected": -115.84847259521484, "loss": 1.5223, "nll_loss": 0.3274967670440674, "rewards/accuracies": 0.625, "rewards/chosen": -11.076406478881836, "rewards/margins": 0.5084413290023804, "rewards/rejected": -11.584847450256348, "step": 185 }, { "epoch": 0.405982905982906, "grad_norm": 56.40313393417172, "learning_rate": 7.413344487586542e-07, "logits/chosen": -0.08225846290588379, "logits/rejected": -0.055325210094451904, "logps/chosen": -116.2511978149414, "logps/rejected": -132.88140869140625, "loss": 1.4658, "nll_loss": 0.3596312403678894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -11.62511920928955, "rewards/margins": 1.6630210876464844, "rewards/rejected": -13.288141250610352, "step": 190 }, { "epoch": 0.4166666666666667, "grad_norm": 54.645494345995765, "learning_rate": 7.248317012892968e-07, "logits/chosen": -0.1261519491672516, "logits/rejected": -0.16661445796489716, "logps/chosen": -125.1494140625, "logps/rejected": -137.20791625976562, "loss": 1.4354, "nll_loss": 0.3696037232875824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -12.514939308166504, "rewards/margins": 1.2058517932891846, "rewards/rejected": -13.720791816711426, "step": 195 }, { "epoch": 0.42735042735042733, "grad_norm": 61.519785155196736, "learning_rate": 7.08015998220647e-07, "logits/chosen": -0.13151851296424866, "logits/rejected": -0.08859863132238388, "logps/chosen": -165.24661254882812, "logps/rejected": -170.71624755859375, "loss": 1.4629, "nll_loss": 0.34175121784210205, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -16.524662017822266, "rewards/margins": 0.5469658374786377, "rewards/rejected": -17.07162857055664, "step": 200 }, { "epoch": 0.43803418803418803, "grad_norm": 55.57722114588639, "learning_rate": 6.909107462538111e-07, "logits/chosen": -0.19121451675891876, "logits/rejected": -0.20065097510814667, "logps/chosen": -142.35458374023438, "logps/rejected": -150.1237030029297, "loss": 1.4106, "nll_loss": 0.34670525789260864, "rewards/accuracies": 0.625, "rewards/chosen": -14.235458374023438, "rewards/margins": 0.7769120335578918, "rewards/rejected": -15.012370109558105, "step": 205 }, { "epoch": 0.44871794871794873, "grad_norm": 69.86075329234693, "learning_rate": 6.735397551289178e-07, "logits/chosen": -0.1635718047618866, "logits/rejected": -0.1015796884894371, "logps/chosen": -132.5345458984375, "logps/rejected": -135.8687286376953, "loss": 1.4962, "nll_loss": 0.32979699969291687, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -13.253456115722656, "rewards/margins": 0.33341652154922485, "rewards/rejected": -13.586873054504395, "step": 210 }, { "epoch": 0.4594017094017094, "grad_norm": 58.3273614395911, "learning_rate": 6.559272044830316e-07, "logits/chosen": -0.07360713183879852, "logits/rejected": -6.962567567825317e-05, "logps/chosen": -122.03056335449219, "logps/rejected": -129.28549194335938, "loss": 1.3599, "nll_loss": 0.3558313846588135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -12.203059196472168, "rewards/margins": 0.7254913449287415, "rewards/rejected": -12.928548812866211, "step": 215 }, { "epoch": 0.4700854700854701, "grad_norm": 50.10334120589398, "learning_rate": 6.380976101931879e-07, "logits/chosen": 0.01870536431670189, "logits/rejected": 0.0029756189323961735, "logps/chosen": -107.1243896484375, "logps/rejected": -112.00141906738281, "loss": 1.3138, "nll_loss": 0.3773984909057617, "rewards/accuracies": 0.625, "rewards/chosen": -10.712437629699707, "rewards/margins": 0.48770326375961304, "rewards/rejected": -11.200141906738281, "step": 220 }, { "epoch": 0.4807692307692308, "grad_norm": 57.01257456037248, "learning_rate": 6.200757902513962e-07, "logits/chosen": -0.0710291862487793, "logits/rejected": -0.1122066006064415, "logps/chosen": -132.7135467529297, "logps/rejected": -147.7503204345703, "loss": 1.3819, "nll_loss": 0.3411773443222046, "rewards/accuracies": 0.625, "rewards/chosen": -13.271354675292969, "rewards/margins": 1.5036779642105103, "rewards/rejected": -14.775032043457031, "step": 225 }, { "epoch": 0.49145299145299143, "grad_norm": 56.10732954543998, "learning_rate": 6.018868302191139e-07, "logits/chosen": -0.02616579458117485, "logits/rejected": -0.08220602571964264, "logps/chosen": -109.6393051147461, "logps/rejected": -119.71272277832031, "loss": 1.4894, "nll_loss": 0.32466286420822144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10.963930130004883, "rewards/margins": 1.0073410272598267, "rewards/rejected": -11.971270561218262, "step": 230 }, { "epoch": 0.5021367521367521, "grad_norm": 57.150169644968244, "learning_rate": 5.835560483092742e-07, "logits/chosen": -0.08456510305404663, "logits/rejected": -0.052301835268735886, "logps/chosen": -110.45853424072266, "logps/rejected": -109.49703216552734, "loss": 1.5332, "nll_loss": 0.35862648487091064, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -11.045854568481445, "rewards/margins": -0.09615027904510498, "rewards/rejected": -10.949705123901367, "step": 235 }, { "epoch": 0.5128205128205128, "grad_norm": 56.92818984806646, "learning_rate": 5.651089601444752e-07, "logits/chosen": -0.25266486406326294, "logits/rejected": -0.27402007579803467, "logps/chosen": -163.11080932617188, "logps/rejected": -170.6371612548828, "loss": 1.3947, "nll_loss": 0.3526006042957306, "rewards/accuracies": 0.625, "rewards/chosen": -16.311084747314453, "rewards/margins": 0.7526326775550842, "rewards/rejected": -17.063716888427734, "step": 240 }, { "epoch": 0.5235042735042735, "grad_norm": 55.824252362448014, "learning_rate": 5.465712432403811e-07, "logits/chosen": -0.1512390673160553, "logits/rejected": -0.15393702685832977, "logps/chosen": -140.1497039794922, "logps/rejected": -155.41702270507812, "loss": 1.2664, "nll_loss": 0.34822720289230347, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -14.014970779418945, "rewards/margins": 1.5267311334609985, "rewards/rejected": -15.541702270507812, "step": 245 }, { "epoch": 0.5341880341880342, "grad_norm": 61.259217624033624, "learning_rate": 5.279687012637798e-07, "logits/chosen": 0.047980885952711105, "logits/rejected": 0.0825057402253151, "logps/chosen": -131.10092163085938, "logps/rejected": -141.0630645751953, "loss": 1.3979, "nll_loss": 0.3648400902748108, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.110092163085938, "rewards/margins": 0.9962142109870911, "rewards/rejected": -14.106305122375488, "step": 250 }, { "epoch": 0.5448717948717948, "grad_norm": 54.05289475380886, "learning_rate": 5.093272281150382e-07, "logits/chosen": -0.019353587180376053, "logits/rejected": 0.09568696469068527, "logps/chosen": -133.12542724609375, "logps/rejected": -135.47377014160156, "loss": 1.4605, "nll_loss": 0.3375224173069, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -13.312542915344238, "rewards/margins": 0.2348347008228302, "rewards/rejected": -13.54737663269043, "step": 255 }, { "epoch": 0.5555555555555556, "grad_norm": 55.813930730207794, "learning_rate": 4.906727718849618e-07, "logits/chosen": 0.031209534034132957, "logits/rejected": 0.0828268900513649, "logps/chosen": -109.07014465332031, "logps/rejected": -123.6259994506836, "loss": 1.3422, "nll_loss": 0.2994317412376404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10.907014846801758, "rewards/margins": 1.4555847644805908, "rewards/rejected": -12.362600326538086, "step": 260 }, { "epoch": 0.5662393162393162, "grad_norm": 55.164171360826785, "learning_rate": 4.7203129873622036e-07, "logits/chosen": -0.12409428507089615, "logits/rejected": -0.08072350919246674, "logps/chosen": -142.841552734375, "logps/rejected": -144.4403533935547, "loss": 1.3871, "nll_loss": 0.36165937781333923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.284154891967773, "rewards/margins": 0.15988163650035858, "rewards/rejected": -14.444036483764648, "step": 265 }, { "epoch": 0.5769230769230769, "grad_norm": 61.02385741830308, "learning_rate": 4.534287567596188e-07, "logits/chosen": -0.16588857769966125, "logits/rejected": -0.10881330817937851, "logps/chosen": -142.35317993164062, "logps/rejected": -145.8653106689453, "loss": 1.4096, "nll_loss": 0.3703126013278961, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -14.235316276550293, "rewards/margins": 0.35121291875839233, "rewards/rejected": -14.586529731750488, "step": 270 }, { "epoch": 0.5876068376068376, "grad_norm": 58.004085751417605, "learning_rate": 4.348910398555249e-07, "logits/chosen": 0.025946801528334618, "logits/rejected": 0.05849064514040947, "logps/chosen": -112.86296081542969, "logps/rejected": -121.48736572265625, "loss": 1.2885, "nll_loss": 0.3448982238769531, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -11.286294937133789, "rewards/margins": 0.8624418377876282, "rewards/rejected": -12.148736953735352, "step": 275 }, { "epoch": 0.5982905982905983, "grad_norm": 60.093543566265524, "learning_rate": 4.1644395169072575e-07, "logits/chosen": -0.1264003962278366, "logits/rejected": -0.11314131319522858, "logps/chosen": -158.03598022460938, "logps/rejected": -167.41009521484375, "loss": 1.3685, "nll_loss": 0.3446425497531891, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -15.803598403930664, "rewards/margins": 0.9374116063117981, "rewards/rejected": -16.741008758544922, "step": 280 }, { "epoch": 0.6089743589743589, "grad_norm": 63.57104854646875, "learning_rate": 3.9811316978088615e-07, "logits/chosen": -0.08768756687641144, "logits/rejected": -0.06864931434392929, "logps/chosen": -114.5308837890625, "logps/rejected": -113.0389633178711, "loss": 1.3041, "nll_loss": 0.3934231698513031, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -11.453088760375977, "rewards/margins": -0.14919374883174896, "rewards/rejected": -11.303895950317383, "step": 285 }, { "epoch": 0.6196581196581197, "grad_norm": 46.88181022689408, "learning_rate": 3.799242097486038e-07, "logits/chosen": -0.06880898773670197, "logits/rejected": -0.0963631272315979, "logps/chosen": -118.00128173828125, "logps/rejected": -126.088134765625, "loss": 1.3619, "nll_loss": 0.3445819616317749, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -11.800127983093262, "rewards/margins": 0.8086857795715332, "rewards/rejected": -12.60881519317627, "step": 290 }, { "epoch": 0.6303418803418803, "grad_norm": 52.868293692367715, "learning_rate": 3.619023898068123e-07, "logits/chosen": 0.04794033616781235, "logits/rejected": 0.02935163304209709, "logps/chosen": -108.1368408203125, "logps/rejected": -113.2763442993164, "loss": 1.3385, "nll_loss": 0.38119053840637207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10.813684463500977, "rewards/margins": 0.5139498710632324, "rewards/rejected": -11.327634811401367, "step": 295 }, { "epoch": 0.6410256410256411, "grad_norm": 51.20036019131344, "learning_rate": 3.4407279551696846e-07, "logits/chosen": 0.03451260179281235, "logits/rejected": 0.060915928333997726, "logps/chosen": -117.63232421875, "logps/rejected": -124.12806701660156, "loss": 1.3697, "nll_loss": 0.3462775647640228, "rewards/accuracies": 0.5, "rewards/chosen": -11.763232231140137, "rewards/margins": 0.6495749354362488, "rewards/rejected": -12.412806510925293, "step": 300 }, { "epoch": 0.6517094017094017, "grad_norm": 58.582600175811734, "learning_rate": 3.2646024487108213e-07, "logits/chosen": -0.004241435322910547, "logits/rejected": -0.03947947174310684, "logps/chosen": -144.97406005859375, "logps/rejected": -152.7334442138672, "loss": 1.4605, "nll_loss": 0.34281834959983826, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -14.497406005859375, "rewards/margins": 0.7759408354759216, "rewards/rejected": -15.273347854614258, "step": 305 }, { "epoch": 0.6623931623931624, "grad_norm": 57.95314258124262, "learning_rate": 3.0908925374618887e-07, "logits/chosen": -0.12898002564907074, "logits/rejected": -0.06738940626382828, "logps/chosen": -147.439697265625, "logps/rejected": -149.19259643554688, "loss": 1.3813, "nll_loss": 0.3252050578594208, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -14.74397087097168, "rewards/margins": 0.1752903163433075, "rewards/rejected": -14.91926097869873, "step": 310 }, { "epoch": 0.6730769230769231, "grad_norm": 62.09918976711039, "learning_rate": 2.91984001779353e-07, "logits/chosen": -0.05374965816736221, "logits/rejected": 0.0732099637389183, "logps/chosen": -148.73020935058594, "logps/rejected": -150.61476135253906, "loss": 1.3364, "nll_loss": 0.31267058849334717, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -14.873019218444824, "rewards/margins": 0.18845567107200623, "rewards/rejected": -15.06147575378418, "step": 315 }, { "epoch": 0.6837606837606838, "grad_norm": 50.667972535787875, "learning_rate": 2.751682987107029e-07, "logits/chosen": 0.08584196865558624, "logits/rejected": 0.1222977414727211, "logps/chosen": -110.92390441894531, "logps/rejected": -119.34083557128906, "loss": 1.306, "nll_loss": 0.3358796238899231, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.092391967773438, "rewards/margins": 0.841692328453064, "rewards/rejected": -11.934083938598633, "step": 320 }, { "epoch": 0.6944444444444444, "grad_norm": 55.51527877131126, "learning_rate": 2.5866555124134577e-07, "logits/chosen": 0.03731069713830948, "logits/rejected": 0.012952113524079323, "logps/chosen": -145.85842895507812, "logps/rejected": -154.74974060058594, "loss": 1.3157, "nll_loss": 0.31585749983787537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.585843086242676, "rewards/margins": 0.8891321420669556, "rewards/rejected": -15.474973678588867, "step": 325 }, { "epoch": 0.7051282051282052, "grad_norm": 52.22477831416299, "learning_rate": 2.424987304522924e-07, "logits/chosen": 0.07864940166473389, "logits/rejected": 0.1205131784081459, "logps/chosen": -111.40937805175781, "logps/rejected": -114.6185073852539, "loss": 1.3989, "nll_loss": 0.3170923590660095, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.140935897827148, "rewards/margins": 0.3209128677845001, "rewards/rejected": -11.4618501663208, "step": 330 }, { "epoch": 0.7158119658119658, "grad_norm": 59.70611600791039, "learning_rate": 2.2669033982974944e-07, "logits/chosen": -0.060904957354068756, "logits/rejected": 0.005220590624958277, "logps/chosen": -142.15638732910156, "logps/rejected": -146.87728881835938, "loss": 1.3442, "nll_loss": 0.3452945053577423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.2156400680542, "rewards/margins": 0.472089946269989, "rewards/rejected": -14.687728881835938, "step": 335 }, { "epoch": 0.7264957264957265, "grad_norm": 71.85103597981242, "learning_rate": 2.1126238394127867e-07, "logits/chosen": -0.016889113932847977, "logits/rejected": -0.0013866141671314836, "logps/chosen": -133.07833862304688, "logps/rejected": -141.3354949951172, "loss": 1.3755, "nll_loss": 0.294593870639801, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -13.307835578918457, "rewards/margins": 0.8257135152816772, "rewards/rejected": -14.133550643920898, "step": 340 }, { "epoch": 0.7371794871794872, "grad_norm": 55.16644312498042, "learning_rate": 1.9623633780643155e-07, "logits/chosen": 0.07130751758813858, "logits/rejected": 0.010698718018829823, "logps/chosen": -111.66865539550781, "logps/rejected": -113.45048522949219, "loss": 1.3458, "nll_loss": 0.42604750394821167, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -11.166866302490234, "rewards/margins": 0.1781845986843109, "rewards/rejected": -11.345048904418945, "step": 345 }, { "epoch": 0.7478632478632479, "grad_norm": 56.371622200044676, "learning_rate": 1.8163311700448898e-07, "logits/chosen": -0.0578581877052784, "logits/rejected": -0.026563648134469986, "logps/chosen": -113.5242919921875, "logps/rejected": -123.50186920166016, "loss": 1.3004, "nll_loss": 0.35681912302970886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.352429389953613, "rewards/margins": 0.9977572560310364, "rewards/rejected": -12.350186347961426, "step": 350 }, { "epoch": 0.7585470085470085, "grad_norm": 62.52480406001481, "learning_rate": 1.674730485609166e-07, "logits/chosen": -0.012843991629779339, "logits/rejected": -0.071006640791893, "logps/chosen": -118.711181640625, "logps/rejected": -130.95870971679688, "loss": 1.2971, "nll_loss": 0.3193722665309906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.871118545532227, "rewards/margins": 1.22475266456604, "rewards/rejected": -13.095870971679688, "step": 355 }, { "epoch": 0.7692307692307693, "grad_norm": 55.56720356363359, "learning_rate": 1.537758426530622e-07, "logits/chosen": -0.01560185570269823, "logits/rejected": 0.07330085337162018, "logps/chosen": -118.5744400024414, "logps/rejected": -115.8942642211914, "loss": 1.4566, "nll_loss": 0.38159191608428955, "rewards/accuracies": 0.5, "rewards/chosen": -11.857443809509277, "rewards/margins": -0.26801711320877075, "rewards/rejected": -11.589426040649414, "step": 360 }, { "epoch": 0.7799145299145299, "grad_norm": 53.63595812481311, "learning_rate": 1.4056056517447634e-07, "logits/chosen": 0.0017164063174277544, "logits/rejected": 0.017641058191657066, "logps/chosen": -107.86031341552734, "logps/rejected": -113.5797119140625, "loss": 1.3115, "nll_loss": 0.328066885471344, "rewards/accuracies": 0.625, "rewards/chosen": -10.786030769348145, "rewards/margins": 0.5719406008720398, "rewards/rejected": -11.357972145080566, "step": 365 }, { "epoch": 0.7905982905982906, "grad_norm": 46.826432951695814, "learning_rate": 1.2784561119604682e-07, "logits/chosen": -0.038930945098400116, "logits/rejected": -0.10993669927120209, "logps/chosen": -128.37625122070312, "logps/rejected": -142.15855407714844, "loss": 1.3305, "nll_loss": 0.38407421112060547, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -12.837625503540039, "rewards/margins": 1.3782320022583008, "rewards/rejected": -14.215856552124023, "step": 370 }, { "epoch": 0.8012820512820513, "grad_norm": 54.285824433116986, "learning_rate": 1.156486793608899e-07, "logits/chosen": -0.0009039134019985795, "logits/rejected": -0.0710187703371048, "logps/chosen": -118.76985168457031, "logps/rejected": -133.30014038085938, "loss": 1.2896, "nll_loss": 0.3520449101924896, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.876985549926758, "rewards/margins": 1.4530264139175415, "rewards/rejected": -13.330012321472168, "step": 375 }, { "epoch": 0.811965811965812, "grad_norm": 52.729935383003465, "learning_rate": 1.0398674724863581e-07, "logits/chosen": -0.009277289733290672, "logits/rejected": 0.07069944590330124, "logps/chosen": -118.21263122558594, "logps/rejected": -118.44798278808594, "loss": 1.3282, "nll_loss": 0.3493942320346832, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -11.821264266967773, "rewards/margins": 0.02353498339653015, "rewards/rejected": -11.844799041748047, "step": 380 }, { "epoch": 0.8226495726495726, "grad_norm": 64.0051849097744, "learning_rate": 9.287604774340235e-08, "logits/chosen": 0.08045725524425507, "logits/rejected": 0.1136372834444046, "logps/chosen": -118.67901611328125, "logps/rejected": -126.129638671875, "loss": 1.2627, "nll_loss": 0.3460482656955719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.867900848388672, "rewards/margins": 0.745063304901123, "rewards/rejected": -12.612964630126953, "step": 385 }, { "epoch": 0.8333333333333334, "grad_norm": 57.70135234770639, "learning_rate": 8.233204643835234e-08, "logits/chosen": -0.02369830384850502, "logits/rejected": -0.13596853613853455, "logps/chosen": -144.19119262695312, "logps/rejected": -155.91761779785156, "loss": 1.2748, "nll_loss": 0.3630937933921814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -14.419118881225586, "rewards/margins": 1.1726421117782593, "rewards/rejected": -15.591761589050293, "step": 390 }, { "epoch": 0.844017094017094, "grad_norm": 58.46522179629448, "learning_rate": 7.236942010828429e-08, "logits/chosen": -0.08849911391735077, "logits/rejected": -0.05617945268750191, "logps/chosen": -123.7318344116211, "logps/rejected": -124.00958251953125, "loss": 1.3258, "nll_loss": 0.3757060170173645, "rewards/accuracies": 0.5, "rewards/chosen": -12.373184204101562, "rewards/margins": 0.027775108814239502, "rewards/rejected": -12.400957107543945, "step": 395 }, { "epoch": 0.8547008547008547, "grad_norm": 57.911987107984345, "learning_rate": 6.300203628022271e-08, "logits/chosen": -0.08188799023628235, "logits/rejected": -0.1166069284081459, "logps/chosen": -125.78984069824219, "logps/rejected": -133.1624755859375, "loss": 1.2968, "nll_loss": 0.37085098028182983, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -12.578984260559082, "rewards/margins": 0.7372626066207886, "rewards/rejected": -13.316246032714844, "step": 400 }, { "epoch": 0.8547008547008547, "eval_logits/chosen": 0.1708301454782486, "eval_logits/rejected": 0.21349689364433289, "eval_logps/chosen": -131.8848114013672, "eval_logps/rejected": -143.98866271972656, "eval_loss": 1.3273688554763794, "eval_nll_loss": 0.3287227749824524, "eval_rewards/accuracies": 0.7016128897666931, "eval_rewards/chosen": -13.188480377197266, "eval_rewards/margins": 1.2103854417800903, "eval_rewards/rejected": -14.398866653442383, "eval_runtime": 102.9227, "eval_samples_per_second": 19.053, "eval_steps_per_second": 0.301, "step": 400 }, { "epoch": 0.8653846153846154, "grad_norm": 52.22311027332078, "learning_rate": 5.42429339304461e-08, "logits/chosen": 0.03789149597287178, "logits/rejected": -0.024609360843896866, "logps/chosen": -125.41561126708984, "logps/rejected": -138.07968139648438, "loss": 1.1837, "nll_loss": 0.32930082082748413, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -12.541561126708984, "rewards/margins": 1.2664083242416382, "rewards/rejected": -13.807971000671387, "step": 405 }, { "epoch": 0.8760683760683761, "grad_norm": 59.39997515872638, "learning_rate": 4.610430533481857e-08, "logits/chosen": -0.09911607950925827, "logits/rejected": -0.08993721008300781, "logps/chosen": -114.1660385131836, "logps/rejected": -119.9376449584961, "loss": 1.3091, "nll_loss": 0.3979756832122803, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.416604042053223, "rewards/margins": 0.5771591067314148, "rewards/rejected": -11.993764877319336, "step": 410 }, { "epoch": 0.8867521367521367, "grad_norm": 60.29868077187127, "learning_rate": 3.859747909769162e-08, "logits/chosen": -0.0670127421617508, "logits/rejected": -0.014563268050551414, "logps/chosen": -157.24630737304688, "logps/rejected": -162.0565948486328, "loss": 1.413, "nll_loss": 0.33768731355667114, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -15.724632263183594, "rewards/margins": 0.48102617263793945, "rewards/rejected": -16.205659866333008, "step": 415 }, { "epoch": 0.8974358974358975, "grad_norm": 60.41014318466645, "learning_rate": 3.173290438299697e-08, "logits/chosen": 0.08699943125247955, "logits/rejected": 0.08032406866550446, "logps/chosen": -125.44620513916016, "logps/rejected": -134.0313720703125, "loss": 1.3516, "nll_loss": 0.35607296228408813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -12.5446195602417, "rewards/margins": 0.8585165739059448, "rewards/rejected": -13.40313720703125, "step": 420 }, { "epoch": 0.9081196581196581, "grad_norm": 55.3705487410181, "learning_rate": 2.5520136369481194e-08, "logits/chosen": -0.04224336892366409, "logits/rejected": -0.06296161562204361, "logps/chosen": -165.10733032226562, "logps/rejected": -172.8993682861328, "loss": 1.2141, "nll_loss": 0.3679961562156677, "rewards/accuracies": 0.625, "rewards/chosen": -16.510732650756836, "rewards/margins": 0.7792031168937683, "rewards/rejected": -17.289936065673828, "step": 425 }, { "epoch": 0.9188034188034188, "grad_norm": 54.089208241870814, "learning_rate": 1.996782295032745e-08, "logits/chosen": -0.14895446598529816, "logits/rejected": -0.13183912634849548, "logps/chosen": -142.65121459960938, "logps/rejected": -148.99490356445312, "loss": 1.2328, "nll_loss": 0.31830939650535583, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -14.265121459960938, "rewards/margins": 0.6343703269958496, "rewards/rejected": -14.899490356445312, "step": 430 }, { "epoch": 0.9294871794871795, "grad_norm": 55.028012487261535, "learning_rate": 1.508369269567783e-08, "logits/chosen": 0.002587652299553156, "logits/rejected": -0.038656361401081085, "logps/chosen": -132.2308349609375, "logps/rejected": -143.6236114501953, "loss": 1.3378, "nll_loss": 0.35204577445983887, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -13.22308349609375, "rewards/margins": 1.1392773389816284, "rewards/rejected": -14.362360954284668, "step": 435 }, { "epoch": 0.9401709401709402, "grad_norm": 65.09993319908288, "learning_rate": 1.0874544094811422e-08, "logits/chosen": 0.02723981812596321, "logits/rejected": 0.026110615581274033, "logps/chosen": -121.16329193115234, "logps/rejected": -125.1588134765625, "loss": 1.3971, "nll_loss": 0.3366636037826538, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -12.116328239440918, "rewards/margins": 0.3995509743690491, "rewards/rejected": -12.515880584716797, "step": 440 }, { "epoch": 0.9508547008547008, "grad_norm": 58.673331587997495, "learning_rate": 7.346236092954316e-09, "logits/chosen": -0.07683765143156052, "logits/rejected": -0.08344952762126923, "logps/chosen": -159.8972625732422, "logps/rejected": -165.53884887695312, "loss": 1.3386, "nll_loss": 0.3503979444503784, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -15.989726066589355, "rewards/margins": 0.5641571879386902, "rewards/rejected": -16.553882598876953, "step": 445 }, { "epoch": 0.9615384615384616, "grad_norm": 54.28528572494338, "learning_rate": 4.50367993589107e-09, "logits/chosen": -0.16253122687339783, "logits/rejected": -0.07185138761997223, "logps/chosen": -142.61741638183594, "logps/rejected": -142.0267791748047, "loss": 1.3953, "nll_loss": 0.4227580428123474, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -14.261739730834961, "rewards/margins": -0.05906330421566963, "rewards/rejected": -14.202677726745605, "step": 450 }, { "epoch": 0.9722222222222222, "grad_norm": 57.751616603455034, "learning_rate": 2.3508323337321224e-09, "logits/chosen": 0.05629957839846611, "logits/rejected": 0.04388625547289848, "logps/chosen": -109.04362487792969, "logps/rejected": -116.85518646240234, "loss": 1.3543, "nll_loss": 0.4259931445121765, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -10.904363632202148, "rewards/margins": 0.7811557650566101, "rewards/rejected": -11.685519218444824, "step": 455 }, { "epoch": 0.9829059829059829, "grad_norm": 53.54066392797414, "learning_rate": 8.906899533517864e-10, "logits/chosen": 0.12563523650169373, "logits/rejected": 0.1722358763217926, "logps/chosen": -120.39036560058594, "logps/rejected": -125.317138671875, "loss": 1.31, "nll_loss": 0.3701631426811218, "rewards/accuracies": 0.625, "rewards/chosen": -12.039037704467773, "rewards/margins": 0.49267855286598206, "rewards/rejected": -12.531715393066406, "step": 460 }, { "epoch": 0.9935897435897436, "grad_norm": 54.485812172434166, "learning_rate": 1.252852471625987e-10, "logits/chosen": 0.047066349536180496, "logits/rejected": 0.08690531551837921, "logps/chosen": -110.74609375, "logps/rejected": -122.6104965209961, "loss": 1.1667, "nll_loss": 0.3447542190551758, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -11.074609756469727, "rewards/margins": 1.1864404678344727, "rewards/rejected": -12.261051177978516, "step": 465 }, { "epoch": 1.0, "step": 468, "total_flos": 0.0, "train_loss": 1.4382328207676227, "train_runtime": 9608.0854, "train_samples_per_second": 6.232, "train_steps_per_second": 0.049 } ], "logging_steps": 5, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }