{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.989547038327526, "eval_steps": 50, "global_step": 429, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06968641114982578, "grad_norm": 8.229574175695925, "learning_rate": 1.1627906976744186e-07, "logits/chosen": -2.7034733295440674, "logits/rejected": -2.7302405834198, "logps/chosen": -301.81427001953125, "logps/rejected": -331.369140625, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0005243912455625832, "rewards/margins": 0.0006931144162081182, "rewards/rejected": -0.00016872311243787408, "step": 10 }, { "epoch": 0.13937282229965156, "grad_norm": 7.845336267328567, "learning_rate": 2.3255813953488372e-07, "logits/chosen": -2.762924909591675, "logits/rejected": -2.7517170906066895, "logps/chosen": -320.4908752441406, "logps/rejected": -314.0067138671875, "loss": 0.6931, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.00030491696088574827, "rewards/margins": -0.00010313927487004548, "rewards/rejected": 0.00040805633761920035, "step": 20 }, { "epoch": 0.20905923344947736, "grad_norm": 9.709940066560216, "learning_rate": 3.4883720930232557e-07, "logits/chosen": -2.702960252761841, "logits/rejected": -2.6881918907165527, "logps/chosen": -295.65521240234375, "logps/rejected": -309.1160888671875, "loss": 0.6926, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.005805189721286297, "rewards/margins": -1.923509444168303e-05, "rewards/rejected": 0.005824424792081118, "step": 30 }, { "epoch": 0.2787456445993031, "grad_norm": 7.6570471523786505, "learning_rate": 4.6511627906976743e-07, "logits/chosen": -2.6843132972717285, "logits/rejected": -2.6987385749816895, "logps/chosen": -289.13720703125, "logps/rejected": -296.2955322265625, "loss": 0.6902, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.010863080620765686, "rewards/margins": 0.004013759084045887, "rewards/rejected": 0.006849322468042374, "step": 40 }, { "epoch": 0.34843205574912894, "grad_norm": 7.937140028593177, "learning_rate": 4.995943852340362e-07, "logits/chosen": -2.6573901176452637, "logits/rejected": -2.66682767868042, "logps/chosen": -327.9737854003906, "logps/rejected": -309.1617431640625, "loss": 0.6871, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.006907278206199408, "rewards/margins": 0.010998749174177647, "rewards/rejected": -0.004091470967978239, "step": 50 }, { "epoch": 0.34843205574912894, "eval_logits/chosen": -2.632770538330078, "eval_logits/rejected": -2.5992560386657715, "eval_logps/chosen": -262.7750244140625, "eval_logps/rejected": -265.65740966796875, "eval_loss": 0.6798189878463745, "eval_rewards/accuracies": 0.59765625, "eval_rewards/chosen": -0.001450682058930397, "eval_rewards/margins": 0.02850232645869255, "eval_rewards/rejected": -0.02995300479233265, "eval_runtime": 103.8301, "eval_samples_per_second": 19.262, "eval_steps_per_second": 0.308, "step": 50 }, { "epoch": 0.4181184668989547, "grad_norm": 9.169375441320238, "learning_rate": 4.976108685115826e-07, "logits/chosen": -2.6978957653045654, "logits/rejected": -2.672510862350464, "logps/chosen": -300.9576110839844, "logps/rejected": -301.5975341796875, "loss": 0.6817, "rewards/accuracies": 0.5625, "rewards/chosen": -0.012056882493197918, "rewards/margins": 0.029647041112184525, "rewards/rejected": -0.04170392453670502, "step": 60 }, { "epoch": 0.4878048780487805, "grad_norm": 9.090563868123345, "learning_rate": 4.939880644182383e-07, "logits/chosen": -2.6447739601135254, "logits/rejected": -2.6350605487823486, "logps/chosen": -327.4669494628906, "logps/rejected": -307.3121643066406, "loss": 0.6832, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.017700044438242912, "rewards/margins": 0.042197179049253464, "rewards/rejected": -0.05989723280072212, "step": 70 }, { "epoch": 0.5574912891986062, "grad_norm": 10.380422622809006, "learning_rate": 4.887499574302625e-07, "logits/chosen": -2.6520066261291504, "logits/rejected": -2.61989164352417, "logps/chosen": -279.8045349121094, "logps/rejected": -279.3233337402344, "loss": 0.6793, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.08412985503673553, "rewards/margins": 0.039986394345760345, "rewards/rejected": 0.044143468141555786, "step": 80 }, { "epoch": 0.627177700348432, "grad_norm": 13.408348648381818, "learning_rate": 4.819312260037522e-07, "logits/chosen": -2.5424866676330566, "logits/rejected": -2.5196774005889893, "logps/chosen": -313.853515625, "logps/rejected": -311.6841125488281, "loss": 0.6786, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08200756460428238, "rewards/margins": 0.05388826131820679, "rewards/rejected": -0.13589580357074738, "step": 90 }, { "epoch": 0.6968641114982579, "grad_norm": 10.057954318267676, "learning_rate": 4.7357701298877766e-07, "logits/chosen": -2.5143790245056152, "logits/rejected": -2.5063555240631104, "logps/chosen": -316.9302978515625, "logps/rejected": -336.63006591796875, "loss": 0.6724, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.2273595631122589, "rewards/margins": 0.014442856423556805, "rewards/rejected": -0.2418024092912674, "step": 100 }, { "epoch": 0.6968641114982579, "eval_logits/chosen": -2.5339930057525635, "eval_logits/rejected": -2.4994165897369385, "eval_logps/chosen": -266.2806091308594, "eval_logps/rejected": -272.15484619140625, "eval_loss": 0.6720507144927979, "eval_rewards/accuracies": 0.59375, "eval_rewards/chosen": -0.03650704026222229, "eval_rewards/margins": 0.05842053145170212, "eval_rewards/rejected": -0.094927579164505, "eval_runtime": 104.7452, "eval_samples_per_second": 19.094, "eval_steps_per_second": 0.306, "step": 100 }, { "epoch": 0.7665505226480837, "grad_norm": 9.469148029484852, "learning_rate": 4.637426267648599e-07, "logits/chosen": -2.615734338760376, "logits/rejected": -2.6145644187927246, "logps/chosen": -302.81866455078125, "logps/rejected": -306.31378173828125, "loss": 0.6774, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.011746838688850403, "rewards/margins": 0.01884249597787857, "rewards/rejected": -0.007095657289028168, "step": 110 }, { "epoch": 0.8362369337979094, "grad_norm": 8.913491607759642, "learning_rate": 4.5249317507639726e-07, "logits/chosen": -2.541506290435791, "logits/rejected": -2.5293049812316895, "logps/chosen": -251.04202270507812, "logps/rejected": -270.9402770996094, "loss": 0.6769, "rewards/accuracies": 0.53125, "rewards/chosen": -0.024001404643058777, "rewards/margins": 0.0283985435962677, "rewards/rejected": -0.052399951964616776, "step": 120 }, { "epoch": 0.9059233449477352, "grad_norm": 10.141409304343389, "learning_rate": 4.399031339922038e-07, "logits/chosen": -2.622816801071167, "logits/rejected": -2.620767116546631, "logps/chosen": -305.6891174316406, "logps/rejected": -307.5406188964844, "loss": 0.673, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04585634917020798, "rewards/margins": 0.06582097709178925, "rewards/rejected": -0.11167732626199722, "step": 130 }, { "epoch": 0.975609756097561, "grad_norm": 8.960281893519205, "learning_rate": 4.2605585484282636e-07, "logits/chosen": -2.6274361610412598, "logits/rejected": -2.6125521659851074, "logps/chosen": -332.13348388671875, "logps/rejected": -312.84503173828125, "loss": 0.6721, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06096304580569267, "rewards/margins": 0.05341456085443497, "rewards/rejected": -0.11437759548425674, "step": 140 }, { "epoch": 1.0452961672473868, "grad_norm": 9.355042641000702, "learning_rate": 4.110430123999227e-07, "logits/chosen": -2.6396541595458984, "logits/rejected": -2.5929980278015137, "logps/chosen": -314.0408630371094, "logps/rejected": -329.1007080078125, "loss": 0.6047, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0018128050724044442, "rewards/margins": 0.18482232093811035, "rewards/rejected": -0.18663513660430908, "step": 150 }, { "epoch": 1.0452961672473868, "eval_logits/chosen": -2.5443286895751953, "eval_logits/rejected": -2.5025253295898438, "eval_logps/chosen": -279.22698974609375, "eval_logps/rejected": -285.9855041503906, "eval_loss": 0.6797215938568115, "eval_rewards/accuracies": 0.58984375, "eval_rewards/chosen": -0.16597062349319458, "eval_rewards/margins": 0.06726360321044922, "eval_rewards/rejected": -0.233234241604805, "eval_runtime": 105.0322, "eval_samples_per_second": 19.042, "eval_steps_per_second": 0.305, "step": 150 }, { "epoch": 1.1149825783972125, "grad_norm": 9.102186975626122, "learning_rate": 3.9496399795098266e-07, "logits/chosen": -2.611131191253662, "logits/rejected": -2.577604293823242, "logps/chosen": -355.5671081542969, "logps/rejected": -354.0496520996094, "loss": 0.5626, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.13642819225788116, "rewards/margins": 0.3638991117477417, "rewards/rejected": -0.5003272294998169, "step": 160 }, { "epoch": 1.1846689895470384, "grad_norm": 10.030074101152461, "learning_rate": 3.779252612874913e-07, "logits/chosen": -2.52521014213562, "logits/rejected": -2.461188793182373, "logps/chosen": -291.77777099609375, "logps/rejected": -298.83343505859375, "loss": 0.5548, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.029370594769716263, "rewards/margins": 0.332529217004776, "rewards/rejected": -0.36189982295036316, "step": 170 }, { "epoch": 1.254355400696864, "grad_norm": 10.899159922293036, "learning_rate": 3.60039605962848e-07, "logits/chosen": -2.48093843460083, "logits/rejected": -2.453683376312256, "logps/chosen": -336.80316162109375, "logps/rejected": -371.10650634765625, "loss": 0.5374, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.12406754493713379, "rewards/margins": 0.418355792760849, "rewards/rejected": -0.5424233675003052, "step": 180 }, { "epoch": 1.32404181184669, "grad_norm": 10.743619932904148, "learning_rate": 3.414254424857272e-07, "logits/chosen": -2.402945041656494, "logits/rejected": -2.4313735961914062, "logps/chosen": -327.4953308105469, "logps/rejected": -387.18035888671875, "loss": 0.5441, "rewards/accuracies": 0.875, "rewards/chosen": -0.2883428931236267, "rewards/margins": 0.4703540802001953, "rewards/rejected": -0.758696973323822, "step": 190 }, { "epoch": 1.3937282229965158, "grad_norm": 13.29972806953709, "learning_rate": 3.2220600439305403e-07, "logits/chosen": -2.356320858001709, "logits/rejected": -2.3801255226135254, "logps/chosen": -323.59130859375, "logps/rejected": -368.29425048828125, "loss": 0.5265, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.3764956295490265, "rewards/margins": 0.45888185501098633, "rewards/rejected": -0.8353773951530457, "step": 200 }, { "epoch": 1.3937282229965158, "eval_logits/chosen": -2.332766532897949, "eval_logits/rejected": -2.2717788219451904, "eval_logps/chosen": -320.0576477050781, "eval_logps/rejected": -335.9707946777344, "eval_loss": 0.6762288808822632, "eval_rewards/accuracies": 0.671875, "eval_rewards/chosen": -0.5742772817611694, "eval_rewards/margins": 0.15881015360355377, "eval_rewards/rejected": -0.733087420463562, "eval_runtime": 103.8887, "eval_samples_per_second": 19.251, "eval_steps_per_second": 0.308, "step": 200 }, { "epoch": 1.4634146341463414, "grad_norm": 15.19540789655145, "learning_rate": 3.025085323925175e-07, "logits/chosen": -2.238861083984375, "logits/rejected": -2.186549663543701, "logps/chosen": -322.6126403808594, "logps/rejected": -375.65850830078125, "loss": 0.5251, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.4552794396877289, "rewards/margins": 0.5174066424369812, "rewards/rejected": -0.9726861119270325, "step": 210 }, { "epoch": 1.533101045296167, "grad_norm": 14.440109215320728, "learning_rate": 2.8246343197594046e-07, "logits/chosen": -2.1742804050445557, "logits/rejected": -2.0851190090179443, "logps/chosen": -388.40509033203125, "logps/rejected": -404.67388916015625, "loss": 0.5096, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5906688570976257, "rewards/margins": 0.48450303077697754, "rewards/rejected": -1.075171947479248, "step": 220 }, { "epoch": 1.6027874564459932, "grad_norm": 16.964461935174956, "learning_rate": 2.622034100804566e-07, "logits/chosen": -1.8995920419692993, "logits/rejected": -2.022343397140503, "logps/chosen": -324.2644958496094, "logps/rejected": -404.7146911621094, "loss": 0.5032, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.6232214570045471, "rewards/margins": 0.5319386720657349, "rewards/rejected": -1.1551600694656372, "step": 230 }, { "epoch": 1.6724738675958188, "grad_norm": 27.37981327191052, "learning_rate": 2.418625965131574e-07, "logits/chosen": -1.7780876159667969, "logits/rejected": -1.6987870931625366, "logps/chosen": -376.9257507324219, "logps/rejected": -409.15887451171875, "loss": 0.5097, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7337759137153625, "rewards/margins": 0.56010901927948, "rewards/rejected": -1.2938848733901978, "step": 240 }, { "epoch": 1.7421602787456445, "grad_norm": 18.86770347379352, "learning_rate": 2.2157565595574668e-07, "logits/chosen": -1.6350816488265991, "logits/rejected": -1.648374319076538, "logps/chosen": -389.5054931640625, "logps/rejected": -428.66802978515625, "loss": 0.4984, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.949749767780304, "rewards/margins": 0.5309340953826904, "rewards/rejected": -1.4806839227676392, "step": 250 }, { "epoch": 1.7421602787456445, "eval_logits/chosen": -1.543340802192688, "eval_logits/rejected": -1.445103645324707, "eval_logps/chosen": -383.83807373046875, "eval_logps/rejected": -407.1153869628906, "eval_loss": 0.6731657981872559, "eval_rewards/accuracies": 0.65625, "eval_rewards/chosen": -1.2120810747146606, "eval_rewards/margins": 0.23245161771774292, "eval_rewards/rejected": -1.4445327520370483, "eval_runtime": 104.3313, "eval_samples_per_second": 19.17, "eval_steps_per_second": 0.307, "step": 250 }, { "epoch": 1.8118466898954704, "grad_norm": 17.331986790708136, "learning_rate": 2.0147689642810138e-07, "logits/chosen": -1.6317332983016968, "logits/rejected": -1.572344183921814, "logps/chosen": -410.221923828125, "logps/rejected": -480.1829528808594, "loss": 0.4902, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0687463283538818, "rewards/margins": 0.6032934188842773, "rewards/rejected": -1.6720397472381592, "step": 260 }, { "epoch": 1.8815331010452963, "grad_norm": 17.843882545206057, "learning_rate": 1.8169938011308233e-07, "logits/chosen": -1.4750444889068604, "logits/rejected": -1.417011022567749, "logps/chosen": -395.61480712890625, "logps/rejected": -437.10577392578125, "loss": 0.492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9328581094741821, "rewards/margins": 0.6057752370834351, "rewards/rejected": -1.538633108139038, "step": 270 }, { "epoch": 1.951219512195122, "grad_norm": 19.896351603470077, "learning_rate": 1.6237404242930697e-07, "logits/chosen": -1.417770266532898, "logits/rejected": -1.3385121822357178, "logps/chosen": -375.705078125, "logps/rejected": -404.3011169433594, "loss": 0.4867, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9175931215286255, "rewards/margins": 0.4988563060760498, "rewards/rejected": -1.4164493083953857, "step": 280 }, { "epoch": 2.0209059233449476, "grad_norm": 16.576113766725356, "learning_rate": 1.4362882518398945e-07, "logits/chosen": -1.3333556652069092, "logits/rejected": -1.3687762022018433, "logps/chosen": -405.6753234863281, "logps/rejected": -468.34490966796875, "loss": 0.4573, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9316972494125366, "rewards/margins": 0.6436307430267334, "rewards/rejected": -1.5753281116485596, "step": 290 }, { "epoch": 2.0905923344947737, "grad_norm": 18.642417597691527, "learning_rate": 1.2558782954473823e-07, "logits/chosen": -1.1027063131332397, "logits/rejected": -1.0710804462432861, "logps/chosen": -400.2093505859375, "logps/rejected": -480.50518798828125, "loss": 0.3569, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9277191162109375, "rewards/margins": 1.0768729448318481, "rewards/rejected": -2.004591941833496, "step": 300 }, { "epoch": 2.0905923344947737, "eval_logits/chosen": -0.9999401569366455, "eval_logits/rejected": -0.870820164680481, "eval_logps/chosen": -397.18048095703125, "eval_logps/rejected": -429.4680480957031, "eval_loss": 0.6527448892593384, "eval_rewards/accuracies": 0.67578125, "eval_rewards/chosen": -1.3455055952072144, "eval_rewards/margins": 0.32255375385284424, "eval_rewards/rejected": -1.6680593490600586, "eval_runtime": 102.8366, "eval_samples_per_second": 19.448, "eval_steps_per_second": 0.311, "step": 300 }, { "epoch": 2.1602787456445993, "grad_norm": 19.642342649385267, "learning_rate": 1.0837049443799279e-07, "logits/chosen": -0.9505928158760071, "logits/rejected": -0.9347362518310547, "logps/chosen": -373.06011962890625, "logps/rejected": -473.77880859375, "loss": 0.3689, "rewards/accuracies": 0.875, "rewards/chosen": -0.9760599136352539, "rewards/margins": 1.0135242938995361, "rewards/rejected": -1.98958420753479, "step": 310 }, { "epoch": 2.229965156794425, "grad_norm": 20.908278202953714, "learning_rate": 9.209080581344306e-08, "logits/chosen": -0.6433783173561096, "logits/rejected": -0.6169986128807068, "logps/chosen": -395.59600830078125, "logps/rejected": -535.1781005859375, "loss": 0.3438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.1918704509735107, "rewards/margins": 1.1342499256134033, "rewards/rejected": -2.326120615005493, "step": 320 }, { "epoch": 2.2996515679442506, "grad_norm": 22.95505810018377, "learning_rate": 7.685654200943378e-08, "logits/chosen": -0.7587612867355347, "logits/rejected": -0.6282288432121277, "logps/chosen": -465.53985595703125, "logps/rejected": -561.067626953125, "loss": 0.3429, "rewards/accuracies": 0.9375, "rewards/chosen": -1.326777696609497, "rewards/margins": 1.2314026355743408, "rewards/rejected": -2.558180332183838, "step": 330 }, { "epoch": 2.3693379790940767, "grad_norm": 23.693323873855398, "learning_rate": 6.27685602153478e-08, "logits/chosen": -0.5435560941696167, "logits/rejected": -0.4107975959777832, "logps/chosen": -466.288818359375, "logps/rejected": -571.3019409179688, "loss": 0.3396, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.5259153842926025, "rewards/margins": 1.2022850513458252, "rewards/rejected": -2.7282001972198486, "step": 340 }, { "epoch": 2.4390243902439024, "grad_norm": 24.606275846522493, "learning_rate": 4.992012875488669e-08, "logits/chosen": -0.3131232261657715, "logits/rejected": -0.2396572083234787, "logps/chosen": -431.25140380859375, "logps/rejected": -555.533203125, "loss": 0.3329, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5053646564483643, "rewards/margins": 1.0768215656280518, "rewards/rejected": -2.582186222076416, "step": 350 }, { "epoch": 2.4390243902439024, "eval_logits/chosen": -0.24473249912261963, "eval_logits/rejected": -0.10840671509504318, "eval_logps/chosen": -453.08160400390625, "eval_logps/rejected": -488.3669738769531, "eval_loss": 0.6840001940727234, "eval_rewards/accuracies": 0.66015625, "eval_rewards/chosen": -1.904516339302063, "eval_rewards/margins": 0.35253193974494934, "eval_rewards/rejected": -2.2570483684539795, "eval_runtime": 106.925, "eval_samples_per_second": 18.705, "eval_steps_per_second": 0.299, "step": 350 }, { "epoch": 2.508710801393728, "grad_norm": 27.353529501179093, "learning_rate": 3.8396309610812086e-08, "logits/chosen": -0.23779411613941193, "logits/rejected": -0.15696097910404205, "logps/chosen": -438.49298095703125, "logps/rejected": -567.0818481445312, "loss": 0.3351, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.482677698135376, "rewards/margins": 1.268634557723999, "rewards/rejected": -2.751312255859375, "step": 360 }, { "epoch": 2.578397212543554, "grad_norm": 26.645676298973676, "learning_rate": 2.8273395279091005e-08, "logits/chosen": -0.31052619218826294, "logits/rejected": -0.2516113221645355, "logps/chosen": -440.5013122558594, "logps/rejected": -570.08056640625, "loss": 0.3373, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.4818499088287354, "rewards/margins": 1.3230129480361938, "rewards/rejected": -2.8048629760742188, "step": 370 }, { "epoch": 2.64808362369338, "grad_norm": 23.8312792481954, "learning_rate": 1.9618403680707053e-08, "logits/chosen": -0.3248611092567444, "logits/rejected": -0.3839193284511566, "logps/chosen": -462.94873046875, "logps/rejected": -592.1238403320312, "loss": 0.34, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.5153675079345703, "rewards/margins": 1.3629937171936035, "rewards/rejected": -2.878361225128174, "step": 380 }, { "epoch": 2.7177700348432055, "grad_norm": 25.615650685832552, "learning_rate": 1.2488634475031761e-08, "logits/chosen": -0.10789848864078522, "logits/rejected": -0.0006875753169879317, "logps/chosen": -432.10968017578125, "logps/rejected": -547.6038208007812, "loss": 0.3334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.524472951889038, "rewards/margins": 1.2026252746582031, "rewards/rejected": -2.727097988128662, "step": 390 }, { "epoch": 2.7874564459930316, "grad_norm": 29.507902477027233, "learning_rate": 6.9312897121466815e-09, "logits/chosen": -0.16302387416362762, "logits/rejected": -0.1776006668806076, "logps/chosen": -456.0896911621094, "logps/rejected": -589.2247924804688, "loss": 0.3368, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.5656400918960571, "rewards/margins": 1.24466872215271, "rewards/rejected": -2.8103089332580566, "step": 400 }, { "epoch": 2.7874564459930316, "eval_logits/chosen": -0.3103621006011963, "eval_logits/rejected": -0.1808159053325653, "eval_logps/chosen": -455.8002624511719, "eval_logps/rejected": -491.1397705078125, "eval_loss": 0.681273877620697, "eval_rewards/accuracies": 0.6796875, "eval_rewards/chosen": -1.9317032098770142, "eval_rewards/margins": 0.35307374596595764, "eval_rewards/rejected": -2.2847771644592285, "eval_runtime": 103.4584, "eval_samples_per_second": 19.331, "eval_steps_per_second": 0.309, "step": 400 }, { "epoch": 2.857142857142857, "grad_norm": 23.881676504304508, "learning_rate": 2.983161335556761e-09, "logits/chosen": -0.464876651763916, "logits/rejected": -0.40461286902427673, "logps/chosen": -434.8916015625, "logps/rejected": -576.1773681640625, "loss": 0.3237, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5066254138946533, "rewards/margins": 1.3078076839447021, "rewards/rejected": -2.8144326210021973, "step": 410 }, { "epoch": 2.926829268292683, "grad_norm": 24.99330232547937, "learning_rate": 6.703876041571077e-10, "logits/chosen": -0.48426467180252075, "logits/rejected": -0.1355866938829422, "logps/chosen": -447.3482971191406, "logps/rejected": -561.0531005859375, "loss": 0.3235, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4588124752044678, "rewards/margins": 1.2296662330627441, "rewards/rejected": -2.688478946685791, "step": 420 }, { "epoch": 2.989547038327526, "step": 429, "total_flos": 0.0, "train_loss": 0.5141507484418251, "train_runtime": 11380.2127, "train_samples_per_second": 4.835, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 429, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }