{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 684, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 2.025402631880394, "learning_rate": 7.246376811594204e-08, "logits/chosen": -2.961127519607544, "logits/rejected": -2.9461119174957275, "logps/chosen": -261.90582275390625, "logps/rejected": -270.03265380859375, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6932222843170166, "epoch": 0.01, "grad_norm": 1.892116368261662, "learning_rate": 7.246376811594204e-07, "logits/chosen": -2.875087022781372, "logits/rejected": -2.855910062789917, "logps/chosen": -217.50634765625, "logps/rejected": -222.0803985595703, "loss": 0.6974, "positive_losses": 0.04892720282077789, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.00041001950739882886, "rewards/margins": -0.0001489536080043763, "rewards/margins_max": 0.0012003988958895206, "rewards/margins_min": -0.0014983059372752905, "rewards/margins_std": 0.0019082725048065186, "rewards/rejected": 0.0005589731154032052, "step": 10 }, { "dpo_losses": 0.6928491592407227, "epoch": 0.03, "grad_norm": 10.889312446124245, "learning_rate": 1.4492753623188408e-06, "logits/chosen": -2.855677366256714, "logits/rejected": -2.8727664947509766, "logps/chosen": -228.65463256835938, "logps/rejected": -176.28146362304688, "loss": 0.695, "positive_losses": 0.026834487915039062, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0038786642253398895, "rewards/margins": 0.0005991062498651445, "rewards/margins_max": 0.0024178135208785534, "rewards/margins_min": -0.001219600671902299, "rewards/margins_std": 0.0025720400735735893, "rewards/rejected": 0.0032795581500977278, "step": 20 }, { "dpo_losses": 0.691431999206543, "epoch": 0.04, "grad_norm": 2.0735563380689697, "learning_rate": 2.173913043478261e-06, "logits/chosen": -2.932262420654297, "logits/rejected": -2.8772940635681152, "logps/chosen": -258.99334716796875, "logps/rejected": -237.83096313476562, "loss": 0.692, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.015487673692405224, "rewards/margins": 0.0034467983059585094, "rewards/margins_max": 0.008276171050965786, "rewards/margins_min": -0.001382575137540698, "rewards/margins_std": 0.006829765625298023, "rewards/rejected": 0.012040875852108002, "step": 30 }, { "dpo_losses": 0.6876205205917358, "epoch": 0.06, "grad_norm": 1.9226295416634294, "learning_rate": 2.8985507246376816e-06, "logits/chosen": -2.8300986289978027, "logits/rejected": -2.7832179069519043, "logps/chosen": -325.06231689453125, "logps/rejected": -363.68426513671875, "loss": 0.6887, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.028436576947569847, "rewards/margins": 0.011200213804841042, "rewards/margins_max": 0.022198233753442764, "rewards/margins_min": 0.00020219237194396555, "rewards/margins_std": 0.015553551726043224, "rewards/rejected": 0.017236361280083656, "step": 40 }, { "dpo_losses": 0.6896201968193054, "epoch": 0.07, "grad_norm": 9.118712466857515, "learning_rate": 3.6231884057971017e-06, "logits/chosen": -2.895482301712036, "logits/rejected": -2.8222224712371826, "logps/chosen": -247.339111328125, "logps/rejected": -244.00790405273438, "loss": 0.6891, "positive_losses": 0.0010955811012536287, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03269472345709801, "rewards/margins": 0.007144673261791468, "rewards/margins_max": 0.017206599935889244, "rewards/margins_min": -0.002917253179475665, "rewards/margins_std": 0.014229713007807732, "rewards/rejected": 0.025550048798322678, "step": 50 }, { "dpo_losses": 0.678604006767273, "epoch": 0.09, "grad_norm": 1.6865302230972352, "learning_rate": 4.347826086956522e-06, "logits/chosen": -3.02363920211792, "logits/rejected": -2.9497618675231934, "logps/chosen": -302.65142822265625, "logps/rejected": -242.8329620361328, "loss": 0.6829, "positive_losses": 0.0033214569557458162, "rewards/accuracies": 0.75, "rewards/chosen": 0.06167557090520859, "rewards/margins": 0.02971900999546051, "rewards/margins_max": 0.05635114759206772, "rewards/margins_min": 0.003086873795837164, "rewards/margins_std": 0.0376635305583477, "rewards/rejected": 0.03195656090974808, "step": 60 }, { "dpo_losses": 0.6720460653305054, "epoch": 0.1, "grad_norm": 7.877876119832288, "learning_rate": 4.999967381905813e-06, "logits/chosen": -3.0418922901153564, "logits/rejected": -2.9664931297302246, "logps/chosen": -266.40692138671875, "logps/rejected": -203.53610229492188, "loss": 0.6792, "positive_losses": 0.09538726508617401, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.05729568004608154, "rewards/margins": 0.04375966638326645, "rewards/margins_max": 0.05517454072833061, "rewards/margins_min": 0.032344792038202286, "rewards/margins_std": 0.0161430723965168, "rewards/rejected": 0.013536013662815094, "step": 70 }, { "dpo_losses": 0.6691193580627441, "epoch": 0.12, "grad_norm": 9.317121958881922, "learning_rate": 4.9960542403925095e-06, "logits/chosen": -2.8223726749420166, "logits/rejected": -2.7410635948181152, "logps/chosen": -249.7649383544922, "logps/rejected": -231.88986206054688, "loss": 0.6716, "positive_losses": 0.03403167799115181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.07761463522911072, "rewards/margins": 0.05084484815597534, "rewards/margins_max": 0.10426272451877594, "rewards/margins_min": -0.0025730193592607975, "rewards/margins_std": 0.07554427534341812, "rewards/rejected": 0.026769787073135376, "step": 80 }, { "dpo_losses": 0.6618136167526245, "epoch": 0.13, "grad_norm": 4.628284010129237, "learning_rate": 4.98562917836165e-06, "logits/chosen": -2.881012201309204, "logits/rejected": -2.8446288108825684, "logps/chosen": -254.67495727539062, "logps/rejected": -195.2042694091797, "loss": 0.6694, "positive_losses": 0.05231323093175888, "rewards/accuracies": 0.75, "rewards/chosen": 0.08632297068834305, "rewards/margins": 0.06475184857845306, "rewards/margins_max": 0.08643685281276703, "rewards/margins_min": 0.0430668406188488, "rewards/margins_std": 0.030667226761579514, "rewards/rejected": 0.021571118384599686, "step": 90 }, { "dpo_losses": 0.6343039274215698, "epoch": 0.15, "grad_norm": 12.853627066309556, "learning_rate": 4.968719393609757e-06, "logits/chosen": -2.973792552947998, "logits/rejected": -2.9188525676727295, "logps/chosen": -364.15399169921875, "logps/rejected": -228.38418579101562, "loss": 0.6612, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.1419464498758316, "rewards/margins": 0.12534113228321075, "rewards/margins_max": 0.19193264842033386, "rewards/margins_min": 0.05874960869550705, "rewards/margins_std": 0.09417462348937988, "rewards/rejected": 0.016605319455266, "step": 100 }, { "epoch": 0.15, "eval_dpo_losses": 0.6805257797241211, "eval_logits/chosen": -2.8549489974975586, "eval_logits/rejected": -2.8096561431884766, "eval_logps/chosen": -277.0838928222656, "eval_logps/rejected": -253.83172607421875, "eval_loss": 0.7167356014251709, "eval_positive_losses": 0.34227606654167175, "eval_rewards/accuracies": 0.591269850730896, "eval_rewards/chosen": 0.0813729390501976, "eval_rewards/margins": 0.027863360941410065, "eval_rewards/margins_max": 0.13411983847618103, "eval_rewards/margins_min": -0.06980105489492416, "eval_rewards/margins_std": 0.0907549038529396, "eval_rewards/rejected": 0.05350957810878754, "eval_runtime": 284.067, "eval_samples_per_second": 7.041, "eval_steps_per_second": 0.222, "step": 100 }, { "dpo_losses": 0.646482527256012, "epoch": 0.16, "grad_norm": 7.886066230281033, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -2.90840482711792, "logits/rejected": -2.870948553085327, "logps/chosen": -332.84967041015625, "logps/rejected": -298.0467529296875, "loss": 0.6608, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.11125056445598602, "rewards/margins": 0.09774880856275558, "rewards/margins_max": 0.12077156454324722, "rewards/margins_min": 0.07472606003284454, "rewards/margins_std": 0.03255908936262131, "rewards/rejected": 0.01350175030529499, "step": 110 }, { "dpo_losses": 0.6566277742385864, "epoch": 0.18, "grad_norm": 1.913254174024933, "learning_rate": 4.915638921541952e-06, "logits/chosen": -2.8616955280303955, "logits/rejected": -2.8522956371307373, "logps/chosen": -277.81402587890625, "logps/rejected": -261.228759765625, "loss": 0.6473, "positive_losses": 0.04311790317296982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0988793820142746, "rewards/margins": 0.07747145742177963, "rewards/margins_max": 0.14120900630950928, "rewards/margins_min": 0.013733914121985435, "rewards/margins_std": 0.0901385098695755, "rewards/rejected": 0.02140791341662407, "step": 120 }, { "dpo_losses": 0.6377557516098022, "epoch": 0.19, "grad_norm": 12.904038948771282, "learning_rate": 4.879606715117019e-06, "logits/chosen": -2.95271897315979, "logits/rejected": -2.8435444831848145, "logps/chosen": -294.14813232421875, "logps/rejected": -240.53207397460938, "loss": 0.6576, "positive_losses": 0.14576569199562073, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.12051650136709213, "rewards/margins": 0.12185319513082504, "rewards/margins_max": 0.16484162211418152, "rewards/margins_min": 0.07886476814746857, "rewards/margins_std": 0.06079481169581413, "rewards/rejected": -0.0013366841012611985, "step": 130 }, { "dpo_losses": 0.640169620513916, "epoch": 0.2, "grad_norm": 16.18813154837036, "learning_rate": 4.837366386472175e-06, "logits/chosen": -3.0178513526916504, "logits/rejected": -2.9269156455993652, "logps/chosen": -279.7732238769531, "logps/rejected": -236.86874389648438, "loss": 0.6767, "positive_losses": 0.29906386137008667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.11848002672195435, "rewards/margins": 0.11471493542194366, "rewards/margins_max": 0.2023201882839203, "rewards/margins_min": 0.02710966393351555, "rewards/margins_std": 0.12389256060123444, "rewards/rejected": 0.0037650964222848415, "step": 140 }, { "dpo_losses": 0.6285568475723267, "epoch": 0.22, "grad_norm": 6.5324825602261045, "learning_rate": 4.789028135801919e-06, "logits/chosen": -2.973548650741577, "logits/rejected": -2.9277901649475098, "logps/chosen": -281.6429443359375, "logps/rejected": -295.3494873046875, "loss": 0.8701, "positive_losses": 0.2579152584075928, "rewards/accuracies": 0.75, "rewards/chosen": 0.16488775610923767, "rewards/margins": 0.14175625145435333, "rewards/margins_max": 0.25869402289390564, "rewards/margins_min": 0.02481846883893013, "rewards/margins_std": 0.1653749793767929, "rewards/rejected": 0.023131517693400383, "step": 150 }, { "dpo_losses": 0.6205035448074341, "epoch": 0.23, "grad_norm": 2.7066336555848176, "learning_rate": 4.7347180720830635e-06, "logits/chosen": -2.9675240516662598, "logits/rejected": -2.804438829421997, "logps/chosen": -316.6290283203125, "logps/rejected": -282.70538330078125, "loss": 0.6598, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.1802542507648468, "rewards/margins": 0.15980175137519836, "rewards/margins_max": 0.272554486989975, "rewards/margins_min": 0.047049008309841156, "rewards/margins_std": 0.15945644676685333, "rewards/rejected": 0.02045249193906784, "step": 160 }, { "dpo_losses": 0.6271503567695618, "epoch": 0.25, "grad_norm": 1.8801745537623247, "learning_rate": 4.674577884070811e-06, "logits/chosen": -2.929814577102661, "logits/rejected": -2.8838062286376953, "logps/chosen": -308.89947509765625, "logps/rejected": -252.41775512695312, "loss": 0.6279, "positive_losses": 0.5797370672225952, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.1764679253101349, "rewards/margins": 0.14800240099430084, "rewards/margins_max": 0.237053781747818, "rewards/margins_min": 0.058951012790203094, "rewards/margins_std": 0.12593765556812286, "rewards/rejected": 0.02846553362905979, "step": 170 }, { "dpo_losses": 0.6267830729484558, "epoch": 0.26, "grad_norm": 2.1466440934520645, "learning_rate": 4.608764470648971e-06, "logits/chosen": -2.9405906200408936, "logits/rejected": -2.8729405403137207, "logps/chosen": -290.3728942871094, "logps/rejected": -330.1247863769531, "loss": 0.6545, "positive_losses": 0.3962584435939789, "rewards/accuracies": 0.75, "rewards/chosen": 0.17637397348880768, "rewards/margins": 0.14918141067028046, "rewards/margins_max": 0.28907179832458496, "rewards/margins_min": 0.00929100438952446, "rewards/margins_std": 0.19783492386341095, "rewards/rejected": 0.027192572131752968, "step": 180 }, { "dpo_losses": 0.5917172431945801, "epoch": 0.28, "grad_norm": 2.287767271676791, "learning_rate": 4.5374495314986874e-06, "logits/chosen": -2.7434186935424805, "logits/rejected": -2.7498998641967773, "logps/chosen": -312.6622619628906, "logps/rejected": -238.92544555664062, "loss": 0.6419, "positive_losses": 0.6725692749023438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.21900752186775208, "rewards/margins": 0.22816917300224304, "rewards/margins_max": 0.34974128007888794, "rewards/margins_min": 0.10659710317850113, "rewards/margins_std": 0.17192888259887695, "rewards/rejected": -0.00916165579110384, "step": 190 }, { "dpo_losses": 0.6394025087356567, "epoch": 0.29, "grad_norm": 10.012997763234694, "learning_rate": 4.460819119153574e-06, "logits/chosen": -2.8694872856140137, "logits/rejected": -2.831519603729248, "logps/chosen": -254.72994995117188, "logps/rejected": -285.6749572753906, "loss": 0.6652, "positive_losses": 0.53265380859375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1096222847700119, "rewards/margins": 0.11550626903772354, "rewards/margins_max": 0.21246998012065887, "rewards/margins_min": 0.018542537465691566, "rewards/margins_std": 0.13712741434574127, "rewards/rejected": -0.005883966572582722, "step": 200 }, { "epoch": 0.29, "eval_dpo_losses": 0.6683889627456665, "eval_logits/chosen": -2.804701566696167, "eval_logits/rejected": -2.761190176010132, "eval_logps/chosen": -270.78045654296875, "eval_logps/rejected": -250.50535583496094, "eval_loss": 0.7424377202987671, "eval_positive_losses": 0.6157510280609131, "eval_rewards/accuracies": 0.6071428656578064, "eval_rewards/chosen": 0.14440776407718658, "eval_rewards/margins": 0.05763502046465874, "eval_rewards/margins_max": 0.23871682584285736, "eval_rewards/margins_min": -0.10859239846467972, "eval_rewards/margins_std": 0.15644660592079163, "eval_rewards/rejected": 0.08677274733781815, "eval_runtime": 283.023, "eval_samples_per_second": 7.067, "eval_steps_per_second": 0.223, "step": 200 }, { "dpo_losses": 0.6189439296722412, "epoch": 0.31, "grad_norm": 6.2964509201178664, "learning_rate": 4.379073153609896e-06, "logits/chosen": -2.980807304382324, "logits/rejected": -2.9285852909088135, "logps/chosen": -310.4910583496094, "logps/rejected": -283.2040100097656, "loss": 0.66, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17465968430042267, "rewards/margins": 0.16720545291900635, "rewards/margins_max": 0.2678540349006653, "rewards/margins_min": 0.0665569081902504, "rewards/margins_std": 0.14233854413032532, "rewards/rejected": 0.007454232778400183, "step": 210 }, { "dpo_losses": 0.6231792569160461, "epoch": 0.32, "grad_norm": 1.9190071231894708, "learning_rate": 4.292424900758129e-06, "logits/chosen": -2.8317534923553467, "logits/rejected": -2.745687961578369, "logps/chosen": -241.54098510742188, "logps/rejected": -261.16180419921875, "loss": 0.6416, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17161966860294342, "rewards/margins": 0.15514414012432098, "rewards/margins_max": 0.2861190140247345, "rewards/margins_min": 0.024169281125068665, "rewards/margins_std": 0.1852264106273651, "rewards/rejected": 0.016475532203912735, "step": 220 }, { "dpo_losses": 0.6433408260345459, "epoch": 0.34, "grad_norm": 2.0868151243656565, "learning_rate": 4.201100415996598e-06, "logits/chosen": -2.727468490600586, "logits/rejected": -2.6615824699401855, "logps/chosen": -234.8723602294922, "logps/rejected": -252.6638641357422, "loss": 0.6425, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1352681964635849, "rewards/margins": 0.10586099326610565, "rewards/margins_max": 0.18672436475753784, "rewards/margins_min": 0.024997618049383163, "rewards/margins_std": 0.11435806751251221, "rewards/rejected": 0.029407206922769547, "step": 230 }, { "dpo_losses": 0.5827513933181763, "epoch": 0.35, "grad_norm": 8.16859688297942, "learning_rate": 4.105337954478756e-06, "logits/chosen": -2.9261674880981445, "logits/rejected": -2.7930915355682373, "logps/chosen": -369.94781494140625, "logps/rejected": -238.9222869873047, "loss": 0.6532, "positive_losses": 0.4402889311313629, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.21584495902061462, "rewards/margins": 0.2518808841705322, "rewards/margins_max": 0.37274229526519775, "rewards/margins_min": 0.1310194730758667, "rewards/margins_std": 0.1709238588809967, "rewards/rejected": -0.0360359326004982, "step": 240 }, { "dpo_losses": 0.6088570356369019, "epoch": 0.37, "grad_norm": 7.483775073000372, "learning_rate": 4.005387349532697e-06, "logits/chosen": -2.9306282997131348, "logits/rejected": -2.87394642829895, "logps/chosen": -293.2420654296875, "logps/rejected": -272.403076171875, "loss": 0.6241, "positive_losses": 0.6371370553970337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14793848991394043, "rewards/margins": 0.18766427040100098, "rewards/margins_max": 0.2927020192146301, "rewards/margins_min": 0.08262647688388824, "rewards/margins_std": 0.14854584634304047, "rewards/rejected": -0.039725758135318756, "step": 250 }, { "dpo_losses": 0.630215048789978, "epoch": 0.38, "grad_norm": 13.905695323383071, "learning_rate": 3.901509360874515e-06, "logits/chosen": -2.832815170288086, "logits/rejected": -2.8140697479248047, "logps/chosen": -197.61141967773438, "logps/rejected": -194.48846435546875, "loss": 0.6319, "positive_losses": 0.17327670753002167, "rewards/accuracies": 0.75, "rewards/chosen": 0.15451067686080933, "rewards/margins": 0.14662909507751465, "rewards/margins_max": 0.2917208671569824, "rewards/margins_min": 0.0015373497735708952, "rewards/margins_std": 0.2051907330751419, "rewards/rejected": 0.00788155198097229, "step": 260 }, { "dpo_losses": 0.6374494433403015, "epoch": 0.39, "grad_norm": 10.795361999407069, "learning_rate": 3.793974994315991e-06, "logits/chosen": -2.788649797439575, "logits/rejected": -2.7856884002685547, "logps/chosen": -167.8430938720703, "logps/rejected": -188.2590789794922, "loss": 0.6446, "positive_losses": 0.739077091217041, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.13068287074565887, "rewards/margins": 0.11988089978694916, "rewards/margins_max": 0.2015564739704132, "rewards/margins_min": 0.0382053516805172, "rewards/margins_std": 0.11550667136907578, "rewards/rejected": 0.010801966302096844, "step": 270 }, { "dpo_losses": 0.601833701133728, "epoch": 0.41, "grad_norm": 1.9286233539251623, "learning_rate": 3.68306479474137e-06, "logits/chosen": -3.023601531982422, "logits/rejected": -2.880030632019043, "logps/chosen": -352.0651550292969, "logps/rejected": -203.5298309326172, "loss": 0.6428, "positive_losses": 0.44579315185546875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.21998050808906555, "rewards/margins": 0.2220487892627716, "rewards/margins_max": 0.3289792239665985, "rewards/margins_min": 0.11511830985546112, "rewards/margins_std": 0.15122249722480774, "rewards/rejected": -0.002068266272544861, "step": 280 }, { "dpo_losses": 0.6319935321807861, "epoch": 0.42, "grad_norm": 2.944032588280805, "learning_rate": 3.569068114197784e-06, "logits/chosen": -2.928798198699951, "logits/rejected": -2.8589279651641846, "logps/chosen": -206.49636840820312, "logps/rejected": -176.32420349121094, "loss": 0.6206, "positive_losses": 0.1757659912109375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.161897212266922, "rewards/margins": 0.13675031065940857, "rewards/margins_max": 0.2698245942592621, "rewards/margins_min": 0.003676059888675809, "rewards/margins_std": 0.18819543719291687, "rewards/rejected": 0.02514689229428768, "step": 290 }, { "dpo_losses": 0.5872830748558044, "epoch": 0.44, "grad_norm": 8.216053799669432, "learning_rate": 3.4522823570088073e-06, "logits/chosen": -2.855262279510498, "logits/rejected": -2.8310704231262207, "logps/chosen": -240.1536407470703, "logps/rejected": -225.31997680664062, "loss": 0.6493, "positive_losses": 0.42629069089889526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.14640358090400696, "rewards/margins": 0.24259230494499207, "rewards/margins_max": 0.3872792422771454, "rewards/margins_min": 0.09790538251399994, "rewards/margins_std": 0.20461821556091309, "rewards/rejected": -0.0961887389421463, "step": 300 }, { "epoch": 0.44, "eval_dpo_losses": 0.6609499454498291, "eval_logits/chosen": -2.8105618953704834, "eval_logits/rejected": -2.7656409740448, "eval_logps/chosen": -268.21075439453125, "eval_logps/rejected": -249.68173217773438, "eval_loss": 0.7585510015487671, "eval_positive_losses": 0.735747218132019, "eval_rewards/accuracies": 0.6150793433189392, "eval_rewards/chosen": 0.17010442912578583, "eval_rewards/margins": 0.0750950425863266, "eval_rewards/margins_max": 0.2765759229660034, "eval_rewards/margins_min": -0.10647378861904144, "eval_rewards/margins_std": 0.17135697603225708, "eval_rewards/rejected": 0.09500937908887863, "eval_runtime": 283.2506, "eval_samples_per_second": 7.061, "eval_steps_per_second": 0.222, "step": 300 }, { "dpo_losses": 0.6002415418624878, "epoch": 0.45, "grad_norm": 15.987450446439036, "learning_rate": 3.333012203880528e-06, "logits/chosen": -2.9334702491760254, "logits/rejected": -2.8857316970825195, "logps/chosen": -221.07736206054688, "logps/rejected": -165.5961456298828, "loss": 0.6259, "positive_losses": 0.29423028230667114, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.20694272220134735, "rewards/margins": 0.22076299786567688, "rewards/margins_max": 0.40377649664878845, "rewards/margins_min": 0.03774946928024292, "rewards/margins_std": 0.25882020592689514, "rewards/rejected": -0.013820228166878223, "step": 310 }, { "dpo_losses": 0.5651403069496155, "epoch": 0.47, "grad_norm": 41.49020086210869, "learning_rate": 3.2115688170243735e-06, "logits/chosen": -2.916748523712158, "logits/rejected": -2.9150567054748535, "logps/chosen": -294.4731750488281, "logps/rejected": -308.61346435546875, "loss": 0.6257, "positive_losses": 0.09651489555835724, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2138613760471344, "rewards/margins": 0.30642035603523254, "rewards/margins_max": 0.4777792990207672, "rewards/margins_min": 0.13506139814853668, "rewards/margins_std": 0.24233810603618622, "rewards/rejected": -0.09255897253751755, "step": 320 }, { "dpo_losses": 0.6225263476371765, "epoch": 0.48, "grad_norm": 10.387168274872021, "learning_rate": 3.0882690283704355e-06, "logits/chosen": -2.800654649734497, "logits/rejected": -2.744640588760376, "logps/chosen": -230.0958709716797, "logps/rejected": -211.9070587158203, "loss": 0.6505, "positive_losses": 0.48747172951698303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16328363120555878, "rewards/margins": 0.16463473439216614, "rewards/margins_max": 0.3351263403892517, "rewards/margins_min": -0.005856870673596859, "rewards/margins_std": 0.24111154675483704, "rewards/rejected": -0.001351101673208177, "step": 330 }, { "dpo_losses": 0.5763748288154602, "epoch": 0.5, "grad_norm": 11.15788842573243, "learning_rate": 2.9634345129891296e-06, "logits/chosen": -2.82387638092041, "logits/rejected": -2.72804594039917, "logps/chosen": -288.6628112792969, "logps/rejected": -272.2504577636719, "loss": 0.6348, "positive_losses": 0.106109619140625, "rewards/accuracies": 0.75, "rewards/chosen": 0.21110180020332336, "rewards/margins": 0.2855163514614105, "rewards/margins_max": 0.5410599708557129, "rewards/margins_min": 0.029972758144140244, "rewards/margins_std": 0.36139318346977234, "rewards/rejected": -0.07441455870866776, "step": 340 }, { "dpo_losses": 0.6027976870536804, "epoch": 0.51, "grad_norm": 15.405775577444263, "learning_rate": 2.8373909498776746e-06, "logits/chosen": -2.9628233909606934, "logits/rejected": -2.954535961151123, "logps/chosen": -264.20440673828125, "logps/rejected": -265.29058837890625, "loss": 0.6444, "positive_losses": 0.14543990790843964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.18934306502342224, "rewards/margins": 0.20881037414073944, "rewards/margins_max": 0.3755717873573303, "rewards/margins_min": 0.04204897955060005, "rewards/margins_std": 0.2358362227678299, "rewards/rejected": -0.019467316567897797, "step": 350 }, { "dpo_losses": 0.5799789428710938, "epoch": 0.53, "grad_norm": 2.0183316768741295, "learning_rate": 2.710467172300768e-06, "logits/chosen": -2.8956894874572754, "logits/rejected": -2.8347909450531006, "logps/chosen": -329.68585205078125, "logps/rejected": -321.13037109375, "loss": 0.6196, "positive_losses": 0.2545227110385895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.23316507041454315, "rewards/margins": 0.2625313103199005, "rewards/margins_max": 0.398255318403244, "rewards/margins_min": 0.12680727243423462, "rewards/margins_std": 0.19194276630878448, "rewards/rejected": -0.029366234317421913, "step": 360 }, { "dpo_losses": 0.5937001705169678, "epoch": 0.54, "grad_norm": 12.06051129169852, "learning_rate": 2.582994309902146e-06, "logits/chosen": -2.884967565536499, "logits/rejected": -2.743847370147705, "logps/chosen": -285.45123291015625, "logps/rejected": -247.1053466796875, "loss": 0.6401, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.1828571856021881, "rewards/margins": 0.22877776622772217, "rewards/margins_max": 0.3255845904350281, "rewards/margins_min": 0.13197092711925507, "rewards/margins_std": 0.13690553605556488, "rewards/rejected": -0.04592058062553406, "step": 370 }, { "dpo_losses": 0.595887303352356, "epoch": 0.56, "grad_norm": 3.3756204670545618, "learning_rate": 2.4553049248251512e-06, "logits/chosen": -2.788328170776367, "logits/rejected": -2.836977958679199, "logps/chosen": -214.0673065185547, "logps/rejected": -235.258544921875, "loss": 0.5859, "positive_losses": 0.17899170517921448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1966702789068222, "rewards/margins": 0.2272748053073883, "rewards/margins_max": 0.33700865507125854, "rewards/margins_min": 0.11754089593887329, "rewards/margins_std": 0.15518715977668762, "rewards/rejected": -0.03060450591146946, "step": 380 }, { "dpo_losses": 0.6092433333396912, "epoch": 0.57, "grad_norm": 2.5893672084668107, "learning_rate": 2.3277321440960733e-06, "logits/chosen": -2.9706382751464844, "logits/rejected": -2.9543769359588623, "logps/chosen": -256.9505615234375, "logps/rejected": -262.0420227050781, "loss": 0.6297, "positive_losses": 0.7152191400527954, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16275997459888458, "rewards/margins": 0.20270125567913055, "rewards/margins_max": 0.401836633682251, "rewards/margins_min": 0.0035658925771713257, "rewards/margins_std": 0.28161993622779846, "rewards/rejected": -0.039941295981407166, "step": 390 }, { "dpo_losses": 0.5526755452156067, "epoch": 0.58, "grad_norm": 21.63358202743805, "learning_rate": 2.20060879053377e-06, "logits/chosen": -2.8260955810546875, "logits/rejected": -2.7942147254943848, "logps/chosen": -184.72544860839844, "logps/rejected": -232.3735809326172, "loss": 0.6224, "positive_losses": 0.5248996019363403, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.16737225651741028, "rewards/margins": 0.3254551887512207, "rewards/margins_max": 0.506883442401886, "rewards/margins_min": 0.14402692019939423, "rewards/margins_std": 0.256578266620636, "rewards/rejected": -0.15808293223381042, "step": 400 }, { "epoch": 0.58, "eval_dpo_losses": 0.6528598070144653, "eval_logits/chosen": -2.819880723953247, "eval_logits/rejected": -2.7766764163970947, "eval_logps/chosen": -274.035888671875, "eval_logps/rejected": -258.0920715332031, "eval_loss": 0.9942816495895386, "eval_positive_losses": 3.3746726512908936, "eval_rewards/accuracies": 0.6388888955116272, "eval_rewards/chosen": 0.11185282468795776, "eval_rewards/margins": 0.10094699263572693, "eval_rewards/margins_max": 0.38355645537376404, "eval_rewards/margins_min": -0.1620650738477707, "eval_rewards/margins_std": 0.24338804185390472, "eval_rewards/rejected": 0.01090583112090826, "eval_runtime": 283.2899, "eval_samples_per_second": 7.06, "eval_steps_per_second": 0.222, "step": 400 }, { "dpo_losses": 0.531283438205719, "epoch": 0.6, "grad_norm": 11.192408672928963, "learning_rate": 2.0742665144529374e-06, "logits/chosen": -2.8772964477539062, "logits/rejected": -2.7996630668640137, "logps/chosen": -310.952392578125, "logps/rejected": -263.61370849609375, "loss": 0.6614, "positive_losses": 0.6912002563476562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3101710379123688, "rewards/margins": 0.4205913543701172, "rewards/margins_max": 0.6372691988945007, "rewards/margins_min": 0.20391342043876648, "rewards/margins_std": 0.30642884969711304, "rewards/rejected": -0.11042030900716782, "step": 410 }, { "dpo_losses": 0.5354982614517212, "epoch": 0.61, "grad_norm": 15.45856376263706, "learning_rate": 1.9490349284263036e-06, "logits/chosen": -2.8157570362091064, "logits/rejected": -2.758150577545166, "logps/chosen": -286.8282775878906, "logps/rejected": -253.0894775390625, "loss": 0.6483, "positive_losses": 0.7530021667480469, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2359578162431717, "rewards/margins": 0.3807125985622406, "rewards/margins_max": 0.5745470523834229, "rewards/margins_min": 0.18687808513641357, "rewards/margins_std": 0.27412334084510803, "rewards/rejected": -0.14475473761558533, "step": 420 }, { "dpo_losses": 0.5478672385215759, "epoch": 0.63, "grad_norm": 9.397460886758937, "learning_rate": 1.8252407473630606e-06, "logits/chosen": -2.981616735458374, "logits/rejected": -2.9860920906066895, "logps/chosen": -268.0840148925781, "logps/rejected": -284.7814025878906, "loss": 0.6187, "positive_losses": 0.08653469383716583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19860908389091492, "rewards/margins": 0.35602104663848877, "rewards/margins_max": 0.5557164549827576, "rewards/margins_min": 0.15632562339305878, "rewards/margins_std": 0.28241199254989624, "rewards/rejected": -0.15741200745105743, "step": 430 }, { "dpo_losses": 0.5760594010353088, "epoch": 0.64, "grad_norm": 23.342372051867663, "learning_rate": 1.7032069361469765e-06, "logits/chosen": -2.775434732437134, "logits/rejected": -2.77290678024292, "logps/chosen": -213.5185089111328, "logps/rejected": -278.1703186035156, "loss": 0.6391, "positive_losses": 0.018738174811005592, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19827936589717865, "rewards/margins": 0.2895718812942505, "rewards/margins_max": 0.5261528491973877, "rewards/margins_min": 0.05299092084169388, "rewards/margins_std": 0.3345760405063629, "rewards/rejected": -0.09129253774881363, "step": 440 }, { "dpo_losses": 0.5661223530769348, "epoch": 0.66, "grad_norm": 2.1591786937116098, "learning_rate": 1.5832518670578802e-06, "logits/chosen": -3.0003552436828613, "logits/rejected": -2.9453907012939453, "logps/chosen": -272.0791015625, "logps/rejected": -293.05645751953125, "loss": 0.6369, "positive_losses": 0.25814515352249146, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2058447152376175, "rewards/margins": 0.2981029152870178, "rewards/margins_max": 0.49641966819763184, "rewards/margins_min": 0.099786177277565, "rewards/margins_std": 0.28046220541000366, "rewards/rejected": -0.09225818514823914, "step": 450 }, { "dpo_losses": 0.5694680213928223, "epoch": 0.67, "grad_norm": 2.6035617787075602, "learning_rate": 1.4656884891747398e-06, "logits/chosen": -2.813758373260498, "logits/rejected": -2.838399887084961, "logps/chosen": -266.3056945800781, "logps/rejected": -263.6679382324219, "loss": 0.6275, "positive_losses": 0.17335128784179688, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.19134476780891418, "rewards/margins": 0.3021687865257263, "rewards/margins_max": 0.5000616312026978, "rewards/margins_min": 0.10427598655223846, "rewards/margins_std": 0.2798627018928528, "rewards/rejected": -0.11082406342029572, "step": 460 }, { "dpo_losses": 0.6458374261856079, "epoch": 0.69, "grad_norm": 2.5506219741141543, "learning_rate": 1.3508235119272466e-06, "logits/chosen": -2.8581721782684326, "logits/rejected": -2.8341779708862305, "logps/chosen": -254.4308319091797, "logps/rejected": -253.99722290039062, "loss": 0.6175, "positive_losses": 2.3088302612304688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.11066363006830215, "rewards/margins": 0.1266460120677948, "rewards/margins_max": 0.2916102409362793, "rewards/margins_min": -0.0383182056248188, "rewards/margins_std": 0.23329463601112366, "rewards/rejected": -0.01598239876329899, "step": 470 }, { "dpo_losses": 0.5230454206466675, "epoch": 0.7, "grad_norm": 24.43956168373256, "learning_rate": 1.238956604925934e-06, "logits/chosen": -2.7936887741088867, "logits/rejected": -2.7815768718719482, "logps/chosen": -241.1090850830078, "logps/rejected": -284.8046569824219, "loss": 0.6176, "positive_losses": 0.7508819699287415, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.23765699565410614, "rewards/margins": 0.422058641910553, "rewards/margins_max": 0.6369160413742065, "rewards/margins_min": 0.20720121264457703, "rewards/margins_std": 0.3038543164730072, "rewards/rejected": -0.18440163135528564, "step": 480 }, { "dpo_losses": 0.5921580195426941, "epoch": 0.72, "grad_norm": 9.14828905462399, "learning_rate": 1.1303796161583763e-06, "logits/chosen": -2.953059673309326, "logits/rejected": -2.935351848602295, "logps/chosen": -277.16998291015625, "logps/rejected": -322.24908447265625, "loss": 0.5856, "positive_losses": 0.24810238182544708, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.18392740190029144, "rewards/margins": 0.252055287361145, "rewards/margins_max": 0.42164430022239685, "rewards/margins_min": 0.08246620744466782, "rewards/margins_std": 0.23983514308929443, "rewards/rejected": -0.06812787055969238, "step": 490 }, { "dpo_losses": 0.5570933222770691, "epoch": 0.73, "grad_norm": 26.824091044275153, "learning_rate": 1.0253758105911169e-06, "logits/chosen": -2.970041275024414, "logits/rejected": -2.90769362449646, "logps/chosen": -304.5309143066406, "logps/rejected": -356.88641357421875, "loss": 0.5674, "positive_losses": 0.19576720893383026, "rewards/accuracies": 0.75, "rewards/chosen": 0.23450036346912384, "rewards/margins": 0.35680580139160156, "rewards/margins_max": 0.5582734942436218, "rewards/margins_min": 0.15533806383609772, "rewards/margins_std": 0.2849184274673462, "rewards/rejected": -0.12230543792247772, "step": 500 }, { "epoch": 0.73, "eval_dpo_losses": 0.6564862728118896, "eval_logits/chosen": -2.8376405239105225, "eval_logits/rejected": -2.793374538421631, "eval_logps/chosen": -278.80975341796875, "eval_logps/rejected": -262.5242004394531, "eval_loss": 1.1831417083740234, "eval_positive_losses": 5.736485481262207, "eval_rewards/accuracies": 0.6269841194152832, "eval_rewards/chosen": 0.06411468237638474, "eval_rewards/margins": 0.09753014147281647, "eval_rewards/margins_max": 0.41429367661476135, "eval_rewards/margins_min": -0.18840637803077698, "eval_rewards/margins_std": 0.27024951577186584, "eval_rewards/rejected": -0.033415455371141434, "eval_runtime": 283.1365, "eval_samples_per_second": 7.064, "eval_steps_per_second": 0.223, "step": 500 }, { "dpo_losses": 0.5341383218765259, "epoch": 0.75, "grad_norm": 2.630596915267566, "learning_rate": 9.24219131163705e-07, "logits/chosen": -2.8331634998321533, "logits/rejected": -2.8200929164886475, "logps/chosen": -288.0255126953125, "logps/rejected": -313.35601806640625, "loss": 0.6361, "positive_losses": 1.7603362798690796, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.21840150654315948, "rewards/margins": 0.4240838885307312, "rewards/margins_max": 0.7840636968612671, "rewards/margins_min": 0.06410404294729233, "rewards/margins_std": 0.509088397026062, "rewards/rejected": -0.2056823968887329, "step": 510 }, { "dpo_losses": 0.49555259943008423, "epoch": 0.76, "grad_norm": 13.21991755241966, "learning_rate": 8.271734841028553e-07, "logits/chosen": -2.72208833694458, "logits/rejected": -2.6312568187713623, "logps/chosen": -286.3044128417969, "logps/rejected": -246.79788208007812, "loss": 0.6459, "positive_losses": 0.1013515442609787, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.28137513995170593, "rewards/margins": 0.4984091818332672, "rewards/margins_max": 0.7110401391983032, "rewards/margins_min": 0.28577831387519836, "rewards/margins_std": 0.3007054626941681, "rewards/rejected": -0.21703402698040009, "step": 520 }, { "dpo_losses": 0.5471224188804626, "epoch": 0.77, "grad_norm": 7.226241019031729, "learning_rate": 7.344920504212244e-07, "logits/chosen": -2.9117705821990967, "logits/rejected": -2.8744723796844482, "logps/chosen": -206.4365997314453, "logps/rejected": -208.7721710205078, "loss": 0.6331, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.20657558739185333, "rewards/margins": 0.34465348720550537, "rewards/margins_max": 0.5002694725990295, "rewards/margins_min": 0.1890375316143036, "rewards/margins_std": 0.22007422149181366, "rewards/rejected": -0.13807791471481323, "step": 530 }, { "dpo_losses": 0.5800082087516785, "epoch": 0.79, "grad_norm": 48.796149530310025, "learning_rate": 6.464166253970672e-07, "logits/chosen": -2.8512115478515625, "logits/rejected": -2.8704025745391846, "logps/chosen": -307.69915771484375, "logps/rejected": -285.3216857910156, "loss": 0.5932, "positive_losses": 0.1961948424577713, "rewards/accuracies": 0.75, "rewards/chosen": 0.1846921294927597, "rewards/margins": 0.2769491672515869, "rewards/margins_max": 0.47591620683670044, "rewards/margins_min": 0.077982097864151, "rewards/margins_std": 0.28138190507888794, "rewards/rejected": -0.09225703775882721, "step": 540 }, { "dpo_losses": 0.5755653381347656, "epoch": 0.8, "grad_norm": 2.795078318527347, "learning_rate": 5.631769877579535e-07, "logits/chosen": -2.9338223934173584, "logits/rejected": -2.885927200317383, "logps/chosen": -237.5154266357422, "logps/rejected": -245.45254516601562, "loss": 0.6043, "positive_losses": 1.3403419256210327, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.13946720957756042, "rewards/margins": 0.28026267886161804, "rewards/margins_max": 0.4180312752723694, "rewards/margins_min": 0.1424940675497055, "rewards/margins_std": 0.19483418762683868, "rewards/rejected": -0.14079545438289642, "step": 550 }, { "dpo_losses": 0.5164459943771362, "epoch": 0.82, "grad_norm": 3.090907013989327, "learning_rate": 4.849903002143114e-07, "logits/chosen": -3.0459659099578857, "logits/rejected": -2.9688525199890137, "logps/chosen": -348.83740234375, "logps/rejected": -346.1819763183594, "loss": 0.5621, "positive_losses": 0.1016082763671875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.23445944488048553, "rewards/margins": 0.4368625283241272, "rewards/margins_max": 0.6422832012176514, "rewards/margins_min": 0.23144181072711945, "rewards/margins_std": 0.2905087471008301, "rewards/rejected": -0.20240306854248047, "step": 560 }, { "dpo_losses": 0.5176469087600708, "epoch": 0.83, "grad_norm": 7.846420016024575, "learning_rate": 4.1206054290670537e-07, "logits/chosen": -2.894327163696289, "logits/rejected": -2.896099805831909, "logps/chosen": -234.593994140625, "logps/rejected": -323.6400146484375, "loss": 0.6136, "positive_losses": 1.2456893920898438, "rewards/accuracies": 0.75, "rewards/chosen": 0.17635245621204376, "rewards/margins": 0.4464770257472992, "rewards/margins_max": 0.6318201422691345, "rewards/margins_min": 0.2611338794231415, "rewards/margins_std": 0.2621147930622101, "rewards/rejected": -0.27012452483177185, "step": 570 }, { "dpo_losses": 0.619495689868927, "epoch": 0.85, "grad_norm": 47.00247250810323, "learning_rate": 3.44577981244944e-07, "logits/chosen": -2.9653825759887695, "logits/rejected": -2.980112314224243, "logps/chosen": -235.0575408935547, "logps/rejected": -250.17593383789062, "loss": 0.6852, "positive_losses": 1.9925556182861328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1124948263168335, "rewards/margins": 0.1802656650543213, "rewards/margins_max": 0.35439062118530273, "rewards/margins_min": 0.006140687968581915, "rewards/margins_std": 0.24624991416931152, "rewards/rejected": -0.06777085363864899, "step": 580 }, { "dpo_losses": 0.5588719248771667, "epoch": 0.86, "grad_norm": 4.545382225154878, "learning_rate": 2.827186695273482e-07, "logits/chosen": -3.0688188076019287, "logits/rejected": -2.9839656352996826, "logps/chosen": -374.6413269042969, "logps/rejected": -343.8844909667969, "loss": 0.6152, "positive_losses": 0.27655029296875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2746116518974304, "rewards/margins": 0.3599032461643219, "rewards/margins_max": 0.609930157661438, "rewards/margins_min": 0.10987631976604462, "rewards/margins_std": 0.3535914719104767, "rewards/rejected": -0.08529156446456909, "step": 590 }, { "dpo_losses": 0.5024896860122681, "epoch": 0.88, "grad_norm": 10.429875129983618, "learning_rate": 2.2664399163518786e-07, "logits/chosen": -2.891846179962158, "logits/rejected": -2.812058925628662, "logps/chosen": -292.8556213378906, "logps/rejected": -254.47628784179688, "loss": 0.5749, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.27775660157203674, "rewards/margins": 0.49465060234069824, "rewards/margins_max": 0.8027955293655396, "rewards/margins_min": 0.18650567531585693, "rewards/margins_std": 0.4357827305793762, "rewards/rejected": -0.2168940007686615, "step": 600 }, { "epoch": 0.88, "eval_dpo_losses": 0.6511540412902832, "eval_logits/chosen": -2.8279476165771484, "eval_logits/rejected": -2.7839229106903076, "eval_logps/chosen": -274.8698425292969, "eval_logps/rejected": -259.9163513183594, "eval_loss": 1.0992192029953003, "eval_positive_losses": 4.5979323387146, "eval_rewards/accuracies": 0.6190476417541504, "eval_rewards/chosen": 0.1035134568810463, "eval_rewards/margins": 0.11085036396980286, "eval_rewards/margins_max": 0.4368113875389099, "eval_rewards/margins_min": -0.18844377994537354, "eval_rewards/margins_std": 0.2790098786354065, "eval_rewards/rejected": -0.0073369028978049755, "eval_runtime": 283.0506, "eval_samples_per_second": 7.066, "eval_steps_per_second": 0.223, "step": 600 }, { "dpo_losses": 0.5908172726631165, "epoch": 0.89, "grad_norm": 8.899268268122263, "learning_rate": 1.7650024000056415e-07, "logits/chosen": -2.910013198852539, "logits/rejected": -2.8941874504089355, "logps/chosen": -202.37757873535156, "logps/rejected": -226.99508666992188, "loss": 0.6342, "positive_losses": 1.9391590356826782, "rewards/accuracies": 0.75, "rewards/chosen": 0.1791270673274994, "rewards/margins": 0.25547030568122864, "rewards/margins_max": 0.4546757638454437, "rewards/margins_min": 0.05626480653882027, "rewards/margins_std": 0.2817191183567047, "rewards/rejected": -0.07634319365024567, "step": 610 }, { "dpo_losses": 0.5537667870521545, "epoch": 0.91, "grad_norm": 27.10072020527372, "learning_rate": 1.324182339461544e-07, "logits/chosen": -2.87471342086792, "logits/rejected": -2.8323957920074463, "logps/chosen": -255.8001251220703, "logps/rejected": -216.3787841796875, "loss": 0.6711, "positive_losses": 0.1423923522233963, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.22555597126483917, "rewards/margins": 0.34654873609542847, "rewards/margins_max": 0.5156995058059692, "rewards/margins_min": 0.17739805579185486, "rewards/margins_std": 0.23921525478363037, "rewards/rejected": -0.12099279463291168, "step": 620 }, { "dpo_losses": 0.48896676301956177, "epoch": 0.92, "grad_norm": 2.6767258362110957, "learning_rate": 9.451297839253915e-08, "logits/chosen": -2.836315393447876, "logits/rejected": -2.7731261253356934, "logps/chosen": -304.1548156738281, "logps/rejected": -336.2890319824219, "loss": 0.6356, "positive_losses": 0.17710499465465546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.31015846133232117, "rewards/margins": 0.5357273817062378, "rewards/margins_max": 0.8079883456230164, "rewards/margins_min": 0.263466477394104, "rewards/margins_std": 0.38503509759902954, "rewards/rejected": -0.2255689650774002, "step": 630 }, { "dpo_losses": 0.5176305174827576, "epoch": 0.94, "grad_norm": 18.280815132381182, "learning_rate": 6.288336382349463e-08, "logits/chosen": -2.8430378437042236, "logits/rejected": -2.7495901584625244, "logps/chosen": -359.60797119140625, "logps/rejected": -298.79827880859375, "loss": 0.5757, "positive_losses": 1.0459808111190796, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.24247094988822937, "rewards/margins": 0.44258061051368713, "rewards/margins_max": 0.6816617846488953, "rewards/margins_min": 0.20349940657615662, "rewards/margins_std": 0.33811187744140625, "rewards/rejected": -0.20010964572429657, "step": 640 }, { "dpo_losses": 0.5474685430526733, "epoch": 0.95, "grad_norm": 2.7207730538280863, "learning_rate": 3.761190829201067e-08, "logits/chosen": -2.8519351482391357, "logits/rejected": -2.805759906768799, "logps/chosen": -365.9830627441406, "logps/rejected": -278.6522521972656, "loss": 0.5966, "positive_losses": 0.05963592603802681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2809067666530609, "rewards/margins": 0.4035716652870178, "rewards/margins_max": 0.7429118156433105, "rewards/margins_min": 0.06423152983188629, "rewards/margins_std": 0.47989946603775024, "rewards/rejected": -0.12266488373279572, "step": 650 }, { "dpo_losses": 0.5853177309036255, "epoch": 0.96, "grad_norm": 40.690683079215134, "learning_rate": 1.876454214011253e-08, "logits/chosen": -2.9034006595611572, "logits/rejected": -2.8463714122772217, "logps/chosen": -243.8646697998047, "logps/rejected": -225.9236602783203, "loss": 0.6752, "positive_losses": 0.5306800603866577, "rewards/accuracies": 0.75, "rewards/chosen": 0.21680143475532532, "rewards/margins": 0.2727014422416687, "rewards/margins_max": 0.4745156764984131, "rewards/margins_min": 0.07088717073202133, "rewards/margins_std": 0.2854084372520447, "rewards/rejected": -0.05589999631047249, "step": 660 }, { "dpo_losses": 0.5261252522468567, "epoch": 0.98, "grad_norm": 8.821965322055776, "learning_rate": 6.390435994127753e-09, "logits/chosen": -2.8224921226501465, "logits/rejected": -2.8528945446014404, "logps/chosen": -284.4530944824219, "logps/rejected": -375.3456115722656, "loss": 0.6203, "positive_losses": 0.12337493896484375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2311427891254425, "rewards/margins": 0.4244818687438965, "rewards/margins_max": 0.6252471208572388, "rewards/margins_min": 0.2237166464328766, "rewards/margins_std": 0.2839249074459076, "rewards/rejected": -0.19333907961845398, "step": 670 }, { "dpo_losses": 0.5133123397827148, "epoch": 0.99, "grad_norm": 23.801829753390283, "learning_rate": 5.218724841346556e-10, "logits/chosen": -2.689457416534424, "logits/rejected": -2.6667604446411133, "logps/chosen": -358.9125061035156, "logps/rejected": -298.8949890136719, "loss": 0.6128, "positive_losses": 1.849574327468872, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.30895963311195374, "rewards/margins": 0.4615735113620758, "rewards/margins_max": 0.7685288190841675, "rewards/margins_min": 0.15461814403533936, "rewards/margins_std": 0.43410032987594604, "rewards/rejected": -0.15261384844779968, "step": 680 }, { "epoch": 1.0, "step": 684, "total_flos": 0.0, "train_loss": 0.641208895814349, "train_runtime": 6249.598, "train_samples_per_second": 1.751, "train_steps_per_second": 0.109 } ], "logging_steps": 10, "max_steps": 684, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }