just1nseo's picture
Model save
cde0a77 verified
raw
history blame contribute delete
No virus
57.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 684,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"dpo_losses": 0.6931471824645996,
"epoch": 0.0,
"grad_norm": 2.025402631880394,
"learning_rate": 7.246376811594204e-08,
"logits/chosen": -2.961127519607544,
"logits/rejected": -2.9461119174957275,
"logps/chosen": -261.90582275390625,
"logps/rejected": -270.03265380859375,
"loss": 0.6931,
"positive_losses": 0.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"dpo_losses": 0.6932222843170166,
"epoch": 0.01,
"grad_norm": 1.892116368261662,
"learning_rate": 7.246376811594204e-07,
"logits/chosen": -2.875087022781372,
"logits/rejected": -2.855910062789917,
"logps/chosen": -217.50634765625,
"logps/rejected": -222.0803985595703,
"loss": 0.6974,
"positive_losses": 0.04892720282077789,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": 0.00041001950739882886,
"rewards/margins": -0.0001489536080043763,
"rewards/margins_max": 0.0012003988958895206,
"rewards/margins_min": -0.0014983059372752905,
"rewards/margins_std": 0.0019082725048065186,
"rewards/rejected": 0.0005589731154032052,
"step": 10
},
{
"dpo_losses": 0.6928491592407227,
"epoch": 0.03,
"grad_norm": 10.889312446124245,
"learning_rate": 1.4492753623188408e-06,
"logits/chosen": -2.855677366256714,
"logits/rejected": -2.8727664947509766,
"logps/chosen": -228.65463256835938,
"logps/rejected": -176.28146362304688,
"loss": 0.695,
"positive_losses": 0.026834487915039062,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.0038786642253398895,
"rewards/margins": 0.0005991062498651445,
"rewards/margins_max": 0.0024178135208785534,
"rewards/margins_min": -0.001219600671902299,
"rewards/margins_std": 0.0025720400735735893,
"rewards/rejected": 0.0032795581500977278,
"step": 20
},
{
"dpo_losses": 0.691431999206543,
"epoch": 0.04,
"grad_norm": 2.0735563380689697,
"learning_rate": 2.173913043478261e-06,
"logits/chosen": -2.932262420654297,
"logits/rejected": -2.8772940635681152,
"logps/chosen": -258.99334716796875,
"logps/rejected": -237.83096313476562,
"loss": 0.692,
"positive_losses": 0.0,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.015487673692405224,
"rewards/margins": 0.0034467983059585094,
"rewards/margins_max": 0.008276171050965786,
"rewards/margins_min": -0.001382575137540698,
"rewards/margins_std": 0.006829765625298023,
"rewards/rejected": 0.012040875852108002,
"step": 30
},
{
"dpo_losses": 0.6876205205917358,
"epoch": 0.06,
"grad_norm": 1.9226295416634294,
"learning_rate": 2.8985507246376816e-06,
"logits/chosen": -2.8300986289978027,
"logits/rejected": -2.7832179069519043,
"logps/chosen": -325.06231689453125,
"logps/rejected": -363.68426513671875,
"loss": 0.6887,
"positive_losses": 0.0,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.028436576947569847,
"rewards/margins": 0.011200213804841042,
"rewards/margins_max": 0.022198233753442764,
"rewards/margins_min": 0.00020219237194396555,
"rewards/margins_std": 0.015553551726043224,
"rewards/rejected": 0.017236361280083656,
"step": 40
},
{
"dpo_losses": 0.6896201968193054,
"epoch": 0.07,
"grad_norm": 9.118712466857515,
"learning_rate": 3.6231884057971017e-06,
"logits/chosen": -2.895482301712036,
"logits/rejected": -2.8222224712371826,
"logps/chosen": -247.339111328125,
"logps/rejected": -244.00790405273438,
"loss": 0.6891,
"positive_losses": 0.0010955811012536287,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.03269472345709801,
"rewards/margins": 0.007144673261791468,
"rewards/margins_max": 0.017206599935889244,
"rewards/margins_min": -0.002917253179475665,
"rewards/margins_std": 0.014229713007807732,
"rewards/rejected": 0.025550048798322678,
"step": 50
},
{
"dpo_losses": 0.678604006767273,
"epoch": 0.09,
"grad_norm": 1.6865302230972352,
"learning_rate": 4.347826086956522e-06,
"logits/chosen": -3.02363920211792,
"logits/rejected": -2.9497618675231934,
"logps/chosen": -302.65142822265625,
"logps/rejected": -242.8329620361328,
"loss": 0.6829,
"positive_losses": 0.0033214569557458162,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.06167557090520859,
"rewards/margins": 0.02971900999546051,
"rewards/margins_max": 0.05635114759206772,
"rewards/margins_min": 0.003086873795837164,
"rewards/margins_std": 0.0376635305583477,
"rewards/rejected": 0.03195656090974808,
"step": 60
},
{
"dpo_losses": 0.6720460653305054,
"epoch": 0.1,
"grad_norm": 7.877876119832288,
"learning_rate": 4.999967381905813e-06,
"logits/chosen": -3.0418922901153564,
"logits/rejected": -2.9664931297302246,
"logps/chosen": -266.40692138671875,
"logps/rejected": -203.53610229492188,
"loss": 0.6792,
"positive_losses": 0.09538726508617401,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.05729568004608154,
"rewards/margins": 0.04375966638326645,
"rewards/margins_max": 0.05517454072833061,
"rewards/margins_min": 0.032344792038202286,
"rewards/margins_std": 0.0161430723965168,
"rewards/rejected": 0.013536013662815094,
"step": 70
},
{
"dpo_losses": 0.6691193580627441,
"epoch": 0.12,
"grad_norm": 9.317121958881922,
"learning_rate": 4.9960542403925095e-06,
"logits/chosen": -2.8223726749420166,
"logits/rejected": -2.7410635948181152,
"logps/chosen": -249.7649383544922,
"logps/rejected": -231.88986206054688,
"loss": 0.6716,
"positive_losses": 0.03403167799115181,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.07761463522911072,
"rewards/margins": 0.05084484815597534,
"rewards/margins_max": 0.10426272451877594,
"rewards/margins_min": -0.0025730193592607975,
"rewards/margins_std": 0.07554427534341812,
"rewards/rejected": 0.026769787073135376,
"step": 80
},
{
"dpo_losses": 0.6618136167526245,
"epoch": 0.13,
"grad_norm": 4.628284010129237,
"learning_rate": 4.98562917836165e-06,
"logits/chosen": -2.881012201309204,
"logits/rejected": -2.8446288108825684,
"logps/chosen": -254.67495727539062,
"logps/rejected": -195.2042694091797,
"loss": 0.6694,
"positive_losses": 0.05231323093175888,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.08632297068834305,
"rewards/margins": 0.06475184857845306,
"rewards/margins_max": 0.08643685281276703,
"rewards/margins_min": 0.0430668406188488,
"rewards/margins_std": 0.030667226761579514,
"rewards/rejected": 0.021571118384599686,
"step": 90
},
{
"dpo_losses": 0.6343039274215698,
"epoch": 0.15,
"grad_norm": 12.853627066309556,
"learning_rate": 4.968719393609757e-06,
"logits/chosen": -2.973792552947998,
"logits/rejected": -2.9188525676727295,
"logps/chosen": -364.15399169921875,
"logps/rejected": -228.38418579101562,
"loss": 0.6612,
"positive_losses": 0.0,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.1419464498758316,
"rewards/margins": 0.12534113228321075,
"rewards/margins_max": 0.19193264842033386,
"rewards/margins_min": 0.05874960869550705,
"rewards/margins_std": 0.09417462348937988,
"rewards/rejected": 0.016605319455266,
"step": 100
},
{
"epoch": 0.15,
"eval_dpo_losses": 0.6805257797241211,
"eval_logits/chosen": -2.8549489974975586,
"eval_logits/rejected": -2.8096561431884766,
"eval_logps/chosen": -277.0838928222656,
"eval_logps/rejected": -253.83172607421875,
"eval_loss": 0.7167356014251709,
"eval_positive_losses": 0.34227606654167175,
"eval_rewards/accuracies": 0.591269850730896,
"eval_rewards/chosen": 0.0813729390501976,
"eval_rewards/margins": 0.027863360941410065,
"eval_rewards/margins_max": 0.13411983847618103,
"eval_rewards/margins_min": -0.06980105489492416,
"eval_rewards/margins_std": 0.0907549038529396,
"eval_rewards/rejected": 0.05350957810878754,
"eval_runtime": 284.067,
"eval_samples_per_second": 7.041,
"eval_steps_per_second": 0.222,
"step": 100
},
{
"dpo_losses": 0.646482527256012,
"epoch": 0.16,
"grad_norm": 7.886066230281033,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": -2.90840482711792,
"logits/rejected": -2.870948553085327,
"logps/chosen": -332.84967041015625,
"logps/rejected": -298.0467529296875,
"loss": 0.6608,
"positive_losses": 0.0,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.11125056445598602,
"rewards/margins": 0.09774880856275558,
"rewards/margins_max": 0.12077156454324722,
"rewards/margins_min": 0.07472606003284454,
"rewards/margins_std": 0.03255908936262131,
"rewards/rejected": 0.01350175030529499,
"step": 110
},
{
"dpo_losses": 0.6566277742385864,
"epoch": 0.18,
"grad_norm": 1.913254174024933,
"learning_rate": 4.915638921541952e-06,
"logits/chosen": -2.8616955280303955,
"logits/rejected": -2.8522956371307373,
"logps/chosen": -277.81402587890625,
"logps/rejected": -261.228759765625,
"loss": 0.6473,
"positive_losses": 0.04311790317296982,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.0988793820142746,
"rewards/margins": 0.07747145742177963,
"rewards/margins_max": 0.14120900630950928,
"rewards/margins_min": 0.013733914121985435,
"rewards/margins_std": 0.0901385098695755,
"rewards/rejected": 0.02140791341662407,
"step": 120
},
{
"dpo_losses": 0.6377557516098022,
"epoch": 0.19,
"grad_norm": 12.904038948771282,
"learning_rate": 4.879606715117019e-06,
"logits/chosen": -2.95271897315979,
"logits/rejected": -2.8435444831848145,
"logps/chosen": -294.14813232421875,
"logps/rejected": -240.53207397460938,
"loss": 0.6576,
"positive_losses": 0.14576569199562073,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.12051650136709213,
"rewards/margins": 0.12185319513082504,
"rewards/margins_max": 0.16484162211418152,
"rewards/margins_min": 0.07886476814746857,
"rewards/margins_std": 0.06079481169581413,
"rewards/rejected": -0.0013366841012611985,
"step": 130
},
{
"dpo_losses": 0.640169620513916,
"epoch": 0.2,
"grad_norm": 16.18813154837036,
"learning_rate": 4.837366386472175e-06,
"logits/chosen": -3.0178513526916504,
"logits/rejected": -2.9269156455993652,
"logps/chosen": -279.7732238769531,
"logps/rejected": -236.86874389648438,
"loss": 0.6767,
"positive_losses": 0.29906386137008667,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.11848002672195435,
"rewards/margins": 0.11471493542194366,
"rewards/margins_max": 0.2023201882839203,
"rewards/margins_min": 0.02710966393351555,
"rewards/margins_std": 0.12389256060123444,
"rewards/rejected": 0.0037650964222848415,
"step": 140
},
{
"dpo_losses": 0.6285568475723267,
"epoch": 0.22,
"grad_norm": 6.5324825602261045,
"learning_rate": 4.789028135801919e-06,
"logits/chosen": -2.973548650741577,
"logits/rejected": -2.9277901649475098,
"logps/chosen": -281.6429443359375,
"logps/rejected": -295.3494873046875,
"loss": 0.8701,
"positive_losses": 0.2579152584075928,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.16488775610923767,
"rewards/margins": 0.14175625145435333,
"rewards/margins_max": 0.25869402289390564,
"rewards/margins_min": 0.02481846883893013,
"rewards/margins_std": 0.1653749793767929,
"rewards/rejected": 0.023131517693400383,
"step": 150
},
{
"dpo_losses": 0.6205035448074341,
"epoch": 0.23,
"grad_norm": 2.7066336555848176,
"learning_rate": 4.7347180720830635e-06,
"logits/chosen": -2.9675240516662598,
"logits/rejected": -2.804438829421997,
"logps/chosen": -316.6290283203125,
"logps/rejected": -282.70538330078125,
"loss": 0.6598,
"positive_losses": 0.0,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.1802542507648468,
"rewards/margins": 0.15980175137519836,
"rewards/margins_max": 0.272554486989975,
"rewards/margins_min": 0.047049008309841156,
"rewards/margins_std": 0.15945644676685333,
"rewards/rejected": 0.02045249193906784,
"step": 160
},
{
"dpo_losses": 0.6271503567695618,
"epoch": 0.25,
"grad_norm": 1.8801745537623247,
"learning_rate": 4.674577884070811e-06,
"logits/chosen": -2.929814577102661,
"logits/rejected": -2.8838062286376953,
"logps/chosen": -308.89947509765625,
"logps/rejected": -252.41775512695312,
"loss": 0.6279,
"positive_losses": 0.5797370672225952,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.1764679253101349,
"rewards/margins": 0.14800240099430084,
"rewards/margins_max": 0.237053781747818,
"rewards/margins_min": 0.058951012790203094,
"rewards/margins_std": 0.12593765556812286,
"rewards/rejected": 0.02846553362905979,
"step": 170
},
{
"dpo_losses": 0.6267830729484558,
"epoch": 0.26,
"grad_norm": 2.1466440934520645,
"learning_rate": 4.608764470648971e-06,
"logits/chosen": -2.9405906200408936,
"logits/rejected": -2.8729405403137207,
"logps/chosen": -290.3728942871094,
"logps/rejected": -330.1247863769531,
"loss": 0.6545,
"positive_losses": 0.3962584435939789,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.17637397348880768,
"rewards/margins": 0.14918141067028046,
"rewards/margins_max": 0.28907179832458496,
"rewards/margins_min": 0.00929100438952446,
"rewards/margins_std": 0.19783492386341095,
"rewards/rejected": 0.027192572131752968,
"step": 180
},
{
"dpo_losses": 0.5917172431945801,
"epoch": 0.28,
"grad_norm": 2.287767271676791,
"learning_rate": 4.5374495314986874e-06,
"logits/chosen": -2.7434186935424805,
"logits/rejected": -2.7498998641967773,
"logps/chosen": -312.6622619628906,
"logps/rejected": -238.92544555664062,
"loss": 0.6419,
"positive_losses": 0.6725692749023438,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.21900752186775208,
"rewards/margins": 0.22816917300224304,
"rewards/margins_max": 0.34974128007888794,
"rewards/margins_min": 0.10659710317850113,
"rewards/margins_std": 0.17192888259887695,
"rewards/rejected": -0.00916165579110384,
"step": 190
},
{
"dpo_losses": 0.6394025087356567,
"epoch": 0.29,
"grad_norm": 10.012997763234694,
"learning_rate": 4.460819119153574e-06,
"logits/chosen": -2.8694872856140137,
"logits/rejected": -2.831519603729248,
"logps/chosen": -254.72994995117188,
"logps/rejected": -285.6749572753906,
"loss": 0.6652,
"positive_losses": 0.53265380859375,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.1096222847700119,
"rewards/margins": 0.11550626903772354,
"rewards/margins_max": 0.21246998012065887,
"rewards/margins_min": 0.018542537465691566,
"rewards/margins_std": 0.13712741434574127,
"rewards/rejected": -0.005883966572582722,
"step": 200
},
{
"epoch": 0.29,
"eval_dpo_losses": 0.6683889627456665,
"eval_logits/chosen": -2.804701566696167,
"eval_logits/rejected": -2.761190176010132,
"eval_logps/chosen": -270.78045654296875,
"eval_logps/rejected": -250.50535583496094,
"eval_loss": 0.7424377202987671,
"eval_positive_losses": 0.6157510280609131,
"eval_rewards/accuracies": 0.6071428656578064,
"eval_rewards/chosen": 0.14440776407718658,
"eval_rewards/margins": 0.05763502046465874,
"eval_rewards/margins_max": 0.23871682584285736,
"eval_rewards/margins_min": -0.10859239846467972,
"eval_rewards/margins_std": 0.15644660592079163,
"eval_rewards/rejected": 0.08677274733781815,
"eval_runtime": 283.023,
"eval_samples_per_second": 7.067,
"eval_steps_per_second": 0.223,
"step": 200
},
{
"dpo_losses": 0.6189439296722412,
"epoch": 0.31,
"grad_norm": 6.2964509201178664,
"learning_rate": 4.379073153609896e-06,
"logits/chosen": -2.980807304382324,
"logits/rejected": -2.9285852909088135,
"logps/chosen": -310.4910583496094,
"logps/rejected": -283.2040100097656,
"loss": 0.66,
"positive_losses": 0.0,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.17465968430042267,
"rewards/margins": 0.16720545291900635,
"rewards/margins_max": 0.2678540349006653,
"rewards/margins_min": 0.0665569081902504,
"rewards/margins_std": 0.14233854413032532,
"rewards/rejected": 0.007454232778400183,
"step": 210
},
{
"dpo_losses": 0.6231792569160461,
"epoch": 0.32,
"grad_norm": 1.9190071231894708,
"learning_rate": 4.292424900758129e-06,
"logits/chosen": -2.8317534923553467,
"logits/rejected": -2.745687961578369,
"logps/chosen": -241.54098510742188,
"logps/rejected": -261.16180419921875,
"loss": 0.6416,
"positive_losses": 0.0,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.17161966860294342,
"rewards/margins": 0.15514414012432098,
"rewards/margins_max": 0.2861190140247345,
"rewards/margins_min": 0.024169281125068665,
"rewards/margins_std": 0.1852264106273651,
"rewards/rejected": 0.016475532203912735,
"step": 220
},
{
"dpo_losses": 0.6433408260345459,
"epoch": 0.34,
"grad_norm": 2.0868151243656565,
"learning_rate": 4.201100415996598e-06,
"logits/chosen": -2.727468490600586,
"logits/rejected": -2.6615824699401855,
"logps/chosen": -234.8723602294922,
"logps/rejected": -252.6638641357422,
"loss": 0.6425,
"positive_losses": 0.0,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.1352681964635849,
"rewards/margins": 0.10586099326610565,
"rewards/margins_max": 0.18672436475753784,
"rewards/margins_min": 0.024997618049383163,
"rewards/margins_std": 0.11435806751251221,
"rewards/rejected": 0.029407206922769547,
"step": 230
},
{
"dpo_losses": 0.5827513933181763,
"epoch": 0.35,
"grad_norm": 8.16859688297942,
"learning_rate": 4.105337954478756e-06,
"logits/chosen": -2.9261674880981445,
"logits/rejected": -2.7930915355682373,
"logps/chosen": -369.94781494140625,
"logps/rejected": -238.9222869873047,
"loss": 0.6532,
"positive_losses": 0.4402889311313629,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.21584495902061462,
"rewards/margins": 0.2518808841705322,
"rewards/margins_max": 0.37274229526519775,
"rewards/margins_min": 0.1310194730758667,
"rewards/margins_std": 0.1709238588809967,
"rewards/rejected": -0.0360359326004982,
"step": 240
},
{
"dpo_losses": 0.6088570356369019,
"epoch": 0.37,
"grad_norm": 7.483775073000372,
"learning_rate": 4.005387349532697e-06,
"logits/chosen": -2.9306282997131348,
"logits/rejected": -2.87394642829895,
"logps/chosen": -293.2420654296875,
"logps/rejected": -272.403076171875,
"loss": 0.6241,
"positive_losses": 0.6371370553970337,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.14793848991394043,
"rewards/margins": 0.18766427040100098,
"rewards/margins_max": 0.2927020192146301,
"rewards/margins_min": 0.08262647688388824,
"rewards/margins_std": 0.14854584634304047,
"rewards/rejected": -0.039725758135318756,
"step": 250
},
{
"dpo_losses": 0.630215048789978,
"epoch": 0.38,
"grad_norm": 13.905695323383071,
"learning_rate": 3.901509360874515e-06,
"logits/chosen": -2.832815170288086,
"logits/rejected": -2.8140697479248047,
"logps/chosen": -197.61141967773438,
"logps/rejected": -194.48846435546875,
"loss": 0.6319,
"positive_losses": 0.17327670753002167,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.15451067686080933,
"rewards/margins": 0.14662909507751465,
"rewards/margins_max": 0.2917208671569824,
"rewards/margins_min": 0.0015373497735708952,
"rewards/margins_std": 0.2051907330751419,
"rewards/rejected": 0.00788155198097229,
"step": 260
},
{
"dpo_losses": 0.6374494433403015,
"epoch": 0.39,
"grad_norm": 10.795361999407069,
"learning_rate": 3.793974994315991e-06,
"logits/chosen": -2.788649797439575,
"logits/rejected": -2.7856884002685547,
"logps/chosen": -167.8430938720703,
"logps/rejected": -188.2590789794922,
"loss": 0.6446,
"positive_losses": 0.739077091217041,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.13068287074565887,
"rewards/margins": 0.11988089978694916,
"rewards/margins_max": 0.2015564739704132,
"rewards/margins_min": 0.0382053516805172,
"rewards/margins_std": 0.11550667136907578,
"rewards/rejected": 0.010801966302096844,
"step": 270
},
{
"dpo_losses": 0.601833701133728,
"epoch": 0.41,
"grad_norm": 1.9286233539251623,
"learning_rate": 3.68306479474137e-06,
"logits/chosen": -3.023601531982422,
"logits/rejected": -2.880030632019043,
"logps/chosen": -352.0651550292969,
"logps/rejected": -203.5298309326172,
"loss": 0.6428,
"positive_losses": 0.44579315185546875,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.21998050808906555,
"rewards/margins": 0.2220487892627716,
"rewards/margins_max": 0.3289792239665985,
"rewards/margins_min": 0.11511830985546112,
"rewards/margins_std": 0.15122249722480774,
"rewards/rejected": -0.002068266272544861,
"step": 280
},
{
"dpo_losses": 0.6319935321807861,
"epoch": 0.42,
"grad_norm": 2.944032588280805,
"learning_rate": 3.569068114197784e-06,
"logits/chosen": -2.928798198699951,
"logits/rejected": -2.8589279651641846,
"logps/chosen": -206.49636840820312,
"logps/rejected": -176.32420349121094,
"loss": 0.6206,
"positive_losses": 0.1757659912109375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.161897212266922,
"rewards/margins": 0.13675031065940857,
"rewards/margins_max": 0.2698245942592621,
"rewards/margins_min": 0.003676059888675809,
"rewards/margins_std": 0.18819543719291687,
"rewards/rejected": 0.02514689229428768,
"step": 290
},
{
"dpo_losses": 0.5872830748558044,
"epoch": 0.44,
"grad_norm": 8.216053799669432,
"learning_rate": 3.4522823570088073e-06,
"logits/chosen": -2.855262279510498,
"logits/rejected": -2.8310704231262207,
"logps/chosen": -240.1536407470703,
"logps/rejected": -225.31997680664062,
"loss": 0.6493,
"positive_losses": 0.42629069089889526,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.14640358090400696,
"rewards/margins": 0.24259230494499207,
"rewards/margins_max": 0.3872792422771454,
"rewards/margins_min": 0.09790538251399994,
"rewards/margins_std": 0.20461821556091309,
"rewards/rejected": -0.0961887389421463,
"step": 300
},
{
"epoch": 0.44,
"eval_dpo_losses": 0.6609499454498291,
"eval_logits/chosen": -2.8105618953704834,
"eval_logits/rejected": -2.7656409740448,
"eval_logps/chosen": -268.21075439453125,
"eval_logps/rejected": -249.68173217773438,
"eval_loss": 0.7585510015487671,
"eval_positive_losses": 0.735747218132019,
"eval_rewards/accuracies": 0.6150793433189392,
"eval_rewards/chosen": 0.17010442912578583,
"eval_rewards/margins": 0.0750950425863266,
"eval_rewards/margins_max": 0.2765759229660034,
"eval_rewards/margins_min": -0.10647378861904144,
"eval_rewards/margins_std": 0.17135697603225708,
"eval_rewards/rejected": 0.09500937908887863,
"eval_runtime": 283.2506,
"eval_samples_per_second": 7.061,
"eval_steps_per_second": 0.222,
"step": 300
},
{
"dpo_losses": 0.6002415418624878,
"epoch": 0.45,
"grad_norm": 15.987450446439036,
"learning_rate": 3.333012203880528e-06,
"logits/chosen": -2.9334702491760254,
"logits/rejected": -2.8857316970825195,
"logps/chosen": -221.07736206054688,
"logps/rejected": -165.5961456298828,
"loss": 0.6259,
"positive_losses": 0.29423028230667114,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.20694272220134735,
"rewards/margins": 0.22076299786567688,
"rewards/margins_max": 0.40377649664878845,
"rewards/margins_min": 0.03774946928024292,
"rewards/margins_std": 0.25882020592689514,
"rewards/rejected": -0.013820228166878223,
"step": 310
},
{
"dpo_losses": 0.5651403069496155,
"epoch": 0.47,
"grad_norm": 41.49020086210869,
"learning_rate": 3.2115688170243735e-06,
"logits/chosen": -2.916748523712158,
"logits/rejected": -2.9150567054748535,
"logps/chosen": -294.4731750488281,
"logps/rejected": -308.61346435546875,
"loss": 0.6257,
"positive_losses": 0.09651489555835724,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.2138613760471344,
"rewards/margins": 0.30642035603523254,
"rewards/margins_max": 0.4777792990207672,
"rewards/margins_min": 0.13506139814853668,
"rewards/margins_std": 0.24233810603618622,
"rewards/rejected": -0.09255897253751755,
"step": 320
},
{
"dpo_losses": 0.6225263476371765,
"epoch": 0.48,
"grad_norm": 10.387168274872021,
"learning_rate": 3.0882690283704355e-06,
"logits/chosen": -2.800654649734497,
"logits/rejected": -2.744640588760376,
"logps/chosen": -230.0958709716797,
"logps/rejected": -211.9070587158203,
"loss": 0.6505,
"positive_losses": 0.48747172951698303,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.16328363120555878,
"rewards/margins": 0.16463473439216614,
"rewards/margins_max": 0.3351263403892517,
"rewards/margins_min": -0.005856870673596859,
"rewards/margins_std": 0.24111154675483704,
"rewards/rejected": -0.001351101673208177,
"step": 330
},
{
"dpo_losses": 0.5763748288154602,
"epoch": 0.5,
"grad_norm": 11.15788842573243,
"learning_rate": 2.9634345129891296e-06,
"logits/chosen": -2.82387638092041,
"logits/rejected": -2.72804594039917,
"logps/chosen": -288.6628112792969,
"logps/rejected": -272.2504577636719,
"loss": 0.6348,
"positive_losses": 0.106109619140625,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.21110180020332336,
"rewards/margins": 0.2855163514614105,
"rewards/margins_max": 0.5410599708557129,
"rewards/margins_min": 0.029972758144140244,
"rewards/margins_std": 0.36139318346977234,
"rewards/rejected": -0.07441455870866776,
"step": 340
},
{
"dpo_losses": 0.6027976870536804,
"epoch": 0.51,
"grad_norm": 15.405775577444263,
"learning_rate": 2.8373909498776746e-06,
"logits/chosen": -2.9628233909606934,
"logits/rejected": -2.954535961151123,
"logps/chosen": -264.20440673828125,
"logps/rejected": -265.29058837890625,
"loss": 0.6444,
"positive_losses": 0.14543990790843964,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.18934306502342224,
"rewards/margins": 0.20881037414073944,
"rewards/margins_max": 0.3755717873573303,
"rewards/margins_min": 0.04204897955060005,
"rewards/margins_std": 0.2358362227678299,
"rewards/rejected": -0.019467316567897797,
"step": 350
},
{
"dpo_losses": 0.5799789428710938,
"epoch": 0.53,
"grad_norm": 2.0183316768741295,
"learning_rate": 2.710467172300768e-06,
"logits/chosen": -2.8956894874572754,
"logits/rejected": -2.8347909450531006,
"logps/chosen": -329.68585205078125,
"logps/rejected": -321.13037109375,
"loss": 0.6196,
"positive_losses": 0.2545227110385895,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.23316507041454315,
"rewards/margins": 0.2625313103199005,
"rewards/margins_max": 0.398255318403244,
"rewards/margins_min": 0.12680727243423462,
"rewards/margins_std": 0.19194276630878448,
"rewards/rejected": -0.029366234317421913,
"step": 360
},
{
"dpo_losses": 0.5937001705169678,
"epoch": 0.54,
"grad_norm": 12.06051129169852,
"learning_rate": 2.582994309902146e-06,
"logits/chosen": -2.884967565536499,
"logits/rejected": -2.743847370147705,
"logps/chosen": -285.45123291015625,
"logps/rejected": -247.1053466796875,
"loss": 0.6401,
"positive_losses": 0.0,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.1828571856021881,
"rewards/margins": 0.22877776622772217,
"rewards/margins_max": 0.3255845904350281,
"rewards/margins_min": 0.13197092711925507,
"rewards/margins_std": 0.13690553605556488,
"rewards/rejected": -0.04592058062553406,
"step": 370
},
{
"dpo_losses": 0.595887303352356,
"epoch": 0.56,
"grad_norm": 3.3756204670545618,
"learning_rate": 2.4553049248251512e-06,
"logits/chosen": -2.788328170776367,
"logits/rejected": -2.836977958679199,
"logps/chosen": -214.0673065185547,
"logps/rejected": -235.258544921875,
"loss": 0.5859,
"positive_losses": 0.17899170517921448,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.1966702789068222,
"rewards/margins": 0.2272748053073883,
"rewards/margins_max": 0.33700865507125854,
"rewards/margins_min": 0.11754089593887329,
"rewards/margins_std": 0.15518715977668762,
"rewards/rejected": -0.03060450591146946,
"step": 380
},
{
"dpo_losses": 0.6092433333396912,
"epoch": 0.57,
"grad_norm": 2.5893672084668107,
"learning_rate": 2.3277321440960733e-06,
"logits/chosen": -2.9706382751464844,
"logits/rejected": -2.9543769359588623,
"logps/chosen": -256.9505615234375,
"logps/rejected": -262.0420227050781,
"loss": 0.6297,
"positive_losses": 0.7152191400527954,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.16275997459888458,
"rewards/margins": 0.20270125567913055,
"rewards/margins_max": 0.401836633682251,
"rewards/margins_min": 0.0035658925771713257,
"rewards/margins_std": 0.28161993622779846,
"rewards/rejected": -0.039941295981407166,
"step": 390
},
{
"dpo_losses": 0.5526755452156067,
"epoch": 0.58,
"grad_norm": 21.63358202743805,
"learning_rate": 2.20060879053377e-06,
"logits/chosen": -2.8260955810546875,
"logits/rejected": -2.7942147254943848,
"logps/chosen": -184.72544860839844,
"logps/rejected": -232.3735809326172,
"loss": 0.6224,
"positive_losses": 0.5248996019363403,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.16737225651741028,
"rewards/margins": 0.3254551887512207,
"rewards/margins_max": 0.506883442401886,
"rewards/margins_min": 0.14402692019939423,
"rewards/margins_std": 0.256578266620636,
"rewards/rejected": -0.15808293223381042,
"step": 400
},
{
"epoch": 0.58,
"eval_dpo_losses": 0.6528598070144653,
"eval_logits/chosen": -2.819880723953247,
"eval_logits/rejected": -2.7766764163970947,
"eval_logps/chosen": -274.035888671875,
"eval_logps/rejected": -258.0920715332031,
"eval_loss": 0.9942816495895386,
"eval_positive_losses": 3.3746726512908936,
"eval_rewards/accuracies": 0.6388888955116272,
"eval_rewards/chosen": 0.11185282468795776,
"eval_rewards/margins": 0.10094699263572693,
"eval_rewards/margins_max": 0.38355645537376404,
"eval_rewards/margins_min": -0.1620650738477707,
"eval_rewards/margins_std": 0.24338804185390472,
"eval_rewards/rejected": 0.01090583112090826,
"eval_runtime": 283.2899,
"eval_samples_per_second": 7.06,
"eval_steps_per_second": 0.222,
"step": 400
},
{
"dpo_losses": 0.531283438205719,
"epoch": 0.6,
"grad_norm": 11.192408672928963,
"learning_rate": 2.0742665144529374e-06,
"logits/chosen": -2.8772964477539062,
"logits/rejected": -2.7996630668640137,
"logps/chosen": -310.952392578125,
"logps/rejected": -263.61370849609375,
"loss": 0.6614,
"positive_losses": 0.6912002563476562,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.3101710379123688,
"rewards/margins": 0.4205913543701172,
"rewards/margins_max": 0.6372691988945007,
"rewards/margins_min": 0.20391342043876648,
"rewards/margins_std": 0.30642884969711304,
"rewards/rejected": -0.11042030900716782,
"step": 410
},
{
"dpo_losses": 0.5354982614517212,
"epoch": 0.61,
"grad_norm": 15.45856376263706,
"learning_rate": 1.9490349284263036e-06,
"logits/chosen": -2.8157570362091064,
"logits/rejected": -2.758150577545166,
"logps/chosen": -286.8282775878906,
"logps/rejected": -253.0894775390625,
"loss": 0.6483,
"positive_losses": 0.7530021667480469,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.2359578162431717,
"rewards/margins": 0.3807125985622406,
"rewards/margins_max": 0.5745470523834229,
"rewards/margins_min": 0.18687808513641357,
"rewards/margins_std": 0.27412334084510803,
"rewards/rejected": -0.14475473761558533,
"step": 420
},
{
"dpo_losses": 0.5478672385215759,
"epoch": 0.63,
"grad_norm": 9.397460886758937,
"learning_rate": 1.8252407473630606e-06,
"logits/chosen": -2.981616735458374,
"logits/rejected": -2.9860920906066895,
"logps/chosen": -268.0840148925781,
"logps/rejected": -284.7814025878906,
"loss": 0.6187,
"positive_losses": 0.08653469383716583,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.19860908389091492,
"rewards/margins": 0.35602104663848877,
"rewards/margins_max": 0.5557164549827576,
"rewards/margins_min": 0.15632562339305878,
"rewards/margins_std": 0.28241199254989624,
"rewards/rejected": -0.15741200745105743,
"step": 430
},
{
"dpo_losses": 0.5760594010353088,
"epoch": 0.64,
"grad_norm": 23.342372051867663,
"learning_rate": 1.7032069361469765e-06,
"logits/chosen": -2.775434732437134,
"logits/rejected": -2.77290678024292,
"logps/chosen": -213.5185089111328,
"logps/rejected": -278.1703186035156,
"loss": 0.6391,
"positive_losses": 0.018738174811005592,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.19827936589717865,
"rewards/margins": 0.2895718812942505,
"rewards/margins_max": 0.5261528491973877,
"rewards/margins_min": 0.05299092084169388,
"rewards/margins_std": 0.3345760405063629,
"rewards/rejected": -0.09129253774881363,
"step": 440
},
{
"dpo_losses": 0.5661223530769348,
"epoch": 0.66,
"grad_norm": 2.1591786937116098,
"learning_rate": 1.5832518670578802e-06,
"logits/chosen": -3.0003552436828613,
"logits/rejected": -2.9453907012939453,
"logps/chosen": -272.0791015625,
"logps/rejected": -293.05645751953125,
"loss": 0.6369,
"positive_losses": 0.25814515352249146,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.2058447152376175,
"rewards/margins": 0.2981029152870178,
"rewards/margins_max": 0.49641966819763184,
"rewards/margins_min": 0.099786177277565,
"rewards/margins_std": 0.28046220541000366,
"rewards/rejected": -0.09225818514823914,
"step": 450
},
{
"dpo_losses": 0.5694680213928223,
"epoch": 0.67,
"grad_norm": 2.6035617787075602,
"learning_rate": 1.4656884891747398e-06,
"logits/chosen": -2.813758373260498,
"logits/rejected": -2.838399887084961,
"logps/chosen": -266.3056945800781,
"logps/rejected": -263.6679382324219,
"loss": 0.6275,
"positive_losses": 0.17335128784179688,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.19134476780891418,
"rewards/margins": 0.3021687865257263,
"rewards/margins_max": 0.5000616312026978,
"rewards/margins_min": 0.10427598655223846,
"rewards/margins_std": 0.2798627018928528,
"rewards/rejected": -0.11082406342029572,
"step": 460
},
{
"dpo_losses": 0.6458374261856079,
"epoch": 0.69,
"grad_norm": 2.5506219741141543,
"learning_rate": 1.3508235119272466e-06,
"logits/chosen": -2.8581721782684326,
"logits/rejected": -2.8341779708862305,
"logps/chosen": -254.4308319091797,
"logps/rejected": -253.99722290039062,
"loss": 0.6175,
"positive_losses": 2.3088302612304688,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.11066363006830215,
"rewards/margins": 0.1266460120677948,
"rewards/margins_max": 0.2916102409362793,
"rewards/margins_min": -0.0383182056248188,
"rewards/margins_std": 0.23329463601112366,
"rewards/rejected": -0.01598239876329899,
"step": 470
},
{
"dpo_losses": 0.5230454206466675,
"epoch": 0.7,
"grad_norm": 24.43956168373256,
"learning_rate": 1.238956604925934e-06,
"logits/chosen": -2.7936887741088867,
"logits/rejected": -2.7815768718719482,
"logps/chosen": -241.1090850830078,
"logps/rejected": -284.8046569824219,
"loss": 0.6176,
"positive_losses": 0.7508819699287415,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.23765699565410614,
"rewards/margins": 0.422058641910553,
"rewards/margins_max": 0.6369160413742065,
"rewards/margins_min": 0.20720121264457703,
"rewards/margins_std": 0.3038543164730072,
"rewards/rejected": -0.18440163135528564,
"step": 480
},
{
"dpo_losses": 0.5921580195426941,
"epoch": 0.72,
"grad_norm": 9.14828905462399,
"learning_rate": 1.1303796161583763e-06,
"logits/chosen": -2.953059673309326,
"logits/rejected": -2.935351848602295,
"logps/chosen": -277.16998291015625,
"logps/rejected": -322.24908447265625,
"loss": 0.5856,
"positive_losses": 0.24810238182544708,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.18392740190029144,
"rewards/margins": 0.252055287361145,
"rewards/margins_max": 0.42164430022239685,
"rewards/margins_min": 0.08246620744466782,
"rewards/margins_std": 0.23983514308929443,
"rewards/rejected": -0.06812787055969238,
"step": 490
},
{
"dpo_losses": 0.5570933222770691,
"epoch": 0.73,
"grad_norm": 26.824091044275153,
"learning_rate": 1.0253758105911169e-06,
"logits/chosen": -2.970041275024414,
"logits/rejected": -2.90769362449646,
"logps/chosen": -304.5309143066406,
"logps/rejected": -356.88641357421875,
"loss": 0.5674,
"positive_losses": 0.19576720893383026,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.23450036346912384,
"rewards/margins": 0.35680580139160156,
"rewards/margins_max": 0.5582734942436218,
"rewards/margins_min": 0.15533806383609772,
"rewards/margins_std": 0.2849184274673462,
"rewards/rejected": -0.12230543792247772,
"step": 500
},
{
"epoch": 0.73,
"eval_dpo_losses": 0.6564862728118896,
"eval_logits/chosen": -2.8376405239105225,
"eval_logits/rejected": -2.793374538421631,
"eval_logps/chosen": -278.80975341796875,
"eval_logps/rejected": -262.5242004394531,
"eval_loss": 1.1831417083740234,
"eval_positive_losses": 5.736485481262207,
"eval_rewards/accuracies": 0.6269841194152832,
"eval_rewards/chosen": 0.06411468237638474,
"eval_rewards/margins": 0.09753014147281647,
"eval_rewards/margins_max": 0.41429367661476135,
"eval_rewards/margins_min": -0.18840637803077698,
"eval_rewards/margins_std": 0.27024951577186584,
"eval_rewards/rejected": -0.033415455371141434,
"eval_runtime": 283.1365,
"eval_samples_per_second": 7.064,
"eval_steps_per_second": 0.223,
"step": 500
},
{
"dpo_losses": 0.5341383218765259,
"epoch": 0.75,
"grad_norm": 2.630596915267566,
"learning_rate": 9.24219131163705e-07,
"logits/chosen": -2.8331634998321533,
"logits/rejected": -2.8200929164886475,
"logps/chosen": -288.0255126953125,
"logps/rejected": -313.35601806640625,
"loss": 0.6361,
"positive_losses": 1.7603362798690796,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.21840150654315948,
"rewards/margins": 0.4240838885307312,
"rewards/margins_max": 0.7840636968612671,
"rewards/margins_min": 0.06410404294729233,
"rewards/margins_std": 0.509088397026062,
"rewards/rejected": -0.2056823968887329,
"step": 510
},
{
"dpo_losses": 0.49555259943008423,
"epoch": 0.76,
"grad_norm": 13.21991755241966,
"learning_rate": 8.271734841028553e-07,
"logits/chosen": -2.72208833694458,
"logits/rejected": -2.6312568187713623,
"logps/chosen": -286.3044128417969,
"logps/rejected": -246.79788208007812,
"loss": 0.6459,
"positive_losses": 0.1013515442609787,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.28137513995170593,
"rewards/margins": 0.4984091818332672,
"rewards/margins_max": 0.7110401391983032,
"rewards/margins_min": 0.28577831387519836,
"rewards/margins_std": 0.3007054626941681,
"rewards/rejected": -0.21703402698040009,
"step": 520
},
{
"dpo_losses": 0.5471224188804626,
"epoch": 0.77,
"grad_norm": 7.226241019031729,
"learning_rate": 7.344920504212244e-07,
"logits/chosen": -2.9117705821990967,
"logits/rejected": -2.8744723796844482,
"logps/chosen": -206.4365997314453,
"logps/rejected": -208.7721710205078,
"loss": 0.6331,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20657558739185333,
"rewards/margins": 0.34465348720550537,
"rewards/margins_max": 0.5002694725990295,
"rewards/margins_min": 0.1890375316143036,
"rewards/margins_std": 0.22007422149181366,
"rewards/rejected": -0.13807791471481323,
"step": 530
},
{
"dpo_losses": 0.5800082087516785,
"epoch": 0.79,
"grad_norm": 48.796149530310025,
"learning_rate": 6.464166253970672e-07,
"logits/chosen": -2.8512115478515625,
"logits/rejected": -2.8704025745391846,
"logps/chosen": -307.69915771484375,
"logps/rejected": -285.3216857910156,
"loss": 0.5932,
"positive_losses": 0.1961948424577713,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.1846921294927597,
"rewards/margins": 0.2769491672515869,
"rewards/margins_max": 0.47591620683670044,
"rewards/margins_min": 0.077982097864151,
"rewards/margins_std": 0.28138190507888794,
"rewards/rejected": -0.09225703775882721,
"step": 540
},
{
"dpo_losses": 0.5755653381347656,
"epoch": 0.8,
"grad_norm": 2.795078318527347,
"learning_rate": 5.631769877579535e-07,
"logits/chosen": -2.9338223934173584,
"logits/rejected": -2.885927200317383,
"logps/chosen": -237.5154266357422,
"logps/rejected": -245.45254516601562,
"loss": 0.6043,
"positive_losses": 1.3403419256210327,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.13946720957756042,
"rewards/margins": 0.28026267886161804,
"rewards/margins_max": 0.4180312752723694,
"rewards/margins_min": 0.1424940675497055,
"rewards/margins_std": 0.19483418762683868,
"rewards/rejected": -0.14079545438289642,
"step": 550
},
{
"dpo_losses": 0.5164459943771362,
"epoch": 0.82,
"grad_norm": 3.090907013989327,
"learning_rate": 4.849903002143114e-07,
"logits/chosen": -3.0459659099578857,
"logits/rejected": -2.9688525199890137,
"logps/chosen": -348.83740234375,
"logps/rejected": -346.1819763183594,
"loss": 0.5621,
"positive_losses": 0.1016082763671875,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.23445944488048553,
"rewards/margins": 0.4368625283241272,
"rewards/margins_max": 0.6422832012176514,
"rewards/margins_min": 0.23144181072711945,
"rewards/margins_std": 0.2905087471008301,
"rewards/rejected": -0.20240306854248047,
"step": 560
},
{
"dpo_losses": 0.5176469087600708,
"epoch": 0.83,
"grad_norm": 7.846420016024575,
"learning_rate": 4.1206054290670537e-07,
"logits/chosen": -2.894327163696289,
"logits/rejected": -2.896099805831909,
"logps/chosen": -234.593994140625,
"logps/rejected": -323.6400146484375,
"loss": 0.6136,
"positive_losses": 1.2456893920898438,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.17635245621204376,
"rewards/margins": 0.4464770257472992,
"rewards/margins_max": 0.6318201422691345,
"rewards/margins_min": 0.2611338794231415,
"rewards/margins_std": 0.2621147930622101,
"rewards/rejected": -0.27012452483177185,
"step": 570
},
{
"dpo_losses": 0.619495689868927,
"epoch": 0.85,
"grad_norm": 47.00247250810323,
"learning_rate": 3.44577981244944e-07,
"logits/chosen": -2.9653825759887695,
"logits/rejected": -2.980112314224243,
"logps/chosen": -235.0575408935547,
"logps/rejected": -250.17593383789062,
"loss": 0.6852,
"positive_losses": 1.9925556182861328,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1124948263168335,
"rewards/margins": 0.1802656650543213,
"rewards/margins_max": 0.35439062118530273,
"rewards/margins_min": 0.006140687968581915,
"rewards/margins_std": 0.24624991416931152,
"rewards/rejected": -0.06777085363864899,
"step": 580
},
{
"dpo_losses": 0.5588719248771667,
"epoch": 0.86,
"grad_norm": 4.545382225154878,
"learning_rate": 2.827186695273482e-07,
"logits/chosen": -3.0688188076019287,
"logits/rejected": -2.9839656352996826,
"logps/chosen": -374.6413269042969,
"logps/rejected": -343.8844909667969,
"loss": 0.6152,
"positive_losses": 0.27655029296875,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.2746116518974304,
"rewards/margins": 0.3599032461643219,
"rewards/margins_max": 0.609930157661438,
"rewards/margins_min": 0.10987631976604462,
"rewards/margins_std": 0.3535914719104767,
"rewards/rejected": -0.08529156446456909,
"step": 590
},
{
"dpo_losses": 0.5024896860122681,
"epoch": 0.88,
"grad_norm": 10.429875129983618,
"learning_rate": 2.2664399163518786e-07,
"logits/chosen": -2.891846179962158,
"logits/rejected": -2.812058925628662,
"logps/chosen": -292.8556213378906,
"logps/rejected": -254.47628784179688,
"loss": 0.5749,
"positive_losses": 0.0,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.27775660157203674,
"rewards/margins": 0.49465060234069824,
"rewards/margins_max": 0.8027955293655396,
"rewards/margins_min": 0.18650567531585693,
"rewards/margins_std": 0.4357827305793762,
"rewards/rejected": -0.2168940007686615,
"step": 600
},
{
"epoch": 0.88,
"eval_dpo_losses": 0.6511540412902832,
"eval_logits/chosen": -2.8279476165771484,
"eval_logits/rejected": -2.7839229106903076,
"eval_logps/chosen": -274.8698425292969,
"eval_logps/rejected": -259.9163513183594,
"eval_loss": 1.0992192029953003,
"eval_positive_losses": 4.5979323387146,
"eval_rewards/accuracies": 0.6190476417541504,
"eval_rewards/chosen": 0.1035134568810463,
"eval_rewards/margins": 0.11085036396980286,
"eval_rewards/margins_max": 0.4368113875389099,
"eval_rewards/margins_min": -0.18844377994537354,
"eval_rewards/margins_std": 0.2790098786354065,
"eval_rewards/rejected": -0.0073369028978049755,
"eval_runtime": 283.0506,
"eval_samples_per_second": 7.066,
"eval_steps_per_second": 0.223,
"step": 600
},
{
"dpo_losses": 0.5908172726631165,
"epoch": 0.89,
"grad_norm": 8.899268268122263,
"learning_rate": 1.7650024000056415e-07,
"logits/chosen": -2.910013198852539,
"logits/rejected": -2.8941874504089355,
"logps/chosen": -202.37757873535156,
"logps/rejected": -226.99508666992188,
"loss": 0.6342,
"positive_losses": 1.9391590356826782,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.1791270673274994,
"rewards/margins": 0.25547030568122864,
"rewards/margins_max": 0.4546757638454437,
"rewards/margins_min": 0.05626480653882027,
"rewards/margins_std": 0.2817191183567047,
"rewards/rejected": -0.07634319365024567,
"step": 610
},
{
"dpo_losses": 0.5537667870521545,
"epoch": 0.91,
"grad_norm": 27.10072020527372,
"learning_rate": 1.324182339461544e-07,
"logits/chosen": -2.87471342086792,
"logits/rejected": -2.8323957920074463,
"logps/chosen": -255.8001251220703,
"logps/rejected": -216.3787841796875,
"loss": 0.6711,
"positive_losses": 0.1423923522233963,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.22555597126483917,
"rewards/margins": 0.34654873609542847,
"rewards/margins_max": 0.5156995058059692,
"rewards/margins_min": 0.17739805579185486,
"rewards/margins_std": 0.23921525478363037,
"rewards/rejected": -0.12099279463291168,
"step": 620
},
{
"dpo_losses": 0.48896676301956177,
"epoch": 0.92,
"grad_norm": 2.6767258362110957,
"learning_rate": 9.451297839253915e-08,
"logits/chosen": -2.836315393447876,
"logits/rejected": -2.7731261253356934,
"logps/chosen": -304.1548156738281,
"logps/rejected": -336.2890319824219,
"loss": 0.6356,
"positive_losses": 0.17710499465465546,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.31015846133232117,
"rewards/margins": 0.5357273817062378,
"rewards/margins_max": 0.8079883456230164,
"rewards/margins_min": 0.263466477394104,
"rewards/margins_std": 0.38503509759902954,
"rewards/rejected": -0.2255689650774002,
"step": 630
},
{
"dpo_losses": 0.5176305174827576,
"epoch": 0.94,
"grad_norm": 18.280815132381182,
"learning_rate": 6.288336382349463e-08,
"logits/chosen": -2.8430378437042236,
"logits/rejected": -2.7495901584625244,
"logps/chosen": -359.60797119140625,
"logps/rejected": -298.79827880859375,
"loss": 0.5757,
"positive_losses": 1.0459808111190796,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.24247094988822937,
"rewards/margins": 0.44258061051368713,
"rewards/margins_max": 0.6816617846488953,
"rewards/margins_min": 0.20349940657615662,
"rewards/margins_std": 0.33811187744140625,
"rewards/rejected": -0.20010964572429657,
"step": 640
},
{
"dpo_losses": 0.5474685430526733,
"epoch": 0.95,
"grad_norm": 2.7207730538280863,
"learning_rate": 3.761190829201067e-08,
"logits/chosen": -2.8519351482391357,
"logits/rejected": -2.805759906768799,
"logps/chosen": -365.9830627441406,
"logps/rejected": -278.6522521972656,
"loss": 0.5966,
"positive_losses": 0.05963592603802681,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.2809067666530609,
"rewards/margins": 0.4035716652870178,
"rewards/margins_max": 0.7429118156433105,
"rewards/margins_min": 0.06423152983188629,
"rewards/margins_std": 0.47989946603775024,
"rewards/rejected": -0.12266488373279572,
"step": 650
},
{
"dpo_losses": 0.5853177309036255,
"epoch": 0.96,
"grad_norm": 40.690683079215134,
"learning_rate": 1.876454214011253e-08,
"logits/chosen": -2.9034006595611572,
"logits/rejected": -2.8463714122772217,
"logps/chosen": -243.8646697998047,
"logps/rejected": -225.9236602783203,
"loss": 0.6752,
"positive_losses": 0.5306800603866577,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.21680143475532532,
"rewards/margins": 0.2727014422416687,
"rewards/margins_max": 0.4745156764984131,
"rewards/margins_min": 0.07088717073202133,
"rewards/margins_std": 0.2854084372520447,
"rewards/rejected": -0.05589999631047249,
"step": 660
},
{
"dpo_losses": 0.5261252522468567,
"epoch": 0.98,
"grad_norm": 8.821965322055776,
"learning_rate": 6.390435994127753e-09,
"logits/chosen": -2.8224921226501465,
"logits/rejected": -2.8528945446014404,
"logps/chosen": -284.4530944824219,
"logps/rejected": -375.3456115722656,
"loss": 0.6203,
"positive_losses": 0.12337493896484375,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.2311427891254425,
"rewards/margins": 0.4244818687438965,
"rewards/margins_max": 0.6252471208572388,
"rewards/margins_min": 0.2237166464328766,
"rewards/margins_std": 0.2839249074459076,
"rewards/rejected": -0.19333907961845398,
"step": 670
},
{
"dpo_losses": 0.5133123397827148,
"epoch": 0.99,
"grad_norm": 23.801829753390283,
"learning_rate": 5.218724841346556e-10,
"logits/chosen": -2.689457416534424,
"logits/rejected": -2.6667604446411133,
"logps/chosen": -358.9125061035156,
"logps/rejected": -298.8949890136719,
"loss": 0.6128,
"positive_losses": 1.849574327468872,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.30895963311195374,
"rewards/margins": 0.4615735113620758,
"rewards/margins_max": 0.7685288190841675,
"rewards/margins_min": 0.15461814403533936,
"rewards/margins_std": 0.43410032987594604,
"rewards/rejected": -0.15261384844779968,
"step": 680
},
{
"epoch": 1.0,
"step": 684,
"total_flos": 0.0,
"train_loss": 0.641208895814349,
"train_runtime": 6249.598,
"train_samples_per_second": 1.751,
"train_steps_per_second": 0.109
}
],
"logging_steps": 10,
"max_steps": 684,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}