{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 4176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 1.8738785250853705, "learning_rate": 1.1961722488038277e-09, "logits/chosen": -2.8505566120147705, "logits/rejected": -2.908921003341675, "logps/chosen": -429.770751953125, "logps/rejected": -264.9197998046875, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6931761503219604, "epoch": 0.0, "grad_norm": 26.37350040168161, "learning_rate": 1.1961722488038278e-08, "logits/chosen": -2.7373692989349365, "logits/rejected": -2.725682258605957, "logps/chosen": -308.5892333984375, "logps/rejected": -256.5108642578125, "loss": 0.6953, "positive_losses": 0.026859350502490997, "rewards/accuracies": 0.0972222238779068, "rewards/chosen": -6.675786426058039e-05, "rewards/margins": -5.705528019461781e-05, "rewards/margins_max": 0.0008998822886496782, "rewards/margins_min": -0.0008845789707265794, "rewards/margins_std": 0.0007663692813366652, "rewards/rejected": -9.702583156467881e-06, "step": 10 }, { "dpo_losses": 0.6932184100151062, "epoch": 0.0, "grad_norm": 22.696087013462392, "learning_rate": 2.3923444976076555e-08, "logits/chosen": -2.7448391914367676, "logits/rejected": -2.7249255180358887, "logps/chosen": -240.10604858398438, "logps/rejected": -258.03729248046875, "loss": 0.7007, "positive_losses": 0.06958901882171631, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00010348296200390905, "rewards/margins": -0.0001402676134603098, "rewards/margins_max": 0.0031580321956425905, "rewards/margins_min": -0.0029335268773138523, "rewards/margins_std": 0.002679151948541403, "rewards/rejected": 3.6784633266506717e-05, "step": 20 }, { "dpo_losses": 0.6930397152900696, "epoch": 0.01, "grad_norm": 18.25577401569458, "learning_rate": 3.588516746411483e-08, "logits/chosen": -2.881375789642334, "logits/rejected": -2.8493809700012207, "logps/chosen": -340.65179443359375, "logps/rejected": -264.988525390625, "loss": 0.7024, "positive_losses": 0.07482187449932098, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00010021973866969347, "rewards/margins": 0.00021672896400559694, "rewards/margins_max": 0.0028204049449414015, "rewards/margins_min": -0.0023398713674396276, "rewards/margins_std": 0.002330223796889186, "rewards/rejected": -0.00011650919623207301, "step": 30 }, { "dpo_losses": 0.6930765509605408, "epoch": 0.01, "grad_norm": 21.327589135896474, "learning_rate": 4.784688995215311e-08, "logits/chosen": -2.797985792160034, "logits/rejected": -2.767072916030884, "logps/chosen": -264.2638244628906, "logps/rejected": -238.094482421875, "loss": 0.698, "positive_losses": 0.03669625520706177, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0008656598511151969, "rewards/margins": 0.0001437961182091385, "rewards/margins_max": 0.0027192423585802317, "rewards/margins_min": -0.002517946297302842, "rewards/margins_std": 0.002373906783759594, "rewards/rejected": 0.0007218637620098889, "step": 40 }, { "dpo_losses": 0.6932223439216614, "epoch": 0.01, "grad_norm": 18.405429271281783, "learning_rate": 5.980861244019139e-08, "logits/chosen": -2.8726441860198975, "logits/rejected": -2.8564815521240234, "logps/chosen": -328.0624694824219, "logps/rejected": -322.01611328125, "loss": 0.6977, "positive_losses": 0.05418548732995987, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0005599832511506975, "rewards/margins": -0.00014783260121475905, "rewards/margins_max": 0.003490965813398361, "rewards/margins_min": -0.003444510977715254, "rewards/margins_std": 0.0031294028740376234, "rewards/rejected": 0.0007078158669173717, "step": 50 }, { "dpo_losses": 0.6930328607559204, "epoch": 0.01, "grad_norm": 14.852713165577065, "learning_rate": 7.177033492822967e-08, "logits/chosen": -2.8414463996887207, "logits/rejected": -2.7681431770324707, "logps/chosen": -306.5771179199219, "logps/rejected": -258.8722839355469, "loss": 0.6959, "positive_losses": 0.039434053003787994, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0014845133991912007, "rewards/margins": 0.00023235660046339035, "rewards/margins_max": 0.0038300990127027035, "rewards/margins_min": -0.0034206905402243137, "rewards/margins_std": 0.0032653622329235077, "rewards/rejected": 0.0012521569151431322, "step": 60 }, { "dpo_losses": 0.6928664445877075, "epoch": 0.02, "grad_norm": 45.0276661915344, "learning_rate": 8.373205741626794e-08, "logits/chosen": -2.7530107498168945, "logits/rejected": -2.7484545707702637, "logps/chosen": -288.3777160644531, "logps/rejected": -252.92489624023438, "loss": 0.6941, "positive_losses": 0.00402488699182868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0030399146489799023, "rewards/margins": 0.0005633073160424829, "rewards/margins_max": 0.003213726682588458, "rewards/margins_min": -0.0020912655163556337, "rewards/margins_std": 0.002359007950872183, "rewards/rejected": 0.0024766074493527412, "step": 70 }, { "dpo_losses": 0.6931138634681702, "epoch": 0.02, "grad_norm": 42.94247492103535, "learning_rate": 9.569377990430622e-08, "logits/chosen": -2.706998348236084, "logits/rejected": -2.738135814666748, "logps/chosen": -233.3713836669922, "logps/rejected": -252.3106689453125, "loss": 0.6948, "positive_losses": 0.021985817700624466, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0030463621951639652, "rewards/margins": 6.902242603246123e-05, "rewards/margins_max": 0.0031026701908558607, "rewards/margins_min": -0.0031084944494068623, "rewards/margins_std": 0.002789227059110999, "rewards/rejected": 0.0029773395508527756, "step": 80 }, { "dpo_losses": 0.6932356357574463, "epoch": 0.02, "grad_norm": 9.363035985784405, "learning_rate": 1.076555023923445e-07, "logits/chosen": -2.8225021362304688, "logits/rejected": -2.791781187057495, "logps/chosen": -282.68133544921875, "logps/rejected": -248.1925506591797, "loss": 0.6957, "positive_losses": 0.03330497816205025, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.003587817307561636, "rewards/margins": -0.00017320421466138214, "rewards/margins_max": 0.0033458396792411804, "rewards/margins_min": -0.003827576292678714, "rewards/margins_std": 0.0031936608720570803, "rewards/rejected": 0.003761020954698324, "step": 90 }, { "dpo_losses": 0.6928516626358032, "epoch": 0.02, "grad_norm": 8.675280422988093, "learning_rate": 1.1961722488038278e-07, "logits/chosen": -2.8072657585144043, "logits/rejected": -2.800027370452881, "logps/chosen": -300.4829406738281, "logps/rejected": -310.3061218261719, "loss": 0.694, "positive_losses": 0.015417861752212048, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004827133379876614, "rewards/margins": 0.0005946685560047626, "rewards/margins_max": 0.004386828280985355, "rewards/margins_min": -0.0035438486374914646, "rewards/margins_std": 0.0035532943438738585, "rewards/rejected": 0.004232465289533138, "step": 100 }, { "epoch": 0.02, "eval_dpo_losses": 0.6931200623512268, "eval_logits/chosen": -2.80008602142334, "eval_logits/rejected": -2.7647652626037598, "eval_logps/chosen": -283.9624938964844, "eval_logps/rejected": -265.3660888671875, "eval_loss": 0.6936664581298828, "eval_positive_losses": 0.006365585140883923, "eval_rewards/accuracies": 0.5074999928474426, "eval_rewards/chosen": 0.004928573966026306, "eval_rewards/margins": 5.705761941499077e-05, "eval_rewards/margins_max": 0.004903213120996952, "eval_rewards/margins_min": -0.004646263550966978, "eval_rewards/margins_std": 0.0031743666622787714, "eval_rewards/rejected": 0.004871516488492489, "eval_runtime": 859.4817, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 100 }, { "dpo_losses": 0.6931909918785095, "epoch": 0.03, "grad_norm": 11.348226373371702, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -2.815725803375244, "logits/rejected": -2.780757188796997, "logps/chosen": -273.94024658203125, "logps/rejected": -254.77865600585938, "loss": 0.694, "positive_losses": 0.003559112548828125, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.004847392439842224, "rewards/margins": -8.494545181747526e-05, "rewards/margins_max": 0.0033095120452344418, "rewards/margins_min": -0.0033569135703146458, "rewards/margins_std": 0.0030814209021627903, "rewards/rejected": 0.004932337906211615, "step": 110 }, { "dpo_losses": 0.6927813291549683, "epoch": 0.03, "grad_norm": 1.6128773033458539, "learning_rate": 1.4354066985645933e-07, "logits/chosen": -2.800926685333252, "logits/rejected": -2.731950283050537, "logps/chosen": -269.0350646972656, "logps/rejected": -221.18896484375, "loss": 0.6935, "positive_losses": 0.0013689041370525956, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.006401193328201771, "rewards/margins": 0.0007349281222559512, "rewards/margins_max": 0.004148256964981556, "rewards/margins_min": -0.0029832981526851654, "rewards/margins_std": 0.0031906559597700834, "rewards/rejected": 0.005666264332830906, "step": 120 }, { "dpo_losses": 0.6930662393569946, "epoch": 0.03, "grad_norm": 2.1252776693802984, "learning_rate": 1.555023923444976e-07, "logits/chosen": -2.853867769241333, "logits/rejected": -2.8054678440093994, "logps/chosen": -318.3598937988281, "logps/rejected": -284.2847900390625, "loss": 0.6941, "positive_losses": 0.010669326409697533, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.005761197302490473, "rewards/margins": 0.0001656186068430543, "rewards/margins_max": 0.004356847610324621, "rewards/margins_min": -0.0034413025714457035, "rewards/margins_std": 0.0035165701992809772, "rewards/rejected": 0.00559557881206274, "step": 130 }, { "dpo_losses": 0.6930587887763977, "epoch": 0.03, "grad_norm": 2.1778891743608857, "learning_rate": 1.6746411483253589e-07, "logits/chosen": -2.83146595954895, "logits/rejected": -2.819706916809082, "logps/chosen": -287.96173095703125, "logps/rejected": -252.7426300048828, "loss": 0.6936, "positive_losses": 0.0007701873546466231, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0061411140486598015, "rewards/margins": 0.00018110910605173558, "rewards/margins_max": 0.004532166291028261, "rewards/margins_min": -0.0035957382060587406, "rewards/margins_std": 0.0036239526234567165, "rewards/rejected": 0.0059600044041872025, "step": 140 }, { "dpo_losses": 0.6930786967277527, "epoch": 0.04, "grad_norm": 10.797391534089597, "learning_rate": 1.7942583732057415e-07, "logits/chosen": -2.902834177017212, "logits/rejected": -2.828998565673828, "logps/chosen": -322.3126525878906, "logps/rejected": -310.7928161621094, "loss": 0.6934, "positive_losses": 0.005268859677016735, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.006560119334608316, "rewards/margins": 0.00014082933194004, "rewards/margins_max": 0.004460381343960762, "rewards/margins_min": -0.004094877280294895, "rewards/margins_std": 0.0038844309747219086, "rewards/rejected": 0.006419290788471699, "step": 150 }, { "dpo_losses": 0.6930662393569946, "epoch": 0.04, "grad_norm": 10.058933550085667, "learning_rate": 1.9138755980861244e-07, "logits/chosen": -2.8338558673858643, "logits/rejected": -2.84671688079834, "logps/chosen": -256.7803649902344, "logps/rejected": -248.265380859375, "loss": 0.6933, "positive_losses": 0.0003303527773823589, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.006678731646388769, "rewards/margins": 0.00016627379227429628, "rewards/margins_max": 0.00447862409055233, "rewards/margins_min": -0.004200800787657499, "rewards/margins_std": 0.0038912042509764433, "rewards/rejected": 0.006512458436191082, "step": 160 }, { "dpo_losses": 0.6929908394813538, "epoch": 0.04, "grad_norm": 2.137830706439247, "learning_rate": 2.033492822966507e-07, "logits/chosen": -2.7813174724578857, "logits/rejected": -2.7461562156677246, "logps/chosen": -297.31866455078125, "logps/rejected": -237.05050659179688, "loss": 0.6929, "positive_losses": 0.0023521422408521175, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0067475782707333565, "rewards/margins": 0.00031619667424820364, "rewards/margins_max": 0.003992300946265459, "rewards/margins_min": -0.0032077815849334, "rewards/margins_std": 0.0032394672743976116, "rewards/rejected": 0.006431381218135357, "step": 170 }, { "dpo_losses": 0.6927242875099182, "epoch": 0.04, "grad_norm": 8.3463894101895, "learning_rate": 2.15311004784689e-07, "logits/chosen": -2.8213436603546143, "logits/rejected": -2.7990190982818604, "logps/chosen": -304.97705078125, "logps/rejected": -295.3390197753906, "loss": 0.693, "positive_losses": 0.00250663748010993, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007058930583298206, "rewards/margins": 0.000850336451549083, "rewards/margins_max": 0.005234990268945694, "rewards/margins_min": -0.0033441055566072464, "rewards/margins_std": 0.0037757386453449726, "rewards/rejected": 0.006208593957126141, "step": 180 }, { "dpo_losses": 0.6928598284721375, "epoch": 0.05, "grad_norm": 6.045310278818408, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.834990978240967, "logits/rejected": -2.817713975906372, "logps/chosen": -222.3096466064453, "logps/rejected": -183.65440368652344, "loss": 0.6939, "positive_losses": 0.002579116728156805, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.006674450822174549, "rewards/margins": 0.0005782871739938855, "rewards/margins_max": 0.0049315630458295345, "rewards/margins_min": -0.0030284584499895573, "rewards/margins_std": 0.0036005149595439434, "rewards/rejected": 0.00609616469591856, "step": 190 }, { "dpo_losses": 0.6921594142913818, "epoch": 0.05, "grad_norm": 4.304560110765371, "learning_rate": 2.3923444976076555e-07, "logits/chosen": -2.8015878200531006, "logits/rejected": -2.7639403343200684, "logps/chosen": -261.92041015625, "logps/rejected": -225.3970184326172, "loss": 0.6922, "positive_losses": 0.0007431030389852822, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008427979424595833, "rewards/margins": 0.001981571316719055, "rewards/margins_max": 0.006373540963977575, "rewards/margins_min": -0.0018613046268001199, "rewards/margins_std": 0.0037357197143137455, "rewards/rejected": 0.00644640764221549, "step": 200 }, { "epoch": 0.05, "eval_dpo_losses": 0.6926060318946838, "eval_logits/chosen": -2.80019474029541, "eval_logits/rejected": -2.7649641036987305, "eval_logps/chosen": -283.6357116699219, "eval_logps/rejected": -265.1424865722656, "eval_loss": 0.6929615139961243, "eval_positive_losses": 0.0035429534036666155, "eval_rewards/accuracies": 0.5874999761581421, "eval_rewards/chosen": 0.008196190930902958, "eval_rewards/margins": 0.0010884931543841958, "eval_rewards/margins_max": 0.008235710673034191, "eval_rewards/margins_min": -0.005624544341117144, "eval_rewards/margins_std": 0.004597168415784836, "eval_rewards/rejected": 0.007107697892934084, "eval_runtime": 858.4805, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 200 }, { "dpo_losses": 0.6929161548614502, "epoch": 0.05, "grad_norm": 9.951336902152228, "learning_rate": 2.511961722488038e-07, "logits/chosen": -2.840200901031494, "logits/rejected": -2.7939860820770264, "logps/chosen": -284.8526916503906, "logps/rejected": -250.66262817382812, "loss": 0.6933, "positive_losses": 0.0025413513649255037, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.008003338240087032, "rewards/margins": 0.0004666981694754213, "rewards/margins_max": 0.004781220108270645, "rewards/margins_min": -0.004328415263444185, "rewards/margins_std": 0.003994586877524853, "rewards/rejected": 0.007536640856415033, "step": 210 }, { "dpo_losses": 0.6925213932991028, "epoch": 0.05, "grad_norm": 1.445147200745907, "learning_rate": 2.631578947368421e-07, "logits/chosen": -2.85284423828125, "logits/rejected": -2.807483196258545, "logps/chosen": -256.46673583984375, "logps/rejected": -236.06106567382812, "loss": 0.6923, "positive_losses": 0.0004325866757426411, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.009008128196001053, "rewards/margins": 0.0012573556741699576, "rewards/margins_max": 0.00646287202835083, "rewards/margins_min": -0.0033540751319378614, "rewards/margins_std": 0.0044835954904556274, "rewards/rejected": 0.007750772871077061, "step": 220 }, { "dpo_losses": 0.6917552351951599, "epoch": 0.06, "grad_norm": 1.864822997420008, "learning_rate": 2.7511961722488034e-07, "logits/chosen": -2.8087480068206787, "logits/rejected": -2.7911715507507324, "logps/chosen": -275.11199951171875, "logps/rejected": -252.17568969726562, "loss": 0.6924, "positive_losses": 0.0018815994262695312, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01019415631890297, "rewards/margins": 0.002796754939481616, "rewards/margins_max": 0.010607192292809486, "rewards/margins_min": -0.003470221534371376, "rewards/margins_std": 0.00617735181003809, "rewards/rejected": 0.007397400680929422, "step": 230 }, { "dpo_losses": 0.692538857460022, "epoch": 0.06, "grad_norm": 7.663905710838887, "learning_rate": 2.8708133971291866e-07, "logits/chosen": -2.8603463172912598, "logits/rejected": -2.8074562549591064, "logps/chosen": -254.876220703125, "logps/rejected": -234.79287719726562, "loss": 0.693, "positive_losses": 0.006529617123305798, "rewards/accuracies": 0.5625, "rewards/chosen": 0.009815944358706474, "rewards/margins": 0.0012237620539963245, "rewards/margins_max": 0.006623173598200083, "rewards/margins_min": -0.003580584656447172, "rewards/margins_std": 0.004588158335536718, "rewards/rejected": 0.008592181839048862, "step": 240 }, { "dpo_losses": 0.692375898361206, "epoch": 0.06, "grad_norm": 1.8199007280754185, "learning_rate": 2.990430622009569e-07, "logits/chosen": -2.747978448867798, "logits/rejected": -2.7321763038635254, "logps/chosen": -280.9356384277344, "logps/rejected": -289.2428283691406, "loss": 0.6933, "positive_losses": 0.0060562132857739925, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.010558054782450199, "rewards/margins": 0.0015577174490317702, "rewards/margins_max": 0.010480500757694244, "rewards/margins_min": -0.0060094427317380905, "rewards/margins_std": 0.007235884666442871, "rewards/rejected": 0.00900033675134182, "step": 250 }, { "dpo_losses": 0.6913677453994751, "epoch": 0.06, "grad_norm": 7.0584807985912965, "learning_rate": 3.110047846889952e-07, "logits/chosen": -2.770611524581909, "logits/rejected": -2.822542190551758, "logps/chosen": -256.5426025390625, "logps/rejected": -274.1138000488281, "loss": 0.692, "positive_losses": 0.0, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.011835113167762756, "rewards/margins": 0.003578018397092819, "rewards/margins_max": 0.011623856611549854, "rewards/margins_min": -0.003952269442379475, "rewards/margins_std": 0.0070669567212462425, "rewards/rejected": 0.008257093839347363, "step": 260 }, { "dpo_losses": 0.6914106607437134, "epoch": 0.06, "grad_norm": 1.7855100051655108, "learning_rate": 3.229665071770335e-07, "logits/chosen": -2.8921444416046143, "logits/rejected": -2.8222174644470215, "logps/chosen": -322.5043640136719, "logps/rejected": -234.33712768554688, "loss": 0.6921, "positive_losses": 0.00957412738353014, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01291549950838089, "rewards/margins": 0.0034954436123371124, "rewards/margins_max": 0.013346338644623756, "rewards/margins_min": -0.00521578686311841, "rewards/margins_std": 0.008145746774971485, "rewards/rejected": 0.009420055896043777, "step": 270 }, { "dpo_losses": 0.6919598579406738, "epoch": 0.07, "grad_norm": 8.712712648442846, "learning_rate": 3.3492822966507177e-07, "logits/chosen": -2.8344123363494873, "logits/rejected": -2.8515541553497314, "logps/chosen": -252.5563201904297, "logps/rejected": -245.0447998046875, "loss": 0.6923, "positive_losses": 0.00439376849681139, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.013906596228480339, "rewards/margins": 0.002396385185420513, "rewards/margins_max": 0.011133002117276192, "rewards/margins_min": -0.0054922280833125114, "rewards/margins_std": 0.007561185862869024, "rewards/rejected": 0.011510210111737251, "step": 280 }, { "dpo_losses": 0.6908942461013794, "epoch": 0.07, "grad_norm": 1.7339181384218492, "learning_rate": 3.4688995215311004e-07, "logits/chosen": -2.7778828144073486, "logits/rejected": -2.7556509971618652, "logps/chosen": -264.2030944824219, "logps/rejected": -220.837890625, "loss": 0.6917, "positive_losses": 0.00359764089807868, "rewards/accuracies": 0.6875, "rewards/chosen": 0.015479792840778828, "rewards/margins": 0.004530029837042093, "rewards/margins_max": 0.013279837556183338, "rewards/margins_min": -0.003813706338405609, "rewards/margins_std": 0.007667989935725927, "rewards/rejected": 0.010949762538075447, "step": 290 }, { "dpo_losses": 0.6920832395553589, "epoch": 0.07, "grad_norm": 1.8228695309312417, "learning_rate": 3.588516746411483e-07, "logits/chosen": -2.866220474243164, "logits/rejected": -2.8580663204193115, "logps/chosen": -249.51025390625, "logps/rejected": -238.7553253173828, "loss": 0.692, "positive_losses": 0.011281967163085938, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01783960685133934, "rewards/margins": 0.0021499062422662973, "rewards/margins_max": 0.010942873544991016, "rewards/margins_min": -0.006986564956605434, "rewards/margins_std": 0.008049877360463142, "rewards/rejected": 0.015689700841903687, "step": 300 }, { "epoch": 0.07, "eval_dpo_losses": 0.6914030313491821, "eval_logits/chosen": -2.801194667816162, "eval_logits/rejected": -2.766218662261963, "eval_logps/chosen": -282.559814453125, "eval_logps/rejected": -264.30963134765625, "eval_loss": 0.6920657753944397, "eval_positive_losses": 0.005152564961463213, "eval_rewards/accuracies": 0.6175000071525574, "eval_rewards/chosen": 0.018955236300826073, "eval_rewards/margins": 0.0035189627669751644, "eval_rewards/margins_max": 0.019450800493359566, "eval_rewards/margins_min": -0.010327541269361973, "eval_rewards/margins_std": 0.009941726922988892, "eval_rewards/rejected": 0.015436273999512196, "eval_runtime": 858.5279, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 300 }, { "dpo_losses": 0.6913835406303406, "epoch": 0.07, "grad_norm": 1.5792925453173863, "learning_rate": 3.7081339712918656e-07, "logits/chosen": -2.871166706085205, "logits/rejected": -2.8037307262420654, "logps/chosen": -254.8651580810547, "logps/rejected": -196.95025634765625, "loss": 0.6917, "positive_losses": 0.008401107974350452, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.018644209951162338, "rewards/margins": 0.0035485588014125824, "rewards/margins_max": 0.013468483462929726, "rewards/margins_min": -0.004681491293013096, "rewards/margins_std": 0.008153611794114113, "rewards/rejected": 0.015095651149749756, "step": 310 }, { "dpo_losses": 0.6917188167572021, "epoch": 0.08, "grad_norm": 2.5872904301052784, "learning_rate": 3.827751196172249e-07, "logits/chosen": -2.8775100708007812, "logits/rejected": -2.8599514961242676, "logps/chosen": -274.1808166503906, "logps/rejected": -351.721435546875, "loss": 0.6928, "positive_losses": 0.01246490515768528, "rewards/accuracies": 0.625, "rewards/chosen": 0.019983936101198196, "rewards/margins": 0.002893428085371852, "rewards/margins_max": 0.01582186482846737, "rewards/margins_min": -0.009652620181441307, "rewards/margins_std": 0.011317083612084389, "rewards/rejected": 0.017090508714318275, "step": 320 }, { "dpo_losses": 0.6903044581413269, "epoch": 0.08, "grad_norm": 1.795119315195841, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -2.8860158920288086, "logits/rejected": -2.8370862007141113, "logps/chosen": -330.9441833496094, "logps/rejected": -261.54547119140625, "loss": 0.6911, "positive_losses": 0.0, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.024029020220041275, "rewards/margins": 0.005735460203140974, "rewards/margins_max": 0.018820906057953835, "rewards/margins_min": -0.005071660038083792, "rewards/margins_std": 0.010576510801911354, "rewards/rejected": 0.018293561413884163, "step": 330 }, { "dpo_losses": 0.6906482577323914, "epoch": 0.08, "grad_norm": 1.8899025285402453, "learning_rate": 4.066985645933014e-07, "logits/chosen": -2.8804211616516113, "logits/rejected": -2.8621511459350586, "logps/chosen": -321.13092041015625, "logps/rejected": -263.317138671875, "loss": 0.6919, "positive_losses": 0.012451171875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02267013117671013, "rewards/margins": 0.005038048606365919, "rewards/margins_max": 0.017500299960374832, "rewards/margins_min": -0.007574019487947226, "rewards/margins_std": 0.011318190023303032, "rewards/rejected": 0.01763208582997322, "step": 340 }, { "dpo_losses": 0.6905657052993774, "epoch": 0.08, "grad_norm": 2.1474119755886645, "learning_rate": 4.1866028708133973e-07, "logits/chosen": -2.8015341758728027, "logits/rejected": -2.749926805496216, "logps/chosen": -262.4505920410156, "logps/rejected": -213.163818359375, "loss": 0.6918, "positive_losses": 0.017441939562559128, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.023383531719446182, "rewards/margins": 0.005236311815679073, "rewards/margins_max": 0.024445028975605965, "rewards/margins_min": -0.01120169647037983, "rewards/margins_std": 0.01591874472796917, "rewards/rejected": 0.018147218972444534, "step": 350 }, { "dpo_losses": 0.6903191804885864, "epoch": 0.09, "grad_norm": 7.053335810169922, "learning_rate": 4.30622009569378e-07, "logits/chosen": -2.865029811859131, "logits/rejected": -2.838165283203125, "logps/chosen": -279.75885009765625, "logps/rejected": -243.24069213867188, "loss": 0.6913, "positive_losses": 0.02119731903076172, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.024706143885850906, "rewards/margins": 0.005712629295885563, "rewards/margins_max": 0.020163383334875107, "rewards/margins_min": -0.0075416574254632, "rewards/margins_std": 0.012243092991411686, "rewards/rejected": 0.01899351179599762, "step": 360 }, { "dpo_losses": 0.6899415254592896, "epoch": 0.09, "grad_norm": 1.4397147159641202, "learning_rate": 4.425837320574162e-07, "logits/chosen": -2.907766819000244, "logits/rejected": -2.8443288803100586, "logps/chosen": -294.98577880859375, "logps/rejected": -229.28121948242188, "loss": 0.6913, "positive_losses": 0.02153339423239231, "rewards/accuracies": 0.6875, "rewards/chosen": 0.025189917534589767, "rewards/margins": 0.006474143359810114, "rewards/margins_max": 0.02172829769551754, "rewards/margins_min": -0.007557608187198639, "rewards/margins_std": 0.013027170673012733, "rewards/rejected": 0.01871577650308609, "step": 370 }, { "dpo_losses": 0.6898144483566284, "epoch": 0.09, "grad_norm": 1.9885584070357976, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.8546910285949707, "logits/rejected": -2.8014886379241943, "logps/chosen": -277.9206848144531, "logps/rejected": -222.42007446289062, "loss": 0.692, "positive_losses": 0.032598115503787994, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02800873853266239, "rewards/margins": 0.006750666536390781, "rewards/margins_max": 0.024539444595575333, "rewards/margins_min": -0.00970520544797182, "rewards/margins_std": 0.01506971288472414, "rewards/rejected": 0.021258071064949036, "step": 380 }, { "dpo_losses": 0.6891442537307739, "epoch": 0.09, "grad_norm": 2.149639889034174, "learning_rate": 4.665071770334928e-07, "logits/chosen": -2.7597641944885254, "logits/rejected": -2.728381633758545, "logps/chosen": -304.5194396972656, "logps/rejected": -255.93246459960938, "loss": 0.6902, "positive_losses": 0.0014411925803869963, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.032246001064777374, "rewards/margins": 0.008092240430414677, "rewards/margins_max": 0.02433139458298683, "rewards/margins_min": -0.0073524946346879005, "rewards/margins_std": 0.014103399589657784, "rewards/rejected": 0.024153759703040123, "step": 390 }, { "dpo_losses": 0.690765380859375, "epoch": 0.1, "grad_norm": 1.8094563414847848, "learning_rate": 4.784688995215311e-07, "logits/chosen": -2.7343783378601074, "logits/rejected": -2.7625694274902344, "logps/chosen": -266.6732177734375, "logps/rejected": -257.3152770996094, "loss": 0.6914, "positive_losses": 0.016986846923828125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.030096907168626785, "rewards/margins": 0.004846726544201374, "rewards/margins_max": 0.022682152688503265, "rewards/margins_min": -0.011465349234640598, "rewards/margins_std": 0.015138273127377033, "rewards/rejected": 0.025250177830457687, "step": 400 }, { "epoch": 0.1, "eval_dpo_losses": 0.6896032691001892, "eval_logits/chosen": -2.7971839904785156, "eval_logits/rejected": -2.761979103088379, "eval_logps/chosen": -281.2178649902344, "eval_logps/rejected": -263.33489990234375, "eval_loss": 0.6907489895820618, "eval_positive_losses": 0.00809965468943119, "eval_rewards/accuracies": 0.6434999704360962, "eval_rewards/chosen": 0.032374657690525055, "eval_rewards/margins": 0.007191108539700508, "eval_rewards/margins_max": 0.036378104239702225, "eval_rewards/margins_min": -0.017611445859074593, "eval_rewards/margins_std": 0.018108980730175972, "eval_rewards/rejected": 0.025183551013469696, "eval_runtime": 858.6601, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 400 }, { "dpo_losses": 0.6880066394805908, "epoch": 0.1, "grad_norm": 1.826467496230296, "learning_rate": 4.904306220095694e-07, "logits/chosen": -2.8324062824249268, "logits/rejected": -2.7556018829345703, "logps/chosen": -319.3162536621094, "logps/rejected": -255.79248046875, "loss": 0.6911, "positive_losses": 0.016066264361143112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.032208532094955444, "rewards/margins": 0.0104054044932127, "rewards/margins_max": 0.030947068706154823, "rewards/margins_min": -0.008416928350925446, "rewards/margins_std": 0.017776008695364, "rewards/rejected": 0.021803127601742744, "step": 410 }, { "dpo_losses": 0.6892588138580322, "epoch": 0.1, "grad_norm": 2.124798260256618, "learning_rate": 4.999996505732917e-07, "logits/chosen": -2.8357996940612793, "logits/rejected": -2.8063788414001465, "logps/chosen": -295.0068359375, "logps/rejected": -290.5059509277344, "loss": 0.6896, "positive_losses": 0.010312652215361595, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03617082163691521, "rewards/margins": 0.007916518487036228, "rewards/margins_max": 0.02907322719693184, "rewards/margins_min": -0.016145097091794014, "rewards/margins_std": 0.020064840093255043, "rewards/rejected": 0.028254300355911255, "step": 420 }, { "dpo_losses": 0.6890312433242798, "epoch": 0.1, "grad_norm": 2.00535676745324, "learning_rate": 4.999874207410648e-07, "logits/chosen": -2.7636029720306396, "logits/rejected": -2.7798850536346436, "logps/chosen": -250.111572265625, "logps/rejected": -258.98101806640625, "loss": 0.6887, "positive_losses": 0.00727005023509264, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03878437355160713, "rewards/margins": 0.008367359638214111, "rewards/margins_max": 0.030556131154298782, "rewards/margins_min": -0.013753412291407585, "rewards/margins_std": 0.01992672309279442, "rewards/rejected": 0.03041701577603817, "step": 430 }, { "dpo_losses": 0.6884254217147827, "epoch": 0.11, "grad_norm": 1.779243811811669, "learning_rate": 4.999577205502039e-07, "logits/chosen": -2.763629198074341, "logits/rejected": -2.746246337890625, "logps/chosen": -237.06826782226562, "logps/rejected": -218.80252075195312, "loss": 0.6884, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03996497392654419, "rewards/margins": 0.0096130995079875, "rewards/margins_max": 0.031676482409238815, "rewards/margins_min": -0.015318959951400757, "rewards/margins_std": 0.020561929792165756, "rewards/rejected": 0.030351877212524414, "step": 440 }, { "dpo_losses": 0.6882220506668091, "epoch": 0.11, "grad_norm": 1.8752449511424576, "learning_rate": 4.999105520763054e-07, "logits/chosen": -2.8381476402282715, "logits/rejected": -2.7578444480895996, "logps/chosen": -282.51678466796875, "logps/rejected": -253.62890625, "loss": 0.6907, "positive_losses": 0.026677703484892845, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0452304445207119, "rewards/margins": 0.010024761781096458, "rewards/margins_max": 0.03506346791982651, "rewards/margins_min": -0.015951456502079964, "rewards/margins_std": 0.02260807529091835, "rewards/rejected": 0.03520568460226059, "step": 450 }, { "dpo_losses": 0.689070999622345, "epoch": 0.11, "grad_norm": 1.9778509350913305, "learning_rate": 4.998459186157357e-07, "logits/chosen": -2.851912260055542, "logits/rejected": -2.7904164791107178, "logps/chosen": -287.03656005859375, "logps/rejected": -263.02532958984375, "loss": 0.6881, "positive_losses": 0.00857467669993639, "rewards/accuracies": 0.625, "rewards/chosen": 0.047658052295446396, "rewards/margins": 0.008319910615682602, "rewards/margins_max": 0.033903222531080246, "rewards/margins_min": -0.013135654851794243, "rewards/margins_std": 0.020944178104400635, "rewards/rejected": 0.039338137954473495, "step": 460 }, { "dpo_losses": 0.687416136264801, "epoch": 0.11, "grad_norm": 1.882940137660175, "learning_rate": 4.997638246854011e-07, "logits/chosen": -2.889648914337158, "logits/rejected": -2.846312999725342, "logps/chosen": -279.9444274902344, "logps/rejected": -267.48760986328125, "loss": 0.6914, "positive_losses": 0.07253570854663849, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05158579349517822, "rewards/margins": 0.011677240021526814, "rewards/margins_max": 0.03717050701379776, "rewards/margins_min": -0.01313592679798603, "rewards/margins_std": 0.02250213921070099, "rewards/rejected": 0.039908550679683685, "step": 470 }, { "dpo_losses": 0.6873867511749268, "epoch": 0.11, "grad_norm": 2.429125067571159, "learning_rate": 4.996642760224317e-07, "logits/chosen": -2.7297511100769043, "logits/rejected": -2.7178680896759033, "logps/chosen": -282.3246154785156, "logps/rejected": -268.9332275390625, "loss": 0.688, "positive_losses": 0.0, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.053841836750507355, "rewards/margins": 0.011840300634503365, "rewards/margins_max": 0.04894539713859558, "rewards/margins_min": -0.01861853152513504, "rewards/margins_std": 0.030089756473898888, "rewards/rejected": 0.04200153797864914, "step": 480 }, { "dpo_losses": 0.6888018250465393, "epoch": 0.12, "grad_norm": 1.7641449522983441, "learning_rate": 4.995472795837813e-07, "logits/chosen": -2.851361036300659, "logits/rejected": -2.7442686557769775, "logps/chosen": -248.9532928466797, "logps/rejected": -221.5353240966797, "loss": 0.6882, "positive_losses": 0.0037258148659020662, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05189530923962593, "rewards/margins": 0.00890975259244442, "rewards/margins_max": 0.04398290067911148, "rewards/margins_min": -0.01847686991095543, "rewards/margins_std": 0.027901792898774147, "rewards/rejected": 0.04298555478453636, "step": 490 }, { "dpo_losses": 0.6851407289505005, "epoch": 0.12, "grad_norm": 1.7158099604616217, "learning_rate": 4.994128435457401e-07, "logits/chosen": -2.8377397060394287, "logits/rejected": -2.8017022609710693, "logps/chosen": -306.364013671875, "logps/rejected": -264.2765197753906, "loss": 0.6867, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06251838058233261, "rewards/margins": 0.016351716592907906, "rewards/margins_max": 0.04856313019990921, "rewards/margins_min": -0.015192821621894836, "rewards/margins_std": 0.028081167489290237, "rewards/rejected": 0.04616665467619896, "step": 500 }, { "epoch": 0.12, "eval_dpo_losses": 0.6867904663085938, "eval_logits/chosen": -2.793182611465454, "eval_logits/rejected": -2.758030891418457, "eval_logps/chosen": -278.6435241699219, "eval_logps/rejected": -261.3454895019531, "eval_loss": 0.6886543035507202, "eval_positive_losses": 0.012437500059604645, "eval_rewards/accuracies": 0.6359999775886536, "eval_rewards/chosen": 0.05811808630824089, "eval_rewards/margins": 0.013040341436862946, "eval_rewards/margins_max": 0.06536700576543808, "eval_rewards/margins_min": -0.031268589198589325, "eval_rewards/margins_std": 0.03234518691897392, "eval_rewards/rejected": 0.04507775232195854, "eval_runtime": 858.5099, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 500 }, { "dpo_losses": 0.683864951133728, "epoch": 0.12, "grad_norm": 11.66713829903748, "learning_rate": 4.992609773033638e-07, "logits/chosen": -2.879805564880371, "logits/rejected": -2.811321496963501, "logps/chosen": -308.86834716796875, "logps/rejected": -287.7463684082031, "loss": 0.6858, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06479277461767197, "rewards/margins": 0.01893237605690956, "rewards/margins_max": 0.05663178116083145, "rewards/margins_min": -0.017731811851263046, "rewards/margins_std": 0.03335576504468918, "rewards/rejected": 0.045860402286052704, "step": 510 }, { "dpo_losses": 0.6881086230278015, "epoch": 0.12, "grad_norm": 7.487329333976162, "learning_rate": 4.990916914698176e-07, "logits/chosen": -2.855478286743164, "logits/rejected": -2.8820157051086426, "logps/chosen": -266.88543701171875, "logps/rejected": -278.99444580078125, "loss": 0.6892, "positive_losses": 0.0196380615234375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.05078927427530289, "rewards/margins": 0.010307502001523972, "rewards/margins_max": 0.040377646684646606, "rewards/margins_min": -0.019560763612389565, "rewards/margins_std": 0.02666412852704525, "rewards/rejected": 0.040481775999069214, "step": 520 }, { "dpo_losses": 0.6844004392623901, "epoch": 0.13, "grad_norm": 1.8550097088418327, "learning_rate": 4.989049978756335e-07, "logits/chosen": -2.84458589553833, "logits/rejected": -2.800212860107422, "logps/chosen": -257.49871826171875, "logps/rejected": -220.19772338867188, "loss": 0.6861, "positive_losses": 0.004101562313735485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06252549588680267, "rewards/margins": 0.017953645437955856, "rewards/margins_max": 0.06007077544927597, "rewards/margins_min": -0.0209768358618021, "rewards/margins_std": 0.0362507700920105, "rewards/rejected": 0.04457186535000801, "step": 530 }, { "dpo_losses": 0.6819995641708374, "epoch": 0.13, "grad_norm": 1.8233611809221315, "learning_rate": 4.987009095678842e-07, "logits/chosen": -2.8468234539031982, "logits/rejected": -2.7643303871154785, "logps/chosen": -333.0827331542969, "logps/rejected": -253.0489044189453, "loss": 0.6832, "positive_losses": 0.025171661749482155, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06934098154306412, "rewards/margins": 0.022915765643119812, "rewards/margins_max": 0.07091207802295685, "rewards/margins_min": -0.02303961291909218, "rewards/margins_std": 0.041294898837804794, "rewards/rejected": 0.04642521217465401, "step": 540 }, { "dpo_losses": 0.6872234344482422, "epoch": 0.13, "grad_norm": 6.618492529222572, "learning_rate": 4.984794408092712e-07, "logits/chosen": -2.7524490356445312, "logits/rejected": -2.767503499984741, "logps/chosen": -225.1552734375, "logps/rejected": -237.30404663085938, "loss": 0.6882, "positive_losses": 0.0015861510764807463, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.06046484038233757, "rewards/margins": 0.012271756306290627, "rewards/margins_max": 0.06029701232910156, "rewards/margins_min": -0.024660592898726463, "rewards/margins_std": 0.037835657596588135, "rewards/rejected": 0.04819308966398239, "step": 550 }, { "dpo_losses": 0.6848558187484741, "epoch": 0.13, "grad_norm": 1.8605013729460484, "learning_rate": 4.982406070771277e-07, "logits/chosen": -2.813084363937378, "logits/rejected": -2.775376796722412, "logps/chosen": -256.5293884277344, "logps/rejected": -241.4282684326172, "loss": 0.6848, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06858629733324051, "rewards/margins": 0.017087718471884727, "rewards/margins_max": 0.06552567332983017, "rewards/margins_min": -0.023014208301901817, "rewards/margins_std": 0.0390985906124115, "rewards/rejected": 0.051498569548130035, "step": 560 }, { "dpo_losses": 0.686092734336853, "epoch": 0.14, "grad_norm": 1.9246395046011886, "learning_rate": 4.979844250623374e-07, "logits/chosen": -2.806347608566284, "logits/rejected": -2.775139331817627, "logps/chosen": -256.4619140625, "logps/rejected": -278.7142639160156, "loss": 0.6855, "positive_losses": 0.013262939639389515, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.06271354854106903, "rewards/margins": 0.014814550057053566, "rewards/margins_max": 0.07110743224620819, "rewards/margins_min": -0.032308489084243774, "rewards/margins_std": 0.046764932572841644, "rewards/rejected": 0.047898996621370316, "step": 570 }, { "dpo_losses": 0.6842057108879089, "epoch": 0.14, "grad_norm": 1.8190925668794924, "learning_rate": 4.977109126681678e-07, "logits/chosen": -2.84385347366333, "logits/rejected": -2.8018012046813965, "logps/chosen": -331.741943359375, "logps/rejected": -275.99530029296875, "loss": 0.6892, "positive_losses": 0.06627921760082245, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0674901157617569, "rewards/margins": 0.01855793222784996, "rewards/margins_max": 0.0716143399477005, "rewards/margins_min": -0.02409081533551216, "rewards/margins_std": 0.04430803656578064, "rewards/rejected": 0.048932187259197235, "step": 580 }, { "dpo_losses": 0.6826609373092651, "epoch": 0.14, "grad_norm": 7.784663788206281, "learning_rate": 4.974200890090191e-07, "logits/chosen": -2.822492837905884, "logits/rejected": -2.81071138381958, "logps/chosen": -240.4092559814453, "logps/rejected": -237.11953735351562, "loss": 0.6878, "positive_losses": 0.0, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0672970563173294, "rewards/margins": 0.0215867031365633, "rewards/margins_max": 0.06920189410448074, "rewards/margins_min": -0.017990760505199432, "rewards/margins_std": 0.03865509107708931, "rewards/rejected": 0.04571035876870155, "step": 590 }, { "dpo_losses": 0.6833235621452332, "epoch": 0.14, "grad_norm": 1.7668240956873207, "learning_rate": 4.971119744090886e-07, "logits/chosen": -2.8321261405944824, "logits/rejected": -2.7825229167938232, "logps/chosen": -258.47003173828125, "logps/rejected": -238.138427734375, "loss": 0.6903, "positive_losses": 0.026993561536073685, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0675448328256607, "rewards/margins": 0.020560389384627342, "rewards/margins_max": 0.08045180141925812, "rewards/margins_min": -0.039693959057331085, "rewards/margins_std": 0.053230851888656616, "rewards/rejected": 0.046984437853097916, "step": 600 }, { "epoch": 0.14, "eval_dpo_losses": 0.6836540102958679, "eval_logits/chosen": -2.7926366329193115, "eval_logits/rejected": -2.7575905323028564, "eval_logps/chosen": -277.4952392578125, "eval_logps/rejected": -260.8594665527344, "eval_loss": 0.6868533492088318, "eval_positive_losses": 0.021267883479595184, "eval_rewards/accuracies": 0.656499981880188, "eval_rewards/chosen": 0.06960085034370422, "eval_rewards/margins": 0.01966278627514839, "eval_rewards/margins_max": 0.09488292783498764, "eval_rewards/margins_min": -0.04335365816950798, "eval_rewards/margins_std": 0.04614293947815895, "eval_rewards/rejected": 0.04993806779384613, "eval_runtime": 858.5673, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 600 }, { "dpo_losses": 0.6829264760017395, "epoch": 0.15, "grad_norm": 2.1127574786314645, "learning_rate": 4.967865904009499e-07, "logits/chosen": -2.855046033859253, "logits/rejected": -2.823148488998413, "logps/chosen": -340.7019958496094, "logps/rejected": -261.4319152832031, "loss": 0.6873, "positive_losses": 0.031710051000118256, "rewards/accuracies": 0.625, "rewards/chosen": 0.06511639803647995, "rewards/margins": 0.021077986806631088, "rewards/margins_max": 0.06853912770748138, "rewards/margins_min": -0.015233929269015789, "rewards/margins_std": 0.03863248601555824, "rewards/rejected": 0.04403840750455856, "step": 610 }, { "dpo_losses": 0.6783616542816162, "epoch": 0.15, "grad_norm": 2.076194497502867, "learning_rate": 4.964439597240486e-07, "logits/chosen": -2.8340821266174316, "logits/rejected": -2.8066117763519287, "logps/chosen": -379.06085205078125, "logps/rejected": -289.2956237792969, "loss": 0.6838, "positive_losses": 0.05157070234417915, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.08368918299674988, "rewards/margins": 0.03066886030137539, "rewards/margins_max": 0.09039265662431717, "rewards/margins_min": -0.031603340059518814, "rewards/margins_std": 0.053332507610321045, "rewards/rejected": 0.05302032083272934, "step": 620 }, { "dpo_losses": 0.6764013767242432, "epoch": 0.15, "grad_norm": 10.305754333754289, "learning_rate": 4.960841063231124e-07, "logits/chosen": -2.815617799758911, "logits/rejected": -2.775735855102539, "logps/chosen": -364.4797668457031, "logps/rejected": -283.71734619140625, "loss": 0.6789, "positive_losses": 0.01079711876809597, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.07987459003925323, "rewards/margins": 0.0345182903110981, "rewards/margins_max": 0.0890834629535675, "rewards/margins_min": -0.02524609863758087, "rewards/margins_std": 0.05067167803645134, "rewards/rejected": 0.04535629600286484, "step": 630 }, { "dpo_losses": 0.6764641404151917, "epoch": 0.15, "grad_norm": 1.9259304275147628, "learning_rate": 4.95707055346479e-07, "logits/chosen": -2.82051944732666, "logits/rejected": -2.7409045696258545, "logps/chosen": -318.0241394042969, "logps/rejected": -244.2421112060547, "loss": 0.6798, "positive_losses": 0.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0764562115073204, "rewards/margins": 0.034409306943416595, "rewards/margins_max": 0.08707224577665329, "rewards/margins_min": -0.0161970853805542, "rewards/margins_std": 0.04582378640770912, "rewards/rejected": 0.04204690456390381, "step": 640 }, { "dpo_losses": 0.6828524470329285, "epoch": 0.16, "grad_norm": 9.612647911888258, "learning_rate": 4.95312833144337e-07, "logits/chosen": -2.849984645843506, "logits/rejected": -2.7774062156677246, "logps/chosen": -285.7909851074219, "logps/rejected": -247.87435913085938, "loss": 0.688, "positive_losses": 0.07914247363805771, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06361918151378632, "rewards/margins": 0.021558113396167755, "rewards/margins_max": 0.09032431989908218, "rewards/margins_min": -0.03832520544528961, "rewards/margins_std": 0.05744075030088425, "rewards/rejected": 0.04206106811761856, "step": 650 }, { "dpo_losses": 0.6823024153709412, "epoch": 0.16, "grad_norm": 5.514750900384, "learning_rate": 4.949014672668858e-07, "logits/chosen": -2.8554444313049316, "logits/rejected": -2.833223819732666, "logps/chosen": -254.039794921875, "logps/rejected": -243.8351593017578, "loss": 0.6861, "positive_losses": 0.006196307949721813, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0682942122220993, "rewards/margins": 0.02269437164068222, "rewards/margins_max": 0.07447636872529984, "rewards/margins_min": -0.03631634637713432, "rewards/margins_std": 0.050204575061798096, "rewards/rejected": 0.045599840581417084, "step": 660 }, { "dpo_losses": 0.6764088869094849, "epoch": 0.16, "grad_norm": 12.536365558876355, "learning_rate": 4.944729864624097e-07, "logits/chosen": -2.934645414352417, "logits/rejected": -2.845205307006836, "logps/chosen": -325.27557373046875, "logps/rejected": -260.515625, "loss": 0.6822, "positive_losses": 0.011296081356704235, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.07887189090251923, "rewards/margins": 0.034532152116298676, "rewards/margins_max": 0.09326545894145966, "rewards/margins_min": -0.01700434461236, "rewards/margins_std": 0.04860921576619148, "rewards/rejected": 0.044339753687381744, "step": 670 }, { "dpo_losses": 0.6812268495559692, "epoch": 0.16, "grad_norm": 3.041135105733259, "learning_rate": 4.940274206752687e-07, "logits/chosen": -2.7797446250915527, "logits/rejected": -2.7533607482910156, "logps/chosen": -323.32989501953125, "logps/rejected": -256.44854736328125, "loss": 0.6806, "positive_losses": 0.04186515882611275, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.08234813064336777, "rewards/margins": 0.025063227862119675, "rewards/margins_max": 0.09514982998371124, "rewards/margins_min": -0.038404058665037155, "rewards/margins_std": 0.059386175125837326, "rewards/rejected": 0.057284899055957794, "step": 680 }, { "dpo_losses": 0.6822247505187988, "epoch": 0.17, "grad_norm": 7.490466607384208, "learning_rate": 4.935648010438058e-07, "logits/chosen": -2.796726703643799, "logits/rejected": -2.783893346786499, "logps/chosen": -253.38290405273438, "logps/rejected": -259.58447265625, "loss": 0.6902, "positive_losses": 0.10077762603759766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0766158252954483, "rewards/margins": 0.022741660475730896, "rewards/margins_max": 0.07919380813837051, "rewards/margins_min": -0.03171208128333092, "rewards/margins_std": 0.049976833164691925, "rewards/rejected": 0.05387415736913681, "step": 690 }, { "dpo_losses": 0.6822782754898071, "epoch": 0.17, "grad_norm": 1.7464238821195723, "learning_rate": 4.930851598981713e-07, "logits/chosen": -2.820312023162842, "logits/rejected": -2.7486772537231445, "logps/chosen": -273.87255859375, "logps/rejected": -240.78298950195312, "loss": 0.6828, "positive_losses": 0.019672393798828125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.07670161873102188, "rewards/margins": 0.02271401882171631, "rewards/margins_max": 0.09424431622028351, "rewards/margins_min": -0.038834765553474426, "rewards/margins_std": 0.05919056013226509, "rewards/rejected": 0.053987592458724976, "step": 700 }, { "epoch": 0.17, "eval_dpo_losses": 0.6812664866447449, "eval_logits/chosen": -2.784299850463867, "eval_logits/rejected": -2.7490391731262207, "eval_logps/chosen": -276.0510559082031, "eval_logps/rejected": -259.93243408203125, "eval_loss": 0.6855286359786987, "eval_positive_losses": 0.03017713874578476, "eval_rewards/accuracies": 0.659500002861023, "eval_rewards/chosen": 0.08404278755187988, "eval_rewards/margins": 0.024834182113409042, "eval_rewards/margins_max": 0.11987937986850739, "eval_rewards/margins_min": -0.05390350520610809, "eval_rewards/margins_std": 0.05804399028420448, "eval_rewards/rejected": 0.05920860171318054, "eval_runtime": 858.6348, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 700 }, { "dpo_losses": 0.6741176843643188, "epoch": 0.17, "grad_norm": 2.0576792935275914, "learning_rate": 4.925885307580632e-07, "logits/chosen": -2.7647995948791504, "logits/rejected": -2.7052018642425537, "logps/chosen": -283.0727844238281, "logps/rejected": -238.3861846923828, "loss": 0.6807, "positive_losses": 0.020458221435546875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09290773421525955, "rewards/margins": 0.03960338234901428, "rewards/margins_max": 0.1098104938864708, "rewards/margins_min": -0.03187655657529831, "rewards/margins_std": 0.06279050558805466, "rewards/rejected": 0.05330435559153557, "step": 710 }, { "dpo_losses": 0.6792494654655457, "epoch": 0.17, "grad_norm": 1.971246680315569, "learning_rate": 4.920749483303846e-07, "logits/chosen": -2.650237560272217, "logits/rejected": -2.6716513633728027, "logps/chosen": -266.8646545410156, "logps/rejected": -268.7168884277344, "loss": 0.6801, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08385501801967621, "rewards/margins": 0.02894839271903038, "rewards/margins_max": 0.09635502099990845, "rewards/margins_min": -0.028833651915192604, "rewards/margins_std": 0.055216819047927856, "rewards/rejected": 0.05490662530064583, "step": 720 }, { "dpo_losses": 0.679061770439148, "epoch": 0.17, "grad_norm": 8.39794011971951, "learning_rate": 4.915444485068181e-07, "logits/chosen": -2.86403489112854, "logits/rejected": -2.792564868927002, "logps/chosen": -314.6549072265625, "logps/rejected": -281.30120849609375, "loss": 0.6799, "positive_losses": 0.030918121337890625, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08858244121074677, "rewards/margins": 0.029324281960725784, "rewards/margins_max": 0.09129904955625534, "rewards/margins_min": -0.027634361758828163, "rewards/margins_std": 0.05307114124298096, "rewards/rejected": 0.059258151799440384, "step": 730 }, { "dpo_losses": 0.6792556643486023, "epoch": 0.18, "grad_norm": 5.056005008453592, "learning_rate": 4.90997068361318e-07, "logits/chosen": -2.866844654083252, "logits/rejected": -2.8213424682617188, "logps/chosen": -253.1515655517578, "logps/rejected": -252.10171508789062, "loss": 0.6776, "positive_losses": 0.0017368316184729338, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.08284234255552292, "rewards/margins": 0.028836002573370934, "rewards/margins_max": 0.09531380981206894, "rewards/margins_min": -0.027020279318094254, "rewards/margins_std": 0.055192988365888596, "rewards/rejected": 0.054006338119506836, "step": 740 }, { "dpo_losses": 0.6802335977554321, "epoch": 0.18, "grad_norm": 1.5757891297793503, "learning_rate": 4.904328461475189e-07, "logits/chosen": -2.8550000190734863, "logits/rejected": -2.8209452629089355, "logps/chosen": -277.0017395019531, "logps/rejected": -270.6773681640625, "loss": 0.6841, "positive_losses": 0.018401335924863815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08508224040269852, "rewards/margins": 0.027111947536468506, "rewards/margins_max": 0.10018207132816315, "rewards/margins_min": -0.03255631402134895, "rewards/margins_std": 0.06011735275387764, "rewards/rejected": 0.05797029286623001, "step": 750 }, { "dpo_losses": 0.6842832565307617, "epoch": 0.18, "grad_norm": 1.6432488667495653, "learning_rate": 4.898518212960625e-07, "logits/chosen": -2.81992506980896, "logits/rejected": -2.833505392074585, "logps/chosen": -267.8069152832031, "logps/rejected": -277.53619384765625, "loss": 0.6872, "positive_losses": 0.08895711600780487, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0821099728345871, "rewards/margins": 0.018676279112696648, "rewards/margins_max": 0.08383579552173615, "rewards/margins_min": -0.04039671644568443, "rewards/margins_std": 0.054400622844696045, "rewards/rejected": 0.0634337067604065, "step": 760 }, { "dpo_losses": 0.6787993311882019, "epoch": 0.18, "grad_norm": 1.8681506621562654, "learning_rate": 4.89254034411842e-07, "logits/chosen": -2.8575997352600098, "logits/rejected": -2.785329818725586, "logps/chosen": -261.1501159667969, "logps/rejected": -264.2248840332031, "loss": 0.681, "positive_losses": 0.024294376373291016, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09593097865581512, "rewards/margins": 0.02982424572110176, "rewards/margins_max": 0.09621700644493103, "rewards/margins_min": -0.03625832498073578, "rewards/margins_std": 0.058905910700559616, "rewards/rejected": 0.06610673666000366, "step": 770 }, { "dpo_losses": 0.6734384298324585, "epoch": 0.19, "grad_norm": 3.695882418599355, "learning_rate": 4.886395272711646e-07, "logits/chosen": -2.890214443206787, "logits/rejected": -2.8210690021514893, "logps/chosen": -300.6239318847656, "logps/rejected": -234.56192016601562, "loss": 0.6788, "positive_losses": 0.013837432488799095, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09861142933368683, "rewards/margins": 0.04101376235485077, "rewards/margins_max": 0.11582870781421661, "rewards/margins_min": -0.028116285800933838, "rewards/margins_std": 0.06481508910655975, "rewards/rejected": 0.05759765952825546, "step": 780 }, { "dpo_losses": 0.6743272542953491, "epoch": 0.19, "grad_norm": 2.413148972580679, "learning_rate": 4.880083428188314e-07, "logits/chosen": -2.8267250061035156, "logits/rejected": -2.7915942668914795, "logps/chosen": -297.55572509765625, "logps/rejected": -247.77352905273438, "loss": 0.6795, "positive_losses": 0.04569215700030327, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.10070053488016129, "rewards/margins": 0.03933431953191757, "rewards/margins_max": 0.11786572635173798, "rewards/margins_min": -0.02329222485423088, "rewards/margins_std": 0.0648992508649826, "rewards/rejected": 0.06136621907353401, "step": 790 }, { "dpo_losses": 0.6709287166595459, "epoch": 0.19, "grad_norm": 2.913817694559649, "learning_rate": 4.873605251651373e-07, "logits/chosen": -2.837273120880127, "logits/rejected": -2.764631748199463, "logps/chosen": -296.6575012207031, "logps/rejected": -244.49575805664062, "loss": 0.6758, "positive_losses": 0.08764095604419708, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.10942758619785309, "rewards/margins": 0.046719592064619064, "rewards/margins_max": 0.14401772618293762, "rewards/margins_min": -0.03062589466571808, "rewards/margins_std": 0.07762106508016586, "rewards/rejected": 0.06270798295736313, "step": 800 }, { "epoch": 0.19, "eval_dpo_losses": 0.6790503263473511, "eval_logits/chosen": -2.7803642749786377, "eval_logits/rejected": -2.7450077533721924, "eval_logps/chosen": -274.7613220214844, "eval_logps/rejected": -259.1296081542969, "eval_loss": 0.6855273246765137, "eval_positive_losses": 0.052617669105529785, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": 0.09694031625986099, "eval_rewards/margins": 0.029703423380851746, "eval_rewards/margins_max": 0.14226162433624268, "eval_rewards/margins_min": -0.06403277069330215, "eval_rewards/margins_std": 0.06884337216615677, "eval_rewards/rejected": 0.06723689287900925, "eval_runtime": 858.8922, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 800 }, { "dpo_losses": 0.6825414896011353, "epoch": 0.19, "grad_norm": 5.81118595487603, "learning_rate": 4.866961195827869e-07, "logits/chosen": -2.7995543479919434, "logits/rejected": -2.7924509048461914, "logps/chosen": -235.26998901367188, "logps/rejected": -233.256103515625, "loss": 0.6882, "positive_losses": 0.06627750396728516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08960787206888199, "rewards/margins": 0.023004651069641113, "rewards/margins_max": 0.09804173558950424, "rewards/margins_min": -0.06027917191386223, "rewards/margins_std": 0.07087436318397522, "rewards/rejected": 0.06660322099924088, "step": 810 }, { "dpo_losses": 0.6721338033676147, "epoch": 0.2, "grad_norm": 3.4237370410909316, "learning_rate": 4.860151725037318e-07, "logits/chosen": -2.7623214721679688, "logits/rejected": -2.7498514652252197, "logps/chosen": -279.1158447265625, "logps/rejected": -255.15841674804688, "loss": 0.6706, "positive_losses": 0.009643363766372204, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0987878367304802, "rewards/margins": 0.04378185048699379, "rewards/margins_max": 0.11617567390203476, "rewards/margins_min": -0.02401834912598133, "rewards/margins_std": 0.06359706073999405, "rewards/rejected": 0.055005986243486404, "step": 820 }, { "dpo_losses": 0.6715747117996216, "epoch": 0.2, "grad_norm": 7.76565299998096, "learning_rate": 4.853177315159253e-07, "logits/chosen": -2.8777451515197754, "logits/rejected": -2.8178610801696777, "logps/chosen": -339.04840087890625, "logps/rejected": -274.03558349609375, "loss": 0.6826, "positive_losses": 0.11019305884838104, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1009032279253006, "rewards/margins": 0.045209161937236786, "rewards/margins_max": 0.12633730471134186, "rewards/margins_min": -0.029953669756650925, "rewards/margins_std": 0.06860056519508362, "rewards/rejected": 0.05569406598806381, "step": 830 }, { "dpo_losses": 0.6728617548942566, "epoch": 0.2, "grad_norm": 1.720077764727041, "learning_rate": 4.846038453599967e-07, "logits/chosen": -2.837223529815674, "logits/rejected": -2.7529520988464355, "logps/chosen": -291.3700256347656, "logps/rejected": -254.5083465576172, "loss": 0.6793, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.117365263402462, "rewards/margins": 0.042250119149684906, "rewards/margins_max": 0.11362957954406738, "rewards/margins_min": -0.023388735949993134, "rewards/margins_std": 0.06044607609510422, "rewards/rejected": 0.0751151442527771, "step": 840 }, { "dpo_losses": 0.6893151998519897, "epoch": 0.2, "grad_norm": 4.825143430485342, "learning_rate": 4.838735639258449e-07, "logits/chosen": -2.8601608276367188, "logits/rejected": -2.8401591777801514, "logps/chosen": -251.51132202148438, "logps/rejected": -266.95513916015625, "loss": 0.6859, "positive_losses": 0.1546136438846588, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.08641336858272552, "rewards/margins": 0.00905313529074192, "rewards/margins_max": 0.07953067123889923, "rewards/margins_min": -0.06482794135808945, "rewards/margins_std": 0.06349007785320282, "rewards/rejected": 0.07736023515462875, "step": 850 }, { "dpo_losses": 0.6841599941253662, "epoch": 0.21, "grad_norm": 10.455103847985823, "learning_rate": 4.831269382491519e-07, "logits/chosen": -2.805694580078125, "logits/rejected": -2.8238396644592285, "logps/chosen": -252.17391967773438, "logps/rejected": -271.09564208984375, "loss": 0.6831, "positive_losses": 0.004224872682243586, "rewards/accuracies": 0.625, "rewards/chosen": 0.10964903980493546, "rewards/margins": 0.019747644662857056, "rewards/margins_max": 0.10010750591754913, "rewards/margins_min": -0.06063016504049301, "rewards/margins_std": 0.07033602148294449, "rewards/rejected": 0.089901402592659, "step": 860 }, { "dpo_losses": 0.6822604537010193, "epoch": 0.21, "grad_norm": 1.6874951963003098, "learning_rate": 4.823640205078166e-07, "logits/chosen": -2.838261127471924, "logits/rejected": -2.8175485134124756, "logps/chosen": -229.46517944335938, "logps/rejected": -237.25131225585938, "loss": 0.6844, "positive_losses": 0.08146629482507706, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.1082630529999733, "rewards/margins": 0.02317950129508972, "rewards/margins_max": 0.09823676198720932, "rewards/margins_min": -0.049094945192337036, "rewards/margins_std": 0.06422857940196991, "rewards/rejected": 0.08508355170488358, "step": 870 }, { "dpo_losses": 0.6781774759292603, "epoch": 0.21, "grad_norm": 7.971632252955927, "learning_rate": 4.815848640183081e-07, "logits/chosen": -2.772138833999634, "logits/rejected": -2.729099750518799, "logps/chosen": -318.7154846191406, "logps/rejected": -279.6519775390625, "loss": 0.6813, "positive_losses": 0.032731153070926666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1219932809472084, "rewards/margins": 0.03179420530796051, "rewards/margins_max": 0.12229770421981812, "rewards/margins_min": -0.04193325713276863, "rewards/margins_std": 0.07462559640407562, "rewards/rejected": 0.0901990756392479, "step": 880 }, { "dpo_losses": 0.6753103137016296, "epoch": 0.21, "grad_norm": 1.8890347689351432, "learning_rate": 4.807895232319393e-07, "logits/chosen": -2.7868289947509766, "logits/rejected": -2.735607147216797, "logps/chosen": -279.2073059082031, "logps/rejected": -202.9532012939453, "loss": 0.6777, "positive_losses": 0.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11181597411632538, "rewards/margins": 0.03742646425962448, "rewards/margins_max": 0.11328178644180298, "rewards/margins_min": -0.03999562934041023, "rewards/margins_std": 0.06738351285457611, "rewards/rejected": 0.07438952475786209, "step": 890 }, { "dpo_losses": 0.6749289631843567, "epoch": 0.22, "grad_norm": 6.503640549189846, "learning_rate": 4.799780537310621e-07, "logits/chosen": -2.8019609451293945, "logits/rejected": -2.7577738761901855, "logps/chosen": -309.42169189453125, "logps/rejected": -269.05584716796875, "loss": 0.6811, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11420547962188721, "rewards/margins": 0.038339123129844666, "rewards/margins_max": 0.12281061708927155, "rewards/margins_min": -0.04068036749958992, "rewards/margins_std": 0.07314107567071915, "rewards/rejected": 0.07586634904146194, "step": 900 }, { "epoch": 0.22, "eval_dpo_losses": 0.6771374344825745, "eval_logits/chosen": -2.772634267807007, "eval_logits/rejected": -2.7378244400024414, "eval_logps/chosen": -273.8141174316406, "eval_logps/rejected": -258.60400390625, "eval_loss": 0.685390830039978, "eval_positive_losses": 0.05944617837667465, "eval_rewards/accuracies": 0.6644999980926514, "eval_rewards/chosen": 0.1064121350646019, "eval_rewards/margins": 0.033919692039489746, "eval_rewards/margins_max": 0.1596282422542572, "eval_rewards/margins_min": -0.07149993628263474, "eval_rewards/margins_std": 0.07709173858165741, "eval_rewards/rejected": 0.07249243557453156, "eval_runtime": 858.1907, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 900 }, { "dpo_losses": 0.6706277132034302, "epoch": 0.22, "grad_norm": 9.238824227568687, "learning_rate": 4.791505122251827e-07, "logits/chosen": -2.849838972091675, "logits/rejected": -2.7760417461395264, "logps/chosen": -234.4337921142578, "logps/rejected": -210.66476440429688, "loss": 0.676, "positive_losses": 0.02234802208840847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10051780939102173, "rewards/margins": 0.04713047668337822, "rewards/margins_max": 0.12348206341266632, "rewards/margins_min": -0.024385234341025352, "rewards/margins_std": 0.06711273640394211, "rewards/rejected": 0.05338733270764351, "step": 910 }, { "dpo_losses": 0.6776150465011597, "epoch": 0.22, "grad_norm": 1.944472411121095, "learning_rate": 4.783069565469985e-07, "logits/chosen": -2.7612035274505615, "logits/rejected": -2.7449731826782227, "logps/chosen": -273.45623779296875, "logps/rejected": -266.89410400390625, "loss": 0.6831, "positive_losses": 0.03824806213378906, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.09719910472631454, "rewards/margins": 0.03258504718542099, "rewards/margins_max": 0.11059228330850601, "rewards/margins_min": -0.043902844190597534, "rewards/margins_std": 0.06737259030342102, "rewards/rejected": 0.06461404263973236, "step": 920 }, { "dpo_losses": 0.6759111285209656, "epoch": 0.22, "grad_norm": 8.308565284095572, "learning_rate": 4.77447445648357e-07, "logits/chosen": -2.77830171585083, "logits/rejected": -2.739842176437378, "logps/chosen": -253.650390625, "logps/rejected": -211.558349609375, "loss": 0.6845, "positive_losses": 0.16712895035743713, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.10256584733724594, "rewards/margins": 0.03615504503250122, "rewards/margins_max": 0.12079312652349472, "rewards/margins_min": -0.03966181352734566, "rewards/margins_std": 0.07190297544002533, "rewards/rejected": 0.06641080230474472, "step": 930 }, { "dpo_losses": 0.6719256639480591, "epoch": 0.23, "grad_norm": 7.628548218685171, "learning_rate": 4.765720395961349e-07, "logits/chosen": -2.8013808727264404, "logits/rejected": -2.7999517917633057, "logps/chosen": -266.1254577636719, "logps/rejected": -261.2449035644531, "loss": 0.6837, "positive_losses": 0.11001205444335938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11776383221149445, "rewards/margins": 0.04437459260225296, "rewards/margins_max": 0.1220204085111618, "rewards/margins_min": -0.033194057643413544, "rewards/margins_std": 0.07096997648477554, "rewards/rejected": 0.07338923960924149, "step": 940 }, { "dpo_losses": 0.6771588325500488, "epoch": 0.23, "grad_norm": 1.5965960652796636, "learning_rate": 4.7568079956804144e-07, "logits/chosen": -2.859605312347412, "logits/rejected": -2.8140926361083984, "logps/chosen": -310.8310241699219, "logps/rejected": -288.5025634765625, "loss": 0.6834, "positive_losses": 0.04455604404211044, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.10955788195133209, "rewards/margins": 0.03450103849172592, "rewards/margins_max": 0.14637137949466705, "rewards/margins_min": -0.056033432483673096, "rewards/margins_std": 0.08897896111011505, "rewards/rejected": 0.07505683600902557, "step": 950 }, { "dpo_losses": 0.672426164150238, "epoch": 0.23, "grad_norm": 1.8200930352588245, "learning_rate": 4.74773787848342e-07, "logits/chosen": -2.8744921684265137, "logits/rejected": -2.811471462249756, "logps/chosen": -292.01104736328125, "logps/rejected": -237.38427734375, "loss": 0.6831, "positive_losses": 0.05643119663000107, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11575409024953842, "rewards/margins": 0.04389750584959984, "rewards/margins_max": 0.15557856857776642, "rewards/margins_min": -0.03475785255432129, "rewards/margins_std": 0.085698202252388, "rewards/rejected": 0.07185657322406769, "step": 960 }, { "dpo_losses": 0.6710332632064819, "epoch": 0.23, "grad_norm": 2.0062879422126505, "learning_rate": 4.7385106782350637e-07, "logits/chosen": -2.8053956031799316, "logits/rejected": -2.747124195098877, "logps/chosen": -308.17889404296875, "logps/rejected": -285.44268798828125, "loss": 0.6764, "positive_losses": 0.04387817531824112, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1223042830824852, "rewards/margins": 0.04687241092324257, "rewards/margins_max": 0.14555902779102325, "rewards/margins_min": -0.051617592573165894, "rewards/margins_std": 0.08880055695772171, "rewards/rejected": 0.07543185353279114, "step": 970 }, { "dpo_losses": 0.6777924299240112, "epoch": 0.23, "grad_norm": 1.9347177648603264, "learning_rate": 4.729127039777781e-07, "logits/chosen": -2.7302935123443604, "logits/rejected": -2.7068936824798584, "logps/chosen": -232.22683715820312, "logps/rejected": -216.17098999023438, "loss": 0.6845, "positive_losses": 0.07324676215648651, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.10641026496887207, "rewards/margins": 0.032620370388031006, "rewards/margins_max": 0.12086065858602524, "rewards/margins_min": -0.04100584238767624, "rewards/margins_std": 0.07166214287281036, "rewards/rejected": 0.07378989458084106, "step": 980 }, { "dpo_losses": 0.677514910697937, "epoch": 0.24, "grad_norm": 4.12945708874403, "learning_rate": 4.719587618886685e-07, "logits/chosen": -2.8368868827819824, "logits/rejected": -2.7616496086120605, "logps/chosen": -298.9474182128906, "logps/rejected": -289.38018798828125, "loss": 0.6978, "positive_losses": 0.17233982682228088, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.11195279657840729, "rewards/margins": 0.03407040983438492, "rewards/margins_max": 0.12920589745044708, "rewards/margins_min": -0.06014276668429375, "rewards/margins_std": 0.08397029340267181, "rewards/rejected": 0.07788237929344177, "step": 990 }, { "dpo_losses": 0.6773067712783813, "epoch": 0.24, "grad_norm": 7.417685906600816, "learning_rate": 4.709893082223737e-07, "logits/chosen": -2.832045793533325, "logits/rejected": -2.7823894023895264, "logps/chosen": -286.35467529296875, "logps/rejected": -272.12164306640625, "loss": 0.6803, "positive_losses": 0.0036687850952148438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12079496681690216, "rewards/margins": 0.033833663910627365, "rewards/margins_max": 0.13215182721614838, "rewards/margins_min": -0.050864316523075104, "rewards/margins_std": 0.0818435326218605, "rewards/rejected": 0.0869612991809845, "step": 1000 }, { "epoch": 0.24, "eval_dpo_losses": 0.6762186884880066, "eval_logits/chosen": -2.7634224891662598, "eval_logits/rejected": -2.7284905910491943, "eval_logps/chosen": -272.7884826660156, "eval_logps/rejected": -257.78558349609375, "eval_loss": 0.6844692230224609, "eval_positive_losses": 0.06086179241538048, "eval_rewards/accuracies": 0.6644999980926514, "eval_rewards/chosen": 0.11666864901781082, "eval_rewards/margins": 0.03599197044968605, "eval_rewards/margins_max": 0.16871315240859985, "eval_rewards/margins_min": -0.07634512335062027, "eval_rewards/margins_std": 0.0818052813410759, "eval_rewards/rejected": 0.08067668229341507, "eval_runtime": 858.2184, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 1000 }, { "dpo_losses": 0.6751042008399963, "epoch": 0.24, "grad_norm": 8.950416461482916, "learning_rate": 4.7000441072911554e-07, "logits/chosen": -2.766784191131592, "logits/rejected": -2.746922016143799, "logps/chosen": -241.67190551757812, "logps/rejected": -267.1072082519531, "loss": 0.6802, "positive_losses": 0.05336589738726616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12016680091619492, "rewards/margins": 0.03801379352807999, "rewards/margins_max": 0.12579157948493958, "rewards/margins_min": -0.04440592974424362, "rewards/margins_std": 0.07636955380439758, "rewards/rejected": 0.08215299248695374, "step": 1010 }, { "dpo_losses": 0.6760284900665283, "epoch": 0.24, "grad_norm": 2.151208152208809, "learning_rate": 4.690041382384071e-07, "logits/chosen": -2.739492654800415, "logits/rejected": -2.754105567932129, "logps/chosen": -220.57510375976562, "logps/rejected": -224.20114135742188, "loss": 0.6762, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1192348375916481, "rewards/margins": 0.03604461997747421, "rewards/margins_max": 0.11405354738235474, "rewards/margins_min": -0.041035592555999756, "rewards/margins_std": 0.06971906125545502, "rewards/rejected": 0.08319021761417389, "step": 1020 }, { "dpo_losses": 0.6773991584777832, "epoch": 0.25, "grad_norm": 1.8922695857166008, "learning_rate": 4.679885606542423e-07, "logits/chosen": -2.806206464767456, "logits/rejected": -2.7961766719818115, "logps/chosen": -245.4696807861328, "logps/rejected": -245.60867309570312, "loss": 0.6788, "positive_losses": 0.0049041747115552425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1119534820318222, "rewards/margins": 0.03354022651910782, "rewards/margins_max": 0.11348341405391693, "rewards/margins_min": -0.047497041523456573, "rewards/margins_std": 0.07265783101320267, "rewards/rejected": 0.07841327041387558, "step": 1030 }, { "dpo_losses": 0.6680446863174438, "epoch": 0.25, "grad_norm": 1.9565565557653075, "learning_rate": 4.669577489502108e-07, "logits/chosen": -2.8382039070129395, "logits/rejected": -2.7731876373291016, "logps/chosen": -269.0843811035156, "logps/rejected": -256.26318359375, "loss": 0.6728, "positive_losses": 0.07880592346191406, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1247919350862503, "rewards/margins": 0.05227974057197571, "rewards/margins_max": 0.13477292656898499, "rewards/margins_min": -0.013504236936569214, "rewards/margins_std": 0.06570630520582199, "rewards/rejected": 0.072512187063694, "step": 1040 }, { "dpo_losses": 0.680415153503418, "epoch": 0.25, "grad_norm": 2.295910898918755, "learning_rate": 4.6591177516453795e-07, "logits/chosen": -2.6781487464904785, "logits/rejected": -2.701482057571411, "logps/chosen": -242.3964385986328, "logps/rejected": -235.07510375976562, "loss": 0.6775, "positive_losses": 0.08710632473230362, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0970238447189331, "rewards/margins": 0.02750151790678501, "rewards/margins_max": 0.1096399649977684, "rewards/margins_min": -0.05270353704690933, "rewards/margins_std": 0.07191523909568787, "rewards/rejected": 0.06952232122421265, "step": 1050 }, { "dpo_losses": 0.6638389229774475, "epoch": 0.25, "grad_norm": 11.897661610864574, "learning_rate": 4.6485071239505037e-07, "logits/chosen": -2.7989861965179443, "logits/rejected": -2.784994125366211, "logps/chosen": -279.7257385253906, "logps/rejected": -255.8384246826172, "loss": 0.6812, "positive_losses": 0.042810820043087006, "rewards/accuracies": 0.75, "rewards/chosen": 0.13547874987125397, "rewards/margins": 0.06184696406126022, "rewards/margins_max": 0.15981335937976837, "rewards/margins_min": -0.029845666140317917, "rewards/margins_std": 0.0870203971862793, "rewards/rejected": 0.07363177835941315, "step": 1060 }, { "dpo_losses": 0.6708803176879883, "epoch": 0.26, "grad_norm": 1.873632631805898, "learning_rate": 4.6377463479406777e-07, "logits/chosen": -2.8089914321899414, "logits/rejected": -2.756983995437622, "logps/chosen": -283.4353942871094, "logps/rejected": -245.7755584716797, "loss": 0.6726, "positive_losses": 0.006662941072136164, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11114688962697983, "rewards/margins": 0.04698692634701729, "rewards/margins_max": 0.14130660891532898, "rewards/margins_min": -0.04065307229757309, "rewards/margins_std": 0.08042062819004059, "rewards/rejected": 0.06415997445583344, "step": 1070 }, { "dpo_losses": 0.6638616323471069, "epoch": 0.26, "grad_norm": 1.756100240275037, "learning_rate": 4.6268361756322037e-07, "logits/chosen": -2.7897324562072754, "logits/rejected": -2.725583076477051, "logps/chosen": -305.89300537109375, "logps/rejected": -263.3019104003906, "loss": 0.6739, "positive_losses": 0.073638916015625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13587689399719238, "rewards/margins": 0.06179635971784592, "rewards/margins_max": 0.15654256939888, "rewards/margins_min": -0.020060013979673386, "rewards/margins_std": 0.08047287166118622, "rewards/rejected": 0.07408054172992706, "step": 1080 }, { "dpo_losses": 0.6799504160881042, "epoch": 0.26, "grad_norm": 1.9340598421237254, "learning_rate": 4.6157773694819396e-07, "logits/chosen": -2.7950727939605713, "logits/rejected": -2.7836081981658936, "logps/chosen": -263.2037353515625, "logps/rejected": -319.5477600097656, "loss": 0.6819, "positive_losses": 0.018991852179169655, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11942502111196518, "rewards/margins": 0.028681915253400803, "rewards/margins_max": 0.12958405911922455, "rewards/margins_min": -0.061748165637254715, "rewards/margins_std": 0.08600357174873352, "rewards/rejected": 0.09074309468269348, "step": 1090 }, { "dpo_losses": 0.6719014644622803, "epoch": 0.26, "grad_norm": 1.8627249409064275, "learning_rate": 4.60457070233401e-07, "logits/chosen": -2.6873061656951904, "logits/rejected": -2.6814308166503906, "logps/chosen": -242.2404022216797, "logps/rejected": -223.6565704345703, "loss": 0.6759, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11840157210826874, "rewards/margins": 0.044380877166986465, "rewards/margins_max": 0.1252276748418808, "rewards/margins_min": -0.02278958447277546, "rewards/margins_std": 0.06606931984424591, "rewards/rejected": 0.07402069866657257, "step": 1100 }, { "epoch": 0.26, "eval_dpo_losses": 0.6749641299247742, "eval_logits/chosen": -2.767153263092041, "eval_logits/rejected": -2.7320351600646973, "eval_logps/chosen": -271.95263671875, "eval_logps/rejected": -257.2344665527344, "eval_loss": 0.6841889023780823, "eval_positive_losses": 0.06760016083717346, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": 0.12502706050872803, "eval_rewards/margins": 0.03883914276957512, "eval_rewards/margins_max": 0.18149109184741974, "eval_rewards/margins_min": -0.08285294473171234, "eval_rewards/margins_std": 0.088114432990551, "eval_rewards/rejected": 0.0861879289150238, "eval_runtime": 858.3723, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 1100 }, { "dpo_losses": 0.6639162302017212, "epoch": 0.27, "grad_norm": 2.1047083343176998, "learning_rate": 4.5932169573657987e-07, "logits/chosen": -2.8587257862091064, "logits/rejected": -2.8369314670562744, "logps/chosen": -305.2558898925781, "logps/rejected": -302.9868469238281, "loss": 0.6753, "positive_losses": 0.02009124681353569, "rewards/accuracies": 0.8125, "rewards/chosen": 0.15367315709590912, "rewards/margins": 0.06176639348268509, "rewards/margins_max": 0.14987319707870483, "rewards/margins_min": -0.03467453271150589, "rewards/margins_std": 0.0812709704041481, "rewards/rejected": 0.09190677851438522, "step": 1110 }, { "dpo_losses": 0.6814800500869751, "epoch": 0.27, "grad_norm": 11.094767435188444, "learning_rate": 4.581716928033216e-07, "logits/chosen": -2.8125529289245605, "logits/rejected": -2.7963242530822754, "logps/chosen": -266.97503662109375, "logps/rejected": -273.31298828125, "loss": 0.6855, "positive_losses": 0.1746665984392166, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1335349977016449, "rewards/margins": 0.025999641045928, "rewards/margins_max": 0.14386391639709473, "rewards/margins_min": -0.08258634060621262, "rewards/margins_std": 0.0986047238111496, "rewards/rejected": 0.10753533989191055, "step": 1120 }, { "dpo_losses": 0.6785280108451843, "epoch": 0.27, "grad_norm": 2.2360163708737533, "learning_rate": 4.5700714180152467e-07, "logits/chosen": -2.735635757446289, "logits/rejected": -2.6962106227874756, "logps/chosen": -211.38729858398438, "logps/rejected": -213.6187744140625, "loss": 0.6837, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12530580163002014, "rewards/margins": 0.03131798654794693, "rewards/margins_max": 0.11877693980932236, "rewards/margins_min": -0.049319393932819366, "rewards/margins_std": 0.07619408518075943, "rewards/rejected": 0.09398780763149261, "step": 1130 }, { "dpo_losses": 0.6719018220901489, "epoch": 0.27, "grad_norm": 5.917426942504068, "learning_rate": 4.5582812411577887e-07, "logits/chosen": -2.7792747020721436, "logits/rejected": -2.750622034072876, "logps/chosen": -265.9587707519531, "logps/rejected": -243.9398956298828, "loss": 0.6793, "positive_losses": 0.03756408765912056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1344030201435089, "rewards/margins": 0.04468253254890442, "rewards/margins_max": 0.13286662101745605, "rewards/margins_min": -0.038429200649261475, "rewards/margins_std": 0.07810581475496292, "rewards/rejected": 0.0897204726934433, "step": 1140 }, { "dpo_losses": 0.6708990335464478, "epoch": 0.28, "grad_norm": 1.8985021760700524, "learning_rate": 4.546347221416772e-07, "logits/chosen": -2.7768023014068604, "logits/rejected": -2.741556406021118, "logps/chosen": -251.93905639648438, "logps/rejected": -238.93624877929688, "loss": 0.6784, "positive_losses": 0.14313526451587677, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12111435830593109, "rewards/margins": 0.046721406280994415, "rewards/margins_max": 0.14194932579994202, "rewards/margins_min": -0.03253781050443649, "rewards/margins_std": 0.07779671996831894, "rewards/rejected": 0.07439295947551727, "step": 1150 }, { "dpo_losses": 0.6691399812698364, "epoch": 0.28, "grad_norm": 2.352301497319073, "learning_rate": 4.534270192800581e-07, "logits/chosen": -2.7497522830963135, "logits/rejected": -2.7177019119262695, "logps/chosen": -250.85317993164062, "logps/rejected": -245.84146118164062, "loss": 0.6705, "positive_losses": 0.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1404324173927307, "rewards/margins": 0.05041662976145744, "rewards/margins_max": 0.13572020828723907, "rewards/margins_min": -0.03052157163619995, "rewards/margins_std": 0.07493475079536438, "rewards/rejected": 0.09001577645540237, "step": 1160 }, { "dpo_losses": 0.670620322227478, "epoch": 0.28, "grad_norm": 1.9661009389707538, "learning_rate": 4.5220509993117684e-07, "logits/chosen": -2.8339881896972656, "logits/rejected": -2.755382537841797, "logps/chosen": -282.88616943359375, "logps/rejected": -251.1024932861328, "loss": 0.6674, "positive_losses": 0.006706619169563055, "rewards/accuracies": 0.75, "rewards/chosen": 0.12689003348350525, "rewards/margins": 0.04794115573167801, "rewards/margins_max": 0.1437653750181198, "rewards/margins_min": -0.044886935502290726, "rewards/margins_std": 0.08534976840019226, "rewards/rejected": 0.07894886285066605, "step": 1170 }, { "dpo_losses": 0.6661661267280579, "epoch": 0.28, "grad_norm": 13.531131594097992, "learning_rate": 4.509690494888071e-07, "logits/chosen": -2.816774368286133, "logits/rejected": -2.7480838298797607, "logps/chosen": -325.19830322265625, "logps/rejected": -274.260498046875, "loss": 0.6748, "positive_losses": 0.09701671451330185, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.14096954464912415, "rewards/margins": 0.056869667023420334, "rewards/margins_max": 0.15933354198932648, "rewards/margins_min": -0.0360349602997303, "rewards/margins_std": 0.08559076488018036, "rewards/rejected": 0.08409987390041351, "step": 1180 }, { "dpo_losses": 0.671231210231781, "epoch": 0.28, "grad_norm": 1.8082032410314015, "learning_rate": 4.4971895433427356e-07, "logits/chosen": -2.773207902908325, "logits/rejected": -2.7551190853118896, "logps/chosen": -215.55224609375, "logps/rejected": -211.1637420654297, "loss": 0.6713, "positive_losses": 0.07059326022863388, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1250600963830948, "rewards/margins": 0.04678435996174812, "rewards/margins_max": 0.1535269320011139, "rewards/margins_min": -0.04599838703870773, "rewards/margins_std": 0.09063064306974411, "rewards/rejected": 0.07827572524547577, "step": 1190 }, { "dpo_losses": 0.6701631546020508, "epoch": 0.29, "grad_norm": 7.459520071510339, "learning_rate": 4.4845490183041454e-07, "logits/chosen": -2.7908058166503906, "logits/rejected": -2.7887003421783447, "logps/chosen": -293.20892333984375, "logps/rejected": -288.8800354003906, "loss": 0.6732, "positive_losses": 0.06139221042394638, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.12377583980560303, "rewards/margins": 0.04823102802038193, "rewards/margins_max": 0.14213033020496368, "rewards/margins_min": -0.03357115387916565, "rewards/margins_std": 0.07813943922519684, "rewards/rejected": 0.0755448043346405, "step": 1200 }, { "epoch": 0.29, "eval_dpo_losses": 0.6721514463424683, "eval_logits/chosen": -2.7663676738739014, "eval_logits/rejected": -2.731541395187378, "eval_logps/chosen": -272.66412353515625, "eval_logps/rejected": -258.58453369140625, "eval_loss": 0.6895638108253479, "eval_positive_losses": 0.14053063094615936, "eval_rewards/accuracies": 0.6694999933242798, "eval_rewards/chosen": 0.11791205406188965, "eval_rewards/margins": 0.04522499069571495, "eval_rewards/margins_max": 0.20755761861801147, "eval_rewards/margins_min": -0.09393501281738281, "eval_rewards/margins_std": 0.10050461441278458, "eval_rewards/rejected": 0.0726870596408844, "eval_runtime": 858.7291, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1200 }, { "dpo_losses": 0.6618081331253052, "epoch": 0.29, "grad_norm": 6.568095991132473, "learning_rate": 4.4717698031547733e-07, "logits/chosen": -2.834045886993408, "logits/rejected": -2.761199474334717, "logps/chosen": -293.65667724609375, "logps/rejected": -254.34939575195312, "loss": 0.6702, "positive_losses": 0.047281645238399506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13549068570137024, "rewards/margins": 0.0671490728855133, "rewards/margins_max": 0.19236032664775848, "rewards/margins_min": -0.04416259005665779, "rewards/margins_std": 0.1052919402718544, "rewards/rejected": 0.06834162026643753, "step": 1210 }, { "dpo_losses": 0.6690332293510437, "epoch": 0.29, "grad_norm": 10.506988742687643, "learning_rate": 4.458852790969445e-07, "logits/chosen": -2.8311150074005127, "logits/rejected": -2.798910140991211, "logps/chosen": -258.90911865234375, "logps/rejected": -254.6076202392578, "loss": 0.6752, "positive_losses": 0.1283218413591385, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1267794519662857, "rewards/margins": 0.0508514866232872, "rewards/margins_max": 0.13388368487358093, "rewards/margins_min": -0.034406617283821106, "rewards/margins_std": 0.07491905242204666, "rewards/rejected": 0.07592795789241791, "step": 1220 }, { "dpo_losses": 0.6740410923957825, "epoch": 0.29, "grad_norm": 8.00843699927687, "learning_rate": 4.4457988844529204e-07, "logits/chosen": -2.8114848136901855, "logits/rejected": -2.7771763801574707, "logps/chosen": -237.8554229736328, "logps/rejected": -266.1767578125, "loss": 0.6877, "positive_losses": 0.08004359900951385, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.12362845242023468, "rewards/margins": 0.04161449521780014, "rewards/margins_max": 0.17277400195598602, "rewards/margins_min": -0.05664440244436264, "rewards/margins_std": 0.10385291278362274, "rewards/rejected": 0.08201394975185394, "step": 1230 }, { "dpo_losses": 0.6760381460189819, "epoch": 0.3, "grad_norm": 8.41547463414103, "learning_rate": 4.432608995876819e-07, "logits/chosen": -2.845557689666748, "logits/rejected": -2.751915693283081, "logps/chosen": -255.30517578125, "logps/rejected": -235.33712768554688, "loss": 0.6801, "positive_losses": 0.05801277235150337, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12946945428848267, "rewards/margins": 0.03667557239532471, "rewards/margins_max": 0.14467547833919525, "rewards/margins_min": -0.05187157914042473, "rewards/margins_std": 0.08809033781290054, "rewards/rejected": 0.09279386699199677, "step": 1240 }, { "dpo_losses": 0.6690986752510071, "epoch": 0.3, "grad_norm": 8.484171322227194, "learning_rate": 4.419284047015854e-07, "logits/chosen": -2.850163221359253, "logits/rejected": -2.8187754154205322, "logps/chosen": -277.6499938964844, "logps/rejected": -226.75662231445312, "loss": 0.6839, "positive_losses": 0.24359741806983948, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.13122794032096863, "rewards/margins": 0.05107776075601578, "rewards/margins_max": 0.1499982476234436, "rewards/margins_min": -0.04515678808093071, "rewards/margins_std": 0.08716264367103577, "rewards/rejected": 0.08015017956495285, "step": 1250 }, { "dpo_losses": 0.6765602231025696, "epoch": 0.3, "grad_norm": 7.420839771551402, "learning_rate": 4.4058249690834235e-07, "logits/chosen": -2.823115587234497, "logits/rejected": -2.814901351928711, "logps/chosen": -238.54782104492188, "logps/rejected": -227.30068969726562, "loss": 0.6833, "positive_losses": 0.19190159440040588, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.11947218328714371, "rewards/margins": 0.034777458757162094, "rewards/margins_max": 0.10337839275598526, "rewards/margins_min": -0.04406232759356499, "rewards/margins_std": 0.06604452431201935, "rewards/rejected": 0.08469473570585251, "step": 1260 }, { "dpo_losses": 0.6694918274879456, "epoch": 0.3, "grad_norm": 1.7552306478498743, "learning_rate": 4.39223270266653e-07, "logits/chosen": -2.8459911346435547, "logits/rejected": -2.7895731925964355, "logps/chosen": -275.7124938964844, "logps/rejected": -274.6763610839844, "loss": 0.6748, "positive_losses": 0.06654568016529083, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13414613902568817, "rewards/margins": 0.05025973170995712, "rewards/margins_max": 0.14741170406341553, "rewards/margins_min": -0.04276218265295029, "rewards/margins_std": 0.0858989804983139, "rewards/rejected": 0.08388641476631165, "step": 1270 }, { "dpo_losses": 0.6693213582038879, "epoch": 0.31, "grad_norm": 2.008821463984248, "learning_rate": 4.378508197660045e-07, "logits/chosen": -2.8668456077575684, "logits/rejected": -2.7963147163391113, "logps/chosen": -294.4699401855469, "logps/rejected": -264.2877502441406, "loss": 0.6833, "positive_losses": 0.2672693133354187, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.14761760830879211, "rewards/margins": 0.05154553800821304, "rewards/margins_max": 0.16401459276676178, "rewards/margins_min": -0.07466455549001694, "rewards/margins_std": 0.10627603530883789, "rewards/rejected": 0.09607205539941788, "step": 1280 }, { "dpo_losses": 0.663962721824646, "epoch": 0.31, "grad_norm": 1.7464308445319745, "learning_rate": 4.364652413200325e-07, "logits/chosen": -2.871854543685913, "logits/rejected": -2.808577537536621, "logps/chosen": -296.9327697753906, "logps/rejected": -246.58901977539062, "loss": 0.669, "positive_losses": 0.07231731712818146, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.1431950479745865, "rewards/margins": 0.061199724674224854, "rewards/margins_max": 0.15835480391979218, "rewards/margins_min": -0.04367520287632942, "rewards/margins_std": 0.08925414830446243, "rewards/rejected": 0.08199533075094223, "step": 1290 }, { "dpo_losses": 0.6671271324157715, "epoch": 0.31, "grad_norm": 1.6644229409006261, "learning_rate": 4.35066631759819e-07, "logits/chosen": -2.7994656562805176, "logits/rejected": -2.7949883937835693, "logps/chosen": -269.16558837890625, "logps/rejected": -252.52029418945312, "loss": 0.6748, "positive_losses": 0.0004432678106240928, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15398970246315002, "rewards/margins": 0.05528656393289566, "rewards/margins_max": 0.17570027709007263, "rewards/margins_min": -0.040178082883358, "rewards/margins_std": 0.0946335718035698, "rewards/rejected": 0.09870313107967377, "step": 1300 }, { "epoch": 0.31, "eval_dpo_losses": 0.6733620762825012, "eval_logits/chosen": -2.770850658416748, "eval_logits/rejected": -2.7357263565063477, "eval_logps/chosen": -270.5491943359375, "eval_logps/rejected": -256.1944274902344, "eval_loss": 0.683529794216156, "eval_positive_losses": 0.08761659264564514, "eval_rewards/accuracies": 0.6664999723434448, "eval_rewards/chosen": 0.13906137645244598, "eval_rewards/margins": 0.042472973465919495, "eval_rewards/margins_max": 0.19648504257202148, "eval_rewards/margins_min": -0.0896991565823555, "eval_rewards/margins_std": 0.09535077214241028, "eval_rewards/rejected": 0.09658840298652649, "eval_runtime": 858.5434, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 1300 }, { "dpo_losses": 0.6588681936264038, "epoch": 0.31, "grad_norm": 2.0366259519728978, "learning_rate": 4.3365508882712445e-07, "logits/chosen": -2.812056541442871, "logits/rejected": -2.7945075035095215, "logps/chosen": -300.86248779296875, "logps/rejected": -256.6606140136719, "loss": 0.6636, "positive_losses": 0.013783549889922142, "rewards/accuracies": 0.75, "rewards/chosen": 0.16610956192016602, "rewards/margins": 0.0733247771859169, "rewards/margins_max": 0.2020629346370697, "rewards/margins_min": -0.04017866402864456, "rewards/margins_std": 0.11099676787853241, "rewards/rejected": 0.09278479963541031, "step": 1310 }, { "dpo_losses": 0.6735138893127441, "epoch": 0.32, "grad_norm": 8.631443436407286, "learning_rate": 4.322307111675573e-07, "logits/chosen": -2.74489426612854, "logits/rejected": -2.7100300788879395, "logps/chosen": -256.45953369140625, "logps/rejected": -224.10702514648438, "loss": 0.6884, "positive_losses": 0.28397732973098755, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12349535524845123, "rewards/margins": 0.04157816991209984, "rewards/margins_max": 0.133545383810997, "rewards/margins_min": -0.03921239823102951, "rewards/margins_std": 0.07818625867366791, "rewards/rejected": 0.0819171816110611, "step": 1320 }, { "dpo_losses": 0.6647436618804932, "epoch": 0.32, "grad_norm": 1.737722311274291, "learning_rate": 4.3079359832368055e-07, "logits/chosen": -2.830124616622925, "logits/rejected": -2.736327648162842, "logps/chosen": -278.7484436035156, "logps/rejected": -215.86428833007812, "loss": 0.672, "positive_losses": 0.1938072144985199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1436310112476349, "rewards/margins": 0.0606268048286438, "rewards/margins_max": 0.18825216591358185, "rewards/margins_min": -0.0486445352435112, "rewards/margins_std": 0.10652091354131699, "rewards/rejected": 0.08300419896841049, "step": 1330 }, { "dpo_losses": 0.6673759818077087, "epoch": 0.32, "grad_norm": 6.955844030332797, "learning_rate": 4.2934385072805467e-07, "logits/chosen": -2.7564260959625244, "logits/rejected": -2.719783306121826, "logps/chosen": -252.60745239257812, "logps/rejected": -218.45199584960938, "loss": 0.6696, "positive_losses": 0.11888504028320312, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13403327763080597, "rewards/margins": 0.05462353304028511, "rewards/margins_max": 0.15446028113365173, "rewards/margins_min": -0.04410739615559578, "rewards/margins_std": 0.08997827023267746, "rewards/rejected": 0.07940974086523056, "step": 1340 }, { "dpo_losses": 0.6668308973312378, "epoch": 0.32, "grad_norm": 2.417110555813672, "learning_rate": 4.278815696962195e-07, "logits/chosen": -2.814981460571289, "logits/rejected": -2.788083553314209, "logps/chosen": -280.0913391113281, "logps/rejected": -275.82366943359375, "loss": 0.6743, "positive_losses": 0.04302825778722763, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1407790184020996, "rewards/margins": 0.05589904263615608, "rewards/margins_max": 0.16645869612693787, "rewards/margins_min": -0.04021143168210983, "rewards/margins_std": 0.09128843992948532, "rewards/rejected": 0.08487998694181442, "step": 1350 }, { "dpo_losses": 0.6576879620552063, "epoch": 0.33, "grad_norm": 1.9666270359809261, "learning_rate": 4.264068574196129e-07, "logits/chosen": -2.745513439178467, "logits/rejected": -2.711181163787842, "logps/chosen": -278.47344970703125, "logps/rejected": -237.20498657226562, "loss": 0.67, "positive_losses": 0.026236915960907936, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1547265350818634, "rewards/margins": 0.07585780322551727, "rewards/margins_max": 0.17965057492256165, "rewards/margins_min": -0.03336961567401886, "rewards/margins_std": 0.09490080177783966, "rewards/rejected": 0.07886873185634613, "step": 1360 }, { "dpo_losses": 0.6796175837516785, "epoch": 0.33, "grad_norm": 1.7187843319325014, "learning_rate": 4.2491981695843016e-07, "logits/chosen": -2.759007215499878, "logits/rejected": -2.791151285171509, "logps/chosen": -240.149658203125, "logps/rejected": -278.3238220214844, "loss": 0.6929, "positive_losses": 0.11770925670862198, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.11884099245071411, "rewards/margins": 0.031090497970581055, "rewards/margins_max": 0.16259969770908356, "rewards/margins_min": -0.09341253340244293, "rewards/margins_std": 0.11095616966485977, "rewards/rejected": 0.08775048702955246, "step": 1370 }, { "dpo_losses": 0.6680698394775391, "epoch": 0.33, "grad_norm": 2.0756923807263727, "learning_rate": 4.2342055223442093e-07, "logits/chosen": -2.7871253490448, "logits/rejected": -2.8064613342285156, "logps/chosen": -265.6734619140625, "logps/rejected": -251.57095336914062, "loss": 0.6754, "positive_losses": 0.03847923129796982, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13982243835926056, "rewards/margins": 0.05321364849805832, "rewards/margins_max": 0.15810494124889374, "rewards/margins_min": -0.03415246680378914, "rewards/margins_std": 0.08585749566555023, "rewards/rejected": 0.08660879731178284, "step": 1380 }, { "dpo_losses": 0.6815242171287537, "epoch": 0.33, "grad_norm": 1.734321043532715, "learning_rate": 4.2190916802362687e-07, "logits/chosen": -2.842531442642212, "logits/rejected": -2.817317247390747, "logps/chosen": -238.5765838623047, "logps/rejected": -249.25088500976562, "loss": 0.6816, "positive_losses": 0.11064691841602325, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.12086455523967743, "rewards/margins": 0.02554454281926155, "rewards/margins_max": 0.12612749636173248, "rewards/margins_min": -0.06676454097032547, "rewards/margins_std": 0.08789917826652527, "rewards/rejected": 0.09532001614570618, "step": 1390 }, { "dpo_losses": 0.6675626635551453, "epoch": 0.34, "grad_norm": 7.034200227414305, "learning_rate": 4.203857699490593e-07, "logits/chosen": -2.827584743499756, "logits/rejected": -2.7623417377471924, "logps/chosen": -259.97259521484375, "logps/rejected": -215.9918975830078, "loss": 0.6872, "positive_losses": 0.18903502821922302, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1284700483083725, "rewards/margins": 0.05474201589822769, "rewards/margins_max": 0.16924390196800232, "rewards/margins_min": -0.04117489606142044, "rewards/margins_std": 0.0951467901468277, "rewards/rejected": 0.0737280398607254, "step": 1400 }, { "epoch": 0.34, "eval_dpo_losses": 0.6721086502075195, "eval_logits/chosen": -2.771934986114502, "eval_logits/rejected": -2.7366933822631836, "eval_logps/chosen": -270.5384521484375, "eval_logps/rejected": -256.4647216796875, "eval_loss": 0.6833829879760742, "eval_positive_losses": 0.09732773154973984, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.139168843626976, "eval_rewards/margins": 0.04528341069817543, "eval_rewards/margins_max": 0.2070295512676239, "eval_rewards/margins_min": -0.09304425120353699, "eval_rewards/margins_std": 0.09999353438615799, "eval_rewards/rejected": 0.09388545155525208, "eval_runtime": 858.7979, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1400 }, { "dpo_losses": 0.6610335111618042, "epoch": 0.34, "grad_norm": 4.42486816037431, "learning_rate": 4.1885046447331816e-07, "logits/chosen": -2.745612621307373, "logits/rejected": -2.7551496028900146, "logps/chosen": -295.9497985839844, "logps/rejected": -262.0827941894531, "loss": 0.6671, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.16865387558937073, "rewards/margins": 0.06779374182224274, "rewards/margins_max": 0.17328371107578278, "rewards/margins_min": -0.025494080036878586, "rewards/margins_std": 0.09056703746318817, "rewards/rejected": 0.10086014121770859, "step": 1410 }, { "dpo_losses": 0.6701120138168335, "epoch": 0.34, "grad_norm": 7.986679670209706, "learning_rate": 4.173033588911511e-07, "logits/chosen": -2.8108277320861816, "logits/rejected": -2.7988948822021484, "logps/chosen": -302.43096923828125, "logps/rejected": -291.8939514160156, "loss": 0.6745, "positive_losses": 0.08373244851827621, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13471785187721252, "rewards/margins": 0.04961312934756279, "rewards/margins_max": 0.16116461157798767, "rewards/margins_min": -0.054306935518980026, "rewards/margins_std": 0.09493438154459, "rewards/rejected": 0.08510471135377884, "step": 1420 }, { "dpo_losses": 0.6700631976127625, "epoch": 0.34, "grad_norm": 6.560991967727589, "learning_rate": 4.157445613219559e-07, "logits/chosen": -2.7104034423828125, "logits/rejected": -2.7185914516448975, "logps/chosen": -256.75048828125, "logps/rejected": -261.38043212890625, "loss": 0.676, "positive_losses": 0.04877481609582901, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13160721957683563, "rewards/margins": 0.04932086914777756, "rewards/margins_max": 0.1584603488445282, "rewards/margins_min": -0.049920372664928436, "rewards/margins_std": 0.09329970180988312, "rewards/rejected": 0.08228635042905807, "step": 1430 }, { "dpo_losses": 0.6712281107902527, "epoch": 0.34, "grad_norm": 6.3459859581283125, "learning_rate": 4.141741807022243e-07, "logits/chosen": -2.8178117275238037, "logits/rejected": -2.7487661838531494, "logps/chosen": -290.2847900390625, "logps/rejected": -235.52682495117188, "loss": 0.6856, "positive_losses": 0.04770936816930771, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13782814145088196, "rewards/margins": 0.047919366508722305, "rewards/margins_max": 0.1658453643321991, "rewards/margins_min": -0.07419236749410629, "rewards/margins_std": 0.1060555949807167, "rewards/rejected": 0.08990879356861115, "step": 1440 }, { "dpo_losses": 0.6754899024963379, "epoch": 0.35, "grad_norm": 1.834550049210062, "learning_rate": 4.1259232677792865e-07, "logits/chosen": -2.7925171852111816, "logits/rejected": -2.787262439727783, "logps/chosen": -247.8746795654297, "logps/rejected": -226.3203582763672, "loss": 0.673, "positive_losses": 0.03153266757726669, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.12556497752666473, "rewards/margins": 0.03765324503183365, "rewards/margins_max": 0.11921225488185883, "rewards/margins_min": -0.04862559586763382, "rewards/margins_std": 0.07422569394111633, "rewards/rejected": 0.08791173994541168, "step": 1450 }, { "dpo_losses": 0.6708461046218872, "epoch": 0.35, "grad_norm": 1.8862233271650009, "learning_rate": 4.1099911009685294e-07, "logits/chosen": -2.734041690826416, "logits/rejected": -2.728245496749878, "logps/chosen": -292.1017761230469, "logps/rejected": -251.2809295654297, "loss": 0.6722, "positive_losses": 0.0021797181107103825, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13616777956485748, "rewards/margins": 0.04826116934418678, "rewards/margins_max": 0.14535865187644958, "rewards/margins_min": -0.06490767747163773, "rewards/margins_std": 0.09541989862918854, "rewards/rejected": 0.0879065990447998, "step": 1460 }, { "dpo_losses": 0.6672282218933105, "epoch": 0.35, "grad_norm": 2.2611052582040445, "learning_rate": 4.093946420008668e-07, "logits/chosen": -2.7727484703063965, "logits/rejected": -2.745819330215454, "logps/chosen": -257.0188293457031, "logps/rejected": -233.44345092773438, "loss": 0.6739, "positive_losses": 0.03880057483911514, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13319933414459229, "rewards/margins": 0.05517640709877014, "rewards/margins_max": 0.1727629005908966, "rewards/margins_min": -0.03090861439704895, "rewards/margins_std": 0.0919630378484726, "rewards/rejected": 0.07802292704582214, "step": 1470 }, { "dpo_losses": 0.6704279184341431, "epoch": 0.35, "grad_norm": 2.0924497524048484, "learning_rate": 4.0777903461814443e-07, "logits/chosen": -2.8056304454803467, "logits/rejected": -2.7812302112579346, "logps/chosen": -283.359375, "logps/rejected": -239.69003295898438, "loss": 0.6823, "positive_losses": 0.030101966112852097, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1405610740184784, "rewards/margins": 0.0491039976477623, "rewards/margins_max": 0.15657839179039001, "rewards/margins_min": -0.056843362748622894, "rewards/margins_std": 0.09549923241138458, "rewards/rejected": 0.0914570763707161, "step": 1480 }, { "dpo_losses": 0.664718747138977, "epoch": 0.36, "grad_norm": 4.182798792717198, "learning_rate": 4.061524008553285e-07, "logits/chosen": -2.735924005508423, "logits/rejected": -2.7131056785583496, "logps/chosen": -243.4227752685547, "logps/rejected": -223.5598602294922, "loss": 0.6677, "positive_losses": 0.05420370027422905, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15023869276046753, "rewards/margins": 0.06076166778802872, "rewards/margins_max": 0.1883254051208496, "rewards/margins_min": -0.05227719992399216, "rewards/margins_std": 0.10687106847763062, "rewards/rejected": 0.0894770398736, "step": 1490 }, { "dpo_losses": 0.6724478006362915, "epoch": 0.36, "grad_norm": 5.422218333870027, "learning_rate": 4.045148543896396e-07, "logits/chosen": -2.806703567504883, "logits/rejected": -2.7600746154785156, "logps/chosen": -272.77105712890625, "logps/rejected": -242.69094848632812, "loss": 0.6926, "positive_losses": 0.17005939781665802, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1296619325876236, "rewards/margins": 0.044427540153265, "rewards/margins_max": 0.16035720705986023, "rewards/margins_min": -0.06300385296344757, "rewards/margins_std": 0.0993872880935669, "rewards/rejected": 0.0852343812584877, "step": 1500 }, { "epoch": 0.36, "eval_dpo_losses": 0.6710348725318909, "eval_logits/chosen": -2.7680938243865967, "eval_logits/rejected": -2.732898712158203, "eval_logps/chosen": -270.43243408203125, "eval_logps/rejected": -256.6025695800781, "eval_loss": 0.6832990646362305, "eval_positive_losses": 0.10578416287899017, "eval_rewards/accuracies": 0.6685000061988831, "eval_rewards/chosen": 0.1402287632226944, "eval_rewards/margins": 0.047721847891807556, "eval_rewards/margins_max": 0.21653224527835846, "eval_rewards/margins_min": -0.0955621674656868, "eval_rewards/margins_std": 0.10418935865163803, "eval_rewards/rejected": 0.09250691533088684, "eval_runtime": 858.4851, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 1500 }, { "dpo_losses": 0.6771226525306702, "epoch": 0.36, "grad_norm": 1.8371655841058125, "learning_rate": 4.028665096609323e-07, "logits/chosen": -2.813671112060547, "logits/rejected": -2.8038244247436523, "logps/chosen": -293.37371826171875, "logps/rejected": -282.1329650878906, "loss": 0.6864, "positive_losses": 0.07167728245258331, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12888312339782715, "rewards/margins": 0.035157207399606705, "rewards/margins_max": 0.14005085825920105, "rewards/margins_min": -0.07154486328363419, "rewards/margins_std": 0.09201464802026749, "rewards/rejected": 0.09372591972351074, "step": 1510 }, { "dpo_losses": 0.6599644422531128, "epoch": 0.36, "grad_norm": 2.245901826039382, "learning_rate": 4.01207481863697e-07, "logits/chosen": -2.885627031326294, "logits/rejected": -2.82566499710083, "logps/chosen": -330.5918273925781, "logps/rejected": -279.207275390625, "loss": 0.6752, "positive_losses": 0.13477468490600586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16726639866828918, "rewards/margins": 0.07052014023065567, "rewards/margins_max": 0.1890564113855362, "rewards/margins_min": -0.04209518805146217, "rewards/margins_std": 0.10368168354034424, "rewards/rejected": 0.09674624353647232, "step": 1520 }, { "dpo_losses": 0.6725361347198486, "epoch": 0.37, "grad_norm": 1.817287108429348, "learning_rate": 3.9953788693901e-07, "logits/chosen": -2.777818202972412, "logits/rejected": -2.7436556816101074, "logps/chosen": -295.2908630371094, "logps/rejected": -267.34368896484375, "loss": 0.6781, "positive_losses": 0.13644781708717346, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1444375216960907, "rewards/margins": 0.045959293842315674, "rewards/margins_max": 0.16905540227890015, "rewards/margins_min": -0.08966761082410812, "rewards/margins_std": 0.11836668103933334, "rewards/rejected": 0.09847822040319443, "step": 1530 }, { "dpo_losses": 0.6646915674209595, "epoch": 0.37, "grad_norm": 1.8806306852468382, "learning_rate": 3.978578415664306e-07, "logits/chosen": -2.714115858078003, "logits/rejected": -2.7103097438812256, "logps/chosen": -243.4694366455078, "logps/rejected": -208.46435546875, "loss": 0.6665, "positive_losses": 0.017754364758729935, "rewards/accuracies": 0.75, "rewards/chosen": 0.1538422405719757, "rewards/margins": 0.06030309200286865, "rewards/margins_max": 0.16562533378601074, "rewards/margins_min": -0.02732093259692192, "rewards/margins_std": 0.08652455359697342, "rewards/rejected": 0.09353913366794586, "step": 1540 }, { "dpo_losses": 0.6576844453811646, "epoch": 0.37, "grad_norm": 2.0687297024840383, "learning_rate": 3.9616746315584733e-07, "logits/chosen": -2.820685863494873, "logits/rejected": -2.7326419353485107, "logps/chosen": -308.12152099609375, "logps/rejected": -229.09426879882812, "loss": 0.6791, "positive_losses": 0.22606030106544495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15785667300224304, "rewards/margins": 0.07622842490673065, "rewards/margins_max": 0.19972564280033112, "rewards/margins_min": -0.02011972665786743, "rewards/margins_std": 0.0997951477766037, "rewards/rejected": 0.0816282406449318, "step": 1550 }, { "dpo_losses": 0.6624256372451782, "epoch": 0.37, "grad_norm": 2.1026767201946277, "learning_rate": 3.9446686983927236e-07, "logits/chosen": -2.7585856914520264, "logits/rejected": -2.739420175552368, "logps/chosen": -243.958251953125, "logps/rejected": -263.9037170410156, "loss": 0.677, "positive_losses": 0.047960661351680756, "rewards/accuracies": 0.75, "rewards/chosen": 0.13920164108276367, "rewards/margins": 0.06559185683727264, "rewards/margins_max": 0.1819840967655182, "rewards/margins_min": -0.04743064194917679, "rewards/margins_std": 0.10192713886499405, "rewards/rejected": 0.07360976934432983, "step": 1560 }, { "dpo_losses": 0.6679614782333374, "epoch": 0.38, "grad_norm": 2.489227833498125, "learning_rate": 3.927561804625863e-07, "logits/chosen": -2.7547833919525146, "logits/rejected": -2.7306020259857178, "logps/chosen": -300.3507385253906, "logps/rejected": -295.5068664550781, "loss": 0.6828, "positive_losses": 0.21662501990795135, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.14682255685329437, "rewards/margins": 0.055068932473659515, "rewards/margins_max": 0.2043771743774414, "rewards/margins_min": -0.07737723737955093, "rewards/margins_std": 0.12575773894786835, "rewards/rejected": 0.09175362437963486, "step": 1570 }, { "dpo_losses": 0.6566230654716492, "epoch": 0.38, "grad_norm": 15.23400490335709, "learning_rate": 3.910355145772323e-07, "logits/chosen": -2.7744317054748535, "logits/rejected": -2.7525362968444824, "logps/chosen": -267.0977783203125, "logps/rejected": -260.8230895996094, "loss": 0.6717, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14902520179748535, "rewards/margins": 0.07777659595012665, "rewards/margins_max": 0.20543232560157776, "rewards/margins_min": -0.040122538805007935, "rewards/margins_std": 0.10807816684246063, "rewards/rejected": 0.0712486058473587, "step": 1580 }, { "dpo_losses": 0.6585559844970703, "epoch": 0.38, "grad_norm": 1.8967466006114662, "learning_rate": 3.893049924318613e-07, "logits/chosen": -2.7760367393493652, "logits/rejected": -2.7615256309509277, "logps/chosen": -261.13995361328125, "logps/rejected": -265.951904296875, "loss": 0.6674, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14671626687049866, "rewards/margins": 0.07251034677028656, "rewards/margins_max": 0.16612282395362854, "rewards/margins_min": -0.015146632678806782, "rewards/margins_std": 0.08260852098464966, "rewards/rejected": 0.07420593500137329, "step": 1590 }, { "dpo_losses": 0.6616863012313843, "epoch": 0.38, "grad_norm": 42.64812766076314, "learning_rate": 3.875647349639286e-07, "logits/chosen": -2.8227038383483887, "logits/rejected": -2.7666497230529785, "logps/chosen": -274.9165954589844, "logps/rejected": -203.1664276123047, "loss": 0.6862, "positive_losses": 0.16700835525989532, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.13217999041080475, "rewards/margins": 0.06738521158695221, "rewards/margins_max": 0.1946076601743698, "rewards/margins_min": -0.057869862765073776, "rewards/margins_std": 0.11476844549179077, "rewards/rejected": 0.06479477137327194, "step": 1600 }, { "epoch": 0.38, "eval_dpo_losses": 0.6689034104347229, "eval_logits/chosen": -2.764211416244507, "eval_logits/rejected": -2.7292416095733643, "eval_logps/chosen": -271.23089599609375, "eval_logps/rejected": -257.8934631347656, "eval_loss": 0.6891427040100098, "eval_positive_losses": 0.17287535965442657, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.1322442591190338, "eval_rewards/margins": 0.05264650657773018, "eval_rewards/margins_max": 0.23605671525001526, "eval_rewards/margins_min": -0.10386566072702408, "eval_rewards/margins_std": 0.11338083446025848, "eval_rewards/rejected": 0.07959775626659393, "eval_runtime": 859.1655, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 1600 }, { "dpo_losses": 0.6577469110488892, "epoch": 0.39, "grad_norm": 12.555259933667369, "learning_rate": 3.8581486379124185e-07, "logits/chosen": -2.8448097705841064, "logits/rejected": -2.839405059814453, "logps/chosen": -297.9596252441406, "logps/rejected": -252.9747314453125, "loss": 0.6802, "positive_losses": 0.08188953250646591, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.14745555818080902, "rewards/margins": 0.07575681805610657, "rewards/margins_max": 0.2013626992702484, "rewards/margins_min": -0.04085198789834976, "rewards/margins_std": 0.10827799141407013, "rewards/rejected": 0.07169874757528305, "step": 1610 }, { "dpo_losses": 0.6658589839935303, "epoch": 0.39, "grad_norm": 9.368976743472393, "learning_rate": 3.840555012034622e-07, "logits/chosen": -2.724377155303955, "logits/rejected": -2.6677796840667725, "logps/chosen": -239.3917236328125, "logps/rejected": -233.96603393554688, "loss": 0.6726, "positive_losses": 0.14963221549987793, "rewards/accuracies": 0.75, "rewards/chosen": 0.13070140779018402, "rewards/margins": 0.05776132270693779, "rewards/margins_max": 0.16032978892326355, "rewards/margins_min": -0.041094452142715454, "rewards/margins_std": 0.09118757396936417, "rewards/rejected": 0.07294009625911713, "step": 1620 }, { "dpo_losses": 0.665189802646637, "epoch": 0.39, "grad_norm": 9.312285760040881, "learning_rate": 3.822867701535578e-07, "logits/chosen": -2.7628848552703857, "logits/rejected": -2.7503902912139893, "logps/chosen": -257.46728515625, "logps/rejected": -227.2912139892578, "loss": 0.6782, "positive_losses": 0.10737171024084091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13726899027824402, "rewards/margins": 0.0597599558532238, "rewards/margins_max": 0.1754257082939148, "rewards/margins_min": -0.05168009549379349, "rewards/margins_std": 0.10023969411849976, "rewards/rejected": 0.07750905305147171, "step": 1630 }, { "dpo_losses": 0.6606549024581909, "epoch": 0.39, "grad_norm": 10.376978306422957, "learning_rate": 3.805087942492112e-07, "logits/chosen": -2.7315287590026855, "logits/rejected": -2.692012310028076, "logps/chosen": -262.320068359375, "logps/rejected": -250.387939453125, "loss": 0.6767, "positive_losses": 0.22962932288646698, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.15593476593494415, "rewards/margins": 0.06938613951206207, "rewards/margins_max": 0.20441968739032745, "rewards/margins_min": -0.041383083909749985, "rewards/margins_std": 0.10869801044464111, "rewards/rejected": 0.08654861897230148, "step": 1640 }, { "dpo_losses": 0.6639494895935059, "epoch": 0.4, "grad_norm": 1.8310513671844897, "learning_rate": 3.787216977441814e-07, "logits/chosen": -2.7906785011291504, "logits/rejected": -2.748378276824951, "logps/chosen": -244.2302703857422, "logps/rejected": -260.2334899902344, "loss": 0.6704, "positive_losses": 0.023534011095762253, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15594667196273804, "rewards/margins": 0.06269647926092148, "rewards/margins_max": 0.1800825148820877, "rewards/margins_min": -0.047526925802230835, "rewards/margins_std": 0.10430476814508438, "rewards/rejected": 0.09325020015239716, "step": 1650 }, { "dpo_losses": 0.6745316982269287, "epoch": 0.4, "grad_norm": 10.450751931912977, "learning_rate": 3.7692560552961976e-07, "logits/chosen": -2.7838072776794434, "logits/rejected": -2.750624418258667, "logps/chosen": -238.0394744873047, "logps/rejected": -252.3380126953125, "loss": 0.6784, "positive_losses": 0.10873398929834366, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1311834752559662, "rewards/margins": 0.04016824811697006, "rewards/margins_max": 0.1495421826839447, "rewards/margins_min": -0.04827088862657547, "rewards/margins_std": 0.08935986459255219, "rewards/rejected": 0.09101523458957672, "step": 1660 }, { "dpo_losses": 0.6743917465209961, "epoch": 0.4, "grad_norm": 2.141166117417495, "learning_rate": 3.7512064312534276e-07, "logits/chosen": -2.7757301330566406, "logits/rejected": -2.710458993911743, "logps/chosen": -298.9239501953125, "logps/rejected": -285.231201171875, "loss": 0.6675, "positive_losses": 0.06390800327062607, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1522187888622284, "rewards/margins": 0.04318951442837715, "rewards/margins_max": 0.19562895596027374, "rewards/margins_min": -0.09502002596855164, "rewards/margins_std": 0.13035540282726288, "rewards/rejected": 0.10902925580739975, "step": 1670 }, { "dpo_losses": 0.6677789688110352, "epoch": 0.4, "grad_norm": 1.9776325571069984, "learning_rate": 3.7330693667105937e-07, "logits/chosen": -2.8675408363342285, "logits/rejected": -2.7965810298919678, "logps/chosen": -304.49237060546875, "logps/rejected": -237.98373413085938, "loss": 0.6749, "positive_losses": 0.15132789313793182, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14851847290992737, "rewards/margins": 0.054375600069761276, "rewards/margins_max": 0.16309884190559387, "rewards/margins_min": -0.056412748992443085, "rewards/margins_std": 0.09585843980312347, "rewards/rejected": 0.09414288401603699, "step": 1680 }, { "dpo_losses": 0.6672551035881042, "epoch": 0.4, "grad_norm": 8.755492747942707, "learning_rate": 3.7148461291755626e-07, "logits/chosen": -2.7923712730407715, "logits/rejected": -2.761561393737793, "logps/chosen": -257.810302734375, "logps/rejected": -275.43280029296875, "loss": 0.6789, "positive_losses": 0.16247773170471191, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.13847076892852783, "rewards/margins": 0.055886924266815186, "rewards/margins_max": 0.16485749185085297, "rewards/margins_min": -0.04262311011552811, "rewards/margins_std": 0.0915924534201622, "rewards/rejected": 0.08258385211229324, "step": 1690 }, { "dpo_losses": 0.6623684167861938, "epoch": 0.41, "grad_norm": 2.093778012216414, "learning_rate": 3.6965379921783945e-07, "logits/chosen": -2.837184429168701, "logits/rejected": -2.8110601902008057, "logps/chosen": -277.3193664550781, "logps/rejected": -257.59228515625, "loss": 0.6779, "positive_losses": 0.08317184448242188, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15550120174884796, "rewards/margins": 0.06643487513065338, "rewards/margins_max": 0.18445202708244324, "rewards/margins_min": -0.05777024105191231, "rewards/margins_std": 0.10668377578258514, "rewards/rejected": 0.0890662893652916, "step": 1700 }, { "epoch": 0.41, "eval_dpo_losses": 0.6697779297828674, "eval_logits/chosen": -2.7657854557037354, "eval_logits/rejected": -2.7307827472686768, "eval_logps/chosen": -269.59613037109375, "eval_logps/rejected": -256.0603942871094, "eval_loss": 0.68208909034729, "eval_positive_losses": 0.09621649235486984, "eval_rewards/accuracies": 0.6704999804496765, "eval_rewards/chosen": 0.14859208464622498, "eval_rewards/margins": 0.05066324770450592, "eval_rewards/margins_max": 0.22932368516921997, "eval_rewards/margins_min": -0.10157396644353867, "eval_rewards/margins_std": 0.11035209149122238, "eval_rewards/rejected": 0.09792882949113846, "eval_runtime": 858.8839, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 1700 }, { "dpo_losses": 0.6682418584823608, "epoch": 0.41, "grad_norm": 2.6392404210591707, "learning_rate": 3.6781462351823455e-07, "logits/chosen": -2.7866721153259277, "logits/rejected": -2.7881574630737305, "logps/chosen": -271.00469970703125, "logps/rejected": -309.91912841796875, "loss": 0.6782, "positive_losses": 0.03138427808880806, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1514436900615692, "rewards/margins": 0.053423114120960236, "rewards/margins_max": 0.16554298996925354, "rewards/margins_min": -0.06405045092105865, "rewards/margins_std": 0.10128488391637802, "rewards/rejected": 0.09802057594060898, "step": 1710 }, { "dpo_losses": 0.6725479960441589, "epoch": 0.41, "grad_norm": 10.365443228829056, "learning_rate": 3.6596721434944513e-07, "logits/chosen": -2.838326930999756, "logits/rejected": -2.791605234146118, "logps/chosen": -269.63665771484375, "logps/rejected": -268.91497802734375, "loss": 0.6724, "positive_losses": 0.09364394843578339, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14229586720466614, "rewards/margins": 0.04432683810591698, "rewards/margins_max": 0.16160908341407776, "rewards/margins_min": -0.058353155851364136, "rewards/margins_std": 0.09746594727039337, "rewards/rejected": 0.09796904027462006, "step": 1720 }, { "dpo_losses": 0.6641412377357483, "epoch": 0.41, "grad_norm": 7.191422693999396, "learning_rate": 3.6411170081757025e-07, "logits/chosen": -2.814384937286377, "logits/rejected": -2.7830426692962646, "logps/chosen": -275.9794616699219, "logps/rejected": -248.0959014892578, "loss": 0.6776, "positive_losses": 0.13040713965892792, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1638602465391159, "rewards/margins": 0.06404414027929306, "rewards/margins_max": 0.21581995487213135, "rewards/margins_min": -0.06597750633955002, "rewards/margins_std": 0.1275712102651596, "rewards/rejected": 0.09981611371040344, "step": 1730 }, { "dpo_losses": 0.6651660203933716, "epoch": 0.42, "grad_norm": 1.932151981319121, "learning_rate": 3.622482125950821e-07, "logits/chosen": -2.8299760818481445, "logits/rejected": -2.8144009113311768, "logps/chosen": -296.02630615234375, "logps/rejected": -271.87548828125, "loss": 0.671, "positive_losses": 0.06782150268554688, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14321517944335938, "rewards/margins": 0.05941122770309448, "rewards/margins_max": 0.16575893759727478, "rewards/margins_min": -0.04967840015888214, "rewards/margins_std": 0.09421806037425995, "rewards/rejected": 0.0838039368391037, "step": 1740 }, { "dpo_losses": 0.6625608801841736, "epoch": 0.42, "grad_norm": 5.530619267423963, "learning_rate": 3.603768799117637e-07, "logits/chosen": -2.7747650146484375, "logits/rejected": -2.7502410411834717, "logps/chosen": -271.59576416015625, "logps/rejected": -249.7652130126953, "loss": 0.6721, "positive_losses": 0.05405540391802788, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15138670802116394, "rewards/margins": 0.06605236232280731, "rewards/margins_max": 0.2026832103729248, "rewards/margins_min": -0.04628317058086395, "rewards/margins_std": 0.11121700704097748, "rewards/rejected": 0.08533434569835663, "step": 1750 }, { "dpo_losses": 0.6694071292877197, "epoch": 0.42, "grad_norm": 10.709187949745244, "learning_rate": 3.584978335456078e-07, "logits/chosen": -2.746544599533081, "logits/rejected": -2.77703595161438, "logps/chosen": -259.0121154785156, "logps/rejected": -282.51849365234375, "loss": 0.667, "positive_losses": 0.04993915557861328, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14787675440311432, "rewards/margins": 0.05134459212422371, "rewards/margins_max": 0.18064099550247192, "rewards/margins_min": -0.06007281690835953, "rewards/margins_std": 0.10860881954431534, "rewards/rejected": 0.09653216600418091, "step": 1760 }, { "dpo_losses": 0.6633495092391968, "epoch": 0.42, "grad_norm": 2.01047158701566, "learning_rate": 3.5661120481367757e-07, "logits/chosen": -2.8629236221313477, "logits/rejected": -2.8274943828582764, "logps/chosen": -307.61810302734375, "logps/rejected": -272.1058349609375, "loss": 0.6712, "positive_losses": 0.0128021240234375, "rewards/accuracies": 0.75, "rewards/chosen": 0.152008518576622, "rewards/margins": 0.06390496343374252, "rewards/margins_max": 0.1893347203731537, "rewards/margins_min": -0.058609772473573685, "rewards/margins_std": 0.11036945879459381, "rewards/rejected": 0.08810355514287949, "step": 1770 }, { "dpo_losses": 0.6638668775558472, "epoch": 0.43, "grad_norm": 2.084328248559161, "learning_rate": 3.547171255629292e-07, "logits/chosen": -2.7453548908233643, "logits/rejected": -2.686885118484497, "logps/chosen": -256.10186767578125, "logps/rejected": -210.37451171875, "loss": 0.6858, "positive_losses": 0.31650906801223755, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14278405904769897, "rewards/margins": 0.06244480609893799, "rewards/margins_max": 0.17453891038894653, "rewards/margins_min": -0.03924071043729782, "rewards/margins_std": 0.09275037050247192, "rewards/rejected": 0.08033924549818039, "step": 1780 }, { "dpo_losses": 0.6740632057189941, "epoch": 0.43, "grad_norm": 10.274534310510212, "learning_rate": 3.528157281609984e-07, "logits/chosen": -2.770188808441162, "logits/rejected": -2.7739768028259277, "logps/chosen": -202.93453979492188, "logps/rejected": -190.0338592529297, "loss": 0.6987, "positive_losses": 0.2924560606479645, "rewards/accuracies": 0.75, "rewards/chosen": 0.10417057573795319, "rewards/margins": 0.040463633835315704, "rewards/margins_max": 0.1307048499584198, "rewards/margins_min": -0.055921923369169235, "rewards/margins_std": 0.0827755406498909, "rewards/rejected": 0.06370692700147629, "step": 1790 }, { "dpo_losses": 0.6643359065055847, "epoch": 0.43, "grad_norm": 11.115220475577761, "learning_rate": 3.5090714548694916e-07, "logits/chosen": -2.65936279296875, "logits/rejected": -2.6561999320983887, "logps/chosen": -314.97821044921875, "logps/rejected": -279.9248962402344, "loss": 0.6726, "positive_losses": 0.23862552642822266, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16556710004806519, "rewards/margins": 0.06590748578310013, "rewards/margins_max": 0.20950034260749817, "rewards/margins_min": -0.07685359567403793, "rewards/margins_std": 0.12584754824638367, "rewards/rejected": 0.09965959936380386, "step": 1800 }, { "epoch": 0.43, "eval_dpo_losses": 0.6686532497406006, "eval_logits/chosen": -2.7615253925323486, "eval_logits/rejected": -2.726641893386841, "eval_logps/chosen": -269.7856750488281, "eval_logps/rejected": -256.5086975097656, "eval_loss": 0.6841806769371033, "eval_positive_losses": 0.12088748067617416, "eval_rewards/accuracies": 0.6729999780654907, "eval_rewards/chosen": 0.1466965675354004, "eval_rewards/margins": 0.05325073376297951, "eval_rewards/margins_max": 0.23796971142292023, "eval_rewards/margins_min": -0.1060108169913292, "eval_rewards/margins_std": 0.11491890251636505, "eval_rewards/rejected": 0.09344581514596939, "eval_runtime": 859.1396, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 1800 }, { "dpo_losses": 0.6669281125068665, "epoch": 0.43, "grad_norm": 1.9541231580281586, "learning_rate": 3.489915109219882e-07, "logits/chosen": -2.7533602714538574, "logits/rejected": -2.7101738452911377, "logps/chosen": -231.8050079345703, "logps/rejected": -213.2619171142578, "loss": 0.6737, "positive_losses": 0.07501935958862305, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.13909700512886047, "rewards/margins": 0.05627512186765671, "rewards/margins_max": 0.1636616587638855, "rewards/margins_min": -0.045592304319143295, "rewards/margins_std": 0.09598131477832794, "rewards/rejected": 0.08282189816236496, "step": 1810 }, { "dpo_losses": 0.6664931774139404, "epoch": 0.44, "grad_norm": 7.35606980666227, "learning_rate": 3.4706895834014294e-07, "logits/chosen": -2.8269903659820557, "logits/rejected": -2.8012499809265137, "logps/chosen": -277.8968200683594, "logps/rejected": -274.05059814453125, "loss": 0.6763, "positive_losses": 0.0857643112540245, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15246310830116272, "rewards/margins": 0.05840132385492325, "rewards/margins_max": 0.186055988073349, "rewards/margins_min": -0.0596766360104084, "rewards/margins_std": 0.10686671733856201, "rewards/rejected": 0.09406177699565887, "step": 1820 }, { "dpo_losses": 0.666508674621582, "epoch": 0.44, "grad_norm": 1.9119169705371055, "learning_rate": 3.451396220989064e-07, "logits/chosen": -2.848273277282715, "logits/rejected": -2.7672178745269775, "logps/chosen": -266.3080749511719, "logps/rejected": -234.0794219970703, "loss": 0.6783, "positive_losses": 0.3531467318534851, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14861007034778595, "rewards/margins": 0.05802565813064575, "rewards/margins_max": 0.18296341598033905, "rewards/margins_min": -0.05569658800959587, "rewards/margins_std": 0.10839241743087769, "rewards/rejected": 0.090584397315979, "step": 1830 }, { "dpo_losses": 0.667290210723877, "epoch": 0.44, "grad_norm": 2.117941408955304, "learning_rate": 3.43203637029847e-07, "logits/chosen": -2.836254596710205, "logits/rejected": -2.774336338043213, "logps/chosen": -322.9077453613281, "logps/rejected": -277.01373291015625, "loss": 0.6936, "positive_losses": 0.3807083070278168, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14203055202960968, "rewards/margins": 0.056296706199645996, "rewards/margins_max": 0.19748814404010773, "rewards/margins_min": -0.06254958361387253, "rewards/margins_std": 0.11538340896368027, "rewards/rejected": 0.08573383837938309, "step": 1840 }, { "dpo_losses": 0.6654022932052612, "epoch": 0.44, "grad_norm": 2.03026357320836, "learning_rate": 3.4126113842918643e-07, "logits/chosen": -2.8013951778411865, "logits/rejected": -2.753168821334839, "logps/chosen": -267.6473083496094, "logps/rejected": -240.2247772216797, "loss": 0.6784, "positive_losses": 0.08067617565393448, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15620282292366028, "rewards/margins": 0.059252046048641205, "rewards/margins_max": 0.1627664864063263, "rewards/margins_min": -0.05071081593632698, "rewards/margins_std": 0.09582805633544922, "rewards/rejected": 0.09695078432559967, "step": 1850 }, { "dpo_losses": 0.66180419921875, "epoch": 0.45, "grad_norm": 2.373563633292578, "learning_rate": 3.3931226204834397e-07, "logits/chosen": -2.855271100997925, "logits/rejected": -2.8535215854644775, "logps/chosen": -310.538818359375, "logps/rejected": -291.3670654296875, "loss": 0.6873, "positive_losses": 0.13837623596191406, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15993010997772217, "rewards/margins": 0.06751077622175217, "rewards/margins_max": 0.20825180411338806, "rewards/margins_min": -0.04970566928386688, "rewards/margins_std": 0.11694632470607758, "rewards/rejected": 0.0924193412065506, "step": 1860 }, { "dpo_losses": 0.6733115315437317, "epoch": 0.45, "grad_norm": 11.040853755944612, "learning_rate": 3.3735714408445e-07, "logits/chosen": -2.7823493480682373, "logits/rejected": -2.7949492931365967, "logps/chosen": -236.0526885986328, "logps/rejected": -247.50936889648438, "loss": 0.6772, "positive_losses": 0.08392753452062607, "rewards/accuracies": 0.625, "rewards/chosen": 0.14773313701152802, "rewards/margins": 0.04372979328036308, "rewards/margins_max": 0.1726093590259552, "rewards/margins_min": -0.0746278315782547, "rewards/margins_std": 0.11085480451583862, "rewards/rejected": 0.10400333255529404, "step": 1870 }, { "dpo_losses": 0.6606142520904541, "epoch": 0.45, "grad_norm": 9.598683235143913, "learning_rate": 3.3539592117082746e-07, "logits/chosen": -2.7616286277770996, "logits/rejected": -2.7219414710998535, "logps/chosen": -265.49920654296875, "logps/rejected": -266.499755859375, "loss": 0.6821, "positive_losses": 0.1669657677412033, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.15696823596954346, "rewards/margins": 0.06985442340373993, "rewards/margins_max": 0.18338429927825928, "rewards/margins_min": -0.04848301038146019, "rewards/margins_std": 0.10353618860244751, "rewards/rejected": 0.08711381256580353, "step": 1880 }, { "dpo_losses": 0.6640108227729797, "epoch": 0.45, "grad_norm": 12.799024687802754, "learning_rate": 3.3342873036744346e-07, "logits/chosen": -2.8153235912323, "logits/rejected": -2.7941555976867676, "logps/chosen": -284.2322082519531, "logps/rejected": -291.2121887207031, "loss": 0.6783, "positive_losses": 0.12589502334594727, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15120121836662292, "rewards/margins": 0.06243538856506348, "rewards/margins_max": 0.1900346428155899, "rewards/margins_min": -0.05409926176071167, "rewards/margins_std": 0.10962159931659698, "rewards/rejected": 0.08876581490039825, "step": 1890 }, { "dpo_losses": 0.6658226251602173, "epoch": 0.45, "grad_norm": 10.30937230623828, "learning_rate": 3.3145570915133067e-07, "logits/chosen": -2.771275043487549, "logits/rejected": -2.7099719047546387, "logps/chosen": -250.13766479492188, "logps/rejected": -251.018798828125, "loss": 0.6688, "positive_losses": 0.01625842973589897, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.14459940791130066, "rewards/margins": 0.05908440798521042, "rewards/margins_max": 0.2021196335554123, "rewards/margins_min": -0.06913208216428757, "rewards/margins_std": 0.11902841180562973, "rewards/rejected": 0.08551499992609024, "step": 1900 }, { "epoch": 0.45, "eval_dpo_losses": 0.6681045889854431, "eval_logits/chosen": -2.7651448249816895, "eval_logits/rejected": -2.729977607727051, "eval_logps/chosen": -269.6281433105469, "eval_logps/rejected": -256.4724426269531, "eval_loss": 0.6834447979927063, "eval_positive_losses": 0.12017535418272018, "eval_rewards/accuracies": 0.6744999885559082, "eval_rewards/chosen": 0.14827173948287964, "eval_rewards/margins": 0.054463401436805725, "eval_rewards/margins_max": 0.24101953208446503, "eval_rewards/margins_min": -0.10646426677703857, "eval_rewards/margins_std": 0.11624707281589508, "eval_rewards/rejected": 0.09380833804607391, "eval_runtime": 858.1037, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 1900 }, { "dpo_losses": 0.6654232144355774, "epoch": 0.46, "grad_norm": 8.719467953890637, "learning_rate": 3.294769954069802e-07, "logits/chosen": -2.7877037525177, "logits/rejected": -2.741056442260742, "logps/chosen": -270.0384826660156, "logps/rejected": -251.13369750976562, "loss": 0.6705, "positive_losses": 0.052925966680049896, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15026476979255676, "rewards/margins": 0.060220420360565186, "rewards/margins_max": 0.2000938355922699, "rewards/margins_min": -0.06977032124996185, "rewards/margins_std": 0.12080085277557373, "rewards/rejected": 0.09004434943199158, "step": 1910 }, { "dpo_losses": 0.6643575429916382, "epoch": 0.46, "grad_norm": 14.699164770363282, "learning_rate": 3.274927274167048e-07, "logits/chosen": -2.769256591796875, "logits/rejected": -2.762864828109741, "logps/chosen": -251.76535034179688, "logps/rejected": -241.9304656982422, "loss": 0.6807, "positive_losses": 0.103490449488163, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14817693829536438, "rewards/margins": 0.06169048696756363, "rewards/margins_max": 0.17907702922821045, "rewards/margins_min": -0.06192632392048836, "rewards/margins_std": 0.11021226644515991, "rewards/rejected": 0.08648644387722015, "step": 1920 }, { "dpo_losses": 0.6640545129776001, "epoch": 0.46, "grad_norm": 1.8654947308885383, "learning_rate": 3.2550304385097575e-07, "logits/chosen": -2.808413028717041, "logits/rejected": -2.7772860527038574, "logps/chosen": -262.5587463378906, "logps/rejected": -233.5145263671875, "loss": 0.6689, "positive_losses": 0.05932874605059624, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14676916599273682, "rewards/margins": 0.0626763105392456, "rewards/margins_max": 0.18366703391075134, "rewards/margins_min": -0.04836802929639816, "rewards/margins_std": 0.10325155407190323, "rewards/rejected": 0.0840928703546524, "step": 1930 }, { "dpo_losses": 0.6719104647636414, "epoch": 0.46, "grad_norm": 2.1445701609487933, "learning_rate": 3.235080837587314e-07, "logits/chosen": -2.8085415363311768, "logits/rejected": -2.7884840965270996, "logps/chosen": -205.3340606689453, "logps/rejected": -249.1177520751953, "loss": 0.6805, "positive_losses": 0.19883742928504944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1219167485833168, "rewards/margins": 0.04622754454612732, "rewards/margins_max": 0.18519961833953857, "rewards/margins_min": -0.06690065562725067, "rewards/margins_std": 0.11041238158941269, "rewards/rejected": 0.07568920403718948, "step": 1940 }, { "dpo_losses": 0.6646826863288879, "epoch": 0.47, "grad_norm": 9.896937087225325, "learning_rate": 3.215079865576599e-07, "logits/chosen": -2.764008045196533, "logits/rejected": -2.798621654510498, "logps/chosen": -266.9892883300781, "logps/rejected": -266.6557922363281, "loss": 0.671, "positive_losses": 0.09678039699792862, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.14836612343788147, "rewards/margins": 0.0616549476981163, "rewards/margins_max": 0.19222991168498993, "rewards/margins_min": -0.06497012078762054, "rewards/margins_std": 0.11652760207653046, "rewards/rejected": 0.08671115338802338, "step": 1950 }, { "dpo_losses": 0.661909818649292, "epoch": 0.47, "grad_norm": 2.4223165786322602, "learning_rate": 3.1950289202445594e-07, "logits/chosen": -2.7474777698516846, "logits/rejected": -2.7396328449249268, "logps/chosen": -262.4934387207031, "logps/rejected": -260.3197021484375, "loss": 0.6709, "positive_losses": 0.07884301990270615, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.14897334575653076, "rewards/margins": 0.06812804937362671, "rewards/margins_max": 0.19089296460151672, "rewards/margins_min": -0.058850545436143875, "rewards/margins_std": 0.11118094623088837, "rewards/rejected": 0.08084531128406525, "step": 1960 }, { "dpo_losses": 0.6723495721817017, "epoch": 0.47, "grad_norm": 13.303950040116728, "learning_rate": 3.174929402850528e-07, "logits/chosen": -2.881601333618164, "logits/rejected": -2.79359769821167, "logps/chosen": -265.28564453125, "logps/rejected": -251.39932250976562, "loss": 0.683, "positive_losses": 0.19449596107006073, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1412382572889328, "rewards/margins": 0.04644555598497391, "rewards/margins_max": 0.19014857709407806, "rewards/margins_min": -0.0813937559723854, "rewards/margins_std": 0.12483637034893036, "rewards/rejected": 0.09479270875453949, "step": 1970 }, { "dpo_losses": 0.6564961671829224, "epoch": 0.47, "grad_norm": 9.62667396157676, "learning_rate": 3.15478271804829e-07, "logits/chosen": -2.7767136096954346, "logits/rejected": -2.7466657161712646, "logps/chosen": -285.6875915527344, "logps/rejected": -272.7561340332031, "loss": 0.6812, "positive_losses": 0.07430706173181534, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.17666248977184296, "rewards/margins": 0.08011461794376373, "rewards/margins_max": 0.23911993205547333, "rewards/margins_min": -0.04966924339532852, "rewards/margins_std": 0.12842869758605957, "rewards/rejected": 0.09654786437749863, "step": 1980 }, { "dpo_losses": 0.6677006483078003, "epoch": 0.48, "grad_norm": 2.079338809157917, "learning_rate": 3.1345902737879257e-07, "logits/chosen": -2.701756715774536, "logits/rejected": -2.692385673522949, "logps/chosen": -248.5734405517578, "logps/rejected": -244.9713592529297, "loss": 0.6764, "positive_losses": 0.17212390899658203, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1393079310655594, "rewards/margins": 0.055907268077135086, "rewards/margins_max": 0.19883087277412415, "rewards/margins_min": -0.08152113854885101, "rewards/margins_std": 0.1265718787908554, "rewards/rejected": 0.0834006741642952, "step": 1990 }, { "dpo_losses": 0.651429295539856, "epoch": 0.48, "grad_norm": 2.0695117481503753, "learning_rate": 3.1143534812174103e-07, "logits/chosen": -2.83211088180542, "logits/rejected": -2.7964072227478027, "logps/chosen": -301.89300537109375, "logps/rejected": -245.0783233642578, "loss": 0.6616, "positive_losses": 0.016871070489287376, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.18196313083171844, "rewards/margins": 0.08932209759950638, "rewards/margins_max": 0.2275833636522293, "rewards/margins_min": -0.064825639128685, "rewards/margins_std": 0.1288064569234848, "rewards/rejected": 0.09264104068279266, "step": 2000 }, { "epoch": 0.48, "eval_dpo_losses": 0.6681052446365356, "eval_logits/chosen": -2.768737316131592, "eval_logits/rejected": -2.7335543632507324, "eval_logps/chosen": -269.1366882324219, "eval_logps/rejected": -255.98248291015625, "eval_loss": 0.6818322539329529, "eval_positive_losses": 0.10919387638568878, "eval_rewards/accuracies": 0.671999990940094, "eval_rewards/chosen": 0.15318630635738373, "eval_rewards/margins": 0.054478373378515244, "eval_rewards/margins_max": 0.24087677896022797, "eval_rewards/margins_min": -0.10689043998718262, "eval_rewards/margins_std": 0.11637938022613525, "eval_rewards/rejected": 0.09870795160531998, "eval_runtime": 858.2331, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 2000 }, { "dpo_losses": 0.6561232805252075, "epoch": 0.48, "grad_norm": 2.5337877150496495, "learning_rate": 3.094073754584001e-07, "logits/chosen": -2.749743938446045, "logits/rejected": -2.7075726985931396, "logps/chosen": -265.680908203125, "logps/rejected": -240.0607452392578, "loss": 0.6698, "positive_losses": 0.04408702999353409, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16345900297164917, "rewards/margins": 0.08082669973373413, "rewards/margins_max": 0.24039249122142792, "rewards/margins_min": -0.04863990470767021, "rewards/margins_std": 0.13059893250465393, "rewards/rejected": 0.08263228833675385, "step": 2010 }, { "dpo_losses": 0.6676343083381653, "epoch": 0.48, "grad_norm": 9.143249656562297, "learning_rate": 3.0737525111353976e-07, "logits/chosen": -2.8355212211608887, "logits/rejected": -2.8075666427612305, "logps/chosen": -275.888916015625, "logps/rejected": -246.99655151367188, "loss": 0.6746, "positive_losses": 0.13828544318675995, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.14818502962589264, "rewards/margins": 0.05577893927693367, "rewards/margins_max": 0.17918959259986877, "rewards/margins_min": -0.07176439464092255, "rewards/margins_std": 0.11308679729700089, "rewards/rejected": 0.09240610152482986, "step": 2020 }, { "dpo_losses": 0.6634706258773804, "epoch": 0.49, "grad_norm": 6.911911520992108, "learning_rate": 3.053391171020702e-07, "logits/chosen": -2.781087636947632, "logits/rejected": -2.737334966659546, "logps/chosen": -286.51019287109375, "logps/rejected": -263.6956481933594, "loss": 0.6743, "positive_losses": 0.16921034455299377, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15591704845428467, "rewards/margins": 0.06442525237798691, "rewards/margins_max": 0.1827867478132248, "rewards/margins_min": -0.07221048325300217, "rewards/margins_std": 0.1136360615491867, "rewards/rejected": 0.09149178862571716, "step": 2030 }, { "dpo_losses": 0.678429126739502, "epoch": 0.49, "grad_norm": 18.336828742453754, "learning_rate": 3.0329911571911693e-07, "logits/chosen": -2.7405848503112793, "logits/rejected": -2.7415637969970703, "logps/chosen": -239.07089233398438, "logps/rejected": -249.01516723632812, "loss": 0.6871, "positive_losses": 0.04316673427820206, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.14480189979076385, "rewards/margins": 0.03395460173487663, "rewards/margins_max": 0.17679078876972198, "rewards/margins_min": -0.08979569375514984, "rewards/margins_std": 0.11886991560459137, "rewards/rejected": 0.11084730923175812, "step": 2040 }, { "dpo_losses": 0.654004693031311, "epoch": 0.49, "grad_norm": 7.335136654236792, "learning_rate": 3.012553895300765e-07, "logits/chosen": -2.720590114593506, "logits/rejected": -2.718966245651245, "logps/chosen": -251.18527221679688, "logps/rejected": -246.2328338623047, "loss": 0.6615, "positive_losses": 0.009166908450424671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16902677714824677, "rewards/margins": 0.08436388522386551, "rewards/margins_max": 0.2201722115278244, "rewards/margins_min": -0.04557929188013077, "rewards/margins_std": 0.11971326172351837, "rewards/rejected": 0.08466287702322006, "step": 2050 }, { "dpo_losses": 0.6654559373855591, "epoch": 0.49, "grad_norm": 11.593644051697504, "learning_rate": 2.9920808136065336e-07, "logits/chosen": -2.819838047027588, "logits/rejected": -2.7719857692718506, "logps/chosen": -262.7549743652344, "logps/rejected": -259.59112548828125, "loss": 0.6823, "positive_losses": 0.03645286709070206, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16726934909820557, "rewards/margins": 0.059890687465667725, "rewards/margins_max": 0.20684847235679626, "rewards/margins_min": -0.04751700535416603, "rewards/margins_std": 0.11576877534389496, "rewards/rejected": 0.10737864673137665, "step": 2060 }, { "dpo_losses": 0.6598677635192871, "epoch": 0.5, "grad_norm": 2.7883854746878955, "learning_rate": 2.971573342868786e-07, "logits/chosen": -2.8340868949890137, "logits/rejected": -2.768871784210205, "logps/chosen": -239.88638305664062, "logps/rejected": -222.06790161132812, "loss": 0.6713, "positive_losses": 0.06543026119470596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1681610643863678, "rewards/margins": 0.07207022607326508, "rewards/margins_max": 0.2195323258638382, "rewards/margins_min": -0.06268153339624405, "rewards/margins_std": 0.12399201095104218, "rewards/rejected": 0.09609085321426392, "step": 2070 }, { "dpo_losses": 0.6652834415435791, "epoch": 0.5, "grad_norm": 1.9671096022789196, "learning_rate": 2.9510329162511054e-07, "logits/chosen": -2.725864887237549, "logits/rejected": -2.7417259216308594, "logps/chosen": -287.4837341308594, "logps/rejected": -255.3811492919922, "loss": 0.6727, "positive_losses": 0.08337660133838654, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15384908020496368, "rewards/margins": 0.06075746938586235, "rewards/margins_max": 0.2161407768726349, "rewards/margins_min": -0.06159939616918564, "rewards/margins_std": 0.12744982540607452, "rewards/rejected": 0.09309159219264984, "step": 2080 }, { "dpo_losses": 0.6745301485061646, "epoch": 0.5, "grad_norm": 5.14892205373413, "learning_rate": 2.930460969220202e-07, "logits/chosen": -2.8124070167541504, "logits/rejected": -2.743915557861328, "logps/chosen": -242.0961151123047, "logps/rejected": -270.41680908203125, "loss": 0.6836, "positive_losses": 0.16036872565746307, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14609447121620178, "rewards/margins": 0.04173535108566284, "rewards/margins_max": 0.17970731854438782, "rewards/margins_min": -0.0854857936501503, "rewards/margins_std": 0.11812999099493027, "rewards/rejected": 0.10435911267995834, "step": 2090 }, { "dpo_losses": 0.6631833910942078, "epoch": 0.5, "grad_norm": 1.8697396557161297, "learning_rate": 2.909858939445584e-07, "logits/chosen": -2.8140625953674316, "logits/rejected": -2.783529043197632, "logps/chosen": -247.07546997070312, "logps/rejected": -234.9035186767578, "loss": 0.6707, "positive_losses": 0.1760263442993164, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1481439769268036, "rewards/margins": 0.0662180557847023, "rewards/margins_max": 0.20918743312358856, "rewards/margins_min": -0.04514465108513832, "rewards/margins_std": 0.11512184143066406, "rewards/rejected": 0.08192592859268188, "step": 2100 }, { "epoch": 0.5, "eval_dpo_losses": 0.6684068441390991, "eval_logits/chosen": -2.7650647163391113, "eval_logits/rejected": -2.7300055027008057, "eval_logps/chosen": -268.57647705078125, "eval_logps/rejected": -255.35862731933594, "eval_loss": 0.6803558468818665, "eval_positive_losses": 0.0929500013589859, "eval_rewards/accuracies": 0.6710000038146973, "eval_rewards/chosen": 0.15878862142562866, "eval_rewards/margins": 0.053842101246118546, "eval_rewards/margins_max": 0.24048562347888947, "eval_rewards/margins_min": -0.10692078620195389, "eval_rewards/margins_std": 0.11616255342960358, "eval_rewards/rejected": 0.10494650155305862, "eval_runtime": 859.0117, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 2100 }, { "dpo_losses": 0.6562048196792603, "epoch": 0.51, "grad_norm": 2.065374509827389, "learning_rate": 2.8892282666990894e-07, "logits/chosen": -2.799276828765869, "logits/rejected": -2.731630563735962, "logps/chosen": -257.4571838378906, "logps/rejected": -219.5493621826172, "loss": 0.6649, "positive_losses": 0.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16865268349647522, "rewards/margins": 0.07884275913238525, "rewards/margins_max": 0.2357640266418457, "rewards/margins_min": -0.03504558652639389, "rewards/margins_std": 0.12048201262950897, "rewards/rejected": 0.08980991691350937, "step": 2110 }, { "dpo_losses": 0.6664280891418457, "epoch": 0.51, "grad_norm": 13.050884722539527, "learning_rate": 2.868570392754272e-07, "logits/chosen": -2.8316636085510254, "logits/rejected": -2.813870668411255, "logps/chosen": -308.05010986328125, "logps/rejected": -306.56903076171875, "loss": 0.6755, "positive_losses": 0.05407829210162163, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.15310320258140564, "rewards/margins": 0.05860395357012749, "rewards/margins_max": 0.17991265654563904, "rewards/margins_min": -0.0731983631849289, "rewards/margins_std": 0.11320201307535172, "rewards/rejected": 0.09449925273656845, "step": 2120 }, { "dpo_losses": 0.6682525873184204, "epoch": 0.51, "grad_norm": 10.66840856952198, "learning_rate": 2.8478867612856394e-07, "logits/chosen": -2.8220834732055664, "logits/rejected": -2.7630372047424316, "logps/chosen": -274.8631286621094, "logps/rejected": -231.75100708007812, "loss": 0.6788, "positive_losses": 0.07702980190515518, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15383529663085938, "rewards/margins": 0.055672429502010345, "rewards/margins_max": 0.19432581961154938, "rewards/margins_min": -0.07826997339725494, "rewards/margins_std": 0.12398327887058258, "rewards/rejected": 0.09816287457942963, "step": 2130 }, { "dpo_losses": 0.6604387164115906, "epoch": 0.51, "grad_norm": 13.982272777569717, "learning_rate": 2.827178817767762e-07, "logits/chosen": -2.730165958404541, "logits/rejected": -2.7031590938568115, "logps/chosen": -283.4290466308594, "logps/rejected": -243.38986206054688, "loss": 0.6867, "positive_losses": 0.13644714653491974, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1736733764410019, "rewards/margins": 0.07175038009881973, "rewards/margins_max": 0.24043183028697968, "rewards/margins_min": -0.061847079545259476, "rewards/margins_std": 0.13533160090446472, "rewards/rejected": 0.10192300379276276, "step": 2140 }, { "dpo_losses": 0.6636389493942261, "epoch": 0.51, "grad_norm": 1.919068062231777, "learning_rate": 2.8064480093742565e-07, "logits/chosen": -2.799138307571411, "logits/rejected": -2.7889764308929443, "logps/chosen": -233.39968872070312, "logps/rejected": -231.72158813476562, "loss": 0.678, "positive_losses": 0.13253745436668396, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16054031252861023, "rewards/margins": 0.06367689371109009, "rewards/margins_max": 0.19872519373893738, "rewards/margins_min": -0.06096898764371872, "rewards/margins_std": 0.11555247008800507, "rewards/rejected": 0.09686340391635895, "step": 2150 }, { "dpo_losses": 0.662826657295227, "epoch": 0.52, "grad_norm": 1.9267929545705513, "learning_rate": 2.7856957848766497e-07, "logits/chosen": -2.783723831176758, "logits/rejected": -2.6947522163391113, "logps/chosen": -254.92532348632812, "logps/rejected": -247.78604125976562, "loss": 0.6682, "positive_losses": 0.07063408195972443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1479075849056244, "rewards/margins": 0.06691776216030121, "rewards/margins_max": 0.21124336123466492, "rewards/margins_min": -0.0641765296459198, "rewards/margins_std": 0.1270638257265091, "rewards/rejected": 0.08098982274532318, "step": 2160 }, { "dpo_losses": 0.6690847277641296, "epoch": 0.52, "grad_norm": 2.1507100241410217, "learning_rate": 2.7649235945431336e-07, "logits/chosen": -2.7601242065429688, "logits/rejected": -2.7254035472869873, "logps/chosen": -278.6953125, "logps/rejected": -314.757568359375, "loss": 0.6729, "positive_losses": 0.1383533477783203, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1510465443134308, "rewards/margins": 0.0528431236743927, "rewards/margins_max": 0.18322992324829102, "rewards/margins_min": -0.06159244105219841, "rewards/margins_std": 0.10892023146152496, "rewards/rejected": 0.09820342063903809, "step": 2170 }, { "dpo_losses": 0.6616215705871582, "epoch": 0.52, "grad_norm": 2.037431905345771, "learning_rate": 2.74413289003721e-07, "logits/chosen": -2.8064351081848145, "logits/rejected": -2.7818188667297363, "logps/chosen": -267.09405517578125, "logps/rejected": -254.98046875, "loss": 0.6719, "positive_losses": 0.14298763871192932, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15856541693210602, "rewards/margins": 0.0702642947435379, "rewards/margins_max": 0.24729447066783905, "rewards/margins_min": -0.08257587999105453, "rewards/margins_std": 0.14554773271083832, "rewards/rejected": 0.08830111473798752, "step": 2180 }, { "dpo_losses": 0.6617010831832886, "epoch": 0.52, "grad_norm": 1.837583802057403, "learning_rate": 2.7233251243162434e-07, "logits/chosen": -2.793156147003174, "logits/rejected": -2.7649288177490234, "logps/chosen": -298.74798583984375, "logps/rejected": -280.72137451171875, "loss": 0.6809, "positive_losses": 0.2457468956708908, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15907970070838928, "rewards/margins": 0.06867168843746185, "rewards/margins_max": 0.1987360268831253, "rewards/margins_min": -0.060612570494413376, "rewards/margins_std": 0.11501216888427734, "rewards/rejected": 0.09040801227092743, "step": 2190 }, { "dpo_losses": 0.6663209199905396, "epoch": 0.53, "grad_norm": 9.13904775721691, "learning_rate": 2.7025017515299207e-07, "logits/chosen": -2.755573272705078, "logits/rejected": -2.7177062034606934, "logps/chosen": -259.05841064453125, "logps/rejected": -232.1884002685547, "loss": 0.6796, "positive_losses": 0.2961049973964691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14036241173744202, "rewards/margins": 0.05829455330967903, "rewards/margins_max": 0.19901099801063538, "rewards/margins_min": -0.0661616176366806, "rewards/margins_std": 0.12037386000156403, "rewards/rejected": 0.08206787705421448, "step": 2200 }, { "epoch": 0.53, "eval_dpo_losses": 0.6666141152381897, "eval_logits/chosen": -2.758179187774658, "eval_logits/rejected": -2.722825050354004, "eval_logps/chosen": -269.45513916015625, "eval_logps/rejected": -256.6536865234375, "eval_loss": 0.6848553419113159, "eval_positive_losses": 0.15510497987270355, "eval_rewards/accuracies": 0.6754999756813049, "eval_rewards/chosen": 0.15000176429748535, "eval_rewards/margins": 0.05800589546561241, "eval_rewards/margins_max": 0.2564653754234314, "eval_rewards/margins_min": -0.11213590204715729, "eval_rewards/margins_std": 0.12338607758283615, "eval_rewards/rejected": 0.09199586510658264, "eval_runtime": 859.3448, "eval_samples_per_second": 4.655, "eval_steps_per_second": 0.291, "step": 2200 }, { "dpo_losses": 0.6729488372802734, "epoch": 0.53, "grad_norm": 2.273927091860665, "learning_rate": 2.6816642269186275e-07, "logits/chosen": -2.7407145500183105, "logits/rejected": -2.7245407104492188, "logps/chosen": -269.4483642578125, "logps/rejected": -248.7185821533203, "loss": 0.6821, "positive_losses": 0.0772184357047081, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.14526183903217316, "rewards/margins": 0.04481197148561478, "rewards/margins_max": 0.1691262573003769, "rewards/margins_min": -0.08192785084247589, "rewards/margins_std": 0.11339626461267471, "rewards/rejected": 0.10044984519481659, "step": 2210 }, { "dpo_losses": 0.6761233806610107, "epoch": 0.53, "grad_norm": 9.260836053598029, "learning_rate": 2.660814006711748e-07, "logits/chosen": -2.7239954471588135, "logits/rejected": -2.766946792602539, "logps/chosen": -240.9567108154297, "logps/rejected": -277.82598876953125, "loss": 0.6792, "positive_losses": 0.16736125946044922, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.15078578889369965, "rewards/margins": 0.038352787494659424, "rewards/margins_max": 0.16740979254245758, "rewards/margins_min": -0.10242000967264175, "rewards/margins_std": 0.12142413854598999, "rewards/rejected": 0.11243300139904022, "step": 2220 }, { "dpo_losses": 0.655363142490387, "epoch": 0.53, "grad_norm": 7.90528302819446, "learning_rate": 2.639952548025899e-07, "logits/chosen": -2.8111348152160645, "logits/rejected": -2.7315573692321777, "logps/chosen": -283.2752990722656, "logps/rejected": -236.6343994140625, "loss": 0.6678, "positive_losses": 0.00597801199182868, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1699642837047577, "rewards/margins": 0.08252812922000885, "rewards/margins_max": 0.2596474885940552, "rewards/margins_min": -0.057215191423892975, "rewards/margins_std": 0.1433498114347458, "rewards/rejected": 0.08743616193532944, "step": 2230 }, { "dpo_losses": 0.6586174964904785, "epoch": 0.54, "grad_norm": 2.531946722148076, "learning_rate": 2.619081308763097e-07, "logits/chosen": -2.75937557220459, "logits/rejected": -2.7364113330841064, "logps/chosen": -278.1329040527344, "logps/rejected": -249.18789672851562, "loss": 0.6699, "positive_losses": 0.038974761962890625, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16530348360538483, "rewards/margins": 0.07463079690933228, "rewards/margins_max": 0.22609278559684753, "rewards/margins_min": -0.042294882237911224, "rewards/margins_std": 0.12348683178424835, "rewards/rejected": 0.09067267924547195, "step": 2240 }, { "dpo_losses": 0.6655239462852478, "epoch": 0.54, "grad_norm": 2.005527725214489, "learning_rate": 2.598201747508875e-07, "logits/chosen": -2.7801475524902344, "logits/rejected": -2.7965972423553467, "logps/chosen": -298.4169921875, "logps/rejected": -284.37908935546875, "loss": 0.6712, "positive_losses": 0.015515899285674095, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.170989528298378, "rewards/margins": 0.06047233194112778, "rewards/margins_max": 0.18134550750255585, "rewards/margins_min": -0.08089722692966461, "rewards/margins_std": 0.11501763015985489, "rewards/rejected": 0.11051716655492783, "step": 2250 }, { "dpo_losses": 0.6571565866470337, "epoch": 0.54, "grad_norm": 5.367847207738861, "learning_rate": 2.577315323430346e-07, "logits/chosen": -2.782386302947998, "logits/rejected": -2.7479090690612793, "logps/chosen": -273.3164978027344, "logps/rejected": -264.6056213378906, "loss": 0.6654, "positive_losses": 0.038141440600156784, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1651037037372589, "rewards/margins": 0.07665382325649261, "rewards/margins_max": 0.1984976828098297, "rewards/margins_min": -0.04122495651245117, "rewards/margins_std": 0.104500412940979, "rewards/rejected": 0.0884498804807663, "step": 2260 }, { "dpo_losses": 0.6561874151229858, "epoch": 0.54, "grad_norm": 8.07948583949895, "learning_rate": 2.5564234961742315e-07, "logits/chosen": -2.7953901290893555, "logits/rejected": -2.7630488872528076, "logps/chosen": -318.374267578125, "logps/rejected": -275.588623046875, "loss": 0.6781, "positive_losses": 0.16964511573314667, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.17856177687644958, "rewards/margins": 0.08041170239448547, "rewards/margins_max": 0.22601398825645447, "rewards/margins_min": -0.05626480653882027, "rewards/margins_std": 0.1315629780292511, "rewards/rejected": 0.09815007448196411, "step": 2270 }, { "dpo_losses": 0.6666969060897827, "epoch": 0.55, "grad_norm": 1.7848827728544645, "learning_rate": 2.5355277257648553e-07, "logits/chosen": -2.825878858566284, "logits/rejected": -2.77541184425354, "logps/chosen": -252.8574981689453, "logps/rejected": -242.07839965820312, "loss": 0.667, "positive_losses": 0.16885261237621307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16006655991077423, "rewards/margins": 0.057914167642593384, "rewards/margins_max": 0.1899695098400116, "rewards/margins_min": -0.07043616473674774, "rewards/margins_std": 0.11645045131444931, "rewards/rejected": 0.10215239226818085, "step": 2280 }, { "dpo_losses": 0.6537405848503113, "epoch": 0.55, "grad_norm": 2.0642474314731523, "learning_rate": 2.514629472502108e-07, "logits/chosen": -2.752764940261841, "logits/rejected": -2.7648093700408936, "logps/chosen": -317.4488830566406, "logps/rejected": -270.2599182128906, "loss": 0.6614, "positive_losses": 0.06785888969898224, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.19106529653072357, "rewards/margins": 0.08657029271125793, "rewards/margins_max": 0.2458745688199997, "rewards/margins_min": -0.025165194645524025, "rewards/margins_std": 0.12435078620910645, "rewards/rejected": 0.10449501127004623, "step": 2290 }, { "dpo_losses": 0.6628495454788208, "epoch": 0.55, "grad_norm": 2.0962068178191027, "learning_rate": 2.4937301968593915e-07, "logits/chosen": -2.7593681812286377, "logits/rejected": -2.747328996658325, "logps/chosen": -239.71060180664062, "logps/rejected": -232.8096923828125, "loss": 0.6672, "positive_losses": 0.009763717651367188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1690691113471985, "rewards/margins": 0.06595106422901154, "rewards/margins_max": 0.20511773228645325, "rewards/margins_min": -0.05297223851084709, "rewards/margins_std": 0.11529793590307236, "rewards/rejected": 0.10311806201934814, "step": 2300 }, { "epoch": 0.55, "eval_dpo_losses": 0.6667888760566711, "eval_logits/chosen": -2.755357265472412, "eval_logits/rejected": -2.7202939987182617, "eval_logps/chosen": -268.83660888671875, "eval_logps/rejected": -255.9975128173828, "eval_loss": 0.6829556226730347, "eval_positive_losses": 0.14040124416351318, "eval_rewards/accuracies": 0.6725000143051147, "eval_rewards/chosen": 0.15618734061717987, "eval_rewards/margins": 0.057629652321338654, "eval_rewards/margins_max": 0.2557487189769745, "eval_rewards/margins_min": -0.11137939244508743, "eval_rewards/margins_std": 0.12313403934240341, "eval_rewards/rejected": 0.09855768084526062, "eval_runtime": 858.8104, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 2300 }, { "dpo_losses": 0.661823034286499, "epoch": 0.55, "grad_norm": 12.132726979737233, "learning_rate": 2.47283135938156e-07, "logits/chosen": -2.8002371788024902, "logits/rejected": -2.774120807647705, "logps/chosen": -251.9924774169922, "logps/rejected": -220.9415283203125, "loss": 0.6818, "positive_losses": 0.15227541327476501, "rewards/accuracies": 0.75, "rewards/chosen": 0.1560712605714798, "rewards/margins": 0.06767888367176056, "rewards/margins_max": 0.2062397450208664, "rewards/margins_min": -0.0536632239818573, "rewards/margins_std": 0.11782427877187729, "rewards/rejected": 0.08839237689971924, "step": 2310 }, { "dpo_losses": 0.660243809223175, "epoch": 0.56, "grad_norm": 1.9745368076350998, "learning_rate": 2.451934420582846e-07, "logits/chosen": -2.8108372688293457, "logits/rejected": -2.7756714820861816, "logps/chosen": -254.51388549804688, "logps/rejected": -241.6333465576172, "loss": 0.6803, "positive_losses": 0.015761375427246094, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15526776015758514, "rewards/margins": 0.07098822295665741, "rewards/margins_max": 0.2065306454896927, "rewards/margins_min": -0.0456148125231266, "rewards/margins_std": 0.11496766656637192, "rewards/rejected": 0.08427950739860535, "step": 2320 }, { "dpo_losses": 0.6642245054244995, "epoch": 0.56, "grad_norm": 8.963626687538627, "learning_rate": 2.4310408408447903e-07, "logits/chosen": -2.75468373298645, "logits/rejected": -2.690964460372925, "logps/chosen": -227.16165161132812, "logps/rejected": -215.1800994873047, "loss": 0.6607, "positive_losses": 0.010651779361069202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1660917103290558, "rewards/margins": 0.06236310675740242, "rewards/margins_max": 0.19441865384578705, "rewards/margins_min": -0.055907636880874634, "rewards/margins_std": 0.1120598167181015, "rewards/rejected": 0.10372859239578247, "step": 2330 }, { "dpo_losses": 0.6603385806083679, "epoch": 0.56, "grad_norm": 1.9357242445219511, "learning_rate": 2.41015208031419e-07, "logits/chosen": -2.819274425506592, "logits/rejected": -2.7583565711975098, "logps/chosen": -278.0947265625, "logps/rejected": -266.79315185546875, "loss": 0.6768, "positive_losses": 0.22658629715442657, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1618601232767105, "rewards/margins": 0.07394503057003021, "rewards/margins_max": 0.24532051384449005, "rewards/margins_min": -0.08648841083049774, "rewards/margins_std": 0.14531855285167694, "rewards/rejected": 0.0879150852560997, "step": 2340 }, { "dpo_losses": 0.655372142791748, "epoch": 0.56, "grad_norm": 10.4825352335146, "learning_rate": 2.389269598801048e-07, "logits/chosen": -2.8024260997772217, "logits/rejected": -2.717745304107666, "logps/chosen": -267.5137939453125, "logps/rejected": -216.3339080810547, "loss": 0.6569, "positive_losses": 0.030373763293027878, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16606058180332184, "rewards/margins": 0.08216945081949234, "rewards/margins_max": 0.24279627203941345, "rewards/margins_min": -0.06665056943893433, "rewards/margins_std": 0.13653022050857544, "rewards/rejected": 0.0838911160826683, "step": 2350 }, { "dpo_losses": 0.6687244176864624, "epoch": 0.57, "grad_norm": 8.343785660913962, "learning_rate": 2.3683948556765624e-07, "logits/chosen": -2.7783455848693848, "logits/rejected": -2.738820791244507, "logps/chosen": -242.51071166992188, "logps/rejected": -222.2284698486328, "loss": 0.685, "positive_losses": 0.2167510986328125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1407192051410675, "rewards/margins": 0.053820062428712845, "rewards/margins_max": 0.19487479329109192, "rewards/margins_min": -0.05451763793826103, "rewards/margins_std": 0.11224918067455292, "rewards/rejected": 0.08689915388822556, "step": 2360 }, { "dpo_losses": 0.6720955967903137, "epoch": 0.57, "grad_norm": 1.8751094786153721, "learning_rate": 2.34752930977113e-07, "logits/chosen": -2.7293200492858887, "logits/rejected": -2.6929969787597656, "logps/chosen": -262.84375, "logps/rejected": -266.13946533203125, "loss": 0.679, "positive_losses": 0.16145935654640198, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16291166841983795, "rewards/margins": 0.04684619978070259, "rewards/margins_max": 0.18595808744430542, "rewards/margins_min": -0.07706108689308167, "rewards/margins_std": 0.1158452033996582, "rewards/rejected": 0.11606547981500626, "step": 2370 }, { "dpo_losses": 0.6641675233840942, "epoch": 0.57, "grad_norm": 19.824235298638413, "learning_rate": 2.3266744192724052e-07, "logits/chosen": -2.7611613273620605, "logits/rejected": -2.7416818141937256, "logps/chosen": -276.66778564453125, "logps/rejected": -257.7802734375, "loss": 0.6874, "positive_losses": 0.261518657207489, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1519637405872345, "rewards/margins": 0.06323989480733871, "rewards/margins_max": 0.21058087050914764, "rewards/margins_min": -0.08207972347736359, "rewards/margins_std": 0.12884901463985443, "rewards/rejected": 0.08872385323047638, "step": 2380 }, { "dpo_losses": 0.6683680415153503, "epoch": 0.57, "grad_norm": 4.87760757742167, "learning_rate": 2.3058316416233864e-07, "logits/chosen": -2.794218063354492, "logits/rejected": -2.779803514480591, "logps/chosen": -271.2039489746094, "logps/rejected": -248.4630889892578, "loss": 0.6695, "positive_losses": 0.15339326858520508, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.17608313262462616, "rewards/margins": 0.05618946999311447, "rewards/margins_max": 0.22427324950695038, "rewards/margins_min": -0.1097201332449913, "rewards/margins_std": 0.14653246104717255, "rewards/rejected": 0.11989367008209229, "step": 2390 }, { "dpo_losses": 0.668249249458313, "epoch": 0.57, "grad_norm": 12.751351079226982, "learning_rate": 2.2850024334205654e-07, "logits/chosen": -2.7696890830993652, "logits/rejected": -2.75766658782959, "logps/chosen": -254.71725463867188, "logps/rejected": -243.1544647216797, "loss": 0.6769, "positive_losses": 0.29547691345214844, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1424407809972763, "rewards/margins": 0.053774215281009674, "rewards/margins_max": 0.18860933184623718, "rewards/margins_min": -0.060921452939510345, "rewards/margins_std": 0.1123867779970169, "rewards/rejected": 0.08866657316684723, "step": 2400 }, { "epoch": 0.57, "eval_dpo_losses": 0.6667873859405518, "eval_logits/chosen": -2.7508089542388916, "eval_logits/rejected": -2.7159054279327393, "eval_logps/chosen": -268.4941101074219, "eval_logps/rejected": -255.659912109375, "eval_loss": 0.6818990707397461, "eval_positive_losses": 0.125191330909729, "eval_rewards/accuracies": 0.6740000247955322, "eval_rewards/chosen": 0.15961231291294098, "eval_rewards/margins": 0.05767882615327835, "eval_rewards/margins_max": 0.2565406858921051, "eval_rewards/margins_min": -0.11284005641937256, "eval_rewards/margins_std": 0.12380736321210861, "eval_rewards/rejected": 0.10193348675966263, "eval_runtime": 858.6094, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 2400 }, { "dpo_losses": 0.653143048286438, "epoch": 0.58, "grad_norm": 2.2146999305708897, "learning_rate": 2.264188250312138e-07, "logits/chosen": -2.7609143257141113, "logits/rejected": -2.6987197399139404, "logps/chosen": -272.8665466308594, "logps/rejected": -213.3135986328125, "loss": 0.6732, "positive_losses": 0.15415534377098083, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16804520785808563, "rewards/margins": 0.08646182715892792, "rewards/margins_max": 0.229865163564682, "rewards/margins_min": -0.04814504459500313, "rewards/margins_std": 0.12566861510276794, "rewards/rejected": 0.08158338814973831, "step": 2410 }, { "dpo_losses": 0.6567984819412231, "epoch": 0.58, "grad_norm": 1.8407909117761543, "learning_rate": 2.2433905468962674e-07, "logits/chosen": -2.823650598526001, "logits/rejected": -2.796830177307129, "logps/chosen": -284.7647705078125, "logps/rejected": -241.35855102539062, "loss": 0.6685, "positive_losses": 0.028902243822813034, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16666273772716522, "rewards/margins": 0.07798048108816147, "rewards/margins_max": 0.211766317486763, "rewards/margins_min": -0.05723657086491585, "rewards/margins_std": 0.12001141160726547, "rewards/rejected": 0.08868227899074554, "step": 2420 }, { "dpo_losses": 0.6575535535812378, "epoch": 0.58, "grad_norm": 2.289626745264257, "learning_rate": 2.222610776619439e-07, "logits/chosen": -2.8279614448547363, "logits/rejected": -2.7801260948181152, "logps/chosen": -279.36480712890625, "logps/rejected": -233.5427703857422, "loss": 0.6655, "positive_losses": 0.048036955296993256, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.17082032561302185, "rewards/margins": 0.0779995247721672, "rewards/margins_max": 0.23888659477233887, "rewards/margins_min": -0.0617380365729332, "rewards/margins_std": 0.135210782289505, "rewards/rejected": 0.09282079339027405, "step": 2430 }, { "dpo_losses": 0.6530050039291382, "epoch": 0.58, "grad_norm": 1.5154182057044656, "learning_rate": 2.201850391674877e-07, "logits/chosen": -2.824777126312256, "logits/rejected": -2.771427869796753, "logps/chosen": -286.0675964355469, "logps/rejected": -233.46240234375, "loss": 0.6578, "positive_losses": 0.009254073724150658, "rewards/accuracies": 0.75, "rewards/chosen": 0.18660372495651245, "rewards/margins": 0.0876220315694809, "rewards/margins_max": 0.2265000343322754, "rewards/margins_min": -0.06819517910480499, "rewards/margins_std": 0.13173483312129974, "rewards/rejected": 0.09898170083761215, "step": 2440 }, { "dpo_losses": 0.6543854475021362, "epoch": 0.59, "grad_norm": 9.577977240015446, "learning_rate": 2.181110842901066e-07, "logits/chosen": -2.836677074432373, "logits/rejected": -2.7582130432128906, "logps/chosen": -258.3134765625, "logps/rejected": -227.0428924560547, "loss": 0.6604, "positive_losses": 0.05332932621240616, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1700987070798874, "rewards/margins": 0.08301262557506561, "rewards/margins_max": 0.219264954328537, "rewards/margins_min": -0.03777962550520897, "rewards/margins_std": 0.11611829698085785, "rewards/rejected": 0.08708608150482178, "step": 2450 }, { "dpo_losses": 0.6494167447090149, "epoch": 0.59, "grad_norm": 2.126523201247886, "learning_rate": 2.160393579680353e-07, "logits/chosen": -2.732412815093994, "logits/rejected": -2.7395479679107666, "logps/chosen": -248.01364135742188, "logps/rejected": -245.60861206054688, "loss": 0.6616, "positive_losses": 0.07407674938440323, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15778644382953644, "rewards/margins": 0.09356477111577988, "rewards/margins_max": 0.23099227249622345, "rewards/margins_min": -0.04053952544927597, "rewards/margins_std": 0.12145686149597168, "rewards/rejected": 0.06422166526317596, "step": 2460 }, { "dpo_losses": 0.6603222489356995, "epoch": 0.59, "grad_norm": 9.262871936326716, "learning_rate": 2.1397000498376634e-07, "logits/chosen": -2.7953994274139404, "logits/rejected": -2.7104790210723877, "logps/chosen": -255.8928680419922, "logps/rejected": -242.5117950439453, "loss": 0.6545, "positive_losses": 0.0034935474395751953, "rewards/accuracies": 0.75, "rewards/chosen": 0.1603352129459381, "rewards/margins": 0.07073120772838593, "rewards/margins_max": 0.20764076709747314, "rewards/margins_min": -0.06356082856655121, "rewards/margins_std": 0.11766792833805084, "rewards/rejected": 0.08960400521755219, "step": 2470 }, { "dpo_losses": 0.659275472164154, "epoch": 0.59, "grad_norm": 1.753023192905648, "learning_rate": 2.1190316995393144e-07, "logits/chosen": -2.775113344192505, "logits/rejected": -2.6995749473571777, "logps/chosen": -244.80789184570312, "logps/rejected": -208.2993621826172, "loss": 0.6675, "positive_losses": 0.2187831848859787, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14790937304496765, "rewards/margins": 0.0731239765882492, "rewards/margins_max": 0.2214915007352829, "rewards/margins_min": -0.06255306303501129, "rewards/margins_std": 0.1280265748500824, "rewards/rejected": 0.07478538900613785, "step": 2480 }, { "dpo_losses": 0.6634795069694519, "epoch": 0.6, "grad_norm": 7.037144339265762, "learning_rate": 2.098389973191953e-07, "logits/chosen": -2.788374185562134, "logits/rejected": -2.7700562477111816, "logps/chosen": -280.07147216796875, "logps/rejected": -273.7430725097656, "loss": 0.669, "positive_losses": 0.02162933349609375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1584930419921875, "rewards/margins": 0.06381521373987198, "rewards/margins_max": 0.20791876316070557, "rewards/margins_min": -0.05560820549726486, "rewards/margins_std": 0.11947876214981079, "rewards/rejected": 0.09467782080173492, "step": 2490 }, { "dpo_losses": 0.6539306640625, "epoch": 0.6, "grad_norm": 2.537941083484269, "learning_rate": 2.0777763133416118e-07, "logits/chosen": -2.7996597290039062, "logits/rejected": -2.736781358718872, "logps/chosen": -249.8372802734375, "logps/rejected": -202.3240203857422, "loss": 0.6725, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14725255966186523, "rewards/margins": 0.0831185132265091, "rewards/margins_max": 0.18639856576919556, "rewards/margins_min": -0.02192167565226555, "rewards/margins_std": 0.09190456569194794, "rewards/rejected": 0.06413406133651733, "step": 2500 }, { "epoch": 0.6, "eval_dpo_losses": 0.6645246148109436, "eval_logits/chosen": -2.750915765762329, "eval_logits/rejected": -2.716078519821167, "eval_logps/chosen": -269.5726623535156, "eval_logps/rejected": -257.26629638671875, "eval_loss": 0.690272331237793, "eval_positive_losses": 0.2239096611738205, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": 0.14882688224315643, "eval_rewards/margins": 0.0629570409655571, "eval_rewards/margins_max": 0.2750893235206604, "eval_rewards/margins_min": -0.12012024968862534, "eval_rewards/margins_std": 0.13245534896850586, "eval_rewards/rejected": 0.08586984127759933, "eval_runtime": 858.9333, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 2500 }, { "dpo_losses": 0.6570712924003601, "epoch": 0.6, "grad_norm": 6.892540213571708, "learning_rate": 2.057192160572898e-07, "logits/chosen": -2.8030176162719727, "logits/rejected": -2.708225727081299, "logps/chosen": -253.5667724609375, "logps/rejected": -265.36260986328125, "loss": 0.6748, "positive_losses": 0.0893787369132042, "rewards/accuracies": 0.75, "rewards/chosen": 0.16482850909233093, "rewards/margins": 0.07902560383081436, "rewards/margins_max": 0.24097923934459686, "rewards/margins_min": -0.06541293859481812, "rewards/margins_std": 0.13768818974494934, "rewards/rejected": 0.08580291271209717, "step": 2510 }, { "dpo_losses": 0.6597687005996704, "epoch": 0.6, "grad_norm": 10.87930037436885, "learning_rate": 2.0366389534083185e-07, "logits/chosen": -2.791435718536377, "logits/rejected": -2.7525410652160645, "logps/chosen": -270.2708435058594, "logps/rejected": -237.8101043701172, "loss": 0.6759, "positive_losses": 0.13981685042381287, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1586829125881195, "rewards/margins": 0.07321233302354813, "rewards/margins_max": 0.24506375193595886, "rewards/margins_min": -0.05251021310687065, "rewards/margins_std": 0.12969766557216644, "rewards/rejected": 0.08547057956457138, "step": 2520 }, { "dpo_losses": 0.6528351306915283, "epoch": 0.61, "grad_norm": 5.925993682332683, "learning_rate": 2.0161181282077469e-07, "logits/chosen": -2.7651515007019043, "logits/rejected": -2.765259265899658, "logps/chosen": -226.3842010498047, "logps/rejected": -230.2400665283203, "loss": 0.661, "positive_losses": 0.03745307773351669, "rewards/accuracies": 0.8125, "rewards/chosen": 0.15903137624263763, "rewards/margins": 0.08609801530838013, "rewards/margins_max": 0.21778056025505066, "rewards/margins_min": -0.031518880277872086, "rewards/margins_std": 0.11304968595504761, "rewards/rejected": 0.07293335348367691, "step": 2530 }, { "dpo_losses": 0.6636265516281128, "epoch": 0.61, "grad_norm": 6.2337802230380746, "learning_rate": 1.9956311190680468e-07, "logits/chosen": -2.7611145973205566, "logits/rejected": -2.752441644668579, "logps/chosen": -238.9643096923828, "logps/rejected": -256.35284423828125, "loss": 0.7199, "positive_losses": 0.43493956327438354, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13593077659606934, "rewards/margins": 0.06421179324388504, "rewards/margins_max": 0.1931479275226593, "rewards/margins_min": -0.05680401995778084, "rewards/margins_std": 0.11524803936481476, "rewards/rejected": 0.07171899825334549, "step": 2540 }, { "dpo_losses": 0.6544219255447388, "epoch": 0.61, "grad_norm": 2.078128212374162, "learning_rate": 1.9751793577228455e-07, "logits/chosen": -2.8054041862487793, "logits/rejected": -2.812608003616333, "logps/chosen": -283.9731140136719, "logps/rejected": -262.6289367675781, "loss": 0.6635, "positive_losses": 0.0031076432205736637, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.16458013653755188, "rewards/margins": 0.08221413195133209, "rewards/margins_max": 0.18799148499965668, "rewards/margins_min": -0.03425773233175278, "rewards/margins_std": 0.09953446686267853, "rewards/rejected": 0.08236599713563919, "step": 2550 }, { "dpo_losses": 0.6561216115951538, "epoch": 0.61, "grad_norm": 2.085701346870496, "learning_rate": 1.9547642734424823e-07, "logits/chosen": -2.7269835472106934, "logits/rejected": -2.7549428939819336, "logps/chosen": -272.5962219238281, "logps/rejected": -262.6563415527344, "loss": 0.6694, "positive_losses": 0.09855242073535919, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16704121232032776, "rewards/margins": 0.08085381239652634, "rewards/margins_max": 0.2673949599266052, "rewards/margins_min": -0.07041596621274948, "rewards/margins_std": 0.14898711442947388, "rewards/rejected": 0.08618739247322083, "step": 2560 }, { "dpo_losses": 0.6638727784156799, "epoch": 0.62, "grad_norm": 8.94386161132097, "learning_rate": 1.9343872929341196e-07, "logits/chosen": -2.787778377532959, "logits/rejected": -2.764533758163452, "logps/chosen": -275.4049072265625, "logps/rejected": -264.90264892578125, "loss": 0.6722, "positive_losses": 0.16229248046875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17178210616111755, "rewards/margins": 0.06374069303274155, "rewards/margins_max": 0.20073875784873962, "rewards/margins_min": -0.0592566542327404, "rewards/margins_std": 0.11831053346395493, "rewards/rejected": 0.1080414205789566, "step": 2570 }, { "dpo_losses": 0.6463866829872131, "epoch": 0.62, "grad_norm": 2.177233336420491, "learning_rate": 1.9140498402420416e-07, "logits/chosen": -2.787079334259033, "logits/rejected": -2.7662193775177, "logps/chosen": -287.1393127441406, "logps/rejected": -280.51898193359375, "loss": 0.6551, "positive_losses": 0.01154251117259264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17731869220733643, "rewards/margins": 0.10072517395019531, "rewards/margins_max": 0.23263141512870789, "rewards/margins_min": -0.027431825175881386, "rewards/margins_std": 0.11355291306972504, "rewards/rejected": 0.07659353315830231, "step": 2580 }, { "dpo_losses": 0.673742949962616, "epoch": 0.62, "grad_norm": 2.1731822518724737, "learning_rate": 1.8937533366481308e-07, "logits/chosen": -2.7016515731811523, "logits/rejected": -2.7352073192596436, "logps/chosen": -245.2400665283203, "logps/rejected": -255.9029541015625, "loss": 0.6809, "positive_losses": 0.11197490990161896, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15035060048103333, "rewards/margins": 0.044650495052337646, "rewards/margins_max": 0.19705966114997864, "rewards/margins_min": -0.09131129086017609, "rewards/margins_std": 0.13300111889839172, "rewards/rejected": 0.10570009797811508, "step": 2590 }, { "dpo_losses": 0.6549166440963745, "epoch": 0.62, "grad_norm": 10.324305210336039, "learning_rate": 1.8734992005725463e-07, "logits/chosen": -2.628615617752075, "logits/rejected": -2.647005319595337, "logps/chosen": -277.8426818847656, "logps/rejected": -268.73052978515625, "loss": 0.6762, "positive_losses": 0.10020065307617188, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1601662039756775, "rewards/margins": 0.08263372629880905, "rewards/margins_max": 0.2190052568912506, "rewards/margins_min": -0.048892222344875336, "rewards/margins_std": 0.12265457957983017, "rewards/rejected": 0.07753246277570724, "step": 2600 }, { "epoch": 0.62, "eval_dpo_losses": 0.6655071377754211, "eval_logits/chosen": -2.7502520084381104, "eval_logits/rejected": -2.7153522968292236, "eval_logps/chosen": -268.3080749511719, "eval_logps/rejected": -255.77088928222656, "eval_loss": 0.6834039092063904, "eval_positive_losses": 0.1472235471010208, "eval_rewards/accuracies": 0.6759999990463257, "eval_rewards/chosen": 0.16147266328334808, "eval_rewards/margins": 0.06064898148179054, "eval_rewards/margins_max": 0.2671493589878082, "eval_rewards/margins_min": -0.1165996566414833, "eval_rewards/margins_std": 0.12867164611816406, "eval_rewards/rejected": 0.10082367807626724, "eval_runtime": 858.4654, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 2600 }, { "dpo_losses": 0.6597750186920166, "epoch": 0.62, "grad_norm": 2.4293846603998377, "learning_rate": 1.853288847474594e-07, "logits/chosen": -2.8134748935699463, "logits/rejected": -2.7702765464782715, "logps/chosen": -290.501953125, "logps/rejected": -260.6832580566406, "loss": 0.6715, "positive_losses": 0.1892099380493164, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1717822253704071, "rewards/margins": 0.07259855419397354, "rewards/margins_max": 0.19520805776119232, "rewards/margins_min": -0.06274349987506866, "rewards/margins_std": 0.11503554880619049, "rewards/rejected": 0.09918366372585297, "step": 2610 }, { "dpo_losses": 0.6685627698898315, "epoch": 0.63, "grad_norm": 1.7525103819317838, "learning_rate": 1.8331236897538065e-07, "logits/chosen": -2.7949442863464355, "logits/rejected": -2.7841122150421143, "logps/chosen": -262.8059997558594, "logps/rejected": -262.04962158203125, "loss": 0.6751, "positive_losses": 0.12178711593151093, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15290528535842896, "rewards/margins": 0.05444132164120674, "rewards/margins_max": 0.20880337059497833, "rewards/margins_min": -0.07946772873401642, "rewards/margins_std": 0.12884338200092316, "rewards/rejected": 0.09846395254135132, "step": 2620 }, { "dpo_losses": 0.6574175953865051, "epoch": 0.63, "grad_norm": 1.7932863972181101, "learning_rate": 1.8130051366512447e-07, "logits/chosen": -2.8026461601257324, "logits/rejected": -2.7036142349243164, "logps/chosen": -275.385009765625, "logps/rejected": -285.0031433105469, "loss": 0.6678, "positive_losses": 0.030692482367157936, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.18601131439208984, "rewards/margins": 0.07919839769601822, "rewards/margins_max": 0.24791495501995087, "rewards/margins_min": -0.08262630552053452, "rewards/margins_std": 0.15186141431331635, "rewards/rejected": 0.10681291669607162, "step": 2630 }, { "dpo_losses": 0.6753314733505249, "epoch": 0.63, "grad_norm": 13.425589175233556, "learning_rate": 1.792934594151003e-07, "logits/chosen": -2.8474459648132324, "logits/rejected": -2.826383590698242, "logps/chosen": -250.7899169921875, "logps/rejected": -229.19515991210938, "loss": 0.6941, "positive_losses": 0.3285289704799652, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.157188281416893, "rewards/margins": 0.04010557755827904, "rewards/margins_max": 0.16838137805461884, "rewards/margins_min": -0.08384736627340317, "rewards/margins_std": 0.11231521517038345, "rewards/rejected": 0.11708269268274307, "step": 2640 }, { "dpo_losses": 0.6678264737129211, "epoch": 0.63, "grad_norm": 2.66657118825057, "learning_rate": 1.7729134648819605e-07, "logits/chosen": -2.723357915878296, "logits/rejected": -2.6746907234191895, "logps/chosen": -231.9086456298828, "logps/rejected": -221.47067260742188, "loss": 0.6809, "positive_losses": 0.16887111961841583, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.15525035560131073, "rewards/margins": 0.055454153567552567, "rewards/margins_max": 0.19224271178245544, "rewards/margins_min": -0.0811559185385704, "rewards/margins_std": 0.11972931772470474, "rewards/rejected": 0.09979619085788727, "step": 2650 }, { "dpo_losses": 0.6647213697433472, "epoch": 0.64, "grad_norm": 2.2029245704762586, "learning_rate": 1.7529431480197533e-07, "logits/chosen": -2.7750751972198486, "logits/rejected": -2.718794345855713, "logps/chosen": -258.7033386230469, "logps/rejected": -246.27542114257812, "loss": 0.6784, "positive_losses": 0.22020359337329865, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15270236134529114, "rewards/margins": 0.06201998144388199, "rewards/margins_max": 0.17691023647785187, "rewards/margins_min": -0.07669947296380997, "rewards/margins_std": 0.11549297720193863, "rewards/rejected": 0.09068240225315094, "step": 2660 }, { "dpo_losses": 0.650566816329956, "epoch": 0.64, "grad_norm": 1.9712727444630143, "learning_rate": 1.7330250391889961e-07, "logits/chosen": -2.816516160964966, "logits/rejected": -2.726344108581543, "logps/chosen": -262.97467041015625, "logps/rejected": -201.1381378173828, "loss": 0.6633, "positive_losses": 0.052294351160526276, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.1738344430923462, "rewards/margins": 0.09070488065481186, "rewards/margins_max": 0.2141878306865692, "rewards/margins_min": -0.007011578418314457, "rewards/margins_std": 0.09855584800243378, "rewards/rejected": 0.08312956243753433, "step": 2670 }, { "dpo_losses": 0.6785145401954651, "epoch": 0.64, "grad_norm": 13.520324666114657, "learning_rate": 1.713160530365747e-07, "logits/chosen": -2.847501516342163, "logits/rejected": -2.8199048042297363, "logps/chosen": -277.103759765625, "logps/rejected": -270.1268005371094, "loss": 0.6869, "positive_losses": 0.2016429901123047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14925292134284973, "rewards/margins": 0.03510298207402229, "rewards/margins_max": 0.1887909471988678, "rewards/margins_min": -0.11368334293365479, "rewards/margins_std": 0.13686922192573547, "rewards/rejected": 0.11414994299411774, "step": 2680 }, { "dpo_losses": 0.6610275506973267, "epoch": 0.64, "grad_norm": 11.633184361710885, "learning_rate": 1.693351009780231e-07, "logits/chosen": -2.754254102706909, "logits/rejected": -2.705504894256592, "logps/chosen": -269.9366760253906, "logps/rejected": -237.3614959716797, "loss": 0.6736, "positive_losses": 0.18831901252269745, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1620502769947052, "rewards/margins": 0.0702347606420517, "rewards/margins_max": 0.21768136322498322, "rewards/margins_min": -0.08829845488071442, "rewards/margins_std": 0.13434329628944397, "rewards/rejected": 0.0918155387043953, "step": 2690 }, { "dpo_losses": 0.6716595888137817, "epoch": 0.65, "grad_norm": 2.2165204518598927, "learning_rate": 1.6735978618198215e-07, "logits/chosen": -2.820106029510498, "logits/rejected": -2.804626703262329, "logps/chosen": -234.778564453125, "logps/rejected": -271.12738037109375, "loss": 0.6867, "positive_losses": 0.30939459800720215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15337766706943512, "rewards/margins": 0.048464007675647736, "rewards/margins_max": 0.21193809807300568, "rewards/margins_min": -0.08744789659976959, "rewards/margins_std": 0.134555384516716, "rewards/rejected": 0.10491366684436798, "step": 2700 }, { "epoch": 0.65, "eval_dpo_losses": 0.6649074554443359, "eval_logits/chosen": -2.7554380893707275, "eval_logits/rejected": -2.720479965209961, "eval_logps/chosen": -268.40863037109375, "eval_logps/rejected": -256.0078125, "eval_loss": 0.6845869421958923, "eval_positive_losses": 0.16194257140159607, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": 0.1604669690132141, "eval_rewards/margins": 0.06201242282986641, "eval_rewards/margins_max": 0.27081313729286194, "eval_rewards/margins_min": -0.1177782192826271, "eval_rewards/margins_std": 0.13037149608135223, "eval_rewards/rejected": 0.09845452755689621, "eval_runtime": 858.7363, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 2700 }, { "dpo_losses": 0.6524493098258972, "epoch": 0.65, "grad_norm": 9.087135662346988, "learning_rate": 1.6539024669322954e-07, "logits/chosen": -2.7725584506988525, "logits/rejected": -2.755258321762085, "logps/chosen": -270.5599365234375, "logps/rejected": -238.7698516845703, "loss": 0.6669, "positive_losses": 0.0752800926566124, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1785382181406021, "rewards/margins": 0.08856969326734543, "rewards/margins_max": 0.22904285788536072, "rewards/margins_min": -0.0418659932911396, "rewards/margins_std": 0.12301256507635117, "rewards/rejected": 0.08996850252151489, "step": 2710 }, { "dpo_losses": 0.6533845663070679, "epoch": 0.65, "grad_norm": 7.571305122294492, "learning_rate": 1.6342662015293584e-07, "logits/chosen": -2.748750686645508, "logits/rejected": -2.714124917984009, "logps/chosen": -288.3981018066406, "logps/rejected": -253.1321563720703, "loss": 0.6729, "positive_losses": 0.0, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1680426448583603, "rewards/margins": 0.08537693321704865, "rewards/margins_max": 0.22501520812511444, "rewards/margins_min": -0.03487751632928848, "rewards/margins_std": 0.11437414586544037, "rewards/rejected": 0.08266572654247284, "step": 2720 }, { "dpo_losses": 0.6721110343933105, "epoch": 0.65, "grad_norm": 2.1244820356213263, "learning_rate": 1.6146904378904536e-07, "logits/chosen": -2.874756097793579, "logits/rejected": -2.8296453952789307, "logps/chosen": -319.38409423828125, "logps/rejected": -306.04425048828125, "loss": 0.6806, "positive_losses": 0.21563568711280823, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15064524114131927, "rewards/margins": 0.048041947185993195, "rewards/margins_max": 0.21451440453529358, "rewards/margins_min": -0.09635026752948761, "rewards/margins_std": 0.13609746098518372, "rewards/rejected": 0.10260329395532608, "step": 2730 }, { "dpo_losses": 0.6471971273422241, "epoch": 0.66, "grad_norm": 15.439963943360283, "learning_rate": 1.5951765440668635e-07, "logits/chosen": -2.817963123321533, "logits/rejected": -2.765235424041748, "logps/chosen": -290.27008056640625, "logps/rejected": -230.53457641601562, "loss": 0.6644, "positive_losses": 0.03470458835363388, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.18386444449424744, "rewards/margins": 0.10036935657262802, "rewards/margins_max": 0.297454833984375, "rewards/margins_min": -0.047707073390483856, "rewards/margins_std": 0.15264105796813965, "rewards/rejected": 0.08349509537220001, "step": 2740 }, { "dpo_losses": 0.6503263711929321, "epoch": 0.66, "grad_norm": 1.576561861822568, "learning_rate": 1.5757258837860998e-07, "logits/chosen": -2.7690486907958984, "logits/rejected": -2.7362446784973145, "logps/chosen": -263.1616516113281, "logps/rejected": -230.9752655029297, "loss": 0.6769, "positive_losses": 0.021961640566587448, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15127944946289062, "rewards/margins": 0.09448641538619995, "rewards/margins_max": 0.26137998700141907, "rewards/margins_min": -0.043125610798597336, "rewards/margins_std": 0.1352236568927765, "rewards/rejected": 0.05679304152727127, "step": 2750 }, { "dpo_losses": 0.6629096269607544, "epoch": 0.66, "grad_norm": 5.601585143401323, "learning_rate": 1.5563398163566034e-07, "logits/chosen": -2.7755508422851562, "logits/rejected": -2.784515857696533, "logps/chosen": -243.64230346679688, "logps/rejected": -265.16436767578125, "loss": 0.6816, "positive_losses": 0.3364960253238678, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1605442464351654, "rewards/margins": 0.06648992002010345, "rewards/margins_max": 0.21271471679210663, "rewards/margins_min": -0.06558944284915924, "rewards/margins_std": 0.12276293337345123, "rewards/rejected": 0.09405432641506195, "step": 2760 }, { "dpo_losses": 0.6621657609939575, "epoch": 0.66, "grad_norm": 2.2569216122246707, "learning_rate": 1.5370196965727438e-07, "logits/chosen": -2.779705286026001, "logits/rejected": -2.7549209594726562, "logps/chosen": -253.5834197998047, "logps/rejected": -249.49179077148438, "loss": 0.6633, "positive_losses": 0.1435375213623047, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.15737812221050262, "rewards/margins": 0.06731993705034256, "rewards/margins_max": 0.21473431587219238, "rewards/margins_min": -0.05090221017599106, "rewards/margins_std": 0.12139612436294556, "rewards/rejected": 0.09005819261074066, "step": 2770 }, { "dpo_losses": 0.6633927226066589, "epoch": 0.67, "grad_norm": 7.0828510195014776, "learning_rate": 1.5177668746201454e-07, "logits/chosen": -2.744079828262329, "logits/rejected": -2.7635505199432373, "logps/chosen": -232.8404083251953, "logps/rejected": -246.7035675048828, "loss": 0.6754, "positive_losses": 0.24320650100708008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1321360468864441, "rewards/margins": 0.06531360000371933, "rewards/margins_max": 0.22315183281898499, "rewards/margins_min": -0.07478601485490799, "rewards/margins_std": 0.12894102931022644, "rewards/rejected": 0.06682245433330536, "step": 2780 }, { "dpo_losses": 0.661398708820343, "epoch": 0.67, "grad_norm": 1.7129794264146385, "learning_rate": 1.4985826959813254e-07, "logits/chosen": -2.8454246520996094, "logits/rejected": -2.7848269939422607, "logps/chosen": -301.18707275390625, "logps/rejected": -285.4922180175781, "loss": 0.6698, "positive_losses": 0.2170257568359375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1601361781358719, "rewards/margins": 0.06998364627361298, "rewards/margins_max": 0.22184951603412628, "rewards/margins_min": -0.0861741453409195, "rewards/margins_std": 0.13536037504673004, "rewards/rejected": 0.09015252441167831, "step": 2790 }, { "dpo_losses": 0.6604946851730347, "epoch": 0.67, "grad_norm": 2.045496585683647, "learning_rate": 1.4794685013416674e-07, "logits/chosen": -2.7734482288360596, "logits/rejected": -2.730741500854492, "logps/chosen": -285.7689208984375, "logps/rejected": -260.18157958984375, "loss": 0.702, "positive_losses": 0.46672648191452026, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17319495975971222, "rewards/margins": 0.07141584157943726, "rewards/margins_max": 0.23294229805469513, "rewards/margins_min": -0.07332227379083633, "rewards/margins_std": 0.13758841156959534, "rewards/rejected": 0.10177910327911377, "step": 2800 }, { "epoch": 0.67, "eval_dpo_losses": 0.6650704741477966, "eval_logits/chosen": -2.750959634780884, "eval_logits/rejected": -2.7157351970672607, "eval_logps/chosen": -268.2217712402344, "eval_logps/rejected": -255.783203125, "eval_loss": 0.683647096157074, "eval_positive_losses": 0.15097393095493317, "eval_rewards/accuracies": 0.6815000176429749, "eval_rewards/chosen": 0.16233578324317932, "eval_rewards/margins": 0.061635181307792664, "eval_rewards/margins_max": 0.26973432302474976, "eval_rewards/margins_min": -0.11747419089078903, "eval_rewards/margins_std": 0.12986698746681213, "eval_rewards/rejected": 0.10070060938596725, "eval_runtime": 858.3859, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 2800 }, { "dpo_losses": 0.6606911420822144, "epoch": 0.67, "grad_norm": 7.833988027065342, "learning_rate": 1.460425626495725e-07, "logits/chosen": -2.779177188873291, "logits/rejected": -2.7455544471740723, "logps/chosen": -232.9375762939453, "logps/rejected": -222.8559112548828, "loss": 0.6728, "positive_losses": 0.06630849838256836, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16429878771305084, "rewards/margins": 0.06982554495334625, "rewards/margins_max": 0.21017050743103027, "rewards/margins_min": -0.05256318300962448, "rewards/margins_std": 0.11449243873357773, "rewards/rejected": 0.0944732278585434, "step": 2810 }, { "dpo_losses": 0.6522689461708069, "epoch": 0.68, "grad_norm": 2.331960910266662, "learning_rate": 1.4414554022538737e-07, "logits/chosen": -2.828850507736206, "logits/rejected": -2.7668731212615967, "logps/chosen": -271.706298828125, "logps/rejected": -229.43728637695312, "loss": 0.667, "positive_losses": 0.11637802422046661, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16890832781791687, "rewards/margins": 0.08820970356464386, "rewards/margins_max": 0.20734378695487976, "rewards/margins_min": -0.019770972430706024, "rewards/margins_std": 0.10575555264949799, "rewards/rejected": 0.08069862425327301, "step": 2820 }, { "dpo_losses": 0.6641877889633179, "epoch": 0.68, "grad_norm": 10.016127361011925, "learning_rate": 1.4225591543493025e-07, "logits/chosen": -2.691718578338623, "logits/rejected": -2.67981219291687, "logps/chosen": -229.587890625, "logps/rejected": -272.73809814453125, "loss": 0.6685, "positive_losses": 0.07205601036548615, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15834155678749084, "rewards/margins": 0.06315115094184875, "rewards/margins_max": 0.19021062552928925, "rewards/margins_min": -0.07571481168270111, "rewards/margins_std": 0.11863082647323608, "rewards/rejected": 0.0951903909444809, "step": 2830 }, { "dpo_losses": 0.6632484197616577, "epoch": 0.68, "grad_norm": 2.0151148897268434, "learning_rate": 1.4037382033453698e-07, "logits/chosen": -2.786177158355713, "logits/rejected": -2.7792882919311523, "logps/chosen": -267.7104187011719, "logps/rejected": -266.3455810546875, "loss": 0.6694, "positive_losses": 0.1170806884765625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.152303546667099, "rewards/margins": 0.06460903584957123, "rewards/margins_max": 0.19348660111427307, "rewards/margins_min": -0.05174566060304642, "rewards/margins_std": 0.10853198915719986, "rewards/rejected": 0.08769448846578598, "step": 2840 }, { "dpo_losses": 0.654760479927063, "epoch": 0.68, "grad_norm": 5.860049114867734, "learning_rate": 1.384993864543314e-07, "logits/chosen": -2.824385166168213, "logits/rejected": -2.796736717224121, "logps/chosen": -266.542724609375, "logps/rejected": -278.4881591796875, "loss": 0.6856, "positive_losses": 0.5247258543968201, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16717705130577087, "rewards/margins": 0.08463399112224579, "rewards/margins_max": 0.24452392756938934, "rewards/margins_min": -0.06163151189684868, "rewards/margins_std": 0.13722988963127136, "rewards/rejected": 0.08254307508468628, "step": 2850 }, { "dpo_losses": 0.6610405445098877, "epoch": 0.68, "grad_norm": 11.08962119195928, "learning_rate": 1.366327447890332e-07, "logits/chosen": -2.833289384841919, "logits/rejected": -2.786445140838623, "logps/chosen": -289.1764831542969, "logps/rejected": -278.06011962890625, "loss": 0.6806, "positive_losses": 0.23758526146411896, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1707238405942917, "rewards/margins": 0.06981154531240463, "rewards/margins_max": 0.21326616406440735, "rewards/margins_min": -0.04526565223932266, "rewards/margins_std": 0.11378525197505951, "rewards/rejected": 0.10091231018304825, "step": 2860 }, { "dpo_losses": 0.6578949689865112, "epoch": 0.69, "grad_norm": 8.011974432932549, "learning_rate": 1.3477402578880356e-07, "logits/chosen": -2.8602027893066406, "logits/rejected": -2.8112597465515137, "logps/chosen": -279.9748229980469, "logps/rejected": -276.86810302734375, "loss": 0.6707, "positive_losses": 0.004292297177016735, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1692042052745819, "rewards/margins": 0.07580138742923737, "rewards/margins_max": 0.18990425765514374, "rewards/margins_min": -0.05800133943557739, "rewards/margins_std": 0.11075906455516815, "rewards/rejected": 0.09340278804302216, "step": 2870 }, { "dpo_losses": 0.6489999890327454, "epoch": 0.69, "grad_norm": 2.3630275117206487, "learning_rate": 1.3292335935012854e-07, "logits/chosen": -2.813847780227661, "logits/rejected": -2.77685546875, "logps/chosen": -309.0907287597656, "logps/rejected": -264.6180114746094, "loss": 0.6797, "positive_losses": 0.03472786024212837, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1819867491722107, "rewards/margins": 0.09541799128055573, "rewards/margins_max": 0.23620197176933289, "rewards/margins_min": -0.06027257442474365, "rewards/margins_std": 0.13762632012367249, "rewards/rejected": 0.08656875044107437, "step": 2880 }, { "dpo_losses": 0.6594910621643066, "epoch": 0.69, "grad_norm": 1.9175520869694374, "learning_rate": 1.3108087480674166e-07, "logits/chosen": -2.7939932346343994, "logits/rejected": -2.7984633445739746, "logps/chosen": -297.43402099609375, "logps/rejected": -289.48724365234375, "loss": 0.6702, "positive_losses": 0.16554084420204163, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15832707285881042, "rewards/margins": 0.07349055260419846, "rewards/margins_max": 0.2264772653579712, "rewards/margins_min": -0.06588301062583923, "rewards/margins_std": 0.13147732615470886, "rewards/rejected": 0.08483649790287018, "step": 2890 }, { "dpo_losses": 0.6549899578094482, "epoch": 0.69, "grad_norm": 15.904209927959867, "learning_rate": 1.2924670092058465e-07, "logits/chosen": -2.82595157623291, "logits/rejected": -2.7797107696533203, "logps/chosen": -287.46923828125, "logps/rejected": -220.1902618408203, "loss": 0.6822, "positive_losses": 0.25711411237716675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17762033641338348, "rewards/margins": 0.08239112794399261, "rewards/margins_max": 0.22791901230812073, "rewards/margins_min": -0.031055575236678123, "rewards/margins_std": 0.11507829278707504, "rewards/rejected": 0.09522920101881027, "step": 2900 }, { "epoch": 0.69, "eval_dpo_losses": 0.6653212904930115, "eval_logits/chosen": -2.7553696632385254, "eval_logits/rejected": -2.7201449871063232, "eval_logps/chosen": -267.90948486328125, "eval_logps/rejected": -255.40750122070312, "eval_loss": 0.6817807555198669, "eval_positive_losses": 0.13121400773525238, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": 0.16545848548412323, "eval_rewards/margins": 0.06100109964609146, "eval_rewards/margins_max": 0.26687848567962646, "eval_rewards/margins_min": -0.11557187139987946, "eval_rewards/margins_std": 0.1282324343919754, "eval_rewards/rejected": 0.10445738583803177, "eval_runtime": 858.8036, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 2900 }, { "dpo_losses": 0.6643952131271362, "epoch": 0.7, "grad_norm": 10.601093279041436, "learning_rate": 1.2742096587280966e-07, "logits/chosen": -2.747917890548706, "logits/rejected": -2.698944568634033, "logps/chosen": -258.4048767089844, "logps/rejected": -226.254638671875, "loss": 0.6692, "positive_losses": 0.13101959228515625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15781119465827942, "rewards/margins": 0.06331338733434677, "rewards/margins_max": 0.20424716174602509, "rewards/margins_min": -0.06483958661556244, "rewards/margins_std": 0.12180174887180328, "rewards/rejected": 0.09449778497219086, "step": 2910 }, { "dpo_losses": 0.6580942273139954, "epoch": 0.7, "grad_norm": 2.144636196771822, "learning_rate": 1.2560379725482073e-07, "logits/chosen": -2.7960219383239746, "logits/rejected": -2.724987030029297, "logps/chosen": -274.41912841796875, "logps/rejected": -242.3802032470703, "loss": 0.6709, "positive_losses": 0.015929222106933594, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.18388552963733673, "rewards/margins": 0.07397058606147766, "rewards/margins_max": 0.18513531982898712, "rewards/margins_min": -0.020436633378267288, "rewards/margins_std": 0.09174038469791412, "rewards/rejected": 0.10991492122411728, "step": 2920 }, { "dpo_losses": 0.6571717262268066, "epoch": 0.7, "grad_norm": 4.611733043557385, "learning_rate": 1.237953220593579e-07, "logits/chosen": -2.8126466274261475, "logits/rejected": -2.7566933631896973, "logps/chosen": -288.8916320800781, "logps/rejected": -252.1869354248047, "loss": 0.6711, "positive_losses": 0.24074149131774902, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16893498599529266, "rewards/margins": 0.07975190877914429, "rewards/margins_max": 0.26337534189224243, "rewards/margins_min": -0.08362237364053726, "rewards/margins_std": 0.1537998616695404, "rewards/rejected": 0.08918308466672897, "step": 2930 }, { "dpo_losses": 0.6468284726142883, "epoch": 0.7, "grad_norm": 1.9236209041317107, "learning_rate": 1.2199566667162127e-07, "logits/chosen": -2.8290622234344482, "logits/rejected": -2.7499024868011475, "logps/chosen": -299.721923828125, "logps/rejected": -249.7019500732422, "loss": 0.6562, "positive_losses": 0.09929580986499786, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.18500009179115295, "rewards/margins": 0.09919735789299011, "rewards/margins_max": 0.24635128676891327, "rewards/margins_min": -0.028249531984329224, "rewards/margins_std": 0.12385032325983047, "rewards/rejected": 0.08580274134874344, "step": 2940 }, { "dpo_losses": 0.6482985019683838, "epoch": 0.71, "grad_norm": 1.9135335385962065, "learning_rate": 1.2020495686043924e-07, "logits/chosen": -2.785789728164673, "logits/rejected": -2.756655216217041, "logps/chosen": -283.24053955078125, "logps/rejected": -248.37136840820312, "loss": 0.6592, "positive_losses": 0.07631029933691025, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18530094623565674, "rewards/margins": 0.09602082520723343, "rewards/margins_max": 0.22588662803173065, "rewards/margins_min": -0.035989873111248016, "rewards/margins_std": 0.11860401928424835, "rewards/rejected": 0.0892801284790039, "step": 2950 }, { "dpo_losses": 0.6583413481712341, "epoch": 0.71, "grad_norm": 10.627018994753126, "learning_rate": 1.1842331776947931e-07, "logits/chosen": -2.7828636169433594, "logits/rejected": -2.7469639778137207, "logps/chosen": -313.7954406738281, "logps/rejected": -240.46774291992188, "loss": 0.6734, "positive_losses": 0.044359780848026276, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.18144525587558746, "rewards/margins": 0.07484839111566544, "rewards/margins_max": 0.20202605426311493, "rewards/margins_min": -0.05801903456449509, "rewards/margins_std": 0.11518166214227676, "rewards/rejected": 0.10659684985876083, "step": 2960 }, { "dpo_losses": 0.6770136952400208, "epoch": 0.71, "grad_norm": 5.5382108692761385, "learning_rate": 1.1665087390850187e-07, "logits/chosen": -2.764281988143921, "logits/rejected": -2.7421581745147705, "logps/chosen": -200.07289123535156, "logps/rejected": -233.5911102294922, "loss": 0.6901, "positive_losses": 0.2501167356967926, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.13606631755828857, "rewards/margins": 0.03657250851392746, "rewards/margins_max": 0.1660461723804474, "rewards/margins_min": -0.10412979125976562, "rewards/margins_std": 0.123598612844944, "rewards/rejected": 0.09949380159378052, "step": 2970 }, { "dpo_losses": 0.6660959124565125, "epoch": 0.71, "grad_norm": 6.566283379743498, "learning_rate": 1.1488774914465918e-07, "logits/chosen": -2.7592780590057373, "logits/rejected": -2.7573442459106445, "logps/chosen": -240.81002807617188, "logps/rejected": -255.63919067382812, "loss": 0.684, "positive_losses": 0.3941512107849121, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15164153277873993, "rewards/margins": 0.059640537947416306, "rewards/margins_max": 0.21047723293304443, "rewards/margins_min": -0.08080439269542694, "rewards/margins_std": 0.12847281992435455, "rewards/rejected": 0.09200098365545273, "step": 2980 }, { "dpo_losses": 0.657392144203186, "epoch": 0.72, "grad_norm": 7.23906646273904, "learning_rate": 1.1313406669383877e-07, "logits/chosen": -2.80047345161438, "logits/rejected": -2.7407608032226562, "logps/chosen": -305.2362060546875, "logps/rejected": -264.88677978515625, "loss": 0.6852, "positive_losses": 0.23414000868797302, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17754772305488586, "rewards/margins": 0.07927733659744263, "rewards/margins_max": 0.2645510137081146, "rewards/margins_min": -0.07542475312948227, "rewards/margins_std": 0.15335193276405334, "rewards/rejected": 0.09827037155628204, "step": 2990 }, { "dpo_losses": 0.654673159122467, "epoch": 0.72, "grad_norm": 7.018515486293925, "learning_rate": 1.1138994911205284e-07, "logits/chosen": -2.785823345184326, "logits/rejected": -2.7316832542419434, "logps/chosen": -276.02532958984375, "logps/rejected": -282.4020690917969, "loss": 0.6751, "positive_losses": 0.0956810936331749, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1865675151348114, "rewards/margins": 0.08264709264039993, "rewards/margins_max": 0.202007457613945, "rewards/margins_min": -0.04947680979967117, "rewards/margins_std": 0.10926423966884613, "rewards/rejected": 0.10392043739557266, "step": 3000 }, { "epoch": 0.72, "eval_dpo_losses": 0.6655716896057129, "eval_logits/chosen": -2.7546749114990234, "eval_logits/rejected": -2.7193422317504883, "eval_logps/chosen": -267.7156066894531, "eval_logps/rejected": -255.1547088623047, "eval_loss": 0.6809297204017639, "eval_positive_losses": 0.12347549200057983, "eval_rewards/accuracies": 0.6744999885559082, "eval_rewards/chosen": 0.16739727556705475, "eval_rewards/margins": 0.060411617159843445, "eval_rewards/margins_max": 0.2651466429233551, "eval_rewards/margins_min": -0.11444079875946045, "eval_rewards/margins_std": 0.1271669566631317, "eval_rewards/rejected": 0.10698564350605011, "eval_runtime": 859.1868, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 3000 }, { "dpo_losses": 0.6597100496292114, "epoch": 0.72, "grad_norm": 2.001110126436989, "learning_rate": 1.0965551828687297e-07, "logits/chosen": -2.797302722930908, "logits/rejected": -2.7398412227630615, "logps/chosen": -248.9597930908203, "logps/rejected": -276.11224365234375, "loss": 0.6646, "positive_losses": 0.01718597486615181, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.17887654900550842, "rewards/margins": 0.07441447675228119, "rewards/margins_max": 0.2463037520647049, "rewards/margins_min": -0.06546028703451157, "rewards/margins_std": 0.13987399637699127, "rewards/rejected": 0.10446204245090485, "step": 3010 }, { "dpo_losses": 0.6435840129852295, "epoch": 0.72, "grad_norm": 8.245358635370502, "learning_rate": 1.0793089542891229e-07, "logits/chosen": -2.7502543926239014, "logits/rejected": -2.709235668182373, "logps/chosen": -274.73974609375, "logps/rejected": -225.44778442382812, "loss": 0.6641, "positive_losses": 0.13151463866233826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.20190437138080597, "rewards/margins": 0.1073942556977272, "rewards/margins_max": 0.2721753418445587, "rewards/margins_min": -0.042645473033189774, "rewards/margins_std": 0.13847844302654266, "rewards/rejected": 0.09451012313365936, "step": 3020 }, { "dpo_losses": 0.6518331170082092, "epoch": 0.73, "grad_norm": 2.0809378266695897, "learning_rate": 1.062162010633545e-07, "logits/chosen": -2.8175134658813477, "logits/rejected": -2.7701735496520996, "logps/chosen": -273.0970764160156, "logps/rejected": -227.7178497314453, "loss": 0.6695, "positive_losses": 0.022677231580018997, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1792055070400238, "rewards/margins": 0.08813692629337311, "rewards/margins_max": 0.20955650508403778, "rewards/margins_min": -0.02520567737519741, "rewards/margins_std": 0.10836289077997208, "rewards/rejected": 0.0910685807466507, "step": 3030 }, { "dpo_losses": 0.6757034063339233, "epoch": 0.73, "grad_norm": 15.2754784042437, "learning_rate": 1.0451155502153138e-07, "logits/chosen": -2.8078954219818115, "logits/rejected": -2.7715048789978027, "logps/chosen": -270.4355773925781, "logps/rejected": -231.6807098388672, "loss": 0.682, "positive_losses": 0.1485191285610199, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15472225844860077, "rewards/margins": 0.040431465953588486, "rewards/margins_max": 0.18580278754234314, "rewards/margins_min": -0.09027747809886932, "rewards/margins_std": 0.12469116598367691, "rewards/rejected": 0.11429079622030258, "step": 3040 }, { "dpo_losses": 0.6657760739326477, "epoch": 0.73, "grad_norm": 1.9090555526299762, "learning_rate": 1.028170764325479e-07, "logits/chosen": -2.814365863800049, "logits/rejected": -2.780466318130493, "logps/chosen": -297.16748046875, "logps/rejected": -262.21551513671875, "loss": 0.6788, "positive_losses": 0.3511480391025543, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.16765518486499786, "rewards/margins": 0.06150408461689949, "rewards/margins_max": 0.21796151995658875, "rewards/margins_min": -0.09834052622318268, "rewards/margins_std": 0.14194948971271515, "rewards/rejected": 0.10615108907222748, "step": 3050 }, { "dpo_losses": 0.6624652743339539, "epoch": 0.73, "grad_norm": 2.0990102933474075, "learning_rate": 1.0113288371495707e-07, "logits/chosen": -2.75932240486145, "logits/rejected": -2.757172107696533, "logps/chosen": -288.0514831542969, "logps/rejected": -256.4453125, "loss": 0.6723, "positive_losses": 0.1895163357257843, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16893397271633148, "rewards/margins": 0.06645186990499496, "rewards/margins_max": 0.19159531593322754, "rewards/margins_min": -0.05699806660413742, "rewards/margins_std": 0.11193323135375977, "rewards/rejected": 0.10248209536075592, "step": 3060 }, { "dpo_losses": 0.6589276790618896, "epoch": 0.74, "grad_norm": 6.011172065888057, "learning_rate": 9.945909456848434e-08, "logits/chosen": -2.7732791900634766, "logits/rejected": -2.740635871887207, "logps/chosen": -285.3351135253906, "logps/rejected": -231.7952423095703, "loss": 0.6692, "positive_losses": 0.1543678343296051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16491903364658356, "rewards/margins": 0.07484950125217438, "rewards/margins_max": 0.2287200391292572, "rewards/margins_min": -0.05990969389677048, "rewards/margins_std": 0.12757554650306702, "rewards/rejected": 0.09006952494382858, "step": 3070 }, { "dpo_losses": 0.6531566381454468, "epoch": 0.74, "grad_norm": 2.369827661485451, "learning_rate": 9.779582596580203e-08, "logits/chosen": -2.6808223724365234, "logits/rejected": -2.676213026046753, "logps/chosen": -236.76889038085938, "logps/rejected": -233.95242309570312, "loss": 0.6692, "positive_losses": 0.15050411224365234, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.17240282893180847, "rewards/margins": 0.08699841052293777, "rewards/margins_max": 0.24397151172161102, "rewards/margins_min": -0.052401065826416016, "rewards/margins_std": 0.13221415877342224, "rewards/rejected": 0.0854044184088707, "step": 3080 }, { "dpo_losses": 0.6607550382614136, "epoch": 0.74, "grad_norm": 2.028365449214921, "learning_rate": 9.614319414435499e-08, "logits/chosen": -2.8383548259735107, "logits/rejected": -2.787454843521118, "logps/chosen": -264.7420959472656, "logps/rejected": -223.0021514892578, "loss": 0.6612, "positive_losses": 0.0984744057059288, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1703965812921524, "rewards/margins": 0.07087160646915436, "rewards/margins_max": 0.2242279052734375, "rewards/margins_min": -0.0724235326051712, "rewards/margins_std": 0.13259103894233704, "rewards/rejected": 0.09952497482299805, "step": 3090 }, { "dpo_losses": 0.6507196426391602, "epoch": 0.74, "grad_norm": 2.2012223489249645, "learning_rate": 9.450131459823688e-08, "logits/chosen": -2.8341426849365234, "logits/rejected": -2.807316541671753, "logps/chosen": -308.36175537109375, "logps/rejected": -257.3990478515625, "loss": 0.673, "positive_losses": 0.1587640792131424, "rewards/accuracies": 0.75, "rewards/chosen": 0.19114577770233154, "rewards/margins": 0.09191853553056717, "rewards/margins_max": 0.23594188690185547, "rewards/margins_min": -0.03346435725688934, "rewards/margins_std": 0.119049072265625, "rewards/rejected": 0.09922723472118378, "step": 3100 }, { "epoch": 0.74, "eval_dpo_losses": 0.6648415327072144, "eval_logits/chosen": -2.7563130855560303, "eval_logits/rejected": -2.721135139465332, "eval_logps/chosen": -268.0209655761719, "eval_logps/rejected": -255.63137817382812, "eval_loss": 0.682990312576294, "eval_positive_losses": 0.15230390429496765, "eval_rewards/accuracies": 0.6815000176429749, "eval_rewards/chosen": 0.16434340178966522, "eval_rewards/margins": 0.06212437152862549, "eval_rewards/margins_max": 0.2709335386753082, "eval_rewards/margins_min": -0.11677832901477814, "eval_rewards/margins_std": 0.13006359338760376, "eval_rewards/rejected": 0.10221902281045914, "eval_runtime": 859.2337, "eval_samples_per_second": 4.655, "eval_steps_per_second": 0.291, "step": 3100 }, { "dpo_losses": 0.6676645278930664, "epoch": 0.74, "grad_norm": 1.965889260096315, "learning_rate": 9.287030207011929e-08, "logits/chosen": -2.71087908744812, "logits/rejected": -2.7050774097442627, "logps/chosen": -262.36688232421875, "logps/rejected": -258.3808288574219, "loss": 0.6896, "positive_losses": 0.36590081453323364, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14394626021385193, "rewards/margins": 0.056908298283815384, "rewards/margins_max": 0.18129441142082214, "rewards/margins_min": -0.08656595647335052, "rewards/margins_std": 0.12000372260808945, "rewards/rejected": 0.08703794330358505, "step": 3110 }, { "dpo_losses": 0.6515650153160095, "epoch": 0.75, "grad_norm": 1.7814709041209777, "learning_rate": 9.125027054323256e-08, "logits/chosen": -2.7792680263519287, "logits/rejected": -2.7425482273101807, "logps/chosen": -305.2757263183594, "logps/rejected": -248.3201904296875, "loss": 0.6686, "positive_losses": 0.03846092149615288, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.18153542280197144, "rewards/margins": 0.0890083909034729, "rewards/margins_max": 0.23709633946418762, "rewards/margins_min": -0.04619797691702843, "rewards/margins_std": 0.12526734173297882, "rewards/rejected": 0.09252701699733734, "step": 3120 }, { "dpo_losses": 0.6614887118339539, "epoch": 0.75, "grad_norm": 11.965342382688467, "learning_rate": 8.964133323340081e-08, "logits/chosen": -2.7472243309020996, "logits/rejected": -2.6830875873565674, "logps/chosen": -217.2124786376953, "logps/rejected": -198.31399536132812, "loss": 0.6727, "positive_losses": 0.20338955521583557, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.16317789256572723, "rewards/margins": 0.0696457251906395, "rewards/margins_max": 0.20361462235450745, "rewards/margins_min": -0.06773094087839127, "rewards/margins_std": 0.12562128901481628, "rewards/rejected": 0.09353218227624893, "step": 3130 }, { "dpo_losses": 0.6547694206237793, "epoch": 0.75, "grad_norm": 1.7832066681106982, "learning_rate": 8.804360258112861e-08, "logits/chosen": -2.8714051246643066, "logits/rejected": -2.815396785736084, "logps/chosen": -265.19036865234375, "logps/rejected": -228.49331665039062, "loss": 0.6677, "positive_losses": 0.0858907699584961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17786194384098053, "rewards/margins": 0.0839531421661377, "rewards/margins_max": 0.24686464667320251, "rewards/margins_min": -0.07368039339780807, "rewards/margins_std": 0.14117637276649475, "rewards/rejected": 0.09390879422426224, "step": 3140 }, { "dpo_losses": 0.6508282423019409, "epoch": 0.75, "grad_norm": 2.255357932634391, "learning_rate": 8.645719024374446e-08, "logits/chosen": -2.8324344158172607, "logits/rejected": -2.770369529724121, "logps/chosen": -293.355712890625, "logps/rejected": -261.38018798828125, "loss": 0.6761, "positive_losses": 0.06520424038171768, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.18506333231925964, "rewards/margins": 0.09046249091625214, "rewards/margins_max": 0.2178497314453125, "rewards/margins_min": -0.031379975378513336, "rewards/margins_std": 0.1138070672750473, "rewards/rejected": 0.09460082650184631, "step": 3150 }, { "dpo_losses": 0.6555924415588379, "epoch": 0.76, "grad_norm": 2.0637931237893716, "learning_rate": 8.488220708759667e-08, "logits/chosen": -2.8385794162750244, "logits/rejected": -2.794734477996826, "logps/chosen": -300.38763427734375, "logps/rejected": -249.67520141601562, "loss": 0.678, "positive_losses": 0.3245214521884918, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16448244452476501, "rewards/margins": 0.08136534690856934, "rewards/margins_max": 0.2282770872116089, "rewards/margins_min": -0.05792509391903877, "rewards/margins_std": 0.12321660667657852, "rewards/rejected": 0.08311710506677628, "step": 3160 }, { "dpo_losses": 0.6678366661071777, "epoch": 0.76, "grad_norm": 12.625252598416719, "learning_rate": 8.331876318030585e-08, "logits/chosen": -2.7887542247772217, "logits/rejected": -2.756824493408203, "logps/chosen": -264.6676330566406, "logps/rejected": -243.1872100830078, "loss": 0.6909, "positive_losses": 0.024743270128965378, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16196869313716888, "rewards/margins": 0.05514339730143547, "rewards/margins_max": 0.18713268637657166, "rewards/margins_min": -0.08152800798416138, "rewards/margins_std": 0.11955104023218155, "rewards/rejected": 0.10682530701160431, "step": 3170 }, { "dpo_losses": 0.6596195101737976, "epoch": 0.76, "grad_norm": 1.9437352416039395, "learning_rate": 8.176696778307269e-08, "logits/chosen": -2.7708170413970947, "logits/rejected": -2.732281446456909, "logps/chosen": -281.07647705078125, "logps/rejected": -263.0594177246094, "loss": 0.6778, "positive_losses": 0.30154380202293396, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17353418469429016, "rewards/margins": 0.07388520240783691, "rewards/margins_max": 0.22113916277885437, "rewards/margins_min": -0.07152248919010162, "rewards/margins_std": 0.12745937705039978, "rewards/rejected": 0.09964897483587265, "step": 3180 }, { "dpo_losses": 0.6572962999343872, "epoch": 0.76, "grad_norm": 2.1868814956381053, "learning_rate": 8.022692934304238e-08, "logits/chosen": -2.8038437366485596, "logits/rejected": -2.7211310863494873, "logps/chosen": -283.28985595703125, "logps/rejected": -245.0017852783203, "loss": 0.6718, "positive_losses": 0.30991917848587036, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.16886462271213531, "rewards/margins": 0.07783128321170807, "rewards/margins_max": 0.22361192107200623, "rewards/margins_min": -0.05608420819044113, "rewards/margins_std": 0.12367092072963715, "rewards/rejected": 0.09103331714868546, "step": 3190 }, { "dpo_losses": 0.66231369972229, "epoch": 0.77, "grad_norm": 7.502583376692241, "learning_rate": 7.869875548572588e-08, "logits/chosen": -2.788996696472168, "logits/rejected": -2.776718854904175, "logps/chosen": -238.0166473388672, "logps/rejected": -222.90567016601562, "loss": 0.6666, "positive_losses": 0.08055458217859268, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16978101432323456, "rewards/margins": 0.06810571253299713, "rewards/margins_max": 0.20775611698627472, "rewards/margins_min": -0.09373478591442108, "rewards/margins_std": 0.13699105381965637, "rewards/rejected": 0.10167531669139862, "step": 3200 }, { "epoch": 0.77, "eval_dpo_losses": 0.6653023362159729, "eval_logits/chosen": -2.7554423809051514, "eval_logits/rejected": -2.720195770263672, "eval_logps/chosen": -267.7304382324219, "eval_logps/rejected": -255.23439025878906, "eval_loss": 0.6818264126777649, "eval_positive_losses": 0.13811592757701874, "eval_rewards/accuracies": 0.6784999966621399, "eval_rewards/chosen": 0.1672487109899521, "eval_rewards/margins": 0.06105995178222656, "eval_rewards/margins_max": 0.26752763986587524, "eval_rewards/margins_min": -0.11574739217758179, "eval_rewards/margins_std": 0.12843702733516693, "eval_rewards/rejected": 0.10618878155946732, "eval_runtime": 858.7794, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 3200 }, { "dpo_losses": 0.6516947150230408, "epoch": 0.77, "grad_norm": 1.87124499384657, "learning_rate": 7.718255300747817e-08, "logits/chosen": -2.7647528648376465, "logits/rejected": -2.7416744232177734, "logps/chosen": -258.1357116699219, "logps/rejected": -268.9915466308594, "loss": 0.6607, "positive_losses": 0.09852103888988495, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1894657462835312, "rewards/margins": 0.09015753120183945, "rewards/margins_max": 0.24885597825050354, "rewards/margins_min": -0.059423137456178665, "rewards/margins_std": 0.13656654953956604, "rewards/rejected": 0.09930822998285294, "step": 3210 }, { "dpo_losses": 0.6571955680847168, "epoch": 0.77, "grad_norm": 1.9948028152459722, "learning_rate": 7.567842786803502e-08, "logits/chosen": -2.7428343296051025, "logits/rejected": -2.6695494651794434, "logps/chosen": -267.6722717285156, "logps/rejected": -259.9129333496094, "loss": 0.6586, "positive_losses": 0.10708370059728622, "rewards/accuracies": 0.75, "rewards/chosen": 0.16514816880226135, "rewards/margins": 0.07745979726314545, "rewards/margins_max": 0.21773628890514374, "rewards/margins_min": -0.05754680186510086, "rewards/margins_std": 0.12350962311029434, "rewards/rejected": 0.0876883715391159, "step": 3220 }, { "dpo_losses": 0.6471143960952759, "epoch": 0.77, "grad_norm": 2.262482245781298, "learning_rate": 7.418648518310797e-08, "logits/chosen": -2.8083693981170654, "logits/rejected": -2.7582826614379883, "logps/chosen": -268.1528625488281, "logps/rejected": -208.77877807617188, "loss": 0.6756, "positive_losses": 0.07729168236255646, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18160364031791687, "rewards/margins": 0.09957949817180634, "rewards/margins_max": 0.2602100968360901, "rewards/margins_min": -0.0481126643717289, "rewards/margins_std": 0.13958647847175598, "rewards/rejected": 0.08202414214611053, "step": 3230 }, { "dpo_losses": 0.6663921475410461, "epoch": 0.78, "grad_norm": 7.029573367808209, "learning_rate": 7.270682921703853e-08, "logits/chosen": -2.795095443725586, "logits/rejected": -2.7953317165374756, "logps/chosen": -272.14080810546875, "logps/rejected": -261.3625793457031, "loss": 0.6725, "positive_losses": 0.07107166945934296, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16820578277111053, "rewards/margins": 0.059129320085048676, "rewards/margins_max": 0.2140970230102539, "rewards/margins_min": -0.07038327306509018, "rewards/margins_std": 0.127908393740654, "rewards/rejected": 0.10907645523548126, "step": 3240 }, { "dpo_losses": 0.6619508862495422, "epoch": 0.78, "grad_norm": 9.78013648376715, "learning_rate": 7.123956337551116e-08, "logits/chosen": -2.7573256492614746, "logits/rejected": -2.781247854232788, "logps/chosen": -260.58233642578125, "logps/rejected": -228.30691528320312, "loss": 0.7031, "positive_losses": 0.41934508085250854, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16436035931110382, "rewards/margins": 0.06838925182819366, "rewards/margins_max": 0.20600955188274384, "rewards/margins_min": -0.05949493125081062, "rewards/margins_std": 0.12117097526788712, "rewards/rejected": 0.09597107023000717, "step": 3250 }, { "dpo_losses": 0.6445857286453247, "epoch": 0.78, "grad_norm": 10.608811196359374, "learning_rate": 6.978479019832725e-08, "logits/chosen": -2.7441658973693848, "logits/rejected": -2.705112934112549, "logps/chosen": -303.57061767578125, "logps/rejected": -249.8936004638672, "loss": 0.6637, "positive_losses": 0.14329147338867188, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18708881735801697, "rewards/margins": 0.10616060346364975, "rewards/margins_max": 0.27576643228530884, "rewards/margins_min": -0.048006825149059296, "rewards/margins_std": 0.14384113252162933, "rewards/rejected": 0.08092823624610901, "step": 3260 }, { "dpo_losses": 0.6633797287940979, "epoch": 0.78, "grad_norm": 2.030186653576009, "learning_rate": 6.83426113522389e-08, "logits/chosen": -2.7558770179748535, "logits/rejected": -2.7438063621520996, "logps/chosen": -273.0426330566406, "logps/rejected": -242.9607696533203, "loss": 0.6639, "positive_losses": 0.07001648098230362, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.18063107132911682, "rewards/margins": 0.06444104015827179, "rewards/margins_max": 0.21436145901679993, "rewards/margins_min": -0.05951087921857834, "rewards/margins_std": 0.12042567878961563, "rewards/rejected": 0.11619003117084503, "step": 3270 }, { "dpo_losses": 0.6671854257583618, "epoch": 0.79, "grad_norm": 6.306552918250457, "learning_rate": 6.691312762384396e-08, "logits/chosen": -2.8321080207824707, "logits/rejected": -2.781754732131958, "logps/chosen": -248.39242553710938, "logps/rejected": -214.80160522460938, "loss": 0.671, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15718095004558563, "rewards/margins": 0.05620662122964859, "rewards/margins_max": 0.1668313443660736, "rewards/margins_min": -0.048628367483615875, "rewards/margins_std": 0.09669866412878036, "rewards/rejected": 0.10097432136535645, "step": 3280 }, { "dpo_losses": 0.6620953679084778, "epoch": 0.79, "grad_norm": 2.402918106518768, "learning_rate": 6.54964389125428e-08, "logits/chosen": -2.7394638061523438, "logits/rejected": -2.7300403118133545, "logps/chosen": -244.8998260498047, "logps/rejected": -258.6817626953125, "loss": 0.6745, "positive_losses": 0.15004310011863708, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15411940217018127, "rewards/margins": 0.06749512255191803, "rewards/margins_max": 0.20470313727855682, "rewards/margins_min": -0.055118389427661896, "rewards/margins_std": 0.11774852126836777, "rewards/rejected": 0.08662425726652145, "step": 3290 }, { "dpo_losses": 0.6497889161109924, "epoch": 0.79, "grad_norm": 5.9691980004072915, "learning_rate": 6.409264422355642e-08, "logits/chosen": -2.8293371200561523, "logits/rejected": -2.819593667984009, "logps/chosen": -287.93634033203125, "logps/rejected": -277.17706298828125, "loss": 0.6619, "positive_losses": 0.0660938248038292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19996556639671326, "rewards/margins": 0.09392669051885605, "rewards/margins_max": 0.24315014481544495, "rewards/margins_min": -0.045699674636125565, "rewards/margins_std": 0.12970185279846191, "rewards/rejected": 0.10603886842727661, "step": 3300 }, { "epoch": 0.79, "eval_dpo_losses": 0.6647235155105591, "eval_logits/chosen": -2.7558841705322266, "eval_logits/rejected": -2.720723867416382, "eval_logps/chosen": -267.9396057128906, "eval_logps/rejected": -255.57681274414062, "eval_loss": 0.682898998260498, "eval_positive_losses": 0.1523255556821823, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.16515730321407318, "eval_rewards/margins": 0.06239260733127594, "eval_rewards/margins_max": 0.27165281772613525, "eval_rewards/margins_min": -0.11721807718276978, "eval_rewards/margins_std": 0.130425363779068, "eval_rewards/rejected": 0.10276471078395844, "eval_runtime": 859.5511, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 3300 }, { "dpo_losses": 0.6745742559432983, "epoch": 0.79, "grad_norm": 6.004081609345404, "learning_rate": 6.27018416610078e-08, "logits/chosen": -2.788038730621338, "logits/rejected": -2.7331957817077637, "logps/chosen": -233.0995330810547, "logps/rejected": -289.3941955566406, "loss": 0.6681, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.17546075582504272, "rewards/margins": 0.045064687728881836, "rewards/margins_max": 0.2105121910572052, "rewards/margins_min": -0.10920113325119019, "rewards/margins_std": 0.14138653874397278, "rewards/rejected": 0.1303960382938385, "step": 3310 }, { "dpo_losses": 0.6637552380561829, "epoch": 0.8, "grad_norm": 4.878801868504104, "learning_rate": 6.132412842106572e-08, "logits/chosen": -2.793503522872925, "logits/rejected": -2.7467846870422363, "logps/chosen": -287.07354736328125, "logps/rejected": -286.0504455566406, "loss": 0.6814, "positive_losses": 0.1265428513288498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1680542528629303, "rewards/margins": 0.06597061455249786, "rewards/margins_max": 0.22266510128974915, "rewards/margins_min": -0.0885610431432724, "rewards/margins_std": 0.13998521864414215, "rewards/rejected": 0.10208363831043243, "step": 3320 }, { "dpo_losses": 0.6658666729927063, "epoch": 0.8, "grad_norm": 2.286510620446214, "learning_rate": 5.995960078515255e-08, "logits/chosen": -2.798668622970581, "logits/rejected": -2.7196779251098633, "logps/chosen": -270.7170715332031, "logps/rejected": -246.88095092773438, "loss": 0.6871, "positive_losses": 0.2687618136405945, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.15244479477405548, "rewards/margins": 0.060325391590595245, "rewards/margins_max": 0.20955657958984375, "rewards/margins_min": -0.0803038701415062, "rewards/margins_std": 0.13133302330970764, "rewards/rejected": 0.09211940318346024, "step": 3330 }, { "dpo_losses": 0.6605310440063477, "epoch": 0.8, "grad_norm": 2.0740728811924143, "learning_rate": 5.860835411321494e-08, "logits/chosen": -2.7353053092956543, "logits/rejected": -2.7072510719299316, "logps/chosen": -278.6685485839844, "logps/rejected": -268.1210021972656, "loss": 0.6686, "positive_losses": 0.055524446070194244, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19139747321605682, "rewards/margins": 0.07464327663183212, "rewards/margins_max": 0.24491114914417267, "rewards/margins_min": -0.10009765625, "rewards/margins_std": 0.15474039316177368, "rewards/rejected": 0.1167541965842247, "step": 3340 }, { "dpo_losses": 0.6570427417755127, "epoch": 0.8, "grad_norm": 4.365068334370116, "learning_rate": 5.7270482837060455e-08, "logits/chosen": -2.8398659229278564, "logits/rejected": -2.8040809631347656, "logps/chosen": -284.28472900390625, "logps/rejected": -231.2967529296875, "loss": 0.6646, "positive_losses": 0.11738715320825577, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1786644607782364, "rewards/margins": 0.07911532372236252, "rewards/margins_max": 0.23751592636108398, "rewards/margins_min": -0.05232914164662361, "rewards/margins_std": 0.13107521831989288, "rewards/rejected": 0.09954912960529327, "step": 3350 }, { "dpo_losses": 0.6529143452644348, "epoch": 0.8, "grad_norm": 2.077914119719153, "learning_rate": 5.5946080453757425e-08, "logits/chosen": -2.7002339363098145, "logits/rejected": -2.6894619464874268, "logps/chosen": -283.50146484375, "logps/rejected": -255.17245483398438, "loss": 0.6607, "positive_losses": 0.025138091295957565, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18657462298870087, "rewards/margins": 0.08805961161851883, "rewards/margins_max": 0.23779284954071045, "rewards/margins_min": -0.07201007008552551, "rewards/margins_std": 0.13704165816307068, "rewards/rejected": 0.09851501882076263, "step": 3360 }, { "dpo_losses": 0.6778891682624817, "epoch": 0.81, "grad_norm": 7.627089667471748, "learning_rate": 5.4635239519101706e-08, "logits/chosen": -2.7399086952209473, "logits/rejected": -2.770047426223755, "logps/chosen": -259.58905029296875, "logps/rejected": -283.8619689941406, "loss": 0.7118, "positive_losses": 0.30641278624534607, "rewards/accuracies": 0.5625, "rewards/chosen": 0.17575284838676453, "rewards/margins": 0.035719890147447586, "rewards/margins_max": 0.1801590621471405, "rewards/margins_min": -0.10961707681417465, "rewards/margins_std": 0.1309371292591095, "rewards/rejected": 0.14003296196460724, "step": 3370 }, { "dpo_losses": 0.650728702545166, "epoch": 0.81, "grad_norm": 14.077890341239376, "learning_rate": 5.333805164114744e-08, "logits/chosen": -2.761138439178467, "logits/rejected": -2.7056262493133545, "logps/chosen": -293.15606689453125, "logps/rejected": -272.4565124511719, "loss": 0.6526, "positive_losses": 0.026407623663544655, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1890369951725006, "rewards/margins": 0.09052906185388565, "rewards/margins_max": 0.21877756714820862, "rewards/margins_min": -0.019002093002200127, "rewards/margins_std": 0.1078447550535202, "rewards/rejected": 0.09850792586803436, "step": 3380 }, { "dpo_losses": 0.6585914492607117, "epoch": 0.81, "grad_norm": 1.7141576276339023, "learning_rate": 5.205460747380588e-08, "logits/chosen": -2.8345866203308105, "logits/rejected": -2.8106422424316406, "logps/chosen": -250.3362579345703, "logps/rejected": -235.87380981445312, "loss": 0.676, "positive_losses": 0.05412711948156357, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16862516105175018, "rewards/margins": 0.07414807379245758, "rewards/margins_max": 0.19512033462524414, "rewards/margins_min": -0.05015040189027786, "rewards/margins_std": 0.10794240236282349, "rewards/rejected": 0.0944771021604538, "step": 3390 }, { "dpo_losses": 0.6596596837043762, "epoch": 0.81, "grad_norm": 2.182699986195155, "learning_rate": 5.0784996710509785e-08, "logits/chosen": -2.7778992652893066, "logits/rejected": -2.7633323669433594, "logps/chosen": -337.2443542480469, "logps/rejected": -317.1626892089844, "loss": 0.6752, "positive_losses": 0.030435943976044655, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17162474989891052, "rewards/margins": 0.07284317910671234, "rewards/margins_max": 0.21011486649513245, "rewards/margins_min": -0.046968333423137665, "rewards/margins_std": 0.11391538381576538, "rewards/rejected": 0.09878159314393997, "step": 3400 }, { "epoch": 0.81, "eval_dpo_losses": 0.6646923422813416, "eval_logits/chosen": -2.754830837249756, "eval_logits/rejected": -2.719656229019165, "eval_logps/chosen": -267.9222412109375, "eval_logps/rejected": -255.56700134277344, "eval_loss": 0.6829546093940735, "eval_positive_losses": 0.1529531627893448, "eval_rewards/accuracies": 0.6804999709129333, "eval_rewards/chosen": 0.16533102095127106, "eval_rewards/margins": 0.06246813386678696, "eval_rewards/margins_max": 0.27184244990348816, "eval_rewards/margins_min": -0.11774192005395889, "eval_rewards/margins_std": 0.130555659532547, "eval_rewards/rejected": 0.1028628870844841, "eval_runtime": 859.0025, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 3400 }, { "dpo_losses": 0.6619130969047546, "epoch": 0.82, "grad_norm": 2.063810683432068, "learning_rate": 4.952930807794503e-08, "logits/chosen": -2.7935609817504883, "logits/rejected": -2.765453815460205, "logps/chosen": -244.87973022460938, "logps/rejected": -263.1246032714844, "loss": 0.6702, "positive_losses": 0.04901885986328125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.16113756597042084, "rewards/margins": 0.07058267295360565, "rewards/margins_max": 0.24181148409843445, "rewards/margins_min": -0.08621235191822052, "rewards/margins_std": 0.14885516464710236, "rewards/rejected": 0.09055489301681519, "step": 3410 }, { "dpo_losses": 0.6469432711601257, "epoch": 0.82, "grad_norm": 2.525466339557661, "learning_rate": 4.828762932985009e-08, "logits/chosen": -2.8164350986480713, "logits/rejected": -2.7488341331481934, "logps/chosen": -280.8096618652344, "logps/rejected": -238.9579315185547, "loss": 0.659, "positive_losses": 0.1910697966814041, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.17682726681232452, "rewards/margins": 0.09931819885969162, "rewards/margins_max": 0.23799188435077667, "rewards/margins_min": -0.04032497853040695, "rewards/margins_std": 0.12361109256744385, "rewards/rejected": 0.07750906050205231, "step": 3420 }, { "dpo_losses": 0.6753236055374146, "epoch": 0.82, "grad_norm": 19.922787969276538, "learning_rate": 4.706004724088328e-08, "logits/chosen": -2.7445521354675293, "logits/rejected": -2.6637704372406006, "logps/chosen": -282.76556396484375, "logps/rejected": -281.15814208984375, "loss": 0.6862, "positive_losses": 0.1970151662826538, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.154363751411438, "rewards/margins": 0.039429426193237305, "rewards/margins_max": 0.15408742427825928, "rewards/margins_min": -0.0703122615814209, "rewards/margins_std": 0.0997403934597969, "rewards/rejected": 0.11493434756994247, "step": 3430 }, { "dpo_losses": 0.6575914621353149, "epoch": 0.82, "grad_norm": 9.301307865183782, "learning_rate": 4.584664760055881e-08, "logits/chosen": -2.806864023208618, "logits/rejected": -2.7848546504974365, "logps/chosen": -221.9463348388672, "logps/rejected": -206.96664428710938, "loss": 0.6656, "positive_losses": 0.1111370101571083, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17141377925872803, "rewards/margins": 0.07779663801193237, "rewards/margins_max": 0.23745279014110565, "rewards/margins_min": -0.06254319846630096, "rewards/margins_std": 0.13202930986881256, "rewards/rejected": 0.09361713379621506, "step": 3440 }, { "dpo_losses": 0.6604411005973816, "epoch": 0.83, "grad_norm": 6.567660963556703, "learning_rate": 4.4647515207250934e-08, "logits/chosen": -2.8708627223968506, "logits/rejected": -2.8254528045654297, "logps/chosen": -282.6668395996094, "logps/rejected": -257.4250183105469, "loss": 0.6634, "positive_losses": 0.015377998352050781, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17459583282470703, "rewards/margins": 0.07071644067764282, "rewards/margins_max": 0.1941402405500412, "rewards/margins_min": -0.048468492925167084, "rewards/margins_std": 0.1119454950094223, "rewards/rejected": 0.10387939214706421, "step": 3450 }, { "dpo_losses": 0.6608849763870239, "epoch": 0.83, "grad_norm": 2.739689659308384, "learning_rate": 4.346273386226812e-08, "logits/chosen": -2.7588064670562744, "logits/rejected": -2.7567520141601562, "logps/chosen": -285.0992126464844, "logps/rejected": -260.522216796875, "loss": 0.6752, "positive_losses": 0.0028160095680505037, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.18937823176383972, "rewards/margins": 0.07493998855352402, "rewards/margins_max": 0.2832399308681488, "rewards/margins_min": -0.07329995930194855, "rewards/margins_std": 0.15775929391384125, "rewards/rejected": 0.1144382506608963, "step": 3460 }, { "dpo_losses": 0.658972978591919, "epoch": 0.83, "grad_norm": 12.55314325735385, "learning_rate": 4.2292386363996484e-08, "logits/chosen": -2.796144723892212, "logits/rejected": -2.7560811042785645, "logps/chosen": -286.2737121582031, "logps/rejected": -258.2828369140625, "loss": 0.6724, "positive_losses": 0.23861780762672424, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.17371580004692078, "rewards/margins": 0.07555990666151047, "rewards/margins_max": 0.24161669611930847, "rewards/margins_min": -0.08606896549463272, "rewards/margins_std": 0.14473329484462738, "rewards/rejected": 0.09815588593482971, "step": 3470 }, { "dpo_losses": 0.6670488715171814, "epoch": 0.83, "grad_norm": 14.932329759481554, "learning_rate": 4.1136554502113676e-08, "logits/chosen": -2.7665271759033203, "logits/rejected": -2.786179304122925, "logps/chosen": -255.95166015625, "logps/rejected": -282.81695556640625, "loss": 0.6872, "positive_losses": 0.29130443930625916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16322802007198334, "rewards/margins": 0.05973082035779953, "rewards/margins_max": 0.2496376931667328, "rewards/margins_min": -0.11830757558345795, "rewards/margins_std": 0.1635482758283615, "rewards/rejected": 0.1034972071647644, "step": 3480 }, { "dpo_losses": 0.6547514796257019, "epoch": 0.84, "grad_norm": 10.001505929089348, "learning_rate": 3.999531905187256e-08, "logits/chosen": -2.801217555999756, "logits/rejected": -2.7586214542388916, "logps/chosen": -287.6464538574219, "logps/rejected": -272.67523193359375, "loss": 0.6817, "positive_losses": 0.18505516648292542, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18473398685455322, "rewards/margins": 0.08431489765644073, "rewards/margins_max": 0.24541649222373962, "rewards/margins_min": -0.08374623954296112, "rewards/margins_std": 0.1444653570652008, "rewards/rejected": 0.10041908919811249, "step": 3490 }, { "dpo_losses": 0.6568532586097717, "epoch": 0.84, "grad_norm": 16.19139164278593, "learning_rate": 3.886875976845661e-08, "logits/chosen": -2.8865325450897217, "logits/rejected": -2.8390843868255615, "logps/chosen": -298.00909423828125, "logps/rejected": -264.50115966796875, "loss": 0.6711, "positive_losses": 0.03290538862347603, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18251189589500427, "rewards/margins": 0.0800279974937439, "rewards/margins_max": 0.22156575322151184, "rewards/margins_min": -0.05654965713620186, "rewards/margins_std": 0.12360548973083496, "rewards/rejected": 0.10248388350009918, "step": 3500 }, { "epoch": 0.84, "eval_dpo_losses": 0.6643189787864685, "eval_logits/chosen": -2.7540204524993896, "eval_logits/rejected": -2.718820571899414, "eval_logps/chosen": -268.11956787109375, "eval_logps/rejected": -255.84933471679688, "eval_loss": 0.6840823292732239, "eval_positive_losses": 0.16633722186088562, "eval_rewards/accuracies": 0.6794999837875366, "eval_rewards/chosen": 0.1633576601743698, "eval_rewards/margins": 0.06331835687160492, "eval_rewards/margins_max": 0.2740486264228821, "eval_rewards/margins_min": -0.11830957233905792, "eval_rewards/margins_std": 0.13166756927967072, "eval_rewards/rejected": 0.1000392958521843, "eval_runtime": 858.8936, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 3500 }, { "dpo_losses": 0.6626294255256653, "epoch": 0.84, "grad_norm": 8.432558322120105, "learning_rate": 3.775695538140608e-08, "logits/chosen": -2.7874884605407715, "logits/rejected": -2.7325947284698486, "logps/chosen": -236.6802215576172, "logps/rejected": -202.62847900390625, "loss": 0.6742, "positive_losses": 0.16564521193504333, "rewards/accuracies": 0.625, "rewards/chosen": 0.16089025139808655, "rewards/margins": 0.06757035851478577, "rewards/margins_max": 0.23646171391010284, "rewards/margins_min": -0.08172665536403656, "rewards/margins_std": 0.14006316661834717, "rewards/rejected": 0.09331991523504257, "step": 3510 }, { "dpo_losses": 0.6668065786361694, "epoch": 0.84, "grad_norm": 2.0955575001827715, "learning_rate": 3.665998358911593e-08, "logits/chosen": -2.8080132007598877, "logits/rejected": -2.7307779788970947, "logps/chosen": -235.3622589111328, "logps/rejected": -242.8473663330078, "loss": 0.6644, "positive_losses": 0.003895378205925226, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.1626807451248169, "rewards/margins": 0.06152535229921341, "rewards/margins_max": 0.2067304104566574, "rewards/margins_min": -0.0905807763338089, "rewards/margins_std": 0.13365648686885834, "rewards/rejected": 0.10115540027618408, "step": 3520 }, { "dpo_losses": 0.6618870496749878, "epoch": 0.85, "grad_norm": 17.382889747019373, "learning_rate": 3.557792105340621e-08, "logits/chosen": -2.8555986881256104, "logits/rejected": -2.809135675430298, "logps/chosen": -265.1983642578125, "logps/rejected": -248.84963989257812, "loss": 0.684, "positive_losses": 0.14107456803321838, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1621045172214508, "rewards/margins": 0.06803744286298752, "rewards/margins_max": 0.21972200274467468, "rewards/margins_min": -0.05503816157579422, "rewards/margins_std": 0.1268104910850525, "rewards/rejected": 0.09406708180904388, "step": 3530 }, { "dpo_losses": 0.649704098701477, "epoch": 0.85, "grad_norm": 12.604373814585653, "learning_rate": 3.4510843394163966e-08, "logits/chosen": -2.788520574569702, "logits/rejected": -2.7774863243103027, "logps/chosen": -275.66522216796875, "logps/rejected": -278.797607421875, "loss": 0.6762, "positive_losses": 0.09154434502124786, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1750548630952835, "rewards/margins": 0.09439029544591904, "rewards/margins_max": 0.25461068749427795, "rewards/margins_min": -0.05751841515302658, "rewards/margins_std": 0.13798056542873383, "rewards/rejected": 0.08066456019878387, "step": 3540 }, { "dpo_losses": 0.6588909029960632, "epoch": 0.85, "grad_norm": 20.782780870540176, "learning_rate": 3.345882518405918e-08, "logits/chosen": -2.781425952911377, "logits/rejected": -2.8115956783294678, "logps/chosen": -218.76547241210938, "logps/rejected": -238.454833984375, "loss": 0.6687, "positive_losses": 0.06881485134363174, "rewards/accuracies": 0.75, "rewards/chosen": 0.16983821988105774, "rewards/margins": 0.0750776007771492, "rewards/margins_max": 0.23094777762889862, "rewards/margins_min": -0.08202357590198517, "rewards/margins_std": 0.13859549164772034, "rewards/rejected": 0.09476064145565033, "step": 3550 }, { "dpo_losses": 0.6595170497894287, "epoch": 0.85, "grad_norm": 12.6678868425505, "learning_rate": 3.242193994333278e-08, "logits/chosen": -2.756159543991089, "logits/rejected": -2.718527317047119, "logps/chosen": -240.20303344726562, "logps/rejected": -224.70687866210938, "loss": 0.6802, "positive_losses": 0.3063264787197113, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1580447107553482, "rewards/margins": 0.07447342574596405, "rewards/margins_max": 0.2679930627346039, "rewards/margins_min": -0.07119562476873398, "rewards/margins_std": 0.15105770528316498, "rewards/rejected": 0.08357128500938416, "step": 3560 }, { "dpo_losses": 0.6530089378356934, "epoch": 0.85, "grad_norm": 7.258213783561412, "learning_rate": 3.14002601346591e-08, "logits/chosen": -2.7348289489746094, "logits/rejected": -2.7819461822509766, "logps/chosen": -275.82904052734375, "logps/rejected": -280.7950744628906, "loss": 0.6593, "positive_losses": 0.007923793978989124, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17459799349308014, "rewards/margins": 0.08610548079013824, "rewards/margins_max": 0.2145996391773224, "rewards/margins_min": -0.036080751568078995, "rewards/margins_std": 0.11529894918203354, "rewards/rejected": 0.0884925127029419, "step": 3570 }, { "dpo_losses": 0.648635745048523, "epoch": 0.86, "grad_norm": 2.233130154928028, "learning_rate": 3.039385715808121e-08, "logits/chosen": -2.799232006072998, "logits/rejected": -2.7276079654693604, "logps/chosen": -236.6031494140625, "logps/rejected": -200.833251953125, "loss": 0.6648, "positive_losses": 0.060898590832948685, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1780189722776413, "rewards/margins": 0.09477417171001434, "rewards/margins_max": 0.21943345665931702, "rewards/margins_min": -0.011336622759699821, "rewards/margins_std": 0.10491341352462769, "rewards/rejected": 0.08324481546878815, "step": 3580 }, { "dpo_losses": 0.6598198413848877, "epoch": 0.86, "grad_norm": 14.533070702469205, "learning_rate": 2.9402801346021937e-08, "logits/chosen": -2.8372209072113037, "logits/rejected": -2.7489466667175293, "logps/chosen": -312.5776672363281, "logps/rejected": -261.6145324707031, "loss": 0.6758, "positive_losses": 0.17368511855602264, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1633022129535675, "rewards/margins": 0.07370231300592422, "rewards/margins_max": 0.24105291068553925, "rewards/margins_min": -0.08412280678749084, "rewards/margins_std": 0.14122509956359863, "rewards/rejected": 0.08959989994764328, "step": 3590 }, { "dpo_losses": 0.6520019769668579, "epoch": 0.86, "grad_norm": 2.0247200101210945, "learning_rate": 2.8427161958368002e-08, "logits/chosen": -2.7505455017089844, "logits/rejected": -2.695969820022583, "logps/chosen": -279.2154846191406, "logps/rejected": -242.13021850585938, "loss": 0.669, "positive_losses": 0.24303893744945526, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.16600492596626282, "rewards/margins": 0.08897914737462997, "rewards/margins_max": 0.24220676720142365, "rewards/margins_min": -0.03847798705101013, "rewards/margins_std": 0.1246912032365799, "rewards/rejected": 0.07702575623989105, "step": 3600 }, { "epoch": 0.86, "eval_dpo_losses": 0.6641672253608704, "eval_logits/chosen": -2.753295660018921, "eval_logits/rejected": -2.7180283069610596, "eval_logps/chosen": -268.17059326171875, "eval_logps/rejected": -255.9365692138672, "eval_loss": 0.6843318939208984, "eval_positive_losses": 0.16891872882843018, "eval_rewards/accuracies": 0.6815000176429749, "eval_rewards/chosen": 0.16284741461277008, "eval_rewards/margins": 0.063680499792099, "eval_rewards/margins_max": 0.27550214529037476, "eval_rewards/margins_min": -0.11900355666875839, "eval_rewards/margins_std": 0.13232021033763885, "eval_rewards/rejected": 0.09916691482067108, "eval_runtime": 859.3513, "eval_samples_per_second": 4.655, "eval_steps_per_second": 0.291, "step": 3600 }, { "dpo_losses": 0.6588376760482788, "epoch": 0.86, "grad_norm": 6.968489432488445, "learning_rate": 2.7467007177630174e-08, "logits/chosen": -2.8288702964782715, "logits/rejected": -2.8104233741760254, "logps/chosen": -301.7097473144531, "logps/rejected": -300.7554016113281, "loss": 0.6567, "positive_losses": 0.007707786746323109, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1795564591884613, "rewards/margins": 0.07449166476726532, "rewards/margins_max": 0.21466434001922607, "rewards/margins_min": -0.05861250311136246, "rewards/margins_std": 0.1232379674911499, "rewards/rejected": 0.10506479442119598, "step": 3610 }, { "dpo_losses": 0.6570429801940918, "epoch": 0.87, "grad_norm": 15.692109787097225, "learning_rate": 2.652240410417819e-08, "logits/chosen": -2.838653564453125, "logits/rejected": -2.7633767127990723, "logps/chosen": -283.2077331542969, "logps/rejected": -239.9818572998047, "loss": 0.6766, "positive_losses": 0.09850625693798065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1679084450006485, "rewards/margins": 0.07771757990121841, "rewards/margins_max": 0.2042878121137619, "rewards/margins_min": -0.054166387766599655, "rewards/margins_std": 0.11388619244098663, "rewards/rejected": 0.09019087255001068, "step": 3620 }, { "dpo_losses": 0.6388376951217651, "epoch": 0.87, "grad_norm": 12.224098693449243, "learning_rate": 2.5593418751551437e-08, "logits/chosen": -2.8263235092163086, "logits/rejected": -2.782092809677124, "logps/chosen": -308.6744079589844, "logps/rejected": -241.56777954101562, "loss": 0.6521, "positive_losses": 0.07272262871265411, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.19764626026153564, "rewards/margins": 0.11764277517795563, "rewards/margins_max": 0.2843915820121765, "rewards/margins_min": -0.043582603335380554, "rewards/margins_std": 0.14907525479793549, "rewards/rejected": 0.08000347763299942, "step": 3630 }, { "dpo_losses": 0.6518778800964355, "epoch": 0.87, "grad_norm": 2.4837830036122726, "learning_rate": 2.4680116041845834e-08, "logits/chosen": -2.739654779434204, "logits/rejected": -2.736675500869751, "logps/chosen": -261.0301818847656, "logps/rejected": -258.49029541015625, "loss": 0.6719, "positive_losses": 0.11439085006713867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17289122939109802, "rewards/margins": 0.08961168676614761, "rewards/margins_max": 0.2415582239627838, "rewards/margins_min": -0.05575419217348099, "rewards/margins_std": 0.1310141384601593, "rewards/rejected": 0.08327953517436981, "step": 3640 }, { "dpo_losses": 0.656845211982727, "epoch": 0.87, "grad_norm": 7.709580384152372, "learning_rate": 2.3782559801176354e-08, "logits/chosen": -2.783247232437134, "logits/rejected": -2.750171661376953, "logps/chosen": -288.55889892578125, "logps/rejected": -296.3675537109375, "loss": 0.6701, "positive_losses": 0.19040927290916443, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.17641529440879822, "rewards/margins": 0.07857222855091095, "rewards/margins_max": 0.19827166199684143, "rewards/margins_min": -0.04177452623844147, "rewards/margins_std": 0.10636365413665771, "rewards/rejected": 0.09784306585788727, "step": 3650 }, { "dpo_losses": 0.6684629321098328, "epoch": 0.88, "grad_norm": 12.984344445662005, "learning_rate": 2.290081275521688e-08, "logits/chosen": -2.7253103256225586, "logits/rejected": -2.7259607315063477, "logps/chosen": -235.94607543945312, "logps/rejected": -228.1432647705078, "loss": 0.6885, "positive_losses": 0.07912597805261612, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15082618594169617, "rewards/margins": 0.05418941378593445, "rewards/margins_max": 0.18712875247001648, "rewards/margins_min": -0.057570330798625946, "rewards/margins_std": 0.11016597598791122, "rewards/rejected": 0.09663675725460052, "step": 3660 }, { "dpo_losses": 0.6677729487419128, "epoch": 0.88, "grad_norm": 11.661620511075462, "learning_rate": 2.2034936524816388e-08, "logits/chosen": -2.7403669357299805, "logits/rejected": -2.760712146759033, "logps/chosen": -283.58770751953125, "logps/rejected": -312.1358337402344, "loss": 0.683, "positive_losses": 0.17639903724193573, "rewards/accuracies": 0.625, "rewards/chosen": 0.15898045897483826, "rewards/margins": 0.05646269768476486, "rewards/margins_max": 0.1974315196275711, "rewards/margins_min": -0.07084940373897552, "rewards/margins_std": 0.11944758892059326, "rewards/rejected": 0.1025177389383316, "step": 3670 }, { "dpo_losses": 0.6413763761520386, "epoch": 0.88, "grad_norm": 10.296938720876431, "learning_rate": 2.118499162169285e-08, "logits/chosen": -2.7922329902648926, "logits/rejected": -2.732243299484253, "logps/chosen": -342.88153076171875, "logps/rejected": -266.6952209472656, "loss": 0.6681, "positive_losses": 0.16939429938793182, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.19493433833122253, "rewards/margins": 0.11256328970193863, "rewards/margins_max": 0.27581486105918884, "rewards/margins_min": -0.039721570909023285, "rewards/margins_std": 0.14128807187080383, "rewards/rejected": 0.0823710560798645, "step": 3680 }, { "dpo_losses": 0.6510524749755859, "epoch": 0.88, "grad_norm": 1.9522404613244746, "learning_rate": 2.035103744420408e-08, "logits/chosen": -2.824552536010742, "logits/rejected": -2.7700655460357666, "logps/chosen": -334.76666259765625, "logps/rejected": -283.0545654296875, "loss": 0.6809, "positive_losses": 0.08021030575037003, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.18678848445415497, "rewards/margins": 0.09195387363433838, "rewards/margins_max": 0.27611032128334045, "rewards/margins_min": -0.059003960341215134, "rewards/margins_std": 0.15174394845962524, "rewards/rejected": 0.09483462572097778, "step": 3690 }, { "dpo_losses": 0.6479452252388, "epoch": 0.89, "grad_norm": 14.81804002905696, "learning_rate": 1.953313227319689e-08, "logits/chosen": -2.7152485847473145, "logits/rejected": -2.6622376441955566, "logps/chosen": -294.839599609375, "logps/rejected": -257.5033874511719, "loss": 0.6563, "positive_losses": 0.013370132073760033, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19089344143867493, "rewards/margins": 0.09724991768598557, "rewards/margins_max": 0.25001612305641174, "rewards/margins_min": -0.038240429013967514, "rewards/margins_std": 0.12915584444999695, "rewards/rejected": 0.09364351630210876, "step": 3700 }, { "epoch": 0.89, "eval_dpo_losses": 0.6643303036689758, "eval_logits/chosen": -2.754045248031616, "eval_logits/rejected": -2.718864679336548, "eval_logps/chosen": -268.03582763671875, "eval_logps/rejected": -255.7627410888672, "eval_loss": 0.6835072636604309, "eval_positive_losses": 0.16018158197402954, "eval_rewards/accuracies": 0.6815000176429749, "eval_rewards/chosen": 0.16419503092765808, "eval_rewards/margins": 0.06328964233398438, "eval_rewards/margins_max": 0.2740493416786194, "eval_rewards/margins_min": -0.11819687485694885, "eval_rewards/margins_std": 0.13161331415176392, "eval_rewards/rejected": 0.1009053960442543, "eval_runtime": 858.9635, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 3700 }, { "dpo_losses": 0.6654092073440552, "epoch": 0.89, "grad_norm": 4.092060235408913, "learning_rate": 1.873133326793397e-08, "logits/chosen": -2.7687911987304688, "logits/rejected": -2.7348849773406982, "logps/chosen": -269.12261962890625, "logps/rejected": -258.05816650390625, "loss": 0.6907, "positive_losses": 0.2345535308122635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17091169953346252, "rewards/margins": 0.06004973500967026, "rewards/margins_max": 0.17686879634857178, "rewards/margins_min": -0.05759010836482048, "rewards/margins_std": 0.10334448516368866, "rewards/rejected": 0.11086195707321167, "step": 3710 }, { "dpo_losses": 0.6684147715568542, "epoch": 0.89, "grad_norm": 5.230490811951698, "learning_rate": 1.794569646209948e-08, "logits/chosen": -2.702066421508789, "logits/rejected": -2.670173406600952, "logps/chosen": -284.0806884765625, "logps/rejected": -246.78555297851562, "loss": 0.6885, "positive_losses": 0.3270094692707062, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16640737652778625, "rewards/margins": 0.057038843631744385, "rewards/margins_max": 0.2085561454296112, "rewards/margins_min": -0.10717018693685532, "rewards/margins_std": 0.13882891833782196, "rewards/rejected": 0.10936852544546127, "step": 3720 }, { "dpo_losses": 0.6477783918380737, "epoch": 0.89, "grad_norm": 11.82366835211396, "learning_rate": 1.7176276759883146e-08, "logits/chosen": -2.7503254413604736, "logits/rejected": -2.7477829456329346, "logps/chosen": -280.01666259765625, "logps/rejected": -246.5050811767578, "loss": 0.6711, "positive_losses": 0.22927704453468323, "rewards/accuracies": 0.75, "rewards/chosen": 0.20066359639167786, "rewards/margins": 0.10062937438488007, "rewards/margins_max": 0.2595561146736145, "rewards/margins_min": -0.04970159754157066, "rewards/margins_std": 0.13402973115444183, "rewards/rejected": 0.1000341922044754, "step": 3730 }, { "dpo_losses": 0.651672899723053, "epoch": 0.9, "grad_norm": 2.034532682106165, "learning_rate": 1.642312793214293e-08, "logits/chosen": -2.7342772483825684, "logits/rejected": -2.6859817504882812, "logps/chosen": -247.6178741455078, "logps/rejected": -276.7887268066406, "loss": 0.6613, "positive_losses": 0.04632539674639702, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1594669073820114, "rewards/margins": 0.13311654329299927, "rewards/margins_max": 0.44950494170188904, "rewards/margins_min": -0.04881676286458969, "rewards/margins_std": 0.2342521846294403, "rewards/rejected": 0.026350397616624832, "step": 3740 }, { "dpo_losses": 0.6588581800460815, "epoch": 0.9, "grad_norm": 12.547019720133617, "learning_rate": 1.568630261264789e-08, "logits/chosen": -2.777749538421631, "logits/rejected": -2.7405974864959717, "logps/chosen": -256.0454406738281, "logps/rejected": -216.5902099609375, "loss": 0.6763, "positive_losses": 0.16543880105018616, "rewards/accuracies": 0.75, "rewards/chosen": 0.17062219977378845, "rewards/margins": 0.07456677407026291, "rewards/margins_max": 0.22886960208415985, "rewards/margins_min": -0.05939141660928726, "rewards/margins_std": 0.1285584270954132, "rewards/rejected": 0.09605542570352554, "step": 3750 }, { "dpo_losses": 0.6605504155158997, "epoch": 0.9, "grad_norm": 5.6023401547889735, "learning_rate": 1.49658522943992e-08, "logits/chosen": -2.757417917251587, "logits/rejected": -2.7130351066589355, "logps/chosen": -217.6382598876953, "logps/rejected": -240.6887664794922, "loss": 0.6666, "positive_losses": 0.2249523103237152, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17407721281051636, "rewards/margins": 0.07136063277721405, "rewards/margins_max": 0.18515238165855408, "rewards/margins_min": -0.0688382238149643, "rewards/margins_std": 0.11682508140802383, "rewards/rejected": 0.10271658003330231, "step": 3760 }, { "dpo_losses": 0.6611598134040833, "epoch": 0.9, "grad_norm": 8.505953779255107, "learning_rate": 1.4261827326032122e-08, "logits/chosen": -2.794163703918457, "logits/rejected": -2.7438387870788574, "logps/chosen": -282.5149230957031, "logps/rejected": -254.3866729736328, "loss": 0.6849, "positive_losses": 0.30809053778648376, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15620173513889313, "rewards/margins": 0.07091357558965683, "rewards/margins_max": 0.23661665618419647, "rewards/margins_min": -0.08354359120130539, "rewards/margins_std": 0.14458543062210083, "rewards/rejected": 0.0852881520986557, "step": 3770 }, { "dpo_losses": 0.6541672348976135, "epoch": 0.91, "grad_norm": 2.111750868293021, "learning_rate": 1.3574276908296906e-08, "logits/chosen": -2.7305970191955566, "logits/rejected": -2.681629180908203, "logps/chosen": -215.31201171875, "logps/rejected": -224.89285278320312, "loss": 0.6653, "positive_losses": 0.1431051790714264, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.1683400571346283, "rewards/margins": 0.08388723433017731, "rewards/margins_max": 0.21670201420783997, "rewards/margins_min": -0.03809646517038345, "rewards/margins_std": 0.11329780519008636, "rewards/rejected": 0.08445282280445099, "step": 3780 }, { "dpo_losses": 0.658660888671875, "epoch": 0.91, "grad_norm": 2.274535867484966, "learning_rate": 1.2903249090620849e-08, "logits/chosen": -2.826555013656616, "logits/rejected": -2.7483725547790527, "logps/chosen": -320.4327087402344, "logps/rejected": -273.35125732421875, "loss": 0.6679, "positive_losses": 0.17862510681152344, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16669707000255585, "rewards/margins": 0.0756738930940628, "rewards/margins_max": 0.20173649489879608, "rewards/margins_min": -0.04780013859272003, "rewards/margins_std": 0.11229197680950165, "rewards/rejected": 0.09102317690849304, "step": 3790 }, { "dpo_losses": 0.6732086539268494, "epoch": 0.91, "grad_norm": 2.1963993518051, "learning_rate": 1.2248790767750012e-08, "logits/chosen": -2.7348084449768066, "logits/rejected": -2.742199420928955, "logps/chosen": -207.6639404296875, "logps/rejected": -237.13906860351562, "loss": 0.6811, "positive_losses": 0.23195019364356995, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.14042732119560242, "rewards/margins": 0.045624054968357086, "rewards/margins_max": 0.1778506338596344, "rewards/margins_min": -0.1026204377412796, "rewards/margins_std": 0.1265561729669571, "rewards/rejected": 0.09480325132608414, "step": 3800 }, { "epoch": 0.91, "eval_dpo_losses": 0.6646060347557068, "eval_logits/chosen": -2.7541239261627197, "eval_logits/rejected": -2.7189526557922363, "eval_logps/chosen": -267.8722229003906, "eval_logps/rejected": -255.53590393066406, "eval_loss": 0.6827938556671143, "eval_positive_losses": 0.15170931816101074, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": 0.16583123803138733, "eval_rewards/margins": 0.06265761703252792, "eval_rewards/margins_max": 0.2720535695552826, "eval_rewards/margins_min": -0.11757001280784607, "eval_rewards/margins_std": 0.13073314726352692, "eval_rewards/rejected": 0.10317362844944, "eval_runtime": 859.5308, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 3800 }, { "dpo_losses": 0.6644261479377747, "epoch": 0.91, "grad_norm": 10.152448206705797, "learning_rate": 1.1610947676472277e-08, "logits/chosen": -2.7783429622650146, "logits/rejected": -2.7518913745880127, "logps/chosen": -270.7191467285156, "logps/rejected": -254.0736541748047, "loss": 0.6956, "positive_losses": 0.4448486268520355, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15376734733581543, "rewards/margins": 0.06310968846082687, "rewards/margins_max": 0.2015010416507721, "rewards/margins_min": -0.06341960281133652, "rewards/margins_std": 0.11845578998327255, "rewards/rejected": 0.09065763652324677, "step": 3810 }, { "dpo_losses": 0.6563535928726196, "epoch": 0.91, "grad_norm": 12.366785270622575, "learning_rate": 1.0989764392420692e-08, "logits/chosen": -2.7928905487060547, "logits/rejected": -2.739075183868408, "logps/chosen": -299.98114013671875, "logps/rejected": -282.0989685058594, "loss": 0.6825, "positive_losses": 0.4375602602958679, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16498340666294098, "rewards/margins": 0.07950299978256226, "rewards/margins_max": 0.22089791297912598, "rewards/margins_min": -0.06209545582532883, "rewards/margins_std": 0.1249159425497055, "rewards/rejected": 0.08548040688037872, "step": 3820 }, { "dpo_losses": 0.6617771983146667, "epoch": 0.92, "grad_norm": 7.835951130970198, "learning_rate": 1.0385284326958593e-08, "logits/chosen": -2.8322882652282715, "logits/rejected": -2.7414145469665527, "logps/chosen": -304.3345947265625, "logps/rejected": -266.72210693359375, "loss": 0.6876, "positive_losses": 0.15757504105567932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1791706383228302, "rewards/margins": 0.06925542652606964, "rewards/margins_max": 0.21128897368907928, "rewards/margins_min": -0.07265281677246094, "rewards/margins_std": 0.12417922168970108, "rewards/rejected": 0.10991521179676056, "step": 3830 }, { "dpo_losses": 0.6569172143936157, "epoch": 0.92, "grad_norm": 9.31675510353147, "learning_rate": 9.797549724145731e-09, "logits/chosen": -2.8651509284973145, "logits/rejected": -2.7985455989837646, "logps/chosen": -315.66302490234375, "logps/rejected": -263.8731384277344, "loss": 0.6656, "positive_losses": 0.037329863756895065, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1870483160018921, "rewards/margins": 0.0806959941983223, "rewards/margins_max": 0.26165899634361267, "rewards/margins_min": -0.07325632870197296, "rewards/margins_std": 0.15307198464870453, "rewards/rejected": 0.1063523143529892, "step": 3840 }, { "dpo_losses": 0.6736956238746643, "epoch": 0.92, "grad_norm": 12.350049075133391, "learning_rate": 9.226601657785993e-09, "logits/chosen": -2.8000316619873047, "logits/rejected": -2.816969394683838, "logps/chosen": -248.09524536132812, "logps/rejected": -287.8347473144531, "loss": 0.6823, "positive_losses": 0.1911594420671463, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.148758202791214, "rewards/margins": 0.04497291520237923, "rewards/margins_max": 0.20772309601306915, "rewards/margins_min": -0.11044390499591827, "rewards/margins_std": 0.140190988779068, "rewards/rejected": 0.10378527641296387, "step": 3850 }, { "dpo_losses": 0.6688699722290039, "epoch": 0.92, "grad_norm": 1.9956768518743042, "learning_rate": 8.672480028556972e-09, "logits/chosen": -2.623276710510254, "logits/rejected": -2.6199355125427246, "logps/chosen": -248.938232421875, "logps/rejected": -278.8777160644531, "loss": 0.6774, "positive_losses": 0.0049002645537257195, "rewards/accuracies": 0.6875, "rewards/chosen": 0.170660138130188, "rewards/margins": 0.05233796685934067, "rewards/margins_max": 0.16301101446151733, "rewards/margins_min": -0.05267889425158501, "rewards/margins_std": 0.09617452323436737, "rewards/rejected": 0.11832215636968613, "step": 3860 }, { "dpo_losses": 0.6631342172622681, "epoch": 0.93, "grad_norm": 6.795728845531742, "learning_rate": 8.13522356122151e-09, "logits/chosen": -2.8603577613830566, "logits/rejected": -2.777244806289673, "logps/chosen": -266.6046447753906, "logps/rejected": -255.6238250732422, "loss": 0.6776, "positive_losses": 0.19300465285778046, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1604842245578766, "rewards/margins": 0.06500445306301117, "rewards/margins_max": 0.21781650185585022, "rewards/margins_min": -0.05176999419927597, "rewards/margins_std": 0.12496298551559448, "rewards/rejected": 0.09547976404428482, "step": 3870 }, { "dpo_losses": 0.6712054014205933, "epoch": 0.93, "grad_norm": 2.071618470756357, "learning_rate": 7.614869801921525e-09, "logits/chosen": -2.7937769889831543, "logits/rejected": -2.7635202407836914, "logps/chosen": -259.7538757324219, "logps/rejected": -251.8928680419922, "loss": 0.6836, "positive_losses": 0.22212085127830505, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1604996770620346, "rewards/margins": 0.04918736591935158, "rewards/margins_max": 0.20032449066638947, "rewards/margins_min": -0.08522528409957886, "rewards/margins_std": 0.1272539347410202, "rewards/rejected": 0.11131230741739273, "step": 3880 }, { "dpo_losses": 0.6633546948432922, "epoch": 0.93, "grad_norm": 15.9282678529988, "learning_rate": 7.111455115553944e-09, "logits/chosen": -2.7595419883728027, "logits/rejected": -2.7322256565093994, "logps/chosen": -245.7715301513672, "logps/rejected": -280.38720703125, "loss": 0.6655, "positive_losses": 0.02374115027487278, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17196574807167053, "rewards/margins": 0.06536030769348145, "rewards/margins_max": 0.20768776535987854, "rewards/margins_min": -0.07875025272369385, "rewards/margins_std": 0.12868280708789825, "rewards/rejected": 0.10660544782876968, "step": 3890 }, { "dpo_losses": 0.6526366472244263, "epoch": 0.93, "grad_norm": 10.474842632996218, "learning_rate": 6.6250146832294296e-09, "logits/chosen": -2.8094482421875, "logits/rejected": -2.7801709175109863, "logps/chosen": -271.47344970703125, "logps/rejected": -227.63412475585938, "loss": 0.664, "positive_losses": 0.0715000182390213, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.17943359911441803, "rewards/margins": 0.08841031044721603, "rewards/margins_max": 0.22779294848442078, "rewards/margins_min": -0.07818715274333954, "rewards/margins_std": 0.1409936547279358, "rewards/rejected": 0.09102325141429901, "step": 3900 }, { "epoch": 0.93, "eval_dpo_losses": 0.6646539568901062, "eval_logits/chosen": -2.7570688724517822, "eval_logits/rejected": -2.7221286296844482, "eval_logps/chosen": -267.8119201660156, "eval_logps/rejected": -255.46409606933594, "eval_loss": 0.6822987198829651, "eval_positive_losses": 0.14527979493141174, "eval_rewards/accuracies": 0.6779999732971191, "eval_rewards/chosen": 0.16643406450748444, "eval_rewards/margins": 0.0625423938035965, "eval_rewards/margins_max": 0.2717288136482239, "eval_rewards/margins_min": -0.11706181615591049, "eval_rewards/margins_std": 0.13045823574066162, "eval_rewards/rejected": 0.10389167815446854, "eval_runtime": 859.0569, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 3900 }, { "dpo_losses": 0.6642988920211792, "epoch": 0.94, "grad_norm": 9.260077683059093, "learning_rate": 6.155582499813655e-09, "logits/chosen": -2.7680790424346924, "logits/rejected": -2.712761163711548, "logps/chosen": -264.1136779785156, "logps/rejected": -264.02484130859375, "loss": 0.6992, "positive_losses": 0.07118396461009979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1604934185743332, "rewards/margins": 0.06244243308901787, "rewards/margins_max": 0.18634401261806488, "rewards/margins_min": -0.045282356441020966, "rewards/margins_std": 0.10743094980716705, "rewards/rejected": 0.09805098921060562, "step": 3910 }, { "dpo_losses": 0.6461649537086487, "epoch": 0.94, "grad_norm": 2.2528779379825457, "learning_rate": 5.703191371551841e-09, "logits/chosen": -2.800030469894409, "logits/rejected": -2.6686244010925293, "logps/chosen": -353.1239929199219, "logps/rejected": -256.30645751953125, "loss": 0.6622, "positive_losses": 0.2535758912563324, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1879279464483261, "rewards/margins": 0.10282881557941437, "rewards/margins_max": 0.28091442584991455, "rewards/margins_min": -0.047752343118190765, "rewards/margins_std": 0.14527681469917297, "rewards/rejected": 0.08509914577007294, "step": 3920 }, { "dpo_losses": 0.6580290794372559, "epoch": 0.94, "grad_norm": 2.24489480568659, "learning_rate": 5.267872913775756e-09, "logits/chosen": -2.8585116863250732, "logits/rejected": -2.8200507164001465, "logps/chosen": -269.33892822265625, "logps/rejected": -227.3752899169922, "loss": 0.6788, "positive_losses": 0.10815658420324326, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17992644011974335, "rewards/margins": 0.07684055715799332, "rewards/margins_max": 0.2264234721660614, "rewards/margins_min": -0.05927117541432381, "rewards/margins_std": 0.13092733919620514, "rewards/rejected": 0.10308587551116943, "step": 3930 }, { "dpo_losses": 0.641032874584198, "epoch": 0.94, "grad_norm": 6.320484512685531, "learning_rate": 4.8496575486943744e-09, "logits/chosen": -2.830091714859009, "logits/rejected": -2.7235329151153564, "logps/chosen": -322.9629211425781, "logps/rejected": -255.38442993164062, "loss": 0.663, "positive_losses": 0.1953844130039215, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19224749505519867, "rewards/margins": 0.11282501369714737, "rewards/margins_max": 0.2628289461135864, "rewards/margins_min": -0.05925637483596802, "rewards/margins_std": 0.14454469084739685, "rewards/rejected": 0.0794224888086319, "step": 3940 }, { "dpo_losses": 0.6564732789993286, "epoch": 0.95, "grad_norm": 2.0783461833068175, "learning_rate": 4.448574503268076e-09, "logits/chosen": -2.6885581016540527, "logits/rejected": -2.6577229499816895, "logps/chosen": -244.3848876953125, "logps/rejected": -250.7645263671875, "loss": 0.6561, "positive_losses": 0.004229736514389515, "rewards/accuracies": 0.75, "rewards/chosen": 0.17602810263633728, "rewards/margins": 0.07926786690950394, "rewards/margins_max": 0.2302931249141693, "rewards/margins_min": -0.04355299472808838, "rewards/margins_std": 0.12126515805721283, "rewards/rejected": 0.09676024317741394, "step": 3950 }, { "dpo_losses": 0.6451069116592407, "epoch": 0.95, "grad_norm": 13.509830992550041, "learning_rate": 4.064651807165781e-09, "logits/chosen": -2.7459397315979004, "logits/rejected": -2.7053933143615723, "logps/chosen": -236.46939086914062, "logps/rejected": -202.78945922851562, "loss": 0.6648, "positive_losses": 0.2858510911464691, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.16971875727176666, "rewards/margins": 0.10494234412908554, "rewards/margins_max": 0.2705627381801605, "rewards/margins_min": -0.021359222009778023, "rewards/margins_std": 0.1319151222705841, "rewards/rejected": 0.06477640569210052, "step": 3960 }, { "dpo_losses": 0.6555677652359009, "epoch": 0.95, "grad_norm": 2.0606348100312437, "learning_rate": 3.697916290806291e-09, "logits/chosen": -2.850156307220459, "logits/rejected": -2.7751412391662598, "logps/chosen": -273.57330322265625, "logps/rejected": -228.7772216796875, "loss": 0.6607, "positive_losses": 0.09204483032226562, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18670892715454102, "rewards/margins": 0.0826026052236557, "rewards/margins_max": 0.23823082447052002, "rewards/margins_min": -0.05544150620698929, "rewards/margins_std": 0.12718325853347778, "rewards/rejected": 0.10410632938146591, "step": 3970 }, { "dpo_losses": 0.6486338973045349, "epoch": 0.95, "grad_norm": 2.2998849259489966, "learning_rate": 3.3483935834831e-09, "logits/chosen": -2.772996187210083, "logits/rejected": -2.7173964977264404, "logps/chosen": -277.57696533203125, "logps/rejected": -253.50369262695312, "loss": 0.6584, "positive_losses": 0.060744475573301315, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.17648367583751678, "rewards/margins": 0.09603290259838104, "rewards/margins_max": 0.2472161054611206, "rewards/margins_min": -0.029990728944540024, "rewards/margins_std": 0.12347612529993057, "rewards/rejected": 0.08045077323913574, "step": 3980 }, { "dpo_losses": 0.66400146484375, "epoch": 0.96, "grad_norm": 14.425725189438863, "learning_rate": 3.0161081115735456e-09, "logits/chosen": -2.7981061935424805, "logits/rejected": -2.7522151470184326, "logps/chosen": -294.22833251953125, "logps/rejected": -268.29217529296875, "loss": 0.6734, "positive_losses": 0.14072665572166443, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16251571476459503, "rewards/margins": 0.0642295554280281, "rewards/margins_max": 0.23960871994495392, "rewards/margins_min": -0.06259835511445999, "rewards/margins_std": 0.1360151618719101, "rewards/rejected": 0.09828615188598633, "step": 3990 }, { "dpo_losses": 0.6546421051025391, "epoch": 0.96, "grad_norm": 11.26565886133954, "learning_rate": 2.7010830968314802e-09, "logits/chosen": -2.754178524017334, "logits/rejected": -2.7354214191436768, "logps/chosen": -251.7115478515625, "logps/rejected": -239.2911376953125, "loss": 0.6771, "positive_losses": 0.0032329559326171875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.183672234416008, "rewards/margins": 0.08287551999092102, "rewards/margins_max": 0.22468486428260803, "rewards/margins_min": -0.06261727958917618, "rewards/margins_std": 0.12918703258037567, "rewards/rejected": 0.10079671442508698, "step": 4000 }, { "epoch": 0.96, "eval_dpo_losses": 0.6646814942359924, "eval_logits/chosen": -2.7565603256225586, "eval_logits/rejected": -2.7215585708618164, "eval_logps/chosen": -267.8387756347656, "eval_logps/rejected": -255.4852294921875, "eval_loss": 0.6823775768280029, "eval_positive_losses": 0.14531666040420532, "eval_rewards/accuracies": 0.6775000095367432, "eval_rewards/chosen": 0.1661653369665146, "eval_rewards/margins": 0.06248496472835541, "eval_rewards/margins_max": 0.2715640962123871, "eval_rewards/margins_min": -0.11740686744451523, "eval_rewards/margins_std": 0.13043251633644104, "eval_rewards/rejected": 0.10368037223815918, "eval_runtime": 859.6013, "eval_samples_per_second": 4.653, "eval_steps_per_second": 0.291, "step": 4000 }, { "dpo_losses": 0.6627596020698547, "epoch": 0.96, "grad_norm": 6.760800515804467, "learning_rate": 2.4033405547646545e-09, "logits/chosen": -2.7864184379577637, "logits/rejected": -2.7516934871673584, "logps/chosen": -232.94873046875, "logps/rejected": -286.1166076660156, "loss": 0.6842, "positive_losses": 0.11080016940832138, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16040866076946259, "rewards/margins": 0.06618188321590424, "rewards/margins_max": 0.21579007804393768, "rewards/margins_min": -0.06469889730215073, "rewards/margins_std": 0.12559542059898376, "rewards/rejected": 0.09422676265239716, "step": 4010 }, { "dpo_losses": 0.6535666584968567, "epoch": 0.96, "grad_norm": 1.8509287977038207, "learning_rate": 2.122901293095919e-09, "logits/chosen": -2.767712116241455, "logits/rejected": -2.7076239585876465, "logps/chosen": -273.70745849609375, "logps/rejected": -260.16021728515625, "loss": 0.6628, "positive_losses": 0.04809379577636719, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.18115071952342987, "rewards/margins": 0.08582070469856262, "rewards/margins_max": 0.22919097542762756, "rewards/margins_min": -0.03256974369287491, "rewards/margins_std": 0.12001357972621918, "rewards/rejected": 0.09533001482486725, "step": 4020 }, { "dpo_losses": 0.668084442615509, "epoch": 0.97, "grad_norm": 1.8275784710737126, "learning_rate": 1.8597849103094143e-09, "logits/chosen": -2.7770278453826904, "logits/rejected": -2.7452876567840576, "logps/chosen": -271.2476501464844, "logps/rejected": -267.3369445800781, "loss": 0.6776, "positive_losses": 0.056734323501586914, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16092410683631897, "rewards/margins": 0.055357493460178375, "rewards/margins_max": 0.20463581383228302, "rewards/margins_min": -0.0749364048242569, "rewards/margins_std": 0.12409114837646484, "rewards/rejected": 0.10556660592556, "step": 4030 }, { "dpo_losses": 0.6623555421829224, "epoch": 0.97, "grad_norm": 2.3874125120101137, "learning_rate": 1.614009794280613e-09, "logits/chosen": -2.827165126800537, "logits/rejected": -2.774501085281372, "logps/chosen": -279.1925354003906, "logps/rejected": -270.4194030761719, "loss": 0.6762, "positive_losses": 0.3010104298591614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1556367427110672, "rewards/margins": 0.06865683943033218, "rewards/margins_max": 0.21759963035583496, "rewards/margins_min": -0.0807686597108841, "rewards/margins_std": 0.1352541744709015, "rewards/rejected": 0.08697989583015442, "step": 4040 }, { "dpo_losses": 0.6723076105117798, "epoch": 0.97, "grad_norm": 10.628227551489028, "learning_rate": 1.3855931209914295e-09, "logits/chosen": -2.824005126953125, "logits/rejected": -2.8196158409118652, "logps/chosen": -267.6695861816406, "logps/rejected": -274.29998779296875, "loss": 0.6775, "positive_losses": 0.27635812759399414, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15767940878868103, "rewards/margins": 0.04778493940830231, "rewards/margins_max": 0.2086886465549469, "rewards/margins_min": -0.11356700956821442, "rewards/margins_std": 0.14013658463954926, "rewards/rejected": 0.10989447683095932, "step": 4050 }, { "dpo_losses": 0.6527209281921387, "epoch": 0.97, "grad_norm": 6.9869137103445595, "learning_rate": 1.1745508533298754e-09, "logits/chosen": -2.8000526428222656, "logits/rejected": -2.732656955718994, "logps/chosen": -271.7012023925781, "logps/rejected": -230.9076385498047, "loss": 0.6759, "positive_losses": 0.12889042496681213, "rewards/accuracies": 0.75, "rewards/chosen": 0.157293900847435, "rewards/margins": 0.0865793377161026, "rewards/margins_max": 0.24143192172050476, "rewards/margins_min": -0.023911673575639725, "rewards/margins_std": 0.12149790674448013, "rewards/rejected": 0.07071457803249359, "step": 4060 }, { "dpo_losses": 0.6597913503646851, "epoch": 0.97, "grad_norm": 6.577380647928485, "learning_rate": 9.808977399744511e-10, "logits/chosen": -2.7209413051605225, "logits/rejected": -2.7068724632263184, "logps/chosen": -262.62188720703125, "logps/rejected": -238.3629150390625, "loss": 0.6838, "positive_losses": 0.22567634284496307, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18065913021564484, "rewards/margins": 0.07478559017181396, "rewards/margins_max": 0.25104469060897827, "rewards/margins_min": -0.06707103550434113, "rewards/margins_std": 0.14136120676994324, "rewards/rejected": 0.10587354749441147, "step": 4070 }, { "dpo_losses": 0.658578097820282, "epoch": 0.98, "grad_norm": 6.9337808073359, "learning_rate": 8.046473143635268e-10, "logits/chosen": -2.742370128631592, "logits/rejected": -2.7214465141296387, "logps/chosen": -267.11798095703125, "logps/rejected": -255.657958984375, "loss": 0.6877, "positive_losses": 0.22314730286598206, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1791904866695404, "rewards/margins": 0.07589339464902878, "rewards/margins_max": 0.23930883407592773, "rewards/margins_min": -0.07688675075769424, "rewards/margins_std": 0.14083437621593475, "rewards/rejected": 0.10329709202051163, "step": 4080 }, { "dpo_losses": 0.660162091255188, "epoch": 0.98, "grad_norm": 7.7691507299038065, "learning_rate": 6.458118937494317e-10, "logits/chosen": -2.7214627265930176, "logits/rejected": -2.7198398113250732, "logps/chosen": -302.90069580078125, "logps/rejected": -281.9145202636719, "loss": 0.6654, "positive_losses": 0.0422632209956646, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17170992493629456, "rewards/margins": 0.07241971790790558, "rewards/margins_max": 0.22914473712444305, "rewards/margins_min": -0.08452726155519485, "rewards/margins_std": 0.14119994640350342, "rewards/rejected": 0.09929021447896957, "step": 4090 }, { "dpo_losses": 0.6557719111442566, "epoch": 0.98, "grad_norm": 1.9925064729133104, "learning_rate": 5.044025783377259e-10, "logits/chosen": -2.8321895599365234, "logits/rejected": -2.80350399017334, "logps/chosen": -288.643310546875, "logps/rejected": -263.55560302734375, "loss": 0.6644, "positive_losses": 0.11800841987133026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17768821120262146, "rewards/margins": 0.0832248330116272, "rewards/margins_max": 0.25097864866256714, "rewards/margins_min": -0.0663030594587326, "rewards/margins_std": 0.1430123746395111, "rewards/rejected": 0.09446339309215546, "step": 4100 }, { "epoch": 0.98, "eval_dpo_losses": 0.6646350026130676, "eval_logits/chosen": -2.754180908203125, "eval_logits/rejected": -2.7189297676086426, "eval_logps/chosen": -267.83477783203125, "eval_logps/rejected": -255.49130249023438, "eval_loss": 0.6825215220451355, "eval_positive_losses": 0.1479889303445816, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": 0.1662055402994156, "eval_rewards/margins": 0.06258578598499298, "eval_rewards/margins_max": 0.2720088064670563, "eval_rewards/margins_min": -0.1174345538020134, "eval_rewards/margins_std": 0.13053110241889954, "eval_rewards/rejected": 0.1036197692155838, "eval_runtime": 859.0641, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 4100 }, { "dpo_losses": 0.652340829372406, "epoch": 0.98, "grad_norm": 2.0190118531270755, "learning_rate": 3.8042925051148813e-10, "logits/chosen": -2.7282910346984863, "logits/rejected": -2.7017452716827393, "logps/chosen": -285.70880126953125, "logps/rejected": -253.1737518310547, "loss": 0.6602, "positive_losses": 0.05210762098431587, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1906423270702362, "rewards/margins": 0.08919943124055862, "rewards/margins_max": 0.25463730096817017, "rewards/margins_min": -0.04905784875154495, "rewards/margins_std": 0.13663437962532043, "rewards/rejected": 0.10144289582967758, "step": 4110 }, { "dpo_losses": 0.6575914025306702, "epoch": 0.99, "grad_norm": 2.0478949881021, "learning_rate": 2.7390057414064525e-10, "logits/chosen": -2.760251998901367, "logits/rejected": -2.74522066116333, "logps/chosen": -288.87664794921875, "logps/rejected": -256.4379577636719, "loss": 0.6774, "positive_losses": 0.10413327068090439, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.17558273673057556, "rewards/margins": 0.07716713100671768, "rewards/margins_max": 0.22382047772407532, "rewards/margins_min": -0.0731758177280426, "rewards/margins_std": 0.13102379441261292, "rewards/rejected": 0.09841560572385788, "step": 4120 }, { "dpo_losses": 0.6671404242515564, "epoch": 0.99, "grad_norm": 6.97685885253831, "learning_rate": 1.8482399397654057e-10, "logits/chosen": -2.8221592903137207, "logits/rejected": -2.7946910858154297, "logps/chosen": -286.81768798828125, "logps/rejected": -290.49053955078125, "loss": 0.6742, "positive_losses": 0.23002424836158752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17291216552257538, "rewards/margins": 0.05662660673260689, "rewards/margins_max": 0.18632474541664124, "rewards/margins_min": -0.058633577078580856, "rewards/margins_std": 0.11294053494930267, "rewards/rejected": 0.11628556251525879, "step": 4130 }, { "dpo_losses": 0.6662750840187073, "epoch": 0.99, "grad_norm": 29.518879684237678, "learning_rate": 1.1320573513159959e-10, "logits/chosen": -2.7849316596984863, "logits/rejected": -2.735177516937256, "logps/chosen": -247.0010223388672, "logps/rejected": -225.8738555908203, "loss": 0.6818, "positive_losses": 0.319840669631958, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14727768301963806, "rewards/margins": 0.05896994471549988, "rewards/margins_max": 0.20719870924949646, "rewards/margins_min": -0.0806480273604393, "rewards/margins_std": 0.12833374738693237, "rewards/rejected": 0.08830773830413818, "step": 4140 }, { "dpo_losses": 0.6551648378372192, "epoch": 0.99, "grad_norm": 11.650614421834154, "learning_rate": 5.905080264431705e-11, "logits/chosen": -2.7605371475219727, "logits/rejected": -2.7368950843811035, "logps/chosen": -265.2921447753906, "logps/rejected": -248.7461395263672, "loss": 0.6695, "positive_losses": 0.058501433581113815, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.17077505588531494, "rewards/margins": 0.08128766715526581, "rewards/margins_max": 0.22274088859558105, "rewards/margins_min": -0.01692686602473259, "rewards/margins_std": 0.10871877521276474, "rewards/rejected": 0.08948738127946854, "step": 4150 }, { "dpo_losses": 0.6541586518287659, "epoch": 1.0, "grad_norm": 2.0570807497738364, "learning_rate": 2.2362981129508963e-11, "logits/chosen": -2.802473545074463, "logits/rejected": -2.7629497051239014, "logps/chosen": -278.388671875, "logps/rejected": -272.71258544921875, "loss": 0.6594, "positive_losses": 0.0, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.188084214925766, "rewards/margins": 0.0852595642209053, "rewards/margins_max": 0.2401733696460724, "rewards/margins_min": -0.04913000017404556, "rewards/margins_std": 0.12942180037498474, "rewards/rejected": 0.10282465070486069, "step": 4160 }, { "dpo_losses": 0.6477263569831848, "epoch": 1.0, "grad_norm": 9.832168208347847, "learning_rate": 3.144834513746364e-12, "logits/chosen": -2.807058811187744, "logits/rejected": -2.792757034301758, "logps/chosen": -302.5372619628906, "logps/rejected": -265.207763671875, "loss": 0.6636, "positive_losses": 0.21496811509132385, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19391296803951263, "rewards/margins": 0.09752051532268524, "rewards/margins_max": 0.23375506699085236, "rewards/margins_min": -0.03883753716945648, "rewards/margins_std": 0.12090893089771271, "rewards/rejected": 0.09639245271682739, "step": 4170 }, { "epoch": 1.0, "step": 4176, "total_flos": 0.0, "train_loss": 0.6776896565581647, "train_runtime": 67917.559, "train_samples_per_second": 0.984, "train_steps_per_second": 0.061 } ], "logging_steps": 10, "max_steps": 4176, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }