{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 1065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 1.6032202352154772, "learning_rate": 4.672897196261682e-08, "logits/chosen": -3.0016818046569824, "logits/rejected": -2.8469698429107666, "logps/chosen": -650.2908325195312, "logps/rejected": -359.48583984375, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6927387714385986, "epoch": 0.03, "grad_norm": 14.052093588804325, "learning_rate": 4.6728971962616824e-07, "logits/chosen": -2.9367923736572266, "logits/rejected": -2.819260835647583, "logps/chosen": -254.90475463867188, "logps/rejected": -170.36068725585938, "loss": 0.6989, "positive_losses": 0.033258650451898575, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.0008218331495299935, "rewards/margins": 0.0008189052459783852, "rewards/margins_max": 0.0018548837397247553, "rewards/margins_min": -0.00021707323321606964, "rewards/margins_std": 0.0014650949742645025, "rewards/rejected": 2.927754849224584e-06, "step": 10 }, { "dpo_losses": 0.6923267245292664, "epoch": 0.06, "grad_norm": 1.844492423373157, "learning_rate": 9.345794392523365e-07, "logits/chosen": -2.7079358100891113, "logits/rejected": -2.7515180110931396, "logps/chosen": -306.1308898925781, "logps/rejected": -241.56021118164062, "loss": 0.6932, "positive_losses": 0.012112426571547985, "rewards/accuracies": 0.75, "rewards/chosen": 0.006594317965209484, "rewards/margins": 0.0016433143755421042, "rewards/margins_max": 0.002823440358042717, "rewards/margins_min": 0.00046318816021084785, "rewards/margins_std": 0.001668950542807579, "rewards/rejected": 0.004951003938913345, "step": 20 }, { "dpo_losses": 0.6899991631507874, "epoch": 0.08, "grad_norm": 2.1897418931727595, "learning_rate": 1.4018691588785047e-06, "logits/chosen": -2.904411792755127, "logits/rejected": -2.816619396209717, "logps/chosen": -358.5197448730469, "logps/rejected": -251.15103149414062, "loss": 0.6896, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02147643454372883, "rewards/margins": 0.006321606691926718, "rewards/margins_max": 0.01196110900491476, "rewards/margins_min": 0.000682103622239083, "rewards/margins_std": 0.007975460961461067, "rewards/rejected": 0.015154826454818249, "step": 30 }, { "dpo_losses": 0.6862105131149292, "epoch": 0.11, "grad_norm": 1.7300257406359418, "learning_rate": 1.869158878504673e-06, "logits/chosen": -2.8441336154937744, "logits/rejected": -2.7715249061584473, "logps/chosen": -327.30523681640625, "logps/rejected": -313.1446228027344, "loss": 0.6864, "positive_losses": 0.02085266076028347, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03371895104646683, "rewards/margins": 0.013976506888866425, "rewards/margins_max": 0.017818700522184372, "rewards/margins_min": 0.010134311392903328, "rewards/margins_std": 0.0054336837492883205, "rewards/rejected": 0.019742444157600403, "step": 40 }, { "dpo_losses": 0.6820067167282104, "epoch": 0.14, "grad_norm": 9.347589322785899, "learning_rate": 2.3364485981308413e-06, "logits/chosen": -2.795854091644287, "logits/rejected": -2.720963954925537, "logps/chosen": -217.7622833251953, "logps/rejected": -171.39205932617188, "loss": 0.6779, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0513346791267395, "rewards/margins": 0.022588390856981277, "rewards/margins_max": 0.03625725582242012, "rewards/margins_min": 0.008919527754187584, "rewards/margins_std": 0.019330691546201706, "rewards/rejected": 0.028746291995048523, "step": 50 }, { "dpo_losses": 0.6643597483634949, "epoch": 0.17, "grad_norm": 2.37274745731943, "learning_rate": 2.8037383177570094e-06, "logits/chosen": -2.7788054943084717, "logits/rejected": -2.710609197616577, "logps/chosen": -256.30633544921875, "logps/rejected": -233.06576538085938, "loss": 0.6666, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.08375100791454315, "rewards/margins": 0.05946110561490059, "rewards/margins_max": 0.08825884014368057, "rewards/margins_min": 0.030663389712572098, "rewards/margins_std": 0.040726132690906525, "rewards/rejected": 0.024289902299642563, "step": 60 }, { "dpo_losses": 0.6566643714904785, "epoch": 0.2, "grad_norm": 1.6634540430479345, "learning_rate": 3.2710280373831774e-06, "logits/chosen": -2.635437488555908, "logits/rejected": -2.678208351135254, "logps/chosen": -283.38287353515625, "logps/rejected": -209.6460418701172, "loss": 0.6558, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.11453696340322495, "rewards/margins": 0.07607836276292801, "rewards/margins_max": 0.13534289598464966, "rewards/margins_min": 0.01681383326649666, "rewards/margins_std": 0.08381269872188568, "rewards/rejected": 0.03845860809087753, "step": 70 }, { "dpo_losses": 0.6309095025062561, "epoch": 0.23, "grad_norm": 1.7989959804157094, "learning_rate": 3.738317757009346e-06, "logits/chosen": -2.9159035682678223, "logits/rejected": -2.8235018253326416, "logps/chosen": -335.8651123046875, "logps/rejected": -286.46331787109375, "loss": 0.6397, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.13743606209754944, "rewards/margins": 0.1305120289325714, "rewards/margins_max": 0.18503351509571075, "rewards/margins_min": 0.07599054276943207, "rewards/margins_std": 0.07710503041744232, "rewards/rejected": 0.0069240378215909, "step": 80 }, { "dpo_losses": 0.6185696721076965, "epoch": 0.25, "grad_norm": 9.307634759665634, "learning_rate": 4.205607476635514e-06, "logits/chosen": -2.6819961071014404, "logits/rejected": -2.7166359424591064, "logps/chosen": -211.7088623046875, "logps/rejected": -203.97885131835938, "loss": 0.6145, "positive_losses": 0.035182952880859375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.15242353081703186, "rewards/margins": 0.16033907234668732, "rewards/margins_max": 0.24873778223991394, "rewards/margins_min": 0.07194037735462189, "rewards/margins_std": 0.12501463294029236, "rewards/rejected": -0.007915569469332695, "step": 90 }, { "dpo_losses": 0.6138414144515991, "epoch": 0.28, "grad_norm": 2.169680467803253, "learning_rate": 4.6728971962616825e-06, "logits/chosen": -2.783569812774658, "logits/rejected": -2.812309741973877, "logps/chosen": -288.1591796875, "logps/rejected": -341.5180969238281, "loss": 0.6275, "positive_losses": 0.8350906372070312, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1585826575756073, "rewards/margins": 0.1714317500591278, "rewards/margins_max": 0.2515793442726135, "rewards/margins_min": 0.09128417074680328, "rewards/margins_std": 0.11334581673145294, "rewards/rejected": -0.012849109247326851, "step": 100 }, { "epoch": 0.28, "eval_dpo_losses": 0.6742300391197205, "eval_logits/chosen": -2.7527217864990234, "eval_logits/rejected": -2.71140456199646, "eval_logps/chosen": -276.58984375, "eval_logps/rejected": -254.9810333251953, "eval_loss": 0.8539575338363647, "eval_positive_losses": 1.6940749883651733, "eval_rewards/accuracies": 0.60317462682724, "eval_rewards/chosen": 0.08631354570388794, "eval_rewards/margins": 0.04429732263088226, "eval_rewards/margins_max": 0.21467885375022888, "eval_rewards/margins_min": -0.10308819264173508, "eval_rewards/margins_std": 0.14203837513923645, "eval_rewards/rejected": 0.042016226798295975, "eval_runtime": 285.3929, "eval_samples_per_second": 7.008, "eval_steps_per_second": 0.221, "step": 100 }, { "dpo_losses": 0.5535503029823303, "epoch": 0.31, "grad_norm": 2.2536984881767905, "learning_rate": 4.999879018839288e-06, "logits/chosen": -2.7111623287200928, "logits/rejected": -2.6175503730773926, "logps/chosen": -252.84732055664062, "logps/rejected": -252.4491729736328, "loss": 0.5736, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.2623223662376404, "rewards/margins": 0.32247892022132874, "rewards/margins_max": 0.4974708557128906, "rewards/margins_min": 0.14748699963092804, "rewards/margins_std": 0.24747595191001892, "rewards/rejected": -0.060156505554914474, "step": 110 }, { "dpo_losses": 0.5708788633346558, "epoch": 0.34, "grad_norm": 1.8718792057149318, "learning_rate": 4.99772856836941e-06, "logits/chosen": -2.873108148574829, "logits/rejected": -2.8189544677734375, "logps/chosen": -373.77386474609375, "logps/rejected": -337.38922119140625, "loss": 0.5727, "positive_losses": 0.22691193222999573, "rewards/accuracies": 1.0, "rewards/chosen": 0.21501663327217102, "rewards/margins": 0.27264389395713806, "rewards/margins_max": 0.36815184354782104, "rewards/margins_min": 0.17713597416877747, "rewards/margins_std": 0.13506858050823212, "rewards/rejected": -0.057627253234386444, "step": 120 }, { "dpo_losses": 0.5159657001495361, "epoch": 0.37, "grad_norm": 1.9587224479056975, "learning_rate": 4.992892309373227e-06, "logits/chosen": -2.7587242126464844, "logits/rejected": -2.689577341079712, "logps/chosen": -311.52978515625, "logps/rejected": -274.8511047363281, "loss": 0.5718, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.24945905804634094, "rewards/margins": 0.41450828313827515, "rewards/margins_max": 0.5637356638908386, "rewards/margins_min": 0.26528093218803406, "rewards/margins_std": 0.21103934943675995, "rewards/rejected": -0.1650492250919342, "step": 130 }, { "dpo_losses": 0.5120642185211182, "epoch": 0.39, "grad_norm": 35.2981995380076, "learning_rate": 4.985375442281969e-06, "logits/chosen": -2.725268602371216, "logits/rejected": -2.7174267768859863, "logps/chosen": -270.7826843261719, "logps/rejected": -248.8843536376953, "loss": 0.5953, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.25090181827545166, "rewards/margins": 0.42299699783325195, "rewards/margins_max": 0.5964112877845764, "rewards/margins_min": 0.24958273768424988, "rewards/margins_std": 0.2452448159456253, "rewards/rejected": -0.1720951795578003, "step": 140 }, { "dpo_losses": 0.4928362965583801, "epoch": 0.42, "grad_norm": 11.90624935921094, "learning_rate": 4.9751860499858175e-06, "logits/chosen": -2.72652530670166, "logits/rejected": -2.7453625202178955, "logps/chosen": -301.97021484375, "logps/rejected": -276.3653259277344, "loss": 0.5758, "positive_losses": 0.42821502685546875, "rewards/accuracies": 1.0, "rewards/chosen": 0.19235308468341827, "rewards/margins": 0.47807592153549194, "rewards/margins_max": 0.6831300854682922, "rewards/margins_min": 0.2730218172073364, "rewards/margins_std": 0.2899903357028961, "rewards/rejected": -0.2857228219509125, "step": 150 }, { "dpo_losses": 0.4960567355155945, "epoch": 0.45, "grad_norm": 15.847210447883002, "learning_rate": 4.962335089142376e-06, "logits/chosen": -2.81313157081604, "logits/rejected": -2.735961675643921, "logps/chosen": -244.3223419189453, "logps/rejected": -264.59417724609375, "loss": 0.559, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.25210094451904297, "rewards/margins": 0.46666574478149414, "rewards/margins_max": 0.6729411482810974, "rewards/margins_min": 0.26039019227027893, "rewards/margins_std": 0.2917175889015198, "rewards/rejected": -0.21456477046012878, "step": 160 }, { "dpo_losses": 0.4443618357181549, "epoch": 0.48, "grad_norm": 2.921537165567133, "learning_rate": 4.946836378394967e-06, "logits/chosen": -2.8487417697906494, "logits/rejected": -2.7233359813690186, "logps/chosen": -293.14263916015625, "logps/rejected": -265.21044921875, "loss": 0.4792, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.32291096448898315, "rewards/margins": 0.6394142508506775, "rewards/margins_max": 0.8530386686325073, "rewards/margins_min": 0.42578983306884766, "rewards/margins_std": 0.30211058259010315, "rewards/rejected": -0.3165033161640167, "step": 170 }, { "dpo_losses": 0.4956347942352295, "epoch": 0.51, "grad_norm": 23.196576832752985, "learning_rate": 4.928706583513441e-06, "logits/chosen": -2.7180655002593994, "logits/rejected": -2.674361228942871, "logps/chosen": -249.37704467773438, "logps/rejected": -410.07391357421875, "loss": 0.5836, "positive_losses": 1.074639916419983, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.17418113350868225, "rewards/margins": 0.47780531644821167, "rewards/margins_max": 0.6538316011428833, "rewards/margins_min": 0.3017791211605072, "rewards/margins_std": 0.24893875420093536, "rewards/rejected": -0.3036242425441742, "step": 180 }, { "dpo_losses": 0.4040610194206238, "epoch": 0.54, "grad_norm": 2.662026262000448, "learning_rate": 4.907965199473471e-06, "logits/chosen": -2.6723411083221436, "logits/rejected": -2.5524985790252686, "logps/chosen": -320.6319274902344, "logps/rejected": -257.9935302734375, "loss": 0.5582, "positive_losses": 1.0019195079803467, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3839383125305176, "rewards/margins": 0.7386760711669922, "rewards/margins_max": 0.9635330438613892, "rewards/margins_min": 0.51381915807724, "rewards/margins_std": 0.31799572706222534, "rewards/rejected": -0.3547378182411194, "step": 190 }, { "dpo_losses": 0.4472725987434387, "epoch": 0.56, "grad_norm": 23.173407948282012, "learning_rate": 4.884634529493591e-06, "logits/chosen": -2.8709282875061035, "logits/rejected": -2.7968573570251465, "logps/chosen": -255.41879272460938, "logps/rejected": -237.76406860351562, "loss": 0.599, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2910212576389313, "rewards/margins": 0.6229863166809082, "rewards/margins_max": 0.8043031692504883, "rewards/margins_min": 0.4416695535182953, "rewards/margins_std": 0.25642070174217224, "rewards/rejected": -0.3319651484489441, "step": 200 }, { "epoch": 0.56, "eval_dpo_losses": 0.6560041308403015, "eval_logits/chosen": -2.7841696739196777, "eval_logits/rejected": -2.738633632659912, "eval_logps/chosen": -291.0660095214844, "eval_logps/rejected": -275.99658203125, "eval_loss": 1.9206839799880981, "eval_positive_losses": 12.58076000213623, "eval_rewards/accuracies": 0.6388888955116272, "eval_rewards/chosen": -0.05844784155488014, "eval_rewards/margins": 0.10969138890504837, "eval_rewards/margins_max": 0.4903210401535034, "eval_rewards/margins_min": -0.25554272532463074, "eval_rewards/margins_std": 0.33160677552223206, "eval_rewards/rejected": -0.1681392341852188, "eval_runtime": 284.4185, "eval_samples_per_second": 7.032, "eval_steps_per_second": 0.222, "step": 200 }, { "dpo_losses": 0.45512253046035767, "epoch": 0.59, "grad_norm": 7.436654691984327, "learning_rate": 4.858739661052539e-06, "logits/chosen": -2.5205092430114746, "logits/rejected": -2.5804672241210938, "logps/chosen": -240.2886962890625, "logps/rejected": -298.3849182128906, "loss": 0.5267, "positive_losses": 0.4501487612724304, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.38874852657318115, "rewards/margins": 0.6847165822982788, "rewards/margins_max": 1.0578687191009521, "rewards/margins_min": 0.31156447529792786, "rewards/margins_std": 0.5277167558670044, "rewards/rejected": -0.29596805572509766, "step": 210 }, { "dpo_losses": 0.4267025589942932, "epoch": 0.62, "grad_norm": 12.252545150739026, "learning_rate": 4.830308438912687e-06, "logits/chosen": -2.901047945022583, "logits/rejected": -2.776557207107544, "logps/chosen": -341.5310363769531, "logps/rejected": -316.5777282714844, "loss": 0.5828, "positive_losses": 1.5781867504119873, "rewards/accuracies": 1.0, "rewards/chosen": 0.28965142369270325, "rewards/margins": 0.6728664040565491, "rewards/margins_max": 0.8954153060913086, "rewards/margins_min": 0.4503174424171448, "rewards/margins_std": 0.3147316873073578, "rewards/rejected": -0.38321495056152344, "step": 220 }, { "dpo_losses": 0.42703738808631897, "epoch": 0.65, "grad_norm": 2.1766995421765545, "learning_rate": 4.799371435178544e-06, "logits/chosen": -2.821802854537964, "logits/rejected": -2.777765989303589, "logps/chosen": -321.39501953125, "logps/rejected": -376.64483642578125, "loss": 0.5028, "positive_losses": 1.304276466369629, "rewards/accuracies": 1.0, "rewards/chosen": 0.30395936965942383, "rewards/margins": 0.7244865298271179, "rewards/margins_max": 1.1590335369110107, "rewards/margins_min": 0.2899397909641266, "rewards/margins_std": 0.6145419478416443, "rewards/rejected": -0.42052727937698364, "step": 230 }, { "dpo_losses": 0.4363502860069275, "epoch": 0.68, "grad_norm": 13.650828107929078, "learning_rate": 4.765961916422575e-06, "logits/chosen": -2.7546634674072266, "logits/rejected": -2.707695722579956, "logps/chosen": -219.1737518310547, "logps/rejected": -330.49444580078125, "loss": 0.5883, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.29220908880233765, "rewards/margins": 0.6587773561477661, "rewards/margins_max": 0.8973654508590698, "rewards/margins_min": 0.42018923163414, "rewards/margins_std": 0.33741456270217896, "rewards/rejected": -0.36656829714775085, "step": 240 }, { "dpo_losses": 0.40715283155441284, "epoch": 0.7, "grad_norm": 5.403626615804181, "learning_rate": 4.730115807913627e-06, "logits/chosen": -2.786029577255249, "logits/rejected": -2.656646490097046, "logps/chosen": -316.26605224609375, "logps/rejected": -292.4571838378906, "loss": 0.4798, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.36320579051971436, "rewards/margins": 0.7509908676147461, "rewards/margins_max": 0.927462100982666, "rewards/margins_min": 0.5745195150375366, "rewards/margins_std": 0.24956803023815155, "rewards/rejected": -0.3877849876880646, "step": 250 }, { "dpo_losses": 0.45335307717323303, "epoch": 0.73, "grad_norm": 29.921037643309493, "learning_rate": 4.691871654986485e-06, "logits/chosen": -2.8433797359466553, "logits/rejected": -2.7910611629486084, "logps/chosen": -240.71328735351562, "logps/rejected": -260.13897705078125, "loss": 0.5549, "positive_losses": 1.8418042659759521, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2628856301307678, "rewards/margins": 0.6381598711013794, "rewards/margins_max": 0.8531384468078613, "rewards/margins_min": 0.42318135499954224, "rewards/margins_std": 0.3040255904197693, "rewards/rejected": -0.37527427077293396, "step": 260 }, { "dpo_losses": 0.45805755257606506, "epoch": 0.76, "grad_norm": 3.6869955202700884, "learning_rate": 4.651270581594054e-06, "logits/chosen": -2.8275113105773926, "logits/rejected": -2.726349353790283, "logps/chosen": -264.3140869140625, "logps/rejected": -256.37506103515625, "loss": 0.5553, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.38106483221054077, "rewards/margins": 0.6026363372802734, "rewards/margins_max": 0.8345470428466797, "rewards/margins_min": 0.37072569131851196, "rewards/margins_std": 0.3279712498188019, "rewards/rejected": -0.22157149016857147, "step": 270 }, { "dpo_losses": 0.46088677644729614, "epoch": 0.79, "grad_norm": 11.384071170544201, "learning_rate": 4.6083562460867545e-06, "logits/chosen": -2.7374978065490723, "logits/rejected": -2.705930233001709, "logps/chosen": -292.6180114746094, "logps/rejected": -295.0760803222656, "loss": 0.6126, "positive_losses": 1.0429108142852783, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2991539239883423, "rewards/margins": 0.611792802810669, "rewards/margins_max": 0.8936999440193176, "rewards/margins_min": 0.32988566160202026, "rewards/margins_std": 0.39867693185806274, "rewards/rejected": -0.3126388192176819, "step": 280 }, { "dpo_losses": 0.5089690685272217, "epoch": 0.82, "grad_norm": 2.7173946115224865, "learning_rate": 4.563174794266684e-06, "logits/chosen": -2.875331163406372, "logits/rejected": -2.819256544113159, "logps/chosen": -263.9188232421875, "logps/rejected": -286.82647705078125, "loss": 0.593, "positive_losses": 1.377386450767517, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2715073823928833, "rewards/margins": 0.4865007996559143, "rewards/margins_max": 0.7703573703765869, "rewards/margins_min": 0.20264430344104767, "rewards/margins_std": 0.40143370628356934, "rewards/rejected": -0.2149934470653534, "step": 290 }, { "dpo_losses": 0.44498148560523987, "epoch": 0.85, "grad_norm": 8.551683262439843, "learning_rate": 4.5157748097670125e-06, "logits/chosen": -2.9059486389160156, "logits/rejected": -2.793186902999878, "logps/chosen": -319.2405090332031, "logps/rejected": -338.54998779296875, "loss": 0.4901, "positive_losses": 0.0022247314918786287, "rewards/accuracies": 1.0, "rewards/chosen": 0.35988515615463257, "rewards/margins": 0.6180437803268433, "rewards/margins_max": 0.8190226554870605, "rewards/margins_min": 0.41706475615501404, "rewards/margins_std": 0.2842271625995636, "rewards/rejected": -0.2581585943698883, "step": 300 }, { "epoch": 0.85, "eval_dpo_losses": 0.6506758332252502, "eval_logits/chosen": -2.7854835987091064, "eval_logits/rejected": -2.7329776287078857, "eval_logps/chosen": -303.7291564941406, "eval_logps/rejected": -289.5481872558594, "eval_loss": 2.8066518306732178, "eval_positive_losses": 22.214069366455078, "eval_rewards/accuracies": 0.6388888955116272, "eval_rewards/chosen": -0.18507955968379974, "eval_rewards/margins": 0.11857547610998154, "eval_rewards/margins_max": 0.47240880131721497, "eval_rewards/margins_min": -0.25752344727516174, "eval_rewards/margins_std": 0.32571399211883545, "eval_rewards/rejected": -0.3036550283432007, "eval_runtime": 284.7873, "eval_samples_per_second": 7.023, "eval_steps_per_second": 0.221, "step": 300 }, { "dpo_losses": 0.43647676706314087, "epoch": 0.87, "grad_norm": 2.6162448381062275, "learning_rate": 4.466207261809989e-06, "logits/chosen": -2.9903199672698975, "logits/rejected": -2.7902731895446777, "logps/chosen": -293.12274169921875, "logps/rejected": -296.422119140625, "loss": 0.6852, "positive_losses": 0.944580078125, "rewards/accuracies": 1.0, "rewards/chosen": 0.31041496992111206, "rewards/margins": 0.6650521755218506, "rewards/margins_max": 0.992100715637207, "rewards/margins_min": 0.338003545999527, "rewards/margins_std": 0.4625166058540344, "rewards/rejected": -0.35463717579841614, "step": 310 }, { "dpo_losses": 0.4618608057498932, "epoch": 0.9, "grad_norm": 10.483471531606499, "learning_rate": 4.414525450399713e-06, "logits/chosen": -2.8283543586730957, "logits/rejected": -2.7349746227264404, "logps/chosen": -286.9427185058594, "logps/rejected": -262.766845703125, "loss": 0.527, "positive_losses": 0.8719180822372437, "rewards/accuracies": 1.0, "rewards/chosen": 0.32706111669540405, "rewards/margins": 0.5900775790214539, "rewards/margins_max": 0.8372209668159485, "rewards/margins_min": 0.34293434023857117, "rewards/margins_std": 0.34951338171958923, "rewards/rejected": -0.2630165219306946, "step": 320 }, { "dpo_losses": 0.40510478615760803, "epoch": 0.93, "grad_norm": 20.600609246290738, "learning_rate": 4.360784949008615e-06, "logits/chosen": -2.9669108390808105, "logits/rejected": -2.8032517433166504, "logps/chosen": -316.91192626953125, "logps/rejected": -283.3198547363281, "loss": 0.515, "positive_losses": 0.5270363092422485, "rewards/accuracies": 1.0, "rewards/chosen": 0.45585203170776367, "rewards/margins": 0.8136453628540039, "rewards/margins_max": 1.1761964559555054, "rewards/margins_min": 0.45109423995018005, "rewards/margins_std": 0.512724757194519, "rewards/rejected": -0.35779333114624023, "step": 330 }, { "dpo_losses": 0.47730112075805664, "epoch": 0.96, "grad_norm": 2.164730368336074, "learning_rate": 4.30504354481929e-06, "logits/chosen": -2.79738450050354, "logits/rejected": -2.7073657512664795, "logps/chosen": -230.3443145751953, "logps/rejected": -234.2275390625, "loss": 0.496, "positive_losses": 2.5491890907287598, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2501987814903259, "rewards/margins": 0.5649263858795166, "rewards/margins_max": 0.820625901222229, "rewards/margins_min": 0.3092268109321594, "rewards/margins_std": 0.36161375045776367, "rewards/rejected": -0.3147276043891907, "step": 340 }, { "dpo_losses": 0.3645946681499481, "epoch": 0.99, "grad_norm": 19.210785792660445, "learning_rate": 4.247361176585904e-06, "logits/chosen": -2.791806697845459, "logits/rejected": -2.676161289215088, "logps/chosen": -352.7079162597656, "logps/rejected": -353.04425048828125, "loss": 0.584, "positive_losses": 1.5420730113983154, "rewards/accuracies": 1.0, "rewards/chosen": 0.3225085139274597, "rewards/margins": 0.8690497279167175, "rewards/margins_max": 1.1524405479431152, "rewards/margins_min": 0.5856587886810303, "rewards/margins_std": 0.4007752537727356, "rewards/rejected": -0.546541154384613, "step": 350 }, { "dpo_losses": 0.44031819701194763, "epoch": 1.01, "grad_norm": 3.4688322040336876, "learning_rate": 4.187799870182038e-06, "logits/chosen": -2.756261110305786, "logits/rejected": -2.6450822353363037, "logps/chosen": -273.16424560546875, "logps/rejected": -231.5010986328125, "loss": 0.4573, "positive_losses": 0.19403228163719177, "rewards/accuracies": 1.0, "rewards/chosen": 0.35820913314819336, "rewards/margins": 0.6459983587265015, "rewards/margins_max": 0.8495124578475952, "rewards/margins_min": 0.4424843192100525, "rewards/margins_std": 0.28781232237815857, "rewards/rejected": -0.2877892851829529, "step": 360 }, { "dpo_losses": 0.3558691143989563, "epoch": 1.04, "grad_norm": 71.7335292506231, "learning_rate": 4.1264236719042365e-06, "logits/chosen": -2.6822152137756348, "logits/rejected": -2.662559986114502, "logps/chosen": -320.59442138671875, "logps/rejected": -317.09295654296875, "loss": 0.4251, "positive_losses": 0.42721253633499146, "rewards/accuracies": 1.0, "rewards/chosen": 0.45041507482528687, "rewards/margins": 0.9831393957138062, "rewards/margins_max": 1.3708398342132568, "rewards/margins_min": 0.5954390168190002, "rewards/margins_std": 0.5482910871505737, "rewards/rejected": -0.5327242612838745, "step": 370 }, { "dpo_losses": 0.3393256664276123, "epoch": 1.07, "grad_norm": 4.337233890868313, "learning_rate": 4.063298579603001e-06, "logits/chosen": -2.7261626720428467, "logits/rejected": -2.5453438758850098, "logps/chosen": -265.1933288574219, "logps/rejected": -244.08682250976562, "loss": 0.3984, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4042983055114746, "rewards/margins": 0.999946117401123, "rewards/margins_max": 1.2659950256347656, "rewards/margins_min": 0.7338972091674805, "rewards/margins_std": 0.3762499690055847, "rewards/rejected": -0.5956477522850037, "step": 380 }, { "dpo_losses": 0.25663647055625916, "epoch": 1.1, "grad_norm": 35.28065142871338, "learning_rate": 3.998492471715272e-06, "logits/chosen": -2.7409512996673584, "logits/rejected": -2.752206325531006, "logps/chosen": -314.38153076171875, "logps/rejected": -423.4803161621094, "loss": 0.5701, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4437999129295349, "rewards/margins": 1.3619416952133179, "rewards/margins_max": 1.7605613470077515, "rewards/margins_min": 0.9633218050003052, "rewards/margins_std": 0.5637335181236267, "rewards/rejected": -0.9181416630744934, "step": 390 }, { "dpo_losses": 0.3513553738594055, "epoch": 1.13, "grad_norm": 4.052912034886079, "learning_rate": 3.932075034274723e-06, "logits/chosen": -2.73002552986145, "logits/rejected": -2.6879513263702393, "logps/chosen": -205.73922729492188, "logps/rejected": -290.40057373046875, "loss": 0.4414, "positive_losses": 0.4541704058647156, "rewards/accuracies": 1.0, "rewards/chosen": 0.3537348806858063, "rewards/margins": 0.9853051900863647, "rewards/margins_max": 1.2278480529785156, "rewards/margins_min": 0.7427625060081482, "rewards/margins_std": 0.34300726652145386, "rewards/rejected": -0.6315703988075256, "step": 400 }, { "epoch": 1.13, "eval_dpo_losses": 0.6385828852653503, "eval_logits/chosen": -2.71909236907959, "eval_logits/rejected": -2.670318365097046, "eval_logps/chosen": -299.07989501953125, "eval_logps/rejected": -291.5615539550781, "eval_loss": 2.6622352600097656, "eval_positive_losses": 20.927839279174805, "eval_rewards/accuracies": 0.6746031641960144, "eval_rewards/chosen": -0.13858698308467865, "eval_rewards/margins": 0.18520160019397736, "eval_rewards/margins_max": 0.6970763802528381, "eval_rewards/margins_min": -0.37488874793052673, "eval_rewards/margins_std": 0.4832788407802582, "eval_rewards/rejected": -0.323788583278656, "eval_runtime": 283.8974, "eval_samples_per_second": 7.045, "eval_steps_per_second": 0.222, "step": 400 }, { "dpo_losses": 0.38113099336624146, "epoch": 1.15, "grad_norm": 1.9724463020625589, "learning_rate": 3.864117685978339e-06, "logits/chosen": -2.816284656524658, "logits/rejected": -2.7134735584259033, "logps/chosen": -242.77761840820312, "logps/rejected": -272.8990173339844, "loss": 0.4468, "positive_losses": 4.795651912689209, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2276880443096161, "rewards/margins": 0.8958386182785034, "rewards/margins_max": 1.3816872835159302, "rewards/margins_min": 0.40999001264572144, "rewards/margins_std": 0.6870937943458557, "rewards/rejected": -0.6681506037712097, "step": 410 }, { "dpo_losses": 0.33210596442222595, "epoch": 1.18, "grad_norm": 3.4285412656501766, "learning_rate": 3.794693501389861e-06, "logits/chosen": -2.8275389671325684, "logits/rejected": -2.7307045459747314, "logps/chosen": -293.709716796875, "logps/rejected": -331.89312744140625, "loss": 0.4087, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4260416030883789, "rewards/margins": 1.108737587928772, "rewards/margins_max": 1.603941559791565, "rewards/margins_min": 0.613533616065979, "rewards/margins_std": 0.7003240585327148, "rewards/rejected": -0.6826959848403931, "step": 420 }, { "dpo_losses": 0.34472885727882385, "epoch": 1.21, "grad_norm": 2.846847523997402, "learning_rate": 3.7238771323626822e-06, "logits/chosen": -2.7846486568450928, "logits/rejected": -2.6524085998535156, "logps/chosen": -342.40692138671875, "logps/rejected": -332.17010498046875, "loss": 0.5622, "positive_losses": 4.125036239624023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3678087592124939, "rewards/margins": 1.078407883644104, "rewards/margins_max": 1.4884113073349, "rewards/margins_min": 0.6684045195579529, "rewards/margins_std": 0.5798323154449463, "rewards/rejected": -0.7105990648269653, "step": 430 }, { "dpo_losses": 0.32919952273368835, "epoch": 1.24, "grad_norm": 109.42335538231772, "learning_rate": 3.651744727766676e-06, "logits/chosen": -2.7272467613220215, "logits/rejected": -2.66713285446167, "logps/chosen": -210.4514617919922, "logps/rejected": -259.1316833496094, "loss": 0.4028, "positive_losses": 0.3457130491733551, "rewards/accuracies": 1.0, "rewards/chosen": 0.36259937286376953, "rewards/margins": 1.016867995262146, "rewards/margins_max": 1.3856614828109741, "rewards/margins_min": 0.6480745077133179, "rewards/margins_std": 0.5215528607368469, "rewards/rejected": -0.6542686223983765, "step": 440 }, { "dpo_losses": 0.31118613481521606, "epoch": 1.27, "grad_norm": 38.9440234553471, "learning_rate": 3.57837385160529e-06, "logits/chosen": -2.659485340118408, "logits/rejected": -2.6188011169433594, "logps/chosen": -273.7745361328125, "logps/rejected": -349.734619140625, "loss": 0.4823, "positive_losses": 2.6830811500549316, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3455764055252075, "rewards/margins": 1.166411280632019, "rewards/margins_max": 1.5491106510162354, "rewards/margins_min": 0.7837120890617371, "rewards/margins_std": 0.5412184596061707, "rewards/rejected": -0.8208349347114563, "step": 450 }, { "dpo_losses": 0.24196143448352814, "epoch": 1.3, "grad_norm": 2.8256101438878116, "learning_rate": 3.503843399610941e-06, "logits/chosen": -2.6595611572265625, "logits/rejected": -2.6660475730895996, "logps/chosen": -322.4607849121094, "logps/rejected": -492.70068359375, "loss": 0.4169, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5088067650794983, "rewards/margins": 1.4272974729537964, "rewards/margins_max": 1.8303813934326172, "rewards/margins_min": 1.0242136716842651, "rewards/margins_std": 0.5700467824935913, "rewards/rejected": -0.9184908866882324, "step": 460 }, { "dpo_losses": 0.32115817070007324, "epoch": 1.32, "grad_norm": 5.028579287398997, "learning_rate": 3.4282335144083985e-06, "logits/chosen": -2.567282199859619, "logits/rejected": -2.616426706314087, "logps/chosen": -219.5450439453125, "logps/rejected": -303.61566162109375, "loss": 0.537, "positive_losses": 2.556870937347412, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.25023719668388367, "rewards/margins": 1.0798122882843018, "rewards/margins_max": 1.3413639068603516, "rewards/margins_min": 0.8182605504989624, "rewards/margins_std": 0.36988988518714905, "rewards/rejected": -0.8295750617980957, "step": 470 }, { "dpo_losses": 0.2937307357788086, "epoch": 1.35, "grad_norm": 70.59458692509646, "learning_rate": 3.351625499337395e-06, "logits/chosen": -2.821207284927368, "logits/rejected": -2.655557155609131, "logps/chosen": -336.3492126464844, "logps/rejected": -360.6393127441406, "loss": 0.4803, "positive_losses": 4.53096866607666, "rewards/accuracies": 1.0, "rewards/chosen": 0.37952059507369995, "rewards/margins": 1.2218748331069946, "rewards/margins_max": 1.5149763822555542, "rewards/margins_min": 0.9287732243537903, "rewards/margins_std": 0.4145084023475647, "rewards/rejected": -0.8423541784286499, "step": 480 }, { "dpo_losses": 0.38940221071243286, "epoch": 1.38, "grad_norm": 2.687569639501308, "learning_rate": 3.2741017310271056e-06, "logits/chosen": -2.6762735843658447, "logits/rejected": -2.549715280532837, "logps/chosen": -201.81640625, "logps/rejected": -277.5948791503906, "loss": 0.4423, "positive_losses": 0.7856195569038391, "rewards/accuracies": 1.0, "rewards/chosen": 0.2850914001464844, "rewards/margins": 0.8952637910842896, "rewards/margins_max": 1.2249016761779785, "rewards/margins_min": 0.5656259655952454, "rewards/margins_std": 0.46617835760116577, "rewards/rejected": -0.6101723909378052, "step": 490 }, { "dpo_losses": 0.3359260559082031, "epoch": 1.41, "grad_norm": 8.068773298281624, "learning_rate": 3.195745570816532e-06, "logits/chosen": -2.582794189453125, "logits/rejected": -2.5295655727386475, "logps/chosen": -293.511962890625, "logps/rejected": -310.9229736328125, "loss": 0.4651, "positive_losses": 1.6097240447998047, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3676465153694153, "rewards/margins": 1.0873607397079468, "rewards/margins_max": 1.359550952911377, "rewards/margins_min": 0.8151704668998718, "rewards/margins_std": 0.38493508100509644, "rewards/rejected": -0.7197142243385315, "step": 500 }, { "epoch": 1.41, "eval_dpo_losses": 0.6384106874465942, "eval_logits/chosen": -2.7216532230377197, "eval_logits/rejected": -2.6714365482330322, "eval_logps/chosen": -298.51165771484375, "eval_logps/rejected": -292.0330505371094, "eval_loss": 2.6646323204040527, "eval_positive_losses": 20.608970642089844, "eval_rewards/accuracies": 0.6626983880996704, "eval_rewards/chosen": -0.13290439546108246, "eval_rewards/margins": 0.1955995112657547, "eval_rewards/margins_max": 0.7628427743911743, "eval_rewards/margins_min": -0.3882632255554199, "eval_rewards/margins_std": 0.5195400714874268, "eval_rewards/rejected": -0.32850393652915955, "eval_runtime": 285.1068, "eval_samples_per_second": 7.015, "eval_steps_per_second": 0.221, "step": 500 }, { "dpo_losses": 0.33750054240226746, "epoch": 1.44, "grad_norm": 5.205737491948956, "learning_rate": 3.116641275116018e-06, "logits/chosen": -2.409104108810425, "logits/rejected": -2.434281349182129, "logps/chosen": -200.69908142089844, "logps/rejected": -388.02001953125, "loss": 0.398, "positive_losses": 1.0130329132080078, "rewards/accuracies": 1.0, "rewards/chosen": 0.26239317655563354, "rewards/margins": 1.0276142358779907, "rewards/margins_max": 1.2639634609222412, "rewards/margins_min": 0.7912648916244507, "rewards/margins_std": 0.3342483639717102, "rewards/rejected": -0.7652209997177124, "step": 510 }, { "dpo_losses": 0.2813549041748047, "epoch": 1.46, "grad_norm": 81.57789306373847, "learning_rate": 3.0368739048062956e-06, "logits/chosen": -2.748539447784424, "logits/rejected": -2.641331672668457, "logps/chosen": -305.63671875, "logps/rejected": -331.99383544921875, "loss": 0.5374, "positive_losses": 10.878652572631836, "rewards/accuracies": 1.0, "rewards/chosen": 0.30826514959335327, "rewards/margins": 1.2961227893829346, "rewards/margins_max": 1.768711805343628, "rewards/margins_min": 0.823533833026886, "rewards/margins_std": 0.6683418154716492, "rewards/rejected": -0.9878576397895813, "step": 520 }, { "dpo_losses": 0.2712605893611908, "epoch": 1.49, "grad_norm": 75.79271498324394, "learning_rate": 2.956529233772492e-06, "logits/chosen": -2.689558744430542, "logits/rejected": -2.6852006912231445, "logps/chosen": -292.9363098144531, "logps/rejected": -357.29400634765625, "loss": 0.3968, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4060862958431244, "rewards/margins": 1.2946475744247437, "rewards/margins_max": 1.633283019065857, "rewards/margins_min": 0.9560121297836304, "rewards/margins_std": 0.47890281677246094, "rewards/rejected": -0.8885613679885864, "step": 530 }, { "dpo_losses": 0.27980148792266846, "epoch": 1.52, "grad_norm": 56.497845904041995, "learning_rate": 2.8756936566714317e-06, "logits/chosen": -2.7521424293518066, "logits/rejected": -2.6638569831848145, "logps/chosen": -310.28753662109375, "logps/rejected": -327.8934020996094, "loss": 0.5646, "positive_losses": 1.8035399913787842, "rewards/accuracies": 1.0, "rewards/chosen": 0.4047401547431946, "rewards/margins": 1.3088172674179077, "rewards/margins_max": 1.7437057495117188, "rewards/margins_min": 0.8739286661148071, "rewards/margins_std": 0.6150254011154175, "rewards/rejected": -0.9040770530700684, "step": 540 }, { "dpo_losses": 0.30083730816841125, "epoch": 1.55, "grad_norm": 4.003119682647961, "learning_rate": 2.794454096031429e-06, "logits/chosen": -2.722224235534668, "logits/rejected": -2.6790289878845215, "logps/chosen": -281.0094299316406, "logps/rejected": -354.3661804199219, "loss": 0.387, "positive_losses": 0.4849150776863098, "rewards/accuracies": 1.0, "rewards/chosen": 0.3756260275840759, "rewards/margins": 1.2488583326339722, "rewards/margins_max": 1.8290369510650635, "rewards/margins_min": 0.668679416179657, "rewards/margins_std": 0.820496678352356, "rewards/rejected": -0.8732322454452515, "step": 550 }, { "dpo_losses": 0.2974298894405365, "epoch": 1.58, "grad_norm": 4.054485926091979, "learning_rate": 2.71289790878446e-06, "logits/chosen": -2.6345105171203613, "logits/rejected": -2.6252238750457764, "logps/chosen": -266.069580078125, "logps/rejected": -428.830322265625, "loss": 0.4149, "positive_losses": 0.9539718627929688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.36717483401298523, "rewards/margins": 1.2800363302230835, "rewards/margins_max": 1.847495675086975, "rewards/margins_min": 0.7125769257545471, "rewards/margins_std": 0.8025087118148804, "rewards/rejected": -0.9128614664077759, "step": 560 }, { "dpo_losses": 0.2434779852628708, "epoch": 1.61, "grad_norm": 13.695790466916172, "learning_rate": 2.6311127923312156e-06, "logits/chosen": -2.7691166400909424, "logits/rejected": -2.570652723312378, "logps/chosen": -357.65362548828125, "logps/rejected": -422.05902099609375, "loss": 0.3595, "positive_losses": 1.6939789056777954, "rewards/accuracies": 1.0, "rewards/chosen": 0.35474830865859985, "rewards/margins": 1.4161592721939087, "rewards/margins_max": 1.7364327907562256, "rewards/margins_min": 1.0958856344223022, "rewards/margins_std": 0.4529353678226471, "rewards/rejected": -1.061410903930664, "step": 570 }, { "dpo_losses": 0.26757892966270447, "epoch": 1.63, "grad_norm": 45.10124515413211, "learning_rate": 2.549186690240057e-06, "logits/chosen": -2.7345547676086426, "logits/rejected": -2.6686453819274902, "logps/chosen": -254.34683227539062, "logps/rejected": -315.84857177734375, "loss": 0.5253, "positive_losses": 0.2569518983364105, "rewards/accuracies": 1.0, "rewards/chosen": 0.43774762749671936, "rewards/margins": 1.3818461894989014, "rewards/margins_max": 1.815768837928772, "rewards/margins_min": 0.9479236602783203, "rewards/margins_std": 0.6136592626571655, "rewards/rejected": -0.9440986514091492, "step": 580 }, { "dpo_losses": 0.28964871168136597, "epoch": 1.66, "grad_norm": 7.214863048583421, "learning_rate": 2.4672076976812548e-06, "logits/chosen": -2.6155965328216553, "logits/rejected": -2.465445041656494, "logps/chosen": -330.9356994628906, "logps/rejected": -382.1274719238281, "loss": 0.4009, "positive_losses": 0.20948180556297302, "rewards/accuracies": 1.0, "rewards/chosen": 0.42689600586891174, "rewards/margins": 1.3260236978530884, "rewards/margins_max": 1.9126968383789062, "rewards/margins_min": 0.7393506169319153, "rewards/margins_std": 0.8296809196472168, "rewards/rejected": -0.8991276025772095, "step": 590 }, { "dpo_losses": 0.3019997179508209, "epoch": 1.69, "grad_norm": 2.5656935168095365, "learning_rate": 2.3852639666982218e-06, "logits/chosen": -2.696664571762085, "logits/rejected": -2.6669843196868896, "logps/chosen": -210.6801300048828, "logps/rejected": -339.8411560058594, "loss": 0.5269, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4121875762939453, "rewards/margins": 1.2055537700653076, "rewards/margins_max": 1.5584628582000732, "rewards/margins_min": 0.8526442646980286, "rewards/margins_std": 0.49908918142318726, "rewards/rejected": -0.793366014957428, "step": 600 }, { "epoch": 1.69, "eval_dpo_losses": 0.6337167024612427, "eval_logits/chosen": -2.653167724609375, "eval_logits/rejected": -2.6025989055633545, "eval_logps/chosen": -326.8940734863281, "eval_logps/rejected": -323.9284362792969, "eval_loss": 5.016211986541748, "eval_positive_losses": 46.1312141418457, "eval_rewards/accuracies": 0.6626983880996704, "eval_rewards/chosen": -0.4167284667491913, "eval_rewards/margins": 0.2307295948266983, "eval_rewards/margins_max": 0.8626330494880676, "eval_rewards/margins_min": -0.4616139829158783, "eval_rewards/margins_std": 0.5963027477264404, "eval_rewards/rejected": -0.647458016872406, "eval_runtime": 284.3544, "eval_samples_per_second": 7.033, "eval_steps_per_second": 0.222, "step": 600 }, { "dpo_losses": 0.3822602331638336, "epoch": 1.72, "grad_norm": 4.499258828055022, "learning_rate": 2.303443611417584e-06, "logits/chosen": -2.5053551197052, "logits/rejected": -2.452122449874878, "logps/chosen": -285.8536682128906, "logps/rejected": -345.1878662109375, "loss": 0.5838, "positive_losses": 7.787275791168213, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.20293152332305908, "rewards/margins": 0.9711725115776062, "rewards/margins_max": 1.5771162509918213, "rewards/margins_min": 0.36522871255874634, "rewards/margins_std": 0.8569338917732239, "rewards/rejected": -0.7682409286499023, "step": 610 }, { "dpo_losses": 0.2892194390296936, "epoch": 1.75, "grad_norm": 5.081357332154091, "learning_rate": 2.2218346133000264e-06, "logits/chosen": -2.5583109855651855, "logits/rejected": -2.4557156562805176, "logps/chosen": -241.0048370361328, "logps/rejected": -288.3791809082031, "loss": 0.4921, "positive_losses": 4.499431610107422, "rewards/accuracies": 1.0, "rewards/chosen": 0.3376957178115845, "rewards/margins": 1.2491505146026611, "rewards/margins_max": 1.6783252954483032, "rewards/margins_min": 0.8199755549430847, "rewards/margins_std": 0.606944739818573, "rewards/rejected": -0.9114546775817871, "step": 620 }, { "dpo_losses": 0.28637608885765076, "epoch": 1.77, "grad_norm": 31.375745057762174, "learning_rate": 2.140524726533792e-06, "logits/chosen": -2.611680030822754, "logits/rejected": -2.492157459259033, "logps/chosen": -342.9209899902344, "logps/rejected": -305.1431884765625, "loss": 0.381, "positive_losses": 1.477830171585083, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5035167336463928, "rewards/margins": 1.3139656782150269, "rewards/margins_max": 1.746787428855896, "rewards/margins_min": 0.8811438679695129, "rewards/margins_std": 0.6121026277542114, "rewards/rejected": -0.8104490041732788, "step": 630 }, { "dpo_losses": 0.27914196252822876, "epoch": 1.8, "grad_norm": 56.714432514737815, "learning_rate": 2.059601383672566e-06, "logits/chosen": -2.6837282180786133, "logits/rejected": -2.669649600982666, "logps/chosen": -205.0702362060547, "logps/rejected": -292.3086853027344, "loss": 0.6023, "positive_losses": 3.721278429031372, "rewards/accuracies": 1.0, "rewards/chosen": 0.2986445426940918, "rewards/margins": 1.2236577272415161, "rewards/margins_max": 1.4498487710952759, "rewards/margins_min": 0.9974665641784668, "rewards/margins_std": 0.31988245248794556, "rewards/rejected": -0.9250132441520691, "step": 640 }, { "dpo_losses": 0.3276744782924652, "epoch": 1.83, "grad_norm": 108.41037124625116, "learning_rate": 1.9791516016192214e-06, "logits/chosen": -2.7006583213806152, "logits/rejected": -2.657177686691284, "logps/chosen": -219.15249633789062, "logps/rejected": -298.5721130371094, "loss": 0.3902, "positive_losses": 0.02580871619284153, "rewards/accuracies": 1.0, "rewards/chosen": 0.2983975410461426, "rewards/margins": 1.0909839868545532, "rewards/margins_max": 1.5546290874481201, "rewards/margins_min": 0.6273389458656311, "rewards/margins_std": 0.6556931138038635, "rewards/rejected": -0.7925864458084106, "step": 650 }, { "dpo_losses": 0.37573254108428955, "epoch": 1.86, "grad_norm": 4.49512981087327, "learning_rate": 1.8992618880565039e-06, "logits/chosen": -2.4442310333251953, "logits/rejected": -2.430908679962158, "logps/chosen": -247.6465301513672, "logps/rejected": -270.6328125, "loss": 0.673, "positive_losses": 9.613517761230469, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.23618540167808533, "rewards/margins": 1.0627477169036865, "rewards/margins_max": 1.7770532369613647, "rewards/margins_min": 0.3484421372413635, "rewards/margins_std": 1.0101807117462158, "rewards/rejected": -0.8265622854232788, "step": 660 }, { "dpo_losses": 0.2606434226036072, "epoch": 1.89, "grad_norm": 16.900887081181846, "learning_rate": 1.8200181484252888e-06, "logits/chosen": -2.728989601135254, "logits/rejected": -2.65732741355896, "logps/chosen": -339.34649658203125, "logps/rejected": -414.9603576660156, "loss": 0.3802, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.46093645691871643, "rewards/margins": 1.4545660018920898, "rewards/margins_max": 1.9622220993041992, "rewards/margins_min": 0.9469099044799805, "rewards/margins_std": 0.7179341316223145, "rewards/rejected": -0.9936296343803406, "step": 670 }, { "dpo_losses": 0.22198085486888885, "epoch": 1.92, "grad_norm": 41.51526627620706, "learning_rate": 1.7415055935504234e-06, "logits/chosen": -2.705850601196289, "logits/rejected": -2.6019129753112793, "logps/chosen": -284.8177795410156, "logps/rejected": -411.76708984375, "loss": 0.4159, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.37507936358451843, "rewards/margins": 1.5223026275634766, "rewards/margins_max": 1.767469048500061, "rewards/margins_min": 1.2771363258361816, "rewards/margins_std": 0.34671759605407715, "rewards/rejected": -1.1472233533859253, "step": 680 }, { "dpo_losses": 0.3486565351486206, "epoch": 1.94, "grad_norm": 138.4896910648948, "learning_rate": 1.6638086480134954e-06, "logits/chosen": -2.577733039855957, "logits/rejected": -2.557359218597412, "logps/chosen": -144.18289184570312, "logps/rejected": -205.9375762939453, "loss": 0.4276, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3236793875694275, "rewards/margins": 1.0879504680633545, "rewards/margins_max": 1.6922643184661865, "rewards/margins_min": 0.4836367070674896, "rewards/margins_std": 0.85462886095047, "rewards/rejected": -0.7642711400985718, "step": 690 }, { "dpo_losses": 0.24665436148643494, "epoch": 1.97, "grad_norm": 22.11936611709013, "learning_rate": 1.5870108593710473e-06, "logits/chosen": -2.422232151031494, "logits/rejected": -2.351428508758545, "logps/chosen": -301.96270751953125, "logps/rejected": -312.5522766113281, "loss": 0.3513, "positive_losses": 0.03521118313074112, "rewards/accuracies": 1.0, "rewards/chosen": 0.5125211477279663, "rewards/margins": 1.4875143766403198, "rewards/margins_max": 1.8352491855621338, "rewards/margins_min": 1.139779806137085, "rewards/margins_std": 0.4917708933353424, "rewards/rejected": -0.9749932289123535, "step": 700 }, { "epoch": 1.97, "eval_dpo_losses": 0.6398608684539795, "eval_logits/chosen": -2.631686210632324, "eval_logits/rejected": -2.5807785987854004, "eval_logps/chosen": -326.29583740234375, "eval_logps/rejected": -325.2173156738281, "eval_loss": 4.895449161529541, "eval_positive_losses": 45.593257904052734, "eval_rewards/accuracies": 0.6626983880996704, "eval_rewards/chosen": -0.41074639558792114, "eval_rewards/margins": 0.24960003793239594, "eval_rewards/margins_max": 0.9743701815605164, "eval_rewards/margins_min": -0.5254129767417908, "eval_rewards/margins_std": 0.6826153993606567, "eval_rewards/rejected": -0.6603464484214783, "eval_runtime": 284.0532, "eval_samples_per_second": 7.041, "eval_steps_per_second": 0.222, "step": 700 }, { "dpo_losses": 0.32716676592826843, "epoch": 2.0, "grad_norm": 24.97077759875969, "learning_rate": 1.511194808315853e-06, "logits/chosen": -2.5247268676757812, "logits/rejected": -2.486575126647949, "logps/chosen": -229.55859375, "logps/rejected": -268.9668273925781, "loss": 0.4163, "positive_losses": 0.5168693661689758, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.28237053751945496, "rewards/margins": 1.146689772605896, "rewards/margins_max": 1.594560146331787, "rewards/margins_min": 0.6988194584846497, "rewards/margins_std": 0.6333842873573303, "rewards/rejected": -0.8643192052841187, "step": 710 }, { "dpo_losses": 0.21982404589653015, "epoch": 2.03, "grad_norm": 1.38353688722549, "learning_rate": 1.4364420198778662e-06, "logits/chosen": -2.7155685424804688, "logits/rejected": -2.609267234802246, "logps/chosen": -343.7250061035156, "logps/rejected": -450.3816833496094, "loss": 0.3634, "positive_losses": 4.519556999206543, "rewards/accuracies": 1.0, "rewards/chosen": 0.456549733877182, "rewards/margins": 1.6569738388061523, "rewards/margins_max": 2.1707637310028076, "rewards/margins_min": 1.143183946609497, "rewards/margins_std": 0.7266086935997009, "rewards/rejected": -1.200424075126648, "step": 720 }, { "dpo_losses": 0.25513142347335815, "epoch": 2.06, "grad_norm": 3.318839287140489, "learning_rate": 1.3628328757603243e-06, "logits/chosen": -2.6959056854248047, "logits/rejected": -2.5843894481658936, "logps/chosen": -267.92010498046875, "logps/rejected": -357.7880554199219, "loss": 0.2684, "positive_losses": 0.038549043238162994, "rewards/accuracies": 1.0, "rewards/chosen": 0.31957200169563293, "rewards/margins": 1.4021821022033691, "rewards/margins_max": 1.6910970211029053, "rewards/margins_min": 1.113266944885254, "rewards/margins_std": 0.4085877537727356, "rewards/rejected": -1.082610011100769, "step": 730 }, { "dpo_losses": 0.30432650446891785, "epoch": 2.08, "grad_norm": 36.95329512381558, "learning_rate": 1.2904465279052725e-06, "logits/chosen": -2.634579658508301, "logits/rejected": -2.56650710105896, "logps/chosen": -284.7083740234375, "logps/rejected": -317.93389892578125, "loss": 0.4788, "positive_losses": 3.9446158409118652, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.22743673622608185, "rewards/margins": 1.2028728723526, "rewards/margins_max": 1.6675021648406982, "rewards/margins_min": 0.7382434606552124, "rewards/margins_std": 0.6570851802825928, "rewards/rejected": -0.9754360914230347, "step": 740 }, { "dpo_losses": 0.2779385447502136, "epoch": 2.11, "grad_norm": 5.125527517570657, "learning_rate": 1.219360813381446e-06, "logits/chosen": -2.462111234664917, "logits/rejected": -2.498530387878418, "logps/chosen": -159.8828887939453, "logps/rejected": -236.5124053955078, "loss": 0.2882, "positive_losses": 0.18086472153663635, "rewards/accuracies": 1.0, "rewards/chosen": 0.3097127676010132, "rewards/margins": 1.3083655834197998, "rewards/margins_max": 1.592053771018982, "rewards/margins_min": 1.0246771574020386, "rewards/margins_std": 0.4011960029602051, "rewards/rejected": -0.9986528158187866, "step": 750 }, { "dpo_losses": 0.22775745391845703, "epoch": 2.14, "grad_norm": 18.30294185818396, "learning_rate": 1.1496521706860392e-06, "logits/chosen": -2.651033401489258, "logits/rejected": -2.537503242492676, "logps/chosen": -291.1076354980469, "logps/rejected": -382.2750244140625, "loss": 0.3201, "positive_losses": 2.4156768321990967, "rewards/accuracies": 1.0, "rewards/chosen": 0.36460763216018677, "rewards/margins": 1.5596258640289307, "rewards/margins_max": 1.959398627281189, "rewards/margins_min": 1.1598527431488037, "rewards/margins_std": 0.5653643012046814, "rewards/rejected": -1.1950181722640991, "step": 760 }, { "dpo_losses": 0.21675769984722137, "epoch": 2.17, "grad_norm": 21.74451175593295, "learning_rate": 1.0813955575503588e-06, "logits/chosen": -2.604640483856201, "logits/rejected": -2.5890743732452393, "logps/chosen": -301.3707580566406, "logps/rejected": -381.50506591796875, "loss": 0.3818, "positive_losses": 0.353890985250473, "rewards/accuracies": 1.0, "rewards/chosen": 0.4118216633796692, "rewards/margins": 1.6504104137420654, "rewards/margins_max": 2.0086562633514404, "rewards/margins_min": 1.2921648025512695, "rewards/margins_std": 0.5066360235214233, "rewards/rejected": -1.2385889291763306, "step": 770 }, { "dpo_losses": 0.24907536804676056, "epoch": 2.2, "grad_norm": 5.9813942121541706, "learning_rate": 1.0146643703377488e-06, "logits/chosen": -2.734790325164795, "logits/rejected": -2.537445306777954, "logps/chosen": -292.0768127441406, "logps/rejected": -332.3907775878906, "loss": 0.4576, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5042055249214172, "rewards/margins": 1.5501843690872192, "rewards/margins_max": 2.1402950286865234, "rewards/margins_min": 0.9600737690925598, "rewards/margins_std": 0.8345423936843872, "rewards/rejected": -1.0459789037704468, "step": 780 }, { "dpo_losses": 0.20471492409706116, "epoch": 2.23, "grad_norm": 2.116074836272933, "learning_rate": 9.495303651204496e-07, "logits/chosen": -2.611013889312744, "logits/rejected": -2.5461339950561523, "logps/chosen": -319.31951904296875, "logps/rejected": -404.64886474609375, "loss": 0.4666, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3257637321949005, "rewards/margins": 1.60427725315094, "rewards/margins_max": 1.9402239322662354, "rewards/margins_min": 1.2683299779891968, "rewards/margins_std": 0.4751007556915283, "rewards/rejected": -1.2785133123397827, "step": 790 }, { "dpo_losses": 0.2155081331729889, "epoch": 2.25, "grad_norm": 12.840834237921664, "learning_rate": 8.860635805202616e-07, "logits/chosen": -2.615548610687256, "logits/rejected": -2.5271685123443604, "logps/chosen": -304.5693054199219, "logps/rejected": -362.62225341796875, "loss": 0.2795, "positive_losses": 0.01874256134033203, "rewards/accuracies": 1.0, "rewards/chosen": 0.4078141152858734, "rewards/margins": 1.58090341091156, "rewards/margins_max": 1.9335031509399414, "rewards/margins_min": 1.2283036708831787, "rewards/margins_std": 0.49865132570266724, "rewards/rejected": -1.1730893850326538, "step": 800 }, { "epoch": 2.25, "eval_dpo_losses": 0.6266348958015442, "eval_logits/chosen": -2.604722738265991, "eval_logits/rejected": -2.554541826248169, "eval_logps/chosen": -324.4103088378906, "eval_logps/rejected": -327.570556640625, "eval_loss": 4.769333839416504, "eval_positive_losses": 43.908966064453125, "eval_rewards/accuracies": 0.682539701461792, "eval_rewards/chosen": -0.3918909430503845, "eval_rewards/margins": 0.29198840260505676, "eval_rewards/margins_max": 1.0657094717025757, "eval_rewards/margins_min": -0.5265500545501709, "eval_rewards/margins_std": 0.7165747284889221, "eval_rewards/rejected": -0.6838793158531189, "eval_runtime": 284.6208, "eval_samples_per_second": 7.027, "eval_steps_per_second": 0.221, "step": 800 }, { "dpo_losses": 0.25243309140205383, "epoch": 2.28, "grad_norm": 71.28169182787225, "learning_rate": 8.24332262395994e-07, "logits/chosen": -2.6843011379241943, "logits/rejected": -2.6510274410247803, "logps/chosen": -252.87222290039062, "logps/rejected": -349.50506591796875, "loss": 0.3457, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3779425024986267, "rewards/margins": 1.4469493627548218, "rewards/margins_max": 2.0055932998657227, "rewards/margins_min": 0.8883053660392761, "rewards/margins_std": 0.7900420427322388, "rewards/rejected": -1.0690069198608398, "step": 810 }, { "dpo_losses": 0.25832101702690125, "epoch": 2.31, "grad_norm": 245.8246008336025, "learning_rate": 7.644027904586587e-07, "logits/chosen": -2.637300968170166, "logits/rejected": -2.5708765983581543, "logps/chosen": -227.47787475585938, "logps/rejected": -322.4635925292969, "loss": 0.5117, "positive_losses": 4.7760443687438965, "rewards/accuracies": 1.0, "rewards/chosen": 0.2738664150238037, "rewards/margins": 1.4220813512802124, "rewards/margins_max": 1.896773338317871, "rewards/margins_min": 0.9473894238471985, "rewards/margins_std": 0.6713159084320068, "rewards/rejected": -1.1482150554656982, "step": 820 }, { "dpo_losses": 0.16726334393024445, "epoch": 2.34, "grad_norm": 1.7593306703555782, "learning_rate": 7.06339606893347e-07, "logits/chosen": -2.6265785694122314, "logits/rejected": -2.5026650428771973, "logps/chosen": -399.26031494140625, "logps/rejected": -387.8680419921875, "loss": 0.2112, "positive_losses": 0.06428833305835724, "rewards/accuracies": 1.0, "rewards/chosen": 0.5203009247779846, "rewards/margins": 1.8566944599151611, "rewards/margins_max": 2.1375911235809326, "rewards/margins_min": 1.5757976770401, "rewards/margins_std": 0.3972480893135071, "rewards/rejected": -1.3363934755325317, "step": 830 }, { "dpo_losses": 0.2129584103822708, "epoch": 2.37, "grad_norm": 28.81303382097675, "learning_rate": 6.502051470645149e-07, "logits/chosen": -2.721235513687134, "logits/rejected": -2.5673904418945312, "logps/chosen": -341.94073486328125, "logps/rejected": -413.451171875, "loss": 0.3816, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.34823325276374817, "rewards/margins": 1.6308362483978271, "rewards/margins_max": 1.947928786277771, "rewards/margins_min": 1.313744068145752, "rewards/margins_std": 0.4484362006187439, "rewards/rejected": -1.282603144645691, "step": 840 }, { "dpo_losses": 0.204869344830513, "epoch": 2.39, "grad_norm": 1.8754374311724713, "learning_rate": 5.960597723792194e-07, "logits/chosen": -2.610276937484741, "logits/rejected": -2.4925060272216797, "logps/chosen": -280.25665283203125, "logps/rejected": -387.3306579589844, "loss": 0.429, "positive_losses": 4.123325824737549, "rewards/accuracies": 1.0, "rewards/chosen": 0.3918009400367737, "rewards/margins": 1.684704065322876, "rewards/margins_max": 2.1113224029541016, "rewards/margins_min": 1.2580856084823608, "rewards/margins_std": 0.6033294796943665, "rewards/rejected": -1.2929030656814575, "step": 850 }, { "dpo_losses": 0.18849320709705353, "epoch": 2.42, "grad_norm": 2.415129688011, "learning_rate": 5.43961705380465e-07, "logits/chosen": -2.5959548950195312, "logits/rejected": -2.5745034217834473, "logps/chosen": -274.2839660644531, "logps/rejected": -413.91650390625, "loss": 0.3573, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4205331802368164, "rewards/margins": 1.8768619298934937, "rewards/margins_max": 2.31927490234375, "rewards/margins_min": 1.4344491958618164, "rewards/margins_std": 0.6256662011146545, "rewards/rejected": -1.4563289880752563, "step": 860 }, { "dpo_losses": 0.24235720932483673, "epoch": 2.45, "grad_norm": 2.200547137281921, "learning_rate": 4.939669671404871e-07, "logits/chosen": -2.5770421028137207, "logits/rejected": -2.521031618118286, "logps/chosen": -251.25564575195312, "logps/rejected": -441.2269592285156, "loss": 0.4093, "positive_losses": 5.246364116668701, "rewards/accuracies": 1.0, "rewards/chosen": 0.2414274662733078, "rewards/margins": 1.4953609704971313, "rewards/margins_max": 1.9941730499267578, "rewards/margins_min": 0.9965487718582153, "rewards/margins_std": 0.7054268717765808, "rewards/rejected": -1.2539334297180176, "step": 870 }, { "dpo_losses": 0.2292724847793579, "epoch": 2.48, "grad_norm": 33.76430360961392, "learning_rate": 4.461293170212644e-07, "logits/chosen": -2.6965794563293457, "logits/rejected": -2.543576717376709, "logps/chosen": -292.703125, "logps/rejected": -368.0157775878906, "loss": 0.3654, "positive_losses": 5.510960578918457, "rewards/accuracies": 1.0, "rewards/chosen": 0.29951637983322144, "rewards/margins": 1.5260313749313354, "rewards/margins_max": 1.979828119277954, "rewards/margins_min": 1.0722346305847168, "rewards/margins_std": 0.6417653560638428, "rewards/rejected": -1.2265150547027588, "step": 880 }, { "dpo_losses": 0.15858207643032074, "epoch": 2.51, "grad_norm": 5.727775081054632, "learning_rate": 4.005001948670606e-07, "logits/chosen": -2.694242238998413, "logits/rejected": -2.595343828201294, "logps/chosen": -382.9683532714844, "logps/rejected": -468.80157470703125, "loss": 0.463, "positive_losses": 0.11419792473316193, "rewards/accuracies": 1.0, "rewards/chosen": 0.5407828092575073, "rewards/margins": 1.968488097190857, "rewards/margins_max": 2.390479564666748, "rewards/margins_min": 1.546496868133545, "rewards/margins_std": 0.5967859029769897, "rewards/rejected": -1.4277052879333496, "step": 890 }, { "dpo_losses": 0.18081924319267273, "epoch": 2.54, "grad_norm": 6.1887239729076455, "learning_rate": 3.571286656911377e-07, "logits/chosen": -2.6035306453704834, "logits/rejected": -2.4794845581054688, "logps/chosen": -310.08013916015625, "logps/rejected": -408.18426513671875, "loss": 0.3544, "positive_losses": 2.6008810997009277, "rewards/accuracies": 1.0, "rewards/chosen": 0.4102245271205902, "rewards/margins": 1.8388206958770752, "rewards/margins_max": 2.2580726146698, "rewards/margins_min": 1.4195688962936401, "rewards/margins_std": 0.5929116606712341, "rewards/rejected": -1.4285962581634521, "step": 900 }, { "epoch": 2.54, "eval_dpo_losses": 0.6314364075660706, "eval_logits/chosen": -2.586303472518921, "eval_logits/rejected": -2.535871744155884, "eval_logps/chosen": -332.5704345703125, "eval_logps/rejected": -335.68133544921875, "eval_loss": 5.36396598815918, "eval_positive_losses": 51.33633804321289, "eval_rewards/accuracies": 0.670634925365448, "eval_rewards/chosen": -0.47349241375923157, "eval_rewards/margins": 0.29149433970451355, "eval_rewards/margins_max": 1.078196406364441, "eval_rewards/margins_min": -0.5344981551170349, "eval_rewards/margins_std": 0.72893226146698, "eval_rewards/rejected": -0.7649868130683899, "eval_runtime": 284.4452, "eval_samples_per_second": 7.031, "eval_steps_per_second": 0.221, "step": 900 }, { "dpo_losses": 0.20350190997123718, "epoch": 2.56, "grad_norm": 1.9747106750644823, "learning_rate": 3.1606136691612555e-07, "logits/chosen": -2.7836008071899414, "logits/rejected": -2.5904271602630615, "logps/chosen": -345.66265869140625, "logps/rejected": -354.3844299316406, "loss": 0.2637, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5323250889778137, "rewards/margins": 1.6922286748886108, "rewards/margins_max": 2.0907249450683594, "rewards/margins_min": 1.2937328815460205, "rewards/margins_std": 0.5635584592819214, "rewards/rejected": -1.159903883934021, "step": 910 }, { "dpo_losses": 0.22471606731414795, "epoch": 2.59, "grad_norm": 11.212944207381554, "learning_rate": 2.773424582247844e-07, "logits/chosen": -2.5793869495391846, "logits/rejected": -2.4063210487365723, "logps/chosen": -291.543701171875, "logps/rejected": -320.06353759765625, "loss": 0.6166, "positive_losses": 8.062161445617676, "rewards/accuracies": 1.0, "rewards/chosen": 0.38504648208618164, "rewards/margins": 1.6606292724609375, "rewards/margins_max": 2.1240899562835693, "rewards/margins_min": 1.1971690654754639, "rewards/margins_std": 0.655431866645813, "rewards/rejected": -1.275583028793335, "step": 920 }, { "dpo_losses": 0.20711331069469452, "epoch": 2.62, "grad_norm": 190.13690585667476, "learning_rate": 2.410135740750821e-07, "logits/chosen": -2.6692299842834473, "logits/rejected": -2.5721378326416016, "logps/chosen": -300.9726257324219, "logps/rejected": -399.5599365234375, "loss": 0.4929, "positive_losses": 3.012037754058838, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3912240266799927, "rewards/margins": 1.6907918453216553, "rewards/margins_max": 2.0557963848114014, "rewards/margins_min": 1.3257873058319092, "rewards/margins_std": 0.5161946415901184, "rewards/rejected": -1.2995678186416626, "step": 930 }, { "dpo_losses": 0.3051915466785431, "epoch": 2.65, "grad_norm": 189.05899790144875, "learning_rate": 2.0711377893064182e-07, "logits/chosen": -2.639585256576538, "logits/rejected": -2.488219738006592, "logps/chosen": -312.65863037109375, "logps/rejected": -305.5802917480469, "loss": 0.4531, "positive_losses": 3.7967441082000732, "rewards/accuracies": 1.0, "rewards/chosen": 0.2920045554637909, "rewards/margins": 1.2456369400024414, "rewards/margins_max": 1.7247259616851807, "rewards/margins_min": 0.7665479183197021, "rewards/margins_std": 0.6775342226028442, "rewards/rejected": -0.9536323547363281, "step": 940 }, { "dpo_losses": 0.2704788148403168, "epoch": 2.68, "grad_norm": 15.556722893889498, "learning_rate": 1.756795252547111e-07, "logits/chosen": -2.588268518447876, "logits/rejected": -2.501624584197998, "logps/chosen": -225.04928588867188, "logps/rejected": -314.3163757324219, "loss": 0.4599, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3465590476989746, "rewards/margins": 1.4794371128082275, "rewards/margins_max": 1.9303573369979858, "rewards/margins_min": 1.0285165309906006, "rewards/margins_std": 0.6376978158950806, "rewards/rejected": -1.1328779458999634, "step": 950 }, { "dpo_losses": 0.23120097815990448, "epoch": 2.7, "grad_norm": 3.6975387738343986, "learning_rate": 1.4674461431281013e-07, "logits/chosen": -2.7935328483581543, "logits/rejected": -2.7169508934020996, "logps/chosen": -246.69778442382812, "logps/rejected": -358.2559509277344, "loss": 0.3766, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3893265724182129, "rewards/margins": 1.5332249402999878, "rewards/margins_max": 2.008496046066284, "rewards/margins_min": 1.0579537153244019, "rewards/margins_std": 0.6721349954605103, "rewards/rejected": -1.143898367881775, "step": 960 }, { "dpo_losses": 0.18317696452140808, "epoch": 2.73, "grad_norm": 25.967042428441264, "learning_rate": 1.2034015982622243e-07, "logits/chosen": -2.68410587310791, "logits/rejected": -2.5668373107910156, "logps/chosen": -320.6241760253906, "logps/rejected": -454.39849853515625, "loss": 0.3194, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4021673798561096, "rewards/margins": 1.9114774465560913, "rewards/margins_max": 2.451068878173828, "rewards/margins_min": 1.3718855381011963, "rewards/margins_std": 0.763097882270813, "rewards/rejected": -1.5093098878860474, "step": 970 }, { "dpo_losses": 0.2419268637895584, "epoch": 2.76, "grad_norm": 19.29396011638503, "learning_rate": 9.649455451539419e-08, "logits/chosen": -2.555974006652832, "logits/rejected": -2.4670310020446777, "logps/chosen": -218.39334106445312, "logps/rejected": -300.92254638671875, "loss": 0.4254, "positive_losses": 4.289657115936279, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.25288599729537964, "rewards/margins": 1.518112301826477, "rewards/margins_max": 1.9375699758529663, "rewards/margins_min": 1.098654866218567, "rewards/margins_std": 0.5932024717330933, "rewards/rejected": -1.2652263641357422, "step": 980 }, { "dpo_losses": 0.20426790416240692, "epoch": 2.79, "grad_norm": 145.7358684722982, "learning_rate": 7.523343956923196e-08, "logits/chosen": -2.7547340393066406, "logits/rejected": -2.6413354873657227, "logps/chosen": -303.62115478515625, "logps/rejected": -412.58782958984375, "loss": 0.406, "positive_losses": 1.1116502285003662, "rewards/accuracies": 1.0, "rewards/chosen": 0.4799574315547943, "rewards/margins": 1.7528730630874634, "rewards/margins_max": 2.2721505165100098, "rewards/margins_min": 1.2335954904556274, "rewards/margins_std": 0.734369158744812, "rewards/rejected": -1.2729156017303467, "step": 990 }, { "dpo_losses": 0.2937398850917816, "epoch": 2.82, "grad_norm": 147.9672419405728, "learning_rate": 5.657967707312195e-08, "logits/chosen": -2.519782543182373, "logits/rejected": -2.54045033454895, "logps/chosen": -236.8069610595703, "logps/rejected": -393.01373291015625, "loss": 0.545, "positive_losses": 6.547940254211426, "rewards/accuracies": 1.0, "rewards/chosen": 0.24184127151966095, "rewards/margins": 1.3283250331878662, "rewards/margins_max": 1.8528366088867188, "rewards/margins_min": 0.8038133382797241, "rewards/margins_std": 0.7417714595794678, "rewards/rejected": -1.0864837169647217, "step": 1000 }, { "epoch": 2.82, "eval_dpo_losses": 0.6312186121940613, "eval_logits/chosen": -2.5872504711151123, "eval_logits/rejected": -2.5366668701171875, "eval_logps/chosen": -330.9984436035156, "eval_logps/rejected": -333.99945068359375, "eval_loss": 5.222360134124756, "eval_positive_losses": 49.98057556152344, "eval_rewards/accuracies": 0.6626983880996704, "eval_rewards/chosen": -0.4577721953392029, "eval_rewards/margins": 0.29039543867111206, "eval_rewards/margins_max": 1.0717767477035522, "eval_rewards/margins_min": -0.533184289932251, "eval_rewards/margins_std": 0.724482536315918, "eval_rewards/rejected": -0.7481676340103149, "eval_runtime": 284.8086, "eval_samples_per_second": 7.022, "eval_steps_per_second": 0.221, "step": 1000 }, { "dpo_losses": 0.2392820119857788, "epoch": 2.85, "grad_norm": 87.50201169562474, "learning_rate": 4.055332542531959e-08, "logits/chosen": -2.7165019512176514, "logits/rejected": -2.6357262134552, "logps/chosen": -229.1401824951172, "logps/rejected": -371.04571533203125, "loss": 0.5645, "positive_losses": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.34021449089050293, "rewards/margins": 1.4760607481002808, "rewards/margins_max": 1.7028331756591797, "rewards/margins_min": 1.2492884397506714, "rewards/margins_std": 0.3207046389579773, "rewards/rejected": -1.1358463764190674, "step": 1010 }, { "dpo_losses": 0.2636774182319641, "epoch": 2.87, "grad_norm": 9.243316710391014, "learning_rate": 2.7171617768147472e-08, "logits/chosen": -2.5805556774139404, "logits/rejected": -2.4946963787078857, "logps/chosen": -200.70706176757812, "logps/rejected": -348.9754638671875, "loss": 0.4736, "positive_losses": 4.167427062988281, "rewards/accuracies": 1.0, "rewards/chosen": 0.226848766207695, "rewards/margins": 1.4039170742034912, "rewards/margins_max": 1.8958208560943604, "rewards/margins_min": 0.9120131731033325, "rewards/margins_std": 0.6956570148468018, "rewards/rejected": -1.1770681142807007, "step": 1020 }, { "dpo_losses": 0.26111191511154175, "epoch": 2.9, "grad_norm": 3.7450554356463743, "learning_rate": 1.6448943457189616e-08, "logits/chosen": -2.5760231018066406, "logits/rejected": -2.58748197555542, "logps/chosen": -268.255126953125, "logps/rejected": -387.044921875, "loss": 0.2968, "positive_losses": 1.4128901958465576, "rewards/accuracies": 1.0, "rewards/chosen": 0.34100010991096497, "rewards/margins": 1.478846788406372, "rewards/margins_max": 2.0194473266601562, "rewards/margins_min": 0.9382462501525879, "rewards/margins_std": 0.7645247578620911, "rewards/rejected": -1.1378467082977295, "step": 1030 }, { "dpo_losses": 0.24359698593616486, "epoch": 2.93, "grad_norm": 187.10263101103095, "learning_rate": 8.39683258841123e-09, "logits/chosen": -2.5231451988220215, "logits/rejected": -2.408517360687256, "logps/chosen": -264.57916259765625, "logps/rejected": -332.7992248535156, "loss": 0.3754, "positive_losses": 0.07207755744457245, "rewards/accuracies": 1.0, "rewards/chosen": 0.4693472981452942, "rewards/margins": 1.5425517559051514, "rewards/margins_max": 1.9166587591171265, "rewards/margins_min": 1.1684446334838867, "rewards/margins_std": 0.52906733751297, "rewards/rejected": -1.0732043981552124, "step": 1040 }, { "dpo_losses": 0.20071451365947723, "epoch": 2.96, "grad_norm": 3.6062297906425043, "learning_rate": 3.0239435998430376e-09, "logits/chosen": -2.645131826400757, "logits/rejected": -2.5101490020751953, "logps/chosen": -282.20855712890625, "logps/rejected": -383.17950439453125, "loss": 0.3373, "positive_losses": 1.7565370798110962, "rewards/accuracies": 1.0, "rewards/chosen": 0.41964656114578247, "rewards/margins": 1.6733496189117432, "rewards/margins_max": 2.073215961456299, "rewards/margins_min": 1.2734830379486084, "rewards/margins_std": 0.5654967427253723, "rewards/rejected": -1.253702998161316, "step": 1050 }, { "dpo_losses": 0.22902190685272217, "epoch": 2.99, "grad_norm": 3.308213249224383, "learning_rate": 3.3605396115826695e-10, "logits/chosen": -2.394101142883301, "logits/rejected": -2.4773335456848145, "logps/chosen": -165.21871948242188, "logps/rejected": -327.92352294921875, "loss": 0.3158, "positive_losses": 1.1246204376220703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.30173832178115845, "rewards/margins": 1.4864323139190674, "rewards/margins_max": 1.822080373764038, "rewards/margins_min": 1.150783896446228, "rewards/margins_std": 0.4746781885623932, "rewards/rejected": -1.1846938133239746, "step": 1060 }, { "epoch": 3.0, "step": 1065, "total_flos": 0.0, "train_loss": 0.48024289137880566, "train_runtime": 8933.1726, "train_samples_per_second": 1.907, "train_steps_per_second": 0.119 } ], "logging_steps": 10, "max_steps": 1065, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }