diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16648 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100000, + "global_step": 11868, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.212299915754001e-10, + "logits/chosen": -3.166908025741577, + "logits/rejected": -3.3487741947174072, + "logps/chosen": -546.013916015625, + "logps/rejected": -472.92132568359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.212299915754001e-09, + "logits/chosen": -2.9773247241973877, + "logits/rejected": -2.852890968322754, + "logps/chosen": -277.7378234863281, + "logps/rejected": -267.5666198730469, + "loss": 0.6964, + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": 0.0006573781720362604, + "rewards/margins": -0.007902717217803001, + "rewards/rejected": 0.008560094982385635, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 8.424599831508002e-09, + "logits/chosen": -2.807126998901367, + "logits/rejected": -2.7919721603393555, + "logps/chosen": -245.955078125, + "logps/rejected": -225.7766571044922, + "loss": 0.692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01841505989432335, + "rewards/margins": 0.023902256041765213, + "rewards/rejected": -0.0054871938191354275, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.2636899747262005e-08, + "logits/chosen": -2.989205837249756, + "logits/rejected": -2.9802839756011963, + "logps/chosen": -299.1269836425781, + "logps/rejected": -249.7318572998047, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.002239107619971037, + "rewards/margins": 0.006709246896207333, + "rewards/rejected": -0.004470138344913721, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 1.6849199663016004e-08, + "logits/chosen": -2.9071033000946045, + "logits/rejected": -2.8376030921936035, + "logps/chosen": -300.695556640625, + "logps/rejected": -234.8794708251953, + "loss": 0.6956, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.004403972998261452, + "rewards/margins": 0.006278342567384243, + "rewards/rejected": -0.0018743708496913314, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 2.106149957877001e-08, + "logits/chosen": -2.7081246376037598, + "logits/rejected": -2.683647871017456, + "logps/chosen": -249.2618408203125, + "logps/rejected": -249.1317901611328, + "loss": 0.6885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.019329214468598366, + "rewards/margins": 0.01521158218383789, + "rewards/rejected": 0.0041176313534379005, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 2.527379949452401e-08, + "logits/chosen": -2.8151774406433105, + "logits/rejected": -2.8212380409240723, + "logps/chosen": -186.51699829101562, + "logps/rejected": -238.936279296875, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0006278610089793801, + "rewards/margins": 0.010208692401647568, + "rewards/rejected": -0.010836553759872913, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 2.948609941027801e-08, + "logits/chosen": -2.926729440689087, + "logits/rejected": -2.8498129844665527, + "logps/chosen": -298.0478210449219, + "logps/rejected": -214.31356811523438, + "loss": 0.6899, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.015186784788966179, + "rewards/margins": 0.006457159761339426, + "rewards/rejected": 0.008729624561965466, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 3.369839932603201e-08, + "logits/chosen": -3.0162465572357178, + "logits/rejected": -2.957993507385254, + "logps/chosen": -293.937255859375, + "logps/rejected": -307.1500549316406, + "loss": 0.6906, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.015594196505844593, + "rewards/margins": 0.021771308034658432, + "rewards/rejected": -0.006177111063152552, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 3.791069924178601e-08, + "logits/chosen": -2.7287724018096924, + "logits/rejected": -2.6703319549560547, + "logps/chosen": -355.0783386230469, + "logps/rejected": -228.819091796875, + "loss": 0.687, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.013224788010120392, + "rewards/margins": 0.025851404294371605, + "rewards/rejected": -0.012626620009541512, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 4.212299915754002e-08, + "logits/chosen": -2.8378939628601074, + "logits/rejected": -2.860485792160034, + "logps/chosen": -337.00531005859375, + "logps/rejected": -187.0914306640625, + "loss": 0.6873, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.012190933339297771, + "rewards/margins": 0.007645601872354746, + "rewards/rejected": 0.004545331001281738, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 4.6335299073294016e-08, + "logits/chosen": -2.791350841522217, + "logits/rejected": -2.7435081005096436, + "logps/chosen": -152.36270141601562, + "logps/rejected": -189.67120361328125, + "loss": 0.6811, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0046439082361757755, + "rewards/margins": 0.01751074194908142, + "rewards/rejected": -0.012866830453276634, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 5.054759898904802e-08, + "logits/chosen": -2.9045522212982178, + "logits/rejected": -2.8898544311523438, + "logps/chosen": -241.04635620117188, + "logps/rejected": -231.09036254882812, + "loss": 0.681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.030125772580504417, + "rewards/margins": 0.03214115649461746, + "rewards/rejected": -0.0020153801888227463, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 5.475989890480202e-08, + "logits/chosen": -2.863396167755127, + "logits/rejected": -2.875047206878662, + "logps/chosen": -295.2554626464844, + "logps/rejected": -316.55908203125, + "loss": 0.6819, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.06428538262844086, + "rewards/margins": 0.02827184833586216, + "rewards/rejected": 0.03601354733109474, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 5.897219882055602e-08, + "logits/chosen": -2.800774097442627, + "logits/rejected": -2.7655835151672363, + "logps/chosen": -227.52932739257812, + "logps/rejected": -209.88265991210938, + "loss": 0.6768, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.055633824318647385, + "rewards/margins": 0.0319707877933979, + "rewards/rejected": 0.023663034662604332, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 6.318449873631002e-08, + "logits/chosen": -2.7800867557525635, + "logits/rejected": -2.8257360458374023, + "logps/chosen": -201.40185546875, + "logps/rejected": -236.130859375, + "loss": 0.6778, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06176251918077469, + "rewards/margins": 0.037913981825113297, + "rewards/rejected": 0.023848531767725945, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 6.739679865206401e-08, + "logits/chosen": -2.9548959732055664, + "logits/rejected": -2.835465669631958, + "logps/chosen": -235.51101684570312, + "logps/rejected": -188.86111450195312, + "loss": 0.6604, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05679447576403618, + "rewards/margins": 0.043836116790771484, + "rewards/rejected": 0.012958364561200142, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 7.160909856781803e-08, + "logits/chosen": -2.813509464263916, + "logits/rejected": -2.895786762237549, + "logps/chosen": -317.48004150390625, + "logps/rejected": -302.57806396484375, + "loss": 0.6769, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.09258848428726196, + "rewards/margins": 0.06396313011646271, + "rewards/rejected": 0.02862536534667015, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 7.582139848357203e-08, + "logits/chosen": -2.7334518432617188, + "logits/rejected": -2.749671697616577, + "logps/chosen": -183.47982788085938, + "logps/rejected": -292.4325866699219, + "loss": 0.6528, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.019605696201324463, + "rewards/margins": 0.04809585586190224, + "rewards/rejected": -0.028490161523222923, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 8.003369839932602e-08, + "logits/chosen": -2.8567605018615723, + "logits/rejected": -2.8261303901672363, + "logps/chosen": -310.6612854003906, + "logps/rejected": -254.41552734375, + "loss": 0.6434, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.11722777038812637, + "rewards/margins": 0.0635734349489212, + "rewards/rejected": 0.05365434288978577, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 8.424599831508004e-08, + "logits/chosen": -2.7087759971618652, + "logits/rejected": -2.7274348735809326, + "logps/chosen": -306.12469482421875, + "logps/rejected": -233.0144500732422, + "loss": 0.6536, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.042596496641635895, + "rewards/margins": 0.04861391335725784, + "rewards/rejected": -0.00601741811260581, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 8.845829823083403e-08, + "logits/chosen": -2.6913976669311523, + "logits/rejected": -2.7625555992126465, + "logps/chosen": -195.1536865234375, + "logps/rejected": -288.43914794921875, + "loss": 0.6492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.08051207661628723, + "rewards/margins": 0.15957090258598328, + "rewards/rejected": -0.07905881106853485, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 9.267059814658803e-08, + "logits/chosen": -2.8155956268310547, + "logits/rejected": -2.858513355255127, + "logps/chosen": -269.7468566894531, + "logps/rejected": -209.1760711669922, + "loss": 0.6394, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.09490491449832916, + "rewards/margins": 0.09294173866510391, + "rewards/rejected": 0.0019631728064268827, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 9.688289806234204e-08, + "logits/chosen": -2.7522032260894775, + "logits/rejected": -2.6138253211975098, + "logps/chosen": -223.89501953125, + "logps/rejected": -219.81045532226562, + "loss": 0.6445, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.024391641840338707, + "rewards/margins": 0.02039915695786476, + "rewards/rejected": -0.04479080066084862, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 1.0109519797809604e-07, + "logits/chosen": -2.892430543899536, + "logits/rejected": -2.9211509227752686, + "logps/chosen": -262.0943908691406, + "logps/rejected": -300.74822998046875, + "loss": 0.64, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1830977201461792, + "rewards/margins": 0.11756487190723419, + "rewards/rejected": 0.0655328631401062, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 1.0530749789385003e-07, + "logits/chosen": -2.816969633102417, + "logits/rejected": -2.772429943084717, + "logps/chosen": -212.73501586914062, + "logps/rejected": -219.41799926757812, + "loss": 0.6396, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05148952081799507, + "rewards/margins": 0.03601338341832161, + "rewards/rejected": 0.015476112253963947, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 1.0951979780960404e-07, + "logits/chosen": -2.801922082901001, + "logits/rejected": -2.7320210933685303, + "logps/chosen": -311.57659912109375, + "logps/rejected": -213.0792694091797, + "loss": 0.6479, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.11519031226634979, + "rewards/margins": 0.20116600394248962, + "rewards/rejected": -0.08597570657730103, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 1.1373209772535804e-07, + "logits/chosen": -2.9025025367736816, + "logits/rejected": -2.8095669746398926, + "logps/chosen": -204.12356567382812, + "logps/rejected": -184.1841278076172, + "loss": 0.6558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.18039628863334656, + "rewards/margins": 0.17073126137256622, + "rewards/rejected": 0.009665054269134998, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 1.1794439764111204e-07, + "logits/chosen": -2.873542308807373, + "logits/rejected": -2.6858971118927, + "logps/chosen": -302.70599365234375, + "logps/rejected": -284.6993103027344, + "loss": 0.6326, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10259075462818146, + "rewards/margins": 0.242101788520813, + "rewards/rejected": -0.13951101899147034, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 1.2215669755686605e-07, + "logits/chosen": -2.8171753883361816, + "logits/rejected": -2.789837598800659, + "logps/chosen": -176.04864501953125, + "logps/rejected": -190.3517303466797, + "loss": 0.6455, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07305292040109634, + "rewards/margins": 0.2632436156272888, + "rewards/rejected": -0.19019068777561188, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 1.2636899747262003e-07, + "logits/chosen": -2.90800404548645, + "logits/rejected": -2.73028302192688, + "logps/chosen": -283.43798828125, + "logps/rejected": -231.88259887695312, + "loss": 0.5735, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.24098046123981476, + "rewards/margins": 0.5552220344543457, + "rewards/rejected": -0.31424158811569214, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 1.3058129738837404e-07, + "logits/chosen": -2.6579442024230957, + "logits/rejected": -2.7264251708984375, + "logps/chosen": -171.2255096435547, + "logps/rejected": -202.13211059570312, + "loss": 0.5957, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.09882710874080658, + "rewards/margins": 0.27224260568618774, + "rewards/rejected": -0.17341549694538116, + "step": 310 + }, + { + "epoch": 0.08, + "learning_rate": 1.3479359730412803e-07, + "logits/chosen": -2.829624891281128, + "logits/rejected": -2.885951519012451, + "logps/chosen": -294.4091796875, + "logps/rejected": -299.72235107421875, + "loss": 0.6045, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.20352359116077423, + "rewards/margins": 0.23111791908740997, + "rewards/rejected": -0.027594303712248802, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 1.3900589721988204e-07, + "logits/chosen": -2.950706958770752, + "logits/rejected": -2.893571138381958, + "logps/chosen": -417.59442138671875, + "logps/rejected": -337.65692138671875, + "loss": 0.6005, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16790476441383362, + "rewards/margins": 0.23245680332183838, + "rewards/rejected": -0.06455201655626297, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 1.4321819713563605e-07, + "logits/chosen": -2.7175984382629395, + "logits/rejected": -2.662710189819336, + "logps/chosen": -268.44659423828125, + "logps/rejected": -274.83990478515625, + "loss": 0.6299, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.08806786686182022, + "rewards/margins": 0.3283998966217041, + "rewards/rejected": -0.24033205211162567, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 1.4743049705139004e-07, + "logits/chosen": -2.8463797569274902, + "logits/rejected": -2.8426125049591064, + "logps/chosen": -237.9515838623047, + "logps/rejected": -267.5751037597656, + "loss": 0.6076, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.33398309350013733, + "rewards/margins": 0.43801507353782654, + "rewards/rejected": -0.10403194278478622, + "step": 350 + }, + { + "epoch": 0.09, + "learning_rate": 1.5164279696714405e-07, + "logits/chosen": -2.76839542388916, + "logits/rejected": -2.7172908782958984, + "logps/chosen": -237.4521942138672, + "logps/rejected": -197.22958374023438, + "loss": 0.6356, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15721561014652252, + "rewards/margins": 0.23134712874889374, + "rewards/rejected": -0.07413151115179062, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 1.5585509688289806e-07, + "logits/chosen": -2.804783821105957, + "logits/rejected": -2.7225444316864014, + "logps/chosen": -200.27352905273438, + "logps/rejected": -182.17044067382812, + "loss": 0.6186, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11609198898077011, + "rewards/margins": 0.4650752544403076, + "rewards/rejected": -0.3489832282066345, + "step": 370 + }, + { + "epoch": 0.1, + "learning_rate": 1.6006739679865205e-07, + "logits/chosen": -2.867514133453369, + "logits/rejected": -2.7255942821502686, + "logps/chosen": -240.0352020263672, + "logps/rejected": -234.1785125732422, + "loss": 0.5953, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3244686722755432, + "rewards/margins": 0.2539460361003876, + "rewards/rejected": 0.07052260637283325, + "step": 380 + }, + { + "epoch": 0.1, + "learning_rate": 1.6427969671440606e-07, + "logits/chosen": -2.634826183319092, + "logits/rejected": -2.6665759086608887, + "logps/chosen": -243.5812530517578, + "logps/rejected": -324.9338684082031, + "loss": 0.6195, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01270450372248888, + "rewards/margins": 0.18538111448287964, + "rewards/rejected": -0.17267660796642303, + "step": 390 + }, + { + "epoch": 0.1, + "learning_rate": 1.6849199663016007e-07, + "logits/chosen": -2.812776565551758, + "logits/rejected": -2.7242472171783447, + "logps/chosen": -211.7786102294922, + "logps/rejected": -196.15328979492188, + "loss": 0.6083, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.20752961933612823, + "rewards/margins": 0.26783204078674316, + "rewards/rejected": -0.06030241772532463, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 1.7270429654591406e-07, + "logits/chosen": -2.7651867866516113, + "logits/rejected": -2.794961452484131, + "logps/chosen": -200.4475555419922, + "logps/rejected": -193.27154541015625, + "loss": 0.6431, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.17628836631774902, + "rewards/margins": 0.17924582958221436, + "rewards/rejected": -0.0029574513901025057, + "step": 410 + }, + { + "epoch": 0.11, + "learning_rate": 1.7691659646166807e-07, + "logits/chosen": -3.0512638092041016, + "logits/rejected": -2.8745832443237305, + "logps/chosen": -388.82977294921875, + "logps/rejected": -230.249755859375, + "loss": 0.5436, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.42595309019088745, + "rewards/margins": 0.3382980525493622, + "rewards/rejected": 0.08765505254268646, + "step": 420 + }, + { + "epoch": 0.11, + "learning_rate": 1.8112889637742208e-07, + "logits/chosen": -2.8399643898010254, + "logits/rejected": -2.7200350761413574, + "logps/chosen": -215.1141815185547, + "logps/rejected": -203.30105590820312, + "loss": 0.609, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.2471599280834198, + "rewards/margins": 0.24380986392498016, + "rewards/rejected": 0.0033500641584396362, + "step": 430 + }, + { + "epoch": 0.11, + "learning_rate": 1.8534119629317606e-07, + "logits/chosen": -2.8783884048461914, + "logits/rejected": -2.8517260551452637, + "logps/chosen": -260.9423522949219, + "logps/rejected": -225.1972198486328, + "loss": 0.6453, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18088439106941223, + "rewards/margins": 0.19021722674369812, + "rewards/rejected": -0.00933288224041462, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 1.8955349620893008e-07, + "logits/chosen": -2.776001214981079, + "logits/rejected": -2.8345043659210205, + "logps/chosen": -240.72265625, + "logps/rejected": -260.09356689453125, + "loss": 0.572, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.009611167944967747, + "rewards/margins": 0.2188769280910492, + "rewards/rejected": -0.20926575362682343, + "step": 450 + }, + { + "epoch": 0.12, + "learning_rate": 1.937657961246841e-07, + "logits/chosen": -2.6703200340270996, + "logits/rejected": -2.78947114944458, + "logps/chosen": -145.38352966308594, + "logps/rejected": -203.1894989013672, + "loss": 0.6171, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023900752887129784, + "rewards/margins": 0.18232859671115875, + "rewards/rejected": -0.2062292993068695, + "step": 460 + }, + { + "epoch": 0.12, + "learning_rate": 1.9797809604043807e-07, + "logits/chosen": -2.8545408248901367, + "logits/rejected": -2.7196555137634277, + "logps/chosen": -252.4641571044922, + "logps/rejected": -168.0990753173828, + "loss": 0.5999, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4809208810329437, + "rewards/margins": 0.6154331564903259, + "rewards/rejected": -0.1345122754573822, + "step": 470 + }, + { + "epoch": 0.12, + "learning_rate": 2.0219039595619208e-07, + "logits/chosen": -2.9020743370056152, + "logits/rejected": -2.805799961090088, + "logps/chosen": -278.7673034667969, + "logps/rejected": -270.74188232421875, + "loss": 0.5801, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.48040515184402466, + "rewards/margins": 0.5795575976371765, + "rewards/rejected": -0.09915249049663544, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 2.064026958719461e-07, + "logits/chosen": -2.7485694885253906, + "logits/rejected": -2.668761730194092, + "logps/chosen": -173.67547607421875, + "logps/rejected": -155.75164794921875, + "loss": 0.5928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.32208365201950073, + "rewards/margins": 0.7304231524467468, + "rewards/rejected": -0.4083394408226013, + "step": 490 + }, + { + "epoch": 0.13, + "learning_rate": 2.1061499578770005e-07, + "logits/chosen": -2.8859145641326904, + "logits/rejected": -2.747610569000244, + "logps/chosen": -401.01629638671875, + "logps/rejected": -255.532470703125, + "loss": 0.5404, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06550431251525879, + "rewards/margins": 0.473673015832901, + "rewards/rejected": -0.40816861391067505, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 2.1482729570345407e-07, + "logits/chosen": -2.8993895053863525, + "logits/rejected": -2.767453193664551, + "logps/chosen": -269.9494323730469, + "logps/rejected": -317.01702880859375, + "loss": 0.5515, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.41263261437416077, + "rewards/margins": 0.5958629250526428, + "rewards/rejected": -0.18323031067848206, + "step": 510 + }, + { + "epoch": 0.13, + "learning_rate": 2.1903959561920808e-07, + "logits/chosen": -2.8467533588409424, + "logits/rejected": -2.801426410675049, + "logps/chosen": -204.9807586669922, + "logps/rejected": -192.07164001464844, + "loss": 0.5514, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3908511698246002, + "rewards/margins": 0.37529826164245605, + "rewards/rejected": 0.015552910976111889, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 2.2325189553496206e-07, + "logits/chosen": -2.7452545166015625, + "logits/rejected": -2.702454090118408, + "logps/chosen": -204.1117401123047, + "logps/rejected": -267.8322448730469, + "loss": 0.5728, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.40074747800827026, + "rewards/margins": 0.31568995118141174, + "rewards/rejected": 0.0850575715303421, + "step": 530 + }, + { + "epoch": 0.14, + "learning_rate": 2.2746419545071608e-07, + "logits/chosen": -2.7847249507904053, + "logits/rejected": -2.720637083053589, + "logps/chosen": -281.24444580078125, + "logps/rejected": -255.8726043701172, + "loss": 0.6399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10813522338867188, + "rewards/margins": 0.3281122148036957, + "rewards/rejected": -0.2199770212173462, + "step": 540 + }, + { + "epoch": 0.14, + "learning_rate": 2.316764953664701e-07, + "logits/chosen": -2.8527252674102783, + "logits/rejected": -2.836733818054199, + "logps/chosen": -235.00033569335938, + "logps/rejected": -273.7145080566406, + "loss": 0.5523, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5961872935295105, + "rewards/margins": 0.9351669549942017, + "rewards/rejected": -0.33897966146469116, + "step": 550 + }, + { + "epoch": 0.14, + "learning_rate": 2.3588879528222407e-07, + "logits/chosen": -2.7602150440216064, + "logits/rejected": -2.620371103286743, + "logps/chosen": -293.63043212890625, + "logps/rejected": -320.56707763671875, + "loss": 0.5559, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.47802114486694336, + "rewards/margins": 0.22451654076576233, + "rewards/rejected": 0.2535046637058258, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 2.4010109519797806e-07, + "logits/chosen": -2.7937815189361572, + "logits/rejected": -2.7583813667297363, + "logps/chosen": -248.607666015625, + "logps/rejected": -213.4592742919922, + "loss": 0.5489, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4767304062843323, + "rewards/margins": 0.6132498383522034, + "rewards/rejected": -0.13651947677135468, + "step": 570 + }, + { + "epoch": 0.15, + "learning_rate": 2.443133951137321e-07, + "logits/chosen": -2.705596685409546, + "logits/rejected": -2.764143705368042, + "logps/chosen": -183.74258422851562, + "logps/rejected": -181.19253540039062, + "loss": 0.5128, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.038012512028217316, + "rewards/margins": 0.4396725594997406, + "rewards/rejected": -0.40166011452674866, + "step": 580 + }, + { + "epoch": 0.15, + "learning_rate": 2.485256950294861e-07, + "logits/chosen": -2.7596192359924316, + "logits/rejected": -2.7716755867004395, + "logps/chosen": -170.6914520263672, + "logps/rejected": -194.93226623535156, + "loss": 0.4832, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.023831719532608986, + "rewards/margins": 0.9237872362136841, + "rewards/rejected": -0.8999554514884949, + "step": 590 + }, + { + "epoch": 0.15, + "learning_rate": 2.5273799494524007e-07, + "logits/chosen": -2.7988181114196777, + "logits/rejected": -2.784140110015869, + "logps/chosen": -239.1433868408203, + "logps/rejected": -284.5286865234375, + "loss": 0.5607, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.14084307849407196, + "rewards/margins": 0.25364288687705994, + "rewards/rejected": -0.3944859504699707, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 2.5695029486099405e-07, + "logits/chosen": -2.7752766609191895, + "logits/rejected": -2.7815215587615967, + "logps/chosen": -264.5174560546875, + "logps/rejected": -266.3208923339844, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06395702064037323, + "rewards/margins": 0.6657644510269165, + "rewards/rejected": -0.6018074154853821, + "step": 610 + }, + { + "epoch": 0.16, + "learning_rate": 2.611625947767481e-07, + "logits/chosen": -2.7037911415100098, + "logits/rejected": -2.7605366706848145, + "logps/chosen": -243.78945922851562, + "logps/rejected": -243.4246368408203, + "loss": 0.573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.20454922318458557, + "rewards/margins": 0.645856499671936, + "rewards/rejected": -0.4413072466850281, + "step": 620 + }, + { + "epoch": 0.16, + "learning_rate": 2.653748946925021e-07, + "logits/chosen": -2.7574284076690674, + "logits/rejected": -2.7437338829040527, + "logps/chosen": -228.0672607421875, + "logps/rejected": -259.17694091796875, + "loss": 0.5244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.3170861601829529, + "rewards/margins": 0.26493725180625916, + "rewards/rejected": 0.05214894562959671, + "step": 630 + }, + { + "epoch": 0.16, + "learning_rate": 2.6958719460825606e-07, + "logits/chosen": -2.7019240856170654, + "logits/rejected": -2.7028634548187256, + "logps/chosen": -254.992431640625, + "logps/rejected": -231.47348022460938, + "loss": 0.6187, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4512414038181305, + "rewards/margins": 0.4618772864341736, + "rewards/rejected": -0.010635855607688427, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 2.737994945240101e-07, + "logits/chosen": -2.835756301879883, + "logits/rejected": -2.7334399223327637, + "logps/chosen": -396.8888854980469, + "logps/rejected": -331.6492614746094, + "loss": 0.6095, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5410932302474976, + "rewards/margins": 0.8466060757637024, + "rewards/rejected": -0.305512934923172, + "step": 650 + }, + { + "epoch": 0.17, + "learning_rate": 2.780117944397641e-07, + "logits/chosen": -2.6983981132507324, + "logits/rejected": -2.595994234085083, + "logps/chosen": -259.11248779296875, + "logps/rejected": -187.53634643554688, + "loss": 0.6914, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.18554158508777618, + "rewards/margins": 0.41711145639419556, + "rewards/rejected": -0.23156991600990295, + "step": 660 + }, + { + "epoch": 0.17, + "learning_rate": 2.8222409435551807e-07, + "logits/chosen": -2.6740572452545166, + "logits/rejected": -2.573707103729248, + "logps/chosen": -223.1436767578125, + "logps/rejected": -239.71603393554688, + "loss": 0.5766, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2260672152042389, + "rewards/margins": 0.28128767013549805, + "rewards/rejected": -0.05522041767835617, + "step": 670 + }, + { + "epoch": 0.17, + "learning_rate": 2.864363942712721e-07, + "logits/chosen": -2.5910723209381104, + "logits/rejected": -2.700523614883423, + "logps/chosen": -147.84005737304688, + "logps/rejected": -224.4444580078125, + "loss": 0.6463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4095599055290222, + "rewards/margins": 0.5575370788574219, + "rewards/rejected": -0.9670969843864441, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 2.906486941870261e-07, + "logits/chosen": -2.8283638954162598, + "logits/rejected": -2.7460625171661377, + "logps/chosen": -283.7747802734375, + "logps/rejected": -201.9105224609375, + "loss": 0.5628, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2565240263938904, + "rewards/margins": 0.5982394814491272, + "rewards/rejected": -0.8547635078430176, + "step": 690 + }, + { + "epoch": 0.18, + "learning_rate": 2.948609941027801e-07, + "logits/chosen": -2.8705124855041504, + "logits/rejected": -2.8632733821868896, + "logps/chosen": -291.81622314453125, + "logps/rejected": -266.4309997558594, + "loss": 0.5825, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.031624965369701385, + "rewards/margins": 0.8366470336914062, + "rewards/rejected": -0.8682720065116882, + "step": 700 + }, + { + "epoch": 0.18, + "learning_rate": 2.990732940185341e-07, + "logits/chosen": -2.5919554233551025, + "logits/rejected": -2.614384174346924, + "logps/chosen": -242.84140014648438, + "logps/rejected": -381.2159423828125, + "loss": 0.5726, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7084521055221558, + "rewards/margins": 0.08272404968738556, + "rewards/rejected": -0.7911761999130249, + "step": 710 + }, + { + "epoch": 0.18, + "learning_rate": 3.032855939342881e-07, + "logits/chosen": -2.7366249561309814, + "logits/rejected": -2.763061761856079, + "logps/chosen": -290.13043212890625, + "logps/rejected": -257.94073486328125, + "loss": 0.5624, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.22357621788978577, + "rewards/margins": 0.9331549406051636, + "rewards/rejected": -1.156731128692627, + "step": 720 + }, + { + "epoch": 0.18, + "learning_rate": 3.074978938500421e-07, + "logits/chosen": -2.858980655670166, + "logits/rejected": -2.879626512527466, + "logps/chosen": -260.36572265625, + "logps/rejected": -331.29986572265625, + "loss": 0.5597, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.024600837379693985, + "rewards/margins": 0.9509462118148804, + "rewards/rejected": -0.9755471348762512, + "step": 730 + }, + { + "epoch": 0.19, + "learning_rate": 3.117101937657961e-07, + "logits/chosen": -2.755232572555542, + "logits/rejected": -2.788558006286621, + "logps/chosen": -271.6325378417969, + "logps/rejected": -268.0082702636719, + "loss": 0.55, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.017580661922693253, + "rewards/margins": 0.9975998997688293, + "rewards/rejected": -1.0151805877685547, + "step": 740 + }, + { + "epoch": 0.19, + "learning_rate": 3.159224936815501e-07, + "logits/chosen": -2.7869515419006348, + "logits/rejected": -2.7380776405334473, + "logps/chosen": -252.6619110107422, + "logps/rejected": -179.35830688476562, + "loss": 0.5059, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.08400382101535797, + "rewards/margins": 0.8033849596977234, + "rewards/rejected": -0.7193810343742371, + "step": 750 + }, + { + "epoch": 0.19, + "learning_rate": 3.201347935973041e-07, + "logits/chosen": -2.7246577739715576, + "logits/rejected": -2.6718568801879883, + "logps/chosen": -303.58880615234375, + "logps/rejected": -240.606201171875, + "loss": 0.6173, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2395877093076706, + "rewards/margins": 0.45436787605285645, + "rewards/rejected": -0.21478009223937988, + "step": 760 + }, + { + "epoch": 0.19, + "learning_rate": 3.2434709351305813e-07, + "logits/chosen": -2.788994550704956, + "logits/rejected": -2.5727732181549072, + "logps/chosen": -313.975341796875, + "logps/rejected": -212.37728881835938, + "loss": 0.5013, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.17403548955917358, + "rewards/margins": 0.97325199842453, + "rewards/rejected": -0.799216628074646, + "step": 770 + }, + { + "epoch": 0.2, + "learning_rate": 3.285593934288121e-07, + "logits/chosen": -2.840376615524292, + "logits/rejected": -2.8498775959014893, + "logps/chosen": -203.07363891601562, + "logps/rejected": -230.2865753173828, + "loss": 0.6018, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06935430318117142, + "rewards/margins": 0.49717801809310913, + "rewards/rejected": -0.5665322542190552, + "step": 780 + }, + { + "epoch": 0.2, + "learning_rate": 3.327716933445661e-07, + "logits/chosen": -2.7698721885681152, + "logits/rejected": -2.7035422325134277, + "logps/chosen": -287.3998718261719, + "logps/rejected": -255.6114044189453, + "loss": 0.5399, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09833178669214249, + "rewards/margins": 1.1349576711654663, + "rewards/rejected": -1.2332894802093506, + "step": 790 + }, + { + "epoch": 0.2, + "learning_rate": 3.3698399326032014e-07, + "logits/chosen": -2.7961173057556152, + "logits/rejected": -2.7884459495544434, + "logps/chosen": -214.2937774658203, + "logps/rejected": -260.4123840332031, + "loss": 0.5682, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.39419320225715637, + "rewards/margins": 1.3177748918533325, + "rewards/rejected": -1.711968183517456, + "step": 800 + }, + { + "epoch": 0.2, + "learning_rate": 3.411962931760741e-07, + "logits/chosen": -2.7424988746643066, + "logits/rejected": -2.7070367336273193, + "logps/chosen": -309.4488830566406, + "logps/rejected": -298.2324523925781, + "loss": 0.6783, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3691617548465729, + "rewards/margins": 0.6875399351119995, + "rewards/rejected": -1.0567017793655396, + "step": 810 + }, + { + "epoch": 0.21, + "learning_rate": 3.454085930918281e-07, + "logits/chosen": -2.8790063858032227, + "logits/rejected": -2.8102855682373047, + "logps/chosen": -267.56610107421875, + "logps/rejected": -255.9617919921875, + "loss": 0.5377, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4485592842102051, + "rewards/margins": 0.6301880478858948, + "rewards/rejected": -1.0787473917007446, + "step": 820 + }, + { + "epoch": 0.21, + "learning_rate": 3.4962089300758215e-07, + "logits/chosen": -2.9085679054260254, + "logits/rejected": -2.8483242988586426, + "logps/chosen": -309.87261962890625, + "logps/rejected": -326.3826599121094, + "loss": 0.5412, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11304453760385513, + "rewards/margins": 0.7235099077224731, + "rewards/rejected": -0.6104652285575867, + "step": 830 + }, + { + "epoch": 0.21, + "learning_rate": 3.5383319292333613e-07, + "logits/chosen": -2.7523531913757324, + "logits/rejected": -2.7195019721984863, + "logps/chosen": -275.505859375, + "logps/rejected": -251.9176483154297, + "loss": 0.5109, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01079724170267582, + "rewards/margins": 0.6783922910690308, + "rewards/rejected": -0.689189612865448, + "step": 840 + }, + { + "epoch": 0.21, + "learning_rate": 3.580454928390901e-07, + "logits/chosen": -2.5856566429138184, + "logits/rejected": -2.6075503826141357, + "logps/chosen": -180.05770874023438, + "logps/rejected": -246.65780639648438, + "loss": 0.642, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2767847180366516, + "rewards/margins": -0.03543096035718918, + "rewards/rejected": -0.24135378003120422, + "step": 850 + }, + { + "epoch": 0.22, + "learning_rate": 3.6225779275484416e-07, + "logits/chosen": -2.8647685050964355, + "logits/rejected": -2.837616205215454, + "logps/chosen": -256.5491638183594, + "logps/rejected": -266.04217529296875, + "loss": 0.5703, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10952053964138031, + "rewards/margins": 0.4287700653076172, + "rewards/rejected": -0.3192494809627533, + "step": 860 + }, + { + "epoch": 0.22, + "learning_rate": 3.6647009267059814e-07, + "logits/chosen": -2.8707778453826904, + "logits/rejected": -2.848655939102173, + "logps/chosen": -225.21530151367188, + "logps/rejected": -267.1256408691406, + "loss": 0.566, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2901657521724701, + "rewards/margins": 0.3968164324760437, + "rewards/rejected": -0.10665065050125122, + "step": 870 + }, + { + "epoch": 0.22, + "learning_rate": 3.7068239258635213e-07, + "logits/chosen": -2.695209503173828, + "logits/rejected": -2.670459508895874, + "logps/chosen": -225.5996551513672, + "logps/rejected": -241.8999481201172, + "loss": 0.4506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11750245094299316, + "rewards/margins": 1.2355433702468872, + "rewards/rejected": -1.118040919303894, + "step": 880 + }, + { + "epoch": 0.22, + "learning_rate": 3.7489469250210617e-07, + "logits/chosen": -2.76595139503479, + "logits/rejected": -2.7959845066070557, + "logps/chosen": -253.39501953125, + "logps/rejected": -301.0154113769531, + "loss": 0.6257, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16110540926456451, + "rewards/margins": 0.8840128183364868, + "rewards/rejected": -1.0451180934906006, + "step": 890 + }, + { + "epoch": 0.23, + "learning_rate": 3.7910699241786015e-07, + "logits/chosen": -2.901207685470581, + "logits/rejected": -2.9004340171813965, + "logps/chosen": -318.0623474121094, + "logps/rejected": -266.1954040527344, + "loss": 0.5089, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.47898969054222107, + "rewards/margins": 1.0601681470870972, + "rewards/rejected": -0.5811785459518433, + "step": 900 + }, + { + "epoch": 0.23, + "learning_rate": 3.8331929233361414e-07, + "logits/chosen": -2.93418025970459, + "logits/rejected": -2.9847195148468018, + "logps/chosen": -290.5269470214844, + "logps/rejected": -285.1426086425781, + "loss": 0.5668, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.011163473129272461, + "rewards/margins": 0.9022845029830933, + "rewards/rejected": -0.9134479761123657, + "step": 910 + }, + { + "epoch": 0.23, + "learning_rate": 3.875315922493682e-07, + "logits/chosen": -2.695725679397583, + "logits/rejected": -2.6235289573669434, + "logps/chosen": -239.16494750976562, + "logps/rejected": -298.2274169921875, + "loss": 0.5698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03379325941205025, + "rewards/margins": 0.6167046427726746, + "rewards/rejected": -0.6504980325698853, + "step": 920 + }, + { + "epoch": 0.24, + "learning_rate": 3.9174389216512216e-07, + "logits/chosen": -2.9124255180358887, + "logits/rejected": -2.9122350215911865, + "logps/chosen": -208.6260528564453, + "logps/rejected": -265.9713439941406, + "loss": 0.62, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.343755304813385, + "rewards/margins": 0.6561600565910339, + "rewards/rejected": -0.31240472197532654, + "step": 930 + }, + { + "epoch": 0.24, + "learning_rate": 3.9595619208087615e-07, + "logits/chosen": -2.6123409271240234, + "logits/rejected": -2.6417336463928223, + "logps/chosen": -242.59756469726562, + "logps/rejected": -194.54550170898438, + "loss": 0.612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26655206084251404, + "rewards/margins": 0.2533474564552307, + "rewards/rejected": -0.5198994874954224, + "step": 940 + }, + { + "epoch": 0.24, + "learning_rate": 4.001684919966302e-07, + "logits/chosen": -2.965735912322998, + "logits/rejected": -2.874760389328003, + "logps/chosen": -333.0281982421875, + "logps/rejected": -354.705810546875, + "loss": 0.6405, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.08252526819705963, + "rewards/margins": 0.7334955334663391, + "rewards/rejected": -0.6509702205657959, + "step": 950 + }, + { + "epoch": 0.24, + "learning_rate": 4.0438079191238417e-07, + "logits/chosen": -2.4438750743865967, + "logits/rejected": -2.4380674362182617, + "logps/chosen": -255.1825714111328, + "logps/rejected": -220.0428466796875, + "loss": 0.5256, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007409202866256237, + "rewards/margins": 0.8924994468688965, + "rewards/rejected": -0.885090172290802, + "step": 960 + }, + { + "epoch": 0.25, + "learning_rate": 4.0859309182813815e-07, + "logits/chosen": -2.8350677490234375, + "logits/rejected": -2.726632595062256, + "logps/chosen": -368.48760986328125, + "logps/rejected": -201.78909301757812, + "loss": 0.5834, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.32845601439476013, + "rewards/margins": 1.1812372207641602, + "rewards/rejected": -0.8527814149856567, + "step": 970 + }, + { + "epoch": 0.25, + "learning_rate": 4.128053917438922e-07, + "logits/chosen": -2.846832752227783, + "logits/rejected": -2.860927104949951, + "logps/chosen": -332.1506652832031, + "logps/rejected": -294.8549499511719, + "loss": 0.7617, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3438522219657898, + "rewards/margins": 0.39307039976119995, + "rewards/rejected": -0.7369226813316345, + "step": 980 + }, + { + "epoch": 0.25, + "learning_rate": 4.170176916596461e-07, + "logits/chosen": -2.7602732181549072, + "logits/rejected": -2.7995636463165283, + "logps/chosen": -201.95748901367188, + "logps/rejected": -303.0880432128906, + "loss": 0.4734, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.04982544854283333, + "rewards/margins": 1.2540963888168335, + "rewards/rejected": -1.3039219379425049, + "step": 990 + }, + { + "epoch": 0.25, + "learning_rate": 4.212299915754001e-07, + "logits/chosen": -2.849196195602417, + "logits/rejected": -2.8943185806274414, + "logps/chosen": -298.24432373046875, + "logps/rejected": -271.255859375, + "loss": 0.5213, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5941381454467773, + "rewards/margins": 0.5909891128540039, + "rewards/rejected": -1.1851271390914917, + "step": 1000 + }, + { + "epoch": 0.26, + "learning_rate": 4.2544229149115415e-07, + "logits/chosen": -2.801237106323242, + "logits/rejected": -2.6629276275634766, + "logps/chosen": -323.99285888671875, + "logps/rejected": -250.1830596923828, + "loss": 0.5358, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.05499999597668648, + "rewards/margins": 1.6413654088974, + "rewards/rejected": -1.6963655948638916, + "step": 1010 + }, + { + "epoch": 0.26, + "learning_rate": 4.2965459140690813e-07, + "logits/chosen": -2.8145062923431396, + "logits/rejected": -2.7096211910247803, + "logps/chosen": -281.5048828125, + "logps/rejected": -285.79962158203125, + "loss": 0.695, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3529316186904907, + "rewards/margins": -0.7123409509658813, + "rewards/rejected": -0.6405906677246094, + "step": 1020 + }, + { + "epoch": 0.26, + "learning_rate": 4.338668913226621e-07, + "logits/chosen": -2.8272197246551514, + "logits/rejected": -2.808361530303955, + "logps/chosen": -283.245361328125, + "logps/rejected": -337.6376953125, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.703170657157898, + "rewards/margins": 0.26156896352767944, + "rewards/rejected": -0.9647396206855774, + "step": 1030 + }, + { + "epoch": 0.26, + "learning_rate": 4.3807919123841616e-07, + "logits/chosen": -2.8427348136901855, + "logits/rejected": -2.7732455730438232, + "logps/chosen": -212.1287078857422, + "logps/rejected": -149.4747314453125, + "loss": 1.3506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0113106369972229, + "rewards/margins": 0.6789531111717224, + "rewards/rejected": -0.6902638077735901, + "step": 1040 + }, + { + "epoch": 0.27, + "learning_rate": 4.4229149115417014e-07, + "logits/chosen": -2.6644446849823, + "logits/rejected": -2.764239549636841, + "logps/chosen": -177.46188354492188, + "logps/rejected": -227.08425903320312, + "loss": 0.5089, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004345702938735485, + "rewards/margins": 1.017622947692871, + "rewards/rejected": -1.0132771730422974, + "step": 1050 + }, + { + "epoch": 0.27, + "learning_rate": 4.4650379106992413e-07, + "logits/chosen": -2.8153128623962402, + "logits/rejected": -2.73732852935791, + "logps/chosen": -315.84454345703125, + "logps/rejected": -352.3153381347656, + "loss": 0.5678, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8632125854492188, + "rewards/margins": 0.012102723121643066, + "rewards/rejected": -0.8753153085708618, + "step": 1060 + }, + { + "epoch": 0.27, + "learning_rate": 4.5071609098567817e-07, + "logits/chosen": -2.832590103149414, + "logits/rejected": -2.742182493209839, + "logps/chosen": -353.1194152832031, + "logps/rejected": -341.9236755371094, + "loss": 0.6138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7696177959442139, + "rewards/margins": 0.7982410192489624, + "rewards/rejected": -1.5678590536117554, + "step": 1070 + }, + { + "epoch": 0.27, + "learning_rate": 4.5492839090143215e-07, + "logits/chosen": -2.7319958209991455, + "logits/rejected": -2.675238609313965, + "logps/chosen": -230.688232421875, + "logps/rejected": -146.16233825683594, + "loss": 0.5068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5361072421073914, + "rewards/margins": 0.8432807922363281, + "rewards/rejected": -1.3793880939483643, + "step": 1080 + }, + { + "epoch": 0.28, + "learning_rate": 4.5914069081718614e-07, + "logits/chosen": -2.6153883934020996, + "logits/rejected": -2.7117838859558105, + "logps/chosen": -204.71461486816406, + "logps/rejected": -221.910400390625, + "loss": 0.5345, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3535842001438141, + "rewards/margins": 0.7887686491012573, + "rewards/rejected": -1.142352819442749, + "step": 1090 + }, + { + "epoch": 0.28, + "learning_rate": 4.633529907329402e-07, + "logits/chosen": -2.7112722396850586, + "logits/rejected": -2.7043750286102295, + "logps/chosen": -218.708984375, + "logps/rejected": -170.6296844482422, + "loss": 0.5437, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.26763540506362915, + "rewards/margins": 1.7196743488311768, + "rewards/rejected": -1.4520387649536133, + "step": 1100 + }, + { + "epoch": 0.28, + "learning_rate": 4.6756529064869416e-07, + "logits/chosen": -2.7523810863494873, + "logits/rejected": -2.484694719314575, + "logps/chosen": -204.053466796875, + "logps/rejected": -223.3732147216797, + "loss": 0.5025, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8717359304428101, + "rewards/margins": 1.1432167291641235, + "rewards/rejected": -2.0149526596069336, + "step": 1110 + }, + { + "epoch": 0.28, + "learning_rate": 4.7177759056444814e-07, + "logits/chosen": -2.618384838104248, + "logits/rejected": -2.608262062072754, + "logps/chosen": -232.3643341064453, + "logps/rejected": -211.66806030273438, + "loss": 0.5708, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28576603531837463, + "rewards/margins": 0.973733127117157, + "rewards/rejected": -1.259499192237854, + "step": 1120 + }, + { + "epoch": 0.29, + "learning_rate": 4.759898904802022e-07, + "logits/chosen": -2.828278064727783, + "logits/rejected": -2.793574810028076, + "logps/chosen": -334.7503967285156, + "logps/rejected": -232.13388061523438, + "loss": 0.5265, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3884439170360565, + "rewards/margins": 0.7667890787124634, + "rewards/rejected": -1.1552331447601318, + "step": 1130 + }, + { + "epoch": 0.29, + "learning_rate": 4.802021903959561e-07, + "logits/chosen": -2.667235851287842, + "logits/rejected": -2.727304458618164, + "logps/chosen": -290.23175048828125, + "logps/rejected": -288.10882568359375, + "loss": 0.5996, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5534285306930542, + "rewards/margins": 0.9525440335273743, + "rewards/rejected": -1.5059726238250732, + "step": 1140 + }, + { + "epoch": 0.29, + "learning_rate": 4.844144903117102e-07, + "logits/chosen": -2.830899477005005, + "logits/rejected": -2.8354620933532715, + "logps/chosen": -373.57281494140625, + "logps/rejected": -359.60174560546875, + "loss": 0.6887, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5015013217926025, + "rewards/margins": 0.7062493562698364, + "rewards/rejected": -1.2077505588531494, + "step": 1150 + }, + { + "epoch": 0.29, + "learning_rate": 4.886267902274642e-07, + "logits/chosen": -2.749483585357666, + "logits/rejected": -2.6624131202697754, + "logps/chosen": -278.58392333984375, + "logps/rejected": -275.7124938964844, + "loss": 0.5975, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3455383777618408, + "rewards/margins": 1.2535759210586548, + "rewards/rejected": -0.9080374836921692, + "step": 1160 + }, + { + "epoch": 0.3, + "learning_rate": 4.928390901432181e-07, + "logits/chosen": -2.786632776260376, + "logits/rejected": -2.681096076965332, + "logps/chosen": -240.35556030273438, + "logps/rejected": -227.7943878173828, + "loss": 0.5064, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.4853554666042328, + "rewards/margins": 1.8523401021957397, + "rewards/rejected": -2.337695837020874, + "step": 1170 + }, + { + "epoch": 0.3, + "learning_rate": 4.970513900589722e-07, + "logits/chosen": -2.6930205821990967, + "logits/rejected": -2.7257983684539795, + "logps/chosen": -224.31124877929688, + "logps/rejected": -241.4190216064453, + "loss": 0.5607, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8809860944747925, + "rewards/margins": 1.269070029258728, + "rewards/rejected": -2.1500561237335205, + "step": 1180 + }, + { + "epoch": 0.3, + "learning_rate": 4.99859563711263e-07, + "logits/chosen": -2.6348814964294434, + "logits/rejected": -2.605294704437256, + "logps/chosen": -281.0661926269531, + "logps/rejected": -287.85369873046875, + "loss": 0.5648, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5088950395584106, + "rewards/margins": 0.9193344116210938, + "rewards/rejected": -1.4282294511795044, + "step": 1190 + }, + { + "epoch": 0.3, + "learning_rate": 4.993914427488063e-07, + "logits/chosen": -2.661515712738037, + "logits/rejected": -2.533756732940674, + "logps/chosen": -228.9407196044922, + "logps/rejected": -198.20849609375, + "loss": 0.6059, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.789717435836792, + "rewards/margins": 0.8907758593559265, + "rewards/rejected": -1.6804933547973633, + "step": 1200 + }, + { + "epoch": 0.31, + "learning_rate": 4.989233217863496e-07, + "logits/chosen": -2.6996188163757324, + "logits/rejected": -2.7078824043273926, + "logps/chosen": -216.7144012451172, + "logps/rejected": -271.33245849609375, + "loss": 0.7244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5363994836807251, + "rewards/margins": 1.1410717964172363, + "rewards/rejected": -1.677471399307251, + "step": 1210 + }, + { + "epoch": 0.31, + "learning_rate": 4.984552008238928e-07, + "logits/chosen": -2.9735779762268066, + "logits/rejected": -2.8310341835021973, + "logps/chosen": -349.55645751953125, + "logps/rejected": -314.33392333984375, + "loss": 0.6247, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.46798020601272583, + "rewards/margins": 1.0479462146759033, + "rewards/rejected": -1.5159263610839844, + "step": 1220 + }, + { + "epoch": 0.31, + "learning_rate": 4.979870798614362e-07, + "logits/chosen": -2.696829319000244, + "logits/rejected": -2.7793948650360107, + "logps/chosen": -351.41314697265625, + "logps/rejected": -300.1158447265625, + "loss": 0.5116, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.781160831451416, + "rewards/margins": 0.9367626309394836, + "rewards/rejected": -1.7179237604141235, + "step": 1230 + }, + { + "epoch": 0.31, + "learning_rate": 4.975189588989795e-07, + "logits/chosen": -2.7155284881591797, + "logits/rejected": -2.8466756343841553, + "logps/chosen": -252.1944580078125, + "logps/rejected": -328.84832763671875, + "loss": 0.5823, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.43154963850975037, + "rewards/margins": 1.5160157680511475, + "rewards/rejected": -1.9475654363632202, + "step": 1240 + }, + { + "epoch": 0.32, + "learning_rate": 4.970508379365228e-07, + "logits/chosen": -2.7954907417297363, + "logits/rejected": -2.793194055557251, + "logps/chosen": -276.27459716796875, + "logps/rejected": -347.00750732421875, + "loss": 0.533, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4929552972316742, + "rewards/margins": 1.3219962120056152, + "rewards/rejected": -1.8149516582489014, + "step": 1250 + }, + { + "epoch": 0.32, + "learning_rate": 4.965827169740661e-07, + "logits/chosen": -2.824890613555908, + "logits/rejected": -2.8709912300109863, + "logps/chosen": -275.1580810546875, + "logps/rejected": -275.6163635253906, + "loss": 0.5267, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.5243432521820068, + "rewards/margins": 0.1552998125553131, + "rewards/rejected": -1.6796430349349976, + "step": 1260 + }, + { + "epoch": 0.32, + "learning_rate": 4.961145960116093e-07, + "logits/chosen": -2.79732084274292, + "logits/rejected": -2.5819685459136963, + "logps/chosen": -276.87945556640625, + "logps/rejected": -212.22482299804688, + "loss": 0.5344, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0666576623916626, + "rewards/margins": 1.3572540283203125, + "rewards/rejected": -2.4239115715026855, + "step": 1270 + }, + { + "epoch": 0.32, + "learning_rate": 4.956464750491527e-07, + "logits/chosen": -2.5743613243103027, + "logits/rejected": -2.6122655868530273, + "logps/chosen": -293.70367431640625, + "logps/rejected": -337.3021240234375, + "loss": 0.5087, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.438387155532837, + "rewards/margins": 1.1028835773468018, + "rewards/rejected": -2.5412707328796387, + "step": 1280 + }, + { + "epoch": 0.33, + "learning_rate": 4.95178354086696e-07, + "logits/chosen": -2.6074695587158203, + "logits/rejected": -2.5442311763763428, + "logps/chosen": -235.07211303710938, + "logps/rejected": -192.266357421875, + "loss": 0.6397, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3241581916809082, + "rewards/margins": 0.9993916749954224, + "rewards/rejected": -1.3235498666763306, + "step": 1290 + }, + { + "epoch": 0.33, + "learning_rate": 4.947102331242393e-07, + "logits/chosen": -2.6536176204681396, + "logits/rejected": -2.5985217094421387, + "logps/chosen": -342.68084716796875, + "logps/rejected": -351.6435852050781, + "loss": 0.7122, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2262356281280518, + "rewards/margins": 0.3329642415046692, + "rewards/rejected": -1.5591998100280762, + "step": 1300 + }, + { + "epoch": 0.33, + "learning_rate": 4.942421121617826e-07, + "logits/chosen": -2.7266454696655273, + "logits/rejected": -2.7418060302734375, + "logps/chosen": -314.6419372558594, + "logps/rejected": -273.09930419921875, + "loss": 0.5851, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7316147089004517, + "rewards/margins": 0.9679932594299316, + "rewards/rejected": -1.6996078491210938, + "step": 1310 + }, + { + "epoch": 0.33, + "learning_rate": 4.937739911993259e-07, + "logits/chosen": -2.7935047149658203, + "logits/rejected": -2.6458325386047363, + "logps/chosen": -278.81951904296875, + "logps/rejected": -263.1274719238281, + "loss": 0.5954, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1032342910766602, + "rewards/margins": 0.8505995869636536, + "rewards/rejected": -1.9538339376449585, + "step": 1320 + }, + { + "epoch": 0.34, + "learning_rate": 4.933058702368692e-07, + "logits/chosen": -2.3314476013183594, + "logits/rejected": -2.297711133956909, + "logps/chosen": -214.3764190673828, + "logps/rejected": -262.4116516113281, + "loss": 0.6091, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2805622816085815, + "rewards/margins": 0.8711134195327759, + "rewards/rejected": -2.1516757011413574, + "step": 1330 + }, + { + "epoch": 0.34, + "learning_rate": 4.928377492744124e-07, + "logits/chosen": -2.690143346786499, + "logits/rejected": -2.599189281463623, + "logps/chosen": -229.4182891845703, + "logps/rejected": -258.7178039550781, + "loss": 0.6126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7359424829483032, + "rewards/margins": 1.148744821548462, + "rewards/rejected": -1.8846874237060547, + "step": 1340 + }, + { + "epoch": 0.34, + "learning_rate": 4.923696283119558e-07, + "logits/chosen": -2.6452858448028564, + "logits/rejected": -2.6183056831359863, + "logps/chosen": -200.41815185546875, + "logps/rejected": -210.90628051757812, + "loss": 0.5397, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8483043909072876, + "rewards/margins": 0.6536422371864319, + "rewards/rejected": -1.5019466876983643, + "step": 1350 + }, + { + "epoch": 0.34, + "learning_rate": 4.919015073494991e-07, + "logits/chosen": -2.5736565589904785, + "logits/rejected": -2.5667662620544434, + "logps/chosen": -229.0320281982422, + "logps/rejected": -277.36669921875, + "loss": 0.6221, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28454262018203735, + "rewards/margins": 1.7313134670257568, + "rewards/rejected": -2.0158562660217285, + "step": 1360 + }, + { + "epoch": 0.35, + "learning_rate": 4.914333863870424e-07, + "logits/chosen": -2.467679262161255, + "logits/rejected": -2.5415878295898438, + "logps/chosen": -301.71099853515625, + "logps/rejected": -286.52423095703125, + "loss": 0.6835, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8348533511161804, + "rewards/margins": 1.363982915878296, + "rewards/rejected": -2.198835849761963, + "step": 1370 + }, + { + "epoch": 0.35, + "learning_rate": 4.909652654245857e-07, + "logits/chosen": -2.5349931716918945, + "logits/rejected": -2.6447384357452393, + "logps/chosen": -316.8460388183594, + "logps/rejected": -313.21990966796875, + "loss": 0.718, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.01732412539422512, + "rewards/margins": 1.913246512413025, + "rewards/rejected": -1.9305706024169922, + "step": 1380 + }, + { + "epoch": 0.35, + "learning_rate": 4.90497144462129e-07, + "logits/chosen": -2.7121050357818604, + "logits/rejected": -2.6427199840545654, + "logps/chosen": -342.8936462402344, + "logps/rejected": -341.6715393066406, + "loss": 0.7374, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.07042744010686874, + "rewards/margins": 1.4891926050186157, + "rewards/rejected": -1.4187650680541992, + "step": 1390 + }, + { + "epoch": 0.35, + "learning_rate": 4.900290234996723e-07, + "logits/chosen": -2.7661263942718506, + "logits/rejected": -2.6738123893737793, + "logps/chosen": -425.2027893066406, + "logps/rejected": -262.91510009765625, + "loss": 0.5545, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35545283555984497, + "rewards/margins": 1.151031494140625, + "rewards/rejected": -1.5064842700958252, + "step": 1400 + }, + { + "epoch": 0.36, + "learning_rate": 4.895609025372156e-07, + "logits/chosen": -2.6749067306518555, + "logits/rejected": -2.6837830543518066, + "logps/chosen": -218.03640747070312, + "logps/rejected": -294.2855529785156, + "loss": 0.5206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3428592085838318, + "rewards/margins": 1.5088475942611694, + "rewards/rejected": -1.8517067432403564, + "step": 1410 + }, + { + "epoch": 0.36, + "learning_rate": 4.890927815747589e-07, + "logits/chosen": -2.6067261695861816, + "logits/rejected": -2.5629193782806396, + "logps/chosen": -302.8584289550781, + "logps/rejected": -244.94229125976562, + "loss": 0.7956, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.47615474462509155, + "rewards/margins": 0.5270879864692688, + "rewards/rejected": -1.0032426118850708, + "step": 1420 + }, + { + "epoch": 0.36, + "learning_rate": 4.886246606123022e-07, + "logits/chosen": -2.6083011627197266, + "logits/rejected": -2.646594524383545, + "logps/chosen": -341.74346923828125, + "logps/rejected": -243.8942108154297, + "loss": 0.644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2975875437259674, + "rewards/margins": 1.4379489421844482, + "rewards/rejected": -1.7355365753173828, + "step": 1430 + }, + { + "epoch": 0.36, + "learning_rate": 4.881565396498455e-07, + "logits/chosen": -2.612194538116455, + "logits/rejected": -2.68499755859375, + "logps/chosen": -235.001953125, + "logps/rejected": -252.8855438232422, + "loss": 0.6004, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43834367394447327, + "rewards/margins": 0.21190723776817322, + "rewards/rejected": -0.6502509117126465, + "step": 1440 + }, + { + "epoch": 0.37, + "learning_rate": 4.876884186873888e-07, + "logits/chosen": -2.730816125869751, + "logits/rejected": -2.696262836456299, + "logps/chosen": -237.52621459960938, + "logps/rejected": -233.2257080078125, + "loss": 0.5765, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1927516907453537, + "rewards/margins": 0.9156246185302734, + "rewards/rejected": -1.1083762645721436, + "step": 1450 + }, + { + "epoch": 0.37, + "learning_rate": 4.872202977249321e-07, + "logits/chosen": -2.778622627258301, + "logits/rejected": -2.734001636505127, + "logps/chosen": -254.0236053466797, + "logps/rejected": -273.97210693359375, + "loss": 0.5525, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0060231685638428, + "rewards/margins": 0.8538422584533691, + "rewards/rejected": -1.8598655462265015, + "step": 1460 + }, + { + "epoch": 0.37, + "learning_rate": 4.867521767624753e-07, + "logits/chosen": -2.767373561859131, + "logits/rejected": -2.6322503089904785, + "logps/chosen": -336.3904113769531, + "logps/rejected": -339.51312255859375, + "loss": 0.5659, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2834639251232147, + "rewards/margins": 2.169893741607666, + "rewards/rejected": -2.4533581733703613, + "step": 1470 + }, + { + "epoch": 0.37, + "learning_rate": 4.862840558000187e-07, + "logits/chosen": -2.5647339820861816, + "logits/rejected": -2.552048921585083, + "logps/chosen": -259.8147888183594, + "logps/rejected": -360.00311279296875, + "loss": 0.5895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1612476110458374, + "rewards/margins": 0.8322628140449524, + "rewards/rejected": -1.9935102462768555, + "step": 1480 + }, + { + "epoch": 0.38, + "learning_rate": 4.85815934837562e-07, + "logits/chosen": -2.696507692337036, + "logits/rejected": -2.722266674041748, + "logps/chosen": -251.97970581054688, + "logps/rejected": -266.2132568359375, + "loss": 0.578, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7193154692649841, + "rewards/margins": 1.0223782062530518, + "rewards/rejected": -1.7416938543319702, + "step": 1490 + }, + { + "epoch": 0.38, + "learning_rate": 4.853478138751054e-07, + "logits/chosen": -2.6109890937805176, + "logits/rejected": -2.663027286529541, + "logps/chosen": -233.97970581054688, + "logps/rejected": -182.5577392578125, + "loss": 0.5884, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3420624732971191, + "rewards/margins": 0.07519197463989258, + "rewards/rejected": -1.4172544479370117, + "step": 1500 + }, + { + "epoch": 0.38, + "learning_rate": 4.848796929126486e-07, + "logits/chosen": -2.785861015319824, + "logits/rejected": -2.8285059928894043, + "logps/chosen": -283.89300537109375, + "logps/rejected": -269.63909912109375, + "loss": 0.6041, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7470687627792358, + "rewards/margins": 1.224205732345581, + "rewards/rejected": -1.9712746143341064, + "step": 1510 + }, + { + "epoch": 0.38, + "learning_rate": 4.844115719501919e-07, + "logits/chosen": -2.7913641929626465, + "logits/rejected": -2.6347858905792236, + "logps/chosen": -304.1687927246094, + "logps/rejected": -333.74029541015625, + "loss": 0.7299, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.37677839398384094, + "rewards/margins": 1.4539364576339722, + "rewards/rejected": -1.8307149410247803, + "step": 1520 + }, + { + "epoch": 0.39, + "learning_rate": 4.839434509877352e-07, + "logits/chosen": -2.791416645050049, + "logits/rejected": -2.765383720397949, + "logps/chosen": -226.19091796875, + "logps/rejected": -252.04275512695312, + "loss": 0.6193, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5906246304512024, + "rewards/margins": 1.1396105289459229, + "rewards/rejected": -1.7302350997924805, + "step": 1530 + }, + { + "epoch": 0.39, + "learning_rate": 4.834753300252785e-07, + "logits/chosen": -2.711320400238037, + "logits/rejected": -2.6810293197631836, + "logps/chosen": -277.157470703125, + "logps/rejected": -259.8028259277344, + "loss": 0.4853, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8695998191833496, + "rewards/margins": 1.3735220432281494, + "rewards/rejected": -2.24312162399292, + "step": 1540 + }, + { + "epoch": 0.39, + "learning_rate": 4.830072090628219e-07, + "logits/chosen": -2.5173277854919434, + "logits/rejected": -2.4909121990203857, + "logps/chosen": -251.896484375, + "logps/rejected": -213.8211669921875, + "loss": 0.4497, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2197401523590088, + "rewards/margins": 2.002092123031616, + "rewards/rejected": -0.7823519110679626, + "step": 1550 + }, + { + "epoch": 0.39, + "learning_rate": 4.825390881003651e-07, + "logits/chosen": -2.7679383754730225, + "logits/rejected": -2.7764766216278076, + "logps/chosen": -310.71722412109375, + "logps/rejected": -282.398681640625, + "loss": 0.7393, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2943713665008545, + "rewards/margins": 1.1048939228057861, + "rewards/rejected": -2.3992652893066406, + "step": 1560 + }, + { + "epoch": 0.4, + "learning_rate": 4.820709671379084e-07, + "logits/chosen": -2.8770172595977783, + "logits/rejected": -2.7384181022644043, + "logps/chosen": -291.2705383300781, + "logps/rejected": -278.2033996582031, + "loss": 0.6648, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4087117910385132, + "rewards/margins": 0.2625928521156311, + "rewards/rejected": -1.671304702758789, + "step": 1570 + }, + { + "epoch": 0.4, + "learning_rate": 4.816028461754517e-07, + "logits/chosen": -2.6546216011047363, + "logits/rejected": -2.7144055366516113, + "logps/chosen": -252.04006958007812, + "logps/rejected": -205.1694793701172, + "loss": 0.5475, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.298349142074585, + "rewards/margins": 0.6605261564254761, + "rewards/rejected": -1.958875060081482, + "step": 1580 + }, + { + "epoch": 0.4, + "learning_rate": 4.81134725212995e-07, + "logits/chosen": -2.7433269023895264, + "logits/rejected": -2.7543070316314697, + "logps/chosen": -260.1664733886719, + "logps/rejected": -256.9065856933594, + "loss": 0.6055, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9419029951095581, + "rewards/margins": 1.7040073871612549, + "rewards/rejected": -2.6459105014801025, + "step": 1590 + }, + { + "epoch": 0.4, + "learning_rate": 4.806666042505384e-07, + "logits/chosen": -2.68558931350708, + "logits/rejected": -2.5312259197235107, + "logps/chosen": -251.97323608398438, + "logps/rejected": -229.5078582763672, + "loss": 0.5146, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2642735242843628, + "rewards/margins": 1.0432544946670532, + "rewards/rejected": -2.307528257369995, + "step": 1600 + }, + { + "epoch": 0.41, + "learning_rate": 4.801984832880816e-07, + "logits/chosen": -2.603732109069824, + "logits/rejected": -2.388270854949951, + "logps/chosen": -353.0534362792969, + "logps/rejected": -289.6091003417969, + "loss": 0.5718, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.376704216003418, + "rewards/margins": 1.6749976873397827, + "rewards/rejected": -3.051701784133911, + "step": 1610 + }, + { + "epoch": 0.41, + "learning_rate": 4.797303623256249e-07, + "logits/chosen": -2.7381432056427, + "logits/rejected": -2.661796808242798, + "logps/chosen": -302.10357666015625, + "logps/rejected": -230.2789764404297, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9080637693405151, + "rewards/margins": 1.5905338525772095, + "rewards/rejected": -2.4985973834991455, + "step": 1620 + }, + { + "epoch": 0.41, + "learning_rate": 4.792622413631682e-07, + "logits/chosen": -2.6611194610595703, + "logits/rejected": -2.5353498458862305, + "logps/chosen": -378.8614501953125, + "logps/rejected": -439.38787841796875, + "loss": 0.5265, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.44443148374557495, + "rewards/margins": 1.3168675899505615, + "rewards/rejected": -1.7612993717193604, + "step": 1630 + }, + { + "epoch": 0.41, + "learning_rate": 4.787941204007115e-07, + "logits/chosen": -2.631410598754883, + "logits/rejected": -2.5915353298187256, + "logps/chosen": -287.78936767578125, + "logps/rejected": -323.13653564453125, + "loss": 0.5354, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.061402678489685, + "rewards/margins": 1.3255428075790405, + "rewards/rejected": -2.3869454860687256, + "step": 1640 + }, + { + "epoch": 0.42, + "learning_rate": 4.783259994382548e-07, + "logits/chosen": -2.79594349861145, + "logits/rejected": -2.6912193298339844, + "logps/chosen": -304.9742736816406, + "logps/rejected": -246.300048828125, + "loss": 0.6284, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2038127183914185, + "rewards/margins": 1.096224069595337, + "rewards/rejected": -2.300036907196045, + "step": 1650 + }, + { + "epoch": 0.42, + "learning_rate": 4.778578784757981e-07, + "logits/chosen": -2.738156795501709, + "logits/rejected": -2.596961259841919, + "logps/chosen": -268.0915832519531, + "logps/rejected": -274.79278564453125, + "loss": 0.4787, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.030813455581665, + "rewards/margins": 0.8118458986282349, + "rewards/rejected": -1.8426593542099, + "step": 1660 + }, + { + "epoch": 0.42, + "learning_rate": 4.773897575133414e-07, + "logits/chosen": -2.63154673576355, + "logits/rejected": -2.627650737762451, + "logps/chosen": -237.63363647460938, + "logps/rejected": -223.29159545898438, + "loss": 0.575, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.6023433208465576, + "rewards/margins": 0.5691138505935669, + "rewards/rejected": -2.171457529067993, + "step": 1670 + }, + { + "epoch": 0.42, + "learning_rate": 4.769216365508848e-07, + "logits/chosen": -2.695577621459961, + "logits/rejected": -2.5401132106781006, + "logps/chosen": -339.2311096191406, + "logps/rejected": -262.02874755859375, + "loss": 0.7872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6823625564575195, + "rewards/margins": 0.8626251220703125, + "rewards/rejected": -2.5449881553649902, + "step": 1680 + }, + { + "epoch": 0.43, + "learning_rate": 4.76453515588428e-07, + "logits/chosen": -2.7896368503570557, + "logits/rejected": -2.6924490928649902, + "logps/chosen": -339.7137756347656, + "logps/rejected": -307.8088073730469, + "loss": 0.5801, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.13714599609375, + "rewards/margins": 1.197256326675415, + "rewards/rejected": -2.334402322769165, + "step": 1690 + }, + { + "epoch": 0.43, + "learning_rate": 4.759853946259713e-07, + "logits/chosen": -2.591322183609009, + "logits/rejected": -2.6701719760894775, + "logps/chosen": -301.0086975097656, + "logps/rejected": -269.099609375, + "loss": 0.6423, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6230440139770508, + "rewards/margins": 0.9465805888175964, + "rewards/rejected": -2.569624662399292, + "step": 1700 + }, + { + "epoch": 0.43, + "learning_rate": 4.7551727366351465e-07, + "logits/chosen": -2.6842517852783203, + "logits/rejected": -2.5960946083068848, + "logps/chosen": -305.3388671875, + "logps/rejected": -275.5049133300781, + "loss": 0.6272, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0681524276733398, + "rewards/margins": 1.0799347162246704, + "rewards/rejected": -2.1480870246887207, + "step": 1710 + }, + { + "epoch": 0.43, + "learning_rate": 4.7504915270105794e-07, + "logits/chosen": -2.475844621658325, + "logits/rejected": -2.477785110473633, + "logps/chosen": -283.61627197265625, + "logps/rejected": -302.0332946777344, + "loss": 0.572, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9548444747924805, + "rewards/margins": 1.2471121549606323, + "rewards/rejected": -3.2019565105438232, + "step": 1720 + }, + { + "epoch": 0.44, + "learning_rate": 4.745810317386013e-07, + "logits/chosen": -2.798185348510742, + "logits/rejected": -2.6507716178894043, + "logps/chosen": -425.5433044433594, + "logps/rejected": -361.62921142578125, + "loss": 0.6259, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7486549615859985, + "rewards/margins": 1.0134116411209106, + "rewards/rejected": -2.762066602706909, + "step": 1730 + }, + { + "epoch": 0.44, + "learning_rate": 4.741129107761445e-07, + "logits/chosen": -2.7440648078918457, + "logits/rejected": -2.7234296798706055, + "logps/chosen": -310.80633544921875, + "logps/rejected": -349.42059326171875, + "loss": 0.4625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9778580665588379, + "rewards/margins": 1.1600100994110107, + "rewards/rejected": -2.1378684043884277, + "step": 1740 + }, + { + "epoch": 0.44, + "learning_rate": 4.736447898136878e-07, + "logits/chosen": -2.5742950439453125, + "logits/rejected": -2.5962517261505127, + "logps/chosen": -284.7892150878906, + "logps/rejected": -281.23394775390625, + "loss": 0.559, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4265632629394531, + "rewards/margins": 1.247928261756897, + "rewards/rejected": -2.6744914054870605, + "step": 1750 + }, + { + "epoch": 0.44, + "learning_rate": 4.7317666885123115e-07, + "logits/chosen": -2.501652240753174, + "logits/rejected": -2.4336256980895996, + "logps/chosen": -199.65084838867188, + "logps/rejected": -295.1380310058594, + "loss": 0.5376, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.718796968460083, + "rewards/margins": 0.35511070489883423, + "rewards/rejected": -2.0739076137542725, + "step": 1760 + }, + { + "epoch": 0.45, + "learning_rate": 4.7270854788877444e-07, + "logits/chosen": -2.6972432136535645, + "logits/rejected": -2.6885361671447754, + "logps/chosen": -362.0516662597656, + "logps/rejected": -301.98431396484375, + "loss": 0.5787, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8265206217765808, + "rewards/margins": 1.7504993677139282, + "rewards/rejected": -2.5770199298858643, + "step": 1770 + }, + { + "epoch": 0.45, + "learning_rate": 4.7224042692631773e-07, + "logits/chosen": -2.5660178661346436, + "logits/rejected": -2.6825168132781982, + "logps/chosen": -236.5127410888672, + "logps/rejected": -286.50506591796875, + "loss": 0.6535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2312124967575073, + "rewards/margins": 0.5038989782333374, + "rewards/rejected": -1.7351115942001343, + "step": 1780 + }, + { + "epoch": 0.45, + "learning_rate": 4.71772305963861e-07, + "logits/chosen": -2.6352035999298096, + "logits/rejected": -2.724378824234009, + "logps/chosen": -291.7593688964844, + "logps/rejected": -318.66241455078125, + "loss": 0.6269, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1717381477355957, + "rewards/margins": 0.9231816530227661, + "rewards/rejected": -2.0949196815490723, + "step": 1790 + }, + { + "epoch": 0.46, + "learning_rate": 4.7130418500140436e-07, + "logits/chosen": -2.7564892768859863, + "logits/rejected": -2.6997904777526855, + "logps/chosen": -433.239501953125, + "logps/rejected": -370.81121826171875, + "loss": 0.6336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5008946657180786, + "rewards/margins": 1.5569981336593628, + "rewards/rejected": -3.0578930377960205, + "step": 1800 + }, + { + "epoch": 0.46, + "learning_rate": 4.7083606403894765e-07, + "logits/chosen": -2.5221521854400635, + "logits/rejected": -2.511442184448242, + "logps/chosen": -255.2664031982422, + "logps/rejected": -250.4320526123047, + "loss": 0.7713, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6310237646102905, + "rewards/margins": 0.4643372893333435, + "rewards/rejected": -2.0953612327575684, + "step": 1810 + }, + { + "epoch": 0.46, + "learning_rate": 4.70367943076491e-07, + "logits/chosen": -2.61252760887146, + "logits/rejected": -2.590552806854248, + "logps/chosen": -184.2164306640625, + "logps/rejected": -193.83358764648438, + "loss": 0.9105, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8147584795951843, + "rewards/margins": 0.5969401597976685, + "rewards/rejected": -1.4116986989974976, + "step": 1820 + }, + { + "epoch": 0.46, + "learning_rate": 4.6989982211403423e-07, + "logits/chosen": -2.653203010559082, + "logits/rejected": -2.6058340072631836, + "logps/chosen": -335.33941650390625, + "logps/rejected": -291.458740234375, + "loss": 0.6721, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1626819372177124, + "rewards/margins": 1.0949734449386597, + "rewards/rejected": -2.257655620574951, + "step": 1830 + }, + { + "epoch": 0.47, + "learning_rate": 4.694317011515775e-07, + "logits/chosen": -2.737450361251831, + "logits/rejected": -2.6910014152526855, + "logps/chosen": -300.6209411621094, + "logps/rejected": -287.80657958984375, + "loss": 0.6583, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2600080966949463, + "rewards/margins": 1.2313051223754883, + "rewards/rejected": -2.4913132190704346, + "step": 1840 + }, + { + "epoch": 0.47, + "learning_rate": 4.6896358018912086e-07, + "logits/chosen": -2.5987954139709473, + "logits/rejected": -2.709334373474121, + "logps/chosen": -242.94735717773438, + "logps/rejected": -284.9515075683594, + "loss": 0.6904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2675464153289795, + "rewards/margins": 0.8861404657363892, + "rewards/rejected": -2.153687000274658, + "step": 1850 + }, + { + "epoch": 0.47, + "learning_rate": 4.6849545922666415e-07, + "logits/chosen": -2.7179274559020996, + "logits/rejected": -2.660978317260742, + "logps/chosen": -259.5520935058594, + "logps/rejected": -226.71029663085938, + "loss": 0.6684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9327359199523926, + "rewards/margins": 1.161287546157837, + "rewards/rejected": -2.0940234661102295, + "step": 1860 + }, + { + "epoch": 0.47, + "learning_rate": 4.6802733826420744e-07, + "logits/chosen": -2.618227005004883, + "logits/rejected": -2.589632272720337, + "logps/chosen": -308.71923828125, + "logps/rejected": -278.947265625, + "loss": 0.5687, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5463954210281372, + "rewards/margins": 1.3313301801681519, + "rewards/rejected": -2.877725124359131, + "step": 1870 + }, + { + "epoch": 0.48, + "learning_rate": 4.6755921730175073e-07, + "logits/chosen": -2.6452157497406006, + "logits/rejected": -2.5336766242980957, + "logps/chosen": -346.36529541015625, + "logps/rejected": -289.3600158691406, + "loss": 0.6538, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5556385517120361, + "rewards/margins": 0.8465816378593445, + "rewards/rejected": -2.4022200107574463, + "step": 1880 + }, + { + "epoch": 0.48, + "learning_rate": 4.6709109633929407e-07, + "logits/chosen": -2.7492051124572754, + "logits/rejected": -2.700204610824585, + "logps/chosen": -316.11492919921875, + "logps/rejected": -313.65338134765625, + "loss": 0.6407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3001348972320557, + "rewards/margins": 1.444930911064148, + "rewards/rejected": -2.745065927505493, + "step": 1890 + }, + { + "epoch": 0.48, + "learning_rate": 4.6662297537683736e-07, + "logits/chosen": -2.871032238006592, + "logits/rejected": -2.772630214691162, + "logps/chosen": -233.3348846435547, + "logps/rejected": -266.38677978515625, + "loss": 0.5614, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9081518054008484, + "rewards/margins": 0.832781195640564, + "rewards/rejected": -1.7409330606460571, + "step": 1900 + }, + { + "epoch": 0.48, + "learning_rate": 4.6615485441438065e-07, + "logits/chosen": -2.6391685009002686, + "logits/rejected": -2.620103120803833, + "logps/chosen": -267.82135009765625, + "logps/rejected": -257.8954162597656, + "loss": 0.5407, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.1627651453018188, + "rewards/margins": 0.6330679059028625, + "rewards/rejected": -1.7958329916000366, + "step": 1910 + }, + { + "epoch": 0.49, + "learning_rate": 4.6568673345192394e-07, + "logits/chosen": -2.872558116912842, + "logits/rejected": -2.7963390350341797, + "logps/chosen": -368.4126892089844, + "logps/rejected": -305.3444519042969, + "loss": 0.6164, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8419774770736694, + "rewards/margins": 1.7068140506744385, + "rewards/rejected": -2.5487914085388184, + "step": 1920 + }, + { + "epoch": 0.49, + "learning_rate": 4.652186124894673e-07, + "logits/chosen": -2.8567965030670166, + "logits/rejected": -2.6726253032684326, + "logps/chosen": -317.45062255859375, + "logps/rejected": -160.01547241210938, + "loss": 0.553, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01207830011844635, + "rewards/margins": 1.3234484195709229, + "rewards/rejected": -1.335526704788208, + "step": 1930 + }, + { + "epoch": 0.49, + "learning_rate": 4.6475049152701057e-07, + "logits/chosen": -2.758852481842041, + "logits/rejected": -2.6877527236938477, + "logps/chosen": -291.3718566894531, + "logps/rejected": -226.0242919921875, + "loss": 0.4435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6673688292503357, + "rewards/margins": 1.2510454654693604, + "rewards/rejected": -1.9184144735336304, + "step": 1940 + }, + { + "epoch": 0.49, + "learning_rate": 4.6428237056455386e-07, + "logits/chosen": -2.761021137237549, + "logits/rejected": -2.714482307434082, + "logps/chosen": -228.28213500976562, + "logps/rejected": -216.9440155029297, + "loss": 0.6122, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.011810541152954, + "rewards/margins": 0.710497260093689, + "rewards/rejected": -1.7223079204559326, + "step": 1950 + }, + { + "epoch": 0.5, + "learning_rate": 4.6381424960209715e-07, + "logits/chosen": -2.654102325439453, + "logits/rejected": -2.702723264694214, + "logps/chosen": -296.45733642578125, + "logps/rejected": -313.7186279296875, + "loss": 0.6241, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2756218910217285, + "rewards/margins": 0.5113860368728638, + "rewards/rejected": -1.7870079278945923, + "step": 1960 + }, + { + "epoch": 0.5, + "learning_rate": 4.6334612863964044e-07, + "logits/chosen": -2.782576322555542, + "logits/rejected": -2.6841907501220703, + "logps/chosen": -284.8516540527344, + "logps/rejected": -260.6100769042969, + "loss": 0.6017, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1669677197933197, + "rewards/margins": 0.5671809911727905, + "rewards/rejected": -0.7341487407684326, + "step": 1970 + }, + { + "epoch": 0.5, + "learning_rate": 4.628780076771838e-07, + "logits/chosen": -2.5050246715545654, + "logits/rejected": -2.5695414543151855, + "logps/chosen": -236.9279022216797, + "logps/rejected": -214.5349578857422, + "loss": 0.5509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2865060567855835, + "rewards/margins": 1.4661260843276978, + "rewards/rejected": -1.7526321411132812, + "step": 1980 + }, + { + "epoch": 0.5, + "learning_rate": 4.6240988671472707e-07, + "logits/chosen": -2.651034355163574, + "logits/rejected": -2.722280979156494, + "logps/chosen": -177.27679443359375, + "logps/rejected": -235.3101806640625, + "loss": 0.5908, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.22419700026512146, + "rewards/margins": 1.5160796642303467, + "rewards/rejected": -1.7402766942977905, + "step": 1990 + }, + { + "epoch": 0.51, + "learning_rate": 4.6194176575227036e-07, + "logits/chosen": -2.5649561882019043, + "logits/rejected": -2.4804835319519043, + "logps/chosen": -387.7535095214844, + "logps/rejected": -289.9999694824219, + "loss": 0.6403, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.967598557472229, + "rewards/margins": 0.8338910937309265, + "rewards/rejected": -1.8014894723892212, + "step": 2000 + }, + { + "epoch": 0.51, + "learning_rate": 4.6147364478981365e-07, + "logits/chosen": -2.6053478717803955, + "logits/rejected": -2.6039249897003174, + "logps/chosen": -218.8382110595703, + "logps/rejected": -216.4173126220703, + "loss": 0.5884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8255708813667297, + "rewards/margins": 0.8683991432189941, + "rewards/rejected": -1.6939697265625, + "step": 2010 + }, + { + "epoch": 0.51, + "learning_rate": 4.61005523827357e-07, + "logits/chosen": -2.6284866333007812, + "logits/rejected": -2.6124508380889893, + "logps/chosen": -302.99798583984375, + "logps/rejected": -240.9373779296875, + "loss": 0.5455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18973544239997864, + "rewards/margins": 1.3795931339263916, + "rewards/rejected": -1.5693285465240479, + "step": 2020 + }, + { + "epoch": 0.51, + "learning_rate": 4.605374028649003e-07, + "logits/chosen": -2.6844563484191895, + "logits/rejected": -2.695885181427002, + "logps/chosen": -267.65289306640625, + "logps/rejected": -275.90753173828125, + "loss": 0.524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9961126446723938, + "rewards/margins": 1.043602705001831, + "rewards/rejected": -2.03971529006958, + "step": 2030 + }, + { + "epoch": 0.52, + "learning_rate": 4.600692819024436e-07, + "logits/chosen": -2.759692907333374, + "logits/rejected": -2.5767171382904053, + "logps/chosen": -397.1188049316406, + "logps/rejected": -295.84710693359375, + "loss": 0.5585, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6714985966682434, + "rewards/margins": 1.577000379562378, + "rewards/rejected": -2.2484986782073975, + "step": 2040 + }, + { + "epoch": 0.52, + "learning_rate": 4.5960116093998686e-07, + "logits/chosen": -2.511993885040283, + "logits/rejected": -2.5586748123168945, + "logps/chosen": -247.68722534179688, + "logps/rejected": -325.49945068359375, + "loss": 0.4833, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3887598514556885, + "rewards/margins": 1.375656008720398, + "rewards/rejected": -2.7644155025482178, + "step": 2050 + }, + { + "epoch": 0.52, + "learning_rate": 4.5913303997753015e-07, + "logits/chosen": -2.638223648071289, + "logits/rejected": -2.7245888710021973, + "logps/chosen": -197.88986206054688, + "logps/rejected": -289.672119140625, + "loss": 0.5621, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2302311658859253, + "rewards/margins": 1.2392613887786865, + "rewards/rejected": -2.4694924354553223, + "step": 2060 + }, + { + "epoch": 0.52, + "learning_rate": 4.586649190150735e-07, + "logits/chosen": -2.5059456825256348, + "logits/rejected": -2.5340614318847656, + "logps/chosen": -299.3819885253906, + "logps/rejected": -259.20843505859375, + "loss": 0.5224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5727068185806274, + "rewards/margins": 1.2742458581924438, + "rewards/rejected": -2.8469526767730713, + "step": 2070 + }, + { + "epoch": 0.53, + "learning_rate": 4.581967980526168e-07, + "logits/chosen": -2.6917359828948975, + "logits/rejected": -2.831451892852783, + "logps/chosen": -224.42752075195312, + "logps/rejected": -268.67498779296875, + "loss": 0.6184, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5647084712982178, + "rewards/margins": 0.5590938925743103, + "rewards/rejected": -2.123802661895752, + "step": 2080 + }, + { + "epoch": 0.53, + "learning_rate": 4.5772867709016007e-07, + "logits/chosen": -2.5519332885742188, + "logits/rejected": -2.6108298301696777, + "logps/chosen": -205.2392578125, + "logps/rejected": -281.6006774902344, + "loss": 0.5099, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9697121381759644, + "rewards/margins": 1.5353702306747437, + "rewards/rejected": -2.505082607269287, + "step": 2090 + }, + { + "epoch": 0.53, + "learning_rate": 4.5726055612770336e-07, + "logits/chosen": -2.7195792198181152, + "logits/rejected": -2.6307480335235596, + "logps/chosen": -257.200439453125, + "logps/rejected": -234.48812866210938, + "loss": 0.5728, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9558160901069641, + "rewards/margins": 1.1950876712799072, + "rewards/rejected": -2.1509037017822266, + "step": 2100 + }, + { + "epoch": 0.53, + "learning_rate": 4.567924351652467e-07, + "logits/chosen": -2.538254737854004, + "logits/rejected": -2.5346908569335938, + "logps/chosen": -212.212646484375, + "logps/rejected": -295.8035888671875, + "loss": 0.7148, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9982824325561523, + "rewards/margins": 0.681343674659729, + "rewards/rejected": -2.679626226425171, + "step": 2110 + }, + { + "epoch": 0.54, + "learning_rate": 4.5632431420279e-07, + "logits/chosen": -2.4563584327697754, + "logits/rejected": -2.542332649230957, + "logps/chosen": -251.83627319335938, + "logps/rejected": -282.85302734375, + "loss": 0.4758, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.539489507675171, + "rewards/margins": 1.854832410812378, + "rewards/rejected": -3.394321918487549, + "step": 2120 + }, + { + "epoch": 0.54, + "learning_rate": 4.5585619324033333e-07, + "logits/chosen": -2.710216999053955, + "logits/rejected": -2.6763594150543213, + "logps/chosen": -289.79974365234375, + "logps/rejected": -273.98248291015625, + "loss": 0.5503, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3079802989959717, + "rewards/margins": 1.1398521661758423, + "rewards/rejected": -2.4478325843811035, + "step": 2130 + }, + { + "epoch": 0.54, + "learning_rate": 4.5538807227787657e-07, + "logits/chosen": -2.601003408432007, + "logits/rejected": -2.473191976547241, + "logps/chosen": -419.03216552734375, + "logps/rejected": -315.4063415527344, + "loss": 0.523, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6389806866645813, + "rewards/margins": 1.801963210105896, + "rewards/rejected": -2.440943956375122, + "step": 2140 + }, + { + "epoch": 0.54, + "learning_rate": 4.5491995131541986e-07, + "logits/chosen": -2.6831939220428467, + "logits/rejected": -2.7121834754943848, + "logps/chosen": -205.4101104736328, + "logps/rejected": -221.7682647705078, + "loss": 0.5012, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1309032440185547, + "rewards/margins": 1.0398448705673218, + "rewards/rejected": -2.170748233795166, + "step": 2150 + }, + { + "epoch": 0.55, + "learning_rate": 4.544518303529632e-07, + "logits/chosen": -2.668999671936035, + "logits/rejected": -2.5639638900756836, + "logps/chosen": -318.5812072753906, + "logps/rejected": -237.9522247314453, + "loss": 0.661, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.829673409461975, + "rewards/margins": 1.336219072341919, + "rewards/rejected": -3.1658921241760254, + "step": 2160 + }, + { + "epoch": 0.55, + "learning_rate": 4.539837093905065e-07, + "logits/chosen": -2.5789854526519775, + "logits/rejected": -2.602631092071533, + "logps/chosen": -325.195556640625, + "logps/rejected": -290.55926513671875, + "loss": 0.5443, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8139760494232178, + "rewards/margins": 0.6441632509231567, + "rewards/rejected": -3.458139419555664, + "step": 2170 + }, + { + "epoch": 0.55, + "learning_rate": 4.535155884280498e-07, + "logits/chosen": -2.5557656288146973, + "logits/rejected": -2.5971879959106445, + "logps/chosen": -281.81121826171875, + "logps/rejected": -281.9815673828125, + "loss": 0.5885, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4063057899475098, + "rewards/margins": 0.7042945623397827, + "rewards/rejected": -3.110599994659424, + "step": 2180 + }, + { + "epoch": 0.55, + "learning_rate": 4.5304746746559307e-07, + "logits/chosen": -2.807870864868164, + "logits/rejected": -2.7482285499572754, + "logps/chosen": -346.4349670410156, + "logps/rejected": -309.8635559082031, + "loss": 0.6759, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.917128324508667, + "rewards/margins": 0.6684790253639221, + "rewards/rejected": -2.5856072902679443, + "step": 2190 + }, + { + "epoch": 0.56, + "learning_rate": 4.525793465031364e-07, + "logits/chosen": -2.6755623817443848, + "logits/rejected": -2.6319668292999268, + "logps/chosen": -365.0047302246094, + "logps/rejected": -225.29183959960938, + "loss": 0.5749, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.633305311203003, + "rewards/margins": 1.0927493572235107, + "rewards/rejected": -2.7260546684265137, + "step": 2200 + }, + { + "epoch": 0.56, + "learning_rate": 4.521112255406797e-07, + "logits/chosen": -2.551112413406372, + "logits/rejected": -2.5428919792175293, + "logps/chosen": -232.1934356689453, + "logps/rejected": -289.0122375488281, + "loss": 0.8049, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.458327054977417, + "rewards/margins": 1.503689169883728, + "rewards/rejected": -2.9620163440704346, + "step": 2210 + }, + { + "epoch": 0.56, + "learning_rate": 4.51643104578223e-07, + "logits/chosen": -2.6861534118652344, + "logits/rejected": -2.6878790855407715, + "logps/chosen": -263.35772705078125, + "logps/rejected": -240.6010284423828, + "loss": 0.5831, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3718329966068268, + "rewards/margins": 1.8208471536636353, + "rewards/rejected": -2.1926798820495605, + "step": 2220 + }, + { + "epoch": 0.56, + "learning_rate": 4.511749836157663e-07, + "logits/chosen": -2.6632721424102783, + "logits/rejected": -2.5350358486175537, + "logps/chosen": -270.24200439453125, + "logps/rejected": -227.2047882080078, + "loss": 0.6363, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4736149311065674, + "rewards/margins": 1.2423803806304932, + "rewards/rejected": -2.7159953117370605, + "step": 2230 + }, + { + "epoch": 0.57, + "learning_rate": 4.507068626533096e-07, + "logits/chosen": -2.6859564781188965, + "logits/rejected": -2.616516351699829, + "logps/chosen": -337.3983459472656, + "logps/rejected": -304.076904296875, + "loss": 0.6378, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1171061992645264, + "rewards/margins": 1.7147471904754639, + "rewards/rejected": -2.8318533897399902, + "step": 2240 + }, + { + "epoch": 0.57, + "learning_rate": 4.502387416908529e-07, + "logits/chosen": -2.7045998573303223, + "logits/rejected": -2.6133079528808594, + "logps/chosen": -452.3338928222656, + "logps/rejected": -380.03009033203125, + "loss": 0.5329, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8934322595596313, + "rewards/margins": 2.0240795612335205, + "rewards/rejected": -2.9175117015838623, + "step": 2250 + }, + { + "epoch": 0.57, + "learning_rate": 4.497706207283962e-07, + "logits/chosen": -2.5197484493255615, + "logits/rejected": -2.619424819946289, + "logps/chosen": -271.44000244140625, + "logps/rejected": -364.75384521484375, + "loss": 0.5912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.892828345298767, + "rewards/margins": 0.9958688616752625, + "rewards/rejected": -2.888697385787964, + "step": 2260 + }, + { + "epoch": 0.57, + "learning_rate": 4.493024997659395e-07, + "logits/chosen": -2.718838930130005, + "logits/rejected": -2.617269515991211, + "logps/chosen": -298.60443115234375, + "logps/rejected": -247.50552368164062, + "loss": 0.4695, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.534011960029602, + "rewards/margins": 1.599219560623169, + "rewards/rejected": -2.1332316398620605, + "step": 2270 + }, + { + "epoch": 0.58, + "learning_rate": 4.488343788034828e-07, + "logits/chosen": -2.6473498344421387, + "logits/rejected": -2.563596487045288, + "logps/chosen": -200.87728881835938, + "logps/rejected": -181.6173858642578, + "loss": 0.5965, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9050285220146179, + "rewards/margins": 1.7684764862060547, + "rewards/rejected": -2.6735050678253174, + "step": 2280 + }, + { + "epoch": 0.58, + "learning_rate": 4.483662578410261e-07, + "logits/chosen": -2.6025519371032715, + "logits/rejected": -2.5912060737609863, + "logps/chosen": -240.58151245117188, + "logps/rejected": -267.31304931640625, + "loss": 0.5615, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6238843202590942, + "rewards/margins": 2.195256471633911, + "rewards/rejected": -2.819140672683716, + "step": 2290 + }, + { + "epoch": 0.58, + "learning_rate": 4.478981368785694e-07, + "logits/chosen": -2.5795750617980957, + "logits/rejected": -2.5501458644866943, + "logps/chosen": -216.99716186523438, + "logps/rejected": -218.18185424804688, + "loss": 0.5537, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6390549540519714, + "rewards/margins": 1.9194520711898804, + "rewards/rejected": -2.5585074424743652, + "step": 2300 + }, + { + "epoch": 0.58, + "learning_rate": 4.474300159161127e-07, + "logits/chosen": -2.545380115509033, + "logits/rejected": -2.4202880859375, + "logps/chosen": -211.42227172851562, + "logps/rejected": -239.5775146484375, + "loss": 0.5645, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7160159349441528, + "rewards/margins": 0.9380934834480286, + "rewards/rejected": -1.654109239578247, + "step": 2310 + }, + { + "epoch": 0.59, + "learning_rate": 4.46961894953656e-07, + "logits/chosen": -2.40726900100708, + "logits/rejected": -2.431823253631592, + "logps/chosen": -223.0937042236328, + "logps/rejected": -196.89883422851562, + "loss": 0.6963, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6814838647842407, + "rewards/margins": 0.3216429352760315, + "rewards/rejected": -2.003127098083496, + "step": 2320 + }, + { + "epoch": 0.59, + "learning_rate": 4.4649377399119933e-07, + "logits/chosen": -2.835279703140259, + "logits/rejected": -2.740386486053467, + "logps/chosen": -321.75152587890625, + "logps/rejected": -288.8874206542969, + "loss": 0.5352, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5283945798873901, + "rewards/margins": 2.2066421508789062, + "rewards/rejected": -2.735036611557007, + "step": 2330 + }, + { + "epoch": 0.59, + "learning_rate": 4.460256530287426e-07, + "logits/chosen": -2.5671308040618896, + "logits/rejected": -2.5428531169891357, + "logps/chosen": -403.35406494140625, + "logps/rejected": -291.94573974609375, + "loss": 0.5566, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4538414478302002, + "rewards/margins": 0.6209135055541992, + "rewards/rejected": -2.0747549533843994, + "step": 2340 + }, + { + "epoch": 0.59, + "learning_rate": 4.4555753206628596e-07, + "logits/chosen": -2.6901965141296387, + "logits/rejected": -2.624437093734741, + "logps/chosen": -179.4101104736328, + "logps/rejected": -185.3762969970703, + "loss": 0.7162, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7822803258895874, + "rewards/margins": 1.7482283115386963, + "rewards/rejected": -2.530508279800415, + "step": 2350 + }, + { + "epoch": 0.6, + "learning_rate": 4.450894111038292e-07, + "logits/chosen": -2.7932639122009277, + "logits/rejected": -2.770897626876831, + "logps/chosen": -296.88995361328125, + "logps/rejected": -244.643310546875, + "loss": 0.5341, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1266002655029297, + "rewards/margins": 1.2736326456069946, + "rewards/rejected": -2.400233030319214, + "step": 2360 + }, + { + "epoch": 0.6, + "learning_rate": 4.446212901413725e-07, + "logits/chosen": -2.3377389907836914, + "logits/rejected": -2.5143706798553467, + "logps/chosen": -312.0408630371094, + "logps/rejected": -325.20440673828125, + "loss": 0.6169, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4157869815826416, + "rewards/margins": 0.7878390550613403, + "rewards/rejected": -2.2036261558532715, + "step": 2370 + }, + { + "epoch": 0.6, + "learning_rate": 4.4415316917891583e-07, + "logits/chosen": -2.5377869606018066, + "logits/rejected": -2.651761293411255, + "logps/chosen": -188.7794647216797, + "logps/rejected": -275.84033203125, + "loss": 0.6268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4358875751495361, + "rewards/margins": 0.8894082307815552, + "rewards/rejected": -2.3252956867218018, + "step": 2380 + }, + { + "epoch": 0.6, + "learning_rate": 4.436850482164591e-07, + "logits/chosen": -2.949428081512451, + "logits/rejected": -2.8552987575531006, + "logps/chosen": -290.80780029296875, + "logps/rejected": -259.5678405761719, + "loss": 0.546, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5419038534164429, + "rewards/margins": 1.150870442390442, + "rewards/rejected": -1.6927744150161743, + "step": 2390 + }, + { + "epoch": 0.61, + "learning_rate": 4.432169272540024e-07, + "logits/chosen": -2.7359933853149414, + "logits/rejected": -2.7360599040985107, + "logps/chosen": -278.16497802734375, + "logps/rejected": -320.8052673339844, + "loss": 0.6529, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9255379438400269, + "rewards/margins": 0.9193289875984192, + "rewards/rejected": -1.8448671102523804, + "step": 2400 + }, + { + "epoch": 0.61, + "learning_rate": 4.427488062915457e-07, + "logits/chosen": -2.6792635917663574, + "logits/rejected": -2.724548816680908, + "logps/chosen": -227.6698760986328, + "logps/rejected": -268.72247314453125, + "loss": 0.6107, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18764783442020416, + "rewards/margins": 1.3458220958709717, + "rewards/rejected": -1.533469557762146, + "step": 2410 + }, + { + "epoch": 0.61, + "learning_rate": 4.4228068532908904e-07, + "logits/chosen": -2.506946086883545, + "logits/rejected": -2.5913844108581543, + "logps/chosen": -218.8012237548828, + "logps/rejected": -284.5220642089844, + "loss": 0.5946, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7076995372772217, + "rewards/margins": 1.5795643329620361, + "rewards/rejected": -3.287263870239258, + "step": 2420 + }, + { + "epoch": 0.61, + "learning_rate": 4.4181256436663233e-07, + "logits/chosen": -2.5103375911712646, + "logits/rejected": -2.548065185546875, + "logps/chosen": -248.3396453857422, + "logps/rejected": -225.8044891357422, + "loss": 0.5725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.242130994796753, + "rewards/margins": 1.2999765872955322, + "rewards/rejected": -2.542107343673706, + "step": 2430 + }, + { + "epoch": 0.62, + "learning_rate": 4.413444434041756e-07, + "logits/chosen": -2.590561628341675, + "logits/rejected": -2.5754146575927734, + "logps/chosen": -268.98944091796875, + "logps/rejected": -290.4842529296875, + "loss": 0.6782, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8283954858779907, + "rewards/margins": 0.2944660782814026, + "rewards/rejected": -2.122861385345459, + "step": 2440 + }, + { + "epoch": 0.62, + "learning_rate": 4.408763224417189e-07, + "logits/chosen": -2.510091781616211, + "logits/rejected": -2.4711225032806396, + "logps/chosen": -236.9765167236328, + "logps/rejected": -224.4086151123047, + "loss": 0.5842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2400747537612915, + "rewards/margins": 1.2969255447387695, + "rewards/rejected": -2.5370001792907715, + "step": 2450 + }, + { + "epoch": 0.62, + "learning_rate": 4.404082014792622e-07, + "logits/chosen": -2.5807337760925293, + "logits/rejected": -2.552704334259033, + "logps/chosen": -358.95721435546875, + "logps/rejected": -306.596435546875, + "loss": 0.5798, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.365069031715393, + "rewards/margins": 1.3823387622833252, + "rewards/rejected": -2.747407913208008, + "step": 2460 + }, + { + "epoch": 0.62, + "learning_rate": 4.3994008051680554e-07, + "logits/chosen": -2.700641632080078, + "logits/rejected": -2.6497087478637695, + "logps/chosen": -359.22210693359375, + "logps/rejected": -443.8907165527344, + "loss": 0.6075, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6151984930038452, + "rewards/margins": 1.5008728504180908, + "rewards/rejected": -3.1160712242126465, + "step": 2470 + }, + { + "epoch": 0.63, + "learning_rate": 4.3947195955434883e-07, + "logits/chosen": -2.2459988594055176, + "logits/rejected": -2.2005584239959717, + "logps/chosen": -274.14263916015625, + "logps/rejected": -352.2812194824219, + "loss": 0.6611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5034785270690918, + "rewards/margins": 0.2487478256225586, + "rewards/rejected": -1.7522262334823608, + "step": 2480 + }, + { + "epoch": 0.63, + "learning_rate": 4.390038385918921e-07, + "logits/chosen": -2.651543140411377, + "logits/rejected": -2.6370761394500732, + "logps/chosen": -270.1546325683594, + "logps/rejected": -262.6888427734375, + "loss": 0.8213, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9153323173522949, + "rewards/margins": 0.7845653295516968, + "rewards/rejected": -1.6998974084854126, + "step": 2490 + }, + { + "epoch": 0.63, + "learning_rate": 4.385357176294354e-07, + "logits/chosen": -2.6818630695343018, + "logits/rejected": -2.501220464706421, + "logps/chosen": -290.70037841796875, + "logps/rejected": -244.542236328125, + "loss": 0.6459, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.761728048324585, + "rewards/margins": 0.9028828740119934, + "rewards/rejected": -2.6646108627319336, + "step": 2500 + }, + { + "epoch": 0.63, + "learning_rate": 4.3806759666697875e-07, + "logits/chosen": -2.6011970043182373, + "logits/rejected": -2.5796284675598145, + "logps/chosen": -267.5291442871094, + "logps/rejected": -274.05657958984375, + "loss": 0.5122, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3027671575546265, + "rewards/margins": 1.2686413526535034, + "rewards/rejected": -2.571408748626709, + "step": 2510 + }, + { + "epoch": 0.64, + "learning_rate": 4.3759947570452204e-07, + "logits/chosen": -2.5026752948760986, + "logits/rejected": -2.3501696586608887, + "logps/chosen": -218.0870819091797, + "logps/rejected": -272.39178466796875, + "loss": 0.654, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.630427896976471, + "rewards/margins": 1.6490509510040283, + "rewards/rejected": -2.2794787883758545, + "step": 2520 + }, + { + "epoch": 0.64, + "learning_rate": 4.3713135474206533e-07, + "logits/chosen": -2.3893485069274902, + "logits/rejected": -2.3126299381256104, + "logps/chosen": -315.6602478027344, + "logps/rejected": -283.17169189453125, + "loss": 0.5226, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1809231042861938, + "rewards/margins": 1.385221242904663, + "rewards/rejected": -2.5661442279815674, + "step": 2530 + }, + { + "epoch": 0.64, + "learning_rate": 4.366632337796086e-07, + "logits/chosen": -2.6477644443511963, + "logits/rejected": -2.729999542236328, + "logps/chosen": -321.12933349609375, + "logps/rejected": -333.33648681640625, + "loss": 0.7278, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9740563631057739, + "rewards/margins": 0.9034310579299927, + "rewards/rejected": -1.8774875402450562, + "step": 2540 + }, + { + "epoch": 0.64, + "learning_rate": 4.3619511281715196e-07, + "logits/chosen": -2.3891119956970215, + "logits/rejected": -2.4518914222717285, + "logps/chosen": -345.5533752441406, + "logps/rejected": -297.24609375, + "loss": 0.5454, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.45164117217063904, + "rewards/margins": 2.1465792655944824, + "rewards/rejected": -2.598220109939575, + "step": 2550 + }, + { + "epoch": 0.65, + "learning_rate": 4.3572699185469525e-07, + "logits/chosen": -2.724630117416382, + "logits/rejected": -2.5989553928375244, + "logps/chosen": -305.9207458496094, + "logps/rejected": -310.9513244628906, + "loss": 0.5299, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9471807479858398, + "rewards/margins": 1.425126314163208, + "rewards/rejected": -2.372307062149048, + "step": 2560 + }, + { + "epoch": 0.65, + "learning_rate": 4.3525887089223854e-07, + "logits/chosen": -2.5453360080718994, + "logits/rejected": -2.501199245452881, + "logps/chosen": -276.5281677246094, + "logps/rejected": -264.8980407714844, + "loss": 0.8216, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0282284021377563, + "rewards/margins": 1.2830857038497925, + "rewards/rejected": -2.311314344406128, + "step": 2570 + }, + { + "epoch": 0.65, + "learning_rate": 4.3479074992978183e-07, + "logits/chosen": -2.536196708679199, + "logits/rejected": -2.4768316745758057, + "logps/chosen": -333.0994567871094, + "logps/rejected": -244.43875122070312, + "loss": 0.5424, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5456063747406006, + "rewards/margins": 0.5869501829147339, + "rewards/rejected": -2.132556438446045, + "step": 2580 + }, + { + "epoch": 0.65, + "learning_rate": 4.343226289673251e-07, + "logits/chosen": -2.6771626472473145, + "logits/rejected": -2.6559603214263916, + "logps/chosen": -341.78936767578125, + "logps/rejected": -319.7326965332031, + "loss": 0.4778, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0155402421951294, + "rewards/margins": 1.3975855112075806, + "rewards/rejected": -2.41312575340271, + "step": 2590 + }, + { + "epoch": 0.66, + "learning_rate": 4.3385450800486846e-07, + "logits/chosen": -2.4650776386260986, + "logits/rejected": -2.4052162170410156, + "logps/chosen": -260.9008483886719, + "logps/rejected": -206.55044555664062, + "loss": 0.5196, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0318548679351807, + "rewards/margins": 2.524167060852051, + "rewards/rejected": -3.5560219287872314, + "step": 2600 + }, + { + "epoch": 0.66, + "learning_rate": 4.3338638704241175e-07, + "logits/chosen": -2.79402494430542, + "logits/rejected": -2.622647762298584, + "logps/chosen": -415.70050048828125, + "logps/rejected": -334.04962158203125, + "loss": 0.6465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2122654914855957, + "rewards/margins": 1.0877020359039307, + "rewards/rejected": -3.2999675273895264, + "step": 2610 + }, + { + "epoch": 0.66, + "learning_rate": 4.3291826607995504e-07, + "logits/chosen": -2.5749564170837402, + "logits/rejected": -2.6640853881835938, + "logps/chosen": -267.54583740234375, + "logps/rejected": -270.51739501953125, + "loss": 0.6652, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0356361865997314, + "rewards/margins": 1.8222503662109375, + "rewards/rejected": -2.857886791229248, + "step": 2620 + }, + { + "epoch": 0.66, + "learning_rate": 4.3245014511749833e-07, + "logits/chosen": -2.6502137184143066, + "logits/rejected": -2.5814380645751953, + "logps/chosen": -284.9609375, + "logps/rejected": -341.3908996582031, + "loss": 0.5912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.013614535331726, + "rewards/margins": 1.0198663473129272, + "rewards/rejected": -2.0334811210632324, + "step": 2630 + }, + { + "epoch": 0.67, + "learning_rate": 4.3198202415504167e-07, + "logits/chosen": -2.5044636726379395, + "logits/rejected": -2.4996304512023926, + "logps/chosen": -197.9936065673828, + "logps/rejected": -231.295654296875, + "loss": 0.5785, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9943052530288696, + "rewards/margins": 0.5037738680839539, + "rewards/rejected": -1.4980791807174683, + "step": 2640 + }, + { + "epoch": 0.67, + "learning_rate": 4.3151390319258496e-07, + "logits/chosen": -2.6049554347991943, + "logits/rejected": -2.5984387397766113, + "logps/chosen": -154.5291290283203, + "logps/rejected": -180.2733154296875, + "loss": 0.5901, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0797511339187622, + "rewards/margins": 1.2088053226470947, + "rewards/rejected": -2.2885565757751465, + "step": 2650 + }, + { + "epoch": 0.67, + "learning_rate": 4.310457822301282e-07, + "logits/chosen": -2.6208643913269043, + "logits/rejected": -2.6619465351104736, + "logps/chosen": -329.0693054199219, + "logps/rejected": -323.13616943359375, + "loss": 0.6039, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8519481420516968, + "rewards/margins": 1.0418014526367188, + "rewards/rejected": -2.893749713897705, + "step": 2660 + }, + { + "epoch": 0.67, + "learning_rate": 4.3057766126767154e-07, + "logits/chosen": -2.710085391998291, + "logits/rejected": -2.647268533706665, + "logps/chosen": -318.821044921875, + "logps/rejected": -331.76300048828125, + "loss": 0.5378, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5609668493270874, + "rewards/margins": 1.7650535106658936, + "rewards/rejected": -3.3260207176208496, + "step": 2670 + }, + { + "epoch": 0.68, + "learning_rate": 4.3010954030521483e-07, + "logits/chosen": -2.488053798675537, + "logits/rejected": -2.5420925617218018, + "logps/chosen": -304.56951904296875, + "logps/rejected": -276.8664245605469, + "loss": 0.5102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8793163299560547, + "rewards/margins": 1.3756439685821533, + "rewards/rejected": -3.254960298538208, + "step": 2680 + }, + { + "epoch": 0.68, + "learning_rate": 4.2964141934275817e-07, + "logits/chosen": -2.6673781871795654, + "logits/rejected": -2.6666882038116455, + "logps/chosen": -192.8503875732422, + "logps/rejected": -213.07363891601562, + "loss": 0.5483, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7033116817474365, + "rewards/margins": 1.4978384971618652, + "rewards/rejected": -3.201150417327881, + "step": 2690 + }, + { + "epoch": 0.68, + "learning_rate": 4.2917329838030146e-07, + "logits/chosen": -2.6083192825317383, + "logits/rejected": -2.6443545818328857, + "logps/chosen": -167.23509216308594, + "logps/rejected": -206.14404296875, + "loss": 0.4846, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8999881744384766, + "rewards/margins": 1.4323008060455322, + "rewards/rejected": -3.3322887420654297, + "step": 2700 + }, + { + "epoch": 0.69, + "learning_rate": 4.2870517741784475e-07, + "logits/chosen": -2.658937931060791, + "logits/rejected": -2.673595666885376, + "logps/chosen": -341.39056396484375, + "logps/rejected": -346.138671875, + "loss": 0.5855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2022271156311035, + "rewards/margins": 0.9777986407279968, + "rewards/rejected": -3.180025577545166, + "step": 2710 + }, + { + "epoch": 0.69, + "learning_rate": 4.2823705645538804e-07, + "logits/chosen": -2.581702470779419, + "logits/rejected": -2.4730591773986816, + "logps/chosen": -285.0960388183594, + "logps/rejected": -292.950927734375, + "loss": 0.6057, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.297670602798462, + "rewards/margins": 1.7100436687469482, + "rewards/rejected": -3.0077145099639893, + "step": 2720 + }, + { + "epoch": 0.69, + "learning_rate": 4.277689354929314e-07, + "logits/chosen": -2.655735492706299, + "logits/rejected": -2.5294718742370605, + "logps/chosen": -340.40887451171875, + "logps/rejected": -293.7754821777344, + "loss": 0.5714, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2910304069519043, + "rewards/margins": 1.173147201538086, + "rewards/rejected": -3.4641776084899902, + "step": 2730 + }, + { + "epoch": 0.69, + "learning_rate": 4.2730081453047467e-07, + "logits/chosen": -2.3572351932525635, + "logits/rejected": -2.313126564025879, + "logps/chosen": -216.91470336914062, + "logps/rejected": -229.12130737304688, + "loss": 0.5627, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2121973037719727, + "rewards/margins": 2.0007376670837402, + "rewards/rejected": -3.212934970855713, + "step": 2740 + }, + { + "epoch": 0.7, + "learning_rate": 4.2683269356801796e-07, + "logits/chosen": -2.639946460723877, + "logits/rejected": -2.4425880908966064, + "logps/chosen": -367.25152587890625, + "logps/rejected": -230.43032836914062, + "loss": 0.5015, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3880975246429443, + "rewards/margins": 2.261265516281128, + "rewards/rejected": -3.6493630409240723, + "step": 2750 + }, + { + "epoch": 0.7, + "learning_rate": 4.2636457260556125e-07, + "logits/chosen": -2.4713833332061768, + "logits/rejected": -2.4109320640563965, + "logps/chosen": -223.4802703857422, + "logps/rejected": -208.6910858154297, + "loss": 0.6571, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5514646768569946, + "rewards/margins": 2.1207289695739746, + "rewards/rejected": -3.6721935272216797, + "step": 2760 + }, + { + "epoch": 0.7, + "learning_rate": 4.2589645164310454e-07, + "logits/chosen": -2.6536941528320312, + "logits/rejected": -2.58679461479187, + "logps/chosen": -281.8035888671875, + "logps/rejected": -220.51327514648438, + "loss": 0.6474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.821862816810608, + "rewards/margins": 1.2211488485336304, + "rewards/rejected": -3.0430119037628174, + "step": 2770 + }, + { + "epoch": 0.7, + "learning_rate": 4.254283306806479e-07, + "logits/chosen": -2.838618516921997, + "logits/rejected": -2.7523787021636963, + "logps/chosen": -281.54168701171875, + "logps/rejected": -240.41146850585938, + "loss": 0.59, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8369686007499695, + "rewards/margins": 1.0852001905441284, + "rewards/rejected": -1.9221687316894531, + "step": 2780 + }, + { + "epoch": 0.71, + "learning_rate": 4.2496020971819117e-07, + "logits/chosen": -2.658292293548584, + "logits/rejected": -2.6537060737609863, + "logps/chosen": -325.69561767578125, + "logps/rejected": -277.89501953125, + "loss": 0.6677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4621500968933105, + "rewards/margins": 1.1805541515350342, + "rewards/rejected": -2.6427040100097656, + "step": 2790 + }, + { + "epoch": 0.71, + "learning_rate": 4.2449208875573446e-07, + "logits/chosen": -2.772883176803589, + "logits/rejected": -2.631762981414795, + "logps/chosen": -373.33929443359375, + "logps/rejected": -327.4913635253906, + "loss": 0.5467, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8034371137619019, + "rewards/margins": 1.0818064212799072, + "rewards/rejected": -1.8852436542510986, + "step": 2800 + }, + { + "epoch": 0.71, + "learning_rate": 4.2402396779327775e-07, + "logits/chosen": -2.6546027660369873, + "logits/rejected": -2.644376039505005, + "logps/chosen": -250.03372192382812, + "logps/rejected": -314.5126953125, + "loss": 0.5764, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1385111808776855, + "rewards/margins": 1.938734769821167, + "rewards/rejected": -3.0772461891174316, + "step": 2810 + }, + { + "epoch": 0.71, + "learning_rate": 4.235558468308211e-07, + "logits/chosen": -2.407291889190674, + "logits/rejected": -2.4491875171661377, + "logps/chosen": -278.3325500488281, + "logps/rejected": -268.4944152832031, + "loss": 0.4692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0057759284973145, + "rewards/margins": 1.8401225805282593, + "rewards/rejected": -2.845898389816284, + "step": 2820 + }, + { + "epoch": 0.72, + "learning_rate": 4.230877258683644e-07, + "logits/chosen": -2.615633487701416, + "logits/rejected": -2.541612148284912, + "logps/chosen": -250.8361053466797, + "logps/rejected": -178.80010986328125, + "loss": 0.7134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8592203855514526, + "rewards/margins": 1.3873034715652466, + "rewards/rejected": -2.2465240955352783, + "step": 2830 + }, + { + "epoch": 0.72, + "learning_rate": 4.2261960490590767e-07, + "logits/chosen": -2.609163522720337, + "logits/rejected": -2.5728368759155273, + "logps/chosen": -301.5133361816406, + "logps/rejected": -281.16314697265625, + "loss": 0.519, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4930236339569092, + "rewards/margins": 1.7747684717178345, + "rewards/rejected": -3.267792224884033, + "step": 2840 + }, + { + "epoch": 0.72, + "learning_rate": 4.2215148394345096e-07, + "logits/chosen": -2.752432107925415, + "logits/rejected": -2.6862025260925293, + "logps/chosen": -332.0007629394531, + "logps/rejected": -251.06582641601562, + "loss": 0.4968, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6779168844223022, + "rewards/margins": 0.8063459396362305, + "rewards/rejected": -2.484262704849243, + "step": 2850 + }, + { + "epoch": 0.72, + "learning_rate": 4.2168336298099425e-07, + "logits/chosen": -2.6456823348999023, + "logits/rejected": -2.539421558380127, + "logps/chosen": -184.9524688720703, + "logps/rejected": -195.1049346923828, + "loss": 0.5587, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0221726894378662, + "rewards/margins": 1.4521821737289429, + "rewards/rejected": -2.4743552207946777, + "step": 2860 + }, + { + "epoch": 0.73, + "learning_rate": 4.212152420185376e-07, + "logits/chosen": -2.8353781700134277, + "logits/rejected": -2.7753653526306152, + "logps/chosen": -284.77081298828125, + "logps/rejected": -305.65374755859375, + "loss": 0.5973, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.002152442932129, + "rewards/margins": 0.8888009786605835, + "rewards/rejected": -2.890953540802002, + "step": 2870 + }, + { + "epoch": 0.73, + "learning_rate": 4.207471210560808e-07, + "logits/chosen": -2.7503039836883545, + "logits/rejected": -2.548729181289673, + "logps/chosen": -317.0842590332031, + "logps/rejected": -323.949462890625, + "loss": 0.6178, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9967567920684814, + "rewards/margins": 1.4991819858551025, + "rewards/rejected": -3.495938539505005, + "step": 2880 + }, + { + "epoch": 0.73, + "learning_rate": 4.2027900009362417e-07, + "logits/chosen": -2.707176685333252, + "logits/rejected": -2.716992139816284, + "logps/chosen": -258.08221435546875, + "logps/rejected": -381.6121520996094, + "loss": 0.4701, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2269902229309082, + "rewards/margins": 3.490187406539917, + "rewards/rejected": -4.717177867889404, + "step": 2890 + }, + { + "epoch": 0.73, + "learning_rate": 4.1981087913116746e-07, + "logits/chosen": -2.7810075283050537, + "logits/rejected": -2.7391176223754883, + "logps/chosen": -258.90576171875, + "logps/rejected": -228.8199462890625, + "loss": 0.535, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5134868621826172, + "rewards/margins": 1.1353763341903687, + "rewards/rejected": -2.6488633155822754, + "step": 2900 + }, + { + "epoch": 0.74, + "learning_rate": 4.193427581687108e-07, + "logits/chosen": -2.92142915725708, + "logits/rejected": -2.7393126487731934, + "logps/chosen": -458.55621337890625, + "logps/rejected": -362.22039794921875, + "loss": 0.5891, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -2.4719746112823486, + "rewards/margins": 0.5298187732696533, + "rewards/rejected": -3.001793146133423, + "step": 2910 + }, + { + "epoch": 0.74, + "learning_rate": 4.188746372062541e-07, + "logits/chosen": -2.731666088104248, + "logits/rejected": -2.667985439300537, + "logps/chosen": -369.28753662109375, + "logps/rejected": -301.48187255859375, + "loss": 0.6573, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5690410137176514, + "rewards/margins": 0.5976849794387817, + "rewards/rejected": -2.1667261123657227, + "step": 2920 + }, + { + "epoch": 0.74, + "learning_rate": 4.184065162437974e-07, + "logits/chosen": -2.7661612033843994, + "logits/rejected": -2.759232759475708, + "logps/chosen": -351.2125549316406, + "logps/rejected": -406.3238220214844, + "loss": 0.533, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3426258563995361, + "rewards/margins": 1.159343957901001, + "rewards/rejected": -2.501969814300537, + "step": 2930 + }, + { + "epoch": 0.74, + "learning_rate": 4.1793839528134067e-07, + "logits/chosen": -2.6930956840515137, + "logits/rejected": -2.6457581520080566, + "logps/chosen": -241.09414672851562, + "logps/rejected": -242.9203643798828, + "loss": 0.5967, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6755502223968506, + "rewards/margins": 2.0493533611297607, + "rewards/rejected": -2.7249033451080322, + "step": 2940 + }, + { + "epoch": 0.75, + "learning_rate": 4.17470274318884e-07, + "logits/chosen": -2.784623622894287, + "logits/rejected": -2.6285128593444824, + "logps/chosen": -266.16241455078125, + "logps/rejected": -191.28790283203125, + "loss": 0.598, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.225970983505249, + "rewards/margins": 1.7024619579315186, + "rewards/rejected": -2.9284331798553467, + "step": 2950 + }, + { + "epoch": 0.75, + "learning_rate": 4.170021533564273e-07, + "logits/chosen": -2.810001850128174, + "logits/rejected": -2.641927480697632, + "logps/chosen": -349.1688232421875, + "logps/rejected": -250.1381378173828, + "loss": 0.6235, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3024142980575562, + "rewards/margins": 1.194219708442688, + "rewards/rejected": -2.496634006500244, + "step": 2960 + }, + { + "epoch": 0.75, + "learning_rate": 4.1653403239397053e-07, + "logits/chosen": -2.920943021774292, + "logits/rejected": -2.827078342437744, + "logps/chosen": -388.9732971191406, + "logps/rejected": -319.05474853515625, + "loss": 0.7266, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.081215500831604, + "rewards/margins": 0.7131466269493103, + "rewards/rejected": -1.7943620681762695, + "step": 2970 + }, + { + "epoch": 0.75, + "learning_rate": 4.160659114315139e-07, + "logits/chosen": -2.6681265830993652, + "logits/rejected": -2.6208529472351074, + "logps/chosen": -246.3448028564453, + "logps/rejected": -256.88446044921875, + "loss": 0.6632, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1406188011169434, + "rewards/margins": 2.058380603790283, + "rewards/rejected": -3.1989991664886475, + "step": 2980 + }, + { + "epoch": 0.76, + "learning_rate": 4.1559779046905717e-07, + "logits/chosen": -2.7833802700042725, + "logits/rejected": -2.676621437072754, + "logps/chosen": -341.5685729980469, + "logps/rejected": -268.85015869140625, + "loss": 0.5589, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4013714790344238, + "rewards/margins": 1.5592626333236694, + "rewards/rejected": -2.9606339931488037, + "step": 2990 + }, + { + "epoch": 0.76, + "learning_rate": 4.151296695066005e-07, + "logits/chosen": -2.8344216346740723, + "logits/rejected": -2.7577290534973145, + "logps/chosen": -271.50860595703125, + "logps/rejected": -270.90576171875, + "loss": 0.5541, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7711751461029053, + "rewards/margins": 0.8914017677307129, + "rewards/rejected": -2.662576913833618, + "step": 3000 + }, + { + "epoch": 0.76, + "learning_rate": 4.146615485441438e-07, + "logits/chosen": -2.62028169631958, + "logits/rejected": -2.5463624000549316, + "logps/chosen": -329.8163146972656, + "logps/rejected": -278.805908203125, + "loss": 0.5247, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.1455769538879395, + "rewards/margins": 1.8340288400650024, + "rewards/rejected": -3.9796054363250732, + "step": 3010 + }, + { + "epoch": 0.76, + "learning_rate": 4.141934275816871e-07, + "logits/chosen": -2.6459527015686035, + "logits/rejected": -2.6695361137390137, + "logps/chosen": -280.35699462890625, + "logps/rejected": -262.6058654785156, + "loss": 0.6609, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1435942649841309, + "rewards/margins": 1.60695481300354, + "rewards/rejected": -2.750548839569092, + "step": 3020 + }, + { + "epoch": 0.77, + "learning_rate": 4.137253066192304e-07, + "logits/chosen": -2.807156562805176, + "logits/rejected": -2.7334413528442383, + "logps/chosen": -327.09344482421875, + "logps/rejected": -289.5278625488281, + "loss": 0.537, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9722150564193726, + "rewards/margins": 1.2871454954147339, + "rewards/rejected": -2.2593607902526855, + "step": 3030 + }, + { + "epoch": 0.77, + "learning_rate": 4.132571856567737e-07, + "logits/chosen": -2.73054838180542, + "logits/rejected": -2.613467216491699, + "logps/chosen": -325.3888244628906, + "logps/rejected": -293.91192626953125, + "loss": 0.6763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6591287851333618, + "rewards/margins": 0.35142362117767334, + "rewards/rejected": -2.0105526447296143, + "step": 3040 + }, + { + "epoch": 0.77, + "learning_rate": 4.12789064694317e-07, + "logits/chosen": -2.6410369873046875, + "logits/rejected": -2.5918993949890137, + "logps/chosen": -286.21051025390625, + "logps/rejected": -279.4210205078125, + "loss": 0.6697, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2338840961456299, + "rewards/margins": 1.201923131942749, + "rewards/rejected": -2.435807704925537, + "step": 3050 + }, + { + "epoch": 0.77, + "learning_rate": 4.123209437318603e-07, + "logits/chosen": -2.6927359104156494, + "logits/rejected": -2.6920785903930664, + "logps/chosen": -403.5760498046875, + "logps/rejected": -293.9722595214844, + "loss": 0.5659, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.5857149362564087, + "rewards/margins": 0.5379332304000854, + "rewards/rejected": -2.123648166656494, + "step": 3060 + }, + { + "epoch": 0.78, + "learning_rate": 4.118528227694036e-07, + "logits/chosen": -2.538649082183838, + "logits/rejected": -2.5093741416931152, + "logps/chosen": -229.52725219726562, + "logps/rejected": -292.178466796875, + "loss": 0.5763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.477867841720581, + "rewards/margins": 1.1251497268676758, + "rewards/rejected": -2.603017568588257, + "step": 3070 + }, + { + "epoch": 0.78, + "learning_rate": 4.113847018069469e-07, + "logits/chosen": -2.6631672382354736, + "logits/rejected": -2.600651741027832, + "logps/chosen": -307.30609130859375, + "logps/rejected": -316.18426513671875, + "loss": 0.5171, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.586888074874878, + "rewards/margins": 1.2167094945907593, + "rewards/rejected": -2.8035976886749268, + "step": 3080 + }, + { + "epoch": 0.78, + "learning_rate": 4.109165808444902e-07, + "logits/chosen": -2.6414685249328613, + "logits/rejected": -2.5432944297790527, + "logps/chosen": -282.6369934082031, + "logps/rejected": -255.5457000732422, + "loss": 0.6611, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7821651697158813, + "rewards/margins": 1.5107762813568115, + "rewards/rejected": -3.2929415702819824, + "step": 3090 + }, + { + "epoch": 0.78, + "learning_rate": 4.104484598820335e-07, + "logits/chosen": -2.742868661880493, + "logits/rejected": -2.6158576011657715, + "logps/chosen": -297.921875, + "logps/rejected": -295.4357604980469, + "loss": 0.5415, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3601291179656982, + "rewards/margins": 1.7456867694854736, + "rewards/rejected": -3.10581636428833, + "step": 3100 + }, + { + "epoch": 0.79, + "learning_rate": 4.099803389195768e-07, + "logits/chosen": -2.6755316257476807, + "logits/rejected": -2.6158745288848877, + "logps/chosen": -228.6281280517578, + "logps/rejected": -174.53689575195312, + "loss": 0.5777, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9730864763259888, + "rewards/margins": 0.5768686532974243, + "rewards/rejected": -1.549955129623413, + "step": 3110 + }, + { + "epoch": 0.79, + "learning_rate": 4.095122179571201e-07, + "logits/chosen": -2.855614185333252, + "logits/rejected": -2.75842022895813, + "logps/chosen": -370.797119140625, + "logps/rejected": -288.92547607421875, + "loss": 0.6124, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.317105293273926, + "rewards/margins": 0.5193389654159546, + "rewards/rejected": -2.836444139480591, + "step": 3120 + }, + { + "epoch": 0.79, + "learning_rate": 4.0904409699466343e-07, + "logits/chosen": -2.551896095275879, + "logits/rejected": -2.552696704864502, + "logps/chosen": -204.35296630859375, + "logps/rejected": -230.80282592773438, + "loss": 0.5603, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.911942720413208, + "rewards/margins": 1.3499393463134766, + "rewards/rejected": -3.2618820667266846, + "step": 3130 + }, + { + "epoch": 0.79, + "learning_rate": 4.085759760322067e-07, + "logits/chosen": -2.778578519821167, + "logits/rejected": -2.698376417160034, + "logps/chosen": -189.66526794433594, + "logps/rejected": -198.605712890625, + "loss": 0.5461, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7923952341079712, + "rewards/margins": 1.0926042795181274, + "rewards/rejected": -2.8849995136260986, + "step": 3140 + }, + { + "epoch": 0.8, + "learning_rate": 4.0810785506975e-07, + "logits/chosen": -2.641099691390991, + "logits/rejected": -2.5993666648864746, + "logps/chosen": -179.12158203125, + "logps/rejected": -188.12527465820312, + "loss": 0.4533, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4460694789886475, + "rewards/margins": 1.0726109743118286, + "rewards/rejected": -2.5186808109283447, + "step": 3150 + }, + { + "epoch": 0.8, + "learning_rate": 4.076397341072933e-07, + "logits/chosen": -2.62526798248291, + "logits/rejected": -2.607677936553955, + "logps/chosen": -210.54129028320312, + "logps/rejected": -272.2312927246094, + "loss": 0.581, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.301488995552063, + "rewards/margins": 0.9993396997451782, + "rewards/rejected": -2.300828695297241, + "step": 3160 + }, + { + "epoch": 0.8, + "learning_rate": 4.071716131448366e-07, + "logits/chosen": -2.711860179901123, + "logits/rejected": -2.6672210693359375, + "logps/chosen": -294.1248779296875, + "logps/rejected": -276.2283935546875, + "loss": 0.571, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4649314284324646, + "rewards/margins": 1.3288383483886719, + "rewards/rejected": -1.7937695980072021, + "step": 3170 + }, + { + "epoch": 0.8, + "learning_rate": 4.0670349218237993e-07, + "logits/chosen": -2.5365118980407715, + "logits/rejected": -2.5782217979431152, + "logps/chosen": -335.83917236328125, + "logps/rejected": -262.0423583984375, + "loss": 0.4983, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8767711520195007, + "rewards/margins": 1.6427284479141235, + "rewards/rejected": -2.5194995403289795, + "step": 3180 + }, + { + "epoch": 0.81, + "learning_rate": 4.0623537121992316e-07, + "logits/chosen": -2.634230852127075, + "logits/rejected": -2.648371934890747, + "logps/chosen": -328.01885986328125, + "logps/rejected": -252.741943359375, + "loss": 0.575, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.053867816925049, + "rewards/margins": 1.3023773431777954, + "rewards/rejected": -3.356245517730713, + "step": 3190 + }, + { + "epoch": 0.81, + "learning_rate": 4.057672502574665e-07, + "logits/chosen": -2.7491402626037598, + "logits/rejected": -2.7113699913024902, + "logps/chosen": -366.0289001464844, + "logps/rejected": -295.0087585449219, + "loss": 0.5942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1792293787002563, + "rewards/margins": 1.7449098825454712, + "rewards/rejected": -2.9241390228271484, + "step": 3200 + }, + { + "epoch": 0.81, + "learning_rate": 4.052991292950098e-07, + "logits/chosen": -2.7164413928985596, + "logits/rejected": -2.7283341884613037, + "logps/chosen": -244.5092315673828, + "logps/rejected": -243.07958984375, + "loss": 0.5719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5343577861785889, + "rewards/margins": 0.30130812525749207, + "rewards/rejected": -1.8356659412384033, + "step": 3210 + }, + { + "epoch": 0.81, + "learning_rate": 4.0483100833255314e-07, + "logits/chosen": -2.690765619277954, + "logits/rejected": -2.544368028640747, + "logps/chosen": -251.3297576904297, + "logps/rejected": -224.3122100830078, + "loss": 0.5907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7500336170196533, + "rewards/margins": 1.2233902215957642, + "rewards/rejected": -2.973423719406128, + "step": 3220 + }, + { + "epoch": 0.82, + "learning_rate": 4.0436288737009643e-07, + "logits/chosen": -2.56693959236145, + "logits/rejected": -2.575343608856201, + "logps/chosen": -218.8804473876953, + "logps/rejected": -245.62911987304688, + "loss": 0.557, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9216814041137695, + "rewards/margins": 1.8012382984161377, + "rewards/rejected": -3.7229199409484863, + "step": 3230 + }, + { + "epoch": 0.82, + "learning_rate": 4.038947664076397e-07, + "logits/chosen": -2.5427839756011963, + "logits/rejected": -2.329836368560791, + "logps/chosen": -255.80868530273438, + "logps/rejected": -224.1576690673828, + "loss": 0.5536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7845814228057861, + "rewards/margins": 1.403369665145874, + "rewards/rejected": -3.187950849533081, + "step": 3240 + }, + { + "epoch": 0.82, + "learning_rate": 4.03426645445183e-07, + "logits/chosen": -2.501692533493042, + "logits/rejected": -2.4710707664489746, + "logps/chosen": -273.8977966308594, + "logps/rejected": -273.548095703125, + "loss": 0.5992, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.00805401802063, + "rewards/margins": 1.3584315776824951, + "rewards/rejected": -3.366485595703125, + "step": 3250 + }, + { + "epoch": 0.82, + "learning_rate": 4.0295852448272635e-07, + "logits/chosen": -2.6749606132507324, + "logits/rejected": -2.640605926513672, + "logps/chosen": -208.001708984375, + "logps/rejected": -244.11300659179688, + "loss": 0.564, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.029116153717041, + "rewards/margins": 1.3018925189971924, + "rewards/rejected": -3.3310089111328125, + "step": 3260 + }, + { + "epoch": 0.83, + "learning_rate": 4.0249040352026964e-07, + "logits/chosen": -2.686748504638672, + "logits/rejected": -2.5494863986968994, + "logps/chosen": -302.6435546875, + "logps/rejected": -251.1964874267578, + "loss": 0.6419, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -2.0372931957244873, + "rewards/margins": 0.9006452560424805, + "rewards/rejected": -2.9379382133483887, + "step": 3270 + }, + { + "epoch": 0.83, + "learning_rate": 4.020222825578129e-07, + "logits/chosen": -2.7011072635650635, + "logits/rejected": -2.6204285621643066, + "logps/chosen": -292.6033630371094, + "logps/rejected": -324.64434814453125, + "loss": 0.5648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.684535264968872, + "rewards/margins": 1.2165940999984741, + "rewards/rejected": -2.9011292457580566, + "step": 3280 + }, + { + "epoch": 0.83, + "learning_rate": 4.015541615953562e-07, + "logits/chosen": -2.535747528076172, + "logits/rejected": -2.582158088684082, + "logps/chosen": -195.5382843017578, + "logps/rejected": -261.56365966796875, + "loss": 0.6563, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.721742868423462, + "rewards/margins": 0.9087635278701782, + "rewards/rejected": -3.6305060386657715, + "step": 3290 + }, + { + "epoch": 0.83, + "learning_rate": 4.010860406328995e-07, + "logits/chosen": -2.808954954147339, + "logits/rejected": -2.696636915206909, + "logps/chosen": -361.0929260253906, + "logps/rejected": -290.98779296875, + "loss": 0.6535, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.4368767738342285, + "rewards/margins": 0.8197757601737976, + "rewards/rejected": -3.256652355194092, + "step": 3300 + }, + { + "epoch": 0.84, + "learning_rate": 4.0061791967044285e-07, + "logits/chosen": -2.429372787475586, + "logits/rejected": -2.4101641178131104, + "logps/chosen": -292.60931396484375, + "logps/rejected": -245.3379669189453, + "loss": 0.6289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1441898345947266, + "rewards/margins": 1.6892770528793335, + "rewards/rejected": -3.8334667682647705, + "step": 3310 + }, + { + "epoch": 0.84, + "learning_rate": 4.0014979870798614e-07, + "logits/chosen": -2.67567777633667, + "logits/rejected": -2.740675687789917, + "logps/chosen": -295.6293029785156, + "logps/rejected": -416.08221435546875, + "loss": 0.6084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9729360342025757, + "rewards/margins": 1.8700878620147705, + "rewards/rejected": -3.8430240154266357, + "step": 3320 + }, + { + "epoch": 0.84, + "learning_rate": 3.9968167774552943e-07, + "logits/chosen": -2.6133923530578613, + "logits/rejected": -2.5547168254852295, + "logps/chosen": -234.68698120117188, + "logps/rejected": -259.75048828125, + "loss": 0.4013, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.157472610473633, + "rewards/margins": 1.3976643085479736, + "rewards/rejected": -3.5551364421844482, + "step": 3330 + }, + { + "epoch": 0.84, + "learning_rate": 3.992135567830727e-07, + "logits/chosen": -2.5629830360412598, + "logits/rejected": -2.607271671295166, + "logps/chosen": -232.90493774414062, + "logps/rejected": -259.7083435058594, + "loss": 0.5546, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1839420795440674, + "rewards/margins": 2.026728391647339, + "rewards/rejected": -3.2106704711914062, + "step": 3340 + }, + { + "epoch": 0.85, + "learning_rate": 3.9874543582061606e-07, + "logits/chosen": -2.6749119758605957, + "logits/rejected": -2.4684460163116455, + "logps/chosen": -257.3249816894531, + "logps/rejected": -186.1034698486328, + "loss": 0.5217, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.893366813659668, + "rewards/margins": 1.852936029434204, + "rewards/rejected": -3.746302843093872, + "step": 3350 + }, + { + "epoch": 0.85, + "learning_rate": 3.9827731485815935e-07, + "logits/chosen": -2.6366989612579346, + "logits/rejected": -2.5729355812072754, + "logps/chosen": -281.87969970703125, + "logps/rejected": -295.0253601074219, + "loss": 0.6457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.02162504196167, + "rewards/margins": 0.9705334901809692, + "rewards/rejected": -3.9921584129333496, + "step": 3360 + }, + { + "epoch": 0.85, + "learning_rate": 3.978091938957026e-07, + "logits/chosen": -2.4280495643615723, + "logits/rejected": -2.409846782684326, + "logps/chosen": -249.6262969970703, + "logps/rejected": -197.26663208007812, + "loss": 0.5377, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.532490849494934, + "rewards/margins": 1.3351542949676514, + "rewards/rejected": -2.867644786834717, + "step": 3370 + }, + { + "epoch": 0.85, + "learning_rate": 3.9734107293324593e-07, + "logits/chosen": -2.3564815521240234, + "logits/rejected": -2.418379545211792, + "logps/chosen": -230.7577667236328, + "logps/rejected": -210.77334594726562, + "loss": 0.5286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.699113368988037, + "rewards/margins": 0.9843389391899109, + "rewards/rejected": -3.6834521293640137, + "step": 3380 + }, + { + "epoch": 0.86, + "learning_rate": 3.968729519707892e-07, + "logits/chosen": -2.6940178871154785, + "logits/rejected": -2.5732855796813965, + "logps/chosen": -276.3876953125, + "logps/rejected": -253.4854278564453, + "loss": 0.5725, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.361846923828125, + "rewards/margins": 0.9445775747299194, + "rewards/rejected": -2.306424617767334, + "step": 3390 + }, + { + "epoch": 0.86, + "learning_rate": 3.9640483100833256e-07, + "logits/chosen": -2.7425854206085205, + "logits/rejected": -2.730180501937866, + "logps/chosen": -398.3658142089844, + "logps/rejected": -334.22869873046875, + "loss": 0.5644, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1900866031646729, + "rewards/margins": 1.9019054174423218, + "rewards/rejected": -3.091992139816284, + "step": 3400 + }, + { + "epoch": 0.86, + "learning_rate": 3.959367100458758e-07, + "logits/chosen": -2.6171083450317383, + "logits/rejected": -2.6043202877044678, + "logps/chosen": -289.6757507324219, + "logps/rejected": -265.671630859375, + "loss": 0.5571, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2667036056518555, + "rewards/margins": 1.299369215965271, + "rewards/rejected": -2.566072702407837, + "step": 3410 + }, + { + "epoch": 0.86, + "learning_rate": 3.9546858908341914e-07, + "logits/chosen": -2.6402785778045654, + "logits/rejected": -2.543848752975464, + "logps/chosen": -292.3985290527344, + "logps/rejected": -254.4654083251953, + "loss": 0.5835, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4257738590240479, + "rewards/margins": 1.4361470937728882, + "rewards/rejected": -2.8619208335876465, + "step": 3420 + }, + { + "epoch": 0.87, + "learning_rate": 3.950004681209624e-07, + "logits/chosen": -2.537668228149414, + "logits/rejected": -2.5646309852600098, + "logps/chosen": -211.93148803710938, + "logps/rejected": -216.738037109375, + "loss": 0.5892, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6508975028991699, + "rewards/margins": 1.7442976236343384, + "rewards/rejected": -2.3951950073242188, + "step": 3430 + }, + { + "epoch": 0.87, + "learning_rate": 3.9453234715850577e-07, + "logits/chosen": -2.6993932723999023, + "logits/rejected": -2.5626072883605957, + "logps/chosen": -358.39227294921875, + "logps/rejected": -276.04742431640625, + "loss": 0.5386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.695052981376648, + "rewards/margins": 2.2782647609710693, + "rewards/rejected": -2.9733176231384277, + "step": 3440 + }, + { + "epoch": 0.87, + "learning_rate": 3.9406422619604906e-07, + "logits/chosen": -2.7179439067840576, + "logits/rejected": -2.6138925552368164, + "logps/chosen": -378.1282958984375, + "logps/rejected": -288.8641357421875, + "loss": 0.5758, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9554533958435059, + "rewards/margins": 1.3336598873138428, + "rewards/rejected": -2.2891132831573486, + "step": 3450 + }, + { + "epoch": 0.87, + "learning_rate": 3.9359610523359235e-07, + "logits/chosen": -2.6427860260009766, + "logits/rejected": -2.5691373348236084, + "logps/chosen": -247.28494262695312, + "logps/rejected": -264.9586181640625, + "loss": 0.5398, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0383670330047607, + "rewards/margins": 1.094294786453247, + "rewards/rejected": -2.1326615810394287, + "step": 3460 + }, + { + "epoch": 0.88, + "learning_rate": 3.9312798427113564e-07, + "logits/chosen": -2.5711920261383057, + "logits/rejected": -2.5556704998016357, + "logps/chosen": -189.369384765625, + "logps/rejected": -230.4828338623047, + "loss": 0.5015, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8589425086975098, + "rewards/margins": 1.2133837938308716, + "rewards/rejected": -2.072326183319092, + "step": 3470 + }, + { + "epoch": 0.88, + "learning_rate": 3.926598633086789e-07, + "logits/chosen": -2.505418062210083, + "logits/rejected": -2.6115143299102783, + "logps/chosen": -305.3795471191406, + "logps/rejected": -264.7258605957031, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -2.0123074054718018, + "rewards/margins": -0.07890516519546509, + "rewards/rejected": -1.933402419090271, + "step": 3480 + }, + { + "epoch": 0.88, + "learning_rate": 3.9219174234622227e-07, + "logits/chosen": -2.527928113937378, + "logits/rejected": -2.6276683807373047, + "logps/chosen": -282.0664367675781, + "logps/rejected": -267.37109375, + "loss": 0.5965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4060749113559723, + "rewards/margins": 1.2708396911621094, + "rewards/rejected": -1.6769145727157593, + "step": 3490 + }, + { + "epoch": 0.88, + "learning_rate": 3.917236213837655e-07, + "logits/chosen": -2.7274959087371826, + "logits/rejected": -2.577826976776123, + "logps/chosen": -331.56549072265625, + "logps/rejected": -279.29083251953125, + "loss": 0.4947, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7532097697257996, + "rewards/margins": 1.5866296291351318, + "rewards/rejected": -2.339839220046997, + "step": 3500 + }, + { + "epoch": 0.89, + "learning_rate": 3.9125550042130885e-07, + "logits/chosen": -2.564699649810791, + "logits/rejected": -2.598123550415039, + "logps/chosen": -278.11785888671875, + "logps/rejected": -316.83502197265625, + "loss": 0.5313, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2439638376235962, + "rewards/margins": 0.5872949361801147, + "rewards/rejected": -1.831258773803711, + "step": 3510 + }, + { + "epoch": 0.89, + "learning_rate": 3.9078737945885214e-07, + "logits/chosen": -2.628638744354248, + "logits/rejected": -2.470625400543213, + "logps/chosen": -342.5315856933594, + "logps/rejected": -275.0693054199219, + "loss": 0.7085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.970574140548706, + "rewards/margins": 1.1266072988510132, + "rewards/rejected": -3.097181797027588, + "step": 3520 + }, + { + "epoch": 0.89, + "learning_rate": 3.903192584963955e-07, + "logits/chosen": -2.676095485687256, + "logits/rejected": -2.6351985931396484, + "logps/chosen": -349.648193359375, + "logps/rejected": -269.4110412597656, + "loss": 0.6365, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7081565856933594, + "rewards/margins": 1.273766279220581, + "rewards/rejected": -2.9819226264953613, + "step": 3530 + }, + { + "epoch": 0.89, + "learning_rate": 3.8985113753393877e-07, + "logits/chosen": -2.4113409519195557, + "logits/rejected": -2.4867024421691895, + "logps/chosen": -209.7042999267578, + "logps/rejected": -235.5690155029297, + "loss": 0.5081, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2396607398986816, + "rewards/margins": 1.8427131175994873, + "rewards/rejected": -3.082373857498169, + "step": 3540 + }, + { + "epoch": 0.9, + "learning_rate": 3.8938301657148206e-07, + "logits/chosen": -2.6304538249969482, + "logits/rejected": -2.5243570804595947, + "logps/chosen": -293.2393798828125, + "logps/rejected": -311.14263916015625, + "loss": 0.5648, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9866330027580261, + "rewards/margins": 2.1932902336120605, + "rewards/rejected": -3.1799235343933105, + "step": 3550 + }, + { + "epoch": 0.9, + "learning_rate": 3.8891489560902535e-07, + "logits/chosen": -2.3965888023376465, + "logits/rejected": -2.5309622287750244, + "logps/chosen": -242.2054443359375, + "logps/rejected": -270.7049255371094, + "loss": 0.577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3094321489334106, + "rewards/margins": 1.3607757091522217, + "rewards/rejected": -2.6702075004577637, + "step": 3560 + }, + { + "epoch": 0.9, + "learning_rate": 3.884467746465687e-07, + "logits/chosen": -2.592451333999634, + "logits/rejected": -2.6167538166046143, + "logps/chosen": -231.6024627685547, + "logps/rejected": -326.56378173828125, + "loss": 0.6839, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8817847967147827, + "rewards/margins": 2.196343183517456, + "rewards/rejected": -3.0781280994415283, + "step": 3570 + }, + { + "epoch": 0.9, + "learning_rate": 3.87978653684112e-07, + "logits/chosen": -2.794201612472534, + "logits/rejected": -2.5453829765319824, + "logps/chosen": -418.9364318847656, + "logps/rejected": -227.43875122070312, + "loss": 0.4803, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.071514368057251, + "rewards/margins": 1.1174607276916504, + "rewards/rejected": -3.1889748573303223, + "step": 3580 + }, + { + "epoch": 0.91, + "learning_rate": 3.875105327216552e-07, + "logits/chosen": -2.723081350326538, + "logits/rejected": -2.688534736633301, + "logps/chosen": -254.62332153320312, + "logps/rejected": -248.84072875976562, + "loss": 0.5947, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0840532779693604, + "rewards/margins": 1.7423394918441772, + "rewards/rejected": -2.826392650604248, + "step": 3590 + }, + { + "epoch": 0.91, + "learning_rate": 3.8704241175919856e-07, + "logits/chosen": -2.6092543601989746, + "logits/rejected": -2.518146514892578, + "logps/chosen": -291.47137451171875, + "logps/rejected": -250.5021209716797, + "loss": 0.5874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7550883889198303, + "rewards/margins": 1.8083865642547607, + "rewards/rejected": -2.5634751319885254, + "step": 3600 + }, + { + "epoch": 0.91, + "learning_rate": 3.8657429079674185e-07, + "logits/chosen": -2.7659926414489746, + "logits/rejected": -2.703761577606201, + "logps/chosen": -304.2853088378906, + "logps/rejected": -325.69012451171875, + "loss": 0.6338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4947483539581299, + "rewards/margins": 0.9108313322067261, + "rewards/rejected": -2.4055798053741455, + "step": 3610 + }, + { + "epoch": 0.92, + "learning_rate": 3.861061698342852e-07, + "logits/chosen": -2.6591992378234863, + "logits/rejected": -2.55127215385437, + "logps/chosen": -316.5832214355469, + "logps/rejected": -230.5525360107422, + "loss": 0.5693, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1001923084259033, + "rewards/margins": 1.9854466915130615, + "rewards/rejected": -3.085639238357544, + "step": 3620 + }, + { + "epoch": 0.92, + "learning_rate": 3.856380488718284e-07, + "logits/chosen": -2.67317795753479, + "logits/rejected": -2.627720594406128, + "logps/chosen": -236.6415557861328, + "logps/rejected": -306.64239501953125, + "loss": 0.6112, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4338676929473877, + "rewards/margins": 2.2288482189178467, + "rewards/rejected": -3.6627159118652344, + "step": 3630 + }, + { + "epoch": 0.92, + "learning_rate": 3.8516992790937177e-07, + "logits/chosen": -2.586552619934082, + "logits/rejected": -2.6333885192871094, + "logps/chosen": -260.02496337890625, + "logps/rejected": -262.3716125488281, + "loss": 0.6356, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.574707269668579, + "rewards/margins": 1.0051276683807373, + "rewards/rejected": -2.5798351764678955, + "step": 3640 + }, + { + "epoch": 0.92, + "learning_rate": 3.8470180694691506e-07, + "logits/chosen": -2.8641533851623535, + "logits/rejected": -2.7077131271362305, + "logps/chosen": -271.4937438964844, + "logps/rejected": -222.1265411376953, + "loss": 0.615, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.424896001815796, + "rewards/margins": 1.3194665908813477, + "rewards/rejected": -2.7443625926971436, + "step": 3650 + }, + { + "epoch": 0.93, + "learning_rate": 3.842336859844584e-07, + "logits/chosen": -2.6335389614105225, + "logits/rejected": -2.5206198692321777, + "logps/chosen": -303.037353515625, + "logps/rejected": -293.91668701171875, + "loss": 0.5565, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4828656911849976, + "rewards/margins": 1.1465017795562744, + "rewards/rejected": -2.6293673515319824, + "step": 3660 + }, + { + "epoch": 0.93, + "learning_rate": 3.837655650220017e-07, + "logits/chosen": -2.773890256881714, + "logits/rejected": -2.586940050125122, + "logps/chosen": -332.7080993652344, + "logps/rejected": -269.8664855957031, + "loss": 0.6619, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.16353440284729, + "rewards/margins": 1.4180161952972412, + "rewards/rejected": -3.581550121307373, + "step": 3670 + }, + { + "epoch": 0.93, + "learning_rate": 3.832974440595449e-07, + "logits/chosen": -2.72123122215271, + "logits/rejected": -2.6845922470092773, + "logps/chosen": -299.73162841796875, + "logps/rejected": -271.2549743652344, + "loss": 0.7304, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.399379253387451, + "rewards/margins": 0.6283119916915894, + "rewards/rejected": -3.02769136428833, + "step": 3680 + }, + { + "epoch": 0.93, + "learning_rate": 3.8282932309708827e-07, + "logits/chosen": -2.725205183029175, + "logits/rejected": -2.6237640380859375, + "logps/chosen": -241.66439819335938, + "logps/rejected": -225.9160614013672, + "loss": 0.5719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.980405569076538, + "rewards/margins": 0.9876116514205933, + "rewards/rejected": -2.968017101287842, + "step": 3690 + }, + { + "epoch": 0.94, + "learning_rate": 3.8236120213463156e-07, + "logits/chosen": -2.708747386932373, + "logits/rejected": -2.711845874786377, + "logps/chosen": -246.9578857421875, + "logps/rejected": -265.8954772949219, + "loss": 0.5576, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.7569336891174316, + "rewards/margins": 0.7664319276809692, + "rewards/rejected": -3.5233657360076904, + "step": 3700 + }, + { + "epoch": 0.94, + "learning_rate": 3.818930811721749e-07, + "logits/chosen": -2.563875198364258, + "logits/rejected": -2.60050892829895, + "logps/chosen": -277.04351806640625, + "logps/rejected": -283.3923034667969, + "loss": 0.6133, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.120920419692993, + "rewards/margins": 0.8064680099487305, + "rewards/rejected": -2.9273884296417236, + "step": 3710 + }, + { + "epoch": 0.94, + "learning_rate": 3.8142496020971813e-07, + "logits/chosen": -2.6808571815490723, + "logits/rejected": -2.6824426651000977, + "logps/chosen": -266.8973388671875, + "logps/rejected": -216.49423217773438, + "loss": 0.5389, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3810316324234009, + "rewards/margins": 1.0641003847122192, + "rewards/rejected": -2.44513201713562, + "step": 3720 + }, + { + "epoch": 0.94, + "learning_rate": 3.809568392472615e-07, + "logits/chosen": -2.7239327430725098, + "logits/rejected": -2.701735496520996, + "logps/chosen": -301.28521728515625, + "logps/rejected": -270.1116943359375, + "loss": 0.5956, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.071234941482544, + "rewards/margins": 1.517927885055542, + "rewards/rejected": -2.589162826538086, + "step": 3730 + }, + { + "epoch": 0.95, + "learning_rate": 3.8048871828480477e-07, + "logits/chosen": -2.5309438705444336, + "logits/rejected": -2.4692294597625732, + "logps/chosen": -316.5550842285156, + "logps/rejected": -302.6878662109375, + "loss": 0.4316, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.82965886592865, + "rewards/margins": 1.7301852703094482, + "rewards/rejected": -3.559844493865967, + "step": 3740 + }, + { + "epoch": 0.95, + "learning_rate": 3.800205973223481e-07, + "logits/chosen": -2.6489920616149902, + "logits/rejected": -2.551776647567749, + "logps/chosen": -227.8351593017578, + "logps/rejected": -247.8078155517578, + "loss": 0.7572, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2301652431488037, + "rewards/margins": 1.3657900094985962, + "rewards/rejected": -3.5959556102752686, + "step": 3750 + }, + { + "epoch": 0.95, + "learning_rate": 3.795524763598914e-07, + "logits/chosen": -2.2900452613830566, + "logits/rejected": -2.341315746307373, + "logps/chosen": -368.0122375488281, + "logps/rejected": -338.74951171875, + "loss": 0.5997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7872066497802734, + "rewards/margins": 3.213815689086914, + "rewards/rejected": -5.001022815704346, + "step": 3760 + }, + { + "epoch": 0.95, + "learning_rate": 3.790843553974347e-07, + "logits/chosen": -2.4849908351898193, + "logits/rejected": -2.551020860671997, + "logps/chosen": -263.210693359375, + "logps/rejected": -397.0281066894531, + "loss": 0.4803, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4165332317352295, + "rewards/margins": 1.1031792163848877, + "rewards/rejected": -3.519712448120117, + "step": 3770 + }, + { + "epoch": 0.96, + "learning_rate": 3.78616234434978e-07, + "logits/chosen": -2.51816987991333, + "logits/rejected": -2.5692131519317627, + "logps/chosen": -258.7332458496094, + "logps/rejected": -332.8305969238281, + "loss": 0.5861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.619706392288208, + "rewards/margins": 1.3120684623718262, + "rewards/rejected": -3.9317753314971924, + "step": 3780 + }, + { + "epoch": 0.96, + "learning_rate": 3.7814811347252127e-07, + "logits/chosen": -2.289250612258911, + "logits/rejected": -2.404963970184326, + "logps/chosen": -224.2694549560547, + "logps/rejected": -220.3228759765625, + "loss": 0.6365, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.5331289768218994, + "rewards/margins": 0.5771879553794861, + "rewards/rejected": -3.110316753387451, + "step": 3790 + }, + { + "epoch": 0.96, + "learning_rate": 3.776799925100646e-07, + "logits/chosen": -2.3783960342407227, + "logits/rejected": -2.2490427494049072, + "logps/chosen": -397.93804931640625, + "logps/rejected": -365.0287780761719, + "loss": 0.7984, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.592641592025757, + "rewards/margins": -0.39536601305007935, + "rewards/rejected": -2.1972756385803223, + "step": 3800 + }, + { + "epoch": 0.96, + "learning_rate": 3.7721187154760784e-07, + "logits/chosen": -2.3933181762695312, + "logits/rejected": -2.3748421669006348, + "logps/chosen": -261.06378173828125, + "logps/rejected": -215.88504028320312, + "loss": 0.6671, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1627757549285889, + "rewards/margins": 0.8094266653060913, + "rewards/rejected": -1.9722025394439697, + "step": 3810 + }, + { + "epoch": 0.97, + "learning_rate": 3.767437505851512e-07, + "logits/chosen": -2.6702141761779785, + "logits/rejected": -2.6176254749298096, + "logps/chosen": -281.86773681640625, + "logps/rejected": -348.5569763183594, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.463614821434021, + "rewards/margins": 1.5381866693496704, + "rewards/rejected": -3.0018012523651123, + "step": 3820 + }, + { + "epoch": 0.97, + "learning_rate": 3.762756296226945e-07, + "logits/chosen": -2.3716139793395996, + "logits/rejected": -2.35888671875, + "logps/chosen": -261.81512451171875, + "logps/rejected": -226.60299682617188, + "loss": 0.5125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3739840984344482, + "rewards/margins": 1.4622911214828491, + "rewards/rejected": -2.836275339126587, + "step": 3830 + }, + { + "epoch": 0.97, + "learning_rate": 3.758075086602378e-07, + "logits/chosen": -2.617845058441162, + "logits/rejected": -2.6614153385162354, + "logps/chosen": -240.1200714111328, + "logps/rejected": -294.9786071777344, + "loss": 0.4858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.050519347190857, + "rewards/margins": 2.296268939971924, + "rewards/rejected": -3.346787929534912, + "step": 3840 + }, + { + "epoch": 0.97, + "learning_rate": 3.7533938769778105e-07, + "logits/chosen": -2.5451509952545166, + "logits/rejected": -2.451148271560669, + "logps/chosen": -258.9147033691406, + "logps/rejected": -250.61575317382812, + "loss": 0.6505, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.864426851272583, + "rewards/margins": 0.6109026074409485, + "rewards/rejected": -2.4753293991088867, + "step": 3850 + }, + { + "epoch": 0.98, + "learning_rate": 3.748712667353244e-07, + "logits/chosen": -2.7763352394104004, + "logits/rejected": -2.7048943042755127, + "logps/chosen": -369.334716796875, + "logps/rejected": -304.45758056640625, + "loss": 0.6338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1332387924194336, + "rewards/margins": 0.9427889585494995, + "rewards/rejected": -3.0760276317596436, + "step": 3860 + }, + { + "epoch": 0.98, + "learning_rate": 3.744031457728677e-07, + "logits/chosen": -2.4644370079040527, + "logits/rejected": -2.4173831939697266, + "logps/chosen": -319.7100524902344, + "logps/rejected": -272.42559814453125, + "loss": 0.5365, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5952672958374023, + "rewards/margins": 2.1468255519866943, + "rewards/rejected": -3.742093324661255, + "step": 3870 + }, + { + "epoch": 0.98, + "learning_rate": 3.7393502481041103e-07, + "logits/chosen": -2.7287070751190186, + "logits/rejected": -2.740267276763916, + "logps/chosen": -383.4566345214844, + "logps/rejected": -363.9472961425781, + "loss": 0.5002, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.6474063396453857, + "rewards/margins": 1.0410480499267578, + "rewards/rejected": -3.6884543895721436, + "step": 3880 + }, + { + "epoch": 0.98, + "learning_rate": 3.734669038479543e-07, + "logits/chosen": -2.6365084648132324, + "logits/rejected": -2.4641079902648926, + "logps/chosen": -363.31915283203125, + "logps/rejected": -291.7921142578125, + "loss": 0.6346, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3033270835876465, + "rewards/margins": 2.174389600753784, + "rewards/rejected": -3.4777164459228516, + "step": 3890 + }, + { + "epoch": 0.99, + "learning_rate": 3.7299878288549755e-07, + "logits/chosen": -2.745086431503296, + "logits/rejected": -2.551102876663208, + "logps/chosen": -321.5898132324219, + "logps/rejected": -275.5663146972656, + "loss": 0.5264, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8006789684295654, + "rewards/margins": 1.495180368423462, + "rewards/rejected": -4.295859336853027, + "step": 3900 + }, + { + "epoch": 0.99, + "learning_rate": 3.725306619230409e-07, + "logits/chosen": -2.579899549484253, + "logits/rejected": -2.564140796661377, + "logps/chosen": -250.631103515625, + "logps/rejected": -247.8636016845703, + "loss": 0.5685, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.100395679473877, + "rewards/margins": 2.7038025856018066, + "rewards/rejected": -3.8041980266571045, + "step": 3910 + }, + { + "epoch": 0.99, + "learning_rate": 3.720625409605842e-07, + "logits/chosen": -2.5308988094329834, + "logits/rejected": -2.525057315826416, + "logps/chosen": -297.98529052734375, + "logps/rejected": -257.6971130371094, + "loss": 0.5155, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13560429215431213, + "rewards/margins": 2.4298863410949707, + "rewards/rejected": -2.56549072265625, + "step": 3920 + }, + { + "epoch": 0.99, + "learning_rate": 3.7159441999812753e-07, + "logits/chosen": -2.6410772800445557, + "logits/rejected": -2.5101382732391357, + "logps/chosen": -234.18124389648438, + "logps/rejected": -279.98370361328125, + "loss": 0.7214, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2928466796875, + "rewards/margins": 0.7621484994888306, + "rewards/rejected": -3.054995059967041, + "step": 3930 + }, + { + "epoch": 1.0, + "learning_rate": 3.7112629903567076e-07, + "logits/chosen": -2.872884750366211, + "logits/rejected": -2.4980196952819824, + "logps/chosen": -281.6012878417969, + "logps/rejected": -180.0428009033203, + "loss": 0.4839, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9578030109405518, + "rewards/margins": 1.8885114192962646, + "rewards/rejected": -3.8463146686553955, + "step": 3940 + }, + { + "epoch": 1.0, + "learning_rate": 3.706581780732141e-07, + "logits/chosen": -2.7477738857269287, + "logits/rejected": -2.658379316329956, + "logps/chosen": -236.2034912109375, + "logps/rejected": -243.9905242919922, + "loss": 0.6138, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6140636205673218, + "rewards/margins": 1.226651906967163, + "rewards/rejected": -2.8407154083251953, + "step": 3950 + }, + { + "epoch": 1.0, + "learning_rate": 3.701900571107574e-07, + "logits/chosen": -2.5700225830078125, + "logits/rejected": -2.509644031524658, + "logps/chosen": -277.822265625, + "logps/rejected": -263.8984680175781, + "loss": 0.3053, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8047565221786499, + "rewards/margins": 4.11063289642334, + "rewards/rejected": -4.915389537811279, + "step": 3960 + }, + { + "epoch": 1.0, + "learning_rate": 3.6972193614830074e-07, + "logits/chosen": -2.705073118209839, + "logits/rejected": -2.610149383544922, + "logps/chosen": -239.9237060546875, + "logps/rejected": -314.971923828125, + "loss": 0.139, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.38272005319595337, + "rewards/margins": 6.647641658782959, + "rewards/rejected": -6.26492166519165, + "step": 3970 + }, + { + "epoch": 1.01, + "learning_rate": 3.6925381518584403e-07, + "logits/chosen": -2.7207839488983154, + "logits/rejected": -2.585111141204834, + "logps/chosen": -426.0726623535156, + "logps/rejected": -304.66510009765625, + "loss": 0.0802, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1401253938674927, + "rewards/margins": 7.029414176940918, + "rewards/rejected": -5.889288425445557, + "step": 3980 + }, + { + "epoch": 1.01, + "learning_rate": 3.6878569422338726e-07, + "logits/chosen": -2.681621551513672, + "logits/rejected": -2.5620551109313965, + "logps/chosen": -309.2017822265625, + "logps/rejected": -323.23931884765625, + "loss": 0.1826, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.27074792981147766, + "rewards/margins": 5.536995887756348, + "rewards/rejected": -5.807744026184082, + "step": 3990 + }, + { + "epoch": 1.01, + "learning_rate": 3.683175732609306e-07, + "logits/chosen": -2.7098963260650635, + "logits/rejected": -2.4533934593200684, + "logps/chosen": -325.30316162109375, + "logps/rejected": -380.95135498046875, + "loss": 0.0694, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.07627948373556137, + "rewards/margins": 7.224770545959473, + "rewards/rejected": -7.148490905761719, + "step": 4000 + }, + { + "epoch": 1.01, + "learning_rate": 3.678494522984739e-07, + "logits/chosen": -2.7862906455993652, + "logits/rejected": -2.6643624305725098, + "logps/chosen": -218.59738159179688, + "logps/rejected": -219.8732147216797, + "loss": 0.1955, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7133752703666687, + "rewards/margins": 5.267045021057129, + "rewards/rejected": -4.553668975830078, + "step": 4010 + }, + { + "epoch": 1.02, + "learning_rate": 3.6738133133601724e-07, + "logits/chosen": -2.7759945392608643, + "logits/rejected": -2.7921369075775146, + "logps/chosen": -261.9883117675781, + "logps/rejected": -340.5843200683594, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1419750452041626, + "rewards/margins": 7.45736837387085, + "rewards/rejected": -6.31539249420166, + "step": 4020 + }, + { + "epoch": 1.02, + "learning_rate": 3.669132103735605e-07, + "logits/chosen": -2.646761417388916, + "logits/rejected": -2.5379443168640137, + "logps/chosen": -260.28765869140625, + "logps/rejected": -246.83193969726562, + "loss": 0.1136, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24251198768615723, + "rewards/margins": 5.654573917388916, + "rewards/rejected": -5.897086143493652, + "step": 4030 + }, + { + "epoch": 1.02, + "learning_rate": 3.664450894111038e-07, + "logits/chosen": -2.5841357707977295, + "logits/rejected": -2.53714919090271, + "logps/chosen": -287.8470153808594, + "logps/rejected": -314.3433837890625, + "loss": 0.1137, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2378253936767578, + "rewards/margins": 7.138622283935547, + "rewards/rejected": -7.376448154449463, + "step": 4040 + }, + { + "epoch": 1.02, + "learning_rate": 3.659769684486471e-07, + "logits/chosen": -2.703209161758423, + "logits/rejected": -2.571162700653076, + "logps/chosen": -247.2056427001953, + "logps/rejected": -289.3986511230469, + "loss": 0.0958, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.07882566750049591, + "rewards/margins": 6.572531700134277, + "rewards/rejected": -6.493705749511719, + "step": 4050 + }, + { + "epoch": 1.03, + "learning_rate": 3.6550884748619045e-07, + "logits/chosen": -2.5277256965637207, + "logits/rejected": -2.565796375274658, + "logps/chosen": -261.80841064453125, + "logps/rejected": -318.0531311035156, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.227341890335083, + "rewards/margins": 9.081512451171875, + "rewards/rejected": -7.854170322418213, + "step": 4060 + }, + { + "epoch": 1.03, + "learning_rate": 3.650407265237337e-07, + "logits/chosen": -2.5151851177215576, + "logits/rejected": -2.4258155822753906, + "logps/chosen": -224.4687042236328, + "logps/rejected": -311.28936767578125, + "loss": 0.1217, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.26925137639045715, + "rewards/margins": 4.458642959594727, + "rewards/rejected": -4.7278947830200195, + "step": 4070 + }, + { + "epoch": 1.03, + "learning_rate": 3.6457260556127703e-07, + "logits/chosen": -2.613365650177002, + "logits/rejected": -2.623168468475342, + "logps/chosen": -212.8934783935547, + "logps/rejected": -294.2333984375, + "loss": 0.1228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08440116047859192, + "rewards/margins": 8.041141510009766, + "rewards/rejected": -8.125543594360352, + "step": 4080 + }, + { + "epoch": 1.03, + "learning_rate": 3.641044845988203e-07, + "logits/chosen": -2.67268443107605, + "logits/rejected": -2.607306480407715, + "logps/chosen": -221.0255889892578, + "logps/rejected": -288.19696044921875, + "loss": 0.074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5305021405220032, + "rewards/margins": 7.2046942710876465, + "rewards/rejected": -6.674191474914551, + "step": 4090 + }, + { + "epoch": 1.04, + "learning_rate": 3.636363636363636e-07, + "logits/chosen": -2.359762668609619, + "logits/rejected": -2.412950038909912, + "logps/chosen": -240.56201171875, + "logps/rejected": -267.10150146484375, + "loss": 0.0971, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5591071844100952, + "rewards/margins": 5.897465705871582, + "rewards/rejected": -6.456572532653809, + "step": 4100 + }, + { + "epoch": 1.04, + "learning_rate": 3.6316824267390695e-07, + "logits/chosen": -2.675454616546631, + "logits/rejected": -2.515897274017334, + "logps/chosen": -310.7024841308594, + "logps/rejected": -344.6875305175781, + "loss": 0.0756, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8981047868728638, + "rewards/margins": 9.891563415527344, + "rewards/rejected": -8.99345874786377, + "step": 4110 + }, + { + "epoch": 1.04, + "learning_rate": 3.627001217114502e-07, + "logits/chosen": -2.698408603668213, + "logits/rejected": -2.574500322341919, + "logps/chosen": -219.25167846679688, + "logps/rejected": -214.8889923095703, + "loss": 0.1992, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4875507354736328, + "rewards/margins": 3.8590054512023926, + "rewards/rejected": -4.346555709838867, + "step": 4120 + }, + { + "epoch": 1.04, + "learning_rate": 3.622320007489935e-07, + "logits/chosen": -2.582207202911377, + "logits/rejected": -2.5340628623962402, + "logps/chosen": -264.3891296386719, + "logps/rejected": -235.1367645263672, + "loss": 0.1439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6553934812545776, + "rewards/margins": 6.19410514831543, + "rewards/rejected": -5.538712024688721, + "step": 4130 + }, + { + "epoch": 1.05, + "learning_rate": 3.617638797865368e-07, + "logits/chosen": -2.4984068870544434, + "logits/rejected": -2.4741299152374268, + "logps/chosen": -209.7914581298828, + "logps/rejected": -269.10748291015625, + "loss": 0.0869, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.32389122247695923, + "rewards/margins": 6.044379234313965, + "rewards/rejected": -6.368269920349121, + "step": 4140 + }, + { + "epoch": 1.05, + "learning_rate": 3.6129575882408016e-07, + "logits/chosen": -2.6042675971984863, + "logits/rejected": -2.5595932006835938, + "logps/chosen": -239.3861846923828, + "logps/rejected": -258.2563171386719, + "loss": 0.0979, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.48330157995224, + "rewards/margins": 5.9926300048828125, + "rewards/rejected": -5.5093278884887695, + "step": 4150 + }, + { + "epoch": 1.05, + "learning_rate": 3.608276378616234e-07, + "logits/chosen": -2.443714141845703, + "logits/rejected": -2.2847142219543457, + "logps/chosen": -271.3092346191406, + "logps/rejected": -215.4811248779297, + "loss": 0.0999, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6948297619819641, + "rewards/margins": 5.3410773277282715, + "rewards/rejected": -6.035906791687012, + "step": 4160 + }, + { + "epoch": 1.05, + "learning_rate": 3.6035951689916674e-07, + "logits/chosen": -2.599687099456787, + "logits/rejected": -2.506901979446411, + "logps/chosen": -190.1280059814453, + "logps/rejected": -262.118896484375, + "loss": 0.1492, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.496035099029541, + "rewards/margins": 5.447064399719238, + "rewards/rejected": -4.9510297775268555, + "step": 4170 + }, + { + "epoch": 1.06, + "learning_rate": 3.5989139593671e-07, + "logits/chosen": -2.5147697925567627, + "logits/rejected": -2.5242176055908203, + "logps/chosen": -240.4428253173828, + "logps/rejected": -321.82061767578125, + "loss": 0.1165, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9775978922843933, + "rewards/margins": 6.624411106109619, + "rewards/rejected": -5.646812915802002, + "step": 4180 + }, + { + "epoch": 1.06, + "learning_rate": 3.5942327497425337e-07, + "logits/chosen": -2.503431797027588, + "logits/rejected": -2.459929943084717, + "logps/chosen": -207.2917022705078, + "logps/rejected": -317.4582824707031, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0375446118414402, + "rewards/margins": 6.4059157371521, + "rewards/rejected": -6.4434614181518555, + "step": 4190 + }, + { + "epoch": 1.06, + "learning_rate": 3.5895515401179666e-07, + "logits/chosen": -2.4857592582702637, + "logits/rejected": -2.416965961456299, + "logps/chosen": -263.7533264160156, + "logps/rejected": -320.15362548828125, + "loss": 0.1649, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4429326057434082, + "rewards/margins": 7.078021049499512, + "rewards/rejected": -5.6350884437561035, + "step": 4200 + }, + { + "epoch": 1.06, + "learning_rate": 3.584870330493399e-07, + "logits/chosen": -2.7715916633605957, + "logits/rejected": -2.681565999984741, + "logps/chosen": -331.1893615722656, + "logps/rejected": -335.10223388671875, + "loss": 0.0935, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20631642639636993, + "rewards/margins": 6.091833591461182, + "rewards/rejected": -5.885517120361328, + "step": 4210 + }, + { + "epoch": 1.07, + "learning_rate": 3.5801891208688324e-07, + "logits/chosen": -2.6815414428710938, + "logits/rejected": -2.6054797172546387, + "logps/chosen": -200.813720703125, + "logps/rejected": -217.6241912841797, + "loss": 0.1042, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9837878346443176, + "rewards/margins": 4.837677001953125, + "rewards/rejected": -5.821465015411377, + "step": 4220 + }, + { + "epoch": 1.07, + "learning_rate": 3.575507911244265e-07, + "logits/chosen": -2.59967303276062, + "logits/rejected": -2.607329845428467, + "logps/chosen": -241.1183624267578, + "logps/rejected": -331.64117431640625, + "loss": 0.1341, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5841931104660034, + "rewards/margins": 7.46688985824585, + "rewards/rejected": -6.882697105407715, + "step": 4230 + }, + { + "epoch": 1.07, + "learning_rate": 3.5708267016196987e-07, + "logits/chosen": -2.7304348945617676, + "logits/rejected": -2.553729772567749, + "logps/chosen": -252.4587860107422, + "logps/rejected": -267.918212890625, + "loss": 0.0772, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.876346230506897, + "rewards/margins": 5.990228652954102, + "rewards/rejected": -5.113882064819336, + "step": 4240 + }, + { + "epoch": 1.07, + "learning_rate": 3.566145491995131e-07, + "logits/chosen": -2.666968584060669, + "logits/rejected": -2.579244613647461, + "logps/chosen": -280.7061462402344, + "logps/rejected": -300.51776123046875, + "loss": 0.1279, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13451166450977325, + "rewards/margins": 7.642842769622803, + "rewards/rejected": -7.7773542404174805, + "step": 4250 + }, + { + "epoch": 1.08, + "learning_rate": 3.5614642823705645e-07, + "logits/chosen": -2.742469310760498, + "logits/rejected": -2.7256650924682617, + "logps/chosen": -232.1375732421875, + "logps/rejected": -248.3732147216797, + "loss": 0.1574, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0827386379241943, + "rewards/margins": 4.552823543548584, + "rewards/rejected": -3.4700851440429688, + "step": 4260 + }, + { + "epoch": 1.08, + "learning_rate": 3.5567830727459974e-07, + "logits/chosen": -2.4802334308624268, + "logits/rejected": -2.3722081184387207, + "logps/chosen": -245.95205688476562, + "logps/rejected": -424.5274353027344, + "loss": 0.0612, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5879344940185547, + "rewards/margins": 8.373659133911133, + "rewards/rejected": -6.7857255935668945, + "step": 4270 + }, + { + "epoch": 1.08, + "learning_rate": 3.552101863121431e-07, + "logits/chosen": -2.5735154151916504, + "logits/rejected": -2.550457715988159, + "logps/chosen": -264.8874206542969, + "logps/rejected": -269.48980712890625, + "loss": 0.1089, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5131327509880066, + "rewards/margins": 7.48690128326416, + "rewards/rejected": -8.000033378601074, + "step": 4280 + }, + { + "epoch": 1.08, + "learning_rate": 3.5474206534968637e-07, + "logits/chosen": -2.6528215408325195, + "logits/rejected": -2.4966320991516113, + "logps/chosen": -270.1402587890625, + "logps/rejected": -250.8711395263672, + "loss": 0.0816, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6331374645233154, + "rewards/margins": 7.091406345367432, + "rewards/rejected": -6.458268642425537, + "step": 4290 + }, + { + "epoch": 1.09, + "learning_rate": 3.542739443872296e-07, + "logits/chosen": -2.5830702781677246, + "logits/rejected": -2.5414295196533203, + "logps/chosen": -259.9555358886719, + "logps/rejected": -311.7438049316406, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13916687667369843, + "rewards/margins": 5.850489616394043, + "rewards/rejected": -5.9896559715271, + "step": 4300 + }, + { + "epoch": 1.09, + "learning_rate": 3.5380582342477295e-07, + "logits/chosen": -2.585474729537964, + "logits/rejected": -2.639191150665283, + "logps/chosen": -239.0613555908203, + "logps/rejected": -289.3290100097656, + "loss": 0.1227, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19983701407909393, + "rewards/margins": 6.089356422424316, + "rewards/rejected": -6.289193153381348, + "step": 4310 + }, + { + "epoch": 1.09, + "learning_rate": 3.5333770246231624e-07, + "logits/chosen": -2.4982070922851562, + "logits/rejected": -2.427825927734375, + "logps/chosen": -320.6855163574219, + "logps/rejected": -309.69256591796875, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9687162637710571, + "rewards/margins": 8.043795585632324, + "rewards/rejected": -9.012511253356934, + "step": 4320 + }, + { + "epoch": 1.09, + "learning_rate": 3.528695814998596e-07, + "logits/chosen": -2.4943461418151855, + "logits/rejected": -2.4896931648254395, + "logps/chosen": -238.29458618164062, + "logps/rejected": -280.5413513183594, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0275102853775024, + "rewards/margins": 6.780093193054199, + "rewards/rejected": -7.807603359222412, + "step": 4330 + }, + { + "epoch": 1.1, + "learning_rate": 3.524014605374028e-07, + "logits/chosen": -2.7054009437561035, + "logits/rejected": -2.6643614768981934, + "logps/chosen": -212.01318359375, + "logps/rejected": -324.78717041015625, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3500351309776306, + "rewards/margins": 9.12161922454834, + "rewards/rejected": -9.471654891967773, + "step": 4340 + }, + { + "epoch": 1.1, + "learning_rate": 3.5193333957494616e-07, + "logits/chosen": -2.4785401821136475, + "logits/rejected": -2.505174160003662, + "logps/chosen": -207.61343383789062, + "logps/rejected": -348.1514587402344, + "loss": 0.0704, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1490641087293625, + "rewards/margins": 8.631427764892578, + "rewards/rejected": -8.482362747192383, + "step": 4350 + }, + { + "epoch": 1.1, + "learning_rate": 3.5146521861248945e-07, + "logits/chosen": -2.54978609085083, + "logits/rejected": -2.507051944732666, + "logps/chosen": -285.1393127441406, + "logps/rejected": -300.57354736328125, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9839023351669312, + "rewards/margins": 8.747843742370605, + "rewards/rejected": -6.763941764831543, + "step": 4360 + }, + { + "epoch": 1.1, + "learning_rate": 3.509970976500328e-07, + "logits/chosen": -2.670477867126465, + "logits/rejected": -2.6304116249084473, + "logps/chosen": -283.92144775390625, + "logps/rejected": -384.2460632324219, + "loss": 0.0937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.8794275522232056, + "rewards/margins": 11.181547164916992, + "rewards/rejected": -9.302119255065918, + "step": 4370 + }, + { + "epoch": 1.11, + "learning_rate": 3.50528976687576e-07, + "logits/chosen": -2.586055040359497, + "logits/rejected": -2.5961456298828125, + "logps/chosen": -241.93124389648438, + "logps/rejected": -340.2915954589844, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8442662954330444, + "rewards/margins": 8.11192512512207, + "rewards/rejected": -6.26765775680542, + "step": 4380 + }, + { + "epoch": 1.11, + "learning_rate": 3.5006085572511937e-07, + "logits/chosen": -2.651855945587158, + "logits/rejected": -2.473231792449951, + "logps/chosen": -312.76507568359375, + "logps/rejected": -317.0950927734375, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09365153312683105, + "rewards/margins": 7.239400386810303, + "rewards/rejected": -7.145749092102051, + "step": 4390 + }, + { + "epoch": 1.11, + "learning_rate": 3.4959273476266266e-07, + "logits/chosen": -2.7039191722869873, + "logits/rejected": -2.6823935508728027, + "logps/chosen": -242.7147979736328, + "logps/rejected": -268.5030822753906, + "loss": 0.0781, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9241912961006165, + "rewards/margins": 4.920010089874268, + "rewards/rejected": -3.995819091796875, + "step": 4400 + }, + { + "epoch": 1.11, + "learning_rate": 3.4912461380020595e-07, + "logits/chosen": -2.694061517715454, + "logits/rejected": -2.5867414474487305, + "logps/chosen": -329.20806884765625, + "logps/rejected": -339.77227783203125, + "loss": 0.0937, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.02051043510437, + "rewards/margins": 10.045517921447754, + "rewards/rejected": -8.025007247924805, + "step": 4410 + }, + { + "epoch": 1.12, + "learning_rate": 3.486564928377493e-07, + "logits/chosen": -2.6415634155273438, + "logits/rejected": -2.554490566253662, + "logps/chosen": -249.7362060546875, + "logps/rejected": -265.8996887207031, + "loss": 0.0812, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.46163004636764526, + "rewards/margins": 5.617148399353027, + "rewards/rejected": -6.0787787437438965, + "step": 4420 + }, + { + "epoch": 1.12, + "learning_rate": 3.481883718752925e-07, + "logits/chosen": -2.595177173614502, + "logits/rejected": -2.6248831748962402, + "logps/chosen": -311.8207092285156, + "logps/rejected": -321.56243896484375, + "loss": 0.1348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5490500330924988, + "rewards/margins": 6.183700084686279, + "rewards/rejected": -6.732749938964844, + "step": 4430 + }, + { + "epoch": 1.12, + "learning_rate": 3.4772025091283587e-07, + "logits/chosen": -2.6946587562561035, + "logits/rejected": -2.681180477142334, + "logps/chosen": -285.69488525390625, + "logps/rejected": -354.4988708496094, + "loss": 0.1563, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4733982980251312, + "rewards/margins": 7.104119777679443, + "rewards/rejected": -7.577518463134766, + "step": 4440 + }, + { + "epoch": 1.12, + "learning_rate": 3.4725212995037916e-07, + "logits/chosen": -2.6871225833892822, + "logits/rejected": -2.4688057899475098, + "logps/chosen": -252.8527069091797, + "logps/rejected": -235.73483276367188, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8882217407226562, + "rewards/margins": 5.205448627471924, + "rewards/rejected": -6.093670845031738, + "step": 4450 + }, + { + "epoch": 1.13, + "learning_rate": 3.467840089879225e-07, + "logits/chosen": -2.4395699501037598, + "logits/rejected": -2.5508391857147217, + "logps/chosen": -266.2554931640625, + "logps/rejected": -329.2310485839844, + "loss": 0.1904, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0424487590789795, + "rewards/margins": 9.269670486450195, + "rewards/rejected": -8.227221488952637, + "step": 4460 + }, + { + "epoch": 1.13, + "learning_rate": 3.4631588802546573e-07, + "logits/chosen": -2.6959567070007324, + "logits/rejected": -2.6026034355163574, + "logps/chosen": -318.25616455078125, + "logps/rejected": -392.3797302246094, + "loss": 0.1572, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7935773134231567, + "rewards/margins": 7.944984436035156, + "rewards/rejected": -7.151407718658447, + "step": 4470 + }, + { + "epoch": 1.13, + "learning_rate": 3.458477670630091e-07, + "logits/chosen": -2.707530975341797, + "logits/rejected": -2.6383745670318604, + "logps/chosen": -271.60198974609375, + "logps/rejected": -291.4852600097656, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16344894468784332, + "rewards/margins": 5.676591873168945, + "rewards/rejected": -5.84004020690918, + "step": 4480 + }, + { + "epoch": 1.13, + "learning_rate": 3.4537964610055237e-07, + "logits/chosen": -2.642083168029785, + "logits/rejected": -2.554753541946411, + "logps/chosen": -357.892578125, + "logps/rejected": -368.09686279296875, + "loss": 0.1444, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3861753940582275, + "rewards/margins": 9.830830574035645, + "rewards/rejected": -8.44465446472168, + "step": 4490 + }, + { + "epoch": 1.14, + "learning_rate": 3.449115251380957e-07, + "logits/chosen": -2.560091972351074, + "logits/rejected": -2.6622557640075684, + "logps/chosen": -228.8826141357422, + "logps/rejected": -300.7447204589844, + "loss": 0.0706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33011412620544434, + "rewards/margins": 8.04318904876709, + "rewards/rejected": -8.37330436706543, + "step": 4500 + }, + { + "epoch": 1.14, + "learning_rate": 3.44443404175639e-07, + "logits/chosen": -2.764373302459717, + "logits/rejected": -2.7491188049316406, + "logps/chosen": -359.4165954589844, + "logps/rejected": -319.29095458984375, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6747967600822449, + "rewards/margins": 6.897503852844238, + "rewards/rejected": -7.572301387786865, + "step": 4510 + }, + { + "epoch": 1.14, + "learning_rate": 3.4397528321318223e-07, + "logits/chosen": -2.7152912616729736, + "logits/rejected": -2.70686674118042, + "logps/chosen": -314.1294250488281, + "logps/rejected": -342.2290954589844, + "loss": 0.0982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29457229375839233, + "rewards/margins": 6.949118614196777, + "rewards/rejected": -7.243691444396973, + "step": 4520 + }, + { + "epoch": 1.15, + "learning_rate": 3.435071622507256e-07, + "logits/chosen": -2.5364561080932617, + "logits/rejected": -2.527350902557373, + "logps/chosen": -284.78472900390625, + "logps/rejected": -373.36517333984375, + "loss": 0.1348, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7594577074050903, + "rewards/margins": 8.074015617370605, + "rewards/rejected": -8.833474159240723, + "step": 4530 + }, + { + "epoch": 1.15, + "learning_rate": 3.4303904128826887e-07, + "logits/chosen": -2.800316333770752, + "logits/rejected": -2.845433235168457, + "logps/chosen": -252.117431640625, + "logps/rejected": -345.842529296875, + "loss": 0.1883, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.2000238597393036, + "rewards/margins": 9.164016723632812, + "rewards/rejected": -8.963993072509766, + "step": 4540 + }, + { + "epoch": 1.15, + "learning_rate": 3.425709203258122e-07, + "logits/chosen": -2.722050905227661, + "logits/rejected": -2.7154502868652344, + "logps/chosen": -240.56130981445312, + "logps/rejected": -348.01336669921875, + "loss": 0.0954, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9766684770584106, + "rewards/margins": 6.593020439147949, + "rewards/rejected": -7.5696892738342285, + "step": 4550 + }, + { + "epoch": 1.15, + "learning_rate": 3.4210279936335544e-07, + "logits/chosen": -2.747832775115967, + "logits/rejected": -2.6978516578674316, + "logps/chosen": -357.05572509765625, + "logps/rejected": -413.34906005859375, + "loss": 0.0741, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.316377639770508, + "rewards/margins": 11.051581382751465, + "rewards/rejected": -8.735204696655273, + "step": 4560 + }, + { + "epoch": 1.16, + "learning_rate": 3.416346784008988e-07, + "logits/chosen": -2.6359996795654297, + "logits/rejected": -2.667074203491211, + "logps/chosen": -311.69647216796875, + "logps/rejected": -352.11676025390625, + "loss": 0.1039, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8526498079299927, + "rewards/margins": 8.114217758178711, + "rewards/rejected": -8.966867446899414, + "step": 4570 + }, + { + "epoch": 1.16, + "learning_rate": 3.411665574384421e-07, + "logits/chosen": -2.653594970703125, + "logits/rejected": -2.585784435272217, + "logps/chosen": -350.00238037109375, + "logps/rejected": -349.8188781738281, + "loss": 0.1114, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.14014729857444763, + "rewards/margins": 7.541351318359375, + "rewards/rejected": -7.4012041091918945, + "step": 4580 + }, + { + "epoch": 1.16, + "learning_rate": 3.406984364759854e-07, + "logits/chosen": -2.7345433235168457, + "logits/rejected": -2.7317304611206055, + "logps/chosen": -285.2073974609375, + "logps/rejected": -375.93621826171875, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3569324314594269, + "rewards/margins": 7.82189416885376, + "rewards/rejected": -8.178826332092285, + "step": 4590 + }, + { + "epoch": 1.16, + "learning_rate": 3.4023031551352865e-07, + "logits/chosen": -2.5770225524902344, + "logits/rejected": -2.5470917224884033, + "logps/chosen": -216.72360229492188, + "logps/rejected": -259.01409912109375, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25177958607673645, + "rewards/margins": 5.54573917388916, + "rewards/rejected": -5.797519683837891, + "step": 4600 + }, + { + "epoch": 1.17, + "learning_rate": 3.3976219455107194e-07, + "logits/chosen": -2.4865548610687256, + "logits/rejected": -2.4736034870147705, + "logps/chosen": -241.6890106201172, + "logps/rejected": -293.00665283203125, + "loss": 0.0821, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0089930295944214, + "rewards/margins": 5.638051509857178, + "rewards/rejected": -6.6470441818237305, + "step": 4610 + }, + { + "epoch": 1.17, + "learning_rate": 3.392940735886153e-07, + "logits/chosen": -2.670954704284668, + "logits/rejected": -2.6923716068267822, + "logps/chosen": -236.7271728515625, + "logps/rejected": -395.5998229980469, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6401809453964233, + "rewards/margins": 11.688632011413574, + "rewards/rejected": -10.048450469970703, + "step": 4620 + }, + { + "epoch": 1.17, + "learning_rate": 3.388259526261586e-07, + "logits/chosen": -2.740015983581543, + "logits/rejected": -2.6743392944335938, + "logps/chosen": -323.6190185546875, + "logps/rejected": -393.6664733886719, + "loss": 0.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9889957904815674, + "rewards/margins": 11.044259071350098, + "rewards/rejected": -9.05526351928711, + "step": 4630 + }, + { + "epoch": 1.17, + "learning_rate": 3.383578316637019e-07, + "logits/chosen": -2.745980978012085, + "logits/rejected": -2.6363847255706787, + "logps/chosen": -222.12136840820312, + "logps/rejected": -237.53622436523438, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.422921746969223, + "rewards/margins": 5.303702354431152, + "rewards/rejected": -5.726624011993408, + "step": 4640 + }, + { + "epoch": 1.18, + "learning_rate": 3.3788971070124515e-07, + "logits/chosen": -2.6138062477111816, + "logits/rejected": -2.5201363563537598, + "logps/chosen": -296.3515625, + "logps/rejected": -307.6025085449219, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3155912458896637, + "rewards/margins": 6.940545558929443, + "rewards/rejected": -7.256136417388916, + "step": 4650 + }, + { + "epoch": 1.18, + "learning_rate": 3.374215897387885e-07, + "logits/chosen": -2.8091039657592773, + "logits/rejected": -2.78580904006958, + "logps/chosen": -320.61651611328125, + "logps/rejected": -311.21051025390625, + "loss": 0.0875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5112084746360779, + "rewards/margins": 6.893043518066406, + "rewards/rejected": -6.381835460662842, + "step": 4660 + }, + { + "epoch": 1.18, + "learning_rate": 3.369534687763318e-07, + "logits/chosen": -2.6944401264190674, + "logits/rejected": -2.6727585792541504, + "logps/chosen": -221.2980194091797, + "logps/rejected": -306.5845947265625, + "loss": 0.0994, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.10195112228393555, + "rewards/margins": 8.119444847106934, + "rewards/rejected": -8.017494201660156, + "step": 4670 + }, + { + "epoch": 1.18, + "learning_rate": 3.3648534781387513e-07, + "logits/chosen": -2.3914220333099365, + "logits/rejected": -2.2504477500915527, + "logps/chosen": -424.81341552734375, + "logps/rejected": -369.765869140625, + "loss": 0.086, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4902187585830688, + "rewards/margins": 6.820440769195557, + "rewards/rejected": -8.310659408569336, + "step": 4680 + }, + { + "epoch": 1.19, + "learning_rate": 3.3601722685141836e-07, + "logits/chosen": -2.843759059906006, + "logits/rejected": -2.71614670753479, + "logps/chosen": -283.5601501464844, + "logps/rejected": -440.706298828125, + "loss": 0.0882, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2339136600494385, + "rewards/margins": 10.507527351379395, + "rewards/rejected": -9.273611068725586, + "step": 4690 + }, + { + "epoch": 1.19, + "learning_rate": 3.355491058889617e-07, + "logits/chosen": -2.504859447479248, + "logits/rejected": -2.501770496368408, + "logps/chosen": -257.53802490234375, + "logps/rejected": -385.2470703125, + "loss": 0.0999, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.575905442237854, + "rewards/margins": 7.368882656097412, + "rewards/rejected": -8.944788932800293, + "step": 4700 + }, + { + "epoch": 1.19, + "learning_rate": 3.35080984926505e-07, + "logits/chosen": -2.812898635864258, + "logits/rejected": -2.7729899883270264, + "logps/chosen": -301.29193115234375, + "logps/rejected": -374.1706848144531, + "loss": 0.0783, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.030539250001311302, + "rewards/margins": 7.485116481781006, + "rewards/rejected": -7.4545769691467285, + "step": 4710 + }, + { + "epoch": 1.19, + "learning_rate": 3.346128639640483e-07, + "logits/chosen": -2.7590649127960205, + "logits/rejected": -2.6491782665252686, + "logps/chosen": -356.0698547363281, + "logps/rejected": -367.9351501464844, + "loss": 0.2066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5648988485336304, + "rewards/margins": 6.791413307189941, + "rewards/rejected": -7.356311798095703, + "step": 4720 + }, + { + "epoch": 1.2, + "learning_rate": 3.3414474300159163e-07, + "logits/chosen": -2.4159865379333496, + "logits/rejected": -2.5030646324157715, + "logps/chosen": -272.33172607421875, + "logps/rejected": -328.46258544921875, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2051588296890259, + "rewards/margins": 7.337274074554443, + "rewards/rejected": -8.54243278503418, + "step": 4730 + }, + { + "epoch": 1.2, + "learning_rate": 3.3367662203913486e-07, + "logits/chosen": -2.6770882606506348, + "logits/rejected": -2.4457528591156006, + "logps/chosen": -304.1426696777344, + "logps/rejected": -284.6874084472656, + "loss": 0.0785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.562188982963562, + "rewards/margins": 8.102972030639648, + "rewards/rejected": -8.665160179138184, + "step": 4740 + }, + { + "epoch": 1.2, + "learning_rate": 3.332085010766782e-07, + "logits/chosen": -2.6974499225616455, + "logits/rejected": -2.589693784713745, + "logps/chosen": -322.5655212402344, + "logps/rejected": -326.5779113769531, + "loss": 0.1351, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5035828948020935, + "rewards/margins": 7.923377990722656, + "rewards/rejected": -7.41979455947876, + "step": 4750 + }, + { + "epoch": 1.2, + "learning_rate": 3.327403801142215e-07, + "logits/chosen": -2.536750555038452, + "logits/rejected": -2.445263624191284, + "logps/chosen": -219.2605743408203, + "logps/rejected": -300.7672119140625, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11792264133691788, + "rewards/margins": 8.396120071411133, + "rewards/rejected": -8.514042854309082, + "step": 4760 + }, + { + "epoch": 1.21, + "learning_rate": 3.3227225915176484e-07, + "logits/chosen": -2.708127498626709, + "logits/rejected": -2.5230722427368164, + "logps/chosen": -322.44451904296875, + "logps/rejected": -329.8259582519531, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0522180795669556, + "rewards/margins": 9.174871444702148, + "rewards/rejected": -8.12265396118164, + "step": 4770 + }, + { + "epoch": 1.21, + "learning_rate": 3.318041381893081e-07, + "logits/chosen": -2.3173861503601074, + "logits/rejected": -2.3874242305755615, + "logps/chosen": -357.01116943359375, + "logps/rejected": -348.6958312988281, + "loss": 0.0629, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6036861538887024, + "rewards/margins": 6.142181873321533, + "rewards/rejected": -6.7458672523498535, + "step": 4780 + }, + { + "epoch": 1.21, + "learning_rate": 3.313360172268514e-07, + "logits/chosen": -2.5158562660217285, + "logits/rejected": -2.5699989795684814, + "logps/chosen": -304.5142822265625, + "logps/rejected": -355.92901611328125, + "loss": 0.1261, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19178393483161926, + "rewards/margins": 7.122007846832275, + "rewards/rejected": -7.313791751861572, + "step": 4790 + }, + { + "epoch": 1.21, + "learning_rate": 3.308678962643947e-07, + "logits/chosen": -2.7196245193481445, + "logits/rejected": -2.601762294769287, + "logps/chosen": -333.9893798828125, + "logps/rejected": -354.0044250488281, + "loss": 0.0989, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6505855321884155, + "rewards/margins": 9.573948860168457, + "rewards/rejected": -7.923361778259277, + "step": 4800 + }, + { + "epoch": 1.22, + "learning_rate": 3.3039977530193805e-07, + "logits/chosen": -2.6281702518463135, + "logits/rejected": -2.604680299758911, + "logps/chosen": -213.1340789794922, + "logps/rejected": -235.78866577148438, + "loss": 0.0867, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.426002562046051, + "rewards/margins": 6.862097263336182, + "rewards/rejected": -7.288099765777588, + "step": 4810 + }, + { + "epoch": 1.22, + "learning_rate": 3.299316543394813e-07, + "logits/chosen": -2.5487630367279053, + "logits/rejected": -2.3977303504943848, + "logps/chosen": -308.79803466796875, + "logps/rejected": -307.55865478515625, + "loss": 0.1295, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6266343593597412, + "rewards/margins": 7.827906608581543, + "rewards/rejected": -8.454541206359863, + "step": 4820 + }, + { + "epoch": 1.22, + "learning_rate": 3.294635333770246e-07, + "logits/chosen": -2.5832231044769287, + "logits/rejected": -2.551140546798706, + "logps/chosen": -260.5826416015625, + "logps/rejected": -373.8498840332031, + "loss": 0.0825, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.6590865254402161, + "rewards/margins": 10.208707809448242, + "rewards/rejected": -9.54962158203125, + "step": 4830 + }, + { + "epoch": 1.22, + "learning_rate": 3.289954124145679e-07, + "logits/chosen": -2.5220448970794678, + "logits/rejected": -2.5066912174224854, + "logps/chosen": -262.2845764160156, + "logps/rejected": -257.2518005371094, + "loss": 0.2061, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8599599599838257, + "rewards/margins": 5.666739463806152, + "rewards/rejected": -7.526698112487793, + "step": 4840 + }, + { + "epoch": 1.23, + "learning_rate": 3.285272914521112e-07, + "logits/chosen": -2.4081637859344482, + "logits/rejected": -2.360440254211426, + "logps/chosen": -210.6362762451172, + "logps/rejected": -366.09283447265625, + "loss": 0.1449, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3536659479141235, + "rewards/margins": 6.394976615905762, + "rewards/rejected": -7.748641014099121, + "step": 4850 + }, + { + "epoch": 1.23, + "learning_rate": 3.2805917048965455e-07, + "logits/chosen": -2.679720640182495, + "logits/rejected": -2.585204839706421, + "logps/chosen": -300.98736572265625, + "logps/rejected": -262.6742858886719, + "loss": 0.0843, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.46402186155319214, + "rewards/margins": 5.079150199890137, + "rewards/rejected": -5.5431718826293945, + "step": 4860 + }, + { + "epoch": 1.23, + "learning_rate": 3.275910495271978e-07, + "logits/chosen": -2.487708330154419, + "logits/rejected": -2.5284218788146973, + "logps/chosen": -231.89138793945312, + "logps/rejected": -356.9471740722656, + "loss": 0.0977, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6061888933181763, + "rewards/margins": 8.890503883361816, + "rewards/rejected": -8.284314155578613, + "step": 4870 + }, + { + "epoch": 1.23, + "learning_rate": 3.271229285647411e-07, + "logits/chosen": -2.5985474586486816, + "logits/rejected": -2.5037901401519775, + "logps/chosen": -319.89703369140625, + "logps/rejected": -444.4794921875, + "loss": 0.1161, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.2769446074962616, + "rewards/margins": 8.904375076293945, + "rewards/rejected": -8.627429008483887, + "step": 4880 + }, + { + "epoch": 1.24, + "learning_rate": 3.266548076022844e-07, + "logits/chosen": -2.5160458087921143, + "logits/rejected": -2.3146345615386963, + "logps/chosen": -263.0496826171875, + "logps/rejected": -347.59539794921875, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4355698227882385, + "rewards/margins": 8.083341598510742, + "rewards/rejected": -8.518911361694336, + "step": 4890 + }, + { + "epoch": 1.24, + "learning_rate": 3.2618668663982776e-07, + "logits/chosen": -2.3110601902008057, + "logits/rejected": -2.304619073867798, + "logps/chosen": -289.2687072753906, + "logps/rejected": -312.71832275390625, + "loss": 0.0807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.874498724937439, + "rewards/margins": 5.567784309387207, + "rewards/rejected": -6.442282676696777, + "step": 4900 + }, + { + "epoch": 1.24, + "learning_rate": 3.25718565677371e-07, + "logits/chosen": -2.3375391960144043, + "logits/rejected": -2.280052661895752, + "logps/chosen": -351.435791015625, + "logps/rejected": -335.129150390625, + "loss": 0.1079, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4033665060997009, + "rewards/margins": 8.237372398376465, + "rewards/rejected": -7.834007263183594, + "step": 4910 + }, + { + "epoch": 1.24, + "learning_rate": 3.252504447149143e-07, + "logits/chosen": -2.599074363708496, + "logits/rejected": -2.519040107727051, + "logps/chosen": -295.12054443359375, + "logps/rejected": -278.61279296875, + "loss": 0.0756, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.35003671050071716, + "rewards/margins": 5.749134063720703, + "rewards/rejected": -6.099170207977295, + "step": 4920 + }, + { + "epoch": 1.25, + "learning_rate": 3.247823237524576e-07, + "logits/chosen": -2.363374948501587, + "logits/rejected": -2.219956159591675, + "logps/chosen": -272.5213317871094, + "logps/rejected": -356.702880859375, + "loss": 0.0551, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.160892128944397, + "rewards/margins": 6.272316932678223, + "rewards/rejected": -7.433209419250488, + "step": 4930 + }, + { + "epoch": 1.25, + "learning_rate": 3.243142027900009e-07, + "logits/chosen": -2.46087384223938, + "logits/rejected": -2.4047253131866455, + "logps/chosen": -330.67987060546875, + "logps/rejected": -315.9459533691406, + "loss": 0.0981, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.39595699310302734, + "rewards/margins": 8.342355728149414, + "rewards/rejected": -7.9463982582092285, + "step": 4940 + }, + { + "epoch": 1.25, + "learning_rate": 3.2384608182754426e-07, + "logits/chosen": -2.372825860977173, + "logits/rejected": -2.363440752029419, + "logps/chosen": -313.22259521484375, + "logps/rejected": -392.95330810546875, + "loss": 0.077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1961669921875, + "rewards/margins": 8.941523551940918, + "rewards/rejected": -10.137690544128418, + "step": 4950 + }, + { + "epoch": 1.25, + "learning_rate": 3.233779608650875e-07, + "logits/chosen": -2.536463499069214, + "logits/rejected": -2.463451623916626, + "logps/chosen": -299.01849365234375, + "logps/rejected": -307.9565734863281, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06701017916202545, + "rewards/margins": 6.308807373046875, + "rewards/rejected": -6.37581729888916, + "step": 4960 + }, + { + "epoch": 1.26, + "learning_rate": 3.2290983990263084e-07, + "logits/chosen": -2.4642741680145264, + "logits/rejected": -2.242976665496826, + "logps/chosen": -297.47772216796875, + "logps/rejected": -272.5084228515625, + "loss": 0.0671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.45554548501968384, + "rewards/margins": 7.599942207336426, + "rewards/rejected": -7.144396781921387, + "step": 4970 + }, + { + "epoch": 1.26, + "learning_rate": 3.224417189401741e-07, + "logits/chosen": -2.6214382648468018, + "logits/rejected": -2.4945359230041504, + "logps/chosen": -313.21820068359375, + "logps/rejected": -415.0000915527344, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43819770216941833, + "rewards/margins": 8.810786247253418, + "rewards/rejected": -9.248983383178711, + "step": 4980 + }, + { + "epoch": 1.26, + "learning_rate": 3.2197359797771747e-07, + "logits/chosen": -2.313692569732666, + "logits/rejected": -2.2011497020721436, + "logps/chosen": -218.2009735107422, + "logps/rejected": -247.17532348632812, + "loss": 0.111, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7616634368896484, + "rewards/margins": 4.7421064376831055, + "rewards/rejected": -6.503769874572754, + "step": 4990 + }, + { + "epoch": 1.26, + "learning_rate": 3.215054770152607e-07, + "logits/chosen": -2.337268352508545, + "logits/rejected": -2.1621477603912354, + "logps/chosen": -224.9978790283203, + "logps/rejected": -239.8637237548828, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3969880938529968, + "rewards/margins": 7.433840751647949, + "rewards/rejected": -7.8308281898498535, + "step": 5000 + }, + { + "epoch": 1.27, + "learning_rate": 3.2103735605280405e-07, + "logits/chosen": -2.4363958835601807, + "logits/rejected": -2.274914026260376, + "logps/chosen": -250.6156463623047, + "logps/rejected": -246.60397338867188, + "loss": 0.1126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40992647409439087, + "rewards/margins": 6.342892169952393, + "rewards/rejected": -6.7528181076049805, + "step": 5010 + }, + { + "epoch": 1.27, + "learning_rate": 3.2056923509034734e-07, + "logits/chosen": -2.478621482849121, + "logits/rejected": -2.4945502281188965, + "logps/chosen": -282.9931335449219, + "logps/rejected": -350.892578125, + "loss": 0.083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5662134289741516, + "rewards/margins": 9.96242904663086, + "rewards/rejected": -9.396215438842773, + "step": 5020 + }, + { + "epoch": 1.27, + "learning_rate": 3.201011141278906e-07, + "logits/chosen": -2.539477586746216, + "logits/rejected": -2.6019339561462402, + "logps/chosen": -190.00582885742188, + "logps/rejected": -270.9634704589844, + "loss": 0.0953, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5821501016616821, + "rewards/margins": 8.267257690429688, + "rewards/rejected": -8.849408149719238, + "step": 5030 + }, + { + "epoch": 1.27, + "learning_rate": 3.196329931654339e-07, + "logits/chosen": -2.580522060394287, + "logits/rejected": -2.344433069229126, + "logps/chosen": -213.58950805664062, + "logps/rejected": -231.41171264648438, + "loss": 0.043, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.2818623185157776, + "rewards/margins": 6.4354963302612305, + "rewards/rejected": -6.153634071350098, + "step": 5040 + }, + { + "epoch": 1.28, + "learning_rate": 3.191648722029772e-07, + "logits/chosen": -2.538010597229004, + "logits/rejected": -2.5986104011535645, + "logps/chosen": -334.59130859375, + "logps/rejected": -352.0889892578125, + "loss": 0.075, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7573752403259277, + "rewards/margins": 7.762686252593994, + "rewards/rejected": -7.005311489105225, + "step": 5050 + }, + { + "epoch": 1.28, + "learning_rate": 3.1869675124052055e-07, + "logits/chosen": -2.246711254119873, + "logits/rejected": -2.192532539367676, + "logps/chosen": -273.1240234375, + "logps/rejected": -297.4888610839844, + "loss": 0.0721, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6416655778884888, + "rewards/margins": 7.805489540100098, + "rewards/rejected": -8.447155952453613, + "step": 5060 + }, + { + "epoch": 1.28, + "learning_rate": 3.1822863027806384e-07, + "logits/chosen": -2.2593188285827637, + "logits/rejected": -2.304441213607788, + "logps/chosen": -214.68161010742188, + "logps/rejected": -375.0998229980469, + "loss": 0.1026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5545988082885742, + "rewards/margins": 11.266571998596191, + "rewards/rejected": -12.82116985321045, + "step": 5070 + }, + { + "epoch": 1.28, + "learning_rate": 3.177605093156072e-07, + "logits/chosen": -2.4767723083496094, + "logits/rejected": -2.3222458362579346, + "logps/chosen": -209.7731475830078, + "logps/rejected": -316.46942138671875, + "loss": 0.0994, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6914139986038208, + "rewards/margins": 6.875144958496094, + "rewards/rejected": -7.566558837890625, + "step": 5080 + }, + { + "epoch": 1.29, + "learning_rate": 3.172923883531504e-07, + "logits/chosen": -2.4502346515655518, + "logits/rejected": -2.4465484619140625, + "logps/chosen": -260.17474365234375, + "logps/rejected": -325.09649658203125, + "loss": 0.2634, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.13710087537765503, + "rewards/margins": 7.22597599029541, + "rewards/rejected": -7.363077640533447, + "step": 5090 + }, + { + "epoch": 1.29, + "learning_rate": 3.1682426739069376e-07, + "logits/chosen": -2.5645554065704346, + "logits/rejected": -2.2416329383850098, + "logps/chosen": -294.3785705566406, + "logps/rejected": -285.55218505859375, + "loss": 0.1383, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3467468321323395, + "rewards/margins": 5.447103977203369, + "rewards/rejected": -5.793850898742676, + "step": 5100 + }, + { + "epoch": 1.29, + "learning_rate": 3.1635614642823705e-07, + "logits/chosen": -2.532900333404541, + "logits/rejected": -2.584439754486084, + "logps/chosen": -228.9880828857422, + "logps/rejected": -327.6162414550781, + "loss": 0.1279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023542404174804688, + "rewards/margins": 6.46780252456665, + "rewards/rejected": -6.491345405578613, + "step": 5110 + }, + { + "epoch": 1.29, + "learning_rate": 3.1588802546578033e-07, + "logits/chosen": -2.691772937774658, + "logits/rejected": -2.6354057788848877, + "logps/chosen": -274.74908447265625, + "logps/rejected": -227.7584991455078, + "loss": 0.126, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2150676250457764, + "rewards/margins": 6.647829532623291, + "rewards/rejected": -5.432761192321777, + "step": 5120 + }, + { + "epoch": 1.3, + "learning_rate": 3.154199045033236e-07, + "logits/chosen": -2.686004161834717, + "logits/rejected": -2.56087064743042, + "logps/chosen": -267.67657470703125, + "logps/rejected": -256.8505859375, + "loss": 0.0943, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1638038456439972, + "rewards/margins": 5.643346309661865, + "rewards/rejected": -5.807150840759277, + "step": 5130 + }, + { + "epoch": 1.3, + "learning_rate": 3.149517835408669e-07, + "logits/chosen": -2.780150890350342, + "logits/rejected": -2.749056100845337, + "logps/chosen": -260.70452880859375, + "logps/rejected": -246.4949951171875, + "loss": 0.1032, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6062029600143433, + "rewards/margins": 5.213749885559082, + "rewards/rejected": -5.81995153427124, + "step": 5140 + }, + { + "epoch": 1.3, + "learning_rate": 3.1448366257841026e-07, + "logits/chosen": -2.6148600578308105, + "logits/rejected": -2.5808730125427246, + "logps/chosen": -188.8590545654297, + "logps/rejected": -271.2063903808594, + "loss": 0.109, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014142001047730446, + "rewards/margins": 6.740876197814941, + "rewards/rejected": -6.726733207702637, + "step": 5150 + }, + { + "epoch": 1.3, + "learning_rate": 3.1401554161595354e-07, + "logits/chosen": -2.544335126876831, + "logits/rejected": -2.4183132648468018, + "logps/chosen": -299.56256103515625, + "logps/rejected": -318.59283447265625, + "loss": 0.094, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3409278392791748, + "rewards/margins": 7.766920566558838, + "rewards/rejected": -7.4259934425354, + "step": 5160 + }, + { + "epoch": 1.31, + "learning_rate": 3.135474206534969e-07, + "logits/chosen": -2.4106175899505615, + "logits/rejected": -2.454444646835327, + "logps/chosen": -252.49008178710938, + "logps/rejected": -335.98138427734375, + "loss": 0.318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3366869390010834, + "rewards/margins": 6.528241157531738, + "rewards/rejected": -6.864927768707275, + "step": 5170 + }, + { + "epoch": 1.31, + "learning_rate": 3.130792996910401e-07, + "logits/chosen": -2.7149930000305176, + "logits/rejected": -2.665344715118408, + "logps/chosen": -232.3904266357422, + "logps/rejected": -329.61187744140625, + "loss": 0.1881, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6869542002677917, + "rewards/margins": 6.880588531494141, + "rewards/rejected": -7.567543029785156, + "step": 5180 + }, + { + "epoch": 1.31, + "learning_rate": 3.1261117872858347e-07, + "logits/chosen": -2.739945411682129, + "logits/rejected": -2.588160276412964, + "logps/chosen": -382.86700439453125, + "logps/rejected": -346.7525329589844, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6857783794403076, + "rewards/margins": 7.36517333984375, + "rewards/rejected": -8.05095100402832, + "step": 5190 + }, + { + "epoch": 1.31, + "learning_rate": 3.1214305776612676e-07, + "logits/chosen": -2.639806032180786, + "logits/rejected": -2.6764893531799316, + "logps/chosen": -253.75650024414062, + "logps/rejected": -262.29376220703125, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3462933897972107, + "rewards/margins": 5.708548069000244, + "rewards/rejected": -6.054841041564941, + "step": 5200 + }, + { + "epoch": 1.32, + "learning_rate": 3.116749368036701e-07, + "logits/chosen": -2.5353024005889893, + "logits/rejected": -2.475508689880371, + "logps/chosen": -313.263916015625, + "logps/rejected": -308.0852355957031, + "loss": 0.1445, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.26782792806625366, + "rewards/margins": 8.699701309204102, + "rewards/rejected": -8.431873321533203, + "step": 5210 + }, + { + "epoch": 1.32, + "learning_rate": 3.1120681584121333e-07, + "logits/chosen": -2.51269268989563, + "logits/rejected": -2.4065051078796387, + "logps/chosen": -364.6129455566406, + "logps/rejected": -401.416748046875, + "loss": 0.083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7194173336029053, + "rewards/margins": 9.384866714477539, + "rewards/rejected": -8.665449142456055, + "step": 5220 + }, + { + "epoch": 1.32, + "learning_rate": 3.107386948787566e-07, + "logits/chosen": -2.64780855178833, + "logits/rejected": -2.580000400543213, + "logps/chosen": -239.59799194335938, + "logps/rejected": -263.75860595703125, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8628127574920654, + "rewards/margins": 5.6662468910217285, + "rewards/rejected": -7.529058933258057, + "step": 5230 + }, + { + "epoch": 1.32, + "learning_rate": 3.1027057391629997e-07, + "logits/chosen": -2.7219760417938232, + "logits/rejected": -2.5608391761779785, + "logps/chosen": -235.74038696289062, + "logps/rejected": -348.9249572753906, + "loss": 0.1029, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.319959819316864, + "rewards/margins": 7.829257011413574, + "rewards/rejected": -8.14921760559082, + "step": 5240 + }, + { + "epoch": 1.33, + "learning_rate": 3.0980245295384325e-07, + "logits/chosen": -2.2276036739349365, + "logits/rejected": -2.187509536743164, + "logps/chosen": -247.76754760742188, + "logps/rejected": -262.46905517578125, + "loss": 0.1006, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9565721750259399, + "rewards/margins": 5.925984859466553, + "rewards/rejected": -6.882556915283203, + "step": 5250 + }, + { + "epoch": 1.33, + "learning_rate": 3.093343319913866e-07, + "logits/chosen": -2.4842872619628906, + "logits/rejected": -2.506471633911133, + "logps/chosen": -232.65634155273438, + "logps/rejected": -322.9957580566406, + "loss": 0.2545, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0278587341308594, + "rewards/margins": 6.862580299377441, + "rewards/rejected": -7.890439033508301, + "step": 5260 + }, + { + "epoch": 1.33, + "learning_rate": 3.0886621102892983e-07, + "logits/chosen": -2.5557355880737305, + "logits/rejected": -2.34470796585083, + "logps/chosen": -269.01129150390625, + "logps/rejected": -328.1927490234375, + "loss": 0.1072, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42379847168922424, + "rewards/margins": 7.9827117919921875, + "rewards/rejected": -7.558913230895996, + "step": 5270 + }, + { + "epoch": 1.33, + "learning_rate": 3.083980900664732e-07, + "logits/chosen": -2.5487945079803467, + "logits/rejected": -2.4814343452453613, + "logps/chosen": -300.04327392578125, + "logps/rejected": -354.25762939453125, + "loss": 0.0879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19205832481384277, + "rewards/margins": 7.899884223937988, + "rewards/rejected": -8.09194278717041, + "step": 5280 + }, + { + "epoch": 1.34, + "learning_rate": 3.0792996910401647e-07, + "logits/chosen": -2.5001492500305176, + "logits/rejected": -2.453411102294922, + "logps/chosen": -245.8662109375, + "logps/rejected": -261.9246520996094, + "loss": 0.0955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09629590809345245, + "rewards/margins": 7.062813758850098, + "rewards/rejected": -7.159110069274902, + "step": 5290 + }, + { + "epoch": 1.34, + "learning_rate": 3.074618481415598e-07, + "logits/chosen": -2.3594117164611816, + "logits/rejected": -2.2448248863220215, + "logps/chosen": -250.38961791992188, + "logps/rejected": -226.07034301757812, + "loss": 0.0924, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.12966462969779968, + "rewards/margins": 5.99800443649292, + "rewards/rejected": -6.127669334411621, + "step": 5300 + }, + { + "epoch": 1.34, + "learning_rate": 3.0699372717910304e-07, + "logits/chosen": -2.446681022644043, + "logits/rejected": -2.406707286834717, + "logps/chosen": -267.0587463378906, + "logps/rejected": -302.5667724609375, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03188369423151016, + "rewards/margins": 8.295231819152832, + "rewards/rejected": -8.327116012573242, + "step": 5310 + }, + { + "epoch": 1.34, + "learning_rate": 3.065256062166464e-07, + "logits/chosen": -2.5380733013153076, + "logits/rejected": -2.412759780883789, + "logps/chosen": -228.8946990966797, + "logps/rejected": -260.20880126953125, + "loss": 0.1016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5675271153450012, + "rewards/margins": 6.607524871826172, + "rewards/rejected": -7.175052642822266, + "step": 5320 + }, + { + "epoch": 1.35, + "learning_rate": 3.060574852541897e-07, + "logits/chosen": -2.536606788635254, + "logits/rejected": -2.5423636436462402, + "logps/chosen": -238.0750732421875, + "logps/rejected": -361.2891845703125, + "loss": 0.0686, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8750554919242859, + "rewards/margins": 6.947751522064209, + "rewards/rejected": -7.822807312011719, + "step": 5330 + }, + { + "epoch": 1.35, + "learning_rate": 3.0558936429173296e-07, + "logits/chosen": -2.3694663047790527, + "logits/rejected": -2.361489772796631, + "logps/chosen": -176.83859252929688, + "logps/rejected": -229.7638397216797, + "loss": 0.104, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.37298643589019775, + "rewards/margins": 4.905221939086914, + "rewards/rejected": -5.278207778930664, + "step": 5340 + }, + { + "epoch": 1.35, + "learning_rate": 3.0512124332927625e-07, + "logits/chosen": -2.567654609680176, + "logits/rejected": -2.5228219032287598, + "logps/chosen": -383.1378173828125, + "logps/rejected": -327.1722106933594, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13435129821300507, + "rewards/margins": 7.013735294342041, + "rewards/rejected": -6.8793840408325195, + "step": 5350 + }, + { + "epoch": 1.35, + "learning_rate": 3.0465312236681954e-07, + "logits/chosen": -2.4223361015319824, + "logits/rejected": -2.3703174591064453, + "logps/chosen": -241.94107055664062, + "logps/rejected": -277.2035827636719, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31000375747680664, + "rewards/margins": 7.710813999176025, + "rewards/rejected": -8.020816802978516, + "step": 5360 + }, + { + "epoch": 1.36, + "learning_rate": 3.041850014043629e-07, + "logits/chosen": -2.4581074714660645, + "logits/rejected": -2.3692710399627686, + "logps/chosen": -312.61236572265625, + "logps/rejected": -466.7472229003906, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3494400978088379, + "rewards/margins": 9.886107444763184, + "rewards/rejected": -10.23554801940918, + "step": 5370 + }, + { + "epoch": 1.36, + "learning_rate": 3.037168804419062e-07, + "logits/chosen": -2.4162583351135254, + "logits/rejected": -2.347888708114624, + "logps/chosen": -233.33657836914062, + "logps/rejected": -285.4001159667969, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32537588477134705, + "rewards/margins": 7.4720563888549805, + "rewards/rejected": -7.797431945800781, + "step": 5380 + }, + { + "epoch": 1.36, + "learning_rate": 3.032487594794495e-07, + "logits/chosen": -2.5550405979156494, + "logits/rejected": -2.5950775146484375, + "logps/chosen": -282.7281188964844, + "logps/rejected": -376.10205078125, + "loss": 0.0927, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0375185012817383, + "rewards/margins": 9.317535400390625, + "rewards/rejected": -8.280016899108887, + "step": 5390 + }, + { + "epoch": 1.37, + "learning_rate": 3.0278063851699275e-07, + "logits/chosen": -2.3756508827209473, + "logits/rejected": -2.281033992767334, + "logps/chosen": -216.5684814453125, + "logps/rejected": -313.62030029296875, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8860958814620972, + "rewards/margins": 6.265264987945557, + "rewards/rejected": -7.151360511779785, + "step": 5400 + }, + { + "epoch": 1.37, + "learning_rate": 3.023125175545361e-07, + "logits/chosen": -2.414106845855713, + "logits/rejected": -2.336822509765625, + "logps/chosen": -247.4856414794922, + "logps/rejected": -265.3743591308594, + "loss": 0.1013, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24329610168933868, + "rewards/margins": 6.677481174468994, + "rewards/rejected": -6.920777320861816, + "step": 5410 + }, + { + "epoch": 1.37, + "learning_rate": 3.018443965920794e-07, + "logits/chosen": -2.40796160697937, + "logits/rejected": -2.5906550884246826, + "logps/chosen": -311.5230407714844, + "logps/rejected": -433.317626953125, + "loss": 0.0861, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6146554946899414, + "rewards/margins": 8.28280258178711, + "rewards/rejected": -8.897459030151367, + "step": 5420 + }, + { + "epoch": 1.37, + "learning_rate": 3.013762756296227e-07, + "logits/chosen": -2.696668863296509, + "logits/rejected": -2.546956777572632, + "logps/chosen": -251.2556915283203, + "logps/rejected": -289.1623840332031, + "loss": 0.0825, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5907089710235596, + "rewards/margins": 6.483929634094238, + "rewards/rejected": -8.074637413024902, + "step": 5430 + }, + { + "epoch": 1.38, + "learning_rate": 3.0090815466716596e-07, + "logits/chosen": -2.555224895477295, + "logits/rejected": -2.459705114364624, + "logps/chosen": -312.50482177734375, + "logps/rejected": -346.53387451171875, + "loss": 0.0872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4651520252227783, + "rewards/margins": 7.669609069824219, + "rewards/rejected": -8.134759902954102, + "step": 5440 + }, + { + "epoch": 1.38, + "learning_rate": 3.0044003370470925e-07, + "logits/chosen": -2.4516243934631348, + "logits/rejected": -2.2774457931518555, + "logps/chosen": -330.15582275390625, + "logps/rejected": -310.63946533203125, + "loss": 0.0741, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.050064899027347565, + "rewards/margins": 6.6439690589904785, + "rewards/rejected": -6.5939040184021, + "step": 5450 + }, + { + "epoch": 1.38, + "learning_rate": 2.999719127422526e-07, + "logits/chosen": -2.441774606704712, + "logits/rejected": -2.4072556495666504, + "logps/chosen": -236.97335815429688, + "logps/rejected": -306.00140380859375, + "loss": 0.1016, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8134746551513672, + "rewards/margins": 6.758217811584473, + "rewards/rejected": -8.571691513061523, + "step": 5460 + }, + { + "epoch": 1.38, + "learning_rate": 2.995037917797959e-07, + "logits/chosen": -2.4176416397094727, + "logits/rejected": -2.3949923515319824, + "logps/chosen": -280.667236328125, + "logps/rejected": -353.26507568359375, + "loss": 0.0999, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2733783721923828, + "rewards/margins": 7.296590328216553, + "rewards/rejected": -7.569968223571777, + "step": 5470 + }, + { + "epoch": 1.39, + "learning_rate": 2.9903567081733923e-07, + "logits/chosen": -2.6557836532592773, + "logits/rejected": -2.504343032836914, + "logps/chosen": -311.24371337890625, + "logps/rejected": -369.2570495605469, + "loss": 0.1336, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3242943286895752, + "rewards/margins": 7.029200077056885, + "rewards/rejected": -8.353494644165039, + "step": 5480 + }, + { + "epoch": 1.39, + "learning_rate": 2.9856754985488246e-07, + "logits/chosen": -2.8352675437927246, + "logits/rejected": -2.565420627593994, + "logps/chosen": -266.55560302734375, + "logps/rejected": -273.0650329589844, + "loss": 0.0972, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7270272970199585, + "rewards/margins": 7.272873878479004, + "rewards/rejected": -6.545845985412598, + "step": 5490 + }, + { + "epoch": 1.39, + "learning_rate": 2.980994288924258e-07, + "logits/chosen": -2.438821792602539, + "logits/rejected": -2.379103660583496, + "logps/chosen": -238.837158203125, + "logps/rejected": -324.5002136230469, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3304581642150879, + "rewards/margins": 7.2548418045043945, + "rewards/rejected": -7.585301399230957, + "step": 5500 + }, + { + "epoch": 1.39, + "learning_rate": 2.976313079299691e-07, + "logits/chosen": -2.37760066986084, + "logits/rejected": -2.4221079349517822, + "logps/chosen": -189.891357421875, + "logps/rejected": -291.51153564453125, + "loss": 0.1455, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0570133924484253, + "rewards/margins": 6.477774143218994, + "rewards/rejected": -7.534787178039551, + "step": 5510 + }, + { + "epoch": 1.4, + "learning_rate": 2.9716318696751244e-07, + "logits/chosen": -2.6192290782928467, + "logits/rejected": -2.759930372238159, + "logps/chosen": -193.07093811035156, + "logps/rejected": -374.53155517578125, + "loss": 0.1248, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1276962757110596, + "rewards/margins": 6.811429500579834, + "rewards/rejected": -7.939126014709473, + "step": 5520 + }, + { + "epoch": 1.4, + "learning_rate": 2.966950660050557e-07, + "logits/chosen": -2.473038911819458, + "logits/rejected": -2.470531940460205, + "logps/chosen": -208.4632568359375, + "logps/rejected": -346.54901123046875, + "loss": 0.0749, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7250261306762695, + "rewards/margins": 6.926937103271484, + "rewards/rejected": -7.651963233947754, + "step": 5530 + }, + { + "epoch": 1.4, + "learning_rate": 2.9622694504259896e-07, + "logits/chosen": -2.854395866394043, + "logits/rejected": -2.8448264598846436, + "logps/chosen": -313.43011474609375, + "logps/rejected": -380.42047119140625, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4461665749549866, + "rewards/margins": 7.274533271789551, + "rewards/rejected": -6.828366279602051, + "step": 5540 + }, + { + "epoch": 1.4, + "learning_rate": 2.957588240801423e-07, + "logits/chosen": -2.5442984104156494, + "logits/rejected": -2.3909056186676025, + "logps/chosen": -340.7677001953125, + "logps/rejected": -361.9802551269531, + "loss": 0.0682, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6735048294067383, + "rewards/margins": 4.611882209777832, + "rewards/rejected": -6.28538703918457, + "step": 5550 + }, + { + "epoch": 1.41, + "learning_rate": 2.952907031176856e-07, + "logits/chosen": -2.2576093673706055, + "logits/rejected": -2.2817511558532715, + "logps/chosen": -182.24478149414062, + "logps/rejected": -332.81268310546875, + "loss": 0.054, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7802220582962036, + "rewards/margins": 7.97159481048584, + "rewards/rejected": -8.75181770324707, + "step": 5560 + }, + { + "epoch": 1.41, + "learning_rate": 2.948225821552289e-07, + "logits/chosen": -2.4663827419281006, + "logits/rejected": -2.5319385528564453, + "logps/chosen": -180.53253173828125, + "logps/rejected": -316.0972595214844, + "loss": 0.1364, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.3494766354560852, + "rewards/margins": 9.86876106262207, + "rewards/rejected": -9.51928424835205, + "step": 5570 + }, + { + "epoch": 1.41, + "learning_rate": 2.9435446119277217e-07, + "logits/chosen": -2.7857749462127686, + "logits/rejected": -2.664595127105713, + "logps/chosen": -317.0216064453125, + "logps/rejected": -350.7251892089844, + "loss": 0.0648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3388091027736664, + "rewards/margins": 8.139375686645508, + "rewards/rejected": -8.478184700012207, + "step": 5580 + }, + { + "epoch": 1.41, + "learning_rate": 2.938863402303155e-07, + "logits/chosen": -2.7515158653259277, + "logits/rejected": -2.567390203475952, + "logps/chosen": -326.83209228515625, + "logps/rejected": -353.8697204589844, + "loss": 0.0821, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1699625253677368, + "rewards/margins": 7.984126091003418, + "rewards/rejected": -9.154088973999023, + "step": 5590 + }, + { + "epoch": 1.42, + "learning_rate": 2.934182192678588e-07, + "logits/chosen": -2.676360607147217, + "logits/rejected": -2.38034987449646, + "logps/chosen": -261.83428955078125, + "logps/rejected": -288.66094970703125, + "loss": 0.1106, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.187965750694275, + "rewards/margins": 6.701111793518066, + "rewards/rejected": -7.889077186584473, + "step": 5600 + }, + { + "epoch": 1.42, + "learning_rate": 2.9295009830540215e-07, + "logits/chosen": -2.336933135986328, + "logits/rejected": -2.0914289951324463, + "logps/chosen": -319.1845703125, + "logps/rejected": -405.6087341308594, + "loss": 0.0786, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1506112813949585, + "rewards/margins": 7.663626194000244, + "rewards/rejected": -8.814237594604492, + "step": 5610 + }, + { + "epoch": 1.42, + "learning_rate": 2.924819773429454e-07, + "logits/chosen": -2.5694477558135986, + "logits/rejected": -2.3845672607421875, + "logps/chosen": -309.91021728515625, + "logps/rejected": -286.1216735839844, + "loss": 0.0766, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1635410785675049, + "rewards/margins": 6.091832160949707, + "rewards/rejected": -7.255372524261475, + "step": 5620 + }, + { + "epoch": 1.42, + "learning_rate": 2.9201385638048867e-07, + "logits/chosen": -2.5705244541168213, + "logits/rejected": -2.4809091091156006, + "logps/chosen": -285.5097961425781, + "logps/rejected": -325.88372802734375, + "loss": 0.1075, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2106363773345947, + "rewards/margins": 10.178508758544922, + "rewards/rejected": -8.967872619628906, + "step": 5630 + }, + { + "epoch": 1.43, + "learning_rate": 2.91545735418032e-07, + "logits/chosen": -2.4400763511657715, + "logits/rejected": -2.473245143890381, + "logps/chosen": -231.61257934570312, + "logps/rejected": -289.244873046875, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42569226026535034, + "rewards/margins": 7.356956481933594, + "rewards/rejected": -7.782649040222168, + "step": 5640 + }, + { + "epoch": 1.43, + "learning_rate": 2.910776144555753e-07, + "logits/chosen": -2.447082757949829, + "logits/rejected": -2.4613442420959473, + "logps/chosen": -246.17288208007812, + "logps/rejected": -291.76287841796875, + "loss": 0.0777, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5513318777084351, + "rewards/margins": 7.071081638336182, + "rewards/rejected": -6.519749641418457, + "step": 5650 + }, + { + "epoch": 1.43, + "learning_rate": 2.906094934931186e-07, + "logits/chosen": -2.573009490966797, + "logits/rejected": -2.3336191177368164, + "logps/chosen": -336.81903076171875, + "logps/rejected": -252.26974487304688, + "loss": 0.0923, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.25829944014549255, + "rewards/margins": 7.9574151039123535, + "rewards/rejected": -7.699115753173828, + "step": 5660 + }, + { + "epoch": 1.43, + "learning_rate": 2.901413725306619e-07, + "logits/chosen": -2.550891399383545, + "logits/rejected": -2.451150417327881, + "logps/chosen": -295.5380554199219, + "logps/rejected": -313.8006286621094, + "loss": 0.0791, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.783888816833496, + "rewards/margins": 5.338407516479492, + "rewards/rejected": -7.1222968101501465, + "step": 5670 + }, + { + "epoch": 1.44, + "learning_rate": 2.896732515682052e-07, + "logits/chosen": -2.3952527046203613, + "logits/rejected": -2.389531373977661, + "logps/chosen": -204.33877563476562, + "logps/rejected": -264.1290588378906, + "loss": 0.1216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.787590742111206, + "rewards/margins": 7.53562068939209, + "rewards/rejected": -6.748030185699463, + "step": 5680 + }, + { + "epoch": 1.44, + "learning_rate": 2.892051306057485e-07, + "logits/chosen": -2.5032591819763184, + "logits/rejected": -2.518265724182129, + "logps/chosen": -296.32977294921875, + "logps/rejected": -414.72564697265625, + "loss": 0.0841, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2691758871078491, + "rewards/margins": 9.437515258789062, + "rewards/rejected": -8.168340682983398, + "step": 5690 + }, + { + "epoch": 1.44, + "learning_rate": 2.8873700964329186e-07, + "logits/chosen": -2.558290481567383, + "logits/rejected": -2.5007545948028564, + "logps/chosen": -289.505859375, + "logps/rejected": -322.5883483886719, + "loss": 0.1034, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.3201775848865509, + "rewards/margins": 7.209942817687988, + "rewards/rejected": -6.889764308929443, + "step": 5700 + }, + { + "epoch": 1.44, + "learning_rate": 2.882688886808351e-07, + "logits/chosen": -2.454603672027588, + "logits/rejected": -2.4256834983825684, + "logps/chosen": -193.55093383789062, + "logps/rejected": -257.0700988769531, + "loss": 0.1765, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.116165041923523, + "rewards/margins": 7.95557165145874, + "rewards/rejected": -6.839406490325928, + "step": 5710 + }, + { + "epoch": 1.45, + "learning_rate": 2.8780076771837844e-07, + "logits/chosen": -2.575314521789551, + "logits/rejected": -2.5462193489074707, + "logps/chosen": -220.2628936767578, + "logps/rejected": -283.34088134765625, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6230512857437134, + "rewards/margins": 6.939806938171387, + "rewards/rejected": -7.562857627868652, + "step": 5720 + }, + { + "epoch": 1.45, + "learning_rate": 2.873326467559217e-07, + "logits/chosen": -2.588092803955078, + "logits/rejected": -2.4266304969787598, + "logps/chosen": -218.74368286132812, + "logps/rejected": -260.84228515625, + "loss": 0.1046, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.0295507907867432, + "rewards/margins": 6.963271141052246, + "rewards/rejected": -5.933720588684082, + "step": 5730 + }, + { + "epoch": 1.45, + "learning_rate": 2.86864525793465e-07, + "logits/chosen": -2.608156681060791, + "logits/rejected": -2.5254130363464355, + "logps/chosen": -178.9147491455078, + "logps/rejected": -278.29290771484375, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29764580726623535, + "rewards/margins": 5.203714370727539, + "rewards/rejected": -5.5013604164123535, + "step": 5740 + }, + { + "epoch": 1.45, + "learning_rate": 2.863964048310083e-07, + "logits/chosen": -2.6046547889709473, + "logits/rejected": -2.3902244567871094, + "logps/chosen": -258.7455749511719, + "logps/rejected": -235.589599609375, + "loss": 0.0817, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6193943023681641, + "rewards/margins": 5.428850173950195, + "rewards/rejected": -6.048244476318359, + "step": 5750 + }, + { + "epoch": 1.46, + "learning_rate": 2.859282838685516e-07, + "logits/chosen": -2.4807581901550293, + "logits/rejected": -2.586901903152466, + "logps/chosen": -215.5895538330078, + "logps/rejected": -336.99664306640625, + "loss": 0.1096, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0720269680023193, + "rewards/margins": 5.663225173950195, + "rewards/rejected": -6.735252380371094, + "step": 5760 + }, + { + "epoch": 1.46, + "learning_rate": 2.8546016290609494e-07, + "logits/chosen": -2.515242338180542, + "logits/rejected": -2.443443775177002, + "logps/chosen": -224.4276885986328, + "logps/rejected": -299.26336669921875, + "loss": 0.1309, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5686905980110168, + "rewards/margins": 9.281776428222656, + "rewards/rejected": -9.850465774536133, + "step": 5770 + }, + { + "epoch": 1.46, + "learning_rate": 2.849920419436382e-07, + "logits/chosen": -2.3371036052703857, + "logits/rejected": -2.2131152153015137, + "logps/chosen": -278.50762939453125, + "logps/rejected": -278.50030517578125, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.818460464477539, + "rewards/margins": 5.5670084953308105, + "rewards/rejected": -7.38546895980835, + "step": 5780 + }, + { + "epoch": 1.46, + "learning_rate": 2.845239209811815e-07, + "logits/chosen": -2.3276255130767822, + "logits/rejected": -2.200223922729492, + "logps/chosen": -199.0684814453125, + "logps/rejected": -212.3662872314453, + "loss": 0.0566, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.0024425506126135588, + "rewards/margins": 7.235228061676025, + "rewards/rejected": -7.2376708984375, + "step": 5790 + }, + { + "epoch": 1.47, + "learning_rate": 2.840558000187248e-07, + "logits/chosen": -2.686560869216919, + "logits/rejected": -2.4994020462036133, + "logps/chosen": -344.4832763671875, + "logps/rejected": -400.8691101074219, + "loss": 0.0751, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36507248878479004, + "rewards/margins": 8.268793106079102, + "rewards/rejected": -7.903720855712891, + "step": 5800 + }, + { + "epoch": 1.47, + "learning_rate": 2.8358767905626815e-07, + "logits/chosen": -2.262820243835449, + "logits/rejected": -2.2107882499694824, + "logps/chosen": -200.62533569335938, + "logps/rejected": -208.8029022216797, + "loss": 0.1287, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7861111164093018, + "rewards/margins": 5.80421781539917, + "rewards/rejected": -7.590329170227051, + "step": 5810 + }, + { + "epoch": 1.47, + "learning_rate": 2.8311955809381143e-07, + "logits/chosen": -2.5503885746002197, + "logits/rejected": -2.4902098178863525, + "logps/chosen": -298.7713928222656, + "logps/rejected": -394.04833984375, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4204397201538086, + "rewards/margins": 9.799768447875977, + "rewards/rejected": -9.379328727722168, + "step": 5820 + }, + { + "epoch": 1.47, + "learning_rate": 2.826514371313548e-07, + "logits/chosen": -2.4589803218841553, + "logits/rejected": -2.4627013206481934, + "logps/chosen": -288.97943115234375, + "logps/rejected": -333.4466857910156, + "loss": 0.0799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8154493570327759, + "rewards/margins": 9.827569961547852, + "rewards/rejected": -9.012121200561523, + "step": 5830 + }, + { + "epoch": 1.48, + "learning_rate": 2.82183316168898e-07, + "logits/chosen": -2.542847156524658, + "logits/rejected": -2.2891502380371094, + "logps/chosen": -265.6089172363281, + "logps/rejected": -256.6798095703125, + "loss": 0.1162, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.137566328048706, + "rewards/margins": 7.110358238220215, + "rewards/rejected": -9.2479248046875, + "step": 5840 + }, + { + "epoch": 1.48, + "learning_rate": 2.817151952064413e-07, + "logits/chosen": -2.2505059242248535, + "logits/rejected": -2.116539239883423, + "logps/chosen": -225.38626098632812, + "logps/rejected": -257.52081298828125, + "loss": 0.0794, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6695985794067383, + "rewards/margins": 6.193515300750732, + "rewards/rejected": -7.8631134033203125, + "step": 5850 + }, + { + "epoch": 1.48, + "learning_rate": 2.8124707424398465e-07, + "logits/chosen": -2.6086325645446777, + "logits/rejected": -2.5266706943511963, + "logps/chosen": -244.1230010986328, + "logps/rejected": -297.35198974609375, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30047744512557983, + "rewards/margins": 8.088984489440918, + "rewards/rejected": -8.389463424682617, + "step": 5860 + }, + { + "epoch": 1.48, + "learning_rate": 2.8077895328152793e-07, + "logits/chosen": -2.6594889163970947, + "logits/rejected": -2.5432751178741455, + "logps/chosen": -274.76116943359375, + "logps/rejected": -362.2596130371094, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45065465569496155, + "rewards/margins": 9.219099998474121, + "rewards/rejected": -9.669755935668945, + "step": 5870 + }, + { + "epoch": 1.49, + "learning_rate": 2.803108323190712e-07, + "logits/chosen": -2.459502935409546, + "logits/rejected": -2.391540288925171, + "logps/chosen": -211.6705780029297, + "logps/rejected": -267.565185546875, + "loss": 0.1507, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5549541115760803, + "rewards/margins": 7.696126461029053, + "rewards/rejected": -8.251081466674805, + "step": 5880 + }, + { + "epoch": 1.49, + "learning_rate": 2.798427113566145e-07, + "logits/chosen": -2.5379414558410645, + "logits/rejected": -2.41731595993042, + "logps/chosen": -215.3525848388672, + "logps/rejected": -307.221435546875, + "loss": 0.128, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5225763320922852, + "rewards/margins": 6.014660835266113, + "rewards/rejected": -7.537237644195557, + "step": 5890 + }, + { + "epoch": 1.49, + "learning_rate": 2.7937459039415786e-07, + "logits/chosen": -2.419687032699585, + "logits/rejected": -2.4673666954040527, + "logps/chosen": -203.66171264648438, + "logps/rejected": -299.1502685546875, + "loss": 0.0774, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.26614704728126526, + "rewards/margins": 7.863767147064209, + "rewards/rejected": -7.597620487213135, + "step": 5900 + }, + { + "epoch": 1.49, + "learning_rate": 2.7890646943170114e-07, + "logits/chosen": -2.6347222328186035, + "logits/rejected": -2.705169439315796, + "logps/chosen": -290.8692626953125, + "logps/rejected": -386.2977600097656, + "loss": 0.1117, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.419967532157898, + "rewards/margins": 9.840895652770996, + "rewards/rejected": -8.420928955078125, + "step": 5910 + }, + { + "epoch": 1.5, + "learning_rate": 2.784383484692445e-07, + "logits/chosen": -2.620143175125122, + "logits/rejected": -2.575716495513916, + "logps/chosen": -234.12960815429688, + "logps/rejected": -333.3351745605469, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7579079866409302, + "rewards/margins": 6.541896820068359, + "rewards/rejected": -7.299803733825684, + "step": 5920 + }, + { + "epoch": 1.5, + "learning_rate": 2.779702275067877e-07, + "logits/chosen": -2.6059043407440186, + "logits/rejected": -2.5090622901916504, + "logps/chosen": -333.93701171875, + "logps/rejected": -346.04095458984375, + "loss": 0.0702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5566359162330627, + "rewards/margins": 8.704923629760742, + "rewards/rejected": -8.148286819458008, + "step": 5930 + }, + { + "epoch": 1.5, + "learning_rate": 2.77502106544331e-07, + "logits/chosen": -2.5816235542297363, + "logits/rejected": -2.607320547103882, + "logps/chosen": -202.93458557128906, + "logps/rejected": -324.5149230957031, + "loss": 0.0928, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5764981508255005, + "rewards/margins": 6.660517692565918, + "rewards/rejected": -7.237016201019287, + "step": 5940 + }, + { + "epoch": 1.5, + "learning_rate": 2.7703398558187435e-07, + "logits/chosen": -2.6109724044799805, + "logits/rejected": -2.443455219268799, + "logps/chosen": -278.3966064453125, + "logps/rejected": -274.39251708984375, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7179809808731079, + "rewards/margins": 6.091689109802246, + "rewards/rejected": -6.809670448303223, + "step": 5950 + }, + { + "epoch": 1.51, + "learning_rate": 2.7656586461941764e-07, + "logits/chosen": -2.5273938179016113, + "logits/rejected": -2.4414260387420654, + "logps/chosen": -255.90646362304688, + "logps/rejected": -310.47418212890625, + "loss": 0.095, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8377459645271301, + "rewards/margins": 8.048016548156738, + "rewards/rejected": -8.885762214660645, + "step": 5960 + }, + { + "epoch": 1.51, + "learning_rate": 2.7609774365696093e-07, + "logits/chosen": -2.461449146270752, + "logits/rejected": -2.3967373371124268, + "logps/chosen": -269.59979248046875, + "logps/rejected": -303.8924560546875, + "loss": 0.0864, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5161443948745728, + "rewards/margins": 6.825022220611572, + "rewards/rejected": -7.341165065765381, + "step": 5970 + }, + { + "epoch": 1.51, + "learning_rate": 2.756296226945042e-07, + "logits/chosen": -2.6349711418151855, + "logits/rejected": -2.652991533279419, + "logps/chosen": -228.6748504638672, + "logps/rejected": -292.7082824707031, + "loss": 0.1179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43261289596557617, + "rewards/margins": 7.230783939361572, + "rewards/rejected": -7.663397312164307, + "step": 5980 + }, + { + "epoch": 1.51, + "learning_rate": 2.7516150173204757e-07, + "logits/chosen": -2.4800076484680176, + "logits/rejected": -2.265038013458252, + "logps/chosen": -218.5613555908203, + "logps/rejected": -243.19091796875, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0557198524475098, + "rewards/margins": 5.498206615447998, + "rewards/rejected": -7.55392599105835, + "step": 5990 + }, + { + "epoch": 1.52, + "learning_rate": 2.7469338076959085e-07, + "logits/chosen": -2.6435389518737793, + "logits/rejected": -2.5455713272094727, + "logps/chosen": -222.0457763671875, + "logps/rejected": -301.1404113769531, + "loss": 0.1007, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5636796951293945, + "rewards/margins": 7.443596839904785, + "rewards/rejected": -6.879917144775391, + "step": 6000 + }, + { + "epoch": 1.52, + "learning_rate": 2.7422525980713414e-07, + "logits/chosen": -2.673046588897705, + "logits/rejected": -2.3299639225006104, + "logps/chosen": -270.10064697265625, + "logps/rejected": -286.19610595703125, + "loss": 0.0637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03729300573468208, + "rewards/margins": 8.096498489379883, + "rewards/rejected": -8.13379192352295, + "step": 6010 + }, + { + "epoch": 1.52, + "learning_rate": 2.7375713884467743e-07, + "logits/chosen": -2.5294830799102783, + "logits/rejected": -2.4918341636657715, + "logps/chosen": -271.35174560546875, + "logps/rejected": -361.06256103515625, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08345876634120941, + "rewards/margins": 8.526935577392578, + "rewards/rejected": -8.443475723266602, + "step": 6020 + }, + { + "epoch": 1.52, + "learning_rate": 2.732890178822208e-07, + "logits/chosen": -2.491140127182007, + "logits/rejected": -2.444007635116577, + "logps/chosen": -255.1403350830078, + "logps/rejected": -389.2249450683594, + "loss": 0.0816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6935482025146484, + "rewards/margins": 8.629409790039062, + "rewards/rejected": -9.322957992553711, + "step": 6030 + }, + { + "epoch": 1.53, + "learning_rate": 2.7282089691976406e-07, + "logits/chosen": -2.7475171089172363, + "logits/rejected": -2.7083027362823486, + "logps/chosen": -222.727783203125, + "logps/rejected": -315.22821044921875, + "loss": 0.1316, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.23866835236549377, + "rewards/margins": 8.23305892944336, + "rewards/rejected": -7.99439001083374, + "step": 6040 + }, + { + "epoch": 1.53, + "learning_rate": 2.7235277595730735e-07, + "logits/chosen": -2.478139877319336, + "logits/rejected": -2.435267925262451, + "logps/chosen": -312.53863525390625, + "logps/rejected": -334.06842041015625, + "loss": 0.1077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7183864116668701, + "rewards/margins": 7.949586391448975, + "rewards/rejected": -8.66797161102295, + "step": 6050 + }, + { + "epoch": 1.53, + "learning_rate": 2.7188465499485064e-07, + "logits/chosen": -2.7338013648986816, + "logits/rejected": -2.608670473098755, + "logps/chosen": -393.45013427734375, + "logps/rejected": -444.36407470703125, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3149901032447815, + "rewards/margins": 9.215995788574219, + "rewards/rejected": -8.901005744934082, + "step": 6060 + }, + { + "epoch": 1.53, + "learning_rate": 2.7141653403239393e-07, + "logits/chosen": -2.6936416625976562, + "logits/rejected": -2.6677496433258057, + "logps/chosen": -298.617431640625, + "logps/rejected": -334.5692443847656, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.571147084236145, + "rewards/margins": 7.877842903137207, + "rewards/rejected": -7.306695461273193, + "step": 6070 + }, + { + "epoch": 1.54, + "learning_rate": 2.709484130699373e-07, + "logits/chosen": -2.438176393508911, + "logits/rejected": -2.3903422355651855, + "logps/chosen": -298.58282470703125, + "logps/rejected": -266.06695556640625, + "loss": 0.0981, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.18747439980506897, + "rewards/margins": 7.7106194496154785, + "rewards/rejected": -7.523144721984863, + "step": 6080 + }, + { + "epoch": 1.54, + "learning_rate": 2.7048029210748056e-07, + "logits/chosen": -2.6650218963623047, + "logits/rejected": -2.4819769859313965, + "logps/chosen": -331.0292053222656, + "logps/rejected": -271.9941711425781, + "loss": 0.1013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5644257068634033, + "rewards/margins": 4.562926292419434, + "rewards/rejected": -6.127351760864258, + "step": 6090 + }, + { + "epoch": 1.54, + "learning_rate": 2.7001217114502385e-07, + "logits/chosen": -2.766846179962158, + "logits/rejected": -2.6548171043395996, + "logps/chosen": -249.60818481445312, + "logps/rejected": -301.04132080078125, + "loss": 0.0747, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7441790699958801, + "rewards/margins": 8.869656562805176, + "rewards/rejected": -8.12547779083252, + "step": 6100 + }, + { + "epoch": 1.54, + "learning_rate": 2.6954405018256714e-07, + "logits/chosen": -2.476426601409912, + "logits/rejected": -2.4908547401428223, + "logps/chosen": -340.268798828125, + "logps/rejected": -395.8197326660156, + "loss": 0.0972, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0655219554901123, + "rewards/margins": 6.699605464935303, + "rewards/rejected": -7.765127658843994, + "step": 6110 + }, + { + "epoch": 1.55, + "learning_rate": 2.690759292201105e-07, + "logits/chosen": -2.606290578842163, + "logits/rejected": -2.5720295906066895, + "logps/chosen": -264.5345764160156, + "logps/rejected": -287.35064697265625, + "loss": 0.122, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.07059669494628906, + "rewards/margins": 7.116732597351074, + "rewards/rejected": -7.046135902404785, + "step": 6120 + }, + { + "epoch": 1.55, + "learning_rate": 2.686078082576538e-07, + "logits/chosen": -2.607025146484375, + "logits/rejected": -2.63069486618042, + "logps/chosen": -259.15875244140625, + "logps/rejected": -405.8233337402344, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.288994699716568, + "rewards/margins": 8.880585670471191, + "rewards/rejected": -9.169580459594727, + "step": 6130 + }, + { + "epoch": 1.55, + "learning_rate": 2.681396872951971e-07, + "logits/chosen": -2.6519951820373535, + "logits/rejected": -2.664268970489502, + "logps/chosen": -333.8085632324219, + "logps/rejected": -403.4785461425781, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8596887588500977, + "rewards/margins": 8.478861808776855, + "rewards/rejected": -9.338550567626953, + "step": 6140 + }, + { + "epoch": 1.55, + "learning_rate": 2.6767156633274035e-07, + "logits/chosen": -2.642038583755493, + "logits/rejected": -2.377959966659546, + "logps/chosen": -240.6234130859375, + "logps/rejected": -286.7399597167969, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029271280393004417, + "rewards/margins": 7.734196662902832, + "rewards/rejected": -7.704924583435059, + "step": 6150 + }, + { + "epoch": 1.56, + "learning_rate": 2.6720344537028364e-07, + "logits/chosen": -2.6156229972839355, + "logits/rejected": -2.625122308731079, + "logps/chosen": -405.40093994140625, + "logps/rejected": -423.067626953125, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3577374815940857, + "rewards/margins": 8.518488883972168, + "rewards/rejected": -8.876226425170898, + "step": 6160 + }, + { + "epoch": 1.56, + "learning_rate": 2.66735324407827e-07, + "logits/chosen": -2.3296422958374023, + "logits/rejected": -2.411426544189453, + "logps/chosen": -196.38916015625, + "logps/rejected": -244.12405395507812, + "loss": 0.1481, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3764898777008057, + "rewards/margins": 5.746121406555176, + "rewards/rejected": -7.122610569000244, + "step": 6170 + }, + { + "epoch": 1.56, + "learning_rate": 2.662672034453703e-07, + "logits/chosen": -2.6419670581817627, + "logits/rejected": -2.539872169494629, + "logps/chosen": -249.18057250976562, + "logps/rejected": -309.7854309082031, + "loss": 0.092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.07812035083770752, + "rewards/margins": 8.424200057983398, + "rewards/rejected": -8.346078872680664, + "step": 6180 + }, + { + "epoch": 1.56, + "learning_rate": 2.6579908248291356e-07, + "logits/chosen": -2.5486984252929688, + "logits/rejected": -2.5054383277893066, + "logps/chosen": -257.5140075683594, + "logps/rejected": -296.4288024902344, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3803873658180237, + "rewards/margins": 7.806270599365234, + "rewards/rejected": -8.18665885925293, + "step": 6190 + }, + { + "epoch": 1.57, + "learning_rate": 2.6533096152045685e-07, + "logits/chosen": -2.5460028648376465, + "logits/rejected": -2.613973617553711, + "logps/chosen": -348.6800537109375, + "logps/rejected": -482.733642578125, + "loss": 0.0409, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.12141378223896027, + "rewards/margins": 8.33647632598877, + "rewards/rejected": -8.457890510559082, + "step": 6200 + }, + { + "epoch": 1.57, + "learning_rate": 2.648628405580002e-07, + "logits/chosen": -2.289564847946167, + "logits/rejected": -2.3023674488067627, + "logps/chosen": -245.26651000976562, + "logps/rejected": -272.0384826660156, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3192493915557861, + "rewards/margins": 7.366220951080322, + "rewards/rejected": -8.685470581054688, + "step": 6210 + }, + { + "epoch": 1.57, + "learning_rate": 2.643947195955435e-07, + "logits/chosen": -2.4759864807128906, + "logits/rejected": -2.3297553062438965, + "logps/chosen": -260.42449951171875, + "logps/rejected": -326.72003173828125, + "loss": 0.0926, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6568080186843872, + "rewards/margins": 8.137785911560059, + "rewards/rejected": -8.794594764709473, + "step": 6220 + }, + { + "epoch": 1.57, + "learning_rate": 2.639265986330868e-07, + "logits/chosen": -2.4743053913116455, + "logits/rejected": -2.4329488277435303, + "logps/chosen": -284.16986083984375, + "logps/rejected": -234.8544158935547, + "loss": 0.0916, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8003547191619873, + "rewards/margins": 4.773205757141113, + "rewards/rejected": -6.573559761047363, + "step": 6230 + }, + { + "epoch": 1.58, + "learning_rate": 2.6345847767063006e-07, + "logits/chosen": -2.6427693367004395, + "logits/rejected": -2.5126216411590576, + "logps/chosen": -278.27716064453125, + "logps/rejected": -337.318115234375, + "loss": 0.1066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15482717752456665, + "rewards/margins": 7.757693290710449, + "rewards/rejected": -7.602866172790527, + "step": 6240 + }, + { + "epoch": 1.58, + "learning_rate": 2.6299035670817335e-07, + "logits/chosen": -2.5959510803222656, + "logits/rejected": -2.570957660675049, + "logps/chosen": -298.49114990234375, + "logps/rejected": -396.47198486328125, + "loss": 0.0724, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1863725185394287, + "rewards/margins": 6.7948126792907715, + "rewards/rejected": -7.9811835289001465, + "step": 6250 + }, + { + "epoch": 1.58, + "learning_rate": 2.625222357457167e-07, + "logits/chosen": -2.5576119422912598, + "logits/rejected": -2.371354579925537, + "logps/chosen": -241.24801635742188, + "logps/rejected": -237.8208770751953, + "loss": 0.0968, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9517194032669067, + "rewards/margins": 5.712420463562012, + "rewards/rejected": -6.664140224456787, + "step": 6260 + }, + { + "epoch": 1.58, + "learning_rate": 2.6205411478326e-07, + "logits/chosen": -2.635572671890259, + "logits/rejected": -2.7240185737609863, + "logps/chosen": -332.22381591796875, + "logps/rejected": -367.07818603515625, + "loss": 0.1099, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5952962040901184, + "rewards/margins": 7.662569999694824, + "rewards/rejected": -8.257866859436035, + "step": 6270 + }, + { + "epoch": 1.59, + "learning_rate": 2.6158599382080327e-07, + "logits/chosen": -2.5941269397735596, + "logits/rejected": -2.5424253940582275, + "logps/chosen": -269.71881103515625, + "logps/rejected": -277.7208251953125, + "loss": 0.0993, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6491734981536865, + "rewards/margins": 8.246459007263184, + "rewards/rejected": -6.597285270690918, + "step": 6280 + }, + { + "epoch": 1.59, + "learning_rate": 2.6111787285834656e-07, + "logits/chosen": -2.282750368118286, + "logits/rejected": -2.1928257942199707, + "logps/chosen": -291.94964599609375, + "logps/rejected": -323.4311218261719, + "loss": 0.137, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7096611261367798, + "rewards/margins": 8.644102096557617, + "rewards/rejected": -7.934440612792969, + "step": 6290 + }, + { + "epoch": 1.59, + "learning_rate": 2.606497518958899e-07, + "logits/chosen": -2.480043411254883, + "logits/rejected": -2.45155930519104, + "logps/chosen": -274.54913330078125, + "logps/rejected": -396.34930419921875, + "loss": 0.0791, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7529920339584351, + "rewards/margins": 7.522666931152344, + "rewards/rejected": -8.275659561157227, + "step": 6300 + }, + { + "epoch": 1.6, + "learning_rate": 2.601816309334332e-07, + "logits/chosen": -2.3269565105438232, + "logits/rejected": -2.212367057800293, + "logps/chosen": -215.3049774169922, + "logps/rejected": -289.00567626953125, + "loss": 0.0842, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5885213613510132, + "rewards/margins": 8.83895492553711, + "rewards/rejected": -9.427475929260254, + "step": 6310 + }, + { + "epoch": 1.6, + "learning_rate": 2.597135099709765e-07, + "logits/chosen": -2.575462818145752, + "logits/rejected": -2.493577480316162, + "logps/chosen": -364.81634521484375, + "logps/rejected": -351.1142578125, + "loss": 0.1075, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5452896356582642, + "rewards/margins": 7.818143367767334, + "rewards/rejected": -7.272853851318359, + "step": 6320 + }, + { + "epoch": 1.6, + "learning_rate": 2.5924538900851977e-07, + "logits/chosen": -2.531485080718994, + "logits/rejected": -2.5142064094543457, + "logps/chosen": -234.1458282470703, + "logps/rejected": -293.10308837890625, + "loss": 0.108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.367737352848053, + "rewards/margins": 6.96529483795166, + "rewards/rejected": -7.333032131195068, + "step": 6330 + }, + { + "epoch": 1.6, + "learning_rate": 2.587772680460631e-07, + "logits/chosen": -2.566310167312622, + "logits/rejected": -2.4758496284484863, + "logps/chosen": -364.1842346191406, + "logps/rejected": -333.62384033203125, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9204639196395874, + "rewards/margins": 8.889691352844238, + "rewards/rejected": -7.969226837158203, + "step": 6340 + }, + { + "epoch": 1.61, + "learning_rate": 2.583091470836064e-07, + "logits/chosen": -2.4522814750671387, + "logits/rejected": -2.479931354522705, + "logps/chosen": -317.1643981933594, + "logps/rejected": -353.9148254394531, + "loss": 0.0866, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9924957156181335, + "rewards/margins": 7.705667972564697, + "rewards/rejected": -6.713172912597656, + "step": 6350 + }, + { + "epoch": 1.61, + "learning_rate": 2.578410261211497e-07, + "logits/chosen": -2.390688419342041, + "logits/rejected": -2.3841347694396973, + "logps/chosen": -225.24075317382812, + "logps/rejected": -381.5619201660156, + "loss": 0.1551, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04876537248492241, + "rewards/margins": 11.332666397094727, + "rewards/rejected": -11.283902168273926, + "step": 6360 + }, + { + "epoch": 1.61, + "learning_rate": 2.57372905158693e-07, + "logits/chosen": -2.4534378051757812, + "logits/rejected": -2.3759846687316895, + "logps/chosen": -367.37713623046875, + "logps/rejected": -324.92864990234375, + "loss": 0.1195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09103012084960938, + "rewards/margins": 8.274453163146973, + "rewards/rejected": -8.365483283996582, + "step": 6370 + }, + { + "epoch": 1.61, + "learning_rate": 2.5690478419623627e-07, + "logits/chosen": -2.593829393386841, + "logits/rejected": -2.536083698272705, + "logps/chosen": -245.0706024169922, + "logps/rejected": -372.5894775390625, + "loss": 0.0701, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8631905317306519, + "rewards/margins": 9.275471687316895, + "rewards/rejected": -10.13866138458252, + "step": 6380 + }, + { + "epoch": 1.62, + "learning_rate": 2.564366632337796e-07, + "logits/chosen": -2.532728433609009, + "logits/rejected": -2.518982410430908, + "logps/chosen": -161.19578552246094, + "logps/rejected": -306.3395080566406, + "loss": 0.0699, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0426340326666832, + "rewards/margins": 9.686996459960938, + "rewards/rejected": -9.644363403320312, + "step": 6390 + }, + { + "epoch": 1.62, + "learning_rate": 2.559685422713229e-07, + "logits/chosen": -2.5918660163879395, + "logits/rejected": -2.5031676292419434, + "logps/chosen": -298.0547790527344, + "logps/rejected": -390.0431213378906, + "loss": 0.0742, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7646089792251587, + "rewards/margins": 7.8302788734436035, + "rewards/rejected": -8.594887733459473, + "step": 6400 + }, + { + "epoch": 1.62, + "learning_rate": 2.555004213088662e-07, + "logits/chosen": -2.6251654624938965, + "logits/rejected": -2.649387836456299, + "logps/chosen": -300.34613037109375, + "logps/rejected": -409.62548828125, + "loss": 0.0819, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8415437936782837, + "rewards/margins": 10.491747856140137, + "rewards/rejected": -8.6502046585083, + "step": 6410 + }, + { + "epoch": 1.62, + "learning_rate": 2.550323003464095e-07, + "logits/chosen": -2.3623738288879395, + "logits/rejected": -2.245854139328003, + "logps/chosen": -243.4949188232422, + "logps/rejected": -234.89468383789062, + "loss": 0.0967, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.059647560119629, + "rewards/margins": 6.525529384613037, + "rewards/rejected": -8.585176467895508, + "step": 6420 + }, + { + "epoch": 1.63, + "learning_rate": 2.545641793839528e-07, + "logits/chosen": -2.6008667945861816, + "logits/rejected": -2.637789726257324, + "logps/chosen": -192.2373504638672, + "logps/rejected": -267.214111328125, + "loss": 0.1195, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7727561593055725, + "rewards/margins": 8.40786075592041, + "rewards/rejected": -7.635104179382324, + "step": 6430 + }, + { + "epoch": 1.63, + "learning_rate": 2.540960584214961e-07, + "logits/chosen": -2.643723964691162, + "logits/rejected": -2.5780606269836426, + "logps/chosen": -286.4054260253906, + "logps/rejected": -300.36834716796875, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12415985763072968, + "rewards/margins": 7.626623630523682, + "rewards/rejected": -7.502463340759277, + "step": 6440 + }, + { + "epoch": 1.63, + "learning_rate": 2.5362793745903946e-07, + "logits/chosen": -2.520909309387207, + "logits/rejected": -2.569328784942627, + "logps/chosen": -220.816650390625, + "logps/rejected": -351.45928955078125, + "loss": 0.1153, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2902345359325409, + "rewards/margins": 7.306796073913574, + "rewards/rejected": -7.597031593322754, + "step": 6450 + }, + { + "epoch": 1.63, + "learning_rate": 2.531598164965827e-07, + "logits/chosen": -2.7431952953338623, + "logits/rejected": -2.5619826316833496, + "logps/chosen": -279.5010986328125, + "logps/rejected": -286.8915100097656, + "loss": 0.0826, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9281533360481262, + "rewards/margins": 6.31351375579834, + "rewards/rejected": -7.2416672706604, + "step": 6460 + }, + { + "epoch": 1.64, + "learning_rate": 2.52691695534126e-07, + "logits/chosen": -2.461831569671631, + "logits/rejected": -2.447096824645996, + "logps/chosen": -245.5779571533203, + "logps/rejected": -284.29473876953125, + "loss": 0.1038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.645194411277771, + "rewards/margins": 7.570770263671875, + "rewards/rejected": -8.215965270996094, + "step": 6470 + }, + { + "epoch": 1.64, + "learning_rate": 2.522235745716693e-07, + "logits/chosen": -2.7126336097717285, + "logits/rejected": -2.641799211502075, + "logps/chosen": -252.3530731201172, + "logps/rejected": -305.55706787109375, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7650710940361023, + "rewards/margins": 7.985662937164307, + "rewards/rejected": -7.2205915451049805, + "step": 6480 + }, + { + "epoch": 1.64, + "learning_rate": 2.517554536092126e-07, + "logits/chosen": -2.561034917831421, + "logits/rejected": -2.5873913764953613, + "logps/chosen": -219.373291015625, + "logps/rejected": -346.26123046875, + "loss": 0.0884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11245546489953995, + "rewards/margins": 7.357387542724609, + "rewards/rejected": -7.469842433929443, + "step": 6490 + }, + { + "epoch": 1.64, + "learning_rate": 2.512873326467559e-07, + "logits/chosen": -2.4951984882354736, + "logits/rejected": -2.5137152671813965, + "logps/chosen": -274.0867614746094, + "logps/rejected": -345.02178955078125, + "loss": 0.1047, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.48362818360328674, + "rewards/margins": 8.117841720581055, + "rewards/rejected": -8.601469039916992, + "step": 6500 + }, + { + "epoch": 1.65, + "learning_rate": 2.508192116842992e-07, + "logits/chosen": -2.5647077560424805, + "logits/rejected": -2.4434280395507812, + "logps/chosen": -227.68240356445312, + "logps/rejected": -282.48980712890625, + "loss": 0.087, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.21963413059711456, + "rewards/margins": 7.6169867515563965, + "rewards/rejected": -7.836620330810547, + "step": 6510 + }, + { + "epoch": 1.65, + "learning_rate": 2.5035109072184253e-07, + "logits/chosen": -2.4927191734313965, + "logits/rejected": -2.355459690093994, + "logps/chosen": -237.4644775390625, + "logps/rejected": -275.3338317871094, + "loss": 0.0832, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9448397755622864, + "rewards/margins": 7.046795845031738, + "rewards/rejected": -7.991635322570801, + "step": 6520 + }, + { + "epoch": 1.65, + "learning_rate": 2.498829697593858e-07, + "logits/chosen": -2.7306220531463623, + "logits/rejected": -2.5981738567352295, + "logps/chosen": -292.41632080078125, + "logps/rejected": -398.3863830566406, + "loss": 0.0958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7138475179672241, + "rewards/margins": 9.044022560119629, + "rewards/rejected": -9.7578706741333, + "step": 6530 + }, + { + "epoch": 1.65, + "learning_rate": 2.494148487969291e-07, + "logits/chosen": -2.5870468616485596, + "logits/rejected": -2.4440600872039795, + "logps/chosen": -307.70513916015625, + "logps/rejected": -365.44696044921875, + "loss": 0.1477, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8003414273262024, + "rewards/margins": 11.231703758239746, + "rewards/rejected": -10.431364059448242, + "step": 6540 + }, + { + "epoch": 1.66, + "learning_rate": 2.489467278344724e-07, + "logits/chosen": -2.5128679275512695, + "logits/rejected": -2.475287675857544, + "logps/chosen": -233.4687957763672, + "logps/rejected": -257.11376953125, + "loss": 0.0656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2492097616195679, + "rewards/margins": 8.299988746643066, + "rewards/rejected": -7.050778865814209, + "step": 6550 + }, + { + "epoch": 1.66, + "learning_rate": 2.484786068720157e-07, + "logits/chosen": -2.547872543334961, + "logits/rejected": -2.586104154586792, + "logps/chosen": -264.0244140625, + "logps/rejected": -343.283935546875, + "loss": 0.0833, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.10393998771905899, + "rewards/margins": 9.039542198181152, + "rewards/rejected": -8.935602188110352, + "step": 6560 + }, + { + "epoch": 1.66, + "learning_rate": 2.4801048590955903e-07, + "logits/chosen": -2.45554518699646, + "logits/rejected": -2.4026269912719727, + "logps/chosen": -310.2799377441406, + "logps/rejected": -418.31463623046875, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7158440351486206, + "rewards/margins": 11.024736404418945, + "rewards/rejected": -11.740580558776855, + "step": 6570 + }, + { + "epoch": 1.66, + "learning_rate": 2.475423649471023e-07, + "logits/chosen": -2.699667453765869, + "logits/rejected": -2.7050626277923584, + "logps/chosen": -288.3102722167969, + "logps/rejected": -335.96221923828125, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9094661474227905, + "rewards/margins": 6.560095310211182, + "rewards/rejected": -7.469560146331787, + "step": 6580 + }, + { + "epoch": 1.67, + "learning_rate": 2.470742439846456e-07, + "logits/chosen": -2.6850056648254395, + "logits/rejected": -2.447831630706787, + "logps/chosen": -289.18353271484375, + "logps/rejected": -295.9617614746094, + "loss": 0.0962, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7664966583251953, + "rewards/margins": 4.887343406677246, + "rewards/rejected": -6.6538405418396, + "step": 6590 + }, + { + "epoch": 1.67, + "learning_rate": 2.466061230221889e-07, + "logits/chosen": -2.601702928543091, + "logits/rejected": -2.488107204437256, + "logps/chosen": -287.02020263671875, + "logps/rejected": -321.2840270996094, + "loss": 0.1292, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7866836786270142, + "rewards/margins": 4.828063488006592, + "rewards/rejected": -6.614747047424316, + "step": 6600 + }, + { + "epoch": 1.67, + "learning_rate": 2.4613800205973224e-07, + "logits/chosen": -2.545788288116455, + "logits/rejected": -2.5788114070892334, + "logps/chosen": -228.93972778320312, + "logps/rejected": -291.91680908203125, + "loss": 0.0651, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0478473901748657, + "rewards/margins": 8.06389045715332, + "rewards/rejected": -9.111737251281738, + "step": 6610 + }, + { + "epoch": 1.67, + "learning_rate": 2.4566988109727553e-07, + "logits/chosen": -2.6414923667907715, + "logits/rejected": -2.5644431114196777, + "logps/chosen": -228.28369140625, + "logps/rejected": -265.0378112792969, + "loss": 0.0969, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6914323568344116, + "rewards/margins": 4.776876449584961, + "rewards/rejected": -6.468308925628662, + "step": 6620 + }, + { + "epoch": 1.68, + "learning_rate": 2.452017601348188e-07, + "logits/chosen": -2.5595173835754395, + "logits/rejected": -2.523301362991333, + "logps/chosen": -321.19122314453125, + "logps/rejected": -370.2229309082031, + "loss": 0.124, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3089625835418701, + "rewards/margins": 5.994938373565674, + "rewards/rejected": -7.303900718688965, + "step": 6630 + }, + { + "epoch": 1.68, + "learning_rate": 2.447336391723621e-07, + "logits/chosen": -2.6744046211242676, + "logits/rejected": -2.500985622406006, + "logps/chosen": -302.8641662597656, + "logps/rejected": -312.8785095214844, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.256498247385025, + "rewards/margins": 8.627106666564941, + "rewards/rejected": -8.37060832977295, + "step": 6640 + }, + { + "epoch": 1.68, + "learning_rate": 2.4426551820990546e-07, + "logits/chosen": -2.772514581680298, + "logits/rejected": -2.5799484252929688, + "logps/chosen": -309.5052185058594, + "logps/rejected": -348.5119934082031, + "loss": 0.0934, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.509163498878479, + "rewards/margins": 6.6776251792907715, + "rewards/rejected": -8.186788558959961, + "step": 6650 + }, + { + "epoch": 1.68, + "learning_rate": 2.437973972474487e-07, + "logits/chosen": -2.7458884716033936, + "logits/rejected": -2.7266435623168945, + "logps/chosen": -264.4335021972656, + "logps/rejected": -250.4994354248047, + "loss": 0.0909, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5879275798797607, + "rewards/margins": 5.835958480834961, + "rewards/rejected": -7.423886775970459, + "step": 6660 + }, + { + "epoch": 1.69, + "learning_rate": 2.4332927628499203e-07, + "logits/chosen": -2.5481972694396973, + "logits/rejected": -2.609694004058838, + "logps/chosen": -274.73199462890625, + "logps/rejected": -350.5310363769531, + "loss": 0.0603, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5865224599838257, + "rewards/margins": 7.221885681152344, + "rewards/rejected": -7.808408260345459, + "step": 6670 + }, + { + "epoch": 1.69, + "learning_rate": 2.428611553225353e-07, + "logits/chosen": -2.6879124641418457, + "logits/rejected": -2.711090564727783, + "logps/chosen": -310.59771728515625, + "logps/rejected": -443.3534240722656, + "loss": 0.1009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01967158354818821, + "rewards/margins": 8.676813125610352, + "rewards/rejected": -8.69648551940918, + "step": 6680 + }, + { + "epoch": 1.69, + "learning_rate": 2.423930343600786e-07, + "logits/chosen": -2.64151668548584, + "logits/rejected": -2.7272515296936035, + "logps/chosen": -250.64956665039062, + "logps/rejected": -479.2918395996094, + "loss": 0.2103, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5805938243865967, + "rewards/margins": 9.652566909790039, + "rewards/rejected": -11.233160018920898, + "step": 6690 + }, + { + "epoch": 1.69, + "learning_rate": 2.4192491339762195e-07, + "logits/chosen": -2.4825167655944824, + "logits/rejected": -2.4715609550476074, + "logps/chosen": -286.91864013671875, + "logps/rejected": -352.4754638671875, + "loss": 0.0877, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.8185830116271973, + "rewards/margins": 6.200232028961182, + "rewards/rejected": -9.018815040588379, + "step": 6700 + }, + { + "epoch": 1.7, + "learning_rate": 2.4145679243516524e-07, + "logits/chosen": -2.6975979804992676, + "logits/rejected": -2.7159650325775146, + "logps/chosen": -239.55581665039062, + "logps/rejected": -321.30322265625, + "loss": 0.0949, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.047037959098816, + "rewards/margins": 6.473003387451172, + "rewards/rejected": -7.520041465759277, + "step": 6710 + }, + { + "epoch": 1.7, + "learning_rate": 2.4098867147270853e-07, + "logits/chosen": -2.7223079204559326, + "logits/rejected": -2.674180746078491, + "logps/chosen": -336.23590087890625, + "logps/rejected": -390.67169189453125, + "loss": 0.0824, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.45447224378585815, + "rewards/margins": 10.92662239074707, + "rewards/rejected": -10.472149848937988, + "step": 6720 + }, + { + "epoch": 1.7, + "learning_rate": 2.405205505102518e-07, + "logits/chosen": -2.695161819458008, + "logits/rejected": -2.6044206619262695, + "logps/chosen": -287.9726867675781, + "logps/rejected": -309.17095947265625, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6049206852912903, + "rewards/margins": 7.56399393081665, + "rewards/rejected": -8.168913841247559, + "step": 6730 + }, + { + "epoch": 1.7, + "learning_rate": 2.4005242954779516e-07, + "logits/chosen": -2.5688953399658203, + "logits/rejected": -2.490004062652588, + "logps/chosen": -186.72337341308594, + "logps/rejected": -260.4896545410156, + "loss": 0.1629, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2096558809280396, + "rewards/margins": 6.041118144989014, + "rewards/rejected": -7.2507734298706055, + "step": 6740 + }, + { + "epoch": 1.71, + "learning_rate": 2.3958430858533845e-07, + "logits/chosen": -2.559790849685669, + "logits/rejected": -2.5062003135681152, + "logps/chosen": -243.90713500976562, + "logps/rejected": -420.903564453125, + "loss": 0.1199, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14399679005146027, + "rewards/margins": 12.745423316955566, + "rewards/rejected": -12.60142707824707, + "step": 6750 + }, + { + "epoch": 1.71, + "learning_rate": 2.3911618762288174e-07, + "logits/chosen": -2.6574254035949707, + "logits/rejected": -2.565380811691284, + "logps/chosen": -222.82992553710938, + "logps/rejected": -308.78271484375, + "loss": 0.086, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3074750900268555, + "rewards/margins": 6.395266056060791, + "rewards/rejected": -7.7027411460876465, + "step": 6760 + }, + { + "epoch": 1.71, + "learning_rate": 2.3864806666042503e-07, + "logits/chosen": -2.383507490158081, + "logits/rejected": -2.495840311050415, + "logps/chosen": -289.81036376953125, + "logps/rejected": -347.7547302246094, + "loss": 0.0574, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4515106678009033, + "rewards/margins": 7.600310325622559, + "rewards/rejected": -9.051820755004883, + "step": 6770 + }, + { + "epoch": 1.71, + "learning_rate": 2.3817994569796835e-07, + "logits/chosen": -2.4468441009521484, + "logits/rejected": -2.4855294227600098, + "logps/chosen": -219.2752685546875, + "logps/rejected": -316.3480529785156, + "loss": 0.0904, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7793846130371094, + "rewards/margins": 5.968520641326904, + "rewards/rejected": -9.747904777526855, + "step": 6780 + }, + { + "epoch": 1.72, + "learning_rate": 2.3771182473551166e-07, + "logits/chosen": -2.5500540733337402, + "logits/rejected": -2.5283660888671875, + "logps/chosen": -210.5634307861328, + "logps/rejected": -265.06378173828125, + "loss": 0.1372, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.719752311706543, + "rewards/margins": 7.075891017913818, + "rewards/rejected": -8.79564380645752, + "step": 6790 + }, + { + "epoch": 1.72, + "learning_rate": 2.3724370377305495e-07, + "logits/chosen": -2.453866958618164, + "logits/rejected": -2.2946243286132812, + "logps/chosen": -305.92694091796875, + "logps/rejected": -296.0342712402344, + "loss": 0.0778, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.159287691116333, + "rewards/margins": 6.641240119934082, + "rewards/rejected": -8.800528526306152, + "step": 6800 + }, + { + "epoch": 1.72, + "learning_rate": 2.3677558281059827e-07, + "logits/chosen": -2.5608479976654053, + "logits/rejected": -2.425877571105957, + "logps/chosen": -310.08770751953125, + "logps/rejected": -382.2318420410156, + "loss": 0.0921, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0401217937469482, + "rewards/margins": 7.864357948303223, + "rewards/rejected": -9.904478073120117, + "step": 6810 + }, + { + "epoch": 1.72, + "learning_rate": 2.3630746184814153e-07, + "logits/chosen": -2.493360996246338, + "logits/rejected": -2.489497661590576, + "logps/chosen": -265.77734375, + "logps/rejected": -298.32769775390625, + "loss": 0.0836, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8773454427719116, + "rewards/margins": 7.0327863693237305, + "rewards/rejected": -8.910131454467773, + "step": 6820 + }, + { + "epoch": 1.73, + "learning_rate": 2.3583934088568485e-07, + "logits/chosen": -2.4660093784332275, + "logits/rejected": -2.43588924407959, + "logps/chosen": -270.72589111328125, + "logps/rejected": -375.72332763671875, + "loss": 0.0781, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8436657786369324, + "rewards/margins": 8.026527404785156, + "rewards/rejected": -8.870193481445312, + "step": 6830 + }, + { + "epoch": 1.73, + "learning_rate": 2.3537121992322814e-07, + "logits/chosen": -2.517906665802002, + "logits/rejected": -2.3145463466644287, + "logps/chosen": -289.3856506347656, + "logps/rejected": -351.4459533691406, + "loss": 0.0948, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6315228939056396, + "rewards/margins": 7.835135459899902, + "rewards/rejected": -8.466657638549805, + "step": 6840 + }, + { + "epoch": 1.73, + "learning_rate": 2.3490309896077145e-07, + "logits/chosen": -2.646665573120117, + "logits/rejected": -2.6209945678710938, + "logps/chosen": -337.2581481933594, + "logps/rejected": -352.4207458496094, + "loss": 0.1138, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6701911687850952, + "rewards/margins": 5.531129360198975, + "rewards/rejected": -7.201320648193359, + "step": 6850 + }, + { + "epoch": 1.73, + "learning_rate": 2.3443497799831474e-07, + "logits/chosen": -2.38620924949646, + "logits/rejected": -2.3328304290771484, + "logps/chosen": -249.17434692382812, + "logps/rejected": -329.9270324707031, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7047498226165771, + "rewards/margins": 8.854387283325195, + "rewards/rejected": -10.559137344360352, + "step": 6860 + }, + { + "epoch": 1.74, + "learning_rate": 2.3396685703585806e-07, + "logits/chosen": -2.4927468299865723, + "logits/rejected": -2.560324192047119, + "logps/chosen": -180.02432250976562, + "logps/rejected": -261.7504577636719, + "loss": 0.11, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4752345085144043, + "rewards/margins": 5.783717155456543, + "rewards/rejected": -8.258952140808105, + "step": 6870 + }, + { + "epoch": 1.74, + "learning_rate": 2.3349873607340137e-07, + "logits/chosen": -2.5648436546325684, + "logits/rejected": -2.3717598915100098, + "logps/chosen": -277.2408447265625, + "logps/rejected": -287.49285888671875, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4586201310157776, + "rewards/margins": 6.640373229980469, + "rewards/rejected": -7.09899377822876, + "step": 6880 + }, + { + "epoch": 1.74, + "learning_rate": 2.3303061511094466e-07, + "logits/chosen": -2.6955389976501465, + "logits/rejected": -2.5326249599456787, + "logps/chosen": -327.55389404296875, + "logps/rejected": -347.0709533691406, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5011809468269348, + "rewards/margins": 8.189516067504883, + "rewards/rejected": -8.690695762634277, + "step": 6890 + }, + { + "epoch": 1.74, + "learning_rate": 2.3256249414848798e-07, + "logits/chosen": -2.476616382598877, + "logits/rejected": -2.397434949874878, + "logps/chosen": -242.52053833007812, + "logps/rejected": -335.563232421875, + "loss": 0.194, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2094663381576538, + "rewards/margins": 9.379359245300293, + "rewards/rejected": -10.588825225830078, + "step": 6900 + }, + { + "epoch": 1.75, + "learning_rate": 2.3209437318603127e-07, + "logits/chosen": -2.455580234527588, + "logits/rejected": -2.6129374504089355, + "logps/chosen": -234.93679809570312, + "logps/rejected": -267.9778137207031, + "loss": 0.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7409460544586182, + "rewards/margins": 6.319588661193848, + "rewards/rejected": -7.060534477233887, + "step": 6910 + }, + { + "epoch": 1.75, + "learning_rate": 2.3162625222357456e-07, + "logits/chosen": -2.249671220779419, + "logits/rejected": -2.421679735183716, + "logps/chosen": -220.7417449951172, + "logps/rejected": -348.41351318359375, + "loss": 0.1057, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4988008141517639, + "rewards/margins": 9.099369049072266, + "rewards/rejected": -9.598170280456543, + "step": 6920 + }, + { + "epoch": 1.75, + "learning_rate": 2.3115813126111785e-07, + "logits/chosen": -2.5640151500701904, + "logits/rejected": -2.563455820083618, + "logps/chosen": -228.15603637695312, + "logps/rejected": -277.2568359375, + "loss": 0.1121, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2560606002807617, + "rewards/margins": 6.815365791320801, + "rewards/rejected": -7.071427345275879, + "step": 6930 + }, + { + "epoch": 1.75, + "learning_rate": 2.3069001029866116e-07, + "logits/chosen": -2.3587443828582764, + "logits/rejected": -2.3043508529663086, + "logps/chosen": -257.75775146484375, + "logps/rejected": -232.22909545898438, + "loss": 0.104, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6903194189071655, + "rewards/margins": 5.616563320159912, + "rewards/rejected": -7.306881904602051, + "step": 6940 + }, + { + "epoch": 1.76, + "learning_rate": 2.3022188933620445e-07, + "logits/chosen": -2.5451407432556152, + "logits/rejected": -2.4094347953796387, + "logps/chosen": -291.5643005371094, + "logps/rejected": -332.65875244140625, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5125554800033569, + "rewards/margins": 7.864842414855957, + "rewards/rejected": -8.377397537231445, + "step": 6950 + }, + { + "epoch": 1.76, + "learning_rate": 2.2975376837374777e-07, + "logits/chosen": -2.498363971710205, + "logits/rejected": -2.5168495178222656, + "logps/chosen": -276.8007507324219, + "logps/rejected": -270.8402099609375, + "loss": 0.0863, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0414700023829937, + "rewards/margins": 7.689317226409912, + "rewards/rejected": -7.647847652435303, + "step": 6960 + }, + { + "epoch": 1.76, + "learning_rate": 2.2928564741129106e-07, + "logits/chosen": -2.657831907272339, + "logits/rejected": -2.4639506340026855, + "logps/chosen": -412.9681091308594, + "logps/rejected": -304.3277282714844, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8967636227607727, + "rewards/margins": 6.674870491027832, + "rewards/rejected": -7.571633815765381, + "step": 6970 + }, + { + "epoch": 1.76, + "learning_rate": 2.2881752644883437e-07, + "logits/chosen": -2.576490640640259, + "logits/rejected": -2.5533933639526367, + "logps/chosen": -324.05462646484375, + "logps/rejected": -329.29339599609375, + "loss": 0.1431, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7502419352531433, + "rewards/margins": 7.129467964172363, + "rewards/rejected": -7.8797101974487305, + "step": 6980 + }, + { + "epoch": 1.77, + "learning_rate": 2.283494054863777e-07, + "logits/chosen": -2.5980420112609863, + "logits/rejected": -2.373439311981201, + "logps/chosen": -349.974365234375, + "logps/rejected": -433.7562561035156, + "loss": 0.0873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7957502007484436, + "rewards/margins": 9.475018501281738, + "rewards/rejected": -8.679269790649414, + "step": 6990 + }, + { + "epoch": 1.77, + "learning_rate": 2.2788128452392098e-07, + "logits/chosen": -2.405667543411255, + "logits/rejected": -2.24853515625, + "logps/chosen": -247.5191650390625, + "logps/rejected": -276.60723876953125, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1435565948486328, + "rewards/margins": 6.712456703186035, + "rewards/rejected": -7.856014251708984, + "step": 7000 + }, + { + "epoch": 1.77, + "learning_rate": 2.274131635614643e-07, + "logits/chosen": -2.345731258392334, + "logits/rejected": -2.2325825691223145, + "logps/chosen": -295.4849548339844, + "logps/rejected": -341.0472412109375, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4678739011287689, + "rewards/margins": 7.9846062660217285, + "rewards/rejected": -7.516732215881348, + "step": 7010 + }, + { + "epoch": 1.77, + "learning_rate": 2.2694504259900756e-07, + "logits/chosen": -2.706740617752075, + "logits/rejected": -2.68070650100708, + "logps/chosen": -251.5078125, + "logps/rejected": -267.70477294921875, + "loss": 0.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2914077639579773, + "rewards/margins": 6.667860507965088, + "rewards/rejected": -6.959267616271973, + "step": 7020 + }, + { + "epoch": 1.78, + "learning_rate": 2.2647692163655087e-07, + "logits/chosen": -2.697540521621704, + "logits/rejected": -2.433387517929077, + "logps/chosen": -240.4569091796875, + "logps/rejected": -249.18798828125, + "loss": 0.0813, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3658664226531982, + "rewards/margins": 5.591639518737793, + "rewards/rejected": -6.957505702972412, + "step": 7030 + }, + { + "epoch": 1.78, + "learning_rate": 2.2600880067409416e-07, + "logits/chosen": -2.5349862575531006, + "logits/rejected": -2.370959758758545, + "logps/chosen": -289.0957336425781, + "logps/rejected": -315.78192138671875, + "loss": 0.1125, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2642221450805664, + "rewards/margins": 8.159451484680176, + "rewards/rejected": -9.423672676086426, + "step": 7040 + }, + { + "epoch": 1.78, + "learning_rate": 2.2554067971163748e-07, + "logits/chosen": -2.5603203773498535, + "logits/rejected": -2.5768723487854004, + "logps/chosen": -293.04669189453125, + "logps/rejected": -282.08416748046875, + "loss": 0.0597, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6653440594673157, + "rewards/margins": 6.230250358581543, + "rewards/rejected": -6.895594120025635, + "step": 7050 + }, + { + "epoch": 1.78, + "learning_rate": 2.2507255874918077e-07, + "logits/chosen": -2.4578702449798584, + "logits/rejected": -2.349565029144287, + "logps/chosen": -191.6605682373047, + "logps/rejected": -267.44464111328125, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3502098023891449, + "rewards/margins": 9.301736831665039, + "rewards/rejected": -9.651947975158691, + "step": 7060 + }, + { + "epoch": 1.79, + "learning_rate": 2.2460443778672408e-07, + "logits/chosen": -2.788151264190674, + "logits/rejected": -2.52620530128479, + "logps/chosen": -288.56158447265625, + "logps/rejected": -306.52960205078125, + "loss": 0.098, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.22187399864196777, + "rewards/margins": 8.754189491271973, + "rewards/rejected": -8.97606372833252, + "step": 7070 + }, + { + "epoch": 1.79, + "learning_rate": 2.2413631682426737e-07, + "logits/chosen": -2.605219602584839, + "logits/rejected": -2.574493885040283, + "logps/chosen": -331.01458740234375, + "logps/rejected": -378.16302490234375, + "loss": 0.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7465782165527344, + "rewards/margins": 11.760537147521973, + "rewards/rejected": -10.013957023620605, + "step": 7080 + }, + { + "epoch": 1.79, + "learning_rate": 2.236681958618107e-07, + "logits/chosen": -2.598388195037842, + "logits/rejected": -2.4072489738464355, + "logps/chosen": -294.28790283203125, + "logps/rejected": -289.75323486328125, + "loss": 0.1413, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3980717062950134, + "rewards/margins": 7.004315376281738, + "rewards/rejected": -7.4023871421813965, + "step": 7090 + }, + { + "epoch": 1.79, + "learning_rate": 2.23200074899354e-07, + "logits/chosen": -2.652200937271118, + "logits/rejected": -2.6423587799072266, + "logps/chosen": -267.423828125, + "logps/rejected": -380.0479431152344, + "loss": 0.08, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3439788520336151, + "rewards/margins": 7.537928104400635, + "rewards/rejected": -7.881906986236572, + "step": 7100 + }, + { + "epoch": 1.8, + "learning_rate": 2.227319539368973e-07, + "logits/chosen": -2.6082496643066406, + "logits/rejected": -2.4751124382019043, + "logps/chosen": -238.06454467773438, + "logps/rejected": -234.53146362304688, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09598349034786224, + "rewards/margins": 6.674858093261719, + "rewards/rejected": -6.578874111175537, + "step": 7110 + }, + { + "epoch": 1.8, + "learning_rate": 2.2226383297444058e-07, + "logits/chosen": -2.6969308853149414, + "logits/rejected": -2.634843349456787, + "logps/chosen": -292.68414306640625, + "logps/rejected": -342.77752685546875, + "loss": 0.1412, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7074816226959229, + "rewards/margins": 7.93694543838501, + "rewards/rejected": -8.644426345825195, + "step": 7120 + }, + { + "epoch": 1.8, + "learning_rate": 2.2179571201198387e-07, + "logits/chosen": -2.823737382888794, + "logits/rejected": -2.614729404449463, + "logps/chosen": -326.89892578125, + "logps/rejected": -340.80279541015625, + "loss": 0.1154, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7879875898361206, + "rewards/margins": 8.97514533996582, + "rewards/rejected": -8.18715763092041, + "step": 7130 + }, + { + "epoch": 1.8, + "learning_rate": 2.213275910495272e-07, + "logits/chosen": -2.5659549236297607, + "logits/rejected": -2.409237861633301, + "logps/chosen": -289.281005859375, + "logps/rejected": -292.24603271484375, + "loss": 0.1, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.997495412826538, + "rewards/margins": 9.539546012878418, + "rewards/rejected": -7.542050838470459, + "step": 7140 + }, + { + "epoch": 1.81, + "learning_rate": 2.2085947008707048e-07, + "logits/chosen": -2.7370827198028564, + "logits/rejected": -2.5288028717041016, + "logps/chosen": -210.69241333007812, + "logps/rejected": -243.98434448242188, + "loss": 0.1095, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.514499306678772, + "rewards/margins": 5.825979709625244, + "rewards/rejected": -7.340478420257568, + "step": 7150 + }, + { + "epoch": 1.81, + "learning_rate": 2.203913491246138e-07, + "logits/chosen": -2.6789844036102295, + "logits/rejected": -2.664365530014038, + "logps/chosen": -300.47686767578125, + "logps/rejected": -307.5674743652344, + "loss": 0.097, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.961271047592163, + "rewards/margins": 6.479406833648682, + "rewards/rejected": -8.440677642822266, + "step": 7160 + }, + { + "epoch": 1.81, + "learning_rate": 2.1992322816215708e-07, + "logits/chosen": -2.6289384365081787, + "logits/rejected": -2.4877371788024902, + "logps/chosen": -264.2879943847656, + "logps/rejected": -271.01910400390625, + "loss": 0.0916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2111540585756302, + "rewards/margins": 7.823189735412598, + "rewards/rejected": -8.034343719482422, + "step": 7170 + }, + { + "epoch": 1.81, + "learning_rate": 2.194551071997004e-07, + "logits/chosen": -2.6112618446350098, + "logits/rejected": -2.5769357681274414, + "logps/chosen": -301.84149169921875, + "logps/rejected": -337.9112243652344, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.090156689286232, + "rewards/margins": 8.0753173828125, + "rewards/rejected": -8.165472984313965, + "step": 7180 + }, + { + "epoch": 1.82, + "learning_rate": 2.189869862372437e-07, + "logits/chosen": -2.807898998260498, + "logits/rejected": -2.5814788341522217, + "logps/chosen": -289.78948974609375, + "logps/rejected": -307.60565185546875, + "loss": 0.0797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6811469793319702, + "rewards/margins": 6.347805976867676, + "rewards/rejected": -7.028953552246094, + "step": 7190 + }, + { + "epoch": 1.82, + "learning_rate": 2.18518865274787e-07, + "logits/chosen": -2.5731379985809326, + "logits/rejected": -2.39717173576355, + "logps/chosen": -255.71981811523438, + "logps/rejected": -259.46124267578125, + "loss": 0.1267, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0866820812225342, + "rewards/margins": 5.789783000946045, + "rewards/rejected": -6.87646484375, + "step": 7200 + }, + { + "epoch": 1.82, + "learning_rate": 2.1805074431233032e-07, + "logits/chosen": -2.474670886993408, + "logits/rejected": -2.4122226238250732, + "logps/chosen": -314.0117492675781, + "logps/rejected": -416.59649658203125, + "loss": 0.0809, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.323627233505249, + "rewards/margins": 11.513708114624023, + "rewards/rejected": -10.190080642700195, + "step": 7210 + }, + { + "epoch": 1.83, + "learning_rate": 2.1758262334987358e-07, + "logits/chosen": -2.6376280784606934, + "logits/rejected": -2.7935502529144287, + "logps/chosen": -304.44403076171875, + "logps/rejected": -479.2804260253906, + "loss": 0.0759, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.33253157138824463, + "rewards/margins": 10.46351432800293, + "rewards/rejected": -10.130983352661133, + "step": 7220 + }, + { + "epoch": 1.83, + "learning_rate": 2.171145023874169e-07, + "logits/chosen": -2.5788180828094482, + "logits/rejected": -2.365527391433716, + "logps/chosen": -329.9599914550781, + "logps/rejected": -326.90673828125, + "loss": 0.0777, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3725981116294861, + "rewards/margins": 6.738213539123535, + "rewards/rejected": -7.110811710357666, + "step": 7230 + }, + { + "epoch": 1.83, + "learning_rate": 2.166463814249602e-07, + "logits/chosen": -2.4922292232513428, + "logits/rejected": -2.5368895530700684, + "logps/chosen": -225.52914428710938, + "logps/rejected": -326.6394348144531, + "loss": 0.1136, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.11458321660757065, + "rewards/margins": 7.7642316818237305, + "rewards/rejected": -7.878814697265625, + "step": 7240 + }, + { + "epoch": 1.83, + "learning_rate": 2.161782604625035e-07, + "logits/chosen": -2.851933240890503, + "logits/rejected": -2.7211337089538574, + "logps/chosen": -320.7086486816406, + "logps/rejected": -369.9259033203125, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8477370142936707, + "rewards/margins": 8.635969161987305, + "rewards/rejected": -7.788232326507568, + "step": 7250 + }, + { + "epoch": 1.84, + "learning_rate": 2.157101395000468e-07, + "logits/chosen": -2.5511319637298584, + "logits/rejected": -2.5334312915802, + "logps/chosen": -306.78057861328125, + "logps/rejected": -398.1654052734375, + "loss": 0.124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13714860379695892, + "rewards/margins": 10.0731201171875, + "rewards/rejected": -10.210268020629883, + "step": 7260 + }, + { + "epoch": 1.84, + "learning_rate": 2.152420185375901e-07, + "logits/chosen": -2.6786410808563232, + "logits/rejected": -2.5691006183624268, + "logps/chosen": -236.7671356201172, + "logps/rejected": -323.62188720703125, + "loss": 0.0962, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0288159847259521, + "rewards/margins": 8.96338176727295, + "rewards/rejected": -9.992198944091797, + "step": 7270 + }, + { + "epoch": 1.84, + "learning_rate": 2.147738975751334e-07, + "logits/chosen": -2.6515941619873047, + "logits/rejected": -2.3960585594177246, + "logps/chosen": -273.30029296875, + "logps/rejected": -370.7136535644531, + "loss": 0.0744, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4314087927341461, + "rewards/margins": 9.314390182495117, + "rewards/rejected": -8.88298225402832, + "step": 7280 + }, + { + "epoch": 1.84, + "learning_rate": 2.1430577661267671e-07, + "logits/chosen": -2.606426239013672, + "logits/rejected": -2.6145050525665283, + "logps/chosen": -338.6814270019531, + "logps/rejected": -360.7559814453125, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5850367546081543, + "rewards/margins": 11.156438827514648, + "rewards/rejected": -9.571401596069336, + "step": 7290 + }, + { + "epoch": 1.85, + "learning_rate": 2.1383765565022e-07, + "logits/chosen": -2.4672369956970215, + "logits/rejected": -2.5930633544921875, + "logps/chosen": -213.44735717773438, + "logps/rejected": -447.8428649902344, + "loss": 0.0545, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3330296576023102, + "rewards/margins": 9.776864051818848, + "rewards/rejected": -9.443833351135254, + "step": 7300 + }, + { + "epoch": 1.85, + "learning_rate": 2.1336953468776332e-07, + "logits/chosen": -2.655038595199585, + "logits/rejected": -2.5842857360839844, + "logps/chosen": -210.86434936523438, + "logps/rejected": -270.0993957519531, + "loss": 0.1019, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6795251369476318, + "rewards/margins": 6.547571659088135, + "rewards/rejected": -7.2270965576171875, + "step": 7310 + }, + { + "epoch": 1.85, + "learning_rate": 2.1290141372530663e-07, + "logits/chosen": -2.729170083999634, + "logits/rejected": -2.6793854236602783, + "logps/chosen": -344.52044677734375, + "logps/rejected": -432.5497131347656, + "loss": 0.0438, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.254820317029953, + "rewards/margins": 9.785828590393066, + "rewards/rejected": -9.531007766723633, + "step": 7320 + }, + { + "epoch": 1.85, + "learning_rate": 2.124332927628499e-07, + "logits/chosen": -2.456355571746826, + "logits/rejected": -2.330481767654419, + "logps/chosen": -214.1356964111328, + "logps/rejected": -252.86752319335938, + "loss": 0.0637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11180851608514786, + "rewards/margins": 7.105473518371582, + "rewards/rejected": -7.217282772064209, + "step": 7330 + }, + { + "epoch": 1.86, + "learning_rate": 2.119651718003932e-07, + "logits/chosen": -2.4621005058288574, + "logits/rejected": -2.4193661212921143, + "logps/chosen": -273.9238586425781, + "logps/rejected": -351.8313903808594, + "loss": 0.1036, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4636752605438232, + "rewards/margins": 8.42043685913086, + "rewards/rejected": -9.884112358093262, + "step": 7340 + }, + { + "epoch": 1.86, + "learning_rate": 2.114970508379365e-07, + "logits/chosen": -2.611341714859009, + "logits/rejected": -2.4828543663024902, + "logps/chosen": -271.2765808105469, + "logps/rejected": -346.5517272949219, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7445017695426941, + "rewards/margins": 7.4684295654296875, + "rewards/rejected": -8.212930679321289, + "step": 7350 + }, + { + "epoch": 1.86, + "learning_rate": 2.1102892987547982e-07, + "logits/chosen": -2.58764910697937, + "logits/rejected": -2.609360694885254, + "logps/chosen": -235.40536499023438, + "logps/rejected": -340.46533203125, + "loss": 0.0732, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4518706798553467, + "rewards/margins": 7.273972988128662, + "rewards/rejected": -7.725844383239746, + "step": 7360 + }, + { + "epoch": 1.86, + "learning_rate": 2.105608089130231e-07, + "logits/chosen": -2.7084922790527344, + "logits/rejected": -2.6220972537994385, + "logps/chosen": -262.4686584472656, + "logps/rejected": -316.953125, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7665785551071167, + "rewards/margins": 8.81516170501709, + "rewards/rejected": -9.581741333007812, + "step": 7370 + }, + { + "epoch": 1.87, + "learning_rate": 2.1009268795056642e-07, + "logits/chosen": -2.3753809928894043, + "logits/rejected": -2.2589664459228516, + "logps/chosen": -272.1087951660156, + "logps/rejected": -286.30279541015625, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8401474952697754, + "rewards/margins": 6.868135929107666, + "rewards/rejected": -7.708283424377441, + "step": 7380 + }, + { + "epoch": 1.87, + "learning_rate": 2.096245669881097e-07, + "logits/chosen": -2.447204113006592, + "logits/rejected": -2.477513551712036, + "logps/chosen": -187.57974243164062, + "logps/rejected": -261.90338134765625, + "loss": 0.1106, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4475107192993164, + "rewards/margins": 6.116883754730225, + "rewards/rejected": -6.564394474029541, + "step": 7390 + }, + { + "epoch": 1.87, + "learning_rate": 2.0915644602565303e-07, + "logits/chosen": -2.64605712890625, + "logits/rejected": -2.6174445152282715, + "logps/chosen": -266.545654296875, + "logps/rejected": -323.08856201171875, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3667122721672058, + "rewards/margins": 9.014110565185547, + "rewards/rejected": -8.647397994995117, + "step": 7400 + }, + { + "epoch": 1.87, + "learning_rate": 2.0868832506319632e-07, + "logits/chosen": -2.531986713409424, + "logits/rejected": -2.54695987701416, + "logps/chosen": -243.2427978515625, + "logps/rejected": -461.019287109375, + "loss": 0.0968, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1118009090423584, + "rewards/margins": 9.954936981201172, + "rewards/rejected": -10.066737174987793, + "step": 7410 + }, + { + "epoch": 1.88, + "learning_rate": 2.0822020410073963e-07, + "logits/chosen": -2.595608711242676, + "logits/rejected": -2.468226194381714, + "logps/chosen": -311.4830627441406, + "logps/rejected": -302.28363037109375, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9550451040267944, + "rewards/margins": 6.499562740325928, + "rewards/rejected": -7.454607963562012, + "step": 7420 + }, + { + "epoch": 1.88, + "learning_rate": 2.0775208313828292e-07, + "logits/chosen": -2.715292453765869, + "logits/rejected": -2.664085865020752, + "logps/chosen": -254.71871948242188, + "logps/rejected": -304.4339599609375, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2248115837574005, + "rewards/margins": 6.619701385498047, + "rewards/rejected": -6.844512939453125, + "step": 7430 + }, + { + "epoch": 1.88, + "learning_rate": 2.072839621758262e-07, + "logits/chosen": -2.4319987297058105, + "logits/rejected": -2.4993152618408203, + "logps/chosen": -272.27923583984375, + "logps/rejected": -266.61456298828125, + "loss": 0.0518, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.692047119140625, + "rewards/margins": 6.355321884155273, + "rewards/rejected": -8.047369003295898, + "step": 7440 + }, + { + "epoch": 1.88, + "learning_rate": 2.0681584121336953e-07, + "logits/chosen": -2.679299831390381, + "logits/rejected": -2.6855454444885254, + "logps/chosen": -248.4481201171875, + "logps/rejected": -376.2951354980469, + "loss": 0.0956, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.08112213760614395, + "rewards/margins": 7.70186710357666, + "rewards/rejected": -7.782988548278809, + "step": 7450 + }, + { + "epoch": 1.89, + "learning_rate": 2.0634772025091282e-07, + "logits/chosen": -2.3245654106140137, + "logits/rejected": -2.3124887943267822, + "logps/chosen": -237.27978515625, + "logps/rejected": -272.12652587890625, + "loss": 0.0964, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0427350997924805, + "rewards/margins": 6.740478515625, + "rewards/rejected": -8.783212661743164, + "step": 7460 + }, + { + "epoch": 1.89, + "learning_rate": 2.0587959928845613e-07, + "logits/chosen": -2.2935822010040283, + "logits/rejected": -2.4650814533233643, + "logps/chosen": -267.4075927734375, + "logps/rejected": -303.23565673828125, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3434972763061523, + "rewards/margins": 8.509553909301758, + "rewards/rejected": -9.85305118560791, + "step": 7470 + }, + { + "epoch": 1.89, + "learning_rate": 2.0541147832599942e-07, + "logits/chosen": -2.5621564388275146, + "logits/rejected": -2.5696120262145996, + "logps/chosen": -270.3358459472656, + "logps/rejected": -301.4221496582031, + "loss": 0.1126, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2438421249389648, + "rewards/margins": 6.812567234039307, + "rewards/rejected": -8.056408882141113, + "step": 7480 + }, + { + "epoch": 1.89, + "learning_rate": 2.0494335736354274e-07, + "logits/chosen": -2.524179458618164, + "logits/rejected": -2.4801383018493652, + "logps/chosen": -293.1369934082031, + "logps/rejected": -440.0506286621094, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30194908380508423, + "rewards/margins": 9.120832443237305, + "rewards/rejected": -9.422780990600586, + "step": 7490 + }, + { + "epoch": 1.9, + "learning_rate": 2.0447523640108603e-07, + "logits/chosen": -2.46061635017395, + "logits/rejected": -2.450618267059326, + "logps/chosen": -308.24365234375, + "logps/rejected": -366.5410461425781, + "loss": 0.0504, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.354601502418518, + "rewards/margins": 8.541120529174805, + "rewards/rejected": -9.895721435546875, + "step": 7500 + }, + { + "epoch": 1.9, + "learning_rate": 2.0400711543862934e-07, + "logits/chosen": -2.558875560760498, + "logits/rejected": -2.4312326908111572, + "logps/chosen": -307.14453125, + "logps/rejected": -400.5588684082031, + "loss": 0.0861, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27744191884994507, + "rewards/margins": 12.654280662536621, + "rewards/rejected": -12.376840591430664, + "step": 7510 + }, + { + "epoch": 1.9, + "learning_rate": 2.0353899447617263e-07, + "logits/chosen": -2.7522482872009277, + "logits/rejected": -2.647388458251953, + "logps/chosen": -332.55987548828125, + "logps/rejected": -371.76434326171875, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23856863379478455, + "rewards/margins": 9.279437065124512, + "rewards/rejected": -9.040868759155273, + "step": 7520 + }, + { + "epoch": 1.9, + "learning_rate": 2.0307087351371592e-07, + "logits/chosen": -2.3382153511047363, + "logits/rejected": -2.2408435344696045, + "logps/chosen": -285.0376892089844, + "logps/rejected": -289.60760498046875, + "loss": 0.107, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.278595209121704, + "rewards/margins": 7.759607791900635, + "rewards/rejected": -9.038202285766602, + "step": 7530 + }, + { + "epoch": 1.91, + "learning_rate": 2.0260275255125924e-07, + "logits/chosen": -2.3935012817382812, + "logits/rejected": -2.4903364181518555, + "logps/chosen": -255.71084594726562, + "logps/rejected": -360.177978515625, + "loss": 0.108, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7504819631576538, + "rewards/margins": 7.956636905670166, + "rewards/rejected": -9.70711898803711, + "step": 7540 + }, + { + "epoch": 1.91, + "learning_rate": 2.0213463158880253e-07, + "logits/chosen": -2.4879226684570312, + "logits/rejected": -2.4252655506134033, + "logps/chosen": -328.02435302734375, + "logps/rejected": -385.58612060546875, + "loss": 0.0671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0912948846817017, + "rewards/margins": 10.426485061645508, + "rewards/rejected": -9.33519172668457, + "step": 7550 + }, + { + "epoch": 1.91, + "learning_rate": 2.0166651062634584e-07, + "logits/chosen": -2.42702317237854, + "logits/rejected": -2.4872756004333496, + "logps/chosen": -293.220703125, + "logps/rejected": -341.46490478515625, + "loss": 0.0857, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6263461112976074, + "rewards/margins": 9.559865951538086, + "rewards/rejected": -12.186211585998535, + "step": 7560 + }, + { + "epoch": 1.91, + "learning_rate": 2.0119838966388913e-07, + "logits/chosen": -2.5302624702453613, + "logits/rejected": -2.486345052719116, + "logps/chosen": -266.01739501953125, + "logps/rejected": -352.3585510253906, + "loss": 0.0657, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9551617503166199, + "rewards/margins": 9.321630477905273, + "rewards/rejected": -10.276793479919434, + "step": 7570 + }, + { + "epoch": 1.92, + "learning_rate": 2.0073026870143245e-07, + "logits/chosen": -2.3224892616271973, + "logits/rejected": -2.189652919769287, + "logps/chosen": -279.7569274902344, + "logps/rejected": -243.7903594970703, + "loss": 0.0699, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.153075933456421, + "rewards/margins": 5.087791919708252, + "rewards/rejected": -6.240868091583252, + "step": 7580 + }, + { + "epoch": 1.92, + "learning_rate": 2.0026214773897574e-07, + "logits/chosen": -2.5566675662994385, + "logits/rejected": -2.4298269748687744, + "logps/chosen": -286.26812744140625, + "logps/rejected": -471.37152099609375, + "loss": 0.0764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9075733423233032, + "rewards/margins": 9.651289939880371, + "rewards/rejected": -10.558862686157227, + "step": 7590 + }, + { + "epoch": 1.92, + "learning_rate": 1.9979402677651905e-07, + "logits/chosen": -2.4859347343444824, + "logits/rejected": -2.332869052886963, + "logps/chosen": -302.6961364746094, + "logps/rejected": -289.33233642578125, + "loss": 0.1151, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1341279745101929, + "rewards/margins": 8.396425247192383, + "rewards/rejected": -9.530553817749023, + "step": 7600 + }, + { + "epoch": 1.92, + "learning_rate": 1.9932590581406234e-07, + "logits/chosen": -2.49135160446167, + "logits/rejected": -2.3705971240997314, + "logps/chosen": -235.57321166992188, + "logps/rejected": -392.3863220214844, + "loss": 0.097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1869460493326187, + "rewards/margins": 8.586758613586426, + "rewards/rejected": -8.773704528808594, + "step": 7610 + }, + { + "epoch": 1.93, + "learning_rate": 1.9885778485160566e-07, + "logits/chosen": -2.483900547027588, + "logits/rejected": -2.541287660598755, + "logps/chosen": -283.54876708984375, + "logps/rejected": -357.29583740234375, + "loss": 0.0855, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6025287508964539, + "rewards/margins": 7.6768479347229, + "rewards/rejected": -8.279376029968262, + "step": 7620 + }, + { + "epoch": 1.93, + "learning_rate": 1.9838966388914892e-07, + "logits/chosen": -2.3701515197753906, + "logits/rejected": -2.5350399017333984, + "logps/chosen": -291.2542419433594, + "logps/rejected": -467.56756591796875, + "loss": 0.0716, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4438214898109436, + "rewards/margins": 11.625341415405273, + "rewards/rejected": -11.181519508361816, + "step": 7630 + }, + { + "epoch": 1.93, + "learning_rate": 1.9792154292669224e-07, + "logits/chosen": -2.595479726791382, + "logits/rejected": -2.625075578689575, + "logps/chosen": -277.4136047363281, + "logps/rejected": -371.3047180175781, + "loss": 0.1188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5710428953170776, + "rewards/margins": 10.5836763381958, + "rewards/rejected": -10.012632369995117, + "step": 7640 + }, + { + "epoch": 1.93, + "learning_rate": 1.9745342196423555e-07, + "logits/chosen": -2.531487464904785, + "logits/rejected": -2.466464042663574, + "logps/chosen": -296.78155517578125, + "logps/rejected": -376.5704650878906, + "loss": 0.0856, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18947431445121765, + "rewards/margins": 9.353696823120117, + "rewards/rejected": -9.543170928955078, + "step": 7650 + }, + { + "epoch": 1.94, + "learning_rate": 1.9698530100177884e-07, + "logits/chosen": -2.4659383296966553, + "logits/rejected": -2.4665517807006836, + "logps/chosen": -309.4673156738281, + "logps/rejected": -407.2119140625, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9733579754829407, + "rewards/margins": 7.893436431884766, + "rewards/rejected": -8.866793632507324, + "step": 7660 + }, + { + "epoch": 1.94, + "learning_rate": 1.9651718003932216e-07, + "logits/chosen": -2.1824698448181152, + "logits/rejected": -2.095674753189087, + "logps/chosen": -282.605712890625, + "logps/rejected": -280.44488525390625, + "loss": 0.0824, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8554694056510925, + "rewards/margins": 6.769730567932129, + "rewards/rejected": -7.6252007484436035, + "step": 7670 + }, + { + "epoch": 1.94, + "learning_rate": 1.9604905907686545e-07, + "logits/chosen": -2.369614839553833, + "logits/rejected": -2.2198705673217773, + "logps/chosen": -324.4106750488281, + "logps/rejected": -367.41461181640625, + "loss": 0.1118, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5607366561889648, + "rewards/margins": 7.041375160217285, + "rewards/rejected": -8.60211181640625, + "step": 7680 + }, + { + "epoch": 1.94, + "learning_rate": 1.9558093811440876e-07, + "logits/chosen": -2.494131565093994, + "logits/rejected": -2.4976108074188232, + "logps/chosen": -244.96316528320312, + "logps/rejected": -326.7379455566406, + "loss": 0.0849, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0386927127838135, + "rewards/margins": 6.035378456115723, + "rewards/rejected": -8.074070930480957, + "step": 7690 + }, + { + "epoch": 1.95, + "learning_rate": 1.9511281715195205e-07, + "logits/chosen": -2.569685697555542, + "logits/rejected": -2.408794641494751, + "logps/chosen": -253.5305633544922, + "logps/rejected": -320.5977783203125, + "loss": 0.088, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7515738010406494, + "rewards/margins": 6.714259147644043, + "rewards/rejected": -8.46583366394043, + "step": 7700 + }, + { + "epoch": 1.95, + "learning_rate": 1.9464469618949537e-07, + "logits/chosen": -2.457524538040161, + "logits/rejected": -2.29757022857666, + "logps/chosen": -262.61212158203125, + "logps/rejected": -294.0771484375, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6144840717315674, + "rewards/margins": 6.872900485992432, + "rewards/rejected": -8.487383842468262, + "step": 7710 + }, + { + "epoch": 1.95, + "learning_rate": 1.9417657522703866e-07, + "logits/chosen": -2.5344772338867188, + "logits/rejected": -2.473503589630127, + "logps/chosen": -253.647216796875, + "logps/rejected": -355.26422119140625, + "loss": 0.0902, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.004709136672317982, + "rewards/margins": 8.908960342407227, + "rewards/rejected": -8.904251098632812, + "step": 7720 + }, + { + "epoch": 1.95, + "learning_rate": 1.9370845426458197e-07, + "logits/chosen": -2.347165584564209, + "logits/rejected": -2.3553099632263184, + "logps/chosen": -199.9696502685547, + "logps/rejected": -317.84295654296875, + "loss": 0.0735, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1516036987304688, + "rewards/margins": 6.281300067901611, + "rewards/rejected": -8.432904243469238, + "step": 7730 + }, + { + "epoch": 1.96, + "learning_rate": 1.9324033330212524e-07, + "logits/chosen": -2.6470491886138916, + "logits/rejected": -2.4362094402313232, + "logps/chosen": -248.72830200195312, + "logps/rejected": -278.7564697265625, + "loss": 0.0773, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.181691288948059, + "rewards/margins": 7.5501861572265625, + "rewards/rejected": -8.731878280639648, + "step": 7740 + }, + { + "epoch": 1.96, + "learning_rate": 1.9277221233966855e-07, + "logits/chosen": -2.4118123054504395, + "logits/rejected": -2.496093273162842, + "logps/chosen": -251.16970825195312, + "logps/rejected": -275.74749755859375, + "loss": 0.0745, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6367145776748657, + "rewards/margins": 7.31857442855835, + "rewards/rejected": -8.955288887023926, + "step": 7750 + }, + { + "epoch": 1.96, + "learning_rate": 1.9230409137721187e-07, + "logits/chosen": -2.350545883178711, + "logits/rejected": -2.2462215423583984, + "logps/chosen": -307.57269287109375, + "logps/rejected": -275.0626525878906, + "loss": 0.0645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4821748733520508, + "rewards/margins": 9.54548454284668, + "rewards/rejected": -10.02765941619873, + "step": 7760 + }, + { + "epoch": 1.96, + "learning_rate": 1.9183597041475516e-07, + "logits/chosen": -2.6003167629241943, + "logits/rejected": -2.481722593307495, + "logps/chosen": -257.1544494628906, + "logps/rejected": -341.75543212890625, + "loss": 0.0574, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.408017873764038, + "rewards/margins": 7.46950626373291, + "rewards/rejected": -8.877523422241211, + "step": 7770 + }, + { + "epoch": 1.97, + "learning_rate": 1.9136784945229847e-07, + "logits/chosen": -2.6348369121551514, + "logits/rejected": -2.52321195602417, + "logps/chosen": -281.85699462890625, + "logps/rejected": -319.59405517578125, + "loss": 0.1264, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7298279404640198, + "rewards/margins": 8.692323684692383, + "rewards/rejected": -9.422151565551758, + "step": 7780 + }, + { + "epoch": 1.97, + "learning_rate": 1.9089972848984176e-07, + "logits/chosen": -2.6521339416503906, + "logits/rejected": -2.540344476699829, + "logps/chosen": -305.10418701171875, + "logps/rejected": -263.56634521484375, + "loss": 0.0689, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8900665044784546, + "rewards/margins": 6.8570051193237305, + "rewards/rejected": -7.747071743011475, + "step": 7790 + }, + { + "epoch": 1.97, + "learning_rate": 1.9043160752738508e-07, + "logits/chosen": -2.67622709274292, + "logits/rejected": -2.6399292945861816, + "logps/chosen": -355.0856628417969, + "logps/rejected": -337.4562683105469, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6816396713256836, + "rewards/margins": 7.101809501647949, + "rewards/rejected": -7.783450126647949, + "step": 7800 + }, + { + "epoch": 1.97, + "learning_rate": 1.8996348656492837e-07, + "logits/chosen": -2.376469135284424, + "logits/rejected": -2.32849383354187, + "logps/chosen": -207.29660034179688, + "logps/rejected": -284.89007568359375, + "loss": 0.079, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5479943752288818, + "rewards/margins": 6.303579330444336, + "rewards/rejected": -7.851573944091797, + "step": 7810 + }, + { + "epoch": 1.98, + "learning_rate": 1.8949536560247168e-07, + "logits/chosen": -2.3354849815368652, + "logits/rejected": -2.3749680519104004, + "logps/chosen": -181.08316040039062, + "logps/rejected": -217.5195770263672, + "loss": 0.0984, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.27036499977111816, + "rewards/margins": 6.646331787109375, + "rewards/rejected": -6.916696071624756, + "step": 7820 + }, + { + "epoch": 1.98, + "learning_rate": 1.8902724464001497e-07, + "logits/chosen": -2.676802158355713, + "logits/rejected": -2.4680111408233643, + "logps/chosen": -322.8711853027344, + "logps/rejected": -424.1873474121094, + "loss": 0.1034, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.44950175285339355, + "rewards/margins": 8.20262336730957, + "rewards/rejected": -8.652125358581543, + "step": 7830 + }, + { + "epoch": 1.98, + "learning_rate": 1.8855912367755826e-07, + "logits/chosen": -2.494516372680664, + "logits/rejected": -2.5215835571289062, + "logps/chosen": -227.83432006835938, + "logps/rejected": -321.3638916015625, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.327940821647644, + "rewards/margins": 6.914625644683838, + "rewards/rejected": -8.24256706237793, + "step": 7840 + }, + { + "epoch": 1.98, + "learning_rate": 1.8809100271510155e-07, + "logits/chosen": -2.4991791248321533, + "logits/rejected": -2.3919272422790527, + "logps/chosen": -243.00161743164062, + "logps/rejected": -311.275390625, + "loss": 0.0558, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8239580392837524, + "rewards/margins": 7.247486114501953, + "rewards/rejected": -9.071443557739258, + "step": 7850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8762288175264487e-07, + "logits/chosen": -2.4490151405334473, + "logits/rejected": -2.3293774127960205, + "logps/chosen": -335.91619873046875, + "logps/rejected": -364.5838928222656, + "loss": 0.1274, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4316998720169067, + "rewards/margins": 7.8372673988342285, + "rewards/rejected": -9.268967628479004, + "step": 7860 + }, + { + "epoch": 1.99, + "learning_rate": 1.8715476079018818e-07, + "logits/chosen": -2.497100591659546, + "logits/rejected": -2.379605770111084, + "logps/chosen": -226.01351928710938, + "logps/rejected": -261.7375793457031, + "loss": 0.2659, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5435857772827148, + "rewards/margins": 6.394545078277588, + "rewards/rejected": -7.9381303787231445, + "step": 7870 + }, + { + "epoch": 1.99, + "learning_rate": 1.8668663982773147e-07, + "logits/chosen": -2.4984803199768066, + "logits/rejected": -2.4084489345550537, + "logps/chosen": -278.9674377441406, + "logps/rejected": -300.3205261230469, + "loss": 0.1404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5822738409042358, + "rewards/margins": 8.082863807678223, + "rewards/rejected": -8.665136337280273, + "step": 7880 + }, + { + "epoch": 1.99, + "learning_rate": 1.862185188652748e-07, + "logits/chosen": -2.2636733055114746, + "logits/rejected": -2.1600308418273926, + "logps/chosen": -235.1582794189453, + "logps/rejected": -430.0575256347656, + "loss": 0.0936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06023601442575455, + "rewards/margins": 12.86986255645752, + "rewards/rejected": -12.930097579956055, + "step": 7890 + }, + { + "epoch": 2.0, + "learning_rate": 1.8575039790281808e-07, + "logits/chosen": -2.718794822692871, + "logits/rejected": -2.6176838874816895, + "logps/chosen": -297.5869140625, + "logps/rejected": -413.158203125, + "loss": 0.1766, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0013257265090942, + "rewards/margins": 9.500307083129883, + "rewards/rejected": -10.501633644104004, + "step": 7900 + }, + { + "epoch": 2.0, + "learning_rate": 1.852822769403614e-07, + "logits/chosen": -2.5108580589294434, + "logits/rejected": -2.5221803188323975, + "logps/chosen": -266.2281188964844, + "logps/rejected": -489.9151306152344, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1882187128067017, + "rewards/margins": 11.635540008544922, + "rewards/rejected": -12.823759078979492, + "step": 7910 + }, + { + "epoch": 2.0, + "learning_rate": 1.8481415597790468e-07, + "logits/chosen": -2.561251163482666, + "logits/rejected": -2.601534605026245, + "logps/chosen": -256.61260986328125, + "logps/rejected": -371.5245666503906, + "loss": 0.0257, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7856419086456299, + "rewards/margins": 9.267403602600098, + "rewards/rejected": -10.053044319152832, + "step": 7920 + }, + { + "epoch": 2.0, + "learning_rate": 1.84346035015448e-07, + "logits/chosen": -2.456209659576416, + "logits/rejected": -2.4456276893615723, + "logps/chosen": -269.7339782714844, + "logps/rejected": -317.511474609375, + "loss": 0.0271, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5396373867988586, + "rewards/margins": 9.451229095458984, + "rewards/rejected": -9.990866661071777, + "step": 7930 + }, + { + "epoch": 2.01, + "learning_rate": 1.8387791405299126e-07, + "logits/chosen": -2.3685498237609863, + "logits/rejected": -2.5117557048797607, + "logps/chosen": -263.42596435546875, + "logps/rejected": -358.99200439453125, + "loss": 0.0242, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.06902637332677841, + "rewards/margins": 11.954598426818848, + "rewards/rejected": -12.023625373840332, + "step": 7940 + }, + { + "epoch": 2.01, + "learning_rate": 1.8340979309053458e-07, + "logits/chosen": -2.643951892852783, + "logits/rejected": -2.4455976486206055, + "logps/chosen": -264.4395446777344, + "logps/rejected": -310.5169982910156, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0862385481595993, + "rewards/margins": 10.593948364257812, + "rewards/rejected": -10.68018627166748, + "step": 7950 + }, + { + "epoch": 2.01, + "learning_rate": 1.829416721280779e-07, + "logits/chosen": -2.6394400596618652, + "logits/rejected": -2.614877939224243, + "logps/chosen": -283.7574157714844, + "logps/rejected": -358.73687744140625, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05305292457342148, + "rewards/margins": 9.071102142333984, + "rewards/rejected": -9.12415599822998, + "step": 7960 + }, + { + "epoch": 2.01, + "learning_rate": 1.8247355116562118e-07, + "logits/chosen": -2.4935081005096436, + "logits/rejected": -2.388760805130005, + "logps/chosen": -250.9310760498047, + "logps/rejected": -325.74835205078125, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.061603307723999, + "rewards/margins": 6.261511325836182, + "rewards/rejected": -7.32311487197876, + "step": 7970 + }, + { + "epoch": 2.02, + "learning_rate": 1.820054302031645e-07, + "logits/chosen": -2.652278423309326, + "logits/rejected": -2.5909059047698975, + "logps/chosen": -204.5234375, + "logps/rejected": -269.10760498046875, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3194776177406311, + "rewards/margins": 8.187994003295898, + "rewards/rejected": -8.50747299194336, + "step": 7980 + }, + { + "epoch": 2.02, + "learning_rate": 1.815373092407078e-07, + "logits/chosen": -2.5203022956848145, + "logits/rejected": -2.4558236598968506, + "logps/chosen": -259.03582763671875, + "logps/rejected": -283.21807861328125, + "loss": 0.0216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8201421499252319, + "rewards/margins": 6.852190971374512, + "rewards/rejected": -7.672332763671875, + "step": 7990 + }, + { + "epoch": 2.02, + "learning_rate": 1.810691882782511e-07, + "logits/chosen": -2.5171782970428467, + "logits/rejected": -2.3930742740631104, + "logps/chosen": -310.53656005859375, + "logps/rejected": -249.26687622070312, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.053589820861816406, + "rewards/margins": 8.388738632202148, + "rewards/rejected": -8.335149765014648, + "step": 8000 + }, + { + "epoch": 2.02, + "learning_rate": 1.806010673157944e-07, + "logits/chosen": -2.5312087535858154, + "logits/rejected": -2.4033422470092773, + "logps/chosen": -299.86273193359375, + "logps/rejected": -359.2492370605469, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21362681686878204, + "rewards/margins": 8.105158805847168, + "rewards/rejected": -8.318785667419434, + "step": 8010 + }, + { + "epoch": 2.03, + "learning_rate": 1.801329463533377e-07, + "logits/chosen": -2.6355605125427246, + "logits/rejected": -2.521852970123291, + "logps/chosen": -234.41146850585938, + "logps/rejected": -390.18963623046875, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.062195561826229095, + "rewards/margins": 10.18429183959961, + "rewards/rejected": -10.122096061706543, + "step": 8020 + }, + { + "epoch": 2.03, + "learning_rate": 1.79664825390881e-07, + "logits/chosen": -2.543912410736084, + "logits/rejected": -2.3445167541503906, + "logps/chosen": -274.00604248046875, + "logps/rejected": -264.05810546875, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.859461784362793, + "rewards/margins": 8.486991882324219, + "rewards/rejected": -9.346452713012695, + "step": 8030 + }, + { + "epoch": 2.03, + "learning_rate": 1.791967044284243e-07, + "logits/chosen": -2.34971284866333, + "logits/rejected": -2.3091883659362793, + "logps/chosen": -189.92611694335938, + "logps/rejected": -276.96270751953125, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5734513998031616, + "rewards/margins": 7.828526496887207, + "rewards/rejected": -9.401978492736816, + "step": 8040 + }, + { + "epoch": 2.03, + "learning_rate": 1.7872858346596758e-07, + "logits/chosen": -2.5440075397491455, + "logits/rejected": -2.3190009593963623, + "logps/chosen": -291.01593017578125, + "logps/rejected": -358.1510009765625, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44751954078674316, + "rewards/margins": 8.990605354309082, + "rewards/rejected": -9.438124656677246, + "step": 8050 + }, + { + "epoch": 2.04, + "learning_rate": 1.782604625035109e-07, + "logits/chosen": -2.489872455596924, + "logits/rejected": -2.409348249435425, + "logps/chosen": -235.9306640625, + "logps/rejected": -279.5631103515625, + "loss": 0.0205, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1051379442214966, + "rewards/margins": 7.312878608703613, + "rewards/rejected": -8.41801643371582, + "step": 8060 + }, + { + "epoch": 2.04, + "learning_rate": 1.777923415410542e-07, + "logits/chosen": -2.252562999725342, + "logits/rejected": -2.2914912700653076, + "logps/chosen": -222.335693359375, + "logps/rejected": -248.390380859375, + "loss": 0.0174, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3904139995574951, + "rewards/margins": 7.800166130065918, + "rewards/rejected": -9.190579414367676, + "step": 8070 + }, + { + "epoch": 2.04, + "learning_rate": 1.773242205785975e-07, + "logits/chosen": -2.641828775405884, + "logits/rejected": -2.566885232925415, + "logps/chosen": -332.9060363769531, + "logps/rejected": -376.92059326171875, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2503730356693268, + "rewards/margins": 10.505960464477539, + "rewards/rejected": -10.25558853149414, + "step": 8080 + }, + { + "epoch": 2.04, + "learning_rate": 1.768560996161408e-07, + "logits/chosen": -2.5699288845062256, + "logits/rejected": -2.359781503677368, + "logps/chosen": -283.2177734375, + "logps/rejected": -267.2395935058594, + "loss": 0.1899, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8085619807243347, + "rewards/margins": 8.801640510559082, + "rewards/rejected": -9.610200881958008, + "step": 8090 + }, + { + "epoch": 2.05, + "learning_rate": 1.763879786536841e-07, + "logits/chosen": -2.3845903873443604, + "logits/rejected": -2.3299880027770996, + "logps/chosen": -269.281005859375, + "logps/rejected": -364.01177978515625, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.045176975429058075, + "rewards/margins": 11.046797752380371, + "rewards/rejected": -11.001619338989258, + "step": 8100 + }, + { + "epoch": 2.05, + "learning_rate": 1.7591985769122742e-07, + "logits/chosen": -2.6512389183044434, + "logits/rejected": -2.4565510749816895, + "logps/chosen": -293.4483642578125, + "logps/rejected": -316.6645812988281, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3632938861846924, + "rewards/margins": 7.443239688873291, + "rewards/rejected": -8.806532859802246, + "step": 8110 + }, + { + "epoch": 2.05, + "learning_rate": 1.754517367287707e-07, + "logits/chosen": -2.4491851329803467, + "logits/rejected": -2.3319764137268066, + "logps/chosen": -191.01324462890625, + "logps/rejected": -290.0277404785156, + "loss": 0.0316, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18040040135383606, + "rewards/margins": 10.047754287719727, + "rewards/rejected": -10.228155136108398, + "step": 8120 + }, + { + "epoch": 2.06, + "learning_rate": 1.7498361576631402e-07, + "logits/chosen": -2.584289073944092, + "logits/rejected": -2.410949945449829, + "logps/chosen": -220.06869506835938, + "logps/rejected": -292.25885009765625, + "loss": 0.0287, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.637312889099121, + "rewards/margins": 9.337835311889648, + "rewards/rejected": -10.975146293640137, + "step": 8130 + }, + { + "epoch": 2.06, + "learning_rate": 1.745154948038573e-07, + "logits/chosen": -2.5168070793151855, + "logits/rejected": -2.3504860401153564, + "logps/chosen": -291.9444885253906, + "logps/rejected": -430.6874084472656, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9920564889907837, + "rewards/margins": 9.87191390991211, + "rewards/rejected": -11.863969802856445, + "step": 8140 + }, + { + "epoch": 2.06, + "learning_rate": 1.740473738414006e-07, + "logits/chosen": -2.504371404647827, + "logits/rejected": -2.3894200325012207, + "logps/chosen": -317.1540222167969, + "logps/rejected": -293.84375, + "loss": 0.0229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0019736289978027, + "rewards/margins": 6.3885931968688965, + "rewards/rejected": -8.390567779541016, + "step": 8150 + }, + { + "epoch": 2.06, + "learning_rate": 1.735792528789439e-07, + "logits/chosen": -2.6090266704559326, + "logits/rejected": -2.4409031867980957, + "logps/chosen": -314.2073669433594, + "logps/rejected": -401.43157958984375, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9937704801559448, + "rewards/margins": 9.054048538208008, + "rewards/rejected": -10.047819137573242, + "step": 8160 + }, + { + "epoch": 2.07, + "learning_rate": 1.731111319164872e-07, + "logits/chosen": -2.321427345275879, + "logits/rejected": -2.2758946418762207, + "logps/chosen": -152.3300018310547, + "logps/rejected": -221.68136596679688, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7852057814598083, + "rewards/margins": 8.7341890335083, + "rewards/rejected": -9.519393920898438, + "step": 8170 + }, + { + "epoch": 2.07, + "learning_rate": 1.7264301095403052e-07, + "logits/chosen": -2.5600805282592773, + "logits/rejected": -2.400702714920044, + "logps/chosen": -327.7169494628906, + "logps/rejected": -362.20648193359375, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4652659893035889, + "rewards/margins": 10.510732650756836, + "rewards/rejected": -11.975997924804688, + "step": 8180 + }, + { + "epoch": 2.07, + "learning_rate": 1.721748899915738e-07, + "logits/chosen": -2.646230459213257, + "logits/rejected": -2.510148525238037, + "logps/chosen": -252.9543914794922, + "logps/rejected": -352.07916259765625, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06406328827142715, + "rewards/margins": 10.646974563598633, + "rewards/rejected": -10.582910537719727, + "step": 8190 + }, + { + "epoch": 2.07, + "learning_rate": 1.7170676902911713e-07, + "logits/chosen": -2.5234501361846924, + "logits/rejected": -2.3625144958496094, + "logps/chosen": -216.58914184570312, + "logps/rejected": -283.11175537109375, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7709776163101196, + "rewards/margins": 7.509818077087402, + "rewards/rejected": -8.280795097351074, + "step": 8200 + }, + { + "epoch": 2.08, + "learning_rate": 1.7123864806666042e-07, + "logits/chosen": -2.4937376976013184, + "logits/rejected": -2.333495616912842, + "logps/chosen": -267.9828796386719, + "logps/rejected": -351.60833740234375, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5620592832565308, + "rewards/margins": 10.002971649169922, + "rewards/rejected": -11.565031051635742, + "step": 8210 + }, + { + "epoch": 2.08, + "learning_rate": 1.7077052710420373e-07, + "logits/chosen": -2.5776846408843994, + "logits/rejected": -2.494020938873291, + "logps/chosen": -236.8033447265625, + "logps/rejected": -289.5408935546875, + "loss": 0.0207, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.069629192352295, + "rewards/margins": 8.876639366149902, + "rewards/rejected": -10.946269035339355, + "step": 8220 + }, + { + "epoch": 2.08, + "learning_rate": 1.7030240614174702e-07, + "logits/chosen": -2.341704845428467, + "logits/rejected": -2.1591591835021973, + "logps/chosen": -207.3115234375, + "logps/rejected": -285.88458251953125, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1234140396118164, + "rewards/margins": 11.760368347167969, + "rewards/rejected": -10.636955261230469, + "step": 8230 + }, + { + "epoch": 2.08, + "learning_rate": 1.6983428517929034e-07, + "logits/chosen": -2.479773998260498, + "logits/rejected": -2.2539517879486084, + "logps/chosen": -366.88201904296875, + "logps/rejected": -370.52618408203125, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2932839393615723, + "rewards/margins": 7.529888153076172, + "rewards/rejected": -9.823171615600586, + "step": 8240 + }, + { + "epoch": 2.09, + "learning_rate": 1.693661642168336e-07, + "logits/chosen": -2.645338296890259, + "logits/rejected": -2.3822028636932373, + "logps/chosen": -244.0205535888672, + "logps/rejected": -349.7413024902344, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49810829758644104, + "rewards/margins": 10.42314338684082, + "rewards/rejected": -9.92503547668457, + "step": 8250 + }, + { + "epoch": 2.09, + "learning_rate": 1.6889804325437692e-07, + "logits/chosen": -2.571413040161133, + "logits/rejected": -2.4334158897399902, + "logps/chosen": -335.8421325683594, + "logps/rejected": -308.39263916015625, + "loss": 0.0186, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9779975414276123, + "rewards/margins": 8.579587936401367, + "rewards/rejected": -11.557584762573242, + "step": 8260 + }, + { + "epoch": 2.09, + "learning_rate": 1.684299222919202e-07, + "logits/chosen": -2.5503220558166504, + "logits/rejected": -2.539371967315674, + "logps/chosen": -194.6260986328125, + "logps/rejected": -334.47161865234375, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26310062408447266, + "rewards/margins": 11.362150192260742, + "rewards/rejected": -11.099050521850586, + "step": 8270 + }, + { + "epoch": 2.09, + "learning_rate": 1.6796180132946352e-07, + "logits/chosen": -2.3966946601867676, + "logits/rejected": -2.3247835636138916, + "logps/chosen": -344.1413269042969, + "logps/rejected": -380.3586730957031, + "loss": 0.062, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.485003799200058, + "rewards/margins": 8.122628211975098, + "rewards/rejected": -8.60763168334961, + "step": 8280 + }, + { + "epoch": 2.1, + "learning_rate": 1.6749368036700684e-07, + "logits/chosen": -2.525015354156494, + "logits/rejected": -2.412067413330078, + "logps/chosen": -264.25482177734375, + "logps/rejected": -356.77655029296875, + "loss": 0.0736, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21086089313030243, + "rewards/margins": 9.446159362792969, + "rewards/rejected": -9.657018661499023, + "step": 8290 + }, + { + "epoch": 2.1, + "learning_rate": 1.6702555940455013e-07, + "logits/chosen": -2.270688056945801, + "logits/rejected": -2.3070790767669678, + "logps/chosen": -185.3845977783203, + "logps/rejected": -337.3004455566406, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5996869802474976, + "rewards/margins": 10.957901000976562, + "rewards/rejected": -12.557588577270508, + "step": 8300 + }, + { + "epoch": 2.1, + "learning_rate": 1.6655743844209344e-07, + "logits/chosen": -2.582287073135376, + "logits/rejected": -2.5308005809783936, + "logps/chosen": -279.72979736328125, + "logps/rejected": -383.0921325683594, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.091134786605835, + "rewards/margins": 9.544285774230957, + "rewards/rejected": -10.635419845581055, + "step": 8310 + }, + { + "epoch": 2.1, + "learning_rate": 1.6608931747963673e-07, + "logits/chosen": -2.496814727783203, + "logits/rejected": -2.4529342651367188, + "logps/chosen": -347.9647521972656, + "logps/rejected": -375.94781494140625, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0510023832321167, + "rewards/margins": 10.772096633911133, + "rewards/rejected": -10.721094131469727, + "step": 8320 + }, + { + "epoch": 2.11, + "learning_rate": 1.6562119651718005e-07, + "logits/chosen": -2.5629899501800537, + "logits/rejected": -2.434959888458252, + "logps/chosen": -246.48849487304688, + "logps/rejected": -348.2873229980469, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9217214584350586, + "rewards/margins": 10.495768547058105, + "rewards/rejected": -11.417490005493164, + "step": 8330 + }, + { + "epoch": 2.11, + "learning_rate": 1.6515307555472334e-07, + "logits/chosen": -2.3305695056915283, + "logits/rejected": -2.317716598510742, + "logps/chosen": -251.8772430419922, + "logps/rejected": -313.6505432128906, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3803676664829254, + "rewards/margins": 9.625941276550293, + "rewards/rejected": -10.006309509277344, + "step": 8340 + }, + { + "epoch": 2.11, + "learning_rate": 1.6468495459226663e-07, + "logits/chosen": -2.4659130573272705, + "logits/rejected": -2.4751675128936768, + "logps/chosen": -156.78298950195312, + "logps/rejected": -280.6018981933594, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27651506662368774, + "rewards/margins": 10.506505012512207, + "rewards/rejected": -10.78302001953125, + "step": 8350 + }, + { + "epoch": 2.11, + "learning_rate": 1.6421683362980992e-07, + "logits/chosen": -2.323483943939209, + "logits/rejected": -2.4297595024108887, + "logps/chosen": -195.44876098632812, + "logps/rejected": -443.52691650390625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0662974119186401, + "rewards/margins": 14.12694263458252, + "rewards/rejected": -15.19324016571045, + "step": 8360 + }, + { + "epoch": 2.12, + "learning_rate": 1.6374871266735323e-07, + "logits/chosen": -2.4566922187805176, + "logits/rejected": -2.3604483604431152, + "logps/chosen": -276.23419189453125, + "logps/rejected": -290.4494323730469, + "loss": 0.0108, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7056076526641846, + "rewards/margins": 10.065206527709961, + "rewards/rejected": -11.77081298828125, + "step": 8370 + }, + { + "epoch": 2.12, + "learning_rate": 1.6328059170489652e-07, + "logits/chosen": -2.63828706741333, + "logits/rejected": -2.462395429611206, + "logps/chosen": -289.7229919433594, + "logps/rejected": -303.6163330078125, + "loss": 0.0217, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.912933349609375, + "rewards/margins": 7.971318244934082, + "rewards/rejected": -9.88425064086914, + "step": 8380 + }, + { + "epoch": 2.12, + "learning_rate": 1.6281247074243984e-07, + "logits/chosen": -2.365940809249878, + "logits/rejected": -2.3253707885742188, + "logps/chosen": -275.8111877441406, + "logps/rejected": -374.54693603515625, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3288636207580566, + "rewards/margins": 12.161454200744629, + "rewards/rejected": -13.490318298339844, + "step": 8390 + }, + { + "epoch": 2.12, + "learning_rate": 1.6234434977998315e-07, + "logits/chosen": -2.3862948417663574, + "logits/rejected": -2.3025941848754883, + "logps/chosen": -292.1995849609375, + "logps/rejected": -398.25225830078125, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.936740517616272, + "rewards/margins": 10.562994003295898, + "rewards/rejected": -12.499734878540039, + "step": 8400 + }, + { + "epoch": 2.13, + "learning_rate": 1.6187622881752644e-07, + "logits/chosen": -2.454590320587158, + "logits/rejected": -2.471228837966919, + "logps/chosen": -235.47763061523438, + "logps/rejected": -368.3105163574219, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.234427452087402, + "rewards/margins": 9.330568313598633, + "rewards/rejected": -13.564994812011719, + "step": 8410 + }, + { + "epoch": 2.13, + "learning_rate": 1.6140810785506976e-07, + "logits/chosen": -2.5940306186676025, + "logits/rejected": -2.3940608501434326, + "logps/chosen": -293.6297607421875, + "logps/rejected": -333.98248291015625, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9440231323242188, + "rewards/margins": 10.160451889038086, + "rewards/rejected": -12.104475975036621, + "step": 8420 + }, + { + "epoch": 2.13, + "learning_rate": 1.6093998689261305e-07, + "logits/chosen": -2.4853529930114746, + "logits/rejected": -2.4013895988464355, + "logps/chosen": -269.6568298339844, + "logps/rejected": -433.7679748535156, + "loss": 0.019, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2059710025787354, + "rewards/margins": 12.60078239440918, + "rewards/rejected": -13.806753158569336, + "step": 8430 + }, + { + "epoch": 2.13, + "learning_rate": 1.6047186593015636e-07, + "logits/chosen": -2.5733630657196045, + "logits/rejected": -2.41157865524292, + "logps/chosen": -260.69903564453125, + "logps/rejected": -291.9221496582031, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5500335693359375, + "rewards/margins": 8.757722854614258, + "rewards/rejected": -11.307757377624512, + "step": 8440 + }, + { + "epoch": 2.14, + "learning_rate": 1.6000374496769963e-07, + "logits/chosen": -2.5668065547943115, + "logits/rejected": -2.5014026165008545, + "logps/chosen": -255.66799926757812, + "logps/rejected": -373.7508239746094, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7974373698234558, + "rewards/margins": 11.770620346069336, + "rewards/rejected": -12.568058013916016, + "step": 8450 + }, + { + "epoch": 2.14, + "learning_rate": 1.5953562400524294e-07, + "logits/chosen": -2.6310415267944336, + "logits/rejected": -2.5319175720214844, + "logps/chosen": -262.56201171875, + "logps/rejected": -301.62335205078125, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2064162492752075, + "rewards/margins": 10.903173446655273, + "rewards/rejected": -12.109588623046875, + "step": 8460 + }, + { + "epoch": 2.14, + "learning_rate": 1.5906750304278623e-07, + "logits/chosen": -2.5703375339508057, + "logits/rejected": -2.5990467071533203, + "logps/chosen": -272.7796630859375, + "logps/rejected": -399.44158935546875, + "loss": 0.0192, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6385996341705322, + "rewards/margins": 10.162915229797363, + "rewards/rejected": -12.801515579223633, + "step": 8470 + }, + { + "epoch": 2.14, + "learning_rate": 1.5859938208032955e-07, + "logits/chosen": -2.532130718231201, + "logits/rejected": -2.4024813175201416, + "logps/chosen": -396.67840576171875, + "logps/rejected": -454.4134216308594, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2578914165496826, + "rewards/margins": 10.379530906677246, + "rewards/rejected": -11.637422561645508, + "step": 8480 + }, + { + "epoch": 2.15, + "learning_rate": 1.5813126111787284e-07, + "logits/chosen": -2.4982962608337402, + "logits/rejected": -2.474616289138794, + "logps/chosen": -397.2320251464844, + "logps/rejected": -552.1438598632812, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.637967348098755, + "rewards/margins": 12.508705139160156, + "rewards/rejected": -15.146673202514648, + "step": 8490 + }, + { + "epoch": 2.15, + "learning_rate": 1.5766314015541615e-07, + "logits/chosen": -2.7057583332061768, + "logits/rejected": -2.437467575073242, + "logps/chosen": -283.351318359375, + "logps/rejected": -287.83544921875, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8018553853034973, + "rewards/margins": 9.905305862426758, + "rewards/rejected": -10.707162857055664, + "step": 8500 + }, + { + "epoch": 2.15, + "learning_rate": 1.5719501919295947e-07, + "logits/chosen": -2.7296359539031982, + "logits/rejected": -2.6196982860565186, + "logps/chosen": -330.23516845703125, + "logps/rejected": -404.32794189453125, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6082562804222107, + "rewards/margins": 10.484846115112305, + "rewards/rejected": -11.093101501464844, + "step": 8510 + }, + { + "epoch": 2.15, + "learning_rate": 1.5672689823050276e-07, + "logits/chosen": -2.2502963542938232, + "logits/rejected": -2.183072566986084, + "logps/chosen": -266.4691467285156, + "logps/rejected": -300.840087890625, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8021724820137024, + "rewards/margins": 10.726180076599121, + "rewards/rejected": -11.528352737426758, + "step": 8520 + }, + { + "epoch": 2.16, + "learning_rate": 1.5625877726804607e-07, + "logits/chosen": -2.5576233863830566, + "logits/rejected": -2.457980155944824, + "logps/chosen": -325.43280029296875, + "logps/rejected": -400.6141662597656, + "loss": 0.0794, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25530198216438293, + "rewards/margins": 12.95799446105957, + "rewards/rejected": -12.702692031860352, + "step": 8530 + }, + { + "epoch": 2.16, + "learning_rate": 1.5579065630558936e-07, + "logits/chosen": -2.40440034866333, + "logits/rejected": -2.240015745162964, + "logps/chosen": -312.38092041015625, + "logps/rejected": -383.36187744140625, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0788674354553223, + "rewards/margins": 11.552621841430664, + "rewards/rejected": -13.631490707397461, + "step": 8540 + }, + { + "epoch": 2.16, + "learning_rate": 1.5532253534313268e-07, + "logits/chosen": -2.654876232147217, + "logits/rejected": -2.5984530448913574, + "logps/chosen": -276.11053466796875, + "logps/rejected": -370.93743896484375, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.086482048034668, + "rewards/margins": 10.666714668273926, + "rewards/rejected": -11.753195762634277, + "step": 8550 + }, + { + "epoch": 2.16, + "learning_rate": 1.5485441438067594e-07, + "logits/chosen": -2.6042940616607666, + "logits/rejected": -2.5303900241851807, + "logps/chosen": -276.2106018066406, + "logps/rejected": -304.36370849609375, + "loss": 0.0221, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0825324058532715, + "rewards/margins": 9.108705520629883, + "rewards/rejected": -11.191239356994629, + "step": 8560 + }, + { + "epoch": 2.17, + "learning_rate": 1.5438629341821926e-07, + "logits/chosen": -2.4339449405670166, + "logits/rejected": -2.4021949768066406, + "logps/chosen": -234.2420654296875, + "logps/rejected": -386.6480407714844, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.374173641204834, + "rewards/margins": 10.155722618103027, + "rewards/rejected": -12.529895782470703, + "step": 8570 + }, + { + "epoch": 2.17, + "learning_rate": 1.5391817245576255e-07, + "logits/chosen": -2.5122060775756836, + "logits/rejected": -2.4285624027252197, + "logps/chosen": -229.5922088623047, + "logps/rejected": -280.2808837890625, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24378149211406708, + "rewards/margins": 10.249922752380371, + "rewards/rejected": -10.493703842163086, + "step": 8580 + }, + { + "epoch": 2.17, + "learning_rate": 1.5345005149330586e-07, + "logits/chosen": -2.523165464401245, + "logits/rejected": -2.4486472606658936, + "logps/chosen": -266.5024719238281, + "logps/rejected": -344.3494873046875, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16855347156524658, + "rewards/margins": 12.045059204101562, + "rewards/rejected": -12.213613510131836, + "step": 8590 + }, + { + "epoch": 2.17, + "learning_rate": 1.5298193053084915e-07, + "logits/chosen": -2.5514864921569824, + "logits/rejected": -2.444061040878296, + "logps/chosen": -266.6001892089844, + "logps/rejected": -331.06170654296875, + "loss": 0.0176, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7335854768753052, + "rewards/margins": 9.204207420349121, + "rewards/rejected": -9.937792778015137, + "step": 8600 + }, + { + "epoch": 2.18, + "learning_rate": 1.5251380956839247e-07, + "logits/chosen": -2.603799343109131, + "logits/rejected": -2.4442601203918457, + "logps/chosen": -330.43023681640625, + "logps/rejected": -305.4593811035156, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46179574728012085, + "rewards/margins": 11.207643508911133, + "rewards/rejected": -10.745849609375, + "step": 8610 + }, + { + "epoch": 2.18, + "learning_rate": 1.5204568860593578e-07, + "logits/chosen": -2.5968942642211914, + "logits/rejected": -2.5343058109283447, + "logps/chosen": -282.2979736328125, + "logps/rejected": -334.32244873046875, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5933091640472412, + "rewards/margins": 10.09453010559082, + "rewards/rejected": -11.687838554382324, + "step": 8620 + }, + { + "epoch": 2.18, + "learning_rate": 1.5157756764347907e-07, + "logits/chosen": -2.6372528076171875, + "logits/rejected": -2.5136094093322754, + "logps/chosen": -332.3785705566406, + "logps/rejected": -405.2020263671875, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.054649591445923, + "rewards/margins": 10.335689544677734, + "rewards/rejected": -12.390339851379395, + "step": 8630 + }, + { + "epoch": 2.18, + "learning_rate": 1.511094466810224e-07, + "logits/chosen": -2.5080435276031494, + "logits/rejected": -2.451399326324463, + "logps/chosen": -210.416259765625, + "logps/rejected": -338.97479248046875, + "loss": 0.0698, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2667189836502075, + "rewards/margins": 10.377790451049805, + "rewards/rejected": -11.64450740814209, + "step": 8640 + }, + { + "epoch": 2.19, + "learning_rate": 1.5064132571856568e-07, + "logits/chosen": -2.4267754554748535, + "logits/rejected": -2.339641809463501, + "logps/chosen": -277.78851318359375, + "logps/rejected": -439.1700744628906, + "loss": 0.0304, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.432140827178955, + "rewards/margins": 11.009929656982422, + "rewards/rejected": -13.442071914672852, + "step": 8650 + }, + { + "epoch": 2.19, + "learning_rate": 1.5017320475610897e-07, + "logits/chosen": -2.4750618934631348, + "logits/rejected": -2.5541577339172363, + "logps/chosen": -273.45489501953125, + "logps/rejected": -420.8013610839844, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6541218757629395, + "rewards/margins": 12.435382843017578, + "rewards/rejected": -13.089506149291992, + "step": 8660 + }, + { + "epoch": 2.19, + "learning_rate": 1.4970508379365226e-07, + "logits/chosen": -2.5706381797790527, + "logits/rejected": -2.381520986557007, + "logps/chosen": -356.1487731933594, + "logps/rejected": -336.7781982421875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6374567747116089, + "rewards/margins": 11.748361587524414, + "rewards/rejected": -12.385817527770996, + "step": 8670 + }, + { + "epoch": 2.19, + "learning_rate": 1.4923696283119557e-07, + "logits/chosen": -2.29630446434021, + "logits/rejected": -2.1913228034973145, + "logps/chosen": -283.35162353515625, + "logps/rejected": -354.83184814453125, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.598496913909912, + "rewards/margins": 9.002246856689453, + "rewards/rejected": -11.60074234008789, + "step": 8680 + }, + { + "epoch": 2.2, + "learning_rate": 1.4876884186873886e-07, + "logits/chosen": -2.284550428390503, + "logits/rejected": -2.113361358642578, + "logps/chosen": -234.25259399414062, + "logps/rejected": -332.28424072265625, + "loss": 0.0155, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0471138954162598, + "rewards/margins": 10.27766227722168, + "rewards/rejected": -13.324775695800781, + "step": 8690 + }, + { + "epoch": 2.2, + "learning_rate": 1.4830072090628218e-07, + "logits/chosen": -2.464995861053467, + "logits/rejected": -2.440201759338379, + "logps/chosen": -295.09588623046875, + "logps/rejected": -337.61492919921875, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0475311279296875, + "rewards/margins": 10.91230583190918, + "rewards/rejected": -11.959836959838867, + "step": 8700 + }, + { + "epoch": 2.2, + "learning_rate": 1.4783259994382547e-07, + "logits/chosen": -2.500070571899414, + "logits/rejected": -2.3573880195617676, + "logps/chosen": -325.81646728515625, + "logps/rejected": -481.3152770996094, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9885588884353638, + "rewards/margins": 15.744482040405273, + "rewards/rejected": -14.755925178527832, + "step": 8710 + }, + { + "epoch": 2.2, + "learning_rate": 1.4736447898136878e-07, + "logits/chosen": -2.5980992317199707, + "logits/rejected": -2.4638357162475586, + "logps/chosen": -326.902587890625, + "logps/rejected": -382.9718933105469, + "loss": 0.0241, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2018386870622635, + "rewards/margins": 12.231120109558105, + "rewards/rejected": -12.432960510253906, + "step": 8720 + }, + { + "epoch": 2.21, + "learning_rate": 1.468963580189121e-07, + "logits/chosen": -2.485769748687744, + "logits/rejected": -2.4419784545898438, + "logps/chosen": -312.25067138671875, + "logps/rejected": -351.7271728515625, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.832437515258789, + "rewards/margins": 10.075966835021973, + "rewards/rejected": -11.908404350280762, + "step": 8730 + }, + { + "epoch": 2.21, + "learning_rate": 1.4642823705645539e-07, + "logits/chosen": -2.316826105117798, + "logits/rejected": -2.4810423851013184, + "logps/chosen": -236.44052124023438, + "logps/rejected": -311.9570617675781, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.018673300743103, + "rewards/margins": 9.659832954406738, + "rewards/rejected": -10.678506851196289, + "step": 8740 + }, + { + "epoch": 2.21, + "learning_rate": 1.459601160939987e-07, + "logits/chosen": -2.653926372528076, + "logits/rejected": -2.537010908126831, + "logps/chosen": -291.753662109375, + "logps/rejected": -322.4140319824219, + "loss": 0.0246, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0316170454025269, + "rewards/margins": 9.18066692352295, + "rewards/rejected": -10.212284088134766, + "step": 8750 + }, + { + "epoch": 2.21, + "learning_rate": 1.4549199513154197e-07, + "logits/chosen": -2.6024010181427, + "logits/rejected": -2.4828600883483887, + "logps/chosen": -405.0433044433594, + "logps/rejected": -491.9237365722656, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7376415133476257, + "rewards/margins": 12.731585502624512, + "rewards/rejected": -13.46922779083252, + "step": 8760 + }, + { + "epoch": 2.22, + "learning_rate": 1.4502387416908528e-07, + "logits/chosen": -2.42547869682312, + "logits/rejected": -2.299511671066284, + "logps/chosen": -236.38729858398438, + "logps/rejected": -299.8211364746094, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5624053478240967, + "rewards/margins": 9.061653137207031, + "rewards/rejected": -11.624059677124023, + "step": 8770 + }, + { + "epoch": 2.22, + "learning_rate": 1.4455575320662857e-07, + "logits/chosen": -2.3677194118499756, + "logits/rejected": -2.2375476360321045, + "logps/chosen": -293.98370361328125, + "logps/rejected": -419.0365295410156, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0371956825256348, + "rewards/margins": 12.501256942749023, + "rewards/rejected": -14.5384521484375, + "step": 8780 + }, + { + "epoch": 2.22, + "learning_rate": 1.4408763224417189e-07, + "logits/chosen": -2.31605863571167, + "logits/rejected": -2.36974835395813, + "logps/chosen": -233.51315307617188, + "logps/rejected": -291.78143310546875, + "loss": 0.0341, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6936259269714355, + "rewards/margins": 7.989305019378662, + "rewards/rejected": -10.682931900024414, + "step": 8790 + }, + { + "epoch": 2.22, + "learning_rate": 1.4361951128171518e-07, + "logits/chosen": -2.484557628631592, + "logits/rejected": -2.4126553535461426, + "logps/chosen": -246.5937957763672, + "logps/rejected": -323.31671142578125, + "loss": 0.0249, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5201890468597412, + "rewards/margins": 9.29806137084961, + "rewards/rejected": -10.818249702453613, + "step": 8800 + }, + { + "epoch": 2.23, + "learning_rate": 1.431513903192585e-07, + "logits/chosen": -2.5416526794433594, + "logits/rejected": -2.418910264968872, + "logps/chosen": -273.0360107421875, + "logps/rejected": -309.00787353515625, + "loss": 0.0529, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.139587163925171, + "rewards/margins": 11.749044418334961, + "rewards/rejected": -13.888630867004395, + "step": 8810 + }, + { + "epoch": 2.23, + "learning_rate": 1.4268326935680178e-07, + "logits/chosen": -2.2652955055236816, + "logits/rejected": -2.275373697280884, + "logps/chosen": -282.45245361328125, + "logps/rejected": -355.07855224609375, + "loss": 0.011, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4622039794921875, + "rewards/margins": 12.305490493774414, + "rewards/rejected": -13.767694473266602, + "step": 8820 + }, + { + "epoch": 2.23, + "learning_rate": 1.422151483943451e-07, + "logits/chosen": -2.5491325855255127, + "logits/rejected": -2.653744697570801, + "logps/chosen": -300.6777648925781, + "logps/rejected": -386.43768310546875, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2964599132537842, + "rewards/margins": 12.887380599975586, + "rewards/rejected": -14.18384075164795, + "step": 8830 + }, + { + "epoch": 2.23, + "learning_rate": 1.417470274318884e-07, + "logits/chosen": -2.336543560028076, + "logits/rejected": -2.199955701828003, + "logps/chosen": -235.3468475341797, + "logps/rejected": -328.2801208496094, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0419225692749023, + "rewards/margins": 9.49148941040039, + "rewards/rejected": -11.53341293334961, + "step": 8840 + }, + { + "epoch": 2.24, + "learning_rate": 1.412789064694317e-07, + "logits/chosen": -2.620922565460205, + "logits/rejected": -2.45328688621521, + "logps/chosen": -279.2070617675781, + "logps/rejected": -362.3523254394531, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4809529781341553, + "rewards/margins": 12.773890495300293, + "rewards/rejected": -14.254842758178711, + "step": 8850 + }, + { + "epoch": 2.24, + "learning_rate": 1.4081078550697502e-07, + "logits/chosen": -2.3447766304016113, + "logits/rejected": -2.2119431495666504, + "logps/chosen": -223.18124389648438, + "logps/rejected": -394.81146240234375, + "loss": 0.026, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.897385358810425, + "rewards/margins": 10.753974914550781, + "rewards/rejected": -13.651362419128418, + "step": 8860 + }, + { + "epoch": 2.24, + "learning_rate": 1.4034266454451828e-07, + "logits/chosen": -2.4083456993103027, + "logits/rejected": -2.4523205757141113, + "logps/chosen": -314.6570739746094, + "logps/rejected": -476.2359924316406, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19616852700710297, + "rewards/margins": 15.615038871765137, + "rewards/rejected": -15.418869018554688, + "step": 8870 + }, + { + "epoch": 2.24, + "learning_rate": 1.398745435820616e-07, + "logits/chosen": -2.2582082748413086, + "logits/rejected": -2.3017678260803223, + "logps/chosen": -309.89971923828125, + "logps/rejected": -369.45458984375, + "loss": 0.043, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.068044662475586, + "rewards/margins": 11.423654556274414, + "rewards/rejected": -13.49169921875, + "step": 8880 + }, + { + "epoch": 2.25, + "learning_rate": 1.3940642261960489e-07, + "logits/chosen": -2.507986068725586, + "logits/rejected": -2.3526625633239746, + "logps/chosen": -384.5350036621094, + "logps/rejected": -393.1593933105469, + "loss": 0.0225, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5077223777770996, + "rewards/margins": 10.584321975708008, + "rewards/rejected": -13.09204387664795, + "step": 8890 + }, + { + "epoch": 2.25, + "learning_rate": 1.389383016571482e-07, + "logits/chosen": -2.3579325675964355, + "logits/rejected": -2.460432291030884, + "logps/chosen": -286.2724609375, + "logps/rejected": -325.6170959472656, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48190802335739136, + "rewards/margins": 9.54693603515625, + "rewards/rejected": -10.028843879699707, + "step": 8900 + }, + { + "epoch": 2.25, + "learning_rate": 1.384701806946915e-07, + "logits/chosen": -2.3750059604644775, + "logits/rejected": -2.5095913410186768, + "logps/chosen": -215.9287109375, + "logps/rejected": -340.02880859375, + "loss": 0.0406, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5825750827789307, + "rewards/margins": 9.099946975708008, + "rewards/rejected": -11.682523727416992, + "step": 8910 + }, + { + "epoch": 2.25, + "learning_rate": 1.380020597322348e-07, + "logits/chosen": -2.4824776649475098, + "logits/rejected": -2.3568711280822754, + "logps/chosen": -240.6768341064453, + "logps/rejected": -319.8079833984375, + "loss": 0.0102, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2835326194763184, + "rewards/margins": 8.985818862915039, + "rewards/rejected": -12.269351959228516, + "step": 8920 + }, + { + "epoch": 2.26, + "learning_rate": 1.375339387697781e-07, + "logits/chosen": -2.248166561126709, + "logits/rejected": -2.1565823554992676, + "logps/chosen": -220.9330596923828, + "logps/rejected": -338.0259094238281, + "loss": 0.0314, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.05809593200683594, + "rewards/margins": 11.133580207824707, + "rewards/rejected": -11.191675186157227, + "step": 8930 + }, + { + "epoch": 2.26, + "learning_rate": 1.370658178073214e-07, + "logits/chosen": -2.3465335369110107, + "logits/rejected": -2.1472835540771484, + "logps/chosen": -303.6405029296875, + "logps/rejected": -448.4706115722656, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8687835931777954, + "rewards/margins": 15.166003227233887, + "rewards/rejected": -17.034786224365234, + "step": 8940 + }, + { + "epoch": 2.26, + "learning_rate": 1.3659769684486473e-07, + "logits/chosen": -2.6118006706237793, + "logits/rejected": -2.6439757347106934, + "logps/chosen": -399.91021728515625, + "logps/rejected": -482.90924072265625, + "loss": 0.0681, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12734787166118622, + "rewards/margins": 11.891292572021484, + "rewards/rejected": -11.763945579528809, + "step": 8950 + }, + { + "epoch": 2.26, + "learning_rate": 1.3612957588240802e-07, + "logits/chosen": -2.6660571098327637, + "logits/rejected": -2.5263445377349854, + "logps/chosen": -280.5657653808594, + "logps/rejected": -375.1638488769531, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1703410148620605, + "rewards/margins": 9.894917488098145, + "rewards/rejected": -12.065258026123047, + "step": 8960 + }, + { + "epoch": 2.27, + "learning_rate": 1.356614549199513e-07, + "logits/chosen": -2.3960671424865723, + "logits/rejected": -2.389824628829956, + "logps/chosen": -271.875732421875, + "logps/rejected": -421.37152099609375, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11441369354724884, + "rewards/margins": 11.73409366607666, + "rewards/rejected": -11.61967945098877, + "step": 8970 + }, + { + "epoch": 2.27, + "learning_rate": 1.351933339574946e-07, + "logits/chosen": -2.498690605163574, + "logits/rejected": -2.442638874053955, + "logps/chosen": -332.7298889160156, + "logps/rejected": -372.57525634765625, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4244102239608765, + "rewards/margins": 10.737654685974121, + "rewards/rejected": -12.162065505981445, + "step": 8980 + }, + { + "epoch": 2.27, + "learning_rate": 1.347252129950379e-07, + "logits/chosen": -2.467064619064331, + "logits/rejected": -2.436596393585205, + "logps/chosen": -338.3554992675781, + "logps/rejected": -352.9508361816406, + "loss": 0.0307, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0824882984161377, + "rewards/margins": 8.874943733215332, + "rewards/rejected": -10.95743179321289, + "step": 8990 + }, + { + "epoch": 2.28, + "learning_rate": 1.342570920325812e-07, + "logits/chosen": -2.48591685295105, + "logits/rejected": -2.327716827392578, + "logps/chosen": -225.51107788085938, + "logps/rejected": -343.0364990234375, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3385100364685059, + "rewards/margins": 9.523209571838379, + "rewards/rejected": -10.861720085144043, + "step": 9000 + }, + { + "epoch": 2.28, + "learning_rate": 1.3378897107012452e-07, + "logits/chosen": -2.4900214672088623, + "logits/rejected": -2.4207189083099365, + "logps/chosen": -335.5738525390625, + "logps/rejected": -325.5471496582031, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3286423683166504, + "rewards/margins": 10.190323829650879, + "rewards/rejected": -12.51896858215332, + "step": 9010 + }, + { + "epoch": 2.28, + "learning_rate": 1.333208501076678e-07, + "logits/chosen": -2.5807812213897705, + "logits/rejected": -2.552722930908203, + "logps/chosen": -298.92108154296875, + "logps/rejected": -359.711181640625, + "loss": 0.0368, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9593040943145752, + "rewards/margins": 9.846508979797363, + "rewards/rejected": -11.805811882019043, + "step": 9020 + }, + { + "epoch": 2.28, + "learning_rate": 1.3285272914521112e-07, + "logits/chosen": -2.588824510574341, + "logits/rejected": -2.516756057739258, + "logps/chosen": -343.57745361328125, + "logps/rejected": -385.2336730957031, + "loss": 0.018, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6453635096549988, + "rewards/margins": 9.960227012634277, + "rewards/rejected": -10.6055908203125, + "step": 9030 + }, + { + "epoch": 2.29, + "learning_rate": 1.3238460818275444e-07, + "logits/chosen": -2.5509066581726074, + "logits/rejected": -2.501462459564209, + "logps/chosen": -232.3785858154297, + "logps/rejected": -356.4541931152344, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5372840762138367, + "rewards/margins": 11.531806945800781, + "rewards/rejected": -12.069090843200684, + "step": 9040 + }, + { + "epoch": 2.29, + "learning_rate": 1.3191648722029773e-07, + "logits/chosen": -2.479043483734131, + "logits/rejected": -2.459160566329956, + "logps/chosen": -336.2101135253906, + "logps/rejected": -355.56170654296875, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3328600525856018, + "rewards/margins": 12.044123649597168, + "rewards/rejected": -11.711263656616211, + "step": 9050 + }, + { + "epoch": 2.29, + "learning_rate": 1.3144836625784104e-07, + "logits/chosen": -2.6534008979797363, + "logits/rejected": -2.392120361328125, + "logps/chosen": -354.12432861328125, + "logps/rejected": -394.28521728515625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8992952108383179, + "rewards/margins": 11.788612365722656, + "rewards/rejected": -12.687907218933105, + "step": 9060 + }, + { + "epoch": 2.29, + "learning_rate": 1.309802452953843e-07, + "logits/chosen": -2.467833995819092, + "logits/rejected": -2.377655029296875, + "logps/chosen": -172.44436645507812, + "logps/rejected": -307.0425109863281, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1732361316680908, + "rewards/margins": 10.656147956848145, + "rewards/rejected": -11.829381942749023, + "step": 9070 + }, + { + "epoch": 2.3, + "learning_rate": 1.3051212433292762e-07, + "logits/chosen": -2.5307626724243164, + "logits/rejected": -2.511751174926758, + "logps/chosen": -279.1362609863281, + "logps/rejected": -463.63214111328125, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1896442025899887, + "rewards/margins": 12.709389686584473, + "rewards/rejected": -12.519744873046875, + "step": 9080 + }, + { + "epoch": 2.3, + "learning_rate": 1.300440033704709e-07, + "logits/chosen": -2.435119390487671, + "logits/rejected": -2.4567675590515137, + "logps/chosen": -257.6720275878906, + "logps/rejected": -317.12664794921875, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.369809627532959, + "rewards/margins": 9.4765625, + "rewards/rejected": -11.846372604370117, + "step": 9090 + }, + { + "epoch": 2.3, + "learning_rate": 1.2957588240801423e-07, + "logits/chosen": -2.7394285202026367, + "logits/rejected": -2.7037463188171387, + "logps/chosen": -427.0330505371094, + "logps/rejected": -442.126220703125, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.711077332496643, + "rewards/margins": 10.261374473571777, + "rewards/rejected": -11.972452163696289, + "step": 9100 + }, + { + "epoch": 2.3, + "learning_rate": 1.2910776144555752e-07, + "logits/chosen": -2.494143009185791, + "logits/rejected": -2.436272144317627, + "logps/chosen": -254.1820068359375, + "logps/rejected": -328.72955322265625, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5816042423248291, + "rewards/margins": 10.016374588012695, + "rewards/rejected": -10.597977638244629, + "step": 9110 + }, + { + "epoch": 2.31, + "learning_rate": 1.2863964048310083e-07, + "logits/chosen": -2.5131287574768066, + "logits/rejected": -2.4419400691986084, + "logps/chosen": -214.53964233398438, + "logps/rejected": -357.4662170410156, + "loss": 0.0219, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9653714299201965, + "rewards/margins": 10.164517402648926, + "rewards/rejected": -11.129888534545898, + "step": 9120 + }, + { + "epoch": 2.31, + "learning_rate": 1.2817151952064412e-07, + "logits/chosen": -2.556823492050171, + "logits/rejected": -2.44260835647583, + "logps/chosen": -343.62738037109375, + "logps/rejected": -361.4820556640625, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45240944623947144, + "rewards/margins": 9.722356796264648, + "rewards/rejected": -10.17476749420166, + "step": 9130 + }, + { + "epoch": 2.31, + "learning_rate": 1.2770339855818744e-07, + "logits/chosen": -2.4272677898406982, + "logits/rejected": -2.476644515991211, + "logps/chosen": -273.29840087890625, + "logps/rejected": -372.9749755859375, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32155928015708923, + "rewards/margins": 12.134693145751953, + "rewards/rejected": -12.456252098083496, + "step": 9140 + }, + { + "epoch": 2.31, + "learning_rate": 1.2723527759573075e-07, + "logits/chosen": -2.484036684036255, + "logits/rejected": -2.4402365684509277, + "logps/chosen": -264.90240478515625, + "logps/rejected": -367.96478271484375, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5199756026268005, + "rewards/margins": 12.971882820129395, + "rewards/rejected": -12.451906204223633, + "step": 9150 + }, + { + "epoch": 2.32, + "learning_rate": 1.2676715663327404e-07, + "logits/chosen": -2.6081578731536865, + "logits/rejected": -2.5950417518615723, + "logps/chosen": -319.1241149902344, + "logps/rejected": -425.2018127441406, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09917531162500381, + "rewards/margins": 11.424158096313477, + "rewards/rejected": -11.324983596801758, + "step": 9160 + }, + { + "epoch": 2.32, + "learning_rate": 1.2629903567081736e-07, + "logits/chosen": -2.36436128616333, + "logits/rejected": -2.354889392852783, + "logps/chosen": -259.3992919921875, + "logps/rejected": -392.87396240234375, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1882728338241577, + "rewards/margins": 11.059471130371094, + "rewards/rejected": -12.247743606567383, + "step": 9170 + }, + { + "epoch": 2.32, + "learning_rate": 1.2583091470836062e-07, + "logits/chosen": -2.565932035446167, + "logits/rejected": -2.3525166511535645, + "logps/chosen": -254.3878631591797, + "logps/rejected": -318.05963134765625, + "loss": 0.0233, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5903030633926392, + "rewards/margins": 10.55272102355957, + "rewards/rejected": -11.143023490905762, + "step": 9180 + }, + { + "epoch": 2.32, + "learning_rate": 1.2536279374590394e-07, + "logits/chosen": -2.463155508041382, + "logits/rejected": -2.339812755584717, + "logps/chosen": -225.89944458007812, + "logps/rejected": -430.6494140625, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0934032201766968, + "rewards/margins": 11.839479446411133, + "rewards/rejected": -12.932882308959961, + "step": 9190 + }, + { + "epoch": 2.33, + "learning_rate": 1.2489467278344723e-07, + "logits/chosen": -2.4740588665008545, + "logits/rejected": -2.270139455795288, + "logps/chosen": -258.7538146972656, + "logps/rejected": -514.9562377929688, + "loss": 0.0298, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0893930196762085, + "rewards/margins": 14.917033195495605, + "rewards/rejected": -16.006427764892578, + "step": 9200 + }, + { + "epoch": 2.33, + "learning_rate": 1.2442655182099054e-07, + "logits/chosen": -2.522426128387451, + "logits/rejected": -2.581494092941284, + "logps/chosen": -283.09307861328125, + "logps/rejected": -441.8668518066406, + "loss": 0.0257, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.10143113136291504, + "rewards/margins": 14.812362670898438, + "rewards/rejected": -14.913793563842773, + "step": 9210 + }, + { + "epoch": 2.33, + "learning_rate": 1.2395843085853386e-07, + "logits/chosen": -2.485147476196289, + "logits/rejected": -2.320444345474243, + "logps/chosen": -319.28997802734375, + "logps/rejected": -346.0876770019531, + "loss": 0.0212, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.241127848625183, + "rewards/margins": 10.240372657775879, + "rewards/rejected": -11.481500625610352, + "step": 9220 + }, + { + "epoch": 2.33, + "learning_rate": 1.2349030989607715e-07, + "logits/chosen": -2.55083966255188, + "logits/rejected": -2.4898033142089844, + "logps/chosen": -255.58920288085938, + "logps/rejected": -358.78155517578125, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.457207441329956, + "rewards/margins": 11.063669204711914, + "rewards/rejected": -12.520875930786133, + "step": 9230 + }, + { + "epoch": 2.34, + "learning_rate": 1.2302218893362044e-07, + "logits/chosen": -2.5158586502075195, + "logits/rejected": -2.3378074169158936, + "logps/chosen": -288.1528625488281, + "logps/rejected": -298.3981628417969, + "loss": 0.0205, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2239353656768799, + "rewards/margins": 8.507644653320312, + "rewards/rejected": -9.731579780578613, + "step": 9240 + }, + { + "epoch": 2.34, + "learning_rate": 1.2255406797116375e-07, + "logits/chosen": -2.4785284996032715, + "logits/rejected": -2.3354721069335938, + "logps/chosen": -290.882568359375, + "logps/rejected": -308.3099670410156, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4787499904632568, + "rewards/margins": 10.869338035583496, + "rewards/rejected": -12.348088264465332, + "step": 9250 + }, + { + "epoch": 2.34, + "learning_rate": 1.2208594700870704e-07, + "logits/chosen": -2.5821151733398438, + "logits/rejected": -2.450249671936035, + "logps/chosen": -309.484130859375, + "logps/rejected": -372.41534423828125, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7294740676879883, + "rewards/margins": 11.100632667541504, + "rewards/rejected": -12.830106735229492, + "step": 9260 + }, + { + "epoch": 2.34, + "learning_rate": 1.2161782604625036e-07, + "logits/chosen": -2.374976634979248, + "logits/rejected": -2.3441929817199707, + "logps/chosen": -290.5916748046875, + "logps/rejected": -347.4356384277344, + "loss": 0.0163, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.8753108978271484, + "rewards/margins": 8.887423515319824, + "rewards/rejected": -11.762734413146973, + "step": 9270 + }, + { + "epoch": 2.35, + "learning_rate": 1.2114970508379365e-07, + "logits/chosen": -2.481886386871338, + "logits/rejected": -2.4045209884643555, + "logps/chosen": -195.5873565673828, + "logps/rejected": -339.64373779296875, + "loss": 0.0252, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2074648141860962, + "rewards/margins": 9.534823417663574, + "rewards/rejected": -10.742288589477539, + "step": 9280 + }, + { + "epoch": 2.35, + "learning_rate": 1.2068158412133694e-07, + "logits/chosen": -2.5621516704559326, + "logits/rejected": -2.567854404449463, + "logps/chosen": -298.40313720703125, + "logps/rejected": -410.24267578125, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6887444257736206, + "rewards/margins": 12.210759162902832, + "rewards/rejected": -11.522014617919922, + "step": 9290 + }, + { + "epoch": 2.35, + "learning_rate": 1.2021346315888025e-07, + "logits/chosen": -2.139488697052002, + "logits/rejected": -2.2088325023651123, + "logps/chosen": -208.7119140625, + "logps/rejected": -313.7248840332031, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9525458812713623, + "rewards/margins": 8.045282363891602, + "rewards/rejected": -10.997827529907227, + "step": 9300 + }, + { + "epoch": 2.35, + "learning_rate": 1.1974534219642357e-07, + "logits/chosen": -2.704371452331543, + "logits/rejected": -2.423408269882202, + "logps/chosen": -410.0580139160156, + "logps/rejected": -451.2791442871094, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14632168412208557, + "rewards/margins": 15.756739616394043, + "rewards/rejected": -15.610417366027832, + "step": 9310 + }, + { + "epoch": 2.36, + "learning_rate": 1.1927722123396686e-07, + "logits/chosen": -2.501812696456909, + "logits/rejected": -2.5868372917175293, + "logps/chosen": -320.36383056640625, + "logps/rejected": -485.62835693359375, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1520987749099731, + "rewards/margins": 12.831914901733398, + "rewards/rejected": -13.984013557434082, + "step": 9320 + }, + { + "epoch": 2.36, + "learning_rate": 1.1880910027151016e-07, + "logits/chosen": -2.5127670764923096, + "logits/rejected": -2.3409245014190674, + "logps/chosen": -268.2634582519531, + "logps/rejected": -256.263671875, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45365089178085327, + "rewards/margins": 9.961838722229004, + "rewards/rejected": -9.508188247680664, + "step": 9330 + }, + { + "epoch": 2.36, + "learning_rate": 1.1834097930905346e-07, + "logits/chosen": -2.5624654293060303, + "logits/rejected": -2.5135717391967773, + "logps/chosen": -262.6741943359375, + "logps/rejected": -349.298095703125, + "loss": 0.0382, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.664712429046631, + "rewards/margins": 9.775838851928711, + "rewards/rejected": -12.440550804138184, + "step": 9340 + }, + { + "epoch": 2.36, + "learning_rate": 1.1787285834659675e-07, + "logits/chosen": -2.4618287086486816, + "logits/rejected": -2.3042941093444824, + "logps/chosen": -248.24282836914062, + "logps/rejected": -318.86090087890625, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3148791193962097, + "rewards/margins": 9.420307159423828, + "rewards/rejected": -9.735186576843262, + "step": 9350 + }, + { + "epoch": 2.37, + "learning_rate": 1.1740473738414005e-07, + "logits/chosen": -2.482856273651123, + "logits/rejected": -2.5394861698150635, + "logps/chosen": -361.1878967285156, + "logps/rejected": -414.9200134277344, + "loss": 0.0301, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.191426992416382, + "rewards/margins": 11.798062324523926, + "rewards/rejected": -13.98948860168457, + "step": 9360 + }, + { + "epoch": 2.37, + "learning_rate": 1.1693661642168336e-07, + "logits/chosen": -2.445554733276367, + "logits/rejected": -2.4558825492858887, + "logps/chosen": -248.78958129882812, + "logps/rejected": -354.56280517578125, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0716218948364258, + "rewards/margins": 11.133877754211426, + "rewards/rejected": -12.205499649047852, + "step": 9370 + }, + { + "epoch": 2.37, + "learning_rate": 1.1646849545922666e-07, + "logits/chosen": -2.413811445236206, + "logits/rejected": -2.4532883167266846, + "logps/chosen": -328.21826171875, + "logps/rejected": -404.242431640625, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2563796043395996, + "rewards/margins": 10.868789672851562, + "rewards/rejected": -14.12516975402832, + "step": 9380 + }, + { + "epoch": 2.37, + "learning_rate": 1.1600037449676996e-07, + "logits/chosen": -2.4219603538513184, + "logits/rejected": -2.388962507247925, + "logps/chosen": -275.37677001953125, + "logps/rejected": -308.3039245605469, + "loss": 0.024, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2332690954208374, + "rewards/margins": 9.072051048278809, + "rewards/rejected": -10.305319786071777, + "step": 9390 + }, + { + "epoch": 2.38, + "learning_rate": 1.1553225353431325e-07, + "logits/chosen": -2.4700279235839844, + "logits/rejected": -2.3462586402893066, + "logps/chosen": -204.75643920898438, + "logps/rejected": -344.16973876953125, + "loss": 0.0123, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9443933963775635, + "rewards/margins": 10.954790115356445, + "rewards/rejected": -13.89918327331543, + "step": 9400 + }, + { + "epoch": 2.38, + "learning_rate": 1.1506413257185655e-07, + "logits/chosen": -2.33776593208313, + "logits/rejected": -2.3722383975982666, + "logps/chosen": -352.3572998046875, + "logps/rejected": -380.77789306640625, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33109062910079956, + "rewards/margins": 11.438632011413574, + "rewards/rejected": -11.10754108428955, + "step": 9410 + }, + { + "epoch": 2.38, + "learning_rate": 1.1459601160939987e-07, + "logits/chosen": -2.5390992164611816, + "logits/rejected": -2.3394479751586914, + "logps/chosen": -291.4183654785156, + "logps/rejected": -307.16278076171875, + "loss": 0.0293, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1895270347595215, + "rewards/margins": 9.163797378540039, + "rewards/rejected": -12.353324890136719, + "step": 9420 + }, + { + "epoch": 2.38, + "learning_rate": 1.1412789064694317e-07, + "logits/chosen": -2.480581045150757, + "logits/rejected": -2.452671527862549, + "logps/chosen": -257.4129638671875, + "logps/rejected": -364.5929260253906, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3671848773956299, + "rewards/margins": 12.583539962768555, + "rewards/rejected": -13.950726509094238, + "step": 9430 + }, + { + "epoch": 2.39, + "learning_rate": 1.1365976968448647e-07, + "logits/chosen": -2.4294826984405518, + "logits/rejected": -2.2800369262695312, + "logps/chosen": -264.89166259765625, + "logps/rejected": -358.344482421875, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5198668241500854, + "rewards/margins": 11.957700729370117, + "rewards/rejected": -13.477566719055176, + "step": 9440 + }, + { + "epoch": 2.39, + "learning_rate": 1.1319164872202978e-07, + "logits/chosen": -2.599461078643799, + "logits/rejected": -2.4103665351867676, + "logps/chosen": -346.68896484375, + "logps/rejected": -361.1012268066406, + "loss": 0.0247, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0498604774475098, + "rewards/margins": 11.50390338897705, + "rewards/rejected": -13.553762435913086, + "step": 9450 + }, + { + "epoch": 2.39, + "learning_rate": 1.1272352775957307e-07, + "logits/chosen": -2.304784059524536, + "logits/rejected": -2.3479790687561035, + "logps/chosen": -221.0005340576172, + "logps/rejected": -371.857666015625, + "loss": 0.0273, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1888279914855957, + "rewards/margins": 11.113363265991211, + "rewards/rejected": -13.302189826965332, + "step": 9460 + }, + { + "epoch": 2.39, + "learning_rate": 1.1225540679711637e-07, + "logits/chosen": -2.4169933795928955, + "logits/rejected": -2.522143840789795, + "logps/chosen": -269.467529296875, + "logps/rejected": -374.2334289550781, + "loss": 0.0199, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.357435703277588, + "rewards/margins": 9.489862442016602, + "rewards/rejected": -11.847297668457031, + "step": 9470 + }, + { + "epoch": 2.4, + "learning_rate": 1.1178728583465967e-07, + "logits/chosen": -2.375798463821411, + "logits/rejected": -2.2957353591918945, + "logps/chosen": -262.4629211425781, + "logps/rejected": -425.6641540527344, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5441296100616455, + "rewards/margins": 12.342843055725098, + "rewards/rejected": -14.886972427368164, + "step": 9480 + }, + { + "epoch": 2.4, + "learning_rate": 1.1131916487220297e-07, + "logits/chosen": -2.465956211090088, + "logits/rejected": -2.360863208770752, + "logps/chosen": -239.08798217773438, + "logps/rejected": -299.3997497558594, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6827244758605957, + "rewards/margins": 10.561367988586426, + "rewards/rejected": -11.24409008026123, + "step": 9490 + }, + { + "epoch": 2.4, + "learning_rate": 1.1085104390974628e-07, + "logits/chosen": -2.694605588912964, + "logits/rejected": -2.5516393184661865, + "logps/chosen": -361.26629638671875, + "logps/rejected": -474.02587890625, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9704413414001465, + "rewards/margins": 12.27275276184082, + "rewards/rejected": -13.243192672729492, + "step": 9500 + }, + { + "epoch": 2.4, + "learning_rate": 1.1038292294728956e-07, + "logits/chosen": -2.609485387802124, + "logits/rejected": -2.6828622817993164, + "logps/chosen": -199.8148956298828, + "logps/rejected": -420.41717529296875, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8817756772041321, + "rewards/margins": 13.943887710571289, + "rewards/rejected": -14.825662612915039, + "step": 9510 + }, + { + "epoch": 2.41, + "learning_rate": 1.0991480198483287e-07, + "logits/chosen": -2.4950063228607178, + "logits/rejected": -2.3991646766662598, + "logps/chosen": -244.9822235107422, + "logps/rejected": -465.58447265625, + "loss": 0.027, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.1490936279296875, + "rewards/margins": 12.020115852355957, + "rewards/rejected": -16.169208526611328, + "step": 9520 + }, + { + "epoch": 2.41, + "learning_rate": 1.0944668102237618e-07, + "logits/chosen": -2.490548610687256, + "logits/rejected": -2.2655842304229736, + "logps/chosen": -310.09515380859375, + "logps/rejected": -302.21435546875, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3610144257545471, + "rewards/margins": 10.774946212768555, + "rewards/rejected": -11.135960578918457, + "step": 9530 + }, + { + "epoch": 2.41, + "learning_rate": 1.0897856005991949e-07, + "logits/chosen": -2.5174641609191895, + "logits/rejected": -2.4812374114990234, + "logps/chosen": -318.38665771484375, + "logps/rejected": -402.00018310546875, + "loss": 0.0143, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6079021692276, + "rewards/margins": 10.268229484558105, + "rewards/rejected": -11.87613296508789, + "step": 9540 + }, + { + "epoch": 2.41, + "learning_rate": 1.0851043909746279e-07, + "logits/chosen": -2.3226113319396973, + "logits/rejected": -2.332900285720825, + "logps/chosen": -221.42001342773438, + "logps/rejected": -340.07373046875, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.173617959022522, + "rewards/margins": 11.125650405883789, + "rewards/rejected": -12.299267768859863, + "step": 9550 + }, + { + "epoch": 2.42, + "learning_rate": 1.0804231813500608e-07, + "logits/chosen": -2.6040127277374268, + "logits/rejected": -2.653808116912842, + "logps/chosen": -227.6526336669922, + "logps/rejected": -364.04486083984375, + "loss": 0.0113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9157921075820923, + "rewards/margins": 10.997711181640625, + "rewards/rejected": -12.913503646850586, + "step": 9560 + }, + { + "epoch": 2.42, + "learning_rate": 1.0757419717254938e-07, + "logits/chosen": -2.4118499755859375, + "logits/rejected": -2.430004596710205, + "logps/chosen": -235.00588989257812, + "logps/rejected": -390.02215576171875, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05648164823651314, + "rewards/margins": 15.429672241210938, + "rewards/rejected": -15.373188972473145, + "step": 9570 + }, + { + "epoch": 2.42, + "learning_rate": 1.0710607621009268e-07, + "logits/chosen": -2.375131607055664, + "logits/rejected": -2.3413290977478027, + "logps/chosen": -231.0093994140625, + "logps/rejected": -349.61065673828125, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6894218921661377, + "rewards/margins": 10.271553039550781, + "rewards/rejected": -12.960973739624023, + "step": 9580 + }, + { + "epoch": 2.42, + "learning_rate": 1.0663795524763599e-07, + "logits/chosen": -2.2757697105407715, + "logits/rejected": -2.280949831008911, + "logps/chosen": -175.64895629882812, + "logps/rejected": -300.08734130859375, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.390406847000122, + "rewards/margins": 10.89712142944336, + "rewards/rejected": -13.287528991699219, + "step": 9590 + }, + { + "epoch": 2.43, + "learning_rate": 1.0616983428517929e-07, + "logits/chosen": -2.48026704788208, + "logits/rejected": -2.4394729137420654, + "logps/chosen": -257.1330871582031, + "logps/rejected": -344.04376220703125, + "loss": 0.0236, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5281569957733154, + "rewards/margins": 8.856816291809082, + "rewards/rejected": -12.38497257232666, + "step": 9600 + }, + { + "epoch": 2.43, + "learning_rate": 1.0570171332272258e-07, + "logits/chosen": -2.5033538341522217, + "logits/rejected": -2.3946590423583984, + "logps/chosen": -394.1955261230469, + "logps/rejected": -396.7699279785156, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2552104890346527, + "rewards/margins": 11.802976608276367, + "rewards/rejected": -12.058186531066895, + "step": 9610 + }, + { + "epoch": 2.43, + "learning_rate": 1.0523359236026588e-07, + "logits/chosen": -2.4539897441864014, + "logits/rejected": -2.4002108573913574, + "logps/chosen": -290.2524719238281, + "logps/rejected": -446.51708984375, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.071610927581787, + "rewards/margins": 11.670352935791016, + "rewards/rejected": -13.741963386535645, + "step": 9620 + }, + { + "epoch": 2.43, + "learning_rate": 1.0476547139780918e-07, + "logits/chosen": -2.487164258956909, + "logits/rejected": -2.5121045112609863, + "logps/chosen": -289.3143005371094, + "logps/rejected": -355.0195617675781, + "loss": 0.0161, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.06483952701091766, + "rewards/margins": 11.031146049499512, + "rewards/rejected": -11.09598445892334, + "step": 9630 + }, + { + "epoch": 2.44, + "learning_rate": 1.042973504353525e-07, + "logits/chosen": -2.5066590309143066, + "logits/rejected": -2.42234468460083, + "logps/chosen": -224.26168823242188, + "logps/rejected": -343.7847595214844, + "loss": 0.0226, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7051777839660645, + "rewards/margins": 11.361398696899414, + "rewards/rejected": -12.06657600402832, + "step": 9640 + }, + { + "epoch": 2.44, + "learning_rate": 1.038292294728958e-07, + "logits/chosen": -2.538245439529419, + "logits/rejected": -2.390660047531128, + "logps/chosen": -244.62551879882812, + "logps/rejected": -343.1199035644531, + "loss": 0.0232, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.566230058670044, + "rewards/margins": 9.027008056640625, + "rewards/rejected": -12.593238830566406, + "step": 9650 + }, + { + "epoch": 2.44, + "learning_rate": 1.0336110851043909e-07, + "logits/chosen": -2.3509981632232666, + "logits/rejected": -2.280942916870117, + "logps/chosen": -278.4637451171875, + "logps/rejected": -337.9454040527344, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.56687331199646, + "rewards/margins": 10.138782501220703, + "rewards/rejected": -11.705657005310059, + "step": 9660 + }, + { + "epoch": 2.44, + "learning_rate": 1.0289298754798239e-07, + "logits/chosen": -2.4716315269470215, + "logits/rejected": -2.550530195236206, + "logps/chosen": -304.77490234375, + "logps/rejected": -407.2782287597656, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5738002061843872, + "rewards/margins": 11.042047500610352, + "rewards/rejected": -12.615848541259766, + "step": 9670 + }, + { + "epoch": 2.45, + "learning_rate": 1.024248665855257e-07, + "logits/chosen": -2.5989766120910645, + "logits/rejected": -2.4732518196105957, + "logps/chosen": -260.64581298828125, + "logps/rejected": -375.7298583984375, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0580739974975586, + "rewards/margins": 13.893926620483398, + "rewards/rejected": -14.952000617980957, + "step": 9680 + }, + { + "epoch": 2.45, + "learning_rate": 1.01956745623069e-07, + "logits/chosen": -2.539679527282715, + "logits/rejected": -2.405225992202759, + "logps/chosen": -327.54522705078125, + "logps/rejected": -370.77813720703125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4409523010253906, + "rewards/margins": 11.447229385375977, + "rewards/rejected": -12.88818073272705, + "step": 9690 + }, + { + "epoch": 2.45, + "learning_rate": 1.014886246606123e-07, + "logits/chosen": -2.3739826679229736, + "logits/rejected": -2.367341995239258, + "logps/chosen": -195.1979217529297, + "logps/rejected": -286.34320068359375, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5407164096832275, + "rewards/margins": 9.584169387817383, + "rewards/rejected": -11.124883651733398, + "step": 9700 + }, + { + "epoch": 2.45, + "learning_rate": 1.0102050369815559e-07, + "logits/chosen": -2.5260393619537354, + "logits/rejected": -2.2634921073913574, + "logps/chosen": -344.4964599609375, + "logps/rejected": -284.90545654296875, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.781498908996582, + "rewards/margins": 10.852533340454102, + "rewards/rejected": -12.634031295776367, + "step": 9710 + }, + { + "epoch": 2.46, + "learning_rate": 1.0055238273569889e-07, + "logits/chosen": -2.525456190109253, + "logits/rejected": -2.550560235977173, + "logps/chosen": -259.0410461425781, + "logps/rejected": -403.1743469238281, + "loss": 0.0335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9123313426971436, + "rewards/margins": 9.142938613891602, + "rewards/rejected": -11.055269241333008, + "step": 9720 + }, + { + "epoch": 2.46, + "learning_rate": 1.000842617732422e-07, + "logits/chosen": -2.3116941452026367, + "logits/rejected": -2.2560160160064697, + "logps/chosen": -230.031005859375, + "logps/rejected": -308.4913330078125, + "loss": 0.0196, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.852482318878174, + "rewards/margins": 8.964485168457031, + "rewards/rejected": -11.816967964172363, + "step": 9730 + }, + { + "epoch": 2.46, + "learning_rate": 9.96161408107855e-08, + "logits/chosen": -2.4614522457122803, + "logits/rejected": -2.355747938156128, + "logps/chosen": -289.4332275390625, + "logps/rejected": -358.6775817871094, + "loss": 0.0115, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.407428741455078, + "rewards/margins": 10.502245903015137, + "rewards/rejected": -13.909673690795898, + "step": 9740 + }, + { + "epoch": 2.46, + "learning_rate": 9.914801984832881e-08, + "logits/chosen": -2.6264684200286865, + "logits/rejected": -2.492650032043457, + "logps/chosen": -338.1631774902344, + "logps/rejected": -362.98101806640625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27077144384384155, + "rewards/margins": 12.973017692565918, + "rewards/rejected": -12.70224666595459, + "step": 9750 + }, + { + "epoch": 2.47, + "learning_rate": 9.867989888587212e-08, + "logits/chosen": -2.56974720954895, + "logits/rejected": -2.4586358070373535, + "logps/chosen": -375.69732666015625, + "logps/rejected": -473.6441345214844, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16659900546073914, + "rewards/margins": 11.835259437561035, + "rewards/rejected": -12.001858711242676, + "step": 9760 + }, + { + "epoch": 2.47, + "learning_rate": 9.82117779234154e-08, + "logits/chosen": -2.362417697906494, + "logits/rejected": -2.4201173782348633, + "logps/chosen": -202.46035766601562, + "logps/rejected": -443.12335205078125, + "loss": 0.0191, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9886672496795654, + "rewards/margins": 12.812800407409668, + "rewards/rejected": -15.801467895507812, + "step": 9770 + }, + { + "epoch": 2.47, + "learning_rate": 9.774365696095871e-08, + "logits/chosen": -2.5065131187438965, + "logits/rejected": -2.3762283325195312, + "logps/chosen": -254.2418975830078, + "logps/rejected": -370.89971923828125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.246609687805176, + "rewards/margins": 11.305074691772461, + "rewards/rejected": -13.551684379577637, + "step": 9780 + }, + { + "epoch": 2.47, + "learning_rate": 9.727553599850201e-08, + "logits/chosen": -2.370347261428833, + "logits/rejected": -2.3666515350341797, + "logps/chosen": -273.8157958984375, + "logps/rejected": -380.5888977050781, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6909661293029785, + "rewards/margins": 13.555691719055176, + "rewards/rejected": -14.246658325195312, + "step": 9790 + }, + { + "epoch": 2.48, + "learning_rate": 9.680741503604531e-08, + "logits/chosen": -2.531897783279419, + "logits/rejected": -2.4267642498016357, + "logps/chosen": -265.67303466796875, + "logps/rejected": -383.75384521484375, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9569188356399536, + "rewards/margins": 11.649717330932617, + "rewards/rejected": -12.606637001037598, + "step": 9800 + }, + { + "epoch": 2.48, + "learning_rate": 9.633929407358862e-08, + "logits/chosen": -2.5569138526916504, + "logits/rejected": -2.396172046661377, + "logps/chosen": -285.2621765136719, + "logps/rejected": -310.68389892578125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2880444526672363, + "rewards/margins": 9.889970779418945, + "rewards/rejected": -12.178014755249023, + "step": 9810 + }, + { + "epoch": 2.48, + "learning_rate": 9.58711731111319e-08, + "logits/chosen": -2.445523738861084, + "logits/rejected": -2.241865873336792, + "logps/chosen": -355.5696105957031, + "logps/rejected": -393.40972900390625, + "loss": 0.026, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.43356913328170776, + "rewards/margins": 12.471688270568848, + "rewards/rejected": -12.905256271362305, + "step": 9820 + }, + { + "epoch": 2.48, + "learning_rate": 9.540305214867521e-08, + "logits/chosen": -2.519758701324463, + "logits/rejected": -2.4892241954803467, + "logps/chosen": -238.73196411132812, + "logps/rejected": -396.1922302246094, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1257517337799072, + "rewards/margins": 12.754495620727539, + "rewards/rejected": -13.880247116088867, + "step": 9830 + }, + { + "epoch": 2.49, + "learning_rate": 9.493493118621851e-08, + "logits/chosen": -2.6485443115234375, + "logits/rejected": -2.5044517517089844, + "logps/chosen": -353.70904541015625, + "logps/rejected": -471.74151611328125, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10478146374225616, + "rewards/margins": 14.041101455688477, + "rewards/rejected": -14.145881652832031, + "step": 9840 + }, + { + "epoch": 2.49, + "learning_rate": 9.446681022376183e-08, + "logits/chosen": -2.4805495738983154, + "logits/rejected": -2.302246570587158, + "logps/chosen": -229.20370483398438, + "logps/rejected": -325.60711669921875, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.490037441253662, + "rewards/margins": 14.055638313293457, + "rewards/rejected": -12.565601348876953, + "step": 9850 + }, + { + "epoch": 2.49, + "learning_rate": 9.399868926130513e-08, + "logits/chosen": -2.7085890769958496, + "logits/rejected": -2.5546762943267822, + "logps/chosen": -302.5946960449219, + "logps/rejected": -341.9223327636719, + "loss": 0.028, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5961366295814514, + "rewards/margins": 10.983573913574219, + "rewards/rejected": -11.579710006713867, + "step": 9860 + }, + { + "epoch": 2.49, + "learning_rate": 9.353056829884842e-08, + "logits/chosen": -2.4775044918060303, + "logits/rejected": -2.400139331817627, + "logps/chosen": -312.041015625, + "logps/rejected": -324.03338623046875, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2200387716293335, + "rewards/margins": 9.557716369628906, + "rewards/rejected": -10.777755737304688, + "step": 9870 + }, + { + "epoch": 2.5, + "learning_rate": 9.306244733639172e-08, + "logits/chosen": -2.431150197982788, + "logits/rejected": -2.3163318634033203, + "logps/chosen": -235.7882080078125, + "logps/rejected": -308.95074462890625, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.804135799407959, + "rewards/margins": 8.712458610534668, + "rewards/rejected": -11.516595840454102, + "step": 9880 + }, + { + "epoch": 2.5, + "learning_rate": 9.259432637393502e-08, + "logits/chosen": -2.443704128265381, + "logits/rejected": -2.328042507171631, + "logps/chosen": -288.6375427246094, + "logps/rejected": -308.34906005859375, + "loss": 0.0332, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.7502975463867188, + "rewards/margins": 9.743748664855957, + "rewards/rejected": -12.494046211242676, + "step": 9890 + }, + { + "epoch": 2.5, + "learning_rate": 9.212620541147833e-08, + "logits/chosen": -2.547600507736206, + "logits/rejected": -2.4481041431427, + "logps/chosen": -252.47866821289062, + "logps/rejected": -333.0774841308594, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8022094964981079, + "rewards/margins": 12.065972328186035, + "rewards/rejected": -12.868181228637695, + "step": 9900 + }, + { + "epoch": 2.51, + "learning_rate": 9.165808444902163e-08, + "logits/chosen": -2.5878243446350098, + "logits/rejected": -2.607832908630371, + "logps/chosen": -299.35406494140625, + "logps/rejected": -416.83135986328125, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.320135474205017, + "rewards/margins": 12.124985694885254, + "rewards/rejected": -13.445119857788086, + "step": 9910 + }, + { + "epoch": 2.51, + "learning_rate": 9.118996348656492e-08, + "logits/chosen": -2.421605110168457, + "logits/rejected": -2.3391876220703125, + "logps/chosen": -212.12454223632812, + "logps/rejected": -311.01495361328125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06767783313989639, + "rewards/margins": 10.119757652282715, + "rewards/rejected": -10.187434196472168, + "step": 9920 + }, + { + "epoch": 2.51, + "learning_rate": 9.072184252410822e-08, + "logits/chosen": -2.3627657890319824, + "logits/rejected": -2.311931848526001, + "logps/chosen": -230.1654510498047, + "logps/rejected": -427.72674560546875, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9032829403877258, + "rewards/margins": 11.931960105895996, + "rewards/rejected": -12.835243225097656, + "step": 9930 + }, + { + "epoch": 2.51, + "learning_rate": 9.025372156165152e-08, + "logits/chosen": -2.4075100421905518, + "logits/rejected": -2.3419651985168457, + "logps/chosen": -276.5333251953125, + "logps/rejected": -372.3677673339844, + "loss": 0.0305, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.7151522636413574, + "rewards/margins": 10.86931324005127, + "rewards/rejected": -13.584465026855469, + "step": 9940 + }, + { + "epoch": 2.52, + "learning_rate": 8.978560059919482e-08, + "logits/chosen": -2.3619284629821777, + "logits/rejected": -2.4651925563812256, + "logps/chosen": -204.21096801757812, + "logps/rejected": -323.39202880859375, + "loss": 0.0195, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7284538745880127, + "rewards/margins": 8.96215534210205, + "rewards/rejected": -10.6906099319458, + "step": 9950 + }, + { + "epoch": 2.52, + "learning_rate": 8.931747963673814e-08, + "logits/chosen": -2.414381504058838, + "logits/rejected": -2.4175143241882324, + "logps/chosen": -192.42576599121094, + "logps/rejected": -274.3816223144531, + "loss": 0.0139, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4563064575195312, + "rewards/margins": 8.960624694824219, + "rewards/rejected": -11.416932106018066, + "step": 9960 + }, + { + "epoch": 2.52, + "learning_rate": 8.884935867428143e-08, + "logits/chosen": -2.4522461891174316, + "logits/rejected": -2.6154751777648926, + "logps/chosen": -191.37452697753906, + "logps/rejected": -345.6150817871094, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5650936365127563, + "rewards/margins": 10.567245483398438, + "rewards/rejected": -11.13233757019043, + "step": 9970 + }, + { + "epoch": 2.52, + "learning_rate": 8.838123771182473e-08, + "logits/chosen": -2.7119133472442627, + "logits/rejected": -2.5880613327026367, + "logps/chosen": -296.63763427734375, + "logps/rejected": -398.0306091308594, + "loss": 0.0317, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7520133256912231, + "rewards/margins": 12.314008712768555, + "rewards/rejected": -11.561994552612305, + "step": 9980 + }, + { + "epoch": 2.53, + "learning_rate": 8.791311674936804e-08, + "logits/chosen": -2.6000936031341553, + "logits/rejected": -2.5926265716552734, + "logps/chosen": -289.90057373046875, + "logps/rejected": -521.1102294921875, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6040196418762207, + "rewards/margins": 15.795877456665039, + "rewards/rejected": -15.191858291625977, + "step": 9990 + }, + { + "epoch": 2.53, + "learning_rate": 8.744499578691134e-08, + "logits/chosen": -2.2788772583007812, + "logits/rejected": -2.2455339431762695, + "logps/chosen": -230.094482421875, + "logps/rejected": -294.2077331542969, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3577768802642822, + "rewards/margins": 10.781793594360352, + "rewards/rejected": -12.139569282531738, + "step": 10000 + }, + { + "epoch": 2.53, + "learning_rate": 8.697687482445464e-08, + "logits/chosen": -2.37874174118042, + "logits/rejected": -2.263432025909424, + "logps/chosen": -267.250244140625, + "logps/rejected": -287.97381591796875, + "loss": 0.0293, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6297651529312134, + "rewards/margins": 10.835100173950195, + "rewards/rejected": -12.464864730834961, + "step": 10010 + }, + { + "epoch": 2.53, + "learning_rate": 8.650875386199793e-08, + "logits/chosen": -2.319446086883545, + "logits/rejected": -2.2784368991851807, + "logps/chosen": -272.7980041503906, + "logps/rejected": -331.6700744628906, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4239766597747803, + "rewards/margins": 12.43993091583252, + "rewards/rejected": -12.01595401763916, + "step": 10020 + }, + { + "epoch": 2.54, + "learning_rate": 8.604063289954123e-08, + "logits/chosen": -2.3687281608581543, + "logits/rejected": -2.155780792236328, + "logps/chosen": -311.53936767578125, + "logps/rejected": -348.9579162597656, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45428338646888733, + "rewards/margins": 12.450576782226562, + "rewards/rejected": -12.904861450195312, + "step": 10030 + }, + { + "epoch": 2.54, + "learning_rate": 8.557251193708453e-08, + "logits/chosen": -2.408719539642334, + "logits/rejected": -2.3702263832092285, + "logps/chosen": -272.25091552734375, + "logps/rejected": -429.01751708984375, + "loss": 0.0311, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8166736364364624, + "rewards/margins": 13.995099067687988, + "rewards/rejected": -15.811772346496582, + "step": 10040 + }, + { + "epoch": 2.54, + "learning_rate": 8.510439097462784e-08, + "logits/chosen": -2.5389504432678223, + "logits/rejected": -2.3398921489715576, + "logps/chosen": -295.73358154296875, + "logps/rejected": -410.68170166015625, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.004246234893799, + "rewards/margins": 13.201797485351562, + "rewards/rejected": -15.20604419708252, + "step": 10050 + }, + { + "epoch": 2.54, + "learning_rate": 8.463627001217114e-08, + "logits/chosen": -2.0855343341827393, + "logits/rejected": -2.0940823554992676, + "logps/chosen": -206.9429473876953, + "logps/rejected": -316.58355712890625, + "loss": 0.0202, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.951176643371582, + "rewards/margins": 8.924515724182129, + "rewards/rejected": -11.875692367553711, + "step": 10060 + }, + { + "epoch": 2.55, + "learning_rate": 8.416814904971444e-08, + "logits/chosen": -2.6794869899749756, + "logits/rejected": -2.4969735145568848, + "logps/chosen": -294.0050964355469, + "logps/rejected": -331.2002258300781, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5639679431915283, + "rewards/margins": 9.706094741821289, + "rewards/rejected": -11.270063400268555, + "step": 10070 + }, + { + "epoch": 2.55, + "learning_rate": 8.370002808725774e-08, + "logits/chosen": -2.5366883277893066, + "logits/rejected": -2.519639492034912, + "logps/chosen": -227.18185424804688, + "logps/rejected": -352.42352294921875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13427084684371948, + "rewards/margins": 14.200691223144531, + "rewards/rejected": -14.334962844848633, + "step": 10080 + }, + { + "epoch": 2.55, + "learning_rate": 8.323190712480105e-08, + "logits/chosen": -2.360050916671753, + "logits/rejected": -2.469874858856201, + "logps/chosen": -275.49981689453125, + "logps/rejected": -420.4410705566406, + "loss": 0.0234, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.37065351009368896, + "rewards/margins": 13.31440258026123, + "rewards/rejected": -13.68505573272705, + "step": 10090 + }, + { + "epoch": 2.55, + "learning_rate": 8.276378616234435e-08, + "logits/chosen": -2.5125153064727783, + "logits/rejected": -2.4173789024353027, + "logps/chosen": -170.84732055664062, + "logps/rejected": -282.93170166015625, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.286710262298584, + "rewards/margins": 9.803840637207031, + "rewards/rejected": -12.090551376342773, + "step": 10100 + }, + { + "epoch": 2.56, + "learning_rate": 8.229566519988765e-08, + "logits/chosen": -2.3780689239501953, + "logits/rejected": -2.3163952827453613, + "logps/chosen": -241.56271362304688, + "logps/rejected": -270.06591796875, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4059072732925415, + "rewards/margins": 10.28950309753418, + "rewards/rejected": -11.695409774780273, + "step": 10110 + }, + { + "epoch": 2.56, + "learning_rate": 8.182754423743094e-08, + "logits/chosen": -2.333916187286377, + "logits/rejected": -2.1758086681365967, + "logps/chosen": -327.21319580078125, + "logps/rejected": -247.1071319580078, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4194493293762207, + "rewards/margins": 8.837392807006836, + "rewards/rejected": -12.256841659545898, + "step": 10120 + }, + { + "epoch": 2.56, + "learning_rate": 8.135942327497424e-08, + "logits/chosen": -2.3765933513641357, + "logits/rejected": -2.1695456504821777, + "logps/chosen": -347.8133850097656, + "logps/rejected": -362.703369140625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11873920261859894, + "rewards/margins": 13.10112190246582, + "rewards/rejected": -12.982383728027344, + "step": 10130 + }, + { + "epoch": 2.56, + "learning_rate": 8.089130231251755e-08, + "logits/chosen": -2.4001736640930176, + "logits/rejected": -2.3628358840942383, + "logps/chosen": -304.4840393066406, + "logps/rejected": -364.996337890625, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12035231292247772, + "rewards/margins": 13.0234375, + "rewards/rejected": -12.903085708618164, + "step": 10140 + }, + { + "epoch": 2.57, + "learning_rate": 8.042318135006085e-08, + "logits/chosen": -2.6083054542541504, + "logits/rejected": -2.470574378967285, + "logps/chosen": -307.7094421386719, + "logps/rejected": -354.67230224609375, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18828299641609192, + "rewards/margins": 12.188252449035645, + "rewards/rejected": -11.999969482421875, + "step": 10150 + }, + { + "epoch": 2.57, + "learning_rate": 7.995506038760415e-08, + "logits/chosen": -2.5575153827667236, + "logits/rejected": -2.4259867668151855, + "logps/chosen": -345.7724914550781, + "logps/rejected": -446.3836975097656, + "loss": 0.0333, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.486109495162964, + "rewards/margins": 12.340263366699219, + "rewards/rejected": -14.826370239257812, + "step": 10160 + }, + { + "epoch": 2.57, + "learning_rate": 7.948693942514744e-08, + "logits/chosen": -2.618077039718628, + "logits/rejected": -2.6674914360046387, + "logps/chosen": -302.4067077636719, + "logps/rejected": -483.14501953125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2378050833940506, + "rewards/margins": 14.155160903930664, + "rewards/rejected": -13.917353630065918, + "step": 10170 + }, + { + "epoch": 2.57, + "learning_rate": 7.901881846269076e-08, + "logits/chosen": -2.5556085109710693, + "logits/rejected": -2.5346038341522217, + "logps/chosen": -276.97344970703125, + "logps/rejected": -341.01983642578125, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3403923511505127, + "rewards/margins": 10.849390029907227, + "rewards/rejected": -11.189783096313477, + "step": 10180 + }, + { + "epoch": 2.58, + "learning_rate": 7.855069750023406e-08, + "logits/chosen": -2.4308063983917236, + "logits/rejected": -2.3705153465270996, + "logps/chosen": -357.98162841796875, + "logps/rejected": -456.73394775390625, + "loss": 0.0252, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5117690563201904, + "rewards/margins": 11.085670471191406, + "rewards/rejected": -12.597439765930176, + "step": 10190 + }, + { + "epoch": 2.58, + "learning_rate": 7.808257653777736e-08, + "logits/chosen": -2.4045298099517822, + "logits/rejected": -2.304861545562744, + "logps/chosen": -252.59622192382812, + "logps/rejected": -342.4645080566406, + "loss": 0.0088, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.10743260383605957, + "rewards/margins": 14.025423049926758, + "rewards/rejected": -13.917988777160645, + "step": 10200 + }, + { + "epoch": 2.58, + "learning_rate": 7.761445557532067e-08, + "logits/chosen": -2.368105411529541, + "logits/rejected": -2.483786106109619, + "logps/chosen": -233.50930786132812, + "logps/rejected": -337.35333251953125, + "loss": 0.0148, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8818254470825195, + "rewards/margins": 9.318094253540039, + "rewards/rejected": -11.199919700622559, + "step": 10210 + }, + { + "epoch": 2.58, + "learning_rate": 7.714633461286397e-08, + "logits/chosen": -2.4343035221099854, + "logits/rejected": -2.398287057876587, + "logps/chosen": -286.1935119628906, + "logps/rejected": -382.47076416015625, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5427314043045044, + "rewards/margins": 11.932634353637695, + "rewards/rejected": -13.475366592407227, + "step": 10220 + }, + { + "epoch": 2.59, + "learning_rate": 7.667821365040726e-08, + "logits/chosen": -2.5406994819641113, + "logits/rejected": -2.5999538898468018, + "logps/chosen": -417.00640869140625, + "logps/rejected": -448.77294921875, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8344262838363647, + "rewards/margins": 13.575149536132812, + "rewards/rejected": -12.74072265625, + "step": 10230 + }, + { + "epoch": 2.59, + "learning_rate": 7.621009268795056e-08, + "logits/chosen": -2.561500072479248, + "logits/rejected": -2.3849411010742188, + "logps/chosen": -283.8385925292969, + "logps/rejected": -375.70819091796875, + "loss": 0.0163, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6725066900253296, + "rewards/margins": 13.281970024108887, + "rewards/rejected": -14.954477310180664, + "step": 10240 + }, + { + "epoch": 2.59, + "learning_rate": 7.574197172549386e-08, + "logits/chosen": -2.305802822113037, + "logits/rejected": -2.366856813430786, + "logps/chosen": -264.9994201660156, + "logps/rejected": -385.60260009765625, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8811006546020508, + "rewards/margins": 12.801080703735352, + "rewards/rejected": -13.682180404663086, + "step": 10250 + }, + { + "epoch": 2.59, + "learning_rate": 7.527385076303716e-08, + "logits/chosen": -2.336945056915283, + "logits/rejected": -2.31264066696167, + "logps/chosen": -275.8536376953125, + "logps/rejected": -343.1841125488281, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8622077703475952, + "rewards/margins": 11.723356246948242, + "rewards/rejected": -13.585565567016602, + "step": 10260 + }, + { + "epoch": 2.6, + "learning_rate": 7.480572980058047e-08, + "logits/chosen": -2.5478365421295166, + "logits/rejected": -2.541980266571045, + "logps/chosen": -231.7345733642578, + "logps/rejected": -281.2032470703125, + "loss": 0.0196, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7049830555915833, + "rewards/margins": 7.908007621765137, + "rewards/rejected": -8.612991333007812, + "step": 10270 + }, + { + "epoch": 2.6, + "learning_rate": 7.433760883812376e-08, + "logits/chosen": -2.669074296951294, + "logits/rejected": -2.5242648124694824, + "logps/chosen": -355.5440979003906, + "logps/rejected": -615.2041625976562, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.465815544128418, + "rewards/margins": 18.569738388061523, + "rewards/rejected": -17.103923797607422, + "step": 10280 + }, + { + "epoch": 2.6, + "learning_rate": 7.386948787566707e-08, + "logits/chosen": -2.5442676544189453, + "logits/rejected": -2.506998062133789, + "logps/chosen": -302.8976135253906, + "logps/rejected": -450.32257080078125, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4359608590602875, + "rewards/margins": 12.673746109008789, + "rewards/rejected": -13.109707832336426, + "step": 10290 + }, + { + "epoch": 2.6, + "learning_rate": 7.340136691321037e-08, + "logits/chosen": -2.494982957839966, + "logits/rejected": -2.3586652278900146, + "logps/chosen": -286.8078308105469, + "logps/rejected": -388.41607666015625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3223403692245483, + "rewards/margins": 12.64293384552002, + "rewards/rejected": -13.9652738571167, + "step": 10300 + }, + { + "epoch": 2.61, + "learning_rate": 7.293324595075368e-08, + "logits/chosen": -2.5453314781188965, + "logits/rejected": -2.445521831512451, + "logps/chosen": -364.0186462402344, + "logps/rejected": -498.9871520996094, + "loss": 0.0272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6890239715576172, + "rewards/margins": 12.470296859741211, + "rewards/rejected": -13.159318923950195, + "step": 10310 + }, + { + "epoch": 2.61, + "learning_rate": 7.246512498829698e-08, + "logits/chosen": -2.5487751960754395, + "logits/rejected": -2.3223040103912354, + "logps/chosen": -253.3618927001953, + "logps/rejected": -332.0083312988281, + "loss": 0.0236, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.7357704639434814, + "rewards/margins": 11.522217750549316, + "rewards/rejected": -14.257987976074219, + "step": 10320 + }, + { + "epoch": 2.61, + "learning_rate": 7.199700402584027e-08, + "logits/chosen": -2.178081512451172, + "logits/rejected": -2.2869818210601807, + "logps/chosen": -262.9320983886719, + "logps/rejected": -408.1930236816406, + "loss": 0.0277, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.4747023582458496, + "rewards/margins": 9.987930297851562, + "rewards/rejected": -13.462631225585938, + "step": 10330 + }, + { + "epoch": 2.61, + "learning_rate": 7.152888306338357e-08, + "logits/chosen": -2.255789041519165, + "logits/rejected": -2.1870675086975098, + "logps/chosen": -212.7251739501953, + "logps/rejected": -505.8531799316406, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9834052324295044, + "rewards/margins": 15.1971435546875, + "rewards/rejected": -16.1805477142334, + "step": 10340 + }, + { + "epoch": 2.62, + "learning_rate": 7.106076210092687e-08, + "logits/chosen": -2.4865643978118896, + "logits/rejected": -2.325779676437378, + "logps/chosen": -221.14950561523438, + "logps/rejected": -271.050048828125, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4804035425186157, + "rewards/margins": 9.898581504821777, + "rewards/rejected": -11.378985404968262, + "step": 10350 + }, + { + "epoch": 2.62, + "learning_rate": 7.059264113847018e-08, + "logits/chosen": -2.522327423095703, + "logits/rejected": -2.4662699699401855, + "logps/chosen": -383.5025329589844, + "logps/rejected": -463.1015625, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0164690017700195, + "rewards/margins": 12.747421264648438, + "rewards/rejected": -13.763890266418457, + "step": 10360 + }, + { + "epoch": 2.62, + "learning_rate": 7.012452017601348e-08, + "logits/chosen": -2.5477945804595947, + "logits/rejected": -2.496734619140625, + "logps/chosen": -281.07708740234375, + "logps/rejected": -404.2988586425781, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1302664279937744, + "rewards/margins": 11.660921096801758, + "rewards/rejected": -13.79118537902832, + "step": 10370 + }, + { + "epoch": 2.62, + "learning_rate": 6.965639921355677e-08, + "logits/chosen": -2.6205244064331055, + "logits/rejected": -2.451359272003174, + "logps/chosen": -301.0621032714844, + "logps/rejected": -327.03765869140625, + "loss": 0.0291, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4722011089324951, + "rewards/margins": 10.043096542358398, + "rewards/rejected": -11.515296936035156, + "step": 10380 + }, + { + "epoch": 2.63, + "learning_rate": 6.918827825110008e-08, + "logits/chosen": -2.4276673793792725, + "logits/rejected": -2.5138256549835205, + "logps/chosen": -192.591064453125, + "logps/rejected": -331.2646179199219, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2287522852420807, + "rewards/margins": 10.684351921081543, + "rewards/rejected": -10.913103103637695, + "step": 10390 + }, + { + "epoch": 2.63, + "learning_rate": 6.872015728864339e-08, + "logits/chosen": -2.5545268058776855, + "logits/rejected": -2.3641724586486816, + "logps/chosen": -262.8212890625, + "logps/rejected": -310.0434265136719, + "loss": 0.0158, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6818441152572632, + "rewards/margins": 10.600229263305664, + "rewards/rejected": -12.282073020935059, + "step": 10400 + }, + { + "epoch": 2.63, + "learning_rate": 6.825203632618669e-08, + "logits/chosen": -2.5672922134399414, + "logits/rejected": -2.389941453933716, + "logps/chosen": -191.87167358398438, + "logps/rejected": -313.29840087890625, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3071268796920776, + "rewards/margins": 10.873403549194336, + "rewards/rejected": -12.18053150177002, + "step": 10410 + }, + { + "epoch": 2.63, + "learning_rate": 6.778391536372999e-08, + "logits/chosen": -2.5447535514831543, + "logits/rejected": -2.637056589126587, + "logps/chosen": -276.07684326171875, + "logps/rejected": -379.71685791015625, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1864304542541504, + "rewards/margins": 10.02700424194336, + "rewards/rejected": -11.213434219360352, + "step": 10420 + }, + { + "epoch": 2.64, + "learning_rate": 6.731579440127328e-08, + "logits/chosen": -2.62526273727417, + "logits/rejected": -2.5361416339874268, + "logps/chosen": -335.57208251953125, + "logps/rejected": -381.73590087890625, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.532575011253357, + "rewards/margins": 11.382027626037598, + "rewards/rejected": -12.914602279663086, + "step": 10430 + }, + { + "epoch": 2.64, + "learning_rate": 6.684767343881658e-08, + "logits/chosen": -2.4614133834838867, + "logits/rejected": -2.4959049224853516, + "logps/chosen": -284.344482421875, + "logps/rejected": -406.2197265625, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.527146339416504, + "rewards/margins": 12.429636001586914, + "rewards/rejected": -14.95678424835205, + "step": 10440 + }, + { + "epoch": 2.64, + "learning_rate": 6.637955247635989e-08, + "logits/chosen": -2.5398077964782715, + "logits/rejected": -2.6172759532928467, + "logps/chosen": -310.23724365234375, + "logps/rejected": -421.4280700683594, + "loss": 0.0229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.645061731338501, + "rewards/margins": 11.452204704284668, + "rewards/rejected": -13.097265243530273, + "step": 10450 + }, + { + "epoch": 2.64, + "learning_rate": 6.591143151390319e-08, + "logits/chosen": -2.495894193649292, + "logits/rejected": -2.372702121734619, + "logps/chosen": -197.970458984375, + "logps/rejected": -267.6205139160156, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6794233322143555, + "rewards/margins": 7.389477729797363, + "rewards/rejected": -10.068901062011719, + "step": 10460 + }, + { + "epoch": 2.65, + "learning_rate": 6.544331055144649e-08, + "logits/chosen": -2.656646251678467, + "logits/rejected": -2.4655566215515137, + "logps/chosen": -312.5660705566406, + "logps/rejected": -395.36932373046875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6871867775917053, + "rewards/margins": 12.126971244812012, + "rewards/rejected": -11.439784049987793, + "step": 10470 + }, + { + "epoch": 2.65, + "learning_rate": 6.497518958898978e-08, + "logits/chosen": -2.4221789836883545, + "logits/rejected": -2.395259380340576, + "logps/chosen": -283.78118896484375, + "logps/rejected": -339.09893798828125, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4997386932373047, + "rewards/margins": 10.91909408569336, + "rewards/rejected": -11.418832778930664, + "step": 10480 + }, + { + "epoch": 2.65, + "learning_rate": 6.450706862653308e-08, + "logits/chosen": -2.5740628242492676, + "logits/rejected": -2.4489102363586426, + "logps/chosen": -309.19097900390625, + "logps/rejected": -340.6146545410156, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41388407349586487, + "rewards/margins": 11.361598014831543, + "rewards/rejected": -11.775482177734375, + "step": 10490 + }, + { + "epoch": 2.65, + "learning_rate": 6.40389476640764e-08, + "logits/chosen": -2.3948919773101807, + "logits/rejected": -2.3630425930023193, + "logps/chosen": -253.672607421875, + "logps/rejected": -326.77642822265625, + "loss": 0.0156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9204368591308594, + "rewards/margins": 9.881596565246582, + "rewards/rejected": -13.802035331726074, + "step": 10500 + }, + { + "epoch": 2.66, + "learning_rate": 6.35708267016197e-08, + "logits/chosen": -2.634661912918091, + "logits/rejected": -2.3652424812316895, + "logps/chosen": -318.3630065917969, + "logps/rejected": -486.04620361328125, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4128196239471436, + "rewards/margins": 19.352506637573242, + "rewards/rejected": -16.939685821533203, + "step": 10510 + }, + { + "epoch": 2.66, + "learning_rate": 6.3102705739163e-08, + "logits/chosen": -2.475853443145752, + "logits/rejected": -2.535086154937744, + "logps/chosen": -231.3102569580078, + "logps/rejected": -308.42950439453125, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7528905868530273, + "rewards/margins": 11.808151245117188, + "rewards/rejected": -13.561042785644531, + "step": 10520 + }, + { + "epoch": 2.66, + "learning_rate": 6.26345847767063e-08, + "logits/chosen": -2.5275111198425293, + "logits/rejected": -2.401824474334717, + "logps/chosen": -286.4459533691406, + "logps/rejected": -336.7516784667969, + "loss": 0.0404, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.68312406539917, + "rewards/margins": 10.263943672180176, + "rewards/rejected": -13.947067260742188, + "step": 10530 + }, + { + "epoch": 2.66, + "learning_rate": 6.21664638142496e-08, + "logits/chosen": -2.2744011878967285, + "logits/rejected": -2.3522772789001465, + "logps/chosen": -260.71173095703125, + "logps/rejected": -473.5498962402344, + "loss": 0.0144, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.97296142578125, + "rewards/margins": 15.849021911621094, + "rewards/rejected": -16.821985244750977, + "step": 10540 + }, + { + "epoch": 2.67, + "learning_rate": 6.16983428517929e-08, + "logits/chosen": -2.4145588874816895, + "logits/rejected": -2.5145926475524902, + "logps/chosen": -253.80517578125, + "logps/rejected": -349.5704040527344, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6969666481018066, + "rewards/margins": 9.819896697998047, + "rewards/rejected": -12.516862869262695, + "step": 10550 + }, + { + "epoch": 2.67, + "learning_rate": 6.12302218893362e-08, + "logits/chosen": -2.484348773956299, + "logits/rejected": -2.463733673095703, + "logps/chosen": -348.70855712890625, + "logps/rejected": -386.32208251953125, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07341752201318741, + "rewards/margins": 12.499094009399414, + "rewards/rejected": -12.425674438476562, + "step": 10560 + }, + { + "epoch": 2.67, + "learning_rate": 6.07621009268795e-08, + "logits/chosen": -2.4193224906921387, + "logits/rejected": -2.3769335746765137, + "logps/chosen": -242.24105834960938, + "logps/rejected": -316.8022766113281, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3817594051361084, + "rewards/margins": 10.313627243041992, + "rewards/rejected": -11.695385932922363, + "step": 10570 + }, + { + "epoch": 2.67, + "learning_rate": 6.029397996442281e-08, + "logits/chosen": -2.511824131011963, + "logits/rejected": -2.4816884994506836, + "logps/chosen": -306.25030517578125, + "logps/rejected": -437.5084533691406, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6340714693069458, + "rewards/margins": 14.742956161499023, + "rewards/rejected": -14.108884811401367, + "step": 10580 + }, + { + "epoch": 2.68, + "learning_rate": 5.982585900196611e-08, + "logits/chosen": -2.169275999069214, + "logits/rejected": -2.2812275886535645, + "logps/chosen": -256.701416015625, + "logps/rejected": -390.88909912109375, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.529537200927734, + "rewards/margins": 11.55712604522705, + "rewards/rejected": -16.0866641998291, + "step": 10590 + }, + { + "epoch": 2.68, + "learning_rate": 5.9357738039509406e-08, + "logits/chosen": -2.545469284057617, + "logits/rejected": -2.611917734146118, + "logps/chosen": -241.7361297607422, + "logps/rejected": -361.86517333984375, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0422825813293457, + "rewards/margins": 10.897112846374512, + "rewards/rejected": -11.939393997192383, + "step": 10600 + }, + { + "epoch": 2.68, + "learning_rate": 5.888961707705271e-08, + "logits/chosen": -2.51267409324646, + "logits/rejected": -2.470567226409912, + "logps/chosen": -233.7913055419922, + "logps/rejected": -376.94189453125, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0417325496673584, + "rewards/margins": 12.69041633605957, + "rewards/rejected": -12.732148170471191, + "step": 10610 + }, + { + "epoch": 2.68, + "learning_rate": 5.8421496114596004e-08, + "logits/chosen": -2.5030910968780518, + "logits/rejected": -2.6085524559020996, + "logps/chosen": -192.46926879882812, + "logps/rejected": -350.96728515625, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.685454249382019, + "rewards/margins": 10.346867561340332, + "rewards/rejected": -11.03232192993164, + "step": 10620 + }, + { + "epoch": 2.69, + "learning_rate": 5.7953375152139307e-08, + "logits/chosen": -2.4607090950012207, + "logits/rejected": -2.535057544708252, + "logps/chosen": -340.30914306640625, + "logps/rejected": -384.41802978515625, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.945713222026825, + "rewards/margins": 11.22203254699707, + "rewards/rejected": -12.167744636535645, + "step": 10630 + }, + { + "epoch": 2.69, + "learning_rate": 5.7485254189682616e-08, + "logits/chosen": -2.472196578979492, + "logits/rejected": -2.4685492515563965, + "logps/chosen": -281.9466247558594, + "logps/rejected": -399.18682861328125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.288421630859375, + "rewards/margins": 11.785292625427246, + "rewards/rejected": -13.073715209960938, + "step": 10640 + }, + { + "epoch": 2.69, + "learning_rate": 5.701713322722591e-08, + "logits/chosen": -2.378995418548584, + "logits/rejected": -2.4454381465911865, + "logps/chosen": -316.51458740234375, + "logps/rejected": -399.79681396484375, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.655643939971924, + "rewards/margins": 12.092453002929688, + "rewards/rejected": -14.748095512390137, + "step": 10650 + }, + { + "epoch": 2.69, + "learning_rate": 5.6549012264769214e-08, + "logits/chosen": -2.4572110176086426, + "logits/rejected": -2.284625768661499, + "logps/chosen": -237.8368682861328, + "logps/rejected": -315.8360290527344, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2194294929504395, + "rewards/margins": 11.172201156616211, + "rewards/rejected": -12.391630172729492, + "step": 10660 + }, + { + "epoch": 2.7, + "learning_rate": 5.608089130231251e-08, + "logits/chosen": -2.373015880584717, + "logits/rejected": -2.5167531967163086, + "logps/chosen": -240.7644805908203, + "logps/rejected": -398.7191162109375, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.427962064743042, + "rewards/margins": 11.068601608276367, + "rewards/rejected": -12.496563911437988, + "step": 10670 + }, + { + "epoch": 2.7, + "learning_rate": 5.561277033985581e-08, + "logits/chosen": -2.5520968437194824, + "logits/rejected": -2.472224235534668, + "logps/chosen": -297.46484375, + "logps/rejected": -420.49755859375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14769385755062103, + "rewards/margins": 13.067815780639648, + "rewards/rejected": -12.920123100280762, + "step": 10680 + }, + { + "epoch": 2.7, + "learning_rate": 5.514464937739912e-08, + "logits/chosen": -2.552321195602417, + "logits/rejected": -2.521683692932129, + "logps/chosen": -324.34600830078125, + "logps/rejected": -350.0135803222656, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8432782888412476, + "rewards/margins": 15.473007202148438, + "rewards/rejected": -13.629728317260742, + "step": 10690 + }, + { + "epoch": 2.7, + "learning_rate": 5.467652841494242e-08, + "logits/chosen": -2.4410221576690674, + "logits/rejected": -2.261835813522339, + "logps/chosen": -206.8491668701172, + "logps/rejected": -321.8210754394531, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9713339805603027, + "rewards/margins": 10.97213077545166, + "rewards/rejected": -13.943461418151855, + "step": 10700 + }, + { + "epoch": 2.71, + "learning_rate": 5.420840745248572e-08, + "logits/chosen": -2.504394054412842, + "logits/rejected": -2.1629796028137207, + "logps/chosen": -270.15753173828125, + "logps/rejected": -349.7294921875, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5076284408569336, + "rewards/margins": 10.655898094177246, + "rewards/rejected": -12.16352653503418, + "step": 10710 + }, + { + "epoch": 2.71, + "learning_rate": 5.374028649002902e-08, + "logits/chosen": -2.432774543762207, + "logits/rejected": -2.387944459915161, + "logps/chosen": -270.55169677734375, + "logps/rejected": -308.4688720703125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.218773365020752, + "rewards/margins": 11.117988586425781, + "rewards/rejected": -13.336763381958008, + "step": 10720 + }, + { + "epoch": 2.71, + "learning_rate": 5.327216552757232e-08, + "logits/chosen": -2.429544687271118, + "logits/rejected": -2.3968300819396973, + "logps/chosen": -211.1251983642578, + "logps/rejected": -322.4640808105469, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0799713134765625, + "rewards/margins": 12.16854190826416, + "rewards/rejected": -14.248514175415039, + "step": 10730 + }, + { + "epoch": 2.71, + "learning_rate": 5.280404456511563e-08, + "logits/chosen": -2.376677989959717, + "logits/rejected": -2.472378969192505, + "logps/chosen": -201.08544921875, + "logps/rejected": -444.22314453125, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6852868795394897, + "rewards/margins": 11.311137199401855, + "rewards/rejected": -12.996424674987793, + "step": 10740 + }, + { + "epoch": 2.72, + "learning_rate": 5.2335923602658924e-08, + "logits/chosen": -2.5168099403381348, + "logits/rejected": -2.4923369884490967, + "logps/chosen": -308.19244384765625, + "logps/rejected": -460.25579833984375, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.394303560256958, + "rewards/margins": 15.313270568847656, + "rewards/rejected": -16.70757484436035, + "step": 10750 + }, + { + "epoch": 2.72, + "learning_rate": 5.186780264020223e-08, + "logits/chosen": -2.423064708709717, + "logits/rejected": -2.377765417098999, + "logps/chosen": -280.87457275390625, + "logps/rejected": -397.24615478515625, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.421639919281006, + "rewards/margins": 11.679462432861328, + "rewards/rejected": -15.101102828979492, + "step": 10760 + }, + { + "epoch": 2.72, + "learning_rate": 5.139968167774553e-08, + "logits/chosen": -2.542541742324829, + "logits/rejected": -2.5660011768341064, + "logps/chosen": -311.539306640625, + "logps/rejected": -374.19122314453125, + "loss": 0.0229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1220595836639404, + "rewards/margins": 9.83003044128418, + "rewards/rejected": -12.9520902633667, + "step": 10770 + }, + { + "epoch": 2.72, + "learning_rate": 5.0931560715288825e-08, + "logits/chosen": -2.3945837020874023, + "logits/rejected": -2.451707601547241, + "logps/chosen": -262.0988464355469, + "logps/rejected": -347.1482238769531, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.536090850830078, + "rewards/margins": 11.240296363830566, + "rewards/rejected": -13.776387214660645, + "step": 10780 + }, + { + "epoch": 2.73, + "learning_rate": 5.046343975283213e-08, + "logits/chosen": -2.617018222808838, + "logits/rejected": -2.5935912132263184, + "logps/chosen": -279.4792175292969, + "logps/rejected": -379.79638671875, + "loss": 0.015, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.8385303020477295, + "rewards/margins": 9.282624244689941, + "rewards/rejected": -13.12115478515625, + "step": 10790 + }, + { + "epoch": 2.73, + "learning_rate": 4.999531879037543e-08, + "logits/chosen": -2.360745906829834, + "logits/rejected": -2.4422788619995117, + "logps/chosen": -244.77255249023438, + "logps/rejected": -348.60455322265625, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.752306342124939, + "rewards/margins": 12.563000679016113, + "rewards/rejected": -14.3153076171875, + "step": 10800 + }, + { + "epoch": 2.73, + "learning_rate": 4.952719782791873e-08, + "logits/chosen": -2.506380558013916, + "logits/rejected": -2.4522621631622314, + "logps/chosen": -245.85659790039062, + "logps/rejected": -313.57568359375, + "loss": 0.0214, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0090579986572266, + "rewards/margins": 8.213888168334961, + "rewards/rejected": -11.222947120666504, + "step": 10810 + }, + { + "epoch": 2.74, + "learning_rate": 4.9059076865462035e-08, + "logits/chosen": -2.6841819286346436, + "logits/rejected": -2.5684549808502197, + "logps/chosen": -318.21685791015625, + "logps/rejected": -471.38177490234375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1088840961456299, + "rewards/margins": 12.368844985961914, + "rewards/rejected": -13.477727890014648, + "step": 10820 + }, + { + "epoch": 2.74, + "learning_rate": 4.859095590300533e-08, + "logits/chosen": -2.425753593444824, + "logits/rejected": -2.3956360816955566, + "logps/chosen": -189.13650512695312, + "logps/rejected": -332.4715576171875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3720785677433014, + "rewards/margins": 12.712925910949707, + "rewards/rejected": -13.085004806518555, + "step": 10830 + }, + { + "epoch": 2.74, + "learning_rate": 4.8122834940548634e-08, + "logits/chosen": -2.610264301300049, + "logits/rejected": -2.447481155395508, + "logps/chosen": -342.28509521484375, + "logps/rejected": -377.2023010253906, + "loss": 0.0113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.163722515106201, + "rewards/margins": 11.125509262084961, + "rewards/rejected": -13.28923225402832, + "step": 10840 + }, + { + "epoch": 2.74, + "learning_rate": 4.765471397809194e-08, + "logits/chosen": -2.4246602058410645, + "logits/rejected": -2.326582670211792, + "logps/chosen": -297.70037841796875, + "logps/rejected": -320.5494079589844, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0987861379981041, + "rewards/margins": 13.165461540222168, + "rewards/rejected": -13.066675186157227, + "step": 10850 + }, + { + "epoch": 2.75, + "learning_rate": 4.718659301563524e-08, + "logits/chosen": -2.4775731563568115, + "logits/rejected": -2.311589002609253, + "logps/chosen": -388.44189453125, + "logps/rejected": -500.1787109375, + "loss": 0.0101, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.110121488571167, + "rewards/margins": 12.782126426696777, + "rewards/rejected": -15.892248153686523, + "step": 10860 + }, + { + "epoch": 2.75, + "learning_rate": 4.671847205317854e-08, + "logits/chosen": -2.391537666320801, + "logits/rejected": -2.377824306488037, + "logps/chosen": -342.33648681640625, + "logps/rejected": -438.40264892578125, + "loss": 0.0285, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7127606272697449, + "rewards/margins": 11.989851951599121, + "rewards/rejected": -12.702611923217773, + "step": 10870 + }, + { + "epoch": 2.75, + "learning_rate": 4.625035109072184e-08, + "logits/chosen": -2.449780225753784, + "logits/rejected": -2.3901445865631104, + "logps/chosen": -292.65264892578125, + "logps/rejected": -432.4183044433594, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.945425033569336, + "rewards/margins": 12.054902076721191, + "rewards/rejected": -15.000328063964844, + "step": 10880 + }, + { + "epoch": 2.75, + "learning_rate": 4.578223012826514e-08, + "logits/chosen": -2.4154934883117676, + "logits/rejected": -2.4078152179718018, + "logps/chosen": -267.01678466796875, + "logps/rejected": -375.1787109375, + "loss": 0.0268, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3427653312683105, + "rewards/margins": 11.284711837768555, + "rewards/rejected": -14.627476692199707, + "step": 10890 + }, + { + "epoch": 2.76, + "learning_rate": 4.531410916580844e-08, + "logits/chosen": -2.5893101692199707, + "logits/rejected": -2.4205315113067627, + "logps/chosen": -273.5794982910156, + "logps/rejected": -325.26898193359375, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7947503924369812, + "rewards/margins": 10.781389236450195, + "rewards/rejected": -9.986639976501465, + "step": 10900 + }, + { + "epoch": 2.76, + "learning_rate": 4.4845988203351745e-08, + "logits/chosen": -2.6362905502319336, + "logits/rejected": -2.4879088401794434, + "logps/chosen": -290.63250732421875, + "logps/rejected": -322.5184326171875, + "loss": 0.0187, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.050183653831482, + "rewards/margins": 9.369373321533203, + "rewards/rejected": -10.419557571411133, + "step": 10910 + }, + { + "epoch": 2.76, + "learning_rate": 4.437786724089505e-08, + "logits/chosen": -2.4191794395446777, + "logits/rejected": -2.46612548828125, + "logps/chosen": -319.10308837890625, + "logps/rejected": -402.46124267578125, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31841006875038147, + "rewards/margins": 11.323060035705566, + "rewards/rejected": -11.641469955444336, + "step": 10920 + }, + { + "epoch": 2.76, + "learning_rate": 4.3909746278438344e-08, + "logits/chosen": -2.5394818782806396, + "logits/rejected": -2.576970100402832, + "logps/chosen": -236.96401977539062, + "logps/rejected": -392.5608825683594, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.159064769744873, + "rewards/margins": 11.52956771850586, + "rewards/rejected": -13.688632011413574, + "step": 10930 + }, + { + "epoch": 2.77, + "learning_rate": 4.3441625315981646e-08, + "logits/chosen": -2.5915493965148926, + "logits/rejected": -2.467534303665161, + "logps/chosen": -250.9964599609375, + "logps/rejected": -292.21600341796875, + "loss": 0.0328, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5387306213378906, + "rewards/margins": 9.883870124816895, + "rewards/rejected": -11.422601699829102, + "step": 10940 + }, + { + "epoch": 2.77, + "learning_rate": 4.297350435352495e-08, + "logits/chosen": -2.506401538848877, + "logits/rejected": -2.489722728729248, + "logps/chosen": -351.91064453125, + "logps/rejected": -438.94818115234375, + "loss": 0.0162, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.900360107421875, + "rewards/margins": 14.668928146362305, + "rewards/rejected": -16.569290161132812, + "step": 10950 + }, + { + "epoch": 2.77, + "learning_rate": 4.250538339106825e-08, + "logits/chosen": -2.2875542640686035, + "logits/rejected": -2.2873737812042236, + "logps/chosen": -345.0179748535156, + "logps/rejected": -385.7926940917969, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6500468254089355, + "rewards/margins": 11.161527633666992, + "rewards/rejected": -14.811576843261719, + "step": 10960 + }, + { + "epoch": 2.77, + "learning_rate": 4.2037262428611554e-08, + "logits/chosen": -2.2688958644866943, + "logits/rejected": -2.2536237239837646, + "logps/chosen": -306.314697265625, + "logps/rejected": -414.4088439941406, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5722497701644897, + "rewards/margins": 11.489215850830078, + "rewards/rejected": -13.0614652633667, + "step": 10970 + }, + { + "epoch": 2.78, + "learning_rate": 4.156914146615485e-08, + "logits/chosen": -2.456655502319336, + "logits/rejected": -2.4888079166412354, + "logps/chosen": -206.24093627929688, + "logps/rejected": -329.0902404785156, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9091620445251465, + "rewards/margins": 9.75031566619873, + "rewards/rejected": -12.659477233886719, + "step": 10980 + }, + { + "epoch": 2.78, + "learning_rate": 4.110102050369815e-08, + "logits/chosen": -2.4782252311706543, + "logits/rejected": -2.437685966491699, + "logps/chosen": -224.4599609375, + "logps/rejected": -357.8743591308594, + "loss": 0.0249, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.37739634513855, + "rewards/margins": 12.426565170288086, + "rewards/rejected": -15.803962707519531, + "step": 10990 + }, + { + "epoch": 2.78, + "learning_rate": 4.0632899541241455e-08, + "logits/chosen": -2.588283061981201, + "logits/rejected": -2.510744094848633, + "logps/chosen": -332.6466369628906, + "logps/rejected": -408.37982177734375, + "loss": 0.0127, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.765860080718994, + "rewards/margins": 11.549457550048828, + "rewards/rejected": -14.315317153930664, + "step": 11000 + }, + { + "epoch": 2.78, + "learning_rate": 4.016477857878476e-08, + "logits/chosen": -2.4237632751464844, + "logits/rejected": -2.3809571266174316, + "logps/chosen": -360.4668273925781, + "logps/rejected": -399.2850036621094, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020248282700777054, + "rewards/margins": 12.246782302856445, + "rewards/rejected": -12.267030715942383, + "step": 11010 + }, + { + "epoch": 2.79, + "learning_rate": 3.969665761632806e-08, + "logits/chosen": -2.5303540229797363, + "logits/rejected": -2.4834442138671875, + "logps/chosen": -244.11532592773438, + "logps/rejected": -366.6121520996094, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3335506916046143, + "rewards/margins": 10.373838424682617, + "rewards/rejected": -12.707388877868652, + "step": 11020 + }, + { + "epoch": 2.79, + "learning_rate": 3.9228536653871356e-08, + "logits/chosen": -2.4338903427124023, + "logits/rejected": -2.4130215644836426, + "logps/chosen": -202.8179168701172, + "logps/rejected": -318.9550476074219, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7410482168197632, + "rewards/margins": 11.074950218200684, + "rewards/rejected": -11.815999031066895, + "step": 11030 + }, + { + "epoch": 2.79, + "learning_rate": 3.876041569141466e-08, + "logits/chosen": -2.6266818046569824, + "logits/rejected": -2.570533275604248, + "logps/chosen": -289.25604248046875, + "logps/rejected": -355.9947814941406, + "loss": 0.0115, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.634669303894043, + "rewards/margins": 10.495134353637695, + "rewards/rejected": -12.129803657531738, + "step": 11040 + }, + { + "epoch": 2.79, + "learning_rate": 3.829229472895796e-08, + "logits/chosen": -2.49495267868042, + "logits/rejected": -2.5380091667175293, + "logps/chosen": -273.33343505859375, + "logps/rejected": -460.66094970703125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1227961778640747, + "rewards/margins": 16.76388931274414, + "rewards/rejected": -15.641095161437988, + "step": 11050 + }, + { + "epoch": 2.8, + "learning_rate": 3.782417376650126e-08, + "logits/chosen": -2.365868330001831, + "logits/rejected": -2.394813060760498, + "logps/chosen": -311.5312194824219, + "logps/rejected": -355.22900390625, + "loss": 0.0101, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.4357383251190186, + "rewards/margins": 9.920340538024902, + "rewards/rejected": -13.3560791015625, + "step": 11060 + }, + { + "epoch": 2.8, + "learning_rate": 3.7356052804044567e-08, + "logits/chosen": -2.744568347930908, + "logits/rejected": -2.6600661277770996, + "logps/chosen": -371.8135681152344, + "logps/rejected": -508.0810546875, + "loss": 0.026, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4500129222869873, + "rewards/margins": 11.732702255249023, + "rewards/rejected": -13.182714462280273, + "step": 11070 + }, + { + "epoch": 2.8, + "learning_rate": 3.688793184158787e-08, + "logits/chosen": -2.5530102252960205, + "logits/rejected": -2.4516406059265137, + "logps/chosen": -269.38934326171875, + "logps/rejected": -336.32220458984375, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0966869592666626, + "rewards/margins": 10.917011260986328, + "rewards/rejected": -12.01369857788086, + "step": 11080 + }, + { + "epoch": 2.8, + "learning_rate": 3.6419810879131165e-08, + "logits/chosen": -2.6439976692199707, + "logits/rejected": -2.5483946800231934, + "logps/chosen": -311.73126220703125, + "logps/rejected": -396.8368835449219, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8862838745117188, + "rewards/margins": 11.635208129882812, + "rewards/rejected": -14.521492004394531, + "step": 11090 + }, + { + "epoch": 2.81, + "learning_rate": 3.595168991667447e-08, + "logits/chosen": -2.478001117706299, + "logits/rejected": -2.47713303565979, + "logps/chosen": -291.5771179199219, + "logps/rejected": -327.6292419433594, + "loss": 0.027, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.647336006164551, + "rewards/margins": 10.578722953796387, + "rewards/rejected": -13.226058959960938, + "step": 11100 + }, + { + "epoch": 2.81, + "learning_rate": 3.5483568954217764e-08, + "logits/chosen": -2.380610704421997, + "logits/rejected": -2.281994342803955, + "logps/chosen": -330.2130126953125, + "logps/rejected": -382.11248779296875, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2577191889286041, + "rewards/margins": 13.458898544311523, + "rewards/rejected": -13.716618537902832, + "step": 11110 + }, + { + "epoch": 2.81, + "learning_rate": 3.501544799176107e-08, + "logits/chosen": -2.6637930870056152, + "logits/rejected": -2.6697707176208496, + "logps/chosen": -315.9070739746094, + "logps/rejected": -398.7461242675781, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1222403049468994, + "rewards/margins": 12.726017951965332, + "rewards/rejected": -13.848257064819336, + "step": 11120 + }, + { + "epoch": 2.81, + "learning_rate": 3.4547327029304375e-08, + "logits/chosen": -2.4287617206573486, + "logits/rejected": -2.3643672466278076, + "logps/chosen": -240.102294921875, + "logps/rejected": -320.3937072753906, + "loss": 0.011, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3844046592712402, + "rewards/margins": 9.640578269958496, + "rewards/rejected": -13.024984359741211, + "step": 11130 + }, + { + "epoch": 2.82, + "learning_rate": 3.407920606684767e-08, + "logits/chosen": -2.393781900405884, + "logits/rejected": -2.2155890464782715, + "logps/chosen": -266.0910949707031, + "logps/rejected": -352.65338134765625, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3181581497192383, + "rewards/margins": 13.180933952331543, + "rewards/rejected": -14.499092102050781, + "step": 11140 + }, + { + "epoch": 2.82, + "learning_rate": 3.3611085104390974e-08, + "logits/chosen": -2.5786240100860596, + "logits/rejected": -2.470693349838257, + "logps/chosen": -399.3895568847656, + "logps/rejected": -467.3301696777344, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16405799984931946, + "rewards/margins": 13.329595565795898, + "rewards/rejected": -13.49365520477295, + "step": 11150 + }, + { + "epoch": 2.82, + "learning_rate": 3.314296414193427e-08, + "logits/chosen": -2.2679240703582764, + "logits/rejected": -2.252685070037842, + "logps/chosen": -187.90406799316406, + "logps/rejected": -356.1705017089844, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4501420855522156, + "rewards/margins": 13.709848403930664, + "rewards/rejected": -13.25970458984375, + "step": 11160 + }, + { + "epoch": 2.82, + "learning_rate": 3.267484317947757e-08, + "logits/chosen": -2.6549148559570312, + "logits/rejected": -2.5700325965881348, + "logps/chosen": -350.21295166015625, + "logps/rejected": -372.63189697265625, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11017012596130371, + "rewards/margins": 12.352564811706543, + "rewards/rejected": -12.242395401000977, + "step": 11170 + }, + { + "epoch": 2.83, + "learning_rate": 3.220672221702088e-08, + "logits/chosen": -2.405151844024658, + "logits/rejected": -2.482081174850464, + "logps/chosen": -214.7506866455078, + "logps/rejected": -394.0699768066406, + "loss": 0.0192, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3040611743927, + "rewards/margins": 11.162083625793457, + "rewards/rejected": -14.466143608093262, + "step": 11180 + }, + { + "epoch": 2.83, + "learning_rate": 3.173860125456418e-08, + "logits/chosen": -2.4875283241271973, + "logits/rejected": -2.5791056156158447, + "logps/chosen": -278.1722412109375, + "logps/rejected": -406.5775451660156, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8419070243835449, + "rewards/margins": 12.21147632598877, + "rewards/rejected": -13.053384780883789, + "step": 11190 + }, + { + "epoch": 2.83, + "learning_rate": 3.127048029210748e-08, + "logits/chosen": -2.462008237838745, + "logits/rejected": -2.3970558643341064, + "logps/chosen": -202.25149536132812, + "logps/rejected": -290.50592041015625, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1536483764648438, + "rewards/margins": 9.67103385925293, + "rewards/rejected": -11.824682235717773, + "step": 11200 + }, + { + "epoch": 2.83, + "learning_rate": 3.0802359329650776e-08, + "logits/chosen": -2.5316426753997803, + "logits/rejected": -2.4360640048980713, + "logps/chosen": -181.0486602783203, + "logps/rejected": -243.1605682373047, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2299704551696777, + "rewards/margins": 10.221961975097656, + "rewards/rejected": -11.451932907104492, + "step": 11210 + }, + { + "epoch": 2.84, + "learning_rate": 3.0334238367194085e-08, + "logits/chosen": -2.749342441558838, + "logits/rejected": -2.6762137413024902, + "logps/chosen": -331.9028015136719, + "logps/rejected": -490.9769592285156, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8896066546440125, + "rewards/margins": 14.539273262023926, + "rewards/rejected": -13.649667739868164, + "step": 11220 + }, + { + "epoch": 2.84, + "learning_rate": 2.986611740473738e-08, + "logits/chosen": -2.4288411140441895, + "logits/rejected": -2.336217164993286, + "logps/chosen": -238.291259765625, + "logps/rejected": -331.3802185058594, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2435986995697021, + "rewards/margins": 10.70294189453125, + "rewards/rejected": -11.946540832519531, + "step": 11230 + }, + { + "epoch": 2.84, + "learning_rate": 2.9397996442280687e-08, + "logits/chosen": -2.5155584812164307, + "logits/rejected": -2.2555930614471436, + "logps/chosen": -261.25, + "logps/rejected": -329.39801025390625, + "loss": 0.0203, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6180357933044434, + "rewards/margins": 10.947762489318848, + "rewards/rejected": -13.56579875946045, + "step": 11240 + }, + { + "epoch": 2.84, + "learning_rate": 2.8929875479823986e-08, + "logits/chosen": -2.5518200397491455, + "logits/rejected": -2.547988176345825, + "logps/chosen": -235.08847045898438, + "logps/rejected": -342.1391296386719, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9798015356063843, + "rewards/margins": 14.8731050491333, + "rewards/rejected": -13.893304824829102, + "step": 11250 + }, + { + "epoch": 2.85, + "learning_rate": 2.8461754517367285e-08, + "logits/chosen": -2.3784825801849365, + "logits/rejected": -2.3726489543914795, + "logps/chosen": -240.8814239501953, + "logps/rejected": -376.7526550292969, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18654665350914001, + "rewards/margins": 11.669304847717285, + "rewards/rejected": -11.482756614685059, + "step": 11260 + }, + { + "epoch": 2.85, + "learning_rate": 2.7993633554910588e-08, + "logits/chosen": -2.561047077178955, + "logits/rejected": -2.493727207183838, + "logps/chosen": -254.25119018554688, + "logps/rejected": -413.8963317871094, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2032550573349, + "rewards/margins": 15.142125129699707, + "rewards/rejected": -13.938870429992676, + "step": 11270 + }, + { + "epoch": 2.85, + "learning_rate": 2.7525512592453887e-08, + "logits/chosen": -2.58831787109375, + "logits/rejected": -2.430523157119751, + "logps/chosen": -289.0402526855469, + "logps/rejected": -370.84027099609375, + "loss": 0.017, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0527355670928955, + "rewards/margins": 12.028496742248535, + "rewards/rejected": -13.081232070922852, + "step": 11280 + }, + { + "epoch": 2.85, + "learning_rate": 2.705739162999719e-08, + "logits/chosen": -2.622467279434204, + "logits/rejected": -2.708653211593628, + "logps/chosen": -349.805419921875, + "logps/rejected": -452.0779724121094, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3241063952445984, + "rewards/margins": 11.919878005981445, + "rewards/rejected": -12.24398422241211, + "step": 11290 + }, + { + "epoch": 2.86, + "learning_rate": 2.6589270667540492e-08, + "logits/chosen": -2.526979684829712, + "logits/rejected": -2.485935688018799, + "logps/chosen": -291.6176452636719, + "logps/rejected": -466.741943359375, + "loss": 0.0218, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.716256856918335, + "rewards/margins": 18.379528045654297, + "rewards/rejected": -20.095787048339844, + "step": 11300 + }, + { + "epoch": 2.86, + "learning_rate": 2.6121149705083792e-08, + "logits/chosen": -2.4464025497436523, + "logits/rejected": -2.3842031955718994, + "logps/chosen": -292.5438537597656, + "logps/rejected": -340.6904296875, + "loss": 0.0208, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9673873782157898, + "rewards/margins": 11.448567390441895, + "rewards/rejected": -12.415953636169434, + "step": 11310 + }, + { + "epoch": 2.86, + "learning_rate": 2.5653028742627094e-08, + "logits/chosen": -2.5121302604675293, + "logits/rejected": -2.506096363067627, + "logps/chosen": -249.0, + "logps/rejected": -413.91510009765625, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6290432214736938, + "rewards/margins": 10.821525573730469, + "rewards/rejected": -12.450569152832031, + "step": 11320 + }, + { + "epoch": 2.86, + "learning_rate": 2.5184907780170397e-08, + "logits/chosen": -2.5698561668395996, + "logits/rejected": -2.580622673034668, + "logps/chosen": -253.86483764648438, + "logps/rejected": -340.75372314453125, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.206702470779419, + "rewards/margins": 10.514699935913086, + "rewards/rejected": -12.721403121948242, + "step": 11330 + }, + { + "epoch": 2.87, + "learning_rate": 2.4716786817713696e-08, + "logits/chosen": -2.641425132751465, + "logits/rejected": -2.6758666038513184, + "logps/chosen": -334.5554504394531, + "logps/rejected": -419.2228088378906, + "loss": 0.025, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.09734637290239334, + "rewards/margins": 13.741844177246094, + "rewards/rejected": -13.839190483093262, + "step": 11340 + }, + { + "epoch": 2.87, + "learning_rate": 2.4248665855257e-08, + "logits/chosen": -2.596531391143799, + "logits/rejected": -2.584341526031494, + "logps/chosen": -270.8020935058594, + "logps/rejected": -418.1478576660156, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.781860113143921, + "rewards/margins": 10.423555374145508, + "rewards/rejected": -13.205415725708008, + "step": 11350 + }, + { + "epoch": 2.87, + "learning_rate": 2.3780544892800298e-08, + "logits/chosen": -2.4953293800354004, + "logits/rejected": -2.465010166168213, + "logps/chosen": -248.66635131835938, + "logps/rejected": -443.67816162109375, + "loss": 0.0388, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2700600624084473, + "rewards/margins": 13.418362617492676, + "rewards/rejected": -15.688423156738281, + "step": 11360 + }, + { + "epoch": 2.87, + "learning_rate": 2.3312423930343597e-08, + "logits/chosen": -2.4786221981048584, + "logits/rejected": -2.429025650024414, + "logps/chosen": -261.0307312011719, + "logps/rejected": -350.3789978027344, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10001619905233383, + "rewards/margins": 14.259042739868164, + "rewards/rejected": -14.359057426452637, + "step": 11370 + }, + { + "epoch": 2.88, + "learning_rate": 2.2844302967886903e-08, + "logits/chosen": -2.468181848526001, + "logits/rejected": -2.3002982139587402, + "logps/chosen": -270.9293518066406, + "logps/rejected": -373.20037841796875, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2403790950775146, + "rewards/margins": 12.128232955932617, + "rewards/rejected": -14.368614196777344, + "step": 11380 + }, + { + "epoch": 2.88, + "learning_rate": 2.2376182005430202e-08, + "logits/chosen": -2.557896375656128, + "logits/rejected": -2.448179244995117, + "logps/chosen": -329.0253601074219, + "logps/rejected": -477.41351318359375, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.408167600631714, + "rewards/margins": 12.042343139648438, + "rewards/rejected": -14.45051097869873, + "step": 11390 + }, + { + "epoch": 2.88, + "learning_rate": 2.19080610429735e-08, + "logits/chosen": -2.376495838165283, + "logits/rejected": -2.2772092819213867, + "logps/chosen": -261.8798828125, + "logps/rejected": -300.6114807128906, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9720168113708496, + "rewards/margins": 9.81820011138916, + "rewards/rejected": -12.790217399597168, + "step": 11400 + }, + { + "epoch": 2.88, + "learning_rate": 2.1439940080516804e-08, + "logits/chosen": -2.4556946754455566, + "logits/rejected": -2.4068713188171387, + "logps/chosen": -358.0137634277344, + "logps/rejected": -476.787353515625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.019568920135498, + "rewards/margins": 18.053943634033203, + "rewards/rejected": -16.034372329711914, + "step": 11410 + }, + { + "epoch": 2.89, + "learning_rate": 2.0971819118060107e-08, + "logits/chosen": -2.536691188812256, + "logits/rejected": -2.4123189449310303, + "logps/chosen": -287.26849365234375, + "logps/rejected": -335.825439453125, + "loss": 0.0322, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.085879802703857, + "rewards/margins": 10.493057250976562, + "rewards/rejected": -14.578936576843262, + "step": 11420 + }, + { + "epoch": 2.89, + "learning_rate": 2.050369815560341e-08, + "logits/chosen": -2.3083388805389404, + "logits/rejected": -2.3589773178100586, + "logps/chosen": -225.0576171875, + "logps/rejected": -419.41583251953125, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46240463852882385, + "rewards/margins": 13.146392822265625, + "rewards/rejected": -13.608795166015625, + "step": 11430 + }, + { + "epoch": 2.89, + "learning_rate": 2.003557719314671e-08, + "logits/chosen": -2.2630105018615723, + "logits/rejected": -2.2091078758239746, + "logps/chosen": -280.4766845703125, + "logps/rejected": -324.72369384765625, + "loss": 0.0695, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.704224109649658, + "rewards/margins": 9.590178489685059, + "rewards/rejected": -12.294401168823242, + "step": 11440 + }, + { + "epoch": 2.89, + "learning_rate": 1.9567456230690008e-08, + "logits/chosen": -2.5785865783691406, + "logits/rejected": -2.4064249992370605, + "logps/chosen": -270.0608215332031, + "logps/rejected": -344.23199462890625, + "loss": 0.0189, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2489631175994873, + "rewards/margins": 9.269261360168457, + "rewards/rejected": -11.518223762512207, + "step": 11450 + }, + { + "epoch": 2.9, + "learning_rate": 1.909933526823331e-08, + "logits/chosen": -2.4120113849639893, + "logits/rejected": -2.4648735523223877, + "logps/chosen": -183.39422607421875, + "logps/rejected": -346.17572021484375, + "loss": 0.0335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9732882976531982, + "rewards/margins": 12.807714462280273, + "rewards/rejected": -14.78100299835205, + "step": 11460 + }, + { + "epoch": 2.9, + "learning_rate": 1.8631214305776613e-08, + "logits/chosen": -2.3059537410736084, + "logits/rejected": -2.301574945449829, + "logps/chosen": -217.8255615234375, + "logps/rejected": -328.16278076171875, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0877737998962402, + "rewards/margins": 11.274978637695312, + "rewards/rejected": -13.362752914428711, + "step": 11470 + }, + { + "epoch": 2.9, + "learning_rate": 1.8163093343319912e-08, + "logits/chosen": -2.6343302726745605, + "logits/rejected": -2.536257266998291, + "logps/chosen": -309.25579833984375, + "logps/rejected": -385.6678771972656, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29305902123451233, + "rewards/margins": 11.193380355834961, + "rewards/rejected": -10.900321960449219, + "step": 11480 + }, + { + "epoch": 2.9, + "learning_rate": 1.7694972380863215e-08, + "logits/chosen": -2.252707004547119, + "logits/rejected": -2.2293648719787598, + "logps/chosen": -253.23196411132812, + "logps/rejected": -451.1348571777344, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5220910906791687, + "rewards/margins": 12.508794784545898, + "rewards/rejected": -13.03088665008545, + "step": 11490 + }, + { + "epoch": 2.91, + "learning_rate": 1.7226851418406514e-08, + "logits/chosen": -2.458756685256958, + "logits/rejected": -2.4679484367370605, + "logps/chosen": -298.9981689453125, + "logps/rejected": -424.33660888671875, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1835033893585205, + "rewards/margins": 10.601720809936523, + "rewards/rejected": -13.785223007202148, + "step": 11500 + }, + { + "epoch": 2.91, + "learning_rate": 1.675873045594982e-08, + "logits/chosen": -2.4793026447296143, + "logits/rejected": -2.4787607192993164, + "logps/chosen": -251.3925323486328, + "logps/rejected": -372.2521057128906, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5706532001495361, + "rewards/margins": 13.254281997680664, + "rewards/rejected": -13.824934005737305, + "step": 11510 + }, + { + "epoch": 2.91, + "learning_rate": 1.629060949349312e-08, + "logits/chosen": -2.3445253372192383, + "logits/rejected": -2.3248162269592285, + "logps/chosen": -308.88311767578125, + "logps/rejected": -379.0182189941406, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7497143149375916, + "rewards/margins": 12.753561019897461, + "rewards/rejected": -13.503274917602539, + "step": 11520 + }, + { + "epoch": 2.91, + "learning_rate": 1.5822488531036418e-08, + "logits/chosen": -2.6158080101013184, + "logits/rejected": -2.552791118621826, + "logps/chosen": -260.298583984375, + "logps/rejected": -346.90704345703125, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6114883422851562, + "rewards/margins": 11.58378791809082, + "rewards/rejected": -14.195277214050293, + "step": 11530 + }, + { + "epoch": 2.92, + "learning_rate": 1.535436756857972e-08, + "logits/chosen": -2.60005259513855, + "logits/rejected": -2.506302833557129, + "logps/chosen": -284.90252685546875, + "logps/rejected": -413.2223205566406, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1742764711380005, + "rewards/margins": 13.892889022827148, + "rewards/rejected": -15.067166328430176, + "step": 11540 + }, + { + "epoch": 2.92, + "learning_rate": 1.4886246606123022e-08, + "logits/chosen": -2.702465295791626, + "logits/rejected": -2.673305034637451, + "logps/chosen": -271.26397705078125, + "logps/rejected": -452.94183349609375, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5330944061279297, + "rewards/margins": 13.213106155395508, + "rewards/rejected": -15.74620246887207, + "step": 11550 + }, + { + "epoch": 2.92, + "learning_rate": 1.4418125643666323e-08, + "logits/chosen": -2.5396695137023926, + "logits/rejected": -2.485886335372925, + "logps/chosen": -330.40960693359375, + "logps/rejected": -416.08807373046875, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9634650945663452, + "rewards/margins": 10.967055320739746, + "rewards/rejected": -11.930520057678223, + "step": 11560 + }, + { + "epoch": 2.92, + "learning_rate": 1.3950004681209625e-08, + "logits/chosen": -2.4903059005737305, + "logits/rejected": -2.3227312564849854, + "logps/chosen": -357.42529296875, + "logps/rejected": -345.2140197753906, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0612937211990356, + "rewards/margins": 11.046579360961914, + "rewards/rejected": -12.107873916625977, + "step": 11570 + }, + { + "epoch": 2.93, + "learning_rate": 1.3481883718752925e-08, + "logits/chosen": -2.5871903896331787, + "logits/rejected": -2.5559890270233154, + "logps/chosen": -234.2266082763672, + "logps/rejected": -360.0385437011719, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6578261852264404, + "rewards/margins": 10.966938972473145, + "rewards/rejected": -13.624765396118164, + "step": 11580 + }, + { + "epoch": 2.93, + "learning_rate": 1.3013762756296227e-08, + "logits/chosen": -2.605954170227051, + "logits/rejected": -2.6144227981567383, + "logps/chosen": -272.157470703125, + "logps/rejected": -378.93939208984375, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1602685451507568, + "rewards/margins": 11.329082489013672, + "rewards/rejected": -12.489351272583008, + "step": 11590 + }, + { + "epoch": 2.93, + "learning_rate": 1.2545641793839528e-08, + "logits/chosen": -2.4722487926483154, + "logits/rejected": -2.380265712738037, + "logps/chosen": -335.23895263671875, + "logps/rejected": -396.14385986328125, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5032278895378113, + "rewards/margins": 14.567062377929688, + "rewards/rejected": -14.063835144042969, + "step": 11600 + }, + { + "epoch": 2.93, + "learning_rate": 1.2077520831382827e-08, + "logits/chosen": -2.3190627098083496, + "logits/rejected": -2.398951530456543, + "logps/chosen": -233.6508026123047, + "logps/rejected": -414.9471130371094, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.216247797012329, + "rewards/margins": 13.317113876342773, + "rewards/rejected": -14.533361434936523, + "step": 11610 + }, + { + "epoch": 2.94, + "learning_rate": 1.160939986892613e-08, + "logits/chosen": -2.4299168586730957, + "logits/rejected": -2.365037202835083, + "logps/chosen": -373.56597900390625, + "logps/rejected": -381.29498291015625, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5779365301132202, + "rewards/margins": 10.094257354736328, + "rewards/rejected": -11.67219352722168, + "step": 11620 + }, + { + "epoch": 2.94, + "learning_rate": 1.114127890646943e-08, + "logits/chosen": -2.496657371520996, + "logits/rejected": -2.456730365753174, + "logps/chosen": -264.4086608886719, + "logps/rejected": -353.732666015625, + "loss": 0.0111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3022593259811401, + "rewards/margins": 11.434806823730469, + "rewards/rejected": -12.737066268920898, + "step": 11630 + }, + { + "epoch": 2.94, + "learning_rate": 1.0673157944012733e-08, + "logits/chosen": -2.5562052726745605, + "logits/rejected": -2.61944580078125, + "logps/chosen": -262.9412536621094, + "logps/rejected": -453.12969970703125, + "loss": 0.0113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9975343942642212, + "rewards/margins": 13.116531372070312, + "rewards/rejected": -15.114065170288086, + "step": 11640 + }, + { + "epoch": 2.94, + "learning_rate": 1.0205036981556033e-08, + "logits/chosen": -2.4997823238372803, + "logits/rejected": -2.3326516151428223, + "logps/chosen": -318.5556640625, + "logps/rejected": -383.9226989746094, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2242390215396881, + "rewards/margins": 12.315638542175293, + "rewards/rejected": -12.539877891540527, + "step": 11650 + }, + { + "epoch": 2.95, + "learning_rate": 9.736916019099335e-09, + "logits/chosen": -2.6021151542663574, + "logits/rejected": -2.3831443786621094, + "logps/chosen": -246.7200469970703, + "logps/rejected": -266.804931640625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0942278727889061, + "rewards/margins": 11.007195472717285, + "rewards/rejected": -11.101422309875488, + "step": 11660 + }, + { + "epoch": 2.95, + "learning_rate": 9.268795056642636e-09, + "logits/chosen": -2.6219844818115234, + "logits/rejected": -2.498939037322998, + "logps/chosen": -310.24639892578125, + "logps/rejected": -428.74481201171875, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2587666511535645, + "rewards/margins": 11.268743515014648, + "rewards/rejected": -13.527508735656738, + "step": 11670 + }, + { + "epoch": 2.95, + "learning_rate": 8.800674094185939e-09, + "logits/chosen": -2.4541478157043457, + "logits/rejected": -2.366241931915283, + "logps/chosen": -258.74945068359375, + "logps/rejected": -385.4808044433594, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.320427417755127, + "rewards/margins": 11.502983093261719, + "rewards/rejected": -13.823410034179688, + "step": 11680 + }, + { + "epoch": 2.96, + "learning_rate": 8.332553131729238e-09, + "logits/chosen": -2.3575940132141113, + "logits/rejected": -2.297339916229248, + "logps/chosen": -266.781005859375, + "logps/rejected": -337.98150634765625, + "loss": 0.0142, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.519464135169983, + "rewards/margins": 15.5191011428833, + "rewards/rejected": -17.038564682006836, + "step": 11690 + }, + { + "epoch": 2.96, + "learning_rate": 7.864432169272539e-09, + "logits/chosen": -2.328925371170044, + "logits/rejected": -2.376373052597046, + "logps/chosen": -187.71844482421875, + "logps/rejected": -261.5347900390625, + "loss": 0.0155, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7654637098312378, + "rewards/margins": 10.228033065795898, + "rewards/rejected": -10.993496894836426, + "step": 11700 + }, + { + "epoch": 2.96, + "learning_rate": 7.396311206815841e-09, + "logits/chosen": -2.4738736152648926, + "logits/rejected": -2.4614264965057373, + "logps/chosen": -213.5368194580078, + "logps/rejected": -259.68914794921875, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2424598932266235, + "rewards/margins": 9.811772346496582, + "rewards/rejected": -11.054232597351074, + "step": 11710 + }, + { + "epoch": 2.96, + "learning_rate": 6.928190244359142e-09, + "logits/chosen": -2.4830522537231445, + "logits/rejected": -2.361623764038086, + "logps/chosen": -292.7064514160156, + "logps/rejected": -451.8169860839844, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8870089650154114, + "rewards/margins": 14.211830139160156, + "rewards/rejected": -15.098838806152344, + "step": 11720 + }, + { + "epoch": 2.97, + "learning_rate": 6.460069281902443e-09, + "logits/chosen": -2.6505773067474365, + "logits/rejected": -2.6242470741271973, + "logps/chosen": -367.317138671875, + "logps/rejected": -419.08953857421875, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5306159853935242, + "rewards/margins": 12.968234062194824, + "rewards/rejected": -12.43761920928955, + "step": 11730 + }, + { + "epoch": 2.97, + "learning_rate": 5.991948319445744e-09, + "logits/chosen": -2.4809062480926514, + "logits/rejected": -2.439175605773926, + "logps/chosen": -190.18470764160156, + "logps/rejected": -313.22698974609375, + "loss": 0.0493, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1452090740203857, + "rewards/margins": 11.021424293518066, + "rewards/rejected": -14.166631698608398, + "step": 11740 + }, + { + "epoch": 2.97, + "learning_rate": 5.523827356989046e-09, + "logits/chosen": -2.4870567321777344, + "logits/rejected": -2.391294002532959, + "logps/chosen": -321.2856750488281, + "logps/rejected": -426.3085021972656, + "loss": 0.0057, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7549960613250732, + "rewards/margins": 15.484231948852539, + "rewards/rejected": -17.239227294921875, + "step": 11750 + }, + { + "epoch": 2.97, + "learning_rate": 5.055706394532347e-09, + "logits/chosen": -2.315706491470337, + "logits/rejected": -2.325279712677002, + "logps/chosen": -227.36172485351562, + "logps/rejected": -390.9896545410156, + "loss": 0.0161, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5550304651260376, + "rewards/margins": 12.025665283203125, + "rewards/rejected": -12.580698013305664, + "step": 11760 + }, + { + "epoch": 2.98, + "learning_rate": 4.5875854320756484e-09, + "logits/chosen": -2.507659435272217, + "logits/rejected": -2.4059300422668457, + "logps/chosen": -261.05743408203125, + "logps/rejected": -381.0530700683594, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6033679842948914, + "rewards/margins": 11.23347282409668, + "rewards/rejected": -11.83684253692627, + "step": 11770 + }, + { + "epoch": 2.98, + "learning_rate": 4.119464469618949e-09, + "logits/chosen": -2.589505672454834, + "logits/rejected": -2.5626962184906006, + "logps/chosen": -358.2600402832031, + "logps/rejected": -409.32379150390625, + "loss": 0.0172, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2080825567245483, + "rewards/margins": 12.394508361816406, + "rewards/rejected": -13.602592468261719, + "step": 11780 + }, + { + "epoch": 2.98, + "learning_rate": 3.6513435071622503e-09, + "logits/chosen": -2.5097475051879883, + "logits/rejected": -2.4207422733306885, + "logps/chosen": -289.1883544921875, + "logps/rejected": -337.92327880859375, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.319321870803833, + "rewards/margins": 9.911137580871582, + "rewards/rejected": -11.23045825958252, + "step": 11790 + }, + { + "epoch": 2.98, + "learning_rate": 3.1832225447055516e-09, + "logits/chosen": -2.3756625652313232, + "logits/rejected": -2.3374183177948, + "logps/chosen": -259.49835205078125, + "logps/rejected": -432.2469787597656, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6284923553466797, + "rewards/margins": 15.795854568481445, + "rewards/rejected": -16.424346923828125, + "step": 11800 + }, + { + "epoch": 2.99, + "learning_rate": 2.715101582248853e-09, + "logits/chosen": -2.232285737991333, + "logits/rejected": -2.2487640380859375, + "logps/chosen": -264.79022216796875, + "logps/rejected": -377.00616455078125, + "loss": 0.0285, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6382107734680176, + "rewards/margins": 11.905860900878906, + "rewards/rejected": -14.54407024383545, + "step": 11810 + }, + { + "epoch": 2.99, + "learning_rate": 2.2469806197921542e-09, + "logits/chosen": -2.4739320278167725, + "logits/rejected": -2.522197723388672, + "logps/chosen": -309.99188232421875, + "logps/rejected": -447.0345153808594, + "loss": 0.0257, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.363677740097046, + "rewards/margins": 12.926587104797363, + "rewards/rejected": -14.290265083312988, + "step": 11820 + }, + { + "epoch": 2.99, + "learning_rate": 1.7788596573354553e-09, + "logits/chosen": -2.6093578338623047, + "logits/rejected": -2.5824170112609863, + "logps/chosen": -354.9128112792969, + "logps/rejected": -400.4103088378906, + "loss": 0.1116, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.216612458229065, + "rewards/margins": 10.741962432861328, + "rewards/rejected": -11.958575248718262, + "step": 11830 + }, + { + "epoch": 2.99, + "learning_rate": 1.3107386948787567e-09, + "logits/chosen": -2.597052812576294, + "logits/rejected": -2.5173323154449463, + "logps/chosen": -339.32379150390625, + "logps/rejected": -349.84088134765625, + "loss": 0.0097, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9821268320083618, + "rewards/margins": 10.569337844848633, + "rewards/rejected": -12.551465034484863, + "step": 11840 + }, + { + "epoch": 3.0, + "learning_rate": 8.426177324220578e-10, + "logits/chosen": -2.583278179168701, + "logits/rejected": -2.4176480770111084, + "logps/chosen": -250.9386444091797, + "logps/rejected": -297.49237060546875, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9199004173278809, + "rewards/margins": 10.506296157836914, + "rewards/rejected": -11.426196098327637, + "step": 11850 + }, + { + "epoch": 3.0, + "learning_rate": 3.74496769965359e-10, + "logits/chosen": -2.468799114227295, + "logits/rejected": -2.3663735389709473, + "logps/chosen": -318.8310852050781, + "logps/rejected": -367.59356689453125, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5627554655075073, + "rewards/margins": 9.739514350891113, + "rewards/rejected": -11.302268981933594, + "step": 11860 + }, + { + "epoch": 3.0, + "step": 11868, + "total_flos": 0.0, + "train_loss": 0.2383290374584692, + "train_runtime": 16014.0387, + "train_samples_per_second": 11.855, + "train_steps_per_second": 0.741 + } + ], + "logging_steps": 10, + "max_steps": 11868, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1187, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}