diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,13 +10,13 @@ "log_history": [ { "epoch": 0.0010465724751439038, - "grad_norm": 12.735772520921401, + "grad_norm": 21.102116873134612, "learning_rate": 5.208333333333333e-09, "logits/chosen": -2.924262046813965, "logits/rejected": -2.7925047874450684, "logps/chosen": -380.8447570800781, "logps/rejected": -358.51123046875, - "loss": 0.6931, + "loss": 4.6506, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,1581 +25,1581 @@ }, { "epoch": 0.010465724751439037, - "grad_norm": 10.939955073523928, + "grad_norm": 15.822543074567085, "learning_rate": 5.208333333333333e-08, - "logits/chosen": -2.5958893299102783, - "logits/rejected": -2.5695536136627197, - "logps/chosen": -256.6239318847656, - "logps/rejected": -234.886962890625, - "loss": 0.6932, + "logits/chosen": -2.595761299133301, + "logits/rejected": -2.569227457046509, + "logps/chosen": -256.6064453125, + "logps/rejected": -234.93408203125, + "loss": 4.5621, "rewards/accuracies": 0.5, - "rewards/chosen": 0.00025410810485482216, - "rewards/margins": 0.00034702278207987547, - "rewards/rejected": -9.291467722505331e-05, + "rewards/chosen": 0.00042897689854726195, + "rewards/margins": 0.0009927540086209774, + "rewards/rejected": -0.0005637770518660545, "step": 10 }, { "epoch": 0.020931449502878074, - "grad_norm": 12.592291847861857, + "grad_norm": 18.010820015079055, "learning_rate": 1.0416666666666667e-07, - "logits/chosen": -2.6138570308685303, - "logits/rejected": -2.5763792991638184, - "logps/chosen": -283.1133728027344, - "logps/rejected": -282.29644775390625, - "loss": 0.693, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.00030150412931106985, - "rewards/margins": -8.710605470696464e-05, - "rewards/rejected": -0.00021439809643197805, + "logits/chosen": -2.613164186477661, + "logits/rejected": -2.5756287574768066, + "logps/chosen": -283.0158996582031, + "logps/rejected": -282.265869140625, + "loss": 4.4053, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0006733193295076489, + "rewards/margins": 0.0005819452926516533, + "rewards/rejected": 9.137402230408043e-05, "step": 20 }, { "epoch": 0.03139717425431711, - "grad_norm": 12.15555855567297, + "grad_norm": 21.44807572026145, "learning_rate": 1.5624999999999999e-07, - "logits/chosen": -2.691681385040283, - "logits/rejected": -2.6676158905029297, - "logps/chosen": -270.2156982421875, - "logps/rejected": -276.73724365234375, - "loss": 0.6923, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0006642484804615378, - "rewards/margins": 0.0012884512543678284, - "rewards/rejected": -0.0006242028321139514, + "logits/chosen": -2.691143035888672, + "logits/rejected": -2.6666667461395264, + "logps/chosen": -269.9042053222656, + "logps/rejected": -276.4795837402344, + "loss": 5.105, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0037794082891196012, + "rewards/margins": 0.0018267262494191527, + "rewards/rejected": 0.0019526820397004485, "step": 30 }, { "epoch": 0.04186289900575615, - "grad_norm": 11.841392011083105, + "grad_norm": 17.302023991146115, "learning_rate": 2.0833333333333333e-07, - "logits/chosen": -2.6653332710266113, - "logits/rejected": -2.590430736541748, - "logps/chosen": -290.4707946777344, - "logps/rejected": -282.1141662597656, - "loss": 0.6902, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.00528017058968544, - "rewards/margins": 0.0057797241024672985, - "rewards/rejected": -0.0004995539784431458, + "logits/chosen": -2.6577816009521484, + "logits/rejected": -2.5818943977355957, + "logps/chosen": -288.9285888671875, + "logps/rejected": -280.9770202636719, + "loss": 4.9032, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.020702064037322998, + "rewards/margins": 0.009830506518483162, + "rewards/rejected": 0.01087155845016241, "step": 40 }, { "epoch": 0.052328623757195186, - "grad_norm": 13.24083046891695, + "grad_norm": 22.46337927130885, "learning_rate": 2.604166666666667e-07, - "logits/chosen": -2.6722922325134277, - "logits/rejected": -2.588569164276123, - "logps/chosen": -266.1282958984375, - "logps/rejected": -236.51809692382812, - "loss": 0.6859, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.014677075669169426, - "rewards/margins": 0.013995910063385963, - "rewards/rejected": 0.0006811673520132899, + "logits/chosen": -2.6507585048675537, + "logits/rejected": -2.5627222061157227, + "logps/chosen": -263.1905212402344, + "logps/rejected": -234.9305419921875, + "loss": 4.8274, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.044054824858903885, + "rewards/margins": 0.02749818004667759, + "rewards/rejected": 0.016556641086935997, "step": 50 }, { "epoch": 0.06279434850863422, - "grad_norm": 11.856942211355204, + "grad_norm": 18.98737987603255, "learning_rate": 3.1249999999999997e-07, - "logits/chosen": -2.627357244491577, - "logits/rejected": -2.591691493988037, - "logps/chosen": -299.61175537109375, - "logps/rejected": -274.43731689453125, - "loss": 0.6791, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.044125042855739594, - "rewards/margins": 0.030969683080911636, - "rewards/rejected": 0.013155360706150532, + "logits/chosen": -2.5976526737213135, + "logits/rejected": -2.5587098598480225, + "logps/chosen": -299.9574890136719, + "logps/rejected": -276.1783142089844, + "loss": 4.5279, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.040667824447155, + "rewards/margins": 0.04492232948541641, + "rewards/rejected": -0.004254504106938839, "step": 60 }, { "epoch": 0.07326007326007326, - "grad_norm": 11.929671055235316, + "grad_norm": 20.501382800234886, "learning_rate": 3.645833333333333e-07, - "logits/chosen": -2.5305747985839844, - "logits/rejected": -2.523099422454834, - "logps/chosen": -257.43914794921875, - "logps/rejected": -262.67779541015625, - "loss": 0.6702, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.021997224539518356, - "rewards/margins": 0.06655967980623245, - "rewards/rejected": -0.044562458992004395, + "logits/chosen": -2.469130039215088, + "logits/rejected": -2.452857732772827, + "logps/chosen": -265.96978759765625, + "logps/rejected": -271.6788330078125, + "loss": 4.6703, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0633089542388916, + "rewards/margins": 0.07126398384571075, + "rewards/rejected": -0.13457295298576355, "step": 70 }, { "epoch": 0.0837257980115123, - "grad_norm": 13.864057015926953, + "grad_norm": 25.49997843488533, "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -2.5468287467956543, - "logits/rejected": -2.474470376968384, - "logps/chosen": -274.47784423828125, - "logps/rejected": -261.062255859375, - "loss": 0.6503, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.01564904674887657, - "rewards/margins": 0.11314449459314346, - "rewards/rejected": -0.097495436668396, + "logits/chosen": -2.4551777839660645, + "logits/rejected": -2.3624327182769775, + "logps/chosen": -285.5320739746094, + "logps/rejected": -276.4596252441406, + "loss": 4.5605, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09489366412162781, + "rewards/margins": 0.15657536685466766, + "rewards/rejected": -0.2514690160751343, "step": 80 }, { "epoch": 0.09419152276295134, - "grad_norm": 15.875862157352557, + "grad_norm": 30.61647338954573, "learning_rate": 4.6874999999999996e-07, - "logits/chosen": -2.488557815551758, - "logits/rejected": -2.4535651206970215, - "logps/chosen": -261.57366943359375, - "logps/rejected": -278.4251403808594, - "loss": 0.642, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.059757404029369354, - "rewards/margins": 0.07676272094249725, - "rewards/rejected": -0.1365201324224472, + "logits/chosen": -2.3756256103515625, + "logits/rejected": -2.332918882369995, + "logps/chosen": -277.46014404296875, + "logps/rejected": -290.0049743652344, + "loss": 4.1231, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.21862252056598663, + "rewards/margins": 0.033695660531520844, + "rewards/rejected": -0.25231820344924927, "step": 90 }, { "epoch": 0.10465724751439037, - "grad_norm": 13.065702068285603, + "grad_norm": 38.124561793065574, "learning_rate": 4.999732492681437e-07, - "logits/chosen": -2.4728801250457764, - "logits/rejected": -2.390676498413086, - "logps/chosen": -304.7428894042969, - "logps/rejected": -304.18231201171875, - "loss": 0.6267, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.0491873137652874, - "rewards/margins": 0.19260282814502716, - "rewards/rejected": -0.24179014563560486, + "logits/chosen": -2.332035779953003, + "logits/rejected": -2.2253689765930176, + "logps/chosen": -314.4341125488281, + "logps/rejected": -317.18695068359375, + "loss": 4.5854, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1460995227098465, + "rewards/margins": 0.22573721408843994, + "rewards/rejected": -0.37183672189712524, "step": 100 }, { "epoch": 0.10465724751439037, - "eval_logits/chosen": -2.4988505840301514, - "eval_logits/rejected": -2.424436569213867, - "eval_logps/chosen": -280.5349426269531, - "eval_logps/rejected": -274.64080810546875, - "eval_loss": 0.6375061273574829, - "eval_rewards/accuracies": 0.7063491940498352, - "eval_rewards/chosen": 0.014330551959574223, - "eval_rewards/margins": 0.1586705446243286, - "eval_rewards/rejected": -0.14433999359607697, - "eval_runtime": 175.7842, - "eval_samples_per_second": 11.378, - "eval_steps_per_second": 0.358, + "eval_logits/chosen": -2.2812609672546387, + "eval_logits/rejected": -2.192293167114258, + "eval_logps/chosen": -309.1551818847656, + "eval_logps/rejected": -310.1242370605469, + "eval_loss": 4.381103515625, + "eval_rewards/accuracies": 0.648809552192688, + "eval_rewards/chosen": -0.2718724012374878, + "eval_rewards/margins": 0.2273014634847641, + "eval_rewards/rejected": -0.4991738498210907, + "eval_runtime": 176.2372, + "eval_samples_per_second": 11.348, + "eval_steps_per_second": 0.357, "step": 100 }, { "epoch": 0.1151229722658294, - "grad_norm": 18.15644072773089, + "grad_norm": 47.336977780094564, "learning_rate": 4.996723692767926e-07, - "logits/chosen": -2.5023550987243652, - "logits/rejected": -2.432821273803711, - "logps/chosen": -249.1396484375, - "logps/rejected": -264.470458984375, - "loss": 0.6297, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.07685011625289917, - "rewards/margins": 0.19513027369976044, - "rewards/rejected": -0.2719804048538208, + "logits/chosen": -2.0436112880706787, + "logits/rejected": -1.9534924030303955, + "logps/chosen": -310.6973571777344, + "logps/rejected": -324.1681823730469, + "loss": 3.758, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6924275755882263, + "rewards/margins": 0.17653007805347443, + "rewards/rejected": -0.8689576387405396, "step": 110 }, { "epoch": 0.12558869701726844, - "grad_norm": 22.023309150811986, + "grad_norm": 109.43376131471078, "learning_rate": 4.990375746213598e-07, - "logits/chosen": -2.3616247177124023, - "logits/rejected": -2.281648874282837, - "logps/chosen": -285.5514831542969, - "logps/rejected": -339.8910217285156, - "loss": 0.5961, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.2297312319278717, - "rewards/margins": 0.2847149968147278, - "rewards/rejected": -0.5144462585449219, + "logits/chosen": -0.08515436947345734, + "logits/rejected": 0.34949326515197754, + "logps/chosen": -343.26495361328125, + "logps/rejected": -412.98577880859375, + "loss": 4.0333, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8068662881851196, + "rewards/margins": 0.438527911901474, + "rewards/rejected": -1.2453943490982056, "step": 120 }, { "epoch": 0.1360544217687075, - "grad_norm": 19.678835809215887, + "grad_norm": 95.04671304091885, "learning_rate": 4.980697142834314e-07, - "logits/chosen": -2.283674478530884, - "logits/rejected": -2.192523956298828, - "logps/chosen": -294.853515625, - "logps/rejected": -303.66986083984375, - "loss": 0.5883, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.1358092576265335, - "rewards/margins": 0.3562476634979248, - "rewards/rejected": -0.4920569360256195, + "logits/chosen": 0.396954745054245, + "logits/rejected": 1.0232269763946533, + "logps/chosen": -406.28521728515625, + "logps/rejected": -430.10760498046875, + "loss": 4.2005, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2501262426376343, + "rewards/margins": 0.5063079595565796, + "rewards/rejected": -1.7564342021942139, "step": 130 }, { "epoch": 0.14652014652014653, - "grad_norm": 18.045058125702276, + "grad_norm": 144.39035434160894, "learning_rate": 4.967700826904229e-07, - "logits/chosen": -2.1272222995758057, - "logits/rejected": -2.050865888595581, - "logps/chosen": -321.29254150390625, - "logps/rejected": -360.5655517578125, - "loss": 0.5733, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.3020511567592621, - "rewards/margins": 0.41385501623153687, - "rewards/rejected": -0.7159062027931213, + "logits/chosen": -0.1560676395893097, + "logits/rejected": 0.6105406880378723, + "logps/chosen": -416.2538146972656, + "logps/rejected": -463.2472229003906, + "loss": 3.7876, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2516638040542603, + "rewards/margins": 0.49105915427207947, + "rewards/rejected": -1.7427231073379517, "step": 140 }, { "epoch": 0.15698587127158556, - "grad_norm": 23.733353319304864, + "grad_norm": 125.21681673589694, "learning_rate": 4.951404179843962e-07, - "logits/chosen": -1.8601700067520142, - "logits/rejected": -1.6911884546279907, - "logps/chosen": -328.0176696777344, - "logps/rejected": -331.93365478515625, - "loss": 0.5724, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.40121254324913025, - "rewards/margins": 0.44814401865005493, - "rewards/rejected": -0.849356472492218, + "logits/chosen": 2.0407581329345703, + "logits/rejected": 2.8481547832489014, + "logps/chosen": -510.521484375, + "logps/rejected": -534.6341552734375, + "loss": 3.898, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.226250648498535, + "rewards/margins": 0.6501102447509766, + "rewards/rejected": -2.876361131668091, "step": 150 }, { "epoch": 0.1674515960230246, - "grad_norm": 28.013542721553446, + "grad_norm": 66.88313091855639, "learning_rate": 4.931828996974498e-07, - "logits/chosen": -0.7096320390701294, - "logits/rejected": -0.25428909063339233, - "logps/chosen": -341.81439208984375, - "logps/rejected": -363.79217529296875, - "loss": 0.5414, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.46001678705215454, - "rewards/margins": 0.6181891560554504, - "rewards/rejected": -1.0782058238983154, + "logits/chosen": 2.163175106048584, + "logits/rejected": 3.5420451164245605, + "logps/chosen": -585.4688720703125, + "logps/rejected": -635.2697143554688, + "loss": 3.9393, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.89656138420105, + "rewards/margins": 0.8964195251464844, + "rewards/rejected": -3.792980909347534, "step": 160 }, { "epoch": 0.17791732077446362, - "grad_norm": 23.342052779259543, + "grad_norm": 188.98325062900707, "learning_rate": 4.909001458367866e-07, - "logits/chosen": -0.7341230511665344, - "logits/rejected": -0.3530420660972595, - "logps/chosen": -352.09222412109375, - "logps/rejected": -376.5685119628906, - "loss": 0.5581, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7362300753593445, - "rewards/margins": 0.6209839582443237, - "rewards/rejected": -1.3572142124176025, + "logits/chosen": 0.49319368600845337, + "logits/rejected": 1.3766599893569946, + "logps/chosen": -599.5331420898438, + "logps/rejected": -654.1383056640625, + "loss": 3.9922, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.210639476776123, + "rewards/margins": 0.922272801399231, + "rewards/rejected": -4.132911682128906, "step": 170 }, { "epoch": 0.18838304552590268, - "grad_norm": 28.21709060740068, + "grad_norm": 320.6202106283321, "learning_rate": 4.882952093833627e-07, - "logits/chosen": -0.876774787902832, - "logits/rejected": -0.6448796391487122, - "logps/chosen": -335.0474548339844, - "logps/rejected": -404.52728271484375, - "loss": 0.5274, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9509947896003723, - "rewards/margins": 0.6264718174934387, - "rewards/rejected": -1.5774667263031006, + "logits/chosen": 0.6820823550224304, + "logits/rejected": 1.588409185409546, + "logps/chosen": -1040.5491943359375, + "logps/rejected": -1233.1207275390625, + "loss": 3.36, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.006011009216309, + "rewards/margins": 1.8573882579803467, + "rewards/rejected": -9.86340045928955, "step": 180 }, { "epoch": 0.1988487702773417, - "grad_norm": 40.43719634141344, + "grad_norm": 157.79546381015746, "learning_rate": 4.853715742087946e-07, - "logits/chosen": -0.12615634500980377, - "logits/rejected": 0.2832840085029602, - "logps/chosen": -344.6008605957031, - "logps/rejected": -427.69354248046875, - "loss": 0.5309, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0862140655517578, - "rewards/margins": 0.6919594407081604, - "rewards/rejected": -1.7781736850738525, + "logits/chosen": 3.3087031841278076, + "logits/rejected": 4.11985445022583, + "logps/chosen": -1690.8167724609375, + "logps/rejected": -1890.6634521484375, + "loss": 2.6799, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -14.548372268676758, + "rewards/margins": 1.8595011234283447, + "rewards/rejected": -16.407875061035156, "step": 190 }, { "epoch": 0.20931449502878074, - "grad_norm": 24.48228039656553, + "grad_norm": 178.97245767319544, "learning_rate": 4.821331504159906e-07, - "logits/chosen": -1.134807825088501, - "logits/rejected": -0.7409021258354187, - "logps/chosen": -399.9931640625, - "logps/rejected": -406.24249267578125, - "loss": 0.578, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.893362820148468, - "rewards/margins": 0.5979009866714478, - "rewards/rejected": -1.49126398563385, + "logits/chosen": 0.3337511122226715, + "logits/rejected": 1.9961885213851929, + "logps/chosen": -1578.712158203125, + "logps/rejected": -1801.65625, + "loss": 2.6464, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -12.680551528930664, + "rewards/margins": 2.764849901199341, + "rewards/rejected": -15.445402145385742, "step": 200 }, { "epoch": 0.20931449502878074, - "eval_logits/chosen": -1.7000207901000977, - "eval_logits/rejected": -1.3956148624420166, - "eval_logps/chosen": -357.6234130859375, - "eval_logps/rejected": -397.53143310546875, - "eval_loss": 0.5543879270553589, - "eval_rewards/accuracies": 0.7480158805847168, - "eval_rewards/chosen": -0.7565548419952393, - "eval_rewards/margins": 0.616691529750824, - "eval_rewards/rejected": -1.3732463121414185, - "eval_runtime": 175.6227, - "eval_samples_per_second": 11.388, - "eval_steps_per_second": 0.359, + "eval_logits/chosen": -0.35622134804725647, + "eval_logits/rejected": 0.6981890797615051, + "eval_logps/chosen": -1244.43603515625, + "eval_logps/rejected": -1423.3580322265625, + "eval_loss": 2.606262683868408, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -9.624680519104004, + "eval_rewards/margins": 2.0068302154541016, + "eval_rewards/rejected": -11.631510734558105, + "eval_runtime": 177.3795, + "eval_samples_per_second": 11.275, + "eval_steps_per_second": 0.355, "step": 200 }, { "epoch": 0.21978021978021978, - "grad_norm": 33.0816384461788, + "grad_norm": 221.39959720400535, "learning_rate": 4.785842691097342e-07, - "logits/chosen": -1.2056543827056885, - "logits/rejected": -0.7488024830818176, - "logps/chosen": -379.0580139160156, - "logps/rejected": -430.2911682128906, - "loss": 0.5639, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.9624747037887573, - "rewards/margins": 0.6178582906723022, - "rewards/rejected": -1.5803329944610596, + "logits/chosen": 0.43124809861183167, + "logits/rejected": 1.6196168661117554, + "logps/chosen": -1394.329345703125, + "logps/rejected": -1612.8701171875, + "loss": 2.2192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -11.115188598632812, + "rewards/margins": 2.29093337059021, + "rewards/rejected": -13.406122207641602, "step": 210 }, { "epoch": 0.2302459445316588, - "grad_norm": 33.16914134802434, + "grad_norm": 107.97254065213261, "learning_rate": 4.7472967660421603e-07, - "logits/chosen": -0.9634332656860352, - "logits/rejected": -0.056523989886045456, - "logps/chosen": -388.16888427734375, - "logps/rejected": -407.8970642089844, - "loss": 0.5222, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0096079111099243, - "rewards/margins": 0.7075966000556946, - "rewards/rejected": -1.7172044515609741, + "logits/chosen": 0.5400440096855164, + "logits/rejected": 1.9760030508041382, + "logps/chosen": -1507.001220703125, + "logps/rejected": -1713.616455078125, + "loss": 2.018, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.197932243347168, + "rewards/margins": 2.5764663219451904, + "rewards/rejected": -14.774396896362305, "step": 220 }, { "epoch": 0.24071166928309787, - "grad_norm": 35.571946638008065, + "grad_norm": 217.88193736039008, "learning_rate": 4.705745280752585e-07, - "logits/chosen": -0.3167392611503601, - "logits/rejected": 0.18852970004081726, - "logps/chosen": -397.49847412109375, - "logps/rejected": -440.3253479003906, - "loss": 0.5468, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.0877156257629395, - "rewards/margins": 0.6301809549331665, - "rewards/rejected": -1.7178964614868164, + "logits/chosen": 1.4225207567214966, + "logits/rejected": 2.4756038188934326, + "logps/chosen": -1726.320068359375, + "logps/rejected": -2005.7041015625, + "loss": 1.9719, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -14.375930786132812, + "rewards/margins": 2.995753526687622, + "rewards/rejected": -17.37168312072754, "step": 230 }, { "epoch": 0.25117739403453687, - "grad_norm": 32.242793295380665, + "grad_norm": 109.77258728949327, "learning_rate": 4.6612438066572555e-07, - "logits/chosen": -0.004159653093665838, - "logits/rejected": 0.8073694109916687, - "logps/chosen": -364.86566162109375, - "logps/rejected": -433.24072265625, - "loss": 0.5162, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.9729146957397461, - "rewards/margins": 0.8177644610404968, - "rewards/rejected": -1.7906792163848877, + "logits/chosen": 2.2113587856292725, + "logits/rejected": 3.125591993331909, + "logps/chosen": -1894.770751953125, + "logps/rejected": -2110.86376953125, + "loss": 1.9847, + "rewards/accuracies": 0.59375, + "rewards/chosen": -16.27196502685547, + "rewards/margins": 2.294943332672119, + "rewards/rejected": -18.56690788269043, "step": 240 }, { "epoch": 0.2616431187859759, - "grad_norm": 29.7046569499327, + "grad_norm": 276.53415343052893, "learning_rate": 4.6138518605333664e-07, - "logits/chosen": -0.3281826674938202, - "logits/rejected": 0.29218000173568726, - "logps/chosen": -366.93865966796875, - "logps/rejected": -417.32666015625, - "loss": 0.5058, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.8915438652038574, - "rewards/margins": 0.7059476375579834, - "rewards/rejected": -1.5974915027618408, + "logits/chosen": 1.203977108001709, + "logits/rejected": 1.9225616455078125, + "logps/chosen": -1561.0047607421875, + "logps/rejected": -1763.075439453125, + "loss": 2.257, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.832204818725586, + "rewards/margins": 2.222775936126709, + "rewards/rejected": -15.05497932434082, "step": 250 }, { "epoch": 0.272108843537415, - "grad_norm": 26.07189300308382, + "grad_norm": 159.14963627253198, "learning_rate": 4.5636328249082514e-07, - "logits/chosen": -0.7655261158943176, - "logits/rejected": 0.05788875371217728, - "logps/chosen": -377.3932800292969, - "logps/rejected": -431.24603271484375, - "loss": 0.5333, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.8842679262161255, - "rewards/margins": 0.6056793928146362, - "rewards/rejected": -1.4899474382400513, + "logits/chosen": 1.134037733078003, + "logits/rejected": 2.1568219661712646, + "logps/chosen": -1608.8623046875, + "logps/rejected": -1763.599853515625, + "loss": 2.2606, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -13.198956489562988, + "rewards/margins": 1.6145280599594116, + "rewards/rejected": -14.813486099243164, "step": 260 }, { "epoch": 0.282574568288854, - "grad_norm": 27.48585519105196, + "grad_norm": 199.45417630865836, "learning_rate": 4.510653863290871e-07, - "logits/chosen": -0.8150612711906433, - "logits/rejected": -0.19155649840831757, - "logps/chosen": -376.1461486816406, - "logps/rejected": -399.7369689941406, - "loss": 0.5074, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.7649620175361633, - "rewards/margins": 0.6968373656272888, - "rewards/rejected": -1.4617992639541626, + "logits/chosen": 0.3547247350215912, + "logits/rejected": 1.2751286029815674, + "logps/chosen": -1781.0726318359375, + "logps/rejected": -2089.05615234375, + "loss": 1.7211, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -14.814226150512695, + "rewards/margins": 3.540767192840576, + "rewards/rejected": -18.354991912841797, "step": 270 }, { "epoch": 0.29304029304029305, - "grad_norm": 28.38677400170819, + "grad_norm": 162.5497817330968, "learning_rate": 4.4549858303465737e-07, - "logits/chosen": 0.3504285514354706, - "logits/rejected": 0.9512192606925964, - "logps/chosen": -415.9444274902344, - "logps/rejected": -479.0673828125, - "loss": 0.5416, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1970031261444092, - "rewards/margins": 0.7609006762504578, - "rewards/rejected": -1.9579038619995117, + "logits/chosen": 0.21130748093128204, + "logits/rejected": 1.2269564867019653, + "logps/chosen": -1743.0787353515625, + "logps/rejected": -2033.669921875, + "loss": 1.9445, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -14.468345642089844, + "rewards/margins": 3.0355849266052246, + "rewards/rejected": -17.50392723083496, "step": 280 }, { "epoch": 0.3035060177917321, - "grad_norm": 32.744553637286586, + "grad_norm": 307.15808847538113, "learning_rate": 4.396703177135261e-07, - "logits/chosen": -0.09853874146938324, - "logits/rejected": 0.8018854260444641, - "logps/chosen": -399.70263671875, - "logps/rejected": -441.5960388183594, - "loss": 0.5186, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.269182801246643, - "rewards/margins": 0.7766303420066833, - "rewards/rejected": -2.0458133220672607, + "logits/chosen": 0.7419403791427612, + "logits/rejected": 1.9202260971069336, + "logps/chosen": -1948.1787109375, + "logps/rejected": -2273.5205078125, + "loss": 1.9864, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -16.753948211669922, + "rewards/margins": 3.6111111640930176, + "rewards/rejected": -20.36505699157715, "step": 290 }, { "epoch": 0.3139717425431711, - "grad_norm": 36.30101198384572, + "grad_norm": 90.00202577382801, "learning_rate": 4.335883851539693e-07, - "logits/chosen": -1.0064457654953003, - "logits/rejected": -0.4348925054073334, - "logps/chosen": -373.8060302734375, - "logps/rejected": -438.51513671875, - "loss": 0.5301, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0478389263153076, - "rewards/margins": 0.6600475907325745, - "rewards/rejected": -1.7078864574432373, + "logits/chosen": 0.30849236249923706, + "logits/rejected": 1.1072229146957397, + "logps/chosen": -1431.3275146484375, + "logps/rejected": -1653.4029541015625, + "loss": 1.9069, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -11.623054504394531, + "rewards/margins": 2.2337088584899902, + "rewards/rejected": -13.856762886047363, "step": 300 }, { "epoch": 0.3139717425431711, - "eval_logits/chosen": -0.7774365544319153, - "eval_logits/rejected": -0.07988195866346359, - "eval_logps/chosen": -377.41851806640625, - "eval_logps/rejected": -439.006103515625, - "eval_loss": 0.519775927066803, - "eval_rewards/accuracies": 0.7757936716079712, - "eval_rewards/chosen": -0.9545055627822876, - "eval_rewards/margins": 0.8334872126579285, - "eval_rewards/rejected": -1.7879927158355713, - "eval_runtime": 175.6202, - "eval_samples_per_second": 11.388, - "eval_steps_per_second": 0.359, + "eval_logits/chosen": 0.45899611711502075, + "eval_logits/rejected": 1.5569082498550415, + "eval_logps/chosen": -1266.6490478515625, + "eval_logps/rejected": -1452.7674560546875, + "eval_loss": 2.262396812438965, + "eval_rewards/accuracies": 0.6329365372657776, + "eval_rewards/chosen": -9.846811294555664, + "eval_rewards/margins": 2.0787949562072754, + "eval_rewards/rejected": -11.925606727600098, + "eval_runtime": 176.5188, + "eval_samples_per_second": 11.33, + "eval_steps_per_second": 0.357, "step": 300 }, { "epoch": 0.32443746729461015, - "grad_norm": 34.23825403742458, + "grad_norm": 177.8700388917398, "learning_rate": 4.272609194017105e-07, - "logits/chosen": -0.798404335975647, - "logits/rejected": 0.40799885988235474, - "logps/chosen": -376.5071716308594, - "logps/rejected": -409.09600830078125, - "loss": 0.5113, + "logits/chosen": 0.647371768951416, + "logits/rejected": 2.9104599952697754, + "logps/chosen": -1395.496826171875, + "logps/rejected": -1711.9573974609375, + "loss": 2.3095, "rewards/accuracies": 0.75, - "rewards/chosen": -0.9274541735649109, - "rewards/margins": 0.8360295295715332, - "rewards/rejected": -1.7634836435317993, + "rewards/chosen": -11.117349624633789, + "rewards/margins": 3.674748182296753, + "rewards/rejected": -14.792098999023438, "step": 310 }, { "epoch": 0.3349031920460492, - "grad_norm": 39.68212351207402, + "grad_norm": 180.92515200199898, "learning_rate": 4.2069638288135547e-07, - "logits/chosen": 0.02141362428665161, - "logits/rejected": 0.8004009127616882, - "logps/chosen": -371.86358642578125, - "logps/rejected": -460.26971435546875, - "loss": 0.5256, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1884684562683105, - "rewards/margins": 0.9082374572753906, - "rewards/rejected": -2.096705913543701, + "logits/chosen": 0.9543863534927368, + "logits/rejected": 1.7447538375854492, + "logps/chosen": -1926.299560546875, + "logps/rejected": -2217.88037109375, + "loss": 2.1724, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -16.73282814025879, + "rewards/margins": 2.939984083175659, + "rewards/rejected": -19.672813415527344, "step": 320 }, { "epoch": 0.3453689167974882, - "grad_norm": 37.95982237233792, + "grad_norm": 145.6894284610869, "learning_rate": 4.139035550786494e-07, - "logits/chosen": 0.24720951914787292, - "logits/rejected": 0.8929317593574524, - "logps/chosen": -380.25689697265625, - "logps/rejected": -435.5292053222656, - "loss": 0.5049, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2391283512115479, - "rewards/margins": 0.795093297958374, - "rewards/rejected": -2.034221649169922, + "logits/chosen": -0.039321091026067734, + "logits/rejected": 0.5018073320388794, + "logps/chosen": -1734.091796875, + "logps/rejected": -1908.339111328125, + "loss": 1.716, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -14.77747917175293, + "rewards/margins": 1.9848415851593018, + "rewards/rejected": -16.76232147216797, "step": 330 }, { "epoch": 0.35583464154892724, - "grad_norm": 31.44829466131635, + "grad_norm": 183.78050890033984, "learning_rate": 4.0689152079869306e-07, - "logits/chosen": 0.19869890809059143, - "logits/rejected": 0.7656279802322388, - "logps/chosen": -384.75897216796875, - "logps/rejected": -426.62274169921875, - "loss": 0.5341, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1527179479599, - "rewards/margins": 0.7536069750785828, - "rewards/rejected": -1.9063247442245483, + "logits/chosen": -0.5724295377731323, + "logits/rejected": 0.023262571543455124, + "logps/chosen": -1660.732177734375, + "logps/rejected": -1876.1025390625, + "loss": 1.8439, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -13.912447929382324, + "rewards/margins": 2.488671064376831, + "rewards/rejected": -16.401119232177734, "step": 340 }, { "epoch": 0.3663003663003663, - "grad_norm": 33.60100508993655, + "grad_norm": 149.28700648360655, "learning_rate": 3.99669658015821e-07, - "logits/chosen": -0.23042722046375275, - "logits/rejected": 0.44881439208984375, - "logps/chosen": -416.906982421875, - "logps/rejected": -460.3355407714844, - "loss": 0.502, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2567191123962402, - "rewards/margins": 0.7857658863067627, - "rewards/rejected": -2.042484998703003, + "logits/chosen": 0.006322336383163929, + "logits/rejected": 0.6332755088806152, + "logps/chosen": -1966.5765380859375, + "logps/rejected": -2201.843505859375, + "loss": 1.6671, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.753414154052734, + "rewards/margins": 2.7041499614715576, + "rewards/rejected": -19.457565307617188, "step": 350 }, { "epoch": 0.37676609105180536, - "grad_norm": 35.86790303415799, + "grad_norm": 237.65668361495474, "learning_rate": 3.92247625331392e-07, - "logits/chosen": -0.2522183060646057, - "logits/rejected": 0.6683695316314697, - "logps/chosen": -416.01641845703125, - "logps/rejected": -479.52294921875, - "loss": 0.4805, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3013213872909546, - "rewards/margins": 0.8662745356559753, - "rewards/rejected": -2.1675961017608643, + "logits/chosen": -0.21500203013420105, + "logits/rejected": 0.6255682110786438, + "logps/chosen": -1989.7509765625, + "logps/rejected": -2207.83349609375, + "loss": 1.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": -17.038667678833008, + "rewards/margins": 2.4120330810546875, + "rewards/rejected": -19.450698852539062, "step": 360 }, { "epoch": 0.3872318158032444, - "grad_norm": 34.95835364160841, + "grad_norm": 152.55773033990448, "learning_rate": 3.846353490562664e-07, - "logits/chosen": 0.0829664096236229, - "logits/rejected": 0.9096466898918152, - "logps/chosen": -406.54913330078125, - "logps/rejected": -456.1133728027344, - "loss": 0.5136, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4327296018600464, - "rewards/margins": 0.7719124555587769, - "rewards/rejected": -2.2046420574188232, + "logits/chosen": -0.39199286699295044, + "logits/rejected": -0.043508779257535934, + "logps/chosen": -1889.5286865234375, + "logps/rejected": -2139.589111328125, + "loss": 1.7098, + "rewards/accuracies": 0.59375, + "rewards/chosen": -16.262523651123047, + "rewards/margins": 2.7768733501434326, + "rewards/rejected": -19.039398193359375, "step": 370 }, { "epoch": 0.3976975405546834, - "grad_norm": 33.96402149473364, + "grad_norm": 239.86422108427834, "learning_rate": 3.768430099352445e-07, - "logits/chosen": -0.14455661177635193, - "logits/rejected": 0.8097721934318542, - "logps/chosen": -388.0363464355469, - "logps/rejected": -460.75048828125, - "loss": 0.5138, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.333893060684204, - "rewards/margins": 0.8102533221244812, - "rewards/rejected": -2.14414644241333, + "logits/chosen": -0.5338395833969116, + "logits/rejected": -0.10323655605316162, + "logps/chosen": -1830.7080078125, + "logps/rejected": -2104.773681640625, + "loss": 1.786, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -15.76060962677002, + "rewards/margins": 2.8237688541412354, + "rewards/rejected": -18.58437728881836, "step": 380 }, { "epoch": 0.40816326530612246, - "grad_norm": 32.427509955053814, + "grad_norm": 137.89263121746114, "learning_rate": 3.6888102953122304e-07, - "logits/chosen": 0.7358514070510864, - "logits/rejected": 1.5359573364257812, - "logps/chosen": -425.21014404296875, - "logps/rejected": -492.5250549316406, - "loss": 0.5231, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4359853267669678, - "rewards/margins": 0.8055831789970398, - "rewards/rejected": -2.2415683269500732, + "logits/chosen": -0.3421451449394226, + "logits/rejected": 0.2877078056335449, + "logps/chosen": -1774.384765625, + "logps/rejected": -2007.7366943359375, + "loss": 1.9274, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -14.92773151397705, + "rewards/margins": 2.465951442718506, + "rewards/rejected": -17.393680572509766, "step": 390 }, { "epoch": 0.4186289900575615, - "grad_norm": 36.13318678339082, + "grad_norm": 164.86784545063486, "learning_rate": 3.607600562872785e-07, - "logits/chosen": 0.8064459562301636, - "logits/rejected": 1.430558681488037, - "logps/chosen": -414.16839599609375, - "logps/rejected": -482.3439025878906, - "loss": 0.5386, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4417721033096313, - "rewards/margins": 0.7046888470649719, - "rewards/rejected": -2.146461009979248, + "logits/chosen": -0.7335325479507446, + "logits/rejected": -0.33919858932495117, + "logps/chosen": -1733.375244140625, + "logps/rejected": -1963.0279541015625, + "loss": 1.6642, + "rewards/accuracies": 0.5625, + "rewards/chosen": -14.63383960723877, + "rewards/margins": 2.319460391998291, + "rewards/rejected": -16.95330047607422, "step": 400 }, { "epoch": 0.4186289900575615, - "eval_logits/chosen": 0.024975202977657318, - "eval_logits/rejected": 0.9406751394271851, - "eval_logps/chosen": -412.3045654296875, - "eval_logps/rejected": -471.00274658203125, - "eval_loss": 0.510511040687561, - "eval_rewards/accuracies": 0.77182537317276, - "eval_rewards/chosen": -1.3033660650253296, - "eval_rewards/margins": 0.8045932054519653, - "eval_rewards/rejected": -2.107959032058716, - "eval_runtime": 177.0547, - "eval_samples_per_second": 11.296, - "eval_steps_per_second": 0.356, + "eval_logits/chosen": -0.7751028537750244, + "eval_logits/rejected": -0.08748837560415268, + "eval_logps/chosen": -1731.152587890625, + "eval_logps/rejected": -2045.1492919921875, + "eval_loss": 1.6421091556549072, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -14.491846084594727, + "eval_rewards/margins": 3.3575782775878906, + "eval_rewards/rejected": -17.849422454833984, + "eval_runtime": 176.0651, + "eval_samples_per_second": 11.359, + "eval_steps_per_second": 0.358, "step": 400 }, { "epoch": 0.4290947148090005, - "grad_norm": 26.612881775481124, + "grad_norm": 128.91689311765836, "learning_rate": 3.5249095128531856e-07, - "logits/chosen": 0.2300519496202469, - "logits/rejected": 0.9924400448799133, - "logps/chosen": -398.78729248046875, - "logps/rejected": -436.8857421875, - "loss": 0.5301, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.270517110824585, - "rewards/margins": 0.6298418045043945, - "rewards/rejected": -1.9003589153289795, + "logits/chosen": -0.10633065551519394, + "logits/rejected": 0.350477933883667, + "logps/chosen": -1862.1099853515625, + "logps/rejected": -2067.15673828125, + "loss": 1.7556, + "rewards/accuracies": 0.59375, + "rewards/chosen": -15.903741836547852, + "rewards/margins": 2.2993245124816895, + "rewards/rejected": -18.203065872192383, "step": 410 }, { "epoch": 0.43956043956043955, - "grad_norm": 39.714482709669014, + "grad_norm": 187.2282869549343, "learning_rate": 3.4408477372034736e-07, - "logits/chosen": -0.2808998227119446, - "logits/rejected": 0.9214981198310852, - "logps/chosen": -379.78924560546875, - "logps/rejected": -428.7146911621094, - "loss": 0.5265, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.9761778116226196, - "rewards/margins": 0.7907910346984863, - "rewards/rejected": -1.7669687271118164, + "logits/chosen": -0.2209610939025879, + "logits/rejected": 0.7663095593452454, + "logps/chosen": -1825.959228515625, + "logps/rejected": -2182.580810546875, + "loss": 1.8542, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -15.437875747680664, + "rewards/margins": 3.867755174636841, + "rewards/rejected": -19.30562973022461, "step": 420 }, { "epoch": 0.4500261643118786, - "grad_norm": 36.7812247576203, + "grad_norm": 150.13979068919696, "learning_rate": 3.3555276610977276e-07, - "logits/chosen": 1.0532963275909424, - "logits/rejected": 1.682213544845581, - "logps/chosen": -372.61004638671875, - "logps/rejected": -455.31524658203125, - "loss": 0.507, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.18856680393219, - "rewards/margins": 0.8627974390983582, - "rewards/rejected": -2.0513641834259033, + "logits/chosen": -1.128701090812683, + "logits/rejected": -0.5558885335922241, + "logps/chosen": -1832.6103515625, + "logps/rejected": -2176.197265625, + "loss": 1.5079, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -15.788568496704102, + "rewards/margins": 3.47161602973938, + "rewards/rejected": -19.26018714904785, "step": 430 }, { "epoch": 0.4604918890633176, - "grad_norm": 37.84672118674808, + "grad_norm": 163.41066719667168, "learning_rate": 3.269063392575352e-07, - "logits/chosen": 2.015638828277588, - "logits/rejected": 2.8407273292541504, - "logps/chosen": -472.79931640625, - "logps/rejected": -504.94561767578125, - "loss": 0.5227, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.7762467861175537, - "rewards/margins": 0.7188802361488342, - "rewards/rejected": -2.4951272010803223, + "logits/chosen": -0.6949409246444702, + "logits/rejected": -0.05746125057339668, + "logps/chosen": -1597.5341796875, + "logps/rejected": -1821.0198974609375, + "loss": 1.4868, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -13.023595809936523, + "rewards/margins": 2.632272481918335, + "rewards/rejected": -15.655868530273438, "step": 440 }, { "epoch": 0.47095761381475665, - "grad_norm": 30.609934320862045, + "grad_norm": 133.46596474594617, "learning_rate": 3.1815705699316964e-07, - "logits/chosen": 1.3560642004013062, - "logits/rejected": 2.127135753631592, - "logps/chosen": -465.49188232421875, - "logps/rejected": -571.1328125, - "loss": 0.5099, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.8310394287109375, - "rewards/margins": 1.0971113443374634, - "rewards/rejected": -2.9281508922576904, + "logits/chosen": -0.4808398187160492, + "logits/rejected": 0.3264926075935364, + "logps/chosen": -1599.6370849609375, + "logps/rejected": -1936.6884765625, + "loss": 1.5413, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.172491073608398, + "rewards/margins": 3.4112179279327393, + "rewards/rejected": -16.583707809448242, "step": 450 }, { "epoch": 0.48142333856619574, - "grad_norm": 28.715258464621037, + "grad_norm": 155.84007478164062, "learning_rate": 3.0931662070620794e-07, - "logits/chosen": 0.8564838171005249, - "logits/rejected": 1.5636537075042725, - "logps/chosen": -452.06951904296875, - "logps/rejected": -517.620361328125, - "loss": 0.5258, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.9461126327514648, - "rewards/margins": 0.7798872590065002, - "rewards/rejected": -2.725999593734741, + "logits/chosen": -0.719369113445282, + "logits/rejected": -0.06152462959289551, + "logps/chosen": -1643.2447509765625, + "logps/rejected": -1872.9976806640625, + "loss": 1.7906, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -13.85786247253418, + "rewards/margins": 2.4219117164611816, + "rewards/rejected": -16.279773712158203, "step": 460 }, { "epoch": 0.49188906331763477, - "grad_norm": 43.25596740234111, + "grad_norm": 203.3322056694353, "learning_rate": 3.003968536966078e-07, - "logits/chosen": 0.4564128816127777, - "logits/rejected": 1.1015666723251343, - "logps/chosen": -456.0911560058594, - "logps/rejected": -495.827880859375, - "loss": 0.4802, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.7230240106582642, - "rewards/margins": 0.7358741164207458, - "rewards/rejected": -2.458897829055786, + "logits/chosen": -0.4609583020210266, + "logits/rejected": -0.09374441206455231, + "logps/chosen": -1654.1614990234375, + "logps/rejected": -1845.5618896484375, + "loss": 1.7718, + "rewards/accuracies": 0.59375, + "rewards/chosen": -13.703729629516602, + "rewards/margins": 2.2525086402893066, + "rewards/rejected": -15.956239700317383, "step": 470 }, { "epoch": 0.5023547880690737, - "grad_norm": 36.20202372184976, + "grad_norm": 156.4799546194198, "learning_rate": 2.9140968536213693e-07, - "logits/chosen": 0.7777734994888306, - "logits/rejected": 1.9267253875732422, - "logps/chosen": -460.71783447265625, - "logps/rejected": -554.2379150390625, - "loss": 0.5012, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.935050368309021, - "rewards/margins": 1.0943626165390015, - "rewards/rejected": -3.0294129848480225, + "logits/chosen": -0.2353781908750534, + "logits/rejected": 0.5946909785270691, + "logps/chosen": -1859.3265380859375, + "logps/rejected": -2325.88134765625, + "loss": 1.3829, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -15.921140670776367, + "rewards/margins": 4.824706077575684, + "rewards/rejected": -20.745845794677734, "step": 480 }, { "epoch": 0.5128205128205128, - "grad_norm": 32.834417417864124, + "grad_norm": 160.19325879757844, "learning_rate": 2.823671352438608e-07, - "logits/chosen": 0.05315951257944107, - "logits/rejected": 2.070735454559326, - "logps/chosen": -481.5361328125, - "logps/rejected": -537.8575439453125, - "loss": 0.4958, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6960468292236328, - "rewards/margins": 1.1183526515960693, - "rewards/rejected": -2.814399242401123, + "logits/chosen": -0.9654836654663086, + "logits/rejected": -0.002035105135291815, + "logps/chosen": -1637.873291015625, + "logps/rejected": -2143.010986328125, + "loss": 1.6206, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -13.259417533874512, + "rewards/margins": 5.606515407562256, + "rewards/rejected": -18.86593246459961, "step": 490 }, { "epoch": 0.5232862375719518, - "grad_norm": 37.03566337015388, + "grad_norm": 221.83952267135834, "learning_rate": 2.73281296951072e-07, - "logits/chosen": 0.2395954132080078, - "logits/rejected": 1.218072772026062, - "logps/chosen": -431.42626953125, - "logps/rejected": -487.06756591796875, - "loss": 0.4996, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7278209924697876, - "rewards/margins": 0.868577778339386, - "rewards/rejected": -2.5963988304138184, + "logits/chosen": -0.6597784161567688, + "logits/rejected": -0.14649493992328644, + "logps/chosen": -1530.5738525390625, + "logps/rejected": -1781.8070068359375, + "loss": 1.6328, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -12.7192964553833, + "rewards/margins": 2.8244967460632324, + "rewards/rejected": -15.543792724609375, "step": 500 }, { "epoch": 0.5232862375719518, - "eval_logits/chosen": 0.005896765738725662, - "eval_logits/rejected": 1.1533339023590088, - "eval_logps/chosen": -460.39996337890625, - "eval_logps/rejected": -534.9353637695312, - "eval_loss": 0.49528467655181885, - "eval_rewards/accuracies": 0.7678571343421936, - "eval_rewards/chosen": -1.7843199968338013, - "eval_rewards/margins": 0.9629656076431274, - "eval_rewards/rejected": -2.7472856044769287, - "eval_runtime": 175.591, - "eval_samples_per_second": 11.39, - "eval_steps_per_second": 0.359, + "eval_logits/chosen": -0.6590258479118347, + "eval_logits/rejected": -0.091790109872818, + "eval_logps/chosen": -1589.3370361328125, + "eval_logps/rejected": -1890.562255859375, + "eval_loss": 1.5119922161102295, + "eval_rewards/accuracies": 0.6388888955116272, + "eval_rewards/chosen": -13.073691368103027, + "eval_rewards/margins": 3.229863166809082, + "eval_rewards/rejected": -16.303556442260742, + "eval_runtime": 177.8158, + "eval_samples_per_second": 11.248, + "eval_steps_per_second": 0.354, "step": 500 }, { "epoch": 0.533751962323391, - "grad_norm": 31.83813721910833, + "grad_norm": 187.4336485549293, "learning_rate": 2.641643219871597e-07, - "logits/chosen": -0.09106893837451935, - "logits/rejected": 1.0047601461410522, - "logps/chosen": -448.24365234375, - "logps/rejected": -547.2485961914062, - "loss": 0.4759, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.6430542469024658, - "rewards/margins": 1.1913459300994873, - "rewards/rejected": -2.834400177001953, + "logits/chosen": -0.5598984360694885, + "logits/rejected": -0.2727218270301819, + "logps/chosen": -1694.568359375, + "logps/rejected": -2086.98193359375, + "loss": 1.4069, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -14.106300354003906, + "rewards/margins": 4.125433921813965, + "rewards/rejected": -18.231733322143555, "step": 510 }, { "epoch": 0.54421768707483, - "grad_norm": 31.094283760600373, + "grad_norm": 180.24950333654212, "learning_rate": 2.550284034980507e-07, - "logits/chosen": 0.5583597421646118, - "logits/rejected": 1.752502202987671, - "logps/chosen": -487.286376953125, - "logps/rejected": -552.1371459960938, - "loss": 0.5187, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.1521472930908203, - "rewards/margins": 0.8697667121887207, - "rewards/rejected": -3.021914005279541, + "logits/chosen": -0.652435302734375, + "logits/rejected": -0.25857192277908325, + "logps/chosen": -1941.6849365234375, + "logps/rejected": -2231.46337890625, + "loss": 1.6022, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -16.69613265991211, + "rewards/margins": 3.1190426349639893, + "rewards/rejected": -19.815174102783203, "step": 520 }, { "epoch": 0.554683411826269, - "grad_norm": 46.54511021652677, + "grad_norm": 147.71519410172087, "learning_rate": 2.4588575996495794e-07, - "logits/chosen": -0.278386652469635, - "logits/rejected": 0.8032172918319702, - "logps/chosen": -446.83624267578125, - "logps/rejected": -526.9205322265625, - "loss": 0.4996, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7808927297592163, - "rewards/margins": 0.9380749464035034, - "rewards/rejected": -2.7189676761627197, + "logits/chosen": -0.6198351979255676, + "logits/rejected": -0.19036616384983063, + "logps/chosen": -1601.6470947265625, + "logps/rejected": -1820.4556884765625, + "loss": 1.5136, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -13.329002380371094, + "rewards/margins": 2.3253164291381836, + "rewards/rejected": -15.654316902160645, "step": 530 }, { "epoch": 0.565149136577708, - "grad_norm": 32.98937848769124, + "grad_norm": 146.6770433780799, "learning_rate": 2.367486188632446e-07, - "logits/chosen": -0.7310466170310974, - "logits/rejected": 0.3242616355419159, - "logps/chosen": -452.5191345214844, - "logps/rejected": -511.3818359375, - "loss": 0.513, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.6216903924942017, - "rewards/margins": 0.8862103223800659, - "rewards/rejected": -2.5079007148742676, + "logits/chosen": -0.7303057909011841, + "logits/rejected": 0.15564236044883728, + "logps/chosen": -1670.916015625, + "logps/rejected": -2011.5406494140625, + "loss": 1.5458, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.805659294128418, + "rewards/margins": 3.703829288482666, + "rewards/rejected": -17.509489059448242, "step": 540 }, { "epoch": 0.5756148613291471, - "grad_norm": 28.08745252820734, + "grad_norm": 206.94359776232758, "learning_rate": 2.276292003092593e-07, - "logits/chosen": -0.819686770439148, - "logits/rejected": 0.32188865542411804, - "logps/chosen": -443.1897888183594, - "logps/rejected": -494.9161071777344, - "loss": 0.526, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.719175100326538, - "rewards/margins": 0.9371669888496399, - "rewards/rejected": -2.656342029571533, + "logits/chosen": -0.22513580322265625, + "logits/rejected": 0.4895138740539551, + "logps/chosen": -1914.7532958984375, + "logps/rejected": -2300.30322265625, + "loss": 1.6801, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -16.434810638427734, + "rewards/margins": 4.275403022766113, + "rewards/rejected": -20.71021270751953, "step": 550 }, { "epoch": 0.5860805860805861, - "grad_norm": 34.81864414663528, + "grad_norm": 175.41735239090949, "learning_rate": 2.185397007170141e-07, - "logits/chosen": -0.32582345604896545, - "logits/rejected": 1.0776915550231934, - "logps/chosen": -462.23406982421875, - "logps/rejected": -525.7540893554688, - "loss": 0.5104, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7941694259643555, - "rewards/margins": 0.9143456220626831, - "rewards/rejected": -2.708515167236328, + "logits/chosen": -0.1453290730714798, + "logits/rejected": 0.3121495842933655, + "logps/chosen": -1876.300537109375, + "logps/rejected": -2229.38134765625, + "loss": 1.3878, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -15.934832572937012, + "rewards/margins": 3.8099570274353027, + "rewards/rejected": -19.744789123535156, "step": 560 }, { "epoch": 0.5965463108320251, - "grad_norm": 32.06431987316443, + "grad_norm": 142.79294258337345, "learning_rate": 2.094922764865619e-07, - "logits/chosen": -0.5136072635650635, - "logits/rejected": 0.8532153367996216, - "logps/chosen": -462.86260986328125, - "logps/rejected": -532.7152099609375, - "loss": 0.4848, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.8336808681488037, - "rewards/margins": 0.8313940167427063, - "rewards/rejected": -2.6650748252868652, + "logits/chosen": -0.276650995016098, + "logits/rejected": 0.13945253193378448, + "logps/chosen": -1827.0634765625, + "logps/rejected": -2034.280517578125, + "loss": 1.4902, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -15.475687980651855, + "rewards/margins": 2.2050392627716064, + "rewards/rejected": -17.680728912353516, "step": 570 }, { "epoch": 0.6070120355834642, - "grad_norm": 38.10097117955034, + "grad_norm": 245.80968468908674, "learning_rate": 2.0049902774588797e-07, - "logits/chosen": -0.3451794385910034, - "logits/rejected": 0.9854526519775391, - "logps/chosen": -424.631103515625, - "logps/rejected": -478.0337829589844, - "loss": 0.5039, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7129182815551758, - "rewards/margins": 0.9391329884529114, - "rewards/rejected": -2.6520514488220215, + "logits/chosen": -0.011815989390015602, + "logits/rejected": 0.42436084151268005, + "logps/chosen": -1794.5543212890625, + "logps/rejected": -2061.93310546875, + "loss": 1.4461, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -15.412150382995605, + "rewards/margins": 3.078895330429077, + "rewards/rejected": -18.491044998168945, "step": 580 }, { "epoch": 0.6174777603349032, - "grad_norm": 31.542114777873778, + "grad_norm": 175.38280547329734, "learning_rate": 1.9157198216806238e-07, - "logits/chosen": -0.5171893239021301, - "logits/rejected": 0.9677211046218872, - "logps/chosen": -454.8963928222656, - "logps/rejected": -536.4180908203125, - "loss": 0.4838, + "logits/chosen": -0.3044319152832031, + "logits/rejected": 0.3406422734260559, + "logps/chosen": -1649.8509521484375, + "logps/rejected": -2006.366455078125, + "loss": 1.5446, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.7694385051727295, - "rewards/margins": 0.9072266817092896, - "rewards/rejected": -2.6766653060913086, + "rewards/chosen": -13.71898365020752, + "rewards/margins": 3.657163143157959, + "rewards/rejected": -17.376148223876953, "step": 590 }, { "epoch": 0.6279434850863422, - "grad_norm": 29.346076446463687, + "grad_norm": 203.04339818262545, "learning_rate": 1.8272307888529274e-07, - "logits/chosen": -0.03304073214530945, - "logits/rejected": 1.3204997777938843, - "logps/chosen": -449.006103515625, - "logps/rejected": -523.1787719726562, - "loss": 0.4664, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.6648248434066772, - "rewards/margins": 0.9661164283752441, - "rewards/rejected": -2.630941390991211, + "logits/chosen": 0.16477735340595245, + "logits/rejected": 0.6171606183052063, + "logps/chosen": -1870.41015625, + "logps/rejected": -2165.638427734375, + "loss": 1.6032, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -15.878863334655762, + "rewards/margins": 3.1766743659973145, + "rewards/rejected": -19.055538177490234, "step": 600 }, { "epoch": 0.6279434850863422, - "eval_logits/chosen": -0.26891058683395386, - "eval_logits/rejected": 1.055680513381958, - "eval_logps/chosen": -450.7947692871094, - "eval_logps/rejected": -530.9400634765625, - "eval_loss": 0.49440109729766846, - "eval_rewards/accuracies": 0.7698412537574768, - "eval_rewards/chosen": -1.6882680654525757, - "eval_rewards/margins": 1.01906418800354, - "eval_rewards/rejected": -2.7073323726654053, - "eval_runtime": 176.7399, - "eval_samples_per_second": 11.316, - "eval_steps_per_second": 0.356, + "eval_logits/chosen": 0.01903720200061798, + "eval_logits/rejected": 0.6402472853660583, + "eval_logps/chosen": -2015.7071533203125, + "eval_logps/rejected": -2402.58447265625, + "eval_loss": 1.4751698970794678, + "eval_rewards/accuracies": 0.6230158805847168, + "eval_rewards/chosen": -17.33738899230957, + "eval_rewards/margins": 4.086385250091553, + "eval_rewards/rejected": -21.42377471923828, + "eval_runtime": 176.4506, + "eval_samples_per_second": 11.335, + "eval_steps_per_second": 0.357, "step": 600 }, { "epoch": 0.6384092098377813, - "grad_norm": 37.553002444239766, + "grad_norm": 184.64896406440843, "learning_rate": 1.7396415252139288e-07, - "logits/chosen": -0.38198137283325195, - "logits/rejected": 1.3065571784973145, - "logps/chosen": -473.23046875, - "logps/rejected": -518.6094360351562, - "loss": 0.4964, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.6766010522842407, - "rewards/margins": 1.062415361404419, - "rewards/rejected": -2.73901629447937, + "logits/chosen": 0.0034784465096890926, + "logits/rejected": 0.6044633388519287, + "logps/chosen": -2050.113037109375, + "logps/rejected": -2622.564453125, + "loss": 1.5229, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -17.44542694091797, + "rewards/margins": 6.333140850067139, + "rewards/rejected": -23.778566360473633, "step": 610 }, { "epoch": 0.6488749345892203, - "grad_norm": 42.193220673193395, + "grad_norm": 150.92780625161797, "learning_rate": 1.6530691736402316e-07, - "logits/chosen": -0.21174755692481995, - "logits/rejected": 0.9794837236404419, - "logps/chosen": -429.8130798339844, - "logps/rejected": -492.4026794433594, - "loss": 0.4665, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5878998041152954, - "rewards/margins": 0.9720605611801147, - "rewards/rejected": -2.5599606037139893, + "logits/chosen": -0.05873150750994682, + "logits/rejected": 0.2572210133075714, + "logps/chosen": -1822.690185546875, + "logps/rejected": -2140.002685546875, + "loss": 1.3047, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -15.516670227050781, + "rewards/margins": 3.519291400909424, + "rewards/rejected": -19.035961151123047, "step": 620 }, { "epoch": 0.6593406593406593, - "grad_norm": 38.97702529967974, + "grad_norm": 158.62413320623054, "learning_rate": 1.5676295169786864e-07, - "logits/chosen": -0.6409791707992554, - "logits/rejected": 0.6868582367897034, - "logps/chosen": -423.3990783691406, - "logps/rejected": -487.27117919921875, - "loss": 0.4718, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.4871389865875244, - "rewards/margins": 1.025777816772461, - "rewards/rejected": -2.5129168033599854, + "logits/chosen": -0.5535549521446228, + "logits/rejected": -0.16974008083343506, + "logps/chosen": -1799.411376953125, + "logps/rejected": -2184.095458984375, + "loss": 1.4004, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -15.247261047363281, + "rewards/margins": 4.233900547027588, + "rewards/rejected": -19.481159210205078, "step": 630 }, { "epoch": 0.6698063840920984, - "grad_norm": 47.68708831419142, + "grad_norm": 174.63990723873954, "learning_rate": 1.483436823197092e-07, - "logits/chosen": -0.141208216547966, - "logits/rejected": 1.2582366466522217, - "logps/chosen": -447.75701904296875, - "logps/rejected": -507.4087829589844, - "loss": 0.4792, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.8073606491088867, - "rewards/margins": 0.9799903631210327, - "rewards/rejected": -2.787351131439209, + "logits/chosen": -0.49727511405944824, + "logits/rejected": -0.09024439752101898, + "logps/chosen": -1910.181396484375, + "logps/rejected": -2272.175537109375, + "loss": 1.2582, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -16.431602478027344, + "rewards/margins": 4.003415107727051, + "rewards/rejected": -20.43501853942871, "step": 640 }, { "epoch": 0.6802721088435374, - "grad_norm": 58.635870372435406, + "grad_norm": 212.30897956956616, "learning_rate": 1.4006036925609243e-07, - "logits/chosen": -0.6169118881225586, - "logits/rejected": 0.4020913243293762, - "logps/chosen": -419.74786376953125, - "logps/rejected": -509.29840087890625, - "loss": 0.4833, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.5206490755081177, - "rewards/margins": 0.8552547693252563, - "rewards/rejected": -2.375903606414795, + "logits/chosen": -0.5441917777061462, + "logits/rejected": -0.3759006857872009, + "logps/chosen": -1762.1038818359375, + "logps/rejected": -1993.853515625, + "loss": 1.3183, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -14.94421100616455, + "rewards/margins": 2.2772457599639893, + "rewards/rejected": -17.221454620361328, "step": 650 }, { "epoch": 0.6907378335949764, - "grad_norm": 38.14798973487795, + "grad_norm": 122.40725726992933, "learning_rate": 1.319240907040458e-07, - "logits/chosen": -0.6595967411994934, - "logits/rejected": 0.701393723487854, - "logps/chosen": -430.74365234375, - "logps/rejected": -514.9932250976562, - "loss": 0.4825, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.485804557800293, - "rewards/margins": 1.0895816087722778, - "rewards/rejected": -2.5753860473632812, + "logits/chosen": -0.578727126121521, + "logits/rejected": -0.15290720760822296, + "logps/chosen": -1786.3648681640625, + "logps/rejected": -2103.92919921875, + "loss": 1.5482, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -15.042015075683594, + "rewards/margins": 3.422727584838867, + "rewards/rejected": -18.46474266052246, "step": 660 }, { "epoch": 0.7012035583464155, - "grad_norm": 45.96703369486118, + "grad_norm": 273.40146184819037, "learning_rate": 1.239457282149695e-07, - "logits/chosen": -0.6679301261901855, - "logits/rejected": 0.42252644896507263, - "logps/chosen": -438.337158203125, - "logps/rejected": -540.0033569335938, - "loss": 0.472, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.4978899955749512, - "rewards/margins": 1.0711971521377563, - "rewards/rejected": -2.569087028503418, + "logits/chosen": -0.6542818546295166, + "logits/rejected": -0.6405806541442871, + "logps/chosen": -1718.8697509765625, + "logps/rejected": -2025.167236328125, + "loss": 1.1528, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -14.303213119506836, + "rewards/margins": 3.1175124645233154, + "rewards/rejected": -17.420726776123047, "step": 670 }, { "epoch": 0.7116692830978545, - "grad_norm": 36.89505960468928, + "grad_norm": 246.28508875936285, "learning_rate": 1.1613595214152711e-07, - "logits/chosen": -0.2877568006515503, - "logits/rejected": 0.7360206246376038, - "logps/chosen": -468.3255920410156, - "logps/rejected": -552.3143310546875, - "loss": 0.5059, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.5666911602020264, - "rewards/margins": 1.0774924755096436, - "rewards/rejected": -2.64418363571167, + "logits/chosen": -0.6755629777908325, + "logits/rejected": -0.26193898916244507, + "logps/chosen": -1862.4000244140625, + "logps/rejected": -2191.969482421875, + "loss": 1.3671, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -15.507433891296387, + "rewards/margins": 3.5333023071289062, + "rewards/rejected": -19.04073715209961, "step": 680 }, { "epoch": 0.7221350078492935, - "grad_norm": 26.931344238941527, + "grad_norm": 216.14843384209277, "learning_rate": 1.0850520736699362e-07, - "logits/chosen": -0.49702662229537964, - "logits/rejected": 0.9417774081230164, - "logps/chosen": -430.6412658691406, - "logps/rejected": -496.180419921875, - "loss": 0.4527, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.5998663902282715, - "rewards/margins": 1.0349512100219727, - "rewards/rejected": -2.634817361831665, + "logits/chosen": -0.6002136468887329, + "logits/rejected": 0.03606845811009407, + "logps/chosen": -1838.6982421875, + "logps/rejected": -2214.07470703125, + "loss": 1.3895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -15.680435180664062, + "rewards/margins": 4.133326053619385, + "rewards/rejected": -19.813762664794922, "step": 690 }, { "epoch": 0.7326007326007326, - "grad_norm": 34.042068496483864, + "grad_norm": 162.01079027631573, "learning_rate": 1.0106369933615042e-07, - "logits/chosen": -0.22161564230918884, - "logits/rejected": 0.7004293203353882, - "logps/chosen": -416.830322265625, - "logps/rejected": -488.65106201171875, - "loss": 0.4716, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.6273488998413086, - "rewards/margins": 0.8736777305603027, - "rewards/rejected": -2.5010266304016113, + "logits/chosen": -0.7846351861953735, + "logits/rejected": -0.5915166735649109, + "logps/chosen": -1752.784423828125, + "logps/rejected": -2021.7802734375, + "loss": 1.5039, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -14.986889839172363, + "rewards/margins": 2.845428943634033, + "rewards/rejected": -17.832317352294922, "step": 700 }, { "epoch": 0.7326007326007326, - "eval_logits/chosen": -0.3817674517631531, - "eval_logits/rejected": 0.7091061472892761, - "eval_logps/chosen": -438.7652893066406, - "eval_logps/rejected": -520.2406005859375, - "eval_loss": 0.49287474155426025, - "eval_rewards/accuracies": 0.7777777910232544, - "eval_rewards/chosen": -1.5679733753204346, - "eval_rewards/margins": 1.032364010810852, - "eval_rewards/rejected": -2.600337505340576, - "eval_runtime": 175.3398, - "eval_samples_per_second": 11.406, - "eval_steps_per_second": 0.359, + "eval_logits/chosen": -0.8898468017578125, + "eval_logits/rejected": -0.4967605769634247, + "eval_logps/chosen": -1694.96240234375, + "eval_logps/rejected": -2016.4490966796875, + "eval_loss": 1.3852962255477905, + "eval_rewards/accuracies": 0.6527777910232544, + "eval_rewards/chosen": -14.129942893981934, + "eval_rewards/margins": 3.432478666305542, + "eval_rewards/rejected": -17.562421798706055, + "eval_runtime": 176.0679, + "eval_samples_per_second": 11.359, + "eval_steps_per_second": 0.358, "step": 700 }, { "epoch": 0.7430664573521716, - "grad_norm": 34.12335024104434, + "grad_norm": 177.45761000957364, "learning_rate": 9.382138040640714e-08, - "logits/chosen": -0.200923353433609, - "logits/rejected": 1.0540679693222046, - "logps/chosen": -429.7672424316406, - "logps/rejected": -506.0777282714844, - "loss": 0.5254, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.7129242420196533, - "rewards/margins": 0.9184734225273132, - "rewards/rejected": -2.6313977241516113, + "logits/chosen": -1.012629747390747, + "logits/rejected": -0.6268833875656128, + "logps/chosen": -1776.499755859375, + "logps/rejected": -2017.539794921875, + "loss": 1.5264, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -15.180249214172363, + "rewards/margins": 2.565770387649536, + "rewards/rejected": -17.74601936340332, "step": 710 }, { "epoch": 0.7535321821036107, - "grad_norm": 30.02495734311258, + "grad_norm": 140.94359920373847, "learning_rate": 8.678793653740632e-08, - "logits/chosen": -0.33009666204452515, - "logits/rejected": 0.9047529101371765, - "logps/chosen": -432.42010498046875, - "logps/rejected": -512.3065185546875, - "loss": 0.4648, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.4868285655975342, - "rewards/margins": 1.1614835262298584, - "rewards/rejected": -2.6483120918273926, + "logits/chosen": -0.9271895289421082, + "logits/rejected": -0.47789469361305237, + "logps/chosen": -1664.4437255859375, + "logps/rejected": -1977.908447265625, + "loss": 1.3295, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -13.8070650100708, + "rewards/margins": 3.4972636699676514, + "rewards/rejected": -17.3043270111084, "step": 720 }, { "epoch": 0.7639979068550498, - "grad_norm": 38.12612104973478, + "grad_norm": 190.75937551525504, "learning_rate": 7.997277433690983e-08, - "logits/chosen": -0.24449904263019562, - "logits/rejected": 0.697944164276123, - "logps/chosen": -442.3524475097656, - "logps/rejected": -493.40228271484375, - "loss": 0.4935, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.7699024677276611, - "rewards/margins": 0.7942432761192322, - "rewards/rejected": -2.564145565032959, + "logits/chosen": -0.8303499221801758, + "logits/rejected": -0.2948521077632904, + "logps/chosen": -1813.2340087890625, + "logps/rejected": -2049.240234375, + "loss": 1.4631, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -15.478715896606445, + "rewards/margins": 2.6438088417053223, + "rewards/rejected": -18.12252426147461, "step": 730 }, { "epoch": 0.7744636316064888, - "grad_norm": 34.63386515718176, + "grad_norm": 164.74206538760382, "learning_rate": 7.338500848029602e-08, - "logits/chosen": -0.09529142081737518, - "logits/rejected": 0.7665776610374451, - "logps/chosen": -415.37127685546875, - "logps/rejected": -480.74981689453125, - "loss": 0.5094, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.6250483989715576, - "rewards/margins": 0.8600455522537231, - "rewards/rejected": -2.485093832015991, + "logits/chosen": -0.6835179924964905, + "logits/rejected": -0.42263850569725037, + "logps/chosen": -1808.6490478515625, + "logps/rejected": -2096.81396484375, + "loss": 1.2242, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -15.557826042175293, + "rewards/margins": 3.0879101753234863, + "rewards/rejected": -18.645736694335938, "step": 740 }, { "epoch": 0.7849293563579278, - "grad_norm": 38.170157629518556, + "grad_norm": 135.0757551116068, "learning_rate": 6.70334495204884e-08, - "logits/chosen": -0.16666540503501892, - "logits/rejected": 0.6466744542121887, - "logps/chosen": -414.00482177734375, - "logps/rejected": -509.15478515625, - "loss": 0.4725, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.6713943481445312, - "rewards/margins": 0.9836255311965942, - "rewards/rejected": -2.655019760131836, + "logits/chosen": -0.5583680868148804, + "logits/rejected": -0.36530551314353943, + "logps/chosen": -1854.912353515625, + "logps/rejected": -2177.262451171875, + "loss": 1.3344, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -16.08046531677246, + "rewards/margins": 3.2556281089782715, + "rewards/rejected": -19.33609390258789, "step": 750 }, { "epoch": 0.7953950811093669, - "grad_norm": 32.154322102412, + "grad_norm": 158.01784405358154, "learning_rate": 6.092659210462231e-08, - "logits/chosen": -0.10375770181417465, - "logits/rejected": 0.7752261757850647, - "logps/chosen": -410.65399169921875, - "logps/rejected": -507.827880859375, - "loss": 0.5116, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.6930710077285767, - "rewards/margins": 0.9550973773002625, - "rewards/rejected": -2.6481685638427734, + "logits/chosen": -0.653573215007782, + "logits/rejected": -0.4876467287540436, + "logps/chosen": -1903.880615234375, + "logps/rejected": -2182.48291015625, + "loss": 1.4038, + "rewards/accuracies": 0.5625, + "rewards/chosen": -16.625337600708008, + "rewards/margins": 2.7693800926208496, + "rewards/rejected": -19.394718170166016, "step": 760 }, { "epoch": 0.8058608058608059, - "grad_norm": 29.7627819143098, + "grad_norm": 169.97964049682443, "learning_rate": 5.507260361320737e-08, - "logits/chosen": -0.4120585322380066, - "logits/rejected": 0.24446937441825867, - "logps/chosen": -466.15069580078125, - "logps/rejected": -568.6295776367188, - "loss": 0.4641, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.6279999017715454, - "rewards/margins": 0.8385451436042786, - "rewards/rejected": -2.4665448665618896, + "logits/chosen": -1.0366981029510498, + "logits/rejected": -0.9037246704101562, + "logps/chosen": -1879.755126953125, + "logps/rejected": -2001.697265625, + "loss": 1.286, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -15.764042854309082, + "rewards/margins": 1.033178687095642, + "rewards/rejected": -16.797222137451172, "step": 770 }, { "epoch": 0.8163265306122449, - "grad_norm": 40.436476118941954, + "grad_norm": 162.02338031146334, "learning_rate": 4.947931323697982e-08, - "logits/chosen": -0.4582279324531555, - "logits/rejected": 0.3044646382331848, - "logps/chosen": -473.2518005371094, - "logps/rejected": -511.49334716796875, - "loss": 0.5022, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.542907953262329, - "rewards/margins": 0.8963887095451355, - "rewards/rejected": -2.4392964839935303, + "logits/chosen": -1.0304605960845947, + "logits/rejected": -0.9400796890258789, + "logps/chosen": -1669.2073974609375, + "logps/rejected": -2004.525390625, + "loss": 1.5927, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -13.502462387084961, + "rewards/margins": 3.8671538829803467, + "rewards/rejected": -17.369617462158203, "step": 780 }, { "epoch": 0.826792255363684, - "grad_norm": 38.14021936834171, + "grad_norm": 140.7368428333841, "learning_rate": 4.415420150605398e-08, - "logits/chosen": -0.21675653755664825, - "logits/rejected": 1.2191559076309204, - "logps/chosen": -446.891357421875, - "logps/rejected": -514.4319458007812, - "loss": 0.5074, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6831605434417725, - "rewards/margins": 1.0085071325302124, - "rewards/rejected": -2.6916680335998535, + "logits/chosen": -1.0811887979507446, + "logits/rejected": -0.5253428220748901, + "logps/chosen": -1726.182373046875, + "logps/rejected": -2063.27099609375, + "loss": 1.4648, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -14.476069450378418, + "rewards/margins": 3.7039875984191895, + "rewards/rejected": -18.180057525634766, "step": 790 }, { "epoch": 0.837257980115123, - "grad_norm": 33.872578527467745, + "grad_norm": 202.82775780509928, "learning_rate": 3.9104390285376374e-08, - "logits/chosen": 0.09026177227497101, - "logits/rejected": 0.9493114352226257, - "logps/chosen": -439.46429443359375, - "logps/rejected": -529.3217163085938, - "loss": 0.4741, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.7335838079452515, - "rewards/margins": 0.9264078140258789, - "rewards/rejected": -2.65999174118042, + "logits/chosen": -0.835501492023468, + "logits/rejected": -0.5900505781173706, + "logps/chosen": -1749.853759765625, + "logps/rejected": -1951.329345703125, + "loss": 1.3527, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -14.83747673034668, + "rewards/margins": 2.0425891876220703, + "rewards/rejected": -16.88006591796875, "step": 800 }, { "epoch": 0.837257980115123, - "eval_logits/chosen": -0.3940890431404114, - "eval_logits/rejected": 0.6599191427230835, - "eval_logps/chosen": -437.01190185546875, - "eval_logps/rejected": -517.6414794921875, - "eval_loss": 0.49065762758255005, - "eval_rewards/accuracies": 0.7777777910232544, - "eval_rewards/chosen": -1.5504390001296997, - "eval_rewards/margins": 1.02390718460083, - "eval_rewards/rejected": -2.5743465423583984, - "eval_runtime": 176.8304, - "eval_samples_per_second": 11.31, - "eval_steps_per_second": 0.356, + "eval_logits/chosen": -1.0374784469604492, + "eval_logits/rejected": -0.6750361919403076, + "eval_logps/chosen": -1672.130615234375, + "eval_logps/rejected": -1986.035888671875, + "eval_loss": 1.366306185722351, + "eval_rewards/accuracies": 0.6448412537574768, + "eval_rewards/chosen": -13.901623725891113, + "eval_rewards/margins": 3.3566668033599854, + "eval_rewards/rejected": -17.25829315185547, + "eval_runtime": 176.0547, + "eval_samples_per_second": 11.36, + "eval_steps_per_second": 0.358, "step": 800 }, { "epoch": 0.847723704866562, - "grad_norm": 32.1124717145771, + "grad_norm": 218.80895490878117, "learning_rate": 3.433663324986208e-08, - "logits/chosen": -0.3709171712398529, - "logits/rejected": 0.708153486251831, - "logps/chosen": -451.8567810058594, - "logps/rejected": -501.13153076171875, - "loss": 0.4866, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.606183409690857, - "rewards/margins": 0.9406149983406067, - "rewards/rejected": -2.5467987060546875, + "logits/chosen": -1.2597501277923584, + "logits/rejected": -0.7243804931640625, + "logps/chosen": -1665.3489990234375, + "logps/rejected": -2045.541259765625, + "loss": 1.4186, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -13.741106986999512, + "rewards/margins": 4.249786853790283, + "rewards/rejected": -17.990894317626953, "step": 810 }, { "epoch": 0.858189429618001, - "grad_norm": 36.42213321353283, + "grad_norm": 175.67069527310957, "learning_rate": 2.9857306851953897e-08, - "logits/chosen": -0.5197598934173584, - "logits/rejected": 0.44594240188598633, - "logps/chosen": -462.8509216308594, - "logps/rejected": -537.5787963867188, - "loss": 0.498, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.5807650089263916, - "rewards/margins": 0.9640032052993774, - "rewards/rejected": -2.5447685718536377, + "logits/chosen": -1.075448751449585, + "logits/rejected": -0.8459098935127258, + "logps/chosen": -1705.802734375, + "logps/rejected": -1971.207275390625, + "loss": 1.1819, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -14.010282516479492, + "rewards/margins": 2.8707687854766846, + "rewards/rejected": -16.881052017211914, "step": 820 }, { "epoch": 0.8686551543694401, - "grad_norm": 33.241447274140704, + "grad_norm": 150.14969837730865, "learning_rate": 2.567240179368185e-08, - "logits/chosen": -0.35766562819480896, - "logits/rejected": 0.4714294373989105, - "logps/chosen": -418.21746826171875, - "logps/rejected": -512.0104370117188, - "loss": 0.5054, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.600725531578064, - "rewards/margins": 1.0214117765426636, - "rewards/rejected": -2.6221375465393066, + "logits/chosen": -0.8211779594421387, + "logits/rejected": -0.672277569770813, + "logps/chosen": -1724.1959228515625, + "logps/rejected": -1975.289306640625, + "loss": 1.3771, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -14.66050910949707, + "rewards/margins": 2.5944151878356934, + "rewards/rejected": -17.25492286682129, "step": 830 }, { "epoch": 0.8791208791208791, - "grad_norm": 30.190104504480274, + "grad_norm": 143.51050018041488, "learning_rate": 2.1787515014630357e-08, - "logits/chosen": -0.35540395975112915, - "logits/rejected": 0.8115441203117371, - "logps/chosen": -448.0243225097656, - "logps/rejected": -531.029541015625, - "loss": 0.4616, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6575037240982056, - "rewards/margins": 0.9920711517333984, - "rewards/rejected": -2.6495752334594727, + "logits/chosen": -0.9592329263687134, + "logits/rejected": -0.6304475665092468, + "logps/chosen": -1664.050048828125, + "logps/rejected": -2090.85107421875, + "loss": 1.1841, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -13.817761421203613, + "rewards/margins": 4.430028915405273, + "rewards/rejected": -18.247791290283203, "step": 840 }, { "epoch": 0.8895866038723181, - "grad_norm": 41.82041936139999, + "grad_norm": 160.09590738302992, "learning_rate": 1.820784220652766e-08, - "logits/chosen": -0.12738969922065735, - "logits/rejected": 1.10179603099823, - "logps/chosen": -434.4571228027344, - "logps/rejected": -497.9303283691406, - "loss": 0.455, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5362211465835571, - "rewards/margins": 1.1085121631622314, - "rewards/rejected": -2.64473295211792, + "logits/chosen": -0.8976573944091797, + "logits/rejected": -0.619744598865509, + "logps/chosen": -1732.6185302734375, + "logps/rejected": -2009.6126708984375, + "loss": 1.3946, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -14.51783561706543, + "rewards/margins": 3.2437214851379395, + "rewards/rejected": -17.761554718017578, "step": 850 }, { "epoch": 0.9000523286237572, - "grad_norm": 34.694500021885084, + "grad_norm": 140.45079725700174, "learning_rate": 1.4938170864468636e-08, - "logits/chosen": -0.25795820355415344, - "logits/rejected": 0.9191703796386719, - "logps/chosen": -451.52752685546875, - "logps/rejected": -523.0709838867188, - "loss": 0.4787, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.6323562860488892, - "rewards/margins": 0.984820544719696, - "rewards/rejected": -2.6171765327453613, + "logits/chosen": -1.2183126211166382, + "logits/rejected": -0.7451462149620056, + "logps/chosen": -1663.8861083984375, + "logps/rejected": -2030.5501708984375, + "loss": 1.403, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -13.755941390991211, + "rewards/margins": 3.9360270500183105, + "rewards/rejected": -17.691970825195312, "step": 860 }, { "epoch": 0.9105180533751962, - "grad_norm": 34.514941887456985, + "grad_norm": 177.87764974909854, "learning_rate": 1.1982873884064465e-08, - "logits/chosen": -0.48437461256980896, - "logits/rejected": 0.7289548516273499, - "logps/chosen": -428.81390380859375, - "logps/rejected": -501.2752990722656, - "loss": 0.489, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.5704351663589478, - "rewards/margins": 0.9770514369010925, - "rewards/rejected": -2.5474865436553955, + "logits/chosen": -1.142114281654358, + "logits/rejected": -0.8570957183837891, + "logps/chosen": -1702.1165771484375, + "logps/rejected": -2053.07568359375, + "loss": 1.364, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -14.30346393585205, + "rewards/margins": 3.762028932571411, + "rewards/rejected": -18.06549072265625, "step": 870 }, { "epoch": 0.9209837781266352, - "grad_norm": 33.54810196280602, + "grad_norm": 138.3301348415624, "learning_rate": 9.345903713082304e-09, - "logits/chosen": -0.3731919825077057, - "logits/rejected": 0.4349101185798645, - "logps/chosen": -459.6202697753906, - "logps/rejected": -546.8380126953125, - "loss": 0.4823, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5548641681671143, - "rewards/margins": 0.9671527743339539, - "rewards/rejected": -2.522017002105713, + "logits/chosen": -1.0760080814361572, + "logits/rejected": -0.866096019744873, + "logps/chosen": -1735.3382568359375, + "logps/rejected": -2023.660888671875, + "loss": 1.355, + "rewards/accuracies": 0.5625, + "rewards/chosen": -14.312044143676758, + "rewards/margins": 2.978205680847168, + "rewards/rejected": -17.290246963500977, "step": 880 }, { "epoch": 0.9314495028780743, - "grad_norm": 32.531647089433314, + "grad_norm": 179.16273994251034, "learning_rate": 7.030787065396865e-09, - "logits/chosen": -0.3866179585456848, - "logits/rejected": 0.5604432225227356, - "logps/chosen": -444.09967041015625, - "logps/rejected": -537.5520629882812, - "loss": 0.4825, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.5642650127410889, - "rewards/margins": 1.0092883110046387, - "rewards/rejected": -2.5735533237457275, + "logits/chosen": -1.0234577655792236, + "logits/rejected": -0.9720734357833862, + "logps/chosen": -1736.5269775390625, + "logps/rejected": -2083.37939453125, + "loss": 1.4332, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -14.488537788391113, + "rewards/margins": 3.543290376663208, + "rewards/rejected": -18.031827926635742, "step": 890 }, { "epoch": 0.9419152276295133, - "grad_norm": 41.37080153124819, + "grad_norm": 163.4835379161221, "learning_rate": 5.04062020432286e-09, - "logits/chosen": -0.411233514547348, - "logits/rejected": 0.5156325101852417, - "logps/chosen": -434.09906005859375, - "logps/rejected": -530.5025024414062, - "loss": 0.4741, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.5370757579803467, - "rewards/margins": 0.953914999961853, - "rewards/rejected": -2.49099063873291, + "logits/chosen": -0.8189510107040405, + "logits/rejected": -0.8584410548210144, + "logps/chosen": -1706.8818359375, + "logps/rejected": -1968.8441162109375, + "loss": 1.5137, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.264904975891113, + "rewards/margins": 2.609503984451294, + "rewards/rejected": -16.874408721923828, "step": 900 }, { "epoch": 0.9419152276295133, - "eval_logits/chosen": -0.3568094074726105, - "eval_logits/rejected": 0.7152466773986816, - "eval_logps/chosen": -440.10498046875, - "eval_logps/rejected": -523.0108032226562, - "eval_loss": 0.49055662751197815, - "eval_rewards/accuracies": 0.7698412537574768, - "eval_rewards/chosen": -1.5813697576522827, - "eval_rewards/margins": 1.0466694831848145, - "eval_rewards/rejected": -2.6280391216278076, - "eval_runtime": 175.5396, - "eval_samples_per_second": 11.393, - "eval_steps_per_second": 0.359, + "eval_logits/chosen": -1.001752495765686, + "eval_logits/rejected": -0.673967182636261, + "eval_logps/chosen": -1735.9151611328125, + "eval_logps/rejected": -2073.3388671875, + "eval_loss": 1.3373700380325317, + "eval_rewards/accuracies": 0.6408730149269104, + "eval_rewards/chosen": -14.539473533630371, + "eval_rewards/margins": 3.5918467044830322, + "eval_rewards/rejected": -18.13132095336914, + "eval_runtime": 176.3334, + "eval_samples_per_second": 11.342, + "eval_steps_per_second": 0.357, "step": 900 }, { "epoch": 0.9523809523809523, - "grad_norm": 47.043240406771055, + "grad_norm": 190.32378571700949, "learning_rate": 3.3780648016376866e-09, - "logits/chosen": -0.13907718658447266, - "logits/rejected": 0.8174427151679993, - "logps/chosen": -380.21087646484375, - "logps/rejected": -457.331298828125, - "loss": 0.4873, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5678941011428833, - "rewards/margins": 0.8766323924064636, - "rewards/rejected": -2.444526433944702, + "logits/chosen": -0.9321626424789429, + "logits/rejected": -0.5902298092842102, + "logps/chosen": -1696.779296875, + "logps/rejected": -1922.1607666015625, + "loss": 1.4578, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.7335786819458, + "rewards/margins": 2.3592441082000732, + "rewards/rejected": -17.092823028564453, "step": 910 }, { "epoch": 0.9628466771323915, - "grad_norm": 41.04154104780731, + "grad_norm": 183.98567167006505, "learning_rate": 2.0453443778310766e-09, - "logits/chosen": -0.3518763482570648, - "logits/rejected": 0.7049046754837036, - "logps/chosen": -444.53271484375, - "logps/rejected": -510.38812255859375, - "loss": 0.4989, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.5789575576782227, - "rewards/margins": 1.008893370628357, - "rewards/rejected": -2.587851047515869, + "logits/chosen": -1.0600922107696533, + "logits/rejected": -0.7931039929389954, + "logps/chosen": -1763.392822265625, + "logps/rejected": -2107.805419921875, + "loss": 1.3202, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -14.767558097839355, + "rewards/margins": 3.794466495513916, + "rewards/rejected": -18.562023162841797, "step": 920 }, { "epoch": 0.9733124018838305, - "grad_norm": 31.21202125429197, + "grad_norm": 181.56437725274117, "learning_rate": 1.0442413283435758e-09, - "logits/chosen": -0.5221754312515259, - "logits/rejected": 1.012073040008545, - "logps/chosen": -460.8104553222656, - "logps/rejected": -517.5181884765625, - "loss": 0.4808, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.4597631692886353, - "rewards/margins": 1.07325279712677, - "rewards/rejected": -2.5330162048339844, + "logits/chosen": -1.1890182495117188, + "logits/rejected": -0.5295430421829224, + "logps/chosen": -1729.0921630859375, + "logps/rejected": -1985.2783203125, + "loss": 1.5669, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -14.14258098602295, + "rewards/margins": 3.068037748336792, + "rewards/rejected": -17.210617065429688, "step": 930 }, { "epoch": 0.9837781266352695, - "grad_norm": 31.068531683647514, + "grad_norm": 173.28786175289625, "learning_rate": 3.760945397705828e-10, - "logits/chosen": -0.38587886095046997, - "logits/rejected": 0.7057468891143799, - "logps/chosen": -410.48834228515625, - "logps/rejected": -507.0341796875, - "loss": 0.4773, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4730709791183472, - "rewards/margins": 1.0000593662261963, - "rewards/rejected": -2.473130464553833, + "logits/chosen": -0.856045126914978, + "logits/rejected": -0.7398639917373657, + "logps/chosen": -1713.3883056640625, + "logps/rejected": -2039.740966796875, + "loss": 1.266, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.502069473266602, + "rewards/margins": 3.2981293201446533, + "rewards/rejected": -17.800199508666992, "step": 940 }, { "epoch": 0.9942438513867086, - "grad_norm": 32.463686030627095, + "grad_norm": 188.65879146663107, "learning_rate": 4.17975992204056e-11, - "logits/chosen": -0.2789713740348816, - "logits/rejected": 0.4086712896823883, - "logps/chosen": -464.32086181640625, - "logps/rejected": -530.4679565429688, - "loss": 0.4828, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.6147760152816772, - "rewards/margins": 0.8401002883911133, - "rewards/rejected": -2.45487642288208, + "logits/chosen": -1.168084740638733, + "logits/rejected": -0.8855546116828918, + "logps/chosen": -1736.102783203125, + "logps/rejected": -1955.3255615234375, + "loss": 1.4604, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -14.33259391784668, + "rewards/margins": 2.370856761932373, + "rewards/rejected": -16.70345115661621, "step": 950 }, { "epoch": 0.9994767137624281, "step": 955, "total_flos": 0.0, - "train_loss": 0.5247097397349891, - "train_runtime": 18219.8315, - "train_samples_per_second": 3.355, - "train_steps_per_second": 0.052 + "train_loss": 2.1165736393154604, + "train_runtime": 18133.1885, + "train_samples_per_second": 3.371, + "train_steps_per_second": 0.053 } ], "logging_steps": 10,