diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23247 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999183606825047, + "eval_steps": 100, + "global_step": 1531, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000653114539962446, + "grad_norm": 6.562415939922599, + "learning_rate": 1.2987012987012988e-09, + "logits/chosen": -0.8478949069976807, + "logits/rejected": -0.799842894077301, + "logps/chosen": -449.73687744140625, + "logps/rejected": -466.4884338378906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.001306229079924892, + "grad_norm": 6.748473020918118, + "learning_rate": 2.5974025974025976e-09, + "logits/chosen": -0.8114207983016968, + "logits/rejected": -0.803877592086792, + "logps/chosen": -473.9585876464844, + "logps/rejected": -491.62945556640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0019593436198873378, + "grad_norm": 7.844471184175687, + "learning_rate": 3.8961038961038956e-09, + "logits/chosen": -0.7223482728004456, + "logits/rejected": -0.7460772395133972, + "logps/chosen": -452.0518798828125, + "logps/rejected": -520.4376831054688, + "loss": 0.6924, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0006803108262829483, + "rewards/margins": 0.000374476658180356, + "rewards/rejected": -0.0010547875426709652, + "step": 3 + }, + { + "epoch": 0.002612458159849784, + "grad_norm": 6.158505172940123, + "learning_rate": 5.194805194805195e-09, + "logits/chosen": -0.7972275614738464, + "logits/rejected": -0.7775567770004272, + "logps/chosen": -464.388916015625, + "logps/rejected": -450.9453125, + "loss": 0.6932, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.002797861350700259, + "rewards/margins": 0.0031175969634205103, + "rewards/rejected": -0.000319735670927912, + "step": 4 + }, + { + "epoch": 0.0032655726998122294, + "grad_norm": 5.924997097764666, + "learning_rate": 6.493506493506492e-09, + "logits/chosen": -0.7506823539733887, + "logits/rejected": -0.7407633066177368, + "logps/chosen": -448.05523681640625, + "logps/rejected": -451.1369934082031, + "loss": 0.6933, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0005750393029302359, + "rewards/margins": 0.0015734098851680756, + "rewards/rejected": -0.0009983705822378397, + "step": 5 + }, + { + "epoch": 0.0039186872397746755, + "grad_norm": 7.131146828068107, + "learning_rate": 7.792207792207791e-09, + "logits/chosen": -0.7589026689529419, + "logits/rejected": -0.7306044101715088, + "logps/chosen": -467.2422790527344, + "logps/rejected": -473.4179382324219, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0016507150139659643, + "rewards/margins": -0.0006250384030863643, + "rewards/rejected": 0.0022757528349757195, + "step": 6 + }, + { + "epoch": 0.0045718017797371216, + "grad_norm": 6.131111671725116, + "learning_rate": 9.09090909090909e-09, + "logits/chosen": -0.8041960597038269, + "logits/rejected": -0.774326503276825, + "logps/chosen": -538.4379272460938, + "logps/rejected": -494.1611022949219, + "loss": 0.6936, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0003780794213525951, + "rewards/margins": 0.00037171837175264955, + "rewards/rejected": -0.0007497979095205665, + "step": 7 + }, + { + "epoch": 0.005224916319699568, + "grad_norm": 6.367194577822853, + "learning_rate": 1.038961038961039e-08, + "logits/chosen": -0.7674342393875122, + "logits/rejected": -0.7461612820625305, + "logps/chosen": -502.4452819824219, + "logps/rejected": -518.5669555664062, + "loss": 0.6934, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0011391757288947701, + "rewards/margins": 0.0007867790409363806, + "rewards/rejected": -0.0019259547116234899, + "step": 8 + }, + { + "epoch": 0.005878030859662013, + "grad_norm": 6.463891351458318, + "learning_rate": 1.1688311688311687e-08, + "logits/chosen": -0.850688636302948, + "logits/rejected": -0.8408986926078796, + "logps/chosen": -448.7418212890625, + "logps/rejected": -451.31463623046875, + "loss": 0.6933, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.002334318123757839, + "rewards/margins": 0.002614659955725074, + "rewards/rejected": -0.0002803421812132001, + "step": 9 + }, + { + "epoch": 0.006531145399624459, + "grad_norm": 6.550118208844482, + "learning_rate": 1.2987012987012985e-08, + "logits/chosen": -0.8453483581542969, + "logits/rejected": -0.8150793313980103, + "logps/chosen": -507.4414367675781, + "logps/rejected": -457.73309326171875, + "loss": 0.6932, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.002283277688547969, + "rewards/margins": 0.00012453558156266809, + "rewards/rejected": 0.0021587416995316744, + "step": 10 + }, + { + "epoch": 0.007184259939586905, + "grad_norm": 6.40311214722953, + "learning_rate": 1.4285714285714284e-08, + "logits/chosen": -0.5852610468864441, + "logits/rejected": -0.5376417636871338, + "logps/chosen": -440.3765869140625, + "logps/rejected": -440.29327392578125, + "loss": 0.693, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.00031277656671591103, + "rewards/margins": -0.00043199292849749327, + "rewards/rejected": 0.0007447696407325566, + "step": 11 + }, + { + "epoch": 0.007837374479549351, + "grad_norm": 6.088349060239392, + "learning_rate": 1.5584415584415582e-08, + "logits/chosen": -0.8110294938087463, + "logits/rejected": -0.7939715385437012, + "logps/chosen": -440.26983642578125, + "logps/rejected": -439.900390625, + "loss": 0.6924, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.00481039984151721, + "rewards/margins": 0.0019957683980464935, + "rewards/rejected": 0.00281463167630136, + "step": 12 + }, + { + "epoch": 0.008490489019511797, + "grad_norm": 5.861510809151376, + "learning_rate": 1.6883116883116882e-08, + "logits/chosen": -0.9300619959831238, + "logits/rejected": -0.9246059060096741, + "logps/chosen": -537.093017578125, + "logps/rejected": -536.7426147460938, + "loss": 0.6924, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0025931550189852715, + "rewards/margins": 0.0013525104150176048, + "rewards/rejected": -0.003945665434002876, + "step": 13 + }, + { + "epoch": 0.009143603559474243, + "grad_norm": 6.593136523571926, + "learning_rate": 1.818181818181818e-08, + "logits/chosen": -0.8136120438575745, + "logits/rejected": -0.8396366834640503, + "logps/chosen": -511.1154479980469, + "logps/rejected": -519.767578125, + "loss": 0.694, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0005540727870538831, + "rewards/margins": -0.002040934283286333, + "rewards/rejected": 0.0025950074195861816, + "step": 14 + }, + { + "epoch": 0.00979671809943669, + "grad_norm": 5.9190226098756344, + "learning_rate": 1.948051948051948e-08, + "logits/chosen": -0.8912380933761597, + "logits/rejected": -0.8504163026809692, + "logps/chosen": -480.95562744140625, + "logps/rejected": -442.54010009765625, + "loss": 0.6938, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.001571483677253127, + "rewards/margins": -0.0010143854888156056, + "rewards/rejected": -0.0005570981884375215, + "step": 15 + }, + { + "epoch": 0.010449832639399135, + "grad_norm": 6.038795223633495, + "learning_rate": 2.077922077922078e-08, + "logits/chosen": -0.897087812423706, + "logits/rejected": -0.8558621406555176, + "logps/chosen": -535.7008666992188, + "logps/rejected": -501.1187438964844, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005735683371312916, + "rewards/margins": 0.00031663902336731553, + "rewards/rejected": -0.0008902073604986072, + "step": 16 + }, + { + "epoch": 0.011102947179361581, + "grad_norm": 5.647231054712397, + "learning_rate": 2.2077922077922077e-08, + "logits/chosen": -0.7808964252471924, + "logits/rejected": -0.7509868144989014, + "logps/chosen": -404.5117492675781, + "logps/rejected": -420.18280029296875, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00039049627957865596, + "rewards/margins": -0.00038562045665457845, + "rewards/rejected": 0.0007761167944408953, + "step": 17 + }, + { + "epoch": 0.011756061719324026, + "grad_norm": 6.240516998552548, + "learning_rate": 2.3376623376623374e-08, + "logits/chosen": -0.9235209822654724, + "logits/rejected": -0.8612209558486938, + "logps/chosen": -483.8492431640625, + "logps/rejected": -480.3951721191406, + "loss": 0.6934, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0007323266472667456, + "rewards/margins": 0.0006075193523429334, + "rewards/rejected": 0.00012480735313147306, + "step": 18 + }, + { + "epoch": 0.012409176259286472, + "grad_norm": 6.121429252846795, + "learning_rate": 2.4675324675324673e-08, + "logits/chosen": -0.8684954643249512, + "logits/rejected": -0.7898334264755249, + "logps/chosen": -421.7978210449219, + "logps/rejected": -404.7839660644531, + "loss": 0.6934, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0008550119819119573, + "rewards/margins": 0.000968189153354615, + "rewards/rejected": -0.0001131771132349968, + "step": 19 + }, + { + "epoch": 0.013062290799248918, + "grad_norm": 6.339785767547404, + "learning_rate": 2.597402597402597e-08, + "logits/chosen": -0.8148159980773926, + "logits/rejected": -0.8194824457168579, + "logps/chosen": -458.73248291015625, + "logps/rejected": -466.71539306640625, + "loss": 0.6934, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00043350690975785255, + "rewards/margins": -0.002935664728283882, + "rewards/rejected": 0.002502157585695386, + "step": 20 + }, + { + "epoch": 0.013715405339211364, + "grad_norm": 5.9420982708921235, + "learning_rate": 2.727272727272727e-08, + "logits/chosen": -0.8213762640953064, + "logits/rejected": -0.7422791719436646, + "logps/chosen": -494.7904357910156, + "logps/rejected": -440.2748718261719, + "loss": 0.6933, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.005373182240873575, + "rewards/margins": -0.0054640937596559525, + "rewards/rejected": 9.091137326322496e-05, + "step": 21 + }, + { + "epoch": 0.01436851987917381, + "grad_norm": 7.466100405088569, + "learning_rate": 2.857142857142857e-08, + "logits/chosen": -0.7983863353729248, + "logits/rejected": -0.7951775789260864, + "logps/chosen": -435.16485595703125, + "logps/rejected": -436.73638916015625, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0032159374095499516, + "rewards/margins": 0.0017969273030757904, + "rewards/rejected": 0.0014190103393048048, + "step": 22 + }, + { + "epoch": 0.015021634419136256, + "grad_norm": 8.73962952417513, + "learning_rate": 2.987012987012987e-08, + "logits/chosen": -0.7669256925582886, + "logits/rejected": -0.7493557929992676, + "logps/chosen": -468.0224304199219, + "logps/rejected": -496.3000793457031, + "loss": 0.6927, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0022595857735723257, + "rewards/margins": 0.0015146515797823668, + "rewards/rejected": 0.0007449342519976199, + "step": 23 + }, + { + "epoch": 0.015674748959098702, + "grad_norm": 6.272301382957476, + "learning_rate": 3.1168831168831165e-08, + "logits/chosen": -0.8932624459266663, + "logits/rejected": -0.738869845867157, + "logps/chosen": -465.3074035644531, + "logps/rejected": -401.3009948730469, + "loss": 0.6934, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0013363815378397703, + "rewards/margins": -0.0026393865700811148, + "rewards/rejected": 0.0013030050322413445, + "step": 24 + }, + { + "epoch": 0.016327863499061148, + "grad_norm": 6.417594194874059, + "learning_rate": 3.246753246753247e-08, + "logits/chosen": -0.6995882391929626, + "logits/rejected": -0.7010672688484192, + "logps/chosen": -469.766845703125, + "logps/rejected": -522.3395385742188, + "loss": 0.6933, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0015485978219658136, + "rewards/margins": -0.0024081014562398195, + "rewards/rejected": 0.0008595038088969886, + "step": 25 + }, + { + "epoch": 0.016980978039023594, + "grad_norm": 6.367600858757605, + "learning_rate": 3.3766233766233764e-08, + "logits/chosen": -0.9031739234924316, + "logits/rejected": -0.8351707458496094, + "logps/chosen": -466.1416320800781, + "logps/rejected": -425.1600646972656, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0006817459943704307, + "rewards/margins": -0.0016306330217048526, + "rewards/rejected": 0.0023123789578676224, + "step": 26 + }, + { + "epoch": 0.01763409257898604, + "grad_norm": 6.118336209796513, + "learning_rate": 3.506493506493507e-08, + "logits/chosen": -0.8085826635360718, + "logits/rejected": -0.7773007750511169, + "logps/chosen": -501.96881103515625, + "logps/rejected": -469.5330810546875, + "loss": 0.6937, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0007913708686828613, + "rewards/margins": -0.0013373160036280751, + "rewards/rejected": 0.0005459451349452138, + "step": 27 + }, + { + "epoch": 0.018287207118948486, + "grad_norm": 8.745785820519497, + "learning_rate": 3.636363636363636e-08, + "logits/chosen": -0.7346066832542419, + "logits/rejected": -0.7083489894866943, + "logps/chosen": -427.6324768066406, + "logps/rejected": -452.82135009765625, + "loss": 0.6941, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0010156225180253386, + "rewards/margins": -0.000818071304820478, + "rewards/rejected": 0.0018336940556764603, + "step": 28 + }, + { + "epoch": 0.018940321658910932, + "grad_norm": 5.868861469611495, + "learning_rate": 3.766233766233766e-08, + "logits/chosen": -0.8541309833526611, + "logits/rejected": -0.7489965558052063, + "logps/chosen": -494.1967468261719, + "logps/rejected": -451.6412048339844, + "loss": 0.6932, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.002935814904049039, + "rewards/margins": 0.0030139421578496695, + "rewards/rejected": -7.812735566403717e-05, + "step": 29 + }, + { + "epoch": 0.01959343619887338, + "grad_norm": 5.717710270817341, + "learning_rate": 3.896103896103896e-08, + "logits/chosen": -0.876485288143158, + "logits/rejected": -0.8216814398765564, + "logps/chosen": -465.61505126953125, + "logps/rejected": -459.4190673828125, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0021222876384854317, + "rewards/margins": 0.0020624399185180664, + "rewards/rejected": 5.984783638268709e-05, + "step": 30 + }, + { + "epoch": 0.020246550738835824, + "grad_norm": 6.281100624641187, + "learning_rate": 4.025974025974026e-08, + "logits/chosen": -0.8063470125198364, + "logits/rejected": -0.7959135174751282, + "logps/chosen": -491.56195068359375, + "logps/rejected": -522.5487670898438, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0031092308927327394, + "rewards/margins": 0.004373883828520775, + "rewards/rejected": -0.001264653168618679, + "step": 31 + }, + { + "epoch": 0.02089966527879827, + "grad_norm": 8.396501231198874, + "learning_rate": 4.155844155844156e-08, + "logits/chosen": -0.8137073516845703, + "logits/rejected": -0.7688996195793152, + "logps/chosen": -487.014892578125, + "logps/rejected": -456.2672119140625, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9263464966788888e-05, + "rewards/margins": 0.002564668655395508, + "rewards/rejected": -0.002593932207673788, + "step": 32 + }, + { + "epoch": 0.021552779818760717, + "grad_norm": 6.330099918769276, + "learning_rate": 4.285714285714285e-08, + "logits/chosen": -0.8605762124061584, + "logits/rejected": -0.877137303352356, + "logps/chosen": -533.037841796875, + "logps/rejected": -569.402099609375, + "loss": 0.6925, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0005539129488170147, + "rewards/margins": 0.003927340265363455, + "rewards/rejected": -0.0044812532141804695, + "step": 33 + }, + { + "epoch": 0.022205894358723163, + "grad_norm": 6.7333044733098255, + "learning_rate": 4.4155844155844154e-08, + "logits/chosen": -0.8076195120811462, + "logits/rejected": -0.8017356395721436, + "logps/chosen": -468.20135498046875, + "logps/rejected": -545.4854736328125, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0006238793721422553, + "rewards/margins": 0.001298372633755207, + "rewards/rejected": -0.0019222521223127842, + "step": 34 + }, + { + "epoch": 0.02285900889868561, + "grad_norm": 6.246072765738932, + "learning_rate": 4.545454545454545e-08, + "logits/chosen": -0.9315503835678101, + "logits/rejected": -0.8312739133834839, + "logps/chosen": -506.8136291503906, + "logps/rejected": -494.281982421875, + "loss": 0.6928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0008896064246073365, + "rewards/margins": 0.0015592812560498714, + "rewards/rejected": -0.0024488880299031734, + "step": 35 + }, + { + "epoch": 0.02351212343864805, + "grad_norm": 6.247743209477438, + "learning_rate": 4.675324675324675e-08, + "logits/chosen": -0.8887131810188293, + "logits/rejected": -0.797776460647583, + "logps/chosen": -443.1953125, + "logps/rejected": -482.0925598144531, + "loss": 0.6926, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00042538653360679746, + "rewards/margins": 0.0009448549244552851, + "rewards/rejected": -0.0013702415162697434, + "step": 36 + }, + { + "epoch": 0.024165237978610497, + "grad_norm": 6.910073571698978, + "learning_rate": 4.805194805194805e-08, + "logits/chosen": -0.7385239601135254, + "logits/rejected": -0.7428927421569824, + "logps/chosen": -448.6010437011719, + "logps/rejected": -474.8780517578125, + "loss": 0.6936, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0011567330220714211, + "rewards/margins": 0.0007090972503647208, + "rewards/rejected": 0.00044763548066839576, + "step": 37 + }, + { + "epoch": 0.024818352518572943, + "grad_norm": 6.379108387846267, + "learning_rate": 4.9350649350649346e-08, + "logits/chosen": -0.8199461698532104, + "logits/rejected": -0.8401196002960205, + "logps/chosen": -460.55938720703125, + "logps/rejected": -528.7261962890625, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0014551591593772173, + "rewards/margins": -0.0002124977036146447, + "rewards/rejected": -0.00124266161583364, + "step": 38 + }, + { + "epoch": 0.02547146705853539, + "grad_norm": 8.352936998804818, + "learning_rate": 5.064935064935064e-08, + "logits/chosen": -0.8554872870445251, + "logits/rejected": -0.7549471855163574, + "logps/chosen": -521.4730834960938, + "logps/rejected": -480.829345703125, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0004938221536576748, + "rewards/margins": -0.0012401770800352097, + "rewards/rejected": 0.0007463552174158394, + "step": 39 + }, + { + "epoch": 0.026124581598497836, + "grad_norm": 5.6761908104871175, + "learning_rate": 5.194805194805194e-08, + "logits/chosen": -0.816625714302063, + "logits/rejected": -0.7816446423530579, + "logps/chosen": -458.86163330078125, + "logps/rejected": -457.4176025390625, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.000727543723769486, + "rewards/margins": 0.002859849948436022, + "rewards/rejected": -0.0021323063410818577, + "step": 40 + }, + { + "epoch": 0.02677769613846028, + "grad_norm": 6.57427304979193, + "learning_rate": 5.324675324675324e-08, + "logits/chosen": -0.8133624792098999, + "logits/rejected": -0.7977904677391052, + "logps/chosen": -475.57403564453125, + "logps/rejected": -483.3184509277344, + "loss": 0.6933, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0008566378382965922, + "rewards/margins": -0.0026502918917685747, + "rewards/rejected": 0.0017936539370566607, + "step": 41 + }, + { + "epoch": 0.027430810678422728, + "grad_norm": 6.268682392513088, + "learning_rate": 5.454545454545454e-08, + "logits/chosen": -0.9565770030021667, + "logits/rejected": -0.905295193195343, + "logps/chosen": -477.0942687988281, + "logps/rejected": -471.25128173828125, + "loss": 0.6933, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001213481416925788, + "rewards/margins": -0.0002797842025756836, + "rewards/rejected": -0.0009336970397271216, + "step": 42 + }, + { + "epoch": 0.028083925218385174, + "grad_norm": 7.089768213142083, + "learning_rate": 5.584415584415584e-08, + "logits/chosen": -0.7968413829803467, + "logits/rejected": -0.7868589162826538, + "logps/chosen": -415.7538146972656, + "logps/rejected": -465.5323486328125, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0018370484467595816, + "rewards/margins": 0.0022343345917761326, + "rewards/rejected": -0.0003972864360548556, + "step": 43 + }, + { + "epoch": 0.02873703975834762, + "grad_norm": 6.217079116264922, + "learning_rate": 5.714285714285714e-08, + "logits/chosen": -0.8787972927093506, + "logits/rejected": -0.8169955015182495, + "logps/chosen": -515.1199951171875, + "logps/rejected": -519.668701171875, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001179876271635294, + "rewards/margins": -0.00044661754509434104, + "rewards/rejected": -0.0007332589011639357, + "step": 44 + }, + { + "epoch": 0.029390154298310066, + "grad_norm": 6.9137881963714936, + "learning_rate": 5.8441558441558434e-08, + "logits/chosen": -0.75499027967453, + "logits/rejected": -0.6955289244651794, + "logps/chosen": -398.29498291015625, + "logps/rejected": -411.081787109375, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0006995415897108614, + "rewards/margins": 0.0012732648756355047, + "rewards/rejected": -0.001972806639969349, + "step": 45 + }, + { + "epoch": 0.030043268838272512, + "grad_norm": 6.131893963737681, + "learning_rate": 5.974025974025974e-08, + "logits/chosen": -0.7929679155349731, + "logits/rejected": -0.7688208222389221, + "logps/chosen": -497.57989501953125, + "logps/rejected": -497.4010925292969, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0028578471392393112, + "rewards/margins": -0.0016406627837568521, + "rewards/rejected": -0.0012171841226518154, + "step": 46 + }, + { + "epoch": 0.030696383378234958, + "grad_norm": 6.308846265764008, + "learning_rate": 6.103896103896104e-08, + "logits/chosen": -0.7683753371238708, + "logits/rejected": -0.7528908848762512, + "logps/chosen": -499.39813232421875, + "logps/rejected": -578.732666015625, + "loss": 0.6915, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.002009766176342964, + "rewards/margins": 0.002183923963457346, + "rewards/rejected": -0.00419369013980031, + "step": 47 + }, + { + "epoch": 0.031349497918197404, + "grad_norm": 6.225507507577421, + "learning_rate": 6.233766233766233e-08, + "logits/chosen": -0.891581654548645, + "logits/rejected": -0.8392075896263123, + "logps/chosen": -556.0016479492188, + "logps/rejected": -578.8616943359375, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0008191632805392146, + "rewards/margins": 0.0043422505259513855, + "rewards/rejected": -0.005161413922905922, + "step": 48 + }, + { + "epoch": 0.03200261245815985, + "grad_norm": 6.497895076049909, + "learning_rate": 6.363636363636363e-08, + "logits/chosen": -0.8468933701515198, + "logits/rejected": -0.818824291229248, + "logps/chosen": -434.587890625, + "logps/rejected": -489.9275207519531, + "loss": 0.6923, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0039938571862876415, + "rewards/margins": 0.001086747506633401, + "rewards/rejected": -0.005080604460090399, + "step": 49 + }, + { + "epoch": 0.032655726998122296, + "grad_norm": 6.161089510077531, + "learning_rate": 6.493506493506494e-08, + "logits/chosen": -0.8131488561630249, + "logits/rejected": -0.8177019357681274, + "logps/chosen": -430.72418212890625, + "logps/rejected": -458.6739501953125, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0010935901664197445, + "rewards/margins": 0.002127976156771183, + "rewards/rejected": -0.003221566788852215, + "step": 50 + }, + { + "epoch": 0.03330884153808474, + "grad_norm": 6.1640122002418325, + "learning_rate": 6.623376623376622e-08, + "logits/chosen": -0.9181683659553528, + "logits/rejected": -0.9139996767044067, + "logps/chosen": -558.840087890625, + "logps/rejected": -575.5423583984375, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.006191587541252375, + "rewards/margins": -0.0004238271212670952, + "rewards/rejected": -0.005767759867012501, + "step": 51 + }, + { + "epoch": 0.03396195607804719, + "grad_norm": 7.623595975761383, + "learning_rate": 6.753246753246753e-08, + "logits/chosen": -0.9138858318328857, + "logits/rejected": -0.8854256868362427, + "logps/chosen": -449.04913330078125, + "logps/rejected": -479.0080261230469, + "loss": 0.6921, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0006973959389142692, + "rewards/margins": 0.007185818627476692, + "rewards/rejected": -0.007883214391767979, + "step": 52 + }, + { + "epoch": 0.034615070618009634, + "grad_norm": 5.674823286453492, + "learning_rate": 6.883116883116883e-08, + "logits/chosen": -0.8322397470474243, + "logits/rejected": -0.8036131858825684, + "logps/chosen": -499.2374267578125, + "logps/rejected": -474.3294677734375, + "loss": 0.6917, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.002350029768422246, + "rewards/margins": 0.002208204008638859, + "rewards/rejected": -0.004558234475553036, + "step": 53 + }, + { + "epoch": 0.03526818515797208, + "grad_norm": 6.418518626617565, + "learning_rate": 7.012987012987013e-08, + "logits/chosen": -0.9170102477073669, + "logits/rejected": -0.9513068199157715, + "logps/chosen": -508.000244140625, + "logps/rejected": -505.0244140625, + "loss": 0.6928, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.005700020585209131, + "rewards/margins": -0.0018256568582728505, + "rewards/rejected": -0.003874363610520959, + "step": 54 + }, + { + "epoch": 0.035921299697934526, + "grad_norm": 5.982980505552759, + "learning_rate": 7.142857142857142e-08, + "logits/chosen": -0.8587192296981812, + "logits/rejected": -0.8139776587486267, + "logps/chosen": -489.4334716796875, + "logps/rejected": -458.024658203125, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002181756542995572, + "rewards/margins": 0.0015678240451961756, + "rewards/rejected": -0.0037495801225304604, + "step": 55 + }, + { + "epoch": 0.03657441423789697, + "grad_norm": 5.807776924515222, + "learning_rate": 7.272727272727273e-08, + "logits/chosen": -0.721636176109314, + "logits/rejected": -0.6972697973251343, + "logps/chosen": -414.83050537109375, + "logps/rejected": -432.6552734375, + "loss": 0.6926, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00701476726680994, + "rewards/margins": -0.00036887641181237996, + "rewards/rejected": -0.006645892281085253, + "step": 56 + }, + { + "epoch": 0.03722752877785942, + "grad_norm": 6.888909832263999, + "learning_rate": 7.402597402597403e-08, + "logits/chosen": -0.8523138761520386, + "logits/rejected": -0.8489971160888672, + "logps/chosen": -450.8440246582031, + "logps/rejected": -456.98492431640625, + "loss": 0.6913, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0033631990663707256, + "rewards/margins": -0.00023952008632477373, + "rewards/rejected": -0.0031236789654940367, + "step": 57 + }, + { + "epoch": 0.037880643317821865, + "grad_norm": 5.989946098741673, + "learning_rate": 7.532467532467532e-08, + "logits/chosen": -0.8812742233276367, + "logits/rejected": -0.8634947538375854, + "logps/chosen": -503.89544677734375, + "logps/rejected": -511.4792175292969, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.008199996314942837, + "rewards/margins": -0.0015930174849927425, + "rewards/rejected": -0.006606978829950094, + "step": 58 + }, + { + "epoch": 0.03853375785778431, + "grad_norm": 7.748259119189096, + "learning_rate": 7.662337662337662e-08, + "logits/chosen": -0.9024884104728699, + "logits/rejected": -0.8909275531768799, + "logps/chosen": -421.2478332519531, + "logps/rejected": -468.47564697265625, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008577623404562473, + "rewards/margins": 0.0016973968595266342, + "rewards/rejected": -0.010275020264089108, + "step": 59 + }, + { + "epoch": 0.03918687239774676, + "grad_norm": 6.109377238755764, + "learning_rate": 7.792207792207792e-08, + "logits/chosen": -0.8703082203865051, + "logits/rejected": -0.8328035473823547, + "logps/chosen": -481.5205383300781, + "logps/rejected": -503.9502258300781, + "loss": 0.6912, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004758486524224281, + "rewards/margins": 0.0038297700230032206, + "rewards/rejected": -0.008588257245719433, + "step": 60 + }, + { + "epoch": 0.0398399869377092, + "grad_norm": 5.995806133025539, + "learning_rate": 7.922077922077923e-08, + "logits/chosen": -0.7556477785110474, + "logits/rejected": -0.6751359701156616, + "logps/chosen": -434.9151611328125, + "logps/rejected": -478.38226318359375, + "loss": 0.6923, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.008997991681098938, + "rewards/margins": 0.001691188896074891, + "rewards/rejected": -0.010689180344343185, + "step": 61 + }, + { + "epoch": 0.04049310147767165, + "grad_norm": 6.174429823082506, + "learning_rate": 8.051948051948052e-08, + "logits/chosen": -0.8938629031181335, + "logits/rejected": -0.8404133319854736, + "logps/chosen": -423.1520690917969, + "logps/rejected": -414.3715515136719, + "loss": 0.6922, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0060334536246955395, + "rewards/margins": -0.0003200914361514151, + "rewards/rejected": -0.005713362712413073, + "step": 62 + }, + { + "epoch": 0.041146216017634095, + "grad_norm": 6.807841842313529, + "learning_rate": 8.181818181818182e-08, + "logits/chosen": -0.8384872078895569, + "logits/rejected": -0.7748404741287231, + "logps/chosen": -509.7137145996094, + "logps/rejected": -572.6459350585938, + "loss": 0.6921, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.009534873999655247, + "rewards/margins": 0.00488923117518425, + "rewards/rejected": -0.014424105174839497, + "step": 63 + }, + { + "epoch": 0.04179933055759654, + "grad_norm": 5.919031965066986, + "learning_rate": 8.311688311688312e-08, + "logits/chosen": -0.8469180464744568, + "logits/rejected": -0.7904057502746582, + "logps/chosen": -456.8668212890625, + "logps/rejected": -472.22271728515625, + "loss": 0.6921, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.007433886174112558, + "rewards/margins": 0.0018512008246034384, + "rewards/rejected": -0.009285087697207928, + "step": 64 + }, + { + "epoch": 0.04245244509755899, + "grad_norm": 7.5278828111367, + "learning_rate": 8.441558441558441e-08, + "logits/chosen": -0.8526628017425537, + "logits/rejected": -0.8213850259780884, + "logps/chosen": -491.56292724609375, + "logps/rejected": -523.5179443359375, + "loss": 0.6908, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.009033655747771263, + "rewards/margins": 0.0045729633420705795, + "rewards/rejected": -0.013606620021164417, + "step": 65 + }, + { + "epoch": 0.04310555963752143, + "grad_norm": 5.958334774704796, + "learning_rate": 8.57142857142857e-08, + "logits/chosen": -0.788672924041748, + "logits/rejected": -0.7575796842575073, + "logps/chosen": -404.4598083496094, + "logps/rejected": -429.7571716308594, + "loss": 0.6917, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.009336566552519798, + "rewards/margins": 0.002134217880666256, + "rewards/rejected": -0.01147078350186348, + "step": 66 + }, + { + "epoch": 0.04375867417748388, + "grad_norm": 7.088944587354933, + "learning_rate": 8.7012987012987e-08, + "logits/chosen": -0.7902776002883911, + "logits/rejected": -0.7541419267654419, + "logps/chosen": -469.0950927734375, + "logps/rejected": -450.28570556640625, + "loss": 0.6904, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.008544647134840488, + "rewards/margins": 0.002165439072996378, + "rewards/rejected": -0.010710087604820728, + "step": 67 + }, + { + "epoch": 0.044411788717446325, + "grad_norm": 5.263961432989652, + "learning_rate": 8.831168831168831e-08, + "logits/chosen": -0.821535050868988, + "logits/rejected": -0.7698171138763428, + "logps/chosen": -429.2215576171875, + "logps/rejected": -410.6090393066406, + "loss": 0.6917, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0062387725338339806, + "rewards/margins": 0.004181134980171919, + "rewards/rejected": -0.010419907979667187, + "step": 68 + }, + { + "epoch": 0.04506490325740877, + "grad_norm": 7.083198290201972, + "learning_rate": 8.96103896103896e-08, + "logits/chosen": -0.8009337186813354, + "logits/rejected": -0.7727637887001038, + "logps/chosen": -487.6383972167969, + "logps/rejected": -469.3074035644531, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010825095698237419, + "rewards/margins": 0.004274988081306219, + "rewards/rejected": -0.015100083313882351, + "step": 69 + }, + { + "epoch": 0.04571801779737122, + "grad_norm": 7.4421814888402755, + "learning_rate": 9.09090909090909e-08, + "logits/chosen": -0.8108646273612976, + "logits/rejected": -0.7533100843429565, + "logps/chosen": -430.077880859375, + "logps/rejected": -439.3136901855469, + "loss": 0.6907, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.01343483291566372, + "rewards/margins": 0.005768372677266598, + "rewards/rejected": -0.019203204661607742, + "step": 70 + }, + { + "epoch": 0.046371132337333656, + "grad_norm": 5.711367581988735, + "learning_rate": 9.22077922077922e-08, + "logits/chosen": -0.7984856367111206, + "logits/rejected": -0.692218542098999, + "logps/chosen": -420.7679443359375, + "logps/rejected": -392.79638671875, + "loss": 0.6916, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.01482747495174408, + "rewards/margins": -0.0020556068047881126, + "rewards/rejected": -0.012771867215633392, + "step": 71 + }, + { + "epoch": 0.0470242468772961, + "grad_norm": 6.490728908525714, + "learning_rate": 9.35064935064935e-08, + "logits/chosen": -0.6769509315490723, + "logits/rejected": -0.669062077999115, + "logps/chosen": -467.250244140625, + "logps/rejected": -452.87518310546875, + "loss": 0.691, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.010883898474276066, + "rewards/margins": 0.005101327784359455, + "rewards/rejected": -0.01598522625863552, + "step": 72 + }, + { + "epoch": 0.04767736141725855, + "grad_norm": 5.9084258412842425, + "learning_rate": 9.48051948051948e-08, + "logits/chosen": -0.800752580165863, + "logits/rejected": -0.809019148349762, + "logps/chosen": -480.45501708984375, + "logps/rejected": -450.3203430175781, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.020324386656284332, + "rewards/margins": -0.002547673648223281, + "rewards/rejected": -0.017776712775230408, + "step": 73 + }, + { + "epoch": 0.048330475957220995, + "grad_norm": 6.771433114214737, + "learning_rate": 9.61038961038961e-08, + "logits/chosen": -0.8106687664985657, + "logits/rejected": -0.7991058230400085, + "logps/chosen": -474.6903991699219, + "logps/rejected": -460.9072265625, + "loss": 0.6902, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.017290135845541954, + "rewards/margins": 0.004097265657037497, + "rewards/rejected": -0.02138740010559559, + "step": 74 + }, + { + "epoch": 0.04898359049718344, + "grad_norm": 6.111034389669093, + "learning_rate": 9.74025974025974e-08, + "logits/chosen": -0.972756028175354, + "logits/rejected": -0.9051934480667114, + "logps/chosen": -499.28411865234375, + "logps/rejected": -504.16778564453125, + "loss": 0.6904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.011824184097349644, + "rewards/margins": 0.01180923543870449, + "rewards/rejected": -0.02363341674208641, + "step": 75 + }, + { + "epoch": 0.04963670503714589, + "grad_norm": 6.052193857616701, + "learning_rate": 9.870129870129869e-08, + "logits/chosen": -0.8308959603309631, + "logits/rejected": -0.7871338129043579, + "logps/chosen": -451.3121032714844, + "logps/rejected": -426.82366943359375, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015030915848910809, + "rewards/margins": 0.0026831624563783407, + "rewards/rejected": -0.01771407760679722, + "step": 76 + }, + { + "epoch": 0.05028981957710833, + "grad_norm": 6.393654989952147, + "learning_rate": 1e-07, + "logits/chosen": -0.8764944672584534, + "logits/rejected": -0.823025643825531, + "logps/chosen": -431.4548034667969, + "logps/rejected": -456.610107421875, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01578567549586296, + "rewards/margins": 0.006993000861257315, + "rewards/rejected": -0.022778674960136414, + "step": 77 + }, + { + "epoch": 0.05094293411707078, + "grad_norm": 6.160999400670431, + "learning_rate": 1.0129870129870129e-07, + "logits/chosen": -0.8555769920349121, + "logits/rejected": -0.8144134283065796, + "logps/chosen": -466.14141845703125, + "logps/rejected": -429.2846374511719, + "loss": 0.692, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.019040152430534363, + "rewards/margins": 0.004302749410271645, + "rewards/rejected": -0.023342899978160858, + "step": 78 + }, + { + "epoch": 0.051596048657033225, + "grad_norm": 7.056169927053459, + "learning_rate": 1.0259740259740259e-07, + "logits/chosen": -0.9062113761901855, + "logits/rejected": -0.8697599172592163, + "logps/chosen": -453.70489501953125, + "logps/rejected": -428.64678955078125, + "loss": 0.6906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.017968041822314262, + "rewards/margins": 0.005362831987440586, + "rewards/rejected": -0.023330872878432274, + "step": 79 + }, + { + "epoch": 0.05224916319699567, + "grad_norm": 6.855093023185875, + "learning_rate": 1.0389610389610388e-07, + "logits/chosen": -0.8560048341751099, + "logits/rejected": -0.8402966260910034, + "logps/chosen": -530.1753540039062, + "logps/rejected": -501.68438720703125, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.021542314440011978, + "rewards/margins": 0.006968836300075054, + "rewards/rejected": -0.028511153534054756, + "step": 80 + }, + { + "epoch": 0.05290227773695812, + "grad_norm": 7.095472440756087, + "learning_rate": 1.051948051948052e-07, + "logits/chosen": -0.7790235877037048, + "logits/rejected": -0.7337735295295715, + "logps/chosen": -510.4595947265625, + "logps/rejected": -553.8370361328125, + "loss": 0.6896, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.03253030404448509, + "rewards/margins": 0.009851142764091492, + "rewards/rejected": -0.042381446808576584, + "step": 81 + }, + { + "epoch": 0.05355539227692056, + "grad_norm": 5.876457899804008, + "learning_rate": 1.0649350649350648e-07, + "logits/chosen": -0.8381537199020386, + "logits/rejected": -0.8183386921882629, + "logps/chosen": -477.2951965332031, + "logps/rejected": -468.66448974609375, + "loss": 0.6922, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.022215649485588074, + "rewards/margins": 0.0009337332448922098, + "rewards/rejected": -0.023149382323026657, + "step": 82 + }, + { + "epoch": 0.05420850681688301, + "grad_norm": 8.317291551451623, + "learning_rate": 1.0779220779220779e-07, + "logits/chosen": -0.8544670343399048, + "logits/rejected": -0.8863725066184998, + "logps/chosen": -600.6954345703125, + "logps/rejected": -707.2957153320312, + "loss": 0.6885, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.039892133325338364, + "rewards/margins": 0.01902475208044052, + "rewards/rejected": -0.05891688913106918, + "step": 83 + }, + { + "epoch": 0.054861621356845455, + "grad_norm": 6.21282343438225, + "learning_rate": 1.0909090909090908e-07, + "logits/chosen": -0.8715451955795288, + "logits/rejected": -0.9091987609863281, + "logps/chosen": -466.81402587890625, + "logps/rejected": -482.4119873046875, + "loss": 0.6894, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02122860960662365, + "rewards/margins": 0.0019442938501015306, + "rewards/rejected": -0.023172900080680847, + "step": 84 + }, + { + "epoch": 0.0555147358968079, + "grad_norm": 5.796572771273212, + "learning_rate": 1.1038961038961039e-07, + "logits/chosen": -0.8712518215179443, + "logits/rejected": -0.8297672271728516, + "logps/chosen": -540.7307739257812, + "logps/rejected": -499.5742492675781, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.025148704648017883, + "rewards/margins": 0.0031996634788811207, + "rewards/rejected": -0.028348371386528015, + "step": 85 + }, + { + "epoch": 0.05616785043677035, + "grad_norm": 6.308588424445465, + "learning_rate": 1.1168831168831168e-07, + "logits/chosen": -0.8423592448234558, + "logits/rejected": -0.8116931319236755, + "logps/chosen": -547.5648193359375, + "logps/rejected": -553.7283325195312, + "loss": 0.6886, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.028540167957544327, + "rewards/margins": 0.01041246484965086, + "rewards/rejected": -0.03895263373851776, + "step": 86 + }, + { + "epoch": 0.05682096497673279, + "grad_norm": 6.03429981202279, + "learning_rate": 1.1298701298701299e-07, + "logits/chosen": -0.8049710392951965, + "logits/rejected": -0.7904379367828369, + "logps/chosen": -479.36773681640625, + "logps/rejected": -507.9740295410156, + "loss": 0.6888, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03078952059149742, + "rewards/margins": 0.009615332819521427, + "rewards/rejected": -0.040404852479696274, + "step": 87 + }, + { + "epoch": 0.05747407951669524, + "grad_norm": 6.144437437040281, + "learning_rate": 1.1428571428571427e-07, + "logits/chosen": -0.7853479385375977, + "logits/rejected": -0.7315254211425781, + "logps/chosen": -455.5406188964844, + "logps/rejected": -487.3783264160156, + "loss": 0.6904, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.027616048231720924, + "rewards/margins": 0.006320532876998186, + "rewards/rejected": -0.033936578780412674, + "step": 88 + }, + { + "epoch": 0.058127194056657686, + "grad_norm": 5.933687262032515, + "learning_rate": 1.1558441558441558e-07, + "logits/chosen": -0.7603040933609009, + "logits/rejected": -0.7568442821502686, + "logps/chosen": -436.42315673828125, + "logps/rejected": -479.44122314453125, + "loss": 0.6905, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.028719400987029076, + "rewards/margins": 0.008343620225787163, + "rewards/rejected": -0.03706301748752594, + "step": 89 + }, + { + "epoch": 0.05878030859662013, + "grad_norm": 6.521345864177329, + "learning_rate": 1.1688311688311687e-07, + "logits/chosen": -0.7553395628929138, + "logits/rejected": -0.7436259388923645, + "logps/chosen": -538.7774047851562, + "logps/rejected": -584.1131591796875, + "loss": 0.6879, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.03026486374437809, + "rewards/margins": 0.010531079024076462, + "rewards/rejected": -0.0407959409058094, + "step": 90 + }, + { + "epoch": 0.05943342313658258, + "grad_norm": 6.090184904322082, + "learning_rate": 1.1818181818181818e-07, + "logits/chosen": -0.9382575154304504, + "logits/rejected": -0.9285158514976501, + "logps/chosen": -448.5179443359375, + "logps/rejected": -442.5186462402344, + "loss": 0.6886, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.03046746551990509, + "rewards/margins": 0.004350304137915373, + "rewards/rejected": -0.03481777012348175, + "step": 91 + }, + { + "epoch": 0.060086537676545024, + "grad_norm": 6.195989030580817, + "learning_rate": 1.1948051948051947e-07, + "logits/chosen": -0.9594197273254395, + "logits/rejected": -0.9092954397201538, + "logps/chosen": -488.979736328125, + "logps/rejected": -459.5779113769531, + "loss": 0.6891, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.031389351934194565, + "rewards/margins": 0.0023029684089124203, + "rewards/rejected": -0.03369232267141342, + "step": 92 + }, + { + "epoch": 0.06073965221650747, + "grad_norm": 6.679170721604982, + "learning_rate": 1.207792207792208e-07, + "logits/chosen": -0.8515009880065918, + "logits/rejected": -0.8676334619522095, + "logps/chosen": -440.25531005859375, + "logps/rejected": -465.7403564453125, + "loss": 0.6878, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.033117592334747314, + "rewards/margins": 0.022125840187072754, + "rewards/rejected": -0.05524343252182007, + "step": 93 + }, + { + "epoch": 0.061392766756469916, + "grad_norm": 7.826182713581807, + "learning_rate": 1.2207792207792208e-07, + "logits/chosen": -0.8225115537643433, + "logits/rejected": -0.8626826405525208, + "logps/chosen": -454.4405822753906, + "logps/rejected": -567.5311279296875, + "loss": 0.6824, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030026568099856377, + "rewards/margins": 0.03317389637231827, + "rewards/rejected": -0.0632004663348198, + "step": 94 + }, + { + "epoch": 0.06204588129643236, + "grad_norm": 6.19437116817884, + "learning_rate": 1.2337662337662337e-07, + "logits/chosen": -0.92811518907547, + "logits/rejected": -0.9229671359062195, + "logps/chosen": -429.3043212890625, + "logps/rejected": -448.91278076171875, + "loss": 0.6874, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03139014542102814, + "rewards/margins": 0.01759587787091732, + "rewards/rejected": -0.04898602515459061, + "step": 95 + }, + { + "epoch": 0.06269899583639481, + "grad_norm": 6.410822303042916, + "learning_rate": 1.2467532467532466e-07, + "logits/chosen": -0.7579972147941589, + "logits/rejected": -0.749883234500885, + "logps/chosen": -426.28485107421875, + "logps/rejected": -437.2420654296875, + "loss": 0.6868, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.028881024569272995, + "rewards/margins": 0.018690448254346848, + "rewards/rejected": -0.04757147282361984, + "step": 96 + }, + { + "epoch": 0.06335211037635725, + "grad_norm": 6.669657257596277, + "learning_rate": 1.2597402597402597e-07, + "logits/chosen": -0.8033837080001831, + "logits/rejected": -0.7326114177703857, + "logps/chosen": -448.3200378417969, + "logps/rejected": -457.5045471191406, + "loss": 0.685, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.035421933978796005, + "rewards/margins": 0.008486203849315643, + "rewards/rejected": -0.04390813410282135, + "step": 97 + }, + { + "epoch": 0.0640052249163197, + "grad_norm": 6.214717449042044, + "learning_rate": 1.2727272727272726e-07, + "logits/chosen": -0.8324880003929138, + "logits/rejected": -0.8319780230522156, + "logps/chosen": -511.32891845703125, + "logps/rejected": -537.2152099609375, + "loss": 0.6885, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.047659579664468765, + "rewards/margins": 0.014243985526263714, + "rewards/rejected": -0.06190356984734535, + "step": 98 + }, + { + "epoch": 0.06465833945628215, + "grad_norm": 6.031504782803402, + "learning_rate": 1.2857142857142858e-07, + "logits/chosen": -0.8415837287902832, + "logits/rejected": -0.817263126373291, + "logps/chosen": -524.89501953125, + "logps/rejected": -573.4385375976562, + "loss": 0.6886, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.04896525666117668, + "rewards/margins": 0.025869663804769516, + "rewards/rejected": -0.0748349204659462, + "step": 99 + }, + { + "epoch": 0.06531145399624459, + "grad_norm": 6.485913528950639, + "learning_rate": 1.2987012987012987e-07, + "logits/chosen": -0.7721492052078247, + "logits/rejected": -0.6845570802688599, + "logps/chosen": -489.182861328125, + "logps/rejected": -462.3575439453125, + "loss": 0.686, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05609467625617981, + "rewards/margins": 0.005238112062215805, + "rewards/rejected": -0.06133279576897621, + "step": 100 + }, + { + "epoch": 0.06531145399624459, + "eval_logits/chosen": -0.7427287697792053, + "eval_logits/rejected": -0.7034040093421936, + "eval_logps/chosen": -478.8180847167969, + "eval_logps/rejected": -471.3314514160156, + "eval_loss": 0.6856335997581482, + "eval_rewards/accuracies": 0.6480000019073486, + "eval_rewards/chosen": -0.04910915717482567, + "eval_rewards/margins": 0.012519298121333122, + "eval_rewards/rejected": -0.06162845715880394, + "eval_runtime": 615.2872, + "eval_samples_per_second": 6.501, + "eval_steps_per_second": 0.406, + "step": 100 + }, + { + "epoch": 0.06596456853620704, + "grad_norm": 6.288509399205162, + "learning_rate": 1.3116883116883116e-07, + "logits/chosen": -0.8475708961486816, + "logits/rejected": -0.8312665224075317, + "logps/chosen": -409.4945068359375, + "logps/rejected": -514.4505004882812, + "loss": 0.6861, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.036619801074266434, + "rewards/margins": 0.03393262252211571, + "rewards/rejected": -0.07055243104696274, + "step": 101 + }, + { + "epoch": 0.06661768307616948, + "grad_norm": 8.43910713373062, + "learning_rate": 1.3246753246753245e-07, + "logits/chosen": -0.8536262512207031, + "logits/rejected": -0.7814661264419556, + "logps/chosen": -474.07562255859375, + "logps/rejected": -504.9327697753906, + "loss": 0.6803, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.041596654802560806, + "rewards/margins": 0.02779841423034668, + "rewards/rejected": -0.06939506530761719, + "step": 102 + }, + { + "epoch": 0.06727079761613193, + "grad_norm": 5.9455668904408405, + "learning_rate": 1.3376623376623374e-07, + "logits/chosen": -0.8607446551322937, + "logits/rejected": -0.8579328656196594, + "logps/chosen": -480.89239501953125, + "logps/rejected": -477.357177734375, + "loss": 0.6889, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04734374210238457, + "rewards/margins": 0.0077428012154996395, + "rewards/rejected": -0.05508654564619064, + "step": 103 + }, + { + "epoch": 0.06792391215609438, + "grad_norm": 7.084091083186374, + "learning_rate": 1.3506493506493506e-07, + "logits/chosen": -0.7221732139587402, + "logits/rejected": -0.7565256953239441, + "logps/chosen": -453.64556884765625, + "logps/rejected": -535.10986328125, + "loss": 0.6872, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.05151586979627609, + "rewards/margins": 0.03345106542110443, + "rewards/rejected": -0.08496693521738052, + "step": 104 + }, + { + "epoch": 0.06857702669605682, + "grad_norm": 7.518260078445051, + "learning_rate": 1.3636363636363635e-07, + "logits/chosen": -0.8677546381950378, + "logits/rejected": -0.8008454442024231, + "logps/chosen": -466.8918151855469, + "logps/rejected": -481.9544982910156, + "loss": 0.6826, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0516490712761879, + "rewards/margins": 0.025956520810723305, + "rewards/rejected": -0.07760559022426605, + "step": 105 + }, + { + "epoch": 0.06923014123601927, + "grad_norm": 6.820450121720659, + "learning_rate": 1.3766233766233766e-07, + "logits/chosen": -0.7937129735946655, + "logits/rejected": -0.7826783657073975, + "logps/chosen": -448.1839599609375, + "logps/rejected": -473.87188720703125, + "loss": 0.6822, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0625004917383194, + "rewards/margins": 0.016829611733555794, + "rewards/rejected": -0.07933010160923004, + "step": 106 + }, + { + "epoch": 0.06988325577598171, + "grad_norm": 7.2075078702746715, + "learning_rate": 1.3896103896103895e-07, + "logits/chosen": -0.9469939470291138, + "logits/rejected": -0.9710195064544678, + "logps/chosen": -528.0358276367188, + "logps/rejected": -546.5836791992188, + "loss": 0.6826, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06540645658969879, + "rewards/margins": 0.015278171747922897, + "rewards/rejected": -0.08068463206291199, + "step": 107 + }, + { + "epoch": 0.07053637031594416, + "grad_norm": 5.859716807084932, + "learning_rate": 1.4025974025974027e-07, + "logits/chosen": -0.9260960817337036, + "logits/rejected": -0.9055756330490112, + "logps/chosen": -437.0494079589844, + "logps/rejected": -446.2071533203125, + "loss": 0.6855, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.059963203966617584, + "rewards/margins": 0.01329735480248928, + "rewards/rejected": -0.07326056808233261, + "step": 108 + }, + { + "epoch": 0.0711894848559066, + "grad_norm": 6.594713572620182, + "learning_rate": 1.4155844155844153e-07, + "logits/chosen": -0.8733989000320435, + "logits/rejected": -0.8238499164581299, + "logps/chosen": -490.5877380371094, + "logps/rejected": -517.9608764648438, + "loss": 0.6851, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.07540058344602585, + "rewards/margins": 0.01694507710635662, + "rewards/rejected": -0.09234566241502762, + "step": 109 + }, + { + "epoch": 0.07184259939586905, + "grad_norm": 5.977398778954903, + "learning_rate": 1.4285714285714285e-07, + "logits/chosen": -0.6950225830078125, + "logits/rejected": -0.7042300701141357, + "logps/chosen": -360.7035217285156, + "logps/rejected": -361.4801330566406, + "loss": 0.6849, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.05551227554678917, + "rewards/margins": 0.004719756543636322, + "rewards/rejected": -0.06023203581571579, + "step": 110 + }, + { + "epoch": 0.0724957139358315, + "grad_norm": 6.145204957324782, + "learning_rate": 1.4415584415584414e-07, + "logits/chosen": -0.7611751556396484, + "logits/rejected": -0.7577171325683594, + "logps/chosen": -493.44537353515625, + "logps/rejected": -515.5286254882812, + "loss": 0.6834, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08799681067466736, + "rewards/margins": 0.019383134320378304, + "rewards/rejected": -0.10737993568181992, + "step": 111 + }, + { + "epoch": 0.07314882847579394, + "grad_norm": 6.213137266301709, + "learning_rate": 1.4545454545454545e-07, + "logits/chosen": -0.8211960792541504, + "logits/rejected": -0.7990384697914124, + "logps/chosen": -427.08642578125, + "logps/rejected": -450.64056396484375, + "loss": 0.6845, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0742255300283432, + "rewards/margins": 0.0058108242228627205, + "rewards/rejected": -0.08003635704517365, + "step": 112 + }, + { + "epoch": 0.07380194301575639, + "grad_norm": 6.315726030814639, + "learning_rate": 1.4675324675324674e-07, + "logits/chosen": -0.7984048128128052, + "logits/rejected": -0.7764061689376831, + "logps/chosen": -471.4085693359375, + "logps/rejected": -501.5404052734375, + "loss": 0.6777, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07434587180614471, + "rewards/margins": 0.03265540301799774, + "rewards/rejected": -0.10700127482414246, + "step": 113 + }, + { + "epoch": 0.07445505755571884, + "grad_norm": 7.263965754945047, + "learning_rate": 1.4805194805194806e-07, + "logits/chosen": -0.7634120583534241, + "logits/rejected": -0.7683815360069275, + "logps/chosen": -467.3013916015625, + "logps/rejected": -447.38470458984375, + "loss": 0.6804, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.06479562073945999, + "rewards/margins": 0.022856786847114563, + "rewards/rejected": -0.08765240758657455, + "step": 114 + }, + { + "epoch": 0.07510817209568128, + "grad_norm": 6.8685173621443125, + "learning_rate": 1.4935064935064935e-07, + "logits/chosen": -0.7747349143028259, + "logits/rejected": -0.7697080373764038, + "logps/chosen": -427.6845703125, + "logps/rejected": -428.98291015625, + "loss": 0.6787, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.07621235400438309, + "rewards/margins": 0.010828008875250816, + "rewards/rejected": -0.08704036474227905, + "step": 115 + }, + { + "epoch": 0.07576128663564373, + "grad_norm": 6.505431738842133, + "learning_rate": 1.5064935064935064e-07, + "logits/chosen": -0.9600522518157959, + "logits/rejected": -0.9311190843582153, + "logps/chosen": -488.33428955078125, + "logps/rejected": -485.6419677734375, + "loss": 0.68, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.06629984080791473, + "rewards/margins": 0.011177876964211464, + "rewards/rejected": -0.07747771590948105, + "step": 116 + }, + { + "epoch": 0.07641440117560618, + "grad_norm": 7.44390451084911, + "learning_rate": 1.5194805194805193e-07, + "logits/chosen": -0.7398961782455444, + "logits/rejected": -0.7255181670188904, + "logps/chosen": -511.2425231933594, + "logps/rejected": -504.2010803222656, + "loss": 0.6798, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07773499935865402, + "rewards/margins": 0.02082427777349949, + "rewards/rejected": -0.09855926781892776, + "step": 117 + }, + { + "epoch": 0.07706751571556862, + "grad_norm": 5.877894495278698, + "learning_rate": 1.5324675324675324e-07, + "logits/chosen": -0.8150400519371033, + "logits/rejected": -0.7478416562080383, + "logps/chosen": -445.16583251953125, + "logps/rejected": -471.5647888183594, + "loss": 0.6848, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08694851398468018, + "rewards/margins": 0.02347085252404213, + "rewards/rejected": -0.110419362783432, + "step": 118 + }, + { + "epoch": 0.07772063025553107, + "grad_norm": 6.184245731480456, + "learning_rate": 1.5454545454545453e-07, + "logits/chosen": -0.9232463836669922, + "logits/rejected": -0.8822212219238281, + "logps/chosen": -441.2488098144531, + "logps/rejected": -428.74822998046875, + "loss": 0.6788, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07863913476467133, + "rewards/margins": 0.019341323524713516, + "rewards/rejected": -0.09798046201467514, + "step": 119 + }, + { + "epoch": 0.07837374479549351, + "grad_norm": 6.059601097022327, + "learning_rate": 1.5584415584415585e-07, + "logits/chosen": -0.7953625917434692, + "logits/rejected": -0.7896191477775574, + "logps/chosen": -454.513671875, + "logps/rejected": -499.6773681640625, + "loss": 0.6858, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.09819990396499634, + "rewards/margins": 0.030399596318602562, + "rewards/rejected": -0.12859950959682465, + "step": 120 + }, + { + "epoch": 0.07902685933545596, + "grad_norm": 6.332138116216839, + "learning_rate": 1.5714285714285714e-07, + "logits/chosen": -0.8561302423477173, + "logits/rejected": -0.8522703647613525, + "logps/chosen": -545.7410888671875, + "logps/rejected": -648.7020874023438, + "loss": 0.6739, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.13791513442993164, + "rewards/margins": 0.099464550614357, + "rewards/rejected": -0.23737969994544983, + "step": 121 + }, + { + "epoch": 0.0796799738754184, + "grad_norm": 6.533955823119497, + "learning_rate": 1.5844155844155846e-07, + "logits/chosen": -0.9354885220527649, + "logits/rejected": -0.8845950961112976, + "logps/chosen": -550.8383178710938, + "logps/rejected": -526.287109375, + "loss": 0.6829, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.08527510613203049, + "rewards/margins": 0.020781541243195534, + "rewards/rejected": -0.10605664551258087, + "step": 122 + }, + { + "epoch": 0.08033308841538085, + "grad_norm": 6.228795518907741, + "learning_rate": 1.5974025974025972e-07, + "logits/chosen": -0.8380718231201172, + "logits/rejected": -0.7975863218307495, + "logps/chosen": -441.62408447265625, + "logps/rejected": -465.06524658203125, + "loss": 0.6811, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09198759496212006, + "rewards/margins": 0.05546523258090019, + "rewards/rejected": -0.14745283126831055, + "step": 123 + }, + { + "epoch": 0.0809862029553433, + "grad_norm": 6.98012118568781, + "learning_rate": 1.6103896103896104e-07, + "logits/chosen": -0.9168799519538879, + "logits/rejected": -0.8905295729637146, + "logps/chosen": -487.2784118652344, + "logps/rejected": -439.59735107421875, + "loss": 0.6846, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.07767526805400848, + "rewards/margins": 0.01843424327671528, + "rewards/rejected": -0.09610950946807861, + "step": 124 + }, + { + "epoch": 0.08163931749530574, + "grad_norm": 7.3021683779348265, + "learning_rate": 1.6233766233766232e-07, + "logits/chosen": -0.873420000076294, + "logits/rejected": -0.8396754264831543, + "logps/chosen": -549.7445678710938, + "logps/rejected": -550.0377197265625, + "loss": 0.6809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13183246552944183, + "rewards/margins": 0.015030565671622753, + "rewards/rejected": -0.14686302840709686, + "step": 125 + }, + { + "epoch": 0.08229243203526819, + "grad_norm": 6.911212156499355, + "learning_rate": 1.6363636363636364e-07, + "logits/chosen": -0.8185966610908508, + "logits/rejected": -0.8198251724243164, + "logps/chosen": -501.91595458984375, + "logps/rejected": -536.2969970703125, + "loss": 0.6708, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1114685982465744, + "rewards/margins": 0.05052608996629715, + "rewards/rejected": -0.16199469566345215, + "step": 126 + }, + { + "epoch": 0.08294554657523064, + "grad_norm": 7.712937728537032, + "learning_rate": 1.6493506493506493e-07, + "logits/chosen": -0.8163201808929443, + "logits/rejected": -0.7817766666412354, + "logps/chosen": -549.0859985351562, + "logps/rejected": -532.6451416015625, + "loss": 0.671, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1389622688293457, + "rewards/margins": 0.05282050371170044, + "rewards/rejected": -0.19178277254104614, + "step": 127 + }, + { + "epoch": 0.08359866111519308, + "grad_norm": 6.718382214474356, + "learning_rate": 1.6623376623376625e-07, + "logits/chosen": -0.9383991360664368, + "logits/rejected": -0.9336289167404175, + "logps/chosen": -490.7161865234375, + "logps/rejected": -474.29144287109375, + "loss": 0.6741, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09263370931148529, + "rewards/margins": 0.04221198707818985, + "rewards/rejected": -0.13484567403793335, + "step": 128 + }, + { + "epoch": 0.08425177565515553, + "grad_norm": 5.961800672060285, + "learning_rate": 1.6753246753246754e-07, + "logits/chosen": -0.878481388092041, + "logits/rejected": -0.7739174365997314, + "logps/chosen": -474.29010009765625, + "logps/rejected": -467.60345458984375, + "loss": 0.6778, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15530624985694885, + "rewards/margins": 0.038683634251356125, + "rewards/rejected": -0.19398987293243408, + "step": 129 + }, + { + "epoch": 0.08490489019511797, + "grad_norm": 6.6363324619995385, + "learning_rate": 1.6883116883116883e-07, + "logits/chosen": -0.9586436748504639, + "logits/rejected": -0.9272456169128418, + "logps/chosen": -548.6116943359375, + "logps/rejected": -552.0598754882812, + "loss": 0.6822, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12150692194700241, + "rewards/margins": 0.045128632336854935, + "rewards/rejected": -0.16663555800914764, + "step": 130 + }, + { + "epoch": 0.08555800473508042, + "grad_norm": 7.156351873248016, + "learning_rate": 1.7012987012987012e-07, + "logits/chosen": -0.8477173447608948, + "logits/rejected": -0.8483027219772339, + "logps/chosen": -464.6786193847656, + "logps/rejected": -552.1746826171875, + "loss": 0.6671, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13094085454940796, + "rewards/margins": 0.07437079399824142, + "rewards/rejected": -0.2053116410970688, + "step": 131 + }, + { + "epoch": 0.08621111927504287, + "grad_norm": 11.567562751248682, + "learning_rate": 1.714285714285714e-07, + "logits/chosen": -0.810928463935852, + "logits/rejected": -0.8170727491378784, + "logps/chosen": -473.8848571777344, + "logps/rejected": -483.2086486816406, + "loss": 0.6811, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11197689175605774, + "rewards/margins": 0.042106710374355316, + "rewards/rejected": -0.15408360958099365, + "step": 132 + }, + { + "epoch": 0.08686423381500531, + "grad_norm": 6.666536354703348, + "learning_rate": 1.7272727272727272e-07, + "logits/chosen": -0.8156629204750061, + "logits/rejected": -0.7771432399749756, + "logps/chosen": -588.0108032226562, + "logps/rejected": -608.7498168945312, + "loss": 0.6779, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.20428195595741272, + "rewards/margins": 0.08440540730953217, + "rewards/rejected": -0.2886873781681061, + "step": 133 + }, + { + "epoch": 0.08751734835496776, + "grad_norm": 7.1435262883808734, + "learning_rate": 1.74025974025974e-07, + "logits/chosen": -0.774854302406311, + "logits/rejected": -0.7982902526855469, + "logps/chosen": -470.2498474121094, + "logps/rejected": -583.7744750976562, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1737588495016098, + "rewards/margins": 0.1177835762500763, + "rewards/rejected": -0.2915424108505249, + "step": 134 + }, + { + "epoch": 0.0881704628949302, + "grad_norm": 6.2313339803498655, + "learning_rate": 1.7532467532467533e-07, + "logits/chosen": -0.8765559196472168, + "logits/rejected": -0.8805602192878723, + "logps/chosen": -469.5799255371094, + "logps/rejected": -518.7642822265625, + "loss": 0.6744, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.15586817264556885, + "rewards/margins": 0.038987912237644196, + "rewards/rejected": -0.19485607743263245, + "step": 135 + }, + { + "epoch": 0.08882357743489265, + "grad_norm": 6.745544786449858, + "learning_rate": 1.7662337662337662e-07, + "logits/chosen": -0.8797418475151062, + "logits/rejected": -0.8522154688835144, + "logps/chosen": -551.00146484375, + "logps/rejected": -525.474609375, + "loss": 0.6662, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.19286242127418518, + "rewards/margins": 0.05776097998023033, + "rewards/rejected": -0.2506234049797058, + "step": 136 + }, + { + "epoch": 0.0894766919748551, + "grad_norm": 7.32189969799847, + "learning_rate": 1.779220779220779e-07, + "logits/chosen": -0.9289051294326782, + "logits/rejected": -0.9343584179878235, + "logps/chosen": -466.8741455078125, + "logps/rejected": -475.151611328125, + "loss": 0.6611, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1371385008096695, + "rewards/margins": 0.011942176148295403, + "rewards/rejected": -0.14908067882061005, + "step": 137 + }, + { + "epoch": 0.09012980651481754, + "grad_norm": 6.847931231483095, + "learning_rate": 1.792207792207792e-07, + "logits/chosen": -0.8206534385681152, + "logits/rejected": -0.8144592642784119, + "logps/chosen": -501.97467041015625, + "logps/rejected": -535.951904296875, + "loss": 0.6776, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1563834846019745, + "rewards/margins": 0.054855406284332275, + "rewards/rejected": -0.21123890578746796, + "step": 138 + }, + { + "epoch": 0.09078292105477999, + "grad_norm": 6.944669833446691, + "learning_rate": 1.805194805194805e-07, + "logits/chosen": -0.8529220819473267, + "logits/rejected": -0.8208853006362915, + "logps/chosen": -459.8351135253906, + "logps/rejected": -492.44573974609375, + "loss": 0.6773, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1605282872915268, + "rewards/margins": 0.03590104728937149, + "rewards/rejected": -0.19642934203147888, + "step": 139 + }, + { + "epoch": 0.09143603559474243, + "grad_norm": 6.928164394224459, + "learning_rate": 1.818181818181818e-07, + "logits/chosen": -0.7550444006919861, + "logits/rejected": -0.7985954880714417, + "logps/chosen": -370.5205078125, + "logps/rejected": -476.3122253417969, + "loss": 0.6676, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.14772818982601166, + "rewards/margins": 0.13047771155834198, + "rewards/rejected": -0.27820590138435364, + "step": 140 + }, + { + "epoch": 0.09208915013470488, + "grad_norm": 6.968051505403654, + "learning_rate": 1.8311688311688312e-07, + "logits/chosen": -0.8749439716339111, + "logits/rejected": -0.8448376655578613, + "logps/chosen": -494.04486083984375, + "logps/rejected": -511.3379211425781, + "loss": 0.6682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2082904875278473, + "rewards/margins": 0.04293321818113327, + "rewards/rejected": -0.25122371315956116, + "step": 141 + }, + { + "epoch": 0.09274226467466731, + "grad_norm": 6.287379820512906, + "learning_rate": 1.844155844155844e-07, + "logits/chosen": -0.8604624271392822, + "logits/rejected": -0.8241639137268066, + "logps/chosen": -480.1129455566406, + "logps/rejected": -433.39208984375, + "loss": 0.6757, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.15264149010181427, + "rewards/margins": 0.027708852663636208, + "rewards/rejected": -0.18035033345222473, + "step": 142 + }, + { + "epoch": 0.09339537921462976, + "grad_norm": 6.399592718147689, + "learning_rate": 1.8571428571428572e-07, + "logits/chosen": -0.9117648601531982, + "logits/rejected": -0.8506286144256592, + "logps/chosen": -496.1771240234375, + "logps/rejected": -476.2972106933594, + "loss": 0.6791, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.19126328825950623, + "rewards/margins": 0.022945519536733627, + "rewards/rejected": -0.21420881152153015, + "step": 143 + }, + { + "epoch": 0.0940484937545922, + "grad_norm": 6.323941767214625, + "learning_rate": 1.87012987012987e-07, + "logits/chosen": -0.9264488816261292, + "logits/rejected": -0.9081891775131226, + "logps/chosen": -481.20404052734375, + "logps/rejected": -480.6315612792969, + "loss": 0.6782, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1617911458015442, + "rewards/margins": 0.03854794800281525, + "rewards/rejected": -0.20033907890319824, + "step": 144 + }, + { + "epoch": 0.09470160829455465, + "grad_norm": 6.647653521405033, + "learning_rate": 1.883116883116883e-07, + "logits/chosen": -0.8386011123657227, + "logits/rejected": -0.7837764024734497, + "logps/chosen": -458.1123352050781, + "logps/rejected": -543.1805419921875, + "loss": 0.6695, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.21621732413768768, + "rewards/margins": 0.12856005132198334, + "rewards/rejected": -0.34477734565734863, + "step": 145 + }, + { + "epoch": 0.0953547228345171, + "grad_norm": 7.22549122734511, + "learning_rate": 1.896103896103896e-07, + "logits/chosen": -0.7894729375839233, + "logits/rejected": -0.7863112092018127, + "logps/chosen": -535.9451904296875, + "logps/rejected": -595.9136352539062, + "loss": 0.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25997599959373474, + "rewards/margins": 0.13046929240226746, + "rewards/rejected": -0.3904453217983246, + "step": 146 + }, + { + "epoch": 0.09600783737447954, + "grad_norm": 6.515339257767948, + "learning_rate": 1.909090909090909e-07, + "logits/chosen": -0.8731563091278076, + "logits/rejected": -0.8720945119857788, + "logps/chosen": -479.39776611328125, + "logps/rejected": -538.8330078125, + "loss": 0.6701, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.22556887567043304, + "rewards/margins": 0.12516018748283386, + "rewards/rejected": -0.3507290482521057, + "step": 147 + }, + { + "epoch": 0.09666095191444199, + "grad_norm": 6.422766757760342, + "learning_rate": 1.922077922077922e-07, + "logits/chosen": -0.9519265294075012, + "logits/rejected": -0.9368531107902527, + "logps/chosen": -473.68560791015625, + "logps/rejected": -474.37939453125, + "loss": 0.6683, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18442487716674805, + "rewards/margins": 0.029139067977666855, + "rewards/rejected": -0.213563933968544, + "step": 148 + }, + { + "epoch": 0.09731406645440444, + "grad_norm": 6.65435406028437, + "learning_rate": 1.9350649350649352e-07, + "logits/chosen": -0.9168094396591187, + "logits/rejected": -0.9032158851623535, + "logps/chosen": -461.6015930175781, + "logps/rejected": -425.8824462890625, + "loss": 0.6689, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2183716893196106, + "rewards/margins": 0.02544441632926464, + "rewards/rejected": -0.24381610751152039, + "step": 149 + }, + { + "epoch": 0.09796718099436688, + "grad_norm": 6.9916618084619175, + "learning_rate": 1.948051948051948e-07, + "logits/chosen": -0.7993723154067993, + "logits/rejected": -0.7403357625007629, + "logps/chosen": -549.0083618164062, + "logps/rejected": -495.74847412109375, + "loss": 0.6565, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.2393510341644287, + "rewards/margins": 0.03217237442731857, + "rewards/rejected": -0.2715234160423279, + "step": 150 + }, + { + "epoch": 0.09862029553432933, + "grad_norm": 6.002844245053828, + "learning_rate": 1.961038961038961e-07, + "logits/chosen": -0.8811283707618713, + "logits/rejected": -0.8406999111175537, + "logps/chosen": -448.83868408203125, + "logps/rejected": -474.09320068359375, + "loss": 0.667, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.17506016790866852, + "rewards/margins": 0.10420884937047958, + "rewards/rejected": -0.2792690396308899, + "step": 151 + }, + { + "epoch": 0.09927341007429177, + "grad_norm": 6.898188356647436, + "learning_rate": 1.9740259740259739e-07, + "logits/chosen": -0.9522019028663635, + "logits/rejected": -0.9420698881149292, + "logps/chosen": -476.7374267578125, + "logps/rejected": -501.6094055175781, + "loss": 0.6578, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22090810537338257, + "rewards/margins": 0.062128640711307526, + "rewards/rejected": -0.2830367684364319, + "step": 152 + }, + { + "epoch": 0.09992652461425422, + "grad_norm": 6.4452295203644825, + "learning_rate": 1.987012987012987e-07, + "logits/chosen": -0.9114395380020142, + "logits/rejected": -0.897411584854126, + "logps/chosen": -495.1845397949219, + "logps/rejected": -469.0250244140625, + "loss": 0.6644, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21842332184314728, + "rewards/margins": 0.025180380791425705, + "rewards/rejected": -0.24360370635986328, + "step": 153 + }, + { + "epoch": 0.10057963915421667, + "grad_norm": 6.438642037152613, + "learning_rate": 2e-07, + "logits/chosen": -0.9732076525688171, + "logits/rejected": -0.8783437013626099, + "logps/chosen": -459.40673828125, + "logps/rejected": -439.1183776855469, + "loss": 0.6663, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2149794101715088, + "rewards/margins": 0.020638851448893547, + "rewards/rejected": -0.23561826348304749, + "step": 154 + }, + { + "epoch": 0.10123275369417911, + "grad_norm": 6.922197606721808, + "learning_rate": 1.9999973974344256e-07, + "logits/chosen": -1.0275825262069702, + "logits/rejected": -0.9289765357971191, + "logps/chosen": -447.41766357421875, + "logps/rejected": -410.023193359375, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2406703233718872, + "rewards/margins": 0.03814300149679184, + "rewards/rejected": -0.27881333231925964, + "step": 155 + }, + { + "epoch": 0.10188586823414156, + "grad_norm": 8.364313956052694, + "learning_rate": 1.999989589751249e-07, + "logits/chosen": -0.8802437782287598, + "logits/rejected": -0.9175126552581787, + "logps/chosen": -527.3651123046875, + "logps/rejected": -564.7503051757812, + "loss": 0.6565, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.27885937690734863, + "rewards/margins": 0.0818130373954773, + "rewards/rejected": -0.3606724143028259, + "step": 156 + }, + { + "epoch": 0.102538982774104, + "grad_norm": 6.154889997810939, + "learning_rate": 1.9999765769911105e-07, + "logits/chosen": -0.8315755724906921, + "logits/rejected": -0.7917363047599792, + "logps/chosen": -484.8338623046875, + "logps/rejected": -492.9305419921875, + "loss": 0.6668, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1891915202140808, + "rewards/margins": 0.07525278627872467, + "rewards/rejected": -0.26444435119628906, + "step": 157 + }, + { + "epoch": 0.10319209731406645, + "grad_norm": 6.516656144007839, + "learning_rate": 1.999958359221743e-07, + "logits/chosen": -0.9213133454322815, + "logits/rejected": -0.8424961566925049, + "logps/chosen": -497.5096435546875, + "logps/rejected": -483.623046875, + "loss": 0.6643, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.23822657763957977, + "rewards/margins": 0.054590702056884766, + "rewards/rejected": -0.29281729459762573, + "step": 158 + }, + { + "epoch": 0.1038452118540289, + "grad_norm": 6.620879007235329, + "learning_rate": 1.999934936537972e-07, + "logits/chosen": -0.8575921058654785, + "logits/rejected": -0.8345927000045776, + "logps/chosen": -440.8963623046875, + "logps/rejected": -478.8257141113281, + "loss": 0.6524, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2483789175748825, + "rewards/margins": 0.10234285145998001, + "rewards/rejected": -0.3507218062877655, + "step": 159 + }, + { + "epoch": 0.10449832639399134, + "grad_norm": 6.828503715764836, + "learning_rate": 1.9999063090617166e-07, + "logits/chosen": -0.9059373140335083, + "logits/rejected": -0.8238606452941895, + "logps/chosen": -526.1168212890625, + "logps/rejected": -510.9229431152344, + "loss": 0.6583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28075307607650757, + "rewards/margins": 0.07372645288705826, + "rewards/rejected": -0.3544795513153076, + "step": 160 + }, + { + "epoch": 0.10515144093395379, + "grad_norm": 6.0655028367067505, + "learning_rate": 1.9998724769419858e-07, + "logits/chosen": -0.8036881685256958, + "logits/rejected": -0.7686895132064819, + "logps/chosen": -383.0535583496094, + "logps/rejected": -403.23736572265625, + "loss": 0.6746, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18963344395160675, + "rewards/margins": 0.05757593736052513, + "rewards/rejected": -0.24720938503742218, + "step": 161 + }, + { + "epoch": 0.10580455547391623, + "grad_norm": 6.590865574876252, + "learning_rate": 1.9998334403548806e-07, + "logits/chosen": -0.9176549315452576, + "logits/rejected": -0.9008191823959351, + "logps/chosen": -531.5628662109375, + "logps/rejected": -509.3006591796875, + "loss": 0.676, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.27991557121276855, + "rewards/margins": 0.058448441326618195, + "rewards/rejected": -0.33836400508880615, + "step": 162 + }, + { + "epoch": 0.10645767001387868, + "grad_norm": 6.467086480102644, + "learning_rate": 1.9997891995035913e-07, + "logits/chosen": -0.9845995903015137, + "logits/rejected": -0.9540070295333862, + "logps/chosen": -553.7200927734375, + "logps/rejected": -549.807861328125, + "loss": 0.6718, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.34165340662002563, + "rewards/margins": 0.04177336394786835, + "rewards/rejected": -0.38342681527137756, + "step": 163 + }, + { + "epoch": 0.10711078455384113, + "grad_norm": 6.633851760456449, + "learning_rate": 1.9997397546183974e-07, + "logits/chosen": -0.8908398151397705, + "logits/rejected": -0.9239029884338379, + "logps/chosen": -435.40771484375, + "logps/rejected": -506.06903076171875, + "loss": 0.655, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2753165364265442, + "rewards/margins": 0.14071771502494812, + "rewards/rejected": -0.4160342216491699, + "step": 164 + }, + { + "epoch": 0.10776389909380357, + "grad_norm": 7.504573304749105, + "learning_rate": 1.999685105956666e-07, + "logits/chosen": -0.84445720911026, + "logits/rejected": -0.846612811088562, + "logps/chosen": -481.5384521484375, + "logps/rejected": -507.06134033203125, + "loss": 0.6437, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28293365240097046, + "rewards/margins": 0.06457722187042236, + "rewards/rejected": -0.34751084446907043, + "step": 165 + }, + { + "epoch": 0.10841701363376602, + "grad_norm": 6.844588401003989, + "learning_rate": 1.9996252538028508e-07, + "logits/chosen": -0.9970215559005737, + "logits/rejected": -0.9549089670181274, + "logps/chosen": -523.70068359375, + "logps/rejected": -515.5821533203125, + "loss": 0.6466, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3379030227661133, + "rewards/margins": 0.1277962177991867, + "rewards/rejected": -0.4656992256641388, + "step": 166 + }, + { + "epoch": 0.10907012817372846, + "grad_norm": 7.236130567280733, + "learning_rate": 1.9995601984684897e-07, + "logits/chosen": -0.7774499654769897, + "logits/rejected": -0.7958889007568359, + "logps/chosen": -579.2680053710938, + "logps/rejected": -638.60791015625, + "loss": 0.6503, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.44350510835647583, + "rewards/margins": 0.1982521116733551, + "rewards/rejected": -0.6417572498321533, + "step": 167 + }, + { + "epoch": 0.10972324271369091, + "grad_norm": 6.547100915291269, + "learning_rate": 1.9994899402922046e-07, + "logits/chosen": -0.7743135690689087, + "logits/rejected": -0.7389811277389526, + "logps/chosen": -477.3546142578125, + "logps/rejected": -516.999267578125, + "loss": 0.6593, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.29847046732902527, + "rewards/margins": 0.18295258283615112, + "rewards/rejected": -0.4814230799674988, + "step": 168 + }, + { + "epoch": 0.11037635725365336, + "grad_norm": 6.580578918239811, + "learning_rate": 1.999414479639698e-07, + "logits/chosen": -0.8631864786148071, + "logits/rejected": -0.8823621273040771, + "logps/chosen": -490.77557373046875, + "logps/rejected": -499.3733825683594, + "loss": 0.6744, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.35784608125686646, + "rewards/margins": 0.03360201418399811, + "rewards/rejected": -0.39144808053970337, + "step": 169 + }, + { + "epoch": 0.1110294717936158, + "grad_norm": 7.304635089452459, + "learning_rate": 1.9993338169037532e-07, + "logits/chosen": -0.9329928755760193, + "logits/rejected": -0.8873375654220581, + "logps/chosen": -536.2473754882812, + "logps/rejected": -565.2854614257812, + "loss": 0.6412, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.35816970467567444, + "rewards/margins": 0.16970431804656982, + "rewards/rejected": -0.5278739929199219, + "step": 170 + }, + { + "epoch": 0.11168258633357825, + "grad_norm": 8.526831736459082, + "learning_rate": 1.99924795250423e-07, + "logits/chosen": -0.9018800854682922, + "logits/rejected": -0.9441125392913818, + "logps/chosen": -514.7568359375, + "logps/rejected": -537.2463989257812, + "loss": 0.6555, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3121597468852997, + "rewards/margins": 0.09106716513633728, + "rewards/rejected": -0.4032268822193146, + "step": 171 + }, + { + "epoch": 0.1123357008735407, + "grad_norm": 6.974764550329646, + "learning_rate": 1.9991568868880638e-07, + "logits/chosen": -0.7577574849128723, + "logits/rejected": -0.7482568025588989, + "logps/chosen": -465.4934997558594, + "logps/rejected": -487.9968566894531, + "loss": 0.6518, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3646446764469147, + "rewards/margins": 0.11736685037612915, + "rewards/rejected": -0.48201149702072144, + "step": 172 + }, + { + "epoch": 0.11298881541350314, + "grad_norm": 6.435484683449247, + "learning_rate": 1.999060620529263e-07, + "logits/chosen": -0.8632010221481323, + "logits/rejected": -0.8566238880157471, + "logps/chosen": -494.66363525390625, + "logps/rejected": -519.395751953125, + "loss": 0.6526, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.22226670384407043, + "rewards/margins": 0.15008774399757385, + "rewards/rejected": -0.3723544478416443, + "step": 173 + }, + { + "epoch": 0.11364192995346559, + "grad_norm": 7.420727411793039, + "learning_rate": 1.998959153928907e-07, + "logits/chosen": -0.9504894018173218, + "logits/rejected": -0.9857321977615356, + "logps/chosen": -527.0250244140625, + "logps/rejected": -545.4429931640625, + "loss": 0.6424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3586081862449646, + "rewards/margins": 0.08520010113716125, + "rewards/rejected": -0.44380828738212585, + "step": 174 + }, + { + "epoch": 0.11429504449342803, + "grad_norm": 6.600657284454907, + "learning_rate": 1.9988524876151422e-07, + "logits/chosen": -0.7927142381668091, + "logits/rejected": -0.7309151887893677, + "logps/chosen": -497.8948059082031, + "logps/rejected": -530.8662719726562, + "loss": 0.6508, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4121289849281311, + "rewards/margins": 0.22072520852088928, + "rewards/rejected": -0.6328542828559875, + "step": 175 + }, + { + "epoch": 0.11494815903339048, + "grad_norm": 6.711546585878465, + "learning_rate": 1.9987406221431812e-07, + "logits/chosen": -0.9204037189483643, + "logits/rejected": -0.8354212641716003, + "logps/chosen": -486.84637451171875, + "logps/rejected": -469.3777770996094, + "loss": 0.6305, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.30236339569091797, + "rewards/margins": 0.1800605207681656, + "rewards/rejected": -0.4824238419532776, + "step": 176 + }, + { + "epoch": 0.11560127357335293, + "grad_norm": 7.413380504551747, + "learning_rate": 1.9986235580952986e-07, + "logits/chosen": -0.898276686668396, + "logits/rejected": -0.8423072695732117, + "logps/chosen": -421.8368835449219, + "logps/rejected": -463.8457946777344, + "loss": 0.6556, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2767115533351898, + "rewards/margins": 0.15376242995262146, + "rewards/rejected": -0.4304739832878113, + "step": 177 + }, + { + "epoch": 0.11625438811331537, + "grad_norm": 7.4603000934120685, + "learning_rate": 1.9985012960808275e-07, + "logits/chosen": -0.9808669686317444, + "logits/rejected": -0.9150485396385193, + "logps/chosen": -546.5240478515625, + "logps/rejected": -494.89044189453125, + "loss": 0.6512, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.39803603291511536, + "rewards/margins": 0.059254515916109085, + "rewards/rejected": -0.45729053020477295, + "step": 178 + }, + { + "epoch": 0.11690750265327782, + "grad_norm": 7.101437920971849, + "learning_rate": 1.998373836736158e-07, + "logits/chosen": -0.9734604954719543, + "logits/rejected": -0.8278497457504272, + "logps/chosen": -579.0867919921875, + "logps/rejected": -503.48492431640625, + "loss": 0.6381, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.42628052830696106, + "rewards/margins": 0.023855991661548615, + "rewards/rejected": -0.4501365125179291, + "step": 179 + }, + { + "epoch": 0.11756061719324026, + "grad_norm": 6.538781837196094, + "learning_rate": 1.998241180724733e-07, + "logits/chosen": -0.9165725708007812, + "logits/rejected": -0.9196174144744873, + "logps/chosen": -464.7592468261719, + "logps/rejected": -482.8262023925781, + "loss": 0.6509, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.41940075159072876, + "rewards/margins": 0.05880502983927727, + "rewards/rejected": -0.47820577025413513, + "step": 180 + }, + { + "epoch": 0.11821373173320271, + "grad_norm": 7.321345164948039, + "learning_rate": 1.998103328737044e-07, + "logits/chosen": -1.006074070930481, + "logits/rejected": -1.001975417137146, + "logps/chosen": -491.1534423828125, + "logps/rejected": -515.4642333984375, + "loss": 0.6435, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3396851420402527, + "rewards/margins": 0.13319769501686096, + "rewards/rejected": -0.47288286685943604, + "step": 181 + }, + { + "epoch": 0.11886684627316516, + "grad_norm": 6.894930940879784, + "learning_rate": 1.997960281490629e-07, + "logits/chosen": -0.927212119102478, + "logits/rejected": -0.9049844145774841, + "logps/chosen": -600.8464965820312, + "logps/rejected": -554.48095703125, + "loss": 0.6648, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5532928109169006, + "rewards/margins": 0.03803117573261261, + "rewards/rejected": -0.591323971748352, + "step": 182 + }, + { + "epoch": 0.1195199608131276, + "grad_norm": 7.83193928010655, + "learning_rate": 1.9978120397300673e-07, + "logits/chosen": -0.8268590569496155, + "logits/rejected": -0.7920058965682983, + "logps/chosen": -452.5147705078125, + "logps/rejected": -533.310302734375, + "loss": 0.6315, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4391025900840759, + "rewards/margins": 0.2984185218811035, + "rewards/rejected": -0.7375210523605347, + "step": 183 + }, + { + "epoch": 0.12017307535309005, + "grad_norm": 6.386679940574844, + "learning_rate": 1.9976586042269772e-07, + "logits/chosen": -0.8321893215179443, + "logits/rejected": -0.8634829521179199, + "logps/chosen": -488.01654052734375, + "logps/rejected": -579.4048461914062, + "loss": 0.6259, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4608284533023834, + "rewards/margins": 0.266685426235199, + "rewards/rejected": -0.72751384973526, + "step": 184 + }, + { + "epoch": 0.1208261898930525, + "grad_norm": 6.8983232744264305, + "learning_rate": 1.9974999757800103e-07, + "logits/chosen": -0.832870602607727, + "logits/rejected": -0.8473939895629883, + "logps/chosen": -465.09686279296875, + "logps/rejected": -476.3861083984375, + "loss": 0.6348, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4560048282146454, + "rewards/margins": 0.16139870882034302, + "rewards/rejected": -0.6174035668373108, + "step": 185 + }, + { + "epoch": 0.12147930443301494, + "grad_norm": 7.754190109178935, + "learning_rate": 1.9973361552148487e-07, + "logits/chosen": -0.8424513339996338, + "logits/rejected": -0.8219509124755859, + "logps/chosen": -574.000732421875, + "logps/rejected": -605.0245361328125, + "loss": 0.6487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5188160538673401, + "rewards/margins": 0.3015327453613281, + "rewards/rejected": -0.8203487992286682, + "step": 186 + }, + { + "epoch": 0.12213241897297739, + "grad_norm": 6.591297590145272, + "learning_rate": 1.9971671433841998e-07, + "logits/chosen": -0.8903267979621887, + "logits/rejected": -0.8687557578086853, + "logps/chosen": -519.8242797851562, + "logps/rejected": -529.4027099609375, + "loss": 0.6555, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.463649719953537, + "rewards/margins": 0.062138475477695465, + "rewards/rejected": -0.5257881879806519, + "step": 187 + }, + { + "epoch": 0.12278553351293983, + "grad_norm": 7.4689116101629125, + "learning_rate": 1.996992941167792e-07, + "logits/chosen": -1.0059764385223389, + "logits/rejected": -0.9330079555511475, + "logps/chosen": -653.8500366210938, + "logps/rejected": -690.8359375, + "loss": 0.656, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6045035719871521, + "rewards/margins": 0.2457866370677948, + "rewards/rejected": -0.8502901792526245, + "step": 188 + }, + { + "epoch": 0.12343864805290228, + "grad_norm": 7.523661722899911, + "learning_rate": 1.996813549472371e-07, + "logits/chosen": -0.8645133376121521, + "logits/rejected": -0.8016350269317627, + "logps/chosen": -536.940673828125, + "logps/rejected": -569.5060424804688, + "loss": 0.6702, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5659014582633972, + "rewards/margins": 0.19686836004257202, + "rewards/rejected": -0.7627697587013245, + "step": 189 + }, + { + "epoch": 0.12409176259286472, + "grad_norm": 7.435260811262935, + "learning_rate": 1.9966289692316943e-07, + "logits/chosen": -0.9768977165222168, + "logits/rejected": -0.9664483666419983, + "logps/chosen": -512.6085205078125, + "logps/rejected": -521.680908203125, + "loss": 0.6227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48955708742141724, + "rewards/margins": 0.16032075881958008, + "rewards/rejected": -0.6498778462409973, + "step": 190 + }, + { + "epoch": 0.12474487713282717, + "grad_norm": 7.410166333031568, + "learning_rate": 1.996439201406526e-07, + "logits/chosen": -0.8727903366088867, + "logits/rejected": -0.8746020793914795, + "logps/chosen": -460.9349060058594, + "logps/rejected": -550.4550170898438, + "loss": 0.6256, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4731426239013672, + "rewards/margins": 0.357341468334198, + "rewards/rejected": -0.8304840326309204, + "step": 191 + }, + { + "epoch": 0.12539799167278962, + "grad_norm": 7.451321792890638, + "learning_rate": 1.9962442469846325e-07, + "logits/chosen": -0.9719246625900269, + "logits/rejected": -0.9369036555290222, + "logps/chosen": -501.4394226074219, + "logps/rejected": -502.3212585449219, + "loss": 0.6383, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.44892147183418274, + "rewards/margins": 0.11068614572286606, + "rewards/rejected": -0.5596076250076294, + "step": 192 + }, + { + "epoch": 0.12605110621275206, + "grad_norm": 7.35526823945006, + "learning_rate": 1.9960441069807775e-07, + "logits/chosen": -0.9953307509422302, + "logits/rejected": -1.020664930343628, + "logps/chosen": -521.7427368164062, + "logps/rejected": -562.8399658203125, + "loss": 0.6475, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5778794288635254, + "rewards/margins": 0.2361249178647995, + "rewards/rejected": -0.8140044212341309, + "step": 193 + }, + { + "epoch": 0.1267042207527145, + "grad_norm": 7.423354130295339, + "learning_rate": 1.9958387824367153e-07, + "logits/chosen": -0.8011785745620728, + "logits/rejected": -0.7897564172744751, + "logps/chosen": -497.1465148925781, + "logps/rejected": -523.15576171875, + "loss": 0.6356, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5150357484817505, + "rewards/margins": 0.06541642546653748, + "rewards/rejected": -0.5804521441459656, + "step": 194 + }, + { + "epoch": 0.12735733529267695, + "grad_norm": 6.975653189759925, + "learning_rate": 1.9956282744211878e-07, + "logits/chosen": -0.9849708676338196, + "logits/rejected": -0.9643858671188354, + "logps/chosen": -563.0205078125, + "logps/rejected": -580.1819458007812, + "loss": 0.6415, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6102763414382935, + "rewards/margins": 0.12003323435783386, + "rewards/rejected": -0.7303095459938049, + "step": 195 + }, + { + "epoch": 0.1280104498326394, + "grad_norm": 7.2065492895384295, + "learning_rate": 1.9954125840299163e-07, + "logits/chosen": -0.9601786732673645, + "logits/rejected": -0.8700001835823059, + "logps/chosen": -471.837158203125, + "logps/rejected": -481.9418029785156, + "loss": 0.63, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4637281894683838, + "rewards/margins": 0.13474319875240326, + "rewards/rejected": -0.5984713435173035, + "step": 196 + }, + { + "epoch": 0.12866356437260185, + "grad_norm": 6.808584697465035, + "learning_rate": 1.9951917123855978e-07, + "logits/chosen": -0.8570945262908936, + "logits/rejected": -0.8283834457397461, + "logps/chosen": -463.7060546875, + "logps/rejected": -492.38006591796875, + "loss": 0.6419, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5440891981124878, + "rewards/margins": 0.13518810272216797, + "rewards/rejected": -0.6792773008346558, + "step": 197 + }, + { + "epoch": 0.1293166789125643, + "grad_norm": 7.7662023295805405, + "learning_rate": 1.994965660637898e-07, + "logits/chosen": -0.9607435464859009, + "logits/rejected": -0.9405413866043091, + "logps/chosen": -539.56884765625, + "logps/rejected": -557.1348266601562, + "loss": 0.614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5751670598983765, + "rewards/margins": 0.279240220785141, + "rewards/rejected": -0.8544072508811951, + "step": 198 + }, + { + "epoch": 0.12996979345252674, + "grad_norm": 6.6784323628411775, + "learning_rate": 1.994734429963446e-07, + "logits/chosen": -0.8653057813644409, + "logits/rejected": -0.8396562337875366, + "logps/chosen": -538.5821533203125, + "logps/rejected": -567.431396484375, + "loss": 0.6326, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6097425818443298, + "rewards/margins": 0.1930355578660965, + "rewards/rejected": -0.8027780652046204, + "step": 199 + }, + { + "epoch": 0.13062290799248918, + "grad_norm": 6.693517839607438, + "learning_rate": 1.994498021565828e-07, + "logits/chosen": -0.9042263031005859, + "logits/rejected": -0.9179930686950684, + "logps/chosen": -510.3880615234375, + "logps/rejected": -595.0519409179688, + "loss": 0.6218, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5225197076797485, + "rewards/margins": 0.37714314460754395, + "rewards/rejected": -0.8996628522872925, + "step": 200 + }, + { + "epoch": 0.13062290799248918, + "eval_logits/chosen": -0.8125271797180176, + "eval_logits/rejected": -0.7770608067512512, + "eval_logps/chosen": -535.1920166015625, + "eval_logps/rejected": -542.3652954101562, + "eval_loss": 0.6276674866676331, + "eval_rewards/accuracies": 0.6959999799728394, + "eval_rewards/chosen": -0.6128481030464172, + "eval_rewards/margins": 0.15911929309368134, + "eval_rewards/rejected": -0.7719674110412598, + "eval_runtime": 619.4957, + "eval_samples_per_second": 6.457, + "eval_steps_per_second": 0.404, + "step": 200 + }, + { + "epoch": 0.13127602253245163, + "grad_norm": 7.287720671466593, + "learning_rate": 1.9942564366755805e-07, + "logits/chosen": -0.8729226589202881, + "logits/rejected": -0.9026110768318176, + "logps/chosen": -488.0004577636719, + "logps/rejected": -517.5531616210938, + "loss": 0.6221, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5494257807731628, + "rewards/margins": 0.14743518829345703, + "rewards/rejected": -0.6968609690666199, + "step": 201 + }, + { + "epoch": 0.13192913707241408, + "grad_norm": 6.635612912694771, + "learning_rate": 1.9940096765501845e-07, + "logits/chosen": -0.8704729676246643, + "logits/rejected": -0.8529733419418335, + "logps/chosen": -469.6875305175781, + "logps/rejected": -517.5676879882812, + "loss": 0.6488, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5424898862838745, + "rewards/margins": 0.19880425930023193, + "rewards/rejected": -0.7412941455841064, + "step": 202 + }, + { + "epoch": 0.13258225161237652, + "grad_norm": 6.565903567595678, + "learning_rate": 1.993757742474059e-07, + "logits/chosen": -0.8626876473426819, + "logits/rejected": -0.8197176456451416, + "logps/chosen": -515.7911376953125, + "logps/rejected": -539.673583984375, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5579182505607605, + "rewards/margins": 0.15692903101444244, + "rewards/rejected": -0.7148473262786865, + "step": 203 + }, + { + "epoch": 0.13323536615233897, + "grad_norm": 6.898446368005499, + "learning_rate": 1.993500635758554e-07, + "logits/chosen": -0.8769382834434509, + "logits/rejected": -0.8424308896064758, + "logps/chosen": -547.3653564453125, + "logps/rejected": -543.1265869140625, + "loss": 0.6247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6659564971923828, + "rewards/margins": 0.1936778426170349, + "rewards/rejected": -0.8596343994140625, + "step": 204 + }, + { + "epoch": 0.13388848069230141, + "grad_norm": 7.35385978193956, + "learning_rate": 1.9932383577419428e-07, + "logits/chosen": -0.8527657985687256, + "logits/rejected": -0.8732788562774658, + "logps/chosen": -515.5074462890625, + "logps/rejected": -542.03515625, + "loss": 0.6355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5365366339683533, + "rewards/margins": 0.19824588298797607, + "rewards/rejected": -0.7347825169563293, + "step": 205 + }, + { + "epoch": 0.13454159523226386, + "grad_norm": 7.030648984280733, + "learning_rate": 1.992970909789418e-07, + "logits/chosen": -0.957828164100647, + "logits/rejected": -0.8857869505882263, + "logps/chosen": -509.6930236816406, + "logps/rejected": -477.85186767578125, + "loss": 0.6093, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.5899068713188171, + "rewards/margins": 0.13009968400001526, + "rewards/rejected": -0.7200065851211548, + "step": 206 + }, + { + "epoch": 0.1351947097722263, + "grad_norm": 6.950088867136282, + "learning_rate": 1.9926982932930807e-07, + "logits/chosen": -1.0200141668319702, + "logits/rejected": -1.0256128311157227, + "logps/chosen": -581.5166625976562, + "logps/rejected": -643.7260131835938, + "loss": 0.6331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6091457605361938, + "rewards/margins": 0.21974962949752808, + "rewards/rejected": -0.8288955092430115, + "step": 207 + }, + { + "epoch": 0.13584782431218875, + "grad_norm": 7.122577238645886, + "learning_rate": 1.9924205096719357e-07, + "logits/chosen": -0.8400067090988159, + "logits/rejected": -0.8682948350906372, + "logps/chosen": -515.2028198242188, + "logps/rejected": -556.894775390625, + "loss": 0.6278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6332363486289978, + "rewards/margins": 0.19957150518894196, + "rewards/rejected": -0.8328077793121338, + "step": 208 + }, + { + "epoch": 0.1365009388521512, + "grad_norm": 6.576309079334382, + "learning_rate": 1.992137560371883e-07, + "logits/chosen": -0.9659208655357361, + "logits/rejected": -0.9228329062461853, + "logps/chosen": -523.8739624023438, + "logps/rejected": -537.5508422851562, + "loss": 0.6493, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7064003944396973, + "rewards/margins": 0.04580266401171684, + "rewards/rejected": -0.7522029280662537, + "step": 209 + }, + { + "epoch": 0.13715405339211365, + "grad_norm": 6.973323990656964, + "learning_rate": 1.991849446865711e-07, + "logits/chosen": -0.8906298875808716, + "logits/rejected": -0.8565115332603455, + "logps/chosen": -530.02197265625, + "logps/rejected": -524.1921997070312, + "loss": 0.6055, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6586757898330688, + "rewards/margins": 0.16047915816307068, + "rewards/rejected": -0.8191549181938171, + "step": 210 + }, + { + "epoch": 0.1378071679320761, + "grad_norm": 6.816468269528872, + "learning_rate": 1.991556170653088e-07, + "logits/chosen": -0.8264785408973694, + "logits/rejected": -0.7743552923202515, + "logps/chosen": -635.3679809570312, + "logps/rejected": -661.7918090820312, + "loss": 0.5945, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8539211750030518, + "rewards/margins": 0.3885781168937683, + "rewards/rejected": -1.2424993515014648, + "step": 211 + }, + { + "epoch": 0.13846028247203854, + "grad_norm": 6.929590404216663, + "learning_rate": 1.9912577332605557e-07, + "logits/chosen": -1.066705346107483, + "logits/rejected": -1.0079761743545532, + "logps/chosen": -534.2091064453125, + "logps/rejected": -533.3729858398438, + "loss": 0.6477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.62717205286026, + "rewards/margins": 0.10539636760950089, + "rewards/rejected": -0.7325683832168579, + "step": 212 + }, + { + "epoch": 0.13911339701200098, + "grad_norm": 7.150556554578528, + "learning_rate": 1.990954136241519e-07, + "logits/chosen": -0.8896300196647644, + "logits/rejected": -0.860119104385376, + "logps/chosen": -561.0973510742188, + "logps/rejected": -588.75146484375, + "loss": 0.5929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7131221890449524, + "rewards/margins": 0.23361392319202423, + "rewards/rejected": -0.9467360377311707, + "step": 213 + }, + { + "epoch": 0.13976651155196343, + "grad_norm": 7.459196518999915, + "learning_rate": 1.9906453811762414e-07, + "logits/chosen": -0.9893519878387451, + "logits/rejected": -0.972107470035553, + "logps/chosen": -550.1785278320312, + "logps/rejected": -562.7999877929688, + "loss": 0.6273, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7028112411499023, + "rewards/margins": 0.19713109731674194, + "rewards/rejected": -0.8999423384666443, + "step": 214 + }, + { + "epoch": 0.14041962609192588, + "grad_norm": 7.006968091209495, + "learning_rate": 1.9903314696718323e-07, + "logits/chosen": -0.9318097233772278, + "logits/rejected": -0.9505141973495483, + "logps/chosen": -491.9817810058594, + "logps/rejected": -514.4917602539062, + "loss": 0.6164, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6546629667282104, + "rewards/margins": 0.14145053923130035, + "rewards/rejected": -0.7961135506629944, + "step": 215 + }, + { + "epoch": 0.14107274063188832, + "grad_norm": 6.927856564940898, + "learning_rate": 1.990012403362243e-07, + "logits/chosen": -0.8761850595474243, + "logits/rejected": -0.8285849094390869, + "logps/chosen": -573.4921264648438, + "logps/rejected": -563.6383056640625, + "loss": 0.6095, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8476728200912476, + "rewards/margins": 0.14320024847984314, + "rewards/rejected": -0.9908730387687683, + "step": 216 + }, + { + "epoch": 0.14172585517185077, + "grad_norm": 8.058961160191211, + "learning_rate": 1.9896881839082554e-07, + "logits/chosen": -0.9572507739067078, + "logits/rejected": -0.8944281339645386, + "logps/chosen": -646.2454223632812, + "logps/rejected": -608.398681640625, + "loss": 0.5914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9121346473693848, + "rewards/margins": 0.20039290189743042, + "rewards/rejected": -1.1125273704528809, + "step": 217 + }, + { + "epoch": 0.1423789697118132, + "grad_norm": 7.091499089560837, + "learning_rate": 1.9893588129974738e-07, + "logits/chosen": -1.0472147464752197, + "logits/rejected": -1.029089093208313, + "logps/chosen": -622.0521240234375, + "logps/rejected": -670.965576171875, + "loss": 0.604, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8638104796409607, + "rewards/margins": 0.325113445520401, + "rewards/rejected": -1.1889238357543945, + "step": 218 + }, + { + "epoch": 0.14303208425177566, + "grad_norm": 7.315033939603737, + "learning_rate": 1.9890242923443176e-07, + "logits/chosen": -0.9592210650444031, + "logits/rejected": -0.922424852848053, + "logps/chosen": -566.4813232421875, + "logps/rejected": -592.270751953125, + "loss": 0.5941, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8809952139854431, + "rewards/margins": 0.32125452160835266, + "rewards/rejected": -1.2022497653961182, + "step": 219 + }, + { + "epoch": 0.1436851987917381, + "grad_norm": 7.526883226909424, + "learning_rate": 1.9886846236900102e-07, + "logits/chosen": -0.9030224680900574, + "logits/rejected": -0.8189893364906311, + "logps/chosen": -543.3269653320312, + "logps/rejected": -526.5144653320312, + "loss": 0.6133, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9240608811378479, + "rewards/margins": 0.1997796893119812, + "rewards/rejected": -1.123840570449829, + "step": 220 + }, + { + "epoch": 0.14433831333170055, + "grad_norm": 7.141612907726172, + "learning_rate": 1.9883398088025718e-07, + "logits/chosen": -0.9949901103973389, + "logits/rejected": -0.9826943278312683, + "logps/chosen": -547.9266357421875, + "logps/rejected": -558.5771484375, + "loss": 0.6015, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8585708141326904, + "rewards/margins": 0.33034569025039673, + "rewards/rejected": -1.188916563987732, + "step": 221 + }, + { + "epoch": 0.144991427871663, + "grad_norm": 8.244844325347787, + "learning_rate": 1.987989849476809e-07, + "logits/chosen": -0.9119670391082764, + "logits/rejected": -0.8896617293357849, + "logps/chosen": -560.7728271484375, + "logps/rejected": -550.8117065429688, + "loss": 0.6158, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9433919191360474, + "rewards/margins": 0.2703930735588074, + "rewards/rejected": -1.2137850522994995, + "step": 222 + }, + { + "epoch": 0.14564454241162544, + "grad_norm": 8.841979767709514, + "learning_rate": 1.9876347475343058e-07, + "logits/chosen": -1.0196629762649536, + "logits/rejected": -1.0518569946289062, + "logps/chosen": -581.1697387695312, + "logps/rejected": -607.8892211914062, + "loss": 0.5911, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.0797791481018066, + "rewards/margins": 0.3377801179885864, + "rewards/rejected": -1.4175591468811035, + "step": 223 + }, + { + "epoch": 0.1462976569515879, + "grad_norm": 7.111657119116627, + "learning_rate": 1.9872745048234148e-07, + "logits/chosen": -0.8659726977348328, + "logits/rejected": -0.8837684392929077, + "logps/chosen": -644.4042358398438, + "logps/rejected": -690.21435546875, + "loss": 0.5954, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.214494228363037, + "rewards/margins": 0.4262455105781555, + "rewards/rejected": -1.6407395601272583, + "step": 224 + }, + { + "epoch": 0.14695077149155034, + "grad_norm": 10.605087808773934, + "learning_rate": 1.9869091232192463e-07, + "logits/chosen": -0.9252822995185852, + "logits/rejected": -0.9206048250198364, + "logps/chosen": -576.3617553710938, + "logps/rejected": -601.0115356445312, + "loss": 0.6156, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0873240232467651, + "rewards/margins": 0.27771976590156555, + "rewards/rejected": -1.3650437593460083, + "step": 225 + }, + { + "epoch": 0.14760388603151278, + "grad_norm": 9.084674235016786, + "learning_rate": 1.9865386046236595e-07, + "logits/chosen": -0.968360424041748, + "logits/rejected": -0.9349038600921631, + "logps/chosen": -588.273681640625, + "logps/rejected": -619.4586791992188, + "loss": 0.6567, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0742963552474976, + "rewards/margins": 0.31582218408584595, + "rewards/rejected": -1.3901184797286987, + "step": 226 + }, + { + "epoch": 0.14825700057147523, + "grad_norm": 7.5679079102766975, + "learning_rate": 1.9861629509652522e-07, + "logits/chosen": -0.897240161895752, + "logits/rejected": -0.8546115159988403, + "logps/chosen": -556.501708984375, + "logps/rejected": -554.8712158203125, + "loss": 0.6248, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0640370845794678, + "rewards/margins": 0.20565065741539001, + "rewards/rejected": -1.2696877717971802, + "step": 227 + }, + { + "epoch": 0.14891011511143767, + "grad_norm": 7.498591308619216, + "learning_rate": 1.985782164199351e-07, + "logits/chosen": -0.9917216300964355, + "logits/rejected": -0.9834437370300293, + "logps/chosen": -541.1337890625, + "logps/rejected": -572.7382202148438, + "loss": 0.6082, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9065245985984802, + "rewards/margins": 0.3573816418647766, + "rewards/rejected": -1.2639062404632568, + "step": 228 + }, + { + "epoch": 0.14956322965140012, + "grad_norm": 8.030664967261915, + "learning_rate": 1.9853962463080012e-07, + "logits/chosen": -1.0200045108795166, + "logits/rejected": -0.9756948351860046, + "logps/chosen": -598.68896484375, + "logps/rejected": -618.638916015625, + "loss": 0.6075, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1298401355743408, + "rewards/margins": 0.4494876265525818, + "rewards/rejected": -1.5793277025222778, + "step": 229 + }, + { + "epoch": 0.15021634419136257, + "grad_norm": 8.61826945907326, + "learning_rate": 1.9850051992999558e-07, + "logits/chosen": -0.9880169630050659, + "logits/rejected": -0.9926092624664307, + "logps/chosen": -639.80810546875, + "logps/rejected": -766.6765747070312, + "loss": 0.5953, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3067864179611206, + "rewards/margins": 0.631999135017395, + "rewards/rejected": -1.9387855529785156, + "step": 230 + }, + { + "epoch": 0.150869458731325, + "grad_norm": 7.108840046005145, + "learning_rate": 1.9846090252106657e-07, + "logits/chosen": -0.9846518635749817, + "logits/rejected": -1.0040161609649658, + "logps/chosen": -643.1053466796875, + "logps/rejected": -744.1202392578125, + "loss": 0.5933, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3677211999893188, + "rewards/margins": 0.5599484443664551, + "rewards/rejected": -1.9276697635650635, + "step": 231 + }, + { + "epoch": 0.15152257327128746, + "grad_norm": 7.40371296978151, + "learning_rate": 1.9842077261022688e-07, + "logits/chosen": -0.8514267802238464, + "logits/rejected": -0.9118195176124573, + "logps/chosen": -563.9459838867188, + "logps/rejected": -678.0374145507812, + "loss": 0.6064, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1446633338928223, + "rewards/margins": 0.520073652267456, + "rewards/rejected": -1.6647369861602783, + "step": 232 + }, + { + "epoch": 0.1521756878112499, + "grad_norm": 7.490174879543239, + "learning_rate": 1.9838013040635805e-07, + "logits/chosen": -0.9286482334136963, + "logits/rejected": -0.831351637840271, + "logps/chosen": -589.9814453125, + "logps/rejected": -574.0662231445312, + "loss": 0.5966, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.133133053779602, + "rewards/margins": 0.36746925115585327, + "rewards/rejected": -1.5006022453308105, + "step": 233 + }, + { + "epoch": 0.15282880235121235, + "grad_norm": 7.601651007846018, + "learning_rate": 1.9833897612100798e-07, + "logits/chosen": -0.9623463749885559, + "logits/rejected": -0.9710357785224915, + "logps/chosen": -609.2854614257812, + "logps/rejected": -691.9542236328125, + "loss": 0.5545, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1702455282211304, + "rewards/margins": 0.5435682535171509, + "rewards/rejected": -1.7138137817382812, + "step": 234 + }, + { + "epoch": 0.1534819168911748, + "grad_norm": 7.172747159619703, + "learning_rate": 1.982973099683902e-07, + "logits/chosen": -1.0496864318847656, + "logits/rejected": -1.0496934652328491, + "logps/chosen": -536.92333984375, + "logps/rejected": -569.0079345703125, + "loss": 0.5691, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.185002088546753, + "rewards/margins": 0.29621896147727966, + "rewards/rejected": -1.4812211990356445, + "step": 235 + }, + { + "epoch": 0.15413503143113724, + "grad_norm": 7.8826748034616845, + "learning_rate": 1.982551321653824e-07, + "logits/chosen": -1.0674985647201538, + "logits/rejected": -1.0483006238937378, + "logps/chosen": -655.8088989257812, + "logps/rejected": -660.1513671875, + "loss": 0.6004, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3943867683410645, + "rewards/margins": 0.18238916993141174, + "rewards/rejected": -1.5767759084701538, + "step": 236 + }, + { + "epoch": 0.1547881459710997, + "grad_norm": 8.758377848112476, + "learning_rate": 1.982124429315257e-07, + "logits/chosen": -0.9774172306060791, + "logits/rejected": -0.9349012970924377, + "logps/chosen": -590.2391357421875, + "logps/rejected": -650.866943359375, + "loss": 0.5894, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2001911401748657, + "rewards/margins": 0.6267807483673096, + "rewards/rejected": -1.8269720077514648, + "step": 237 + }, + { + "epoch": 0.15544126051106213, + "grad_norm": 6.943165336635335, + "learning_rate": 1.9816924248902302e-07, + "logits/chosen": -0.9072751998901367, + "logits/rejected": -0.8869557976722717, + "logps/chosen": -595.5549926757812, + "logps/rejected": -593.1593017578125, + "loss": 0.6123, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2035523653030396, + "rewards/margins": 0.13962620496749878, + "rewards/rejected": -1.343178629875183, + "step": 238 + }, + { + "epoch": 0.15609437505102458, + "grad_norm": 8.698887817875539, + "learning_rate": 1.9812553106273846e-07, + "logits/chosen": -0.9801173806190491, + "logits/rejected": -0.9282537698745728, + "logps/chosen": -623.5844116210938, + "logps/rejected": -658.0555419921875, + "loss": 0.5646, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3153058290481567, + "rewards/margins": 0.47327354550361633, + "rewards/rejected": -1.7885793447494507, + "step": 239 + }, + { + "epoch": 0.15674748959098703, + "grad_norm": 7.1608426664869755, + "learning_rate": 1.9808130888019568e-07, + "logits/chosen": -0.9460001587867737, + "logits/rejected": -0.9269174337387085, + "logps/chosen": -674.88134765625, + "logps/rejected": -838.9345092773438, + "loss": 0.5693, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4076124429702759, + "rewards/margins": 0.9736231565475464, + "rewards/rejected": -2.3812355995178223, + "step": 240 + }, + { + "epoch": 0.15740060413094947, + "grad_norm": 8.96596264367979, + "learning_rate": 1.9803657617157689e-07, + "logits/chosen": -0.9591872692108154, + "logits/rejected": -0.9243131875991821, + "logps/chosen": -737.2781372070312, + "logps/rejected": -727.5070190429688, + "loss": 0.6155, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.726030707359314, + "rewards/margins": 0.2121991366147995, + "rewards/rejected": -1.9382299184799194, + "step": 241 + }, + { + "epoch": 0.15805371867091192, + "grad_norm": 9.023241556895329, + "learning_rate": 1.979913331697218e-07, + "logits/chosen": -1.0456653833389282, + "logits/rejected": -0.9968949556350708, + "logps/chosen": -675.23974609375, + "logps/rejected": -667.6704711914062, + "loss": 0.5662, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3876211643218994, + "rewards/margins": 0.3616403341293335, + "rewards/rejected": -1.749261498451233, + "step": 242 + }, + { + "epoch": 0.15870683321087437, + "grad_norm": 7.43494665793943, + "learning_rate": 1.9794558011012607e-07, + "logits/chosen": -0.8788321614265442, + "logits/rejected": -0.8721754550933838, + "logps/chosen": -537.51904296875, + "logps/rejected": -576.8412475585938, + "loss": 0.5945, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1825263500213623, + "rewards/margins": 0.27157512307167053, + "rewards/rejected": -1.4541014432907104, + "step": 243 + }, + { + "epoch": 0.1593599477508368, + "grad_norm": 7.341841973123469, + "learning_rate": 1.9789931723094044e-07, + "logits/chosen": -1.0291013717651367, + "logits/rejected": -0.9565322399139404, + "logps/chosen": -612.6153564453125, + "logps/rejected": -646.62646484375, + "loss": 0.58, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5041933059692383, + "rewards/margins": 0.3266003727912903, + "rewards/rejected": -1.8307936191558838, + "step": 244 + }, + { + "epoch": 0.16001306229079926, + "grad_norm": 6.979064593900016, + "learning_rate": 1.9785254477296926e-07, + "logits/chosen": -0.9677096009254456, + "logits/rejected": -0.9133827686309814, + "logps/chosen": -586.70849609375, + "logps/rejected": -601.3690185546875, + "loss": 0.5816, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4074910879135132, + "rewards/margins": 0.4098448157310486, + "rewards/rejected": -1.817335844039917, + "step": 245 + }, + { + "epoch": 0.1606661768307617, + "grad_norm": 7.344334329816908, + "learning_rate": 1.978052629796693e-07, + "logits/chosen": -0.9641053080558777, + "logits/rejected": -0.9103111028671265, + "logps/chosen": -622.2855834960938, + "logps/rejected": -630.8062133789062, + "loss": 0.6178, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5755188465118408, + "rewards/margins": 0.21382541954517365, + "rewards/rejected": -1.7893444299697876, + "step": 246 + }, + { + "epoch": 0.16131929137072415, + "grad_norm": 7.615942815150963, + "learning_rate": 1.9775747209714844e-07, + "logits/chosen": -0.9888575077056885, + "logits/rejected": -0.9431308507919312, + "logps/chosen": -653.2548217773438, + "logps/rejected": -701.7452392578125, + "loss": 0.5962, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7049720287322998, + "rewards/margins": 0.26299968361854553, + "rewards/rejected": -1.9679718017578125, + "step": 247 + }, + { + "epoch": 0.1619724059106866, + "grad_norm": 7.121943628659046, + "learning_rate": 1.9770917237416458e-07, + "logits/chosen": -0.943242073059082, + "logits/rejected": -0.956061601638794, + "logps/chosen": -577.7714233398438, + "logps/rejected": -563.3513793945312, + "loss": 0.6032, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5149903297424316, + "rewards/margins": 0.162530779838562, + "rewards/rejected": -1.6775211095809937, + "step": 248 + }, + { + "epoch": 0.16262552045064904, + "grad_norm": 7.460563232555389, + "learning_rate": 1.9766036406212402e-07, + "logits/chosen": -0.8586480617523193, + "logits/rejected": -0.7898170948028564, + "logps/chosen": -577.0723876953125, + "logps/rejected": -589.3902587890625, + "loss": 0.5677, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3534388542175293, + "rewards/margins": 0.28514593839645386, + "rewards/rejected": -1.638584852218628, + "step": 249 + }, + { + "epoch": 0.1632786349906115, + "grad_norm": 8.27330500287747, + "learning_rate": 1.9761104741508055e-07, + "logits/chosen": -0.859459400177002, + "logits/rejected": -0.8584420680999756, + "logps/chosen": -660.7594604492188, + "logps/rejected": -681.0894165039062, + "loss": 0.604, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.583940029144287, + "rewards/margins": 0.39344677329063416, + "rewards/rejected": -1.9773868322372437, + "step": 250 + }, + { + "epoch": 0.16393174953057393, + "grad_norm": 7.994745849819422, + "learning_rate": 1.9756122268973368e-07, + "logits/chosen": -1.1016628742218018, + "logits/rejected": -1.0982387065887451, + "logps/chosen": -659.254638671875, + "logps/rejected": -722.5975952148438, + "loss": 0.5555, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6833679676055908, + "rewards/margins": 0.5387370586395264, + "rewards/rejected": -2.2221052646636963, + "step": 251 + }, + { + "epoch": 0.16458486407053638, + "grad_norm": 8.509706028882437, + "learning_rate": 1.9751089014542767e-07, + "logits/chosen": -1.010880947113037, + "logits/rejected": -1.0128508806228638, + "logps/chosen": -672.9129638671875, + "logps/rejected": -704.9036254882812, + "loss": 0.6262, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5938973426818848, + "rewards/margins": 0.4070626199245453, + "rewards/rejected": -2.000959873199463, + "step": 252 + }, + { + "epoch": 0.16523797861049883, + "grad_norm": 8.068629943543698, + "learning_rate": 1.9746005004415002e-07, + "logits/chosen": -0.955176055431366, + "logits/rejected": -0.9755375981330872, + "logps/chosen": -651.1551513671875, + "logps/rejected": -893.3212280273438, + "loss": 0.5303, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.529944658279419, + "rewards/margins": 1.257524013519287, + "rewards/rejected": -2.787468910217285, + "step": 253 + }, + { + "epoch": 0.16589109315046127, + "grad_norm": 10.311573848019473, + "learning_rate": 1.9740870265053011e-07, + "logits/chosen": -0.9392127990722656, + "logits/rejected": -0.9196099042892456, + "logps/chosen": -640.4437866210938, + "logps/rejected": -758.6360473632812, + "loss": 0.597, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6619254350662231, + "rewards/margins": 0.6193193793296814, + "rewards/rejected": -2.2812447547912598, + "step": 254 + }, + { + "epoch": 0.16654420769042372, + "grad_norm": 7.676564230554778, + "learning_rate": 1.9735684823183786e-07, + "logits/chosen": -1.03840970993042, + "logits/rejected": -0.9937460422515869, + "logps/chosen": -608.6917724609375, + "logps/rejected": -635.8554077148438, + "loss": 0.5846, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6807466745376587, + "rewards/margins": 0.34590983390808105, + "rewards/rejected": -2.02665638923645, + "step": 255 + }, + { + "epoch": 0.16719732223038616, + "grad_norm": 7.630789747584126, + "learning_rate": 1.9730448705798237e-07, + "logits/chosen": -0.9532262086868286, + "logits/rejected": -0.8471428155899048, + "logps/chosen": -715.9351806640625, + "logps/rejected": -676.1060180664062, + "loss": 0.5836, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.81856369972229, + "rewards/margins": 0.22772595286369324, + "rewards/rejected": -2.0462896823883057, + "step": 256 + }, + { + "epoch": 0.1678504367703486, + "grad_norm": 7.996225765356348, + "learning_rate": 1.9725161940151037e-07, + "logits/chosen": -1.0379149913787842, + "logits/rejected": -0.9661443829536438, + "logps/chosen": -589.5501098632812, + "logps/rejected": -574.7515869140625, + "loss": 0.5762, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6499119997024536, + "rewards/margins": 0.21118004620075226, + "rewards/rejected": -1.8610920906066895, + "step": 257 + }, + { + "epoch": 0.16850355131031106, + "grad_norm": 7.457105463708072, + "learning_rate": 1.9719824553760493e-07, + "logits/chosen": -0.976207435131073, + "logits/rejected": -0.939882218837738, + "logps/chosen": -630.826171875, + "logps/rejected": -711.6494750976562, + "loss": 0.5239, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.785365104675293, + "rewards/margins": 0.5453168153762817, + "rewards/rejected": -2.330681800842285, + "step": 258 + }, + { + "epoch": 0.1691566658502735, + "grad_norm": 7.10936396957504, + "learning_rate": 1.9714436574408404e-07, + "logits/chosen": -1.0234023332595825, + "logits/rejected": -1.0151737928390503, + "logps/chosen": -654.0241088867188, + "logps/rejected": -693.206298828125, + "loss": 0.5556, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.698794960975647, + "rewards/margins": 0.47481226921081543, + "rewards/rejected": -2.173607349395752, + "step": 259 + }, + { + "epoch": 0.16980978039023595, + "grad_norm": 12.120889100254757, + "learning_rate": 1.970899803013991e-07, + "logits/chosen": -0.9613388776779175, + "logits/rejected": -0.9475601315498352, + "logps/chosen": -721.533203125, + "logps/rejected": -764.3331909179688, + "loss": 0.6242, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.249241352081299, + "rewards/margins": 0.5473915338516235, + "rewards/rejected": -2.796632766723633, + "step": 260 + }, + { + "epoch": 0.1704628949301984, + "grad_norm": 8.375507399375959, + "learning_rate": 1.9703508949263343e-07, + "logits/chosen": -0.9917237758636475, + "logits/rejected": -0.974717915058136, + "logps/chosen": -657.542724609375, + "logps/rejected": -657.2991333007812, + "loss": 0.5991, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8568816184997559, + "rewards/margins": 0.41493868827819824, + "rewards/rejected": -2.271820068359375, + "step": 261 + }, + { + "epoch": 0.17111600947016084, + "grad_norm": 8.620447456913936, + "learning_rate": 1.9697969360350095e-07, + "logits/chosen": -0.9772311449050903, + "logits/rejected": -0.9717585444450378, + "logps/chosen": -563.0853271484375, + "logps/rejected": -628.078125, + "loss": 0.5931, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7189618349075317, + "rewards/margins": 0.3262416422367096, + "rewards/rejected": -2.045203447341919, + "step": 262 + }, + { + "epoch": 0.1717691240101233, + "grad_norm": 8.622090097281658, + "learning_rate": 1.9692379292234446e-07, + "logits/chosen": -1.1292492151260376, + "logits/rejected": -1.065726399421692, + "logps/chosen": -713.7160034179688, + "logps/rejected": -700.1644287109375, + "loss": 0.5657, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.874625563621521, + "rewards/margins": 0.375280499458313, + "rewards/rejected": -2.249905824661255, + "step": 263 + }, + { + "epoch": 0.17242223855008573, + "grad_norm": 8.32964680659536, + "learning_rate": 1.9686738774013438e-07, + "logits/chosen": -0.9963126182556152, + "logits/rejected": -0.9847568273544312, + "logps/chosen": -616.8109130859375, + "logps/rejected": -623.2798461914062, + "loss": 0.5323, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5587760210037231, + "rewards/margins": 0.21900975704193115, + "rewards/rejected": -1.7777857780456543, + "step": 264 + }, + { + "epoch": 0.17307535309004818, + "grad_norm": 6.771586579633424, + "learning_rate": 1.9681047835046707e-07, + "logits/chosen": -1.026088833808899, + "logits/rejected": -0.977704644203186, + "logps/chosen": -680.363525390625, + "logps/rejected": -708.6707153320312, + "loss": 0.5194, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8157883882522583, + "rewards/margins": 0.44769570231437683, + "rewards/rejected": -2.263484001159668, + "step": 265 + }, + { + "epoch": 0.17372846763001062, + "grad_norm": 7.5583512580487495, + "learning_rate": 1.9675306504956338e-07, + "logits/chosen": -0.9970998167991638, + "logits/rejected": -0.9781684279441833, + "logps/chosen": -659.970947265625, + "logps/rejected": -856.7073974609375, + "loss": 0.5688, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.790527582168579, + "rewards/margins": 1.0892386436462402, + "rewards/rejected": -2.8797662258148193, + "step": 266 + }, + { + "epoch": 0.17438158216997307, + "grad_norm": 9.147249940817659, + "learning_rate": 1.9669514813626704e-07, + "logits/chosen": -0.8966047167778015, + "logits/rejected": -0.8831186890602112, + "logps/chosen": -675.2528686523438, + "logps/rejected": -670.5457763671875, + "loss": 0.5648, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.9768502712249756, + "rewards/margins": 0.2957232892513275, + "rewards/rejected": -2.272573471069336, + "step": 267 + }, + { + "epoch": 0.17503469670993552, + "grad_norm": 7.702705924911228, + "learning_rate": 1.9663672791204323e-07, + "logits/chosen": -1.0310410261154175, + "logits/rejected": -0.9511358141899109, + "logps/chosen": -653.04296875, + "logps/rejected": -671.1371459960938, + "loss": 0.5649, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9696815013885498, + "rewards/margins": 0.4211324155330658, + "rewards/rejected": -2.3908138275146484, + "step": 268 + }, + { + "epoch": 0.17568781124989796, + "grad_norm": 7.604828455223188, + "learning_rate": 1.9657780468097683e-07, + "logits/chosen": -0.9046274423599243, + "logits/rejected": -0.8803516030311584, + "logps/chosen": -581.923828125, + "logps/rejected": -623.5220947265625, + "loss": 0.5438, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5920655727386475, + "rewards/margins": 0.4273078441619873, + "rewards/rejected": -2.0193734169006348, + "step": 269 + }, + { + "epoch": 0.1763409257898604, + "grad_norm": 9.076618915096422, + "learning_rate": 1.96518378749771e-07, + "logits/chosen": -0.9741644859313965, + "logits/rejected": -0.9750419855117798, + "logps/chosen": -733.3799438476562, + "logps/rejected": -751.2376708984375, + "loss": 0.6095, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.264136552810669, + "rewards/margins": 0.37653613090515137, + "rewards/rejected": -2.6406726837158203, + "step": 270 + }, + { + "epoch": 0.17699404032982285, + "grad_norm": 9.37938779675179, + "learning_rate": 1.964584504277455e-07, + "logits/chosen": -1.0458801984786987, + "logits/rejected": -0.9988090395927429, + "logps/chosen": -671.9390258789062, + "logps/rejected": -693.88818359375, + "loss": 0.5901, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9275598526000977, + "rewards/margins": 0.33395007252693176, + "rewards/rejected": -2.261509895324707, + "step": 271 + }, + { + "epoch": 0.1776471548697853, + "grad_norm": 7.552555225756131, + "learning_rate": 1.9639802002683514e-07, + "logits/chosen": -0.946459949016571, + "logits/rejected": -0.892015278339386, + "logps/chosen": -660.0249633789062, + "logps/rejected": -649.5670166015625, + "loss": 0.5484, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8262689113616943, + "rewards/margins": 0.5144949555397034, + "rewards/rejected": -2.340763568878174, + "step": 272 + }, + { + "epoch": 0.17830026940974775, + "grad_norm": 8.342988224658326, + "learning_rate": 1.9633708786158803e-07, + "logits/chosen": -0.973565936088562, + "logits/rejected": -0.9508368968963623, + "logps/chosen": -704.4890747070312, + "logps/rejected": -710.4368896484375, + "loss": 0.5307, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.024822950363159, + "rewards/margins": 0.5465009212493896, + "rewards/rejected": -2.571324110031128, + "step": 273 + }, + { + "epoch": 0.1789533839497102, + "grad_norm": 8.120756554883108, + "learning_rate": 1.962756542491641e-07, + "logits/chosen": -0.8403066396713257, + "logits/rejected": -0.8492639660835266, + "logps/chosen": -621.307373046875, + "logps/rejected": -661.5020141601562, + "loss": 0.5477, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9450585842132568, + "rewards/margins": 0.508897066116333, + "rewards/rejected": -2.4539551734924316, + "step": 274 + }, + { + "epoch": 0.17960649848967264, + "grad_norm": 8.2187266045248, + "learning_rate": 1.962137195093334e-07, + "logits/chosen": -0.9613364934921265, + "logits/rejected": -0.938071608543396, + "logps/chosen": -661.04052734375, + "logps/rejected": -709.01171875, + "loss": 0.57, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1429219245910645, + "rewards/margins": 0.5459993481636047, + "rewards/rejected": -2.6889214515686035, + "step": 275 + }, + { + "epoch": 0.18025961302963509, + "grad_norm": 7.73778880473398, + "learning_rate": 1.9615128396447432e-07, + "logits/chosen": -0.9699392318725586, + "logits/rejected": -0.9538300633430481, + "logps/chosen": -690.2393798828125, + "logps/rejected": -724.1671142578125, + "loss": 0.5256, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.015857696533203, + "rewards/margins": 0.4910595118999481, + "rewards/rejected": -2.5069172382354736, + "step": 276 + }, + { + "epoch": 0.18091272756959753, + "grad_norm": 7.911509339012901, + "learning_rate": 1.9608834793957208e-07, + "logits/chosen": -0.9603309631347656, + "logits/rejected": -0.9850507378578186, + "logps/chosen": -765.2130126953125, + "logps/rejected": -842.7460327148438, + "loss": 0.5647, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.3112220764160156, + "rewards/margins": 0.5749973654747009, + "rewards/rejected": -2.8862195014953613, + "step": 277 + }, + { + "epoch": 0.18156584210955998, + "grad_norm": 8.967748933935127, + "learning_rate": 1.9602491176221695e-07, + "logits/chosen": -0.9512305855751038, + "logits/rejected": -0.9300289154052734, + "logps/chosen": -670.3013305664062, + "logps/rejected": -727.5516357421875, + "loss": 0.5245, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9914320707321167, + "rewards/margins": 0.5332896113395691, + "rewards/rejected": -2.524721622467041, + "step": 278 + }, + { + "epoch": 0.18221895664952242, + "grad_norm": 8.734148197725288, + "learning_rate": 1.9596097576260253e-07, + "logits/chosen": -0.9387493133544922, + "logits/rejected": -0.9077507853507996, + "logps/chosen": -765.1206665039062, + "logps/rejected": -880.9041137695312, + "loss": 0.5773, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2102487087249756, + "rewards/margins": 1.0287326574325562, + "rewards/rejected": -3.238981246948242, + "step": 279 + }, + { + "epoch": 0.18287207118948487, + "grad_norm": 8.15049441196506, + "learning_rate": 1.9589654027352411e-07, + "logits/chosen": -0.999761700630188, + "logits/rejected": -0.9837722778320312, + "logps/chosen": -709.5980224609375, + "logps/rejected": -741.883544921875, + "loss": 0.5495, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2854127883911133, + "rewards/margins": 0.4614034593105316, + "rewards/rejected": -2.746816396713257, + "step": 280 + }, + { + "epoch": 0.18352518572944732, + "grad_norm": 10.6404743180184, + "learning_rate": 1.9583160563037687e-07, + "logits/chosen": -0.7266749143600464, + "logits/rejected": -0.6888471841812134, + "logps/chosen": -661.6748657226562, + "logps/rejected": -709.5576171875, + "loss": 0.5863, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1588385105133057, + "rewards/margins": 0.5338287353515625, + "rewards/rejected": -2.692667245864868, + "step": 281 + }, + { + "epoch": 0.18417830026940976, + "grad_norm": 9.00089638171536, + "learning_rate": 1.957661721711541e-07, + "logits/chosen": -0.8879116773605347, + "logits/rejected": -0.8716956377029419, + "logps/chosen": -693.7220458984375, + "logps/rejected": -783.8416748046875, + "loss": 0.6046, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3302977085113525, + "rewards/margins": 0.6564316749572754, + "rewards/rejected": -2.986729145050049, + "step": 282 + }, + { + "epoch": 0.18483141480937218, + "grad_norm": 8.397472630305604, + "learning_rate": 1.9570024023644555e-07, + "logits/chosen": -0.9611495137214661, + "logits/rejected": -0.9361308813095093, + "logps/chosen": -715.2149047851562, + "logps/rejected": -756.7767333984375, + "loss": 0.5468, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.1140494346618652, + "rewards/margins": 0.8066607117652893, + "rewards/rejected": -2.9207100868225098, + "step": 283 + }, + { + "epoch": 0.18548452934933463, + "grad_norm": 8.115486395877507, + "learning_rate": 1.9563381016943562e-07, + "logits/chosen": -0.9859124422073364, + "logits/rejected": -0.9830190539360046, + "logps/chosen": -775.6683959960938, + "logps/rejected": -819.667724609375, + "loss": 0.5704, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2429869174957275, + "rewards/margins": 0.4830154776573181, + "rewards/rejected": -2.7260022163391113, + "step": 284 + }, + { + "epoch": 0.18613764388929707, + "grad_norm": 9.071986066041832, + "learning_rate": 1.9556688231590148e-07, + "logits/chosen": -0.8561621904373169, + "logits/rejected": -0.8536893129348755, + "logps/chosen": -665.8601684570312, + "logps/rejected": -814.3583374023438, + "loss": 0.5968, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.13525652885437, + "rewards/margins": 0.8098075985908508, + "rewards/rejected": -2.945064067840576, + "step": 285 + }, + { + "epoch": 0.18679075842925952, + "grad_norm": 9.258588546101418, + "learning_rate": 1.9549945702421142e-07, + "logits/chosen": -0.8817156553268433, + "logits/rejected": -0.8830188512802124, + "logps/chosen": -679.46044921875, + "logps/rejected": -752.0320434570312, + "loss": 0.537, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.2376911640167236, + "rewards/margins": 0.5545761585235596, + "rewards/rejected": -2.792267322540283, + "step": 286 + }, + { + "epoch": 0.18744387296922196, + "grad_norm": 8.056969647323232, + "learning_rate": 1.9543153464532288e-07, + "logits/chosen": -0.7749789357185364, + "logits/rejected": -0.794037401676178, + "logps/chosen": -639.8425903320312, + "logps/rejected": -680.3607788085938, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8591387271881104, + "rewards/margins": 0.4303709864616394, + "rewards/rejected": -2.2895097732543945, + "step": 287 + }, + { + "epoch": 0.1880969875091844, + "grad_norm": 7.563536225052495, + "learning_rate": 1.9536311553278083e-07, + "logits/chosen": -0.9034614562988281, + "logits/rejected": -0.8050072193145752, + "logps/chosen": -599.6620483398438, + "logps/rejected": -571.1824951171875, + "loss": 0.5816, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0411486625671387, + "rewards/margins": 0.15406599640846252, + "rewards/rejected": -2.1952145099639893, + "step": 288 + }, + { + "epoch": 0.18875010204914686, + "grad_norm": 8.136613787281412, + "learning_rate": 1.9529420004271567e-07, + "logits/chosen": -0.886310338973999, + "logits/rejected": -0.9234850406646729, + "logps/chosen": -753.2724609375, + "logps/rejected": -823.5413208007812, + "loss": 0.5271, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.702235221862793, + "rewards/margins": 0.4669128954410553, + "rewards/rejected": -3.1691482067108154, + "step": 289 + }, + { + "epoch": 0.1894032165891093, + "grad_norm": 8.155433466668587, + "learning_rate": 1.952247885338415e-07, + "logits/chosen": -0.8842246532440186, + "logits/rejected": -0.9555231332778931, + "logps/chosen": -596.1199340820312, + "logps/rejected": -699.538330078125, + "loss": 0.5528, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0811939239501953, + "rewards/margins": 0.7108913660049438, + "rewards/rejected": -2.7920849323272705, + "step": 290 + }, + { + "epoch": 0.19005633112907175, + "grad_norm": 11.089638112757227, + "learning_rate": 1.9515488136745445e-07, + "logits/chosen": -0.9472813010215759, + "logits/rejected": -0.8430065512657166, + "logps/chosen": -725.784912109375, + "logps/rejected": -711.8215942382812, + "loss": 0.5915, + "rewards/accuracies": 0.46875, + "rewards/chosen": -2.388014793395996, + "rewards/margins": 0.0675196647644043, + "rewards/rejected": -2.4555346965789795, + "step": 291 + }, + { + "epoch": 0.1907094456690342, + "grad_norm": 7.820461974964933, + "learning_rate": 1.9508447890743046e-07, + "logits/chosen": -0.9438467621803284, + "logits/rejected": -0.9221634864807129, + "logps/chosen": -731.6463012695312, + "logps/rejected": -786.1243896484375, + "loss": 0.508, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3206136226654053, + "rewards/margins": 0.6845057010650635, + "rewards/rejected": -3.0051193237304688, + "step": 292 + }, + { + "epoch": 0.19136256020899664, + "grad_norm": 8.708040081311037, + "learning_rate": 1.9501358152022349e-07, + "logits/chosen": -0.926476240158081, + "logits/rejected": -0.9502461552619934, + "logps/chosen": -659.2603149414062, + "logps/rejected": -810.6748046875, + "loss": 0.5761, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1894261837005615, + "rewards/margins": 0.9734176397323608, + "rewards/rejected": -3.162843704223633, + "step": 293 + }, + { + "epoch": 0.1920156747489591, + "grad_norm": 7.970091802437382, + "learning_rate": 1.949421895748638e-07, + "logits/chosen": -0.9131462574005127, + "logits/rejected": -0.9216738939285278, + "logps/chosen": -757.0573120117188, + "logps/rejected": -837.2061767578125, + "loss": 0.561, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.317478656768799, + "rewards/margins": 0.5989239811897278, + "rewards/rejected": -2.916402578353882, + "step": 294 + }, + { + "epoch": 0.19266878928892153, + "grad_norm": 10.606953825436584, + "learning_rate": 1.9487030344295584e-07, + "logits/chosen": -0.9458974003791809, + "logits/rejected": -0.911933183670044, + "logps/chosen": -795.9569091796875, + "logps/rejected": -945.5296020507812, + "loss": 0.5062, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4541730880737305, + "rewards/margins": 0.7940478324890137, + "rewards/rejected": -3.248220920562744, + "step": 295 + }, + { + "epoch": 0.19332190382888398, + "grad_norm": 7.8610409749854675, + "learning_rate": 1.947979234986763e-07, + "logits/chosen": -1.084223985671997, + "logits/rejected": -1.0780235528945923, + "logps/chosen": -672.0103759765625, + "logps/rejected": -736.9426879882812, + "loss": 0.4731, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2389864921569824, + "rewards/margins": 0.7914223670959473, + "rewards/rejected": -3.0304088592529297, + "step": 296 + }, + { + "epoch": 0.19397501836884642, + "grad_norm": 7.916708965496982, + "learning_rate": 1.9472505011877235e-07, + "logits/chosen": -0.839238703250885, + "logits/rejected": -0.8083611130714417, + "logps/chosen": -606.4061279296875, + "logps/rejected": -644.0174560546875, + "loss": 0.5493, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8088489770889282, + "rewards/margins": 0.4753211736679077, + "rewards/rejected": -2.284170150756836, + "step": 297 + }, + { + "epoch": 0.19462813290880887, + "grad_norm": 7.45863666364653, + "learning_rate": 1.9465168368255945e-07, + "logits/chosen": -0.9710556268692017, + "logits/rejected": -0.9412792921066284, + "logps/chosen": -696.9168090820312, + "logps/rejected": -764.7929077148438, + "loss": 0.5276, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5390238761901855, + "rewards/margins": 0.6176880598068237, + "rewards/rejected": -3.1567115783691406, + "step": 298 + }, + { + "epoch": 0.19528124744877132, + "grad_norm": 8.63941851768535, + "learning_rate": 1.9457782457191949e-07, + "logits/chosen": -0.9380285739898682, + "logits/rejected": -0.9279670119285583, + "logps/chosen": -657.2073364257812, + "logps/rejected": -776.177490234375, + "loss": 0.5322, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1041078567504883, + "rewards/margins": 0.8005395531654358, + "rewards/rejected": -2.9046473503112793, + "step": 299 + }, + { + "epoch": 0.19593436198873376, + "grad_norm": 10.905449832753915, + "learning_rate": 1.9450347317129891e-07, + "logits/chosen": -0.8918706178665161, + "logits/rejected": -0.8934108018875122, + "logps/chosen": -648.9880981445312, + "logps/rejected": -658.7398681640625, + "loss": 0.5705, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.2292122840881348, + "rewards/margins": 0.24789035320281982, + "rewards/rejected": -2.477102518081665, + "step": 300 + }, + { + "epoch": 0.19593436198873376, + "eval_logits/chosen": -0.8229660391807556, + "eval_logits/rejected": -0.7893882989883423, + "eval_logps/chosen": -721.2881469726562, + "eval_logps/rejected": -765.6893920898438, + "eval_loss": 0.5544829368591309, + "eval_rewards/accuracies": 0.7269999980926514, + "eval_rewards/chosen": -2.473808765411377, + "eval_rewards/margins": 0.5313993096351624, + "eval_rewards/rejected": -3.0052082538604736, + "eval_runtime": 618.0515, + "eval_samples_per_second": 6.472, + "eval_steps_per_second": 0.404, + "step": 300 + }, + { + "epoch": 0.1965874765286962, + "grad_norm": 7.659026133135656, + "learning_rate": 1.9442862986770643e-07, + "logits/chosen": -0.9449422359466553, + "logits/rejected": -0.8925285935401917, + "logps/chosen": -697.9733276367188, + "logps/rejected": -778.6115112304688, + "loss": 0.508, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2272863388061523, + "rewards/margins": 0.7337895631790161, + "rewards/rejected": -2.961075782775879, + "step": 301 + }, + { + "epoch": 0.19724059106865865, + "grad_norm": 10.725925196101898, + "learning_rate": 1.943532950507113e-07, + "logits/chosen": -0.9608031511306763, + "logits/rejected": -0.9174195528030396, + "logps/chosen": -821.3086547851562, + "logps/rejected": -875.5884399414062, + "loss": 0.5612, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8072926998138428, + "rewards/margins": 0.6858442425727844, + "rewards/rejected": -3.4931368827819824, + "step": 302 + }, + { + "epoch": 0.1978937056086211, + "grad_norm": 8.209549535542726, + "learning_rate": 1.9427746911244113e-07, + "logits/chosen": -1.0311881303787231, + "logits/rejected": -0.9527757167816162, + "logps/chosen": -755.90478515625, + "logps/rejected": -760.92529296875, + "loss": 0.5497, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.6526520252227783, + "rewards/margins": 0.47115930914878845, + "rewards/rejected": -3.1238112449645996, + "step": 303 + }, + { + "epoch": 0.19854682014858355, + "grad_norm": 8.394826636042648, + "learning_rate": 1.942011524475798e-07, + "logits/chosen": -0.8262045383453369, + "logits/rejected": -0.894850492477417, + "logps/chosen": -703.4110107421875, + "logps/rejected": -817.5936279296875, + "loss": 0.4896, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3225224018096924, + "rewards/margins": 0.6646079421043396, + "rewards/rejected": -2.9871304035186768, + "step": 304 + }, + { + "epoch": 0.199199934688546, + "grad_norm": 8.107673849474413, + "learning_rate": 1.9412434545336566e-07, + "logits/chosen": -0.9343682527542114, + "logits/rejected": -0.9112167954444885, + "logps/chosen": -664.5975952148438, + "logps/rejected": -693.5718383789062, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.497659683227539, + "rewards/margins": 0.25632160902023315, + "rewards/rejected": -2.753981828689575, + "step": 305 + }, + { + "epoch": 0.19985304922850844, + "grad_norm": 9.022343657029202, + "learning_rate": 1.9404704852958912e-07, + "logits/chosen": -0.9269659519195557, + "logits/rejected": -0.8933895826339722, + "logps/chosen": -639.3394165039062, + "logps/rejected": -709.7356567382812, + "loss": 0.5264, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.0649757385253906, + "rewards/margins": 0.6906264424324036, + "rewards/rejected": -2.7556021213531494, + "step": 306 + }, + { + "epoch": 0.20050616376847089, + "grad_norm": 7.583513266351151, + "learning_rate": 1.9396926207859085e-07, + "logits/chosen": -0.7031735181808472, + "logits/rejected": -0.6803351640701294, + "logps/chosen": -688.1170043945312, + "logps/rejected": -824.4000854492188, + "loss": 0.5313, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5198776721954346, + "rewards/margins": 0.7780323624610901, + "rewards/rejected": -3.29790997505188, + "step": 307 + }, + { + "epoch": 0.20115927830843333, + "grad_norm": 8.379739148957032, + "learning_rate": 1.9389098650525947e-07, + "logits/chosen": -0.8177533149719238, + "logits/rejected": -0.8568516969680786, + "logps/chosen": -676.4760131835938, + "logps/rejected": -744.55126953125, + "loss": 0.509, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.2387030124664307, + "rewards/margins": 0.5821399092674255, + "rewards/rejected": -2.820842742919922, + "step": 308 + }, + { + "epoch": 0.20181239284839578, + "grad_norm": 8.173710195620405, + "learning_rate": 1.9381222221702967e-07, + "logits/chosen": -0.9144182205200195, + "logits/rejected": -0.9687687158584595, + "logps/chosen": -734.7119750976562, + "logps/rejected": -1004.3690795898438, + "loss": 0.5303, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.554551362991333, + "rewards/margins": 1.2628813982009888, + "rewards/rejected": -3.8174331188201904, + "step": 309 + }, + { + "epoch": 0.20246550738835822, + "grad_norm": 7.9552125591432485, + "learning_rate": 1.9373296962387984e-07, + "logits/chosen": -0.9191279411315918, + "logits/rejected": -0.8958581686019897, + "logps/chosen": -601.743896484375, + "logps/rejected": -661.82421875, + "loss": 0.5474, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0206687450408936, + "rewards/margins": 0.6330419182777405, + "rewards/rejected": -2.65371036529541, + "step": 310 + }, + { + "epoch": 0.20311862192832067, + "grad_norm": 7.980624051483625, + "learning_rate": 1.9365322913833015e-07, + "logits/chosen": -0.866454005241394, + "logits/rejected": -0.7789896726608276, + "logps/chosen": -754.0518188476562, + "logps/rejected": -765.2423706054688, + "loss": 0.5546, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.443909168243408, + "rewards/margins": 0.3114027678966522, + "rewards/rejected": -2.755311965942383, + "step": 311 + }, + { + "epoch": 0.20377173646828312, + "grad_norm": 7.33860819487254, + "learning_rate": 1.935730011754403e-07, + "logits/chosen": -0.8991135358810425, + "logits/rejected": -0.9318748712539673, + "logps/chosen": -720.3719482421875, + "logps/rejected": -895.6969604492188, + "loss": 0.5255, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.2463953495025635, + "rewards/margins": 1.159752607345581, + "rewards/rejected": -3.4061479568481445, + "step": 312 + }, + { + "epoch": 0.20442485100824556, + "grad_norm": 9.418850525971324, + "learning_rate": 1.9349228615280734e-07, + "logits/chosen": -0.8775783777236938, + "logits/rejected": -0.8939123153686523, + "logps/chosen": -679.7825927734375, + "logps/rejected": -848.8208618164062, + "loss": 0.4997, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3244364261627197, + "rewards/margins": 1.083613634109497, + "rewards/rejected": -3.4080498218536377, + "step": 313 + }, + { + "epoch": 0.205077965548208, + "grad_norm": 8.778963808667314, + "learning_rate": 1.9341108449056358e-07, + "logits/chosen": -0.960693359375, + "logits/rejected": -1.0465654134750366, + "logps/chosen": -718.642333984375, + "logps/rejected": -837.5458984375, + "loss": 0.5544, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5699141025543213, + "rewards/margins": 0.5482870936393738, + "rewards/rejected": -3.118201494216919, + "step": 314 + }, + { + "epoch": 0.20573108008817045, + "grad_norm": 8.973296115737988, + "learning_rate": 1.9332939661137425e-07, + "logits/chosen": -0.9772869348526001, + "logits/rejected": -0.9910680055618286, + "logps/chosen": -700.7576904296875, + "logps/rejected": -708.429931640625, + "loss": 0.5846, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4407153129577637, + "rewards/margins": 0.2659532427787781, + "rewards/rejected": -2.7066686153411865, + "step": 315 + }, + { + "epoch": 0.2063841946281329, + "grad_norm": 11.761985597145776, + "learning_rate": 1.9324722294043556e-07, + "logits/chosen": -0.9107195138931274, + "logits/rejected": -0.8959240317344666, + "logps/chosen": -743.5145263671875, + "logps/rejected": -857.8228149414062, + "loss": 0.5304, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.518425941467285, + "rewards/margins": 0.9631866812705994, + "rewards/rejected": -3.48161244392395, + "step": 316 + }, + { + "epoch": 0.20703730916809535, + "grad_norm": 9.491575896290264, + "learning_rate": 1.931645639054722e-07, + "logits/chosen": -0.9179579019546509, + "logits/rejected": -0.9592142105102539, + "logps/chosen": -705.681396484375, + "logps/rejected": -892.9383544921875, + "loss": 0.5062, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5186941623687744, + "rewards/margins": 1.0660593509674072, + "rewards/rejected": -3.5847535133361816, + "step": 317 + }, + { + "epoch": 0.2076904237080578, + "grad_norm": 8.630378132342063, + "learning_rate": 1.930814199367353e-07, + "logits/chosen": -0.8748406171798706, + "logits/rejected": -0.9312347769737244, + "logps/chosen": -766.6427001953125, + "logps/rejected": -819.1407470703125, + "loss": 0.5132, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.654548406600952, + "rewards/margins": 0.5859209299087524, + "rewards/rejected": -3.240469455718994, + "step": 318 + }, + { + "epoch": 0.20834353824802024, + "grad_norm": 9.313105997470586, + "learning_rate": 1.9299779146700008e-07, + "logits/chosen": -0.8464125990867615, + "logits/rejected": -0.8674778938293457, + "logps/chosen": -749.940185546875, + "logps/rejected": -874.630859375, + "loss": 0.5174, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.640587091445923, + "rewards/margins": 0.7609758973121643, + "rewards/rejected": -3.4015626907348633, + "step": 319 + }, + { + "epoch": 0.20899665278798268, + "grad_norm": 8.761272107103547, + "learning_rate": 1.9291367893156374e-07, + "logits/chosen": -0.8507086634635925, + "logits/rejected": -0.8460374474525452, + "logps/chosen": -706.5238647460938, + "logps/rejected": -763.0086669921875, + "loss": 0.5502, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.7435646057128906, + "rewards/margins": 0.4100922644138336, + "rewards/rejected": -3.1536569595336914, + "step": 320 + }, + { + "epoch": 0.20964976732794513, + "grad_norm": 8.645077439540554, + "learning_rate": 1.9282908276824305e-07, + "logits/chosen": -0.8159580230712891, + "logits/rejected": -0.8971022367477417, + "logps/chosen": -718.7769165039062, + "logps/rejected": -896.5993041992188, + "loss": 0.5023, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4595208168029785, + "rewards/margins": 1.1103695631027222, + "rewards/rejected": -3.5698904991149902, + "step": 321 + }, + { + "epoch": 0.21030288186790758, + "grad_norm": 8.9608952505274, + "learning_rate": 1.927440034173721e-07, + "logits/chosen": -0.9348753690719604, + "logits/rejected": -0.9571008682250977, + "logps/chosen": -768.90673828125, + "logps/rejected": -839.931640625, + "loss": 0.5535, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.864488124847412, + "rewards/margins": 0.6067023873329163, + "rewards/rejected": -3.4711906909942627, + "step": 322 + }, + { + "epoch": 0.21095599640787002, + "grad_norm": 8.764928992953658, + "learning_rate": 1.9265844132180014e-07, + "logits/chosen": -0.937222957611084, + "logits/rejected": -0.9439467191696167, + "logps/chosen": -740.7075805664062, + "logps/rejected": -851.681396484375, + "loss": 0.5409, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.908351182937622, + "rewards/margins": 0.8476966023445129, + "rewards/rejected": -3.7560479640960693, + "step": 323 + }, + { + "epoch": 0.21160911094783247, + "grad_norm": 11.203248328192403, + "learning_rate": 1.9257239692688904e-07, + "logits/chosen": -0.9657084345817566, + "logits/rejected": -0.8316485285758972, + "logps/chosen": -710.1865234375, + "logps/rejected": -722.1703491210938, + "loss": 0.5703, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.541426420211792, + "rewards/margins": 0.4015898108482361, + "rewards/rejected": -2.943016290664673, + "step": 324 + }, + { + "epoch": 0.21226222548779491, + "grad_norm": 10.154285686767231, + "learning_rate": 1.9248587068051117e-07, + "logits/chosen": -0.9456658959388733, + "logits/rejected": -0.9287790060043335, + "logps/chosen": -759.39697265625, + "logps/rejected": -901.678955078125, + "loss": 0.5445, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.964029550552368, + "rewards/margins": 0.8283199071884155, + "rewards/rejected": -3.7923495769500732, + "step": 325 + }, + { + "epoch": 0.21291534002775736, + "grad_norm": 10.085226190171486, + "learning_rate": 1.92398863033047e-07, + "logits/chosen": -0.9480457901954651, + "logits/rejected": -0.8808225989341736, + "logps/chosen": -737.1611938476562, + "logps/rejected": -832.3869018554688, + "loss": 0.5297, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4359898567199707, + "rewards/margins": 1.118359088897705, + "rewards/rejected": -3.5543487071990967, + "step": 326 + }, + { + "epoch": 0.2135684545677198, + "grad_norm": 8.037243703725244, + "learning_rate": 1.9231137443738273e-07, + "logits/chosen": -0.8650938868522644, + "logits/rejected": -0.7863737940788269, + "logps/chosen": -667.995361328125, + "logps/rejected": -802.8931274414062, + "loss": 0.4848, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.464874029159546, + "rewards/margins": 0.8920389413833618, + "rewards/rejected": -3.3569130897521973, + "step": 327 + }, + { + "epoch": 0.21422156910768225, + "grad_norm": 9.782455330939415, + "learning_rate": 1.92223405348908e-07, + "logits/chosen": -0.9213351011276245, + "logits/rejected": -0.861817479133606, + "logps/chosen": -762.0364990234375, + "logps/rejected": -799.6807250976562, + "loss": 0.5027, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.5765061378479004, + "rewards/margins": 0.7302671074867249, + "rewards/rejected": -3.3067734241485596, + "step": 328 + }, + { + "epoch": 0.2148746836476447, + "grad_norm": 8.275520691185628, + "learning_rate": 1.9213495622551346e-07, + "logits/chosen": -0.8799265027046204, + "logits/rejected": -0.9568032622337341, + "logps/chosen": -696.4993286132812, + "logps/rejected": -950.8281860351562, + "loss": 0.526, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4880318641662598, + "rewards/margins": 1.6394169330596924, + "rewards/rejected": -4.127448558807373, + "step": 329 + }, + { + "epoch": 0.21552779818760714, + "grad_norm": 9.733494867655624, + "learning_rate": 1.9204602752758836e-07, + "logits/chosen": -0.8873504996299744, + "logits/rejected": -0.8919836282730103, + "logps/chosen": -682.7972412109375, + "logps/rejected": -753.762451171875, + "loss": 0.5733, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7004520893096924, + "rewards/margins": 0.7935549020767212, + "rewards/rejected": -3.494007110595703, + "step": 330 + }, + { + "epoch": 0.2161809127275696, + "grad_norm": 11.487422940603599, + "learning_rate": 1.9195661971801823e-07, + "logits/chosen": -0.8627737760543823, + "logits/rejected": -0.8270461559295654, + "logps/chosen": -752.065673828125, + "logps/rejected": -787.5745239257812, + "loss": 0.5432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.530182361602783, + "rewards/margins": 0.5825455188751221, + "rewards/rejected": -3.1127278804779053, + "step": 331 + }, + { + "epoch": 0.21683402726753204, + "grad_norm": 9.536090726405337, + "learning_rate": 1.9186673326218252e-07, + "logits/chosen": -0.7737371325492859, + "logits/rejected": -0.8088192343711853, + "logps/chosen": -740.6680297851562, + "logps/rejected": -831.1947631835938, + "loss": 0.5135, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.6271743774414062, + "rewards/margins": 0.6504449844360352, + "rewards/rejected": -3.2776193618774414, + "step": 332 + }, + { + "epoch": 0.21748714180749448, + "grad_norm": 10.23713461294934, + "learning_rate": 1.9177636862795192e-07, + "logits/chosen": -0.8282246589660645, + "logits/rejected": -0.8120365142822266, + "logps/chosen": -678.1328735351562, + "logps/rejected": -745.9762573242188, + "loss": 0.5841, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.664412498474121, + "rewards/margins": 0.6008880734443665, + "rewards/rejected": -3.2653002738952637, + "step": 333 + }, + { + "epoch": 0.21814025634745693, + "grad_norm": 7.80838685357101, + "learning_rate": 1.9168552628568628e-07, + "logits/chosen": -0.917161226272583, + "logits/rejected": -1.018967628479004, + "logps/chosen": -695.7123413085938, + "logps/rejected": -829.2813720703125, + "loss": 0.5086, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2889459133148193, + "rewards/margins": 0.8900274634361267, + "rewards/rejected": -3.17897367477417, + "step": 334 + }, + { + "epoch": 0.21879337088741938, + "grad_norm": 8.958596685509493, + "learning_rate": 1.9159420670823185e-07, + "logits/chosen": -0.9185332655906677, + "logits/rejected": -0.870037317276001, + "logps/chosen": -707.0963745117188, + "logps/rejected": -741.2311401367188, + "loss": 0.5556, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3948493003845215, + "rewards/margins": 0.5061476230621338, + "rewards/rejected": -2.9009971618652344, + "step": 335 + }, + { + "epoch": 0.21944648542738182, + "grad_norm": 11.406917533980884, + "learning_rate": 1.9150241037091908e-07, + "logits/chosen": -0.9214222431182861, + "logits/rejected": -0.9454975724220276, + "logps/chosen": -763.0989379882812, + "logps/rejected": -757.4050903320312, + "loss": 0.5357, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7994232177734375, + "rewards/margins": 0.5790168642997742, + "rewards/rejected": -3.3784401416778564, + "step": 336 + }, + { + "epoch": 0.22009959996734427, + "grad_norm": 8.348133500220372, + "learning_rate": 1.9141013775155985e-07, + "logits/chosen": -0.8210875391960144, + "logits/rejected": -0.79030442237854, + "logps/chosen": -674.8720092773438, + "logps/rejected": -720.6124877929688, + "loss": 0.4822, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2248129844665527, + "rewards/margins": 0.6432361006736755, + "rewards/rejected": -2.868049144744873, + "step": 337 + }, + { + "epoch": 0.2207527145073067, + "grad_norm": 10.69630326702095, + "learning_rate": 1.913173893304453e-07, + "logits/chosen": -0.9009968638420105, + "logits/rejected": -0.9283880591392517, + "logps/chosen": -710.2175903320312, + "logps/rejected": -817.3650512695312, + "loss": 0.5991, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6748363971710205, + "rewards/margins": 0.6548750400543213, + "rewards/rejected": -3.329711675643921, + "step": 338 + }, + { + "epoch": 0.22140582904726916, + "grad_norm": 8.33216110402213, + "learning_rate": 1.9122416559034314e-07, + "logits/chosen": -0.8454635143280029, + "logits/rejected": -0.7918909788131714, + "logps/chosen": -822.1680297851562, + "logps/rejected": -859.948486328125, + "loss": 0.5271, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.614701271057129, + "rewards/margins": 0.7649893164634705, + "rewards/rejected": -3.379690647125244, + "step": 339 + }, + { + "epoch": 0.2220589435872316, + "grad_norm": 9.685522044762283, + "learning_rate": 1.9113046701649514e-07, + "logits/chosen": -0.9304038882255554, + "logits/rejected": -0.8563180565834045, + "logps/chosen": -647.0829467773438, + "logps/rejected": -720.6229248046875, + "loss": 0.5638, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3711228370666504, + "rewards/margins": 0.4756318926811218, + "rewards/rejected": -2.846754789352417, + "step": 340 + }, + { + "epoch": 0.22271205812719405, + "grad_norm": 12.691659802253687, + "learning_rate": 1.9103629409661467e-07, + "logits/chosen": -0.8918319940567017, + "logits/rejected": -0.8558528423309326, + "logps/chosen": -666.69140625, + "logps/rejected": -701.9205932617188, + "loss": 0.586, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.631117343902588, + "rewards/margins": 0.31648769974708557, + "rewards/rejected": -2.9476051330566406, + "step": 341 + }, + { + "epoch": 0.2233651726671565, + "grad_norm": 9.139752244778244, + "learning_rate": 1.9094164732088412e-07, + "logits/chosen": -0.9762035608291626, + "logits/rejected": -0.9089761972427368, + "logps/chosen": -749.697998046875, + "logps/rejected": -739.471435546875, + "loss": 0.5427, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6595542430877686, + "rewards/margins": 0.30329763889312744, + "rewards/rejected": -2.9628520011901855, + "step": 342 + }, + { + "epoch": 0.22401828720711894, + "grad_norm": 8.791854812952089, + "learning_rate": 1.9084652718195236e-07, + "logits/chosen": -0.8667416572570801, + "logits/rejected": -0.7667337656021118, + "logps/chosen": -681.466796875, + "logps/rejected": -695.0787353515625, + "loss": 0.4833, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6133289337158203, + "rewards/margins": 0.5205738544464111, + "rewards/rejected": -3.1339030265808105, + "step": 343 + }, + { + "epoch": 0.2246714017470814, + "grad_norm": 8.391392138914927, + "learning_rate": 1.9075093417493222e-07, + "logits/chosen": -1.021234154701233, + "logits/rejected": -0.9969886541366577, + "logps/chosen": -772.0455932617188, + "logps/rejected": -815.7417602539062, + "loss": 0.5024, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6911168098449707, + "rewards/margins": 0.5029557943344116, + "rewards/rejected": -3.1940724849700928, + "step": 344 + }, + { + "epoch": 0.22532451628704384, + "grad_norm": 11.090830095089824, + "learning_rate": 1.9065486879739783e-07, + "logits/chosen": -0.868331789970398, + "logits/rejected": -0.8692890405654907, + "logps/chosen": -716.2054443359375, + "logps/rejected": -811.1107788085938, + "loss": 0.4973, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5343575477600098, + "rewards/margins": 0.8151875138282776, + "rewards/rejected": -3.3495450019836426, + "step": 345 + }, + { + "epoch": 0.22597763082700628, + "grad_norm": 7.7925319190317675, + "learning_rate": 1.9055833154938206e-07, + "logits/chosen": -0.8892742991447449, + "logits/rejected": -0.8983861207962036, + "logps/chosen": -748.330078125, + "logps/rejected": -880.4389038085938, + "loss": 0.4856, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.730029344558716, + "rewards/margins": 0.9022680521011353, + "rewards/rejected": -3.6322972774505615, + "step": 346 + }, + { + "epoch": 0.22663074536696873, + "grad_norm": 8.532957227412508, + "learning_rate": 1.9046132293337398e-07, + "logits/chosen": -0.8384896516799927, + "logits/rejected": -0.8548451066017151, + "logps/chosen": -680.845458984375, + "logps/rejected": -710.210693359375, + "loss": 0.5661, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.533750295639038, + "rewards/margins": 0.35651496052742004, + "rewards/rejected": -2.890265464782715, + "step": 347 + }, + { + "epoch": 0.22728385990693117, + "grad_norm": 8.837287762308879, + "learning_rate": 1.903638434543161e-07, + "logits/chosen": -0.9365876317024231, + "logits/rejected": -0.9628958702087402, + "logps/chosen": -703.1912841796875, + "logps/rejected": -868.1168823242188, + "loss": 0.5385, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4941153526306152, + "rewards/margins": 0.9199743866920471, + "rewards/rejected": -3.4140899181365967, + "step": 348 + }, + { + "epoch": 0.22793697444689362, + "grad_norm": 8.36148014201041, + "learning_rate": 1.9026589361960198e-07, + "logits/chosen": -0.8639967441558838, + "logits/rejected": -0.8728487491607666, + "logps/chosen": -726.3617553710938, + "logps/rejected": -799.6815185546875, + "loss": 0.535, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.505831241607666, + "rewards/margins": 0.5206820964813232, + "rewards/rejected": -3.0265133380889893, + "step": 349 + }, + { + "epoch": 0.22859008898685607, + "grad_norm": 9.530271630865817, + "learning_rate": 1.9016747393907327e-07, + "logits/chosen": -0.9139630794525146, + "logits/rejected": -0.8439926505088806, + "logps/chosen": -815.9520263671875, + "logps/rejected": -833.1660766601562, + "loss": 0.5739, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1119563579559326, + "rewards/margins": 0.37380701303482056, + "rewards/rejected": -3.4857633113861084, + "step": 350 + }, + { + "epoch": 0.2292432035268185, + "grad_norm": 8.867463894546207, + "learning_rate": 1.9006858492501734e-07, + "logits/chosen": -0.8531547784805298, + "logits/rejected": -0.8365933895111084, + "logps/chosen": -704.0611572265625, + "logps/rejected": -903.056640625, + "loss": 0.5359, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5138795375823975, + "rewards/margins": 1.4063800573349, + "rewards/rejected": -3.920259475708008, + "step": 351 + }, + { + "epoch": 0.22989631806678096, + "grad_norm": 11.470432910725284, + "learning_rate": 1.8996922709216454e-07, + "logits/chosen": -0.894100546836853, + "logits/rejected": -0.8663679957389832, + "logps/chosen": -734.0579833984375, + "logps/rejected": -780.0435791015625, + "loss": 0.5576, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.629918336868286, + "rewards/margins": 0.5574440956115723, + "rewards/rejected": -3.1873621940612793, + "step": 352 + }, + { + "epoch": 0.2305494326067434, + "grad_norm": 8.028334619645092, + "learning_rate": 1.8986940095768532e-07, + "logits/chosen": -0.898781955242157, + "logits/rejected": -0.8125208616256714, + "logps/chosen": -815.3742065429688, + "logps/rejected": -846.159912109375, + "loss": 0.5197, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.95741868019104, + "rewards/margins": 0.7997546792030334, + "rewards/rejected": -3.7571730613708496, + "step": 353 + }, + { + "epoch": 0.23120254714670585, + "grad_norm": 9.751558463586461, + "learning_rate": 1.8976910704118788e-07, + "logits/chosen": -0.9632163047790527, + "logits/rejected": -0.9397919178009033, + "logps/chosen": -815.559326171875, + "logps/rejected": -870.219482421875, + "loss": 0.5069, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.571608066558838, + "rewards/margins": 0.7947617173194885, + "rewards/rejected": -3.3663697242736816, + "step": 354 + }, + { + "epoch": 0.2318556616866683, + "grad_norm": 8.9770404991692, + "learning_rate": 1.8966834586471517e-07, + "logits/chosen": -0.7659550309181213, + "logits/rejected": -0.8094583749771118, + "logps/chosen": -731.0473022460938, + "logps/rejected": -909.0474853515625, + "loss": 0.4928, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.2166824340820312, + "rewards/margins": 1.0738444328308105, + "rewards/rejected": -3.290526866912842, + "step": 355 + }, + { + "epoch": 0.23250877622663074, + "grad_norm": 13.094673327461953, + "learning_rate": 1.8956711795274234e-07, + "logits/chosen": -0.915695071220398, + "logits/rejected": -0.9428746700286865, + "logps/chosen": -727.38671875, + "logps/rejected": -821.8265991210938, + "loss": 0.5928, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.389467239379883, + "rewards/margins": 0.8192275762557983, + "rewards/rejected": -3.2086949348449707, + "step": 356 + }, + { + "epoch": 0.2331618907665932, + "grad_norm": 9.652480627651235, + "learning_rate": 1.8946542383217393e-07, + "logits/chosen": -1.0150421857833862, + "logits/rejected": -0.9018900990486145, + "logps/chosen": -819.73486328125, + "logps/rejected": -817.6781616210938, + "loss": 0.5024, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8414320945739746, + "rewards/margins": 0.6176068782806396, + "rewards/rejected": -3.459038734436035, + "step": 357 + }, + { + "epoch": 0.23381500530655563, + "grad_norm": 9.434572929754303, + "learning_rate": 1.8936326403234123e-07, + "logits/chosen": -0.8316928148269653, + "logits/rejected": -0.8276723623275757, + "logps/chosen": -629.0164794921875, + "logps/rejected": -735.0913696289062, + "loss": 0.549, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.401994228363037, + "rewards/margins": 0.6702228784561157, + "rewards/rejected": -3.072216749191284, + "step": 358 + }, + { + "epoch": 0.23446811984651808, + "grad_norm": 8.192667820567904, + "learning_rate": 1.892606390849993e-07, + "logits/chosen": -0.9244518280029297, + "logits/rejected": -0.8568234443664551, + "logps/chosen": -805.7052612304688, + "logps/rejected": -841.6261596679688, + "loss": 0.496, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.8729684352874756, + "rewards/margins": 1.1182036399841309, + "rewards/rejected": -3.9911718368530273, + "step": 359 + }, + { + "epoch": 0.23512123438648053, + "grad_norm": 7.785077716699336, + "learning_rate": 1.8915754952432455e-07, + "logits/chosen": -0.9250266551971436, + "logits/rejected": -0.8511897325515747, + "logps/chosen": -683.8753662109375, + "logps/rejected": -751.814453125, + "loss": 0.5046, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.457275390625, + "rewards/margins": 0.8861963748931885, + "rewards/rejected": -3.3434715270996094, + "step": 360 + }, + { + "epoch": 0.23577434892644297, + "grad_norm": 7.374397652768221, + "learning_rate": 1.8905399588691163e-07, + "logits/chosen": -0.8517379760742188, + "logits/rejected": -0.7823293209075928, + "logps/chosen": -737.6988525390625, + "logps/rejected": -842.8553466796875, + "loss": 0.4827, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6914682388305664, + "rewards/margins": 1.1641268730163574, + "rewards/rejected": -3.855595111846924, + "step": 361 + }, + { + "epoch": 0.23642746346640542, + "grad_norm": 10.541536769336025, + "learning_rate": 1.8894997871177077e-07, + "logits/chosen": -1.0059581995010376, + "logits/rejected": -0.885239839553833, + "logps/chosen": -715.8147583007812, + "logps/rejected": -688.3126220703125, + "loss": 0.5834, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.433546781539917, + "rewards/margins": 0.3138864040374756, + "rewards/rejected": -2.7474331855773926, + "step": 362 + }, + { + "epoch": 0.23708057800636786, + "grad_norm": 10.21089003459886, + "learning_rate": 1.8884549854032504e-07, + "logits/chosen": -0.8944604396820068, + "logits/rejected": -0.7838302850723267, + "logps/chosen": -701.7962646484375, + "logps/rejected": -715.971923828125, + "loss": 0.5636, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.525576591491699, + "rewards/margins": 0.3668100833892822, + "rewards/rejected": -2.8923864364624023, + "step": 363 + }, + { + "epoch": 0.2377336925463303, + "grad_norm": 11.101824662582676, + "learning_rate": 1.8874055591640742e-07, + "logits/chosen": -0.8422518372535706, + "logits/rejected": -0.8626506328582764, + "logps/chosen": -752.20556640625, + "logps/rejected": -995.0734252929688, + "loss": 0.5241, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.914336919784546, + "rewards/margins": 1.4878953695297241, + "rewards/rejected": -4.4022321701049805, + "step": 364 + }, + { + "epoch": 0.23838680708629276, + "grad_norm": 8.509188829026256, + "learning_rate": 1.8863515138625802e-07, + "logits/chosen": -0.9586243629455566, + "logits/rejected": -0.937591552734375, + "logps/chosen": -756.9347534179688, + "logps/rejected": -820.6338500976562, + "loss": 0.5407, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6895179748535156, + "rewards/margins": 0.6859859228134155, + "rewards/rejected": -3.3755040168762207, + "step": 365 + }, + { + "epoch": 0.2390399216262552, + "grad_norm": 16.91175185837713, + "learning_rate": 1.885292854985213e-07, + "logits/chosen": -0.875924825668335, + "logits/rejected": -0.881722629070282, + "logps/chosen": -728.5872802734375, + "logps/rejected": -868.9918823242188, + "loss": 0.5496, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5441956520080566, + "rewards/margins": 0.7560574412345886, + "rewards/rejected": -3.300252914428711, + "step": 366 + }, + { + "epoch": 0.23969303616621765, + "grad_norm": 7.315683900969026, + "learning_rate": 1.8842295880424304e-07, + "logits/chosen": -0.8934835195541382, + "logits/rejected": -0.8142973184585571, + "logps/chosen": -755.492919921875, + "logps/rejected": -836.1302490234375, + "loss": 0.4819, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8435540199279785, + "rewards/margins": 0.8605523705482483, + "rewards/rejected": -3.704106330871582, + "step": 367 + }, + { + "epoch": 0.2403461507061801, + "grad_norm": 11.255355527223116, + "learning_rate": 1.8831617185686762e-07, + "logits/chosen": -0.8581317663192749, + "logits/rejected": -0.8509604930877686, + "logps/chosen": -827.1642456054688, + "logps/rejected": -935.1969604492188, + "loss": 0.4666, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.654917001724243, + "rewards/margins": 1.1863821744918823, + "rewards/rejected": -3.841298818588257, + "step": 368 + }, + { + "epoch": 0.24099926524614254, + "grad_norm": 8.041901256168211, + "learning_rate": 1.8820892521223515e-07, + "logits/chosen": -0.8912599086761475, + "logits/rejected": -0.8826814889907837, + "logps/chosen": -760.4750366210938, + "logps/rejected": -869.6399536132812, + "loss": 0.459, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5947632789611816, + "rewards/margins": 1.0988965034484863, + "rewards/rejected": -3.6936593055725098, + "step": 369 + }, + { + "epoch": 0.241652379786105, + "grad_norm": 9.210568805727371, + "learning_rate": 1.8810121942857845e-07, + "logits/chosen": -0.8365862965583801, + "logits/rejected": -0.8083174824714661, + "logps/chosen": -714.6243896484375, + "logps/rejected": -734.9310302734375, + "loss": 0.5766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7714502811431885, + "rewards/margins": 0.40789082646369934, + "rewards/rejected": -3.1793413162231445, + "step": 370 + }, + { + "epoch": 0.24230549432606743, + "grad_norm": 8.423599734298417, + "learning_rate": 1.8799305506652025e-07, + "logits/chosen": -0.994741678237915, + "logits/rejected": -0.9545141458511353, + "logps/chosen": -809.184326171875, + "logps/rejected": -890.893310546875, + "loss": 0.5425, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0367493629455566, + "rewards/margins": 1.0203640460968018, + "rewards/rejected": -4.057113170623779, + "step": 371 + }, + { + "epoch": 0.24295860886602988, + "grad_norm": 9.652327213203613, + "learning_rate": 1.8788443268907024e-07, + "logits/chosen": -0.8552968502044678, + "logits/rejected": -0.8310869932174683, + "logps/chosen": -720.93017578125, + "logps/rejected": -783.8108520507812, + "loss": 0.5013, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.675302505493164, + "rewards/margins": 0.6832669973373413, + "rewards/rejected": -3.358569860458374, + "step": 372 + }, + { + "epoch": 0.24361172340599233, + "grad_norm": 11.703544457496719, + "learning_rate": 1.8777535286162217e-07, + "logits/chosen": -0.881072461605072, + "logits/rejected": -0.874359130859375, + "logps/chosen": -771.0457763671875, + "logps/rejected": -828.0839233398438, + "loss": 0.5119, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8744983673095703, + "rewards/margins": 0.5727369785308838, + "rewards/rejected": -3.447235345840454, + "step": 373 + }, + { + "epoch": 0.24426483794595477, + "grad_norm": 11.546985055661386, + "learning_rate": 1.8766581615195078e-07, + "logits/chosen": -0.9114188551902771, + "logits/rejected": -0.8961868286132812, + "logps/chosen": -736.6749267578125, + "logps/rejected": -839.9960327148438, + "loss": 0.5171, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.694876194000244, + "rewards/margins": 0.8417505025863647, + "rewards/rejected": -3.5366265773773193, + "step": 374 + }, + { + "epoch": 0.24491795248591722, + "grad_norm": 8.61604895408707, + "learning_rate": 1.875558231302091e-07, + "logits/chosen": -0.9092810153961182, + "logits/rejected": -0.781123697757721, + "logps/chosen": -660.8868408203125, + "logps/rejected": -716.446044921875, + "loss": 0.581, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.2286787033081055, + "rewards/margins": 0.6053875684738159, + "rewards/rejected": -2.834066390991211, + "step": 375 + }, + { + "epoch": 0.24557106702587966, + "grad_norm": 13.796271931510978, + "learning_rate": 1.8744537436892512e-07, + "logits/chosen": -0.8940467834472656, + "logits/rejected": -0.8263184428215027, + "logps/chosen": -826.9315185546875, + "logps/rejected": -844.8912963867188, + "loss": 0.4862, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1379830837249756, + "rewards/margins": 0.6378382444381714, + "rewards/rejected": -3.7758212089538574, + "step": 376 + }, + { + "epoch": 0.2462241815658421, + "grad_norm": 11.831975946336552, + "learning_rate": 1.8733447044299925e-07, + "logits/chosen": -0.7302409410476685, + "logits/rejected": -0.814877986907959, + "logps/chosen": -765.5311889648438, + "logps/rejected": -1002.50244140625, + "loss": 0.4899, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6444497108459473, + "rewards/margins": 1.4703834056854248, + "rewards/rejected": -4.114832878112793, + "step": 377 + }, + { + "epoch": 0.24687729610580456, + "grad_norm": 12.2428231194753, + "learning_rate": 1.8722311192970092e-07, + "logits/chosen": -0.8322169780731201, + "logits/rejected": -0.7421005368232727, + "logps/chosen": -788.8143310546875, + "logps/rejected": -853.354248046875, + "loss": 0.5892, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7768325805664062, + "rewards/margins": 0.5746185183525085, + "rewards/rejected": -3.3514509201049805, + "step": 378 + }, + { + "epoch": 0.247530410645767, + "grad_norm": 9.846423022437284, + "learning_rate": 1.8711129940866575e-07, + "logits/chosen": -0.91043621301651, + "logits/rejected": -0.9167795777320862, + "logps/chosen": -828.8065185546875, + "logps/rejected": -872.3666381835938, + "loss": 0.4836, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.97462797164917, + "rewards/margins": 0.8676510453224182, + "rewards/rejected": -3.8422789573669434, + "step": 379 + }, + { + "epoch": 0.24818352518572945, + "grad_norm": 11.536895925758275, + "learning_rate": 1.8699903346189263e-07, + "logits/chosen": -0.8867179751396179, + "logits/rejected": -0.8912706971168518, + "logps/chosen": -804.9017944335938, + "logps/rejected": -852.7586059570312, + "loss": 0.5841, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9465479850769043, + "rewards/margins": 0.7642606496810913, + "rewards/rejected": -3.710808753967285, + "step": 380 + }, + { + "epoch": 0.2488366397256919, + "grad_norm": 9.03975607711823, + "learning_rate": 1.8688631467374054e-07, + "logits/chosen": -0.946536123752594, + "logits/rejected": -0.9552872180938721, + "logps/chosen": -782.4651489257812, + "logps/rejected": -899.2623291015625, + "loss": 0.5573, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9219205379486084, + "rewards/margins": 0.8479382991790771, + "rewards/rejected": -3.7698588371276855, + "step": 381 + }, + { + "epoch": 0.24948975426565434, + "grad_norm": 10.588989944794507, + "learning_rate": 1.8677314363092553e-07, + "logits/chosen": -0.9365772604942322, + "logits/rejected": -0.8860799670219421, + "logps/chosen": -748.2943725585938, + "logps/rejected": -792.3837890625, + "loss": 0.5409, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.813943862915039, + "rewards/margins": 0.5208563208580017, + "rewards/rejected": -3.3347997665405273, + "step": 382 + }, + { + "epoch": 0.25014286880561676, + "grad_norm": 9.063214190734078, + "learning_rate": 1.866595209225177e-07, + "logits/chosen": -0.7872967720031738, + "logits/rejected": -0.8394625186920166, + "logps/chosen": -805.676513671875, + "logps/rejected": -910.6593627929688, + "loss": 0.5242, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.991018533706665, + "rewards/margins": 0.9019701480865479, + "rewards/rejected": -3.892988920211792, + "step": 383 + }, + { + "epoch": 0.25079598334557923, + "grad_norm": 12.184205086121409, + "learning_rate": 1.8654544713993822e-07, + "logits/chosen": -0.8133022785186768, + "logits/rejected": -0.811676025390625, + "logps/chosen": -755.30859375, + "logps/rejected": -825.8359985351562, + "loss": 0.5428, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7623047828674316, + "rewards/margins": 0.6998733282089233, + "rewards/rejected": -3.4621777534484863, + "step": 384 + }, + { + "epoch": 0.25144909788554165, + "grad_norm": 8.545125637268963, + "learning_rate": 1.8643092287695602e-07, + "logits/chosen": -0.8532843589782715, + "logits/rejected": -0.7933013439178467, + "logps/chosen": -736.1025390625, + "logps/rejected": -808.1593017578125, + "loss": 0.4891, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.8755104541778564, + "rewards/margins": 0.8513441681861877, + "rewards/rejected": -3.7268548011779785, + "step": 385 + }, + { + "epoch": 0.2521022124255041, + "grad_norm": 12.071178659572865, + "learning_rate": 1.8631594872968496e-07, + "logits/chosen": -0.770072340965271, + "logits/rejected": -0.7640020847320557, + "logps/chosen": -683.5043334960938, + "logps/rejected": -767.7935180664062, + "loss": 0.4933, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2001888751983643, + "rewards/margins": 0.619857668876648, + "rewards/rejected": -2.8200464248657227, + "step": 386 + }, + { + "epoch": 0.25275532696546654, + "grad_norm": 10.703726113073369, + "learning_rate": 1.862005252965805e-07, + "logits/chosen": -0.7848135828971863, + "logits/rejected": -0.8621163964271545, + "logps/chosen": -687.1354370117188, + "logps/rejected": -819.2810668945312, + "loss": 0.5356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.449026584625244, + "rewards/margins": 0.7430239915847778, + "rewards/rejected": -3.1920504570007324, + "step": 387 + }, + { + "epoch": 0.253408441505429, + "grad_norm": 11.674209847253039, + "learning_rate": 1.8608465317843676e-07, + "logits/chosen": -0.8625525832176208, + "logits/rejected": -0.8220775127410889, + "logps/chosen": -649.7574462890625, + "logps/rejected": -685.041259765625, + "loss": 0.5312, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3073811531066895, + "rewards/margins": 0.5945886373519897, + "rewards/rejected": -2.9019696712493896, + "step": 388 + }, + { + "epoch": 0.25406155604539143, + "grad_norm": 8.583769038852582, + "learning_rate": 1.8596833297838335e-07, + "logits/chosen": -0.8622403740882874, + "logits/rejected": -0.8727516531944275, + "logps/chosen": -745.3436889648438, + "logps/rejected": -847.031982421875, + "loss": 0.5052, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6236987113952637, + "rewards/margins": 0.9856184720993042, + "rewards/rejected": -3.6093173027038574, + "step": 389 + }, + { + "epoch": 0.2547146705853539, + "grad_norm": 12.037107211910246, + "learning_rate": 1.8585156530188214e-07, + "logits/chosen": -0.8463901281356812, + "logits/rejected": -0.8325417041778564, + "logps/chosen": -732.111083984375, + "logps/rejected": -828.5350952148438, + "loss": 0.6109, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5609564781188965, + "rewards/margins": 0.821265459060669, + "rewards/rejected": -3.3822219371795654, + "step": 390 + }, + { + "epoch": 0.2553677851253163, + "grad_norm": 8.857824861638335, + "learning_rate": 1.8573435075672421e-07, + "logits/chosen": -0.8568727374076843, + "logits/rejected": -0.8396463394165039, + "logps/chosen": -761.6552734375, + "logps/rejected": -789.5634155273438, + "loss": 0.4699, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.5762624740600586, + "rewards/margins": 0.6971707344055176, + "rewards/rejected": -3.273433208465576, + "step": 391 + }, + { + "epoch": 0.2560208996652788, + "grad_norm": 10.947327596621403, + "learning_rate": 1.8561668995302665e-07, + "logits/chosen": -0.9098179936408997, + "logits/rejected": -0.9328880906105042, + "logps/chosen": -706.7294311523438, + "logps/rejected": -816.31787109375, + "loss": 0.4787, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7702081203460693, + "rewards/margins": 0.8480209112167358, + "rewards/rejected": -3.6182289123535156, + "step": 392 + }, + { + "epoch": 0.2566740142052412, + "grad_norm": 9.965810910376886, + "learning_rate": 1.8549858350322932e-07, + "logits/chosen": -0.9181755781173706, + "logits/rejected": -0.8338078856468201, + "logps/chosen": -825.3251342773438, + "logps/rejected": -857.3953857421875, + "loss": 0.5338, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0407447814941406, + "rewards/margins": 0.5176315307617188, + "rewards/rejected": -3.5583760738372803, + "step": 393 + }, + { + "epoch": 0.2573271287452037, + "grad_norm": 14.437300919389857, + "learning_rate": 1.8538003202209186e-07, + "logits/chosen": -0.7896380424499512, + "logits/rejected": -0.6721420884132385, + "logps/chosen": -718.7591552734375, + "logps/rejected": -794.776123046875, + "loss": 0.5163, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.381721258163452, + "rewards/margins": 1.0894372463226318, + "rewards/rejected": -3.471158266067505, + "step": 394 + }, + { + "epoch": 0.2579802432851661, + "grad_norm": 11.73360711692685, + "learning_rate": 1.852610361266902e-07, + "logits/chosen": -1.0355563163757324, + "logits/rejected": -0.9479397535324097, + "logps/chosen": -734.6851806640625, + "logps/rejected": -792.4885864257812, + "loss": 0.5116, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9483344554901123, + "rewards/margins": 0.9447464942932129, + "rewards/rejected": -3.893080711364746, + "step": 395 + }, + { + "epoch": 0.2586333578251286, + "grad_norm": 8.69098030663428, + "learning_rate": 1.8514159643641366e-07, + "logits/chosen": -0.7818084359169006, + "logits/rejected": -0.7556504011154175, + "logps/chosen": -737.1483764648438, + "logps/rejected": -807.0700073242188, + "loss": 0.5289, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.768253803253174, + "rewards/margins": 0.621762752532959, + "rewards/rejected": -3.3900163173675537, + "step": 396 + }, + { + "epoch": 0.259286472365091, + "grad_norm": 8.173290849151792, + "learning_rate": 1.850217135729614e-07, + "logits/chosen": -0.8113851547241211, + "logits/rejected": -0.7199157476425171, + "logps/chosen": -760.3164672851562, + "logps/rejected": -797.7694091796875, + "loss": 0.4421, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6380648612976074, + "rewards/margins": 0.7701491117477417, + "rewards/rejected": -3.4082140922546387, + "step": 397 + }, + { + "epoch": 0.2599395869050535, + "grad_norm": 9.015185202130558, + "learning_rate": 1.8490138816033953e-07, + "logits/chosen": -0.9135926961898804, + "logits/rejected": -0.9324383735656738, + "logps/chosen": -755.5042724609375, + "logps/rejected": -805.4848022460938, + "loss": 0.5094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.365133047103882, + "rewards/margins": 0.7335975766181946, + "rewards/rejected": -3.0987303256988525, + "step": 398 + }, + { + "epoch": 0.2605927014450159, + "grad_norm": 10.620101711003125, + "learning_rate": 1.8478062082485754e-07, + "logits/chosen": -0.8709986805915833, + "logits/rejected": -0.8939104080200195, + "logps/chosen": -766.480712890625, + "logps/rejected": -870.7758178710938, + "loss": 0.518, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.472724676132202, + "rewards/margins": 1.0182551145553589, + "rewards/rejected": -3.4909799098968506, + "step": 399 + }, + { + "epoch": 0.26124581598497837, + "grad_norm": 8.904432929316213, + "learning_rate": 1.8465941219512531e-07, + "logits/chosen": -0.8301805853843689, + "logits/rejected": -0.809867262840271, + "logps/chosen": -749.3978881835938, + "logps/rejected": -825.1639404296875, + "loss": 0.4606, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6916451454162598, + "rewards/margins": 0.8717960119247437, + "rewards/rejected": -3.563441038131714, + "step": 400 + }, + { + "epoch": 0.26124581598497837, + "eval_logits/chosen": -0.724721372127533, + "eval_logits/rejected": -0.6812699437141418, + "eval_logps/chosen": -741.7116088867188, + "eval_logps/rejected": -802.9892578125, + "eval_loss": 0.5081213712692261, + "eval_rewards/accuracies": 0.7559999823570251, + "eval_rewards/chosen": -2.6780450344085693, + "eval_rewards/margins": 0.7001611590385437, + "eval_rewards/rejected": -3.378206253051758, + "eval_runtime": 616.1892, + "eval_samples_per_second": 6.492, + "eval_steps_per_second": 0.406, + "step": 400 + }, + { + "epoch": 0.2618989305249408, + "grad_norm": 9.783139118919728, + "learning_rate": 1.8453776290204963e-07, + "logits/chosen": -0.9069001078605652, + "logits/rejected": -0.810458779335022, + "logps/chosen": -743.5487060546875, + "logps/rejected": -779.9725952148438, + "loss": 0.4883, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6744909286499023, + "rewards/margins": 0.6230872273445129, + "rewards/rejected": -3.2975778579711914, + "step": 401 + }, + { + "epoch": 0.26255204506490326, + "grad_norm": 12.767255063577581, + "learning_rate": 1.84415673578831e-07, + "logits/chosen": -0.851786732673645, + "logits/rejected": -0.8314003348350525, + "logps/chosen": -705.4389038085938, + "logps/rejected": -778.4620361328125, + "loss": 0.5321, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4630203247070312, + "rewards/margins": 0.6230435967445374, + "rewards/rejected": -3.086063861846924, + "step": 402 + }, + { + "epoch": 0.2632051596048657, + "grad_norm": 10.075368467499747, + "learning_rate": 1.8429314486096038e-07, + "logits/chosen": -0.8277249336242676, + "logits/rejected": -0.788428008556366, + "logps/chosen": -705.606689453125, + "logps/rejected": -752.8212280273438, + "loss": 0.5266, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5161733627319336, + "rewards/margins": 0.48541557788848877, + "rewards/rejected": -3.001589298248291, + "step": 403 + }, + { + "epoch": 0.26385827414482815, + "grad_norm": 17.296320200401933, + "learning_rate": 1.8417017738621584e-07, + "logits/chosen": -0.9016965627670288, + "logits/rejected": -0.9234187006950378, + "logps/chosen": -815.3984985351562, + "logps/rejected": -888.422119140625, + "loss": 0.5682, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.178356885910034, + "rewards/margins": 0.6038446426391602, + "rewards/rejected": -3.7822020053863525, + "step": 404 + }, + { + "epoch": 0.26451138868479057, + "grad_norm": 11.424520481589685, + "learning_rate": 1.8404677179465918e-07, + "logits/chosen": -0.9801906943321228, + "logits/rejected": -1.028713583946228, + "logps/chosen": -837.5343017578125, + "logps/rejected": -964.4091796875, + "loss": 0.5529, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.271747589111328, + "rewards/margins": 0.5638325214385986, + "rewards/rejected": -3.8355798721313477, + "step": 405 + }, + { + "epoch": 0.26516450322475305, + "grad_norm": 9.475445804066252, + "learning_rate": 1.8392292872863267e-07, + "logits/chosen": -0.8906446099281311, + "logits/rejected": -0.9547228813171387, + "logps/chosen": -705.4547729492188, + "logps/rejected": -792.2675170898438, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.604658842086792, + "rewards/margins": 0.9246249198913574, + "rewards/rejected": -3.5292837619781494, + "step": 406 + }, + { + "epoch": 0.26581761776471546, + "grad_norm": 10.536985619426973, + "learning_rate": 1.8379864883275574e-07, + "logits/chosen": -0.8332792520523071, + "logits/rejected": -0.8485990762710571, + "logps/chosen": -736.0517578125, + "logps/rejected": -838.329833984375, + "loss": 0.5022, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6248157024383545, + "rewards/margins": 0.8121111989021301, + "rewards/rejected": -3.4369266033172607, + "step": 407 + }, + { + "epoch": 0.26647073230467794, + "grad_norm": 10.424698516252935, + "learning_rate": 1.8367393275392153e-07, + "logits/chosen": -0.7887847423553467, + "logits/rejected": -0.785426139831543, + "logps/chosen": -787.7622680664062, + "logps/rejected": -902.406005859375, + "loss": 0.4796, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8995308876037598, + "rewards/margins": 0.9673147201538086, + "rewards/rejected": -3.8668456077575684, + "step": 408 + }, + { + "epoch": 0.26712384684464036, + "grad_norm": 11.643817212358961, + "learning_rate": 1.8354878114129364e-07, + "logits/chosen": -0.8421759009361267, + "logits/rejected": -0.6650703549385071, + "logps/chosen": -787.9586791992188, + "logps/rejected": -809.9829711914062, + "loss": 0.5951, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.101632595062256, + "rewards/margins": 0.4137914776802063, + "rewards/rejected": -3.5154237747192383, + "step": 409 + }, + { + "epoch": 0.26777696138460283, + "grad_norm": 9.561014123560037, + "learning_rate": 1.8342319464630255e-07, + "logits/chosen": -0.8494737148284912, + "logits/rejected": -0.79804927110672, + "logps/chosen": -699.1937255859375, + "logps/rejected": -756.2304077148438, + "loss": 0.4578, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6116859912872314, + "rewards/margins": 0.5693447589874268, + "rewards/rejected": -3.1810309886932373, + "step": 410 + }, + { + "epoch": 0.26843007592456525, + "grad_norm": 9.478844905600644, + "learning_rate": 1.832971739226425e-07, + "logits/chosen": -0.7649250030517578, + "logits/rejected": -0.71089106798172, + "logps/chosen": -726.0282592773438, + "logps/rejected": -825.573486328125, + "loss": 0.4979, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.829139232635498, + "rewards/margins": 1.0223064422607422, + "rewards/rejected": -3.8514456748962402, + "step": 411 + }, + { + "epoch": 0.2690831904645277, + "grad_norm": 9.805450680592982, + "learning_rate": 1.8317071962626787e-07, + "logits/chosen": -0.7251761555671692, + "logits/rejected": -0.6697893738746643, + "logps/chosen": -771.7799072265625, + "logps/rejected": -803.2220458984375, + "loss": 0.5073, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.042149305343628, + "rewards/margins": 0.6425183415412903, + "rewards/rejected": -3.6846675872802734, + "step": 412 + }, + { + "epoch": 0.26973630500449014, + "grad_norm": 10.50244351453918, + "learning_rate": 1.830438324153898e-07, + "logits/chosen": -0.8642072677612305, + "logits/rejected": -0.8176953792572021, + "logps/chosen": -699.1476440429688, + "logps/rejected": -819.7857666015625, + "loss": 0.4711, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.5096065998077393, + "rewards/margins": 1.047548532485962, + "rewards/rejected": -3.557155132293701, + "step": 413 + }, + { + "epoch": 0.2703894195444526, + "grad_norm": 11.849045341650994, + "learning_rate": 1.8291651295047295e-07, + "logits/chosen": -0.9802454113960266, + "logits/rejected": -0.8233616352081299, + "logps/chosen": -824.2813110351562, + "logps/rejected": -878.6183471679688, + "loss": 0.4647, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1092634201049805, + "rewards/margins": 1.1087751388549805, + "rewards/rejected": -4.218039035797119, + "step": 414 + }, + { + "epoch": 0.27104253408441503, + "grad_norm": 13.177473741686534, + "learning_rate": 1.8278876189423178e-07, + "logits/chosen": -0.8334654569625854, + "logits/rejected": -0.8360169529914856, + "logps/chosen": -736.35302734375, + "logps/rejected": -806.1726684570312, + "loss": 0.4901, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.874499559402466, + "rewards/margins": 0.6783380508422852, + "rewards/rejected": -3.552837371826172, + "step": 415 + }, + { + "epoch": 0.2716956486243775, + "grad_norm": 11.16394727704828, + "learning_rate": 1.826605799116273e-07, + "logits/chosen": -0.8869356513023376, + "logits/rejected": -0.8545706272125244, + "logps/chosen": -725.5807495117188, + "logps/rejected": -868.1986083984375, + "loss": 0.4506, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.486783504486084, + "rewards/margins": 1.0288233757019043, + "rewards/rejected": -3.515606641769409, + "step": 416 + }, + { + "epoch": 0.2723487631643399, + "grad_norm": 11.663692808166209, + "learning_rate": 1.8253196766986353e-07, + "logits/chosen": -0.9029356241226196, + "logits/rejected": -0.9125173091888428, + "logps/chosen": -704.21533203125, + "logps/rejected": -785.9641723632812, + "loss": 0.4987, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.766026735305786, + "rewards/margins": 0.57279372215271, + "rewards/rejected": -3.338820219039917, + "step": 417 + }, + { + "epoch": 0.2730018777043024, + "grad_norm": 8.729822521753496, + "learning_rate": 1.824029258383841e-07, + "logits/chosen": -0.7340871095657349, + "logits/rejected": -0.7211796045303345, + "logps/chosen": -796.06298828125, + "logps/rejected": -913.9825439453125, + "loss": 0.4293, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.40975284576416, + "rewards/margins": 1.0494564771652222, + "rewards/rejected": -4.459209442138672, + "step": 418 + }, + { + "epoch": 0.2736549922442648, + "grad_norm": 10.156017954308194, + "learning_rate": 1.8227345508886862e-07, + "logits/chosen": -0.8956701159477234, + "logits/rejected": -0.8679780960083008, + "logps/chosen": -777.0938110351562, + "logps/rejected": -852.8966064453125, + "loss": 0.5182, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.8934662342071533, + "rewards/margins": 0.8560830950737, + "rewards/rejected": -3.749549627304077, + "step": 419 + }, + { + "epoch": 0.2743081067842273, + "grad_norm": 8.640630705596932, + "learning_rate": 1.8214355609522934e-07, + "logits/chosen": -0.821373701095581, + "logits/rejected": -0.7505570650100708, + "logps/chosen": -764.0152587890625, + "logps/rejected": -791.8469848632812, + "loss": 0.4431, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9267923831939697, + "rewards/margins": 0.4508010745048523, + "rewards/rejected": -3.377593517303467, + "step": 420 + }, + { + "epoch": 0.2749612213241897, + "grad_norm": 13.171977563436153, + "learning_rate": 1.8201322953360756e-07, + "logits/chosen": -0.7890115976333618, + "logits/rejected": -0.7287498712539673, + "logps/chosen": -774.0021362304688, + "logps/rejected": -767.4495239257812, + "loss": 0.5292, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0860540866851807, + "rewards/margins": 0.4356900155544281, + "rewards/rejected": -3.5217440128326416, + "step": 421 + }, + { + "epoch": 0.2756143358641522, + "grad_norm": 11.91644818150085, + "learning_rate": 1.818824760823701e-07, + "logits/chosen": -0.97590571641922, + "logits/rejected": -0.7994486689567566, + "logps/chosen": -663.6700439453125, + "logps/rejected": -690.9847412109375, + "loss": 0.4976, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7485709190368652, + "rewards/margins": 0.5975130796432495, + "rewards/rejected": -3.3460841178894043, + "step": 422 + }, + { + "epoch": 0.2762674504041146, + "grad_norm": 8.62314149072794, + "learning_rate": 1.8175129642210586e-07, + "logits/chosen": -0.8519734740257263, + "logits/rejected": -0.9017986059188843, + "logps/chosen": -743.6911010742188, + "logps/rejected": -866.9241333007812, + "loss": 0.4576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9078967571258545, + "rewards/margins": 0.8721412420272827, + "rewards/rejected": -3.780038356781006, + "step": 423 + }, + { + "epoch": 0.2769205649440771, + "grad_norm": 16.78129564723599, + "learning_rate": 1.8161969123562217e-07, + "logits/chosen": -0.8358048796653748, + "logits/rejected": -0.8505182266235352, + "logps/chosen": -754.3642578125, + "logps/rejected": -818.9282836914062, + "loss": 0.5837, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.47281813621521, + "rewards/margins": 0.578090250492096, + "rewards/rejected": -4.05090856552124, + "step": 424 + }, + { + "epoch": 0.2775736794840395, + "grad_norm": 17.991126052561494, + "learning_rate": 1.8148766120794125e-07, + "logits/chosen": -0.9127295017242432, + "logits/rejected": -0.8362393379211426, + "logps/chosen": -791.9930419921875, + "logps/rejected": -843.0182495117188, + "loss": 0.5182, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.202291488647461, + "rewards/margins": 0.5794678330421448, + "rewards/rejected": -3.78175950050354, + "step": 425 + }, + { + "epoch": 0.27822679402400197, + "grad_norm": 8.6575913642869, + "learning_rate": 1.8135520702629673e-07, + "logits/chosen": -0.8415990471839905, + "logits/rejected": -0.8752740621566772, + "logps/chosen": -771.759765625, + "logps/rejected": -900.0613403320312, + "loss": 0.4585, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.997281789779663, + "rewards/margins": 1.1854169368743896, + "rewards/rejected": -4.182698726654053, + "step": 426 + }, + { + "epoch": 0.2788799085639644, + "grad_norm": 12.81180624184605, + "learning_rate": 1.8122232938013005e-07, + "logits/chosen": -0.8370864391326904, + "logits/rejected": -0.7737575769424438, + "logps/chosen": -855.8829345703125, + "logps/rejected": -937.1058349609375, + "loss": 0.5611, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.7345356941223145, + "rewards/margins": 0.9496172070503235, + "rewards/rejected": -4.684152603149414, + "step": 427 + }, + { + "epoch": 0.27953302310392686, + "grad_norm": 9.737215085975576, + "learning_rate": 1.8108902896108668e-07, + "logits/chosen": -1.0026570558547974, + "logits/rejected": -0.9746906757354736, + "logps/chosen": -857.1223754882812, + "logps/rejected": -890.031982421875, + "loss": 0.4786, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.579835891723633, + "rewards/margins": 0.7876998782157898, + "rewards/rejected": -4.367535591125488, + "step": 428 + }, + { + "epoch": 0.2801861376438893, + "grad_norm": 14.133058490229988, + "learning_rate": 1.8095530646301287e-07, + "logits/chosen": -0.6362062096595764, + "logits/rejected": -0.6157896518707275, + "logps/chosen": -618.6807861328125, + "logps/rejected": -654.237548828125, + "loss": 0.565, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4280242919921875, + "rewards/margins": 0.5356500148773193, + "rewards/rejected": -2.9636740684509277, + "step": 429 + }, + { + "epoch": 0.28083925218385175, + "grad_norm": 10.687127431453712, + "learning_rate": 1.808211625819517e-07, + "logits/chosen": -0.8555097579956055, + "logits/rejected": -0.8589435815811157, + "logps/chosen": -838.960205078125, + "logps/rejected": -951.1204833984375, + "loss": 0.538, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5574240684509277, + "rewards/margins": 0.9826168417930603, + "rewards/rejected": -4.540040969848633, + "step": 430 + }, + { + "epoch": 0.28149236672381417, + "grad_norm": 10.123521301197844, + "learning_rate": 1.8068659801613972e-07, + "logits/chosen": -0.8224723935127258, + "logits/rejected": -0.7616609334945679, + "logps/chosen": -730.337646484375, + "logps/rejected": -856.6796875, + "loss": 0.4953, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.277148485183716, + "rewards/margins": 1.0432851314544678, + "rewards/rejected": -4.320433616638184, + "step": 431 + }, + { + "epoch": 0.28214548126377664, + "grad_norm": 10.291835377836938, + "learning_rate": 1.805516134660031e-07, + "logits/chosen": -0.7565049529075623, + "logits/rejected": -0.760954737663269, + "logps/chosen": -889.3624877929688, + "logps/rejected": -1007.882568359375, + "loss": 0.5203, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.83842134475708, + "rewards/margins": 1.0104892253875732, + "rewards/rejected": -4.848910808563232, + "step": 432 + }, + { + "epoch": 0.28279859580373906, + "grad_norm": 12.003520975391313, + "learning_rate": 1.8041620963415415e-07, + "logits/chosen": -0.8923713564872742, + "logits/rejected": -0.9106241464614868, + "logps/chosen": -777.1085205078125, + "logps/rejected": -870.7319946289062, + "loss": 0.4846, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.4676287174224854, + "rewards/margins": 0.7948607206344604, + "rewards/rejected": -4.262489318847656, + "step": 433 + }, + { + "epoch": 0.28345171034370154, + "grad_norm": 19.10825250816021, + "learning_rate": 1.8028038722538758e-07, + "logits/chosen": -0.799106776714325, + "logits/rejected": -0.7920759916305542, + "logps/chosen": -881.0488891601562, + "logps/rejected": -954.8224487304688, + "loss": 0.4382, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.942882776260376, + "rewards/margins": 0.945711076259613, + "rewards/rejected": -4.888593673706055, + "step": 434 + }, + { + "epoch": 0.28410482488366395, + "grad_norm": 8.867140403896737, + "learning_rate": 1.8014414694667682e-07, + "logits/chosen": -0.9495159983634949, + "logits/rejected": -0.8596460223197937, + "logps/chosen": -788.0202026367188, + "logps/rejected": -844.4072265625, + "loss": 0.4697, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.2976248264312744, + "rewards/margins": 0.9562715291976929, + "rewards/rejected": -4.253896713256836, + "step": 435 + }, + { + "epoch": 0.2847579394236264, + "grad_norm": 11.495282189006529, + "learning_rate": 1.8000748950717038e-07, + "logits/chosen": -0.9166377782821655, + "logits/rejected": -0.8431532979011536, + "logps/chosen": -684.1121215820312, + "logps/rejected": -756.2694091796875, + "loss": 0.4631, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7140069007873535, + "rewards/margins": 0.9040411710739136, + "rewards/rejected": -3.6180481910705566, + "step": 436 + }, + { + "epoch": 0.28541105396358885, + "grad_norm": 10.927848876345342, + "learning_rate": 1.7987041561818816e-07, + "logits/chosen": -0.962665319442749, + "logits/rejected": -0.8971413969993591, + "logps/chosen": -834.0377807617188, + "logps/rejected": -889.9393310546875, + "loss": 0.497, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.379364013671875, + "rewards/margins": 0.7841745615005493, + "rewards/rejected": -4.163538932800293, + "step": 437 + }, + { + "epoch": 0.2860641685035513, + "grad_norm": 11.650368628524, + "learning_rate": 1.7973292599321776e-07, + "logits/chosen": -0.769073486328125, + "logits/rejected": -0.8234167695045471, + "logps/chosen": -847.42724609375, + "logps/rejected": -985.3370361328125, + "loss": 0.4925, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9037604331970215, + "rewards/margins": 1.1833317279815674, + "rewards/rejected": -5.087091445922852, + "step": 438 + }, + { + "epoch": 0.28671728304351374, + "grad_norm": 10.739880288424589, + "learning_rate": 1.7959502134791067e-07, + "logits/chosen": -1.0507196187973022, + "logits/rejected": -0.9440267086029053, + "logps/chosen": -842.0238647460938, + "logps/rejected": -932.1236572265625, + "loss": 0.4573, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.467390537261963, + "rewards/margins": 1.0047094821929932, + "rewards/rejected": -4.472099781036377, + "step": 439 + }, + { + "epoch": 0.2873703975834762, + "grad_norm": 15.061086520968807, + "learning_rate": 1.794567024000787e-07, + "logits/chosen": -0.7447534799575806, + "logits/rejected": -0.7312377095222473, + "logps/chosen": -877.2549438476562, + "logps/rejected": -976.9591064453125, + "loss": 0.5878, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.914492607116699, + "rewards/margins": 0.9334661960601807, + "rewards/rejected": -4.847959041595459, + "step": 440 + }, + { + "epoch": 0.28802351212343863, + "grad_norm": 11.378904599041363, + "learning_rate": 1.7931796986969006e-07, + "logits/chosen": -0.8733054995536804, + "logits/rejected": -0.8079696297645569, + "logps/chosen": -897.7353515625, + "logps/rejected": -923.5629272460938, + "loss": 0.5043, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.33026385307312, + "rewards/margins": 1.067804217338562, + "rewards/rejected": -4.398067951202393, + "step": 441 + }, + { + "epoch": 0.2886766266634011, + "grad_norm": 8.524372415698174, + "learning_rate": 1.791788244788658e-07, + "logits/chosen": -0.7420032620429993, + "logits/rejected": -0.7593204379081726, + "logps/chosen": -712.6714477539062, + "logps/rejected": -809.899169921875, + "loss": 0.4781, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9074902534484863, + "rewards/margins": 0.6758562326431274, + "rewards/rejected": -3.5833466053009033, + "step": 442 + }, + { + "epoch": 0.2893297412033635, + "grad_norm": 11.012683005962536, + "learning_rate": 1.790392669518759e-07, + "logits/chosen": -0.8415663242340088, + "logits/rejected": -0.8038774728775024, + "logps/chosen": -842.3577270507812, + "logps/rejected": -907.7183837890625, + "loss": 0.4267, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.889575481414795, + "rewards/margins": 0.7025194764137268, + "rewards/rejected": -4.592095375061035, + "step": 443 + }, + { + "epoch": 0.289982855743326, + "grad_norm": 12.889931357077444, + "learning_rate": 1.7889929801513565e-07, + "logits/chosen": -0.6964072585105896, + "logits/rejected": -0.6732711791992188, + "logps/chosen": -691.6309814453125, + "logps/rejected": -762.2383422851562, + "loss": 0.5427, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.066455602645874, + "rewards/margins": 0.6046935319900513, + "rewards/rejected": -3.6711490154266357, + "step": 444 + }, + { + "epoch": 0.2906359702832884, + "grad_norm": 9.731628947128165, + "learning_rate": 1.787589183972017e-07, + "logits/chosen": -0.9657904505729675, + "logits/rejected": -0.9944557547569275, + "logps/chosen": -841.9929809570312, + "logps/rejected": -989.1988525390625, + "loss": 0.4848, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4814305305480957, + "rewards/margins": 1.1318271160125732, + "rewards/rejected": -4.61325740814209, + "step": 445 + }, + { + "epoch": 0.2912890848232509, + "grad_norm": 8.084542897402226, + "learning_rate": 1.786181288287683e-07, + "logits/chosen": -0.8106024861335754, + "logits/rejected": -0.7048735022544861, + "logps/chosen": -859.118896484375, + "logps/rejected": -954.908447265625, + "loss": 0.4515, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0667357444763184, + "rewards/margins": 1.035430669784546, + "rewards/rejected": -4.102166652679443, + "step": 446 + }, + { + "epoch": 0.2919421993632133, + "grad_norm": 14.616888201159512, + "learning_rate": 1.7847693004266367e-07, + "logits/chosen": -0.8901810050010681, + "logits/rejected": -0.7865488529205322, + "logps/chosen": -800.8839111328125, + "logps/rejected": -816.6807250976562, + "loss": 0.5162, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.978163719177246, + "rewards/margins": 0.6770980954170227, + "rewards/rejected": -3.655261516571045, + "step": 447 + }, + { + "epoch": 0.2925953139031758, + "grad_norm": 14.867760488305729, + "learning_rate": 1.7833532277384604e-07, + "logits/chosen": -0.8480335474014282, + "logits/rejected": -0.8119291663169861, + "logps/chosen": -682.2476196289062, + "logps/rejected": -718.3956909179688, + "loss": 0.5572, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.687140941619873, + "rewards/margins": 0.585093080997467, + "rewards/rejected": -3.2722342014312744, + "step": 448 + }, + { + "epoch": 0.2932484284431382, + "grad_norm": 12.346656292636492, + "learning_rate": 1.7819330775939978e-07, + "logits/chosen": -0.8786594271659851, + "logits/rejected": -0.8197569251060486, + "logps/chosen": -890.4720458984375, + "logps/rejected": -925.9500122070312, + "loss": 0.5493, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.352853775024414, + "rewards/margins": 0.9036799669265747, + "rewards/rejected": -4.256534099578857, + "step": 449 + }, + { + "epoch": 0.29390154298310067, + "grad_norm": 14.374790747191138, + "learning_rate": 1.780508857385317e-07, + "logits/chosen": -0.776336669921875, + "logits/rejected": -0.6683730483055115, + "logps/chosen": -728.320068359375, + "logps/rejected": -827.4588623046875, + "loss": 0.5504, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0526700019836426, + "rewards/margins": 1.0733567476272583, + "rewards/rejected": -4.126026630401611, + "step": 450 + }, + { + "epoch": 0.2945546575230631, + "grad_norm": 12.060785088732436, + "learning_rate": 1.7790805745256703e-07, + "logits/chosen": -0.6378570795059204, + "logits/rejected": -0.5599284768104553, + "logps/chosen": -788.5953979492188, + "logps/rejected": -829.5595092773438, + "loss": 0.4914, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.06785249710083, + "rewards/margins": 0.6147899627685547, + "rewards/rejected": -3.6826422214508057, + "step": 451 + }, + { + "epoch": 0.29520777206302556, + "grad_norm": 9.973656930649923, + "learning_rate": 1.7776482364494579e-07, + "logits/chosen": -0.7137709856033325, + "logits/rejected": -0.7110152244567871, + "logps/chosen": -865.0806884765625, + "logps/rejected": -980.2341918945312, + "loss": 0.4741, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4852802753448486, + "rewards/margins": 0.8627204895019531, + "rewards/rejected": -4.348001003265381, + "step": 452 + }, + { + "epoch": 0.295860886602988, + "grad_norm": 13.638033723344053, + "learning_rate": 1.7762118506121873e-07, + "logits/chosen": -0.8766645789146423, + "logits/rejected": -0.8333470225334167, + "logps/chosen": -756.6890869140625, + "logps/rejected": -816.2777709960938, + "loss": 0.5171, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.8761587142944336, + "rewards/margins": 0.5043472051620483, + "rewards/rejected": -3.3805060386657715, + "step": 453 + }, + { + "epoch": 0.29651400114295046, + "grad_norm": 10.692905485205573, + "learning_rate": 1.7747714244904346e-07, + "logits/chosen": -0.7567205429077148, + "logits/rejected": -0.7571355104446411, + "logps/chosen": -794.180908203125, + "logps/rejected": -876.698974609375, + "loss": 0.4673, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.093803644180298, + "rewards/margins": 0.8284895420074463, + "rewards/rejected": -3.922293186187744, + "step": 454 + }, + { + "epoch": 0.2971671156829129, + "grad_norm": 9.5645932096092, + "learning_rate": 1.7733269655818076e-07, + "logits/chosen": -0.8513656258583069, + "logits/rejected": -0.7877851724624634, + "logps/chosen": -722.7457885742188, + "logps/rejected": -867.5543212890625, + "loss": 0.5202, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.243468999862671, + "rewards/margins": 0.9194523692131042, + "rewards/rejected": -4.16292142868042, + "step": 455 + }, + { + "epoch": 0.29782023022287535, + "grad_norm": 17.81961025980672, + "learning_rate": 1.7718784814049036e-07, + "logits/chosen": -0.8771448135375977, + "logits/rejected": -0.8124470710754395, + "logps/chosen": -787.6842651367188, + "logps/rejected": -852.7453002929688, + "loss": 0.477, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.917278528213501, + "rewards/margins": 0.9118640422821045, + "rewards/rejected": -3.8291425704956055, + "step": 456 + }, + { + "epoch": 0.29847334476283777, + "grad_norm": 10.521938568244815, + "learning_rate": 1.770425979499273e-07, + "logits/chosen": -0.8637176752090454, + "logits/rejected": -0.8220848441123962, + "logps/chosen": -747.9457397460938, + "logps/rejected": -764.8699340820312, + "loss": 0.5601, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.3360836505889893, + "rewards/margins": 0.39987143874168396, + "rewards/rejected": -3.7359557151794434, + "step": 457 + }, + { + "epoch": 0.29912645930280024, + "grad_norm": 13.778767197295677, + "learning_rate": 1.7689694674253784e-07, + "logits/chosen": -0.8967228531837463, + "logits/rejected": -0.8589329719543457, + "logps/chosen": -817.915771484375, + "logps/rejected": -875.2073974609375, + "loss": 0.4576, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.1167476177215576, + "rewards/margins": 1.0523903369903564, + "rewards/rejected": -4.169137954711914, + "step": 458 + }, + { + "epoch": 0.29977957384276266, + "grad_norm": 10.659382563282882, + "learning_rate": 1.7675089527645568e-07, + "logits/chosen": -0.940302312374115, + "logits/rejected": -0.8791934251785278, + "logps/chosen": -788.9204711914062, + "logps/rejected": -813.0576782226562, + "loss": 0.4806, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.944727659225464, + "rewards/margins": 0.4880099892616272, + "rewards/rejected": -3.4327378273010254, + "step": 459 + }, + { + "epoch": 0.30043268838272513, + "grad_norm": 20.58820840229225, + "learning_rate": 1.766044443118978e-07, + "logits/chosen": -0.9948773384094238, + "logits/rejected": -0.871425211429596, + "logps/chosen": -749.33154296875, + "logps/rejected": -809.860595703125, + "loss": 0.4706, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.934537410736084, + "rewards/margins": 1.2990574836730957, + "rewards/rejected": -4.2335944175720215, + "step": 460 + }, + { + "epoch": 0.30108580292268755, + "grad_norm": 8.561878783064682, + "learning_rate": 1.764575946111607e-07, + "logits/chosen": -0.8827503323554993, + "logits/rejected": -0.8580729365348816, + "logps/chosen": -705.9119873046875, + "logps/rejected": -750.5169677734375, + "loss": 0.5087, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4296109676361084, + "rewards/margins": 0.5230472087860107, + "rewards/rejected": -2.95265793800354, + "step": 461 + }, + { + "epoch": 0.30173891746265, + "grad_norm": 10.058893086562453, + "learning_rate": 1.7631034693861633e-07, + "logits/chosen": -0.8686078190803528, + "logits/rejected": -0.8488516807556152, + "logps/chosen": -806.1150512695312, + "logps/rejected": -868.5146484375, + "loss": 0.5019, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1653387546539307, + "rewards/margins": 0.8287073373794556, + "rewards/rejected": -3.994046211242676, + "step": 462 + }, + { + "epoch": 0.30239203200261244, + "grad_norm": 10.19039827134051, + "learning_rate": 1.7616270206070811e-07, + "logits/chosen": -0.8264525532722473, + "logits/rejected": -0.7664126753807068, + "logps/chosen": -784.3181762695312, + "logps/rejected": -865.5259399414062, + "loss": 0.4936, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1958301067352295, + "rewards/margins": 0.8845147490501404, + "rewards/rejected": -4.080345153808594, + "step": 463 + }, + { + "epoch": 0.3030451465425749, + "grad_norm": 13.006511852492745, + "learning_rate": 1.7601466074594705e-07, + "logits/chosen": -0.7952624559402466, + "logits/rejected": -0.7655043601989746, + "logps/chosen": -725.4571533203125, + "logps/rejected": -828.1234741210938, + "loss": 0.4174, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6289877891540527, + "rewards/margins": 0.9357907772064209, + "rewards/rejected": -3.5647785663604736, + "step": 464 + }, + { + "epoch": 0.30369826108253734, + "grad_norm": 10.478801051027006, + "learning_rate": 1.7586622376490755e-07, + "logits/chosen": -0.8945034742355347, + "logits/rejected": -0.8918284773826599, + "logps/chosen": -796.9361572265625, + "logps/rejected": -890.4472045898438, + "loss": 0.45, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1761415004730225, + "rewards/margins": 0.8999980688095093, + "rewards/rejected": -4.0761399269104, + "step": 465 + }, + { + "epoch": 0.3043513756224998, + "grad_norm": 9.345422496827316, + "learning_rate": 1.7571739189022363e-07, + "logits/chosen": -0.7118120193481445, + "logits/rejected": -0.6660512089729309, + "logps/chosen": -831.6751708984375, + "logps/rejected": -888.5519409179688, + "loss": 0.4966, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.2425475120544434, + "rewards/margins": 0.8526361584663391, + "rewards/rejected": -4.095183849334717, + "step": 466 + }, + { + "epoch": 0.3050044901624622, + "grad_norm": 10.501170102059676, + "learning_rate": 1.7556816589658463e-07, + "logits/chosen": -0.7806082963943481, + "logits/rejected": -0.7936585545539856, + "logps/chosen": -868.0274047851562, + "logps/rejected": -948.4763793945312, + "loss": 0.5316, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5230109691619873, + "rewards/margins": 0.8672765493392944, + "rewards/rejected": -4.390287399291992, + "step": 467 + }, + { + "epoch": 0.3056576047024247, + "grad_norm": 16.5631467557302, + "learning_rate": 1.754185465607315e-07, + "logits/chosen": -0.8917798399925232, + "logits/rejected": -0.8434329628944397, + "logps/chosen": -786.859375, + "logps/rejected": -862.1466064453125, + "loss": 0.4638, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0098447799682617, + "rewards/margins": 0.9299499988555908, + "rewards/rejected": -3.9397945404052734, + "step": 468 + }, + { + "epoch": 0.3063107192423871, + "grad_norm": 13.399040798349395, + "learning_rate": 1.7526853466145243e-07, + "logits/chosen": -0.7421934604644775, + "logits/rejected": -0.705778956413269, + "logps/chosen": -756.4699096679688, + "logps/rejected": -843.1365356445312, + "loss": 0.48, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0707902908325195, + "rewards/margins": 1.0485197305679321, + "rewards/rejected": -4.119309902191162, + "step": 469 + }, + { + "epoch": 0.3069638337823496, + "grad_norm": 10.847863165499646, + "learning_rate": 1.751181309795791e-07, + "logits/chosen": -0.7847117185592651, + "logits/rejected": -0.8583936095237732, + "logps/chosen": -699.90087890625, + "logps/rejected": -841.529541015625, + "loss": 0.4531, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.96787691116333, + "rewards/margins": 0.8856784105300903, + "rewards/rejected": -3.853555202484131, + "step": 470 + }, + { + "epoch": 0.307616948322312, + "grad_norm": 10.623297131564883, + "learning_rate": 1.7496733629798236e-07, + "logits/chosen": -0.7139683961868286, + "logits/rejected": -0.7501896023750305, + "logps/chosen": -771.0042114257812, + "logps/rejected": -972.8804931640625, + "loss": 0.5007, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.053046464920044, + "rewards/margins": 1.4686146974563599, + "rewards/rejected": -4.521660804748535, + "step": 471 + }, + { + "epoch": 0.3082700628622745, + "grad_norm": 9.209388737884883, + "learning_rate": 1.7481615140156833e-07, + "logits/chosen": -0.8231265544891357, + "logits/rejected": -0.8001554608345032, + "logps/chosen": -827.5318603515625, + "logps/rejected": -872.6943359375, + "loss": 0.4664, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0368971824645996, + "rewards/margins": 0.6198225021362305, + "rewards/rejected": -3.656719923019409, + "step": 472 + }, + { + "epoch": 0.3089231774022369, + "grad_norm": 9.143141888933894, + "learning_rate": 1.746645770772742e-07, + "logits/chosen": -0.7907102108001709, + "logits/rejected": -0.7144389748573303, + "logps/chosen": -691.955322265625, + "logps/rejected": -750.7633056640625, + "loss": 0.461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7297940254211426, + "rewards/margins": 0.870061457157135, + "rewards/rejected": -3.599855422973633, + "step": 473 + }, + { + "epoch": 0.3095762919421994, + "grad_norm": 12.054264508592999, + "learning_rate": 1.7451261411406418e-07, + "logits/chosen": -0.7955212593078613, + "logits/rejected": -0.7325343489646912, + "logps/chosen": -816.0054321289062, + "logps/rejected": -856.5308837890625, + "loss": 0.4835, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0509090423583984, + "rewards/margins": 0.724595844745636, + "rewards/rejected": -3.7755050659179688, + "step": 474 + }, + { + "epoch": 0.3102294064821618, + "grad_norm": 10.79946259326026, + "learning_rate": 1.743602633029255e-07, + "logits/chosen": -0.8787297606468201, + "logits/rejected": -0.8972852826118469, + "logps/chosen": -676.2521362304688, + "logps/rejected": -836.7784423828125, + "loss": 0.438, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7497074604034424, + "rewards/margins": 1.409981369972229, + "rewards/rejected": -4.159688949584961, + "step": 475 + }, + { + "epoch": 0.31088252102212427, + "grad_norm": 9.53436829750779, + "learning_rate": 1.7420752543686404e-07, + "logits/chosen": -0.879591166973114, + "logits/rejected": -0.8345413208007812, + "logps/chosen": -809.7759399414062, + "logps/rejected": -808.8768310546875, + "loss": 0.4768, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2994282245635986, + "rewards/margins": 0.5916791558265686, + "rewards/rejected": -3.8911075592041016, + "step": 476 + }, + { + "epoch": 0.3115356355620867, + "grad_norm": 9.592920851797404, + "learning_rate": 1.7405440131090046e-07, + "logits/chosen": -0.9241241216659546, + "logits/rejected": -0.8796841502189636, + "logps/chosen": -859.171142578125, + "logps/rejected": -917.967041015625, + "loss": 0.4416, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2931156158447266, + "rewards/margins": 1.0294749736785889, + "rewards/rejected": -4.3225908279418945, + "step": 477 + }, + { + "epoch": 0.31218875010204916, + "grad_norm": 19.895354822148523, + "learning_rate": 1.739008917220659e-07, + "logits/chosen": -0.9288607835769653, + "logits/rejected": -0.8922178745269775, + "logps/chosen": -867.878173828125, + "logps/rejected": -910.6178588867188, + "loss": 0.5138, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8489792346954346, + "rewards/margins": 0.71727454662323, + "rewards/rejected": -4.566253662109375, + "step": 478 + }, + { + "epoch": 0.3128418646420116, + "grad_norm": 10.578879623870158, + "learning_rate": 1.737469974693979e-07, + "logits/chosen": -0.8969875574111938, + "logits/rejected": -0.8435537219047546, + "logps/chosen": -891.6130981445312, + "logps/rejected": -1026.607177734375, + "loss": 0.4548, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6824443340301514, + "rewards/margins": 1.2574917078018188, + "rewards/rejected": -4.939936637878418, + "step": 479 + }, + { + "epoch": 0.31349497918197405, + "grad_norm": 13.352402212334288, + "learning_rate": 1.735927193539363e-07, + "logits/chosen": -0.8216950297355652, + "logits/rejected": -0.7246571183204651, + "logps/chosen": -759.7047119140625, + "logps/rejected": -856.9074096679688, + "loss": 0.4726, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.9738543033599854, + "rewards/margins": 1.1688733100891113, + "rewards/rejected": -4.142727851867676, + "step": 480 + }, + { + "epoch": 0.31414809372193647, + "grad_norm": 11.869022792008412, + "learning_rate": 1.7343805817871884e-07, + "logits/chosen": -0.8111241459846497, + "logits/rejected": -0.766447901725769, + "logps/chosen": -888.6497802734375, + "logps/rejected": -962.759765625, + "loss": 0.4559, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6871464252471924, + "rewards/margins": 1.016096830368042, + "rewards/rejected": -4.703243255615234, + "step": 481 + }, + { + "epoch": 0.31480120826189895, + "grad_norm": 14.231372028726907, + "learning_rate": 1.7328301474877723e-07, + "logits/chosen": -0.9776662588119507, + "logits/rejected": -0.9157834649085999, + "logps/chosen": -805.4437866210938, + "logps/rejected": -909.315673828125, + "loss": 0.4274, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1832499504089355, + "rewards/margins": 0.964979887008667, + "rewards/rejected": -4.148229598999023, + "step": 482 + }, + { + "epoch": 0.31545432280186136, + "grad_norm": 13.141716217042305, + "learning_rate": 1.731275898711329e-07, + "logits/chosen": -0.6212536692619324, + "logits/rejected": -0.658240795135498, + "logps/chosen": -847.568115234375, + "logps/rejected": -932.3016967773438, + "loss": 0.4936, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.397820472717285, + "rewards/margins": 0.9783123135566711, + "rewards/rejected": -4.376132488250732, + "step": 483 + }, + { + "epoch": 0.31610743734182384, + "grad_norm": 11.377521596330835, + "learning_rate": 1.7297178435479267e-07, + "logits/chosen": -0.6372016668319702, + "logits/rejected": -0.6643779873847961, + "logps/chosen": -765.1443481445312, + "logps/rejected": -921.8984985351562, + "loss": 0.4603, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6738381385803223, + "rewards/margins": 1.214465856552124, + "rewards/rejected": -4.888303756713867, + "step": 484 + }, + { + "epoch": 0.31676055188178626, + "grad_norm": 10.093475452329823, + "learning_rate": 1.7281559901074472e-07, + "logits/chosen": -0.6943804621696472, + "logits/rejected": -0.7027254104614258, + "logps/chosen": -842.590576171875, + "logps/rejected": -997.2478637695312, + "loss": 0.4715, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.864556312561035, + "rewards/margins": 1.3417086601257324, + "rewards/rejected": -5.206264972686768, + "step": 485 + }, + { + "epoch": 0.31741366642174873, + "grad_norm": 11.537418032619593, + "learning_rate": 1.7265903465195427e-07, + "logits/chosen": -0.7280626893043518, + "logits/rejected": -0.6906183958053589, + "logps/chosen": -821.1693115234375, + "logps/rejected": -924.5863037109375, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.9434003829956055, + "rewards/margins": 1.0203464031219482, + "rewards/rejected": -4.963746547698975, + "step": 486 + }, + { + "epoch": 0.31806678096171115, + "grad_norm": 12.21030922364517, + "learning_rate": 1.7250209209335926e-07, + "logits/chosen": -0.8060334920883179, + "logits/rejected": -0.84015953540802, + "logps/chosen": -825.5, + "logps/rejected": -943.2487182617188, + "loss": 0.5405, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.8686962127685547, + "rewards/margins": 0.5992237329483032, + "rewards/rejected": -4.467920303344727, + "step": 487 + }, + { + "epoch": 0.3187198955016736, + "grad_norm": 11.897072585977261, + "learning_rate": 1.7234477215186636e-07, + "logits/chosen": -0.7122973799705505, + "logits/rejected": -0.7898483276367188, + "logps/chosen": -792.6326904296875, + "logps/rejected": -949.6519165039062, + "loss": 0.4476, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5841124057769775, + "rewards/margins": 1.0407400131225586, + "rewards/rejected": -4.624852657318115, + "step": 488 + }, + { + "epoch": 0.31937301004163604, + "grad_norm": 12.129067136065453, + "learning_rate": 1.721870756463465e-07, + "logits/chosen": -0.880097508430481, + "logits/rejected": -0.7950392365455627, + "logps/chosen": -844.4869995117188, + "logps/rejected": -939.5963745117188, + "loss": 0.4325, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.066311359405518, + "rewards/margins": 1.0431554317474365, + "rewards/rejected": -5.109466552734375, + "step": 489 + }, + { + "epoch": 0.3200261245815985, + "grad_norm": 11.35584429849149, + "learning_rate": 1.7202900339763064e-07, + "logits/chosen": -0.841032087802887, + "logits/rejected": -0.799349308013916, + "logps/chosen": -821.346923828125, + "logps/rejected": -912.546630859375, + "loss": 0.4661, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8169021606445312, + "rewards/margins": 1.0876867771148682, + "rewards/rejected": -4.90458869934082, + "step": 490 + }, + { + "epoch": 0.32067923912156093, + "grad_norm": 15.041059756588488, + "learning_rate": 1.7187055622850558e-07, + "logits/chosen": -0.930271327495575, + "logits/rejected": -0.6977147459983826, + "logps/chosen": -942.312744140625, + "logps/rejected": -976.447509765625, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.337576866149902, + "rewards/margins": 0.8670297861099243, + "rewards/rejected": -5.204606533050537, + "step": 491 + }, + { + "epoch": 0.3213323536615234, + "grad_norm": 16.154803531987202, + "learning_rate": 1.717117349637096e-07, + "logits/chosen": -0.728052020072937, + "logits/rejected": -0.7592598795890808, + "logps/chosen": -822.1433715820312, + "logps/rejected": -954.33642578125, + "loss": 0.5059, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.139540672302246, + "rewards/margins": 1.180368423461914, + "rewards/rejected": -5.319908142089844, + "step": 492 + }, + { + "epoch": 0.3219854682014858, + "grad_norm": 19.037823685112883, + "learning_rate": 1.7155254042992825e-07, + "logits/chosen": -0.8321860432624817, + "logits/rejected": -0.7546296119689941, + "logps/chosen": -870.90234375, + "logps/rejected": -969.2771606445312, + "loss": 0.5401, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6551272869110107, + "rewards/margins": 0.9787754416465759, + "rewards/rejected": -4.6339030265808105, + "step": 493 + }, + { + "epoch": 0.3226385827414483, + "grad_norm": 12.255497260985175, + "learning_rate": 1.7139297345578992e-07, + "logits/chosen": -0.8148177862167358, + "logits/rejected": -0.8120337724685669, + "logps/chosen": -818.4249267578125, + "logps/rejected": -935.2034912109375, + "loss": 0.4886, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5292749404907227, + "rewards/margins": 1.132380723953247, + "rewards/rejected": -4.661655902862549, + "step": 494 + }, + { + "epoch": 0.3232916972814107, + "grad_norm": 12.380876583809687, + "learning_rate": 1.7123303487186163e-07, + "logits/chosen": -0.8092105984687805, + "logits/rejected": -0.802417516708374, + "logps/chosen": -872.7374267578125, + "logps/rejected": -993.07275390625, + "loss": 0.4734, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.427180290222168, + "rewards/margins": 1.040173053741455, + "rewards/rejected": -4.467353343963623, + "step": 495 + }, + { + "epoch": 0.3239448118213732, + "grad_norm": 13.011638430489379, + "learning_rate": 1.710727255106447e-07, + "logits/chosen": -0.9116150736808777, + "logits/rejected": -0.8956907987594604, + "logps/chosen": -693.85986328125, + "logps/rejected": -819.0209350585938, + "loss": 0.4843, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.104858636856079, + "rewards/margins": 1.0460671186447144, + "rewards/rejected": -4.150925159454346, + "step": 496 + }, + { + "epoch": 0.3245979263613356, + "grad_norm": 13.157305554266493, + "learning_rate": 1.7091204620657042e-07, + "logits/chosen": -0.9443058967590332, + "logits/rejected": -0.8911035060882568, + "logps/chosen": -843.8384399414062, + "logps/rejected": -942.85693359375, + "loss": 0.4831, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.061310768127441, + "rewards/margins": 1.07249116897583, + "rewards/rejected": -5.13380241394043, + "step": 497 + }, + { + "epoch": 0.3252510409012981, + "grad_norm": 13.572167660903137, + "learning_rate": 1.707509977959956e-07, + "logits/chosen": -0.8992602229118347, + "logits/rejected": -0.900657057762146, + "logps/chosen": -860.68115234375, + "logps/rejected": -970.5743408203125, + "loss": 0.4518, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.872074604034424, + "rewards/margins": 1.0880197286605835, + "rewards/rejected": -4.960093975067139, + "step": 498 + }, + { + "epoch": 0.3259041554412605, + "grad_norm": 13.585680096052258, + "learning_rate": 1.7058958111719835e-07, + "logits/chosen": -0.7684957385063171, + "logits/rejected": -0.7548912167549133, + "logps/chosen": -855.7137451171875, + "logps/rejected": -943.9931030273438, + "loss": 0.4719, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.692744731903076, + "rewards/margins": 1.0405030250549316, + "rewards/rejected": -4.733248233795166, + "step": 499 + }, + { + "epoch": 0.326557269981223, + "grad_norm": 9.503533989046367, + "learning_rate": 1.704277970103736e-07, + "logits/chosen": -0.8868258595466614, + "logits/rejected": -0.8633951544761658, + "logps/chosen": -849.567626953125, + "logps/rejected": -934.948486328125, + "loss": 0.4314, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8034567832946777, + "rewards/margins": 0.9566541314125061, + "rewards/rejected": -4.760110855102539, + "step": 500 + }, + { + "epoch": 0.326557269981223, + "eval_logits/chosen": -0.669052243232727, + "eval_logits/rejected": -0.6189415454864502, + "eval_logps/chosen": -840.8739624023438, + "eval_logps/rejected": -925.4282836914062, + "eval_loss": 0.4787108898162842, + "eval_rewards/accuracies": 0.7630000114440918, + "eval_rewards/chosen": -3.6696672439575195, + "eval_rewards/margins": 0.9329305291175842, + "eval_rewards/rejected": -4.602597236633301, + "eval_runtime": 615.294, + "eval_samples_per_second": 6.501, + "eval_steps_per_second": 0.406, + "step": 500 + }, + { + "epoch": 0.3272103845211854, + "grad_norm": 15.147382044291335, + "learning_rate": 1.7026564631762895e-07, + "logits/chosen": -0.7561497092247009, + "logits/rejected": -0.7421559691429138, + "logps/chosen": -769.6187133789062, + "logps/rejected": -881.5009765625, + "loss": 0.5245, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4119911193847656, + "rewards/margins": 1.0296821594238281, + "rewards/rejected": -4.441673755645752, + "step": 501 + }, + { + "epoch": 0.32786349906114787, + "grad_norm": 10.968419509869916, + "learning_rate": 1.7010312988297993e-07, + "logits/chosen": -0.8724136352539062, + "logits/rejected": -0.9291731119155884, + "logps/chosen": -879.1983032226562, + "logps/rejected": -1025.105712890625, + "loss": 0.4349, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4728522300720215, + "rewards/margins": 1.3036978244781494, + "rewards/rejected": -4.776549816131592, + "step": 502 + }, + { + "epoch": 0.3285166136011103, + "grad_norm": 10.141199307712194, + "learning_rate": 1.6994024855234592e-07, + "logits/chosen": -0.8684702515602112, + "logits/rejected": -0.8307234048843384, + "logps/chosen": -816.9854125976562, + "logps/rejected": -867.1416015625, + "loss": 0.4723, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5080981254577637, + "rewards/margins": 0.7219822406768799, + "rewards/rejected": -4.2300801277160645, + "step": 503 + }, + { + "epoch": 0.32916972814107276, + "grad_norm": 14.673164446644734, + "learning_rate": 1.6977700317354565e-07, + "logits/chosen": -0.7906508445739746, + "logits/rejected": -0.7534193992614746, + "logps/chosen": -878.6422119140625, + "logps/rejected": -905.3970947265625, + "loss": 0.4902, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.976510524749756, + "rewards/margins": 0.7006040215492249, + "rewards/rejected": -4.677114963531494, + "step": 504 + }, + { + "epoch": 0.3298228426810352, + "grad_norm": 9.7205031042585, + "learning_rate": 1.6961339459629266e-07, + "logits/chosen": -0.8221117258071899, + "logits/rejected": -0.7521359920501709, + "logps/chosen": -899.8609619140625, + "logps/rejected": -955.1153564453125, + "loss": 0.4702, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.543219566345215, + "rewards/margins": 0.982782781124115, + "rewards/rejected": -5.526001930236816, + "step": 505 + }, + { + "epoch": 0.33047595722099765, + "grad_norm": 12.414035243869511, + "learning_rate": 1.694494236721911e-07, + "logits/chosen": -1.0194346904754639, + "logits/rejected": -0.9434860944747925, + "logps/chosen": -840.7852172851562, + "logps/rejected": -910.69140625, + "loss": 0.452, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.716850996017456, + "rewards/margins": 0.9896608591079712, + "rewards/rejected": -4.706511497497559, + "step": 506 + }, + { + "epoch": 0.33112907176096007, + "grad_norm": 16.29842962578957, + "learning_rate": 1.6928509125473109e-07, + "logits/chosen": -0.9049715995788574, + "logits/rejected": -0.9679229855537415, + "logps/chosen": -842.1791381835938, + "logps/rejected": -965.1322631835938, + "loss": 0.5553, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8672585487365723, + "rewards/margins": 1.0312442779541016, + "rewards/rejected": -4.898502826690674, + "step": 507 + }, + { + "epoch": 0.33178218630092254, + "grad_norm": 11.824506292597153, + "learning_rate": 1.6912039819928444e-07, + "logits/chosen": -0.9393632411956787, + "logits/rejected": -0.8065717816352844, + "logps/chosen": -909.3609008789062, + "logps/rejected": -944.26806640625, + "loss": 0.4606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5138916969299316, + "rewards/margins": 1.1851271390914917, + "rewards/rejected": -4.699018955230713, + "step": 508 + }, + { + "epoch": 0.33243530084088496, + "grad_norm": 13.178303937450522, + "learning_rate": 1.6895534536310016e-07, + "logits/chosen": -0.8816686868667603, + "logits/rejected": -0.9407558441162109, + "logps/chosen": -825.6542358398438, + "logps/rejected": -1041.97900390625, + "loss": 0.5393, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.608908176422119, + "rewards/margins": 1.2655761241912842, + "rewards/rejected": -4.874484062194824, + "step": 509 + }, + { + "epoch": 0.33308841538084744, + "grad_norm": 14.045988235199069, + "learning_rate": 1.6878993360529982e-07, + "logits/chosen": -1.021334171295166, + "logits/rejected": -0.8892602920532227, + "logps/chosen": -886.7183837890625, + "logps/rejected": -941.6819458007812, + "loss": 0.4527, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.115851402282715, + "rewards/margins": 0.8323267698287964, + "rewards/rejected": -4.948177814483643, + "step": 510 + }, + { + "epoch": 0.33374152992080985, + "grad_norm": 13.698262023902034, + "learning_rate": 1.6862416378687336e-07, + "logits/chosen": -0.9176092743873596, + "logits/rejected": -0.834775447845459, + "logps/chosen": -844.972412109375, + "logps/rejected": -964.8734741210938, + "loss": 0.5024, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.549609899520874, + "rewards/margins": 1.238707423210144, + "rewards/rejected": -4.7883172035217285, + "step": 511 + }, + { + "epoch": 0.33439464446077233, + "grad_norm": 13.51360210625101, + "learning_rate": 1.684580367706744e-07, + "logits/chosen": -0.9627776145935059, + "logits/rejected": -0.9435927271842957, + "logps/chosen": -916.3948364257812, + "logps/rejected": -1032.2822265625, + "loss": 0.4764, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.037001609802246, + "rewards/margins": 1.3188434839248657, + "rewards/rejected": -5.355844497680664, + "step": 512 + }, + { + "epoch": 0.33504775900073475, + "grad_norm": 12.919673658707383, + "learning_rate": 1.6829155342141586e-07, + "logits/chosen": -0.7160369157791138, + "logits/rejected": -0.7462060451507568, + "logps/chosen": -783.5517578125, + "logps/rejected": -832.207275390625, + "loss": 0.4687, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.499671220779419, + "rewards/margins": 0.49122992157936096, + "rewards/rejected": -3.990901231765747, + "step": 513 + }, + { + "epoch": 0.3357008735406972, + "grad_norm": 16.359086592503015, + "learning_rate": 1.681247146056654e-07, + "logits/chosen": -0.9952366948127747, + "logits/rejected": -0.8325803875923157, + "logps/chosen": -982.23828125, + "logps/rejected": -1018.0582275390625, + "loss": 0.5182, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.51038122177124, + "rewards/margins": 0.5027068853378296, + "rewards/rejected": -5.013088226318359, + "step": 514 + }, + { + "epoch": 0.33635398808065964, + "grad_norm": 12.599243151717252, + "learning_rate": 1.6795752119184096e-07, + "logits/chosen": -0.7795126438140869, + "logits/rejected": -0.7431491017341614, + "logps/chosen": -834.5680541992188, + "logps/rejected": -903.7908935546875, + "loss": 0.4307, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.54948091506958, + "rewards/margins": 0.811899721622467, + "rewards/rejected": -4.3613810539245605, + "step": 515 + }, + { + "epoch": 0.3370071026206221, + "grad_norm": 23.221262136804224, + "learning_rate": 1.6778997405020616e-07, + "logits/chosen": -0.8013187646865845, + "logits/rejected": -0.7317193746566772, + "logps/chosen": -978.86865234375, + "logps/rejected": -1063.277587890625, + "loss": 0.5415, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.981942653656006, + "rewards/margins": 1.0086599588394165, + "rewards/rejected": -4.990602016448975, + "step": 516 + }, + { + "epoch": 0.33766021716058453, + "grad_norm": 10.605337134032375, + "learning_rate": 1.6762207405286586e-07, + "logits/chosen": -0.62137770652771, + "logits/rejected": -0.6129266619682312, + "logps/chosen": -876.7236328125, + "logps/rejected": -995.8892211914062, + "loss": 0.5172, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8768973350524902, + "rewards/margins": 1.032037377357483, + "rewards/rejected": -4.908934593200684, + "step": 517 + }, + { + "epoch": 0.338313331700547, + "grad_norm": 12.079127195544842, + "learning_rate": 1.6745382207376156e-07, + "logits/chosen": -0.8155550956726074, + "logits/rejected": -0.7862538695335388, + "logps/chosen": -860.969482421875, + "logps/rejected": -986.3485717773438, + "loss": 0.5162, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.910642385482788, + "rewards/margins": 1.0282100439071655, + "rewards/rejected": -4.938852787017822, + "step": 518 + }, + { + "epoch": 0.3389664462405094, + "grad_norm": 12.099481524777941, + "learning_rate": 1.6728521898866686e-07, + "logits/chosen": -0.9201481342315674, + "logits/rejected": -0.9237188696861267, + "logps/chosen": -874.0413208007812, + "logps/rejected": -971.56494140625, + "loss": 0.4838, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.7183971405029297, + "rewards/margins": 0.771868109703064, + "rewards/rejected": -4.490265846252441, + "step": 519 + }, + { + "epoch": 0.3396195607804719, + "grad_norm": 12.606539455519421, + "learning_rate": 1.6711626567518297e-07, + "logits/chosen": -1.0150597095489502, + "logits/rejected": -1.0211997032165527, + "logps/chosen": -820.2203369140625, + "logps/rejected": -894.1910400390625, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.321272134780884, + "rewards/margins": 0.7952224612236023, + "rewards/rejected": -4.116494655609131, + "step": 520 + }, + { + "epoch": 0.3402726753204343, + "grad_norm": 9.918000148345703, + "learning_rate": 1.6694696301273402e-07, + "logits/chosen": -0.7031683325767517, + "logits/rejected": -0.5711356401443481, + "logps/chosen": -777.73193359375, + "logps/rejected": -904.4061889648438, + "loss": 0.4397, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.679725170135498, + "rewards/margins": 1.2326750755310059, + "rewards/rejected": -4.912400245666504, + "step": 521 + }, + { + "epoch": 0.3409257898603968, + "grad_norm": 9.780704201122273, + "learning_rate": 1.6677731188256257e-07, + "logits/chosen": -0.9383131861686707, + "logits/rejected": -0.9517966508865356, + "logps/chosen": -805.9428100585938, + "logps/rejected": -924.341552734375, + "loss": 0.4209, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.364539384841919, + "rewards/margins": 1.0484657287597656, + "rewards/rejected": -4.4130048751831055, + "step": 522 + }, + { + "epoch": 0.3415789044003592, + "grad_norm": 11.866203216090135, + "learning_rate": 1.6660731316772502e-07, + "logits/chosen": -0.796733021736145, + "logits/rejected": -0.7914215326309204, + "logps/chosen": -848.482421875, + "logps/rejected": -931.0703125, + "loss": 0.5249, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7563669681549072, + "rewards/margins": 0.6706068515777588, + "rewards/rejected": -4.426973819732666, + "step": 523 + }, + { + "epoch": 0.3422320189403217, + "grad_norm": 13.399215080432462, + "learning_rate": 1.6643696775308694e-07, + "logits/chosen": -0.8221777081489563, + "logits/rejected": -0.8325170874595642, + "logps/chosen": -776.5897216796875, + "logps/rejected": -887.011962890625, + "loss": 0.4454, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.084960699081421, + "rewards/margins": 1.0111218690872192, + "rewards/rejected": -4.09608268737793, + "step": 524 + }, + { + "epoch": 0.3428851334802841, + "grad_norm": 13.323830997149015, + "learning_rate": 1.662662765253186e-07, + "logits/chosen": -0.9103497266769409, + "logits/rejected": -0.9971468448638916, + "logps/chosen": -743.458740234375, + "logps/rejected": -852.3600463867188, + "loss": 0.4597, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1231021881103516, + "rewards/margins": 0.6584322452545166, + "rewards/rejected": -3.7815346717834473, + "step": 525 + }, + { + "epoch": 0.3435382480202466, + "grad_norm": 11.479976007473763, + "learning_rate": 1.6609524037289016e-07, + "logits/chosen": -0.8146867752075195, + "logits/rejected": -0.8755562901496887, + "logps/chosen": -821.8370971679688, + "logps/rejected": -998.6825561523438, + "loss": 0.3882, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.435415744781494, + "rewards/margins": 1.529471755027771, + "rewards/rejected": -4.964887619018555, + "step": 526 + }, + { + "epoch": 0.344191362560209, + "grad_norm": 16.99793649417262, + "learning_rate": 1.6592386018606735e-07, + "logits/chosen": -0.7995474934577942, + "logits/rejected": -0.8219078779220581, + "logps/chosen": -768.4069213867188, + "logps/rejected": -880.92724609375, + "loss": 0.505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4240992069244385, + "rewards/margins": 0.8240652680397034, + "rewards/rejected": -4.248164653778076, + "step": 527 + }, + { + "epoch": 0.34484447710017146, + "grad_norm": 22.950737519034714, + "learning_rate": 1.6575213685690636e-07, + "logits/chosen": -0.7773298025131226, + "logits/rejected": -0.6868129968643188, + "logps/chosen": -749.68798828125, + "logps/rejected": -865.9345092773438, + "loss": 0.494, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.298440456390381, + "rewards/margins": 1.106008768081665, + "rewards/rejected": -4.404449462890625, + "step": 528 + }, + { + "epoch": 0.3454975916401339, + "grad_norm": 13.084048334963885, + "learning_rate": 1.6558007127924977e-07, + "logits/chosen": -0.8282909989356995, + "logits/rejected": -0.8078904151916504, + "logps/chosen": -876.759765625, + "logps/rejected": -973.1893310546875, + "loss": 0.471, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8432815074920654, + "rewards/margins": 1.1956998109817505, + "rewards/rejected": -5.0389814376831055, + "step": 529 + }, + { + "epoch": 0.34615070618009636, + "grad_norm": 13.458405988710624, + "learning_rate": 1.6540766434872137e-07, + "logits/chosen": -0.7838751077651978, + "logits/rejected": -0.7649997472763062, + "logps/chosen": -879.0845336914062, + "logps/rejected": -1020.4249877929688, + "loss": 0.4086, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.94573712348938, + "rewards/margins": 1.4766623973846436, + "rewards/rejected": -5.422399520874023, + "step": 530 + }, + { + "epoch": 0.3468038207200588, + "grad_norm": 14.043989120826666, + "learning_rate": 1.6523491696272192e-07, + "logits/chosen": -0.9524602293968201, + "logits/rejected": -0.9211701154708862, + "logps/chosen": -890.7154541015625, + "logps/rejected": -965.8680419921875, + "loss": 0.463, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.362317085266113, + "rewards/margins": 0.8987084031105042, + "rewards/rejected": -5.261025428771973, + "step": 531 + }, + { + "epoch": 0.34745693526002125, + "grad_norm": 11.383924987842038, + "learning_rate": 1.650618300204242e-07, + "logits/chosen": -0.9213005900382996, + "logits/rejected": -0.9080460071563721, + "logps/chosen": -811.111083984375, + "logps/rejected": -971.2776489257812, + "loss": 0.4346, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.729093551635742, + "rewards/margins": 1.4446587562561035, + "rewards/rejected": -5.173752307891846, + "step": 532 + }, + { + "epoch": 0.34811004979998367, + "grad_norm": 11.180079833888518, + "learning_rate": 1.6488840442276846e-07, + "logits/chosen": -0.8584150671958923, + "logits/rejected": -0.9680982828140259, + "logps/chosen": -807.1697387695312, + "logps/rejected": -950.741455078125, + "loss": 0.4583, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.821699619293213, + "rewards/margins": 1.069732904434204, + "rewards/rejected": -4.891432762145996, + "step": 533 + }, + { + "epoch": 0.34876316433994614, + "grad_norm": 12.714767456829277, + "learning_rate": 1.6471464107245766e-07, + "logits/chosen": -1.0752252340316772, + "logits/rejected": -1.076012134552002, + "logps/chosen": -833.3370971679688, + "logps/rejected": -881.901611328125, + "loss": 0.558, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.728384256362915, + "rewards/margins": 0.7215151786804199, + "rewards/rejected": -4.449899673461914, + "step": 534 + }, + { + "epoch": 0.34941627887990856, + "grad_norm": 13.100511603907313, + "learning_rate": 1.645405408739528e-07, + "logits/chosen": -0.8571863174438477, + "logits/rejected": -0.7581591606140137, + "logps/chosen": -853.043212890625, + "logps/rejected": -943.9117431640625, + "loss": 0.4645, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.86018443107605, + "rewards/margins": 0.9599357843399048, + "rewards/rejected": -4.820120334625244, + "step": 535 + }, + { + "epoch": 0.35006939341987103, + "grad_norm": 11.192981080205437, + "learning_rate": 1.643661047334683e-07, + "logits/chosen": -0.8171364665031433, + "logits/rejected": -0.8735244274139404, + "logps/chosen": -1039.5997314453125, + "logps/rejected": -1199.798583984375, + "loss": 0.4044, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.921164512634277, + "rewards/margins": 1.3224613666534424, + "rewards/rejected": -6.243625640869141, + "step": 536 + }, + { + "epoch": 0.35072250795983345, + "grad_norm": 12.109010785072018, + "learning_rate": 1.6419133355896713e-07, + "logits/chosen": -0.9537902474403381, + "logits/rejected": -0.8713423013687134, + "logps/chosen": -931.1124877929688, + "logps/rejected": -981.86572265625, + "loss": 0.4852, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.270295143127441, + "rewards/margins": 1.0137115716934204, + "rewards/rejected": -5.284006118774414, + "step": 537 + }, + { + "epoch": 0.3513756224997959, + "grad_norm": 19.462264324301042, + "learning_rate": 1.6401622826015612e-07, + "logits/chosen": -0.9751338958740234, + "logits/rejected": -0.8156272172927856, + "logps/chosen": -881.4043579101562, + "logps/rejected": -954.9631958007812, + "loss": 0.4636, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.002389907836914, + "rewards/margins": 0.9239901304244995, + "rewards/rejected": -4.926379203796387, + "step": 538 + }, + { + "epoch": 0.35202873703975834, + "grad_norm": 17.303434296903287, + "learning_rate": 1.6384078974848142e-07, + "logits/chosen": -0.858856737613678, + "logits/rejected": -0.8381833434104919, + "logps/chosen": -914.42041015625, + "logps/rejected": -981.6102905273438, + "loss": 0.4921, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.198989391326904, + "rewards/margins": 0.9757477641105652, + "rewards/rejected": -5.174736976623535, + "step": 539 + }, + { + "epoch": 0.3526818515797208, + "grad_norm": 19.533240518764423, + "learning_rate": 1.6366501893712344e-07, + "logits/chosen": -0.9275886416435242, + "logits/rejected": -0.8129714727401733, + "logps/chosen": -907.5596313476562, + "logps/rejected": -1000.3069458007812, + "loss": 0.4501, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.100757598876953, + "rewards/margins": 1.2760422229766846, + "rewards/rejected": -5.376799583435059, + "step": 540 + }, + { + "epoch": 0.35333496611968324, + "grad_norm": 11.810356313020227, + "learning_rate": 1.6348891674099229e-07, + "logits/chosen": -0.8514102101325989, + "logits/rejected": -0.8506395816802979, + "logps/chosen": -914.7454223632812, + "logps/rejected": -980.7003173828125, + "loss": 0.429, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.303524971008301, + "rewards/margins": 0.7691353559494019, + "rewards/rejected": -5.072659969329834, + "step": 541 + }, + { + "epoch": 0.3539880806596457, + "grad_norm": 15.2440846867984, + "learning_rate": 1.6331248407672298e-07, + "logits/chosen": -0.8096214532852173, + "logits/rejected": -0.718659520149231, + "logps/chosen": -847.4376220703125, + "logps/rejected": -919.3460693359375, + "loss": 0.4332, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.8864338397979736, + "rewards/margins": 0.990598201751709, + "rewards/rejected": -4.8770318031311035, + "step": 542 + }, + { + "epoch": 0.35464119519960813, + "grad_norm": 13.212471074381327, + "learning_rate": 1.6313572186267072e-07, + "logits/chosen": -0.698846697807312, + "logits/rejected": -0.6679378151893616, + "logps/chosen": -854.2268676757812, + "logps/rejected": -1125.93310546875, + "loss": 0.4416, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.361400604248047, + "rewards/margins": 2.335489511489868, + "rewards/rejected": -6.696890354156494, + "step": 543 + }, + { + "epoch": 0.3552943097395706, + "grad_norm": 15.42268532063693, + "learning_rate": 1.62958631018906e-07, + "logits/chosen": -0.8758993148803711, + "logits/rejected": -0.8772597908973694, + "logps/chosen": -919.5452880859375, + "logps/rejected": -1081.818115234375, + "loss": 0.4518, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.379350185394287, + "rewards/margins": 1.225996971130371, + "rewards/rejected": -5.605347633361816, + "step": 544 + }, + { + "epoch": 0.355947424279533, + "grad_norm": 12.72993505285054, + "learning_rate": 1.6278121246720986e-07, + "logits/chosen": -0.733549952507019, + "logits/rejected": -0.7166369557380676, + "logps/chosen": -890.005859375, + "logps/rejected": -1006.06689453125, + "loss": 0.4022, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.903335094451904, + "rewards/margins": 1.2002079486846924, + "rewards/rejected": -6.103543281555176, + "step": 545 + }, + { + "epoch": 0.3566005388194955, + "grad_norm": 17.091713363040604, + "learning_rate": 1.6260346713106915e-07, + "logits/chosen": -0.9864925146102905, + "logits/rejected": -0.87857985496521, + "logps/chosen": -908.6575927734375, + "logps/rejected": -1028.0980224609375, + "loss": 0.4571, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8206915855407715, + "rewards/margins": 1.3013036251068115, + "rewards/rejected": -5.121994495391846, + "step": 546 + }, + { + "epoch": 0.3572536533594579, + "grad_norm": 13.27253227088685, + "learning_rate": 1.6242539593567167e-07, + "logits/chosen": -0.8658774495124817, + "logits/rejected": -0.7811622619628906, + "logps/chosen": -884.6961669921875, + "logps/rejected": -967.5526733398438, + "loss": 0.4739, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.198709487915039, + "rewards/margins": 0.9260251522064209, + "rewards/rejected": -5.124734878540039, + "step": 547 + }, + { + "epoch": 0.3579067678994204, + "grad_norm": 16.130528314621436, + "learning_rate": 1.6224699980790128e-07, + "logits/chosen": -0.8342747688293457, + "logits/rejected": -0.8980854153633118, + "logps/chosen": -939.5543212890625, + "logps/rejected": -1134.8359375, + "loss": 0.4595, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.8620781898498535, + "rewards/margins": 1.3039054870605469, + "rewards/rejected": -6.165983200073242, + "step": 548 + }, + { + "epoch": 0.3585598824393828, + "grad_norm": 15.939077041425103, + "learning_rate": 1.620682796763333e-07, + "logits/chosen": -0.8608918190002441, + "logits/rejected": -0.9147299528121948, + "logps/chosen": -849.9219970703125, + "logps/rejected": -998.6673583984375, + "loss": 0.52, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.1205244064331055, + "rewards/margins": 1.2329450845718384, + "rewards/rejected": -5.353469371795654, + "step": 549 + }, + { + "epoch": 0.3592129969793453, + "grad_norm": 10.980661099727842, + "learning_rate": 1.6188923647122945e-07, + "logits/chosen": -0.882325291633606, + "logits/rejected": -0.8491942286491394, + "logps/chosen": -932.6527709960938, + "logps/rejected": -1025.0045166015625, + "loss": 0.424, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.657686710357666, + "rewards/margins": 1.2477612495422363, + "rewards/rejected": -5.9054484367370605, + "step": 550 + }, + { + "epoch": 0.3598661115193077, + "grad_norm": 21.708736418823243, + "learning_rate": 1.6170987112453305e-07, + "logits/chosen": -0.7525062561035156, + "logits/rejected": -0.7757470607757568, + "logps/chosen": -857.36572265625, + "logps/rejected": -877.4985961914062, + "loss": 0.464, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.827180862426758, + "rewards/margins": 0.5651108622550964, + "rewards/rejected": -4.39229154586792, + "step": 551 + }, + { + "epoch": 0.36051922605927017, + "grad_norm": 16.678917268433715, + "learning_rate": 1.6153018456986428e-07, + "logits/chosen": -0.929282546043396, + "logits/rejected": -0.9308112859725952, + "logps/chosen": -797.1315307617188, + "logps/rejected": -901.5902709960938, + "loss": 0.4569, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8772454261779785, + "rewards/margins": 0.9484346508979797, + "rewards/rejected": -4.825679779052734, + "step": 552 + }, + { + "epoch": 0.3611723405992326, + "grad_norm": 11.43691597383882, + "learning_rate": 1.6135017774251518e-07, + "logits/chosen": -0.7681158781051636, + "logits/rejected": -0.7736382484436035, + "logps/chosen": -892.3456420898438, + "logps/rejected": -1131.2122802734375, + "loss": 0.3717, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.3778395652771, + "rewards/margins": 1.9359546899795532, + "rewards/rejected": -6.3137946128845215, + "step": 553 + }, + { + "epoch": 0.36182545513919506, + "grad_norm": 13.839372124571064, + "learning_rate": 1.6116985157944494e-07, + "logits/chosen": -0.7676699757575989, + "logits/rejected": -0.7218711972236633, + "logps/chosen": -903.3311157226562, + "logps/rejected": -1085.003173828125, + "loss": 0.4355, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.102015495300293, + "rewards/margins": 1.7181568145751953, + "rewards/rejected": -5.820172309875488, + "step": 554 + }, + { + "epoch": 0.3624785696791575, + "grad_norm": 26.180670172356372, + "learning_rate": 1.609892070192749e-07, + "logits/chosen": -0.9137950539588928, + "logits/rejected": -0.9119899272918701, + "logps/chosen": -804.7276611328125, + "logps/rejected": -930.9756469726562, + "loss": 0.4846, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.975982666015625, + "rewards/margins": 1.0355627536773682, + "rewards/rejected": -5.011545658111572, + "step": 555 + }, + { + "epoch": 0.36313168421911995, + "grad_norm": 14.581822298383303, + "learning_rate": 1.6080824500228366e-07, + "logits/chosen": -0.9392572045326233, + "logits/rejected": -0.8843672275543213, + "logps/chosen": -817.7110595703125, + "logps/rejected": -892.86279296875, + "loss": 0.4697, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.838777542114258, + "rewards/margins": 0.9610627293586731, + "rewards/rejected": -4.799839973449707, + "step": 556 + }, + { + "epoch": 0.3637847987590824, + "grad_norm": 27.455679946964146, + "learning_rate": 1.6062696647040224e-07, + "logits/chosen": -0.9277936816215515, + "logits/rejected": -0.8799358606338501, + "logps/chosen": -852.052978515625, + "logps/rejected": -1005.009521484375, + "loss": 0.4074, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.25396203994751, + "rewards/margins": 1.4232234954833984, + "rewards/rejected": -5.67718505859375, + "step": 557 + }, + { + "epoch": 0.36443791329904485, + "grad_norm": 19.915542369028874, + "learning_rate": 1.604453723672092e-07, + "logits/chosen": -0.7865580320358276, + "logits/rejected": -0.8904578685760498, + "logps/chosen": -826.33984375, + "logps/rejected": -1009.6666259765625, + "loss": 0.4737, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.9453487396240234, + "rewards/margins": 1.032300353050232, + "rewards/rejected": -4.977649211883545, + "step": 558 + }, + { + "epoch": 0.36509102783900726, + "grad_norm": 9.98716425060218, + "learning_rate": 1.6026346363792565e-07, + "logits/chosen": -0.9274966716766357, + "logits/rejected": -0.8684262037277222, + "logps/chosen": -877.6671142578125, + "logps/rejected": -977.3114624023438, + "loss": 0.4311, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.041975975036621, + "rewards/margins": 1.2339956760406494, + "rewards/rejected": -5.275971412658691, + "step": 559 + }, + { + "epoch": 0.36574414237896974, + "grad_norm": 19.342674323255824, + "learning_rate": 1.6008124122941037e-07, + "logits/chosen": -0.9390403628349304, + "logits/rejected": -0.9198365211486816, + "logps/chosen": -984.45556640625, + "logps/rejected": -1059.18359375, + "loss": 0.4974, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.400831699371338, + "rewards/margins": 0.9896097183227539, + "rewards/rejected": -5.390440940856934, + "step": 560 + }, + { + "epoch": 0.36639725691893216, + "grad_norm": 10.44761304304428, + "learning_rate": 1.5989870609015492e-07, + "logits/chosen": -0.7790169715881348, + "logits/rejected": -0.7415274381637573, + "logps/chosen": -809.3233642578125, + "logps/rejected": -942.257568359375, + "loss": 0.4109, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4561712741851807, + "rewards/margins": 1.2330386638641357, + "rewards/rejected": -4.689209938049316, + "step": 561 + }, + { + "epoch": 0.36705037145889463, + "grad_norm": 11.425228413665803, + "learning_rate": 1.597158591702786e-07, + "logits/chosen": -0.7430764436721802, + "logits/rejected": -0.7750738859176636, + "logps/chosen": -899.6544799804688, + "logps/rejected": -1123.5078125, + "loss": 0.3709, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.073906421661377, + "rewards/margins": 1.5739132165908813, + "rewards/rejected": -5.647819519042969, + "step": 562 + }, + { + "epoch": 0.36770348599885705, + "grad_norm": 13.93278816074745, + "learning_rate": 1.5953270142152367e-07, + "logits/chosen": -0.6818071603775024, + "logits/rejected": -0.6695794463157654, + "logps/chosen": -877.8299560546875, + "logps/rejected": -1009.041259765625, + "loss": 0.5168, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.205145835876465, + "rewards/margins": 0.9694371819496155, + "rewards/rejected": -5.174582481384277, + "step": 563 + }, + { + "epoch": 0.3683566005388195, + "grad_norm": 21.432721183984388, + "learning_rate": 1.5934923379725018e-07, + "logits/chosen": -0.7447420358657837, + "logits/rejected": -0.6917985677719116, + "logps/chosen": -836.9746704101562, + "logps/rejected": -931.629638671875, + "loss": 0.5602, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.975611925125122, + "rewards/margins": 1.0695677995681763, + "rewards/rejected": -5.045180320739746, + "step": 564 + }, + { + "epoch": 0.36900971507878194, + "grad_norm": 19.384954342788763, + "learning_rate": 1.591654572524312e-07, + "logits/chosen": -0.8880128860473633, + "logits/rejected": -0.8889778256416321, + "logps/chosen": -841.0914306640625, + "logps/rejected": -1005.8532104492188, + "loss": 0.5231, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.652195453643799, + "rewards/margins": 1.1500074863433838, + "rewards/rejected": -4.8022027015686035, + "step": 565 + }, + { + "epoch": 0.36966282961874436, + "grad_norm": 12.62412588033666, + "learning_rate": 1.5898137274364774e-07, + "logits/chosen": -0.8240070939064026, + "logits/rejected": -0.8247821927070618, + "logps/chosen": -926.2063598632812, + "logps/rejected": -1109.5235595703125, + "loss": 0.3979, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.213892459869385, + "rewards/margins": 1.5136077404022217, + "rewards/rejected": -5.7275004386901855, + "step": 566 + }, + { + "epoch": 0.37031594415870683, + "grad_norm": 14.566856813447973, + "learning_rate": 1.5879698122908382e-07, + "logits/chosen": -0.8523387908935547, + "logits/rejected": -0.8664197325706482, + "logps/chosen": -848.1416015625, + "logps/rejected": -976.9271240234375, + "loss": 0.4588, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.19114351272583, + "rewards/margins": 1.157962441444397, + "rewards/rejected": -5.3491058349609375, + "step": 567 + }, + { + "epoch": 0.37096905869866925, + "grad_norm": 12.639812881204627, + "learning_rate": 1.5861228366852145e-07, + "logits/chosen": -0.8494150042533875, + "logits/rejected": -0.8036295771598816, + "logps/chosen": -861.272705078125, + "logps/rejected": -1024.7142333984375, + "loss": 0.3876, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.1044416427612305, + "rewards/margins": 1.5496019124984741, + "rewards/rejected": -5.654043197631836, + "step": 568 + }, + { + "epoch": 0.3716221732386317, + "grad_norm": 13.43873158244553, + "learning_rate": 1.5842728102333566e-07, + "logits/chosen": -0.8592506051063538, + "logits/rejected": -0.8374245166778564, + "logps/chosen": -888.4620971679688, + "logps/rejected": -990.207763671875, + "loss": 0.467, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.955009937286377, + "rewards/margins": 0.9639407992362976, + "rewards/rejected": -4.91895055770874, + "step": 569 + }, + { + "epoch": 0.37227528777859414, + "grad_norm": 16.138561564920185, + "learning_rate": 1.5824197425648947e-07, + "logits/chosen": -0.9307665824890137, + "logits/rejected": -1.0155502557754517, + "logps/chosen": -817.697998046875, + "logps/rejected": -1008.6148681640625, + "loss": 0.482, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6846015453338623, + "rewards/margins": 1.3351447582244873, + "rewards/rejected": -5.019746780395508, + "step": 570 + }, + { + "epoch": 0.3729284023185566, + "grad_norm": 15.515416540842121, + "learning_rate": 1.580563643325289e-07, + "logits/chosen": -0.8352770209312439, + "logits/rejected": -0.8319212794303894, + "logps/chosen": -889.775146484375, + "logps/rejected": -1052.1566162109375, + "loss": 0.4446, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.425132751464844, + "rewards/margins": 1.2785844802856445, + "rewards/rejected": -5.7037177085876465, + "step": 571 + }, + { + "epoch": 0.37358151685851904, + "grad_norm": 10.82779016449659, + "learning_rate": 1.5787045221757796e-07, + "logits/chosen": -0.8251916170120239, + "logits/rejected": -0.7899847626686096, + "logps/chosen": -837.8982543945312, + "logps/rejected": -941.0586547851562, + "loss": 0.4451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.964031219482422, + "rewards/margins": 0.9509406089782715, + "rewards/rejected": -4.914971351623535, + "step": 572 + }, + { + "epoch": 0.3742346313984815, + "grad_norm": 13.742934988865395, + "learning_rate": 1.576842388793336e-07, + "logits/chosen": -0.9121626615524292, + "logits/rejected": -0.8413507342338562, + "logps/chosen": -892.6531982421875, + "logps/rejected": -1024.99609375, + "loss": 0.4286, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.081020832061768, + "rewards/margins": 1.357172966003418, + "rewards/rejected": -5.4381937980651855, + "step": 573 + }, + { + "epoch": 0.37488774593844393, + "grad_norm": 13.772268083761096, + "learning_rate": 1.5749772528706066e-07, + "logits/chosen": -0.8252898454666138, + "logits/rejected": -0.8235541582107544, + "logps/chosen": -879.6044311523438, + "logps/rejected": -1059.9500732421875, + "loss": 0.4267, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.773669958114624, + "rewards/margins": 1.4883966445922852, + "rewards/rejected": -5.262066841125488, + "step": 574 + }, + { + "epoch": 0.3755408604784064, + "grad_norm": 15.892411969332281, + "learning_rate": 1.5731091241158683e-07, + "logits/chosen": -0.9504250288009644, + "logits/rejected": -0.8796836733818054, + "logps/chosen": -975.87548828125, + "logps/rejected": -1043.329345703125, + "loss": 0.4615, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.551794052124023, + "rewards/margins": 0.715241014957428, + "rewards/rejected": -5.267035007476807, + "step": 575 + }, + { + "epoch": 0.3761939750183688, + "grad_norm": 15.377040417284569, + "learning_rate": 1.5712380122529763e-07, + "logits/chosen": -0.8614749312400818, + "logits/rejected": -0.8348824977874756, + "logps/chosen": -857.7217407226562, + "logps/rejected": -966.0911865234375, + "loss": 0.4592, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.91888427734375, + "rewards/margins": 1.0167591571807861, + "rewards/rejected": -4.935643196105957, + "step": 576 + }, + { + "epoch": 0.3768470895583313, + "grad_norm": 12.483624917174899, + "learning_rate": 1.5693639270213135e-07, + "logits/chosen": -0.8079325556755066, + "logits/rejected": -0.8149147629737854, + "logps/chosen": -945.904541015625, + "logps/rejected": -1057.374267578125, + "loss": 0.4637, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.423439025878906, + "rewards/margins": 0.7207327485084534, + "rewards/rejected": -5.144172191619873, + "step": 577 + }, + { + "epoch": 0.3775002040982937, + "grad_norm": 21.60003125968158, + "learning_rate": 1.5674868781757393e-07, + "logits/chosen": -0.9885172247886658, + "logits/rejected": -0.9942512512207031, + "logps/chosen": -956.927001953125, + "logps/rejected": -1092.0496826171875, + "loss": 0.432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.42249870300293, + "rewards/margins": 1.3156774044036865, + "rewards/rejected": -5.738176345825195, + "step": 578 + }, + { + "epoch": 0.3781533186382562, + "grad_norm": 17.308152308920473, + "learning_rate": 1.5656068754865386e-07, + "logits/chosen": -0.9299103617668152, + "logits/rejected": -0.8897454142570496, + "logps/chosen": -879.7733764648438, + "logps/rejected": -983.5267333984375, + "loss": 0.4288, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.471696853637695, + "rewards/margins": 1.16510009765625, + "rewards/rejected": -5.636796951293945, + "step": 579 + }, + { + "epoch": 0.3788064331782186, + "grad_norm": 15.53883891948394, + "learning_rate": 1.5637239287393724e-07, + "logits/chosen": -0.8626022934913635, + "logits/rejected": -0.9244422912597656, + "logps/chosen": -860.1614990234375, + "logps/rejected": -1011.134033203125, + "loss": 0.4128, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.882162094116211, + "rewards/margins": 1.0723750591278076, + "rewards/rejected": -4.954537391662598, + "step": 580 + }, + { + "epoch": 0.3794595477181811, + "grad_norm": 14.768077938371574, + "learning_rate": 1.5618380477352258e-07, + "logits/chosen": -0.7988356351852417, + "logits/rejected": -0.7182334661483765, + "logps/chosen": -869.6566162109375, + "logps/rejected": -993.8335571289062, + "loss": 0.4059, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.16757345199585, + "rewards/margins": 1.371941328048706, + "rewards/rejected": -5.539514541625977, + "step": 581 + }, + { + "epoch": 0.3801126622581435, + "grad_norm": 17.211114890337342, + "learning_rate": 1.5599492422903557e-07, + "logits/chosen": -0.8917167782783508, + "logits/rejected": -0.8692957758903503, + "logps/chosen": -906.6569213867188, + "logps/rejected": -1106.05126953125, + "loss": 0.5268, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.61083984375, + "rewards/margins": 1.3148231506347656, + "rewards/rejected": -5.925662517547607, + "step": 582 + }, + { + "epoch": 0.38076577679810597, + "grad_norm": 15.588565948663359, + "learning_rate": 1.5580575222362433e-07, + "logits/chosen": -0.705355703830719, + "logits/rejected": -0.722810685634613, + "logps/chosen": -926.6598510742188, + "logps/rejected": -1159.2498779296875, + "loss": 0.4284, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.331436634063721, + "rewards/margins": 1.9407066106796265, + "rewards/rejected": -6.272141933441162, + "step": 583 + }, + { + "epoch": 0.3814188913380684, + "grad_norm": 12.961352491987626, + "learning_rate": 1.556162897419539e-07, + "logits/chosen": -0.7959637641906738, + "logits/rejected": -0.7493060827255249, + "logps/chosen": -850.8128662109375, + "logps/rejected": -948.0916748046875, + "loss": 0.4445, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.118077754974365, + "rewards/margins": 1.0578035116195679, + "rewards/rejected": -5.175881862640381, + "step": 584 + }, + { + "epoch": 0.38207200587803086, + "grad_norm": 16.31279891925311, + "learning_rate": 1.5542653777020136e-07, + "logits/chosen": -0.7824969291687012, + "logits/rejected": -0.7763348817825317, + "logps/chosen": -896.5308837890625, + "logps/rejected": -1009.3015747070312, + "loss": 0.437, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.351218223571777, + "rewards/margins": 1.1964832544326782, + "rewards/rejected": -5.547701358795166, + "step": 585 + }, + { + "epoch": 0.3827251204179933, + "grad_norm": 13.918482499547776, + "learning_rate": 1.5523649729605057e-07, + "logits/chosen": -0.76658034324646, + "logits/rejected": -0.8511514663696289, + "logps/chosen": -870.3497924804688, + "logps/rejected": -1045.263916015625, + "loss": 0.4695, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.4781718254089355, + "rewards/margins": 1.291435718536377, + "rewards/rejected": -5.769607067108154, + "step": 586 + }, + { + "epoch": 0.38337823495795575, + "grad_norm": 16.569196717169596, + "learning_rate": 1.5504616930868716e-07, + "logits/chosen": -0.8880733251571655, + "logits/rejected": -0.8038618564605713, + "logps/chosen": -889.2333984375, + "logps/rejected": -1001.71240234375, + "loss": 0.4786, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.7782959938049316, + "rewards/margins": 1.6770806312561035, + "rewards/rejected": -5.455376625061035, + "step": 587 + }, + { + "epoch": 0.3840313494979182, + "grad_norm": 17.10491428925674, + "learning_rate": 1.548555547987933e-07, + "logits/chosen": -0.9725980758666992, + "logits/rejected": -0.989176869392395, + "logps/chosen": -989.8197631835938, + "logps/rejected": -1179.978759765625, + "loss": 0.4301, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.106828689575195, + "rewards/margins": 1.6448149681091309, + "rewards/rejected": -5.751643657684326, + "step": 588 + }, + { + "epoch": 0.38468446403788065, + "grad_norm": 13.922269822079555, + "learning_rate": 1.5466465475854244e-07, + "logits/chosen": -0.9255604147911072, + "logits/rejected": -0.8810290694236755, + "logps/chosen": -990.11474609375, + "logps/rejected": -1163.693115234375, + "loss": 0.4654, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.530111312866211, + "rewards/margins": 1.9725072383880615, + "rewards/rejected": -6.502618789672852, + "step": 589 + }, + { + "epoch": 0.38533757857784307, + "grad_norm": 16.843076991151882, + "learning_rate": 1.5447347018159436e-07, + "logits/chosen": -0.8135560154914856, + "logits/rejected": -0.8068975210189819, + "logps/chosen": -900.1400756835938, + "logps/rejected": -950.0714721679688, + "loss": 0.5216, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.988783836364746, + "rewards/margins": 0.508838415145874, + "rewards/rejected": -5.497622013092041, + "step": 590 + }, + { + "epoch": 0.38599069311780554, + "grad_norm": 16.55186264161944, + "learning_rate": 1.5428200206308986e-07, + "logits/chosen": -0.713176429271698, + "logits/rejected": -0.7015882134437561, + "logps/chosen": -850.7107543945312, + "logps/rejected": -964.1630249023438, + "loss": 0.4294, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.00892448425293, + "rewards/margins": 1.1998018026351929, + "rewards/rejected": -5.20872688293457, + "step": 591 + }, + { + "epoch": 0.38664380765776796, + "grad_norm": 12.843591605410206, + "learning_rate": 1.5409025139964559e-07, + "logits/chosen": -0.8123986721038818, + "logits/rejected": -0.6921762824058533, + "logps/chosen": -867.6967163085938, + "logps/rejected": -967.98193359375, + "loss": 0.4517, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.409775257110596, + "rewards/margins": 1.2882235050201416, + "rewards/rejected": -5.697999000549316, + "step": 592 + }, + { + "epoch": 0.38729692219773043, + "grad_norm": 22.22126012857482, + "learning_rate": 1.5389821918934894e-07, + "logits/chosen": -0.8245276212692261, + "logits/rejected": -0.7239007949829102, + "logps/chosen": -1058.098388671875, + "logps/rejected": -1149.383544921875, + "loss": 0.5199, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.164212703704834, + "rewards/margins": 1.3136693239212036, + "rewards/rejected": -6.477882385253906, + "step": 593 + }, + { + "epoch": 0.38795003673769285, + "grad_norm": 18.370737993966067, + "learning_rate": 1.537059064317527e-07, + "logits/chosen": -1.0091485977172852, + "logits/rejected": -1.0160248279571533, + "logps/chosen": -943.4915161132812, + "logps/rejected": -1050.677734375, + "loss": 0.4994, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.813990592956543, + "rewards/margins": 1.1202912330627441, + "rewards/rejected": -4.934281826019287, + "step": 594 + }, + { + "epoch": 0.3886031512776553, + "grad_norm": 31.88057796229377, + "learning_rate": 1.5351331412787003e-07, + "logits/chosen": -0.920621395111084, + "logits/rejected": -0.8015599846839905, + "logps/chosen": -944.1340942382812, + "logps/rejected": -1074.1396484375, + "loss": 0.4716, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.2517828941345215, + "rewards/margins": 0.9702135324478149, + "rewards/rejected": -5.221996307373047, + "step": 595 + }, + { + "epoch": 0.38925626581761774, + "grad_norm": 28.48791105586222, + "learning_rate": 1.5332044328016914e-07, + "logits/chosen": -0.9262527823448181, + "logits/rejected": -0.9940930008888245, + "logps/chosen": -852.5736083984375, + "logps/rejected": -992.1788940429688, + "loss": 0.5288, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.7524843215942383, + "rewards/margins": 1.3429107666015625, + "rewards/rejected": -5.095395088195801, + "step": 596 + }, + { + "epoch": 0.3899093803575802, + "grad_norm": 20.034683502025093, + "learning_rate": 1.53127294892568e-07, + "logits/chosen": -0.8578783869743347, + "logits/rejected": -0.8358186483383179, + "logps/chosen": -841.9410400390625, + "logps/rejected": -914.41259765625, + "loss": 0.4581, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.9182372093200684, + "rewards/margins": 0.5345401763916016, + "rewards/rejected": -4.452776908874512, + "step": 597 + }, + { + "epoch": 0.39056249489754263, + "grad_norm": 14.995040309280029, + "learning_rate": 1.529338699704294e-07, + "logits/chosen": -0.9003750681877136, + "logits/rejected": -0.7470192313194275, + "logps/chosen": -835.0977172851562, + "logps/rejected": -924.6924438476562, + "loss": 0.4515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.865151882171631, + "rewards/margins": 1.1951385736465454, + "rewards/rejected": -5.060290813446045, + "step": 598 + }, + { + "epoch": 0.3912156094375051, + "grad_norm": 17.660145347132392, + "learning_rate": 1.527401695205554e-07, + "logits/chosen": -0.8957807421684265, + "logits/rejected": -0.8723628520965576, + "logps/chosen": -869.9386596679688, + "logps/rejected": -925.26318359375, + "loss": 0.4511, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.060848236083984, + "rewards/margins": 0.71476811170578, + "rewards/rejected": -4.775616645812988, + "step": 599 + }, + { + "epoch": 0.3918687239774675, + "grad_norm": 13.004372705457447, + "learning_rate": 1.5254619455118224e-07, + "logits/chosen": -0.7671356201171875, + "logits/rejected": -0.8021383285522461, + "logps/chosen": -845.3877563476562, + "logps/rejected": -985.086181640625, + "loss": 0.449, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.7086658477783203, + "rewards/margins": 1.1206787824630737, + "rewards/rejected": -4.829344749450684, + "step": 600 + }, + { + "epoch": 0.3918687239774675, + "eval_logits/chosen": -0.6681308150291443, + "eval_logits/rejected": -0.6156617999076843, + "eval_logps/chosen": -848.0513916015625, + "eval_logps/rejected": -945.3562622070312, + "eval_loss": 0.4532933235168457, + "eval_rewards/accuracies": 0.7820000052452087, + "eval_rewards/chosen": -3.7414422035217285, + "eval_rewards/margins": 1.0604348182678223, + "eval_rewards/rejected": -4.801877498626709, + "eval_runtime": 614.2682, + "eval_samples_per_second": 6.512, + "eval_steps_per_second": 0.407, + "step": 600 + }, + { + "epoch": 0.39252183851743, + "grad_norm": 16.493495105014954, + "learning_rate": 1.5235194607197507e-07, + "logits/chosen": -0.7642414569854736, + "logits/rejected": -0.7581309676170349, + "logps/chosen": -877.4822387695312, + "logps/rejected": -1018.4113159179688, + "loss": 0.4087, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8799805641174316, + "rewards/margins": 1.3261351585388184, + "rewards/rejected": -5.206116199493408, + "step": 601 + }, + { + "epoch": 0.3931749530573924, + "grad_norm": 12.37107065963141, + "learning_rate": 1.521574250940227e-07, + "logits/chosen": -0.8965670466423035, + "logits/rejected": -0.8340004086494446, + "logps/chosen": -920.9857788085938, + "logps/rejected": -1078.318359375, + "loss": 0.3523, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.6839547157287598, + "rewards/margins": 1.4769634008407593, + "rewards/rejected": -5.160918235778809, + "step": 602 + }, + { + "epoch": 0.3938280675973549, + "grad_norm": 11.116216687638275, + "learning_rate": 1.5196263262983232e-07, + "logits/chosen": -0.7490851879119873, + "logits/rejected": -0.771804928779602, + "logps/chosen": -828.4022216796875, + "logps/rejected": -1004.2670288085938, + "loss": 0.4399, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.896117687225342, + "rewards/margins": 1.3466899394989014, + "rewards/rejected": -5.242807388305664, + "step": 603 + }, + { + "epoch": 0.3944811821373173, + "grad_norm": 9.967638009299678, + "learning_rate": 1.5176756969332425e-07, + "logits/chosen": -0.8834438323974609, + "logits/rejected": -0.8068808913230896, + "logps/chosen": -795.0307006835938, + "logps/rejected": -866.0194091796875, + "loss": 0.4766, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.7699403762817383, + "rewards/margins": 0.9773763418197632, + "rewards/rejected": -4.747317314147949, + "step": 604 + }, + { + "epoch": 0.3951342966772798, + "grad_norm": 26.596246532184516, + "learning_rate": 1.5157223729982668e-07, + "logits/chosen": -0.8761139512062073, + "logits/rejected": -0.8892871141433716, + "logps/chosen": -871.340576171875, + "logps/rejected": -1007.2827758789062, + "loss": 0.439, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.020564079284668, + "rewards/margins": 1.1975598335266113, + "rewards/rejected": -5.218123912811279, + "step": 605 + }, + { + "epoch": 0.3957874112172422, + "grad_norm": 14.077534407772498, + "learning_rate": 1.5137663646607032e-07, + "logits/chosen": -0.8975901007652283, + "logits/rejected": -0.8460850119590759, + "logps/chosen": -890.2839965820312, + "logps/rejected": -974.4508056640625, + "loss": 0.4732, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.964993715286255, + "rewards/margins": 0.9587504863739014, + "rewards/rejected": -4.923743724822998, + "step": 606 + }, + { + "epoch": 0.3964405257572047, + "grad_norm": 17.96334354523122, + "learning_rate": 1.511807682101832e-07, + "logits/chosen": -0.9620600938796997, + "logits/rejected": -0.8492072820663452, + "logps/chosen": -863.880859375, + "logps/rejected": -929.9119262695312, + "loss": 0.4948, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.7883801460266113, + "rewards/margins": 0.9228003621101379, + "rewards/rejected": -4.711181163787842, + "step": 607 + }, + { + "epoch": 0.3970936402971671, + "grad_norm": 11.485367827080513, + "learning_rate": 1.5098463355168523e-07, + "logits/chosen": -0.8942442536354065, + "logits/rejected": -0.8808648586273193, + "logps/chosen": -818.7298583984375, + "logps/rejected": -917.2005004882812, + "loss": 0.4495, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.760091543197632, + "rewards/margins": 0.9165372252464294, + "rewards/rejected": -4.676629066467285, + "step": 608 + }, + { + "epoch": 0.39774675483712957, + "grad_norm": 14.959869339038608, + "learning_rate": 1.5078823351148305e-07, + "logits/chosen": -0.838761568069458, + "logits/rejected": -0.7156088948249817, + "logps/chosen": -863.39453125, + "logps/rejected": -953.5706176757812, + "loss": 0.5298, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.9802536964416504, + "rewards/margins": 1.1790037155151367, + "rewards/rejected": -5.159257888793945, + "step": 609 + }, + { + "epoch": 0.398399869377092, + "grad_norm": 12.26194863361878, + "learning_rate": 1.5059156911186462e-07, + "logits/chosen": -1.0336946249008179, + "logits/rejected": -0.951801598072052, + "logps/chosen": -967.37744140625, + "logps/rejected": -1024.5635986328125, + "loss": 0.3817, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.440566539764404, + "rewards/margins": 1.1911842823028564, + "rewards/rejected": -5.631750583648682, + "step": 610 + }, + { + "epoch": 0.39905298391705446, + "grad_norm": 19.1262138772951, + "learning_rate": 1.5039464137649395e-07, + "logits/chosen": -0.9519979953765869, + "logits/rejected": -0.8136232495307922, + "logps/chosen": -923.1121826171875, + "logps/rejected": -981.0728759765625, + "loss": 0.4884, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.264883041381836, + "rewards/margins": 0.9207399487495422, + "rewards/rejected": -5.1856231689453125, + "step": 611 + }, + { + "epoch": 0.3997060984570169, + "grad_norm": 13.608525719342314, + "learning_rate": 1.5019745133040571e-07, + "logits/chosen": -0.876707911491394, + "logits/rejected": -0.8673529624938965, + "logps/chosen": -858.470947265625, + "logps/rejected": -973.949951171875, + "loss": 0.4167, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.141305446624756, + "rewards/margins": 0.995867133140564, + "rewards/rejected": -5.137172698974609, + "step": 612 + }, + { + "epoch": 0.40035921299697935, + "grad_norm": 13.474406391717167, + "learning_rate": 1.5e-07, + "logits/chosen": -0.7090507745742798, + "logits/rejected": -0.614541232585907, + "logps/chosen": -914.9686279296875, + "logps/rejected": -1066.94189453125, + "loss": 0.4323, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.5194830894470215, + "rewards/margins": 1.386902093887329, + "rewards/rejected": -5.9063849449157715, + "step": 613 + }, + { + "epoch": 0.40101232753694177, + "grad_norm": 14.172259835539906, + "learning_rate": 1.4980228841303682e-07, + "logits/chosen": -0.9724923372268677, + "logits/rejected": -0.903152346611023, + "logps/chosen": -885.5799560546875, + "logps/rejected": -995.6898803710938, + "loss": 0.4037, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.383858680725098, + "rewards/margins": 1.104832649230957, + "rewards/rejected": -5.4886908531188965, + "step": 614 + }, + { + "epoch": 0.40166544207690424, + "grad_norm": 13.5707311262356, + "learning_rate": 1.4960431759863093e-07, + "logits/chosen": -0.8835554718971252, + "logits/rejected": -0.7123602628707886, + "logps/chosen": -854.8338623046875, + "logps/rejected": -918.1431884765625, + "loss": 0.4256, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.005169868469238, + "rewards/margins": 1.493152379989624, + "rewards/rejected": -5.498322486877441, + "step": 615 + }, + { + "epoch": 0.40231855661686666, + "grad_norm": 18.247415237240777, + "learning_rate": 1.494060885872464e-07, + "logits/chosen": -1.0498430728912354, + "logits/rejected": -0.9992644190788269, + "logps/chosen": -970.97998046875, + "logps/rejected": -1049.422119140625, + "loss": 0.4306, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.565814018249512, + "rewards/margins": 1.0201760530471802, + "rewards/rejected": -5.585990905761719, + "step": 616 + }, + { + "epoch": 0.40297167115682914, + "grad_norm": 14.026453964756975, + "learning_rate": 1.4920760241069124e-07, + "logits/chosen": -0.9324491024017334, + "logits/rejected": -0.917612612247467, + "logps/chosen": -934.6063232421875, + "logps/rejected": -1033.9951171875, + "loss": 0.4023, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.982738494873047, + "rewards/margins": 0.9660321474075317, + "rewards/rejected": -5.948770999908447, + "step": 617 + }, + { + "epoch": 0.40362478569679155, + "grad_norm": 15.386386169496118, + "learning_rate": 1.49008860102112e-07, + "logits/chosen": -0.7892769575119019, + "logits/rejected": -0.7855625152587891, + "logps/chosen": -942.093017578125, + "logps/rejected": -1020.9521484375, + "loss": 0.4783, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.989546775817871, + "rewards/margins": 0.7590184211730957, + "rewards/rejected": -5.748565196990967, + "step": 618 + }, + { + "epoch": 0.40427790023675403, + "grad_norm": 20.36931108231902, + "learning_rate": 1.4880986269598847e-07, + "logits/chosen": -0.7886320352554321, + "logits/rejected": -0.770696759223938, + "logps/chosen": -1011.263671875, + "logps/rejected": -1083.3687744140625, + "loss": 0.4714, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.143796920776367, + "rewards/margins": 0.9884477853775024, + "rewards/rejected": -6.132245063781738, + "step": 619 + }, + { + "epoch": 0.40493101477671645, + "grad_norm": 17.64904479691889, + "learning_rate": 1.4861061122812828e-07, + "logits/chosen": -0.8605690002441406, + "logits/rejected": -0.910815417766571, + "logps/chosen": -962.86767578125, + "logps/rejected": -1095.0264892578125, + "loss": 0.422, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.740562438964844, + "rewards/margins": 1.1599054336547852, + "rewards/rejected": -5.900468349456787, + "step": 620 + }, + { + "epoch": 0.4055841293166789, + "grad_norm": 18.989849625404762, + "learning_rate": 1.484111067356614e-07, + "logits/chosen": -0.8971580266952515, + "logits/rejected": -0.8848168849945068, + "logps/chosen": -877.4755859375, + "logps/rejected": -953.19287109375, + "loss": 0.5492, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.335120677947998, + "rewards/margins": 0.7430107593536377, + "rewards/rejected": -5.078131675720215, + "step": 621 + }, + { + "epoch": 0.40623724385664134, + "grad_norm": 25.58374255467917, + "learning_rate": 1.4821135025703488e-07, + "logits/chosen": -0.8735688924789429, + "logits/rejected": -0.8896888494491577, + "logps/chosen": -969.5611572265625, + "logps/rejected": -1103.0755615234375, + "loss": 0.3992, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.727744102478027, + "rewards/margins": 1.2061125040054321, + "rewards/rejected": -5.933856010437012, + "step": 622 + }, + { + "epoch": 0.4068903583966038, + "grad_norm": 13.450431430906283, + "learning_rate": 1.4801134283200744e-07, + "logits/chosen": -0.82172030210495, + "logits/rejected": -0.806075930595398, + "logps/chosen": -1026.8660888671875, + "logps/rejected": -1136.55712890625, + "loss": 0.4049, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.981423854827881, + "rewards/margins": 1.1098735332489014, + "rewards/rejected": -6.091297149658203, + "step": 623 + }, + { + "epoch": 0.40754347293656623, + "grad_norm": 18.032806569030758, + "learning_rate": 1.4781108550164395e-07, + "logits/chosen": -0.8168790936470032, + "logits/rejected": -0.7250760793685913, + "logps/chosen": -916.8949584960938, + "logps/rejected": -964.107177734375, + "loss": 0.4845, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.447428226470947, + "rewards/margins": 0.5497761368751526, + "rewards/rejected": -4.997204780578613, + "step": 624 + }, + { + "epoch": 0.4081965874765287, + "grad_norm": 18.94760554081831, + "learning_rate": 1.4761057930831e-07, + "logits/chosen": -0.9571875929832458, + "logits/rejected": -0.8576517105102539, + "logps/chosen": -993.7386474609375, + "logps/rejected": -1106.987060546875, + "loss": 0.3963, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.498230457305908, + "rewards/margins": 1.3976860046386719, + "rewards/rejected": -5.895916938781738, + "step": 625 + }, + { + "epoch": 0.4088497020164911, + "grad_norm": 22.686857483265566, + "learning_rate": 1.4740982529566672e-07, + "logits/chosen": -1.0864641666412354, + "logits/rejected": -0.9092856645584106, + "logps/chosen": -960.5426635742188, + "logps/rejected": -1040.1844482421875, + "loss": 0.4542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.545177459716797, + "rewards/margins": 1.2752573490142822, + "rewards/rejected": -5.8204345703125, + "step": 626 + }, + { + "epoch": 0.4095028165564536, + "grad_norm": 13.831886695808532, + "learning_rate": 1.4720882450866502e-07, + "logits/chosen": -0.8931451439857483, + "logits/rejected": -0.82686847448349, + "logps/chosen": -875.4305419921875, + "logps/rejected": -1009.8246459960938, + "loss": 0.3634, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.404606819152832, + "rewards/margins": 1.4391083717346191, + "rewards/rejected": -5.843715190887451, + "step": 627 + }, + { + "epoch": 0.410155931096416, + "grad_norm": 12.748214814923802, + "learning_rate": 1.470075779935404e-07, + "logits/chosen": -0.926064133644104, + "logits/rejected": -0.9559231400489807, + "logps/chosen": -878.1412963867188, + "logps/rejected": -1152.503662109375, + "loss": 0.4009, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.183788299560547, + "rewards/margins": 2.1225814819335938, + "rewards/rejected": -6.306369781494141, + "step": 628 + }, + { + "epoch": 0.4108090456363785, + "grad_norm": 41.87910147179948, + "learning_rate": 1.468060867978073e-07, + "logits/chosen": -0.9104610085487366, + "logits/rejected": -0.9001246690750122, + "logps/chosen": -1014.3042602539062, + "logps/rejected": -1192.781005859375, + "loss": 0.4064, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.662555694580078, + "rewards/margins": 1.773787021636963, + "rewards/rejected": -6.436343193054199, + "step": 629 + }, + { + "epoch": 0.4114621601763409, + "grad_norm": 14.627953520544663, + "learning_rate": 1.4660435197025388e-07, + "logits/chosen": -0.8062002062797546, + "logits/rejected": -0.854103147983551, + "logps/chosen": -936.1939086914062, + "logps/rejected": -1189.2581787109375, + "loss": 0.4233, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.81528377532959, + "rewards/margins": 1.5555047988891602, + "rewards/rejected": -6.370789051055908, + "step": 630 + }, + { + "epoch": 0.4121152747163034, + "grad_norm": 12.111758507125595, + "learning_rate": 1.4640237456093634e-07, + "logits/chosen": -0.6821016073226929, + "logits/rejected": -0.5494420528411865, + "logps/chosen": -972.8225708007812, + "logps/rejected": -1031.09814453125, + "loss": 0.4028, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.693094730377197, + "rewards/margins": 1.2289015054702759, + "rewards/rejected": -5.921996116638184, + "step": 631 + }, + { + "epoch": 0.4127683892562658, + "grad_norm": 13.52571835630328, + "learning_rate": 1.462001556211736e-07, + "logits/chosen": -1.001874327659607, + "logits/rejected": -0.975570797920227, + "logps/chosen": -979.7161865234375, + "logps/rejected": -1141.2294921875, + "loss": 0.4469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.602497577667236, + "rewards/margins": 1.546646237373352, + "rewards/rejected": -6.149144172668457, + "step": 632 + }, + { + "epoch": 0.4134215037962283, + "grad_norm": 19.019281186072746, + "learning_rate": 1.4599769620354174e-07, + "logits/chosen": -0.7710694074630737, + "logits/rejected": -0.7261401414871216, + "logps/chosen": -937.7637939453125, + "logps/rejected": -1017.58935546875, + "loss": 0.5744, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.335657119750977, + "rewards/margins": 0.7023254036903381, + "rewards/rejected": -5.03798246383667, + "step": 633 + }, + { + "epoch": 0.4140746183361907, + "grad_norm": 22.204343763079972, + "learning_rate": 1.4579499736186863e-07, + "logits/chosen": -0.6519548296928406, + "logits/rejected": -0.6551141142845154, + "logps/chosen": -1033.122802734375, + "logps/rejected": -1234.211669921875, + "loss": 0.4501, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.304633140563965, + "rewards/margins": 1.2295292615890503, + "rewards/rejected": -6.5341620445251465, + "step": 634 + }, + { + "epoch": 0.41472773287615317, + "grad_norm": 13.511485762115198, + "learning_rate": 1.4559206015122829e-07, + "logits/chosen": -0.6418792605400085, + "logits/rejected": -0.7200583219528198, + "logps/chosen": -851.169189453125, + "logps/rejected": -1106.4285888671875, + "loss": 0.4361, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.105515956878662, + "rewards/margins": 2.3324177265167236, + "rewards/rejected": -6.437933444976807, + "step": 635 + }, + { + "epoch": 0.4153808474161156, + "grad_norm": 34.33189845987089, + "learning_rate": 1.453888856279355e-07, + "logits/chosen": -0.8396845459938049, + "logits/rejected": -0.7787867188453674, + "logps/chosen": -1001.843017578125, + "logps/rejected": -1041.985107421875, + "loss": 0.5351, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.880048751831055, + "rewards/margins": 0.6726697683334351, + "rewards/rejected": -5.552718639373779, + "step": 636 + }, + { + "epoch": 0.41603396195607806, + "grad_norm": 17.526487722839356, + "learning_rate": 1.451854748495403e-07, + "logits/chosen": -0.6934026479721069, + "logits/rejected": -0.7079018950462341, + "logps/chosen": -806.213134765625, + "logps/rejected": -963.346923828125, + "loss": 0.4063, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8228399753570557, + "rewards/margins": 1.726550579071045, + "rewards/rejected": -5.54939079284668, + "step": 637 + }, + { + "epoch": 0.4166870764960405, + "grad_norm": 25.357018744901364, + "learning_rate": 1.4498182887482252e-07, + "logits/chosen": -0.8057832717895508, + "logits/rejected": -0.7694687843322754, + "logps/chosen": -928.901123046875, + "logps/rejected": -1060.5814208984375, + "loss": 0.3872, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.737601280212402, + "rewards/margins": 1.5201215744018555, + "rewards/rejected": -6.257723331451416, + "step": 638 + }, + { + "epoch": 0.41734019103600295, + "grad_norm": 19.39989514006622, + "learning_rate": 1.4477794876378612e-07, + "logits/chosen": -0.7168833017349243, + "logits/rejected": -0.7470553517341614, + "logps/chosen": -852.4853515625, + "logps/rejected": -945.4410400390625, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.117666721343994, + "rewards/margins": 1.0648146867752075, + "rewards/rejected": -5.182481288909912, + "step": 639 + }, + { + "epoch": 0.41799330557596537, + "grad_norm": 13.583508801617073, + "learning_rate": 1.4457383557765383e-07, + "logits/chosen": -0.8929958343505859, + "logits/rejected": -0.885593831539154, + "logps/chosen": -831.2813720703125, + "logps/rejected": -959.333251953125, + "loss": 0.4017, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.770887613296509, + "rewards/margins": 1.2031605243682861, + "rewards/rejected": -4.974048614501953, + "step": 640 + }, + { + "epoch": 0.41864642011592784, + "grad_norm": 12.885105314816913, + "learning_rate": 1.4436949037886155e-07, + "logits/chosen": -0.8584612011909485, + "logits/rejected": -0.9563580751419067, + "logps/chosen": -973.1109619140625, + "logps/rejected": -1170.2774658203125, + "loss": 0.3817, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.968479871749878, + "rewards/margins": 1.4874968528747559, + "rewards/rejected": -5.455976963043213, + "step": 641 + }, + { + "epoch": 0.41929953465589026, + "grad_norm": 15.499673312261706, + "learning_rate": 1.4416491423105285e-07, + "logits/chosen": -0.5970382690429688, + "logits/rejected": -0.4685077667236328, + "logps/chosen": -756.71337890625, + "logps/rejected": -854.1908569335938, + "loss": 0.457, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.7428746223449707, + "rewards/margins": 1.194618582725525, + "rewards/rejected": -4.937493324279785, + "step": 642 + }, + { + "epoch": 0.41995264919585273, + "grad_norm": 15.458842820703998, + "learning_rate": 1.4396010819907338e-07, + "logits/chosen": -0.7846943140029907, + "logits/rejected": -0.790925145149231, + "logps/chosen": -823.936279296875, + "logps/rejected": -949.0782470703125, + "loss": 0.4334, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4161481857299805, + "rewards/margins": 1.5297538042068481, + "rewards/rejected": -4.945902347564697, + "step": 643 + }, + { + "epoch": 0.42060576373581515, + "grad_norm": 15.070391792337258, + "learning_rate": 1.4375507334896546e-07, + "logits/chosen": -0.8074719309806824, + "logits/rejected": -0.6977930068969727, + "logps/chosen": -939.0458984375, + "logps/rejected": -1025.165283203125, + "loss": 0.4187, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.523794174194336, + "rewards/margins": 1.1222283840179443, + "rewards/rejected": -5.646022319793701, + "step": 644 + }, + { + "epoch": 0.4212588782757776, + "grad_norm": 13.986160456147138, + "learning_rate": 1.4354981074796232e-07, + "logits/chosen": -0.8354430198669434, + "logits/rejected": -0.6734627485275269, + "logps/chosen": -740.5074462890625, + "logps/rejected": -873.5528564453125, + "loss": 0.4067, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.245814561843872, + "rewards/margins": 1.577298641204834, + "rewards/rejected": -4.823113441467285, + "step": 645 + }, + { + "epoch": 0.42191199281574004, + "grad_norm": 12.125652049098292, + "learning_rate": 1.433443214644827e-07, + "logits/chosen": -0.7603698372840881, + "logits/rejected": -0.6487007141113281, + "logps/chosen": -890.7510375976562, + "logps/rejected": -904.6582641601562, + "loss": 0.426, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.7208075523376465, + "rewards/margins": 0.801545262336731, + "rewards/rejected": -4.522353172302246, + "step": 646 + }, + { + "epoch": 0.4225651073557025, + "grad_norm": 20.464126839268978, + "learning_rate": 1.4313860656812535e-07, + "logits/chosen": -0.8351298570632935, + "logits/rejected": -0.8417803049087524, + "logps/chosen": -760.145751953125, + "logps/rejected": -926.35546875, + "loss": 0.4643, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.66671085357666, + "rewards/margins": 1.3970162868499756, + "rewards/rejected": -5.063727378845215, + "step": 647 + }, + { + "epoch": 0.42321822189566494, + "grad_norm": 14.126796385995897, + "learning_rate": 1.429326671296632e-07, + "logits/chosen": -0.7931310534477234, + "logits/rejected": -0.8309136629104614, + "logps/chosen": -806.4236450195312, + "logps/rejected": -945.98828125, + "loss": 0.4526, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.0367279052734375, + "rewards/margins": 0.9322269558906555, + "rewards/rejected": -4.968955039978027, + "step": 648 + }, + { + "epoch": 0.4238713364356274, + "grad_norm": 17.31979167681963, + "learning_rate": 1.427265042210381e-07, + "logits/chosen": -0.983731210231781, + "logits/rejected": -0.9602134227752686, + "logps/chosen": -863.263916015625, + "logps/rejected": -1010.90234375, + "loss": 0.4982, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9947147369384766, + "rewards/margins": 1.413080096244812, + "rewards/rejected": -5.407794952392578, + "step": 649 + }, + { + "epoch": 0.42452445097558983, + "grad_norm": 26.22140907720917, + "learning_rate": 1.4252011891535498e-07, + "logits/chosen": -0.8368729948997498, + "logits/rejected": -0.832813560962677, + "logps/chosen": -890.2816162109375, + "logps/rejected": -1042.296630859375, + "loss": 0.414, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.053102016448975, + "rewards/margins": 1.294475793838501, + "rewards/rejected": -5.347578048706055, + "step": 650 + }, + { + "epoch": 0.4251775655155523, + "grad_norm": 14.020114892351618, + "learning_rate": 1.4231351228687644e-07, + "logits/chosen": -0.8325139880180359, + "logits/rejected": -0.6997128129005432, + "logps/chosen": -826.017333984375, + "logps/rejected": -1002.11181640625, + "loss": 0.4405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8723702430725098, + "rewards/margins": 1.774685263633728, + "rewards/rejected": -5.647055149078369, + "step": 651 + }, + { + "epoch": 0.4258306800555147, + "grad_norm": 12.265082335211462, + "learning_rate": 1.421066854110171e-07, + "logits/chosen": -0.9369992017745972, + "logits/rejected": -0.8721651434898376, + "logps/chosen": -889.888671875, + "logps/rejected": -1001.230224609375, + "loss": 0.4296, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.376310348510742, + "rewards/margins": 1.0269381999969482, + "rewards/rejected": -5.403248310089111, + "step": 652 + }, + { + "epoch": 0.4264837945954772, + "grad_norm": 11.090981454180124, + "learning_rate": 1.4189963936433794e-07, + "logits/chosen": -0.8955050706863403, + "logits/rejected": -0.9158356189727783, + "logps/chosen": -870.947021484375, + "logps/rejected": -961.156494140625, + "loss": 0.4353, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.03396463394165, + "rewards/margins": 0.7431548833847046, + "rewards/rejected": -4.777119159698486, + "step": 653 + }, + { + "epoch": 0.4271369091354396, + "grad_norm": 30.52888285459347, + "learning_rate": 1.4169237522454082e-07, + "logits/chosen": -0.9265910983085632, + "logits/rejected": -0.9025193452835083, + "logps/chosen": -938.2091674804688, + "logps/rejected": -1082.6446533203125, + "loss": 0.462, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.009795188903809, + "rewards/margins": 1.1789731979370117, + "rewards/rejected": -5.18876838684082, + "step": 654 + }, + { + "epoch": 0.4277900236754021, + "grad_norm": 16.525087397047585, + "learning_rate": 1.4148489407046272e-07, + "logits/chosen": -0.8427670001983643, + "logits/rejected": -0.8210782408714294, + "logps/chosen": -1001.025146484375, + "logps/rejected": -1058.7926025390625, + "loss": 0.462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.803376197814941, + "rewards/margins": 0.7696138620376587, + "rewards/rejected": -5.5729899406433105, + "step": 655 + }, + { + "epoch": 0.4284431382153645, + "grad_norm": 14.107761532142954, + "learning_rate": 1.412771969820703e-07, + "logits/chosen": -0.7187002897262573, + "logits/rejected": -0.6458846926689148, + "logps/chosen": -889.3016967773438, + "logps/rejected": -1031.6785888671875, + "loss": 0.4785, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.112646579742432, + "rewards/margins": 1.4312293529510498, + "rewards/rejected": -5.543876647949219, + "step": 656 + }, + { + "epoch": 0.429096252755327, + "grad_norm": 21.330855551906346, + "learning_rate": 1.4106928504045414e-07, + "logits/chosen": -0.7129623889923096, + "logits/rejected": -0.7127180099487305, + "logps/chosen": -966.5421752929688, + "logps/rejected": -1104.1207275390625, + "loss": 0.4478, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.271879196166992, + "rewards/margins": 1.1747528314590454, + "rewards/rejected": -5.44663143157959, + "step": 657 + }, + { + "epoch": 0.4297493672952894, + "grad_norm": 14.415672966540916, + "learning_rate": 1.4086115932782314e-07, + "logits/chosen": -0.743895947933197, + "logits/rejected": -0.8685862421989441, + "logps/chosen": -790.4058227539062, + "logps/rejected": -1067.7724609375, + "loss": 0.4504, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.013057231903076, + "rewards/margins": 1.7089455127716064, + "rewards/rejected": -5.722002983093262, + "step": 658 + }, + { + "epoch": 0.43040248183525187, + "grad_norm": 22.65133168747345, + "learning_rate": 1.4065282092749898e-07, + "logits/chosen": -0.8140444159507751, + "logits/rejected": -0.7156209349632263, + "logps/chosen": -901.9187622070312, + "logps/rejected": -1011.9653930664062, + "loss": 0.4053, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.2266526222229, + "rewards/margins": 1.2894474267959595, + "rewards/rejected": -5.5161004066467285, + "step": 659 + }, + { + "epoch": 0.4310555963752143, + "grad_norm": 17.697468761787118, + "learning_rate": 1.404442709239103e-07, + "logits/chosen": -0.788597822189331, + "logits/rejected": -0.812627375125885, + "logps/chosen": -907.12109375, + "logps/rejected": -1006.6986694335938, + "loss": 0.3981, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9818243980407715, + "rewards/margins": 1.407388687133789, + "rewards/rejected": -5.3892130851745605, + "step": 660 + }, + { + "epoch": 0.43170871091517676, + "grad_norm": 20.96372589073248, + "learning_rate": 1.4023551040258722e-07, + "logits/chosen": -0.8250223994255066, + "logits/rejected": -0.8321212530136108, + "logps/chosen": -907.60205078125, + "logps/rejected": -1042.2557373046875, + "loss": 0.4675, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.117773532867432, + "rewards/margins": 1.2371721267700195, + "rewards/rejected": -5.354945659637451, + "step": 661 + }, + { + "epoch": 0.4323618254551392, + "grad_norm": 20.57356587015438, + "learning_rate": 1.4002654045015573e-07, + "logits/chosen": -0.8451354503631592, + "logits/rejected": -0.8059527277946472, + "logps/chosen": -847.0043334960938, + "logps/rejected": -965.06787109375, + "loss": 0.4463, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.082802772521973, + "rewards/margins": 1.3779957294464111, + "rewards/rejected": -5.4607977867126465, + "step": 662 + }, + { + "epoch": 0.43301493999510166, + "grad_norm": 20.39338992868369, + "learning_rate": 1.3981736215433168e-07, + "logits/chosen": -0.8249819278717041, + "logits/rejected": -0.7393394708633423, + "logps/chosen": -887.72265625, + "logps/rejected": -1003.6444091796875, + "loss": 0.532, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.219474792480469, + "rewards/margins": 1.191009283065796, + "rewards/rejected": -5.4104838371276855, + "step": 663 + }, + { + "epoch": 0.4336680545350641, + "grad_norm": 29.528290842632007, + "learning_rate": 1.3960797660391568e-07, + "logits/chosen": -1.028712511062622, + "logits/rejected": -0.87873375415802, + "logps/chosen": -936.6950073242188, + "logps/rejected": -1085.1673583984375, + "loss": 0.4479, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.28346061706543, + "rewards/margins": 1.7631897926330566, + "rewards/rejected": -6.0466508865356445, + "step": 664 + }, + { + "epoch": 0.43432116907502655, + "grad_norm": 25.500391101093133, + "learning_rate": 1.393983848887869e-07, + "logits/chosen": -0.7285988330841064, + "logits/rejected": -0.785618245601654, + "logps/chosen": -908.6463623046875, + "logps/rejected": -1151.10546875, + "loss": 0.4249, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.462244510650635, + "rewards/margins": 1.578071117401123, + "rewards/rejected": -6.040315628051758, + "step": 665 + }, + { + "epoch": 0.43497428361498897, + "grad_norm": 25.96776880620349, + "learning_rate": 1.3918858809989772e-07, + "logits/chosen": -0.8820406794548035, + "logits/rejected": -0.8553460836410522, + "logps/chosen": -833.931884765625, + "logps/rejected": -943.404541015625, + "loss": 0.4652, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.73722505569458, + "rewards/margins": 1.2582281827926636, + "rewards/rejected": -4.995452880859375, + "step": 666 + }, + { + "epoch": 0.43562739815495144, + "grad_norm": 13.136623740418324, + "learning_rate": 1.3897858732926794e-07, + "logits/chosen": -0.9333454370498657, + "logits/rejected": -0.7037588953971863, + "logps/chosen": -877.7976684570312, + "logps/rejected": -931.8585815429688, + "loss": 0.3979, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.332530975341797, + "rewards/margins": 1.0355404615402222, + "rewards/rejected": -5.36807107925415, + "step": 667 + }, + { + "epoch": 0.43628051269491386, + "grad_norm": 15.93708248937252, + "learning_rate": 1.387683836699791e-07, + "logits/chosen": -0.7652785778045654, + "logits/rejected": -0.712914228439331, + "logps/chosen": -778.5343017578125, + "logps/rejected": -999.83056640625, + "loss": 0.4135, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.547429084777832, + "rewards/margins": 1.8087109327316284, + "rewards/rejected": -5.356139659881592, + "step": 668 + }, + { + "epoch": 0.43693362723487633, + "grad_norm": 24.435455038292687, + "learning_rate": 1.3855797821616888e-07, + "logits/chosen": -0.8811261653900146, + "logits/rejected": -0.8171039819717407, + "logps/chosen": -949.3507080078125, + "logps/rejected": -1035.7723388671875, + "loss": 0.4635, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.635836601257324, + "rewards/margins": 1.0779340267181396, + "rewards/rejected": -5.713770866394043, + "step": 669 + }, + { + "epoch": 0.43758674177483875, + "grad_norm": 22.738127013175763, + "learning_rate": 1.3834737206302517e-07, + "logits/chosen": -0.734673023223877, + "logits/rejected": -0.7677603363990784, + "logps/chosen": -923.2880859375, + "logps/rejected": -1033.8685302734375, + "loss": 0.485, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.669242858886719, + "rewards/margins": 1.1409337520599365, + "rewards/rejected": -5.810176849365234, + "step": 670 + }, + { + "epoch": 0.4382398563148012, + "grad_norm": 20.543237861097456, + "learning_rate": 1.3813656630678067e-07, + "logits/chosen": -0.842354953289032, + "logits/rejected": -0.7564468383789062, + "logps/chosen": -931.6202392578125, + "logps/rejected": -1077.499755859375, + "loss": 0.4288, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.472886085510254, + "rewards/margins": 1.3071494102478027, + "rewards/rejected": -5.780035495758057, + "step": 671 + }, + { + "epoch": 0.43889297085476364, + "grad_norm": 13.57877629047864, + "learning_rate": 1.3792556204470697e-07, + "logits/chosen": -0.8747435808181763, + "logits/rejected": -0.8501456379890442, + "logps/chosen": -927.4142456054688, + "logps/rejected": -1086.185791015625, + "loss": 0.4252, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.513038635253906, + "rewards/margins": 1.0269978046417236, + "rewards/rejected": -5.540036201477051, + "step": 672 + }, + { + "epoch": 0.4395460853947261, + "grad_norm": 17.356762341065927, + "learning_rate": 1.3771436037510896e-07, + "logits/chosen": -0.7796288132667542, + "logits/rejected": -0.6895512342453003, + "logps/chosen": -933.6649169921875, + "logps/rejected": -1087.0875244140625, + "loss": 0.4412, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.490601062774658, + "rewards/margins": 1.5325762033462524, + "rewards/rejected": -6.023177146911621, + "step": 673 + }, + { + "epoch": 0.44019919993468853, + "grad_norm": 13.574005853463984, + "learning_rate": 1.3750296239731897e-07, + "logits/chosen": -0.7811002731323242, + "logits/rejected": -0.851697564125061, + "logps/chosen": -782.3571166992188, + "logps/rejected": -907.9415283203125, + "loss": 0.4683, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.627669334411621, + "rewards/margins": 1.0533050298690796, + "rewards/rejected": -4.68097448348999, + "step": 674 + }, + { + "epoch": 0.440852314474651, + "grad_norm": 19.44076719795671, + "learning_rate": 1.3729136921169127e-07, + "logits/chosen": -0.8856003880500793, + "logits/rejected": -0.8591895699501038, + "logps/chosen": -834.085693359375, + "logps/rejected": -988.325927734375, + "loss": 0.465, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8806254863739014, + "rewards/margins": 1.1521306037902832, + "rewards/rejected": -5.0327558517456055, + "step": 675 + }, + { + "epoch": 0.4415054290146134, + "grad_norm": 16.00239891382966, + "learning_rate": 1.3707958191959609e-07, + "logits/chosen": -0.8808274865150452, + "logits/rejected": -0.8277475833892822, + "logps/chosen": -858.7861938476562, + "logps/rejected": -997.075439453125, + "loss": 0.4338, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.8033220767974854, + "rewards/margins": 1.478643774986267, + "rewards/rejected": -5.281965732574463, + "step": 676 + }, + { + "epoch": 0.4421585435545759, + "grad_norm": 21.70547381761294, + "learning_rate": 1.3686760162341407e-07, + "logits/chosen": -0.8979389071464539, + "logits/rejected": -0.9083980917930603, + "logps/chosen": -903.99560546875, + "logps/rejected": -1149.4349365234375, + "loss": 0.3465, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.51162576675415, + "rewards/margins": 1.9777264595031738, + "rewards/rejected": -6.489351749420166, + "step": 677 + }, + { + "epoch": 0.4428116580945383, + "grad_norm": 14.144294296308374, + "learning_rate": 1.3665542942653045e-07, + "logits/chosen": -1.0189638137817383, + "logits/rejected": -0.9529871940612793, + "logps/chosen": -912.849609375, + "logps/rejected": -973.5966186523438, + "loss": 0.4492, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.560410022735596, + "rewards/margins": 0.8235808610916138, + "rewards/rejected": -5.38399076461792, + "step": 678 + }, + { + "epoch": 0.4434647726345008, + "grad_norm": 32.796961805908666, + "learning_rate": 1.3644306643332938e-07, + "logits/chosen": -0.6085844039916992, + "logits/rejected": -0.6985953450202942, + "logps/chosen": -815.2529907226562, + "logps/rejected": -1073.2359619140625, + "loss": 0.4895, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.089121341705322, + "rewards/margins": 1.6417101621627808, + "rewards/rejected": -5.730830669403076, + "step": 679 + }, + { + "epoch": 0.4441178871744632, + "grad_norm": 13.862963460245135, + "learning_rate": 1.3623051374918802e-07, + "logits/chosen": -0.899061918258667, + "logits/rejected": -0.8426032066345215, + "logps/chosen": -995.0509033203125, + "logps/rejected": -1092.8623046875, + "loss": 0.4237, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.662611484527588, + "rewards/margins": 1.1843613386154175, + "rewards/rejected": -5.846972465515137, + "step": 680 + }, + { + "epoch": 0.4447710017144257, + "grad_norm": 14.735642845594755, + "learning_rate": 1.3601777248047104e-07, + "logits/chosen": -0.6714251637458801, + "logits/rejected": -0.747349739074707, + "logps/chosen": -862.39697265625, + "logps/rejected": -1099.48876953125, + "loss": 0.4277, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.98651385307312, + "rewards/margins": 1.7703197002410889, + "rewards/rejected": -5.756833553314209, + "step": 681 + }, + { + "epoch": 0.4454241162543881, + "grad_norm": 17.382626342442304, + "learning_rate": 1.358048437345246e-07, + "logits/chosen": -0.8407683968544006, + "logits/rejected": -0.8594624400138855, + "logps/chosen": -924.0106201171875, + "logps/rejected": -1134.36865234375, + "loss": 0.4479, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.400675296783447, + "rewards/margins": 1.7937949895858765, + "rewards/rejected": -6.194470405578613, + "step": 682 + }, + { + "epoch": 0.4460772307943506, + "grad_norm": 14.86569909599256, + "learning_rate": 1.3559172861967076e-07, + "logits/chosen": -0.8785750865936279, + "logits/rejected": -0.8424649238586426, + "logps/chosen": -843.7998046875, + "logps/rejected": -980.86767578125, + "loss": 0.4113, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.896084785461426, + "rewards/margins": 1.296176552772522, + "rewards/rejected": -5.192261219024658, + "step": 683 + }, + { + "epoch": 0.446730345334313, + "grad_norm": 13.07522953792525, + "learning_rate": 1.3537842824520164e-07, + "logits/chosen": -0.8316384553909302, + "logits/rejected": -0.7714143991470337, + "logps/chosen": -952.236572265625, + "logps/rejected": -1082.39794921875, + "loss": 0.3925, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.617538928985596, + "rewards/margins": 1.4145046472549438, + "rewards/rejected": -6.03204345703125, + "step": 684 + }, + { + "epoch": 0.44738345987427547, + "grad_norm": 23.423333193136706, + "learning_rate": 1.3516494372137366e-07, + "logits/chosen": -0.641627311706543, + "logits/rejected": -0.5470828413963318, + "logps/chosen": -860.3857421875, + "logps/rejected": -966.0579223632812, + "loss": 0.4359, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.126357078552246, + "rewards/margins": 1.1212894916534424, + "rewards/rejected": -5.247646808624268, + "step": 685 + }, + { + "epoch": 0.4480365744142379, + "grad_norm": 13.282071369818777, + "learning_rate": 1.3495127615940178e-07, + "logits/chosen": -0.8730409741401672, + "logits/rejected": -0.7727816700935364, + "logps/chosen": -948.9930419921875, + "logps/rejected": -1041.864501953125, + "loss": 0.386, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.173754692077637, + "rewards/margins": 1.2896356582641602, + "rewards/rejected": -5.463390827178955, + "step": 686 + }, + { + "epoch": 0.44868968895420036, + "grad_norm": 14.831168635800106, + "learning_rate": 1.347374266714537e-07, + "logits/chosen": -0.6686832904815674, + "logits/rejected": -0.7571280002593994, + "logps/chosen": -874.6576538085938, + "logps/rejected": -1133.1376953125, + "loss": 0.3904, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.000096321105957, + "rewards/margins": 2.115196466445923, + "rewards/rejected": -6.115293502807617, + "step": 687 + }, + { + "epoch": 0.4493428034941628, + "grad_norm": 23.559349783811268, + "learning_rate": 1.34523396370644e-07, + "logits/chosen": -0.7094777822494507, + "logits/rejected": -0.715304434299469, + "logps/chosen": -905.0133056640625, + "logps/rejected": -1173.3590087890625, + "loss": 0.4411, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.392083644866943, + "rewards/margins": 2.4449682235717773, + "rewards/rejected": -6.837051868438721, + "step": 688 + }, + { + "epoch": 0.44999591803412525, + "grad_norm": 13.224110895762323, + "learning_rate": 1.3430918637102846e-07, + "logits/chosen": -0.9106907248497009, + "logits/rejected": -0.8780658841133118, + "logps/chosen": -859.8287353515625, + "logps/rejected": -963.4639282226562, + "loss": 0.3873, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.072977066040039, + "rewards/margins": 1.1936553716659546, + "rewards/rejected": -5.266632556915283, + "step": 689 + }, + { + "epoch": 0.45064903257408767, + "grad_norm": 16.436858644686243, + "learning_rate": 1.3409479778759828e-07, + "logits/chosen": -0.858303427696228, + "logits/rejected": -0.7822844982147217, + "logps/chosen": -976.535400390625, + "logps/rejected": -1135.9136962890625, + "loss": 0.4392, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.008065700531006, + "rewards/margins": 1.3923001289367676, + "rewards/rejected": -6.400365829467773, + "step": 690 + }, + { + "epoch": 0.45130214711405015, + "grad_norm": 13.43588871300013, + "learning_rate": 1.3388023173627412e-07, + "logits/chosen": -0.8778905272483826, + "logits/rejected": -0.853691577911377, + "logps/chosen": -942.0894165039062, + "logps/rejected": -1106.460205078125, + "loss": 0.3977, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.535030841827393, + "rewards/margins": 1.272776484489441, + "rewards/rejected": -5.807807445526123, + "step": 691 + }, + { + "epoch": 0.45195526165401256, + "grad_norm": 16.480774943029072, + "learning_rate": 1.3366548933390041e-07, + "logits/chosen": -0.9276471138000488, + "logits/rejected": -0.89919114112854, + "logps/chosen": -926.0335693359375, + "logps/rejected": -1051.7315673828125, + "loss": 0.4608, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.60793399810791, + "rewards/margins": 1.46094810962677, + "rewards/rejected": -6.068882465362549, + "step": 692 + }, + { + "epoch": 0.45260837619397504, + "grad_norm": 20.387508274609566, + "learning_rate": 1.3345057169823951e-07, + "logits/chosen": -0.7017565369606018, + "logits/rejected": -0.7618687152862549, + "logps/chosen": -863.0051879882812, + "logps/rejected": -1051.503662109375, + "loss": 0.4291, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.024776458740234, + "rewards/margins": 1.6188499927520752, + "rewards/rejected": -5.6436262130737305, + "step": 693 + }, + { + "epoch": 0.45326149073393746, + "grad_norm": 12.858851580750146, + "learning_rate": 1.3323547994796595e-07, + "logits/chosen": -0.9381989240646362, + "logits/rejected": -0.8059816360473633, + "logps/chosen": -887.5203247070312, + "logps/rejected": -969.3154907226562, + "loss": 0.4574, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.702977180480957, + "rewards/margins": 1.40816068649292, + "rewards/rejected": -5.111137390136719, + "step": 694 + }, + { + "epoch": 0.45391460527389993, + "grad_norm": 23.743275342395183, + "learning_rate": 1.3302021520266046e-07, + "logits/chosen": -0.9305901527404785, + "logits/rejected": -0.8084038496017456, + "logps/chosen": -942.6619873046875, + "logps/rejected": -1260.660400390625, + "loss": 0.4013, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.92469596862793, + "rewards/margins": 3.1635892391204834, + "rewards/rejected": -8.088285446166992, + "step": 695 + }, + { + "epoch": 0.45456771981386235, + "grad_norm": 11.805074350198156, + "learning_rate": 1.3280477858280427e-07, + "logits/chosen": -0.8544498682022095, + "logits/rejected": -0.7820972800254822, + "logps/chosen": -955.1072387695312, + "logps/rejected": -1216.007568359375, + "loss": 0.3515, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.631394863128662, + "rewards/margins": 2.7073287963867188, + "rewards/rejected": -7.3387227058410645, + "step": 696 + }, + { + "epoch": 0.4552208343538248, + "grad_norm": 17.61036279740596, + "learning_rate": 1.3258917120977328e-07, + "logits/chosen": -0.7109684944152832, + "logits/rejected": -0.7692880630493164, + "logps/chosen": -930.1781005859375, + "logps/rejected": -1101.009765625, + "loss": 0.4095, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.458493232727051, + "rewards/margins": 1.6143523454666138, + "rewards/rejected": -6.072846412658691, + "step": 697 + }, + { + "epoch": 0.45587394889378724, + "grad_norm": 22.083953625475885, + "learning_rate": 1.3237339420583212e-07, + "logits/chosen": -0.7885478734970093, + "logits/rejected": -0.7560752034187317, + "logps/chosen": -823.4185180664062, + "logps/rejected": -974.829345703125, + "loss": 0.421, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.8105380535125732, + "rewards/margins": 1.4178028106689453, + "rewards/rejected": -5.228341102600098, + "step": 698 + }, + { + "epoch": 0.4565270634337497, + "grad_norm": 24.864037856177223, + "learning_rate": 1.3215744869412835e-07, + "logits/chosen": -0.801964521408081, + "logits/rejected": -0.8227947950363159, + "logps/chosen": -896.732177734375, + "logps/rejected": -1030.0692138671875, + "loss": 0.4344, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.365248680114746, + "rewards/margins": 1.3140637874603271, + "rewards/rejected": -5.679312705993652, + "step": 699 + }, + { + "epoch": 0.45718017797371213, + "grad_norm": 26.55049991945704, + "learning_rate": 1.3194133579868672e-07, + "logits/chosen": -0.9605610370635986, + "logits/rejected": -0.9034255146980286, + "logps/chosen": -874.904296875, + "logps/rejected": -1062.235107421875, + "loss": 0.4538, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.245553493499756, + "rewards/margins": 1.9700900316238403, + "rewards/rejected": -6.215643882751465, + "step": 700 + }, + { + "epoch": 0.45718017797371213, + "eval_logits/chosen": -0.6330902576446533, + "eval_logits/rejected": -0.5789201855659485, + "eval_logps/chosen": -912.4920043945312, + "eval_logps/rejected": -1030.6561279296875, + "eval_loss": 0.43503955006599426, + "eval_rewards/accuracies": 0.7889999747276306, + "eval_rewards/chosen": -4.385847568511963, + "eval_rewards/margins": 1.2690269947052002, + "eval_rewards/rejected": -5.654874324798584, + "eval_runtime": 614.4325, + "eval_samples_per_second": 6.51, + "eval_steps_per_second": 0.407, + "step": 700 + }, + { + "epoch": 0.4578332925136746, + "grad_norm": 22.12940309127682, + "learning_rate": 1.317250566444032e-07, + "logits/chosen": -0.9155985713005066, + "logits/rejected": -0.7935531735420227, + "logps/chosen": -908.0445556640625, + "logps/rejected": -985.8743896484375, + "loss": 0.4656, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.054415225982666, + "rewards/margins": 0.8908652663230896, + "rewards/rejected": -4.945281028747559, + "step": 701 + }, + { + "epoch": 0.458486407053637, + "grad_norm": 15.953258643762798, + "learning_rate": 1.3150861235703912e-07, + "logits/chosen": -0.7887985706329346, + "logits/rejected": -0.6869586110115051, + "logps/chosen": -855.2444458007812, + "logps/rejected": -927.878173828125, + "loss": 0.4191, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.412371635437012, + "rewards/margins": 0.866736888885498, + "rewards/rejected": -5.279108047485352, + "step": 702 + }, + { + "epoch": 0.4591395215935995, + "grad_norm": 28.696366226340047, + "learning_rate": 1.3129200406321544e-07, + "logits/chosen": -0.9091500043869019, + "logits/rejected": -0.9133652448654175, + "logps/chosen": -915.0131225585938, + "logps/rejected": -1015.6453857421875, + "loss": 0.4275, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.232908248901367, + "rewards/margins": 1.2067118883132935, + "rewards/rejected": -5.439619064331055, + "step": 703 + }, + { + "epoch": 0.4597926361335619, + "grad_norm": 15.37871134771081, + "learning_rate": 1.310752328904067e-07, + "logits/chosen": -0.9574613571166992, + "logits/rejected": -0.9025395512580872, + "logps/chosen": -894.5764770507812, + "logps/rejected": -1112.0906982421875, + "loss": 0.4092, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.34193754196167, + "rewards/margins": 1.7960808277130127, + "rewards/rejected": -6.1380181312561035, + "step": 704 + }, + { + "epoch": 0.4604457506735244, + "grad_norm": 19.648239313623666, + "learning_rate": 1.3085829996693524e-07, + "logits/chosen": -0.7728855609893799, + "logits/rejected": -0.755530595779419, + "logps/chosen": -808.7198486328125, + "logps/rejected": -996.7811279296875, + "loss": 0.356, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.7491912841796875, + "rewards/margins": 1.527405023574829, + "rewards/rejected": -5.2765960693359375, + "step": 705 + }, + { + "epoch": 0.4610988652134868, + "grad_norm": 16.7988475441762, + "learning_rate": 1.3064120642196547e-07, + "logits/chosen": -0.843777596950531, + "logits/rejected": -0.9133669137954712, + "logps/chosen": -928.4371948242188, + "logps/rejected": -1059.2352294921875, + "loss": 0.4309, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.472860813140869, + "rewards/margins": 1.21001398563385, + "rewards/rejected": -5.68287467956543, + "step": 706 + }, + { + "epoch": 0.4617519797534493, + "grad_norm": 26.756288502338244, + "learning_rate": 1.304239533854977e-07, + "logits/chosen": -0.7616496682167053, + "logits/rejected": -0.7714404463768005, + "logps/chosen": -957.3233032226562, + "logps/rejected": -1242.140380859375, + "loss": 0.3533, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.836521625518799, + "rewards/margins": 2.0891270637512207, + "rewards/rejected": -6.925648212432861, + "step": 707 + }, + { + "epoch": 0.4624050942934117, + "grad_norm": 20.678633601313965, + "learning_rate": 1.3020654198836248e-07, + "logits/chosen": -0.8029762506484985, + "logits/rejected": -0.6649346947669983, + "logps/chosen": -894.7766723632812, + "logps/rejected": -961.3494262695312, + "loss": 0.3826, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.524013042449951, + "rewards/margins": 1.130009412765503, + "rewards/rejected": -5.654022216796875, + "step": 708 + }, + { + "epoch": 0.4630582088333742, + "grad_norm": 16.42456254623477, + "learning_rate": 1.2998897336221468e-07, + "logits/chosen": -0.6833517551422119, + "logits/rejected": -0.796322226524353, + "logps/chosen": -862.7698974609375, + "logps/rejected": -1060.818359375, + "loss": 0.3926, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9077136516571045, + "rewards/margins": 1.6327791213989258, + "rewards/rejected": -5.540492534637451, + "step": 709 + }, + { + "epoch": 0.4637113233733366, + "grad_norm": 23.17514164745614, + "learning_rate": 1.297712486395275e-07, + "logits/chosen": -0.9213692545890808, + "logits/rejected": -0.8998441100120544, + "logps/chosen": -959.6565551757812, + "logps/rejected": -1069.1971435546875, + "loss": 0.4986, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.9136199951171875, + "rewards/margins": 1.4247158765792847, + "rewards/rejected": -6.338335990905762, + "step": 710 + }, + { + "epoch": 0.46436443791329907, + "grad_norm": 16.192431938968557, + "learning_rate": 1.295533689535867e-07, + "logits/chosen": -0.7986994981765747, + "logits/rejected": -0.826543390750885, + "logps/chosen": -1040.4547119140625, + "logps/rejected": -1238.6219482421875, + "loss": 0.3827, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.081081867218018, + "rewards/margins": 1.767729640007019, + "rewards/rejected": -6.848812103271484, + "step": 711 + }, + { + "epoch": 0.4650175524532615, + "grad_norm": 15.779594539219438, + "learning_rate": 1.2933533543848462e-07, + "logits/chosen": -0.5566374659538269, + "logits/rejected": -0.5704978704452515, + "logps/chosen": -902.4140625, + "logps/rejected": -1187.8582763671875, + "loss": 0.4045, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.638293743133545, + "rewards/margins": 2.1582648754119873, + "rewards/rejected": -6.796557903289795, + "step": 712 + }, + { + "epoch": 0.46567066699322396, + "grad_norm": 21.262426685835567, + "learning_rate": 1.2911714922911425e-07, + "logits/chosen": -0.7912888526916504, + "logits/rejected": -0.7453445792198181, + "logps/chosen": -935.51513671875, + "logps/rejected": -1078.4232177734375, + "loss": 0.4809, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.693671226501465, + "rewards/margins": 1.4786438941955566, + "rewards/rejected": -6.172314643859863, + "step": 713 + }, + { + "epoch": 0.4663237815331864, + "grad_norm": 17.990290034456308, + "learning_rate": 1.2889881146116349e-07, + "logits/chosen": -0.7075154185295105, + "logits/rejected": -0.6138902902603149, + "logps/chosen": -848.4879760742188, + "logps/rejected": -1001.8304443359375, + "loss": 0.4373, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.287723064422607, + "rewards/margins": 1.51327383518219, + "rewards/rejected": -5.800996780395508, + "step": 714 + }, + { + "epoch": 0.46697689607314885, + "grad_norm": 20.00394069949329, + "learning_rate": 1.2868032327110903e-07, + "logits/chosen": -0.9849535226821899, + "logits/rejected": -0.8561375737190247, + "logps/chosen": -951.6922607421875, + "logps/rejected": -1065.832275390625, + "loss": 0.4215, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.533693313598633, + "rewards/margins": 1.485561728477478, + "rewards/rejected": -6.019254684448242, + "step": 715 + }, + { + "epoch": 0.46763001061311127, + "grad_norm": 18.908563866657076, + "learning_rate": 1.2846168579621054e-07, + "logits/chosen": -0.8201822638511658, + "logits/rejected": -0.7975326180458069, + "logps/chosen": -933.5443725585938, + "logps/rejected": -1024.7989501953125, + "loss": 0.4563, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.578218936920166, + "rewards/margins": 1.273808479309082, + "rewards/rejected": -5.85202693939209, + "step": 716 + }, + { + "epoch": 0.46828312515307374, + "grad_norm": 13.605006365729439, + "learning_rate": 1.2824290017450478e-07, + "logits/chosen": -0.8025465607643127, + "logits/rejected": -0.8043302893638611, + "logps/chosen": -825.5875244140625, + "logps/rejected": -965.36962890625, + "loss": 0.4272, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7525651454925537, + "rewards/margins": 1.5843777656555176, + "rewards/rejected": -5.336942672729492, + "step": 717 + }, + { + "epoch": 0.46893623969303616, + "grad_norm": 23.393158261741338, + "learning_rate": 1.2802396754479957e-07, + "logits/chosen": -0.9675735831260681, + "logits/rejected": -0.8064472079277039, + "logps/chosen": -1006.602294921875, + "logps/rejected": -1092.76904296875, + "loss": 0.4558, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.947066307067871, + "rewards/margins": 1.4163665771484375, + "rewards/rejected": -6.363432884216309, + "step": 718 + }, + { + "epoch": 0.46958935423299863, + "grad_norm": 19.761890309512406, + "learning_rate": 1.27804889046668e-07, + "logits/chosen": -0.9035289883613586, + "logits/rejected": -0.8827542066574097, + "logps/chosen": -860.1765747070312, + "logps/rejected": -967.9915771484375, + "loss": 0.3329, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.125575065612793, + "rewards/margins": 1.158940315246582, + "rewards/rejected": -5.284515380859375, + "step": 719 + }, + { + "epoch": 0.47024246877296105, + "grad_norm": 34.43053856299178, + "learning_rate": 1.2758566582044235e-07, + "logits/chosen": -0.8743783235549927, + "logits/rejected": -0.8185849189758301, + "logps/chosen": -904.7877807617188, + "logps/rejected": -1013.7088012695312, + "loss": 0.4482, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.368615627288818, + "rewards/margins": 1.3164438009262085, + "rewards/rejected": -5.685059547424316, + "step": 720 + }, + { + "epoch": 0.4708955833129235, + "grad_norm": 17.2479400408533, + "learning_rate": 1.273662990072083e-07, + "logits/chosen": -0.5357871651649475, + "logits/rejected": -0.5141369700431824, + "logps/chosen": -915.4850463867188, + "logps/rejected": -1039.861328125, + "loss": 0.452, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.497086524963379, + "rewards/margins": 1.1182925701141357, + "rewards/rejected": -5.615379333496094, + "step": 721 + }, + { + "epoch": 0.47154869785288595, + "grad_norm": 30.52690046250129, + "learning_rate": 1.2714678974879885e-07, + "logits/chosen": -0.8510643839836121, + "logits/rejected": -0.7935106754302979, + "logps/chosen": -922.787841796875, + "logps/rejected": -1031.25537109375, + "loss": 0.4458, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.576708793640137, + "rewards/margins": 1.3589658737182617, + "rewards/rejected": -5.935675144195557, + "step": 722 + }, + { + "epoch": 0.4722018123928484, + "grad_norm": 12.51511568238771, + "learning_rate": 1.2692713918778846e-07, + "logits/chosen": -0.9695079326629639, + "logits/rejected": -0.9029305577278137, + "logps/chosen": -849.284423828125, + "logps/rejected": -983.3785400390625, + "loss": 0.3653, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8184423446655273, + "rewards/margins": 1.5116703510284424, + "rewards/rejected": -5.330112457275391, + "step": 723 + }, + { + "epoch": 0.47285492693281084, + "grad_norm": 21.528089592243187, + "learning_rate": 1.2670734846748716e-07, + "logits/chosen": -0.7152475714683533, + "logits/rejected": -0.6749838590621948, + "logps/chosen": -837.8988037109375, + "logps/rejected": -995.8895263671875, + "loss": 0.4078, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.6210410594940186, + "rewards/margins": 1.5628869533538818, + "rewards/rejected": -5.183927536010742, + "step": 724 + }, + { + "epoch": 0.4735080414727733, + "grad_norm": 25.426263169248593, + "learning_rate": 1.2648741873193445e-07, + "logits/chosen": -0.9211537837982178, + "logits/rejected": -0.8691627383232117, + "logps/chosen": -927.373779296875, + "logps/rejected": -1008.0757446289062, + "loss": 0.4983, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.5448899269104, + "rewards/margins": 0.8503549695014954, + "rewards/rejected": -5.395245552062988, + "step": 725 + }, + { + "epoch": 0.47416115601273573, + "grad_norm": 13.836360831145551, + "learning_rate": 1.2626735112589345e-07, + "logits/chosen": -0.5572523474693298, + "logits/rejected": -0.577164351940155, + "logps/chosen": -775.7666625976562, + "logps/rejected": -1003.75634765625, + "loss": 0.3885, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.658046245574951, + "rewards/margins": 1.7806352376937866, + "rewards/rejected": -5.438681125640869, + "step": 726 + }, + { + "epoch": 0.4748142705526982, + "grad_norm": 19.00950665056247, + "learning_rate": 1.2604714679484488e-07, + "logits/chosen": -0.9352363348007202, + "logits/rejected": -0.9275373220443726, + "logps/chosen": -850.5507202148438, + "logps/rejected": -979.1115112304688, + "loss": 0.4245, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.22627067565918, + "rewards/margins": 1.3016259670257568, + "rewards/rejected": -5.527897357940674, + "step": 727 + }, + { + "epoch": 0.4754673850926606, + "grad_norm": 16.93055884467082, + "learning_rate": 1.2582680688498123e-07, + "logits/chosen": -0.7490636706352234, + "logits/rejected": -0.7202102541923523, + "logps/chosen": -831.2103271484375, + "logps/rejected": -961.669921875, + "loss": 0.4218, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.167855739593506, + "rewards/margins": 1.4452309608459473, + "rewards/rejected": -5.613086700439453, + "step": 728 + }, + { + "epoch": 0.4761204996326231, + "grad_norm": 16.291144939826697, + "learning_rate": 1.2560633254320057e-07, + "logits/chosen": -0.7099978923797607, + "logits/rejected": -0.9346813559532166, + "logps/chosen": -838.0257568359375, + "logps/rejected": -1143.6383056640625, + "loss": 0.409, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.113926887512207, + "rewards/margins": 1.7861765623092651, + "rewards/rejected": -5.9001030921936035, + "step": 729 + }, + { + "epoch": 0.4767736141725855, + "grad_norm": 27.67917554312354, + "learning_rate": 1.2538572491710077e-07, + "logits/chosen": -0.7133294939994812, + "logits/rejected": -0.6676253080368042, + "logps/chosen": -820.7550048828125, + "logps/rejected": -966.079345703125, + "loss": 0.4067, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.8138749599456787, + "rewards/margins": 1.4336824417114258, + "rewards/rejected": -5.247557163238525, + "step": 730 + }, + { + "epoch": 0.477426728712548, + "grad_norm": 25.703958623986306, + "learning_rate": 1.251649851549735e-07, + "logits/chosen": -0.7251565456390381, + "logits/rejected": -0.6387436985969543, + "logps/chosen": -896.2338256835938, + "logps/rejected": -1028.1199951171875, + "loss": 0.4196, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.099024295806885, + "rewards/margins": 1.6823163032531738, + "rewards/rejected": -5.781341075897217, + "step": 731 + }, + { + "epoch": 0.4780798432525104, + "grad_norm": 37.02785000532457, + "learning_rate": 1.2494411440579813e-07, + "logits/chosen": -0.9833545088768005, + "logits/rejected": -0.9329172968864441, + "logps/chosen": -935.530517578125, + "logps/rejected": -1039.677978515625, + "loss": 0.3888, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.771054744720459, + "rewards/margins": 1.2584686279296875, + "rewards/rejected": -6.0295233726501465, + "step": 732 + }, + { + "epoch": 0.4787329577924729, + "grad_norm": 25.79246674335453, + "learning_rate": 1.2472311381923588e-07, + "logits/chosen": -0.9473874568939209, + "logits/rejected": -0.9712878465652466, + "logps/chosen": -969.2967529296875, + "logps/rejected": -1192.9619140625, + "loss": 0.3645, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.9531495571136475, + "rewards/margins": 2.2883496284484863, + "rewards/rejected": -6.241498947143555, + "step": 733 + }, + { + "epoch": 0.4793860723324353, + "grad_norm": 20.659869144712403, + "learning_rate": 1.245019845456238e-07, + "logits/chosen": -0.51751708984375, + "logits/rejected": -0.42947906255722046, + "logps/chosen": -836.533203125, + "logps/rejected": -946.0396728515625, + "loss": 0.4573, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.261209487915039, + "rewards/margins": 1.200287103652954, + "rewards/rejected": -5.461496829986572, + "step": 734 + }, + { + "epoch": 0.48003918687239777, + "grad_norm": 21.72711267793662, + "learning_rate": 1.2428072773596873e-07, + "logits/chosen": -0.7008222937583923, + "logits/rejected": -0.6094821691513062, + "logps/chosen": -864.21435546875, + "logps/rejected": -954.5537109375, + "loss": 0.4959, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.461493015289307, + "rewards/margins": 0.9662419557571411, + "rewards/rejected": -5.427734375, + "step": 735 + }, + { + "epoch": 0.4806923014123602, + "grad_norm": 15.65230950114833, + "learning_rate": 1.2405934454194144e-07, + "logits/chosen": -0.899868369102478, + "logits/rejected": -0.8498630523681641, + "logps/chosen": -852.909912109375, + "logps/rejected": -932.8972778320312, + "loss": 0.4457, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.355042934417725, + "rewards/margins": 0.8693320751190186, + "rewards/rejected": -5.224374771118164, + "step": 736 + }, + { + "epoch": 0.48134541595232266, + "grad_norm": 17.108068590790964, + "learning_rate": 1.2383783611587044e-07, + "logits/chosen": -0.8432599902153015, + "logits/rejected": -0.843549907207489, + "logps/chosen": -936.5597534179688, + "logps/rejected": -1126.088134765625, + "loss": 0.4673, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.594806671142578, + "rewards/margins": 1.714801549911499, + "rewards/rejected": -6.309607982635498, + "step": 737 + }, + { + "epoch": 0.4819985304922851, + "grad_norm": 20.363289207881063, + "learning_rate": 1.2361620361073617e-07, + "logits/chosen": -1.0172386169433594, + "logits/rejected": -0.8159488439559937, + "logps/chosen": -1016.598388671875, + "logps/rejected": -1100.7061767578125, + "loss": 0.5102, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.072577476501465, + "rewards/margins": 1.0971565246582031, + "rewards/rejected": -6.169734001159668, + "step": 738 + }, + { + "epoch": 0.48265164503224756, + "grad_norm": 14.671715900291483, + "learning_rate": 1.2339444818016487e-07, + "logits/chosen": -0.6325951218605042, + "logits/rejected": -0.7422337532043457, + "logps/chosen": -930.2354736328125, + "logps/rejected": -1154.2427978515625, + "loss": 0.4045, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.799300193786621, + "rewards/margins": 1.4765405654907227, + "rewards/rejected": -6.2758402824401855, + "step": 739 + }, + { + "epoch": 0.48330475957221, + "grad_norm": 34.521623507158026, + "learning_rate": 1.2317257097842262e-07, + "logits/chosen": -0.9213210344314575, + "logits/rejected": -0.7929114103317261, + "logps/chosen": -853.6707763671875, + "logps/rejected": -988.0006103515625, + "loss": 0.5518, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.203425407409668, + "rewards/margins": 1.2507836818695068, + "rewards/rejected": -5.454209327697754, + "step": 740 + }, + { + "epoch": 0.48395787411217245, + "grad_norm": 43.15485782986652, + "learning_rate": 1.2295057316040937e-07, + "logits/chosen": -0.8311377763748169, + "logits/rejected": -0.7532038688659668, + "logps/chosen": -920.2929077148438, + "logps/rejected": -970.626953125, + "loss": 0.5301, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.7200517654418945, + "rewards/margins": 0.9582325220108032, + "rewards/rejected": -5.678284645080566, + "step": 741 + }, + { + "epoch": 0.48461098865213487, + "grad_norm": 19.92124882456644, + "learning_rate": 1.2272845588165287e-07, + "logits/chosen": -0.7741049528121948, + "logits/rejected": -0.9612672328948975, + "logps/chosen": -770.1849365234375, + "logps/rejected": -975.554931640625, + "loss": 0.4058, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5785880088806152, + "rewards/margins": 1.356839895248413, + "rewards/rejected": -4.935428142547607, + "step": 742 + }, + { + "epoch": 0.48526410319209734, + "grad_norm": 14.002013827452883, + "learning_rate": 1.2250622029830272e-07, + "logits/chosen": -0.8265233635902405, + "logits/rejected": -0.7270675897598267, + "logps/chosen": -918.1268310546875, + "logps/rejected": -1094.3868408203125, + "loss": 0.406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.770752906799316, + "rewards/margins": 1.5041320323944092, + "rewards/rejected": -6.274885177612305, + "step": 743 + }, + { + "epoch": 0.48591721773205976, + "grad_norm": 13.769152189829311, + "learning_rate": 1.2228386756712425e-07, + "logits/chosen": -0.9900491833686829, + "logits/rejected": -0.978319525718689, + "logps/chosen": -960.6400756835938, + "logps/rejected": -1180.7225341796875, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.634914398193359, + "rewards/margins": 1.8906114101409912, + "rewards/rejected": -6.5255255699157715, + "step": 744 + }, + { + "epoch": 0.48657033227202223, + "grad_norm": 17.539591551900347, + "learning_rate": 1.2206139884549258e-07, + "logits/chosen": -0.8892086744308472, + "logits/rejected": -0.9011514186859131, + "logps/chosen": -989.2440185546875, + "logps/rejected": -1094.5552978515625, + "loss": 0.4855, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.743755340576172, + "rewards/margins": 1.1824637651443481, + "rewards/rejected": -5.926219940185547, + "step": 745 + }, + { + "epoch": 0.48722344681198465, + "grad_norm": 16.1789735995074, + "learning_rate": 1.218388152913866e-07, + "logits/chosen": -0.6589198112487793, + "logits/rejected": -0.5492499470710754, + "logps/chosen": -767.7838745117188, + "logps/rejected": -982.4749145507812, + "loss": 0.4377, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.803617000579834, + "rewards/margins": 1.8227593898773193, + "rewards/rejected": -5.626376628875732, + "step": 746 + }, + { + "epoch": 0.4878765613519471, + "grad_norm": 18.517762291790852, + "learning_rate": 1.2161611806338287e-07, + "logits/chosen": -0.8017712831497192, + "logits/rejected": -0.7148299813270569, + "logps/chosen": -857.5178833007812, + "logps/rejected": -951.9879150390625, + "loss": 0.407, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.7201151847839355, + "rewards/margins": 1.41351318359375, + "rewards/rejected": -5.1336283683776855, + "step": 747 + }, + { + "epoch": 0.48852967589190954, + "grad_norm": 17.3742032384711, + "learning_rate": 1.2139330832064973e-07, + "logits/chosen": -0.933401346206665, + "logits/rejected": -0.8776005506515503, + "logps/chosen": -845.0396728515625, + "logps/rejected": -936.0737915039062, + "loss": 0.3894, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5093374252319336, + "rewards/margins": 1.1748614311218262, + "rewards/rejected": -4.68419885635376, + "step": 748 + }, + { + "epoch": 0.489182790431872, + "grad_norm": 20.0598159118841, + "learning_rate": 1.2117038722294108e-07, + "logits/chosen": -0.8245954513549805, + "logits/rejected": -0.8754929900169373, + "logps/chosen": -804.7047729492188, + "logps/rejected": -986.4611206054688, + "loss": 0.4746, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.710789680480957, + "rewards/margins": 1.3969413042068481, + "rewards/rejected": -5.107730865478516, + "step": 749 + }, + { + "epoch": 0.48983590497183443, + "grad_norm": 31.59240574933894, + "learning_rate": 1.2094735593059044e-07, + "logits/chosen": -0.9009556174278259, + "logits/rejected": -0.8687411546707153, + "logps/chosen": -878.0067749023438, + "logps/rejected": -1024.1734619140625, + "loss": 0.3774, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.03043270111084, + "rewards/margins": 1.4627373218536377, + "rewards/rejected": -5.493169784545898, + "step": 750 + }, + { + "epoch": 0.4904890195117969, + "grad_norm": 24.791969285714874, + "learning_rate": 1.2072421560450497e-07, + "logits/chosen": -0.8229708075523376, + "logits/rejected": -0.8028020262718201, + "logps/chosen": -848.9623413085938, + "logps/rejected": -948.7738037109375, + "loss": 0.428, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.879707098007202, + "rewards/margins": 1.1067575216293335, + "rewards/rejected": -4.986464977264404, + "step": 751 + }, + { + "epoch": 0.4911421340517593, + "grad_norm": 12.606071497414911, + "learning_rate": 1.2050096740615933e-07, + "logits/chosen": -0.8053447008132935, + "logits/rejected": -0.8125733137130737, + "logps/chosen": -849.5568237304688, + "logps/rejected": -1000.6946411132812, + "loss": 0.3875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.221886157989502, + "rewards/margins": 1.2606518268585205, + "rewards/rejected": -5.482537269592285, + "step": 752 + }, + { + "epoch": 0.4917952485917218, + "grad_norm": 14.52811675410469, + "learning_rate": 1.2027761249758962e-07, + "logits/chosen": -0.7513640522956848, + "logits/rejected": -0.8120062351226807, + "logps/chosen": -892.8887329101562, + "logps/rejected": -1004.2350463867188, + "loss": 0.449, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.481434345245361, + "rewards/margins": 0.953445553779602, + "rewards/rejected": -5.434879779815674, + "step": 753 + }, + { + "epoch": 0.4924483631316842, + "grad_norm": 13.38995547174789, + "learning_rate": 1.200541520413875e-07, + "logits/chosen": -0.9067294597625732, + "logits/rejected": -0.9803101420402527, + "logps/chosen": -822.230224609375, + "logps/rejected": -969.9388427734375, + "loss": 0.3983, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.3725507259368896, + "rewards/margins": 1.4337201118469238, + "rewards/rejected": -4.806271076202393, + "step": 754 + }, + { + "epoch": 0.49310147767164664, + "grad_norm": 19.628709746070033, + "learning_rate": 1.1983058720069397e-07, + "logits/chosen": -0.769875705242157, + "logits/rejected": -0.6702077984809875, + "logps/chosen": -857.1516723632812, + "logps/rejected": -938.2532958984375, + "loss": 0.4094, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.118803977966309, + "rewards/margins": 1.0416544675827026, + "rewards/rejected": -5.160459041595459, + "step": 755 + }, + { + "epoch": 0.4937545922116091, + "grad_norm": 18.81970358020589, + "learning_rate": 1.1960691913919326e-07, + "logits/chosen": -0.7793917655944824, + "logits/rejected": -0.7675676345825195, + "logps/chosen": -877.9605102539062, + "logps/rejected": -989.585205078125, + "loss": 0.4474, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.07187032699585, + "rewards/margins": 1.2758498191833496, + "rewards/rejected": -5.347719192504883, + "step": 756 + }, + { + "epoch": 0.49440770675157153, + "grad_norm": 20.09325182686205, + "learning_rate": 1.19383149021107e-07, + "logits/chosen": -0.8387940526008606, + "logits/rejected": -0.8090571165084839, + "logps/chosen": -885.7142944335938, + "logps/rejected": -980.8731689453125, + "loss": 0.3542, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5571885108947754, + "rewards/margins": 1.6131629943847656, + "rewards/rejected": -5.170351982116699, + "step": 757 + }, + { + "epoch": 0.495060821291534, + "grad_norm": 27.816476529936935, + "learning_rate": 1.1915927801118804e-07, + "logits/chosen": -0.9003801345825195, + "logits/rejected": -0.8655527830123901, + "logps/chosen": -904.9478759765625, + "logps/rejected": -1061.704345703125, + "loss": 0.4283, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.322434902191162, + "rewards/margins": 1.379239559173584, + "rewards/rejected": -5.701674461364746, + "step": 758 + }, + { + "epoch": 0.4957139358314964, + "grad_norm": 20.61033743233212, + "learning_rate": 1.1893530727471428e-07, + "logits/chosen": -0.9334514737129211, + "logits/rejected": -0.8155126571655273, + "logps/chosen": -946.715576171875, + "logps/rejected": -1062.2564697265625, + "loss": 0.4412, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.529923915863037, + "rewards/margins": 1.363791823387146, + "rewards/rejected": -5.8937153816223145, + "step": 759 + }, + { + "epoch": 0.4963670503714589, + "grad_norm": 33.77412475553499, + "learning_rate": 1.1871123797748283e-07, + "logits/chosen": -0.6742445230484009, + "logits/rejected": -0.5909000635147095, + "logps/chosen": -949.4371337890625, + "logps/rejected": -1071.842529296875, + "loss": 0.4351, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.558448791503906, + "rewards/margins": 1.3982633352279663, + "rewards/rejected": -5.956711292266846, + "step": 760 + }, + { + "epoch": 0.4970201649114213, + "grad_norm": 20.54838651893162, + "learning_rate": 1.1848707128580375e-07, + "logits/chosen": -0.7726311087608337, + "logits/rejected": -0.775484561920166, + "logps/chosen": -927.5745239257812, + "logps/rejected": -1105.466552734375, + "loss": 0.3525, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.482267379760742, + "rewards/margins": 1.805901050567627, + "rewards/rejected": -6.288168430328369, + "step": 761 + }, + { + "epoch": 0.4976732794513838, + "grad_norm": 16.777204352831102, + "learning_rate": 1.1826280836649409e-07, + "logits/chosen": -0.7363873720169067, + "logits/rejected": -0.6501478552818298, + "logps/chosen": -897.8953857421875, + "logps/rejected": -1098.8282470703125, + "loss": 0.401, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.353043556213379, + "rewards/margins": 1.7584478855133057, + "rewards/rejected": -6.1114912033081055, + "step": 762 + }, + { + "epoch": 0.4983263939913462, + "grad_norm": 24.636393241823914, + "learning_rate": 1.180384503868717e-07, + "logits/chosen": -0.7493852376937866, + "logits/rejected": -0.723925769329071, + "logps/chosen": -953.0501098632812, + "logps/rejected": -1146.88134765625, + "loss": 0.4202, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.643695831298828, + "rewards/margins": 2.2372117042541504, + "rewards/rejected": -6.8809075355529785, + "step": 763 + }, + { + "epoch": 0.4989795085313087, + "grad_norm": 19.058482760823487, + "learning_rate": 1.1781399851474931e-07, + "logits/chosen": -0.9877965450286865, + "logits/rejected": -0.9070134162902832, + "logps/chosen": -930.2177734375, + "logps/rejected": -1031.239501953125, + "loss": 0.4575, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.326931476593018, + "rewards/margins": 1.2556382417678833, + "rewards/rejected": -5.582569599151611, + "step": 764 + }, + { + "epoch": 0.4996326230712711, + "grad_norm": 16.17557117170361, + "learning_rate": 1.175894539184284e-07, + "logits/chosen": -0.9283071160316467, + "logits/rejected": -0.8941454887390137, + "logps/chosen": -929.899169921875, + "logps/rejected": -1084.6240234375, + "loss": 0.4407, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.056011199951172, + "rewards/margins": 1.622609257698059, + "rewards/rejected": -5.678620338439941, + "step": 765 + }, + { + "epoch": 0.5002857376112335, + "grad_norm": 23.91162098054986, + "learning_rate": 1.1736481776669305e-07, + "logits/chosen": -0.8212779760360718, + "logits/rejected": -0.8001962304115295, + "logps/chosen": -878.6630859375, + "logps/rejected": -988.41845703125, + "loss": 0.4294, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.468136310577393, + "rewards/margins": 1.1635451316833496, + "rewards/rejected": -5.6316819190979, + "step": 766 + }, + { + "epoch": 0.500938852151196, + "grad_norm": 27.218732486708504, + "learning_rate": 1.171400912288038e-07, + "logits/chosen": -0.8495869636535645, + "logits/rejected": -0.702357292175293, + "logps/chosen": -831.1826171875, + "logps/rejected": -921.25390625, + "loss": 0.4513, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.789102554321289, + "rewards/margins": 1.2873955965042114, + "rewards/rejected": -5.076498508453369, + "step": 767 + }, + { + "epoch": 0.5015919666911585, + "grad_norm": 20.90437936826142, + "learning_rate": 1.169152754744918e-07, + "logits/chosen": -0.7195597290992737, + "logits/rejected": -0.6627518534660339, + "logps/chosen": -953.954345703125, + "logps/rejected": -1121.614013671875, + "loss": 0.4843, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.850507736206055, + "rewards/margins": 1.3082853555679321, + "rewards/rejected": -6.1587934494018555, + "step": 768 + }, + { + "epoch": 0.5022450812311209, + "grad_norm": 13.808361407338461, + "learning_rate": 1.1669037167395254e-07, + "logits/chosen": -0.7798606157302856, + "logits/rejected": -0.7846609354019165, + "logps/chosen": -884.9368896484375, + "logps/rejected": -1020.4482421875, + "loss": 0.3611, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.367555618286133, + "rewards/margins": 1.3237645626068115, + "rewards/rejected": -5.691320419311523, + "step": 769 + }, + { + "epoch": 0.5028981957710833, + "grad_norm": 15.626358450737365, + "learning_rate": 1.164653809978398e-07, + "logits/chosen": -0.8251121640205383, + "logits/rejected": -0.8357376456260681, + "logps/chosen": -813.7039794921875, + "logps/rejected": -1000.0850830078125, + "loss": 0.3555, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9221858978271484, + "rewards/margins": 1.5360300540924072, + "rewards/rejected": -5.458215713500977, + "step": 770 + }, + { + "epoch": 0.5035513103110458, + "grad_norm": 16.252415595810294, + "learning_rate": 1.1624030461725956e-07, + "logits/chosen": -0.6500051021575928, + "logits/rejected": -0.8026775121688843, + "logps/chosen": -844.816162109375, + "logps/rejected": -1006.4044799804688, + "loss": 0.3957, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9311153888702393, + "rewards/margins": 1.383944034576416, + "rewards/rejected": -5.315058708190918, + "step": 771 + }, + { + "epoch": 0.5042044248510082, + "grad_norm": 22.459666056011624, + "learning_rate": 1.1601514370376389e-07, + "logits/chosen": -0.5780275464057922, + "logits/rejected": -0.5432345867156982, + "logps/chosen": -926.1326904296875, + "logps/rejected": -1077.6756591796875, + "loss": 0.3715, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.724125862121582, + "rewards/margins": 1.5845980644226074, + "rewards/rejected": -6.3087239265441895, + "step": 772 + }, + { + "epoch": 0.5048575393909707, + "grad_norm": 23.163130085012735, + "learning_rate": 1.1578989942934488e-07, + "logits/chosen": -0.9420453310012817, + "logits/rejected": -0.7490870356559753, + "logps/chosen": -1003.92822265625, + "logps/rejected": -1117.533447265625, + "loss": 0.3998, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.345033168792725, + "rewards/margins": 1.4133516550064087, + "rewards/rejected": -6.75838565826416, + "step": 773 + }, + { + "epoch": 0.5055106539309331, + "grad_norm": 21.394993393127642, + "learning_rate": 1.1556457296642847e-07, + "logits/chosen": -0.7638024091720581, + "logits/rejected": -0.85248202085495, + "logps/chosen": -988.4688720703125, + "logps/rejected": -1274.0184326171875, + "loss": 0.4004, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.5560994148254395, + "rewards/margins": 2.0841054916381836, + "rewards/rejected": -7.640205383300781, + "step": 774 + }, + { + "epoch": 0.5061637684708956, + "grad_norm": 18.81833935631505, + "learning_rate": 1.1533916548786855e-07, + "logits/chosen": -0.8116979002952576, + "logits/rejected": -0.778827965259552, + "logps/chosen": -870.8009643554688, + "logps/rejected": -1005.8507080078125, + "loss": 0.4318, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.558528423309326, + "rewards/margins": 1.3536608219146729, + "rewards/rejected": -5.912189483642578, + "step": 775 + }, + { + "epoch": 0.506816883010858, + "grad_norm": 16.198840020111252, + "learning_rate": 1.1511367816694051e-07, + "logits/chosen": -0.7797116041183472, + "logits/rejected": -0.7683050036430359, + "logps/chosen": -939.5609741210938, + "logps/rejected": -1083.218994140625, + "loss": 0.4244, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.802543640136719, + "rewards/margins": 1.5838314294815063, + "rewards/rejected": -6.386374473571777, + "step": 776 + }, + { + "epoch": 0.5074699975508205, + "grad_norm": 18.701770107685814, + "learning_rate": 1.1488811217733549e-07, + "logits/chosen": -0.977311372756958, + "logits/rejected": -0.9297751188278198, + "logps/chosen": -917.08642578125, + "logps/rejected": -1083.6258544921875, + "loss": 0.3714, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.588201999664307, + "rewards/margins": 1.63888680934906, + "rewards/rejected": -6.227088928222656, + "step": 777 + }, + { + "epoch": 0.5081231120907829, + "grad_norm": 16.1923946331088, + "learning_rate": 1.1466246869315406e-07, + "logits/chosen": -1.0482027530670166, + "logits/rejected": -0.9430612325668335, + "logps/chosen": -1009.8977661132812, + "logps/rejected": -1169.45556640625, + "loss": 0.3695, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.51522970199585, + "rewards/margins": 1.5812039375305176, + "rewards/rejected": -7.096433639526367, + "step": 778 + }, + { + "epoch": 0.5087762266307454, + "grad_norm": 18.00057572408695, + "learning_rate": 1.1443674888890007e-07, + "logits/chosen": -0.7985199093818665, + "logits/rejected": -0.7626904249191284, + "logps/chosen": -925.5953979492188, + "logps/rejected": -1146.2274169921875, + "loss": 0.3768, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.9369001388549805, + "rewards/margins": 2.3211519718170166, + "rewards/rejected": -7.258052349090576, + "step": 779 + }, + { + "epoch": 0.5094293411707078, + "grad_norm": 15.115454811528636, + "learning_rate": 1.1421095393947478e-07, + "logits/chosen": -1.0339407920837402, + "logits/rejected": -0.9772629737854004, + "logps/chosen": -945.53662109375, + "logps/rejected": -1116.2100830078125, + "loss": 0.3776, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.286327838897705, + "rewards/margins": 1.8986492156982422, + "rewards/rejected": -6.184976577758789, + "step": 780 + }, + { + "epoch": 0.5100824557106702, + "grad_norm": 20.665297297366077, + "learning_rate": 1.1398508502017046e-07, + "logits/chosen": -0.8846578001976013, + "logits/rejected": -0.850710391998291, + "logps/chosen": -868.7977294921875, + "logps/rejected": -975.1135864257812, + "loss": 0.4766, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.497041702270508, + "rewards/margins": 1.3014904260635376, + "rewards/rejected": -5.798532485961914, + "step": 781 + }, + { + "epoch": 0.5107355702506327, + "grad_norm": 18.340311788522833, + "learning_rate": 1.1375914330666449e-07, + "logits/chosen": -0.87464439868927, + "logits/rejected": -0.9072346687316895, + "logps/chosen": -1002.248291015625, + "logps/rejected": -1136.2291259765625, + "loss": 0.4377, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.940828800201416, + "rewards/margins": 1.2929599285125732, + "rewards/rejected": -6.233788013458252, + "step": 782 + }, + { + "epoch": 0.5113886847905952, + "grad_norm": 17.06781548774511, + "learning_rate": 1.1353312997501312e-07, + "logits/chosen": -0.8003891110420227, + "logits/rejected": -0.630307674407959, + "logps/chosen": -902.5821533203125, + "logps/rejected": -1034.3380126953125, + "loss": 0.4738, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8484323024749756, + "rewards/margins": 1.414013147354126, + "rewards/rejected": -5.262445449829102, + "step": 783 + }, + { + "epoch": 0.5120417993305576, + "grad_norm": 20.70545186944479, + "learning_rate": 1.1330704620164537e-07, + "logits/chosen": -0.901751697063446, + "logits/rejected": -0.9353833198547363, + "logps/chosen": -814.7222900390625, + "logps/rejected": -914.0795288085938, + "loss": 0.4576, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.048853397369385, + "rewards/margins": 0.867429256439209, + "rewards/rejected": -4.916282653808594, + "step": 784 + }, + { + "epoch": 0.51269491387052, + "grad_norm": 19.132193339224816, + "learning_rate": 1.1308089316335694e-07, + "logits/chosen": -0.7908254265785217, + "logits/rejected": -0.7592061161994934, + "logps/chosen": -898.7520751953125, + "logps/rejected": -1104.9476318359375, + "loss": 0.3427, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.353885650634766, + "rewards/margins": 1.955952763557434, + "rewards/rejected": -6.309837818145752, + "step": 785 + }, + { + "epoch": 0.5133480284104824, + "grad_norm": 19.766383065845503, + "learning_rate": 1.1285467203730403e-07, + "logits/chosen": -0.9346864819526672, + "logits/rejected": -0.9409589171409607, + "logps/chosen": -894.0338134765625, + "logps/rejected": -956.7890014648438, + "loss": 0.3966, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.301855087280273, + "rewards/margins": 0.8575952649116516, + "rewards/rejected": -5.159451007843018, + "step": 786 + }, + { + "epoch": 0.514001142950445, + "grad_norm": 23.39755783333431, + "learning_rate": 1.1262838400099733e-07, + "logits/chosen": -0.7014452815055847, + "logits/rejected": -0.716223418712616, + "logps/chosen": -890.46142578125, + "logps/rejected": -1055.3876953125, + "loss": 0.4387, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.431001663208008, + "rewards/margins": 1.5780439376831055, + "rewards/rejected": -6.009045600891113, + "step": 787 + }, + { + "epoch": 0.5146542574904074, + "grad_norm": 21.934271494714906, + "learning_rate": 1.1240203023229568e-07, + "logits/chosen": -0.829107403755188, + "logits/rejected": -0.712059497833252, + "logps/chosen": -971.6460571289062, + "logps/rejected": -1099.4808349609375, + "loss": 0.4365, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.3892669677734375, + "rewards/margins": 1.1161062717437744, + "rewards/rejected": -5.505373477935791, + "step": 788 + }, + { + "epoch": 0.5153073720303698, + "grad_norm": 19.574727119246504, + "learning_rate": 1.121756119094002e-07, + "logits/chosen": -1.0405265092849731, + "logits/rejected": -0.9297416806221008, + "logps/chosen": -885.945556640625, + "logps/rejected": -1020.0201416015625, + "loss": 0.4007, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.058424949645996, + "rewards/margins": 1.41481351852417, + "rewards/rejected": -5.473237991333008, + "step": 789 + }, + { + "epoch": 0.5159604865703322, + "grad_norm": 18.852117751022757, + "learning_rate": 1.1194913021084789e-07, + "logits/chosen": -0.6187543272972107, + "logits/rejected": -0.7156034111976624, + "logps/chosen": -872.1130981445312, + "logps/rejected": -1123.46875, + "loss": 0.447, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.998103618621826, + "rewards/margins": 1.690637469291687, + "rewards/rejected": -5.688741683959961, + "step": 790 + }, + { + "epoch": 0.5166136011102948, + "grad_norm": 32.281580806874125, + "learning_rate": 1.1172258631550571e-07, + "logits/chosen": -0.7386258244514465, + "logits/rejected": -0.647053599357605, + "logps/chosen": -860.7993774414062, + "logps/rejected": -970.7279052734375, + "loss": 0.4532, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5283761024475098, + "rewards/margins": 1.3366488218307495, + "rewards/rejected": -4.865025043487549, + "step": 791 + }, + { + "epoch": 0.5172667156502572, + "grad_norm": 26.745299451857544, + "learning_rate": 1.1149598140256435e-07, + "logits/chosen": -0.8861040472984314, + "logits/rejected": -0.8817246556282043, + "logps/chosen": -845.762451171875, + "logps/rejected": -937.815185546875, + "loss": 0.4173, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8758373260498047, + "rewards/margins": 0.9351273775100708, + "rewards/rejected": -4.810964584350586, + "step": 792 + }, + { + "epoch": 0.5179198301902196, + "grad_norm": 11.575282148376893, + "learning_rate": 1.1126931665153212e-07, + "logits/chosen": -0.7514525651931763, + "logits/rejected": -0.7448426485061646, + "logps/chosen": -910.6206665039062, + "logps/rejected": -1089.5885009765625, + "loss": 0.3624, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.288670539855957, + "rewards/margins": 1.8873575925827026, + "rewards/rejected": -6.176028251647949, + "step": 793 + }, + { + "epoch": 0.518572944730182, + "grad_norm": 16.52311325074282, + "learning_rate": 1.1104259324222875e-07, + "logits/chosen": -0.9778417944908142, + "logits/rejected": -0.9302371144294739, + "logps/chosen": -965.8187255859375, + "logps/rejected": -1182.9871826171875, + "loss": 0.3429, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.435500144958496, + "rewards/margins": 2.29146671295166, + "rewards/rejected": -6.7269673347473145, + "step": 794 + }, + { + "epoch": 0.5192260592701445, + "grad_norm": 17.244378755919463, + "learning_rate": 1.1081581235477935e-07, + "logits/chosen": -0.9682803153991699, + "logits/rejected": -0.9343181848526001, + "logps/chosen": -969.009765625, + "logps/rejected": -1048.1226806640625, + "loss": 0.4333, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.81222677230835, + "rewards/margins": 1.035292148590088, + "rewards/rejected": -5.847518444061279, + "step": 795 + }, + { + "epoch": 0.519879173810107, + "grad_norm": 16.66478304693279, + "learning_rate": 1.1058897516960814e-07, + "logits/chosen": -1.0090668201446533, + "logits/rejected": -0.9655574560165405, + "logps/chosen": -897.3173217773438, + "logps/rejected": -1005.6980590820312, + "loss": 0.3305, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.381283283233643, + "rewards/margins": 1.1640375852584839, + "rewards/rejected": -5.545321464538574, + "step": 796 + }, + { + "epoch": 0.5205322883500694, + "grad_norm": 20.808507419653285, + "learning_rate": 1.1036208286743245e-07, + "logits/chosen": -0.7843279242515564, + "logits/rejected": -0.7574375867843628, + "logps/chosen": -872.26416015625, + "logps/rejected": -1070.68798828125, + "loss": 0.4145, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.312079429626465, + "rewards/margins": 1.5701541900634766, + "rewards/rejected": -5.882233619689941, + "step": 797 + }, + { + "epoch": 0.5211854028900318, + "grad_norm": 20.86240755584441, + "learning_rate": 1.1013513662925647e-07, + "logits/chosen": -0.7959500551223755, + "logits/rejected": -0.8046637773513794, + "logps/chosen": -928.501953125, + "logps/rejected": -1132.994140625, + "loss": 0.4094, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.8640923500061035, + "rewards/margins": 1.7896808385849, + "rewards/rejected": -6.653773307800293, + "step": 798 + }, + { + "epoch": 0.5218385174299943, + "grad_norm": 17.250116169588317, + "learning_rate": 1.0990813763636511e-07, + "logits/chosen": -0.9496831297874451, + "logits/rejected": -0.8449395895004272, + "logps/chosen": -855.71630859375, + "logps/rejected": -891.7100219726562, + "loss": 0.4311, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.203281879425049, + "rewards/margins": 0.8583158254623413, + "rewards/rejected": -5.061598300933838, + "step": 799 + }, + { + "epoch": 0.5224916319699567, + "grad_norm": 23.966712460917016, + "learning_rate": 1.0968108707031791e-07, + "logits/chosen": -0.9248429536819458, + "logits/rejected": -0.8824765682220459, + "logps/chosen": -1019.092041015625, + "logps/rejected": -1253.128662109375, + "loss": 0.35, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.74540376663208, + "rewards/margins": 1.9570448398590088, + "rewards/rejected": -6.70244836807251, + "step": 800 + }, + { + "epoch": 0.5224916319699567, + "eval_logits/chosen": -0.6346727013587952, + "eval_logits/rejected": -0.5778316855430603, + "eval_logps/chosen": -945.1963500976562, + "eval_logps/rejected": -1081.7843017578125, + "eval_loss": 0.418577641248703, + "eval_rewards/accuracies": 0.8009999990463257, + "eval_rewards/chosen": -4.712891578674316, + "eval_rewards/margins": 1.4532641172409058, + "eval_rewards/rejected": -6.16615629196167, + "eval_runtime": 620.2119, + "eval_samples_per_second": 6.449, + "eval_steps_per_second": 0.403, + "step": 800 + }, + { + "epoch": 0.5231447465099192, + "grad_norm": 26.85233093827759, + "learning_rate": 1.0945398611294285e-07, + "logits/chosen": -0.7998533248901367, + "logits/rejected": -0.5648460388183594, + "logps/chosen": -1003.441162109375, + "logps/rejected": -1087.8489990234375, + "loss": 0.4996, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.341720104217529, + "rewards/margins": 1.3586604595184326, + "rewards/rejected": -6.700381278991699, + "step": 801 + }, + { + "epoch": 0.5237978610498816, + "grad_norm": 17.38645527294449, + "learning_rate": 1.092268359463302e-07, + "logits/chosen": -0.7134398818016052, + "logits/rejected": -0.6134901642799377, + "logps/chosen": -907.7614135742188, + "logps/rejected": -1068.71337890625, + "loss": 0.4722, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.564774990081787, + "rewards/margins": 1.775583267211914, + "rewards/rejected": -6.340358734130859, + "step": 802 + }, + { + "epoch": 0.5244509755898441, + "grad_norm": 19.71290467024556, + "learning_rate": 1.0899963775282634e-07, + "logits/chosen": -0.8641142249107361, + "logits/rejected": -0.897304892539978, + "logps/chosen": -932.3726196289062, + "logps/rejected": -1118.64599609375, + "loss": 0.4241, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.4526777267456055, + "rewards/margins": 1.5689259767532349, + "rewards/rejected": -6.021603584289551, + "step": 803 + }, + { + "epoch": 0.5251040901298065, + "grad_norm": 18.33626618763272, + "learning_rate": 1.0877239271502772e-07, + "logits/chosen": -0.8803704977035522, + "logits/rejected": -0.6588385105133057, + "logps/chosen": -873.22021484375, + "logps/rejected": -1023.843994140625, + "loss": 0.3665, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.044747352600098, + "rewards/margins": 1.8748221397399902, + "rewards/rejected": -5.919569492340088, + "step": 804 + }, + { + "epoch": 0.5257572046697689, + "grad_norm": 17.680349628821233, + "learning_rate": 1.0854510201577449e-07, + "logits/chosen": -0.8897794485092163, + "logits/rejected": -0.9353486895561218, + "logps/chosen": -917.898193359375, + "logps/rejected": -1087.0511474609375, + "loss": 0.4349, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.276698589324951, + "rewards/margins": 1.452713131904602, + "rewards/rejected": -5.729411602020264, + "step": 805 + }, + { + "epoch": 0.5264103192097314, + "grad_norm": 21.11052512501825, + "learning_rate": 1.0831776683814464e-07, + "logits/chosen": -0.9259909987449646, + "logits/rejected": -0.8548303842544556, + "logps/chosen": -880.1243286132812, + "logps/rejected": -971.2887573242188, + "loss": 0.4331, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.141092300415039, + "rewards/margins": 1.3644092082977295, + "rewards/rejected": -5.5055012702941895, + "step": 806 + }, + { + "epoch": 0.5270634337496939, + "grad_norm": 19.97601485174499, + "learning_rate": 1.0809038836544751e-07, + "logits/chosen": -0.7640643119812012, + "logits/rejected": -0.7135207056999207, + "logps/chosen": -939.5198974609375, + "logps/rejected": -1051.8248291015625, + "loss": 0.4668, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.619152069091797, + "rewards/margins": 1.2952475547790527, + "rewards/rejected": -5.914400100708008, + "step": 807 + }, + { + "epoch": 0.5277165482896563, + "grad_norm": 23.39884997243589, + "learning_rate": 1.0786296778121786e-07, + "logits/chosen": -0.6651238203048706, + "logits/rejected": -0.6047101020812988, + "logps/chosen": -918.5106201171875, + "logps/rejected": -1037.3143310546875, + "loss": 0.3461, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.469816207885742, + "rewards/margins": 1.4443023204803467, + "rewards/rejected": -5.914118766784668, + "step": 808 + }, + { + "epoch": 0.5283696628296187, + "grad_norm": 22.14266673501406, + "learning_rate": 1.0763550626920972e-07, + "logits/chosen": -0.8417829275131226, + "logits/rejected": -0.6736387014389038, + "logps/chosen": -863.1395874023438, + "logps/rejected": -891.3818359375, + "loss": 0.4795, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.130784034729004, + "rewards/margins": 0.5567496418952942, + "rewards/rejected": -4.687533378601074, + "step": 809 + }, + { + "epoch": 0.5290227773695811, + "grad_norm": 16.303972428695925, + "learning_rate": 1.0740800501339007e-07, + "logits/chosen": -0.7094486951828003, + "logits/rejected": -0.6242579221725464, + "logps/chosen": -942.656005859375, + "logps/rejected": -1013.3216552734375, + "loss": 0.4109, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.7234578132629395, + "rewards/margins": 1.2442445755004883, + "rewards/rejected": -5.967702865600586, + "step": 810 + }, + { + "epoch": 0.5296758919095437, + "grad_norm": 14.747642988422188, + "learning_rate": 1.0718046519793276e-07, + "logits/chosen": -0.995162308216095, + "logits/rejected": -0.9179979562759399, + "logps/chosen": -830.9282836914062, + "logps/rejected": -976.679931640625, + "loss": 0.4707, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.731590986251831, + "rewards/margins": 1.6209721565246582, + "rewards/rejected": -5.35256290435791, + "step": 811 + }, + { + "epoch": 0.5303290064495061, + "grad_norm": 15.35127575032279, + "learning_rate": 1.0695288800721239e-07, + "logits/chosen": -0.7637011408805847, + "logits/rejected": -0.7296819090843201, + "logps/chosen": -851.900146484375, + "logps/rejected": -968.2127685546875, + "loss": 0.4265, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.94958758354187, + "rewards/margins": 1.1997019052505493, + "rewards/rejected": -5.149289131164551, + "step": 812 + }, + { + "epoch": 0.5309821209894685, + "grad_norm": 13.572049988213855, + "learning_rate": 1.0672527462579808e-07, + "logits/chosen": -1.0328404903411865, + "logits/rejected": -0.9988901019096375, + "logps/chosen": -1081.924560546875, + "logps/rejected": -1306.794189453125, + "loss": 0.4245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.272036552429199, + "rewards/margins": 2.137697219848633, + "rewards/rejected": -7.409733772277832, + "step": 813 + }, + { + "epoch": 0.5316352355294309, + "grad_norm": 19.156616594792975, + "learning_rate": 1.0649762623844732e-07, + "logits/chosen": -0.9236339330673218, + "logits/rejected": -0.8460508584976196, + "logps/chosen": -954.2943115234375, + "logps/rejected": -1075.7880859375, + "loss": 0.3284, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.132830619812012, + "rewards/margins": 1.6366045475006104, + "rewards/rejected": -5.769435405731201, + "step": 814 + }, + { + "epoch": 0.5322883500693935, + "grad_norm": 23.48249212079967, + "learning_rate": 1.0626994403009984e-07, + "logits/chosen": -0.8499413728713989, + "logits/rejected": -0.8164854645729065, + "logps/chosen": -830.4996337890625, + "logps/rejected": -956.6808471679688, + "loss": 0.4009, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.115767478942871, + "rewards/margins": 1.3039559125900269, + "rewards/rejected": -5.419723033905029, + "step": 815 + }, + { + "epoch": 0.5329414646093559, + "grad_norm": 16.588524539188985, + "learning_rate": 1.0604222918587138e-07, + "logits/chosen": -0.8046386241912842, + "logits/rejected": -0.8380962610244751, + "logps/chosen": -883.6956787109375, + "logps/rejected": -1103.479248046875, + "loss": 0.3659, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.380067825317383, + "rewards/margins": 1.518940806388855, + "rewards/rejected": -5.899008750915527, + "step": 816 + }, + { + "epoch": 0.5335945791493183, + "grad_norm": 37.47838079132424, + "learning_rate": 1.0581448289104757e-07, + "logits/chosen": -0.8544302582740784, + "logits/rejected": -0.8431679010391235, + "logps/chosen": -943.7468872070312, + "logps/rejected": -1091.618896484375, + "loss": 0.4921, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.296459674835205, + "rewards/margins": 1.5792046785354614, + "rewards/rejected": -5.875664234161377, + "step": 817 + }, + { + "epoch": 0.5342476936892807, + "grad_norm": 17.35937964327749, + "learning_rate": 1.0558670633107778e-07, + "logits/chosen": -0.8644323348999023, + "logits/rejected": -0.7818232774734497, + "logps/chosen": -911.686279296875, + "logps/rejected": -1052.9893798828125, + "loss": 0.3812, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.129489898681641, + "rewards/margins": 1.676690697669983, + "rewards/rejected": -5.806180477142334, + "step": 818 + }, + { + "epoch": 0.5349008082292432, + "grad_norm": 22.2328867221953, + "learning_rate": 1.0535890069156883e-07, + "logits/chosen": -0.7671269178390503, + "logits/rejected": -0.7240158319473267, + "logps/chosen": -901.3548583984375, + "logps/rejected": -1148.04248046875, + "loss": 0.3978, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.508139133453369, + "rewards/margins": 1.8734978437423706, + "rewards/rejected": -6.381637096405029, + "step": 819 + }, + { + "epoch": 0.5355539227692057, + "grad_norm": 18.971039193243993, + "learning_rate": 1.0513106715827895e-07, + "logits/chosen": -0.9428325891494751, + "logits/rejected": -0.9358526468276978, + "logps/chosen": -1081.2198486328125, + "logps/rejected": -1319.913330078125, + "loss": 0.4345, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.017473220825195, + "rewards/margins": 1.9838080406188965, + "rewards/rejected": -8.00128173828125, + "step": 820 + }, + { + "epoch": 0.5362070373091681, + "grad_norm": 20.059049388946285, + "learning_rate": 1.0490320691711161e-07, + "logits/chosen": -0.8310408592224121, + "logits/rejected": -0.679469108581543, + "logps/chosen": -968.2921752929688, + "logps/rejected": -1083.9666748046875, + "loss": 0.4387, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.922040939331055, + "rewards/margins": 1.4484070539474487, + "rewards/rejected": -6.370449066162109, + "step": 821 + }, + { + "epoch": 0.5368601518491305, + "grad_norm": 16.817851734320218, + "learning_rate": 1.046753211541092e-07, + "logits/chosen": -0.7916433811187744, + "logits/rejected": -0.8427475094795227, + "logps/chosen": -929.83447265625, + "logps/rejected": -1193.4378662109375, + "loss": 0.3673, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.488502502441406, + "rewards/margins": 2.309746265411377, + "rewards/rejected": -6.798249244689941, + "step": 822 + }, + { + "epoch": 0.537513266389093, + "grad_norm": 17.875343156058488, + "learning_rate": 1.0444741105544703e-07, + "logits/chosen": -0.7578955888748169, + "logits/rejected": -0.6925256252288818, + "logps/chosen": -952.1182861328125, + "logps/rejected": -1144.278076171875, + "loss": 0.411, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.83486270904541, + "rewards/margins": 1.6570872068405151, + "rewards/rejected": -6.491950035095215, + "step": 823 + }, + { + "epoch": 0.5381663809290554, + "grad_norm": 18.053703232762427, + "learning_rate": 1.0421947780742703e-07, + "logits/chosen": -0.8508166670799255, + "logits/rejected": -0.7643336653709412, + "logps/chosen": -934.5198974609375, + "logps/rejected": -1070.933349609375, + "loss": 0.3955, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.615598678588867, + "rewards/margins": 1.5782887935638428, + "rewards/rejected": -6.193887233734131, + "step": 824 + }, + { + "epoch": 0.5388194954690179, + "grad_norm": 19.898275287550767, + "learning_rate": 1.0399152259647168e-07, + "logits/chosen": -0.894473671913147, + "logits/rejected": -0.8692827224731445, + "logps/chosen": -901.361328125, + "logps/rejected": -1145.5904541015625, + "loss": 0.4026, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.9130971431732178, + "rewards/margins": 2.1608455181121826, + "rewards/rejected": -6.0739426612854, + "step": 825 + }, + { + "epoch": 0.5394726100089803, + "grad_norm": 62.75164418514141, + "learning_rate": 1.0376354660911771e-07, + "logits/chosen": -0.9447248578071594, + "logits/rejected": -0.8414398431777954, + "logps/chosen": -960.39453125, + "logps/rejected": -1077.34814453125, + "loss": 0.3594, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.909175395965576, + "rewards/margins": 1.574775218963623, + "rewards/rejected": -6.483950614929199, + "step": 826 + }, + { + "epoch": 0.5401257245489428, + "grad_norm": 21.232910166545164, + "learning_rate": 1.0353555103201006e-07, + "logits/chosen": -0.8699998259544373, + "logits/rejected": -0.9011096358299255, + "logps/chosen": -966.4214477539062, + "logps/rejected": -1145.947021484375, + "loss": 0.3985, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.497706890106201, + "rewards/margins": 1.7452514171600342, + "rewards/rejected": -6.242958068847656, + "step": 827 + }, + { + "epoch": 0.5407788390889052, + "grad_norm": 25.240205454629475, + "learning_rate": 1.0330753705189561e-07, + "logits/chosen": -0.6747693419456482, + "logits/rejected": -0.7031739354133606, + "logps/chosen": -914.111083984375, + "logps/rejected": -1141.014892578125, + "loss": 0.4238, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.844748497009277, + "rewards/margins": 1.6366426944732666, + "rewards/rejected": -6.481391429901123, + "step": 828 + }, + { + "epoch": 0.5414319536288676, + "grad_norm": 18.613133740822978, + "learning_rate": 1.0307950585561704e-07, + "logits/chosen": -0.7960176467895508, + "logits/rejected": -0.5775362253189087, + "logps/chosen": -913.5531616210938, + "logps/rejected": -983.9921875, + "loss": 0.4677, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.511989593505859, + "rewards/margins": 1.0901737213134766, + "rewards/rejected": -5.602163314819336, + "step": 829 + }, + { + "epoch": 0.5420850681688301, + "grad_norm": 14.709016520023226, + "learning_rate": 1.028514586301066e-07, + "logits/chosen": -0.9919070601463318, + "logits/rejected": -0.8352288007736206, + "logps/chosen": -917.8429565429688, + "logps/rejected": -978.2846069335938, + "loss": 0.4225, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.310373306274414, + "rewards/margins": 1.0810041427612305, + "rewards/rejected": -5.391376972198486, + "step": 830 + }, + { + "epoch": 0.5427381827087926, + "grad_norm": 16.588064198046702, + "learning_rate": 1.0262339656238003e-07, + "logits/chosen": -0.7326082587242126, + "logits/rejected": -0.7825813889503479, + "logps/chosen": -828.0997314453125, + "logps/rejected": -912.7554931640625, + "loss": 0.399, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.009592056274414, + "rewards/margins": 0.8472993969917297, + "rewards/rejected": -4.856891632080078, + "step": 831 + }, + { + "epoch": 0.543391297248755, + "grad_norm": 20.963473622896114, + "learning_rate": 1.0239532083953031e-07, + "logits/chosen": -0.7908737659454346, + "logits/rejected": -0.7778758406639099, + "logps/chosen": -886.2369995117188, + "logps/rejected": -959.3201904296875, + "loss": 0.4719, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.217853546142578, + "rewards/margins": 0.9841310381889343, + "rewards/rejected": -5.201984405517578, + "step": 832 + }, + { + "epoch": 0.5440444117887174, + "grad_norm": 22.538640197716454, + "learning_rate": 1.0216723264872145e-07, + "logits/chosen": -0.633493185043335, + "logits/rejected": -0.6329817771911621, + "logps/chosen": -1038.0047607421875, + "logps/rejected": -1209.1646728515625, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.912166595458984, + "rewards/margins": 1.709618330001831, + "rewards/rejected": -6.6217851638793945, + "step": 833 + }, + { + "epoch": 0.5446975263286798, + "grad_norm": 17.514305805925105, + "learning_rate": 1.0193913317718243e-07, + "logits/chosen": -0.8764938116073608, + "logits/rejected": -0.9298416376113892, + "logps/chosen": -828.791748046875, + "logps/rejected": -1008.34912109375, + "loss": 0.4711, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.27457857131958, + "rewards/margins": 1.551622986793518, + "rewards/rejected": -5.826201438903809, + "step": 834 + }, + { + "epoch": 0.5453506408686424, + "grad_norm": 14.456609289815074, + "learning_rate": 1.0171102361220092e-07, + "logits/chosen": -0.9871069192886353, + "logits/rejected": -0.9372472167015076, + "logps/chosen": -894.3827514648438, + "logps/rejected": -1032.8778076171875, + "loss": 0.4152, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.961792469024658, + "rewards/margins": 1.1698249578475952, + "rewards/rejected": -5.131617069244385, + "step": 835 + }, + { + "epoch": 0.5460037554086048, + "grad_norm": 17.467326646690836, + "learning_rate": 1.014829051411171e-07, + "logits/chosen": -0.7733380794525146, + "logits/rejected": -0.7617398500442505, + "logps/chosen": -855.3818969726562, + "logps/rejected": -965.3425903320312, + "loss": 0.4027, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.709343910217285, + "rewards/margins": 1.39662504196167, + "rewards/rejected": -5.105968952178955, + "step": 836 + }, + { + "epoch": 0.5466568699485672, + "grad_norm": 17.238889166022133, + "learning_rate": 1.0125477895131756e-07, + "logits/chosen": -0.7189229130744934, + "logits/rejected": -0.7574179172515869, + "logps/chosen": -897.20361328125, + "logps/rejected": -1149.623291015625, + "loss": 0.3897, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.643250465393066, + "rewards/margins": 1.7560412883758545, + "rewards/rejected": -6.399292469024658, + "step": 837 + }, + { + "epoch": 0.5473099844885296, + "grad_norm": 27.297747074907825, + "learning_rate": 1.0102664623022899e-07, + "logits/chosen": -0.8005959391593933, + "logits/rejected": -0.8152067065238953, + "logps/chosen": -887.7992553710938, + "logps/rejected": -983.334716796875, + "loss": 0.5078, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.507983207702637, + "rewards/margins": 1.1464667320251465, + "rewards/rejected": -5.654450416564941, + "step": 838 + }, + { + "epoch": 0.5479630990284922, + "grad_norm": 19.71863457458606, + "learning_rate": 1.0079850816531214e-07, + "logits/chosen": -0.9236647486686707, + "logits/rejected": -0.9478015899658203, + "logps/chosen": -817.2036743164062, + "logps/rejected": -914.030517578125, + "loss": 0.3951, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.605985164642334, + "rewards/margins": 1.0325517654418945, + "rewards/rejected": -4.6385369300842285, + "step": 839 + }, + { + "epoch": 0.5486162135684546, + "grad_norm": 14.089508088400418, + "learning_rate": 1.0057036594405562e-07, + "logits/chosen": -0.8286216259002686, + "logits/rejected": -0.8908025026321411, + "logps/chosen": -869.789306640625, + "logps/rejected": -1080.75537109375, + "loss": 0.4289, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.9381942749023438, + "rewards/margins": 1.8179608583450317, + "rewards/rejected": -5.756155014038086, + "step": 840 + }, + { + "epoch": 0.549269328108417, + "grad_norm": 20.274743213765152, + "learning_rate": 1.0034222075396952e-07, + "logits/chosen": -0.8915444016456604, + "logits/rejected": -0.8345527052879333, + "logps/chosen": -882.3980712890625, + "logps/rejected": -1007.1998291015625, + "loss": 0.4294, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.207620143890381, + "rewards/margins": 1.7206785678863525, + "rewards/rejected": -5.928298473358154, + "step": 841 + }, + { + "epoch": 0.5499224426483794, + "grad_norm": 13.551649707228977, + "learning_rate": 1.001140737825795e-07, + "logits/chosen": -1.013258695602417, + "logits/rejected": -0.9069733023643494, + "logps/chosen": -881.470947265625, + "logps/rejected": -1054.0950927734375, + "loss": 0.333, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.0731520652771, + "rewards/margins": 1.6127002239227295, + "rewards/rejected": -5.68585205078125, + "step": 842 + }, + { + "epoch": 0.550575557188342, + "grad_norm": 18.23035900921133, + "learning_rate": 9.988592621742053e-08, + "logits/chosen": -0.8569754958152771, + "logits/rejected": -0.7348583340644836, + "logps/chosen": -887.21044921875, + "logps/rejected": -1064.620849609375, + "loss": 0.3786, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.200411796569824, + "rewards/margins": 1.6168982982635498, + "rewards/rejected": -5.817309856414795, + "step": 843 + }, + { + "epoch": 0.5512286717283044, + "grad_norm": 25.70182139027448, + "learning_rate": 9.965777924603051e-08, + "logits/chosen": -0.907223105430603, + "logits/rejected": -0.9123849868774414, + "logps/chosen": -909.4808959960938, + "logps/rejected": -1078.512939453125, + "loss": 0.4288, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.969564437866211, + "rewards/margins": 1.5746479034423828, + "rewards/rejected": -6.544212341308594, + "step": 844 + }, + { + "epoch": 0.5518817862682668, + "grad_norm": 30.303009196449445, + "learning_rate": 9.942963405594442e-08, + "logits/chosen": -0.9817805290222168, + "logits/rejected": -0.8983163833618164, + "logps/chosen": -1022.9886474609375, + "logps/rejected": -1152.95849609375, + "loss": 0.4492, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.9740095138549805, + "rewards/margins": 1.571298360824585, + "rewards/rejected": -6.545307636260986, + "step": 845 + }, + { + "epoch": 0.5525349008082292, + "grad_norm": 31.55977335805332, + "learning_rate": 9.920149183468785e-08, + "logits/chosen": -0.7902059555053711, + "logits/rejected": -0.7529444694519043, + "logps/chosen": -986.4070434570312, + "logps/rejected": -1099.160888671875, + "loss": 0.4789, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.369345188140869, + "rewards/margins": 1.2646675109863281, + "rewards/rejected": -6.6340131759643555, + "step": 846 + }, + { + "epoch": 0.5531880153481917, + "grad_norm": 27.81516473825978, + "learning_rate": 9.897335376977101e-08, + "logits/chosen": -0.8680393099784851, + "logits/rejected": -0.8680979609489441, + "logps/chosen": -784.2938232421875, + "logps/rejected": -914.517333984375, + "loss": 0.3675, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.907463550567627, + "rewards/margins": 1.699120044708252, + "rewards/rejected": -5.606583595275879, + "step": 847 + }, + { + "epoch": 0.5538411298881541, + "grad_norm": 21.703345546628697, + "learning_rate": 9.874522104868246e-08, + "logits/chosen": -0.8262298703193665, + "logits/rejected": -0.8438596725463867, + "logps/chosen": -984.2900390625, + "logps/rejected": -1112.5946044921875, + "loss": 0.3453, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.441863536834717, + "rewards/margins": 1.5138037204742432, + "rewards/rejected": -5.955667972564697, + "step": 848 + }, + { + "epoch": 0.5544942444281166, + "grad_norm": 16.37096708792487, + "learning_rate": 9.85170948588829e-08, + "logits/chosen": -0.8089182376861572, + "logits/rejected": -0.7880966663360596, + "logps/chosen": -907.8336791992188, + "logps/rejected": -1043.2205810546875, + "loss": 0.4129, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.8086628913879395, + "rewards/margins": 1.415217638015747, + "rewards/rejected": -6.223879814147949, + "step": 849 + }, + { + "epoch": 0.555147358968079, + "grad_norm": 19.64590736722594, + "learning_rate": 9.828897638779909e-08, + "logits/chosen": -0.8265779614448547, + "logits/rejected": -0.7707447409629822, + "logps/chosen": -885.2160034179688, + "logps/rejected": -1009.6092529296875, + "loss": 0.4157, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.476744174957275, + "rewards/margins": 1.478574514389038, + "rewards/rejected": -5.955318927764893, + "step": 850 + }, + { + "epoch": 0.5558004735080415, + "grad_norm": 16.589320867103968, + "learning_rate": 9.806086682281757e-08, + "logits/chosen": -0.9139161109924316, + "logits/rejected": -0.8683348298072815, + "logps/chosen": -938.354248046875, + "logps/rejected": -1108.0430908203125, + "loss": 0.4426, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.8656134605407715, + "rewards/margins": 1.4425888061523438, + "rewards/rejected": -6.308202266693115, + "step": 851 + }, + { + "epoch": 0.5564535880480039, + "grad_norm": 29.62901282273577, + "learning_rate": 9.783276735127854e-08, + "logits/chosen": -0.8697648644447327, + "logits/rejected": -0.9227592945098877, + "logps/chosen": -899.2152099609375, + "logps/rejected": -1116.580810546875, + "loss": 0.4923, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.2547454833984375, + "rewards/margins": 1.4750183820724487, + "rewards/rejected": -5.729763984680176, + "step": 852 + }, + { + "epoch": 0.5571067025879664, + "grad_norm": 30.236006209514965, + "learning_rate": 9.760467916046971e-08, + "logits/chosen": -0.7272669076919556, + "logits/rejected": -0.7988542318344116, + "logps/chosen": -901.4462890625, + "logps/rejected": -1169.47900390625, + "loss": 0.3836, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.271294593811035, + "rewards/margins": 2.1608831882476807, + "rewards/rejected": -6.4321770668029785, + "step": 853 + }, + { + "epoch": 0.5577598171279288, + "grad_norm": 14.287731139104126, + "learning_rate": 9.737660343761998e-08, + "logits/chosen": -0.7816422581672668, + "logits/rejected": -0.7482393383979797, + "logps/chosen": -966.7650756835938, + "logps/rejected": -1164.88818359375, + "loss": 0.3908, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.111935615539551, + "rewards/margins": 1.6342039108276367, + "rewards/rejected": -6.7461395263671875, + "step": 854 + }, + { + "epoch": 0.5584129316678913, + "grad_norm": 22.423108939531772, + "learning_rate": 9.71485413698934e-08, + "logits/chosen": -0.8687413930892944, + "logits/rejected": -0.8252356052398682, + "logps/chosen": -926.5127563476562, + "logps/rejected": -1134.5728759765625, + "loss": 0.4262, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.845583438873291, + "rewards/margins": 1.908933401107788, + "rewards/rejected": -6.754517078399658, + "step": 855 + }, + { + "epoch": 0.5590660462078537, + "grad_norm": 33.852533096509646, + "learning_rate": 9.692049414438298e-08, + "logits/chosen": -0.7711067795753479, + "logits/rejected": -0.7694792747497559, + "logps/chosen": -851.6452026367188, + "logps/rejected": -1003.3096313476562, + "loss": 0.4085, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.205625057220459, + "rewards/margins": 1.1972630023956299, + "rewards/rejected": -5.402888298034668, + "step": 856 + }, + { + "epoch": 0.5597191607478161, + "grad_norm": 20.80420263521599, + "learning_rate": 9.66924629481044e-08, + "logits/chosen": -0.6256682872772217, + "logits/rejected": -0.6322759985923767, + "logps/chosen": -906.5489501953125, + "logps/rejected": -1021.0867309570312, + "loss": 0.4294, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.7905402183532715, + "rewards/margins": 1.0824882984161377, + "rewards/rejected": -5.87302827835083, + "step": 857 + }, + { + "epoch": 0.5603722752877786, + "grad_norm": 32.72902729562608, + "learning_rate": 9.646444896798995e-08, + "logits/chosen": -0.7554073333740234, + "logits/rejected": -0.7180957198143005, + "logps/chosen": -828.9449462890625, + "logps/rejected": -949.5023193359375, + "loss": 0.4937, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.3675971031188965, + "rewards/margins": 1.031946063041687, + "rewards/rejected": -5.399543762207031, + "step": 858 + }, + { + "epoch": 0.5610253898277411, + "grad_norm": 18.554744408873283, + "learning_rate": 9.623645339088228e-08, + "logits/chosen": -0.9316878318786621, + "logits/rejected": -0.744033932685852, + "logps/chosen": -991.4241333007812, + "logps/rejected": -1106.7291259765625, + "loss": 0.435, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.765175819396973, + "rewards/margins": 1.3670111894607544, + "rewards/rejected": -6.1321868896484375, + "step": 859 + }, + { + "epoch": 0.5616785043677035, + "grad_norm": 19.31098701896201, + "learning_rate": 9.600847740352833e-08, + "logits/chosen": -0.8235379457473755, + "logits/rejected": -0.8043882846832275, + "logps/chosen": -1014.3311157226562, + "logps/rejected": -1210.599853515625, + "loss": 0.4316, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.12346887588501, + "rewards/margins": 1.4316790103912354, + "rewards/rejected": -6.555147647857666, + "step": 860 + }, + { + "epoch": 0.5623316189076659, + "grad_norm": 19.831850467702388, + "learning_rate": 9.578052219257297e-08, + "logits/chosen": -0.874383807182312, + "logits/rejected": -0.8897905349731445, + "logps/chosen": -888.3280639648438, + "logps/rejected": -1030.50244140625, + "loss": 0.3993, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.6704063415527344, + "rewards/margins": 1.5759150981903076, + "rewards/rejected": -5.246321678161621, + "step": 861 + }, + { + "epoch": 0.5629847334476283, + "grad_norm": 25.583440042617287, + "learning_rate": 9.555258894455297e-08, + "logits/chosen": -0.9648239612579346, + "logits/rejected": -0.9095680713653564, + "logps/chosen": -939.7294921875, + "logps/rejected": -1098.48388671875, + "loss": 0.4398, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.5690765380859375, + "rewards/margins": 1.7343909740447998, + "rewards/rejected": -6.303468227386475, + "step": 862 + }, + { + "epoch": 0.5636378479875909, + "grad_norm": 23.956822440993626, + "learning_rate": 9.532467884589079e-08, + "logits/chosen": -0.8250060081481934, + "logits/rejected": -0.8016010522842407, + "logps/chosen": -864.0713500976562, + "logps/rejected": -1081.85986328125, + "loss": 0.4364, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.301777362823486, + "rewards/margins": 1.6754717826843262, + "rewards/rejected": -5.977249622344971, + "step": 863 + }, + { + "epoch": 0.5642909625275533, + "grad_norm": 15.98416872196485, + "learning_rate": 9.509679308288838e-08, + "logits/chosen": -0.7682183980941772, + "logits/rejected": -0.7244136929512024, + "logps/chosen": -967.8103637695312, + "logps/rejected": -1283.90380859375, + "loss": 0.3473, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.515521049499512, + "rewards/margins": 2.6907920837402344, + "rewards/rejected": -7.206313133239746, + "step": 864 + }, + { + "epoch": 0.5649440770675157, + "grad_norm": 26.277805017545777, + "learning_rate": 9.486893284172101e-08, + "logits/chosen": -0.9032926559448242, + "logits/rejected": -0.7865481376647949, + "logps/chosen": -837.1546630859375, + "logps/rejected": -1026.15185546875, + "loss": 0.4214, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.893397331237793, + "rewards/margins": 1.903401494026184, + "rewards/rejected": -5.796799182891846, + "step": 865 + }, + { + "epoch": 0.5655971916074781, + "grad_norm": 18.34731751923566, + "learning_rate": 9.464109930843119e-08, + "logits/chosen": -0.7677436470985413, + "logits/rejected": -0.7806234955787659, + "logps/chosen": -898.8002319335938, + "logps/rejected": -1071.4962158203125, + "loss": 0.4379, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.56309700012207, + "rewards/margins": 1.2050151824951172, + "rewards/rejected": -5.7681121826171875, + "step": 866 + }, + { + "epoch": 0.5662503061474407, + "grad_norm": 20.18205861838406, + "learning_rate": 9.441329366892222e-08, + "logits/chosen": -0.9723510146141052, + "logits/rejected": -0.8582167625427246, + "logps/chosen": -932.4115600585938, + "logps/rejected": -1021.81494140625, + "loss": 0.4462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.6146979331970215, + "rewards/margins": 1.169451355934143, + "rewards/rejected": -5.784149169921875, + "step": 867 + }, + { + "epoch": 0.5669034206874031, + "grad_norm": 18.457641315699057, + "learning_rate": 9.418551710895242e-08, + "logits/chosen": -0.908024787902832, + "logits/rejected": -0.8809236288070679, + "logps/chosen": -964.94140625, + "logps/rejected": -1143.9808349609375, + "loss": 0.392, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.553819179534912, + "rewards/margins": 1.8509615659713745, + "rewards/rejected": -6.404780864715576, + "step": 868 + }, + { + "epoch": 0.5675565352273655, + "grad_norm": 14.028941998053218, + "learning_rate": 9.395777081412863e-08, + "logits/chosen": -0.9205148220062256, + "logits/rejected": -0.8955953121185303, + "logps/chosen": -1036.298828125, + "logps/rejected": -1205.27880859375, + "loss": 0.3663, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.52469539642334, + "rewards/margins": 1.6343746185302734, + "rewards/rejected": -7.159069538116455, + "step": 869 + }, + { + "epoch": 0.5682096497673279, + "grad_norm": 15.207559565633286, + "learning_rate": 9.373005596990017e-08, + "logits/chosen": -0.9882926940917969, + "logits/rejected": -0.8943915367126465, + "logps/chosen": -1000.8101196289062, + "logps/rejected": -1069.3223876953125, + "loss": 0.3789, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.538582801818848, + "rewards/margins": 1.1359279155731201, + "rewards/rejected": -5.674510955810547, + "step": 870 + }, + { + "epoch": 0.5688627643072904, + "grad_norm": 17.924430093356584, + "learning_rate": 9.350237376155267e-08, + "logits/chosen": -0.7669613361358643, + "logits/rejected": -0.7748525738716125, + "logps/chosen": -969.697265625, + "logps/rejected": -1114.1466064453125, + "loss": 0.4385, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.298352241516113, + "rewards/margins": 1.4008281230926514, + "rewards/rejected": -5.699180603027344, + "step": 871 + }, + { + "epoch": 0.5695158788472529, + "grad_norm": 15.756432200133906, + "learning_rate": 9.327472537420193e-08, + "logits/chosen": -0.9131944179534912, + "logits/rejected": -0.9345240592956543, + "logps/chosen": -950.0737915039062, + "logps/rejected": -1141.3934326171875, + "loss": 0.4235, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.154352188110352, + "rewards/margins": 1.5505449771881104, + "rewards/rejected": -5.704896926879883, + "step": 872 + }, + { + "epoch": 0.5701689933872153, + "grad_norm": 13.680882524525893, + "learning_rate": 9.30471119927876e-08, + "logits/chosen": -0.7933511734008789, + "logits/rejected": -0.758368968963623, + "logps/chosen": -911.67529296875, + "logps/rejected": -1185.8714599609375, + "loss": 0.3754, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.790996074676514, + "rewards/margins": 2.2015562057495117, + "rewards/rejected": -6.992551803588867, + "step": 873 + }, + { + "epoch": 0.5708221079271777, + "grad_norm": 18.923502793123205, + "learning_rate": 9.281953480206723e-08, + "logits/chosen": -0.7157567143440247, + "logits/rejected": -0.7758267521858215, + "logps/chosen": -849.9529418945312, + "logps/rejected": -1070.7559814453125, + "loss": 0.4422, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.099109649658203, + "rewards/margins": 1.9049654006958008, + "rewards/rejected": -6.004075050354004, + "step": 874 + }, + { + "epoch": 0.5714752224671402, + "grad_norm": 22.82247720714276, + "learning_rate": 9.259199498660993e-08, + "logits/chosen": -1.0105547904968262, + "logits/rejected": -0.8255823850631714, + "logps/chosen": -944.2193603515625, + "logps/rejected": -1074.181884765625, + "loss": 0.3415, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.413623332977295, + "rewards/margins": 1.8328722715377808, + "rewards/rejected": -6.246495246887207, + "step": 875 + }, + { + "epoch": 0.5721283370071026, + "grad_norm": 35.206675729481915, + "learning_rate": 9.236449373079026e-08, + "logits/chosen": -0.9094786643981934, + "logits/rejected": -0.778907060623169, + "logps/chosen": -933.5310668945312, + "logps/rejected": -1074.3388671875, + "loss": 0.475, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.948477745056152, + "rewards/margins": 1.522892951965332, + "rewards/rejected": -6.471371173858643, + "step": 876 + }, + { + "epoch": 0.5727814515470651, + "grad_norm": 27.421712157237085, + "learning_rate": 9.213703221878215e-08, + "logits/chosen": -0.7551379799842834, + "logits/rejected": -0.7433313131332397, + "logps/chosen": -939.6011962890625, + "logps/rejected": -1161.37548828125, + "loss": 0.3851, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.847814559936523, + "rewards/margins": 1.955463171005249, + "rewards/rejected": -6.803277969360352, + "step": 877 + }, + { + "epoch": 0.5734345660870275, + "grad_norm": 16.61141184713571, + "learning_rate": 9.190961163455253e-08, + "logits/chosen": -0.8657856583595276, + "logits/rejected": -0.8066181540489197, + "logps/chosen": -973.77490234375, + "logps/rejected": -1163.808349609375, + "loss": 0.3738, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.766665458679199, + "rewards/margins": 1.7785844802856445, + "rewards/rejected": -6.545250415802002, + "step": 878 + }, + { + "epoch": 0.57408768062699, + "grad_norm": 33.86515631869608, + "learning_rate": 9.168223316185538e-08, + "logits/chosen": -0.8339582681655884, + "logits/rejected": -0.7555098533630371, + "logps/chosen": -1008.7633056640625, + "logps/rejected": -1209.2352294921875, + "loss": 0.4745, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.217278480529785, + "rewards/margins": 1.7660671472549438, + "rewards/rejected": -6.983345985412598, + "step": 879 + }, + { + "epoch": 0.5747407951669524, + "grad_norm": 23.838839628578985, + "learning_rate": 9.14548979842255e-08, + "logits/chosen": -0.8603442311286926, + "logits/rejected": -0.7912033796310425, + "logps/chosen": -1011.2332763671875, + "logps/rejected": -1169.10205078125, + "loss": 0.4029, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.176833152770996, + "rewards/margins": 1.4864170551300049, + "rewards/rejected": -6.663249969482422, + "step": 880 + }, + { + "epoch": 0.5753939097069148, + "grad_norm": 24.08255938340959, + "learning_rate": 9.12276072849723e-08, + "logits/chosen": -0.9084462523460388, + "logits/rejected": -0.9800143241882324, + "logps/chosen": -992.2244873046875, + "logps/rejected": -1158.918701171875, + "loss": 0.3761, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.683312892913818, + "rewards/margins": 1.6894042491912842, + "rewards/rejected": -6.372716903686523, + "step": 881 + }, + { + "epoch": 0.5760470242468773, + "grad_norm": 20.741267761943313, + "learning_rate": 9.100036224717366e-08, + "logits/chosen": -0.8111915588378906, + "logits/rejected": -0.8897655010223389, + "logps/chosen": -938.0897216796875, + "logps/rejected": -1127.0625, + "loss": 0.4212, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.022097110748291, + "rewards/margins": 1.2158722877502441, + "rewards/rejected": -5.237969875335693, + "step": 882 + }, + { + "epoch": 0.5767001387868398, + "grad_norm": 17.520051364033677, + "learning_rate": 9.077316405366981e-08, + "logits/chosen": -0.8745783567428589, + "logits/rejected": -0.867775559425354, + "logps/chosen": -1060.55078125, + "logps/rejected": -1236.429443359375, + "loss": 0.3522, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.311179161071777, + "rewards/margins": 1.8504433631896973, + "rewards/rejected": -7.161622047424316, + "step": 883 + }, + { + "epoch": 0.5773532533268022, + "grad_norm": 17.560718564711973, + "learning_rate": 9.054601388705715e-08, + "logits/chosen": -0.8407562375068665, + "logits/rejected": -0.7107415199279785, + "logps/chosen": -1017.076416015625, + "logps/rejected": -1125.1346435546875, + "loss": 0.4278, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.200894832611084, + "rewards/margins": 1.2635349035263062, + "rewards/rejected": -6.464428901672363, + "step": 884 + }, + { + "epoch": 0.5780063678667646, + "grad_norm": 17.42515113495744, + "learning_rate": 9.03189129296821e-08, + "logits/chosen": -0.598199725151062, + "logits/rejected": -0.5980876684188843, + "logps/chosen": -890.3635864257812, + "logps/rejected": -1140.5980224609375, + "loss": 0.3686, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.948042869567871, + "rewards/margins": 2.095231533050537, + "rewards/rejected": -7.043274879455566, + "step": 885 + }, + { + "epoch": 0.578659482406727, + "grad_norm": 16.370071766658146, + "learning_rate": 9.00918623636349e-08, + "logits/chosen": -0.8857473134994507, + "logits/rejected": -0.7719743847846985, + "logps/chosen": -1002.3225708007812, + "logps/rejected": -1073.412353515625, + "loss": 0.4162, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.398256778717041, + "rewards/margins": 1.0608148574829102, + "rewards/rejected": -6.459071636199951, + "step": 886 + }, + { + "epoch": 0.5793125969466896, + "grad_norm": 20.019533693747015, + "learning_rate": 8.986486337074354e-08, + "logits/chosen": -0.8915666341781616, + "logits/rejected": -0.9138790965080261, + "logps/chosen": -909.7576904296875, + "logps/rejected": -976.8128051757812, + "loss": 0.476, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.364135265350342, + "rewards/margins": 0.7550020217895508, + "rewards/rejected": -5.119136810302734, + "step": 887 + }, + { + "epoch": 0.579965711486652, + "grad_norm": 18.672808344529344, + "learning_rate": 8.963791713256754e-08, + "logits/chosen": -0.9769740104675293, + "logits/rejected": -0.852337121963501, + "logps/chosen": -977.1018676757812, + "logps/rejected": -1132.941162109375, + "loss": 0.3811, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.742093086242676, + "rewards/margins": 1.554567813873291, + "rewards/rejected": -6.296660423278809, + "step": 888 + }, + { + "epoch": 0.5806188260266144, + "grad_norm": 27.233632516341583, + "learning_rate": 8.941102483039187e-08, + "logits/chosen": -0.9475686550140381, + "logits/rejected": -0.8408269882202148, + "logps/chosen": -985.3871459960938, + "logps/rejected": -1131.696533203125, + "loss": 0.3782, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.957531929016113, + "rewards/margins": 1.5096138715744019, + "rewards/rejected": -6.4671454429626465, + "step": 889 + }, + { + "epoch": 0.5812719405665768, + "grad_norm": 19.66330179420604, + "learning_rate": 8.918418764522068e-08, + "logits/chosen": -0.7873989939689636, + "logits/rejected": -0.8866724371910095, + "logps/chosen": -841.0829467773438, + "logps/rejected": -1159.834228515625, + "loss": 0.34, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.916111469268799, + "rewards/margins": 2.1629438400268555, + "rewards/rejected": -6.079055309295654, + "step": 890 + }, + { + "epoch": 0.5819250551065394, + "grad_norm": 17.46307037445428, + "learning_rate": 8.895740675777126e-08, + "logits/chosen": -1.0209726095199585, + "logits/rejected": -1.0288503170013428, + "logps/chosen": -938.6088256835938, + "logps/rejected": -1139.0146484375, + "loss": 0.3924, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.702238082885742, + "rewards/margins": 2.0684337615966797, + "rewards/rejected": -6.770671844482422, + "step": 891 + }, + { + "epoch": 0.5825781696465018, + "grad_norm": 22.798831643296747, + "learning_rate": 8.873068334846789e-08, + "logits/chosen": -0.8137722015380859, + "logits/rejected": -0.8584403395652771, + "logps/chosen": -811.7307739257812, + "logps/rejected": -1032.501953125, + "loss": 0.4058, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.759009599685669, + "rewards/margins": 1.7417285442352295, + "rewards/rejected": -5.500738143920898, + "step": 892 + }, + { + "epoch": 0.5832312841864642, + "grad_norm": 23.030536411916117, + "learning_rate": 8.850401859743565e-08, + "logits/chosen": -0.8594922423362732, + "logits/rejected": -0.8479385375976562, + "logps/chosen": -990.1287231445312, + "logps/rejected": -1060.483642578125, + "loss": 0.3786, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.541996479034424, + "rewards/margins": 1.1046607494354248, + "rewards/rejected": -5.646656513214111, + "step": 893 + }, + { + "epoch": 0.5838843987264266, + "grad_norm": 32.01001320363752, + "learning_rate": 8.82774136844943e-08, + "logits/chosen": -0.620232343673706, + "logits/rejected": -0.672800600528717, + "logps/chosen": -959.5262451171875, + "logps/rejected": -1130.987060546875, + "loss": 0.4791, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.572679042816162, + "rewards/margins": 1.3316614627838135, + "rewards/rejected": -6.9043402671813965, + "step": 894 + }, + { + "epoch": 0.5845375132663891, + "grad_norm": 17.256513174943777, + "learning_rate": 8.805086978915213e-08, + "logits/chosen": -0.8671358227729797, + "logits/rejected": -0.9369436502456665, + "logps/chosen": -885.9805297851562, + "logps/rejected": -1093.339111328125, + "loss": 0.3705, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.081684112548828, + "rewards/margins": 2.167947292327881, + "rewards/rejected": -6.249631881713867, + "step": 895 + }, + { + "epoch": 0.5851906278063516, + "grad_norm": 31.92461284768268, + "learning_rate": 8.78243880905998e-08, + "logits/chosen": -0.7277215123176575, + "logits/rejected": -0.8077613115310669, + "logps/chosen": -877.326416015625, + "logps/rejected": -1197.5010986328125, + "loss": 0.3874, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.212238311767578, + "rewards/margins": 2.691312789916992, + "rewards/rejected": -6.90355110168457, + "step": 896 + }, + { + "epoch": 0.585843742346314, + "grad_norm": 29.64600849223627, + "learning_rate": 8.75979697677043e-08, + "logits/chosen": -0.6099945902824402, + "logits/rejected": -0.6647940874099731, + "logps/chosen": -855.3746948242188, + "logps/rejected": -1087.214599609375, + "loss": 0.4806, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.427003860473633, + "rewards/margins": 1.8464815616607666, + "rewards/rejected": -6.27348518371582, + "step": 897 + }, + { + "epoch": 0.5864968568862764, + "grad_norm": 22.474704857385515, + "learning_rate": 8.737161599900265e-08, + "logits/chosen": -0.8131806254386902, + "logits/rejected": -0.715324878692627, + "logps/chosen": -1068.5166015625, + "logps/rejected": -1100.4036865234375, + "loss": 0.4832, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.5754594802856445, + "rewards/margins": 0.7880738377571106, + "rewards/rejected": -6.3635334968566895, + "step": 898 + }, + { + "epoch": 0.5871499714262389, + "grad_norm": 26.409266204419403, + "learning_rate": 8.714532796269593e-08, + "logits/chosen": -0.7550494074821472, + "logits/rejected": -0.6975759863853455, + "logps/chosen": -944.514404296875, + "logps/rejected": -1085.607421875, + "loss": 0.4551, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.10006046295166, + "rewards/margins": 1.5463776588439941, + "rewards/rejected": -6.646438121795654, + "step": 899 + }, + { + "epoch": 0.5878030859662013, + "grad_norm": 19.847329227514567, + "learning_rate": 8.69191068366431e-08, + "logits/chosen": -0.8874033689498901, + "logits/rejected": -0.8347741961479187, + "logps/chosen": -916.2412109375, + "logps/rejected": -1101.5413818359375, + "loss": 0.4153, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.8446855545043945, + "rewards/margins": 1.8519930839538574, + "rewards/rejected": -6.69667911529541, + "step": 900 + }, + { + "epoch": 0.5878030859662013, + "eval_logits/chosen": -0.6474034786224365, + "eval_logits/rejected": -0.5894980430603027, + "eval_logps/chosen": -972.2631225585938, + "eval_logps/rejected": -1118.36767578125, + "eval_loss": 0.41075676679611206, + "eval_rewards/accuracies": 0.796999990940094, + "eval_rewards/chosen": -4.983559608459473, + "eval_rewards/margins": 1.5484305620193481, + "eval_rewards/rejected": -6.531990051269531, + "eval_runtime": 615.3366, + "eval_samples_per_second": 6.501, + "eval_steps_per_second": 0.406, + "step": 900 + }, + { + "epoch": 0.5884562005061638, + "grad_norm": 14.925997798234508, + "learning_rate": 8.669295379835466e-08, + "logits/chosen": -0.8691277503967285, + "logits/rejected": -0.8496584296226501, + "logps/chosen": -1043.7869873046875, + "logps/rejected": -1212.6915283203125, + "loss": 0.34, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.405606746673584, + "rewards/margins": 1.7067140340805054, + "rewards/rejected": -7.112320899963379, + "step": 901 + }, + { + "epoch": 0.5891093150461262, + "grad_norm": 25.753626241410412, + "learning_rate": 8.646687002498691e-08, + "logits/chosen": -0.9647932052612305, + "logits/rejected": -0.9126352667808533, + "logps/chosen": -969.6204833984375, + "logps/rejected": -1194.13623046875, + "loss": 0.3428, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.46538782119751, + "rewards/margins": 2.263078212738037, + "rewards/rejected": -6.728465557098389, + "step": 902 + }, + { + "epoch": 0.5897624295860887, + "grad_norm": 17.703375309858618, + "learning_rate": 8.624085669333552e-08, + "logits/chosen": -0.8007317781448364, + "logits/rejected": -0.7886674404144287, + "logps/chosen": -835.8764038085938, + "logps/rejected": -917.6867065429688, + "loss": 0.3663, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.808894157409668, + "rewards/margins": 0.7904772162437439, + "rewards/rejected": -5.599371433258057, + "step": 903 + }, + { + "epoch": 0.5904155441260511, + "grad_norm": 23.835014759230322, + "learning_rate": 8.601491497982954e-08, + "logits/chosen": -0.9031449556350708, + "logits/rejected": -0.7340150475502014, + "logps/chosen": -1016.9129638671875, + "logps/rejected": -1168.0784912109375, + "loss": 0.4244, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.812582492828369, + "rewards/margins": 1.9187612533569336, + "rewards/rejected": -6.731344223022461, + "step": 904 + }, + { + "epoch": 0.5910686586660135, + "grad_norm": 46.41920197800403, + "learning_rate": 8.578904606052524e-08, + "logits/chosen": -0.8163060545921326, + "logits/rejected": -0.7480596899986267, + "logps/chosen": -867.8343505859375, + "logps/rejected": -1006.3372192382812, + "loss": 0.4485, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.025778770446777, + "rewards/margins": 1.435265064239502, + "rewards/rejected": -5.461042881011963, + "step": 905 + }, + { + "epoch": 0.591721773205976, + "grad_norm": 22.00085431680691, + "learning_rate": 8.556325111109993e-08, + "logits/chosen": -0.8558377027511597, + "logits/rejected": -0.7511030435562134, + "logps/chosen": -935.3119506835938, + "logps/rejected": -1052.9794921875, + "loss": 0.3499, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.992286205291748, + "rewards/margins": 1.536756157875061, + "rewards/rejected": -5.5290422439575195, + "step": 906 + }, + { + "epoch": 0.5923748877459385, + "grad_norm": 16.976980001349833, + "learning_rate": 8.533753130684596e-08, + "logits/chosen": -0.8626459240913391, + "logits/rejected": -0.8605834245681763, + "logps/chosen": -867.1327514648438, + "logps/rejected": -1003.730712890625, + "loss": 0.419, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.049777984619141, + "rewards/margins": 1.4001483917236328, + "rewards/rejected": -5.449926376342773, + "step": 907 + }, + { + "epoch": 0.5930280022859009, + "grad_norm": 20.35692870029299, + "learning_rate": 8.51118878226645e-08, + "logits/chosen": -0.8889980316162109, + "logits/rejected": -0.9109795093536377, + "logps/chosen": -891.5820922851562, + "logps/rejected": -1061.819580078125, + "loss": 0.3608, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.176605701446533, + "rewards/margins": 1.7289620637893677, + "rewards/rejected": -5.9055681228637695, + "step": 908 + }, + { + "epoch": 0.5936811168258633, + "grad_norm": 30.804138303718446, + "learning_rate": 8.488632183305945e-08, + "logits/chosen": -0.9131035804748535, + "logits/rejected": -0.8490291237831116, + "logps/chosen": -907.41015625, + "logps/rejected": -1025.7835693359375, + "loss": 0.4051, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.207481384277344, + "rewards/margins": 1.614747166633606, + "rewards/rejected": -5.822227954864502, + "step": 909 + }, + { + "epoch": 0.5943342313658257, + "grad_norm": 23.681532592721254, + "learning_rate": 8.466083451213145e-08, + "logits/chosen": -0.7877532839775085, + "logits/rejected": -0.632519006729126, + "logps/chosen": -966.0323486328125, + "logps/rejected": -1077.8875732421875, + "loss": 0.4482, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.879648685455322, + "rewards/margins": 1.4281139373779297, + "rewards/rejected": -6.307762145996094, + "step": 910 + }, + { + "epoch": 0.5949873459057883, + "grad_norm": 21.37738164632679, + "learning_rate": 8.443542703357154e-08, + "logits/chosen": -0.9703052639961243, + "logits/rejected": -0.9703279733657837, + "logps/chosen": -891.2740478515625, + "logps/rejected": -1010.065673828125, + "loss": 0.3599, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.771116018295288, + "rewards/margins": 1.7163093090057373, + "rewards/rejected": -5.487425327301025, + "step": 911 + }, + { + "epoch": 0.5956404604457507, + "grad_norm": 21.36892588322395, + "learning_rate": 8.421010057065517e-08, + "logits/chosen": -0.8273603320121765, + "logits/rejected": -0.6571228504180908, + "logps/chosen": -909.6326904296875, + "logps/rejected": -1141.2235107421875, + "loss": 0.359, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.4580841064453125, + "rewards/margins": 2.422499179840088, + "rewards/rejected": -6.880582809448242, + "step": 912 + }, + { + "epoch": 0.5962935749857131, + "grad_norm": 13.636371400634598, + "learning_rate": 8.398485629623613e-08, + "logits/chosen": -0.9461863040924072, + "logits/rejected": -0.9167419075965881, + "logps/chosen": -945.4471435546875, + "logps/rejected": -1142.0771484375, + "loss": 0.365, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.211487293243408, + "rewards/margins": 2.108157157897949, + "rewards/rejected": -6.319644451141357, + "step": 913 + }, + { + "epoch": 0.5969466895256755, + "grad_norm": 19.50909886317978, + "learning_rate": 8.375969538274046e-08, + "logits/chosen": -0.9539211988449097, + "logits/rejected": -0.881746768951416, + "logps/chosen": -859.5111083984375, + "logps/rejected": -1007.7662353515625, + "loss": 0.4434, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.13993501663208, + "rewards/margins": 1.4645112752914429, + "rewards/rejected": -5.604445457458496, + "step": 914 + }, + { + "epoch": 0.5975998040656381, + "grad_norm": 13.53092257072935, + "learning_rate": 8.35346190021602e-08, + "logits/chosen": -0.8023240566253662, + "logits/rejected": -0.6279884576797485, + "logps/chosen": -903.838623046875, + "logps/rejected": -1045.2021484375, + "loss": 0.4114, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.067113876342773, + "rewards/margins": 1.7872202396392822, + "rewards/rejected": -5.854333877563477, + "step": 915 + }, + { + "epoch": 0.5982529186056005, + "grad_norm": 32.33399522874252, + "learning_rate": 8.330962832604746e-08, + "logits/chosen": -0.7911917567253113, + "logits/rejected": -0.8615321516990662, + "logps/chosen": -974.4556884765625, + "logps/rejected": -1305.1280517578125, + "loss": 0.3299, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.3269877433776855, + "rewards/margins": 2.0870394706726074, + "rewards/rejected": -6.414027214050293, + "step": 916 + }, + { + "epoch": 0.5989060331455629, + "grad_norm": 19.159991853562595, + "learning_rate": 8.308472452550821e-08, + "logits/chosen": -0.7687807083129883, + "logits/rejected": -0.5947264432907104, + "logps/chosen": -989.5430908203125, + "logps/rejected": -1158.9034423828125, + "loss": 0.3795, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.157406806945801, + "rewards/margins": 2.0121400356292725, + "rewards/rejected": -7.169547080993652, + "step": 917 + }, + { + "epoch": 0.5995591476855253, + "grad_norm": 16.7182763656914, + "learning_rate": 8.285990877119621e-08, + "logits/chosen": -0.833372950553894, + "logits/rejected": -0.8309823274612427, + "logps/chosen": -887.657958984375, + "logps/rejected": -1025.5283203125, + "loss": 0.4022, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.440640926361084, + "rewards/margins": 1.293958067893982, + "rewards/rejected": -5.7345991134643555, + "step": 918 + }, + { + "epoch": 0.6002122622254878, + "grad_norm": 17.517071068178463, + "learning_rate": 8.263518223330696e-08, + "logits/chosen": -0.9249385595321655, + "logits/rejected": -0.8931978940963745, + "logps/chosen": -952.4149780273438, + "logps/rejected": -1138.89453125, + "loss": 0.3452, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.342203617095947, + "rewards/margins": 1.7767517566680908, + "rewards/rejected": -6.118955612182617, + "step": 919 + }, + { + "epoch": 0.6008653767654503, + "grad_norm": 14.896081107369003, + "learning_rate": 8.241054608157157e-08, + "logits/chosen": -0.7339697480201721, + "logits/rejected": -0.7770588994026184, + "logps/chosen": -865.361083984375, + "logps/rejected": -1034.2215576171875, + "loss": 0.3847, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.295354843139648, + "rewards/margins": 1.5795997381210327, + "rewards/rejected": -5.874954700469971, + "step": 920 + }, + { + "epoch": 0.6015184913054127, + "grad_norm": 22.1021955531778, + "learning_rate": 8.218600148525065e-08, + "logits/chosen": -0.6869848966598511, + "logits/rejected": -0.7026206254959106, + "logps/chosen": -830.2351684570312, + "logps/rejected": -978.03564453125, + "loss": 0.4343, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.30990743637085, + "rewards/margins": 1.3018380403518677, + "rewards/rejected": -5.611745357513428, + "step": 921 + }, + { + "epoch": 0.6021716058453751, + "grad_norm": 15.381420621498501, + "learning_rate": 8.19615496131283e-08, + "logits/chosen": -0.9402379989624023, + "logits/rejected": -0.8162367343902588, + "logps/chosen": -948.3676147460938, + "logps/rejected": -1116.4700927734375, + "loss": 0.3934, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.638570308685303, + "rewards/margins": 1.7800812721252441, + "rewards/rejected": -6.418651103973389, + "step": 922 + }, + { + "epoch": 0.6028247203853376, + "grad_norm": 18.845537370733666, + "learning_rate": 8.173719163350594e-08, + "logits/chosen": -0.8794003129005432, + "logits/rejected": -0.8930718898773193, + "logps/chosen": -948.6639404296875, + "logps/rejected": -1154.9439697265625, + "loss": 0.4172, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.331446647644043, + "rewards/margins": 1.726813554763794, + "rewards/rejected": -6.058260440826416, + "step": 923 + }, + { + "epoch": 0.6034778349253, + "grad_norm": 34.811506098824175, + "learning_rate": 8.151292871419626e-08, + "logits/chosen": -0.7502395510673523, + "logits/rejected": -0.7458022832870483, + "logps/chosen": -881.787841796875, + "logps/rejected": -1068.427734375, + "loss": 0.364, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.579074382781982, + "rewards/margins": 1.974166989326477, + "rewards/rejected": -6.553241729736328, + "step": 924 + }, + { + "epoch": 0.6041309494652625, + "grad_norm": 19.25584833056271, + "learning_rate": 8.128876202251717e-08, + "logits/chosen": -1.001344084739685, + "logits/rejected": -0.937218427658081, + "logps/chosen": -830.0806274414062, + "logps/rejected": -987.154052734375, + "loss": 0.3544, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.078459739685059, + "rewards/margins": 1.5838062763214111, + "rewards/rejected": -5.662266254425049, + "step": 925 + }, + { + "epoch": 0.6047840640052249, + "grad_norm": 17.416222678203805, + "learning_rate": 8.106469272528572e-08, + "logits/chosen": -1.023118257522583, + "logits/rejected": -0.8872838616371155, + "logps/chosen": -935.068115234375, + "logps/rejected": -1081.85595703125, + "loss": 0.3868, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.559538841247559, + "rewards/margins": 1.8961910009384155, + "rewards/rejected": -6.4557294845581055, + "step": 926 + }, + { + "epoch": 0.6054371785451874, + "grad_norm": 15.520574577955848, + "learning_rate": 8.084072198881198e-08, + "logits/chosen": -0.8037633299827576, + "logits/rejected": -0.6210764050483704, + "logps/chosen": -917.8544921875, + "logps/rejected": -1018.9097900390625, + "loss": 0.4153, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.491122245788574, + "rewards/margins": 1.4294805526733398, + "rewards/rejected": -5.920601844787598, + "step": 927 + }, + { + "epoch": 0.6060902930851498, + "grad_norm": 28.746922467088186, + "learning_rate": 8.061685097889299e-08, + "logits/chosen": -0.9267802238464355, + "logits/rejected": -0.788466215133667, + "logps/chosen": -1069.5301513671875, + "logps/rejected": -1141.6458740234375, + "loss": 0.4574, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.574659824371338, + "rewards/margins": 1.0332306623458862, + "rewards/rejected": -6.607890605926514, + "step": 928 + }, + { + "epoch": 0.6067434076251123, + "grad_norm": 14.23380808630684, + "learning_rate": 8.039308086080674e-08, + "logits/chosen": -0.9022722244262695, + "logits/rejected": -0.8536574244499207, + "logps/chosen": -1080.463623046875, + "logps/rejected": -1230.5780029296875, + "loss": 0.3791, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.848098278045654, + "rewards/margins": 1.53517746925354, + "rewards/rejected": -7.383275985717773, + "step": 929 + }, + { + "epoch": 0.6073965221650747, + "grad_norm": 17.403206785758336, + "learning_rate": 8.016941279930605e-08, + "logits/chosen": -0.919601321220398, + "logits/rejected": -0.9383385181427002, + "logps/chosen": -880.8705444335938, + "logps/rejected": -1069.1361083984375, + "loss": 0.363, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.793572425842285, + "rewards/margins": 1.5130126476287842, + "rewards/rejected": -6.306585311889648, + "step": 930 + }, + { + "epoch": 0.6080496367050372, + "grad_norm": 18.009932005684178, + "learning_rate": 7.994584795861247e-08, + "logits/chosen": -0.7678056359291077, + "logits/rejected": -0.7451735138893127, + "logps/chosen": -1034.4462890625, + "logps/rejected": -1221.044677734375, + "loss": 0.3192, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.145386219024658, + "rewards/margins": 2.1374313831329346, + "rewards/rejected": -7.282817363739014, + "step": 931 + }, + { + "epoch": 0.6087027512449996, + "grad_norm": 49.34545548945588, + "learning_rate": 7.972238750241036e-08, + "logits/chosen": -0.7207657694816589, + "logits/rejected": -0.7844063639640808, + "logps/chosen": -1049.498779296875, + "logps/rejected": -1199.9368896484375, + "loss": 0.6142, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.626737594604492, + "rewards/margins": 1.12619948387146, + "rewards/rejected": -6.752936840057373, + "step": 932 + }, + { + "epoch": 0.609355865784962, + "grad_norm": 18.136351511937786, + "learning_rate": 7.949903259384068e-08, + "logits/chosen": -0.9160058498382568, + "logits/rejected": -0.951209545135498, + "logps/chosen": -978.7283325195312, + "logps/rejected": -1158.1009521484375, + "loss": 0.4027, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.956260681152344, + "rewards/margins": 1.780358076095581, + "rewards/rejected": -6.736618995666504, + "step": 933 + }, + { + "epoch": 0.6100089803249245, + "grad_norm": 14.171668974961465, + "learning_rate": 7.927578439549506e-08, + "logits/chosen": -0.8602361083030701, + "logits/rejected": -0.8363439440727234, + "logps/chosen": -1078.1002197265625, + "logps/rejected": -1262.086669921875, + "loss": 0.3364, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.762195110321045, + "rewards/margins": 1.9282588958740234, + "rewards/rejected": -7.690454483032227, + "step": 934 + }, + { + "epoch": 0.610662094864887, + "grad_norm": 21.95332133574316, + "learning_rate": 7.905264406940959e-08, + "logits/chosen": -0.814610481262207, + "logits/rejected": -0.8668062090873718, + "logps/chosen": -996.294921875, + "logps/rejected": -1317.03955078125, + "loss": 0.4424, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.210824966430664, + "rewards/margins": 2.820859432220459, + "rewards/rejected": -8.031684875488281, + "step": 935 + }, + { + "epoch": 0.6113152094048494, + "grad_norm": 23.10823799153079, + "learning_rate": 7.882961277705895e-08, + "logits/chosen": -0.6385756134986877, + "logits/rejected": -0.6616299152374268, + "logps/chosen": -852.9818115234375, + "logps/rejected": -1010.8091430664062, + "loss": 0.4107, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.4981513023376465, + "rewards/margins": 1.3144606351852417, + "rewards/rejected": -5.8126115798950195, + "step": 936 + }, + { + "epoch": 0.6119683239448118, + "grad_norm": 13.356378969333702, + "learning_rate": 7.860669167935028e-08, + "logits/chosen": -0.6723610758781433, + "logits/rejected": -0.5586845874786377, + "logps/chosen": -911.2410278320312, + "logps/rejected": -1077.91552734375, + "loss": 0.3322, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.328841209411621, + "rewards/margins": 1.6360496282577515, + "rewards/rejected": -5.964890003204346, + "step": 937 + }, + { + "epoch": 0.6126214384847742, + "grad_norm": 17.64088440541406, + "learning_rate": 7.838388193661711e-08, + "logits/chosen": -1.0395886898040771, + "logits/rejected": -0.9543710947036743, + "logps/chosen": -994.0137329101562, + "logps/rejected": -1132.4892578125, + "loss": 0.4228, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.503838062286377, + "rewards/margins": 1.5002458095550537, + "rewards/rejected": -7.004083156585693, + "step": 938 + }, + { + "epoch": 0.6132745530247368, + "grad_norm": 41.76222226846645, + "learning_rate": 7.816118470861342e-08, + "logits/chosen": -0.7007467150688171, + "logits/rejected": -0.7458237409591675, + "logps/chosen": -859.9625244140625, + "logps/rejected": -1088.14013671875, + "loss": 0.3553, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.435876369476318, + "rewards/margins": 1.9407938718795776, + "rewards/rejected": -6.376670837402344, + "step": 939 + }, + { + "epoch": 0.6139276675646992, + "grad_norm": 16.41466968357761, + "learning_rate": 7.793860115450743e-08, + "logits/chosen": -0.8894115686416626, + "logits/rejected": -0.7774127125740051, + "logps/chosen": -998.6695556640625, + "logps/rejected": -1176.7144775390625, + "loss": 0.4081, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.242660999298096, + "rewards/margins": 1.6881160736083984, + "rewards/rejected": -6.930777072906494, + "step": 940 + }, + { + "epoch": 0.6145807821046616, + "grad_norm": 28.81321193957898, + "learning_rate": 7.771613243287573e-08, + "logits/chosen": -0.8349412083625793, + "logits/rejected": -0.6980231404304504, + "logps/chosen": -1037.77783203125, + "logps/rejected": -1159.787353515625, + "loss": 0.5375, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.490978240966797, + "rewards/margins": 1.5088858604431152, + "rewards/rejected": -6.99986457824707, + "step": 941 + }, + { + "epoch": 0.615233896644624, + "grad_norm": 15.47414082361199, + "learning_rate": 7.749377970169726e-08, + "logits/chosen": -0.7294608354568481, + "logits/rejected": -0.8300274014472961, + "logps/chosen": -881.7943725585938, + "logps/rejected": -1087.314208984375, + "loss": 0.3835, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.590490818023682, + "rewards/margins": 1.7875473499298096, + "rewards/rejected": -6.37803840637207, + "step": 942 + }, + { + "epoch": 0.6158870111845864, + "grad_norm": 19.63090785781211, + "learning_rate": 7.72715441183471e-08, + "logits/chosen": -0.8396845459938049, + "logits/rejected": -0.8975608348846436, + "logps/chosen": -935.793701171875, + "logps/rejected": -1181.52392578125, + "loss": 0.4221, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.497322082519531, + "rewards/margins": 2.063370704650879, + "rewards/rejected": -6.56069278717041, + "step": 943 + }, + { + "epoch": 0.616540125724549, + "grad_norm": 19.55448604762308, + "learning_rate": 7.704942683959061e-08, + "logits/chosen": -0.7600312829017639, + "logits/rejected": -0.6722344160079956, + "logps/chosen": -876.5004272460938, + "logps/rejected": -1015.899169921875, + "loss": 0.4024, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6736412048339844, + "rewards/margins": 1.7954384088516235, + "rewards/rejected": -5.469079494476318, + "step": 944 + }, + { + "epoch": 0.6171932402645114, + "grad_norm": 22.577034429164247, + "learning_rate": 7.682742902157742e-08, + "logits/chosen": -0.8105038404464722, + "logits/rejected": -0.8246266841888428, + "logps/chosen": -981.1633911132812, + "logps/rejected": -1142.673828125, + "loss": 0.3782, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.0762176513671875, + "rewards/margins": 1.5619633197784424, + "rewards/rejected": -6.638180732727051, + "step": 945 + }, + { + "epoch": 0.6178463548044738, + "grad_norm": 16.850655460651723, + "learning_rate": 7.660555181983517e-08, + "logits/chosen": -0.8345729112625122, + "logits/rejected": -0.8672900199890137, + "logps/chosen": -996.8236083984375, + "logps/rejected": -1235.411865234375, + "loss": 0.34, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.766516208648682, + "rewards/margins": 1.9924037456512451, + "rewards/rejected": -6.758920669555664, + "step": 946 + }, + { + "epoch": 0.6184994693444362, + "grad_norm": 24.42884479602425, + "learning_rate": 7.638379638926384e-08, + "logits/chosen": -0.9110671281814575, + "logits/rejected": -0.9710690975189209, + "logps/chosen": -972.607177734375, + "logps/rejected": -1323.8133544921875, + "loss": 0.355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.0181565284729, + "rewards/margins": 3.4316680431365967, + "rewards/rejected": -8.449824333190918, + "step": 947 + }, + { + "epoch": 0.6191525838843988, + "grad_norm": 32.661617699447575, + "learning_rate": 7.616216388412956e-08, + "logits/chosen": -0.9173276424407959, + "logits/rejected": -0.784736692905426, + "logps/chosen": -906.2161865234375, + "logps/rejected": -1020.6433715820312, + "loss": 0.4303, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.6049885749816895, + "rewards/margins": 1.6103415489196777, + "rewards/rejected": -6.215329647064209, + "step": 948 + }, + { + "epoch": 0.6198056984243612, + "grad_norm": 22.777809128273034, + "learning_rate": 7.594065545805857e-08, + "logits/chosen": -0.6866058707237244, + "logits/rejected": -0.8154290318489075, + "logps/chosen": -895.670654296875, + "logps/rejected": -1135.50048828125, + "loss": 0.4758, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.395349979400635, + "rewards/margins": 1.9171158075332642, + "rewards/rejected": -6.312465667724609, + "step": 949 + }, + { + "epoch": 0.6204588129643236, + "grad_norm": 23.234837418589642, + "learning_rate": 7.571927226403126e-08, + "logits/chosen": -0.749639093875885, + "logits/rejected": -0.7934308052062988, + "logps/chosen": -928.8316040039062, + "logps/rejected": -1088.88525390625, + "loss": 0.3238, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.648177146911621, + "rewards/margins": 1.5695174932479858, + "rewards/rejected": -6.217695236206055, + "step": 950 + }, + { + "epoch": 0.621111927504286, + "grad_norm": 38.340388409455954, + "learning_rate": 7.549801545437621e-08, + "logits/chosen": -0.7972779870033264, + "logits/rejected": -0.8462767004966736, + "logps/chosen": -829.4715576171875, + "logps/rejected": -1028.169921875, + "loss": 0.3906, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.178791046142578, + "rewards/margins": 1.6456745862960815, + "rewards/rejected": -5.824464797973633, + "step": 951 + }, + { + "epoch": 0.6217650420442485, + "grad_norm": 17.06008923511345, + "learning_rate": 7.527688618076413e-08, + "logits/chosen": -0.8404926657676697, + "logits/rejected": -0.8594713807106018, + "logps/chosen": -826.0974731445312, + "logps/rejected": -1009.733642578125, + "loss": 0.4399, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9912056922912598, + "rewards/margins": 1.581747055053711, + "rewards/rejected": -5.5729522705078125, + "step": 952 + }, + { + "epoch": 0.622418156584211, + "grad_norm": 26.890220915174563, + "learning_rate": 7.505588559420187e-08, + "logits/chosen": -0.8127776384353638, + "logits/rejected": -0.8098663091659546, + "logps/chosen": -986.1820678710938, + "logps/rejected": -1091.328369140625, + "loss": 0.3669, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.3212432861328125, + "rewards/margins": 1.011331558227539, + "rewards/rejected": -6.332574844360352, + "step": 953 + }, + { + "epoch": 0.6230712711241734, + "grad_norm": 16.457932917397315, + "learning_rate": 7.48350148450265e-08, + "logits/chosen": -0.9597660303115845, + "logits/rejected": -0.8716091513633728, + "logps/chosen": -1003.8367919921875, + "logps/rejected": -1141.11669921875, + "loss": 0.3672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.11099910736084, + "rewards/margins": 1.655037522315979, + "rewards/rejected": -6.7660369873046875, + "step": 954 + }, + { + "epoch": 0.6237243856641358, + "grad_norm": 36.48894968020559, + "learning_rate": 7.461427508289921e-08, + "logits/chosen": -0.8966237306594849, + "logits/rejected": -0.7957350015640259, + "logps/chosen": -832.1514892578125, + "logps/rejected": -917.5252075195312, + "loss": 0.3699, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9978244304656982, + "rewards/margins": 1.4576671123504639, + "rewards/rejected": -5.455491065979004, + "step": 955 + }, + { + "epoch": 0.6243775002040983, + "grad_norm": 19.6017580958855, + "learning_rate": 7.439366745679942e-08, + "logits/chosen": -0.8043563365936279, + "logits/rejected": -0.8075878620147705, + "logps/chosen": -941.8458251953125, + "logps/rejected": -1092.9459228515625, + "loss": 0.3917, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.635620594024658, + "rewards/margins": 1.3564602136611938, + "rewards/rejected": -5.992081165313721, + "step": 956 + }, + { + "epoch": 0.6250306147440607, + "grad_norm": 25.477482249295566, + "learning_rate": 7.417319311501879e-08, + "logits/chosen": -0.7736403942108154, + "logits/rejected": -0.7841091752052307, + "logps/chosen": -850.4437255859375, + "logps/rejected": -1102.8319091796875, + "loss": 0.4188, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.24493408203125, + "rewards/margins": 1.8479266166687012, + "rewards/rejected": -6.092860221862793, + "step": 957 + }, + { + "epoch": 0.6256837292840232, + "grad_norm": 22.907968350007085, + "learning_rate": 7.395285320515512e-08, + "logits/chosen": -0.8162537217140198, + "logits/rejected": -0.8792285919189453, + "logps/chosen": -913.6036987304688, + "logps/rejected": -1090.5911865234375, + "loss": 0.3582, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.4979424476623535, + "rewards/margins": 1.1419354677200317, + "rewards/rejected": -5.639878273010254, + "step": 958 + }, + { + "epoch": 0.6263368438239856, + "grad_norm": 31.225301502154736, + "learning_rate": 7.373264887410656e-08, + "logits/chosen": -0.6841857433319092, + "logits/rejected": -0.7074832320213318, + "logps/chosen": -835.9268798828125, + "logps/rejected": -1149.3931884765625, + "loss": 0.3721, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8488359451293945, + "rewards/margins": 2.7709360122680664, + "rewards/rejected": -6.619771957397461, + "step": 959 + }, + { + "epoch": 0.6269899583639481, + "grad_norm": 25.565009032820456, + "learning_rate": 7.351258126806555e-08, + "logits/chosen": -0.7442363500595093, + "logits/rejected": -0.754153847694397, + "logps/chosen": -1003.4380493164062, + "logps/rejected": -1236.912841796875, + "loss": 0.3945, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.113544940948486, + "rewards/margins": 2.3746109008789062, + "rewards/rejected": -7.488155364990234, + "step": 960 + }, + { + "epoch": 0.6276430729039105, + "grad_norm": 20.213696097403353, + "learning_rate": 7.329265153251284e-08, + "logits/chosen": -0.9148849844932556, + "logits/rejected": -0.8240453600883484, + "logps/chosen": -995.921630859375, + "logps/rejected": -1149.3841552734375, + "loss": 0.3667, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.002110481262207, + "rewards/margins": 1.7089855670928955, + "rewards/rejected": -6.711096286773682, + "step": 961 + }, + { + "epoch": 0.6282961874438729, + "grad_norm": 21.830265430877628, + "learning_rate": 7.307286081221153e-08, + "logits/chosen": -0.8627299070358276, + "logits/rejected": -0.879044771194458, + "logps/chosen": -893.7263793945312, + "logps/rejected": -1017.5081787109375, + "loss": 0.3958, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.613065242767334, + "rewards/margins": 1.1169685125350952, + "rewards/rejected": -5.730034351348877, + "step": 962 + }, + { + "epoch": 0.6289493019838354, + "grad_norm": 21.34761586639017, + "learning_rate": 7.285321025120116e-08, + "logits/chosen": -1.028733253479004, + "logits/rejected": -0.9963964223861694, + "logps/chosen": -1049.7506103515625, + "logps/rejected": -1225.5938720703125, + "loss": 0.412, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.158174514770508, + "rewards/margins": 1.789759635925293, + "rewards/rejected": -6.947933673858643, + "step": 963 + }, + { + "epoch": 0.6296024165237979, + "grad_norm": 25.298717035276482, + "learning_rate": 7.263370099279171e-08, + "logits/chosen": -0.9898463487625122, + "logits/rejected": -0.9164149761199951, + "logps/chosen": -857.2523193359375, + "logps/rejected": -1091.1043701171875, + "loss": 0.4432, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.275208473205566, + "rewards/margins": 2.2709662914276123, + "rewards/rejected": -6.546175956726074, + "step": 964 + }, + { + "epoch": 0.6302555310637603, + "grad_norm": 21.913629938148368, + "learning_rate": 7.241433417955764e-08, + "logits/chosen": -0.8013566732406616, + "logits/rejected": -0.7411233186721802, + "logps/chosen": -979.8931884765625, + "logps/rejected": -1252.721923828125, + "loss": 0.3631, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.143740653991699, + "rewards/margins": 2.6635663509368896, + "rewards/rejected": -7.807306289672852, + "step": 965 + }, + { + "epoch": 0.6309086456037227, + "grad_norm": 23.61954289841313, + "learning_rate": 7.219511095333199e-08, + "logits/chosen": -0.9983224272727966, + "logits/rejected": -0.8240174651145935, + "logps/chosen": -971.7218017578125, + "logps/rejected": -1114.72509765625, + "loss": 0.4529, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.7123494148254395, + "rewards/margins": 1.8094121217727661, + "rewards/rejected": -6.521760940551758, + "step": 966 + }, + { + "epoch": 0.6315617601436851, + "grad_norm": 23.349877754997273, + "learning_rate": 7.197603245520041e-08, + "logits/chosen": -0.6674496531486511, + "logits/rejected": -0.654395341873169, + "logps/chosen": -986.275146484375, + "logps/rejected": -1218.1419677734375, + "loss": 0.4591, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.584812641143799, + "rewards/margins": 2.1112468242645264, + "rewards/rejected": -7.69605827331543, + "step": 967 + }, + { + "epoch": 0.6322148746836477, + "grad_norm": 19.46883241188175, + "learning_rate": 7.175709982549524e-08, + "logits/chosen": -0.8957158327102661, + "logits/rejected": -0.8326559066772461, + "logps/chosen": -923.7268676757812, + "logps/rejected": -1083.7965087890625, + "loss": 0.3943, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.729306221008301, + "rewards/margins": 1.6854844093322754, + "rewards/rejected": -6.414790630340576, + "step": 968 + }, + { + "epoch": 0.6328679892236101, + "grad_norm": 22.593182793628216, + "learning_rate": 7.153831420378949e-08, + "logits/chosen": -0.9131090044975281, + "logits/rejected": -0.8579813241958618, + "logps/chosen": -1036.234130859375, + "logps/rejected": -1180.869140625, + "loss": 0.3677, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.513719081878662, + "rewards/margins": 2.031284809112549, + "rewards/rejected": -6.545003414154053, + "step": 969 + }, + { + "epoch": 0.6335211037635725, + "grad_norm": 30.292860588793435, + "learning_rate": 7.1319676728891e-08, + "logits/chosen": -0.9997818470001221, + "logits/rejected": -0.9115554094314575, + "logps/chosen": -881.9102783203125, + "logps/rejected": -1037.227783203125, + "loss": 0.455, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.3664350509643555, + "rewards/margins": 1.9524322748184204, + "rewards/rejected": -6.3188676834106445, + "step": 970 + }, + { + "epoch": 0.6341742183035349, + "grad_norm": 25.7144274108187, + "learning_rate": 7.110118853883653e-08, + "logits/chosen": -0.9433892965316772, + "logits/rejected": -0.878379225730896, + "logps/chosen": -957.4013061523438, + "logps/rejected": -1089.672607421875, + "loss": 0.4026, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.62063455581665, + "rewards/margins": 1.3547346591949463, + "rewards/rejected": -5.975369453430176, + "step": 971 + }, + { + "epoch": 0.6348273328434975, + "grad_norm": 14.06727763907239, + "learning_rate": 7.088285077088576e-08, + "logits/chosen": -0.7802984714508057, + "logits/rejected": -0.7867769598960876, + "logps/chosen": -916.9561767578125, + "logps/rejected": -1012.1802978515625, + "loss": 0.3995, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.424203395843506, + "rewards/margins": 1.1160019636154175, + "rewards/rejected": -5.540205478668213, + "step": 972 + }, + { + "epoch": 0.6354804473834599, + "grad_norm": 17.405294616197406, + "learning_rate": 7.06646645615154e-08, + "logits/chosen": -0.8531831502914429, + "logits/rejected": -0.7872010469436646, + "logps/chosen": -914.7217407226562, + "logps/rejected": -1074.9007568359375, + "loss": 0.3495, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.552631855010986, + "rewards/margins": 1.3901065587997437, + "rewards/rejected": -5.942738056182861, + "step": 973 + }, + { + "epoch": 0.6361335619234223, + "grad_norm": 44.55104973891953, + "learning_rate": 7.044663104641331e-08, + "logits/chosen": -0.7868108153343201, + "logits/rejected": -0.807988166809082, + "logps/chosen": -941.859375, + "logps/rejected": -1273.0975341796875, + "loss": 0.3725, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.062593936920166, + "rewards/margins": 2.6936287879943848, + "rewards/rejected": -6.756222724914551, + "step": 974 + }, + { + "epoch": 0.6367866764633847, + "grad_norm": 14.671664046221297, + "learning_rate": 7.02287513604725e-08, + "logits/chosen": -0.7551882863044739, + "logits/rejected": -0.7513316869735718, + "logps/chosen": -807.0482177734375, + "logps/rejected": -928.9945068359375, + "loss": 0.4005, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.388478755950928, + "rewards/margins": 1.168333649635315, + "rewards/rejected": -5.556812286376953, + "step": 975 + }, + { + "epoch": 0.6374397910033472, + "grad_norm": 20.1405977104016, + "learning_rate": 7.001102663778532e-08, + "logits/chosen": -0.7532812356948853, + "logits/rejected": -0.7765889167785645, + "logps/chosen": -839.454345703125, + "logps/rejected": -991.3602294921875, + "loss": 0.3791, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9657483100891113, + "rewards/margins": 1.6194977760314941, + "rewards/rejected": -5.585245609283447, + "step": 976 + }, + { + "epoch": 0.6380929055433097, + "grad_norm": 16.410114359901613, + "learning_rate": 6.979345801163751e-08, + "logits/chosen": -0.7125265002250671, + "logits/rejected": -0.7120476365089417, + "logps/chosen": -922.1001586914062, + "logps/rejected": -1161.8427734375, + "loss": 0.2998, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.680972576141357, + "rewards/margins": 1.9333194494247437, + "rewards/rejected": -6.614292144775391, + "step": 977 + }, + { + "epoch": 0.6387460200832721, + "grad_norm": 18.172957147197614, + "learning_rate": 6.957604661450228e-08, + "logits/chosen": -1.0247219800949097, + "logits/rejected": -0.9776769280433655, + "logps/chosen": -901.0526123046875, + "logps/rejected": -1040.6275634765625, + "loss": 0.3487, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.152916431427002, + "rewards/margins": 1.5070492029190063, + "rewards/rejected": -5.659965991973877, + "step": 978 + }, + { + "epoch": 0.6393991346232345, + "grad_norm": 19.453149679054654, + "learning_rate": 6.935879357803451e-08, + "logits/chosen": -0.8311464786529541, + "logits/rejected": -0.8162705898284912, + "logps/chosen": -808.68359375, + "logps/rejected": -955.904541015625, + "loss": 0.4156, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.762153148651123, + "rewards/margins": 1.4650424718856812, + "rewards/rejected": -5.227195739746094, + "step": 979 + }, + { + "epoch": 0.640052249163197, + "grad_norm": 18.263500694059907, + "learning_rate": 6.914170003306476e-08, + "logits/chosen": -1.0470964908599854, + "logits/rejected": -0.9374945163726807, + "logps/chosen": -939.5408935546875, + "logps/rejected": -1130.5428466796875, + "loss": 0.3755, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.667804718017578, + "rewards/margins": 2.05245041847229, + "rewards/rejected": -6.720254898071289, + "step": 980 + }, + { + "epoch": 0.6407053637031594, + "grad_norm": 20.87710669272072, + "learning_rate": 6.892476710959334e-08, + "logits/chosen": -0.7399060726165771, + "logits/rejected": -0.7495934367179871, + "logps/chosen": -881.5730590820312, + "logps/rejected": -1082.3707275390625, + "loss": 0.3691, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.176002502441406, + "rewards/margins": 2.2363686561584473, + "rewards/rejected": -6.412371635437012, + "step": 981 + }, + { + "epoch": 0.6413584782431219, + "grad_norm": 23.889366608981074, + "learning_rate": 6.870799593678458e-08, + "logits/chosen": -0.8247918486595154, + "logits/rejected": -0.8469498157501221, + "logps/chosen": -935.6553344726562, + "logps/rejected": -1030.1029052734375, + "loss": 0.3968, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.512282848358154, + "rewards/margins": 1.1883516311645508, + "rewards/rejected": -5.700634956359863, + "step": 982 + }, + { + "epoch": 0.6420115927830843, + "grad_norm": 23.550902347456475, + "learning_rate": 6.849138764296087e-08, + "logits/chosen": -0.8264920711517334, + "logits/rejected": -0.8020291328430176, + "logps/chosen": -953.135498046875, + "logps/rejected": -1134.365966796875, + "loss": 0.4937, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.6887664794921875, + "rewards/margins": 1.7499239444732666, + "rewards/rejected": -6.438691139221191, + "step": 983 + }, + { + "epoch": 0.6426647073230468, + "grad_norm": 22.7648671670516, + "learning_rate": 6.827494335559682e-08, + "logits/chosen": -0.9459264278411865, + "logits/rejected": -0.9424221515655518, + "logps/chosen": -923.2305297851562, + "logps/rejected": -1065.76806640625, + "loss": 0.4346, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.051098346710205, + "rewards/margins": 1.6530250310897827, + "rewards/rejected": -6.704123497009277, + "step": 984 + }, + { + "epoch": 0.6433178218630092, + "grad_norm": 18.62264187081726, + "learning_rate": 6.805866420131329e-08, + "logits/chosen": -0.9513512253761292, + "logits/rejected": -0.9166187047958374, + "logps/chosen": -910.2726440429688, + "logps/rejected": -1088.285888671875, + "loss": 0.4295, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.5494465827941895, + "rewards/margins": 1.8071138858795166, + "rewards/rejected": -6.356560230255127, + "step": 985 + }, + { + "epoch": 0.6439709364029716, + "grad_norm": 32.6686778226223, + "learning_rate": 6.784255130587166e-08, + "logits/chosen": -0.5947114825248718, + "logits/rejected": -0.578256368637085, + "logps/chosen": -806.2866821289062, + "logps/rejected": -951.0847778320312, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.234209060668945, + "rewards/margins": 1.5783112049102783, + "rewards/rejected": -5.8125200271606445, + "step": 986 + }, + { + "epoch": 0.6446240509429341, + "grad_norm": 22.385129535074565, + "learning_rate": 6.76266057941679e-08, + "logits/chosen": -0.8183972239494324, + "logits/rejected": -0.7740908265113831, + "logps/chosen": -880.7390747070312, + "logps/rejected": -1058.571044921875, + "loss": 0.3731, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.1473612785339355, + "rewards/margins": 1.603851079940796, + "rewards/rejected": -5.751212120056152, + "step": 987 + }, + { + "epoch": 0.6452771654828966, + "grad_norm": 17.849948544570488, + "learning_rate": 6.74108287902267e-08, + "logits/chosen": -0.9178858995437622, + "logits/rejected": -0.8500401377677917, + "logps/chosen": -923.59228515625, + "logps/rejected": -1101.4404296875, + "loss": 0.4397, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.897286415100098, + "rewards/margins": 1.5341837406158447, + "rewards/rejected": -6.431469917297363, + "step": 988 + }, + { + "epoch": 0.645930280022859, + "grad_norm": 20.377859242404433, + "learning_rate": 6.71952214171957e-08, + "logits/chosen": -0.6903858780860901, + "logits/rejected": -0.6488080620765686, + "logps/chosen": -924.1713256835938, + "logps/rejected": -1020.035888671875, + "loss": 0.3853, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.061565399169922, + "rewards/margins": 1.2257417440414429, + "rewards/rejected": -6.287306785583496, + "step": 989 + }, + { + "epoch": 0.6465833945628214, + "grad_norm": 21.58412096751822, + "learning_rate": 6.697978479733951e-08, + "logits/chosen": -0.9614883661270142, + "logits/rejected": -0.9037103056907654, + "logps/chosen": -965.2286987304688, + "logps/rejected": -1166.6185302734375, + "loss": 0.3806, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.743613243103027, + "rewards/margins": 1.9612386226654053, + "rewards/rejected": -6.704852104187012, + "step": 990 + }, + { + "epoch": 0.6472365091027839, + "grad_norm": 23.814036679350696, + "learning_rate": 6.676452005203405e-08, + "logits/chosen": -0.7713127136230469, + "logits/rejected": -0.9046941995620728, + "logps/chosen": -868.4608154296875, + "logps/rejected": -1109.4259033203125, + "loss": 0.3486, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.386026382446289, + "rewards/margins": 1.9416440725326538, + "rewards/rejected": -6.327670097351074, + "step": 991 + }, + { + "epoch": 0.6478896236427464, + "grad_norm": 15.283217602370875, + "learning_rate": 6.654942830176046e-08, + "logits/chosen": -0.8953443169593811, + "logits/rejected": -0.8187096118927002, + "logps/chosen": -857.76708984375, + "logps/rejected": -1050.30029296875, + "loss": 0.3335, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9873547554016113, + "rewards/margins": 1.8355342149734497, + "rewards/rejected": -5.82288932800293, + "step": 992 + }, + { + "epoch": 0.6485427381827088, + "grad_norm": 24.833610729179064, + "learning_rate": 6.63345106660996e-08, + "logits/chosen": -0.8883509039878845, + "logits/rejected": -0.7819935083389282, + "logps/chosen": -1011.8300170898438, + "logps/rejected": -1155.8167724609375, + "loss": 0.4468, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.484269618988037, + "rewards/margins": 1.5660336017608643, + "rewards/rejected": -7.0503034591674805, + "step": 993 + }, + { + "epoch": 0.6491958527226712, + "grad_norm": 18.970078980831484, + "learning_rate": 6.61197682637259e-08, + "logits/chosen": -0.9100611209869385, + "logits/rejected": -0.9906662702560425, + "logps/chosen": -935.2700805664062, + "logps/rejected": -1161.526123046875, + "loss": 0.3892, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.428016662597656, + "rewards/margins": 1.8627903461456299, + "rewards/rejected": -6.290807247161865, + "step": 994 + }, + { + "epoch": 0.6498489672626336, + "grad_norm": 28.497491815155342, + "learning_rate": 6.590520221240173e-08, + "logits/chosen": -0.8032527565956116, + "logits/rejected": -0.755669116973877, + "logps/chosen": -940.073974609375, + "logps/rejected": -1132.3983154296875, + "loss": 0.4247, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.985255241394043, + "rewards/margins": 1.579092025756836, + "rewards/rejected": -6.564347267150879, + "step": 995 + }, + { + "epoch": 0.6505020818025962, + "grad_norm": 19.59715751506617, + "learning_rate": 6.569081362897154e-08, + "logits/chosen": -0.8742798566818237, + "logits/rejected": -0.8021313548088074, + "logps/chosen": -972.17919921875, + "logps/rejected": -1167.32373046875, + "loss": 0.4422, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.003599166870117, + "rewards/margins": 1.8019216060638428, + "rewards/rejected": -6.805520534515381, + "step": 996 + }, + { + "epoch": 0.6511551963425586, + "grad_norm": 19.152498975684075, + "learning_rate": 6.547660362935602e-08, + "logits/chosen": -0.8598069548606873, + "logits/rejected": -0.8882841467857361, + "logps/chosen": -964.857666015625, + "logps/rejected": -1174.9244384765625, + "loss": 0.4068, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.544558048248291, + "rewards/margins": 1.9304155111312866, + "rewards/rejected": -6.474973678588867, + "step": 997 + }, + { + "epoch": 0.651808310882521, + "grad_norm": 20.733434029636296, + "learning_rate": 6.526257332854631e-08, + "logits/chosen": -0.815421462059021, + "logits/rejected": -0.817630410194397, + "logps/chosen": -898.12744140625, + "logps/rejected": -1068.466796875, + "loss": 0.4547, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.393612861633301, + "rewards/margins": 1.5290162563323975, + "rewards/rejected": -5.922628879547119, + "step": 998 + }, + { + "epoch": 0.6524614254224834, + "grad_norm": 39.67458708618184, + "learning_rate": 6.504872384059821e-08, + "logits/chosen": -0.986212968826294, + "logits/rejected": -0.7506334781646729, + "logps/chosen": -941.7611083984375, + "logps/rejected": -980.0782470703125, + "loss": 0.5037, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.185219764709473, + "rewards/margins": 1.1642541885375977, + "rewards/rejected": -5.34947395324707, + "step": 999 + }, + { + "epoch": 0.653114539962446, + "grad_norm": 28.590369004082522, + "learning_rate": 6.483505627862632e-08, + "logits/chosen": -0.7975764870643616, + "logits/rejected": -0.8433130383491516, + "logps/chosen": -905.9865112304688, + "logps/rejected": -1150.0699462890625, + "loss": 0.3935, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.6011528968811035, + "rewards/margins": 2.136965274810791, + "rewards/rejected": -6.7381181716918945, + "step": 1000 + }, + { + "epoch": 0.653114539962446, + "eval_logits/chosen": -0.6597719192504883, + "eval_logits/rejected": -0.6015712022781372, + "eval_logps/chosen": -916.9378662109375, + "eval_logps/rejected": -1058.8646240234375, + "eval_loss": 0.39993926882743835, + "eval_rewards/accuracies": 0.8109999895095825, + "eval_rewards/chosen": -4.430306911468506, + "eval_rewards/margins": 1.506652593612671, + "eval_rewards/rejected": -5.936959743499756, + "eval_runtime": 611.597, + "eval_samples_per_second": 6.54, + "eval_steps_per_second": 0.409, + "step": 1000 + }, + { + "epoch": 0.6537676545024084, + "grad_norm": 18.88782766889396, + "learning_rate": 6.462157175479834e-08, + "logits/chosen": -0.7251522541046143, + "logits/rejected": -0.7726633548736572, + "logps/chosen": -888.7282104492188, + "logps/rejected": -1134.8662109375, + "loss": 0.4294, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.475460052490234, + "rewards/margins": 2.2871503829956055, + "rewards/rejected": -6.76261043548584, + "step": 1001 + }, + { + "epoch": 0.6544207690423708, + "grad_norm": 17.103388735372086, + "learning_rate": 6.440827138032926e-08, + "logits/chosen": -0.8073115944862366, + "logits/rejected": -0.8252954483032227, + "logps/chosen": -954.5016479492188, + "logps/rejected": -1213.746826171875, + "loss": 0.3381, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.250032424926758, + "rewards/margins": 2.545926809310913, + "rewards/rejected": -6.79595947265625, + "step": 1002 + }, + { + "epoch": 0.6550738835823332, + "grad_norm": 20.252892967215743, + "learning_rate": 6.419515626547542e-08, + "logits/chosen": -0.9170666337013245, + "logits/rejected": -0.830635130405426, + "logps/chosen": -868.954833984375, + "logps/rejected": -1118.8985595703125, + "loss": 0.3554, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.489101886749268, + "rewards/margins": 2.7545058727264404, + "rewards/rejected": -7.243607997894287, + "step": 1003 + }, + { + "epoch": 0.6557269981222957, + "grad_norm": 21.94769150248843, + "learning_rate": 6.398222751952898e-08, + "logits/chosen": -0.97673499584198, + "logits/rejected": -0.8297156691551208, + "logps/chosen": -961.0098876953125, + "logps/rejected": -1063.3936767578125, + "loss": 0.4405, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.749734878540039, + "rewards/margins": 1.4654510021209717, + "rewards/rejected": -6.215185642242432, + "step": 1004 + }, + { + "epoch": 0.6563801126622582, + "grad_norm": 17.236789737719608, + "learning_rate": 6.376948625081197e-08, + "logits/chosen": -0.9749749302864075, + "logits/rejected": -0.8019654750823975, + "logps/chosen": -841.8432006835938, + "logps/rejected": -1004.3468017578125, + "loss": 0.3637, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.680497646331787, + "rewards/margins": 2.00964093208313, + "rewards/rejected": -5.69013786315918, + "step": 1005 + }, + { + "epoch": 0.6570332272022206, + "grad_norm": 17.819653232599666, + "learning_rate": 6.355693356667064e-08, + "logits/chosen": -0.9714920520782471, + "logits/rejected": -0.8096312284469604, + "logps/chosen": -877.8084106445312, + "logps/rejected": -976.0759887695312, + "loss": 0.3838, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.433545112609863, + "rewards/margins": 1.3440808057785034, + "rewards/rejected": -5.777626037597656, + "step": 1006 + }, + { + "epoch": 0.657686341742183, + "grad_norm": 21.829219375237, + "learning_rate": 6.334457057346955e-08, + "logits/chosen": -1.0377883911132812, + "logits/rejected": -0.8932283520698547, + "logps/chosen": -975.3641357421875, + "logps/rejected": -1123.2431640625, + "loss": 0.4111, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.581691741943359, + "rewards/margins": 2.190666437149048, + "rewards/rejected": -6.772358417510986, + "step": 1007 + }, + { + "epoch": 0.6583394562821455, + "grad_norm": 25.870601002660127, + "learning_rate": 6.313239837658595e-08, + "logits/chosen": -0.794097900390625, + "logits/rejected": -0.7714889645576477, + "logps/chosen": -843.1420288085938, + "logps/rejected": -1065.3768310546875, + "loss": 0.3803, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.312726020812988, + "rewards/margins": 1.87660813331604, + "rewards/rejected": -6.189334392547607, + "step": 1008 + }, + { + "epoch": 0.6589925708221079, + "grad_norm": 18.443080252151734, + "learning_rate": 6.292041808040392e-08, + "logits/chosen": -0.8890469074249268, + "logits/rejected": -0.8851238489151001, + "logps/chosen": -943.687255859375, + "logps/rejected": -1115.829345703125, + "loss": 0.3903, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.130290985107422, + "rewards/margins": 1.4789390563964844, + "rewards/rejected": -5.609230041503906, + "step": 1009 + }, + { + "epoch": 0.6596456853620704, + "grad_norm": 18.127254021458366, + "learning_rate": 6.270863078830875e-08, + "logits/chosen": -0.8616389036178589, + "logits/rejected": -0.8308084607124329, + "logps/chosen": -1043.859619140625, + "logps/rejected": -1196.02734375, + "loss": 0.3193, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.978635787963867, + "rewards/margins": 1.3679126501083374, + "rewards/rejected": -6.346548080444336, + "step": 1010 + }, + { + "epoch": 0.6602987999020328, + "grad_norm": 15.263520577363048, + "learning_rate": 6.249703760268102e-08, + "logits/chosen": -0.8907433748245239, + "logits/rejected": -0.7941842079162598, + "logps/chosen": -902.1382446289062, + "logps/rejected": -1041.628173828125, + "loss": 0.4291, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.760441303253174, + "rewards/margins": 1.240327000617981, + "rewards/rejected": -6.000767707824707, + "step": 1011 + }, + { + "epoch": 0.6609519144419953, + "grad_norm": 16.88956803814998, + "learning_rate": 6.228563962489105e-08, + "logits/chosen": -0.7089736461639404, + "logits/rejected": -0.8286032676696777, + "logps/chosen": -887.5546875, + "logps/rejected": -1206.897705078125, + "loss": 0.3464, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.500287055969238, + "rewards/margins": 2.4705374240875244, + "rewards/rejected": -6.970824241638184, + "step": 1012 + }, + { + "epoch": 0.6616050289819577, + "grad_norm": 20.507417708250284, + "learning_rate": 6.207443795529302e-08, + "logits/chosen": -0.9035853147506714, + "logits/rejected": -0.8546502590179443, + "logps/chosen": -930.5029296875, + "logps/rejected": -1015.2005004882812, + "loss": 0.3733, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.440670490264893, + "rewards/margins": 1.2674169540405273, + "rewards/rejected": -5.708087921142578, + "step": 1013 + }, + { + "epoch": 0.6622581435219201, + "grad_norm": 23.30558964260641, + "learning_rate": 6.186343369321936e-08, + "logits/chosen": -0.866624116897583, + "logits/rejected": -0.8739579916000366, + "logps/chosen": -961.4056396484375, + "logps/rejected": -1113.17529296875, + "loss": 0.413, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.258022308349609, + "rewards/margins": 1.6140351295471191, + "rewards/rejected": -5.8720574378967285, + "step": 1014 + }, + { + "epoch": 0.6629112580618826, + "grad_norm": 27.951902968353572, + "learning_rate": 6.165262793697485e-08, + "logits/chosen": -0.7021287083625793, + "logits/rejected": -0.7541577816009521, + "logps/chosen": -975.0057373046875, + "logps/rejected": -1180.75830078125, + "loss": 0.3428, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.649076461791992, + "rewards/margins": 1.4899348020553589, + "rewards/rejected": -6.139011383056641, + "step": 1015 + }, + { + "epoch": 0.6635643726018451, + "grad_norm": 17.65162459087189, + "learning_rate": 6.144202178383116e-08, + "logits/chosen": -0.8119497299194336, + "logits/rejected": -0.7789148688316345, + "logps/chosen": -1021.3302001953125, + "logps/rejected": -1172.5645751953125, + "loss": 0.3729, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.71751070022583, + "rewards/margins": 1.3930636644363403, + "rewards/rejected": -7.110573768615723, + "step": 1016 + }, + { + "epoch": 0.6642174871418075, + "grad_norm": 21.54542033666558, + "learning_rate": 6.12316163300209e-08, + "logits/chosen": -1.0835685729980469, + "logits/rejected": -0.9866104125976562, + "logps/chosen": -963.4041137695312, + "logps/rejected": -1078.0098876953125, + "loss": 0.387, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.319831371307373, + "rewards/margins": 1.408236026763916, + "rewards/rejected": -5.7280683517456055, + "step": 1017 + }, + { + "epoch": 0.6648706016817699, + "grad_norm": 28.278042534457644, + "learning_rate": 6.102141267073207e-08, + "logits/chosen": -0.7278181314468384, + "logits/rejected": -0.6452523469924927, + "logps/chosen": -1027.501708984375, + "logps/rejected": -1116.961181640625, + "loss": 0.4252, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.605171203613281, + "rewards/margins": 1.059931993484497, + "rewards/rejected": -6.665103435516357, + "step": 1018 + }, + { + "epoch": 0.6655237162217323, + "grad_norm": 21.588692051121384, + "learning_rate": 6.081141190010228e-08, + "logits/chosen": -0.9027915596961975, + "logits/rejected": -0.9593067169189453, + "logps/chosen": -929.5646362304688, + "logps/rejected": -1081.724365234375, + "loss": 0.3564, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.074166774749756, + "rewards/margins": 1.5641591548919678, + "rewards/rejected": -6.638326644897461, + "step": 1019 + }, + { + "epoch": 0.6661768307616949, + "grad_norm": 29.52749993277838, + "learning_rate": 6.06016151112131e-08, + "logits/chosen": -0.8214589953422546, + "logits/rejected": -0.7484840750694275, + "logps/chosen": -941.7957763671875, + "logps/rejected": -1127.7822265625, + "loss": 0.3624, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.040858745574951, + "rewards/margins": 1.8832162618637085, + "rewards/rejected": -6.924075126647949, + "step": 1020 + }, + { + "epoch": 0.6668299453016573, + "grad_norm": 35.80802512812045, + "learning_rate": 6.039202339608431e-08, + "logits/chosen": -0.9403936266899109, + "logits/rejected": -0.9254910349845886, + "logps/chosen": -966.054443359375, + "logps/rejected": -1181.616455078125, + "loss": 0.3783, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.613440036773682, + "rewards/margins": 1.987739086151123, + "rewards/rejected": -6.601179122924805, + "step": 1021 + }, + { + "epoch": 0.6674830598416197, + "grad_norm": 32.33080558442622, + "learning_rate": 6.01826378456683e-08, + "logits/chosen": -0.8391031622886658, + "logits/rejected": -0.6777211427688599, + "logps/chosen": -1084.9720458984375, + "logps/rejected": -1229.69140625, + "loss": 0.4813, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.708389759063721, + "rewards/margins": 2.226646661758423, + "rewards/rejected": -7.9350361824035645, + "step": 1022 + }, + { + "epoch": 0.6681361743815821, + "grad_norm": 26.323254452592845, + "learning_rate": 5.997345954984428e-08, + "logits/chosen": -0.8788707852363586, + "logits/rejected": -0.7895121574401855, + "logps/chosen": -980.4774169921875, + "logps/rejected": -1198.721435546875, + "loss": 0.4067, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.896998882293701, + "rewards/margins": 1.8806391954421997, + "rewards/rejected": -7.777637958526611, + "step": 1023 + }, + { + "epoch": 0.6687892889215447, + "grad_norm": 29.774523819635025, + "learning_rate": 5.976448959741274e-08, + "logits/chosen": -0.8680253624916077, + "logits/rejected": -0.8557955026626587, + "logps/chosen": -1033.0860595703125, + "logps/rejected": -1219.666748046875, + "loss": 0.4247, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.772690773010254, + "rewards/margins": 1.7076537609100342, + "rewards/rejected": -7.480345249176025, + "step": 1024 + }, + { + "epoch": 0.6694424034615071, + "grad_norm": 23.95120311809926, + "learning_rate": 5.95557290760897e-08, + "logits/chosen": -0.8246513605117798, + "logits/rejected": -0.7980694770812988, + "logps/chosen": -1030.458251953125, + "logps/rejected": -1224.46826171875, + "loss": 0.4175, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.804914474487305, + "rewards/margins": 1.9084160327911377, + "rewards/rejected": -7.71333122253418, + "step": 1025 + }, + { + "epoch": 0.6700955180014695, + "grad_norm": 15.030718046861146, + "learning_rate": 5.934717907250103e-08, + "logits/chosen": -0.9304713010787964, + "logits/rejected": -0.8270508050918579, + "logps/chosen": -1171.826416015625, + "logps/rejected": -1419.8758544921875, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.460097312927246, + "rewards/margins": 2.5480339527130127, + "rewards/rejected": -9.00813102722168, + "step": 1026 + }, + { + "epoch": 0.6707486325414319, + "grad_norm": 38.89653299484151, + "learning_rate": 5.9138840672176845e-08, + "logits/chosen": -1.040481448173523, + "logits/rejected": -0.9256024360656738, + "logps/chosen": -1069.36767578125, + "logps/rejected": -1228.0535888671875, + "loss": 0.3857, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.9183573722839355, + "rewards/margins": 1.9786078929901123, + "rewards/rejected": -7.896965026855469, + "step": 1027 + }, + { + "epoch": 0.6714017470813944, + "grad_norm": 36.203980499152635, + "learning_rate": 5.893071495954587e-08, + "logits/chosen": -0.9186846017837524, + "logits/rejected": -0.6426994800567627, + "logps/chosen": -1027.7308349609375, + "logps/rejected": -1147.070068359375, + "loss": 0.4639, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.392332077026367, + "rewards/margins": 1.7910363674163818, + "rewards/rejected": -7.183367729187012, + "step": 1028 + }, + { + "epoch": 0.6720548616213569, + "grad_norm": 49.257481169429646, + "learning_rate": 5.87228030179297e-08, + "logits/chosen": -0.8563072681427002, + "logits/rejected": -0.8698632121086121, + "logps/chosen": -940.2181396484375, + "logps/rejected": -1112.7574462890625, + "loss": 0.5029, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.377842426300049, + "rewards/margins": 1.112206220626831, + "rewards/rejected": -6.490049362182617, + "step": 1029 + }, + { + "epoch": 0.6727079761613193, + "grad_norm": 27.724815910490978, + "learning_rate": 5.851510592953728e-08, + "logits/chosen": -0.705344557762146, + "logits/rejected": -0.6405034065246582, + "logps/chosen": -1008.1170654296875, + "logps/rejected": -1307.7806396484375, + "loss": 0.3799, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.663532257080078, + "rewards/margins": 2.7474920749664307, + "rewards/rejected": -8.41102409362793, + "step": 1030 + }, + { + "epoch": 0.6733610907012817, + "grad_norm": 30.54103333361302, + "learning_rate": 5.8307624775459194e-08, + "logits/chosen": -0.9086852073669434, + "logits/rejected": -0.7840572595596313, + "logps/chosen": -1092.9234619140625, + "logps/rejected": -1261.9825439453125, + "loss": 0.3802, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.860642433166504, + "rewards/margins": 1.9201451539993286, + "rewards/rejected": -7.780787467956543, + "step": 1031 + }, + { + "epoch": 0.6740142052412442, + "grad_norm": 22.99525416459896, + "learning_rate": 5.810036063566206e-08, + "logits/chosen": -0.9566446542739868, + "logits/rejected": -0.9745736718177795, + "logps/chosen": -1061.357421875, + "logps/rejected": -1276.2802734375, + "loss": 0.3843, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.4080634117126465, + "rewards/margins": 1.6003923416137695, + "rewards/rejected": -7.008456230163574, + "step": 1032 + }, + { + "epoch": 0.6746673197812066, + "grad_norm": 16.646591987887586, + "learning_rate": 5.78933145889829e-08, + "logits/chosen": -0.6335456371307373, + "logits/rejected": -0.603712260723114, + "logps/chosen": -911.3923950195312, + "logps/rejected": -1112.7017822265625, + "loss": 0.3671, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.9419169425964355, + "rewards/margins": 2.1146814823150635, + "rewards/rejected": -7.056598663330078, + "step": 1033 + }, + { + "epoch": 0.6753204343211691, + "grad_norm": 18.61320965234198, + "learning_rate": 5.768648771312354e-08, + "logits/chosen": -0.8663079738616943, + "logits/rejected": -0.9046114683151245, + "logps/chosen": -874.4501342773438, + "logps/rejected": -1093.3466796875, + "loss": 0.4778, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.530414581298828, + "rewards/margins": 1.9164561033248901, + "rewards/rejected": -6.44687032699585, + "step": 1034 + }, + { + "epoch": 0.6759735488611315, + "grad_norm": 32.866564141810386, + "learning_rate": 5.747988108464501e-08, + "logits/chosen": -0.9264869689941406, + "logits/rejected": -0.8600889444351196, + "logps/chosen": -914.56494140625, + "logps/rejected": -1075.855712890625, + "loss": 0.4244, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.675785541534424, + "rewards/margins": 1.764554500579834, + "rewards/rejected": -6.440340042114258, + "step": 1035 + }, + { + "epoch": 0.676626663401094, + "grad_norm": 22.030929440915205, + "learning_rate": 5.7273495778961934e-08, + "logits/chosen": -0.811077356338501, + "logits/rejected": -0.8401280641555786, + "logps/chosen": -895.1986083984375, + "logps/rejected": -1045.7843017578125, + "loss": 0.4521, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.402617931365967, + "rewards/margins": 1.4379081726074219, + "rewards/rejected": -5.8405256271362305, + "step": 1036 + }, + { + "epoch": 0.6772797779410564, + "grad_norm": 17.044116664966285, + "learning_rate": 5.706733287033681e-08, + "logits/chosen": -0.8876982927322388, + "logits/rejected": -0.9169207215309143, + "logps/chosen": -883.7059936523438, + "logps/rejected": -1003.1925048828125, + "loss": 0.3869, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.5824151039123535, + "rewards/margins": 1.4617775678634644, + "rewards/rejected": -6.044192790985107, + "step": 1037 + }, + { + "epoch": 0.6779328924810188, + "grad_norm": 15.396487747451959, + "learning_rate": 5.686139343187467e-08, + "logits/chosen": -0.9496694803237915, + "logits/rejected": -1.007110357284546, + "logps/chosen": -821.8455810546875, + "logps/rejected": -910.9765014648438, + "loss": 0.3915, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.7206950187683105, + "rewards/margins": 1.021874189376831, + "rewards/rejected": -4.742569446563721, + "step": 1038 + }, + { + "epoch": 0.6785860070209813, + "grad_norm": 15.439263523759438, + "learning_rate": 5.6655678535517296e-08, + "logits/chosen": -0.8559461236000061, + "logits/rejected": -0.8094202876091003, + "logps/chosen": -765.6228637695312, + "logps/rejected": -977.0548095703125, + "loss": 0.3976, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1760189533233643, + "rewards/margins": 1.9604696035385132, + "rewards/rejected": -5.136489391326904, + "step": 1039 + }, + { + "epoch": 0.6792391215609438, + "grad_norm": 23.113871047926832, + "learning_rate": 5.645018925203771e-08, + "logits/chosen": -0.8940149545669556, + "logits/rejected": -0.9034225940704346, + "logps/chosen": -877.113037109375, + "logps/rejected": -1016.8856201171875, + "loss": 0.3944, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.261010646820068, + "rewards/margins": 1.2613548040390015, + "rewards/rejected": -5.522365093231201, + "step": 1040 + }, + { + "epoch": 0.6798922361009062, + "grad_norm": 15.694033791253236, + "learning_rate": 5.6244926651034554e-08, + "logits/chosen": -0.9289844036102295, + "logits/rejected": -0.8611258268356323, + "logps/chosen": -899.3880004882812, + "logps/rejected": -1034.294677734375, + "loss": 0.4081, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.299962997436523, + "rewards/margins": 1.414807677268982, + "rewards/rejected": -5.714770793914795, + "step": 1041 + }, + { + "epoch": 0.6805453506408686, + "grad_norm": 20.104453338479633, + "learning_rate": 5.603989180092661e-08, + "logits/chosen": -1.038852334022522, + "logits/rejected": -1.0274477005004883, + "logps/chosen": -864.4334716796875, + "logps/rejected": -1042.127197265625, + "loss": 0.3334, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5643439292907715, + "rewards/margins": 2.006700277328491, + "rewards/rejected": -5.571044445037842, + "step": 1042 + }, + { + "epoch": 0.681198465180831, + "grad_norm": 20.284466958221444, + "learning_rate": 5.583508576894716e-08, + "logits/chosen": -0.9074689149856567, + "logits/rejected": -0.8942803144454956, + "logps/chosen": -788.8450927734375, + "logps/rejected": -977.1786499023438, + "loss": 0.4116, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.9018185138702393, + "rewards/margins": 1.7637240886688232, + "rewards/rejected": -5.6655426025390625, + "step": 1043 + }, + { + "epoch": 0.6818515797207936, + "grad_norm": 20.80758519782795, + "learning_rate": 5.563050962113844e-08, + "logits/chosen": -0.7185485363006592, + "logits/rejected": -0.6636132001876831, + "logps/chosen": -881.2183227539062, + "logps/rejected": -1114.7947998046875, + "loss": 0.3875, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.508395195007324, + "rewards/margins": 2.214874029159546, + "rewards/rejected": -6.723268985748291, + "step": 1044 + }, + { + "epoch": 0.682504694260756, + "grad_norm": 21.006280162797133, + "learning_rate": 5.542616442234618e-08, + "logits/chosen": -0.9581667184829712, + "logits/rejected": -0.9366329312324524, + "logps/chosen": -965.768798828125, + "logps/rejected": -1094.48095703125, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.318354606628418, + "rewards/margins": 1.6102018356323242, + "rewards/rejected": -5.928555965423584, + "step": 1045 + }, + { + "epoch": 0.6831578088007184, + "grad_norm": 20.92621303297454, + "learning_rate": 5.522205123621389e-08, + "logits/chosen": -0.7880922555923462, + "logits/rejected": -0.8342847228050232, + "logps/chosen": -796.2772216796875, + "logps/rejected": -954.2313232421875, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7319140434265137, + "rewards/margins": 1.4701519012451172, + "rewards/rejected": -5.202065467834473, + "step": 1046 + }, + { + "epoch": 0.6838109233406808, + "grad_norm": 28.646083147828463, + "learning_rate": 5.501817112517748e-08, + "logits/chosen": -0.8063758611679077, + "logits/rejected": -0.7389829158782959, + "logps/chosen": -827.5449829101562, + "logps/rejected": -965.8816528320312, + "loss": 0.3728, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.156528472900391, + "rewards/margins": 1.3732936382293701, + "rewards/rejected": -5.529821872711182, + "step": 1047 + }, + { + "epoch": 0.6844640378806434, + "grad_norm": 16.107664772286466, + "learning_rate": 5.4814525150459735e-08, + "logits/chosen": -0.798210859298706, + "logits/rejected": -0.7040055394172668, + "logps/chosen": -994.9788208007812, + "logps/rejected": -1129.2100830078125, + "loss": 0.4314, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.312658309936523, + "rewards/margins": 1.7292759418487549, + "rewards/rejected": -6.041934490203857, + "step": 1048 + }, + { + "epoch": 0.6851171524206058, + "grad_norm": 19.906158775570255, + "learning_rate": 5.461111437206456e-08, + "logits/chosen": -0.6621243953704834, + "logits/rejected": -0.7967365980148315, + "logps/chosen": -920.3370971679688, + "logps/rejected": -1080.441650390625, + "loss": 0.42, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.662235260009766, + "rewards/margins": 1.124133825302124, + "rewards/rejected": -5.786369800567627, + "step": 1049 + }, + { + "epoch": 0.6857702669605682, + "grad_norm": 14.893691672277162, + "learning_rate": 5.4407939848771764e-08, + "logits/chosen": -0.9783452749252319, + "logits/rejected": -0.9631463885307312, + "logps/chosen": -800.3297119140625, + "logps/rejected": -957.534912109375, + "loss": 0.3606, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5368897914886475, + "rewards/margins": 1.668915867805481, + "rewards/rejected": -5.205805778503418, + "step": 1050 + }, + { + "epoch": 0.6864233815005306, + "grad_norm": 25.85461733379247, + "learning_rate": 5.4205002638131404e-08, + "logits/chosen": -0.9667469263076782, + "logits/rejected": -0.9314427971839905, + "logps/chosen": -910.9541625976562, + "logps/rejected": -1013.9326782226562, + "loss": 0.4132, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.142405986785889, + "rewards/margins": 0.9281677007675171, + "rewards/rejected": -5.070573329925537, + "step": 1051 + }, + { + "epoch": 0.6870764960404931, + "grad_norm": 16.778904431002125, + "learning_rate": 5.400230379645827e-08, + "logits/chosen": -0.8126803636550903, + "logits/rejected": -0.9782637357711792, + "logps/chosen": -929.3917236328125, + "logps/rejected": -1180.9573974609375, + "loss": 0.351, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.6177825927734375, + "rewards/margins": 1.6634180545806885, + "rewards/rejected": -6.281200408935547, + "step": 1052 + }, + { + "epoch": 0.6877296105804556, + "grad_norm": 20.764558520137363, + "learning_rate": 5.379984437882642e-08, + "logits/chosen": -0.7940772771835327, + "logits/rejected": -0.7913058400154114, + "logps/chosen": -897.7255249023438, + "logps/rejected": -1095.1759033203125, + "loss": 0.3669, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.532567977905273, + "rewards/margins": 1.7281697988510132, + "rewards/rejected": -6.260737419128418, + "step": 1053 + }, + { + "epoch": 0.688382725120418, + "grad_norm": 20.538231684797744, + "learning_rate": 5.3597625439063675e-08, + "logits/chosen": -0.9358534812927246, + "logits/rejected": -1.0889662504196167, + "logps/chosen": -925.3179321289062, + "logps/rejected": -1099.0260009765625, + "loss": 0.4167, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8400797843933105, + "rewards/margins": 1.610809326171875, + "rewards/rejected": -5.450888633728027, + "step": 1054 + }, + { + "epoch": 0.6890358396603804, + "grad_norm": 31.707439136525775, + "learning_rate": 5.339564802974614e-08, + "logits/chosen": -0.8603663444519043, + "logits/rejected": -0.658951461315155, + "logps/chosen": -934.3653564453125, + "logps/rejected": -1055.4537353515625, + "loss": 0.4493, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.719765663146973, + "rewards/margins": 1.5893982648849487, + "rewards/rejected": -6.309163570404053, + "step": 1055 + }, + { + "epoch": 0.6896889542003429, + "grad_norm": 35.603302908406555, + "learning_rate": 5.319391320219271e-08, + "logits/chosen": -0.7260603308677673, + "logits/rejected": -0.5930569767951965, + "logps/chosen": -985.3331298828125, + "logps/rejected": -1116.142822265625, + "loss": 0.4828, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.90203332901001, + "rewards/margins": 1.783280372619629, + "rewards/rejected": -6.685314178466797, + "step": 1056 + }, + { + "epoch": 0.6903420687403053, + "grad_norm": 36.525210316554094, + "learning_rate": 5.2992422006459584e-08, + "logits/chosen": -1.0283126831054688, + "logits/rejected": -0.9706616401672363, + "logps/chosen": -881.817626953125, + "logps/rejected": -996.3411865234375, + "loss": 0.3508, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.343301296234131, + "rewards/margins": 0.9503412246704102, + "rewards/rejected": -5.293642997741699, + "step": 1057 + }, + { + "epoch": 0.6909951832802678, + "grad_norm": 16.444240118695234, + "learning_rate": 5.279117549133494e-08, + "logits/chosen": -0.7937179207801819, + "logits/rejected": -0.7530346512794495, + "logps/chosen": -1088.676513671875, + "logps/rejected": -1272.829833984375, + "loss": 0.4141, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.84459114074707, + "rewards/margins": 1.847930669784546, + "rewards/rejected": -7.692521572113037, + "step": 1058 + }, + { + "epoch": 0.6916482978202302, + "grad_norm": 22.26048819934266, + "learning_rate": 5.259017470433328e-08, + "logits/chosen": -0.9292958378791809, + "logits/rejected": -0.8815614581108093, + "logps/chosen": -977.33447265625, + "logps/rejected": -1077.522705078125, + "loss": 0.3846, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.464293003082275, + "rewards/margins": 1.2737520933151245, + "rewards/rejected": -5.7380452156066895, + "step": 1059 + }, + { + "epoch": 0.6923014123601927, + "grad_norm": 22.035446979342556, + "learning_rate": 5.238942069168999e-08, + "logits/chosen": -1.0402249097824097, + "logits/rejected": -0.9985241889953613, + "logps/chosen": -899.611328125, + "logps/rejected": -1045.4075927734375, + "loss": 0.417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.677881240844727, + "rewards/margins": 1.5670441389083862, + "rewards/rejected": -6.2449259757995605, + "step": 1060 + }, + { + "epoch": 0.6929545269001551, + "grad_norm": 16.41657358283337, + "learning_rate": 5.2188914498356074e-08, + "logits/chosen": -0.7909882068634033, + "logits/rejected": -0.652152955532074, + "logps/chosen": -1015.9618530273438, + "logps/rejected": -1141.8668212890625, + "loss": 0.3874, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.259662628173828, + "rewards/margins": 1.6021357774734497, + "rewards/rejected": -6.8617987632751465, + "step": 1061 + }, + { + "epoch": 0.6936076414401176, + "grad_norm": 18.329459349173682, + "learning_rate": 5.198865716799256e-08, + "logits/chosen": -0.9934824109077454, + "logits/rejected": -0.9579141139984131, + "logps/chosen": -945.3819580078125, + "logps/rejected": -1096.711181640625, + "loss": 0.3489, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.630897045135498, + "rewards/margins": 1.4307856559753418, + "rewards/rejected": -6.06168270111084, + "step": 1062 + }, + { + "epoch": 0.69426075598008, + "grad_norm": 18.277996368135458, + "learning_rate": 5.178864974296511e-08, + "logits/chosen": -0.9666036367416382, + "logits/rejected": -0.9266764521598816, + "logps/chosen": -993.549560546875, + "logps/rejected": -1246.1953125, + "loss": 0.3568, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.596602916717529, + "rewards/margins": 2.0984785556793213, + "rewards/rejected": -6.6950812339782715, + "step": 1063 + }, + { + "epoch": 0.6949138705200425, + "grad_norm": 17.971296792300368, + "learning_rate": 5.1588893264338616e-08, + "logits/chosen": -0.8004229068756104, + "logits/rejected": -0.85038822889328, + "logps/chosen": -1064.92822265625, + "logps/rejected": -1406.717529296875, + "loss": 0.3459, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.635952949523926, + "rewards/margins": 3.123544216156006, + "rewards/rejected": -8.759496688842773, + "step": 1064 + }, + { + "epoch": 0.6955669850600049, + "grad_norm": 26.220818179258686, + "learning_rate": 5.138938877187173e-08, + "logits/chosen": -0.8688936233520508, + "logits/rejected": -0.9654818773269653, + "logps/chosen": -949.654296875, + "logps/rejected": -1131.2215576171875, + "loss": 0.4364, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.012843132019043, + "rewards/margins": 1.3016622066497803, + "rewards/rejected": -6.314505577087402, + "step": 1065 + }, + { + "epoch": 0.6962200995999673, + "grad_norm": 19.263308241664543, + "learning_rate": 5.119013730401152e-08, + "logits/chosen": -0.9235569834709167, + "logits/rejected": -0.8715468049049377, + "logps/chosen": -957.46923828125, + "logps/rejected": -1096.870849609375, + "loss": 0.3926, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.0340352058410645, + "rewards/margins": 1.6012662649154663, + "rewards/rejected": -6.63530158996582, + "step": 1066 + }, + { + "epoch": 0.6968732141399298, + "grad_norm": 35.0608624519057, + "learning_rate": 5.099113989788799e-08, + "logits/chosen": -0.7219104766845703, + "logits/rejected": -0.7232096791267395, + "logps/chosen": -897.578125, + "logps/rejected": -1049.745361328125, + "loss": 0.438, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.6848464012146, + "rewards/margins": 1.5889393091201782, + "rewards/rejected": -6.273785591125488, + "step": 1067 + }, + { + "epoch": 0.6975263286798923, + "grad_norm": 46.493238733591774, + "learning_rate": 5.0792397589308754e-08, + "logits/chosen": -0.9190981984138489, + "logits/rejected": -0.9008411169052124, + "logps/chosen": -1021.0134887695312, + "logps/rejected": -1238.074462890625, + "loss": 0.4594, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.529772758483887, + "rewards/margins": 1.920540452003479, + "rewards/rejected": -7.450313568115234, + "step": 1068 + }, + { + "epoch": 0.6981794432198547, + "grad_norm": 34.874149040915995, + "learning_rate": 5.0593911412753574e-08, + "logits/chosen": -0.8482516407966614, + "logits/rejected": -0.814669132232666, + "logps/chosen": -878.93310546875, + "logps/rejected": -1069.1351318359375, + "loss": 0.4679, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.570670127868652, + "rewards/margins": 1.5518887042999268, + "rewards/rejected": -6.122559070587158, + "step": 1069 + }, + { + "epoch": 0.6988325577598171, + "grad_norm": 25.3481107361764, + "learning_rate": 5.0395682401369045e-08, + "logits/chosen": -1.0013236999511719, + "logits/rejected": -1.0127506256103516, + "logps/chosen": -976.4231567382812, + "logps/rejected": -1167.21728515625, + "loss": 0.4275, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.2549943923950195, + "rewards/margins": 1.9067177772521973, + "rewards/rejected": -7.161712169647217, + "step": 1070 + }, + { + "epoch": 0.6994856722997795, + "grad_norm": 24.53339942422348, + "learning_rate": 5.01977115869632e-08, + "logits/chosen": -0.9320468306541443, + "logits/rejected": -0.7867690920829773, + "logps/chosen": -929.0973510742188, + "logps/rejected": -1044.9697265625, + "loss": 0.3859, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.607649803161621, + "rewards/margins": 1.5433708429336548, + "rewards/rejected": -6.1510210037231445, + "step": 1071 + }, + { + "epoch": 0.7001387868397421, + "grad_norm": 19.657165610425114, + "learning_rate": 5.000000000000002e-08, + "logits/chosen": -0.7341365814208984, + "logits/rejected": -0.7753069996833801, + "logps/chosen": -980.4578857421875, + "logps/rejected": -1232.793212890625, + "loss": 0.3473, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.089752674102783, + "rewards/margins": 1.838250756263733, + "rewards/rejected": -6.928003311157227, + "step": 1072 + }, + { + "epoch": 0.7007919013797045, + "grad_norm": 16.536478826191836, + "learning_rate": 4.980254866959428e-08, + "logits/chosen": -0.9527970552444458, + "logits/rejected": -0.7555267810821533, + "logps/chosen": -1103.2843017578125, + "logps/rejected": -1188.142822265625, + "loss": 0.3595, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.621752738952637, + "rewards/margins": 1.7319358587265015, + "rewards/rejected": -7.353688716888428, + "step": 1073 + }, + { + "epoch": 0.7014450159196669, + "grad_norm": 30.635582185007305, + "learning_rate": 4.960535862350604e-08, + "logits/chosen": -0.8805766105651855, + "logits/rejected": -0.8499529957771301, + "logps/chosen": -936.8541870117188, + "logps/rejected": -1091.2286376953125, + "loss": 0.2982, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.690476417541504, + "rewards/margins": 1.805678367614746, + "rewards/rejected": -6.49615478515625, + "step": 1074 + }, + { + "epoch": 0.7020981304596293, + "grad_norm": 22.726614424437997, + "learning_rate": 4.9408430888135366e-08, + "logits/chosen": -0.9252241253852844, + "logits/rejected": -0.9034774303436279, + "logps/chosen": -1005.1307373046875, + "logps/rejected": -1098.4501953125, + "loss": 0.3193, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.690752983093262, + "rewards/margins": 1.0792409181594849, + "rewards/rejected": -6.769993305206299, + "step": 1075 + }, + { + "epoch": 0.7027512449995919, + "grad_norm": 16.997208594932292, + "learning_rate": 4.921176648851695e-08, + "logits/chosen": -0.9160603880882263, + "logits/rejected": -0.9556408524513245, + "logps/chosen": -1001.8563232421875, + "logps/rejected": -1210.08349609375, + "loss": 0.3174, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.473414421081543, + "rewards/margins": 1.7691338062286377, + "rewards/rejected": -7.242548942565918, + "step": 1076 + }, + { + "epoch": 0.7034043595395543, + "grad_norm": 30.06553357109662, + "learning_rate": 4.9015366448314776e-08, + "logits/chosen": -0.9377183318138123, + "logits/rejected": -0.8571749329566956, + "logps/chosen": -885.6331787109375, + "logps/rejected": -1006.0621337890625, + "loss": 0.3731, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.572412967681885, + "rewards/margins": 1.4350861310958862, + "rewards/rejected": -6.007499694824219, + "step": 1077 + }, + { + "epoch": 0.7040574740795167, + "grad_norm": 32.30787877247046, + "learning_rate": 4.8819231789816804e-08, + "logits/chosen": -0.9564331769943237, + "logits/rejected": -0.9491601586341858, + "logps/chosen": -867.03369140625, + "logps/rejected": -1031.533447265625, + "loss": 0.4263, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.6438212394714355, + "rewards/margins": 1.7188313007354736, + "rewards/rejected": -6.36265230178833, + "step": 1078 + }, + { + "epoch": 0.7047105886194791, + "grad_norm": 33.27109224503714, + "learning_rate": 4.8623363533929665e-08, + "logits/chosen": -0.7700978517532349, + "logits/rejected": -0.7294084429740906, + "logps/chosen": -965.6128540039062, + "logps/rejected": -1207.4700927734375, + "loss": 0.346, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.409863471984863, + "rewards/margins": 2.164041519165039, + "rewards/rejected": -7.573904991149902, + "step": 1079 + }, + { + "epoch": 0.7053637031594416, + "grad_norm": 16.688865139759347, + "learning_rate": 4.8427762700173315e-08, + "logits/chosen": -0.7151041030883789, + "logits/rejected": -0.7404736280441284, + "logps/chosen": -1033.2183837890625, + "logps/rejected": -1254.7646484375, + "loss": 0.3362, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.383447170257568, + "rewards/margins": 1.9840342998504639, + "rewards/rejected": -7.367480754852295, + "step": 1080 + }, + { + "epoch": 0.706016817699404, + "grad_norm": 29.119132374839726, + "learning_rate": 4.823243030667575e-08, + "logits/chosen": -0.7962474226951599, + "logits/rejected": -0.7569393515586853, + "logps/chosen": -971.79345703125, + "logps/rejected": -1106.267333984375, + "loss": 0.4776, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.968940258026123, + "rewards/margins": 1.46136474609375, + "rewards/rejected": -6.430305004119873, + "step": 1081 + }, + { + "epoch": 0.7066699322393665, + "grad_norm": 33.91675103172332, + "learning_rate": 4.8037367370167734e-08, + "logits/chosen": -0.9695914387702942, + "logits/rejected": -0.8926000595092773, + "logps/chosen": -945.17138671875, + "logps/rejected": -1108.794677734375, + "loss": 0.4413, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.946935176849365, + "rewards/margins": 1.6695194244384766, + "rewards/rejected": -6.616455078125, + "step": 1082 + }, + { + "epoch": 0.7073230467793289, + "grad_norm": 27.226190897262036, + "learning_rate": 4.784257490597735e-08, + "logits/chosen": -0.6901608109474182, + "logits/rejected": -0.7227190732955933, + "logps/chosen": -892.798583984375, + "logps/rejected": -1206.8834228515625, + "loss": 0.3787, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.823614120483398, + "rewards/margins": 2.6140191555023193, + "rewards/rejected": -7.4376325607299805, + "step": 1083 + }, + { + "epoch": 0.7079761613192914, + "grad_norm": 21.147890499910353, + "learning_rate": 4.7648053928024965e-08, + "logits/chosen": -0.8689476251602173, + "logits/rejected": -0.813915491104126, + "logps/chosen": -930.1356201171875, + "logps/rejected": -1091.7408447265625, + "loss": 0.3937, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.987719535827637, + "rewards/margins": 1.701695442199707, + "rewards/rejected": -6.689414978027344, + "step": 1084 + }, + { + "epoch": 0.7086292758592538, + "grad_norm": 29.385487598664348, + "learning_rate": 4.745380544881779e-08, + "logits/chosen": -0.8477723598480225, + "logits/rejected": -0.7657088041305542, + "logps/chosen": -959.1470947265625, + "logps/rejected": -1071.5858154296875, + "loss": 0.3979, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.0760931968688965, + "rewards/margins": 1.1108086109161377, + "rewards/rejected": -6.186902046203613, + "step": 1085 + }, + { + "epoch": 0.7092823903992163, + "grad_norm": 41.50920009598862, + "learning_rate": 4.725983047944461e-08, + "logits/chosen": -0.8235193490982056, + "logits/rejected": -0.8154042959213257, + "logps/chosen": -1094.9686279296875, + "logps/rejected": -1276.4725341796875, + "loss": 0.5138, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.995870590209961, + "rewards/margins": 1.772527813911438, + "rewards/rejected": -7.768398284912109, + "step": 1086 + }, + { + "epoch": 0.7099355049391787, + "grad_norm": 22.16450576801752, + "learning_rate": 4.7066130029570596e-08, + "logits/chosen": -0.7971264123916626, + "logits/rejected": -0.7773457765579224, + "logps/chosen": -930.9798583984375, + "logps/rejected": -1154.8284912109375, + "loss": 0.353, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.19879674911499, + "rewards/margins": 1.7664600610733032, + "rewards/rejected": -6.965257167816162, + "step": 1087 + }, + { + "epoch": 0.7105886194791412, + "grad_norm": 20.94228934785079, + "learning_rate": 4.6872705107431995e-08, + "logits/chosen": -0.9495987296104431, + "logits/rejected": -1.042331576347351, + "logps/chosen": -1047.2802734375, + "logps/rejected": -1344.6365966796875, + "loss": 0.3787, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.7100958824157715, + "rewards/margins": 2.1285243034362793, + "rewards/rejected": -7.838620185852051, + "step": 1088 + }, + { + "epoch": 0.7112417340191036, + "grad_norm": 23.544647901426156, + "learning_rate": 4.6679556719830895e-08, + "logits/chosen": -0.8243415355682373, + "logits/rejected": -0.9112780690193176, + "logps/chosen": -1005.364501953125, + "logps/rejected": -1222.390869140625, + "loss": 0.3271, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.869158744812012, + "rewards/margins": 1.9440407752990723, + "rewards/rejected": -6.813199996948242, + "step": 1089 + }, + { + "epoch": 0.711894848559066, + "grad_norm": 28.57617302768549, + "learning_rate": 4.648668587212997e-08, + "logits/chosen": -0.873595118522644, + "logits/rejected": -0.9013544917106628, + "logps/chosen": -987.3074340820312, + "logps/rejected": -1221.4285888671875, + "loss": 0.364, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.284542083740234, + "rewards/margins": 2.2790379524230957, + "rewards/rejected": -7.56358003616333, + "step": 1090 + }, + { + "epoch": 0.7125479630990285, + "grad_norm": 17.85569175076103, + "learning_rate": 4.6294093568247297e-08, + "logits/chosen": -0.7971256971359253, + "logits/rejected": -0.6644490361213684, + "logps/chosen": -1045.35986328125, + "logps/rejected": -1205.6650390625, + "loss": 0.3657, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.020387172698975, + "rewards/margins": 2.104153871536255, + "rewards/rejected": -7.124541282653809, + "step": 1091 + }, + { + "epoch": 0.713201077638991, + "grad_norm": 36.66717614889544, + "learning_rate": 4.6101780810651057e-08, + "logits/chosen": -0.9672690033912659, + "logits/rejected": -0.8716130256652832, + "logps/chosen": -986.6090087890625, + "logps/rejected": -1103.75, + "loss": 0.4264, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.203864097595215, + "rewards/margins": 1.3343470096588135, + "rewards/rejected": -6.538211345672607, + "step": 1092 + }, + { + "epoch": 0.7138541921789534, + "grad_norm": 18.874466250073453, + "learning_rate": 4.590974860035439e-08, + "logits/chosen": -0.9630517959594727, + "logits/rejected": -1.0401439666748047, + "logps/chosen": -955.3115844726562, + "logps/rejected": -1162.4873046875, + "loss": 0.3307, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.960440635681152, + "rewards/margins": 1.801946759223938, + "rewards/rejected": -6.762386798858643, + "step": 1093 + }, + { + "epoch": 0.7145073067189158, + "grad_norm": 17.887843223290304, + "learning_rate": 4.571799793691013e-08, + "logits/chosen": -0.7270591259002686, + "logits/rejected": -0.7815565466880798, + "logps/chosen": -855.4141235351562, + "logps/rejected": -1117.685791015625, + "loss": 0.404, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.313096523284912, + "rewards/margins": 2.240790605545044, + "rewards/rejected": -6.553886890411377, + "step": 1094 + }, + { + "epoch": 0.7151604212588782, + "grad_norm": 42.00786174992929, + "learning_rate": 4.5526529818405636e-08, + "logits/chosen": -0.8590465784072876, + "logits/rejected": -0.8318109512329102, + "logps/chosen": -1035.328369140625, + "logps/rejected": -1167.46728515625, + "loss": 0.4439, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.72844934463501, + "rewards/margins": 1.2250757217407227, + "rewards/rejected": -6.953525543212891, + "step": 1095 + }, + { + "epoch": 0.7158135357988408, + "grad_norm": 23.651213647631792, + "learning_rate": 4.533534524145756e-08, + "logits/chosen": -0.9591556787490845, + "logits/rejected": -0.7288451194763184, + "logps/chosen": -1023.7062377929688, + "logps/rejected": -1212.59521484375, + "loss": 0.3722, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.4393205642700195, + "rewards/margins": 2.401437520980835, + "rewards/rejected": -7.840758323669434, + "step": 1096 + }, + { + "epoch": 0.7164666503388032, + "grad_norm": 21.052436894226904, + "learning_rate": 4.514444520120669e-08, + "logits/chosen": -0.7670482397079468, + "logits/rejected": -0.7710490226745605, + "logps/chosen": -998.1491088867188, + "logps/rejected": -1230.4339599609375, + "loss": 0.3542, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.109296798706055, + "rewards/margins": 2.5655345916748047, + "rewards/rejected": -7.674831390380859, + "step": 1097 + }, + { + "epoch": 0.7171197648787656, + "grad_norm": 19.91030514123549, + "learning_rate": 4.495383069131281e-08, + "logits/chosen": -0.7524601221084595, + "logits/rejected": -0.7881897687911987, + "logps/chosen": -927.6683349609375, + "logps/rejected": -1151.753662109375, + "loss": 0.3819, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.98646354675293, + "rewards/margins": 2.0308237075805664, + "rewards/rejected": -7.01728630065918, + "step": 1098 + }, + { + "epoch": 0.717772879418728, + "grad_norm": 29.438295455629994, + "learning_rate": 4.476350270394942e-08, + "logits/chosen": -0.8855903148651123, + "logits/rejected": -0.8176090121269226, + "logps/chosen": -991.46923828125, + "logps/rejected": -1106.505859375, + "loss": 0.4597, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.334303379058838, + "rewards/margins": 1.3546168804168701, + "rewards/rejected": -6.688920021057129, + "step": 1099 + }, + { + "epoch": 0.7184259939586906, + "grad_norm": 21.8407247391087, + "learning_rate": 4.457346222979864e-08, + "logits/chosen": -1.0324180126190186, + "logits/rejected": -1.028664231300354, + "logps/chosen": -1082.05615234375, + "logps/rejected": -1251.3363037109375, + "loss": 0.3205, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.213923454284668, + "rewards/margins": 1.9834064245224, + "rewards/rejected": -7.197330474853516, + "step": 1100 + }, + { + "epoch": 0.7184259939586906, + "eval_logits/chosen": -0.6451797485351562, + "eval_logits/rejected": -0.5846331715583801, + "eval_logps/chosen": -992.7451782226562, + "eval_logps/rejected": -1153.4371337890625, + "eval_loss": 0.39495497941970825, + "eval_rewards/accuracies": 0.8009999990463257, + "eval_rewards/chosen": -5.188379764556885, + "eval_rewards/margins": 1.6943055391311646, + "eval_rewards/rejected": -6.882685661315918, + "eval_runtime": 620.3885, + "eval_samples_per_second": 6.448, + "eval_steps_per_second": 0.403, + "step": 1100 + }, + { + "epoch": 0.719079108498653, + "grad_norm": 32.440008459162485, + "learning_rate": 4.4383710258046095e-08, + "logits/chosen": -0.8906784057617188, + "logits/rejected": -0.8440839052200317, + "logps/chosen": -1033.5921630859375, + "logps/rejected": -1192.6195068359375, + "loss": 0.3947, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.000400066375732, + "rewards/margins": 2.114108085632324, + "rewards/rejected": -7.114508152008057, + "step": 1101 + }, + { + "epoch": 0.7197322230386154, + "grad_norm": 30.116923265962466, + "learning_rate": 4.419424777637565e-08, + "logits/chosen": -0.9643840193748474, + "logits/rejected": -0.8965681791305542, + "logps/chosen": -887.838134765625, + "logps/rejected": -980.8176879882812, + "loss": 0.4032, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.65021276473999, + "rewards/margins": 1.3411513566970825, + "rewards/rejected": -5.991364002227783, + "step": 1102 + }, + { + "epoch": 0.7203853375785778, + "grad_norm": 23.039154118625174, + "learning_rate": 4.4005075770964396e-08, + "logits/chosen": -1.019102931022644, + "logits/rejected": -0.9605964422225952, + "logps/chosen": -1037.569091796875, + "logps/rejected": -1107.9766845703125, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.0233683586120605, + "rewards/margins": 1.0656507015228271, + "rewards/rejected": -6.089019298553467, + "step": 1103 + }, + { + "epoch": 0.7210384521185403, + "grad_norm": 52.674944572170844, + "learning_rate": 4.3816195226477425e-08, + "logits/chosen": -0.8158155679702759, + "logits/rejected": -0.7879456877708435, + "logps/chosen": -913.3277587890625, + "logps/rejected": -1127.154296875, + "loss": 0.3387, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.6683149337768555, + "rewards/margins": 2.0909457206726074, + "rewards/rejected": -6.759261608123779, + "step": 1104 + }, + { + "epoch": 0.7216915666585028, + "grad_norm": 31.95750334380501, + "learning_rate": 4.362760712606277e-08, + "logits/chosen": -0.7597179412841797, + "logits/rejected": -0.7276663184165955, + "logps/chosen": -917.2421875, + "logps/rejected": -1164.9512939453125, + "loss": 0.4093, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.858676433563232, + "rewards/margins": 2.1077828407287598, + "rewards/rejected": -6.96645975112915, + "step": 1105 + }, + { + "epoch": 0.7223446811984652, + "grad_norm": 18.80716307800814, + "learning_rate": 4.3439312451346154e-08, + "logits/chosen": -0.777508556842804, + "logits/rejected": -0.7970513105392456, + "logps/chosen": -883.0430297851562, + "logps/rejected": -1154.78076171875, + "loss": 0.4108, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.606140613555908, + "rewards/margins": 2.7335846424102783, + "rewards/rejected": -7.339725017547607, + "step": 1106 + }, + { + "epoch": 0.7229977957384276, + "grad_norm": 18.132490684953048, + "learning_rate": 4.32513121824261e-08, + "logits/chosen": -0.8307619094848633, + "logits/rejected": -0.7737395763397217, + "logps/chosen": -877.05126953125, + "logps/rejected": -935.8836669921875, + "loss": 0.4277, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.600595474243164, + "rewards/margins": 0.8932433128356934, + "rewards/rejected": -5.493838787078857, + "step": 1107 + }, + { + "epoch": 0.7236509102783901, + "grad_norm": 20.319116079697935, + "learning_rate": 4.306360729786866e-08, + "logits/chosen": -0.8983240127563477, + "logits/rejected": -0.7303913235664368, + "logps/chosen": -999.090576171875, + "logps/rejected": -1133.9527587890625, + "loss": 0.4685, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.099803924560547, + "rewards/margins": 1.5410734415054321, + "rewards/rejected": -6.640877723693848, + "step": 1108 + }, + { + "epoch": 0.7243040248183525, + "grad_norm": 17.22827875356533, + "learning_rate": 4.287619877470238e-08, + "logits/chosen": -0.6570420861244202, + "logits/rejected": -0.6857782006263733, + "logps/chosen": -927.8821411132812, + "logps/rejected": -1173.153564453125, + "loss": 0.3749, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.6236114501953125, + "rewards/margins": 2.370687484741211, + "rewards/rejected": -6.994299411773682, + "step": 1109 + }, + { + "epoch": 0.724957139358315, + "grad_norm": 24.744654710477878, + "learning_rate": 4.268908758841317e-08, + "logits/chosen": -0.769822359085083, + "logits/rejected": -0.7670592069625854, + "logps/chosen": -850.655517578125, + "logps/rejected": -1075.8778076171875, + "loss": 0.3146, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.2061848640441895, + "rewards/margins": 1.7134172916412354, + "rewards/rejected": -5.919601917266846, + "step": 1110 + }, + { + "epoch": 0.7256102538982774, + "grad_norm": 24.463939391961844, + "learning_rate": 4.250227471293935e-08, + "logits/chosen": -0.7590472102165222, + "logits/rejected": -0.6803931593894958, + "logps/chosen": -939.5840454101562, + "logps/rejected": -1133.0911865234375, + "loss": 0.3605, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.415617942810059, + "rewards/margins": 1.7700562477111816, + "rewards/rejected": -7.185673713684082, + "step": 1111 + }, + { + "epoch": 0.7262633684382399, + "grad_norm": 35.63302248021802, + "learning_rate": 4.2315761120666394e-08, + "logits/chosen": -0.8123034238815308, + "logits/rejected": -0.6775568723678589, + "logps/chosen": -924.438232421875, + "logps/rejected": -1116.736328125, + "loss": 0.4079, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.730605125427246, + "rewards/margins": 2.201202869415283, + "rewards/rejected": -6.931807994842529, + "step": 1112 + }, + { + "epoch": 0.7269164829782023, + "grad_norm": 21.952209080683218, + "learning_rate": 4.212954778242203e-08, + "logits/chosen": -0.9220980405807495, + "logits/rejected": -0.8390252590179443, + "logps/chosen": -993.52197265625, + "logps/rejected": -1138.182861328125, + "loss": 0.4338, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.041605472564697, + "rewards/margins": 1.724038004875183, + "rewards/rejected": -6.76564359664917, + "step": 1113 + }, + { + "epoch": 0.7275695975181647, + "grad_norm": 46.56626196193875, + "learning_rate": 4.194363566747109e-08, + "logits/chosen": -0.7909491062164307, + "logits/rejected": -0.7935473322868347, + "logps/chosen": -889.592041015625, + "logps/rejected": -1148.26025390625, + "loss": 0.4108, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.259602069854736, + "rewards/margins": 2.401700496673584, + "rewards/rejected": -6.6613030433654785, + "step": 1114 + }, + { + "epoch": 0.7282227120581272, + "grad_norm": 23.199530145531195, + "learning_rate": 4.175802574351052e-08, + "logits/chosen": -1.1460967063903809, + "logits/rejected": -1.1267650127410889, + "logps/chosen": -1016.3873291015625, + "logps/rejected": -1232.958984375, + "loss": 0.318, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.17551326751709, + "rewards/margins": 1.9491366147994995, + "rewards/rejected": -7.124650001525879, + "step": 1115 + }, + { + "epoch": 0.7288758265980897, + "grad_norm": 24.805300670034743, + "learning_rate": 4.1572718976664366e-08, + "logits/chosen": -0.7371432781219482, + "logits/rejected": -0.7844556570053101, + "logps/chosen": -948.8936767578125, + "logps/rejected": -1193.291259765625, + "loss": 0.3402, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.742420673370361, + "rewards/margins": 2.136198043823242, + "rewards/rejected": -6.8786187171936035, + "step": 1116 + }, + { + "epoch": 0.7295289411380521, + "grad_norm": 36.62281859138192, + "learning_rate": 4.1387716331478564e-08, + "logits/chosen": -0.7828527688980103, + "logits/rejected": -0.7460812330245972, + "logps/chosen": -1003.2020874023438, + "logps/rejected": -1216.4989013671875, + "loss": 0.3146, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.179636001586914, + "rewards/margins": 1.6944818496704102, + "rewards/rejected": -6.874117851257324, + "step": 1117 + }, + { + "epoch": 0.7301820556780145, + "grad_norm": 26.940456429092567, + "learning_rate": 4.1203018770916185e-08, + "logits/chosen": -0.7151032090187073, + "logits/rejected": -0.7399024963378906, + "logps/chosen": -955.585205078125, + "logps/rejected": -1188.607421875, + "loss": 0.4193, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.988138675689697, + "rewards/margins": 1.6223087310791016, + "rewards/rejected": -6.610446929931641, + "step": 1118 + }, + { + "epoch": 0.730835170217977, + "grad_norm": 19.280842272039692, + "learning_rate": 4.101862725635227e-08, + "logits/chosen": -0.9849437475204468, + "logits/rejected": -0.8866140842437744, + "logps/chosen": -987.6910400390625, + "logps/rejected": -1077.995361328125, + "loss": 0.3846, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.67930793762207, + "rewards/margins": 1.2189520597457886, + "rewards/rejected": -5.898260116577148, + "step": 1119 + }, + { + "epoch": 0.7314882847579395, + "grad_norm": 18.069875903287098, + "learning_rate": 4.08345427475688e-08, + "logits/chosen": -0.8553774356842041, + "logits/rejected": -0.7431191802024841, + "logps/chosen": -967.6986083984375, + "logps/rejected": -1162.87060546875, + "loss": 0.371, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.205309867858887, + "rewards/margins": 2.321453332901001, + "rewards/rejected": -7.52676248550415, + "step": 1120 + }, + { + "epoch": 0.7321413992979019, + "grad_norm": 22.63376434941059, + "learning_rate": 4.065076620274983e-08, + "logits/chosen": -0.9161591529846191, + "logits/rejected": -0.9195849299430847, + "logps/chosen": -920.815673828125, + "logps/rejected": -1047.88623046875, + "loss": 0.4609, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.56045389175415, + "rewards/margins": 1.149539589881897, + "rewards/rejected": -5.709993839263916, + "step": 1121 + }, + { + "epoch": 0.7327945138378643, + "grad_norm": 20.45817826800368, + "learning_rate": 4.046729857847634e-08, + "logits/chosen": -0.8821409344673157, + "logits/rejected": -0.8005276918411255, + "logps/chosen": -881.125244140625, + "logps/rejected": -1102.503662109375, + "loss": 0.4102, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9783294200897217, + "rewards/margins": 2.2585127353668213, + "rewards/rejected": -6.236842155456543, + "step": 1122 + }, + { + "epoch": 0.7334476283778267, + "grad_norm": 19.51675556765635, + "learning_rate": 4.0284140829721404e-08, + "logits/chosen": -0.7991642951965332, + "logits/rejected": -0.8100806474685669, + "logps/chosen": -942.4847412109375, + "logps/rejected": -1282.675048828125, + "loss": 0.3271, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.026712417602539, + "rewards/margins": 2.775904417037964, + "rewards/rejected": -7.802616596221924, + "step": 1123 + }, + { + "epoch": 0.7341007429177893, + "grad_norm": 24.3902316105061, + "learning_rate": 4.01012939098451e-08, + "logits/chosen": -0.893683910369873, + "logits/rejected": -0.7639338970184326, + "logps/chosen": -966.1260986328125, + "logps/rejected": -1116.3106689453125, + "loss": 0.4255, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.668510913848877, + "rewards/margins": 1.699383020401001, + "rewards/rejected": -6.367894172668457, + "step": 1124 + }, + { + "epoch": 0.7347538574577517, + "grad_norm": 20.875773170479594, + "learning_rate": 3.9918758770589644e-08, + "logits/chosen": -1.0481836795806885, + "logits/rejected": -0.9271065592765808, + "logps/chosen": -958.1807861328125, + "logps/rejected": -1059.3460693359375, + "loss": 0.3817, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.318079948425293, + "rewards/margins": 1.2409089803695679, + "rewards/rejected": -6.55898904800415, + "step": 1125 + }, + { + "epoch": 0.7354069719977141, + "grad_norm": 18.73461818358389, + "learning_rate": 3.973653636207437e-08, + "logits/chosen": -0.7000550031661987, + "logits/rejected": -0.7210078239440918, + "logps/chosen": -1033.514404296875, + "logps/rejected": -1344.989501953125, + "loss": 0.36, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.706725597381592, + "rewards/margins": 2.880382537841797, + "rewards/rejected": -8.58710765838623, + "step": 1126 + }, + { + "epoch": 0.7360600865376765, + "grad_norm": 58.93678887441067, + "learning_rate": 3.9554627632790815e-08, + "logits/chosen": -0.9635611772537231, + "logits/rejected": -0.7603409886360168, + "logps/chosen": -1095.10205078125, + "logps/rejected": -1271.283203125, + "loss": 0.4252, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.461233615875244, + "rewards/margins": 2.415156602859497, + "rewards/rejected": -7.87639045715332, + "step": 1127 + }, + { + "epoch": 0.736713201077639, + "grad_norm": 26.894919410737224, + "learning_rate": 3.937303352959777e-08, + "logits/chosen": -0.8913260698318481, + "logits/rejected": -0.773656964302063, + "logps/chosen": -929.6309814453125, + "logps/rejected": -1127.9503173828125, + "loss": 0.322, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.980562210083008, + "rewards/margins": 2.162158727645874, + "rewards/rejected": -7.142721176147461, + "step": 1128 + }, + { + "epoch": 0.7373663156176015, + "grad_norm": 23.622760206791476, + "learning_rate": 3.919175499771634e-08, + "logits/chosen": -0.9496240615844727, + "logits/rejected": -0.8086094260215759, + "logps/chosen": -1028.3040771484375, + "logps/rejected": -1120.0272216796875, + "loss": 0.4898, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.441816329956055, + "rewards/margins": 1.3477586507797241, + "rewards/rejected": -6.789575099945068, + "step": 1129 + }, + { + "epoch": 0.7380194301575639, + "grad_norm": 16.57012151608491, + "learning_rate": 3.901079298072509e-08, + "logits/chosen": -0.8438161611557007, + "logits/rejected": -0.7847708463668823, + "logps/chosen": -813.0806884765625, + "logps/rejected": -923.8001098632812, + "loss": 0.3786, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.117031574249268, + "rewards/margins": 1.3204278945922852, + "rewards/rejected": -5.4374589920043945, + "step": 1130 + }, + { + "epoch": 0.7386725446975263, + "grad_norm": 14.471906831289356, + "learning_rate": 3.883014842055504e-08, + "logits/chosen": -0.8245331645011902, + "logits/rejected": -0.870025098323822, + "logps/chosen": -939.5264892578125, + "logps/rejected": -1070.2132568359375, + "loss": 0.3324, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.8601274490356445, + "rewards/margins": 1.342176079750061, + "rewards/rejected": -6.202303886413574, + "step": 1131 + }, + { + "epoch": 0.7393256592374887, + "grad_norm": 35.53448905203971, + "learning_rate": 3.864982225748481e-08, + "logits/chosen": -0.8064248561859131, + "logits/rejected": -0.7723050117492676, + "logps/chosen": -897.3822021484375, + "logps/rejected": -931.1773681640625, + "loss": 0.4193, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.686911582946777, + "rewards/margins": 0.5902104377746582, + "rewards/rejected": -5.277121067047119, + "step": 1132 + }, + { + "epoch": 0.7399787737774512, + "grad_norm": 21.217440902300723, + "learning_rate": 3.8469815430135735e-08, + "logits/chosen": -1.0799496173858643, + "logits/rejected": -1.0132458209991455, + "logps/chosen": -1016.651611328125, + "logps/rejected": -1278.3968505859375, + "loss": 0.3817, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.062169551849365, + "rewards/margins": 2.540085792541504, + "rewards/rejected": -7.602255344390869, + "step": 1133 + }, + { + "epoch": 0.7406318883174137, + "grad_norm": 21.67079481182841, + "learning_rate": 3.8290128875466945e-08, + "logits/chosen": -1.034336805343628, + "logits/rejected": -0.8814944624900818, + "logps/chosen": -887.02001953125, + "logps/rejected": -1028.741455078125, + "loss": 0.3921, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.301558494567871, + "rewards/margins": 1.7627674341201782, + "rewards/rejected": -6.064326286315918, + "step": 1134 + }, + { + "epoch": 0.7412850028573761, + "grad_norm": 21.900427406794464, + "learning_rate": 3.811076352877054e-08, + "logits/chosen": -0.8100993633270264, + "logits/rejected": -0.810735821723938, + "logps/chosen": -956.65869140625, + "logps/rejected": -1134.55078125, + "loss": 0.4055, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.0694427490234375, + "rewards/margins": 1.6674208641052246, + "rewards/rejected": -6.736863613128662, + "step": 1135 + }, + { + "epoch": 0.7419381173973385, + "grad_norm": 37.11345498421912, + "learning_rate": 3.793172032366667e-08, + "logits/chosen": -0.8167673945426941, + "logits/rejected": -0.8620283007621765, + "logps/chosen": -1008.05078125, + "logps/rejected": -1179.90185546875, + "loss": 0.358, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.111788749694824, + "rewards/margins": 1.8442003726959229, + "rewards/rejected": -6.955988883972168, + "step": 1136 + }, + { + "epoch": 0.742591231937301, + "grad_norm": 16.238948093293164, + "learning_rate": 3.7753000192098695e-08, + "logits/chosen": -1.0395476818084717, + "logits/rejected": -1.0163346529006958, + "logps/chosen": -1038.7113037109375, + "logps/rejected": -1218.2423095703125, + "loss": 0.3834, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.375159740447998, + "rewards/margins": 1.8684576749801636, + "rewards/rejected": -7.243618011474609, + "step": 1137 + }, + { + "epoch": 0.7432443464772635, + "grad_norm": 23.547812190116634, + "learning_rate": 3.757460406432833e-08, + "logits/chosen": -0.7605608701705933, + "logits/rejected": -0.7161009311676025, + "logps/chosen": -990.2849731445312, + "logps/rejected": -1235.51025390625, + "loss": 0.4008, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.188079833984375, + "rewards/margins": 2.2514495849609375, + "rewards/rejected": -7.439528942108154, + "step": 1138 + }, + { + "epoch": 0.7438974610172259, + "grad_norm": 18.82174611804124, + "learning_rate": 3.739653286893088e-08, + "logits/chosen": -0.9870573878288269, + "logits/rejected": -0.7753057479858398, + "logps/chosen": -1092.725830078125, + "logps/rejected": -1257.0374755859375, + "loss": 0.3258, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.7637128829956055, + "rewards/margins": 2.28377103805542, + "rewards/rejected": -8.047484397888184, + "step": 1139 + }, + { + "epoch": 0.7445505755571883, + "grad_norm": 23.869228374915046, + "learning_rate": 3.721878753279016e-08, + "logits/chosen": -0.8627902865409851, + "logits/rejected": -0.8585209250450134, + "logps/chosen": -1046.7232666015625, + "logps/rejected": -1276.690185546875, + "loss": 0.4385, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.6731462478637695, + "rewards/margins": 1.737582802772522, + "rewards/rejected": -7.410728931427002, + "step": 1140 + }, + { + "epoch": 0.7452036900971508, + "grad_norm": 25.028791146036063, + "learning_rate": 3.704136898109402e-08, + "logits/chosen": -0.7637258768081665, + "logits/rejected": -0.8027169704437256, + "logps/chosen": -939.3970947265625, + "logps/rejected": -1249.8870849609375, + "loss": 0.2932, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.005740165710449, + "rewards/margins": 2.5859646797180176, + "rewards/rejected": -7.591704845428467, + "step": 1141 + }, + { + "epoch": 0.7458568046371132, + "grad_norm": 23.699634900623874, + "learning_rate": 3.686427813732929e-08, + "logits/chosen": -0.8129778504371643, + "logits/rejected": -0.790728747844696, + "logps/chosen": -1058.7100830078125, + "logps/rejected": -1323.9364013671875, + "loss": 0.407, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.982147216796875, + "rewards/margins": 1.7800394296646118, + "rewards/rejected": -7.7621870040893555, + "step": 1142 + }, + { + "epoch": 0.7465099191770757, + "grad_norm": 21.921561179095427, + "learning_rate": 3.6687515923277015e-08, + "logits/chosen": -0.815049409866333, + "logits/rejected": -0.7605541348457336, + "logps/chosen": -1098.3214111328125, + "logps/rejected": -1316.96728515625, + "loss": 0.4258, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.022390842437744, + "rewards/margins": 2.642651081085205, + "rewards/rejected": -8.66504192352295, + "step": 1143 + }, + { + "epoch": 0.7471630337170381, + "grad_norm": 24.36327520442801, + "learning_rate": 3.6511083259007725e-08, + "logits/chosen": -0.6395676136016846, + "logits/rejected": -0.6162480711936951, + "logps/chosen": -950.2569580078125, + "logps/rejected": -1111.405517578125, + "loss": 0.3874, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.066840648651123, + "rewards/margins": 1.5750312805175781, + "rewards/rejected": -6.641872406005859, + "step": 1144 + }, + { + "epoch": 0.7478161482570006, + "grad_norm": 41.172318989826486, + "learning_rate": 3.633498106287657e-08, + "logits/chosen": -0.9550276398658752, + "logits/rejected": -0.9688808917999268, + "logps/chosen": -955.484130859375, + "logps/rejected": -1154.53466796875, + "loss": 0.4169, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.7296648025512695, + "rewards/margins": 1.6894323825836182, + "rewards/rejected": -6.419096946716309, + "step": 1145 + }, + { + "epoch": 0.748469262796963, + "grad_norm": 22.01106598608996, + "learning_rate": 3.6159210251518566e-08, + "logits/chosen": -0.9625080823898315, + "logits/rejected": -0.9198300242424011, + "logps/chosen": -958.3787231445312, + "logps/rejected": -1220.9547119140625, + "loss": 0.3372, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.220369338989258, + "rewards/margins": 2.5000252723693848, + "rewards/rejected": -7.720395088195801, + "step": 1146 + }, + { + "epoch": 0.7491223773369254, + "grad_norm": 15.911924912335554, + "learning_rate": 3.598377173984385e-08, + "logits/chosen": -0.8641433715820312, + "logits/rejected": -0.7814199328422546, + "logps/chosen": -1110.24462890625, + "logps/rejected": -1363.054931640625, + "loss": 0.3433, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.966456413269043, + "rewards/margins": 2.4509754180908203, + "rewards/rejected": -8.417431831359863, + "step": 1147 + }, + { + "epoch": 0.7497754918768879, + "grad_norm": 19.175118613511536, + "learning_rate": 3.5808666441032876e-08, + "logits/chosen": -0.9386720657348633, + "logits/rejected": -0.9401878118515015, + "logps/chosen": -953.240478515625, + "logps/rejected": -1146.1922607421875, + "loss": 0.3883, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.1068878173828125, + "rewards/margins": 1.779557704925537, + "rewards/rejected": -6.886445045471191, + "step": 1148 + }, + { + "epoch": 0.7504286064168504, + "grad_norm": 25.57354603222973, + "learning_rate": 3.56338952665317e-08, + "logits/chosen": -0.839618444442749, + "logits/rejected": -0.8448423147201538, + "logps/chosen": -1056.17724609375, + "logps/rejected": -1308.0330810546875, + "loss": 0.3417, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.239077568054199, + "rewards/margins": 2.1591358184814453, + "rewards/rejected": -8.398212432861328, + "step": 1149 + }, + { + "epoch": 0.7510817209568128, + "grad_norm": 27.81708132565712, + "learning_rate": 3.545945912604722e-08, + "logits/chosen": -0.9696516394615173, + "logits/rejected": -0.7934356927871704, + "logps/chosen": -958.2764892578125, + "logps/rejected": -1086.7510986328125, + "loss": 0.3957, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.882455348968506, + "rewards/margins": 1.7731363773345947, + "rewards/rejected": -6.65559196472168, + "step": 1150 + }, + { + "epoch": 0.7517348354967752, + "grad_norm": 29.929763150529183, + "learning_rate": 3.5285358927542386e-08, + "logits/chosen": -0.9754225015640259, + "logits/rejected": -0.9199377298355103, + "logps/chosen": -1044.01708984375, + "logps/rejected": -1216.123291015625, + "loss": 0.4275, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.64408540725708, + "rewards/margins": 1.760486125946045, + "rewards/rejected": -7.404570579528809, + "step": 1151 + }, + { + "epoch": 0.7523879500367376, + "grad_norm": 18.403330467837083, + "learning_rate": 3.511159557723157e-08, + "logits/chosen": -0.7564758062362671, + "logits/rejected": -0.7638527154922485, + "logps/chosen": -967.927978515625, + "logps/rejected": -1212.9923095703125, + "loss": 0.3566, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.75152587890625, + "rewards/margins": 2.356855630874634, + "rewards/rejected": -7.108381271362305, + "step": 1152 + }, + { + "epoch": 0.7530410645767002, + "grad_norm": 27.316348794289706, + "learning_rate": 3.493816997957582e-08, + "logits/chosen": -0.8866490125656128, + "logits/rejected": -0.8477368950843811, + "logps/chosen": -969.8577270507812, + "logps/rejected": -1238.609130859375, + "loss": 0.4151, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.076401710510254, + "rewards/margins": 2.3330423831939697, + "rewards/rejected": -7.409444808959961, + "step": 1153 + }, + { + "epoch": 0.7536941791166626, + "grad_norm": 24.26926248538676, + "learning_rate": 3.476508303727809e-08, + "logits/chosen": -0.9898264408111572, + "logits/rejected": -0.823529839515686, + "logps/chosen": -897.8689575195312, + "logps/rejected": -1061.2685546875, + "loss": 0.4123, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.637533664703369, + "rewards/margins": 1.7195688486099243, + "rewards/rejected": -6.357102870941162, + "step": 1154 + }, + { + "epoch": 0.754347293656625, + "grad_norm": 23.535978515943466, + "learning_rate": 3.459233565127865e-08, + "logits/chosen": -0.9022988080978394, + "logits/rejected": -0.7629703283309937, + "logps/chosen": -1029.1558837890625, + "logps/rejected": -1222.035400390625, + "loss": 0.4044, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.495359420776367, + "rewards/margins": 1.9249953031539917, + "rewards/rejected": -7.420354843139648, + "step": 1155 + }, + { + "epoch": 0.7550004081965874, + "grad_norm": 22.38619660861659, + "learning_rate": 3.441992872075027e-08, + "logits/chosen": -0.8099140524864197, + "logits/rejected": -0.9074943661689758, + "logps/chosen": -926.499755859375, + "logps/rejected": -1042.8348388671875, + "loss": 0.3997, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.323889255523682, + "rewards/margins": 1.126346230506897, + "rewards/rejected": -5.450235366821289, + "step": 1156 + }, + { + "epoch": 0.75565352273655, + "grad_norm": 37.5031440678778, + "learning_rate": 3.4247863143093646e-08, + "logits/chosen": -0.7545847296714783, + "logits/rejected": -0.9003801345825195, + "logps/chosen": -1049.3184814453125, + "logps/rejected": -1330.5516357421875, + "loss": 0.4131, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.52249002456665, + "rewards/margins": 2.340816020965576, + "rewards/rejected": -7.863306045532227, + "step": 1157 + }, + { + "epoch": 0.7563066372765124, + "grad_norm": 24.9093202389082, + "learning_rate": 3.407613981393268e-08, + "logits/chosen": -0.8100746870040894, + "logits/rejected": -0.7035970091819763, + "logps/chosen": -969.5421142578125, + "logps/rejected": -1108.051513671875, + "loss": 0.4021, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.4513044357299805, + "rewards/margins": 1.1451897621154785, + "rewards/rejected": -6.596494197845459, + "step": 1158 + }, + { + "epoch": 0.7569597518164748, + "grad_norm": 33.568513626016305, + "learning_rate": 3.3904759627109826e-08, + "logits/chosen": -0.7170710563659668, + "logits/rejected": -0.7021714448928833, + "logps/chosen": -981.71728515625, + "logps/rejected": -1198.9769287109375, + "loss": 0.3767, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.262272834777832, + "rewards/margins": 1.670971393585205, + "rewards/rejected": -6.933244705200195, + "step": 1159 + }, + { + "epoch": 0.7576128663564372, + "grad_norm": 19.48507470464778, + "learning_rate": 3.373372347468141e-08, + "logits/chosen": -0.8081870079040527, + "logits/rejected": -0.8140336871147156, + "logps/chosen": -953.115478515625, + "logps/rejected": -1056.728515625, + "loss": 0.3799, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.855665683746338, + "rewards/margins": 1.1462256908416748, + "rewards/rejected": -6.001891136169434, + "step": 1160 + }, + { + "epoch": 0.7582659808963997, + "grad_norm": 31.38673322123655, + "learning_rate": 3.356303224691306e-08, + "logits/chosen": -0.9541839361190796, + "logits/rejected": -0.7014814019203186, + "logps/chosen": -1052.727783203125, + "logps/rejected": -1163.9364013671875, + "loss": 0.4678, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.942466735839844, + "rewards/margins": 1.7238149642944336, + "rewards/rejected": -7.666281700134277, + "step": 1161 + }, + { + "epoch": 0.7589190954363622, + "grad_norm": 69.4011664473962, + "learning_rate": 3.3392686832274985e-08, + "logits/chosen": -0.9889993667602539, + "logits/rejected": -0.7375394105911255, + "logps/chosen": -941.2330322265625, + "logps/rejected": -1013.1570434570312, + "loss": 0.4446, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.634012699127197, + "rewards/margins": 1.3770856857299805, + "rewards/rejected": -6.011098384857178, + "step": 1162 + }, + { + "epoch": 0.7595722099763246, + "grad_norm": 27.965895018082794, + "learning_rate": 3.3222688117437425e-08, + "logits/chosen": -0.9555126428604126, + "logits/rejected": -0.7966434955596924, + "logps/chosen": -1023.3953247070312, + "logps/rejected": -1168.2110595703125, + "loss": 0.3217, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.375278472900391, + "rewards/margins": 1.807486653327942, + "rewards/rejected": -7.182765483856201, + "step": 1163 + }, + { + "epoch": 0.760225324516287, + "grad_norm": 20.30038550754587, + "learning_rate": 3.305303698726597e-08, + "logits/chosen": -0.7721012830734253, + "logits/rejected": -0.7118543982505798, + "logps/chosen": -928.61279296875, + "logps/rejected": -1214.0880126953125, + "loss": 0.3215, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.874123573303223, + "rewards/margins": 2.648409366607666, + "rewards/rejected": -7.5225324630737305, + "step": 1164 + }, + { + "epoch": 0.7608784390562495, + "grad_norm": 33.58007082608257, + "learning_rate": 3.2883734324817025e-08, + "logits/chosen": -0.79903644323349, + "logits/rejected": -0.8045666217803955, + "logps/chosen": -1060.49267578125, + "logps/rejected": -1248.7489013671875, + "loss": 0.3897, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.418579578399658, + "rewards/margins": 1.9177119731903076, + "rewards/rejected": -7.336291313171387, + "step": 1165 + }, + { + "epoch": 0.7615315535962119, + "grad_norm": 21.581131984398414, + "learning_rate": 3.271478101133313e-08, + "logits/chosen": -0.8603457808494568, + "logits/rejected": -0.8567978739738464, + "logps/chosen": -1065.7412109375, + "logps/rejected": -1239.724853515625, + "loss": 0.3794, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.512617588043213, + "rewards/margins": 1.4748889207839966, + "rewards/rejected": -6.987505912780762, + "step": 1166 + }, + { + "epoch": 0.7621846681361744, + "grad_norm": 27.76584915305069, + "learning_rate": 3.254617792623844e-08, + "logits/chosen": -0.7932612895965576, + "logits/rejected": -0.7557364106178284, + "logps/chosen": -859.295166015625, + "logps/rejected": -1044.875732421875, + "loss": 0.3551, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.440032482147217, + "rewards/margins": 1.8427796363830566, + "rewards/rejected": -6.282812595367432, + "step": 1167 + }, + { + "epoch": 0.7628377826761368, + "grad_norm": 19.357322013989005, + "learning_rate": 3.237792594713413e-08, + "logits/chosen": -0.785632848739624, + "logits/rejected": -0.8138841390609741, + "logps/chosen": -1043.2958984375, + "logps/rejected": -1250.5078125, + "loss": 0.423, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.2632856369018555, + "rewards/margins": 1.5555360317230225, + "rewards/rejected": -6.818821907043457, + "step": 1168 + }, + { + "epoch": 0.7634908972160993, + "grad_norm": 22.411264863097227, + "learning_rate": 3.2210025949793826e-08, + "logits/chosen": -0.7790203094482422, + "logits/rejected": -0.7721933722496033, + "logps/chosen": -1029.548583984375, + "logps/rejected": -1246.845947265625, + "loss": 0.4476, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.5870537757873535, + "rewards/margins": 1.5175191164016724, + "rewards/rejected": -7.1045732498168945, + "step": 1169 + }, + { + "epoch": 0.7641440117560617, + "grad_norm": 21.765699711965283, + "learning_rate": 3.204247880815902e-08, + "logits/chosen": -0.848351001739502, + "logits/rejected": -0.8396331071853638, + "logps/chosen": -964.7103271484375, + "logps/rejected": -1139.3961181640625, + "loss": 0.3456, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.735137462615967, + "rewards/margins": 1.6721020936965942, + "rewards/rejected": -6.407238960266113, + "step": 1170 + }, + { + "epoch": 0.7647971262960241, + "grad_norm": 16.671667536802943, + "learning_rate": 3.1875285394334575e-08, + "logits/chosen": -0.8602566123008728, + "logits/rejected": -0.8754156231880188, + "logps/chosen": -929.6405029296875, + "logps/rejected": -1105.7825927734375, + "loss": 0.435, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.829796314239502, + "rewards/margins": 1.4688724279403687, + "rewards/rejected": -6.298668384552002, + "step": 1171 + }, + { + "epoch": 0.7654502408359866, + "grad_norm": 20.79622076583873, + "learning_rate": 3.1708446578584124e-08, + "logits/chosen": -0.8881270885467529, + "logits/rejected": -0.8268469572067261, + "logps/chosen": -956.0182495117188, + "logps/rejected": -1192.7308349609375, + "loss": 0.3944, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.764824390411377, + "rewards/margins": 2.306044816970825, + "rewards/rejected": -7.0708699226379395, + "step": 1172 + }, + { + "epoch": 0.7661033553759491, + "grad_norm": 25.011870924818727, + "learning_rate": 3.154196322932562e-08, + "logits/chosen": -0.7613641023635864, + "logits/rejected": -0.7869819402694702, + "logps/chosen": -884.375732421875, + "logps/rejected": -1046.363525390625, + "loss": 0.3063, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.63422155380249, + "rewards/margins": 1.5653067827224731, + "rewards/rejected": -6.199528217315674, + "step": 1173 + }, + { + "epoch": 0.7667564699159115, + "grad_norm": 20.090645894822764, + "learning_rate": 3.137583621312665e-08, + "logits/chosen": -0.6448233127593994, + "logits/rejected": -0.6775322556495667, + "logps/chosen": -937.7881469726562, + "logps/rejected": -1196.1304931640625, + "loss": 0.4101, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.74168062210083, + "rewards/margins": 2.3308541774749756, + "rewards/rejected": -7.072534561157227, + "step": 1174 + }, + { + "epoch": 0.7674095844558739, + "grad_norm": 33.3903606022228, + "learning_rate": 3.121006639470019e-08, + "logits/chosen": -0.8766074776649475, + "logits/rejected": -0.8929482698440552, + "logps/chosen": -908.8091430664062, + "logps/rejected": -1128.59619140625, + "loss": 0.4219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.524719715118408, + "rewards/margins": 1.9517160654067993, + "rewards/rejected": -6.476435661315918, + "step": 1175 + }, + { + "epoch": 0.7680626989958363, + "grad_norm": 22.339500155579042, + "learning_rate": 3.104465463689985e-08, + "logits/chosen": -0.6528885364532471, + "logits/rejected": -0.6316456198692322, + "logps/chosen": -1036.5087890625, + "logps/rejected": -1337.04833984375, + "loss": 0.4085, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.987725257873535, + "rewards/margins": 2.9133403301239014, + "rewards/rejected": -7.901065826416016, + "step": 1176 + }, + { + "epoch": 0.7687158135357989, + "grad_norm": 22.90690018103172, + "learning_rate": 3.087960180071553e-08, + "logits/chosen": -0.8717073202133179, + "logits/rejected": -0.9405293464660645, + "logps/chosen": -862.404052734375, + "logps/rejected": -1120.0740966796875, + "loss": 0.229, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.589112758636475, + "rewards/margins": 2.3341736793518066, + "rewards/rejected": -6.923286437988281, + "step": 1177 + }, + { + "epoch": 0.7693689280757613, + "grad_norm": 26.598328462908714, + "learning_rate": 3.07149087452689e-08, + "logits/chosen": -0.8764140009880066, + "logits/rejected": -0.8363279104232788, + "logps/chosen": -982.534912109375, + "logps/rejected": -1125.4754638671875, + "loss": 0.406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.287962913513184, + "rewards/margins": 1.5670433044433594, + "rewards/rejected": -6.855005741119385, + "step": 1178 + }, + { + "epoch": 0.7700220426157237, + "grad_norm": 22.893298745929318, + "learning_rate": 3.055057632780891e-08, + "logits/chosen": -0.8786053657531738, + "logits/rejected": -0.808796763420105, + "logps/chosen": -1015.7930297851562, + "logps/rejected": -1083.675048828125, + "loss": 0.4078, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.676352024078369, + "rewards/margins": 0.9371025562286377, + "rewards/rejected": -6.613454341888428, + "step": 1179 + }, + { + "epoch": 0.7706751571556861, + "grad_norm": 22.818915591743878, + "learning_rate": 3.038660540370735e-08, + "logits/chosen": -0.8500253558158875, + "logits/rejected": -0.8378827571868896, + "logps/chosen": -977.3529052734375, + "logps/rejected": -1116.5126953125, + "loss": 0.4193, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.952162265777588, + "rewards/margins": 1.1345552206039429, + "rewards/rejected": -6.08671760559082, + "step": 1180 + }, + { + "epoch": 0.7713282716956487, + "grad_norm": 29.865635190204213, + "learning_rate": 3.022299682645436e-08, + "logits/chosen": -0.8462579250335693, + "logits/rejected": -0.8399958610534668, + "logps/chosen": -958.1721801757812, + "logps/rejected": -1113.3017578125, + "loss": 0.3925, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.745461463928223, + "rewards/margins": 1.4994910955429077, + "rewards/rejected": -6.24495267868042, + "step": 1181 + }, + { + "epoch": 0.7719813862356111, + "grad_norm": 27.458852230158666, + "learning_rate": 3.005975144765407e-08, + "logits/chosen": -0.7723562717437744, + "logits/rejected": -0.7684867978096008, + "logps/chosen": -927.39599609375, + "logps/rejected": -1054.7724609375, + "loss": 0.3763, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.110241889953613, + "rewards/margins": 1.5953783988952637, + "rewards/rejected": -6.705620765686035, + "step": 1182 + }, + { + "epoch": 0.7726345007755735, + "grad_norm": 31.359566574451833, + "learning_rate": 2.989687011702007e-08, + "logits/chosen": -0.9768270254135132, + "logits/rejected": -0.9259161949157715, + "logps/chosen": -990.6243896484375, + "logps/rejected": -1188.6944580078125, + "loss": 0.3644, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3823652267456055, + "rewards/margins": 1.8317203521728516, + "rewards/rejected": -6.214085578918457, + "step": 1183 + }, + { + "epoch": 0.7732876153155359, + "grad_norm": 18.539542439721654, + "learning_rate": 2.9734353682371082e-08, + "logits/chosen": -0.9522316455841064, + "logits/rejected": -1.0198249816894531, + "logps/chosen": -896.4822998046875, + "logps/rejected": -1116.46142578125, + "loss": 0.3925, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.922615051269531, + "rewards/margins": 1.5149760246276855, + "rewards/rejected": -6.437590599060059, + "step": 1184 + }, + { + "epoch": 0.7739407298554984, + "grad_norm": 37.635532934840626, + "learning_rate": 2.9572202989626404e-08, + "logits/chosen": -0.6697461009025574, + "logits/rejected": -0.5747227668762207, + "logps/chosen": -1109.9716796875, + "logps/rejected": -1424.2490234375, + "loss": 0.4296, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.394173622131348, + "rewards/margins": 3.1784846782684326, + "rewards/rejected": -9.57265853881836, + "step": 1185 + }, + { + "epoch": 0.7745938443954609, + "grad_norm": 23.27278254145601, + "learning_rate": 2.941041888280168e-08, + "logits/chosen": -0.844850480556488, + "logits/rejected": -0.8204216361045837, + "logps/chosen": -928.3829956054688, + "logps/rejected": -1041.739990234375, + "loss": 0.4327, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.049342155456543, + "rewards/margins": 1.1250241994857788, + "rewards/rejected": -6.174365997314453, + "step": 1186 + }, + { + "epoch": 0.7752469589354233, + "grad_norm": 30.546092848157986, + "learning_rate": 2.9249002204004415e-08, + "logits/chosen": -1.0821895599365234, + "logits/rejected": -1.0456998348236084, + "logps/chosen": -901.41015625, + "logps/rejected": -1062.07861328125, + "loss": 0.3324, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.730764865875244, + "rewards/margins": 1.766805648803711, + "rewards/rejected": -6.497569561004639, + "step": 1187 + }, + { + "epoch": 0.7759000734753857, + "grad_norm": 14.916867634591082, + "learning_rate": 2.9087953793429586e-08, + "logits/chosen": -0.9418157935142517, + "logits/rejected": -0.8717476725578308, + "logps/chosen": -932.181640625, + "logps/rejected": -1018.2946166992188, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.699738502502441, + "rewards/margins": 0.9546055793762207, + "rewards/rejected": -5.65434455871582, + "step": 1188 + }, + { + "epoch": 0.7765531880153482, + "grad_norm": 18.42805800044058, + "learning_rate": 2.8927274489355292e-08, + "logits/chosen": -0.831034243106842, + "logits/rejected": -0.7608861327171326, + "logps/chosen": -1031.4248046875, + "logps/rejected": -1184.942626953125, + "loss": 0.3381, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.825045108795166, + "rewards/margins": 1.7045456171035767, + "rewards/rejected": -6.529591083526611, + "step": 1189 + }, + { + "epoch": 0.7772063025553106, + "grad_norm": 21.658547747559748, + "learning_rate": 2.8766965128138387e-08, + "logits/chosen": -0.9315862059593201, + "logits/rejected": -0.9007086753845215, + "logps/chosen": -997.5110473632812, + "logps/rejected": -1119.478515625, + "loss": 0.3427, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.008423805236816, + "rewards/margins": 1.204540491104126, + "rewards/rejected": -6.212964057922363, + "step": 1190 + }, + { + "epoch": 0.7778594170952731, + "grad_norm": 24.267138455955347, + "learning_rate": 2.860702654421011e-08, + "logits/chosen": -0.9535077214241028, + "logits/rejected": -0.8994888067245483, + "logps/chosen": -1031.9945068359375, + "logps/rejected": -1150.012451171875, + "loss": 0.3756, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.6364946365356445, + "rewards/margins": 1.5493192672729492, + "rewards/rejected": -7.18581485748291, + "step": 1191 + }, + { + "epoch": 0.7785125316352355, + "grad_norm": 18.451314859777252, + "learning_rate": 2.8447459570071776e-08, + "logits/chosen": -0.817573070526123, + "logits/rejected": -0.7613990902900696, + "logps/chosen": -936.7318115234375, + "logps/rejected": -1135.806884765625, + "loss": 0.3049, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.091334342956543, + "rewards/margins": 1.77097749710083, + "rewards/rejected": -6.862312316894531, + "step": 1192 + }, + { + "epoch": 0.779165646175198, + "grad_norm": 33.458133028762745, + "learning_rate": 2.8288265036290405e-08, + "logits/chosen": -0.9172095060348511, + "logits/rejected": -0.7128479480743408, + "logps/chosen": -1004.5012817382812, + "logps/rejected": -1170.0657958984375, + "loss": 0.4333, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.607486724853516, + "rewards/margins": 2.5573153495788574, + "rewards/rejected": -7.164802074432373, + "step": 1193 + }, + { + "epoch": 0.7798187607151604, + "grad_norm": 19.970285171428458, + "learning_rate": 2.8129443771494432e-08, + "logits/chosen": -0.6862125396728516, + "logits/rejected": -0.7096244692802429, + "logps/chosen": -802.7031860351562, + "logps/rejected": -1065.51220703125, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.070821285247803, + "rewards/margins": 2.3168816566467285, + "rewards/rejected": -6.387702941894531, + "step": 1194 + }, + { + "epoch": 0.7804718752551228, + "grad_norm": 18.73895337273363, + "learning_rate": 2.7970996602369368e-08, + "logits/chosen": -0.6776973009109497, + "logits/rejected": -0.7221867442131042, + "logps/chosen": -872.154541015625, + "logps/rejected": -1053.0361328125, + "loss": 0.3947, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.469272613525391, + "rewards/margins": 1.3249047994613647, + "rewards/rejected": -5.794177055358887, + "step": 1195 + }, + { + "epoch": 0.7811249897950853, + "grad_norm": 19.272456109855355, + "learning_rate": 2.7812924353653512e-08, + "logits/chosen": -0.8801841139793396, + "logits/rejected": -0.8910186290740967, + "logps/chosen": -958.02294921875, + "logps/rejected": -1119.1151123046875, + "loss": 0.3909, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.870482444763184, + "rewards/margins": 1.3198819160461426, + "rewards/rejected": -6.190364360809326, + "step": 1196 + }, + { + "epoch": 0.7817781043350478, + "grad_norm": 21.38229731402552, + "learning_rate": 2.765522784813363e-08, + "logits/chosen": -0.7160096168518066, + "logits/rejected": -0.7802441120147705, + "logps/chosen": -1024.5040283203125, + "logps/rejected": -1252.777099609375, + "loss": 0.4229, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.846982002258301, + "rewards/margins": 1.7771443128585815, + "rewards/rejected": -6.624126434326172, + "step": 1197 + }, + { + "epoch": 0.7824312188750102, + "grad_norm": 22.362299049982393, + "learning_rate": 2.749790790664074e-08, + "logits/chosen": -0.9027339816093445, + "logits/rejected": -0.8822283744812012, + "logps/chosen": -1182.553955078125, + "logps/rejected": -1378.359619140625, + "loss": 0.3888, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.502844333648682, + "rewards/margins": 1.8371129035949707, + "rewards/rejected": -9.339956283569336, + "step": 1198 + }, + { + "epoch": 0.7830843334149726, + "grad_norm": 20.50908302528186, + "learning_rate": 2.734096534804574e-08, + "logits/chosen": -0.9140005707740784, + "logits/rejected": -1.006885290145874, + "logps/chosen": -858.1275634765625, + "logps/rejected": -1033.56787109375, + "loss": 0.3819, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.204371452331543, + "rewards/margins": 1.4959553480148315, + "rewards/rejected": -5.700326442718506, + "step": 1199 + }, + { + "epoch": 0.783737447954935, + "grad_norm": 18.563171380526605, + "learning_rate": 2.7184400989255264e-08, + "logits/chosen": -0.8342958688735962, + "logits/rejected": -0.8678746223449707, + "logps/chosen": -949.291748046875, + "logps/rejected": -1202.0423583984375, + "loss": 0.3612, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.565825939178467, + "rewards/margins": 2.313875436782837, + "rewards/rejected": -6.879701614379883, + "step": 1200 + }, + { + "epoch": 0.783737447954935, + "eval_logits/chosen": -0.6637032628059387, + "eval_logits/rejected": -0.6046258807182312, + "eval_logps/chosen": -978.1701049804688, + "eval_logps/rejected": -1136.9619140625, + "eval_loss": 0.39009326696395874, + "eval_rewards/accuracies": 0.8040000200271606, + "eval_rewards/chosen": -5.042629241943359, + "eval_rewards/margins": 1.6753028631210327, + "eval_rewards/rejected": -6.717932224273682, + "eval_runtime": 619.0985, + "eval_samples_per_second": 6.461, + "eval_steps_per_second": 0.404, + "step": 1200 + }, + { + "epoch": 0.7843905624948976, + "grad_norm": 22.82284104896695, + "learning_rate": 2.702821564520732e-08, + "logits/chosen": -1.0284345149993896, + "logits/rejected": -0.9083595871925354, + "logps/chosen": -1072.3922119140625, + "logps/rejected": -1132.9649658203125, + "loss": 0.4203, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.476429462432861, + "rewards/margins": 1.4508377313613892, + "rewards/rejected": -6.927268028259277, + "step": 1201 + }, + { + "epoch": 0.78504367703486, + "grad_norm": 46.73465092422572, + "learning_rate": 2.6872410128867095e-08, + "logits/chosen": -0.9627257585525513, + "logits/rejected": -0.9185448288917542, + "logps/chosen": -934.824951171875, + "logps/rejected": -1114.9776611328125, + "loss": 0.4346, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.904428482055664, + "rewards/margins": 1.572401762008667, + "rewards/rejected": -6.47683048248291, + "step": 1202 + }, + { + "epoch": 0.7856967915748224, + "grad_norm": 38.65799318097108, + "learning_rate": 2.6716985251222745e-08, + "logits/chosen": -0.8291105031967163, + "logits/rejected": -0.7675511837005615, + "logps/chosen": -879.0901489257812, + "logps/rejected": -1029.8115234375, + "loss": 0.366, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.441387176513672, + "rewards/margins": 1.5486085414886475, + "rewards/rejected": -5.989995002746582, + "step": 1203 + }, + { + "epoch": 0.7863499061147848, + "grad_norm": 28.376786035425976, + "learning_rate": 2.656194182128114e-08, + "logits/chosen": -1.124685287475586, + "logits/rejected": -1.0612528324127197, + "logps/chosen": -1037.1787109375, + "logps/rejected": -1229.228759765625, + "loss": 0.3473, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.469577312469482, + "rewards/margins": 2.1631574630737305, + "rewards/rejected": -7.632734298706055, + "step": 1204 + }, + { + "epoch": 0.7870030206547474, + "grad_norm": 31.767539638419322, + "learning_rate": 2.640728064606368e-08, + "logits/chosen": -0.8752514123916626, + "logits/rejected": -0.7526658773422241, + "logps/chosen": -991.3655395507812, + "logps/rejected": -1172.45556640625, + "loss": 0.3662, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.0387468338012695, + "rewards/margins": 2.1917738914489746, + "rewards/rejected": -7.230520725250244, + "step": 1205 + }, + { + "epoch": 0.7876561351947098, + "grad_norm": 21.79342338361952, + "learning_rate": 2.6253002530602042e-08, + "logits/chosen": -1.0036921501159668, + "logits/rejected": -0.8435570001602173, + "logps/chosen": -1018.02783203125, + "logps/rejected": -1182.3656005859375, + "loss": 0.4083, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.629342555999756, + "rewards/margins": 2.0361695289611816, + "rewards/rejected": -7.6655120849609375, + "step": 1206 + }, + { + "epoch": 0.7883092497346722, + "grad_norm": 17.625956058750614, + "learning_rate": 2.60991082779341e-08, + "logits/chosen": -0.9824644327163696, + "logits/rejected": -0.9369449615478516, + "logps/chosen": -898.9086303710938, + "logps/rejected": -1089.82568359375, + "loss": 0.3814, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.3817315101623535, + "rewards/margins": 2.228430986404419, + "rewards/rejected": -6.610162734985352, + "step": 1207 + }, + { + "epoch": 0.7889623642746346, + "grad_norm": 17.052280652022436, + "learning_rate": 2.594559868909956e-08, + "logits/chosen": -0.8383811712265015, + "logits/rejected": -0.7021499276161194, + "logps/chosen": -1054.260009765625, + "logps/rejected": -1233.23486328125, + "loss": 0.3939, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.855528831481934, + "rewards/margins": 2.2140450477600098, + "rewards/rejected": -8.069574356079102, + "step": 1208 + }, + { + "epoch": 0.7896154788145971, + "grad_norm": 17.82494133782833, + "learning_rate": 2.579247456313598e-08, + "logits/chosen": -0.6913808584213257, + "logits/rejected": -0.7808651924133301, + "logps/chosen": -820.8729248046875, + "logps/rejected": -995.578857421875, + "loss": 0.327, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9144654273986816, + "rewards/margins": 1.5802310705184937, + "rewards/rejected": -5.494696617126465, + "step": 1209 + }, + { + "epoch": 0.7902685933545596, + "grad_norm": 19.827994296098602, + "learning_rate": 2.563973669707452e-08, + "logits/chosen": -0.9767338633537292, + "logits/rejected": -0.9783136248588562, + "logps/chosen": -1063.784423828125, + "logps/rejected": -1288.805419921875, + "loss": 0.3702, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.4177045822143555, + "rewards/margins": 2.281778335571289, + "rewards/rejected": -7.6994829177856445, + "step": 1210 + }, + { + "epoch": 0.790921707894522, + "grad_norm": 30.180018885086678, + "learning_rate": 2.548738588593582e-08, + "logits/chosen": -0.84820955991745, + "logits/rejected": -0.7857142090797424, + "logps/chosen": -1108.758544921875, + "logps/rejected": -1281.418701171875, + "loss": 0.3701, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.03013801574707, + "rewards/margins": 2.472670793533325, + "rewards/rejected": -7.502809524536133, + "step": 1211 + }, + { + "epoch": 0.7915748224344844, + "grad_norm": 24.355575672449053, + "learning_rate": 2.5335422922725824e-08, + "logits/chosen": -0.8779463768005371, + "logits/rejected": -0.8178101181983948, + "logps/chosen": -960.484619140625, + "logps/rejected": -1187.8353271484375, + "loss": 0.414, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.938921928405762, + "rewards/margins": 2.410421848297119, + "rewards/rejected": -7.349343299865723, + "step": 1212 + }, + { + "epoch": 0.7922279369744469, + "grad_norm": 21.158534500164034, + "learning_rate": 2.518384859843168e-08, + "logits/chosen": -0.8605954051017761, + "logits/rejected": -0.8223261833190918, + "logps/chosen": -1020.8521118164062, + "logps/rejected": -1173.8663330078125, + "loss": 0.3428, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.409945011138916, + "rewards/margins": 1.6165356636047363, + "rewards/rejected": -7.026480197906494, + "step": 1213 + }, + { + "epoch": 0.7928810515144094, + "grad_norm": 19.484177348159484, + "learning_rate": 2.5032663702017633e-08, + "logits/chosen": -0.8267495036125183, + "logits/rejected": -0.8446471095085144, + "logps/chosen": -952.152587890625, + "logps/rejected": -1204.9439697265625, + "loss": 0.3381, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.965042591094971, + "rewards/margins": 1.8775386810302734, + "rewards/rejected": -6.842580795288086, + "step": 1214 + }, + { + "epoch": 0.7935341660543718, + "grad_norm": 16.4682799053427, + "learning_rate": 2.4881869020420888e-08, + "logits/chosen": -1.0425740480422974, + "logits/rejected": -0.9511318206787109, + "logps/chosen": -985.417724609375, + "logps/rejected": -1143.3631591796875, + "loss": 0.357, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.3016676902771, + "rewards/margins": 1.9380346536636353, + "rewards/rejected": -7.2397027015686035, + "step": 1215 + }, + { + "epoch": 0.7941872805943342, + "grad_norm": 36.46933285274797, + "learning_rate": 2.4731465338547552e-08, + "logits/chosen": -1.024279236793518, + "logits/rejected": -0.9062608480453491, + "logps/chosen": -936.8526000976562, + "logps/rejected": -1150.493408203125, + "loss": 0.4068, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.91167688369751, + "rewards/margins": 2.1005947589874268, + "rewards/rejected": -7.012270927429199, + "step": 1216 + }, + { + "epoch": 0.7948403951342967, + "grad_norm": 25.529220177411503, + "learning_rate": 2.4581453439268506e-08, + "logits/chosen": -1.0040630102157593, + "logits/rejected": -0.8005591630935669, + "logps/chosen": -1072.67626953125, + "logps/rejected": -1209.8746337890625, + "loss": 0.425, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.791203022003174, + "rewards/margins": 1.998671054840088, + "rewards/rejected": -7.789875030517578, + "step": 1217 + }, + { + "epoch": 0.7954935096742591, + "grad_norm": 38.9750256048071, + "learning_rate": 2.443183410341535e-08, + "logits/chosen": -0.8872929811477661, + "logits/rejected": -0.8971114754676819, + "logps/chosen": -1029.74462890625, + "logps/rejected": -1296.3499755859375, + "loss": 0.362, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.137325286865234, + "rewards/margins": 2.4624693393707275, + "rewards/rejected": -7.599794864654541, + "step": 1218 + }, + { + "epoch": 0.7961466242142216, + "grad_norm": 23.621966508721446, + "learning_rate": 2.4282608109776404e-08, + "logits/chosen": -0.7239641547203064, + "logits/rejected": -0.8325695395469666, + "logps/chosen": -879.1217041015625, + "logps/rejected": -1209.5390625, + "loss": 0.3791, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.391090393066406, + "rewards/margins": 2.7524499893188477, + "rewards/rejected": -7.143540382385254, + "step": 1219 + }, + { + "epoch": 0.796799738754184, + "grad_norm": 24.192463988892303, + "learning_rate": 2.413377623509245e-08, + "logits/chosen": -0.8919906616210938, + "logits/rejected": -0.7484989166259766, + "logps/chosen": -952.1494140625, + "logps/rejected": -1090.5037841796875, + "loss": 0.363, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.1700439453125, + "rewards/margins": 1.6430315971374512, + "rewards/rejected": -6.813075542449951, + "step": 1220 + }, + { + "epoch": 0.7974528532941465, + "grad_norm": 24.082097860730222, + "learning_rate": 2.3985339254052962e-08, + "logits/chosen": -1.0477828979492188, + "logits/rejected": -0.9007452726364136, + "logps/chosen": -1051.1192626953125, + "logps/rejected": -1316.3472900390625, + "loss": 0.3398, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.486971855163574, + "rewards/margins": 3.2541961669921875, + "rewards/rejected": -8.741168022155762, + "step": 1221 + }, + { + "epoch": 0.7981059678341089, + "grad_norm": 18.91024586575857, + "learning_rate": 2.383729793929189e-08, + "logits/chosen": -0.9369001388549805, + "logits/rejected": -0.8223516345024109, + "logps/chosen": -1001.8585205078125, + "logps/rejected": -1124.386474609375, + "loss": 0.3213, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.99350118637085, + "rewards/margins": 1.4244037866592407, + "rewards/rejected": -6.417904853820801, + "step": 1222 + }, + { + "epoch": 0.7987590823740713, + "grad_norm": 42.05950974053666, + "learning_rate": 2.3689653061383685e-08, + "logits/chosen": -0.8296122550964355, + "logits/rejected": -0.8270793557167053, + "logps/chosen": -1003.6871948242188, + "logps/rejected": -1198.9710693359375, + "loss": 0.3992, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.332522392272949, + "rewards/margins": 1.6734325885772705, + "rewards/rejected": -7.005955219268799, + "step": 1223 + }, + { + "epoch": 0.7994121969140338, + "grad_norm": 45.58794217717948, + "learning_rate": 2.3542405388839315e-08, + "logits/chosen": -0.9384451508522034, + "logits/rejected": -0.7390685081481934, + "logps/chosen": -1071.325927734375, + "logps/rejected": -1221.4930419921875, + "loss": 0.3833, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.6582231521606445, + "rewards/margins": 1.8130872249603271, + "rewards/rejected": -8.471309661865234, + "step": 1224 + }, + { + "epoch": 0.8000653114539963, + "grad_norm": 46.362112329902644, + "learning_rate": 2.339555568810221e-08, + "logits/chosen": -0.9025658965110779, + "logits/rejected": -0.8725322484970093, + "logps/chosen": -1064.623291015625, + "logps/rejected": -1244.8243408203125, + "loss": 0.4366, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.746537685394287, + "rewards/margins": 1.7106165885925293, + "rewards/rejected": -7.457154750823975, + "step": 1225 + }, + { + "epoch": 0.8007184259939587, + "grad_norm": 27.267571141968297, + "learning_rate": 2.3249104723544333e-08, + "logits/chosen": -0.747216522693634, + "logits/rejected": -0.7036592364311218, + "logps/chosen": -1053.0712890625, + "logps/rejected": -1246.5084228515625, + "loss": 0.43, + "rewards/accuracies": 0.78125, + "rewards/chosen": -6.20672082901001, + "rewards/margins": 1.843930959701538, + "rewards/rejected": -8.050651550292969, + "step": 1226 + }, + { + "epoch": 0.8013715405339211, + "grad_norm": 31.244738281197762, + "learning_rate": 2.3103053257462145e-08, + "logits/chosen": -0.8720026612281799, + "logits/rejected": -0.776594877243042, + "logps/chosen": -916.8078002929688, + "logps/rejected": -1142.56982421875, + "loss": 0.3608, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.195310115814209, + "rewards/margins": 2.356201171875, + "rewards/rejected": -7.551511287689209, + "step": 1227 + }, + { + "epoch": 0.8020246550738835, + "grad_norm": 15.654836904900627, + "learning_rate": 2.2957402050072717e-08, + "logits/chosen": -0.98552006483078, + "logits/rejected": -0.9261568784713745, + "logps/chosen": -1043.234130859375, + "logps/rejected": -1214.549560546875, + "loss": 0.3054, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.632101058959961, + "rewards/margins": 1.8705977201461792, + "rewards/rejected": -7.5026984214782715, + "step": 1228 + }, + { + "epoch": 0.8026777696138461, + "grad_norm": 19.962514820454384, + "learning_rate": 2.2812151859509645e-08, + "logits/chosen": -0.9196304082870483, + "logits/rejected": -0.8677047491073608, + "logps/chosen": -1022.9666748046875, + "logps/rejected": -1254.571044921875, + "loss": 0.3091, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6341094970703125, + "rewards/margins": 2.5342183113098145, + "rewards/rejected": -8.168327331542969, + "step": 1229 + }, + { + "epoch": 0.8033308841538085, + "grad_norm": 19.192635217134253, + "learning_rate": 2.2667303441819242e-08, + "logits/chosen": -0.940382719039917, + "logits/rejected": -0.9116930961608887, + "logps/chosen": -996.216064453125, + "logps/rejected": -1162.93115234375, + "loss": 0.3674, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.108640193939209, + "rewards/margins": 1.6084949970245361, + "rewards/rejected": -6.717134952545166, + "step": 1230 + }, + { + "epoch": 0.8039839986937709, + "grad_norm": 23.74258752114483, + "learning_rate": 2.252285755095652e-08, + "logits/chosen": -0.9278345704078674, + "logits/rejected": -0.9251123070716858, + "logps/chosen": -1053.6322021484375, + "logps/rejected": -1384.0828857421875, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.672571182250977, + "rewards/margins": 2.669499635696411, + "rewards/rejected": -8.342071533203125, + "step": 1231 + }, + { + "epoch": 0.8046371132337333, + "grad_norm": 18.678877823425772, + "learning_rate": 2.2378814938781265e-08, + "logits/chosen": -0.8256291151046753, + "logits/rejected": -0.6893395185470581, + "logps/chosen": -1019.7078857421875, + "logps/rejected": -1199.7503662109375, + "loss": 0.382, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.491720199584961, + "rewards/margins": 1.8675512075424194, + "rewards/rejected": -7.359271049499512, + "step": 1232 + }, + { + "epoch": 0.8052902277736959, + "grad_norm": 44.8394412752877, + "learning_rate": 2.22351763550542e-08, + "logits/chosen": -0.8614879846572876, + "logits/rejected": -0.8459488749504089, + "logps/chosen": -1098.8720703125, + "logps/rejected": -1225.9066162109375, + "loss": 0.4307, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.287536144256592, + "rewards/margins": 1.3981188535690308, + "rewards/rejected": -7.685655117034912, + "step": 1233 + }, + { + "epoch": 0.8059433423136583, + "grad_norm": 23.062890847523345, + "learning_rate": 2.2091942547432952e-08, + "logits/chosen": -0.819417417049408, + "logits/rejected": -0.8785775303840637, + "logps/chosen": -936.6260986328125, + "logps/rejected": -1199.01220703125, + "loss": 0.395, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.857845306396484, + "rewards/margins": 1.9972339868545532, + "rewards/rejected": -6.855079174041748, + "step": 1234 + }, + { + "epoch": 0.8065964568536207, + "grad_norm": 25.693523378160574, + "learning_rate": 2.1949114261468304e-08, + "logits/chosen": -0.7582164406776428, + "logits/rejected": -0.7685361504554749, + "logps/chosen": -1009.5326538085938, + "logps/rejected": -1153.093017578125, + "loss": 0.3625, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.58023738861084, + "rewards/margins": 1.4440110921859741, + "rewards/rejected": -7.0242486000061035, + "step": 1235 + }, + { + "epoch": 0.8072495713935831, + "grad_norm": 25.197486248693675, + "learning_rate": 2.18066922406002e-08, + "logits/chosen": -0.8278719186782837, + "logits/rejected": -0.8821883797645569, + "logps/chosen": -939.5359497070312, + "logps/rejected": -1243.8126220703125, + "loss": 0.4497, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.773218631744385, + "rewards/margins": 2.74705171585083, + "rewards/rejected": -7.520270824432373, + "step": 1236 + }, + { + "epoch": 0.8079026859335456, + "grad_norm": 22.66095713824487, + "learning_rate": 2.1664677226153938e-08, + "logits/chosen": -0.8346998691558838, + "logits/rejected": -0.7691102623939514, + "logps/chosen": -1045.7852783203125, + "logps/rejected": -1190.573486328125, + "loss": 0.3849, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.094000816345215, + "rewards/margins": 1.7288265228271484, + "rewards/rejected": -7.822828769683838, + "step": 1237 + }, + { + "epoch": 0.8085558004735081, + "grad_norm": 25.500361485349092, + "learning_rate": 2.1523069957336303e-08, + "logits/chosen": -0.8235265612602234, + "logits/rejected": -0.89778071641922, + "logps/chosen": -1058.410888671875, + "logps/rejected": -1341.494873046875, + "loss": 0.3224, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.059966564178467, + "rewards/margins": 2.3945508003234863, + "rewards/rejected": -8.454517364501953, + "step": 1238 + }, + { + "epoch": 0.8092089150134705, + "grad_norm": 37.356694926090526, + "learning_rate": 2.1381871171231692e-08, + "logits/chosen": -0.8128297328948975, + "logits/rejected": -0.7122081518173218, + "logps/chosen": -932.2877197265625, + "logps/rejected": -1099.0528564453125, + "loss": 0.3584, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.248828887939453, + "rewards/margins": 1.8526395559310913, + "rewards/rejected": -7.101468563079834, + "step": 1239 + }, + { + "epoch": 0.8098620295534329, + "grad_norm": 22.32379852460851, + "learning_rate": 2.1241081602798317e-08, + "logits/chosen": -0.884192943572998, + "logits/rejected": -0.9082574248313904, + "logps/chosen": -1082.5439453125, + "logps/rejected": -1388.14501953125, + "loss": 0.3629, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.576789855957031, + "rewards/margins": 2.5638484954833984, + "rewards/rejected": -9.140637397766113, + "step": 1240 + }, + { + "epoch": 0.8105151440933954, + "grad_norm": 24.36231097267006, + "learning_rate": 2.1100701984864354e-08, + "logits/chosen": -0.924676775932312, + "logits/rejected": -0.9948889017105103, + "logps/chosen": -1087.4019775390625, + "logps/rejected": -1230.5699462890625, + "loss": 0.3895, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.527956962585449, + "rewards/margins": 1.3158284425735474, + "rewards/rejected": -6.843785285949707, + "step": 1241 + }, + { + "epoch": 0.8111682586333578, + "grad_norm": 29.98400675361614, + "learning_rate": 2.096073304812408e-08, + "logits/chosen": -0.8991495966911316, + "logits/rejected": -0.9502657651901245, + "logps/chosen": -1102.3123779296875, + "logps/rejected": -1379.7208251953125, + "loss": 0.3557, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.3212361335754395, + "rewards/margins": 2.6031382083892822, + "rewards/rejected": -8.924375534057617, + "step": 1242 + }, + { + "epoch": 0.8118213731733203, + "grad_norm": 19.8045035109399, + "learning_rate": 2.0821175521134203e-08, + "logits/chosen": -0.8717656135559082, + "logits/rejected": -0.8530954122543335, + "logps/chosen": -978.0860595703125, + "logps/rejected": -1278.1768798828125, + "loss": 0.3568, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.698294162750244, + "rewards/margins": 2.899409294128418, + "rewards/rejected": -7.59770393371582, + "step": 1243 + }, + { + "epoch": 0.8124744877132827, + "grad_norm": 23.529826779582308, + "learning_rate": 2.0682030130309937e-08, + "logits/chosen": -1.0024141073226929, + "logits/rejected": -0.8768361806869507, + "logps/chosen": -1081.8538818359375, + "logps/rejected": -1280.859375, + "loss": 0.3559, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.240067481994629, + "rewards/margins": 2.442638397216797, + "rewards/rejected": -7.682706356048584, + "step": 1244 + }, + { + "epoch": 0.8131276022532452, + "grad_norm": 36.84318502121178, + "learning_rate": 2.0543297599921305e-08, + "logits/chosen": -0.8412925004959106, + "logits/rejected": -0.7551301121711731, + "logps/chosen": -1126.511474609375, + "logps/rejected": -1374.8885498046875, + "loss": 0.4481, + "rewards/accuracies": 0.78125, + "rewards/chosen": -6.242119312286377, + "rewards/margins": 2.2767138481140137, + "rewards/rejected": -8.51883316040039, + "step": 1245 + }, + { + "epoch": 0.8137807167932076, + "grad_norm": 27.498227486362723, + "learning_rate": 2.0404978652089323e-08, + "logits/chosen": -0.6808075904846191, + "logits/rejected": -0.7349047660827637, + "logps/chosen": -877.9880981445312, + "logps/rejected": -1123.840087890625, + "loss": 0.3956, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.855025291442871, + "rewards/margins": 1.9713177680969238, + "rewards/rejected": -6.826342582702637, + "step": 1246 + }, + { + "epoch": 0.81443383133317, + "grad_norm": 31.33636263030433, + "learning_rate": 2.0267074006782235e-08, + "logits/chosen": -0.7306153178215027, + "logits/rejected": -0.7039578557014465, + "logps/chosen": -1020.278076171875, + "logps/rejected": -1221.2576904296875, + "loss": 0.4415, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.24103307723999, + "rewards/margins": 1.8187503814697266, + "rewards/rejected": -7.059783458709717, + "step": 1247 + }, + { + "epoch": 0.8150869458731325, + "grad_norm": 23.27240519426899, + "learning_rate": 2.0129584381811826e-08, + "logits/chosen": -0.7635297775268555, + "logits/rejected": -0.6661262512207031, + "logps/chosen": -1066.372314453125, + "logps/rejected": -1230.029052734375, + "loss": 0.3597, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.463615894317627, + "rewards/margins": 1.785343885421753, + "rewards/rejected": -8.2489595413208, + "step": 1248 + }, + { + "epoch": 0.815740060413095, + "grad_norm": 29.215895350871378, + "learning_rate": 1.9992510492829618e-08, + "logits/chosen": -0.8741496205329895, + "logits/rejected": -0.8110368251800537, + "logps/chosen": -913.3331909179688, + "logps/rejected": -1101.623046875, + "loss": 0.3678, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.769600868225098, + "rewards/margins": 1.9510536193847656, + "rewards/rejected": -6.720654487609863, + "step": 1249 + }, + { + "epoch": 0.8163931749530574, + "grad_norm": 37.00306210512274, + "learning_rate": 1.9855853053323178e-08, + "logits/chosen": -0.9076151847839355, + "logits/rejected": -0.9450336694717407, + "logps/chosen": -978.0199584960938, + "logps/rejected": -1196.162109375, + "loss": 0.3918, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.30790376663208, + "rewards/margins": 2.2430849075317383, + "rewards/rejected": -7.550989151000977, + "step": 1250 + }, + { + "epoch": 0.8170462894930198, + "grad_norm": 26.71800819631346, + "learning_rate": 1.9719612774612404e-08, + "logits/chosen": -0.9339046478271484, + "logits/rejected": -0.9458125829696655, + "logps/chosen": -1008.9201049804688, + "logps/rejected": -1295.292724609375, + "loss": 0.4199, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.112905025482178, + "rewards/margins": 2.7176475524902344, + "rewards/rejected": -7.830551624298096, + "step": 1251 + }, + { + "epoch": 0.8176994040329822, + "grad_norm": 22.65444787969263, + "learning_rate": 1.958379036584582e-08, + "logits/chosen": -0.7930296063423157, + "logits/rejected": -0.7819668054580688, + "logps/chosen": -965.6203002929688, + "logps/rejected": -1127.9246826171875, + "loss": 0.3648, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.067156791687012, + "rewards/margins": 1.8418265581130981, + "rewards/rejected": -6.908982753753662, + "step": 1252 + }, + { + "epoch": 0.8183525185729448, + "grad_norm": 35.94238776510094, + "learning_rate": 1.9448386533996897e-08, + "logits/chosen": -0.8811150789260864, + "logits/rejected": -0.8617693185806274, + "logps/chosen": -905.3536987304688, + "logps/rejected": -1050.594482421875, + "loss": 0.4079, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.579432010650635, + "rewards/margins": 1.3525540828704834, + "rewards/rejected": -5.931985855102539, + "step": 1253 + }, + { + "epoch": 0.8190056331129072, + "grad_norm": 20.114155683568644, + "learning_rate": 1.9313401983860278e-08, + "logits/chosen": -0.7634121775627136, + "logits/rejected": -0.7326263189315796, + "logps/chosen": -1029.6595458984375, + "logps/rejected": -1226.5960693359375, + "loss": 0.3304, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.899470806121826, + "rewards/margins": 1.9020302295684814, + "rewards/rejected": -6.801499843597412, + "step": 1254 + }, + { + "epoch": 0.8196587476528696, + "grad_norm": 29.389608351454772, + "learning_rate": 1.9178837418048287e-08, + "logits/chosen": -1.035788655281067, + "logits/rejected": -1.034264326095581, + "logps/chosen": -939.3095703125, + "logps/rejected": -1057.8206787109375, + "loss": 0.4131, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.060790538787842, + "rewards/margins": 1.1942658424377441, + "rewards/rejected": -6.255056381225586, + "step": 1255 + }, + { + "epoch": 0.820311862192832, + "grad_norm": 23.633542412432398, + "learning_rate": 1.9044693536987146e-08, + "logits/chosen": -0.865630567073822, + "logits/rejected": -0.8657881617546082, + "logps/chosen": -1064.190185546875, + "logps/rejected": -1238.50390625, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.71279764175415, + "rewards/margins": 1.6474566459655762, + "rewards/rejected": -7.36025333404541, + "step": 1256 + }, + { + "epoch": 0.8209649767327946, + "grad_norm": 31.576320870780314, + "learning_rate": 1.8910971038913316e-08, + "logits/chosen": -0.8827898502349854, + "logits/rejected": -0.9015779495239258, + "logps/chosen": -941.9280395507812, + "logps/rejected": -1156.314208984375, + "loss": 0.4099, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.529621124267578, + "rewards/margins": 2.061371088027954, + "rewards/rejected": -6.590992450714111, + "step": 1257 + }, + { + "epoch": 0.821618091272757, + "grad_norm": 26.25520258324422, + "learning_rate": 1.877767061986997e-08, + "logits/chosen": -0.879159688949585, + "logits/rejected": -0.8486706018447876, + "logps/chosen": -1051.3345947265625, + "logps/rejected": -1229.3165283203125, + "loss": 0.2901, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.476308822631836, + "rewards/margins": 1.9429969787597656, + "rewards/rejected": -7.41930627822876, + "step": 1258 + }, + { + "epoch": 0.8222712058127194, + "grad_norm": 27.762333821264512, + "learning_rate": 1.864479297370325e-08, + "logits/chosen": -0.8901294469833374, + "logits/rejected": -0.9367501735687256, + "logps/chosen": -1139.62890625, + "logps/rejected": -1377.3262939453125, + "loss": 0.3116, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.882551670074463, + "rewards/margins": 2.453892230987549, + "rewards/rejected": -8.336442947387695, + "step": 1259 + }, + { + "epoch": 0.8229243203526818, + "grad_norm": 19.94747891616991, + "learning_rate": 1.8512338792058745e-08, + "logits/chosen": -0.89691162109375, + "logits/rejected": -0.9170194864273071, + "logps/chosen": -1008.8082275390625, + "logps/rejected": -1234.414306640625, + "loss": 0.3774, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.763920783996582, + "rewards/margins": 2.05275559425354, + "rewards/rejected": -7.816676616668701, + "step": 1260 + }, + { + "epoch": 0.8235774348926443, + "grad_norm": 28.264623159069007, + "learning_rate": 1.838030876437784e-08, + "logits/chosen": -0.8986479640007019, + "logits/rejected": -0.8794419765472412, + "logps/chosen": -1030.9847412109375, + "logps/rejected": -1238.065673828125, + "loss": 0.4215, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.154675006866455, + "rewards/margins": 1.8639512062072754, + "rewards/rejected": -7.018627166748047, + "step": 1261 + }, + { + "epoch": 0.8242305494326068, + "grad_norm": 18.41175881448945, + "learning_rate": 1.8248703577894132e-08, + "logits/chosen": -0.7217578887939453, + "logits/rejected": -0.7507769465446472, + "logps/chosen": -950.8618774414062, + "logps/rejected": -1251.9049072265625, + "loss": 0.3286, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.839773178100586, + "rewards/margins": 2.6825602054595947, + "rewards/rejected": -7.522334098815918, + "step": 1262 + }, + { + "epoch": 0.8248836639725692, + "grad_norm": 21.04106550876138, + "learning_rate": 1.8117523917629895e-08, + "logits/chosen": -0.8280400037765503, + "logits/rejected": -0.803841233253479, + "logps/chosen": -980.7047119140625, + "logps/rejected": -1142.6981201171875, + "loss": 0.4046, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.227568626403809, + "rewards/margins": 1.4354455471038818, + "rewards/rejected": -6.6630144119262695, + "step": 1263 + }, + { + "epoch": 0.8255367785125316, + "grad_norm": 16.262147010001502, + "learning_rate": 1.798677046639244e-08, + "logits/chosen": -0.761043131351471, + "logits/rejected": -0.6971719861030579, + "logps/chosen": -947.5308837890625, + "logps/rejected": -1009.4982299804688, + "loss": 0.3669, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.8415422439575195, + "rewards/margins": 0.8895235657691956, + "rewards/rejected": -5.731066703796387, + "step": 1264 + }, + { + "epoch": 0.8261898930524941, + "grad_norm": 19.058057525896476, + "learning_rate": 1.7856443904770657e-08, + "logits/chosen": -1.1435941457748413, + "logits/rejected": -1.0320429801940918, + "logps/chosen": -1002.7920532226562, + "logps/rejected": -1178.949462890625, + "loss": 0.3415, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.919374942779541, + "rewards/margins": 2.0565154552459717, + "rewards/rejected": -6.975890636444092, + "step": 1265 + }, + { + "epoch": 0.8268430075924565, + "grad_norm": 23.23707352681105, + "learning_rate": 1.772654491113138e-08, + "logits/chosen": -0.9951072335243225, + "logits/rejected": -0.8988428711891174, + "logps/chosen": -1025.6783447265625, + "logps/rejected": -1171.551513671875, + "loss": 0.3785, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.015107154846191, + "rewards/margins": 1.7816100120544434, + "rewards/rejected": -6.796716690063477, + "step": 1266 + }, + { + "epoch": 0.827496122132419, + "grad_norm": 25.86540317412067, + "learning_rate": 1.75970741616159e-08, + "logits/chosen": -0.7014292478561401, + "logits/rejected": -0.7118043899536133, + "logps/chosen": -897.89794921875, + "logps/rejected": -1087.0958251953125, + "loss": 0.4355, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.325657844543457, + "rewards/margins": 1.5333755016326904, + "rewards/rejected": -5.859033584594727, + "step": 1267 + }, + { + "epoch": 0.8281492366723814, + "grad_norm": 18.171682491599366, + "learning_rate": 1.746803233013645e-08, + "logits/chosen": -0.7887166738510132, + "logits/rejected": -0.8805134892463684, + "logps/chosen": -869.1028442382812, + "logps/rejected": -1045.51416015625, + "loss": 0.4541, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.430434226989746, + "rewards/margins": 1.542751669883728, + "rewards/rejected": -5.973185062408447, + "step": 1268 + }, + { + "epoch": 0.8288023512123439, + "grad_norm": 19.860357482891896, + "learning_rate": 1.733942008837269e-08, + "logits/chosen": -0.8767520785331726, + "logits/rejected": -0.6930796504020691, + "logps/chosen": -1000.611572265625, + "logps/rejected": -1124.2705078125, + "loss": 0.3817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.829126358032227, + "rewards/margins": 1.4662240743637085, + "rewards/rejected": -6.295350551605225, + "step": 1269 + }, + { + "epoch": 0.8294554657523063, + "grad_norm": 15.737729321214841, + "learning_rate": 1.721123810576821e-08, + "logits/chosen": -0.9429450035095215, + "logits/rejected": -0.8901304006576538, + "logps/chosen": -980.0512084960938, + "logps/rejected": -1083.7476806640625, + "loss": 0.3119, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.375136852264404, + "rewards/margins": 1.5506840944290161, + "rewards/rejected": -5.925821304321289, + "step": 1270 + }, + { + "epoch": 0.8301085802922687, + "grad_norm": 18.42427316829238, + "learning_rate": 1.7083487049527033e-08, + "logits/chosen": -0.977911114692688, + "logits/rejected": -0.9723238348960876, + "logps/chosen": -864.9402465820312, + "logps/rejected": -978.0227661132812, + "loss": 0.3579, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.461297512054443, + "rewards/margins": 1.4185537099838257, + "rewards/rejected": -5.8798508644104, + "step": 1271 + }, + { + "epoch": 0.8307616948322312, + "grad_norm": 28.41186955631412, + "learning_rate": 1.695616758461017e-08, + "logits/chosen": -0.8489782214164734, + "logits/rejected": -0.7123806476593018, + "logps/chosen": -989.315185546875, + "logps/rejected": -1167.1005859375, + "loss": 0.4038, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.38248348236084, + "rewards/margins": 2.059591293334961, + "rewards/rejected": -7.442075729370117, + "step": 1272 + }, + { + "epoch": 0.8314148093721937, + "grad_norm": 17.295026440713308, + "learning_rate": 1.6829280373732123e-08, + "logits/chosen": -0.8142927885055542, + "logits/rejected": -0.7259283661842346, + "logps/chosen": -954.6568603515625, + "logps/rejected": -1167.76416015625, + "loss": 0.3945, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.306705474853516, + "rewards/margins": 2.2607154846191406, + "rewards/rejected": -7.56742000579834, + "step": 1273 + }, + { + "epoch": 0.8320679239121561, + "grad_norm": 19.266167582706977, + "learning_rate": 1.670282607735748e-08, + "logits/chosen": -0.7883981466293335, + "logits/rejected": -0.5904198884963989, + "logps/chosen": -1010.0977783203125, + "logps/rejected": -1187.25634765625, + "loss": 0.3784, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.103325843811035, + "rewards/margins": 2.1519057750701904, + "rewards/rejected": -7.255231857299805, + "step": 1274 + }, + { + "epoch": 0.8327210384521185, + "grad_norm": 19.640653551050946, + "learning_rate": 1.657680535369744e-08, + "logits/chosen": -0.8553881645202637, + "logits/rejected": -0.783621609210968, + "logps/chosen": -958.4749755859375, + "logps/rejected": -1111.0194091796875, + "loss": 0.3705, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.92903995513916, + "rewards/margins": 1.7265620231628418, + "rewards/rejected": -6.6556010246276855, + "step": 1275 + }, + { + "epoch": 0.833374152992081, + "grad_norm": 40.42956174656325, + "learning_rate": 1.645121885870637e-08, + "logits/chosen": -0.8692556619644165, + "logits/rejected": -0.8151346445083618, + "logps/chosen": -1030.67724609375, + "logps/rejected": -1192.6016845703125, + "loss": 0.4011, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.321305751800537, + "rewards/margins": 1.7291826009750366, + "rewards/rejected": -7.050488471984863, + "step": 1276 + }, + { + "epoch": 0.8340272675320435, + "grad_norm": 27.494527127687896, + "learning_rate": 1.6326067246078455e-08, + "logits/chosen": -0.7602800130844116, + "logits/rejected": -0.7602342367172241, + "logps/chosen": -919.4981079101562, + "logps/rejected": -1069.651611328125, + "loss": 0.3332, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.5918426513671875, + "rewards/margins": 1.435691475868225, + "rewards/rejected": -6.027534008026123, + "step": 1277 + }, + { + "epoch": 0.8346803820720059, + "grad_norm": 17.408825623095783, + "learning_rate": 1.620135116724427e-08, + "logits/chosen": -0.8369264602661133, + "logits/rejected": -0.7914663553237915, + "logps/chosen": -1004.7100219726562, + "logps/rejected": -1143.3367919921875, + "loss": 0.3466, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.365845680236816, + "rewards/margins": 1.667271375656128, + "rewards/rejected": -7.033117294311523, + "step": 1278 + }, + { + "epoch": 0.8353334966119683, + "grad_norm": 18.46052855986182, + "learning_rate": 1.607707127136734e-08, + "logits/chosen": -0.9659842848777771, + "logits/rejected": -0.9460926055908203, + "logps/chosen": -1054.6634521484375, + "logps/rejected": -1332.1539306640625, + "loss": 0.3493, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.598286151885986, + "rewards/margins": 2.5486865043640137, + "rewards/rejected": -8.14697265625, + "step": 1279 + }, + { + "epoch": 0.8359866111519307, + "grad_norm": 52.90133689155389, + "learning_rate": 1.595322820534084e-08, + "logits/chosen": -1.1675491333007812, + "logits/rejected": -0.9613858461380005, + "logps/chosen": -981.5598754882812, + "logps/rejected": -1093.9095458984375, + "loss": 0.4236, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.7773823738098145, + "rewards/margins": 1.6107470989227295, + "rewards/rejected": -6.388129234313965, + "step": 1280 + }, + { + "epoch": 0.8366397256918933, + "grad_norm": 25.872062130935507, + "learning_rate": 1.582982261378416e-08, + "logits/chosen": -0.718327522277832, + "logits/rejected": -0.7760236859321594, + "logps/chosen": -1003.913330078125, + "logps/rejected": -1260.5673828125, + "loss": 0.3669, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.025420188903809, + "rewards/margins": 2.33792781829834, + "rewards/rejected": -7.36334753036499, + "step": 1281 + }, + { + "epoch": 0.8372928402318557, + "grad_norm": 47.99050518823117, + "learning_rate": 1.5706855139039598e-08, + "logits/chosen": -0.6850143671035767, + "logits/rejected": -0.7747923135757446, + "logps/chosen": -880.8267211914062, + "logps/rejected": -1138.4072265625, + "loss": 0.3798, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.495758533477783, + "rewards/margins": 1.8549747467041016, + "rewards/rejected": -6.350733280181885, + "step": 1282 + }, + { + "epoch": 0.8379459547718181, + "grad_norm": 19.640501117831906, + "learning_rate": 1.5584326421168982e-08, + "logits/chosen": -0.9612534642219543, + "logits/rejected": -0.9289946556091309, + "logps/chosen": -1052.7708740234375, + "logps/rejected": -1209.37353515625, + "loss": 0.3761, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.764304161071777, + "rewards/margins": 1.2603614330291748, + "rewards/rejected": -7.024665832519531, + "step": 1283 + }, + { + "epoch": 0.8385990693117805, + "grad_norm": 17.854998086406365, + "learning_rate": 1.546223709795036e-08, + "logits/chosen": -0.9502691626548767, + "logits/rejected": -0.8256719708442688, + "logps/chosen": -876.907470703125, + "logps/rejected": -1027.9510498046875, + "loss": 0.3932, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.2758636474609375, + "rewards/margins": 1.5407800674438477, + "rewards/rejected": -5.816644191741943, + "step": 1284 + }, + { + "epoch": 0.839252183851743, + "grad_norm": 18.55849706082879, + "learning_rate": 1.534058780487466e-08, + "logits/chosen": -0.9169440269470215, + "logits/rejected": -0.9256635904312134, + "logps/chosen": -974.2261352539062, + "logps/rejected": -1282.3883056640625, + "loss": 0.3313, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.323151111602783, + "rewards/margins": 2.5996651649475098, + "rewards/rejected": -6.922817230224609, + "step": 1285 + }, + { + "epoch": 0.8399052983917055, + "grad_norm": 17.71886089469172, + "learning_rate": 1.5219379175142422e-08, + "logits/chosen": -0.8969942927360535, + "logits/rejected": -0.8221906423568726, + "logps/chosen": -997.0826416015625, + "logps/rejected": -1204.2249755859375, + "loss": 0.3596, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.254983901977539, + "rewards/margins": 2.350426197052002, + "rewards/rejected": -7.605410575866699, + "step": 1286 + }, + { + "epoch": 0.8405584129316679, + "grad_norm": 18.76637143478923, + "learning_rate": 1.509861183966048e-08, + "logits/chosen": -1.0163681507110596, + "logits/rejected": -0.9551371932029724, + "logps/chosen": -1005.5806884765625, + "logps/rejected": -1197.47119140625, + "loss": 0.3389, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.851705074310303, + "rewards/margins": 2.037259340286255, + "rewards/rejected": -6.8889641761779785, + "step": 1287 + }, + { + "epoch": 0.8412115274716303, + "grad_norm": 26.75304121034945, + "learning_rate": 1.49782864270386e-08, + "logits/chosen": -0.8632940649986267, + "logits/rejected": -0.8258289694786072, + "logps/chosen": -895.1854858398438, + "logps/rejected": -1046.0596923828125, + "loss": 0.3176, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.798386573791504, + "rewards/margins": 1.3754218816757202, + "rewards/rejected": -6.1738080978393555, + "step": 1288 + }, + { + "epoch": 0.8418646420115928, + "grad_norm": 24.281872431787555, + "learning_rate": 1.4858403563586364e-08, + "logits/chosen": -0.7966564893722534, + "logits/rejected": -0.781533420085907, + "logps/chosen": -933.5511474609375, + "logps/rejected": -1309.903564453125, + "loss": 0.3223, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.543218612670898, + "rewards/margins": 2.614068031311035, + "rewards/rejected": -7.157286167144775, + "step": 1289 + }, + { + "epoch": 0.8425177565515553, + "grad_norm": 18.318840074907698, + "learning_rate": 1.4738963873309796e-08, + "logits/chosen": -1.0536984205245972, + "logits/rejected": -0.9841817021369934, + "logps/chosen": -1026.2513427734375, + "logps/rejected": -1251.1527099609375, + "loss": 0.4029, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.173938751220703, + "rewards/margins": 2.2958824634552, + "rewards/rejected": -7.469821453094482, + "step": 1290 + }, + { + "epoch": 0.8431708710915177, + "grad_norm": 19.079583722272528, + "learning_rate": 1.4619967977908154e-08, + "logits/chosen": -0.920739471912384, + "logits/rejected": -0.8924881815910339, + "logps/chosen": -936.5213623046875, + "logps/rejected": -1040.7528076171875, + "loss": 0.3601, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.9877400398254395, + "rewards/margins": 1.2131364345550537, + "rewards/rejected": -6.200876235961914, + "step": 1291 + }, + { + "epoch": 0.8438239856314801, + "grad_norm": 21.171536102950306, + "learning_rate": 1.450141649677067e-08, + "logits/chosen": -0.7656310796737671, + "logits/rejected": -0.6925913095474243, + "logps/chosen": -938.579833984375, + "logps/rejected": -1122.44873046875, + "loss": 0.3767, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.016373157501221, + "rewards/margins": 1.5836703777313232, + "rewards/rejected": -6.600043296813965, + "step": 1292 + }, + { + "epoch": 0.8444771001714426, + "grad_norm": 26.59407222785762, + "learning_rate": 1.4383310046973362e-08, + "logits/chosen": -0.9900301694869995, + "logits/rejected": -0.9108420610427856, + "logps/chosen": -944.6455078125, + "logps/rejected": -1131.0509033203125, + "loss": 0.3737, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.449494361877441, + "rewards/margins": 2.0268986225128174, + "rewards/rejected": -6.476393222808838, + "step": 1293 + }, + { + "epoch": 0.845130214711405, + "grad_norm": 21.349267385371785, + "learning_rate": 1.426564924327578e-08, + "logits/chosen": -0.7166858911514282, + "logits/rejected": -0.7389101982116699, + "logps/chosen": -960.0745849609375, + "logps/rejected": -1161.590576171875, + "loss": 0.3609, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.704523086547852, + "rewards/margins": 1.972301721572876, + "rewards/rejected": -6.676824569702148, + "step": 1294 + }, + { + "epoch": 0.8457833292513675, + "grad_norm": 17.76499362390843, + "learning_rate": 1.414843469811785e-08, + "logits/chosen": -0.8625224232673645, + "logits/rejected": -0.8022060394287109, + "logps/chosen": -844.1168212890625, + "logps/rejected": -1009.7998657226562, + "loss": 0.297, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.143881320953369, + "rewards/margins": 1.9904934167861938, + "rewards/rejected": -6.134374618530273, + "step": 1295 + }, + { + "epoch": 0.8464364437913299, + "grad_norm": 27.604014088902886, + "learning_rate": 1.4031667021616644e-08, + "logits/chosen": -0.8415222764015198, + "logits/rejected": -0.833803653717041, + "logps/chosen": -940.93994140625, + "logps/rejected": -1100.514892578125, + "loss": 0.3987, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.876412391662598, + "rewards/margins": 1.3780218362808228, + "rewards/rejected": -6.254434108734131, + "step": 1296 + }, + { + "epoch": 0.8470895583312924, + "grad_norm": 30.61975390516564, + "learning_rate": 1.3915346821563234e-08, + "logits/chosen": -0.9406237602233887, + "logits/rejected": -0.9669230580329895, + "logps/chosen": -1041.82763671875, + "logps/rejected": -1264.9764404296875, + "loss": 0.4221, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.639151096343994, + "rewards/margins": 2.355329990386963, + "rewards/rejected": -7.994481563568115, + "step": 1297 + }, + { + "epoch": 0.8477426728712548, + "grad_norm": 47.86314159060928, + "learning_rate": 1.3799474703419511e-08, + "logits/chosen": -0.7521982192993164, + "logits/rejected": -0.7642690539360046, + "logps/chosen": -1003.8787231445312, + "logps/rejected": -1157.30029296875, + "loss": 0.3954, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.433172225952148, + "rewards/margins": 1.7837271690368652, + "rewards/rejected": -7.216899394989014, + "step": 1298 + }, + { + "epoch": 0.8483957874112172, + "grad_norm": 20.5044227740988, + "learning_rate": 1.3684051270315056e-08, + "logits/chosen": -0.8989154100418091, + "logits/rejected": -0.892413854598999, + "logps/chosen": -1036.949462890625, + "logps/rejected": -1166.9058837890625, + "loss": 0.3666, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.681972026824951, + "rewards/margins": 1.3601738214492798, + "rewards/rejected": -7.0421462059021, + "step": 1299 + }, + { + "epoch": 0.8490489019511797, + "grad_norm": 20.765349915051292, + "learning_rate": 1.356907712304397e-08, + "logits/chosen": -0.8609282374382019, + "logits/rejected": -0.8219651579856873, + "logps/chosen": -944.8560180664062, + "logps/rejected": -1174.7581787109375, + "loss": 0.3058, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.124340057373047, + "rewards/margins": 2.451206922531128, + "rewards/rejected": -7.575546741485596, + "step": 1300 + }, + { + "epoch": 0.8490489019511797, + "eval_logits/chosen": -0.668989896774292, + "eval_logits/rejected": -0.6087003350257874, + "eval_logps/chosen": -986.1474609375, + "eval_logps/rejected": -1149.446533203125, + "eval_loss": 0.3876606225967407, + "eval_rewards/accuracies": 0.8040000200271606, + "eval_rewards/chosen": -5.122402191162109, + "eval_rewards/margins": 1.7203764915466309, + "eval_rewards/rejected": -6.84277868270874, + "eval_runtime": 615.7139, + "eval_samples_per_second": 6.497, + "eval_steps_per_second": 0.406, + "step": 1300 + }, + { + "epoch": 0.8497020164911422, + "grad_norm": 23.618304831288146, + "learning_rate": 1.3454552860061775e-08, + "logits/chosen": -0.9938647150993347, + "logits/rejected": -0.9096130728721619, + "logps/chosen": -1019.2464599609375, + "logps/rejected": -1136.9942626953125, + "loss": 0.3821, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.018831253051758, + "rewards/margins": 1.5127394199371338, + "rewards/rejected": -6.5315704345703125, + "step": 1301 + }, + { + "epoch": 0.8503551310311046, + "grad_norm": 26.652680433445912, + "learning_rate": 1.3340479077482269e-08, + "logits/chosen": -0.6800895929336548, + "logits/rejected": -0.5056519508361816, + "logps/chosen": -1047.76904296875, + "logps/rejected": -1239.179931640625, + "loss": 0.3573, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.758296489715576, + "rewards/margins": 2.2488903999328613, + "rewards/rejected": -8.007186889648438, + "step": 1302 + }, + { + "epoch": 0.851008245571067, + "grad_norm": 17.976290131053773, + "learning_rate": 1.3226856369074469e-08, + "logits/chosen": -0.9246777892112732, + "logits/rejected": -0.8199312090873718, + "logps/chosen": -956.479736328125, + "logps/rejected": -1215.670166015625, + "loss": 0.4105, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.915981769561768, + "rewards/margins": 2.1597423553466797, + "rewards/rejected": -7.0757246017456055, + "step": 1303 + }, + { + "epoch": 0.8516613601110294, + "grad_norm": 18.773093682434446, + "learning_rate": 1.3113685326259449e-08, + "logits/chosen": -0.9623532891273499, + "logits/rejected": -0.9360837340354919, + "logps/chosen": -991.7564086914062, + "logps/rejected": -1178.57958984375, + "loss": 0.3686, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.276458740234375, + "rewards/margins": 1.8608438968658447, + "rewards/rejected": -7.137301921844482, + "step": 1304 + }, + { + "epoch": 0.852314474650992, + "grad_norm": 19.590640282587913, + "learning_rate": 1.3000966538107338e-08, + "logits/chosen": -0.9980742931365967, + "logits/rejected": -0.8249813914299011, + "logps/chosen": -965.131591796875, + "logps/rejected": -1122.595703125, + "loss": 0.3543, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.103146553039551, + "rewards/margins": 2.172144889831543, + "rewards/rejected": -7.2752909660339355, + "step": 1305 + }, + { + "epoch": 0.8529675891909544, + "grad_norm": 23.660495897995975, + "learning_rate": 1.2888700591334224e-08, + "logits/chosen": -0.8977683782577515, + "logits/rejected": -0.9450709819793701, + "logps/chosen": -920.9202270507812, + "logps/rejected": -1138.51513671875, + "loss": 0.3775, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.005034446716309, + "rewards/margins": 1.9082834720611572, + "rewards/rejected": -6.913317680358887, + "step": 1306 + }, + { + "epoch": 0.8536207037309168, + "grad_norm": 25.403831749047146, + "learning_rate": 1.2776888070299074e-08, + "logits/chosen": -0.692973256111145, + "logits/rejected": -0.6692020297050476, + "logps/chosen": -872.77392578125, + "logps/rejected": -1088.57421875, + "loss": 0.4124, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.830261707305908, + "rewards/margins": 2.1663036346435547, + "rewards/rejected": -6.996564865112305, + "step": 1307 + }, + { + "epoch": 0.8542738182708792, + "grad_norm": 36.11944114089828, + "learning_rate": 1.2665529557000721e-08, + "logits/chosen": -0.8878995776176453, + "logits/rejected": -0.8831501007080078, + "logps/chosen": -1004.2806396484375, + "logps/rejected": -1308.138916015625, + "loss": 0.3876, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.11844539642334, + "rewards/margins": 2.552079916000366, + "rewards/rejected": -7.670524597167969, + "step": 1308 + }, + { + "epoch": 0.8549269328108418, + "grad_norm": 19.374431409557353, + "learning_rate": 1.2554625631074845e-08, + "logits/chosen": -0.9898017048835754, + "logits/rejected": -0.8030941486358643, + "logps/chosen": -979.740234375, + "logps/rejected": -1153.956298828125, + "loss": 0.4099, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.388953685760498, + "rewards/margins": 2.137970447540283, + "rewards/rejected": -7.526924133300781, + "step": 1309 + }, + { + "epoch": 0.8555800473508042, + "grad_norm": 19.549188171242683, + "learning_rate": 1.2444176869790923e-08, + "logits/chosen": -0.8904908895492554, + "logits/rejected": -0.8610888719558716, + "logps/chosen": -1027.5037841796875, + "logps/rejected": -1165.888427734375, + "loss": 0.3336, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.47878885269165, + "rewards/margins": 1.3745296001434326, + "rewards/rejected": -6.853318214416504, + "step": 1310 + }, + { + "epoch": 0.8562331618907666, + "grad_norm": 32.18918359672568, + "learning_rate": 1.2334183848049218e-08, + "logits/chosen": -0.8858809471130371, + "logits/rejected": -0.9060701727867126, + "logps/chosen": -936.8377685546875, + "logps/rejected": -1113.317626953125, + "loss": 0.3508, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.633793830871582, + "rewards/margins": 1.7703123092651367, + "rewards/rejected": -6.404106140136719, + "step": 1311 + }, + { + "epoch": 0.856886276430729, + "grad_norm": 23.153513339145228, + "learning_rate": 1.2224647138377852e-08, + "logits/chosen": -0.879304826259613, + "logits/rejected": -0.860927164554596, + "logps/chosen": -909.355712890625, + "logps/rejected": -1061.8111572265625, + "loss": 0.4269, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.555379867553711, + "rewards/margins": 1.5526858568191528, + "rewards/rejected": -6.108066082000732, + "step": 1312 + }, + { + "epoch": 0.8575393909706915, + "grad_norm": 19.303528656660927, + "learning_rate": 1.2115567310929764e-08, + "logits/chosen": -1.0215394496917725, + "logits/rejected": -0.8691954612731934, + "logps/chosen": -959.844970703125, + "logps/rejected": -1066.708251953125, + "loss": 0.346, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.928271770477295, + "rewards/margins": 1.406358242034912, + "rewards/rejected": -6.334630012512207, + "step": 1313 + }, + { + "epoch": 0.858192505510654, + "grad_norm": 35.2373703782269, + "learning_rate": 1.2006944933479757e-08, + "logits/chosen": -0.93189936876297, + "logits/rejected": -0.7885631918907166, + "logps/chosen": -965.0560913085938, + "logps/rejected": -1143.0811767578125, + "loss": 0.3388, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.93606424331665, + "rewards/margins": 1.9708058834075928, + "rewards/rejected": -6.906870365142822, + "step": 1314 + }, + { + "epoch": 0.8588456200506164, + "grad_norm": 16.96897023927652, + "learning_rate": 1.1898780571421552e-08, + "logits/chosen": -0.7246195077896118, + "logits/rejected": -0.5464783906936646, + "logps/chosen": -1039.4256591796875, + "logps/rejected": -1129.1058349609375, + "loss": 0.3656, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.497856616973877, + "rewards/margins": 1.3049273490905762, + "rewards/rejected": -6.802783966064453, + "step": 1315 + }, + { + "epoch": 0.8594987345905788, + "grad_norm": 19.679916600270765, + "learning_rate": 1.1791074787764843e-08, + "logits/chosen": -1.0116046667099, + "logits/rejected": -0.7844774723052979, + "logps/chosen": -1002.8685913085938, + "logps/rejected": -1154.26904296875, + "loss": 0.3629, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.192471504211426, + "rewards/margins": 2.362677812576294, + "rewards/rejected": -7.555149555206299, + "step": 1316 + }, + { + "epoch": 0.8601518491305413, + "grad_norm": 25.296782369673497, + "learning_rate": 1.1683828143132357e-08, + "logits/chosen": -0.830581545829773, + "logits/rejected": -0.9192355275154114, + "logps/chosen": -1129.3555908203125, + "logps/rejected": -1313.74658203125, + "loss": 0.4019, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.07790994644165, + "rewards/margins": 2.141233205795288, + "rewards/rejected": -8.219143867492676, + "step": 1317 + }, + { + "epoch": 0.8608049636705037, + "grad_norm": 29.317004046508643, + "learning_rate": 1.1577041195756954e-08, + "logits/chosen": -0.9937784671783447, + "logits/rejected": -0.9926940202713013, + "logps/chosen": -1031.589111328125, + "logps/rejected": -1172.4385986328125, + "loss": 0.3605, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.357040882110596, + "rewards/margins": 1.5132580995559692, + "rewards/rejected": -6.870299816131592, + "step": 1318 + }, + { + "epoch": 0.8614580782104662, + "grad_norm": 16.67396538130868, + "learning_rate": 1.1470714501478684e-08, + "logits/chosen": -0.9257397651672363, + "logits/rejected": -0.8267370462417603, + "logps/chosen": -980.4049072265625, + "logps/rejected": -1161.0723876953125, + "loss": 0.3556, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.501986503601074, + "rewards/margins": 1.8758596181869507, + "rewards/rejected": -7.3778462409973145, + "step": 1319 + }, + { + "epoch": 0.8621111927504286, + "grad_norm": 18.03638683569069, + "learning_rate": 1.136484861374195e-08, + "logits/chosen": -0.7538772225379944, + "logits/rejected": -0.7899537086486816, + "logps/chosen": -1040.8414306640625, + "logps/rejected": -1305.1431884765625, + "loss": 0.395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.39848518371582, + "rewards/margins": 2.092883586883545, + "rewards/rejected": -7.491368770599365, + "step": 1320 + }, + { + "epoch": 0.862764307290391, + "grad_norm": 25.436755545695352, + "learning_rate": 1.1259444083592584e-08, + "logits/chosen": -0.9210352897644043, + "logits/rejected": -0.848932147026062, + "logps/chosen": -1121.5576171875, + "logps/rejected": -1202.206787109375, + "loss": 0.3479, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.757541179656982, + "rewards/margins": 1.2138102054595947, + "rewards/rejected": -6.971351146697998, + "step": 1321 + }, + { + "epoch": 0.8634174218303535, + "grad_norm": 17.62570364052275, + "learning_rate": 1.115450145967497e-08, + "logits/chosen": -0.8840048313140869, + "logits/rejected": -0.7672009468078613, + "logps/chosen": -902.815673828125, + "logps/rejected": -1068.86279296875, + "loss": 0.3754, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.672972202301025, + "rewards/margins": 1.7041089534759521, + "rewards/rejected": -6.377080917358398, + "step": 1322 + }, + { + "epoch": 0.864070536370316, + "grad_norm": 20.05848285615795, + "learning_rate": 1.1050021288229238e-08, + "logits/chosen": -0.883508563041687, + "logits/rejected": -0.9728587865829468, + "logps/chosen": -922.3805541992188, + "logps/rejected": -1065.12158203125, + "loss": 0.3801, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.737578392028809, + "rewards/margins": 1.3487595319747925, + "rewards/rejected": -6.086338043212891, + "step": 1323 + }, + { + "epoch": 0.8647236509102784, + "grad_norm": 17.24172271225209, + "learning_rate": 1.094600411308838e-08, + "logits/chosen": -0.9118102192878723, + "logits/rejected": -0.8039119839668274, + "logps/chosen": -939.2774047851562, + "logps/rejected": -1240.79638671875, + "loss": 0.366, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.814789295196533, + "rewards/margins": 2.4467813968658447, + "rewards/rejected": -7.261570930480957, + "step": 1324 + }, + { + "epoch": 0.8653767654502408, + "grad_norm": 47.60072526382748, + "learning_rate": 1.0842450475675447e-08, + "logits/chosen": -0.8272143602371216, + "logits/rejected": -0.7395732402801514, + "logps/chosen": -982.828125, + "logps/rejected": -1292.427001953125, + "loss": 0.3915, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.002832889556885, + "rewards/margins": 2.7571911811828613, + "rewards/rejected": -7.760024070739746, + "step": 1325 + }, + { + "epoch": 0.8660298799902033, + "grad_norm": 37.03434033578114, + "learning_rate": 1.0739360915000684e-08, + "logits/chosen": -0.6964623332023621, + "logits/rejected": -0.765415370464325, + "logps/chosen": -1048.430419921875, + "logps/rejected": -1307.3927001953125, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.442670822143555, + "rewards/margins": 2.116424560546875, + "rewards/rejected": -7.55909538269043, + "step": 1326 + }, + { + "epoch": 0.8666829945301657, + "grad_norm": 24.87566087436779, + "learning_rate": 1.0636735967658783e-08, + "logits/chosen": -0.7556520104408264, + "logits/rejected": -0.7704268097877502, + "logps/chosen": -928.6072998046875, + "logps/rejected": -1152.6773681640625, + "loss": 0.4114, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.012763500213623, + "rewards/margins": 2.03975510597229, + "rewards/rejected": -7.052518844604492, + "step": 1327 + }, + { + "epoch": 0.8673361090701281, + "grad_norm": 28.9166809841385, + "learning_rate": 1.053457616782606e-08, + "logits/chosen": -0.8666751384735107, + "logits/rejected": -0.79831463098526, + "logps/chosen": -938.53564453125, + "logps/rejected": -1244.32177734375, + "loss": 0.3871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.967408180236816, + "rewards/margins": 2.6878042221069336, + "rewards/rejected": -7.65521240234375, + "step": 1328 + }, + { + "epoch": 0.8679892236100906, + "grad_norm": 18.0419834090539, + "learning_rate": 1.0432882047257662e-08, + "logits/chosen": -0.7430148720741272, + "logits/rejected": -0.725567102432251, + "logps/chosen": -994.3425903320312, + "logps/rejected": -1212.511962890625, + "loss": 0.3318, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.153656482696533, + "rewards/margins": 2.336616039276123, + "rewards/rejected": -7.49027156829834, + "step": 1329 + }, + { + "epoch": 0.8686423381500531, + "grad_norm": 26.724255517519683, + "learning_rate": 1.0331654135284828e-08, + "logits/chosen": -0.9742575883865356, + "logits/rejected": -0.9050071239471436, + "logps/chosen": -986.375732421875, + "logps/rejected": -1160.0706787109375, + "loss": 0.4229, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.533772945404053, + "rewards/margins": 1.7066081762313843, + "rewards/rejected": -7.240381717681885, + "step": 1330 + }, + { + "epoch": 0.8692954526900155, + "grad_norm": 41.620788625763744, + "learning_rate": 1.0230892958812121e-08, + "logits/chosen": -0.7415061593055725, + "logits/rejected": -0.7691062092781067, + "logps/chosen": -1047.3677978515625, + "logps/rejected": -1379.028076171875, + "loss": 0.3797, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.393304824829102, + "rewards/margins": 2.6148576736450195, + "rewards/rejected": -8.008163452148438, + "step": 1331 + }, + { + "epoch": 0.8699485672299779, + "grad_norm": 22.984334643739626, + "learning_rate": 1.0130599042314692e-08, + "logits/chosen": -0.9216421842575073, + "logits/rejected": -0.8291821479797363, + "logps/chosen": -998.3714599609375, + "logps/rejected": -1113.957763671875, + "loss": 0.3409, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.584649562835693, + "rewards/margins": 1.312453269958496, + "rewards/rejected": -6.8971028327941895, + "step": 1332 + }, + { + "epoch": 0.8706016817699403, + "grad_norm": 24.900796453353124, + "learning_rate": 1.0030772907835483e-08, + "logits/chosen": -0.908165454864502, + "logits/rejected": -0.8499115705490112, + "logps/chosen": -1064.900634765625, + "logps/rejected": -1277.716064453125, + "loss": 0.3541, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.384472370147705, + "rewards/margins": 1.7705702781677246, + "rewards/rejected": -7.155043125152588, + "step": 1333 + }, + { + "epoch": 0.8712547963099029, + "grad_norm": 27.084076031409385, + "learning_rate": 9.931415074982652e-09, + "logits/chosen": -0.7743809819221497, + "logits/rejected": -0.8256383538246155, + "logps/chosen": -994.769287109375, + "logps/rejected": -1325.2421875, + "loss": 0.3249, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.3938307762146, + "rewards/margins": 2.0958592891693115, + "rewards/rejected": -7.489689826965332, + "step": 1334 + }, + { + "epoch": 0.8719079108498653, + "grad_norm": 19.780940598640473, + "learning_rate": 9.83252606092675e-09, + "logits/chosen": -0.6853234767913818, + "logits/rejected": -0.7641116976737976, + "logps/chosen": -982.231689453125, + "logps/rejected": -1285.2666015625, + "loss": 0.322, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.414259910583496, + "rewards/margins": 2.343747138977051, + "rewards/rejected": -7.758007049560547, + "step": 1335 + }, + { + "epoch": 0.8725610253898277, + "grad_norm": 21.680307984905237, + "learning_rate": 9.734106380398022e-09, + "logits/chosen": -0.7005580067634583, + "logits/rejected": -0.6642061471939087, + "logps/chosen": -933.6368408203125, + "logps/rejected": -1211.7689208984375, + "loss": 0.3904, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.866269111633301, + "rewards/margins": 2.560360908508301, + "rewards/rejected": -7.426629543304443, + "step": 1336 + }, + { + "epoch": 0.8732141399297901, + "grad_norm": 31.102753169843872, + "learning_rate": 9.636156545683883e-09, + "logits/chosen": -1.025209665298462, + "logits/rejected": -0.9281519651412964, + "logps/chosen": -1016.0449829101562, + "logps/rejected": -1132.5885009765625, + "loss": 0.4202, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.55272912979126, + "rewards/margins": 1.6035462617874146, + "rewards/rejected": -7.156274795532227, + "step": 1337 + }, + { + "epoch": 0.8738672544697527, + "grad_norm": 17.364332861748135, + "learning_rate": 9.538677066626022e-09, + "logits/chosen": -0.7068428993225098, + "logits/rejected": -0.7502703666687012, + "logps/chosen": -974.12939453125, + "logps/rejected": -1213.2840576171875, + "loss": 0.2782, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.726061820983887, + "rewards/margins": 2.2374606132507324, + "rewards/rejected": -6.963522434234619, + "step": 1338 + }, + { + "epoch": 0.8745203690097151, + "grad_norm": 17.620559884621457, + "learning_rate": 9.441668450617923e-09, + "logits/chosen": -0.9793184995651245, + "logits/rejected": -0.9558190703392029, + "logps/chosen": -965.9219970703125, + "logps/rejected": -1112.312744140625, + "loss": 0.3764, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.9571428298950195, + "rewards/margins": 1.8213887214660645, + "rewards/rejected": -6.778531551361084, + "step": 1339 + }, + { + "epoch": 0.8751734835496775, + "grad_norm": 32.17387765620781, + "learning_rate": 9.345131202602164e-09, + "logits/chosen": -1.1392197608947754, + "logits/rejected": -0.9780033826828003, + "logps/chosen": -975.8745727539062, + "logps/rejected": -1259.0980224609375, + "loss": 0.3324, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.840333461761475, + "rewards/margins": 3.1771368980407715, + "rewards/rejected": -8.017470359802246, + "step": 1340 + }, + { + "epoch": 0.8758265980896399, + "grad_norm": 21.131034779855497, + "learning_rate": 9.249065825067758e-09, + "logits/chosen": -0.8789793848991394, + "logits/rejected": -0.8509078621864319, + "logps/chosen": -923.0077514648438, + "logps/rejected": -1089.787353515625, + "loss": 0.3321, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.57816219329834, + "rewards/margins": 1.576073408126831, + "rewards/rejected": -6.15423583984375, + "step": 1341 + }, + { + "epoch": 0.8764797126296024, + "grad_norm": 29.509029464687337, + "learning_rate": 9.153472818047625e-09, + "logits/chosen": -0.8012582659721375, + "logits/rejected": -0.8273895382881165, + "logps/chosen": -1030.0450439453125, + "logps/rejected": -1222.031494140625, + "loss": 0.495, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.729059219360352, + "rewards/margins": 1.3719710111618042, + "rewards/rejected": -7.101030349731445, + "step": 1342 + }, + { + "epoch": 0.8771328271695649, + "grad_norm": 18.846614157308878, + "learning_rate": 9.058352679115877e-09, + "logits/chosen": -1.0124015808105469, + "logits/rejected": -1.0128324031829834, + "logps/chosen": -1058.6331787109375, + "logps/rejected": -1228.419189453125, + "loss": 0.2927, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.294768333435059, + "rewards/margins": 1.7898911237716675, + "rewards/rejected": -7.084660053253174, + "step": 1343 + }, + { + "epoch": 0.8777859417095273, + "grad_norm": 24.392505614425158, + "learning_rate": 8.963705903385343e-09, + "logits/chosen": -0.7756754159927368, + "logits/rejected": -0.7199742197990417, + "logps/chosen": -916.379150390625, + "logps/rejected": -1156.7076416015625, + "loss": 0.4262, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.617804050445557, + "rewards/margins": 2.4423699378967285, + "rewards/rejected": -7.060173034667969, + "step": 1344 + }, + { + "epoch": 0.8784390562494897, + "grad_norm": 26.689071599825724, + "learning_rate": 8.869532983504857e-09, + "logits/chosen": -0.6491687297821045, + "logits/rejected": -0.6166170835494995, + "logps/chosen": -994.8203125, + "logps/rejected": -1133.805908203125, + "loss": 0.3631, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.721258163452148, + "rewards/margins": 1.3589121103286743, + "rewards/rejected": -7.080170154571533, + "step": 1345 + }, + { + "epoch": 0.8790921707894522, + "grad_norm": 20.776260248385395, + "learning_rate": 8.775834409656858e-09, + "logits/chosen": -0.8777629137039185, + "logits/rejected": -0.7253664135932922, + "logps/chosen": -1020.5516357421875, + "logps/rejected": -1153.7655029296875, + "loss": 0.3426, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.246578216552734, + "rewards/margins": 1.892094612121582, + "rewards/rejected": -7.138672828674316, + "step": 1346 + }, + { + "epoch": 0.8797452853294147, + "grad_norm": 17.81070478069546, + "learning_rate": 8.68261066955468e-09, + "logits/chosen": -0.9101294279098511, + "logits/rejected": -0.9133301973342896, + "logps/chosen": -1000.2833862304688, + "logps/rejected": -1239.5137939453125, + "loss": 0.3019, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.050441741943359, + "rewards/margins": 2.5203206539154053, + "rewards/rejected": -7.570762634277344, + "step": 1347 + }, + { + "epoch": 0.8803983998693771, + "grad_norm": 30.303361478460022, + "learning_rate": 8.589862248440139e-09, + "logits/chosen": -0.9815719723701477, + "logits/rejected": -0.9545872211456299, + "logps/chosen": -950.197021484375, + "logps/rejected": -1092.115966796875, + "loss": 0.4662, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.594487190246582, + "rewards/margins": 1.228310465812683, + "rewards/rejected": -5.8227972984313965, + "step": 1348 + }, + { + "epoch": 0.8810515144093395, + "grad_norm": 33.14583667447738, + "learning_rate": 8.497589629080925e-09, + "logits/chosen": -0.825454592704773, + "logits/rejected": -0.792005717754364, + "logps/chosen": -964.4857177734375, + "logps/rejected": -1159.8331298828125, + "loss": 0.3974, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.539005756378174, + "rewards/margins": 2.0518078804016113, + "rewards/rejected": -6.590812683105469, + "step": 1349 + }, + { + "epoch": 0.881704628949302, + "grad_norm": 31.91327189244124, + "learning_rate": 8.405793291768126e-09, + "logits/chosen": -0.7423689365386963, + "logits/rejected": -0.7918304800987244, + "logps/chosen": -929.157470703125, + "logps/rejected": -1141.266845703125, + "loss": 0.3452, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.143529891967773, + "rewards/margins": 1.6667006015777588, + "rewards/rejected": -6.810230255126953, + "step": 1350 + }, + { + "epoch": 0.8823577434892644, + "grad_norm": 14.811345823375834, + "learning_rate": 8.314473714313718e-09, + "logits/chosen": -0.89141845703125, + "logits/rejected": -0.8798700571060181, + "logps/chosen": -1057.8790283203125, + "logps/rejected": -1344.7864990234375, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.591994285583496, + "rewards/margins": 2.79272198677063, + "rewards/rejected": -8.384716033935547, + "step": 1351 + }, + { + "epoch": 0.8830108580292269, + "grad_norm": 22.26665699973362, + "learning_rate": 8.223631372048068e-09, + "logits/chosen": -0.8326403498649597, + "logits/rejected": -0.8692911863327026, + "logps/chosen": -990.7032470703125, + "logps/rejected": -1251.9947509765625, + "loss": 0.3025, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.864250659942627, + "rewards/margins": 2.1364824771881104, + "rewards/rejected": -7.000733375549316, + "step": 1352 + }, + { + "epoch": 0.8836639725691893, + "grad_norm": 31.552116884148052, + "learning_rate": 8.13326673781748e-09, + "logits/chosen": -0.8896104097366333, + "logits/rejected": -0.7196336388587952, + "logps/chosen": -971.0677490234375, + "logps/rejected": -1149.3621826171875, + "loss": 0.4401, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.905688285827637, + "rewards/margins": 2.0950686931610107, + "rewards/rejected": -7.000757694244385, + "step": 1353 + }, + { + "epoch": 0.8843170871091518, + "grad_norm": 17.181228988826543, + "learning_rate": 8.043380281981738e-09, + "logits/chosen": -0.9177496433258057, + "logits/rejected": -0.7978098392486572, + "logps/chosen": -1084.5869140625, + "logps/rejected": -1308.2545166015625, + "loss": 0.3041, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.60262393951416, + "rewards/margins": 2.6088032722473145, + "rewards/rejected": -8.211426734924316, + "step": 1354 + }, + { + "epoch": 0.8849702016491142, + "grad_norm": 19.23410183751255, + "learning_rate": 7.953972472411651e-09, + "logits/chosen": -0.9430975914001465, + "logits/rejected": -0.9466529488563538, + "logps/chosen": -983.9346313476562, + "logps/rejected": -1212.6507568359375, + "loss": 0.328, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.656593322753906, + "rewards/margins": 1.8082714080810547, + "rewards/rejected": -7.464864730834961, + "step": 1355 + }, + { + "epoch": 0.8856233161890766, + "grad_norm": 21.918588042524284, + "learning_rate": 7.865043774486546e-09, + "logits/chosen": -0.853823184967041, + "logits/rejected": -0.7541569471359253, + "logps/chosen": -981.8515014648438, + "logps/rejected": -1045.3902587890625, + "loss": 0.3729, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.2487664222717285, + "rewards/margins": 1.0395838022232056, + "rewards/rejected": -6.288350582122803, + "step": 1356 + }, + { + "epoch": 0.886276430729039, + "grad_norm": 38.12880855883742, + "learning_rate": 7.776594651091994e-09, + "logits/chosen": -1.0198040008544922, + "logits/rejected": -0.9592865705490112, + "logps/chosen": -1032.7816162109375, + "logps/rejected": -1200.517578125, + "loss": 0.3652, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.793912410736084, + "rewards/margins": 1.8678948879241943, + "rewards/rejected": -7.661807060241699, + "step": 1357 + }, + { + "epoch": 0.8869295452690016, + "grad_norm": 34.33999032738849, + "learning_rate": 7.688625562617256e-09, + "logits/chosen": -0.9091510772705078, + "logits/rejected": -0.8259568214416504, + "logps/chosen": -914.729248046875, + "logps/rejected": -1010.8964233398438, + "loss": 0.4656, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.5010151863098145, + "rewards/margins": 1.2648601531982422, + "rewards/rejected": -5.765875816345215, + "step": 1358 + }, + { + "epoch": 0.887582659808964, + "grad_norm": 33.81711789536401, + "learning_rate": 7.601136966953003e-09, + "logits/chosen": -0.9259005188941956, + "logits/rejected": -0.8752458691596985, + "logps/chosen": -964.3880004882812, + "logps/rejected": -1239.1573486328125, + "loss": 0.4209, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.050426006317139, + "rewards/margins": 2.616529703140259, + "rewards/rejected": -7.666955947875977, + "step": 1359 + }, + { + "epoch": 0.8882357743489264, + "grad_norm": 24.065628566506295, + "learning_rate": 7.514129319488837e-09, + "logits/chosen": -0.8353631496429443, + "logits/rejected": -0.7178384065628052, + "logps/chosen": -931.4916381835938, + "logps/rejected": -1108.391357421875, + "loss": 0.3406, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.812725067138672, + "rewards/margins": 1.8928982019424438, + "rewards/rejected": -6.705623626708984, + "step": 1360 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 17.622810894307673, + "learning_rate": 7.427603073110966e-09, + "logits/chosen": -0.8384911417961121, + "logits/rejected": -0.7364537119865417, + "logps/chosen": -1067.7464599609375, + "logps/rejected": -1383.726318359375, + "loss": 0.2551, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.4259748458862305, + "rewards/margins": 3.5197863578796387, + "rewards/rejected": -8.945761680603027, + "step": 1361 + }, + { + "epoch": 0.8895420034288514, + "grad_norm": 24.007523464336668, + "learning_rate": 7.341558678199866e-09, + "logits/chosen": -0.8948401212692261, + "logits/rejected": -0.7564103603363037, + "logps/chosen": -1035.5931396484375, + "logps/rejected": -1149.049560546875, + "loss": 0.3504, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.795162200927734, + "rewards/margins": 1.4507133960723877, + "rewards/rejected": -7.245875358581543, + "step": 1362 + }, + { + "epoch": 0.8901951179688138, + "grad_norm": 31.397527914915205, + "learning_rate": 7.2559965826278765e-09, + "logits/chosen": -0.9367034435272217, + "logits/rejected": -0.8936917185783386, + "logps/chosen": -965.4476318359375, + "logps/rejected": -1156.2060546875, + "loss": 0.3125, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.646866798400879, + "rewards/margins": 1.8237485885620117, + "rewards/rejected": -7.470614910125732, + "step": 1363 + }, + { + "epoch": 0.8908482325087762, + "grad_norm": 32.492551038820444, + "learning_rate": 7.170917231756957e-09, + "logits/chosen": -0.7623252868652344, + "logits/rejected": -0.699958086013794, + "logps/chosen": -1050.454345703125, + "logps/rejected": -1148.5802001953125, + "loss": 0.4612, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.705660343170166, + "rewards/margins": 1.5220611095428467, + "rewards/rejected": -7.227721214294434, + "step": 1364 + }, + { + "epoch": 0.8915013470487386, + "grad_norm": 22.366295992357657, + "learning_rate": 7.0863210684362514e-09, + "logits/chosen": -0.750536322593689, + "logits/rejected": -0.6350942254066467, + "logps/chosen": -1094.57421875, + "logps/rejected": -1305.535400390625, + "loss": 0.3831, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.483131408691406, + "rewards/margins": 2.0676817893981934, + "rewards/rejected": -8.550813674926758, + "step": 1365 + }, + { + "epoch": 0.8921544615887012, + "grad_norm": 20.289457072065797, + "learning_rate": 7.002208532999931e-09, + "logits/chosen": -0.9724587202072144, + "logits/rejected": -0.9823508262634277, + "logps/chosen": -1013.5618896484375, + "logps/rejected": -1186.172119140625, + "loss": 0.355, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.142480850219727, + "rewards/margins": 1.8304882049560547, + "rewards/rejected": -6.972969055175781, + "step": 1366 + }, + { + "epoch": 0.8928075761286636, + "grad_norm": 51.43639219938111, + "learning_rate": 6.918580063264723e-09, + "logits/chosen": -0.948373556137085, + "logits/rejected": -0.855690598487854, + "logps/chosen": -1088.060302734375, + "logps/rejected": -1273.875732421875, + "loss": 0.4293, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.645302772521973, + "rewards/margins": 2.207998514175415, + "rewards/rejected": -7.853301048278809, + "step": 1367 + }, + { + "epoch": 0.893460690668626, + "grad_norm": 23.319891219088692, + "learning_rate": 6.835436094527802e-09, + "logits/chosen": -0.8042424917221069, + "logits/rejected": -0.7829912304878235, + "logps/chosen": -1005.8231201171875, + "logps/rejected": -1181.2496337890625, + "loss": 0.3815, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.598033428192139, + "rewards/margins": 1.9655909538269043, + "rewards/rejected": -7.563624858856201, + "step": 1368 + }, + { + "epoch": 0.8941138052085884, + "grad_norm": 21.36102394772282, + "learning_rate": 6.75277705956443e-09, + "logits/chosen": -0.6407432556152344, + "logits/rejected": -0.764612078666687, + "logps/chosen": -1037.61474609375, + "logps/rejected": -1353.3521728515625, + "loss": 0.3952, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.1136603355407715, + "rewards/margins": 2.811863899230957, + "rewards/rejected": -7.925524711608887, + "step": 1369 + }, + { + "epoch": 0.8947669197485509, + "grad_norm": 28.704910489557932, + "learning_rate": 6.670603388625729e-09, + "logits/chosen": -0.9759268760681152, + "logits/rejected": -1.0759167671203613, + "logps/chosen": -1123.990966796875, + "logps/rejected": -1394.31591796875, + "loss": 0.4414, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.98316764831543, + "rewards/margins": 1.978651523590088, + "rewards/rejected": -7.961818695068359, + "step": 1370 + }, + { + "epoch": 0.8954200342885134, + "grad_norm": 37.37125749087838, + "learning_rate": 6.588915509436422e-09, + "logits/chosen": -0.7405807375907898, + "logits/rejected": -0.6690051555633545, + "logps/chosen": -972.5208129882812, + "logps/rejected": -1203.4818115234375, + "loss": 0.3805, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.685582160949707, + "rewards/margins": 1.8616018295288086, + "rewards/rejected": -7.547183990478516, + "step": 1371 + }, + { + "epoch": 0.8960731488284758, + "grad_norm": 20.307666787607005, + "learning_rate": 6.507713847192642e-09, + "logits/chosen": -0.8488802909851074, + "logits/rejected": -0.8470434546470642, + "logps/chosen": -920.7410278320312, + "logps/rejected": -1137.112060546875, + "loss": 0.2597, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.060013771057129, + "rewards/margins": 2.1794750690460205, + "rewards/rejected": -7.239489555358887, + "step": 1372 + }, + { + "epoch": 0.8967262633684382, + "grad_norm": 43.88372556931514, + "learning_rate": 6.4269988245596705e-09, + "logits/chosen": -1.0249723196029663, + "logits/rejected": -1.007939338684082, + "logps/chosen": -1000.8651123046875, + "logps/rejected": -1186.3309326171875, + "loss": 0.3355, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.083978176116943, + "rewards/margins": 1.7574626207351685, + "rewards/rejected": -6.8414411544799805, + "step": 1373 + }, + { + "epoch": 0.8973793779084007, + "grad_norm": 24.61215223955591, + "learning_rate": 6.3467708616698255e-09, + "logits/chosen": -1.024997591972351, + "logits/rejected": -0.9208765625953674, + "logps/chosen": -1024.623779296875, + "logps/rejected": -1188.1405029296875, + "loss": 0.3648, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.875184535980225, + "rewards/margins": 1.9895105361938477, + "rewards/rejected": -6.864694595336914, + "step": 1374 + }, + { + "epoch": 0.8980324924483631, + "grad_norm": 23.059001195470955, + "learning_rate": 6.267030376120152e-09, + "logits/chosen": -0.9334471821784973, + "logits/rejected": -0.8829833269119263, + "logps/chosen": -958.0858154296875, + "logps/rejected": -1172.60546875, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.882957458496094, + "rewards/margins": 1.8839558362960815, + "rewards/rejected": -6.766912937164307, + "step": 1375 + }, + { + "epoch": 0.8986856069883256, + "grad_norm": 31.968479083377638, + "learning_rate": 6.187777782970338e-09, + "logits/chosen": -1.0105090141296387, + "logits/rejected": -0.9062252640724182, + "logps/chosen": -1128.7545166015625, + "logps/rejected": -1256.02490234375, + "loss": 0.4209, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.249847888946533, + "rewards/margins": 1.6132521629333496, + "rewards/rejected": -7.863100528717041, + "step": 1376 + }, + { + "epoch": 0.899338721528288, + "grad_norm": 20.840264483151362, + "learning_rate": 6.109013494740522e-09, + "logits/chosen": -0.8968515992164612, + "logits/rejected": -0.8042609691619873, + "logps/chosen": -956.0653686523438, + "logps/rejected": -1098.1114501953125, + "loss": 0.3802, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.395586013793945, + "rewards/margins": 1.7047362327575684, + "rewards/rejected": -7.1003217697143555, + "step": 1377 + }, + { + "epoch": 0.8999918360682505, + "grad_norm": 22.924320591500365, + "learning_rate": 6.030737921409168e-09, + "logits/chosen": -0.9722306132316589, + "logits/rejected": -0.8865904808044434, + "logps/chosen": -942.35791015625, + "logps/rejected": -1074.9583740234375, + "loss": 0.3486, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.057037353515625, + "rewards/margins": 1.712819218635559, + "rewards/rejected": -6.769855976104736, + "step": 1378 + }, + { + "epoch": 0.9006449506082129, + "grad_norm": 25.553546509337146, + "learning_rate": 5.952951470410872e-09, + "logits/chosen": -0.848723292350769, + "logits/rejected": -0.8915231227874756, + "logps/chosen": -1017.4171142578125, + "logps/rejected": -1173.70166015625, + "loss": 0.3365, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.365781307220459, + "rewards/margins": 1.5210890769958496, + "rewards/rejected": -6.886870861053467, + "step": 1379 + }, + { + "epoch": 0.9012980651481753, + "grad_norm": 25.539039082715245, + "learning_rate": 5.875654546634334e-09, + "logits/chosen": -1.0224645137786865, + "logits/rejected": -0.9910560846328735, + "logps/chosen": -890.1279907226562, + "logps/rejected": -1026.0372314453125, + "loss": 0.4851, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.696094989776611, + "rewards/margins": 1.3432387113571167, + "rewards/rejected": -6.039333820343018, + "step": 1380 + }, + { + "epoch": 0.9019511796881378, + "grad_norm": 31.968150161581548, + "learning_rate": 5.798847552420183e-09, + "logits/chosen": -0.7949383854866028, + "logits/rejected": -0.8956403732299805, + "logps/chosen": -938.8556518554688, + "logps/rejected": -1220.24658203125, + "loss": 0.35, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.045302391052246, + "rewards/margins": 2.233097553253174, + "rewards/rejected": -7.27839994430542, + "step": 1381 + }, + { + "epoch": 0.9026042942281003, + "grad_norm": 24.45518586693916, + "learning_rate": 5.722530887558874e-09, + "logits/chosen": -0.8497747182846069, + "logits/rejected": -0.7980440258979797, + "logps/chosen": -941.43310546875, + "logps/rejected": -1097.9815673828125, + "loss": 0.3532, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.3820109367370605, + "rewards/margins": 1.8423418998718262, + "rewards/rejected": -7.224352836608887, + "step": 1382 + }, + { + "epoch": 0.9032574087680627, + "grad_norm": 22.862968219213002, + "learning_rate": 5.646704949288683e-09, + "logits/chosen": -0.9405361413955688, + "logits/rejected": -0.8489739298820496, + "logps/chosen": -985.428955078125, + "logps/rejected": -1097.078857421875, + "loss": 0.4181, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.368486404418945, + "rewards/margins": 1.47150456905365, + "rewards/rejected": -6.839991569519043, + "step": 1383 + }, + { + "epoch": 0.9039105233080251, + "grad_norm": 15.920740655238594, + "learning_rate": 5.571370132293552e-09, + "logits/chosen": -0.9731810092926025, + "logits/rejected": -0.9640232920646667, + "logps/chosen": -991.5859375, + "logps/rejected": -1153.059326171875, + "loss": 0.3359, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.1485514640808105, + "rewards/margins": 1.600454330444336, + "rewards/rejected": -6.749006271362305, + "step": 1384 + }, + { + "epoch": 0.9045636378479875, + "grad_norm": 40.67819573629027, + "learning_rate": 5.496526828701075e-09, + "logits/chosen": -0.9371462464332581, + "logits/rejected": -0.9148290753364563, + "logps/chosen": -1122.163818359375, + "logps/rejected": -1289.2926025390625, + "loss": 0.4809, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.275271892547607, + "rewards/margins": 1.8808785676956177, + "rewards/rejected": -8.156150817871094, + "step": 1385 + }, + { + "epoch": 0.9052167523879501, + "grad_norm": 25.740903240918104, + "learning_rate": 5.4221754280804774e-09, + "logits/chosen": -0.9231817722320557, + "logits/rejected": -0.8849901556968689, + "logps/chosen": -1015.8035888671875, + "logps/rejected": -1216.7371826171875, + "loss": 0.3014, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.52568244934082, + "rewards/margins": 1.806875467300415, + "rewards/rejected": -7.3325581550598145, + "step": 1386 + }, + { + "epoch": 0.9058698669279125, + "grad_norm": 32.25704450398711, + "learning_rate": 5.348316317440549e-09, + "logits/chosen": -0.8346178531646729, + "logits/rejected": -0.8496906757354736, + "logps/chosen": -1041.9498291015625, + "logps/rejected": -1215.4976806640625, + "loss": 0.4455, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.537223815917969, + "rewards/margins": 1.5639135837554932, + "rewards/rejected": -7.101138114929199, + "step": 1387 + }, + { + "epoch": 0.9065229814678749, + "grad_norm": 22.898299911399935, + "learning_rate": 5.274949881227641e-09, + "logits/chosen": -0.8780845403671265, + "logits/rejected": -0.8350746631622314, + "logps/chosen": -926.8447875976562, + "logps/rejected": -1088.592529296875, + "loss": 0.3918, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.732271194458008, + "rewards/margins": 1.4746408462524414, + "rewards/rejected": -6.206912040710449, + "step": 1388 + }, + { + "epoch": 0.9071760960078373, + "grad_norm": 25.38924080412644, + "learning_rate": 5.202076501323694e-09, + "logits/chosen": -0.9332727789878845, + "logits/rejected": -0.876400887966156, + "logps/chosen": -1018.9508056640625, + "logps/rejected": -1312.7470703125, + "loss": 0.4043, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.826478004455566, + "rewards/margins": 2.748919725418091, + "rewards/rejected": -8.575397491455078, + "step": 1389 + }, + { + "epoch": 0.9078292105477999, + "grad_norm": 28.770762632476412, + "learning_rate": 5.129696557044172e-09, + "logits/chosen": -0.9943676590919495, + "logits/rejected": -0.8872275948524475, + "logps/chosen": -1113.85205078125, + "logps/rejected": -1228.1134033203125, + "loss": 0.4223, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.0906147956848145, + "rewards/margins": 1.2218999862670898, + "rewards/rejected": -7.312514781951904, + "step": 1390 + }, + { + "epoch": 0.9084823250877623, + "grad_norm": 23.730110125114983, + "learning_rate": 5.057810425136189e-09, + "logits/chosen": -1.0045166015625, + "logits/rejected": -0.9036121964454651, + "logps/chosen": -982.4205322265625, + "logps/rejected": -1079.776123046875, + "loss": 0.3588, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.33574104309082, + "rewards/margins": 1.3380316495895386, + "rewards/rejected": -6.673772811889648, + "step": 1391 + }, + { + "epoch": 0.9091354396277247, + "grad_norm": 27.63473440877699, + "learning_rate": 4.98641847977651e-09, + "logits/chosen": -0.7540165781974792, + "logits/rejected": -0.7030526399612427, + "logps/chosen": -1042.4677734375, + "logps/rejected": -1347.1019287109375, + "loss": 0.3761, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.64056396484375, + "rewards/margins": 2.689081907272339, + "rewards/rejected": -8.329645156860352, + "step": 1392 + }, + { + "epoch": 0.9097885541676871, + "grad_norm": 20.262425063605342, + "learning_rate": 4.915521092569552e-09, + "logits/chosen": -0.8860914707183838, + "logits/rejected": -0.943866491317749, + "logps/chosen": -1029.2822265625, + "logps/rejected": -1265.9954833984375, + "loss": 0.3605, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.087381839752197, + "rewards/margins": 2.124335527420044, + "rewards/rejected": -8.21171760559082, + "step": 1393 + }, + { + "epoch": 0.9104416687076496, + "grad_norm": 25.627440100114217, + "learning_rate": 4.845118632545531e-09, + "logits/chosen": -0.9025149345397949, + "logits/rejected": -0.9109005928039551, + "logps/chosen": -976.8741455078125, + "logps/rejected": -1393.4610595703125, + "loss": 0.352, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.313702583312988, + "rewards/margins": 3.813965082168579, + "rewards/rejected": -9.127667427062988, + "step": 1394 + }, + { + "epoch": 0.9110947832476121, + "grad_norm": 22.8158973952252, + "learning_rate": 4.775211466158469e-09, + "logits/chosen": -0.9520535469055176, + "logits/rejected": -0.926558792591095, + "logps/chosen": -991.7243041992188, + "logps/rejected": -1153.231201171875, + "loss": 0.3798, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.2474493980407715, + "rewards/margins": 2.104310989379883, + "rewards/rejected": -7.351759910583496, + "step": 1395 + }, + { + "epoch": 0.9117478977875745, + "grad_norm": 31.595434065769368, + "learning_rate": 4.705799957284351e-09, + "logits/chosen": -0.9866628050804138, + "logits/rejected": -0.9612003564834595, + "logps/chosen": -898.5245361328125, + "logps/rejected": -1169.009521484375, + "loss": 0.399, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.514897346496582, + "rewards/margins": 2.6222004890441895, + "rewards/rejected": -7.1370978355407715, + "step": 1396 + }, + { + "epoch": 0.9124010123275369, + "grad_norm": 26.555330502651298, + "learning_rate": 4.636884467219171e-09, + "logits/chosen": -1.0796997547149658, + "logits/rejected": -0.954846203327179, + "logps/chosen": -1147.51611328125, + "logps/rejected": -1267.271240234375, + "loss": 0.4135, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.350531578063965, + "rewards/margins": 1.6353040933609009, + "rewards/rejected": -6.985836505889893, + "step": 1397 + }, + { + "epoch": 0.9130541268674994, + "grad_norm": 21.158609964601236, + "learning_rate": 4.568465354677087e-09, + "logits/chosen": -0.9235592484474182, + "logits/rejected": -0.8905725479125977, + "logps/chosen": -957.9666137695312, + "logps/rejected": -1054.5474853515625, + "loss": 0.3576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.137127876281738, + "rewards/margins": 1.1683361530303955, + "rewards/rejected": -6.305464267730713, + "step": 1398 + }, + { + "epoch": 0.9137072414074618, + "grad_norm": 18.97231329613263, + "learning_rate": 4.500542975788579e-09, + "logits/chosen": -0.8841540217399597, + "logits/rejected": -0.9338924288749695, + "logps/chosen": -904.0048828125, + "logps/rejected": -1080.072021484375, + "loss": 0.2991, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.445211410522461, + "rewards/margins": 1.6414602994918823, + "rewards/rejected": -6.086671352386475, + "step": 1399 + }, + { + "epoch": 0.9143603559474243, + "grad_norm": 19.43570834743748, + "learning_rate": 4.433117684098508e-09, + "logits/chosen": -0.9700881242752075, + "logits/rejected": -0.8894848823547363, + "logps/chosen": -919.09375, + "logps/rejected": -1196.0753173828125, + "loss": 0.3467, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.059053897857666, + "rewards/margins": 2.5139126777648926, + "rewards/rejected": -7.572966575622559, + "step": 1400 + }, + { + "epoch": 0.9143603559474243, + "eval_logits/chosen": -0.6672475337982178, + "eval_logits/rejected": -0.6070554256439209, + "eval_logps/chosen": -997.260986328125, + "eval_logps/rejected": -1163.262939453125, + "eval_loss": 0.38706615567207336, + "eval_rewards/accuracies": 0.8090000152587891, + "eval_rewards/chosen": -5.233537673950195, + "eval_rewards/margins": 1.7474075555801392, + "eval_rewards/rejected": -6.980944633483887, + "eval_runtime": 616.0403, + "eval_samples_per_second": 6.493, + "eval_steps_per_second": 0.406, + "step": 1400 + }, + { + "epoch": 0.9150134704873867, + "grad_norm": 21.623147531365383, + "learning_rate": 4.366189830564393e-09, + "logits/chosen": -0.8763200640678406, + "logits/rejected": -0.7712318897247314, + "logps/chosen": -994.0523071289062, + "logps/rejected": -1056.1307373046875, + "loss": 0.4043, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.576797008514404, + "rewards/margins": 0.9099751710891724, + "rewards/rejected": -6.486772060394287, + "step": 1401 + }, + { + "epoch": 0.9156665850273492, + "grad_norm": 23.79888391402557, + "learning_rate": 4.299759763554456e-09, + "logits/chosen": -0.9876704216003418, + "logits/rejected": -0.9826672077178955, + "logps/chosen": -1038.6239013671875, + "logps/rejected": -1214.2568359375, + "loss": 0.379, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.297689437866211, + "rewards/margins": 1.9423954486846924, + "rewards/rejected": -7.240084171295166, + "step": 1402 + }, + { + "epoch": 0.9163196995673116, + "grad_norm": 21.82743599285643, + "learning_rate": 4.233827828845915e-09, + "logits/chosen": -0.7085953950881958, + "logits/rejected": -0.8077543377876282, + "logps/chosen": -873.767578125, + "logps/rejected": -1173.175048828125, + "loss": 0.3597, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.473335266113281, + "rewards/margins": 2.4106528759002686, + "rewards/rejected": -6.8839874267578125, + "step": 1403 + }, + { + "epoch": 0.916972814107274, + "grad_norm": 21.412720186872857, + "learning_rate": 4.1683943696231515e-09, + "logits/chosen": -0.8199781775474548, + "logits/rejected": -0.8375627994537354, + "logps/chosen": -1003.9866333007812, + "logps/rejected": -1262.1685791015625, + "loss": 0.3588, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.443426132202148, + "rewards/margins": 1.931605339050293, + "rewards/rejected": -7.375031471252441, + "step": 1404 + }, + { + "epoch": 0.9176259286472365, + "grad_norm": 35.589071157888114, + "learning_rate": 4.103459726475889e-09, + "logits/chosen": -0.6660647988319397, + "logits/rejected": -0.8292348384857178, + "logps/chosen": -933.234375, + "logps/rejected": -1223.6353759765625, + "loss": 0.4142, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.21882438659668, + "rewards/margins": 1.956418752670288, + "rewards/rejected": -7.175242900848389, + "step": 1405 + }, + { + "epoch": 0.918279043187199, + "grad_norm": 52.59743146250588, + "learning_rate": 4.03902423739747e-09, + "logits/chosen": -1.0262573957443237, + "logits/rejected": -1.0548584461212158, + "logps/chosen": -885.991455078125, + "logps/rejected": -1079.2177734375, + "loss": 0.3407, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.75430154800415, + "rewards/margins": 1.5191203355789185, + "rewards/rejected": -6.2734222412109375, + "step": 1406 + }, + { + "epoch": 0.9189321577271614, + "grad_norm": 20.398243030951686, + "learning_rate": 3.975088237783064e-09, + "logits/chosen": -0.9631592035293579, + "logits/rejected": -0.8363844156265259, + "logps/chosen": -1049.6431884765625, + "logps/rejected": -1225.7779541015625, + "loss": 0.3676, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.047906398773193, + "rewards/margins": 2.2165191173553467, + "rewards/rejected": -7.264426231384277, + "step": 1407 + }, + { + "epoch": 0.9195852722671238, + "grad_norm": 28.609017006751863, + "learning_rate": 3.911652060427928e-09, + "logits/chosen": -0.8288122415542603, + "logits/rejected": -0.7530689239501953, + "logps/chosen": -998.0337524414062, + "logps/rejected": -1154.56884765625, + "loss": 0.4154, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.994250774383545, + "rewards/margins": 1.7526618242263794, + "rewards/rejected": -6.746912956237793, + "step": 1408 + }, + { + "epoch": 0.9202383868070863, + "grad_norm": 19.446007209573214, + "learning_rate": 3.848716035525678e-09, + "logits/chosen": -0.6308072805404663, + "logits/rejected": -0.697937548160553, + "logps/chosen": -899.0272827148438, + "logps/rejected": -1132.28857421875, + "loss": 0.4446, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.762282371520996, + "rewards/margins": 1.923186182975769, + "rewards/rejected": -6.6854681968688965, + "step": 1409 + }, + { + "epoch": 0.9208915013470488, + "grad_norm": 33.9684949616552, + "learning_rate": 3.786280490666604e-09, + "logits/chosen": -0.9285703897476196, + "logits/rejected": -0.9438356161117554, + "logps/chosen": -1024.6214599609375, + "logps/rejected": -1281.627197265625, + "loss": 0.2981, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.685998916625977, + "rewards/margins": 2.824934244155884, + "rewards/rejected": -8.510933876037598, + "step": 1410 + }, + { + "epoch": 0.9215446158870112, + "grad_norm": 17.31313517775045, + "learning_rate": 3.7243457508358778e-09, + "logits/chosen": -0.6968719959259033, + "logits/rejected": -0.6861792206764221, + "logps/chosen": -897.244384765625, + "logps/rejected": -1154.441162109375, + "loss": 0.349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.37484884262085, + "rewards/margins": 2.1692397594451904, + "rewards/rejected": -6.544088840484619, + "step": 1411 + }, + { + "epoch": 0.9221977304269736, + "grad_norm": 21.26908971504018, + "learning_rate": 3.6629121384119664e-09, + "logits/chosen": -0.9373629689216614, + "logits/rejected": -0.8761916160583496, + "logps/chosen": -969.7335205078125, + "logps/rejected": -1193.5028076171875, + "loss": 0.3883, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.746880054473877, + "rewards/margins": 2.257174015045166, + "rewards/rejected": -7.004053592681885, + "step": 1412 + }, + { + "epoch": 0.922850844966936, + "grad_norm": 26.66535621429327, + "learning_rate": 3.6019799731648704e-09, + "logits/chosen": -0.9955894947052002, + "logits/rejected": -0.8084239959716797, + "logps/chosen": -931.5289306640625, + "logps/rejected": -1104.51806640625, + "loss": 0.3852, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.929518699645996, + "rewards/margins": 2.009385108947754, + "rewards/rejected": -6.93890380859375, + "step": 1413 + }, + { + "epoch": 0.9235039595068986, + "grad_norm": 25.588965364341004, + "learning_rate": 3.5415495722544874e-09, + "logits/chosen": -0.9307757616043091, + "logits/rejected": -0.7715475559234619, + "logps/chosen": -1015.3209228515625, + "logps/rejected": -1181.02978515625, + "loss": 0.3766, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.992738246917725, + "rewards/margins": 2.280240058898926, + "rewards/rejected": -7.272977828979492, + "step": 1414 + }, + { + "epoch": 0.924157074046861, + "grad_norm": 21.254439169004378, + "learning_rate": 3.4816212502289944e-09, + "logits/chosen": -0.9563465714454651, + "logits/rejected": -0.9163129329681396, + "logps/chosen": -1033.4365234375, + "logps/rejected": -1225.145751953125, + "loss": 0.3757, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.287700176239014, + "rewards/margins": 2.000286102294922, + "rewards/rejected": -7.287986755371094, + "step": 1415 + }, + { + "epoch": 0.9248101885868234, + "grad_norm": 25.934486653060304, + "learning_rate": 3.422195319023169e-09, + "logits/chosen": -0.7184072136878967, + "logits/rejected": -0.6855770945549011, + "logps/chosen": -935.3275146484375, + "logps/rejected": -1191.0899658203125, + "loss": 0.3761, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.7941179275512695, + "rewards/margins": 2.6774706840515137, + "rewards/rejected": -7.471588611602783, + "step": 1416 + }, + { + "epoch": 0.9254633031267858, + "grad_norm": 39.367015906718585, + "learning_rate": 3.363272087956759e-09, + "logits/chosen": -0.9661139845848083, + "logits/rejected": -0.8155698776245117, + "logps/chosen": -987.3919677734375, + "logps/rejected": -1184.998291015625, + "loss": 0.4115, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.236489295959473, + "rewards/margins": 2.2704925537109375, + "rewards/rejected": -7.506982803344727, + "step": 1417 + }, + { + "epoch": 0.9261164176667483, + "grad_norm": 16.74180053580894, + "learning_rate": 3.304851863732938e-09, + "logits/chosen": -0.814327597618103, + "logits/rejected": -0.8117744326591492, + "logps/chosen": -920.962890625, + "logps/rejected": -1163.671875, + "loss": 0.3848, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.750037670135498, + "rewards/margins": 1.8664512634277344, + "rewards/rejected": -6.616489410400391, + "step": 1418 + }, + { + "epoch": 0.9267695322067108, + "grad_norm": 24.932758940158795, + "learning_rate": 3.2469349504366083e-09, + "logits/chosen": -0.8866950869560242, + "logits/rejected": -0.8952164649963379, + "logps/chosen": -1020.4898681640625, + "logps/rejected": -1310.095947265625, + "loss": 0.3745, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.152378082275391, + "rewards/margins": 2.8529391288757324, + "rewards/rejected": -8.005317687988281, + "step": 1419 + }, + { + "epoch": 0.9274226467466732, + "grad_norm": 17.473544447319867, + "learning_rate": 3.1895216495329114e-09, + "logits/chosen": -0.9697394371032715, + "logits/rejected": -0.932720959186554, + "logps/chosen": -894.943359375, + "logps/rejected": -1137.5650634765625, + "loss": 0.3651, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.790702819824219, + "rewards/margins": 1.8912099599838257, + "rewards/rejected": -6.681912422180176, + "step": 1420 + }, + { + "epoch": 0.9280757612866356, + "grad_norm": 22.83318699304187, + "learning_rate": 3.132612259865597e-09, + "logits/chosen": -0.7634729146957397, + "logits/rejected": -0.7357776165008545, + "logps/chosen": -986.0407104492188, + "logps/rejected": -1127.8607177734375, + "loss": 0.406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.239987850189209, + "rewards/margins": 1.540162444114685, + "rewards/rejected": -6.780150413513184, + "step": 1421 + }, + { + "epoch": 0.9287288758265981, + "grad_norm": 32.45917372041385, + "learning_rate": 3.076207077655524e-09, + "logits/chosen": -0.8629635572433472, + "logits/rejected": -0.8516905307769775, + "logps/chosen": -1002.01708984375, + "logps/rejected": -1152.742919921875, + "loss": 0.4029, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.313127517700195, + "rewards/margins": 1.448228359222412, + "rewards/rejected": -6.761356353759766, + "step": 1422 + }, + { + "epoch": 0.9293819903665606, + "grad_norm": 27.915496949424785, + "learning_rate": 3.0203063964990614e-09, + "logits/chosen": -0.8668137788772583, + "logits/rejected": -0.8934547901153564, + "logps/chosen": -950.4146728515625, + "logps/rejected": -1186.294921875, + "loss": 0.4064, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.059856414794922, + "rewards/margins": 2.1348183155059814, + "rewards/rejected": -7.194675445556641, + "step": 1423 + }, + { + "epoch": 0.930035104906523, + "grad_norm": 35.81126804895922, + "learning_rate": 2.9649105073665583e-09, + "logits/chosen": -0.7803625464439392, + "logits/rejected": -0.8244260549545288, + "logps/chosen": -877.6205444335938, + "logps/rejected": -1151.6063232421875, + "loss": 0.4328, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.741922378540039, + "rewards/margins": 2.114166021347046, + "rewards/rejected": -6.856088638305664, + "step": 1424 + }, + { + "epoch": 0.9306882194464854, + "grad_norm": 22.2918690515681, + "learning_rate": 2.9100196986009073e-09, + "logits/chosen": -0.7256823778152466, + "logits/rejected": -0.7021087408065796, + "logps/chosen": -923.400146484375, + "logps/rejected": -1084.9371337890625, + "loss": 0.3417, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.120212554931641, + "rewards/margins": 1.8833941221237183, + "rewards/rejected": -7.003606796264648, + "step": 1425 + }, + { + "epoch": 0.9313413339864479, + "grad_norm": 32.21256635361222, + "learning_rate": 2.8556342559159508e-09, + "logits/chosen": -1.0215641260147095, + "logits/rejected": -0.8117510676383972, + "logps/chosen": -985.9193115234375, + "logps/rejected": -1249.2657470703125, + "loss": 0.4066, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.611599922180176, + "rewards/margins": 2.8269217014312744, + "rewards/rejected": -7.438521862030029, + "step": 1426 + }, + { + "epoch": 0.9319944485264103, + "grad_norm": 21.118231191065334, + "learning_rate": 2.8017544623950673e-09, + "logits/chosen": -0.8426204919815063, + "logits/rejected": -0.919529378414154, + "logps/chosen": -965.6964721679688, + "logps/rejected": -1166.8994140625, + "loss": 0.3623, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.932854652404785, + "rewards/margins": 1.1300407648086548, + "rewards/rejected": -6.06289529800415, + "step": 1427 + }, + { + "epoch": 0.9326475630663728, + "grad_norm": 24.156889442754775, + "learning_rate": 2.7483805984896304e-09, + "logits/chosen": -0.8445752859115601, + "logits/rejected": -0.8731241226196289, + "logps/chosen": -1016.215576171875, + "logps/rejected": -1198.81689453125, + "loss": 0.3818, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.215708255767822, + "rewards/margins": 1.9218271970748901, + "rewards/rejected": -7.13753604888916, + "step": 1428 + }, + { + "epoch": 0.9333006776063352, + "grad_norm": 19.25063375366721, + "learning_rate": 2.6955129420176194e-09, + "logits/chosen": -0.9523279070854187, + "logits/rejected": -0.8416871428489685, + "logps/chosen": -860.4990844726562, + "logps/rejected": -992.48876953125, + "loss": 0.4255, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.198513984680176, + "rewards/margins": 1.6185945272445679, + "rewards/rejected": -5.817108631134033, + "step": 1429 + }, + { + "epoch": 0.9339537921462977, + "grad_norm": 20.874432815718333, + "learning_rate": 2.6431517681621107e-09, + "logits/chosen": -0.9927769303321838, + "logits/rejected": -0.9092074632644653, + "logps/chosen": -1038.987548828125, + "logps/rejected": -1260.34130859375, + "loss": 0.3983, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.678443431854248, + "rewards/margins": 2.3545010089874268, + "rewards/rejected": -8.032944679260254, + "step": 1430 + }, + { + "epoch": 0.9346069066862601, + "grad_norm": 15.08771013125819, + "learning_rate": 2.5912973494698785e-09, + "logits/chosen": -0.8484581708908081, + "logits/rejected": -0.7962210774421692, + "logps/chosen": -1113.666015625, + "logps/rejected": -1251.378662109375, + "loss": 0.3751, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.1646318435668945, + "rewards/margins": 1.506376028060913, + "rewards/rejected": -7.671008586883545, + "step": 1431 + }, + { + "epoch": 0.9352600212262225, + "grad_norm": 22.592908330898158, + "learning_rate": 2.5399499558499847e-09, + "logits/chosen": -0.8906291723251343, + "logits/rejected": -1.0438287258148193, + "logps/chosen": -870.3036499023438, + "logps/rejected": -1057.3095703125, + "loss": 0.4427, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.733056545257568, + "rewards/margins": 1.3122425079345703, + "rewards/rejected": -6.045300006866455, + "step": 1432 + }, + { + "epoch": 0.935913135766185, + "grad_norm": 24.33564645984773, + "learning_rate": 2.4891098545723242e-09, + "logits/chosen": -1.0513837337493896, + "logits/rejected": -0.9483792781829834, + "logps/chosen": -1050.873291015625, + "logps/rejected": -1154.373291015625, + "loss": 0.4221, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.649351596832275, + "rewards/margins": 1.523519515991211, + "rewards/rejected": -7.1728715896606445, + "step": 1433 + }, + { + "epoch": 0.9365662503061475, + "grad_norm": 23.735122709661717, + "learning_rate": 2.4387773102663157e-09, + "logits/chosen": -0.806923508644104, + "logits/rejected": -0.8132039308547974, + "logps/chosen": -977.7556762695312, + "logps/rejected": -1201.8583984375, + "loss": 0.411, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.82599401473999, + "rewards/margins": 1.9918920993804932, + "rewards/rejected": -6.8178863525390625, + "step": 1434 + }, + { + "epoch": 0.9372193648461099, + "grad_norm": 30.38805197625348, + "learning_rate": 2.388952584919457e-09, + "logits/chosen": -0.9621034264564514, + "logits/rejected": -0.9715672731399536, + "logps/chosen": -1033.365478515625, + "logps/rejected": -1146.636962890625, + "loss": 0.3824, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.168354511260986, + "rewards/margins": 1.095251202583313, + "rewards/rejected": -6.263606071472168, + "step": 1435 + }, + { + "epoch": 0.9378724793860723, + "grad_norm": 27.14035171290439, + "learning_rate": 2.33963593787595e-09, + "logits/chosen": -0.7088282704353333, + "logits/rejected": -0.5756462812423706, + "logps/chosen": -985.4832763671875, + "logps/rejected": -1134.68798828125, + "loss": 0.3882, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.14437198638916, + "rewards/margins": 1.92146635055542, + "rewards/rejected": -7.065837860107422, + "step": 1436 + }, + { + "epoch": 0.9385255939260347, + "grad_norm": 20.965417931035965, + "learning_rate": 2.2908276258354343e-09, + "logits/chosen": -0.9305647611618042, + "logits/rejected": -1.0245065689086914, + "logps/chosen": -1019.478271484375, + "logps/rejected": -1227.153564453125, + "loss": 0.3591, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.850151062011719, + "rewards/margins": 1.6930534839630127, + "rewards/rejected": -6.543204307556152, + "step": 1437 + }, + { + "epoch": 0.9391787084659973, + "grad_norm": 20.707076213674817, + "learning_rate": 2.2425279028515652e-09, + "logits/chosen": -0.9270403385162354, + "logits/rejected": -0.8247102499008179, + "logps/chosen": -1035.72119140625, + "logps/rejected": -1167.691162109375, + "loss": 0.4176, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.455660820007324, + "rewards/margins": 1.8896361589431763, + "rewards/rejected": -7.345297336578369, + "step": 1438 + }, + { + "epoch": 0.9398318230059597, + "grad_norm": 17.145480220037275, + "learning_rate": 2.194737020330728e-09, + "logits/chosen": -0.8578017950057983, + "logits/rejected": -0.6604302525520325, + "logps/chosen": -1015.2515869140625, + "logps/rejected": -1188.9595947265625, + "loss": 0.3346, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.383336067199707, + "rewards/margins": 2.303807497024536, + "rewards/rejected": -7.6871442794799805, + "step": 1439 + }, + { + "epoch": 0.9404849375459221, + "grad_norm": 18.277620474543976, + "learning_rate": 2.147455227030748e-09, + "logits/chosen": -1.0063140392303467, + "logits/rejected": -0.8611899018287659, + "logps/chosen": -1026.3895263671875, + "logps/rejected": -1207.8428955078125, + "loss": 0.3533, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.047481536865234, + "rewards/margins": 2.0378239154815674, + "rewards/rejected": -7.085305213928223, + "step": 1440 + }, + { + "epoch": 0.9411380520858845, + "grad_norm": 39.94584518067749, + "learning_rate": 2.1006827690595473e-09, + "logits/chosen": -0.9561357498168945, + "logits/rejected": -1.031121850013733, + "logps/chosen": -1034.476318359375, + "logps/rejected": -1264.8309326171875, + "loss": 0.3684, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.420408725738525, + "rewards/margins": 2.2229015827178955, + "rewards/rejected": -7.643310070037842, + "step": 1441 + }, + { + "epoch": 0.941791166625847, + "grad_norm": 33.800789226780005, + "learning_rate": 2.0544198898739263e-09, + "logits/chosen": -0.9700863361358643, + "logits/rejected": -0.870128870010376, + "logps/chosen": -1038.8701171875, + "logps/rejected": -1134.0107421875, + "loss": 0.4121, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.802235126495361, + "rewards/margins": 1.2960669994354248, + "rewards/rejected": -7.098300933837891, + "step": 1442 + }, + { + "epoch": 0.9424442811658095, + "grad_norm": 37.25432369178176, + "learning_rate": 2.008666830278216e-09, + "logits/chosen": -0.9192866086959839, + "logits/rejected": -0.8795109391212463, + "logps/chosen": -982.6119995117188, + "logps/rejected": -1067.9573974609375, + "loss": 0.3474, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.948570251464844, + "rewards/margins": 1.369868516921997, + "rewards/rejected": -6.318438529968262, + "step": 1443 + }, + { + "epoch": 0.9430973957057719, + "grad_norm": 25.570377183088176, + "learning_rate": 1.963423828423094e-09, + "logits/chosen": -0.7798184156417847, + "logits/rejected": -0.7910678386688232, + "logps/chosen": -981.431640625, + "logps/rejected": -1218.7794189453125, + "loss": 0.4121, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.284530162811279, + "rewards/margins": 2.037384271621704, + "rewards/rejected": -7.321914196014404, + "step": 1444 + }, + { + "epoch": 0.9437505102457343, + "grad_norm": 29.169742339003346, + "learning_rate": 1.9186911198043277e-09, + "logits/chosen": -0.9367291927337646, + "logits/rejected": -0.9252871870994568, + "logps/chosen": -1102.89794921875, + "logps/rejected": -1253.6875, + "loss": 0.4379, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.0728302001953125, + "rewards/margins": 1.685518503189087, + "rewards/rejected": -7.758349418640137, + "step": 1445 + }, + { + "epoch": 0.9444036247856968, + "grad_norm": 22.556085107470444, + "learning_rate": 1.8744689372615308e-09, + "logits/chosen": -0.9977632761001587, + "logits/rejected": -0.826669454574585, + "logps/chosen": -1179.10107421875, + "logps/rejected": -1295.236328125, + "loss": 0.3242, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.635008335113525, + "rewards/margins": 1.6209893226623535, + "rewards/rejected": -7.255997657775879, + "step": 1446 + }, + { + "epoch": 0.9450567393256593, + "grad_norm": 23.986548258092327, + "learning_rate": 1.8307575109769657e-09, + "logits/chosen": -0.7967054843902588, + "logits/rejected": -0.8129853010177612, + "logps/chosen": -920.0318603515625, + "logps/rejected": -1080.5286865234375, + "loss": 0.397, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.808022499084473, + "rewards/margins": 1.5115902423858643, + "rewards/rejected": -6.319612503051758, + "step": 1447 + }, + { + "epoch": 0.9457098538656217, + "grad_norm": 22.43816147534732, + "learning_rate": 1.7875570684743323e-09, + "logits/chosen": -1.020730972290039, + "logits/rejected": -0.9586259722709656, + "logps/chosen": -981.6170654296875, + "logps/rejected": -1160.238525390625, + "loss": 0.3927, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.438063621520996, + "rewards/margins": 1.8004133701324463, + "rewards/rejected": -7.238476276397705, + "step": 1448 + }, + { + "epoch": 0.9463629684055841, + "grad_norm": 41.87512216168752, + "learning_rate": 1.7448678346175915e-09, + "logits/chosen": -0.9691171050071716, + "logits/rejected": -0.854966402053833, + "logps/chosen": -895.74560546875, + "logps/rejected": -1024.83935546875, + "loss": 0.4039, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.703372478485107, + "rewards/margins": 1.5487737655639648, + "rewards/rejected": -6.252146244049072, + "step": 1449 + }, + { + "epoch": 0.9470160829455466, + "grad_norm": 20.13570473014106, + "learning_rate": 1.7026900316098214e-09, + "logits/chosen": -0.7298191785812378, + "logits/rejected": -0.7361303567886353, + "logps/chosen": -966.2626342773438, + "logps/rejected": -1121.1221923828125, + "loss": 0.4022, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.953649520874023, + "rewards/margins": 1.2899563312530518, + "rewards/rejected": -6.243605136871338, + "step": 1450 + }, + { + "epoch": 0.947669197485509, + "grad_norm": 26.69200848432792, + "learning_rate": 1.6610238789920073e-09, + "logits/chosen": -0.8612526059150696, + "logits/rejected": -0.9243814945220947, + "logps/chosen": -1065.5809326171875, + "logps/rejected": -1275.85791015625, + "loss": 0.4606, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.303916931152344, + "rewards/margins": 2.0302257537841797, + "rewards/rejected": -7.334142684936523, + "step": 1451 + }, + { + "epoch": 0.9483223120254715, + "grad_norm": 23.657796736360822, + "learning_rate": 1.6198695936419538e-09, + "logits/chosen": -0.8324539661407471, + "logits/rejected": -0.8346168994903564, + "logps/chosen": -916.5712890625, + "logps/rejected": -1095.519775390625, + "loss": 0.3319, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.682393550872803, + "rewards/margins": 1.9298332929611206, + "rewards/rejected": -6.612226963043213, + "step": 1452 + }, + { + "epoch": 0.9489754265654339, + "grad_norm": 32.52042528833027, + "learning_rate": 1.5792273897730856e-09, + "logits/chosen": -0.9859724044799805, + "logits/rejected": -0.8119895458221436, + "logps/chosen": -1032.33740234375, + "logps/rejected": -1179.5372314453125, + "loss": 0.406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.521050930023193, + "rewards/margins": 2.0502986907958984, + "rewards/rejected": -7.57135009765625, + "step": 1453 + }, + { + "epoch": 0.9496285411053964, + "grad_norm": 23.165414471220327, + "learning_rate": 1.5390974789334266e-09, + "logits/chosen": -0.8730648756027222, + "logits/rejected": -0.8281824588775635, + "logps/chosen": -1028.20166015625, + "logps/rejected": -1375.81640625, + "loss": 0.3187, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.710119724273682, + "rewards/margins": 3.208172082901001, + "rewards/rejected": -8.918291091918945, + "step": 1454 + }, + { + "epoch": 0.9502816556453588, + "grad_norm": 24.673015232487636, + "learning_rate": 1.4994800700044219e-09, + "logits/chosen": -1.045493721961975, + "logits/rejected": -1.0213874578475952, + "logps/chosen": -1014.9378662109375, + "logps/rejected": -1189.329345703125, + "loss": 0.3627, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.880398273468018, + "rewards/margins": 2.3164453506469727, + "rewards/rejected": -7.196844100952148, + "step": 1455 + }, + { + "epoch": 0.9509347701853212, + "grad_norm": 73.76277605023918, + "learning_rate": 1.4603753691998733e-09, + "logits/chosen": -0.7908294200897217, + "logits/rejected": -0.7953348755836487, + "logps/chosen": -932.0293579101562, + "logps/rejected": -1270.8121337890625, + "loss": 0.416, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.794908046722412, + "rewards/margins": 2.717374801635742, + "rewards/rejected": -7.512282848358154, + "step": 1456 + }, + { + "epoch": 0.9515878847252837, + "grad_norm": 22.972378938308747, + "learning_rate": 1.4217835800648837e-09, + "logits/chosen": -0.9420186877250671, + "logits/rejected": -0.9727803468704224, + "logps/chosen": -861.2728271484375, + "logps/rejected": -1017.69091796875, + "loss": 0.3061, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.226019859313965, + "rewards/margins": 1.59080970287323, + "rewards/rejected": -5.816829681396484, + "step": 1457 + }, + { + "epoch": 0.9522409992652462, + "grad_norm": 71.0682992055047, + "learning_rate": 1.3837049034747806e-09, + "logits/chosen": -0.9005981087684631, + "logits/rejected": -0.8757720589637756, + "logps/chosen": -918.2922973632812, + "logps/rejected": -1133.818115234375, + "loss": 0.4251, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.245543003082275, + "rewards/margins": 2.0595555305480957, + "rewards/rejected": -6.305099010467529, + "step": 1458 + }, + { + "epoch": 0.9528941138052086, + "grad_norm": 45.29358517020249, + "learning_rate": 1.3461395376340501e-09, + "logits/chosen": -0.914055585861206, + "logits/rejected": -0.7696114778518677, + "logps/chosen": -906.3142700195312, + "logps/rejected": -1021.6484375, + "loss": 0.5118, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.326722621917725, + "rewards/margins": 1.5497199296951294, + "rewards/rejected": -5.876442909240723, + "step": 1459 + }, + { + "epoch": 0.953547228345171, + "grad_norm": 54.488844427179124, + "learning_rate": 1.3090876780753712e-09, + "logits/chosen": -0.6701656579971313, + "logits/rejected": -0.6975820660591125, + "logps/chosen": -914.3140258789062, + "logps/rejected": -1174.50048828125, + "loss": 0.3897, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.02104377746582, + "rewards/margins": 2.0837643146514893, + "rewards/rejected": -7.104808330535889, + "step": 1460 + }, + { + "epoch": 0.9542003428851334, + "grad_norm": 25.299453873176645, + "learning_rate": 1.2725495176585166e-09, + "logits/chosen": -0.7396911382675171, + "logits/rejected": -0.6380915641784668, + "logps/chosen": -968.8209838867188, + "logps/rejected": -1188.0623779296875, + "loss": 0.3772, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.297964572906494, + "rewards/margins": 1.943697452545166, + "rewards/rejected": -7.241661548614502, + "step": 1461 + }, + { + "epoch": 0.954853457425096, + "grad_norm": 27.103808867302387, + "learning_rate": 1.2365252465694086e-09, + "logits/chosen": -0.863556444644928, + "logits/rejected": -0.7426696419715881, + "logps/chosen": -987.9991455078125, + "logps/rejected": -1103.204345703125, + "loss": 0.3622, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.20358943939209, + "rewards/margins": 1.4331691265106201, + "rewards/rejected": -6.636758804321289, + "step": 1462 + }, + { + "epoch": 0.9555065719650584, + "grad_norm": 17.394073074491665, + "learning_rate": 1.2010150523190988e-09, + "logits/chosen": -0.9870079159736633, + "logits/rejected": -0.9677950739860535, + "logps/chosen": -968.0743408203125, + "logps/rejected": -1198.095458984375, + "loss": 0.3549, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.881644248962402, + "rewards/margins": 2.15342378616333, + "rewards/rejected": -7.035068035125732, + "step": 1463 + }, + { + "epoch": 0.9561596865050208, + "grad_norm": 24.783037834413452, + "learning_rate": 1.1660191197428226e-09, + "logits/chosen": -0.7138622999191284, + "logits/rejected": -0.7153616547584534, + "logps/chosen": -958.835693359375, + "logps/rejected": -1190.4322509765625, + "loss": 0.327, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.410097599029541, + "rewards/margins": 1.866647481918335, + "rewards/rejected": -7.276744842529297, + "step": 1464 + }, + { + "epoch": 0.9568128010449832, + "grad_norm": 30.254292249337972, + "learning_rate": 1.13153763099898e-09, + "logits/chosen": -0.7923008799552917, + "logits/rejected": -0.8359322547912598, + "logps/chosen": -964.593994140625, + "logps/rejected": -1163.7933349609375, + "loss": 0.42, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.957150459289551, + "rewards/margins": 2.0364115238189697, + "rewards/rejected": -6.993561744689941, + "step": 1465 + }, + { + "epoch": 0.9574659155849458, + "grad_norm": 19.941049592800624, + "learning_rate": 1.0975707655682453e-09, + "logits/chosen": -0.8487153649330139, + "logits/rejected": -0.7613034844398499, + "logps/chosen": -860.0794677734375, + "logps/rejected": -1030.533203125, + "loss": 0.433, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.2322821617126465, + "rewards/margins": 1.7400192022323608, + "rewards/rejected": -5.972302436828613, + "step": 1466 + }, + { + "epoch": 0.9581190301249082, + "grad_norm": 30.31638895507804, + "learning_rate": 1.0641187002526142e-09, + "logits/chosen": -0.9801903963088989, + "logits/rejected": -0.8500902056694031, + "logps/chosen": -886.7667236328125, + "logps/rejected": -925.6876220703125, + "loss": 0.4019, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.857925891876221, + "rewards/margins": 0.8767415881156921, + "rewards/rejected": -5.734667778015137, + "step": 1467 + }, + { + "epoch": 0.9587721446648706, + "grad_norm": 42.16901233596558, + "learning_rate": 1.0311816091744697e-09, + "logits/chosen": -0.7535737752914429, + "logits/rejected": -0.7454916834831238, + "logps/chosen": -932.8473510742188, + "logps/rejected": -1178.4783935546875, + "loss": 0.3957, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.699604034423828, + "rewards/margins": 2.059380531311035, + "rewards/rejected": -6.7589850425720215, + "step": 1468 + }, + { + "epoch": 0.959425259204833, + "grad_norm": 26.651901787055372, + "learning_rate": 9.987596637756946e-10, + "logits/chosen": -0.8313369154930115, + "logits/rejected": -0.6576776504516602, + "logps/chosen": -987.6339111328125, + "logps/rejected": -1117.3695068359375, + "loss": 0.3691, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.855637073516846, + "rewards/margins": 1.547339677810669, + "rewards/rejected": -6.402976989746094, + "step": 1469 + }, + { + "epoch": 0.9600783737447955, + "grad_norm": 21.683003406272572, + "learning_rate": 9.668530328167612e-10, + "logits/chosen": -0.812420129776001, + "logits/rejected": -0.6992568373680115, + "logps/chosen": -993.4514770507812, + "logps/rejected": -1210.0225830078125, + "loss": 0.3348, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.396690845489502, + "rewards/margins": 2.229128837585449, + "rewards/rejected": -7.625819206237793, + "step": 1470 + }, + { + "epoch": 0.960731488284758, + "grad_norm": 17.28840064330659, + "learning_rate": 9.354618823758653e-10, + "logits/chosen": -1.02504301071167, + "logits/rejected": -0.8524007797241211, + "logps/chosen": -944.0267333984375, + "logps/rejected": -1072.3201904296875, + "loss": 0.3258, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.7975006103515625, + "rewards/margins": 1.4966485500335693, + "rewards/rejected": -6.294148921966553, + "step": 1471 + }, + { + "epoch": 0.9613846028247204, + "grad_norm": 19.607498707495395, + "learning_rate": 9.045863758480709e-10, + "logits/chosen": -1.0104511976242065, + "logits/rejected": -1.0129821300506592, + "logps/chosen": -1002.5841064453125, + "logps/rejected": -1276.6905517578125, + "loss": 0.3336, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.3765411376953125, + "rewards/margins": 2.1960718631744385, + "rewards/rejected": -7.572613716125488, + "step": 1472 + }, + { + "epoch": 0.9620377173646828, + "grad_norm": 34.47668846556969, + "learning_rate": 8.742266739444337e-10, + "logits/chosen": -0.9214047193527222, + "logits/rejected": -0.9010781645774841, + "logps/chosen": -976.4214477539062, + "logps/rejected": -1183.3560791015625, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.587342739105225, + "rewards/margins": 1.8902616500854492, + "rewards/rejected": -7.477603912353516, + "step": 1473 + }, + { + "epoch": 0.9626908319046453, + "grad_norm": 18.40991830007408, + "learning_rate": 8.44382934691179e-10, + "logits/chosen": -0.920522153377533, + "logits/rejected": -0.8545042276382446, + "logps/chosen": -954.7621459960938, + "logps/rejected": -1119.32666015625, + "loss": 0.3603, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.054173946380615, + "rewards/margins": 1.6411688327789307, + "rewards/rejected": -6.695343017578125, + "step": 1474 + }, + { + "epoch": 0.9633439464446077, + "grad_norm": 30.063682711715693, + "learning_rate": 8.150553134289029e-10, + "logits/chosen": -0.7451218962669373, + "logits/rejected": -0.7642776966094971, + "logps/chosen": -958.5621948242188, + "logps/rejected": -1134.0743408203125, + "loss": 0.3813, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.907833099365234, + "rewards/margins": 1.4756911993026733, + "rewards/rejected": -6.383524417877197, + "step": 1475 + }, + { + "epoch": 0.9639970609845702, + "grad_norm": 28.63972492266538, + "learning_rate": 7.862439628116946e-10, + "logits/chosen": -0.9498513340950012, + "logits/rejected": -0.8719826936721802, + "logps/chosen": -1016.4282836914062, + "logps/rejected": -1184.1448974609375, + "loss": 0.361, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.389301300048828, + "rewards/margins": 2.168164014816284, + "rewards/rejected": -7.557465553283691, + "step": 1476 + }, + { + "epoch": 0.9646501755245326, + "grad_norm": 19.29656245173236, + "learning_rate": 7.579490328064264e-10, + "logits/chosen": -0.628196120262146, + "logits/rejected": -0.5795927047729492, + "logps/chosen": -942.5938720703125, + "logps/rejected": -1108.619873046875, + "loss": 0.4409, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.877640247344971, + "rewards/margins": 1.6121346950531006, + "rewards/rejected": -6.48977518081665, + "step": 1477 + }, + { + "epoch": 0.9653032900644951, + "grad_norm": 22.454212973722687, + "learning_rate": 7.301706706919208e-10, + "logits/chosen": -0.6595121026039124, + "logits/rejected": -0.7709716558456421, + "logps/chosen": -993.6766357421875, + "logps/rejected": -1450.3536376953125, + "loss": 0.4043, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.951398849487305, + "rewards/margins": 3.6712889671325684, + "rewards/rejected": -8.622688293457031, + "step": 1478 + }, + { + "epoch": 0.9659564046044575, + "grad_norm": 33.820441000688064, + "learning_rate": 7.029090210581956e-10, + "logits/chosen": -0.8978259563446045, + "logits/rejected": -0.7938851118087769, + "logps/chosen": -1084.6802978515625, + "logps/rejected": -1231.305908203125, + "loss": 0.3614, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.54904842376709, + "rewards/margins": 1.9202641248703003, + "rewards/rejected": -7.46931266784668, + "step": 1479 + }, + { + "epoch": 0.96660951914442, + "grad_norm": 29.055292815530052, + "learning_rate": 6.761642258056977e-10, + "logits/chosen": -0.8216730952262878, + "logits/rejected": -0.8812836408615112, + "logps/chosen": -909.160400390625, + "logps/rejected": -1194.9307861328125, + "loss": 0.43, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.529107093811035, + "rewards/margins": 1.9421048164367676, + "rewards/rejected": -6.471211910247803, + "step": 1480 + }, + { + "epoch": 0.9672626336843824, + "grad_norm": 28.942362395099714, + "learning_rate": 6.499364241446148e-10, + "logits/chosen": -0.8953934907913208, + "logits/rejected": -0.8497557640075684, + "logps/chosen": -954.681884765625, + "logps/rejected": -1159.572021484375, + "loss": 0.3649, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.4994611740112305, + "rewards/margins": 2.3079538345336914, + "rewards/rejected": -6.80741548538208, + "step": 1481 + }, + { + "epoch": 0.9679157482243449, + "grad_norm": 35.14445460138126, + "learning_rate": 6.242257525940875e-10, + "logits/chosen": -0.9045277833938599, + "logits/rejected": -0.8981985449790955, + "logps/chosen": -969.97998046875, + "logps/rejected": -1138.05126953125, + "loss": 0.4004, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.53133487701416, + "rewards/margins": 1.3268393278121948, + "rewards/rejected": -6.8581743240356445, + "step": 1482 + }, + { + "epoch": 0.9685688627643073, + "grad_norm": 24.924893058423713, + "learning_rate": 5.990323449815316e-10, + "logits/chosen": -0.9852606058120728, + "logits/rejected": -0.8136059045791626, + "logps/chosen": -1126.402099609375, + "logps/rejected": -1153.1251220703125, + "loss": 0.3822, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.641076564788818, + "rewards/margins": 0.9179675579071045, + "rewards/rejected": -6.559044361114502, + "step": 1483 + }, + { + "epoch": 0.9692219773042697, + "grad_norm": 22.221974478367503, + "learning_rate": 5.743563324419387e-10, + "logits/chosen": -0.9423245787620544, + "logits/rejected": -0.7548604011535645, + "logps/chosen": -1162.7821044921875, + "logps/rejected": -1321.6494140625, + "loss": 0.3694, + "rewards/accuracies": 0.78125, + "rewards/chosen": -6.515698432922363, + "rewards/margins": 2.173647403717041, + "rewards/rejected": -8.689346313476562, + "step": 1484 + }, + { + "epoch": 0.9698750918442322, + "grad_norm": 22.796985665535065, + "learning_rate": 5.501978434171883e-10, + "logits/chosen": -0.8227739930152893, + "logits/rejected": -0.6972899436950684, + "logps/chosen": -992.651123046875, + "logps/rejected": -1125.826416015625, + "loss": 0.41, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.140564441680908, + "rewards/margins": 1.821034550666809, + "rewards/rejected": -6.961598873138428, + "step": 1485 + }, + { + "epoch": 0.9705282063841947, + "grad_norm": 23.18949792218578, + "learning_rate": 5.265570036553813e-10, + "logits/chosen": -0.8643534779548645, + "logits/rejected": -0.7788809537887573, + "logps/chosen": -953.4285888671875, + "logps/rejected": -1124.8902587890625, + "loss": 0.3599, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.857746124267578, + "rewards/margins": 1.7846039533615112, + "rewards/rejected": -6.642350196838379, + "step": 1486 + }, + { + "epoch": 0.9711813209241571, + "grad_norm": 51.28508939305074, + "learning_rate": 5.034339362101958e-10, + "logits/chosen": -0.699268639087677, + "logits/rejected": -0.8176553249359131, + "logps/chosen": -902.2926025390625, + "logps/rejected": -1181.4716796875, + "loss": 0.32, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.03464412689209, + "rewards/margins": 1.879096269607544, + "rewards/rejected": -6.913740634918213, + "step": 1487 + }, + { + "epoch": 0.9718344354641195, + "grad_norm": 27.68604192606488, + "learning_rate": 4.808287614402218e-10, + "logits/chosen": -1.0526833534240723, + "logits/rejected": -0.9321053624153137, + "logps/chosen": -1100.566162109375, + "logps/rejected": -1366.1162109375, + "loss": 0.3962, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.126470565795898, + "rewards/margins": 2.3651087284088135, + "rewards/rejected": -7.491579055786133, + "step": 1488 + }, + { + "epoch": 0.9724875500040819, + "grad_norm": 21.713246091016085, + "learning_rate": 4.587415970083719e-10, + "logits/chosen": -0.7959469556808472, + "logits/rejected": -0.9242347478866577, + "logps/chosen": -991.1103515625, + "logps/rejected": -1283.2503662109375, + "loss": 0.3681, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.195345878601074, + "rewards/margins": 2.3410024642944336, + "rewards/rejected": -7.536348342895508, + "step": 1489 + }, + { + "epoch": 0.9731406645440445, + "grad_norm": 73.72354463354812, + "learning_rate": 4.3717255788121577e-10, + "logits/chosen": -1.0061514377593994, + "logits/rejected": -0.8946919441223145, + "logps/chosen": -1000.9629516601562, + "logps/rejected": -1165.5416259765625, + "loss": 0.4096, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.1635589599609375, + "rewards/margins": 1.668057918548584, + "rewards/rejected": -6.831617832183838, + "step": 1490 + }, + { + "epoch": 0.9737937790840069, + "grad_norm": 22.70064351850388, + "learning_rate": 4.161217563284469e-10, + "logits/chosen": -1.0634889602661133, + "logits/rejected": -0.9366331696510315, + "logps/chosen": -941.7094116210938, + "logps/rejected": -1145.0657958984375, + "loss": 0.2551, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.710692405700684, + "rewards/margins": 2.2830896377563477, + "rewards/rejected": -6.9937825202941895, + "step": 1491 + }, + { + "epoch": 0.9744468936239693, + "grad_norm": 39.7036696071761, + "learning_rate": 3.9558930192225004e-10, + "logits/chosen": -0.8806736469268799, + "logits/rejected": -0.9087212085723877, + "logps/chosen": -896.1585693359375, + "logps/rejected": -1149.5721435546875, + "loss": 0.3194, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.4892706871032715, + "rewards/margins": 2.5188791751861572, + "rewards/rejected": -7.00814962387085, + "step": 1492 + }, + { + "epoch": 0.9751000081639317, + "grad_norm": 23.64003160960863, + "learning_rate": 3.755753015367236e-10, + "logits/chosen": -0.9212431907653809, + "logits/rejected": -0.9223770499229431, + "logps/chosen": -961.08251953125, + "logps/rejected": -1142.14990234375, + "loss": 0.3491, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.822206497192383, + "rewards/margins": 1.5445626974105835, + "rewards/rejected": -6.366768836975098, + "step": 1493 + }, + { + "epoch": 0.9757531227038942, + "grad_norm": 26.977497967955454, + "learning_rate": 3.560798593473913e-10, + "logits/chosen": -1.052283763885498, + "logits/rejected": -0.953407347202301, + "logps/chosen": -974.070556640625, + "logps/rejected": -1134.294921875, + "loss": 0.4168, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.088522911071777, + "rewards/margins": 1.946774959564209, + "rewards/rejected": -6.0352983474731445, + "step": 1494 + }, + { + "epoch": 0.9764062372438567, + "grad_norm": 15.544717499533835, + "learning_rate": 3.371030768305583e-10, + "logits/chosen": -0.7873333096504211, + "logits/rejected": -0.8360827565193176, + "logps/chosen": -1022.1478271484375, + "logps/rejected": -1273.257568359375, + "loss": 0.3568, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.51603889465332, + "rewards/margins": 1.96681809425354, + "rewards/rejected": -7.482856750488281, + "step": 1495 + }, + { + "epoch": 0.9770593517838191, + "grad_norm": 20.11472540145716, + "learning_rate": 3.186450527628781e-10, + "logits/chosen": -0.741105318069458, + "logits/rejected": -0.804491400718689, + "logps/chosen": -1016.6484375, + "logps/rejected": -1231.481689453125, + "loss": 0.368, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.079611301422119, + "rewards/margins": 1.775844931602478, + "rewards/rejected": -6.855456352233887, + "step": 1496 + }, + { + "epoch": 0.9777124663237815, + "grad_norm": 17.63253263663225, + "learning_rate": 3.007058832207976e-10, + "logits/chosen": -0.9436084628105164, + "logits/rejected": -0.9498050212860107, + "logps/chosen": -914.9832763671875, + "logps/rejected": -1074.5772705078125, + "loss": 0.2992, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.8038554191589355, + "rewards/margins": 1.674473762512207, + "rewards/rejected": -6.478329181671143, + "step": 1497 + }, + { + "epoch": 0.978365580863744, + "grad_norm": 38.54187704615717, + "learning_rate": 2.8328566158002386e-10, + "logits/chosen": -0.9954776763916016, + "logits/rejected": -0.9750261902809143, + "logps/chosen": -1050.8953857421875, + "logps/rejected": -1183.7637939453125, + "loss": 0.4243, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.6384124755859375, + "rewards/margins": 1.5582480430603027, + "rewards/rejected": -7.196660995483398, + "step": 1498 + }, + { + "epoch": 0.9790186954037065, + "grad_norm": 21.021314956248386, + "learning_rate": 2.663844785151248e-10, + "logits/chosen": -0.9795162081718445, + "logits/rejected": -0.922295093536377, + "logps/chosen": -949.4105224609375, + "logps/rejected": -1109.667724609375, + "loss": 0.3731, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.881753444671631, + "rewards/margins": 1.8854466676712036, + "rewards/rejected": -6.767199993133545, + "step": 1499 + }, + { + "epoch": 0.9796718099436689, + "grad_norm": 21.956131521928086, + "learning_rate": 2.5000242199895163e-10, + "logits/chosen": -0.8551816344261169, + "logits/rejected": -0.8687204122543335, + "logps/chosen": -909.28271484375, + "logps/rejected": -1032.6781005859375, + "loss": 0.3197, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.83204984664917, + "rewards/margins": 1.1786363124847412, + "rewards/rejected": -6.01068639755249, + "step": 1500 + }, + { + "epoch": 0.9796718099436689, + "eval_logits/chosen": -0.6721622943878174, + "eval_logits/rejected": -0.6120479106903076, + "eval_logps/chosen": -988.9237060546875, + "eval_logps/rejected": -1153.097900390625, + "eval_loss": 0.3866689205169678, + "eval_rewards/accuracies": 0.8080000281333923, + "eval_rewards/chosen": -5.150165557861328, + "eval_rewards/margins": 1.7291275262832642, + "eval_rewards/rejected": -6.879292964935303, + "eval_runtime": 616.2237, + "eval_samples_per_second": 6.491, + "eval_steps_per_second": 0.406, + "step": 1500 + }, + { + "epoch": 0.9803249244836313, + "grad_norm": 40.25994643128802, + "learning_rate": 2.341395773022614e-10, + "logits/chosen": -0.8601680994033813, + "logits/rejected": -0.7846535444259644, + "logps/chosen": -1010.7470092773438, + "logps/rejected": -1159.432861328125, + "loss": 0.4313, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.6882429122924805, + "rewards/margins": 1.3808894157409668, + "rewards/rejected": -7.069132328033447, + "step": 1501 + }, + { + "epoch": 0.9809780390235938, + "grad_norm": 21.765081447453614, + "learning_rate": 2.1879602699325095e-10, + "logits/chosen": -0.9793115854263306, + "logits/rejected": -0.9564782977104187, + "logps/chosen": -968.4996948242188, + "logps/rejected": -1242.9364013671875, + "loss": 0.3515, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.951983451843262, + "rewards/margins": 2.656872272491455, + "rewards/rejected": -7.608855724334717, + "step": 1502 + }, + { + "epoch": 0.9816311535635562, + "grad_norm": 18.053421531219165, + "learning_rate": 2.0397185093710135e-10, + "logits/chosen": -0.8464664220809937, + "logits/rejected": -0.8488726615905762, + "logps/chosen": -921.1729736328125, + "logps/rejected": -1140.6810302734375, + "loss": 0.3571, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.196868419647217, + "rewards/margins": 1.8572230339050293, + "rewards/rejected": -7.054091453552246, + "step": 1503 + }, + { + "epoch": 0.9822842681035187, + "grad_norm": 45.38378711687764, + "learning_rate": 1.8966712629558956e-10, + "logits/chosen": -0.7857608795166016, + "logits/rejected": -0.7569239139556885, + "logps/chosen": -1089.02783203125, + "logps/rejected": -1232.945068359375, + "loss": 0.4773, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.689802646636963, + "rewards/margins": 1.7456611394882202, + "rewards/rejected": -7.435463905334473, + "step": 1504 + }, + { + "epoch": 0.9829373826434811, + "grad_norm": 33.30306187618995, + "learning_rate": 1.7588192752669983e-10, + "logits/chosen": -0.8797517418861389, + "logits/rejected": -0.9536515474319458, + "logps/chosen": -915.71240234375, + "logps/rejected": -1117.3577880859375, + "loss": 0.4076, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.108963489532471, + "rewards/margins": 1.6557157039642334, + "rewards/rejected": -6.764679431915283, + "step": 1505 + }, + { + "epoch": 0.9835904971834436, + "grad_norm": 19.205264223495828, + "learning_rate": 1.6261632638419064e-10, + "logits/chosen": -0.8795767426490784, + "logits/rejected": -0.6969910860061646, + "logps/chosen": -948.1537475585938, + "logps/rejected": -1066.2906494140625, + "loss": 0.33, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.414488792419434, + "rewards/margins": 1.6049293279647827, + "rewards/rejected": -6.019418239593506, + "step": 1506 + }, + { + "epoch": 0.984243611723406, + "grad_norm": 20.23091846177532, + "learning_rate": 1.498703919172506e-10, + "logits/chosen": -0.8670247197151184, + "logits/rejected": -0.86240154504776, + "logps/chosen": -927.6676025390625, + "logps/rejected": -1164.9281005859375, + "loss": 0.3854, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.659823417663574, + "rewards/margins": 2.259251117706299, + "rewards/rejected": -6.919074058532715, + "step": 1507 + }, + { + "epoch": 0.9848967262633684, + "grad_norm": 19.609976871290833, + "learning_rate": 1.3764419047014307e-10, + "logits/chosen": -0.9074758291244507, + "logits/rejected": -0.7539640069007874, + "logps/chosen": -1016.4730834960938, + "logps/rejected": -1181.7667236328125, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.008094310760498, + "rewards/margins": 2.28283429145813, + "rewards/rejected": -7.290928840637207, + "step": 1508 + }, + { + "epoch": 0.9855498408033309, + "grad_norm": 17.954953795909553, + "learning_rate": 1.259377856818622e-10, + "logits/chosen": -0.8041942119598389, + "logits/rejected": -0.7212069630622864, + "logps/chosen": -987.7574462890625, + "logps/rejected": -1300.8795166015625, + "loss": 0.3057, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.226101398468018, + "rewards/margins": 2.5325069427490234, + "rewards/rejected": -7.758607864379883, + "step": 1509 + }, + { + "epoch": 0.9862029553432933, + "grad_norm": 35.49175552313481, + "learning_rate": 1.147512384857663e-10, + "logits/chosen": -0.8118208646774292, + "logits/rejected": -0.8001708984375, + "logps/chosen": -935.20458984375, + "logps/rejected": -1067.834716796875, + "loss": 0.3763, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.038346767425537, + "rewards/margins": 1.3084391355514526, + "rewards/rejected": -6.346785545349121, + "step": 1510 + }, + { + "epoch": 0.9868560698832558, + "grad_norm": 21.866661204153374, + "learning_rate": 1.0408460710930045e-10, + "logits/chosen": -0.6990280747413635, + "logits/rejected": -0.8841565847396851, + "logps/chosen": -953.3377685546875, + "logps/rejected": -1383.275390625, + "loss": 0.359, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.795334339141846, + "rewards/margins": 2.9013049602508545, + "rewards/rejected": -7.696639537811279, + "step": 1511 + }, + { + "epoch": 0.9875091844232182, + "grad_norm": 30.07481661009675, + "learning_rate": 9.393794707368563e-11, + "logits/chosen": -1.0245449542999268, + "logits/rejected": -0.992201566696167, + "logps/chosen": -1000.864501953125, + "logps/rejected": -1175.2371826171875, + "loss": 0.4041, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.519082069396973, + "rewards/margins": 1.6422016620635986, + "rewards/rejected": -7.161284446716309, + "step": 1512 + }, + { + "epoch": 0.9881622989631806, + "grad_norm": 17.369917763436096, + "learning_rate": 8.43113111936189e-11, + "logits/chosen": -0.7805638909339905, + "logits/rejected": -0.8056908845901489, + "logps/chosen": -908.6342163085938, + "logps/rejected": -1338.196044921875, + "loss": 0.3133, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.329880237579346, + "rewards/margins": 3.113621711730957, + "rewards/rejected": -7.4435014724731445, + "step": 1513 + }, + { + "epoch": 0.9888154135031431, + "grad_norm": 35.69657571556223, + "learning_rate": 7.520474957699585e-11, + "logits/chosen": -0.7891541123390198, + "logits/rejected": -0.8165507912635803, + "logps/chosen": -932.488037109375, + "logps/rejected": -1186.4473876953125, + "loss": 0.4759, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.790083885192871, + "rewards/margins": 2.200594425201416, + "rewards/rejected": -6.990677833557129, + "step": 1514 + }, + { + "epoch": 0.9894685280431056, + "grad_norm": 18.050312335076022, + "learning_rate": 6.661830962466641e-11, + "logits/chosen": -0.8305252194404602, + "logits/rejected": -0.8962659239768982, + "logps/chosen": -964.072021484375, + "logps/rejected": -1240.065673828125, + "loss": 0.3223, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.888925552368164, + "rewards/margins": 2.5497281551361084, + "rewards/rejected": -7.438653945922852, + "step": 1515 + }, + { + "epoch": 0.990121642583068, + "grad_norm": 17.478159809130133, + "learning_rate": 5.855203603017945e-11, + "logits/chosen": -1.1193163394927979, + "logits/rejected": -1.0706626176834106, + "logps/chosen": -1132.2928466796875, + "logps/rejected": -1322.29248046875, + "loss": 0.3302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -6.359055995941162, + "rewards/margins": 1.94695246219635, + "rewards/rejected": -8.306008338928223, + "step": 1516 + }, + { + "epoch": 0.9907747571230304, + "grad_norm": 32.57656802641893, + "learning_rate": 5.10059707795496e-11, + "logits/chosen": -0.9936408400535583, + "logits/rejected": -0.9797040820121765, + "logps/chosen": -970.2620849609375, + "logps/rejected": -1146.864501953125, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.244019508361816, + "rewards/margins": 1.6731963157653809, + "rewards/rejected": -6.917215824127197, + "step": 1517 + }, + { + "epoch": 0.9914278716629928, + "grad_norm": 17.499162801776325, + "learning_rate": 4.398015315103531e-11, + "logits/chosen": -0.9143285155296326, + "logits/rejected": -0.8588674068450928, + "logps/chosen": -989.8718872070312, + "logps/rejected": -1191.24072265625, + "loss": 0.3518, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.379557132720947, + "rewards/margins": 1.8680692911148071, + "rewards/rejected": -7.247626304626465, + "step": 1518 + }, + { + "epoch": 0.9920809862029554, + "grad_norm": 18.95929812206617, + "learning_rate": 3.7474619714927827e-11, + "logits/chosen": -0.8051342964172363, + "logits/rejected": -0.89119952917099, + "logps/chosen": -986.9842529296875, + "logps/rejected": -1458.8433837890625, + "loss": 0.3519, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.907092571258545, + "rewards/margins": 3.5196523666381836, + "rewards/rejected": -8.42674446105957, + "step": 1519 + }, + { + "epoch": 0.9927341007429178, + "grad_norm": 14.665306889160593, + "learning_rate": 3.148940433339575e-11, + "logits/chosen": -0.8779653310775757, + "logits/rejected": -0.7155288457870483, + "logps/chosen": -1045.592041015625, + "logps/rejected": -1308.4227294921875, + "loss": 0.3158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.181655406951904, + "rewards/margins": 2.8715524673461914, + "rewards/rejected": -8.053207397460938, + "step": 1520 + }, + { + "epoch": 0.9933872152828802, + "grad_norm": 18.0393916964981, + "learning_rate": 2.6024538160251962e-11, + "logits/chosen": -0.9173994660377502, + "logits/rejected": -0.8805586099624634, + "logps/chosen": -879.5809326171875, + "logps/rejected": -1108.05078125, + "loss": 0.3632, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.384559154510498, + "rewards/margins": 1.4541279077529907, + "rewards/rejected": -5.838687419891357, + "step": 1521 + }, + { + "epoch": 0.9940403298228426, + "grad_norm": 29.385526588038253, + "learning_rate": 2.1080049640864738e-11, + "logits/chosen": -1.0499584674835205, + "logits/rejected": -1.0689440965652466, + "logps/chosen": -925.3338623046875, + "logps/rejected": -1172.06298828125, + "loss": 0.3628, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.823688507080078, + "rewards/margins": 2.227992296218872, + "rewards/rejected": -7.051680564880371, + "step": 1522 + }, + { + "epoch": 0.9946934443628052, + "grad_norm": 27.358046262023528, + "learning_rate": 1.665596451193574e-11, + "logits/chosen": -0.901128888130188, + "logits/rejected": -0.6968897581100464, + "logps/chosen": -931.8582763671875, + "logps/rejected": -1069.2611083984375, + "loss": 0.3809, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.321289539337158, + "rewards/margins": 2.0190200805664062, + "rewards/rejected": -6.340309143066406, + "step": 1523 + }, + { + "epoch": 0.9953465589027676, + "grad_norm": 45.310244069027156, + "learning_rate": 1.2752305801400077e-11, + "logits/chosen": -0.9536406993865967, + "logits/rejected": -0.9706865549087524, + "logps/chosen": -1034.489501953125, + "logps/rejected": -1205.436767578125, + "loss": 0.399, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.685507774353027, + "rewards/margins": 1.6740481853485107, + "rewards/rejected": -7.359556198120117, + "step": 1524 + }, + { + "epoch": 0.99599967344273, + "grad_norm": 27.900190174903972, + "learning_rate": 9.369093828326402e-12, + "logits/chosen": -0.8810493350028992, + "logits/rejected": -0.810979962348938, + "logps/chosen": -970.609130859375, + "logps/rejected": -1154.4971923828125, + "loss": 0.2934, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.879640102386475, + "rewards/margins": 1.8229658603668213, + "rewards/rejected": -6.702606201171875, + "step": 1525 + }, + { + "epoch": 0.9966527879826924, + "grad_norm": 34.411440821011766, + "learning_rate": 6.506346202772572e-12, + "logits/chosen": -0.8746610879898071, + "logits/rejected": -0.8008745908737183, + "logps/chosen": -990.9203491210938, + "logps/rejected": -1058.67333984375, + "loss": 0.4343, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.3791985511779785, + "rewards/margins": 0.6661874055862427, + "rewards/rejected": -6.045385837554932, + "step": 1526 + }, + { + "epoch": 0.9973059025226549, + "grad_norm": 31.336013670265675, + "learning_rate": 4.164077825707934e-12, + "logits/chosen": -0.9213491678237915, + "logits/rejected": -0.7777243852615356, + "logps/chosen": -913.4546508789062, + "logps/rejected": -1077.456787109375, + "loss": 0.4079, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.575063705444336, + "rewards/margins": 1.3315820693969727, + "rewards/rejected": -5.906645774841309, + "step": 1527 + }, + { + "epoch": 0.9979590170626174, + "grad_norm": 23.47986482520017, + "learning_rate": 2.3423008889467134e-12, + "logits/chosen": -0.9849967360496521, + "logits/rejected": -0.9053511619567871, + "logps/chosen": -968.4962158203125, + "logps/rejected": -1164.7457275390625, + "loss": 0.3579, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.731381416320801, + "rewards/margins": 2.134218215942383, + "rewards/rejected": -6.865600109100342, + "step": 1528 + }, + { + "epoch": 0.9986121316025798, + "grad_norm": 20.62158211293607, + "learning_rate": 1.0410248750925e-12, + "logits/chosen": -0.8682329654693604, + "logits/rejected": -0.8483768105506897, + "logps/chosen": -1060.15673828125, + "logps/rejected": -1233.65478515625, + "loss": 0.4167, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.726959228515625, + "rewards/margins": 1.6892329454421997, + "rewards/rejected": -7.416192054748535, + "step": 1529 + }, + { + "epoch": 0.9992652461425422, + "grad_norm": 23.44058985395152, + "learning_rate": 2.6025655743833196e-13, + "logits/chosen": -0.7166305184364319, + "logits/rejected": -0.8835107684135437, + "logps/chosen": -1093.7403564453125, + "logps/rejected": -1359.48291015625, + "loss": 0.3416, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.947076797485352, + "rewards/margins": 1.7625958919525146, + "rewards/rejected": -7.709671974182129, + "step": 1530 + }, + { + "epoch": 0.9999183606825047, + "grad_norm": 18.690941266109423, + "learning_rate": 0.0, + "logits/chosen": -0.8586152195930481, + "logits/rejected": -0.7749535441398621, + "logps/chosen": -904.07470703125, + "logps/rejected": -1040.50537109375, + "loss": 0.352, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.729012489318848, + "rewards/margins": 1.549518346786499, + "rewards/rejected": -6.278531074523926, + "step": 1531 + }, + { + "epoch": 0.9999183606825047, + "step": 1531, + "total_flos": 0.0, + "train_loss": 0.46421666473520107, + "train_runtime": 86694.5826, + "train_samples_per_second": 2.261, + "train_steps_per_second": 0.018 + } + ], + "logging_steps": 1, + "max_steps": 1531, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}