diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.0, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1540, + "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15,7 +15,7 @@ "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, - "loss": 0.5, + "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,2537 +25,597 @@ { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, - "logits/chosen": -1.8663586378097534, - "logits/rejected": -1.8706679344177246, - "logps/chosen": -36.9964485168457, - "logps/rejected": -33.65947723388672, - "loss": 0.4966, + "logits/chosen": -1.866492748260498, + "logits/rejected": -1.87080979347229, + "logps/chosen": -36.97657775878906, + "logps/rejected": -33.65824890136719, + "loss": 0.9236, "rewards/accuracies": 0.5277777910232544, - "rewards/chosen": 0.005075507797300816, - "rewards/margins": 0.019778331741690636, - "rewards/rejected": -0.014702823013067245, + "rewards/chosen": 0.015009618364274502, + "rewards/margins": 0.02909613959491253, + "rewards/rejected": -0.014086521230638027, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, - "logits/chosen": -1.9970680475234985, - "logits/rejected": -1.9997154474258423, - "logps/chosen": -29.64749526977539, - "logps/rejected": -29.048025131225586, - "loss": 0.5018, - "rewards/accuracies": 0.42500001192092896, - "rewards/chosen": -0.0026510744355618954, - "rewards/margins": -0.010360640473663807, - "rewards/rejected": 0.007709565572440624, + "logits/chosen": -1.9977840185165405, + "logits/rejected": -2.000425100326538, + "logps/chosen": -29.640512466430664, + "logps/rejected": -29.048751831054688, + "loss": 1.0528, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0008407801506109536, + "rewards/margins": -0.0065057664178311825, + "rewards/rejected": 0.007346546743065119, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, - "logits/chosen": -1.9203827381134033, - "logits/rejected": -1.9176925420761108, - "logps/chosen": -31.42234230041504, - "logps/rejected": -33.24127960205078, - "loss": 0.4984, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -0.003123724367469549, - "rewards/margins": 0.008287688717246056, - "rewards/rejected": -0.011411413550376892, + "logits/chosen": -1.9210313558578491, + "logits/rejected": -1.9183330535888672, + "logps/chosen": -31.377187728881836, + "logps/rejected": -33.214942932128906, + "loss": 0.976, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.019451653584837914, + "rewards/margins": 0.01769269071519375, + "rewards/rejected": 0.001758962869644165, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, - "logits/chosen": -2.018051862716675, - "logits/rejected": -2.009334087371826, - "logps/chosen": -32.55129623413086, - "logps/rejected": -32.50330352783203, - "loss": 0.4982, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.012754792347550392, - "rewards/margins": 0.008071592077612877, - "rewards/rejected": 0.004683199338614941, + "logits/chosen": -2.0173258781433105, + "logits/rejected": -2.008592128753662, + "logps/chosen": -32.55642318725586, + "logps/rejected": -32.49436569213867, + "loss": 1.0336, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.010191624984145164, + "rewards/margins": 0.001039800001308322, + "rewards/rejected": 0.009151825681328773, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, - "logits/chosen": -1.8634856939315796, - "logits/rejected": -1.8527206182479858, - "logps/chosen": -33.50724411010742, - "logps/rejected": -35.390602111816406, - "loss": 0.5016, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": 0.02511655166745186, - "rewards/margins": -0.005753959529101849, - "rewards/rejected": 0.030870508402585983, + "logits/chosen": -1.8626506328582764, + "logits/rejected": -1.8518873453140259, + "logps/chosen": -33.54867172241211, + "logps/rejected": -35.45621109008789, + "loss": 1.0318, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004401391837745905, + "rewards/margins": 0.006334079895168543, + "rewards/rejected": -0.001932688057422638, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, - "logits/chosen": -1.9438356161117554, - "logits/rejected": -1.94576096534729, - "logps/chosen": -32.481632232666016, - "logps/rejected": -33.15100860595703, - "loss": 0.4894, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.05927763134241104, - "rewards/margins": 0.049092620611190796, - "rewards/rejected": 0.010185008868575096, + "logits/chosen": -1.940718650817871, + "logits/rejected": -1.9426720142364502, + "logps/chosen": -32.53395462036133, + "logps/rejected": -33.20496368408203, + "loss": 0.9445, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03311632573604584, + "rewards/margins": 0.04990752786397934, + "rewards/rejected": -0.016791202127933502, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, - "logits/chosen": -2.0757923126220703, - "logits/rejected": -2.080766439437866, - "logps/chosen": -33.89708709716797, - "logps/rejected": -36.524818420410156, - "loss": 0.4939, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.04569761082530022, - "rewards/margins": 0.023670893162488937, - "rewards/rejected": 0.02202671766281128, + "logits/chosen": -2.0724740028381348, + "logits/rejected": -2.077458381652832, + "logps/chosen": -33.9911994934082, + "logps/rejected": -36.61388397216797, + "loss": 1.1022, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.001358801149763167, + "rewards/margins": 0.021143654361367226, + "rewards/rejected": -0.022502455860376358, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, - "logits/chosen": -1.936668038368225, - "logits/rejected": -1.9397681951522827, - "logps/chosen": -34.20936965942383, - "logps/rejected": -34.525596618652344, - "loss": 0.4817, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.10922203958034515, - "rewards/margins": 0.07581819593906403, - "rewards/rejected": 0.033403851091861725, + "logits/chosen": -1.936197280883789, + "logits/rejected": -1.9393237829208374, + "logps/chosen": -34.28167724609375, + "logps/rejected": -34.63819885253906, + "loss": 0.8186, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07306724786758423, + "rewards/margins": 0.09596274793148041, + "rewards/rejected": -0.02289549633860588, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, - "logits/chosen": -1.9462896585464478, - "logits/rejected": -1.9508006572723389, - "logps/chosen": -32.27099609375, - "logps/rejected": -32.275699615478516, - "loss": 0.4865, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.0982138067483902, - "rewards/margins": 0.05660901591181755, - "rewards/rejected": 0.04160478338599205, + "logits/chosen": -1.9451515674591064, + "logits/rejected": -1.949669599533081, + "logps/chosen": -32.39059066772461, + "logps/rejected": -32.346839904785156, + "loss": 1.0017, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.03841521218419075, + "rewards/margins": 0.03238191828131676, + "rewards/rejected": 0.006033292505890131, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, - "logits/chosen": -2.043682336807251, - "logits/rejected": -2.04168438911438, - "logps/chosen": -31.95809555053711, - "logps/rejected": -31.16133689880371, - "loss": 0.4794, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.14136961102485657, - "rewards/margins": 0.08893296122550964, - "rewards/rejected": 0.05243664234876633, + "logits/chosen": -2.042168617248535, + "logits/rejected": -2.040160655975342, + "logps/chosen": -32.12788772583008, + "logps/rejected": -31.280298233032227, + "loss": 0.8581, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05647433549165726, + "rewards/margins": 0.06351961940526962, + "rewards/rejected": -0.007045289967209101, "step": 100 }, { "epoch": 0.26, - "eval_logits/chosen": -2.2374911308288574, - "eval_logits/rejected": -2.232652425765991, - "eval_logps/chosen": -33.869850158691406, - "eval_logps/rejected": -37.37673568725586, - "eval_loss": 0.49711015820503235, - "eval_rewards/accuracies": 0.5573089718818665, - "eval_rewards/chosen": 0.08235026895999908, - "eval_rewards/margins": 0.012408134527504444, - "eval_rewards/rejected": 0.06994213908910751, - "eval_runtime": 145.9739, - "eval_samples_per_second": 2.35, + "eval_logits/chosen": -2.237440824508667, + "eval_logits/rejected": -2.232595682144165, + "eval_logps/chosen": -34.017024993896484, + "eval_logps/rejected": -37.50282287597656, + "eval_loss": 1.0730067491531372, + "eval_rewards/accuracies": 0.5199335813522339, + "eval_rewards/chosen": 0.00876238290220499, + "eval_rewards/margins": 0.001864485559053719, + "eval_rewards/rejected": 0.0068978965282440186, + "eval_runtime": 146.0042, + "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, - "logits/chosen": -1.9995167255401611, - "logits/rejected": -1.9971492290496826, - "logps/chosen": -32.9360237121582, - "logps/rejected": -33.857337951660156, - "loss": 0.4819, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.15389741957187653, - "rewards/margins": 0.06788130104541779, - "rewards/rejected": 0.08601613342761993, + "logits/chosen": -1.9984264373779297, + "logits/rejected": -1.9960410594940186, + "logps/chosen": -33.10862350463867, + "logps/rejected": -34.00126266479492, + "loss": 1.1665, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0675957053899765, + "rewards/margins": 0.05354113504290581, + "rewards/rejected": 0.014054578728973866, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, - "logits/chosen": -2.0095458030700684, - "logits/rejected": -2.001213788986206, - "logps/chosen": -32.16087341308594, - "logps/rejected": -31.98464584350586, - "loss": 0.4876, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.142018124461174, - "rewards/margins": 0.05358927324414253, - "rewards/rejected": 0.08842884749174118, + "logits/chosen": -2.0099892616271973, + "logits/rejected": -2.001642942428589, + "logps/chosen": -32.312686920166016, + "logps/rejected": -32.10304641723633, + "loss": 0.9851, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.06610965728759766, + "rewards/margins": 0.03688037022948265, + "rewards/rejected": 0.029229288920760155, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, - "logits/chosen": -2.0384786128997803, - "logits/rejected": -2.030527114868164, - "logps/chosen": -30.18499755859375, - "logps/rejected": -31.905630111694336, - "loss": 0.4841, + "logits/chosen": -2.0365984439849854, + "logits/rejected": -2.0286123752593994, + "logps/chosen": -30.3278865814209, + "logps/rejected": -32.04685592651367, + "loss": 0.9819, "rewards/accuracies": 0.625, - "rewards/chosen": 0.14535793662071228, - "rewards/margins": 0.07167014479637146, - "rewards/rejected": 0.07368779182434082, + "rewards/chosen": 0.07391555607318878, + "rewards/margins": 0.07084138691425323, + "rewards/rejected": 0.0030741647351533175, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, - "logits/chosen": -1.9675172567367554, - "logits/rejected": -1.97771418094635, - "logps/chosen": -31.068078994750977, - "logps/rejected": -32.39047622680664, - "loss": 0.4757, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.17579205334186554, - "rewards/margins": 0.10358880460262299, - "rewards/rejected": 0.07220325618982315, + "logits/chosen": -1.9668314456939697, + "logits/rejected": -1.9770755767822266, + "logps/chosen": -31.2120304107666, + "logps/rejected": -32.57902526855469, + "loss": 0.8178, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.10381648689508438, + "rewards/margins": 0.12588787078857422, + "rewards/rejected": -0.02207140065729618, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, - "logits/chosen": -1.8799912929534912, - "logits/rejected": -1.8811372518539429, - "logps/chosen": -33.688819885253906, - "logps/rejected": -34.5561637878418, - "loss": 0.4651, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.2519363760948181, - "rewards/margins": 0.15139077603816986, - "rewards/rejected": 0.10054560005664825, + "logits/chosen": -1.8799388408660889, + "logits/rejected": -1.881087064743042, + "logps/chosen": -33.97100830078125, + "logps/rejected": -34.84876251220703, + "loss": 0.8351, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.11084038019180298, + "rewards/margins": 0.15659382939338684, + "rewards/rejected": -0.04575346037745476, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, - "logits/chosen": -1.9306682348251343, - "logits/rejected": -1.9272987842559814, - "logps/chosen": -35.7833251953125, - "logps/rejected": -32.48335266113281, - "loss": 0.4818, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.19417627155780792, - "rewards/margins": 0.07447630167007446, - "rewards/rejected": 0.11969996988773346, + "logits/chosen": -1.9326753616333008, + "logits/rejected": -1.9292488098144531, + "logps/chosen": -36.040306091308594, + "logps/rejected": -32.739051818847656, + "loss": 0.8539, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0656842365860939, + "rewards/margins": 0.07383431494235992, + "rewards/rejected": -0.008150083012878895, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, - "logits/chosen": -2.0319628715515137, - "logits/rejected": -2.0246424674987793, - "logps/chosen": -33.24143981933594, - "logps/rejected": -31.190576553344727, - "loss": 0.4536, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.2782108783721924, - "rewards/margins": 0.19757375121116638, - "rewards/rejected": 0.0806371346116066, + "logits/chosen": -2.0338892936706543, + "logits/rejected": -2.026510238647461, + "logps/chosen": -33.518821716308594, + "logps/rejected": -31.37355613708496, + "loss": 0.7141, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13951851427555084, + "rewards/margins": 0.15037165582180023, + "rewards/rejected": -0.010853144340217113, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, - "logits/chosen": -2.039504289627075, - "logits/rejected": -2.044722080230713, - "logps/chosen": -31.97749900817871, - "logps/rejected": -32.195125579833984, - "loss": 0.4662, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.28364285826683044, - "rewards/margins": 0.1386357843875885, - "rewards/rejected": 0.14500707387924194, + "logits/chosen": -2.039998769760132, + "logits/rejected": -2.045238733291626, + "logps/chosen": -32.28400421142578, + "logps/rejected": -32.450523376464844, + "loss": 0.7652, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.13039177656173706, + "rewards/margins": 0.11308407783508301, + "rewards/rejected": 0.017307698726654053, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, - "logits/chosen": -2.0397489070892334, - "logits/rejected": -2.037031412124634, - "logps/chosen": -31.067768096923828, - "logps/rejected": -31.097219467163086, - "loss": 0.4756, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.21059219539165497, - "rewards/margins": 0.10009355843067169, - "rewards/rejected": 0.11049864441156387, + "logits/chosen": -2.0408711433410645, + "logits/rejected": -2.0380892753601074, + "logps/chosen": -31.287479400634766, + "logps/rejected": -31.33124351501465, + "loss": 0.8251, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10073776543140411, + "rewards/margins": 0.10725078731775284, + "rewards/rejected": -0.0065130265429615974, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, - "logits/chosen": -1.910517930984497, - "logits/rejected": -1.915186882019043, - "logps/chosen": -31.059677124023438, - "logps/rejected": -32.61454391479492, - "loss": 0.4561, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.27330341935157776, - "rewards/margins": 0.1851106435060501, - "rewards/rejected": 0.08819273114204407, + "logits/chosen": -1.9110311269760132, + "logits/rejected": -1.915704369544983, + "logps/chosen": -31.336145401000977, + "logps/rejected": -32.791221618652344, + "loss": 0.899, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.13506832718849182, + "rewards/margins": 0.13521215319633484, + "rewards/rejected": -0.0001438349427189678, "step": 200 }, { "epoch": 0.52, - "eval_logits/chosen": -2.234231948852539, - "eval_logits/rejected": -2.2294156551361084, - "eval_logps/chosen": -33.776145935058594, - "eval_logps/rejected": -37.29295349121094, - "eval_loss": 0.49589911103248596, - "eval_rewards/accuracies": 0.5423588156700134, - "eval_rewards/chosen": 0.12920260429382324, - "eval_rewards/margins": 0.017370687797665596, - "eval_rewards/rejected": 0.1118319109082222, - "eval_runtime": 145.6148, - "eval_samples_per_second": 2.356, + "eval_logits/chosen": -2.2354750633239746, + "eval_logits/rejected": -2.2306265830993652, + "eval_logps/chosen": -34.03763198852539, + "eval_logps/rejected": -37.534156799316406, + "eval_loss": 1.0715795755386353, + "eval_rewards/accuracies": 0.5070598125457764, + "eval_rewards/chosen": -0.0015398082323372364, + "eval_rewards/margins": 0.007228231523185968, + "eval_rewards/rejected": -0.008768039755523205, + "eval_runtime": 145.8306, + "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, - "logits/chosen": -2.0229763984680176, - "logits/rejected": -2.0335872173309326, - "logps/chosen": -31.537296295166016, - "logps/rejected": -33.72160339355469, - "loss": 0.466, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.21700111031532288, - "rewards/margins": 0.14645527303218842, - "rewards/rejected": 0.07054580748081207, + "logits/chosen": -2.0229578018188477, + "logits/rejected": -2.033618450164795, + "logps/chosen": -31.776050567626953, + "logps/rejected": -33.90400314331055, + "loss": 0.7465, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09762789309024811, + "rewards/margins": 0.11827856302261353, + "rewards/rejected": -0.020650675520300865, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, - "logits/chosen": -1.9160172939300537, - "logits/rejected": -1.9307291507720947, - "logps/chosen": -29.558719635009766, - "logps/rejected": -31.404027938842773, - "loss": 0.4532, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.27640971541404724, - "rewards/margins": 0.19933710992336273, - "rewards/rejected": 0.07707259804010391, + "logits/chosen": -1.9135267734527588, + "logits/rejected": -1.9282987117767334, + "logps/chosen": -29.896175384521484, + "logps/rejected": -31.5633487701416, + "loss": 0.75, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.10768184810876846, + "rewards/margins": 0.11026783287525177, + "rewards/rejected": -0.002585983369499445, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, - "logits/chosen": -1.9730017185211182, - "logits/rejected": -1.9770148992538452, - "logps/chosen": -32.84386444091797, - "logps/rejected": -31.42836570739746, - "loss": 0.4471, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.2868002951145172, - "rewards/margins": 0.22633206844329834, - "rewards/rejected": 0.06046823784708977, + "logits/chosen": -1.9711973667144775, + "logits/rejected": -1.9751732349395752, + "logps/chosen": -33.15174102783203, + "logps/rejected": -31.605077743530273, + "loss": 0.7191, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1328631192445755, + "rewards/margins": 0.1607515811920166, + "rewards/rejected": -0.027888456359505653, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, - "logits/chosen": -1.9717906713485718, - "logits/rejected": -1.950059175491333, - "logps/chosen": -33.563629150390625, - "logps/rejected": -34.86870193481445, - "loss": 0.4421, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.2950619161128998, - "rewards/margins": 0.25401392579078674, - "rewards/rejected": 0.041047997772693634, + "logits/chosen": -1.969957709312439, + "logits/rejected": -1.9480478763580322, + "logps/chosen": -33.9122200012207, + "logps/rejected": -35.02121353149414, + "loss": 0.6955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12076646089553833, + "rewards/margins": 0.15597540140151978, + "rewards/rejected": -0.03520893678069115, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, - "logits/chosen": -2.0124945640563965, - "logits/rejected": -2.0092015266418457, - "logps/chosen": -32.43610382080078, - "logps/rejected": -35.959327697753906, - "loss": 0.4732, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.24448053538799286, - "rewards/margins": 0.11168196052312851, - "rewards/rejected": 0.13279855251312256, + "logits/chosen": -2.0103070735931396, + "logits/rejected": -2.006990909576416, + "logps/chosen": -32.72673797607422, + "logps/rejected": -36.23841094970703, + "loss": 0.8178, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09916610270738602, + "rewards/margins": 0.10590960830450058, + "rewards/rejected": -0.006743511650711298, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, - "logits/chosen": -1.8797550201416016, - "logits/rejected": -1.8773235082626343, - "logps/chosen": -33.68698501586914, - "logps/rejected": -35.27508544921875, - "loss": 0.4682, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.25070956349372864, - "rewards/margins": 0.13594172894954681, - "rewards/rejected": 0.11476783454418182, + "logits/chosen": -1.8776795864105225, + "logits/rejected": -1.875245451927185, + "logps/chosen": -34.003971099853516, + "logps/rejected": -35.510765075683594, + "loss": 0.889, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.09221391379833221, + "rewards/margins": 0.09528535604476929, + "rewards/rejected": -0.0030714483000338078, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, - "logits/chosen": -1.8655035495758057, - "logits/rejected": -1.8629907369613647, - "logps/chosen": -33.91318893432617, - "logps/rejected": -31.576608657836914, - "loss": 0.4665, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.24220648407936096, - "rewards/margins": 0.1448442041873932, - "rewards/rejected": 0.09736229479312897, + "logits/chosen": -1.863521933555603, + "logits/rejected": -1.8610206842422485, + "logps/chosen": -34.20132827758789, + "logps/rejected": -31.76943016052246, + "loss": 0.8268, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.09813599288463593, + "rewards/margins": 0.09718601405620575, + "rewards/rejected": 0.000949984765611589, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, - "logits/chosen": -1.9690582752227783, - "logits/rejected": -1.9586395025253296, - "logps/chosen": -34.75696563720703, - "logps/rejected": -31.642765045166016, - "loss": 0.4519, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.2998362183570862, - "rewards/margins": 0.20072226226329803, - "rewards/rejected": 0.09911395609378815, + "logits/chosen": -1.9676616191864014, + "logits/rejected": -1.957082748413086, + "logps/chosen": -35.020606994628906, + "logps/rejected": -31.831247329711914, + "loss": 0.6669, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.168013796210289, + "rewards/margins": 0.16314153373241425, + "rewards/rejected": 0.00487226527184248, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, - "logits/chosen": -2.0643129348754883, - "logits/rejected": -2.049489736557007, - "logps/chosen": -30.391122817993164, - "logps/rejected": -32.35709762573242, - "loss": 0.4722, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.2665007710456848, - "rewards/margins": 0.11913253366947174, - "rewards/rejected": 0.14736825227737427, + "logits/chosen": -2.0636610984802246, + "logits/rejected": -2.0486764907836914, + "logps/chosen": -30.683029174804688, + "logps/rejected": -32.61827850341797, + "loss": 0.894, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.12054909765720367, + "rewards/margins": 0.10377003997564316, + "rewards/rejected": 0.016779040917754173, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, - "logits/chosen": -1.9364421367645264, - "logits/rejected": -1.933985710144043, - "logps/chosen": -32.0788688659668, - "logps/rejected": -30.654926300048828, - "loss": 0.4202, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.42412155866622925, - "rewards/margins": 0.34317898750305176, - "rewards/rejected": 0.08094261586666107, + "logits/chosen": -1.9357779026031494, + "logits/rejected": -1.9332023859024048, + "logps/chosen": -32.54056930541992, + "logps/rejected": -30.850332260131836, + "loss": 0.6105, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.1932690441608429, + "rewards/margins": 0.21003055572509766, + "rewards/rejected": -0.01676151715219021, "step": 300 }, { "epoch": 0.78, - "eval_logits/chosen": -2.233308792114258, - "eval_logits/rejected": -2.2285048961639404, - "eval_logps/chosen": -33.75082015991211, - "eval_logps/rejected": -37.27885055541992, - "eval_loss": 0.4944371283054352, - "eval_rewards/accuracies": 0.5307309031486511, - "eval_rewards/chosen": 0.14186599850654602, - "eval_rewards/margins": 0.022981125861406326, - "eval_rewards/rejected": 0.1188848614692688, - "eval_runtime": 145.5333, - "eval_samples_per_second": 2.357, + "eval_logits/chosen": -2.233783006668091, + "eval_logits/rejected": -2.228926658630371, + "eval_logps/chosen": -34.01414108276367, + "eval_logps/rejected": -37.53648376464844, + "eval_loss": 1.0314662456512451, + "eval_rewards/accuracies": 0.560215950012207, + "eval_rewards/chosen": 0.010205330327153206, + "eval_rewards/margins": 0.020134516060352325, + "eval_rewards/rejected": -0.009929186664521694, + "eval_runtime": 145.6316, + "eval_samples_per_second": 2.355, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, - "grad_norm": 4.5625, - "learning_rate": 4.84533120650964e-06, - "logits/chosen": -2.0702390670776367, - "logits/rejected": -2.0575358867645264, - "logps/chosen": -31.772899627685547, - "logps/rejected": -32.60271453857422, - "loss": 0.4189, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.3777456283569336, - "rewards/margins": 0.35857290029525757, - "rewards/rejected": 0.019172677770256996, + "learning_rate": 5.576113578589035e-07, + "logits/chosen": -1.9184128046035767, + "logits/rejected": -1.9151279926300049, + "logps/chosen": -31.325061798095703, + "logps/rejected": -33.77220916748047, + "loss": 0.7283, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13914386928081512, + "rewards/margins": 0.15417365729808807, + "rewards/rejected": -0.01502978801727295, "step": 310 }, { "epoch": 0.83, - "grad_norm": 5.375, - "learning_rate": 4.825108134172131e-06, - "logits/chosen": -1.9816529750823975, - "logits/rejected": -1.9730370044708252, - "logps/chosen": -31.39089012145996, - "logps/rejected": -30.13273048400879, - "loss": 0.4044, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.48204803466796875, - "rewards/margins": 0.42771729826927185, - "rewards/rejected": 0.054330699145793915, + "learning_rate": 4.229036944380913e-07, + "logits/chosen": -1.968726396560669, + "logits/rejected": -1.9564485549926758, + "logps/chosen": -34.366207122802734, + "logps/rejected": -33.61689376831055, + "loss": 0.6804, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11493507772684097, + "rewards/margins": 0.1539594680070877, + "rewards/rejected": -0.03902440145611763, "step": 320 }, { "epoch": 0.86, - "grad_norm": 6.53125, - "learning_rate": 4.80369052967602e-06, - "logits/chosen": -1.9196773767471313, - "logits/rejected": -1.9316660165786743, - "logps/chosen": -29.447494506835938, - "logps/rejected": -33.378883361816406, - "loss": 0.3877, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.5158854722976685, - "rewards/margins": 0.50420081615448, - "rewards/rejected": 0.011684572324156761, + "learning_rate": 3.053082288996112e-07, + "logits/chosen": -2.0041136741638184, + "logits/rejected": -2.002657651901245, + "logps/chosen": -33.25464630126953, + "logps/rejected": -32.49077606201172, + "loss": 0.7724, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11557211726903915, + "rewards/margins": 0.12116815894842148, + "rewards/rejected": -0.005596047732979059, "step": 330 }, { "epoch": 0.88, - "grad_norm": 6.875, - "learning_rate": 4.781089396387968e-06, - "logits/chosen": -1.8812570571899414, - "logits/rejected": -1.8721818923950195, - "logps/chosen": -33.576210021972656, - "logps/rejected": -35.886775970458984, - "loss": 0.379, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.5613591074943542, - "rewards/margins": 0.5763841867446899, - "rewards/rejected": -0.015025007538497448, + "learning_rate": 2.0579377374915805e-07, + "logits/chosen": -2.0917208194732666, + "logits/rejected": -2.0759525299072266, + "logps/chosen": -33.83209991455078, + "logps/rejected": -33.08992004394531, + "loss": 0.7606, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17364642024040222, + "rewards/margins": 0.15958845615386963, + "rewards/rejected": 0.014057991094887257, "step": 340 }, { "epoch": 0.91, - "grad_norm": 4.3125, - "learning_rate": 4.757316345716554e-06, - "logits/chosen": -1.934380292892456, - "logits/rejected": -1.9350519180297852, - "logps/chosen": -33.19637680053711, - "logps/rejected": -33.7524299621582, - "loss": 0.3788, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.6044507026672363, - "rewards/margins": 0.5561784505844116, - "rewards/rejected": 0.04827232286334038, + "learning_rate": 1.2518018074041684e-07, + "logits/chosen": -1.9635206460952759, + "logits/rejected": -1.9625988006591797, + "logps/chosen": -32.91681671142578, + "logps/rejected": -32.46485137939453, + "loss": 0.7913, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.18264031410217285, + "rewards/margins": 0.182492196559906, + "rewards/rejected": 0.00014809667482040823, "step": 350 }, { "epoch": 0.94, - "grad_norm": 4.90625, - "learning_rate": 4.73238359114687e-06, - "logits/chosen": -2.060429334640503, - "logits/rejected": -2.0666050910949707, - "logps/chosen": -30.6839656829834, - "logps/rejected": -32.61330795288086, - "loss": 0.4096, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.44464096426963806, - "rewards/margins": 0.41715437173843384, - "rewards/rejected": 0.027486661449074745, + "learning_rate": 6.41315865106129e-08, + "logits/chosen": -1.9189882278442383, + "logits/rejected": -1.9293220043182373, + "logps/chosen": -31.9406681060791, + "logps/rejected": -35.306640625, + "loss": 0.7574, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1320098638534546, + "rewards/margins": 0.1409510374069214, + "rewards/rejected": -0.008941170759499073, "step": 360 }, { "epoch": 0.96, - "grad_norm": 5.75, - "learning_rate": 4.706303941965804e-06, - "logits/chosen": -1.9897394180297852, - "logits/rejected": -1.9893505573272705, - "logps/chosen": -32.34087371826172, - "logps/rejected": -35.93071746826172, - "loss": 0.3914, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.5520325899124146, - "rewards/margins": 0.49250274896621704, - "rewards/rejected": 0.0595298707485199, + "learning_rate": 2.3150941078050325e-08, + "logits/chosen": -2.0582680702209473, + "logits/rejected": -2.051753520965576, + "logps/chosen": -33.39839553833008, + "logps/rejected": -29.18343734741211, + "loss": 0.7663, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12678876519203186, + "rewards/margins": 0.1052960604429245, + "rewards/rejected": 0.02149270847439766, "step": 370 }, { "epoch": 0.99, - "grad_norm": 4.46875, - "learning_rate": 4.679090796681225e-06, - "logits/chosen": -2.0219178199768066, - "logits/rejected": -2.0173287391662598, - "logps/chosen": -29.65066146850586, - "logps/rejected": -29.175548553466797, - "loss": 0.3897, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.5299680829048157, - "rewards/margins": 0.5107963681221008, - "rewards/rejected": 0.019171705469489098, + "learning_rate": 2.575864278703266e-09, + "logits/chosen": -1.9175922870635986, + "logits/rejected": -1.9197555780410767, + "logps/chosen": -33.878089904785156, + "logps/rejected": -30.871530532836914, + "loss": 0.6999, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.18047982454299927, + "rewards/margins": 0.17993128299713135, + "rewards/rejected": 0.0005485474830493331, "step": 380 }, { - "epoch": 1.01, - "grad_norm": 6.0625, - "learning_rate": 4.650758136138454e-06, - "logits/chosen": -1.7920303344726562, - "logits/rejected": -1.7984501123428345, - "logps/chosen": -31.053028106689453, - "logps/rejected": -36.2508430480957, - "loss": 0.347, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.6663440465927124, - "rewards/margins": 0.7326194643974304, - "rewards/rejected": -0.06627537310123444, - "step": 390 - }, - { - "epoch": 1.04, - "grad_norm": 4.625, - "learning_rate": 4.621320516337559e-06, - "logits/chosen": -1.9457969665527344, - "logits/rejected": -1.939706563949585, - "logps/chosen": -32.465946197509766, - "logps/rejected": -32.288917541503906, - "loss": 0.3592, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.6833757162094116, - "rewards/margins": 0.6733940243721008, - "rewards/rejected": 0.009981664828956127, - "step": 400 - }, - { - "epoch": 1.04, - "eval_logits/chosen": -2.2072908878326416, - "eval_logits/rejected": -2.202495574951172, - "eval_logps/chosen": -33.64178466796875, - "eval_logps/rejected": -37.204627990722656, - "eval_loss": 0.4900670051574707, - "eval_rewards/accuracies": 0.5768272280693054, - "eval_rewards/chosen": 0.19638201594352722, - "eval_rewards/margins": 0.04038503021001816, - "eval_rewards/rejected": 0.15599699318408966, - "eval_runtime": 146.2517, - "eval_samples_per_second": 2.345, - "eval_steps_per_second": 0.294, - "step": 400 - }, - { - "epoch": 1.06, - "grad_norm": 4.40625, - "learning_rate": 4.590793060955158e-06, - "logits/chosen": -1.9434096813201904, - "logits/rejected": -1.950661063194275, - "logps/chosen": -28.069305419921875, - "logps/rejected": -29.235126495361328, - "loss": 0.3783, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5255349278450012, - "rewards/margins": 0.5939846038818359, - "rewards/rejected": -0.0684497132897377, - "step": 410 - }, - { - "epoch": 1.09, - "grad_norm": 4.65625, - "learning_rate": 4.559191453574582e-06, - "logits/chosen": -1.9646800756454468, - "logits/rejected": -1.9637447595596313, - "logps/chosen": -32.93536376953125, - "logps/rejected": -30.744121551513672, - "loss": 0.3823, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.6576722264289856, - "rewards/margins": 0.558210551738739, - "rewards/rejected": 0.0994616374373436, - "step": 420 - }, - { - "epoch": 1.12, - "grad_norm": 4.78125, - "learning_rate": 4.52653192962838e-06, - "logits/chosen": -1.9575519561767578, - "logits/rejected": -1.940502405166626, - "logps/chosen": -29.87259292602539, - "logps/rejected": -33.13463592529297, - "loss": 0.3661, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.552306056022644, - "rewards/margins": 0.6564798951148987, - "rewards/rejected": -0.10417388379573822, - "step": 430 - }, - { - "epoch": 1.14, - "grad_norm": 5.0625, - "learning_rate": 4.492831268057307e-06, - "logits/chosen": -1.9878675937652588, - "logits/rejected": -1.9897832870483398, - "logps/chosen": -35.01631164550781, - "logps/rejected": -34.94855499267578, - "loss": 0.3268, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.7980268597602844, - "rewards/margins": 0.8423188924789429, - "rewards/rejected": -0.044292036443948746, - "step": 440 - }, - { - "epoch": 1.17, - "grad_norm": 5.0625, - "learning_rate": 4.458106782690094e-06, - "logits/chosen": -2.063129425048828, - "logits/rejected": -2.063042163848877, - "logps/chosen": -31.267589569091797, - "logps/rejected": -33.55221176147461, - "loss": 0.3643, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.6828799843788147, - "rewards/margins": 0.6789279580116272, - "rewards/rejected": 0.003952032420784235, - "step": 450 - }, - { - "epoch": 1.19, - "grad_norm": 4.375, - "learning_rate": 4.422376313348405e-06, - "logits/chosen": -2.006115674972534, - "logits/rejected": -1.9987046718597412, - "logps/chosen": -30.81891441345215, - "logps/rejected": -36.21019744873047, - "loss": 0.3276, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.7935220003128052, - "rewards/margins": 0.8716004490852356, - "rewards/rejected": -0.07807846367359161, - "step": 460 - }, - { - "epoch": 1.22, - "grad_norm": 5.0625, - "learning_rate": 4.3856582166815696e-06, - "logits/chosen": -1.9092477560043335, - "logits/rejected": -1.9058347940444946, - "logps/chosen": -32.33415222167969, - "logps/rejected": -32.964149475097656, - "loss": 0.3364, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.7990728616714478, - "rewards/margins": 0.819208025932312, - "rewards/rejected": -0.020135188475251198, - "step": 470 - }, - { - "epoch": 1.25, - "grad_norm": 4.59375, - "learning_rate": 4.347971356735789e-06, - "logits/chosen": -2.0355165004730225, - "logits/rejected": -2.028594493865967, - "logps/chosen": -29.620845794677734, - "logps/rejected": -32.04124450683594, - "loss": 0.3704, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.6038450002670288, - "rewards/margins": 0.6277218461036682, - "rewards/rejected": -0.023876825347542763, - "step": 480 - }, - { - "epoch": 1.27, - "grad_norm": 4.53125, - "learning_rate": 4.309335095262675e-06, - "logits/chosen": -1.9820530414581299, - "logits/rejected": -1.9836938381195068, - "logps/chosen": -33.6979866027832, - "logps/rejected": -33.642757415771484, - "loss": 0.3253, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.827602744102478, - "rewards/margins": 0.8282254338264465, - "rewards/rejected": -0.0006226152181625366, - "step": 490 - }, - { - "epoch": 1.3, - "grad_norm": 4.96875, - "learning_rate": 4.269769281772082e-06, - "logits/chosen": -1.870469093322754, - "logits/rejected": -1.8681905269622803, - "logps/chosen": -31.521778106689453, - "logps/rejected": -36.603065490722656, - "loss": 0.3252, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.8252545595169067, - "rewards/margins": 0.8946241140365601, - "rewards/rejected": -0.06936953216791153, - "step": 500 - }, - { - "epoch": 1.3, - "eval_logits/chosen": -2.2057487964630127, - "eval_logits/rejected": -2.2009708881378174, - "eval_logps/chosen": -33.599735260009766, - "eval_logps/rejected": -37.15663146972656, - "eval_loss": 0.4906069040298462, - "eval_rewards/accuracies": 0.5161960124969482, - "eval_rewards/chosen": 0.21740689873695374, - "eval_rewards/margins": 0.037414077669382095, - "eval_rewards/rejected": 0.17999280989170074, - "eval_runtime": 145.9845, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 500 - }, - { - "epoch": 1.32, - "grad_norm": 3.8125, - "learning_rate": 4.22929424333435e-06, - "logits/chosen": -1.9746795892715454, - "logits/rejected": -1.979432463645935, - "logps/chosen": -31.675983428955078, - "logps/rejected": -31.606616973876953, - "loss": 0.3236, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.8906850814819336, - "rewards/margins": 0.870940089225769, - "rewards/rejected": 0.019745180383324623, - "step": 510 - }, - { - "epoch": 1.35, - "grad_norm": 4.21875, - "learning_rate": 4.1879307741372085e-06, - "logits/chosen": -2.002662181854248, - "logits/rejected": -2.0136265754699707, - "logps/chosen": -29.71444320678711, - "logps/rejected": -31.56441879272461, - "loss": 0.3246, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.8379856944084167, - "rewards/margins": 0.8663502931594849, - "rewards/rejected": -0.028364697471261024, - "step": 520 - }, - { - "epoch": 1.38, - "grad_norm": 3.53125, - "learning_rate": 4.145700124802693e-06, - "logits/chosen": -1.9277675151824951, - "logits/rejected": -1.9244375228881836, - "logps/chosen": -30.94647216796875, - "logps/rejected": -32.4344367980957, - "loss": 0.3407, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.7370210886001587, - "rewards/margins": 0.7728511095046997, - "rewards/rejected": -0.03583000972867012, - "step": 530 - }, - { - "epoch": 1.4, - "grad_norm": 4.6875, - "learning_rate": 4.102623991469562e-06, - "logits/chosen": -1.792615294456482, - "logits/rejected": -1.8018049001693726, - "logps/chosen": -30.796539306640625, - "logps/rejected": -31.958538055419922, - "loss": 0.3241, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.8864250183105469, - "rewards/margins": 0.9377009272575378, - "rewards/rejected": -0.051275890320539474, - "step": 540 - }, - { - "epoch": 1.43, - "grad_norm": 4.71875, - "learning_rate": 4.058724504646834e-06, - "logits/chosen": -1.8878982067108154, - "logits/rejected": -1.881675362586975, - "logps/chosen": -31.799880981445312, - "logps/rejected": -30.896432876586914, - "loss": 0.3238, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.9542133212089539, - "rewards/margins": 0.9067564010620117, - "rewards/rejected": 0.04745698720216751, - "step": 550 - }, - { - "epoch": 1.45, - "grad_norm": 4.34375, - "learning_rate": 4.014024217844167e-06, - "logits/chosen": -1.9775705337524414, - "logits/rejected": -1.9755971431732178, - "logps/chosen": -32.72150421142578, - "logps/rejected": -31.48871421813965, - "loss": 0.3236, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.9248467683792114, - "rewards/margins": 0.9205129742622375, - "rewards/rejected": 0.00433387141674757, - "step": 560 - }, - { - "epoch": 1.48, - "grad_norm": 5.1875, - "learning_rate": 3.968546095984911e-06, - "logits/chosen": -1.8112825155258179, - "logits/rejected": -1.8090919256210327, - "logps/chosen": -31.090688705444336, - "logps/rejected": -31.00448989868164, - "loss": 0.3337, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.8528607487678528, - "rewards/margins": 0.8539535403251648, - "rewards/rejected": -0.0010927558178082108, - "step": 570 - }, - { - "epoch": 1.51, - "grad_norm": 5.4375, - "learning_rate": 3.922313503607806e-06, - "logits/chosen": -1.9450817108154297, - "logits/rejected": -1.9417240619659424, - "logps/chosen": -29.3461971282959, - "logps/rejected": -34.87482833862305, - "loss": 0.3094, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.8792248964309692, - "rewards/margins": 1.0035580396652222, - "rewards/rejected": -0.12433312833309174, - "step": 580 - }, - { - "epoch": 1.53, - "grad_norm": 4.0, - "learning_rate": 3.875350192863368e-06, - "logits/chosen": -1.882834792137146, - "logits/rejected": -1.8864123821258545, - "logps/chosen": -28.255081176757812, - "logps/rejected": -30.703868865966797, - "loss": 0.3524, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.679975688457489, - "rewards/margins": 0.7024742364883423, - "rewards/rejected": -0.022498566657304764, - "step": 590 - }, - { - "epoch": 1.56, - "grad_norm": 4.625, - "learning_rate": 3.8276802913111436e-06, - "logits/chosen": -1.922412633895874, - "logits/rejected": -1.9222551584243774, - "logps/chosen": -30.202539443969727, - "logps/rejected": -31.248510360717773, - "loss": 0.3006, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 1.0094959735870361, - "rewards/margins": 1.0372422933578491, - "rewards/rejected": -0.027746286243200302, - "step": 600 - }, - { - "epoch": 1.56, - "eval_logits/chosen": -2.1860945224761963, - "eval_logits/rejected": -2.181339979171753, - "eval_logps/chosen": -33.55764389038086, - "eval_logps/rejected": -37.189125061035156, - "eval_loss": 0.4826602339744568, - "eval_rewards/accuracies": 0.5710132718086243, - "eval_rewards/chosen": 0.23845446109771729, - "eval_rewards/margins": 0.07470697164535522, - "eval_rewards/rejected": 0.16374748945236206, - "eval_runtime": 145.9654, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 600 - }, - { - "epoch": 1.58, - "grad_norm": 3.875, - "learning_rate": 3.7793282895240927e-06, - "logits/chosen": -1.9824330806732178, - "logits/rejected": -1.983112096786499, - "logps/chosen": -32.78623580932617, - "logps/rejected": -33.11241912841797, - "loss": 0.2882, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 1.1219536066055298, - "rewards/margins": 1.1360708475112915, - "rewards/rejected": -0.014117163605988026, - "step": 610 - }, - { - "epoch": 1.61, - "grad_norm": 3.25, - "learning_rate": 3.730319028506478e-06, - "logits/chosen": -1.945784330368042, - "logits/rejected": -1.9434850215911865, - "logps/chosen": -31.238819122314453, - "logps/rejected": -32.211891174316406, - "loss": 0.3068, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.943290114402771, - "rewards/margins": 1.0142767429351807, - "rewards/rejected": -0.0709865465760231, - "step": 620 - }, - { - "epoch": 1.64, - "grad_norm": 5.34375, - "learning_rate": 3.6806776869317074e-06, - "logits/chosen": -1.9573396444320679, - "logits/rejected": -1.9484916925430298, - "logps/chosen": -30.74532127380371, - "logps/rejected": -31.107269287109375, - "loss": 0.2995, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 1.0153981447219849, - "rewards/margins": 1.1013498306274414, - "rewards/rejected": -0.08595152944326401, - "step": 630 - }, - { - "epoch": 1.66, - "grad_norm": 4.8125, - "learning_rate": 3.6304297682067146e-06, - "logits/chosen": -1.958103895187378, - "logits/rejected": -1.9549020528793335, - "logps/chosen": -30.454784393310547, - "logps/rejected": -32.5543098449707, - "loss": 0.3143, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.8718185424804688, - "rewards/margins": 0.9392663836479187, - "rewards/rejected": -0.06744784861803055, - "step": 640 - }, - { - "epoch": 1.69, - "grad_norm": 4.4375, - "learning_rate": 3.579601087369492e-06, - "logits/chosen": -1.9650936126708984, - "logits/rejected": -1.9674503803253174, - "logps/chosen": -31.781970977783203, - "logps/rejected": -33.79515838623047, - "loss": 0.2994, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.9358679056167603, - "rewards/margins": 0.9917511940002441, - "rewards/rejected": -0.055883266031742096, - "step": 650 - }, - { - "epoch": 1.71, - "grad_norm": 6.375, - "learning_rate": 3.5282177578265295e-06, - "logits/chosen": -1.8790709972381592, - "logits/rejected": -1.8796155452728271, - "logps/chosen": -31.92360496520996, - "logps/rejected": -31.67936134338379, - "loss": 0.2894, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.011613130569458, - "rewards/margins": 1.0476658344268799, - "rewards/rejected": -0.03605276346206665, - "step": 660 - }, - { - "epoch": 1.74, - "grad_norm": 5.375, - "learning_rate": 3.476306177936961e-06, - "logits/chosen": -1.9204801321029663, - "logits/rejected": -1.9109817743301392, - "logps/chosen": -32.0572624206543, - "logps/rejected": -32.53544235229492, - "loss": 0.2934, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.972008228302002, - "rewards/margins": 1.0858339071273804, - "rewards/rejected": -0.11382582038640976, - "step": 670 - }, - { - "epoch": 1.77, - "grad_norm": 3.84375, - "learning_rate": 3.423893017450324e-06, - "logits/chosen": -1.8181785345077515, - "logits/rejected": -1.815189003944397, - "logps/chosen": -29.360342025756836, - "logps/rejected": -34.40721130371094, - "loss": 0.2925, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.0503089427947998, - "rewards/margins": 1.0946245193481445, - "rewards/rejected": -0.04431561380624771, - "step": 680 - }, - { - "epoch": 1.79, - "grad_norm": 3.453125, - "learning_rate": 3.3710052038048794e-06, - "logits/chosen": -1.8782176971435547, - "logits/rejected": -1.877497673034668, - "logps/chosen": -32.936119079589844, - "logps/rejected": -35.511497497558594, - "loss": 0.2671, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.1670840978622437, - "rewards/margins": 1.2459213733673096, - "rewards/rejected": -0.07883722335100174, - "step": 690 - }, - { - "epoch": 1.82, - "grad_norm": 3.78125, - "learning_rate": 3.3176699082935546e-06, - "logits/chosen": -1.8524360656738281, - "logits/rejected": -1.8554551601409912, - "logps/chosen": -30.489444732666016, - "logps/rejected": -35.637123107910156, - "loss": 0.3054, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.0567033290863037, - "rewards/margins": 1.054430365562439, - "rewards/rejected": 0.002273067831993103, - "step": 700 - }, - { - "epoch": 1.82, - "eval_logits/chosen": -2.1734440326690674, - "eval_logits/rejected": -2.1687140464782715, - "eval_logps/chosen": -33.580875396728516, - "eval_logps/rejected": -37.22862243652344, - "eval_loss": 0.4810311496257782, - "eval_rewards/accuracies": 0.5975913405418396, - "eval_rewards/chosen": 0.22683614492416382, - "eval_rewards/margins": 0.08283717185258865, - "eval_rewards/rejected": 0.14399898052215576, - "eval_runtime": 145.9418, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 700 - }, - { - "epoch": 1.84, - "grad_norm": 4.5625, - "learning_rate": 3.2639145321045933e-06, - "logits/chosen": -1.9587640762329102, - "logits/rejected": -1.9615100622177124, - "logps/chosen": -32.75444793701172, - "logps/rejected": -34.373538970947266, - "loss": 0.2982, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 1.0705703496932983, - "rewards/margins": 1.1222612857818604, - "rewards/rejected": -0.05169079825282097, - "step": 710 - }, - { - "epoch": 1.87, - "grad_norm": 5.9375, - "learning_rate": 3.2097666922441107e-06, - "logits/chosen": -1.8133538961410522, - "logits/rejected": -1.807440996170044, - "logps/chosen": -32.563743591308594, - "logps/rejected": -32.37199783325195, - "loss": 0.283, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.1347416639328003, - "rewards/margins": 1.144819974899292, - "rewards/rejected": -0.010078263469040394, - "step": 720 - }, - { - "epoch": 1.9, - "grad_norm": 3.28125, - "learning_rate": 3.1552542073477554e-06, - "logits/chosen": -1.9840853214263916, - "logits/rejected": -1.981187105178833, - "logps/chosen": -28.622554779052734, - "logps/rejected": -31.706497192382812, - "loss": 0.2914, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.0152161121368408, - "rewards/margins": 1.1443650722503662, - "rewards/rejected": -0.12914907932281494, - "step": 730 - }, - { - "epoch": 1.92, - "grad_norm": 3.4375, - "learning_rate": 3.100405083388799e-06, - "logits/chosen": -1.8249248266220093, - "logits/rejected": -1.824972152709961, - "logps/chosen": -31.1278076171875, - "logps/rejected": -37.60888671875, - "loss": 0.2772, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 1.1676826477050781, - "rewards/margins": 1.1582227945327759, - "rewards/rejected": 0.009459850378334522, - "step": 740 - }, - { - "epoch": 1.95, - "grad_norm": 3.125, - "learning_rate": 3.0452474992899645e-06, - "logits/chosen": -1.7074337005615234, - "logits/rejected": -1.7126433849334717, - "logps/chosen": -34.51697540283203, - "logps/rejected": -34.013771057128906, - "loss": 0.2756, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 1.2077430486679077, - "rewards/margins": 1.2284619808197021, - "rewards/rejected": -0.02071886695921421, - "step": 750 - }, - { - "epoch": 1.97, - "grad_norm": 5.3125, - "learning_rate": 2.989809792446417e-06, - "logits/chosen": -1.9103246927261353, - "logits/rejected": -1.9118106365203857, - "logps/chosen": -30.58319091796875, - "logps/rejected": -32.86848449707031, - "loss": 0.2918, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.011243462562561, - "rewards/margins": 1.055877447128296, - "rewards/rejected": -0.04463387280702591, - "step": 760 - }, - { - "epoch": 2.0, - "grad_norm": 4.3125, - "learning_rate": 2.9341204441673267e-06, - "logits/chosen": -1.8809820413589478, - "logits/rejected": -1.8801352977752686, - "logps/chosen": -30.012935638427734, - "logps/rejected": -34.877315521240234, - "loss": 0.3001, - "rewards/accuracies": 0.85833340883255, - "rewards/chosen": 1.0813367366790771, - "rewards/margins": 1.0059177875518799, - "rewards/rejected": 0.07541900128126144, - "step": 770 - }, - { - "epoch": 2.03, - "grad_norm": 3.40625, - "learning_rate": 2.878208065043501e-06, - "logits/chosen": -1.8827836513519287, - "logits/rejected": -1.8821783065795898, - "logps/chosen": -32.30527877807617, - "logps/rejected": -31.889827728271484, - "loss": 0.2485, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.1715916395187378, - "rewards/margins": 1.4067461490631104, - "rewards/rejected": -0.2351544350385666, - "step": 780 - }, - { - "epoch": 2.05, - "grad_norm": 3.53125, - "learning_rate": 2.8221013802485974e-06, - "logits/chosen": -1.913342833518982, - "logits/rejected": -1.9118541479110718, - "logps/chosen": -27.369583129882812, - "logps/rejected": -33.24831008911133, - "loss": 0.2482, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0967981815338135, - "rewards/margins": 1.4027960300445557, - "rewards/rejected": -0.3059977889060974, - "step": 790 - }, - { - "epoch": 2.08, - "grad_norm": 3.4375, - "learning_rate": 2.76582921478147e-06, - "logits/chosen": -1.9648157358169556, - "logits/rejected": -1.961501121520996, - "logps/chosen": -29.89137840270996, - "logps/rejected": -34.684932708740234, - "loss": 0.2314, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.2679157257080078, - "rewards/margins": 1.5108659267425537, - "rewards/rejected": -0.2429502010345459, - "step": 800 - }, - { - "epoch": 2.08, - "eval_logits/chosen": -2.160287857055664, - "eval_logits/rejected": -2.1555850505828857, - "eval_logps/chosen": -33.642948150634766, - "eval_logps/rejected": -37.30831527709961, - "eval_loss": 0.4798665940761566, - "eval_rewards/accuracies": 0.574335515499115, - "eval_rewards/chosen": 0.19579996168613434, - "eval_rewards/margins": 0.09164752811193466, - "eval_rewards/rejected": 0.10415242612361908, - "eval_runtime": 145.9536, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 800 - }, - { - "epoch": 2.1, - "grad_norm": 2.515625, - "learning_rate": 2.7094204786572254e-06, - "logits/chosen": -1.7954742908477783, - "logits/rejected": -1.7877804040908813, - "logps/chosen": -31.379587173461914, - "logps/rejected": -35.11090087890625, - "loss": 0.1986, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.5383274555206299, - "rewards/margins": 1.813719391822815, - "rewards/rejected": -0.2753918766975403, - "step": 810 - }, - { - "epoch": 2.13, - "grad_norm": 3.734375, - "learning_rate": 2.6529041520546072e-06, - "logits/chosen": -1.871080994606018, - "logits/rejected": -1.881505012512207, - "logps/chosen": -32.85948944091797, - "logps/rejected": -32.76939010620117, - "loss": 0.2389, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.4181005954742432, - "rewards/margins": 1.5870771408081055, - "rewards/rejected": -0.16897639632225037, - "step": 820 - }, - { - "epoch": 2.16, - "grad_norm": 3.421875, - "learning_rate": 2.5963092704273302e-06, - "logits/chosen": -1.9215189218521118, - "logits/rejected": -1.9260494709014893, - "logps/chosen": -32.41266632080078, - "logps/rejected": -29.630962371826172, - "loss": 0.238, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.245057225227356, - "rewards/margins": 1.4264161586761475, - "rewards/rejected": -0.1813589632511139, - "step": 830 - }, - { - "epoch": 2.18, - "grad_norm": 3.34375, - "learning_rate": 2.53966490958702e-06, - "logits/chosen": -1.919118881225586, - "logits/rejected": -1.9269850254058838, - "logps/chosen": -31.995285034179688, - "logps/rejected": -30.464197158813477, - "loss": 0.2153, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3963725566864014, - "rewards/margins": 1.704620122909546, - "rewards/rejected": -0.30824753642082214, - "step": 840 - }, - { - "epoch": 2.21, - "grad_norm": 4.0625, - "learning_rate": 2.4830001707654135e-06, - "logits/chosen": -1.8508501052856445, - "logits/rejected": -1.8414716720581055, - "logps/chosen": -29.227252960205078, - "logps/rejected": -32.39653015136719, - "loss": 0.2335, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2829720973968506, - "rewards/margins": 1.4743337631225586, - "rewards/rejected": -0.19136162102222443, - "step": 850 - }, - { - "epoch": 2.23, - "grad_norm": 3.140625, - "learning_rate": 2.4263441656635054e-06, - "logits/chosen": -1.9888044595718384, - "logits/rejected": -1.9792511463165283, - "logps/chosen": -23.88228988647461, - "logps/rejected": -30.420047760009766, - "loss": 0.2477, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.1708904504776, - "rewards/margins": 1.4447450637817383, - "rewards/rejected": -0.273854523897171, - "step": 860 - }, - { - "epoch": 2.26, - "grad_norm": 3.203125, - "learning_rate": 2.3697260014953107e-06, - "logits/chosen": -1.8394625186920166, - "logits/rejected": -1.8409032821655273, - "logps/chosen": -31.589025497436523, - "logps/rejected": -30.483844757080078, - "loss": 0.2243, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3719897270202637, - "rewards/margins": 1.6226600408554077, - "rewards/rejected": -0.25067034363746643, - "step": 870 - }, - { - "epoch": 2.29, - "grad_norm": 2.984375, - "learning_rate": 2.3131747660339396e-06, - "logits/chosen": -1.8513740301132202, - "logits/rejected": -1.8522183895111084, - "logps/chosen": -29.875484466552734, - "logps/rejected": -33.62201690673828, - "loss": 0.2371, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.2568821907043457, - "rewards/margins": 1.5424292087554932, - "rewards/rejected": -0.2855471074581146, - "step": 880 - }, - { - "epoch": 2.31, - "grad_norm": 3.15625, - "learning_rate": 2.256719512667651e-06, - "logits/chosen": -1.7520290613174438, - "logits/rejected": -1.7505395412445068, - "logps/chosen": -32.88391876220703, - "logps/rejected": -36.737144470214844, - "loss": 0.2115, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3905012607574463, - "rewards/margins": 1.8204532861709595, - "rewards/rejected": -0.4299522042274475, - "step": 890 - }, - { - "epoch": 2.34, - "grad_norm": 3.296875, - "learning_rate": 2.2003892454735786e-06, - "logits/chosen": -1.9031442403793335, - "logits/rejected": -1.896388292312622, - "logps/chosen": -29.209808349609375, - "logps/rejected": -33.324440002441406, - "loss": 0.2187, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3964145183563232, - "rewards/margins": 1.679600477218628, - "rewards/rejected": -0.2831859588623047, - "step": 900 - }, - { - "epoch": 2.34, - "eval_logits/chosen": -2.1536905765533447, - "eval_logits/rejected": -2.1489667892456055, - "eval_logps/chosen": -33.63736343383789, - "eval_logps/rejected": -37.3044319152832, - "eval_loss": 0.4798855781555176, - "eval_rewards/accuracies": 0.5859634280204773, - "eval_rewards/chosen": 0.19859382510185242, - "eval_rewards/margins": 0.09250029176473618, - "eval_rewards/rejected": 0.10609354078769684, - "eval_runtime": 145.9374, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 900 - }, - { - "epoch": 2.36, - "grad_norm": 3.34375, - "learning_rate": 2.1442129043167877e-06, - "logits/chosen": -1.9416828155517578, - "logits/rejected": -1.9371055364608765, - "logps/chosen": -30.894506454467773, - "logps/rejected": -35.08334732055664, - "loss": 0.2161, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3315870761871338, - "rewards/margins": 1.6089489459991455, - "rewards/rejected": -0.2773619294166565, - "step": 910 - }, - { - "epoch": 2.39, - "grad_norm": 3.09375, - "learning_rate": 2.088219349982323e-06, - "logits/chosen": -1.8614780902862549, - "logits/rejected": -1.8665939569473267, - "logps/chosen": -32.37309646606445, - "logps/rejected": -32.93268966674805, - "loss": 0.2142, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4877432584762573, - "rewards/margins": 1.7041339874267578, - "rewards/rejected": -0.21639057993888855, - "step": 920 - }, - { - "epoch": 2.42, - "grad_norm": 3.828125, - "learning_rate": 2.0324373493478803e-06, - "logits/chosen": -1.9591439962387085, - "logits/rejected": -1.9500499963760376, - "logps/chosen": -29.61981773376465, - "logps/rejected": -34.86129379272461, - "loss": 0.2218, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3325560092926025, - "rewards/margins": 1.541804552078247, - "rewards/rejected": -0.20924846827983856, - "step": 930 - }, - { - "epoch": 2.44, - "grad_norm": 3.4375, - "learning_rate": 1.976895560604729e-06, - "logits/chosen": -1.8852688074111938, - "logits/rejected": -1.8821607828140259, - "logps/chosen": -28.99648666381836, - "logps/rejected": -32.75856018066406, - "loss": 0.2581, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.145984411239624, - "rewards/margins": 1.2925770282745361, - "rewards/rejected": -0.1465924233198166, - "step": 940 - }, - { - "epoch": 2.47, - "grad_norm": 3.265625, - "learning_rate": 1.921622518534466e-06, - "logits/chosen": -1.8257964849472046, - "logits/rejected": -1.8332417011260986, - "logps/chosen": -30.27571678161621, - "logps/rejected": -35.836090087890625, - "loss": 0.2579, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.2976661920547485, - "rewards/margins": 1.4407445192337036, - "rewards/rejected": -0.14307832717895508, - "step": 950 - }, - { - "epoch": 2.49, - "grad_norm": 3.75, - "learning_rate": 1.8666466198491794e-06, - "logits/chosen": -1.8357635736465454, - "logits/rejected": -1.829122543334961, - "logps/chosen": -31.0333251953125, - "logps/rejected": -37.29237747192383, - "loss": 0.2128, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.397595763206482, - "rewards/margins": 1.7443815469741821, - "rewards/rejected": -0.3467857837677002, - "step": 960 - }, - { - "epoch": 2.52, - "grad_norm": 3.90625, - "learning_rate": 1.8119961086025376e-06, - "logits/chosen": -1.836656928062439, - "logits/rejected": -1.8372418880462646, - "logps/chosen": -28.430130004882812, - "logps/rejected": -33.35955047607422, - "loss": 0.2164, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.4031370878219604, - "rewards/margins": 1.5922828912734985, - "rewards/rejected": -0.18914571404457092, - "step": 970 - }, - { - "epoch": 2.55, - "grad_norm": 3.640625, - "learning_rate": 1.7576990616793139e-06, - "logits/chosen": -1.841817855834961, - "logits/rejected": -1.8529802560806274, - "logps/chosen": -30.15691566467285, - "logps/rejected": -34.552391052246094, - "loss": 0.2162, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.4598867893218994, - "rewards/margins": 1.6704761981964111, - "rewards/rejected": -0.21058940887451172, - "step": 980 - }, - { - "epoch": 2.57, - "grad_norm": 2.796875, - "learning_rate": 1.7037833743707892e-06, - "logits/chosen": -1.9104053974151611, - "logits/rejected": -1.907997488975525, - "logps/chosen": -33.329612731933594, - "logps/rejected": -32.695777893066406, - "loss": 0.2353, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.3957197666168213, - "rewards/margins": 1.5439945459365845, - "rewards/rejected": -0.14827489852905273, - "step": 990 - }, - { - "epoch": 2.6, - "grad_norm": 3.5625, - "learning_rate": 1.6502767460434588e-06, - "logits/chosen": -1.8802947998046875, - "logits/rejected": -1.8843204975128174, - "logps/chosen": -31.801488876342773, - "logps/rejected": -34.79668426513672, - "loss": 0.2382, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.330899953842163, - "rewards/margins": 1.4862186908721924, - "rewards/rejected": -0.1553185135126114, - "step": 1000 - }, - { - "epoch": 2.6, - "eval_logits/chosen": -2.1465587615966797, - "eval_logits/rejected": -2.141857147216797, - "eval_logps/chosen": -33.65878677368164, - "eval_logps/rejected": -37.34543228149414, - "eval_loss": 0.47840210795402527, - "eval_rewards/accuracies": 0.6121262311935425, - "eval_rewards/chosen": 0.18788374960422516, - "eval_rewards/margins": 0.10228801518678665, - "eval_rewards/rejected": 0.08559573441743851, - "eval_runtime": 145.8306, - "eval_samples_per_second": 2.352, - "eval_steps_per_second": 0.295, - "step": 1000 - }, - { - "epoch": 2.62, - "grad_norm": 4.875, - "learning_rate": 1.5972066659083796e-06, - "logits/chosen": -1.890098214149475, - "logits/rejected": -1.8949025869369507, - "logps/chosen": -30.22637939453125, - "logps/rejected": -32.744972229003906, - "loss": 0.2521, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 1.2870213985443115, - "rewards/margins": 1.3988797664642334, - "rewards/rejected": -0.11185808479785919, - "step": 1010 - }, - { - "epoch": 2.65, - "grad_norm": 3.328125, - "learning_rate": 1.5446003988985041e-06, - "logits/chosen": -1.8487266302108765, - "logits/rejected": -1.851427435874939, - "logps/chosen": -27.88092613220215, - "logps/rejected": -32.288612365722656, - "loss": 0.224, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2823561429977417, - "rewards/margins": 1.5770803689956665, - "rewards/rejected": -0.29472416639328003, - "step": 1020 - }, - { - "epoch": 2.68, - "grad_norm": 2.640625, - "learning_rate": 1.4924849716612211e-06, - "logits/chosen": -1.9036891460418701, - "logits/rejected": -1.8982887268066406, - "logps/chosen": -31.47231674194336, - "logps/rejected": -33.97800827026367, - "loss": 0.2386, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2583982944488525, - "rewards/margins": 1.4884042739868164, - "rewards/rejected": -0.23000593483448029, - "step": 1030 - }, - { - "epoch": 2.7, - "grad_norm": 4.0625, - "learning_rate": 1.440887158673332e-06, - "logits/chosen": -1.837662935256958, - "logits/rejected": -1.8414955139160156, - "logps/chosen": -33.099395751953125, - "logps/rejected": -34.61898422241211, - "loss": 0.2434, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.327166199684143, - "rewards/margins": 1.4268674850463867, - "rewards/rejected": -0.09970127046108246, - "step": 1040 - }, - { - "epoch": 2.73, - "grad_norm": 3.421875, - "learning_rate": 1.3898334684855647e-06, - "logits/chosen": -1.8357412815093994, - "logits/rejected": -1.848488450050354, - "logps/chosen": -29.331628799438477, - "logps/rejected": -33.424556732177734, - "loss": 0.242, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.2656288146972656, - "rewards/margins": 1.4678878784179688, - "rewards/rejected": -0.2022588551044464, - "step": 1050 - }, - { - "epoch": 2.75, - "grad_norm": 4.0, - "learning_rate": 1.3393501301037245e-06, - "logits/chosen": -1.8252437114715576, - "logits/rejected": -1.8189178705215454, - "logps/chosen": -29.45071792602539, - "logps/rejected": -33.71580505371094, - "loss": 0.2292, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3543059825897217, - "rewards/margins": 1.5532277822494507, - "rewards/rejected": -0.19892188906669617, - "step": 1060 - }, - { - "epoch": 2.78, - "grad_norm": 3.765625, - "learning_rate": 1.2894630795134454e-06, - "logits/chosen": -1.9500625133514404, - "logits/rejected": -1.9508724212646484, - "logps/chosen": -30.900920867919922, - "logps/rejected": -33.19152069091797, - "loss": 0.2268, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3230774402618408, - "rewards/margins": 1.4964913129806519, - "rewards/rejected": -0.17341403663158417, - "step": 1070 - }, - { - "epoch": 2.81, - "grad_norm": 4.25, - "learning_rate": 1.2401979463554984e-06, - "logits/chosen": -1.8827415704727173, - "logits/rejected": -1.8816791772842407, - "logps/chosen": -31.432628631591797, - "logps/rejected": -33.33588409423828, - "loss": 0.2508, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.3286012411117554, - "rewards/margins": 1.4006279706954956, - "rewards/rejected": -0.07202671468257904, - "step": 1080 - }, - { - "epoch": 2.83, - "grad_norm": 3.1875, - "learning_rate": 1.1915800407584705e-06, - "logits/chosen": -1.9063217639923096, - "logits/rejected": -1.8989269733428955, - "logps/chosen": -31.623647689819336, - "logps/rejected": -31.403568267822266, - "loss": 0.2394, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3011149168014526, - "rewards/margins": 1.4821308851242065, - "rewards/rejected": -0.18101628124713898, - "step": 1090 - }, - { - "epoch": 2.86, - "grad_norm": 2.5625, - "learning_rate": 1.1436343403356019e-06, - "logits/chosen": -1.8777185678482056, - "logits/rejected": -1.877018690109253, - "logps/chosen": -32.405357360839844, - "logps/rejected": -36.93411636352539, - "loss": 0.2041, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.506527304649353, - "rewards/margins": 1.7972309589385986, - "rewards/rejected": -0.2907036244869232, - "step": 1100 - }, - { - "epoch": 2.86, - "eval_logits/chosen": -2.143841028213501, - "eval_logits/rejected": -2.1391518115997314, - "eval_logps/chosen": -33.65277099609375, - "eval_logps/rejected": -37.339603424072266, - "eval_loss": 0.4782448410987854, - "eval_rewards/accuracies": 0.5888704061508179, - "eval_rewards/chosen": 0.19088971614837646, - "eval_rewards/margins": 0.10238084197044373, - "eval_rewards/rejected": 0.08850887417793274, - "eval_runtime": 145.947, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1100 - }, - { - "epoch": 2.88, - "grad_norm": 3.703125, - "learning_rate": 1.0963854773524548e-06, - "logits/chosen": -1.922458291053772, - "logits/rejected": -1.92776358127594, - "logps/chosen": -32.819183349609375, - "logps/rejected": -35.94624328613281, - "loss": 0.2428, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.3239645957946777, - "rewards/margins": 1.5252944231033325, - "rewards/rejected": -0.20132985711097717, - "step": 1110 - }, - { - "epoch": 2.91, - "grad_norm": 2.890625, - "learning_rate": 1.049857726072005e-06, - "logits/chosen": -1.8954557180404663, - "logits/rejected": -1.8935827016830444, - "logps/chosen": -30.0543155670166, - "logps/rejected": -33.462257385253906, - "loss": 0.2365, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2809476852416992, - "rewards/margins": 1.5207003355026245, - "rewards/rejected": -0.23975276947021484, - "step": 1120 - }, - { - "epoch": 2.94, - "grad_norm": 2.796875, - "learning_rate": 1.0040749902836508e-06, - "logits/chosen": -1.798630952835083, - "logits/rejected": -1.801329255104065, - "logps/chosen": -27.128742218017578, - "logps/rejected": -30.9866886138916, - "loss": 0.2121, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3099478483200073, - "rewards/margins": 1.6807739734649658, - "rewards/rejected": -0.37082618474960327, - "step": 1130 - }, - { - "epoch": 2.96, - "grad_norm": 3.21875, - "learning_rate": 9.59060791022566e-07, - "logits/chosen": -1.8972008228302002, - "logits/rejected": -1.8941949605941772, - "logps/chosen": -30.391637802124023, - "logps/rejected": -32.64166259765625, - "loss": 0.2315, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.4060550928115845, - "rewards/margins": 1.4634472131729126, - "rewards/rejected": -0.057392098009586334, - "step": 1140 - }, - { - "epoch": 2.99, - "grad_norm": 3.734375, - "learning_rate": 9.148382544856885e-07, - "logits/chosen": -1.815972089767456, - "logits/rejected": -1.8097312450408936, - "logps/chosen": -25.797870635986328, - "logps/rejected": -32.730873107910156, - "loss": 0.2522, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.1700583696365356, - "rewards/margins": 1.4613462686538696, - "rewards/rejected": -0.2912878394126892, - "step": 1150 - }, - { - "epoch": 3.01, - "grad_norm": 3.125, - "learning_rate": 8.714301001505568e-07, - "logits/chosen": -1.929557204246521, - "logits/rejected": -1.927114725112915, - "logps/chosen": -30.397785186767578, - "logps/rejected": -35.74321365356445, - "loss": 0.2227, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3491405248641968, - "rewards/margins": 1.6749813556671143, - "rewards/rejected": -0.3258407711982727, - "step": 1160 - }, - { - "epoch": 3.04, - "grad_norm": 2.625, - "learning_rate": 8.288586291031025e-07, - "logits/chosen": -1.914023756980896, - "logits/rejected": -1.9119844436645508, - "logps/chosen": -28.768091201782227, - "logps/rejected": -32.97541046142578, - "loss": 0.2124, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.423156976699829, - "rewards/margins": 1.728851079940796, - "rewards/rejected": -0.3056941032409668, - "step": 1170 - }, - { - "epoch": 3.06, - "grad_norm": 3.078125, - "learning_rate": 7.871457125803897e-07, - "logits/chosen": -1.9079790115356445, - "logits/rejected": -1.8959391117095947, - "logps/chosen": -33.25437545776367, - "logps/rejected": -34.638919830322266, - "loss": 0.2167, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.3954612016677856, - "rewards/margins": 1.6791839599609375, - "rewards/rejected": -0.28372296690940857, - "step": 1180 - }, - { - "epoch": 3.09, - "grad_norm": 3.890625, - "learning_rate": 7.463127807341966e-07, - "logits/chosen": -1.8004252910614014, - "logits/rejected": -1.7953609228134155, - "logps/chosen": -32.21669387817383, - "logps/rejected": -34.557525634765625, - "loss": 0.2035, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.566497564315796, - "rewards/margins": 1.7263473272323608, - "rewards/rejected": -0.15985000133514404, - "step": 1190 - }, - { - "epoch": 3.12, - "grad_norm": 2.828125, - "learning_rate": 7.063808116212021e-07, - "logits/chosen": -1.8471992015838623, - "logits/rejected": -1.8501752614974976, - "logps/chosen": -30.067035675048828, - "logps/rejected": -31.822067260742188, - "loss": 0.2467, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.2740302085876465, - "rewards/margins": 1.4419748783111572, - "rewards/rejected": -0.1679445207118988, - "step": 1200 - }, - { - "epoch": 3.12, - "eval_logits/chosen": -2.143656015396118, - "eval_logits/rejected": -2.138974189758301, - "eval_logps/chosen": -33.64625549316406, - "eval_logps/rejected": -37.349605560302734, - "eval_loss": 0.47655606269836426, - "eval_rewards/accuracies": 0.5892857313156128, - "eval_rewards/chosen": 0.19414812326431274, - "eval_rewards/margins": 0.11064070463180542, - "eval_rewards/rejected": 0.08350743353366852, - "eval_runtime": 145.7471, - "eval_samples_per_second": 2.353, - "eval_steps_per_second": 0.295, - "step": 1200 - }, - { - "epoch": 3.14, - "grad_norm": 3.59375, - "learning_rate": 6.673703204254348e-07, - "logits/chosen": -1.8714542388916016, - "logits/rejected": -1.8671817779541016, - "logps/chosen": -28.504308700561523, - "logps/rejected": -31.4024715423584, - "loss": 0.2118, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.384680986404419, - "rewards/margins": 1.6318689584732056, - "rewards/rejected": -0.24718794226646423, - "step": 1210 - }, - { - "epoch": 3.17, - "grad_norm": 2.953125, - "learning_rate": 6.293013489185315e-07, - "logits/chosen": -1.8835033178329468, - "logits/rejected": -1.8782579898834229, - "logps/chosen": -32.33659744262695, - "logps/rejected": -35.217987060546875, - "loss": 0.2087, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.4686732292175293, - "rewards/margins": 1.7927265167236328, - "rewards/rejected": -0.3240532875061035, - "step": 1220 - }, - { - "epoch": 3.19, - "grad_norm": 3.203125, - "learning_rate": 5.921934551632086e-07, - "logits/chosen": -1.885962724685669, - "logits/rejected": -1.8728628158569336, - "logps/chosen": -30.469623565673828, - "logps/rejected": -34.679969787597656, - "loss": 0.2268, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.312195062637329, - "rewards/margins": 1.592256784439087, - "rewards/rejected": -0.28006166219711304, - "step": 1230 - }, - { - "epoch": 3.22, - "grad_norm": 2.546875, - "learning_rate": 5.560657034652405e-07, - "logits/chosen": -1.9309202432632446, - "logits/rejected": -1.9283870458602905, - "logps/chosen": -32.273033142089844, - "logps/rejected": -32.28858184814453, - "loss": 0.2216, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.379934549331665, - "rewards/margins": 1.6042922735214233, - "rewards/rejected": -0.22435779869556427, - "step": 1240 - }, - { - "epoch": 3.25, - "grad_norm": 2.5, - "learning_rate": 5.2093665457911e-07, - "logits/chosen": -1.8590129613876343, - "logits/rejected": -1.8563369512557983, - "logps/chosen": -31.4901180267334, - "logps/rejected": -35.602264404296875, - "loss": 0.1959, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4530160427093506, - "rewards/margins": 1.8555854558944702, - "rewards/rejected": -0.4025695323944092, - "step": 1250 - }, - { - "epoch": 3.27, - "grad_norm": 2.703125, - "learning_rate": 4.868243561723535e-07, - "logits/chosen": -1.9257148504257202, - "logits/rejected": -1.920405387878418, - "logps/chosen": -27.986465454101562, - "logps/rejected": -32.8531379699707, - "loss": 0.2161, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3871676921844482, - "rewards/margins": 1.622460961341858, - "rewards/rejected": -0.23529353737831116, - "step": 1260 - }, - { - "epoch": 3.3, - "grad_norm": 3.28125, - "learning_rate": 4.537463335535161e-07, - "logits/chosen": -1.9512996673583984, - "logits/rejected": -1.9561084508895874, - "logps/chosen": -29.86050033569336, - "logps/rejected": -32.16343688964844, - "loss": 0.2189, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4252476692199707, - "rewards/margins": 1.6662845611572266, - "rewards/rejected": -0.2410367727279663, - "step": 1270 - }, - { - "epoch": 3.32, - "grad_norm": 4.125, - "learning_rate": 4.217195806684629e-07, - "logits/chosen": -1.8454103469848633, - "logits/rejected": -1.8527978658676147, - "logps/chosen": -32.066978454589844, - "logps/rejected": -33.092769622802734, - "loss": 0.2248, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.420132040977478, - "rewards/margins": 1.6673837900161743, - "rewards/rejected": -0.24725179374217987, - "step": 1280 - }, - { - "epoch": 3.35, - "grad_norm": 3.5, - "learning_rate": 3.907605513696808e-07, - "logits/chosen": -1.7517541646957397, - "logits/rejected": -1.7541660070419312, - "logps/chosen": -30.193862915039062, - "logps/rejected": -36.98627471923828, - "loss": 0.2253, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4690179824829102, - "rewards/margins": 1.6850197315216064, - "rewards/rejected": -0.2160019427537918, - "step": 1290 - }, - { - "epoch": 3.38, - "grad_norm": 1.7109375, - "learning_rate": 3.6088515096305675e-07, - "logits/chosen": -1.8056175708770752, - "logits/rejected": -1.8090769052505493, - "logps/chosen": -30.090429306030273, - "logps/rejected": -33.11661911010742, - "loss": 0.1951, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.563118815422058, - "rewards/margins": 1.8893442153930664, - "rewards/rejected": -0.32622528076171875, - "step": 1300 - }, - { - "epoch": 3.38, - "eval_logits/chosen": -2.144080638885498, - "eval_logits/rejected": -2.1394007205963135, - "eval_logps/chosen": -33.64899826049805, - "eval_logps/rejected": -37.3505973815918, - "eval_loss": 0.4767414629459381, - "eval_rewards/accuracies": 0.5888704061508179, - "eval_rewards/chosen": 0.19277696311473846, - "eval_rewards/margins": 0.10976721346378326, - "eval_rewards/rejected": 0.0830097496509552, - "eval_runtime": 145.9589, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1300 - }, - { - "epoch": 3.4, - "grad_norm": 2.703125, - "learning_rate": 3.321087280364757e-07, - "logits/chosen": -1.914703369140625, - "logits/rejected": -1.8947664499282837, - "logps/chosen": -28.157054901123047, - "logps/rejected": -35.78913497924805, - "loss": 0.2118, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.478171467781067, - "rewards/margins": 1.7895358800888062, - "rewards/rejected": -0.3113645613193512, - "step": 1310 - }, - { - "epoch": 3.43, - "grad_norm": 2.578125, - "learning_rate": 3.044460665744284e-07, - "logits/chosen": -1.8919546604156494, - "logits/rejected": -1.89761483669281, - "logps/chosen": -29.809295654296875, - "logps/rejected": -33.12956237792969, - "loss": 0.1811, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.6122461557388306, - "rewards/margins": 1.9316511154174805, - "rewards/rejected": -0.31940507888793945, - "step": 1320 - }, - { - "epoch": 3.45, - "grad_norm": 3.328125, - "learning_rate": 2.779113783626916e-07, - "logits/chosen": -1.8657556772232056, - "logits/rejected": -1.8608640432357788, - "logps/chosen": -30.290508270263672, - "logps/rejected": -35.307830810546875, - "loss": 0.2095, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.3660006523132324, - "rewards/margins": 1.733769178390503, - "rewards/rejected": -0.3677687346935272, - "step": 1330 - }, - { - "epoch": 3.48, - "grad_norm": 2.171875, - "learning_rate": 2.5251829568697204e-07, - "logits/chosen": -1.7264010906219482, - "logits/rejected": -1.7360337972640991, - "logps/chosen": -30.811847686767578, - "logps/rejected": -31.357213973999023, - "loss": 0.2156, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.4938724040985107, - "rewards/margins": 1.6554222106933594, - "rewards/rejected": -0.16154971718788147, - "step": 1340 - }, - { - "epoch": 3.51, - "grad_norm": 2.671875, - "learning_rate": 2.2827986432927774e-07, - "logits/chosen": -1.7876417636871338, - "logits/rejected": -1.7924047708511353, - "logps/chosen": -30.8006591796875, - "logps/rejected": -33.50164031982422, - "loss": 0.1898, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.5751415491104126, - "rewards/margins": 1.7235567569732666, - "rewards/rejected": -0.14841555058956146, - "step": 1350 - }, - { - "epoch": 3.53, - "grad_norm": 3.6875, - "learning_rate": 2.0520853686560177e-07, - "logits/chosen": -1.8155629634857178, - "logits/rejected": -1.8210582733154297, - "logps/chosen": -30.890161514282227, - "logps/rejected": -35.25407028198242, - "loss": 0.2091, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4707510471343994, - "rewards/margins": 1.816253900527954, - "rewards/rejected": -0.34550291299819946, - "step": 1360 - }, - { - "epoch": 3.56, - "grad_norm": 2.234375, - "learning_rate": 1.833161662683672e-07, - "logits/chosen": -1.9086004495620728, - "logits/rejected": -1.904147744178772, - "logps/chosen": -29.909168243408203, - "logps/rejected": -31.517398834228516, - "loss": 0.2214, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3632584810256958, - "rewards/margins": 1.6660083532333374, - "rewards/rejected": -0.3027498126029968, - "step": 1370 - }, - { - "epoch": 3.58, - "grad_norm": 3.328125, - "learning_rate": 1.626139998169246e-07, - "logits/chosen": -1.8234046697616577, - "logits/rejected": -1.8253214359283447, - "logps/chosen": -27.034130096435547, - "logps/rejected": -31.275365829467773, - "loss": 0.2311, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3125860691070557, - "rewards/margins": 1.5781786441802979, - "rewards/rejected": -0.2655923664569855, - "step": 1380 - }, - { - "epoch": 3.61, - "grad_norm": 4.21875, - "learning_rate": 1.4311267331922535e-07, - "logits/chosen": -1.8180971145629883, - "logits/rejected": -1.822779655456543, - "logps/chosen": -29.434040069580078, - "logps/rejected": -33.056365966796875, - "loss": 0.246, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2927253246307373, - "rewards/margins": 1.3268754482269287, - "rewards/rejected": -0.034150008112192154, - "step": 1390 - }, - { - "epoch": 3.64, - "grad_norm": 2.1875, - "learning_rate": 1.2482220564763669e-07, - "logits/chosen": -1.8844108581542969, - "logits/rejected": -1.887677788734436, - "logps/chosen": -31.939258575439453, - "logps/rejected": -33.98000717163086, - "loss": 0.1994, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4956570863723755, - "rewards/margins": 1.7738803625106812, - "rewards/rejected": -0.27822345495224, - "step": 1400 - }, - { - "epoch": 3.64, - "eval_logits/chosen": -2.143280506134033, - "eval_logits/rejected": -2.1386008262634277, - "eval_logps/chosen": -33.658363342285156, - "eval_logps/rejected": -37.351505279541016, - "eval_loss": 0.4777422845363617, - "eval_rewards/accuracies": 0.574335515499115, - "eval_rewards/chosen": 0.188095822930336, - "eval_rewards/margins": 0.1055372953414917, - "eval_rewards/rejected": 0.0825585201382637, - "eval_runtime": 145.9314, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1400 - }, - { - "epoch": 3.66, - "grad_norm": 2.9375, - "learning_rate": 1.0775199359171346e-07, - "logits/chosen": -1.952696442604065, - "logits/rejected": -1.9460382461547852, - "logps/chosen": -30.8232421875, - "logps/rejected": -34.605865478515625, - "loss": 0.2196, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4896517992019653, - "rewards/margins": 1.6703027486801147, - "rewards/rejected": -0.18065093457698822, - "step": 1410 - }, - { - "epoch": 3.69, - "grad_norm": 3.875, - "learning_rate": 9.191080703056604e-08, - "logits/chosen": -1.8402042388916016, - "logits/rejected": -1.8509689569473267, - "logps/chosen": -31.183258056640625, - "logps/rejected": -34.182586669921875, - "loss": 0.2016, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4830243587493896, - "rewards/margins": 1.7787771224975586, - "rewards/rejected": -0.2957528233528137, - "step": 1420 - }, - { - "epoch": 3.71, - "grad_norm": 3.65625, - "learning_rate": 7.730678442730539e-08, - "logits/chosen": -1.9233169555664062, - "logits/rejected": -1.9352226257324219, - "logps/chosen": -31.918426513671875, - "logps/rejected": -34.28717803955078, - "loss": 0.1961, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.571190357208252, - "rewards/margins": 1.895612120628357, - "rewards/rejected": -0.32442185282707214, - "step": 1430 - }, - { - "epoch": 3.74, - "grad_norm": 2.765625, - "learning_rate": 6.394742864787806e-08, - "logits/chosen": -1.912786841392517, - "logits/rejected": -1.914947748184204, - "logps/chosen": -30.34816551208496, - "logps/rejected": -34.61114501953125, - "loss": 0.2018, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4301092624664307, - "rewards/margins": 1.8056964874267578, - "rewards/rejected": -0.3755870759487152, - "step": 1440 - }, - { - "epoch": 3.77, - "grad_norm": 3.125, - "learning_rate": 5.183960310644748e-08, - "logits/chosen": -1.8966419696807861, - "logits/rejected": -1.8896926641464233, - "logps/chosen": -32.674983978271484, - "logps/rejected": -34.129150390625, - "loss": 0.2229, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.446670413017273, - "rewards/margins": 1.6260440349578857, - "rewards/rejected": -0.17937341332435608, - "step": 1450 - }, - { - "epoch": 3.79, - "grad_norm": 2.984375, - "learning_rate": 4.098952823928693e-08, - "logits/chosen": -1.865486741065979, - "logits/rejected": -1.8714466094970703, - "logps/chosen": -28.413650512695312, - "logps/rejected": -34.3272705078125, - "loss": 0.211, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.39976167678833, - "rewards/margins": 1.7283977270126343, - "rewards/rejected": -0.328636109828949, - "step": 1460 - }, - { - "epoch": 3.82, - "grad_norm": 2.890625, - "learning_rate": 3.1402778309014284e-08, - "logits/chosen": -1.8112798929214478, - "logits/rejected": -1.8093807697296143, - "logps/chosen": -28.011821746826172, - "logps/rejected": -30.955224990844727, - "loss": 0.2181, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.4168658256530762, - "rewards/margins": 1.5973619222640991, - "rewards/rejected": -0.18049615621566772, - "step": 1470 - }, - { - "epoch": 3.84, - "grad_norm": 3.5, - "learning_rate": 2.3084278540791427e-08, - "logits/chosen": -1.9865236282348633, - "logits/rejected": -1.981032133102417, - "logps/chosen": -32.51356887817383, - "logps/rejected": -32.532928466796875, - "loss": 0.2133, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.385846495628357, - "rewards/margins": 1.6239551305770874, - "rewards/rejected": -0.23810863494873047, - "step": 1480 - }, - { - "epoch": 3.87, - "grad_norm": 2.9375, - "learning_rate": 1.6038302591975807e-08, - "logits/chosen": -1.8781344890594482, - "logits/rejected": -1.8802881240844727, - "logps/chosen": -26.174503326416016, - "logps/rejected": -28.378719329833984, - "loss": 0.2568, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.1509389877319336, - "rewards/margins": 1.3291571140289307, - "rewards/rejected": -0.17821818590164185, - "step": 1490 - }, - { - "epoch": 3.9, - "grad_norm": 4.0625, - "learning_rate": 1.0268470356514237e-08, - "logits/chosen": -1.8871002197265625, - "logits/rejected": -1.8816699981689453, - "logps/chosen": -30.24139404296875, - "logps/rejected": -32.723289489746094, - "loss": 0.2272, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2796982526779175, - "rewards/margins": 1.6568657159805298, - "rewards/rejected": -0.3771671652793884, - "step": 1500 - }, - { - "epoch": 3.9, - "eval_logits/chosen": -2.1435675621032715, - "eval_logits/rejected": -2.138887882232666, - "eval_logps/chosen": -33.651771545410156, - "eval_logps/rejected": -37.35124969482422, - "eval_loss": 0.4769510328769684, - "eval_rewards/accuracies": 0.5921927094459534, - "eval_rewards/chosen": 0.19138793647289276, - "eval_rewards/margins": 0.10870397835969925, - "eval_rewards/rejected": 0.08268395811319351, - "eval_runtime": 145.9834, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1500 - }, - { - "epoch": 3.92, - "grad_norm": 5.34375, - "learning_rate": 5.777746105209147e-09, - "logits/chosen": -1.8100652694702148, - "logits/rejected": -1.8142120838165283, - "logps/chosen": -31.494476318359375, - "logps/rejected": -34.76432418823242, - "loss": 0.2348, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.3382586240768433, - "rewards/margins": 1.515228509902954, - "rewards/rejected": -0.17696987092494965, - "step": 1510 - }, - { - "epoch": 3.95, - "grad_norm": 2.34375, - "learning_rate": 2.5684369628148352e-09, - "logits/chosen": -1.8688828945159912, - "logits/rejected": -1.8673012256622314, - "logps/chosen": -27.977005004882812, - "logps/rejected": -33.140926361083984, - "loss": 0.2183, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.3785879611968994, - "rewards/margins": 1.6541917324066162, - "rewards/rejected": -0.27560409903526306, - "step": 1520 - }, - { - "epoch": 3.97, - "grad_norm": 4.0, - "learning_rate": 6.421917227455999e-10, - "logits/chosen": -1.9662901163101196, - "logits/rejected": -1.9586807489395142, - "logps/chosen": -25.736225128173828, - "logps/rejected": -29.176280975341797, - "loss": 0.24, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.163358449935913, - "rewards/margins": 1.4156959056854248, - "rewards/rejected": -0.25233757495880127, - "step": 1530 - }, - { - "epoch": 4.0, - "grad_norm": 2.875, - "learning_rate": 0.0, - "logits/chosen": -1.868970513343811, - "logits/rejected": -1.859106421470642, - "logps/chosen": -30.280284881591797, - "logps/rejected": -36.0854377746582, - "loss": 0.2024, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3929578065872192, - "rewards/margins": 1.7300901412963867, - "rewards/rejected": -0.3371322453022003, - "step": 1540 - }, - { - "epoch": 4.0, - "step": 1540, + "epoch": 1.0, + "step": 385, "total_flos": 0.0, - "train_loss": 0.2131701453939661, - "train_runtime": 10811.0627, - "train_samples_per_second": 1.139, - "train_steps_per_second": 0.142 + "train_loss": 0.8446982934877469, + "train_runtime": 3252.3399, + "train_samples_per_second": 0.947, + "train_steps_per_second": 0.118 } ], "logging_steps": 10, - "max_steps": 1540, + "max_steps": 385, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4,