{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 2500.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.866841197013855, "logits/rejected": -1.871166467666626, "logps/chosen": -36.98617172241211, "logps/rejected": -33.65531539916992, "loss": 2495.4616, "rewards/accuracies": 0.5, "rewards/chosen": 0.00020427265553735197, "rewards/margins": 0.00045667175436392426, "rewards/rejected": -0.0002523990988265723, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.997936487197876, "logits/rejected": -2.0005903244018555, "logps/chosen": -29.64678382873535, "logps/rejected": -29.045034408569336, "loss": 2502.3262, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -4.586054274113849e-05, "rewards/margins": -0.00022994528990238905, "rewards/rejected": 0.0001840847689891234, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9207321405410767, "logits/rejected": -1.9180399179458618, "logps/chosen": -31.407222747802734, "logps/rejected": -33.223663330078125, "loss": 2498.6508, "rewards/accuracies": 0.5625, "rewards/chosen": 8.869935118127614e-05, "rewards/margins": 0.00014070476754568517, "rewards/rejected": -5.200541272643022e-05, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.0177226066589355, "logits/rejected": -2.0089757442474365, "logps/chosen": -32.58082962036133, "logps/rejected": -32.527244567871094, "loss": 2498.9926, "rewards/accuracies": 0.5, "rewards/chosen": -4.022592111141421e-05, "rewards/margins": 0.00010554380423855036, "rewards/rejected": -0.00014576970716007054, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8629518747329712, "logits/rejected": -1.8521617650985718, "logps/chosen": -33.5596923828125, "logps/rejected": -35.45528793334961, "loss": 2499.9863, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.213427796959877e-05, "rewards/margins": 7.289124368980993e-06, "rewards/rejected": -2.9423434170894325e-05, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9416849613189697, "logits/rejected": -1.9436241388320923, "logps/chosen": -32.546897888183594, "logps/rejected": -33.21548843383789, "loss": 2490.4672, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0005328331026248634, "rewards/margins": 0.00097393908072263, "rewards/rejected": -0.00044110597809776664, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.072330951690674, "logits/rejected": -2.0772910118103027, "logps/chosen": -34.00098419189453, "logps/rejected": -36.63383102416992, "loss": 2494.9414, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00012498130672611296, "rewards/margins": 0.0005246406653895974, "rewards/rejected": -0.0006496219430118799, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9325841665267944, "logits/rejected": -1.9357010126113892, "logps/chosen": -34.33161163330078, "logps/rejected": -34.630489349365234, "loss": 2486.8059, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0009619827615097165, "rewards/margins": 0.0013428140664473176, "rewards/rejected": -0.0003808312467299402, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9400427341461182, "logits/rejected": -1.9445598125457764, "logps/chosen": -32.36492156982422, "logps/rejected": -32.34357452392578, "loss": 2491.4584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.001024983124807477, "rewards/margins": 0.0008716614102013409, "rewards/rejected": 0.00015332190378103405, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.037466526031494, "logits/rejected": -2.0354855060577393, "logps/chosen": -32.11969757080078, "logps/rejected": -31.30398178100586, "loss": 2484.2775, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.001211356371641159, "rewards/margins": 0.0015890670474618673, "rewards/rejected": -0.00037771055940538645, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.232342481613159, "eval_logits/rejected": -2.2275006771087646, "eval_logps/chosen": -34.01866149902344, "eval_logps/rejected": -37.52037811279297, "eval_loss": 2498.15966796875, "eval_rewards/accuracies": 0.5564784407615662, "eval_rewards/chosen": 0.00015893821546342224, "eval_rewards/margins": 0.0001965187693713233, "eval_rewards/rejected": -3.7580521166091785e-05, "eval_runtime": 146.0331, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.991633415222168, "logits/rejected": -1.9892610311508179, "logps/chosen": -33.10456085205078, "logps/rejected": -34.01618194580078, "loss": 2488.0367, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0013925316743552685, "rewards/margins": 0.0012606054078787565, "rewards/rejected": 0.00013192615006119013, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.003302812576294, "logits/rejected": -1.994974136352539, "logps/chosen": -32.31616973876953, "logps/rejected": -32.14063262939453, "loss": 2489.4971, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.001287340302951634, "rewards/margins": 0.0010786365019157529, "rewards/rejected": 0.00020870394655503333, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0306484699249268, "logits/rejected": -2.022704601287842, "logps/chosen": -30.306324005126953, "logps/rejected": -32.04903793334961, "loss": 2483.8781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0016939423512667418, "rewards/margins": 0.0016542377416044474, "rewards/rejected": 3.9704824303044006e-05, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9617973566055298, "logits/rejected": -1.9720312356948853, "logps/chosen": -31.230310440063477, "logps/rejected": -32.547096252441406, "loss": 2480.1322, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0018935112748295069, "rewards/margins": 0.0020156968384981155, "rewards/rejected": -0.00012218570918776095, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8725357055664062, "logits/rejected": -1.8737138509750366, "logps/chosen": -33.889976501464844, "logps/rejected": -34.795631408691406, "loss": 2466.4881, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003027186496183276, "rewards/margins": 0.0034109093248844147, "rewards/rejected": -0.0003837232361547649, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9241313934326172, "logits/rejected": -1.9207313060760498, "logps/chosen": -35.98552322387695, "logps/rejected": -32.693538665771484, "loss": 2484.5627, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0018615787848830223, "rewards/margins": 0.0015694532776251435, "rewards/rejected": 0.00029212533263489604, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.023880958557129, "logits/rejected": -2.0165772438049316, "logps/chosen": -33.457122802734375, "logps/rejected": -31.414859771728516, "loss": 2460.2227, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0034074243158102036, "rewards/margins": 0.004037545528262854, "rewards/rejected": -0.0006301216781139374, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.030813455581665, "logits/rejected": -2.0360608100891113, "logps/chosen": -32.20356750488281, "logps/rejected": -32.4092903137207, "loss": 2473.8211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0034122199285775423, "rewards/margins": 0.0026536998338997364, "rewards/rejected": 0.0007585205021314323, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0314080715179443, "logits/rejected": -2.0286362171173096, "logps/chosen": -31.27242088317871, "logps/rejected": -31.320995330810547, "loss": 2478.5072, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0021653182338923216, "rewards/margins": 0.0021931403316557407, "rewards/rejected": -2.7821719413623214e-05, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9025766849517822, "logits/rejected": -1.9072151184082031, "logps/chosen": -31.255901336669922, "logps/rejected": -32.79901885986328, "loss": 2464.7859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0035038013011217117, "rewards/margins": 0.0035846265964210033, "rewards/rejected": -8.082549902610481e-05, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.229053497314453, "eval_logits/rejected": -2.224233865737915, "eval_logps/chosen": -34.033775329589844, "eval_logps/rejected": -37.55736541748047, "eval_loss": 2496.0791015625, "eval_rewards/accuracies": 0.5544019937515259, "eval_rewards/chosen": 7.761791493976489e-06, "eval_rewards/margins": 0.00041520988452248275, "eval_rewards/rejected": -0.00040744812577031553, "eval_runtime": 145.4716, "eval_samples_per_second": 2.358, "eval_steps_per_second": 0.296, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.0153257846832275, "logits/rejected": -2.0259604454040527, "logps/chosen": -31.77630615234375, "logps/rejected": -33.9268798828125, "loss": 2474.5629, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.001949988305568695, "rewards/margins": 0.0025917969178408384, "rewards/rejected": -0.0006418084958568215, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.907790184020996, "logits/rejected": -1.9225451946258545, "logps/chosen": -29.77730941772461, "logps/rejected": -31.612323760986328, "loss": 2461.7975, "rewards/accuracies": 0.75, "rewards/chosen": 0.0033422994893044233, "rewards/margins": 0.0038837480824440718, "rewards/rejected": -0.0005414488259702921, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9650691747665405, "logits/rejected": -1.9690206050872803, "logps/chosen": -33.07447052001953, "logps/rejected": -31.645030975341797, "loss": 2457.1672, "rewards/accuracies": 0.6875, "rewards/chosen": 0.003429980482906103, "rewards/margins": 0.004387288354337215, "rewards/rejected": -0.0009573075803928077, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9625787734985352, "logits/rejected": -1.9408048391342163, "logps/chosen": -33.812347412109375, "logps/rejected": -35.121795654296875, "loss": 2449.7848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0034140206407755613, "rewards/margins": 0.005124006420373917, "rewards/rejected": -0.0017099861288443208, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.003685712814331, "logits/rejected": -2.000408172607422, "logps/chosen": -32.71784210205078, "logps/rejected": -36.25305938720703, "loss": 2476.9369, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.002072283299639821, "rewards/margins": 0.0023536235094070435, "rewards/rejected": -0.00028134050080552697, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8708940744400024, "logits/rejected": -1.8684980869293213, "logps/chosen": -33.97399139404297, "logps/rejected": -35.522247314453125, "loss": 2477.3357, "rewards/accuracies": 0.625, "rewards/chosen": 0.0021441043354570866, "rewards/margins": 0.002320351079106331, "rewards/rejected": -0.00017624672909732908, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8561140298843384, "logits/rejected": -1.8537418842315674, "logps/chosen": -34.15688705444336, "logps/rejected": -31.835697174072266, "loss": 2470.1545, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0024070844519883394, "rewards/margins": 0.003050738014280796, "rewards/rejected": -0.0006436532130464911, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.959524154663086, "logits/rejected": -1.9490426778793335, "logps/chosen": -34.99895477294922, "logps/rejected": -31.8908634185791, "loss": 2459.8076, "rewards/accuracies": 0.6875, "rewards/chosen": 0.003576862858608365, "rewards/margins": 0.004075545351952314, "rewards/rejected": -0.0004986823769286275, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0554285049438477, "logits/rejected": -2.0405356884002686, "logps/chosen": -30.697057723999023, "logps/rejected": -32.610191345214844, "loss": 2482.0641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0022706831805408, "rewards/margins": 0.0018542330944910645, "rewards/rejected": 0.0004164502606727183, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9246467351913452, "logits/rejected": -1.922141671180725, "logps/chosen": -32.302886962890625, "logps/rejected": -30.90523338317871, "loss": 2430.5529, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.0062422603368759155, "rewards/margins": 0.007126508746296167, "rewards/rejected": -0.0008842485258355737, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.22615647315979, "eval_logits/rejected": -2.2213311195373535, "eval_logps/chosen": -34.06288146972656, "eval_logps/rejected": -37.59059524536133, "eval_loss": 2495.66796875, "eval_rewards/accuracies": 0.5390365719795227, "eval_rewards/chosen": -0.0002833307080436498, "eval_rewards/margins": 0.00045641581527888775, "eval_rewards/rejected": -0.0007397464942187071, "eval_runtime": 145.8977, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 4.84533120650964e-06, "logits/chosen": -1.9104417562484741, "logits/rejected": -1.9072014093399048, "logps/chosen": -31.30303955078125, "logps/rejected": -33.819358825683594, "loss": 2463.1744, "rewards/accuracies": 0.75, "rewards/chosen": 0.0030030703637748957, "rewards/margins": 0.003775153774768114, "rewards/rejected": -0.0007720834692008793, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.825108134172131e-06, "logits/chosen": -1.9579178094863892, "logits/rejected": -1.9457557201385498, "logps/chosen": -34.26006317138672, "logps/rejected": -33.66352462768555, "loss": 2454.818, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0033601075410842896, "rewards/margins": 0.004606915637850761, "rewards/rejected": -0.0012468084460124373, "step": 320 }, { "epoch": 0.86, "learning_rate": 4.80369052967602e-06, "logits/chosen": -1.991537094116211, "logits/rejected": -1.9901418685913086, "logps/chosen": -33.10230255126953, "logps/rejected": -32.55553436279297, "loss": 2455.166, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.003834925591945648, "rewards/margins": 0.004594448953866959, "rewards/rejected": -0.0007595239439979196, "step": 330 }, { "epoch": 0.88, "learning_rate": 4.781089396387968e-06, "logits/chosen": -2.0774741172790527, "logits/rejected": -2.0618669986724854, "logps/chosen": -33.6904182434082, "logps/rejected": -33.073814392089844, "loss": 2456.2992, "rewards/accuracies": 0.625, "rewards/chosen": 0.004889755509793758, "rewards/margins": 0.004447542130947113, "rewards/rejected": 0.0004422132042236626, "step": 340 }, { "epoch": 0.91, "learning_rate": 4.757316345716554e-06, "logits/chosen": -1.9498752355575562, "logits/rejected": -1.9490633010864258, "logps/chosen": -32.76622009277344, "logps/rejected": -32.49995040893555, "loss": 2446.7852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00515871262177825, "rewards/margins": 0.005506747402250767, "rewards/rejected": -0.0003480348386801779, "step": 350 }, { "epoch": 0.94, "learning_rate": 4.73238359114687e-06, "logits/chosen": -1.9010334014892578, "logits/rejected": -1.91123366355896, "logps/chosen": -31.694040298461914, "logps/rejected": -35.382728576660156, "loss": 2441.1234, "rewards/accuracies": 0.75, "rewards/chosen": 0.005106499884277582, "rewards/margins": 0.006046179216355085, "rewards/rejected": -0.0009396795067004859, "step": 360 }, { "epoch": 0.96, "learning_rate": 4.706303941965804e-06, "logits/chosen": -2.036052703857422, "logits/rejected": -2.029733180999756, "logps/chosen": -33.1943473815918, "logps/rejected": -29.27004051208496, "loss": 2450.7623, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004576197825372219, "rewards/margins": 0.0050123645924031734, "rewards/rejected": -0.00043616676703095436, "step": 370 }, { "epoch": 0.99, "learning_rate": 4.679090796681225e-06, "logits/chosen": -1.8926509618759155, "logits/rejected": -1.894890546798706, "logps/chosen": -33.61520004272461, "logps/rejected": -30.98312759399414, "loss": 2428.4018, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.006238477770239115, "rewards/margins": 0.007343468256294727, "rewards/rejected": -0.0011049896711483598, "step": 380 }, { "epoch": 1.01, "learning_rate": 4.650758136138454e-06, "logits/chosen": -1.9188745021820068, "logits/rejected": -1.9176126718521118, "logps/chosen": -33.695579528808594, "logps/rejected": -36.02911376953125, "loss": 2397.0137, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": 0.006892119534313679, "rewards/margins": 0.010623215697705746, "rewards/rejected": -0.003731096163392067, "step": 390 }, { "epoch": 1.04, "learning_rate": 4.621320516337559e-06, "logits/chosen": -1.8515088558197021, "logits/rejected": -1.8431167602539062, "logps/chosen": -30.941198348999023, "logps/rejected": -36.45293426513672, "loss": 2370.302, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.008327776566147804, "rewards/margins": 0.013451090082526207, "rewards/rejected": -0.005123314447700977, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.199411153793335, "eval_logits/rejected": -2.194589376449585, "eval_logps/chosen": -34.099369049072266, "eval_logps/rejected": -37.661293029785156, "eval_loss": 2492.46533203125, "eval_rewards/accuracies": 0.5622923374176025, "eval_rewards/chosen": -0.0006481813034042716, "eval_rewards/margins": 0.0007985630072653294, "eval_rewards/rejected": -0.001446744310669601, "eval_runtime": 146.2529, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.294, "step": 400 }, { "epoch": 1.06, "learning_rate": 4.590793060955158e-06, "logits/chosen": -2.0204148292541504, "logits/rejected": -2.023253917694092, "logps/chosen": -32.13569259643555, "logps/rejected": -35.30311584472656, "loss": 2361.9771, "rewards/accuracies": 0.875, "rewards/chosen": 0.00918244756758213, "rewards/margins": 0.014285160228610039, "rewards/rejected": -0.0051027145236730576, "step": 410 }, { "epoch": 1.09, "learning_rate": 4.559191453574582e-06, "logits/chosen": -1.856715202331543, "logits/rejected": -1.8553537130355835, "logps/chosen": -28.340347290039062, "logps/rejected": -32.772071838378906, "loss": 2384.5215, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.007272079586982727, "rewards/margins": 0.01181616447865963, "rewards/rejected": -0.004544084891676903, "step": 420 }, { "epoch": 1.12, "learning_rate": 4.52653192962838e-06, "logits/chosen": -1.8120425939559937, "logits/rejected": -1.8051426410675049, "logps/chosen": -33.048492431640625, "logps/rejected": -34.51493453979492, "loss": 2373.559, "rewards/accuracies": 0.9375, "rewards/chosen": 0.010255918838083744, "rewards/margins": 0.012952560558915138, "rewards/rejected": -0.0026966414880007505, "step": 430 }, { "epoch": 1.14, "learning_rate": 4.492831268057307e-06, "logits/chosen": -1.9794769287109375, "logits/rejected": -1.9743585586547852, "logps/chosen": -30.73288345336914, "logps/rejected": -32.56402587890625, "loss": 2341.4699, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.010504107922315598, "rewards/margins": 0.01646825671195984, "rewards/rejected": -0.0059641506522893906, "step": 440 }, { "epoch": 1.17, "learning_rate": 4.458106782690094e-06, "logits/chosen": -1.8598779439926147, "logits/rejected": -1.8641446828842163, "logps/chosen": -33.39701461791992, "logps/rejected": -33.232383728027344, "loss": 2329.0439, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.011188305914402008, "rewards/margins": 0.01762574352324009, "rewards/rejected": -0.006437439471483231, "step": 450 }, { "epoch": 1.19, "learning_rate": 4.422376313348405e-06, "logits/chosen": -1.8614333868026733, "logits/rejected": -1.8558467626571655, "logps/chosen": -34.22340774536133, "logps/rejected": -35.80681610107422, "loss": 2302.8248, "rewards/accuracies": 0.9375, "rewards/chosen": 0.011961170472204685, "rewards/margins": 0.020538393408060074, "rewards/rejected": -0.008577222935855389, "step": 460 }, { "epoch": 1.22, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -1.8815793991088867, "logits/rejected": -1.8814213275909424, "logps/chosen": -33.06370544433594, "logps/rejected": -34.739097595214844, "loss": 2340.4611, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.011046240106225014, "rewards/margins": 0.01658240333199501, "rewards/rejected": -0.005536160431802273, "step": 470 }, { "epoch": 1.25, "learning_rate": 4.347971356735789e-06, "logits/chosen": -1.9247829914093018, "logits/rejected": -1.9061830043792725, "logps/chosen": -32.92525863647461, "logps/rejected": -33.87827682495117, "loss": 2304.1588, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.012355051003396511, "rewards/margins": 0.020438065752387047, "rewards/rejected": -0.008083016611635685, "step": 480 }, { "epoch": 1.27, "learning_rate": 4.309335095262675e-06, "logits/chosen": -1.8873250484466553, "logits/rejected": -1.886690378189087, "logps/chosen": -30.484582901000977, "logps/rejected": -31.771377563476562, "loss": 2340.8271, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.011267202906310558, "rewards/margins": 0.016520529985427856, "rewards/rejected": -0.0052533275447785854, "step": 490 }, { "epoch": 1.3, "learning_rate": 4.269769281772082e-06, "logits/chosen": -1.8447484970092773, "logits/rejected": -1.837871789932251, "logps/chosen": -31.42559242248535, "logps/rejected": -35.48058319091797, "loss": 2298.9412, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.01244533620774746, "rewards/margins": 0.02098379284143448, "rewards/rejected": -0.00853845663368702, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.1343138217926025, "eval_logits/rejected": -2.12954044342041, "eval_logps/chosen": -34.28038787841797, "eval_logps/rejected": -37.88829803466797, "eval_loss": 2488.40625, "eval_rewards/accuracies": 0.5772424936294556, "eval_rewards/chosen": -0.002458348637446761, "eval_rewards/margins": 0.0012584367068484426, "eval_rewards/rejected": -0.0037167854607105255, "eval_runtime": 145.9415, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 500 }, { "epoch": 1.32, "learning_rate": 4.22929424333435e-06, "logits/chosen": -1.8362356424331665, "logits/rejected": -1.8398487567901611, "logps/chosen": -28.270023345947266, "logps/rejected": -33.78419876098633, "loss": 2323.2496, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.009048042818903923, "rewards/margins": 0.018297135829925537, "rewards/rejected": -0.009249093011021614, "step": 510 }, { "epoch": 1.35, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -1.8308753967285156, "logits/rejected": -1.8415968418121338, "logps/chosen": -32.14521408081055, "logps/rejected": -31.652883529663086, "loss": 2299.3625, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.011171149089932442, "rewards/margins": 0.021183136850595474, "rewards/rejected": -0.010011989623308182, "step": 520 }, { "epoch": 1.38, "learning_rate": 4.145700124802693e-06, "logits/chosen": -1.7703996896743774, "logits/rejected": -1.7680895328521729, "logps/chosen": -30.59372329711914, "logps/rejected": -31.122241973876953, "loss": 2307.9037, "rewards/accuracies": 0.8125, "rewards/chosen": 0.010733595117926598, "rewards/margins": 0.020188378170132637, "rewards/rejected": -0.009454783983528614, "step": 530 }, { "epoch": 1.4, "learning_rate": 4.102623991469562e-06, "logits/chosen": -1.840515375137329, "logits/rejected": -1.833764672279358, "logps/chosen": -33.129478454589844, "logps/rejected": -34.03999328613281, "loss": 2296.7414, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.01103346236050129, "rewards/margins": 0.021258534863591194, "rewards/rejected": -0.010225074365735054, "step": 540 }, { "epoch": 1.43, "learning_rate": 4.058724504646834e-06, "logits/chosen": -1.8037725687026978, "logits/rejected": -1.8101667165756226, "logps/chosen": -30.930444717407227, "logps/rejected": -33.56714630126953, "loss": 2343.0631, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.008227399550378323, "rewards/margins": 0.016311541199684143, "rewards/rejected": -0.008084140717983246, "step": 550 }, { "epoch": 1.45, "learning_rate": 4.014024217844167e-06, "logits/chosen": -1.8711225986480713, "logits/rejected": -1.8482850790023804, "logps/chosen": -30.459259033203125, "logps/rejected": -33.72909927368164, "loss": 2335.1771, "rewards/accuracies": 0.875, "rewards/chosen": 0.009783074259757996, "rewards/margins": 0.01711348444223404, "rewards/rejected": -0.007330409251153469, "step": 560 }, { "epoch": 1.48, "learning_rate": 3.968546095984911e-06, "logits/chosen": -1.8007291555404663, "logits/rejected": -1.7958128452301025, "logps/chosen": -31.415090560913086, "logps/rejected": -32.90663528442383, "loss": 2330.2, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.010016413405537605, "rewards/margins": 0.01787360943853855, "rewards/rejected": -0.007857195101678371, "step": 570 }, { "epoch": 1.51, "learning_rate": 3.922313503607806e-06, "logits/chosen": -1.83207106590271, "logits/rejected": -1.8339207172393799, "logps/chosen": -33.55345153808594, "logps/rejected": -36.1082763671875, "loss": 2297.7742, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.008223430253565311, "rewards/margins": 0.02129700407385826, "rewards/rejected": -0.013073575682938099, "step": 580 }, { "epoch": 1.53, "learning_rate": 3.875350192863368e-06, "logits/chosen": -1.812063217163086, "logits/rejected": -1.811581015586853, "logps/chosen": -29.506006240844727, "logps/rejected": -32.62559127807617, "loss": 2286.0877, "rewards/accuracies": 0.875, "rewards/chosen": 0.010860500857234001, "rewards/margins": 0.022500045597553253, "rewards/rejected": -0.011639544740319252, "step": 590 }, { "epoch": 1.56, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -1.8164310455322266, "logits/rejected": -1.8141977787017822, "logps/chosen": -31.94429588317871, "logps/rejected": -33.383872985839844, "loss": 2298.7582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.010539887472987175, "rewards/margins": 0.021379008889198303, "rewards/rejected": -0.010839122347533703, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.063612222671509, "eval_logits/rejected": -2.058927059173584, "eval_logps/chosen": -34.559974670410156, "eval_logps/rejected": -38.285404205322266, "eval_loss": 2477.658935546875, "eval_rewards/accuracies": 0.6121262311935425, "eval_rewards/chosen": -0.005254245828837156, "eval_rewards/margins": 0.0024335861671715975, "eval_rewards/rejected": -0.007687832228839397, "eval_runtime": 145.6811, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.295, "step": 600 }, { "epoch": 1.58, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -1.847764015197754, "logits/rejected": -1.8541465997695923, "logps/chosen": -31.449474334716797, "logps/rejected": -33.3134880065918, "loss": 2305.692, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.007879074662923813, "rewards/margins": 0.02038208767771721, "rewards/rejected": -0.012503013014793396, "step": 610 }, { "epoch": 1.61, "learning_rate": 3.730319028506478e-06, "logits/chosen": -1.7965530157089233, "logits/rejected": -1.7944053411483765, "logps/chosen": -33.688297271728516, "logps/rejected": -32.105934143066406, "loss": 2292.9066, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.010675834491848946, "rewards/margins": 0.021880075335502625, "rewards/rejected": -0.011204240843653679, "step": 620 }, { "epoch": 1.64, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -1.7377105951309204, "logits/rejected": -1.731245756149292, "logps/chosen": -34.34708786010742, "logps/rejected": -33.66561508178711, "loss": 2271.1943, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.011171405203640461, "rewards/margins": 0.024255482479929924, "rewards/rejected": -0.013084076344966888, "step": 630 }, { "epoch": 1.66, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -1.7545562982559204, "logits/rejected": -1.7608686685562134, "logps/chosen": -33.110076904296875, "logps/rejected": -34.38447189331055, "loss": 2307.9031, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.008692274801433086, "rewards/margins": 0.02013438567519188, "rewards/rejected": -0.011442111805081367, "step": 640 }, { "epoch": 1.69, "learning_rate": 3.579601087369492e-06, "logits/chosen": -1.8283579349517822, "logits/rejected": -1.8423779010772705, "logps/chosen": -31.087310791015625, "logps/rejected": -33.21766662597656, "loss": 2311.0535, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.007651531603187323, "rewards/margins": 0.01985129900276661, "rewards/rejected": -0.012199767865240574, "step": 650 }, { "epoch": 1.71, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.6931848526000977, "logits/rejected": -1.6900306940078735, "logps/chosen": -32.68722152709961, "logps/rejected": -36.44025421142578, "loss": 2222.7857, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.012357336468994617, "rewards/margins": 0.029757345095276833, "rewards/rejected": -0.01740000769495964, "step": 660 }, { "epoch": 1.74, "learning_rate": 3.476306177936961e-06, "logits/chosen": -1.7785106897354126, "logits/rejected": -1.7785335779190063, "logps/chosen": -30.625701904296875, "logps/rejected": -35.58719253540039, "loss": 2272.5373, "rewards/accuracies": 0.8125, "rewards/chosen": 0.006598903331905603, "rewards/margins": 0.024207040667533875, "rewards/rejected": -0.01760813593864441, "step": 670 }, { "epoch": 1.77, "learning_rate": 3.423893017450324e-06, "logits/chosen": -1.7213503122329712, "logits/rejected": -1.718073844909668, "logps/chosen": -30.16400146484375, "logps/rejected": -34.405784606933594, "loss": 2284.7229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.007083491422235966, "rewards/margins": 0.02296331152319908, "rewards/rejected": -0.01587982103228569, "step": 680 }, { "epoch": 1.79, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -1.7414871454238892, "logits/rejected": -1.7416623830795288, "logps/chosen": -29.0936279296875, "logps/rejected": -32.20520782470703, "loss": 2256.3047, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.010694684460759163, "rewards/margins": 0.026041794568300247, "rewards/rejected": -0.015347110107541084, "step": 690 }, { "epoch": 1.82, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -1.660274863243103, "logits/rejected": -1.663644552230835, "logps/chosen": -33.33858108520508, "logps/rejected": -33.01979064941406, "loss": 2254.2998, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.011398938484489918, "rewards/margins": 0.026949990540742874, "rewards/rejected": -0.015551051124930382, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -2.0085763931274414, "eval_logits/rejected": -2.0039827823638916, "eval_logps/chosen": -34.992069244384766, "eval_logps/rejected": -38.73301696777344, "eval_loss": 2477.662353515625, "eval_rewards/accuracies": 0.5539867281913757, "eval_rewards/chosen": -0.009575208649039268, "eval_rewards/margins": 0.0025887340307235718, "eval_rewards/rejected": -0.01216394267976284, "eval_runtime": 145.5671, "eval_samples_per_second": 2.356, "eval_steps_per_second": 0.295, "step": 700 }, { "epoch": 1.84, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -1.7369210720062256, "logits/rejected": -1.7286014556884766, "logps/chosen": -35.7460823059082, "logps/rejected": -33.445213317871094, "loss": 2282.4748, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.0076593635603785515, "rewards/margins": 0.02305331453680992, "rewards/rejected": -0.015393950045108795, "step": 710 }, { "epoch": 1.87, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -1.7424733638763428, "logits/rejected": -1.7439861297607422, "logps/chosen": -35.72047424316406, "logps/rejected": -34.96687698364258, "loss": 2256.5887, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.007149122655391693, "rewards/margins": 0.026057641953229904, "rewards/rejected": -0.01890851929783821, "step": 720 }, { "epoch": 1.9, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -1.7609472274780273, "logits/rejected": -1.7586179971694946, "logps/chosen": -31.435550689697266, "logps/rejected": -34.51602554321289, "loss": 2254.2758, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.011277793906629086, "rewards/margins": 0.026469092816114426, "rewards/rejected": -0.015191297046840191, "step": 730 }, { "epoch": 1.92, "learning_rate": 3.100405083388799e-06, "logits/chosen": -1.733371376991272, "logits/rejected": -1.7385714054107666, "logps/chosen": -30.687509536743164, "logps/rejected": -34.88238525390625, "loss": 2235.6635, "rewards/accuracies": 0.875, "rewards/chosen": 0.010982171632349491, "rewards/margins": 0.028257867321372032, "rewards/rejected": -0.017275694757699966, "step": 740 }, { "epoch": 1.95, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -1.686703085899353, "logits/rejected": -1.68540358543396, "logps/chosen": -32.28951644897461, "logps/rejected": -36.70497512817383, "loss": 2236.9605, "rewards/accuracies": 0.8125, "rewards/chosen": 0.008354658260941505, "rewards/margins": 0.028544824570417404, "rewards/rejected": -0.020190168172121048, "step": 750 }, { "epoch": 1.97, "learning_rate": 2.989809792446417e-06, "logits/chosen": -1.5596857070922852, "logits/rejected": -1.5550428628921509, "logps/chosen": -34.99411392211914, "logps/rejected": -37.3930778503418, "loss": 2205.4355, "rewards/accuracies": 0.8125, "rewards/chosen": 0.010043250396847725, "rewards/margins": 0.0318898968398571, "rewards/rejected": -0.021846650168299675, "step": 760 }, { "epoch": 2.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.6859843730926514, "logits/rejected": -1.690326452255249, "logps/chosen": -34.527626037597656, "logps/rejected": -35.3222541809082, "loss": 2274.9828, "rewards/accuracies": 0.7916666269302368, "rewards/chosen": 0.006860324647277594, "rewards/margins": 0.024134492501616478, "rewards/rejected": -0.017274167388677597, "step": 770 }, { "epoch": 2.03, "learning_rate": 2.878208065043501e-06, "logits/chosen": -1.633776068687439, "logits/rejected": -1.632145643234253, "logps/chosen": -32.2863883972168, "logps/rejected": -37.45145797729492, "loss": 2069.3428, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.01615205779671669, "rewards/margins": 0.04662010073661804, "rewards/rejected": -0.030468037351965904, "step": 780 }, { "epoch": 2.05, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -1.6747426986694336, "logits/rejected": -1.6725879907608032, "logps/chosen": -31.845199584960938, "logps/rejected": -35.52173614501953, "loss": 2131.4416, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.015305367298424244, "rewards/margins": 0.039657000452280045, "rewards/rejected": -0.024351635947823524, "step": 790 }, { "epoch": 2.08, "learning_rate": 2.76582921478147e-06, "logits/chosen": -1.597703218460083, "logits/rejected": -1.592248797416687, "logps/chosen": -33.2725830078125, "logps/rejected": -33.86598205566406, "loss": 2173.4393, "rewards/accuracies": 0.875, "rewards/chosen": 0.011104973964393139, "rewards/margins": 0.03512220084667206, "rewards/rejected": -0.024017225950956345, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -1.970989465713501, "eval_logits/rejected": -1.9664607048034668, "eval_logps/chosen": -35.21713638305664, "eval_logps/rejected": -39.03861618041992, "eval_loss": 2470.59033203125, "eval_rewards/accuracies": 0.5568937063217163, "eval_rewards/chosen": -0.011825831606984138, "eval_rewards/margins": 0.003394143423065543, "eval_rewards/rejected": -0.015219975262880325, "eval_runtime": 145.776, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 800 }, { "epoch": 2.1, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -1.6900947093963623, "logits/rejected": -1.6973447799682617, "logps/chosen": -30.748676300048828, "logps/rejected": -37.139137268066406, "loss": 2112.8461, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.01379583589732647, "rewards/margins": 0.04262876883149147, "rewards/rejected": -0.02883293107151985, "step": 810 }, { "epoch": 2.13, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -1.6644665002822876, "logits/rejected": -1.666691541671753, "logps/chosen": -31.436620712280273, "logps/rejected": -35.15371322631836, "loss": 2218.1766, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.011344591155648232, "rewards/margins": 0.030583670362830162, "rewards/rejected": -0.01923907920718193, "step": 820 }, { "epoch": 2.16, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -1.5569541454315186, "logits/rejected": -1.5611451864242554, "logps/chosen": -31.492889404296875, "logps/rejected": -37.86591339111328, "loss": 2126.5043, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.010985675267875195, "rewards/margins": 0.04054299369454384, "rewards/rejected": -0.02955731749534607, "step": 830 }, { "epoch": 2.18, "learning_rate": 2.53966490958702e-06, "logits/chosen": -1.6258825063705444, "logits/rejected": -1.6220991611480713, "logps/chosen": -31.927608489990234, "logps/rejected": -35.32285690307617, "loss": 2187.909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.010410415939986706, "rewards/margins": 0.033692531287670135, "rewards/rejected": -0.023282116279006004, "step": 840 }, { "epoch": 2.21, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -1.6901594400405884, "logits/rejected": -1.6924806833267212, "logps/chosen": -31.357311248779297, "logps/rejected": -38.720489501953125, "loss": 2089.2604, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014180228114128113, "rewards/margins": 0.04476577043533325, "rewards/rejected": -0.03058554232120514, "step": 850 }, { "epoch": 2.23, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -1.5084383487701416, "logits/rejected": -1.5038378238677979, "logps/chosen": -35.46610641479492, "logps/rejected": -35.76911544799805, "loss": 2161.0381, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.00815567746758461, "rewards/margins": 0.03692782670259476, "rewards/rejected": -0.02877214550971985, "step": 860 }, { "epoch": 2.26, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -1.5508906841278076, "logits/rejected": -1.5508755445480347, "logps/chosen": -34.66786575317383, "logps/rejected": -37.92455291748047, "loss": 2108.2408, "rewards/accuracies": 0.875, "rewards/chosen": 0.011749391444027424, "rewards/margins": 0.042532261461019516, "rewards/rejected": -0.03078286722302437, "step": 870 }, { "epoch": 2.29, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.5863968133926392, "logits/rejected": -1.5747534036636353, "logps/chosen": -32.88094711303711, "logps/rejected": -36.167869567871094, "loss": 2086.3512, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.012256348505616188, "rewards/margins": 0.04471471160650253, "rewards/rejected": -0.032458364963531494, "step": 880 }, { "epoch": 2.31, "learning_rate": 2.256719512667651e-06, "logits/chosen": -1.6844234466552734, "logits/rejected": -1.6891088485717773, "logps/chosen": -32.565818786621094, "logps/rejected": -36.00508499145508, "loss": 2111.5299, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.00764118880033493, "rewards/margins": 0.04332723096013069, "rewards/rejected": -0.03568603843450546, "step": 890 }, { "epoch": 2.34, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -1.6079037189483643, "logits/rejected": -1.6007716655731201, "logps/chosen": -33.532958984375, "logps/rejected": -35.84773254394531, "loss": 2065.36, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.013211173936724663, "rewards/margins": 0.047667164355516434, "rewards/rejected": -0.03445599228143692, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -1.9166266918182373, "eval_logits/rejected": -1.9121689796447754, "eval_logps/chosen": -35.597782135009766, "eval_logps/rejected": -39.466796875, "eval_loss": 2467.73291015625, "eval_rewards/accuracies": 0.5598006844520569, "eval_rewards/chosen": -0.01563231088221073, "eval_rewards/margins": 0.003869474632665515, "eval_rewards/rejected": -0.019501786679029465, "eval_runtime": 145.8623, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 900 }, { "epoch": 2.36, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -1.6010173559188843, "logits/rejected": -1.6011197566986084, "logps/chosen": -30.0789794921875, "logps/rejected": -38.56249237060547, "loss": 2050.1326, "rewards/accuracies": 0.9375, "rewards/chosen": 0.010881805792450905, "rewards/margins": 0.04960983246564865, "rewards/rejected": -0.0387280248105526, "step": 910 }, { "epoch": 2.39, "learning_rate": 2.088219349982323e-06, "logits/chosen": -1.5475326776504517, "logits/rejected": -1.5394935607910156, "logps/chosen": -31.198156356811523, "logps/rejected": -37.26829147338867, "loss": 2106.8049, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.006407910026609898, "rewards/margins": 0.04337712749838829, "rewards/rejected": -0.03696921840310097, "step": 920 }, { "epoch": 2.42, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -1.7085663080215454, "logits/rejected": -1.7077171802520752, "logps/chosen": -29.088947296142578, "logps/rejected": -36.169918060302734, "loss": 2121.3875, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.009080884046852589, "rewards/margins": 0.04213147610425949, "rewards/rejected": -0.03305059298872948, "step": 930 }, { "epoch": 2.44, "learning_rate": 1.976895560604729e-06, "logits/chosen": -1.5875581502914429, "logits/rejected": -1.597825527191162, "logps/chosen": -33.56743621826172, "logps/rejected": -36.87736511230469, "loss": 2078.3281, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.008576255291700363, "rewards/margins": 0.04745348542928696, "rewards/rejected": -0.038877226412296295, "step": 940 }, { "epoch": 2.47, "learning_rate": 1.921622518534466e-06, "logits/chosen": -1.6309471130371094, "logits/rejected": -1.6342302560806274, "logps/chosen": -30.122472763061523, "logps/rejected": -35.17578887939453, "loss": 2139.8102, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.005252503324300051, "rewards/margins": 0.03964962065219879, "rewards/rejected": -0.03439711779356003, "step": 950 }, { "epoch": 2.49, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -1.6128339767456055, "logits/rejected": -1.6084800958633423, "logps/chosen": -33.1923713684082, "logps/rejected": -37.553993225097656, "loss": 2098.0742, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008828431367874146, "rewards/margins": 0.04530448839068413, "rewards/rejected": -0.03647606074810028, "step": 960 }, { "epoch": 2.52, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -1.532700538635254, "logits/rejected": -1.5348151922225952, "logps/chosen": -31.818435668945312, "logps/rejected": -38.84191131591797, "loss": 2077.2633, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.009943163953721523, "rewards/margins": 0.04654636234045029, "rewards/rejected": -0.03660320118069649, "step": 970 }, { "epoch": 2.55, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -1.5599863529205322, "logits/rejected": -1.5538241863250732, "logps/chosen": -35.342994689941406, "logps/rejected": -40.327735900878906, "loss": 2127.4625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0006207667174749076, "rewards/margins": 0.041278596967458725, "rewards/rejected": -0.040657833218574524, "step": 980 }, { "epoch": 2.57, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -1.5475926399230957, "logits/rejected": -1.5419275760650635, "logps/chosen": -30.222143173217773, "logps/rejected": -39.89192581176758, "loss": 2074.9148, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.007808062247931957, "rewards/margins": 0.046828486025333405, "rewards/rejected": -0.03902042657136917, "step": 990 }, { "epoch": 2.6, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -1.5285046100616455, "logits/rejected": -1.51847505569458, "logps/chosen": -31.368820190429688, "logps/rejected": -32.55973815917969, "loss": 2196.3246, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.0037986873649060726, "rewards/margins": 0.033398739993572235, "rewards/rejected": -0.029600050300359726, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -1.8825738430023193, "eval_logits/rejected": -1.878185510635376, "eval_logps/chosen": -35.89006423950195, "eval_logps/rejected": -39.809085845947266, "eval_loss": 2464.3681640625, "eval_rewards/accuracies": 0.5510797500610352, "eval_rewards/chosen": -0.01855510286986828, "eval_rewards/margins": 0.004369591362774372, "eval_rewards/rejected": -0.022924695163965225, "eval_runtime": 145.9322, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 1000 }, { "epoch": 2.62, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -1.6132911443710327, "logits/rejected": -1.612853765487671, "logps/chosen": -31.150531768798828, "logps/rejected": -33.747276306152344, "loss": 2132.9832, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.006736672017723322, "rewards/margins": 0.04162462800741196, "rewards/rejected": -0.03488795459270477, "step": 1010 }, { "epoch": 2.65, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -1.6631847620010376, "logits/rejected": -1.663709044456482, "logps/chosen": -31.313705444335938, "logps/rejected": -34.68809127807617, "loss": 2119.0834, "rewards/accuracies": 0.8125, "rewards/chosen": 0.006298714783042669, "rewards/margins": 0.04152151942253113, "rewards/rejected": -0.03522280603647232, "step": 1020 }, { "epoch": 2.68, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -1.6204774379730225, "logits/rejected": -1.6248018741607666, "logps/chosen": -31.891056060791016, "logps/rejected": -30.86574363708496, "loss": 2204.4455, "rewards/accuracies": 0.75, "rewards/chosen": 0.003890159772709012, "rewards/margins": 0.03271043300628662, "rewards/rejected": -0.028820272535085678, "step": 1030 }, { "epoch": 2.7, "learning_rate": 1.440887158673332e-06, "logits/chosen": -1.6208614110946655, "logits/rejected": -1.6127662658691406, "logps/chosen": -30.53921127319336, "logps/rejected": -37.73412322998047, "loss": 2060.6334, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.006386814173310995, "rewards/margins": 0.048725761473178864, "rewards/rejected": -0.04233894869685173, "step": 1040 }, { "epoch": 2.73, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -1.572749137878418, "logits/rejected": -1.5833861827850342, "logps/chosen": -33.21635437011719, "logps/rejected": -35.97629928588867, "loss": 2125.9623, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.003367505269125104, "rewards/margins": 0.04068039730191231, "rewards/rejected": -0.03731289133429527, "step": 1050 }, { "epoch": 2.75, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -1.6456438302993774, "logits/rejected": -1.6367809772491455, "logps/chosen": -32.83705139160156, "logps/rejected": -41.05707550048828, "loss": 2036.1875, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.00663726544007659, "rewards/margins": 0.05389411002397537, "rewards/rejected": -0.047256845980882645, "step": 1060 }, { "epoch": 2.78, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -1.5514529943466187, "logits/rejected": -1.5537471771240234, "logps/chosen": -34.93277359008789, "logps/rejected": -36.4191780090332, "loss": 2069.123, "rewards/accuracies": 0.8125, "rewards/chosen": 0.010565127246081829, "rewards/margins": 0.04777819663286209, "rewards/rejected": -0.03721306473016739, "step": 1070 }, { "epoch": 2.81, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -1.6648813486099243, "logits/rejected": -1.6647241115570068, "logps/chosen": -32.27571487426758, "logps/rejected": -38.67870330810547, "loss": 2022.6195, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.007819265127182007, "rewards/margins": 0.053449440747499466, "rewards/rejected": -0.04563017934560776, "step": 1080 }, { "epoch": 2.83, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -1.6493892669677734, "logits/rejected": -1.6531116962432861, "logps/chosen": -30.350088119506836, "logps/rejected": -37.505104064941406, "loss": 2092.1893, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.005432260222733021, "rewards/margins": 0.04500500112771988, "rewards/rejected": -0.03957274183630943, "step": 1090 }, { "epoch": 2.86, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -1.637351632118225, "logits/rejected": -1.642260193824768, "logps/chosen": -33.443363189697266, "logps/rejected": -33.12295913696289, "loss": 2237.6512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0012564079370349646, "rewards/margins": 0.028469255194067955, "rewards/rejected": -0.027212847024202347, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -1.8689898252487183, "eval_logits/rejected": -1.8646091222763062, "eval_logps/chosen": -35.997581481933594, "eval_logps/rejected": -39.92316436767578, "eval_loss": 2464.299072265625, "eval_rewards/accuracies": 0.545265793800354, "eval_rewards/chosen": -0.01963029056787491, "eval_rewards/margins": 0.004435177426785231, "eval_rewards/rejected": -0.024065470322966576, "eval_runtime": 145.934, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 1100 }, { "epoch": 2.88, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -1.6270654201507568, "logits/rejected": -1.6279323101043701, "logps/chosen": -31.9213809967041, "logps/rejected": -34.26952362060547, "loss": 2115.159, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.010187694802880287, "rewards/margins": 0.042723797261714935, "rewards/rejected": -0.032536108046770096, "step": 1110 }, { "epoch": 2.91, "learning_rate": 1.049857726072005e-06, "logits/chosen": -1.481483817100525, "logits/rejected": -1.4840071201324463, "logps/chosen": -33.81633377075195, "logps/rejected": -36.6799201965332, "loss": 2111.7611, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.007486463990062475, "rewards/margins": 0.04387300834059715, "rewards/rejected": -0.03638654574751854, "step": 1120 }, { "epoch": 2.94, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -1.5083402395248413, "logits/rejected": -1.5064888000488281, "logps/chosen": -30.776952743530273, "logps/rejected": -34.5029182434082, "loss": 2184.3811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.002347666770219803, "rewards/margins": 0.03571944683790207, "rewards/rejected": -0.03337177634239197, "step": 1130 }, { "epoch": 2.96, "learning_rate": 9.59060791022566e-07, "logits/chosen": -1.641847848892212, "logits/rejected": -1.637139916419983, "logps/chosen": -31.925174713134766, "logps/rejected": -36.643123626708984, "loss": 2063.1439, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.011146209202706814, "rewards/margins": 0.04803737998008728, "rewards/rejected": -0.03689116612076759, "step": 1140 }, { "epoch": 2.99, "learning_rate": 9.148382544856885e-07, "logits/chosen": -1.5244739055633545, "logits/rejected": -1.5154194831848145, "logps/chosen": -33.11268615722656, "logps/rejected": -34.98213195800781, "loss": 2134.0221, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0033905128948390484, "rewards/margins": 0.0402878001332283, "rewards/rejected": -0.03689728304743767, "step": 1150 }, { "epoch": 3.01, "learning_rate": 8.714301001505568e-07, "logits/chosen": -1.5689246654510498, "logits/rejected": -1.569645643234253, "logps/chosen": -33.039424896240234, "logps/rejected": -34.57393264770508, "loss": 2133.5518, "rewards/accuracies": 0.8416666984558105, "rewards/chosen": 0.0064233215525746346, "rewards/margins": 0.04014817252755165, "rewards/rejected": -0.03372485190629959, "step": 1160 }, { "epoch": 3.04, "learning_rate": 8.288586291031025e-07, "logits/chosen": -1.6524006128311157, "logits/rejected": -1.6470394134521484, "logps/chosen": -33.036277770996094, "logps/rejected": -36.106807708740234, "loss": 2169.9916, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.004896899685263634, "rewards/margins": 0.03655281290411949, "rewards/rejected": -0.03165591508150101, "step": 1170 }, { "epoch": 3.06, "learning_rate": 7.871457125803897e-07, "logits/chosen": -1.5274744033813477, "logits/rejected": -1.5358660221099854, "logps/chosen": -33.17569351196289, "logps/rejected": -35.91423416137695, "loss": 2159.4746, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0015730734448879957, "rewards/margins": 0.03710102289915085, "rewards/rejected": -0.03552795201539993, "step": 1180 }, { "epoch": 3.09, "learning_rate": 7.463127807341966e-07, "logits/chosen": -1.5684562921524048, "logits/rejected": -1.5627862215042114, "logps/chosen": -31.11408042907715, "logps/rejected": -37.03162384033203, "loss": 2063.1006, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.010595379397273064, "rewards/margins": 0.04807712510228157, "rewards/rejected": -0.03748174011707306, "step": 1190 }, { "epoch": 3.12, "learning_rate": 7.063808116212021e-07, "logits/chosen": -1.5203880071640015, "logits/rejected": -1.522077202796936, "logps/chosen": -32.758827209472656, "logps/rejected": -37.34809112548828, "loss": 2032.5133, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.007316121365875006, "rewards/margins": 0.05319654941558838, "rewards/rejected": -0.045880429446697235, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -1.866162657737732, "eval_logits/rejected": -1.861803650856018, "eval_logps/chosen": -36.02009963989258, "eval_logps/rejected": -39.944732666015625, "eval_loss": 2464.46533203125, "eval_rewards/accuracies": 0.5598006844520569, "eval_rewards/chosen": -0.019855517894029617, "eval_rewards/margins": 0.004425638820976019, "eval_rewards/rejected": -0.024281155318021774, "eval_runtime": 145.911, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1200 }, { "epoch": 3.14, "learning_rate": 6.673703204254348e-07, "logits/chosen": -1.466104507446289, "logits/rejected": -1.4655678272247314, "logps/chosen": -34.974365234375, "logps/rejected": -36.99479293823242, "loss": 2027.0777, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.010781234130263329, "rewards/margins": 0.05363558605313301, "rewards/rejected": -0.04285435378551483, "step": 1210 }, { "epoch": 3.17, "learning_rate": 6.293013489185315e-07, "logits/chosen": -1.6160688400268555, "logits/rejected": -1.6096382141113281, "logps/chosen": -31.019649505615234, "logps/rejected": -37.333335876464844, "loss": 2040.7531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.006840378977358341, "rewards/margins": 0.05138836055994034, "rewards/rejected": -0.04454797878861427, "step": 1220 }, { "epoch": 3.19, "learning_rate": 5.921934551632086e-07, "logits/chosen": -1.4815315008163452, "logits/rejected": -1.4708069562911987, "logps/chosen": -33.21098327636719, "logps/rejected": -37.002418518066406, "loss": 2020.3844, "rewards/accuracies": 0.875, "rewards/chosen": 0.011746999807655811, "rewards/margins": 0.05283288285136223, "rewards/rejected": -0.041085876524448395, "step": 1230 }, { "epoch": 3.22, "learning_rate": 5.560657034652405e-07, "logits/chosen": -1.5710773468017578, "logits/rejected": -1.565071702003479, "logps/chosen": -30.515600204467773, "logps/rejected": -32.57416534423828, "loss": 2164.5504, "rewards/accuracies": 0.75, "rewards/chosen": 0.001846333616413176, "rewards/margins": 0.03891240432858467, "rewards/rejected": -0.03706606850028038, "step": 1240 }, { "epoch": 3.25, "learning_rate": 5.2093665457911e-07, "logits/chosen": -1.586578130722046, "logits/rejected": -1.5945528745651245, "logps/chosen": -34.664546966552734, "logps/rejected": -34.95591354370117, "loss": 2119.1492, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.00730844447389245, "rewards/margins": 0.04168447107076645, "rewards/rejected": -0.034376028925180435, "step": 1250 }, { "epoch": 3.27, "learning_rate": 4.868243561723535e-07, "logits/chosen": -1.5772063732147217, "logits/rejected": -1.577383041381836, "logps/chosen": -32.863037109375, "logps/rejected": -37.241397857666016, "loss": 2082.3426, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0016552206361666322, "rewards/margins": 0.045920491218566895, "rewards/rejected": -0.044265273958444595, "step": 1260 }, { "epoch": 3.3, "learning_rate": 4.537463335535161e-07, "logits/chosen": -1.5012638568878174, "logits/rejected": -1.5000605583190918, "logps/chosen": -32.05634689331055, "logps/rejected": -37.78838348388672, "loss": 2023.8406, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.010344896465539932, "rewards/margins": 0.05302266404032707, "rewards/rejected": -0.04267776757478714, "step": 1270 }, { "epoch": 3.32, "learning_rate": 4.217195806684629e-07, "logits/chosen": -1.4007158279418945, "logits/rejected": -1.3967105150222778, "logps/chosen": -34.35404586791992, "logps/rejected": -34.51953887939453, "loss": 2094.5336, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.008833990432322025, "rewards/margins": 0.04476445913314819, "rewards/rejected": -0.03593046963214874, "step": 1280 }, { "epoch": 3.35, "learning_rate": 3.907605513696808e-07, "logits/chosen": -1.5938528776168823, "logits/rejected": -1.5794765949249268, "logps/chosen": -34.033790588378906, "logps/rejected": -39.640159606933594, "loss": 2033.0867, "rewards/accuracies": 0.875, "rewards/chosen": 0.00313788210041821, "rewards/margins": 0.05188627913594246, "rewards/rejected": -0.04874839633703232, "step": 1290 }, { "epoch": 3.38, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -1.5395238399505615, "logits/rejected": -1.5438177585601807, "logps/chosen": -32.82494354248047, "logps/rejected": -41.31450653076172, "loss": 1967.852, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.006926815025508404, "rewards/margins": 0.059039629995822906, "rewards/rejected": -0.05211281776428223, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -1.8651809692382812, "eval_logits/rejected": -1.8608282804489136, "eval_logps/chosen": -36.00510025024414, "eval_logps/rejected": -39.96323013305664, "eval_loss": 2461.20361328125, "eval_rewards/accuracies": 0.5539867281913757, "eval_rewards/chosen": -0.01970548741519451, "eval_rewards/margins": 0.004760634154081345, "eval_rewards/rejected": -0.024466121569275856, "eval_runtime": 145.7832, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 1300 }, { "epoch": 3.4, "learning_rate": 3.321087280364757e-07, "logits/chosen": -1.519902229309082, "logits/rejected": -1.520318627357483, "logps/chosen": -35.439937591552734, "logps/rejected": -41.7154426574707, "loss": 2029.7744, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.008040779270231724, "rewards/margins": 0.054354071617126465, "rewards/rejected": -0.046313293278217316, "step": 1310 }, { "epoch": 3.43, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.601548194885254, "logits/rejected": -1.6004295349121094, "logps/chosen": -31.515766143798828, "logps/rejected": -35.187618255615234, "loss": 2068.9873, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.005780898500233889, "rewards/margins": 0.04812353104352951, "rewards/rejected": -0.04234262555837631, "step": 1320 }, { "epoch": 3.45, "learning_rate": 2.779113783626916e-07, "logits/chosen": -1.521244764328003, "logits/rejected": -1.5227617025375366, "logps/chosen": -33.48499298095703, "logps/rejected": -37.73661804199219, "loss": 2050.5445, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.008892608806490898, "rewards/margins": 0.04977206513285637, "rewards/rejected": -0.04087945073843002, "step": 1330 }, { "epoch": 3.48, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -1.5785353183746338, "logits/rejected": -1.577487587928772, "logps/chosen": -30.377460479736328, "logps/rejected": -35.70696258544922, "loss": 2086.1504, "rewards/accuracies": 0.875, "rewards/chosen": 0.008212093263864517, "rewards/margins": 0.04536201059818268, "rewards/rejected": -0.03714991733431816, "step": 1340 }, { "epoch": 3.51, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -1.5923887491226196, "logits/rejected": -1.5776017904281616, "logps/chosen": -34.099876403808594, "logps/rejected": -41.499168395996094, "loss": 2014.8814, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.002167941303923726, "rewards/margins": 0.05451526492834091, "rewards/rejected": -0.052347324788570404, "step": 1350 }, { "epoch": 3.53, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -1.5695116519927979, "logits/rejected": -1.5808777809143066, "logps/chosen": -31.036209106445312, "logps/rejected": -36.333526611328125, "loss": 2047.0217, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.010322622954845428, "rewards/margins": 0.05126044154167175, "rewards/rejected": -0.04093782603740692, "step": 1360 }, { "epoch": 3.56, "learning_rate": 1.833161662683672e-07, "logits/chosen": -1.6847254037857056, "logits/rejected": -1.6842361688613892, "logps/chosen": -30.7957763671875, "logps/rejected": -41.20909881591797, "loss": 1917.1037, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01234652940183878, "rewards/margins": 0.06612871587276459, "rewards/rejected": -0.05378218740224838, "step": 1370 }, { "epoch": 3.58, "learning_rate": 1.626139998169246e-07, "logits/chosen": -1.5517163276672363, "logits/rejected": -1.55906081199646, "logps/chosen": -33.17098617553711, "logps/rejected": -42.29136657714844, "loss": 1991.1635, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.010094953700900078, "rewards/margins": 0.05851215124130249, "rewards/rejected": -0.04841719567775726, "step": 1380 }, { "epoch": 3.61, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -1.509541392326355, "logits/rejected": -1.5056698322296143, "logps/chosen": -33.6932258605957, "logps/rejected": -35.26158905029297, "loss": 2073.9217, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.010739867575466633, "rewards/margins": 0.04738181084394455, "rewards/rejected": -0.03664194419980049, "step": 1390 }, { "epoch": 3.64, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -1.6483919620513916, "logits/rejected": -1.6464850902557373, "logps/chosen": -30.515304565429688, "logps/rejected": -35.92851257324219, "loss": 2084.0914, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.007812450639903545, "rewards/margins": 0.045652881264686584, "rewards/rejected": -0.03784043341875076, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -1.8656350374221802, "eval_logits/rejected": -1.8612688779830933, "eval_logps/chosen": -36.0242919921875, "eval_logps/rejected": -39.967525482177734, "eval_loss": 2462.612060546875, "eval_rewards/accuracies": 0.5598006844520569, "eval_rewards/chosen": -0.019897375255823135, "eval_rewards/margins": 0.00461164116859436, "eval_rewards/rejected": -0.024509014561772346, "eval_runtime": 145.8827, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1400 }, { "epoch": 3.66, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -1.603907823562622, "logits/rejected": -1.5997169017791748, "logps/chosen": -32.91996765136719, "logps/rejected": -33.04853820800781, "loss": 2124.1023, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.00722561776638031, "rewards/margins": 0.042166419327259064, "rewards/rejected": -0.034940801560878754, "step": 1410 }, { "epoch": 3.69, "learning_rate": 9.191080703056604e-08, "logits/chosen": -1.5563673973083496, "logits/rejected": -1.5573166608810425, "logps/chosen": -32.476478576660156, "logps/rejected": -38.40611267089844, "loss": 2100.1363, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.008883940055966377, "rewards/margins": 0.04370499402284622, "rewards/rejected": -0.034821052104234695, "step": 1420 }, { "epoch": 3.71, "learning_rate": 7.730678442730539e-08, "logits/chosen": -1.5083153247833252, "logits/rejected": -1.5021404027938843, "logps/chosen": -33.119529724121094, "logps/rejected": -41.40352249145508, "loss": 2006.5551, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.010222419165074825, "rewards/margins": 0.057363539934158325, "rewards/rejected": -0.047141119837760925, "step": 1430 }, { "epoch": 3.74, "learning_rate": 6.394742864787806e-08, "logits/chosen": -1.5188112258911133, "logits/rejected": -1.5131750106811523, "logps/chosen": -28.496017456054688, "logps/rejected": -35.698753356933594, "loss": 2071.1072, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.008172214962542057, "rewards/margins": 0.047853223979473114, "rewards/rejected": -0.03968100994825363, "step": 1440 }, { "epoch": 3.77, "learning_rate": 5.183960310644748e-08, "logits/chosen": -1.5658049583435059, "logits/rejected": -1.5554416179656982, "logps/chosen": -32.46862030029297, "logps/rejected": -39.8042106628418, "loss": 2087.5406, "rewards/accuracies": 0.875, "rewards/chosen": 0.00012906994379591197, "rewards/margins": 0.04531756415963173, "rewards/rejected": -0.04518849402666092, "step": 1450 }, { "epoch": 3.79, "learning_rate": 4.098952823928693e-08, "logits/chosen": -1.5264514684677124, "logits/rejected": -1.5234899520874023, "logps/chosen": -32.81951141357422, "logps/rejected": -34.28097915649414, "loss": 2147.4246, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.004902643617242575, "rewards/margins": 0.03880416229367256, "rewards/rejected": -0.03390152007341385, "step": 1460 }, { "epoch": 3.82, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -1.5864768028259277, "logits/rejected": -1.592008352279663, "logps/chosen": -30.852685928344727, "logps/rejected": -36.960899353027344, "loss": 2034.7088, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.012164896354079247, "rewards/margins": 0.05214967206120491, "rewards/rejected": -0.03998477756977081, "step": 1470 }, { "epoch": 3.84, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -1.5875871181488037, "logits/rejected": -1.5970607995986938, "logps/chosen": -30.86104393005371, "logps/rejected": -33.266883850097656, "loss": 2088.916, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.008342106826603413, "rewards/margins": 0.044725269079208374, "rewards/rejected": -0.03638315945863724, "step": 1480 }, { "epoch": 3.87, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -1.5217053890228271, "logits/rejected": -1.5153101682662964, "logps/chosen": -33.224308013916016, "logps/rejected": -36.06281661987305, "loss": 2109.2895, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.004321801941841841, "rewards/margins": 0.042845211923122406, "rewards/rejected": -0.03852340579032898, "step": 1490 }, { "epoch": 3.9, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -1.5790612697601318, "logits/rejected": -1.5761630535125732, "logps/chosen": -33.064552307128906, "logps/rejected": -37.95295333862305, "loss": 2032.7156, "rewards/accuracies": 0.875, "rewards/chosen": 0.005610200576484203, "rewards/margins": 0.05253750830888748, "rewards/rejected": -0.046927306801080704, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -1.8648215532302856, "eval_logits/rejected": -1.8604679107666016, "eval_logps/chosen": -36.019161224365234, "eval_logps/rejected": -39.95817184448242, "eval_loss": 2463.105712890625, "eval_rewards/accuracies": 0.565614640712738, "eval_rewards/chosen": -0.019846076145768166, "eval_rewards/margins": 0.004569429438561201, "eval_rewards/rejected": -0.024415504187345505, "eval_runtime": 145.9094, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1500 }, { "epoch": 3.92, "learning_rate": 5.777746105209147e-09, "logits/chosen": -1.645821213722229, "logits/rejected": -1.6461843252182007, "logps/chosen": -28.770349502563477, "logps/rejected": -36.86784744262695, "loss": 2038.2139, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.010459242388606071, "rewards/margins": 0.05173317715525627, "rewards/rejected": -0.04127394035458565, "step": 1510 }, { "epoch": 3.95, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -1.5073591470718384, "logits/rejected": -1.5066778659820557, "logps/chosen": -32.11809158325195, "logps/rejected": -37.76689910888672, "loss": 2069.6059, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.00923779234290123, "rewards/margins": 0.04816945642232895, "rewards/rejected": -0.03893166407942772, "step": 1520 }, { "epoch": 3.97, "learning_rate": 6.421917227455999e-10, "logits/chosen": -1.6536405086517334, "logits/rejected": -1.6510140895843506, "logps/chosen": -30.886306762695312, "logps/rejected": -35.38301467895508, "loss": 2094.1572, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0058775185607373714, "rewards/margins": 0.04487111419439316, "rewards/rejected": -0.03899358958005905, "step": 1530 }, { "epoch": 4.0, "learning_rate": 0.0, "logits/chosen": -1.638646125793457, "logits/rejected": -1.6395971775054932, "logps/chosen": -30.43972396850586, "logps/rejected": -33.02666473388672, "loss": 2186.526, "rewards/accuracies": 0.82916659116745, "rewards/chosen": 0.001802150160074234, "rewards/margins": 0.034166958183050156, "rewards/rejected": -0.03236480802297592, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 1756.4536297686689, "train_runtime": 10797.265, "train_samples_per_second": 1.141, "train_steps_per_second": 0.143 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }