{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994761655316919, "eval_steps": 100, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010476689366160294, "grad_norm": 1.4578042030334473, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -0.36581629514694214, "logits/rejected": -0.31856024265289307, "logps/chosen": -124.0562515258789, "logps/rejected": -485.2004699707031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010476689366160294, "grad_norm": 1.4763652086257935, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.7495560050010681, "logits/rejected": -0.6286604404449463, "logps/chosen": -243.79879760742188, "logps/rejected": -251.34925842285156, "loss": 0.6952, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -0.00359285157173872, "rewards/margins": -0.011229949072003365, "rewards/rejected": 0.007637098431587219, "step": 10 }, { "epoch": 0.020953378732320588, "grad_norm": 1.6240452527999878, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.7679840326309204, "logits/rejected": -0.8156970739364624, "logps/chosen": -178.1337890625, "logps/rejected": -190.29257202148438, "loss": 0.6956, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.009887685999274254, "rewards/margins": 0.01590128242969513, "rewards/rejected": -0.006013598758727312, "step": 20 }, { "epoch": 0.03143006809848088, "grad_norm": 1.5617218017578125, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.8484021425247192, "logits/rejected": -0.851048469543457, "logps/chosen": -224.95425415039062, "logps/rejected": -236.6777801513672, "loss": 0.6928, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.013385577127337456, "rewards/margins": 0.01709272526204586, "rewards/rejected": -0.0037071462720632553, "step": 30 }, { "epoch": 0.041906757464641176, "grad_norm": 1.8790456056594849, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.7920663356781006, "logits/rejected": -0.8001823425292969, "logps/chosen": -242.0943145751953, "logps/rejected": -276.2791748046875, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0071295686066150665, "rewards/margins": 0.007320129778236151, "rewards/rejected": -0.01444969791918993, "step": 40 }, { "epoch": 0.05238344683080147, "grad_norm": 1.6932095289230347, "learning_rate": 5.208333333333334e-07, "logits/chosen": -0.765317440032959, "logits/rejected": -0.7238121628761292, "logps/chosen": -172.94967651367188, "logps/rejected": -271.79119873046875, "loss": 0.6915, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.024208534508943558, "rewards/margins": 0.006762148346751928, "rewards/rejected": 0.017446385696530342, "step": 50 }, { "epoch": 0.06286013619696176, "grad_norm": 2.107041358947754, "learning_rate": 6.249999999999999e-07, "logits/chosen": -0.618078887462616, "logits/rejected": -0.6146777272224426, "logps/chosen": -166.27442932128906, "logps/rejected": -234.01748657226562, "loss": 0.6866, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.02008468471467495, "rewards/margins": 0.006810173392295837, "rewards/rejected": 0.013274513185024261, "step": 60 }, { "epoch": 0.07333682556312206, "grad_norm": 1.22433340549469, "learning_rate": 7.291666666666666e-07, "logits/chosen": -0.692817211151123, "logits/rejected": -0.8424701690673828, "logps/chosen": -199.5739288330078, "logps/rejected": -271.6493835449219, "loss": 0.6856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.026040812954306602, "rewards/margins": -0.006783752236515284, "rewards/rejected": 0.0328245684504509, "step": 70 }, { "epoch": 0.08381351492928235, "grad_norm": 1.566157341003418, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.6816123723983765, "logits/rejected": -0.6406960487365723, "logps/chosen": -182.77035522460938, "logps/rejected": -213.7708282470703, "loss": 0.6832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04936245083808899, "rewards/margins": 0.013176756910979748, "rewards/rejected": 0.03618568927049637, "step": 80 }, { "epoch": 0.09429020429544265, "grad_norm": 1.6193081140518188, "learning_rate": 9.374999999999999e-07, "logits/chosen": -0.5649391412734985, "logits/rejected": -0.6288346648216248, "logps/chosen": -161.33181762695312, "logps/rejected": -234.61001586914062, "loss": 0.663, "rewards/accuracies": 0.75, "rewards/chosen": 0.1065601259469986, "rewards/margins": 0.059998225420713425, "rewards/rejected": 0.046561889350414276, "step": 90 }, { "epoch": 0.10476689366160294, "grad_norm": 1.650424838066101, "learning_rate": 9.999463737538052e-07, "logits/chosen": -0.6438407301902771, "logits/rejected": -0.6645095348358154, "logps/chosen": -206.02133178710938, "logps/rejected": -222.8872528076172, "loss": 0.6553, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16100379824638367, "rewards/margins": 0.10496912896633148, "rewards/rejected": 0.05603468418121338, "step": 100 }, { "epoch": 0.10476689366160294, "eval_logits/chosen": -0.6915007829666138, "eval_logits/rejected": -0.7285872101783752, "eval_logps/chosen": -256.8047790527344, "eval_logps/rejected": -272.9460144042969, "eval_loss": 0.6206383109092712, "eval_rewards/accuracies": 0.7735294103622437, "eval_rewards/chosen": 0.2425609529018402, "eval_rewards/margins": 0.1632746458053589, "eval_rewards/rejected": 0.07928629219532013, "eval_runtime": 268.6581, "eval_samples_per_second": 10.124, "eval_steps_per_second": 1.266, "step": 100 }, { "epoch": 0.11524358302776323, "grad_norm": 2.010002374649048, "learning_rate": 9.993432105822034e-07, "logits/chosen": -0.6346007585525513, "logits/rejected": -0.5853306651115417, "logps/chosen": -197.63999938964844, "logps/rejected": -212.1625213623047, "loss": 0.6432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14274722337722778, "rewards/margins": 0.055570416152477264, "rewards/rejected": 0.08717679232358932, "step": 110 }, { "epoch": 0.12572027239392353, "grad_norm": 1.9033344984054565, "learning_rate": 9.980706626858607e-07, "logits/chosen": -0.6291040182113647, "logits/rejected": -0.6428237557411194, "logps/chosen": -154.11695861816406, "logps/rejected": -208.1122589111328, "loss": 0.6164, "rewards/accuracies": 0.75, "rewards/chosen": 0.27889788150787354, "rewards/margins": 0.17130421102046967, "rewards/rejected": 0.10759365558624268, "step": 120 }, { "epoch": 0.1361969617600838, "grad_norm": 2.405748128890991, "learning_rate": 9.961304359538434e-07, "logits/chosen": -0.777423083782196, "logits/rejected": -0.8043774366378784, "logps/chosen": -153.481689453125, "logps/rejected": -231.5235595703125, "loss": 0.5793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4585164189338684, "rewards/margins": 0.322237104177475, "rewards/rejected": 0.13627928495407104, "step": 130 }, { "epoch": 0.14667365112624411, "grad_norm": 1.5299396514892578, "learning_rate": 9.935251313189563e-07, "logits/chosen": -0.7013922929763794, "logits/rejected": -0.743097722530365, "logps/chosen": -180.298583984375, "logps/rejected": -223.975830078125, "loss": 0.5678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.541010856628418, "rewards/margins": 0.32118964195251465, "rewards/rejected": 0.2198212444782257, "step": 140 }, { "epoch": 0.1571503404924044, "grad_norm": 1.8803837299346924, "learning_rate": 9.902582412711118e-07, "logits/chosen": -0.6647799611091614, "logits/rejected": -0.6827102899551392, "logps/chosen": -217.7650909423828, "logps/rejected": -252.84951782226562, "loss": 0.5741, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5824534296989441, "rewards/margins": 0.31880098581314087, "rewards/rejected": 0.2636524736881256, "step": 150 }, { "epoch": 0.1676270298585647, "grad_norm": 1.6350294351577759, "learning_rate": 9.86334145175542e-07, "logits/chosen": -0.5989512205123901, "logits/rejected": -0.5124181509017944, "logps/chosen": -170.99246215820312, "logps/rejected": -202.6414337158203, "loss": 0.5532, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.755856454372406, "rewards/margins": 0.5548487901687622, "rewards/rejected": 0.20100775361061096, "step": 160 }, { "epoch": 0.17810371922472498, "grad_norm": 1.8998695611953735, "learning_rate": 9.817581034021272e-07, "logits/chosen": -0.6117348670959473, "logits/rejected": -0.5613563656806946, "logps/chosen": -210.92129516601562, "logps/rejected": -252.5787811279297, "loss": 0.5309, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7920325994491577, "rewards/margins": 0.46702486276626587, "rewards/rejected": 0.32500773668289185, "step": 170 }, { "epoch": 0.1885804085908853, "grad_norm": 2.614863395690918, "learning_rate": 9.765362502737097e-07, "logits/chosen": -0.6537714004516602, "logits/rejected": -0.6021772623062134, "logps/chosen": -143.68064880371094, "logps/rejected": -177.00177001953125, "loss": 0.5391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7442184090614319, "rewards/margins": 0.3370700478553772, "rewards/rejected": 0.4071483612060547, "step": 180 }, { "epoch": 0.19905709795704557, "grad_norm": 1.3411009311676025, "learning_rate": 9.706755858428485e-07, "logits/chosen": -0.6349914073944092, "logits/rejected": -0.651036262512207, "logps/chosen": -182.64413452148438, "logps/rejected": -204.3733673095703, "loss": 0.5024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9470041990280151, "rewards/margins": 0.48141175508499146, "rewards/rejected": 0.4655924439430237, "step": 190 }, { "epoch": 0.20953378732320588, "grad_norm": 1.8027269840240479, "learning_rate": 9.641839665080363e-07, "logits/chosen": -0.658828616142273, "logits/rejected": -0.6614493727684021, "logps/chosen": -193.62255859375, "logps/rejected": -240.878173828125, "loss": 0.4736, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.281610131263733, "rewards/margins": 0.7956588864326477, "rewards/rejected": 0.48595109581947327, "step": 200 }, { "epoch": 0.20953378732320588, "eval_logits/chosen": -0.6532002091407776, "eval_logits/rejected": -0.6974676847457886, "eval_logps/chosen": -236.9672393798828, "eval_logps/rejected": -266.162109375, "eval_loss": 0.4578969180583954, "eval_rewards/accuracies": 0.8352941274642944, "eval_rewards/chosen": 1.2344386577606201, "eval_rewards/margins": 0.8159562349319458, "eval_rewards/rejected": 0.4184825122356415, "eval_runtime": 261.8559, "eval_samples_per_second": 10.387, "eval_steps_per_second": 1.298, "step": 200 }, { "epoch": 0.22001047668936616, "grad_norm": 1.2251081466674805, "learning_rate": 9.570700944819582e-07, "logits/chosen": -0.5962850451469421, "logits/rejected": -0.6295059323310852, "logps/chosen": -156.5807647705078, "logps/rejected": -211.43453979492188, "loss": 0.4887, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.0516936779022217, "rewards/margins": 0.5657483339309692, "rewards/rejected": 0.4859454035758972, "step": 210 }, { "epoch": 0.23048716605552647, "grad_norm": 2.3638861179351807, "learning_rate": 9.493435061259129e-07, "logits/chosen": -0.6848911643028259, "logits/rejected": -0.7088301777839661, "logps/chosen": -153.29782104492188, "logps/rejected": -231.8919677734375, "loss": 0.4621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2177709341049194, "rewards/margins": 0.788833737373352, "rewards/rejected": 0.42893725633621216, "step": 220 }, { "epoch": 0.24096385542168675, "grad_norm": 1.3815181255340576, "learning_rate": 9.4101455916603e-07, "logits/chosen": -0.6574471592903137, "logits/rejected": -0.6578237414360046, "logps/chosen": -185.6906280517578, "logps/rejected": -232.35885620117188, "loss": 0.4429, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.2863850593566895, "rewards/margins": 0.6995311975479126, "rewards/rejected": 0.5868538618087769, "step": 230 }, { "epoch": 0.25144054478784705, "grad_norm": 1.7935489416122437, "learning_rate": 9.320944188084241e-07, "logits/chosen": -0.4765821099281311, "logits/rejected": -0.4639780521392822, "logps/chosen": -164.8132781982422, "logps/rejected": -210.4600830078125, "loss": 0.4458, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.1567394733428955, "rewards/margins": 0.7783015966415405, "rewards/rejected": 0.3784378468990326, "step": 240 }, { "epoch": 0.26191723415400736, "grad_norm": 1.4800844192504883, "learning_rate": 9.225950427718974e-07, "logits/chosen": -0.634162425994873, "logits/rejected": -0.6297306418418884, "logps/chosen": -173.91427612304688, "logps/rejected": -200.29165649414062, "loss": 0.4509, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3209316730499268, "rewards/margins": 1.1504794359207153, "rewards/rejected": 0.17045211791992188, "step": 250 }, { "epoch": 0.2723939235201676, "grad_norm": 1.3921798467636108, "learning_rate": 9.125291652582547e-07, "logits/chosen": -0.6399877667427063, "logits/rejected": -0.6667548418045044, "logps/chosen": -167.67312622070312, "logps/rejected": -201.6714630126953, "loss": 0.462, "rewards/accuracies": 0.875, "rewards/chosen": 1.2268803119659424, "rewards/margins": 0.8962867856025696, "rewards/rejected": 0.3305933177471161, "step": 260 }, { "epoch": 0.2828706128863279, "grad_norm": 1.4028617143630981, "learning_rate": 9.019102798817195e-07, "logits/chosen": -0.6005131602287292, "logits/rejected": -0.5953696966171265, "logps/chosen": -119.4700698852539, "logps/rejected": -186.04421997070312, "loss": 0.4247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.288698434829712, "rewards/margins": 0.703940749168396, "rewards/rejected": 0.5847576260566711, "step": 270 }, { "epoch": 0.29334730225248823, "grad_norm": 1.3027068376541138, "learning_rate": 8.90752621580335e-07, "logits/chosen": -0.7007671594619751, "logits/rejected": -0.6778917908668518, "logps/chosen": -146.1395263671875, "logps/rejected": -220.3456268310547, "loss": 0.4042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3857734203338623, "rewards/margins": 1.2629530429840088, "rewards/rejected": 0.12282057106494904, "step": 280 }, { "epoch": 0.3038239916186485, "grad_norm": 1.3440015316009521, "learning_rate": 8.79071147533597e-07, "logits/chosen": -0.8330009579658508, "logits/rejected": -0.7537879943847656, "logps/chosen": -229.95028686523438, "logps/rejected": -292.24041748046875, "loss": 0.4378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5092631578445435, "rewards/margins": 1.075532078742981, "rewards/rejected": 0.4337310791015625, "step": 290 }, { "epoch": 0.3143006809848088, "grad_norm": 1.3541631698608398, "learning_rate": 8.668815171119019e-07, "logits/chosen": -0.7203774452209473, "logits/rejected": -0.6443927884101868, "logps/chosen": -226.54098510742188, "logps/rejected": -266.37384033203125, "loss": 0.4158, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6005226373672485, "rewards/margins": 1.0506787300109863, "rewards/rejected": 0.549843966960907, "step": 300 }, { "epoch": 0.3143006809848088, "eval_logits/chosen": -0.6811380982398987, "eval_logits/rejected": -0.7182894945144653, "eval_logps/chosen": -229.1289825439453, "eval_logps/rejected": -265.5471496582031, "eval_loss": 0.40298667550086975, "eval_rewards/accuracies": 0.8500000238418579, "eval_rewards/chosen": 1.626351237297058, "eval_rewards/margins": 1.1771219968795776, "eval_rewards/rejected": 0.4492292106151581, "eval_runtime": 261.678, "eval_samples_per_second": 10.394, "eval_steps_per_second": 1.299, "step": 300 }, { "epoch": 0.3247773703509691, "grad_norm": 1.035560965538025, "learning_rate": 8.54200070884685e-07, "logits/chosen": -0.6642839908599854, "logits/rejected": -0.8109550476074219, "logps/chosen": -201.55926513671875, "logps/rejected": -340.20477294921875, "loss": 0.3971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4134103059768677, "rewards/margins": 0.8630633354187012, "rewards/rejected": 0.5503469705581665, "step": 310 }, { "epoch": 0.3352540597171294, "grad_norm": 1.3532744646072388, "learning_rate": 8.410438087153911e-07, "logits/chosen": -0.7970777750015259, "logits/rejected": -0.7839875221252441, "logps/chosen": -165.8358917236328, "logps/rejected": -237.0997772216797, "loss": 0.3867, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.637261986732483, "rewards/margins": 1.2106577157974243, "rewards/rejected": 0.4266042113304138, "step": 320 }, { "epoch": 0.34573074908328966, "grad_norm": 1.0666495561599731, "learning_rate": 8.274303669726426e-07, "logits/chosen": -0.7586744427680969, "logits/rejected": -0.74485844373703, "logps/chosen": -243.2891082763672, "logps/rejected": -259.55072021484375, "loss": 0.3748, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.8532311916351318, "rewards/margins": 1.4373507499694824, "rewards/rejected": 0.4158805310726166, "step": 330 }, { "epoch": 0.35620743844944996, "grad_norm": 1.0840498208999634, "learning_rate": 8.133779948881513e-07, "logits/chosen": -0.6612862944602966, "logits/rejected": -0.6942886710166931, "logps/chosen": -118.1360855102539, "logps/rejected": -170.34249877929688, "loss": 0.3657, "rewards/accuracies": 0.875, "rewards/chosen": 1.7111928462982178, "rewards/margins": 1.4025405645370483, "rewards/rejected": 0.3086521625518799, "step": 340 }, { "epoch": 0.3666841278156103, "grad_norm": 0.950655460357666, "learning_rate": 7.989055300930704e-07, "logits/chosen": -0.6502361297607422, "logits/rejected": -0.576398491859436, "logps/chosen": -151.99462890625, "logps/rejected": -209.2263946533203, "loss": 0.3509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8276745080947876, "rewards/margins": 1.8597872257232666, "rewards/rejected": -0.03211268037557602, "step": 350 }, { "epoch": 0.3771608171817706, "grad_norm": 1.2855123281478882, "learning_rate": 7.840323733655778e-07, "logits/chosen": -0.6127463579177856, "logits/rejected": -0.740655779838562, "logps/chosen": -121.72711181640625, "logps/rejected": -187.34786987304688, "loss": 0.4072, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.860853910446167, "rewards/margins": 1.4494211673736572, "rewards/rejected": 0.41143256425857544, "step": 360 }, { "epoch": 0.38763750654793083, "grad_norm": 1.1689319610595703, "learning_rate": 7.687784626235447e-07, "logits/chosen": -0.7798863649368286, "logits/rejected": -0.8257772326469421, "logps/chosen": -149.10655212402344, "logps/rejected": -211.77951049804688, "loss": 0.3556, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8038825988769531, "rewards/margins": 1.4737164974212646, "rewards/rejected": 0.33016616106033325, "step": 370 }, { "epoch": 0.39811419591409114, "grad_norm": 1.5824625492095947, "learning_rate": 7.531642461971514e-07, "logits/chosen": -0.8133307695388794, "logits/rejected": -0.7661614418029785, "logps/chosen": -186.41244506835938, "logps/rejected": -248.0023956298828, "loss": 0.3885, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.8458435535430908, "rewards/margins": 1.251628041267395, "rewards/rejected": 0.5942155718803406, "step": 380 }, { "epoch": 0.40859088528025145, "grad_norm": 1.0341404676437378, "learning_rate": 7.372106554172801e-07, "logits/chosen": -0.7007779479026794, "logits/rejected": -0.736503541469574, "logps/chosen": -135.133056640625, "logps/rejected": -209.7045440673828, "loss": 0.3382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8146415948867798, "rewards/margins": 1.6613012552261353, "rewards/rejected": 0.15334045886993408, "step": 390 }, { "epoch": 0.41906757464641176, "grad_norm": 1.4435651302337646, "learning_rate": 7.209390765564318e-07, "logits/chosen": -0.6681721210479736, "logits/rejected": -0.6735861897468567, "logps/chosen": -164.41403198242188, "logps/rejected": -220.19692993164062, "loss": 0.3913, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.8460575342178345, "rewards/margins": 1.4937286376953125, "rewards/rejected": 0.35232874751091003, "step": 400 }, { "epoch": 0.41906757464641176, "eval_logits/chosen": -0.6677387356758118, "eval_logits/rejected": -0.7164434790611267, "eval_logps/chosen": -226.38108825683594, "eval_logps/rejected": -267.6443786621094, "eval_loss": 0.36976274847984314, "eval_rewards/accuracies": 0.8558823466300964, "eval_rewards/chosen": 1.7637465000152588, "eval_rewards/margins": 1.4193782806396484, "eval_rewards/rejected": 0.34436821937561035, "eval_runtime": 261.9094, "eval_samples_per_second": 10.385, "eval_steps_per_second": 1.298, "step": 400 }, { "epoch": 0.429544264012572, "grad_norm": 1.6972898244857788, "learning_rate": 7.043713221597773e-07, "logits/chosen": -0.6213727593421936, "logits/rejected": -0.5815456509590149, "logps/chosen": -251.49691772460938, "logps/rejected": -282.7269287109375, "loss": 0.3671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.714316964149475, "rewards/margins": 1.8758585453033447, "rewards/rejected": -0.1615416705608368, "step": 410 }, { "epoch": 0.4400209533787323, "grad_norm": 1.2721658945083618, "learning_rate": 6.875296018047809e-07, "logits/chosen": -0.6413770914077759, "logits/rejected": -0.576094925403595, "logps/chosen": -165.25161743164062, "logps/rejected": -237.6639404296875, "loss": 0.4021, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9153220653533936, "rewards/margins": 1.9331645965576172, "rewards/rejected": -0.017842281609773636, "step": 420 }, { "epoch": 0.4504976427448926, "grad_norm": 1.4304931163787842, "learning_rate": 6.704364923285857e-07, "logits/chosen": -0.7602907419204712, "logits/rejected": -0.838828444480896, "logps/chosen": -151.62240600585938, "logps/rejected": -272.5216979980469, "loss": 0.3335, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.8815746307373047, "rewards/margins": 1.6266117095947266, "rewards/rejected": 0.2549629509449005, "step": 430 }, { "epoch": 0.46097433211105293, "grad_norm": 0.9915475249290466, "learning_rate": 6.531149075630796e-07, "logits/chosen": -0.7418895959854126, "logits/rejected": -0.7850595712661743, "logps/chosen": -187.44215393066406, "logps/rejected": -303.8704528808594, "loss": 0.3428, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9837749004364014, "rewards/margins": 1.6201133728027344, "rewards/rejected": 0.36366182565689087, "step": 440 }, { "epoch": 0.4714510214772132, "grad_norm": 1.2732757329940796, "learning_rate": 6.355880676182085e-07, "logits/chosen": -0.6368893384933472, "logits/rejected": -0.6567862033843994, "logps/chosen": -162.96644592285156, "logps/rejected": -234.6942138671875, "loss": 0.3344, "rewards/accuracies": 0.875, "rewards/chosen": 1.5993229150772095, "rewards/margins": 1.5878174304962158, "rewards/rejected": 0.011505508795380592, "step": 450 }, { "epoch": 0.4819277108433735, "grad_norm": 1.200920581817627, "learning_rate": 6.178794677547137e-07, "logits/chosen": -0.7497304081916809, "logits/rejected": -0.813916027545929, "logps/chosen": -184.34619140625, "logps/rejected": -325.8006896972656, "loss": 0.351, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7533938884735107, "rewards/margins": 1.8764629364013672, "rewards/rejected": -0.12306902557611465, "step": 460 }, { "epoch": 0.4924044002095338, "grad_norm": 1.2419453859329224, "learning_rate": 6.000128468880222e-07, "logits/chosen": -0.8247785568237305, "logits/rejected": -0.7869304418563843, "logps/chosen": -157.25991821289062, "logps/rejected": -210.11380004882812, "loss": 0.3311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7378108501434326, "rewards/margins": 1.4047176837921143, "rewards/rejected": 0.33309322595596313, "step": 470 }, { "epoch": 0.5028810895756941, "grad_norm": 1.6529850959777832, "learning_rate": 5.820121557655108e-07, "logits/chosen": -0.6859318614006042, "logits/rejected": -0.7206289172172546, "logps/chosen": -117.66719818115234, "logps/rejected": -203.75498962402344, "loss": 0.3663, "rewards/accuracies": 1.0, "rewards/chosen": 2.0003466606140137, "rewards/margins": 1.8357864618301392, "rewards/rejected": 0.16456036269664764, "step": 480 }, { "epoch": 0.5133577789418544, "grad_norm": 1.1126986742019653, "learning_rate": 5.639015248598023e-07, "logits/chosen": -0.6363809704780579, "logits/rejected": -0.6623392105102539, "logps/chosen": -194.19979858398438, "logps/rejected": -238.96615600585938, "loss": 0.3668, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.016817092895508, "rewards/margins": 1.5063263177871704, "rewards/rejected": 0.5104909539222717, "step": 490 }, { "epoch": 0.5238344683080147, "grad_norm": 1.5315709114074707, "learning_rate": 5.457052320211339e-07, "logits/chosen": -0.7844116687774658, "logits/rejected": -0.808144211769104, "logps/chosen": -151.04864501953125, "logps/rejected": -244.23928833007812, "loss": 0.3117, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.9685537815093994, "rewards/margins": 2.61504864692688, "rewards/rejected": -0.6464947462081909, "step": 500 }, { "epoch": 0.5238344683080147, "eval_logits/chosen": -0.6770224571228027, "eval_logits/rejected": -0.7171090245246887, "eval_logps/chosen": -226.59884643554688, "eval_logps/rejected": -271.1226806640625, "eval_loss": 0.34857362508773804, "eval_rewards/accuracies": 0.8705882430076599, "eval_rewards/chosen": 1.7528586387634277, "eval_rewards/margins": 1.5824049711227417, "eval_rewards/rejected": 0.17045359313488007, "eval_runtime": 261.9108, "eval_samples_per_second": 10.385, "eval_steps_per_second": 1.298, "step": 500 }, { "epoch": 0.5343111576741749, "grad_norm": 1.081206202507019, "learning_rate": 5.274476699321637e-07, "logits/chosen": -0.6271040439605713, "logits/rejected": -0.6908977627754211, "logps/chosen": -130.60336303710938, "logps/rejected": -165.01614379882812, "loss": 0.3194, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.081298351287842, "rewards/margins": 1.8207521438598633, "rewards/rejected": 0.2605462670326233, "step": 510 }, { "epoch": 0.5447878470403352, "grad_norm": 1.2145583629608154, "learning_rate": 5.091533134088387e-07, "logits/chosen": -0.6450372934341431, "logits/rejected": -0.6463335752487183, "logps/chosen": -124.65995788574219, "logps/rejected": -181.99124145507812, "loss": 0.3201, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.8719825744628906, "rewards/margins": 1.8109052181243896, "rewards/rejected": 0.06107719987630844, "step": 520 }, { "epoch": 0.5552645364064955, "grad_norm": 0.9780375957489014, "learning_rate": 4.908466865911614e-07, "logits/chosen": -0.6622918248176575, "logits/rejected": -0.6457839012145996, "logps/chosen": -223.21945190429688, "logps/rejected": -342.74493408203125, "loss": 0.3499, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.8123916387557983, "rewards/margins": 1.7622274160385132, "rewards/rejected": 0.050164125859737396, "step": 530 }, { "epoch": 0.5657412257726558, "grad_norm": 1.1513903141021729, "learning_rate": 4.7255233006783624e-07, "logits/chosen": -0.9141020774841309, "logits/rejected": -0.7296786308288574, "logps/chosen": -276.66558837890625, "logps/rejected": -294.1919250488281, "loss": 0.3157, "rewards/accuracies": 0.875, "rewards/chosen": 1.85072922706604, "rewards/margins": 1.9495325088500977, "rewards/rejected": -0.09880335628986359, "step": 540 }, { "epoch": 0.5762179151388162, "grad_norm": 1.3302452564239502, "learning_rate": 4.5429476797886617e-07, "logits/chosen": -0.7545332908630371, "logits/rejected": -0.6817395091056824, "logps/chosen": -216.02926635742188, "logps/rejected": -230.8323211669922, "loss": 0.3406, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7389686107635498, "rewards/margins": 1.522585391998291, "rewards/rejected": 0.21638329327106476, "step": 550 }, { "epoch": 0.5866946045049765, "grad_norm": 1.1911797523498535, "learning_rate": 4.3609847514019763e-07, "logits/chosen": -0.554836094379425, "logits/rejected": -0.5605574250221252, "logps/chosen": -172.40562438964844, "logps/rejected": -245.10995483398438, "loss": 0.2997, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.066683053970337, "rewards/margins": 2.193510055541992, "rewards/rejected": -0.12682685256004333, "step": 560 }, { "epoch": 0.5971712938711368, "grad_norm": 1.8987175226211548, "learning_rate": 4.179878442344892e-07, "logits/chosen": -0.6859768629074097, "logits/rejected": -0.8431941270828247, "logps/chosen": -209.92465209960938, "logps/rejected": -352.6252746582031, "loss": 0.342, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.6043437719345093, "rewards/margins": 1.7139772176742554, "rewards/rejected": -0.10963334143161774, "step": 570 }, { "epoch": 0.607647983237297, "grad_norm": 1.6537494659423828, "learning_rate": 3.9998715311197783e-07, "logits/chosen": -0.7164223790168762, "logits/rejected": -0.7812379598617554, "logps/chosen": -123.40010833740234, "logps/rejected": -194.74520874023438, "loss": 0.3132, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.12626576423645, "rewards/margins": 2.3251867294311523, "rewards/rejected": -0.19892092049121857, "step": 580 }, { "epoch": 0.6181246726034573, "grad_norm": 1.4664175510406494, "learning_rate": 3.821205322452863e-07, "logits/chosen": -0.7272034883499146, "logits/rejected": -0.7011157274246216, "logps/chosen": -122.1158447265625, "logps/rejected": -186.95236206054688, "loss": 0.3187, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.913387656211853, "rewards/margins": 2.4133734703063965, "rewards/rejected": -0.49998578429222107, "step": 590 }, { "epoch": 0.6286013619696176, "grad_norm": 2.0541419982910156, "learning_rate": 3.6441193238179146e-07, "logits/chosen": -0.9111797213554382, "logits/rejected": -0.8080776333808899, "logps/chosen": -203.1688995361328, "logps/rejected": -258.25762939453125, "loss": 0.3219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6965348720550537, "rewards/margins": 1.915580153465271, "rewards/rejected": -0.21904516220092773, "step": 600 }, { "epoch": 0.6286013619696176, "eval_logits/chosen": -0.6708827018737793, "eval_logits/rejected": -0.7125294208526611, "eval_logps/chosen": -226.68060302734375, "eval_logps/rejected": -273.5360412597656, "eval_loss": 0.33462101221084595, "eval_rewards/accuracies": 0.8764705657958984, "eval_rewards/chosen": 1.7487696409225464, "eval_rewards/margins": 1.698984980583191, "eval_rewards/rejected": 0.049785006791353226, "eval_runtime": 261.8797, "eval_samples_per_second": 10.386, "eval_steps_per_second": 1.298, "step": 600 }, { "epoch": 0.6390780513357779, "grad_norm": 1.239606499671936, "learning_rate": 3.4688509243692034e-07, "logits/chosen": -0.7466681003570557, "logits/rejected": -0.6746761798858643, "logps/chosen": -227.64144897460938, "logps/rejected": -261.81854248046875, "loss": 0.3261, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.6498582363128662, "rewards/margins": 1.81642746925354, "rewards/rejected": -0.16656894981861115, "step": 610 }, { "epoch": 0.6495547407019382, "grad_norm": 1.214543342590332, "learning_rate": 3.295635076714144e-07, "logits/chosen": -0.6870505213737488, "logits/rejected": -0.6491986513137817, "logps/chosen": -180.24359130859375, "logps/rejected": -264.5948486328125, "loss": 0.2974, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.7575130462646484, "rewards/margins": 2.330495834350586, "rewards/rejected": -0.5729827284812927, "step": 620 }, { "epoch": 0.6600314300680985, "grad_norm": 1.2782399654388428, "learning_rate": 3.12470398195219e-07, "logits/chosen": -0.7529887557029724, "logits/rejected": -0.753415584564209, "logps/chosen": -195.1822967529297, "logps/rejected": -271.4483642578125, "loss": 0.3042, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.9735838174819946, "rewards/margins": 1.9122159481048584, "rewards/rejected": 0.06136809661984444, "step": 630 }, { "epoch": 0.6705081194342588, "grad_norm": 1.2197209596633911, "learning_rate": 2.956286778402226e-07, "logits/chosen": -0.8249009847640991, "logits/rejected": -0.8348762392997742, "logps/chosen": -202.11837768554688, "logps/rejected": -336.2778015136719, "loss": 0.2823, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.8145548105239868, "rewards/margins": 2.243466854095459, "rewards/rejected": -0.4289122521877289, "step": 640 }, { "epoch": 0.6809848088004191, "grad_norm": 1.6746981143951416, "learning_rate": 2.7906092344356826e-07, "logits/chosen": -0.649664580821991, "logits/rejected": -0.6071664094924927, "logps/chosen": -191.43821716308594, "logps/rejected": -236.2228546142578, "loss": 0.3154, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7577661275863647, "rewards/margins": 1.6446943283081055, "rewards/rejected": 0.11307179927825928, "step": 650 }, { "epoch": 0.6914614981665793, "grad_norm": 0.9587293863296509, "learning_rate": 2.6278934458271996e-07, "logits/chosen": -0.6443291902542114, "logits/rejected": -0.6421949863433838, "logps/chosen": -192.89483642578125, "logps/rejected": -250.52487182617188, "loss": 0.2886, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7871406078338623, "rewards/margins": 1.9648243188858032, "rewards/rejected": -0.17768368124961853, "step": 660 }, { "epoch": 0.7019381875327396, "grad_norm": 1.414176106452942, "learning_rate": 2.468357538028487e-07, "logits/chosen": -0.6694966554641724, "logits/rejected": -0.7076429724693298, "logps/chosen": -119.1021957397461, "logps/rejected": -220.78994750976562, "loss": 0.327, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.0957231521606445, "rewards/margins": 2.640392780303955, "rewards/rejected": -0.5446693301200867, "step": 670 }, { "epoch": 0.7124148768988999, "grad_norm": 1.6091426610946655, "learning_rate": 2.312215373764551e-07, "logits/chosen": -0.7012540102005005, "logits/rejected": -0.6892023086547852, "logps/chosen": -218.32839965820312, "logps/rejected": -300.2800598144531, "loss": 0.3112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7211519479751587, "rewards/margins": 1.656751275062561, "rewards/rejected": 0.06440071016550064, "step": 680 }, { "epoch": 0.7228915662650602, "grad_norm": 1.9142816066741943, "learning_rate": 2.1596762663442213e-07, "logits/chosen": -0.5409647226333618, "logits/rejected": -0.5352240800857544, "logps/chosen": -148.7902374267578, "logps/rejected": -249.7965850830078, "loss": 0.3132, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.828574776649475, "rewards/margins": 2.3894662857055664, "rewards/rejected": -0.5608914494514465, "step": 690 }, { "epoch": 0.7333682556312205, "grad_norm": 1.4163298606872559, "learning_rate": 2.0109446990692963e-07, "logits/chosen": -0.6709907650947571, "logits/rejected": -0.5361426472663879, "logps/chosen": -186.37722778320312, "logps/rejected": -187.66539001464844, "loss": 0.2924, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7443721294403076, "rewards/margins": 2.1021335124969482, "rewards/rejected": -0.3577616810798645, "step": 700 }, { "epoch": 0.7333682556312205, "eval_logits/chosen": -0.6733396053314209, "eval_logits/rejected": -0.7103094458580017, "eval_logps/chosen": -225.75912475585938, "eval_logps/rejected": -274.4924011230469, "eval_loss": 0.3259245753288269, "eval_rewards/accuracies": 0.8823529481887817, "eval_rewards/chosen": 1.7948431968688965, "eval_rewards/margins": 1.79287588596344, "eval_rewards/rejected": 0.0019673823844641447, "eval_runtime": 261.7936, "eval_samples_per_second": 10.39, "eval_steps_per_second": 1.299, "step": 700 }, { "epoch": 0.7438449449973809, "grad_norm": 0.9435632228851318, "learning_rate": 1.8662200511184872e-07, "logits/chosen": -0.6021249890327454, "logits/rejected": -0.6577110886573792, "logps/chosen": -99.76377868652344, "logps/rejected": -208.7487335205078, "loss": 0.2974, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.204495429992676, "rewards/margins": 3.0325827598571777, "rewards/rejected": -0.8280874490737915, "step": 710 }, { "epoch": 0.7543216343635412, "grad_norm": 1.6756037473678589, "learning_rate": 1.725696330273575e-07, "logits/chosen": -0.8345254063606262, "logits/rejected": -0.7763462066650391, "logps/chosen": -210.8310089111328, "logps/rejected": -274.93072509765625, "loss": 0.3315, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.696265459060669, "rewards/margins": 1.8497207164764404, "rewards/rejected": -0.15345516800880432, "step": 720 }, { "epoch": 0.7647983237297015, "grad_norm": 1.1243698596954346, "learning_rate": 1.589561912846089e-07, "logits/chosen": -0.6270607709884644, "logits/rejected": -0.7315198183059692, "logps/chosen": -165.06517028808594, "logps/rejected": -244.2911834716797, "loss": 0.2755, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8916425704956055, "rewards/margins": 2.20832896232605, "rewards/rejected": -0.31668657064437866, "step": 730 }, { "epoch": 0.7752750130958617, "grad_norm": 1.4865248203277588, "learning_rate": 1.4579992911531496e-07, "logits/chosen": -0.6748223304748535, "logits/rejected": -0.7190132141113281, "logps/chosen": -174.67404174804688, "logps/rejected": -226.0449676513672, "loss": 0.3301, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9555890560150146, "rewards/margins": 2.253129482269287, "rewards/rejected": -0.29754024744033813, "step": 740 }, { "epoch": 0.785751702462022, "grad_norm": 1.5342172384262085, "learning_rate": 1.3311848288809813e-07, "logits/chosen": -0.5253860354423523, "logits/rejected": -0.5943223834037781, "logps/chosen": -111.89634704589844, "logps/rejected": -253.905029296875, "loss": 0.3232, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.180968999862671, "rewards/margins": 2.8969180583953857, "rewards/rejected": -0.715948760509491, "step": 750 }, { "epoch": 0.7962283918281823, "grad_norm": 1.143971562385559, "learning_rate": 1.209288524664029e-07, "logits/chosen": -0.5982942581176758, "logits/rejected": -0.687002420425415, "logps/chosen": -132.365478515625, "logps/rejected": -267.1639099121094, "loss": 0.3182, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.9760652780532837, "rewards/margins": 2.1792385578155518, "rewards/rejected": -0.20317308604717255, "step": 760 }, { "epoch": 0.8067050811943426, "grad_norm": 1.4600186347961426, "learning_rate": 1.0924737841966497e-07, "logits/chosen": -0.8853614926338196, "logits/rejected": -0.8439884185791016, "logps/chosen": -189.08493041992188, "logps/rejected": -221.0530242919922, "loss": 0.3246, "rewards/accuracies": 0.875, "rewards/chosen": 1.7503544092178345, "rewards/margins": 2.0065951347351074, "rewards/rejected": -0.25624093413352966, "step": 770 }, { "epoch": 0.8171817705605029, "grad_norm": 1.2022879123687744, "learning_rate": 9.808972011828054e-08, "logits/chosen": -0.719186544418335, "logits/rejected": -0.7255199551582336, "logps/chosen": -153.64651489257812, "logps/rejected": -211.08731079101562, "loss": 0.3011, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.974013328552246, "rewards/margins": 1.7627719640731812, "rewards/rejected": 0.2112412005662918, "step": 780 }, { "epoch": 0.8276584599266632, "grad_norm": 1.2907471656799316, "learning_rate": 8.747083474174527e-08, "logits/chosen": -0.6835842132568359, "logits/rejected": -0.7415401339530945, "logps/chosen": -173.19088745117188, "logps/rejected": -246.8120574951172, "loss": 0.2945, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8839311599731445, "rewards/margins": 2.0039474964141846, "rewards/rejected": -0.12001626193523407, "step": 790 }, { "epoch": 0.8381351492928235, "grad_norm": 1.7280535697937012, "learning_rate": 7.740495722810269e-08, "logits/chosen": -0.6990944147109985, "logits/rejected": -0.593161404132843, "logps/chosen": -198.01394653320312, "logps/rejected": -272.8219909667969, "loss": 0.3287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8608243465423584, "rewards/margins": 2.461362838745117, "rewards/rejected": -0.600538432598114, "step": 800 }, { "epoch": 0.8381351492928235, "eval_logits/chosen": -0.6609554290771484, "eval_logits/rejected": -0.7049468159675598, "eval_logps/chosen": -225.6600799560547, "eval_logps/rejected": -274.9728088378906, "eval_loss": 0.3220690190792084, "eval_rewards/accuracies": 0.8735294342041016, "eval_rewards/chosen": 1.799795150756836, "eval_rewards/margins": 1.821848750114441, "eval_rewards/rejected": -0.022053834050893784, "eval_runtime": 261.7431, "eval_samples_per_second": 10.392, "eval_steps_per_second": 1.299, "step": 800 }, { "epoch": 0.8486118386589837, "grad_norm": 2.1192266941070557, "learning_rate": 6.790558119157597e-08, "logits/chosen": -0.7713523507118225, "logits/rejected": -0.8187154531478882, "logps/chosen": -205.0841827392578, "logps/rejected": -292.3119201660156, "loss": 0.2919, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9179985523223877, "rewards/margins": 2.320665121078491, "rewards/rejected": -0.4026666581630707, "step": 810 }, { "epoch": 0.859088528025144, "grad_norm": 2.014094829559326, "learning_rate": 5.898544083397e-08, "logits/chosen": -0.6670573949813843, "logits/rejected": -0.608993411064148, "logps/chosen": -190.70272827148438, "logps/rejected": -312.3935852050781, "loss": 0.3022, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9065654277801514, "rewards/margins": 1.8183749914169312, "rewards/rejected": 0.08819030225276947, "step": 820 }, { "epoch": 0.8695652173913043, "grad_norm": 1.7847275733947754, "learning_rate": 5.065649387408705e-08, "logits/chosen": -0.6578026413917542, "logits/rejected": -0.6870561838150024, "logps/chosen": -123.2751693725586, "logps/rejected": -218.3284149169922, "loss": 0.3253, "rewards/accuracies": 0.875, "rewards/chosen": 1.8741661310195923, "rewards/margins": 2.1019585132598877, "rewards/rejected": -0.22779226303100586, "step": 830 }, { "epoch": 0.8800419067574646, "grad_norm": 1.4194775819778442, "learning_rate": 4.292990551804171e-08, "logits/chosen": -0.6935116648674011, "logits/rejected": -0.674730658531189, "logps/chosen": -242.6377410888672, "logps/rejected": -321.9432067871094, "loss": 0.3169, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7531979084014893, "rewards/margins": 2.842390537261963, "rewards/rejected": -1.0891928672790527, "step": 840 }, { "epoch": 0.8905185961236249, "grad_norm": 1.3208121061325073, "learning_rate": 3.581603349196371e-08, "logits/chosen": -0.7673216462135315, "logits/rejected": -0.7687219381332397, "logps/chosen": -95.88841247558594, "logps/rejected": -172.04896545410156, "loss": 0.2769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.240773916244507, "rewards/margins": 3.069509983062744, "rewards/rejected": -0.8287358283996582, "step": 850 }, { "epoch": 0.9009952854897852, "grad_norm": 1.3934128284454346, "learning_rate": 2.9324414157151367e-08, "logits/chosen": -0.6518006324768066, "logits/rejected": -0.731589674949646, "logps/chosen": -148.0747528076172, "logps/rejected": -228.8854217529297, "loss": 0.2978, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9387842416763306, "rewards/margins": 1.903713583946228, "rewards/rejected": 0.035070739686489105, "step": 860 }, { "epoch": 0.9114719748559456, "grad_norm": 0.8581977486610413, "learning_rate": 2.3463749726290284e-08, "logits/chosen": -0.6023357510566711, "logits/rejected": -0.591888427734375, "logps/chosen": -158.55538940429688, "logps/rejected": -237.7508544921875, "loss": 0.3205, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.05496883392334, "rewards/margins": 2.3338775634765625, "rewards/rejected": -0.27890875935554504, "step": 870 }, { "epoch": 0.9219486642221059, "grad_norm": 1.8699610233306885, "learning_rate": 1.824189659787284e-08, "logits/chosen": -0.8147839307785034, "logits/rejected": -0.7728699445724487, "logps/chosen": -161.78822326660156, "logps/rejected": -192.01748657226562, "loss": 0.3507, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9544188976287842, "rewards/margins": 1.88066828250885, "rewards/rejected": 0.07375077903270721, "step": 880 }, { "epoch": 0.9324253535882661, "grad_norm": 1.350211262702942, "learning_rate": 1.3665854824458035e-08, "logits/chosen": -0.8000810742378235, "logits/rejected": -0.806304931640625, "logps/chosen": -212.8185272216797, "logps/rejected": -333.13983154296875, "loss": 0.3171, "rewards/accuracies": 0.875, "rewards/chosen": 1.7481015920639038, "rewards/margins": 1.8647775650024414, "rewards/rejected": -0.11667587608098984, "step": 890 }, { "epoch": 0.9429020429544264, "grad_norm": 1.076051950454712, "learning_rate": 9.741758728888217e-09, "logits/chosen": -0.6841428279876709, "logits/rejected": -0.6118525266647339, "logps/chosen": -190.805419921875, "logps/rejected": -257.29071044921875, "loss": 0.3149, "rewards/accuracies": 0.875, "rewards/chosen": 1.9618017673492432, "rewards/margins": 2.3374452590942383, "rewards/rejected": -0.3756433129310608, "step": 900 }, { "epoch": 0.9429020429544264, "eval_logits/chosen": -0.6615923643112183, "eval_logits/rejected": -0.7050707340240479, "eval_logps/chosen": -225.65843200683594, "eval_logps/rejected": -275.258056640625, "eval_loss": 0.321457177400589, "eval_rewards/accuracies": 0.8823529481887817, "eval_rewards/chosen": 1.7998789548873901, "eval_rewards/margins": 1.8361927270889282, "eval_rewards/rejected": -0.03631366044282913, "eval_runtime": 261.808, "eval_samples_per_second": 10.389, "eval_steps_per_second": 1.299, "step": 900 }, { "epoch": 0.9533787323205867, "grad_norm": 1.2230325937271118, "learning_rate": 6.474868681043577e-09, "logits/chosen": -0.6633517146110535, "logits/rejected": -0.6545056104660034, "logps/chosen": -186.76364135742188, "logps/rejected": -208.23318481445312, "loss": 0.3096, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.9310375452041626, "rewards/margins": 2.1193923950195312, "rewards/rejected": -0.18835484981536865, "step": 910 }, { "epoch": 0.963855421686747, "grad_norm": 1.6915018558502197, "learning_rate": 3.869564046156459e-09, "logits/chosen": -0.5889161825180054, "logits/rejected": -0.519936203956604, "logps/chosen": -124.73238372802734, "logps/rejected": -210.55435180664062, "loss": 0.3246, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.9830315113067627, "rewards/margins": 2.1443068981170654, "rewards/rejected": -0.16127553582191467, "step": 920 }, { "epoch": 0.9743321110529073, "grad_norm": 1.0057599544525146, "learning_rate": 1.929337314139412e-09, "logits/chosen": -0.6209553480148315, "logits/rejected": -0.7050691843032837, "logps/chosen": -114.675537109375, "logps/rejected": -184.9393310546875, "loss": 0.2747, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.9676446914672852, "rewards/margins": 2.062540054321289, "rewards/rejected": -0.09489502757787704, "step": 930 }, { "epoch": 0.9848088004190676, "grad_norm": 2.0049855709075928, "learning_rate": 6.567894177967325e-10, "logits/chosen": -0.5827627778053284, "logits/rejected": -0.5892088413238525, "logps/chosen": -210.23275756835938, "logps/rejected": -328.05621337890625, "loss": 0.2948, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9841982126235962, "rewards/margins": 2.81449031829834, "rewards/rejected": -0.8302921056747437, "step": 940 }, { "epoch": 0.9952854897852279, "grad_norm": 1.0995185375213623, "learning_rate": 5.3626246194704575e-11, "logits/chosen": -0.6645965576171875, "logits/rejected": -0.6572784781455994, "logps/chosen": -212.4374542236328, "logps/rejected": -301.1936340332031, "loss": 0.3017, "rewards/accuracies": 0.875, "rewards/chosen": 1.529649019241333, "rewards/margins": 1.8092479705810547, "rewards/rejected": -0.2795989215373993, "step": 950 }, { "epoch": 0.9994761655316919, "step": 954, "total_flos": 0.0, "train_loss": 0.4010467823571379, "train_runtime": 7944.4042, "train_samples_per_second": 3.844, "train_steps_per_second": 0.12 } ], "logging_steps": 10, "max_steps": 954, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }