{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9964868029907215, "eval_steps": 800, "global_step": 2079, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014413115935501305, "grad_norm": 21.287893295288086, "learning_rate": 4.807692307692308e-10, "logits/chosen": -2.3065450191497803, "logits/rejected": -2.3093364238739014, "logps/chosen": -43.837303161621094, "logps/rejected": -48.05693054199219, "loss": 0.6927, "rewards/accuracies": 0.0625, "rewards/chosen": 9.900308214128017e-06, "rewards/margins": 0.0009647191036492586, "rewards/rejected": -0.0009548187954351306, "step": 1 }, { "epoch": 0.014413115935501306, "grad_norm": 21.111068725585938, "learning_rate": 4.807692307692308e-09, "logits/chosen": -2.3277647495269775, "logits/rejected": -2.3011653423309326, "logps/chosen": -42.82107162475586, "logps/rejected": -44.892906188964844, "loss": 0.6928, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": -0.001543219666928053, "rewards/margins": 0.0007266975590027869, "rewards/rejected": -0.002269917167723179, "step": 10 }, { "epoch": 0.02882623187100261, "grad_norm": 19.57785987854004, "learning_rate": 9.615384615384615e-09, "logits/chosen": -2.288435459136963, "logits/rejected": -2.2758889198303223, "logps/chosen": -45.44641876220703, "logps/rejected": -48.17905044555664, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.0001232023787451908, "rewards/margins": 0.0005566190229728818, "rewards/rejected": -0.00043341662967577577, "step": 20 }, { "epoch": 0.04323934780650392, "grad_norm": 26.79755401611328, "learning_rate": 1.442307692307692e-08, "logits/chosen": -2.3148436546325684, "logits/rejected": -2.3025364875793457, "logps/chosen": -46.84934616088867, "logps/rejected": -48.419532775878906, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00020735207363031805, "rewards/margins": 0.0002374596951995045, "rewards/rejected": -0.0004448117106221616, "step": 30 }, { "epoch": 0.05765246374200522, "grad_norm": 22.711183547973633, "learning_rate": 1.923076923076923e-08, "logits/chosen": -2.347923517227173, "logits/rejected": -2.3385748863220215, "logps/chosen": -50.60676956176758, "logps/rejected": -52.752296447753906, "loss": 0.6929, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.00011574259406188503, "rewards/margins": 0.0005067865131422877, "rewards/rejected": -0.0006225291872397065, "step": 40 }, { "epoch": 0.07206557967750653, "grad_norm": 23.8702392578125, "learning_rate": 2.403846153846154e-08, "logits/chosen": -2.329652786254883, "logits/rejected": -2.322984218597412, "logps/chosen": -47.366519927978516, "logps/rejected": -49.952056884765625, "loss": 0.6938, "rewards/accuracies": 0.40937501192092896, "rewards/chosen": -0.00085345224943012, "rewards/margins": -0.0012199878692626953, "rewards/rejected": 0.0003665355616249144, "step": 50 }, { "epoch": 0.08647869561300783, "grad_norm": 18.53515625, "learning_rate": 2.884615384615384e-08, "logits/chosen": -2.3065052032470703, "logits/rejected": -2.2884154319763184, "logps/chosen": -46.60871124267578, "logps/rejected": -48.89433670043945, "loss": 0.6934, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.0008969244663603604, "rewards/margins": -0.00045310668065212667, "rewards/rejected": -0.0004438180476427078, "step": 60 }, { "epoch": 0.10089181154850914, "grad_norm": 18.022958755493164, "learning_rate": 3.365384615384615e-08, "logits/chosen": -2.3429152965545654, "logits/rejected": -2.327995777130127, "logps/chosen": -47.61640167236328, "logps/rejected": -50.77202606201172, "loss": 0.6928, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00031301408307626843, "rewards/margins": 0.000775355554651469, "rewards/rejected": -0.0010883695213124156, "step": 70 }, { "epoch": 0.11530492748401044, "grad_norm": 18.332494735717773, "learning_rate": 3.846153846153846e-08, "logits/chosen": -2.3375937938690186, "logits/rejected": -2.311336040496826, "logps/chosen": -44.785614013671875, "logps/rejected": -48.0066032409668, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00027114865952171385, "rewards/margins": -9.801401574804913e-06, "rewards/rejected": 0.00028095004381611943, "step": 80 }, { "epoch": 0.12971804341951176, "grad_norm": 18.671464920043945, "learning_rate": 4.326923076923077e-08, "logits/chosen": -2.3199424743652344, "logits/rejected": -2.2938828468322754, "logps/chosen": -44.98554611206055, "logps/rejected": -46.84703826904297, "loss": 0.6941, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -0.0013123347889631987, "rewards/margins": -0.0016984030371531844, "rewards/rejected": 0.0003860682772938162, "step": 90 }, { "epoch": 0.14413115935501306, "grad_norm": 23.246063232421875, "learning_rate": 4.807692307692308e-08, "logits/chosen": -2.3865551948547363, "logits/rejected": -2.3804287910461426, "logps/chosen": -42.85012435913086, "logps/rejected": -46.24885177612305, "loss": 0.6934, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.0006392289651557803, "rewards/margins": -0.00039641355397179723, "rewards/rejected": 0.0010356425773352385, "step": 100 }, { "epoch": 0.15854427529051437, "grad_norm": 19.473865509033203, "learning_rate": 5.288461538461538e-08, "logits/chosen": -2.3142473697662354, "logits/rejected": -2.3037219047546387, "logps/chosen": -45.20660400390625, "logps/rejected": -47.961936950683594, "loss": 0.6937, "rewards/accuracies": 0.453125, "rewards/chosen": -0.0003208608250133693, "rewards/margins": -0.0010648202151060104, "rewards/rejected": 0.0007439593900926411, "step": 110 }, { "epoch": 0.17295739122601567, "grad_norm": 22.531965255737305, "learning_rate": 5.769230769230768e-08, "logits/chosen": -2.3439688682556152, "logits/rejected": -2.3307394981384277, "logps/chosen": -46.80133056640625, "logps/rejected": -49.72489547729492, "loss": 0.6933, "rewards/accuracies": 0.453125, "rewards/chosen": 1.1269899914623238e-05, "rewards/margins": -0.00012028318451484665, "rewards/rejected": 0.0001315530389547348, "step": 120 }, { "epoch": 0.18737050716151699, "grad_norm": 24.544965744018555, "learning_rate": 6.25e-08, "logits/chosen": -2.2930805683135986, "logits/rejected": -2.279456377029419, "logps/chosen": -49.83013916015625, "logps/rejected": -51.1182975769043, "loss": 0.6917, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.0026741260662674904, "rewards/margins": 0.0031126493122428656, "rewards/rejected": -0.00043852307135239244, "step": 130 }, { "epoch": 0.20178362309701828, "grad_norm": 19.973854064941406, "learning_rate": 6.73076923076923e-08, "logits/chosen": -2.3479955196380615, "logits/rejected": -2.3185691833496094, "logps/chosen": -44.58715057373047, "logps/rejected": -46.512718200683594, "loss": 0.691, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0033654593862593174, "rewards/margins": 0.0043604555539786816, "rewards/rejected": -0.0009949964005500078, "step": 140 }, { "epoch": 0.2161967390325196, "grad_norm": 20.358179092407227, "learning_rate": 7.211538461538461e-08, "logits/chosen": -2.3313281536102295, "logits/rejected": -2.307065486907959, "logps/chosen": -46.340797424316406, "logps/rejected": -48.234344482421875, "loss": 0.6919, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.00332791730761528, "rewards/margins": 0.002700595883652568, "rewards/rejected": 0.0006273213075473905, "step": 150 }, { "epoch": 0.2306098549680209, "grad_norm": 20.67792510986328, "learning_rate": 7.692307692307692e-08, "logits/chosen": -2.305814266204834, "logits/rejected": -2.290213108062744, "logps/chosen": -47.53135681152344, "logps/rejected": -50.77891540527344, "loss": 0.6921, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.005107083357870579, "rewards/margins": 0.002246940741315484, "rewards/rejected": 0.002860142383724451, "step": 160 }, { "epoch": 0.2450229709035222, "grad_norm": 20.16474723815918, "learning_rate": 8.173076923076923e-08, "logits/chosen": -2.349069356918335, "logits/rejected": -2.331510066986084, "logps/chosen": -50.888572692871094, "logps/rejected": -50.80742645263672, "loss": 0.6929, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0035659130662679672, "rewards/margins": 0.0006433400558307767, "rewards/rejected": 0.0029225728940218687, "step": 170 }, { "epoch": 0.2594360868390235, "grad_norm": 22.5780029296875, "learning_rate": 8.653846153846154e-08, "logits/chosen": -2.305760622024536, "logits/rejected": -2.28718638420105, "logps/chosen": -49.73812484741211, "logps/rejected": -52.55815887451172, "loss": 0.692, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.00477216811850667, "rewards/margins": 0.0024364416021853685, "rewards/rejected": 0.002335726749151945, "step": 180 }, { "epoch": 0.2738492027745248, "grad_norm": 20.311643600463867, "learning_rate": 9.134615384615383e-08, "logits/chosen": -2.3253870010375977, "logits/rejected": -2.3114845752716064, "logps/chosen": -47.62921142578125, "logps/rejected": -50.506046295166016, "loss": 0.6926, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.004789062775671482, "rewards/margins": 0.0012888021301478148, "rewards/rejected": 0.003500260878354311, "step": 190 }, { "epoch": 0.2882623187100261, "grad_norm": 22.185544967651367, "learning_rate": 9.615384615384616e-08, "logits/chosen": -2.351552963256836, "logits/rejected": -2.335294246673584, "logps/chosen": -46.49347686767578, "logps/rejected": -48.789695739746094, "loss": 0.6912, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.006301163230091333, "rewards/margins": 0.0041467128321528435, "rewards/rejected": 0.002154449699446559, "step": 200 }, { "epoch": 0.30267543464552743, "grad_norm": 19.66486167907715, "learning_rate": 9.999971806320255e-08, "logits/chosen": -2.3723862171173096, "logits/rejected": -2.3726882934570312, "logps/chosen": -46.52350616455078, "logps/rejected": -49.28281021118164, "loss": 0.6908, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00785305630415678, "rewards/margins": 0.004859209060668945, "rewards/rejected": 0.0029938467778265476, "step": 210 }, { "epoch": 0.31708855058102875, "grad_norm": 21.849241256713867, "learning_rate": 9.998985060913876e-08, "logits/chosen": -2.29600191116333, "logits/rejected": -2.280897855758667, "logps/chosen": -45.98343276977539, "logps/rejected": -47.80237579345703, "loss": 0.6908, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.007365316152572632, "rewards/margins": 0.0048685320653021336, "rewards/rejected": 0.0024967845529317856, "step": 220 }, { "epoch": 0.33150166651653007, "grad_norm": 29.88494300842285, "learning_rate": 9.996588949457546e-08, "logits/chosen": -2.3516383171081543, "logits/rejected": -2.3211302757263184, "logps/chosen": -53.43247604370117, "logps/rejected": -53.46599197387695, "loss": 0.6896, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.009954456239938736, "rewards/margins": 0.007440419401973486, "rewards/rejected": 0.0025140370707958937, "step": 230 }, { "epoch": 0.34591478245203133, "grad_norm": 21.79450798034668, "learning_rate": 9.992784147488017e-08, "logits/chosen": -2.3784842491149902, "logits/rejected": -2.3466084003448486, "logps/chosen": -47.536399841308594, "logps/rejected": -49.28679275512695, "loss": 0.6913, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.008210290223360062, "rewards/margins": 0.0038244971074163914, "rewards/rejected": 0.004385794512927532, "step": 240 }, { "epoch": 0.36032789838753265, "grad_norm": 19.872880935668945, "learning_rate": 9.987571727694775e-08, "logits/chosen": -2.3470616340637207, "logits/rejected": -2.322946548461914, "logps/chosen": -44.467933654785156, "logps/rejected": -47.677467346191406, "loss": 0.6888, "rewards/accuracies": 0.565625011920929, "rewards/chosen": 0.01355994027107954, "rewards/margins": 0.008894408121705055, "rewards/rejected": 0.00466553308069706, "step": 250 }, { "epoch": 0.37474101432303397, "grad_norm": 25.680984497070312, "learning_rate": 9.98095315961762e-08, "logits/chosen": -2.3500027656555176, "logits/rejected": -2.3312158584594727, "logps/chosen": -49.896331787109375, "logps/rejected": -51.985511779785156, "loss": 0.6901, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.012083685956895351, "rewards/margins": 0.0064600324258208275, "rewards/rejected": 0.005623653531074524, "step": 260 }, { "epoch": 0.3891541302585353, "grad_norm": 21.92185401916504, "learning_rate": 9.97293030923235e-08, "logits/chosen": -2.3450703620910645, "logits/rejected": -2.3156611919403076, "logps/chosen": -46.08183670043945, "logps/rejected": -47.64861297607422, "loss": 0.6882, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01599208638072014, "rewards/margins": 0.010242667980492115, "rewards/rejected": 0.005749420262873173, "step": 270 }, { "epoch": 0.40356724619403656, "grad_norm": 22.253570556640625, "learning_rate": 9.963505438424693e-08, "logits/chosen": -2.311044454574585, "logits/rejected": -2.294201612472534, "logps/chosen": -48.35077667236328, "logps/rejected": -49.112300872802734, "loss": 0.69, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.013459725305438042, "rewards/margins": 0.006622823420912027, "rewards/rejected": 0.006836901418864727, "step": 280 }, { "epoch": 0.4179803621295379, "grad_norm": 20.787418365478516, "learning_rate": 9.952681204352607e-08, "logits/chosen": -2.333068609237671, "logits/rejected": -2.299546003341675, "logps/chosen": -47.46521759033203, "logps/rejected": -49.262413024902344, "loss": 0.6867, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.019271234050393105, "rewards/margins": 0.013373048976063728, "rewards/rejected": 0.005898186471313238, "step": 290 }, { "epoch": 0.4323934780650392, "grad_norm": 23.323768615722656, "learning_rate": 9.94046065869715e-08, "logits/chosen": -2.343045234680176, "logits/rejected": -2.330648899078369, "logps/chosen": -46.3621711730957, "logps/rejected": -51.10980987548828, "loss": 0.6841, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.020211653783917427, "rewards/margins": 0.01878417655825615, "rewards/rejected": 0.0014274796703830361, "step": 300 }, { "epoch": 0.4468065940005405, "grad_norm": 16.884517669677734, "learning_rate": 9.926847246802116e-08, "logits/chosen": -2.325950860977173, "logits/rejected": -2.298354148864746, "logps/chosen": -46.33721160888672, "logps/rejected": -47.747737884521484, "loss": 0.6877, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.0174238421022892, "rewards/margins": 0.011514835990965366, "rewards/rejected": 0.005909005645662546, "step": 310 }, { "epoch": 0.4612197099360418, "grad_norm": 20.160341262817383, "learning_rate": 9.911844806702691e-08, "logits/chosen": -2.324944019317627, "logits/rejected": -2.314134120941162, "logps/chosen": -44.099708557128906, "logps/rejected": -47.27263641357422, "loss": 0.684, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.023020442575216293, "rewards/margins": 0.019213002175092697, "rewards/rejected": 0.0038074390031397343, "step": 320 }, { "epoch": 0.4756328258715431, "grad_norm": 23.56479263305664, "learning_rate": 9.895457568043387e-08, "logits/chosen": -2.349403142929077, "logits/rejected": -2.330629825592041, "logps/chosen": -46.192623138427734, "logps/rejected": -47.701568603515625, "loss": 0.6842, "rewards/accuracies": 0.578125, "rewards/chosen": 0.02301434613764286, "rewards/margins": 0.018842367455363274, "rewards/rejected": 0.004171978682279587, "step": 330 }, { "epoch": 0.4900459418070444, "grad_norm": 26.761240005493164, "learning_rate": 9.877690150885587e-08, "logits/chosen": -2.29034423828125, "logits/rejected": -2.259958028793335, "logps/chosen": -49.6630859375, "logps/rejected": -50.368370056152344, "loss": 0.6842, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.02521464228630066, "rewards/margins": 0.018750475719571114, "rewards/rejected": 0.006464164704084396, "step": 340 }, { "epoch": 0.5044590577425457, "grad_norm": 17.43931770324707, "learning_rate": 9.858547564404998e-08, "logits/chosen": -2.3370158672332764, "logits/rejected": -2.314396381378174, "logps/chosen": -47.03008270263672, "logps/rejected": -49.929176330566406, "loss": 0.6836, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.02510705217719078, "rewards/margins": 0.020147310569882393, "rewards/rejected": 0.004959740675985813, "step": 350 }, { "epoch": 0.518872173678047, "grad_norm": 21.918987274169922, "learning_rate": 9.838035205479418e-08, "logits/chosen": -2.302408218383789, "logits/rejected": -2.283946990966797, "logps/chosen": -45.20514678955078, "logps/rejected": -47.9190559387207, "loss": 0.6832, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.025177201256155968, "rewards/margins": 0.021118884906172752, "rewards/rejected": 0.004058316815644503, "step": 360 }, { "epoch": 0.5332852896135484, "grad_norm": 24.303964614868164, "learning_rate": 9.816158857167196e-08, "logits/chosen": -2.3255884647369385, "logits/rejected": -2.3051230907440186, "logps/chosen": -47.729164123535156, "logps/rejected": -48.6644401550293, "loss": 0.6856, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.02227911166846752, "rewards/margins": 0.016225317493081093, "rewards/rejected": 0.006053795572370291, "step": 370 }, { "epoch": 0.5476984055490496, "grad_norm": 24.593324661254883, "learning_rate": 9.7929246870768e-08, "logits/chosen": -2.323387384414673, "logits/rejected": -2.308987855911255, "logps/chosen": -47.08444595336914, "logps/rejected": -49.36346435546875, "loss": 0.6811, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.028125789016485214, "rewards/margins": 0.02563219703733921, "rewards/rejected": 0.0024935940746217966, "step": 380 }, { "epoch": 0.5621115214845509, "grad_norm": 21.75504493713379, "learning_rate": 9.768339245627993e-08, "logits/chosen": -2.294553518295288, "logits/rejected": -2.2829360961914062, "logps/chosen": -45.68154525756836, "logps/rejected": -48.066612243652344, "loss": 0.6796, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.030594781041145325, "rewards/margins": 0.02912401221692562, "rewards/rejected": 0.0014707675436511636, "step": 390 }, { "epoch": 0.5765246374200522, "grad_norm": 21.862024307250977, "learning_rate": 9.742409464205059e-08, "logits/chosen": -2.3316078186035156, "logits/rejected": -2.3144497871398926, "logps/chosen": -48.248443603515625, "logps/rejected": -50.65959548950195, "loss": 0.6809, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.032895296812057495, "rewards/margins": 0.026586730033159256, "rewards/rejected": 0.006308571435511112, "step": 400 }, { "epoch": 0.5909377533555535, "grad_norm": 20.746496200561523, "learning_rate": 9.715142653202644e-08, "logits/chosen": -2.3111040592193604, "logits/rejected": -2.293923854827881, "logps/chosen": -45.711585998535156, "logps/rejected": -47.74725341796875, "loss": 0.6801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.029646728187799454, "rewards/margins": 0.028508460149168968, "rewards/rejected": 0.001138267107307911, "step": 410 }, { "epoch": 0.6053508692910549, "grad_norm": 17.426355361938477, "learning_rate": 9.68654649996473e-08, "logits/chosen": -2.32612681388855, "logits/rejected": -2.314548969268799, "logps/chosen": -44.34481430053711, "logps/rejected": -46.076744079589844, "loss": 0.6775, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.03846784681081772, "rewards/margins": 0.0339292511343956, "rewards/rejected": 0.004538598004728556, "step": 420 }, { "epoch": 0.6197639852265562, "grad_norm": 22.193706512451172, "learning_rate": 9.656629066617335e-08, "logits/chosen": -2.312929630279541, "logits/rejected": -2.2936108112335205, "logps/chosen": -51.8912467956543, "logps/rejected": -53.661651611328125, "loss": 0.6762, "rewards/accuracies": 0.640625, "rewards/chosen": 0.034915387630462646, "rewards/margins": 0.0364970937371254, "rewards/rejected": -0.001581709599122405, "step": 430 }, { "epoch": 0.6341771011620575, "grad_norm": 20.207712173461914, "learning_rate": 9.62539878779556e-08, "logits/chosen": -2.3140978813171387, "logits/rejected": -2.3001463413238525, "logps/chosen": -46.92143630981445, "logps/rejected": -49.142173767089844, "loss": 0.6789, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0315568745136261, "rewards/margins": 0.03146868199110031, "rewards/rejected": 8.819000504445285e-05, "step": 440 }, { "epoch": 0.6485902170975588, "grad_norm": 26.56516456604004, "learning_rate": 9.592864468265604e-08, "logits/chosen": -2.3407697677612305, "logits/rejected": -2.3281524181365967, "logps/chosen": -48.421573638916016, "logps/rejected": -50.97784423828125, "loss": 0.6779, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 0.033747684210538864, "rewards/margins": 0.03344957157969475, "rewards/rejected": 0.0002981135621666908, "step": 450 }, { "epoch": 0.6630033330330601, "grad_norm": 23.340192794799805, "learning_rate": 9.559035280442441e-08, "logits/chosen": -2.299325942993164, "logits/rejected": -2.2764008045196533, "logps/chosen": -46.117820739746094, "logps/rejected": -47.99044418334961, "loss": 0.6785, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03373030200600624, "rewards/margins": 0.03273017704486847, "rewards/rejected": 0.0010001230984926224, "step": 460 }, { "epoch": 0.6774164489685613, "grad_norm": 20.64352798461914, "learning_rate": 9.523920761803823e-08, "logits/chosen": -2.3671813011169434, "logits/rejected": -2.346903085708618, "logps/chosen": -50.475929260253906, "logps/rejected": -52.254722595214844, "loss": 0.6762, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 0.039838653057813644, "rewards/margins": 0.036926448345184326, "rewards/rejected": 0.002912207506597042, "step": 470 }, { "epoch": 0.6918295649040627, "grad_norm": 22.629377365112305, "learning_rate": 9.487530812201383e-08, "logits/chosen": -2.3167202472686768, "logits/rejected": -2.3114185333251953, "logps/chosen": -47.46406936645508, "logps/rejected": -51.130619049072266, "loss": 0.6793, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.036858368664979935, "rewards/margins": 0.030934974551200867, "rewards/rejected": 0.005923392251133919, "step": 480 }, { "epoch": 0.706242680839564, "grad_norm": 23.737966537475586, "learning_rate": 9.449875691069571e-08, "logits/chosen": -2.3150174617767334, "logits/rejected": -2.3096041679382324, "logps/chosen": -47.43329620361328, "logps/rejected": -51.78876876831055, "loss": 0.6761, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.041950371116399765, "rewards/margins": 0.03815682604908943, "rewards/rejected": 0.003793553216382861, "step": 490 }, { "epoch": 0.7206557967750653, "grad_norm": 22.39310646057129, "learning_rate": 9.410966014533195e-08, "logits/chosen": -2.3104045391082764, "logits/rejected": -2.2926175594329834, "logps/chosen": -48.61553955078125, "logps/rejected": -50.52370834350586, "loss": 0.6772, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.04291251674294472, "rewards/margins": 0.03616334870457649, "rewards/rejected": 0.00674917409196496, "step": 500 }, { "epoch": 0.7350689127105666, "grad_norm": 22.58913803100586, "learning_rate": 9.37081275241442e-08, "logits/chosen": -2.308424711227417, "logits/rejected": -2.2905216217041016, "logps/chosen": -44.69788360595703, "logps/rejected": -46.66436004638672, "loss": 0.679, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 0.03513062000274658, "rewards/margins": 0.031742557883262634, "rewards/rejected": 0.003388059791177511, "step": 510 }, { "epoch": 0.7494820286460679, "grad_norm": 21.075912475585938, "learning_rate": 9.329427225140042e-08, "logits/chosen": -2.297698974609375, "logits/rejected": -2.280587911605835, "logps/chosen": -44.73003387451172, "logps/rejected": -47.60186004638672, "loss": 0.6711, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 0.04519854113459587, "rewards/margins": 0.04976027458906174, "rewards/rejected": -0.004561735317111015, "step": 520 }, { "epoch": 0.7638951445815693, "grad_norm": 19.522680282592773, "learning_rate": 9.286821100549906e-08, "logits/chosen": -2.307309150695801, "logits/rejected": -2.272381544113159, "logps/chosen": -44.07297897338867, "logps/rejected": -46.60788345336914, "loss": 0.6686, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04921981692314148, "rewards/margins": 0.05518011003732681, "rewards/rejected": -0.005960285663604736, "step": 530 }, { "epoch": 0.7783082605170706, "grad_norm": 21.739355087280273, "learning_rate": 9.243006390607402e-08, "logits/chosen": -2.3263185024261475, "logits/rejected": -2.316568613052368, "logps/chosen": -50.72089385986328, "logps/rejected": -54.4951057434082, "loss": 0.6673, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 0.05768689513206482, "rewards/margins": 0.05719071626663208, "rewards/rejected": 0.0004961833474226296, "step": 540 }, { "epoch": 0.7927213764525718, "grad_norm": 22.593128204345703, "learning_rate": 9.197995448012912e-08, "logits/chosen": -2.340376853942871, "logits/rejected": -2.319169044494629, "logps/chosen": -49.77606964111328, "logps/rejected": -52.580345153808594, "loss": 0.6677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.042526066303253174, "rewards/margins": 0.05721992999315262, "rewards/rejected": -0.014693861827254295, "step": 550 }, { "epoch": 0.8071344923880731, "grad_norm": 19.409584045410156, "learning_rate": 9.151800962721217e-08, "logits/chosen": -2.282543182373047, "logits/rejected": -2.2616894245147705, "logps/chosen": -45.87504959106445, "logps/rejected": -47.72060775756836, "loss": 0.668, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 0.04841776564717293, "rewards/margins": 0.05708543583750725, "rewards/rejected": -0.008667677640914917, "step": 560 }, { "epoch": 0.8215476083235744, "grad_norm": 23.79241371154785, "learning_rate": 9.104435958363807e-08, "logits/chosen": -2.3320469856262207, "logits/rejected": -2.321155309677124, "logps/chosen": -45.04407501220703, "logps/rejected": -47.318321228027344, "loss": 0.67, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.0415559858083725, "rewards/margins": 0.053853344172239304, "rewards/rejected": -0.012297360226511955, "step": 570 }, { "epoch": 0.8359607242590757, "grad_norm": 20.56794548034668, "learning_rate": 9.055913788577128e-08, "logits/chosen": -2.300914764404297, "logits/rejected": -2.283904790878296, "logps/chosen": -49.61848068237305, "logps/rejected": -51.29194259643555, "loss": 0.6718, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.03667648881673813, "rewards/margins": 0.050280071794986725, "rewards/rejected": -0.013603581115603447, "step": 580 }, { "epoch": 0.8503738401945771, "grad_norm": 26.347415924072266, "learning_rate": 9.006248133237782e-08, "logits/chosen": -2.337028980255127, "logits/rejected": -2.301743984222412, "logps/chosen": -47.668113708496094, "logps/rejected": -48.43418884277344, "loss": 0.6724, "rewards/accuracies": 0.578125, "rewards/chosen": 0.036908913403749466, "rewards/margins": 0.04973364248871803, "rewards/rejected": -0.012824726291000843, "step": 590 }, { "epoch": 0.8647869561300784, "grad_norm": 23.15882110595703, "learning_rate": 8.955452994605753e-08, "logits/chosen": -2.316641330718994, "logits/rejected": -2.285348653793335, "logps/chosen": -50.31378173828125, "logps/rejected": -52.169654846191406, "loss": 0.67, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.032930124551057816, "rewards/margins": 0.05345344543457031, "rewards/rejected": -0.0205233171582222, "step": 600 }, { "epoch": 0.8792000720655797, "grad_norm": 22.113082885742188, "learning_rate": 8.903542693376747e-08, "logits/chosen": -2.2881808280944824, "logits/rejected": -2.277193784713745, "logps/chosen": -44.89480209350586, "logps/rejected": -48.47951126098633, "loss": 0.6676, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 0.04625421017408371, "rewards/margins": 0.06053239107131958, "rewards/rejected": -0.014278176240622997, "step": 610 }, { "epoch": 0.893613188001081, "grad_norm": 24.942516326904297, "learning_rate": 8.850531864644748e-08, "logits/chosen": -2.297983407974243, "logits/rejected": -2.265835762023926, "logps/chosen": -45.724159240722656, "logps/rejected": -48.599876403808594, "loss": 0.6627, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 0.047544609755277634, "rewards/margins": 0.07158304750919342, "rewards/rejected": -0.024038437753915787, "step": 620 }, { "epoch": 0.9080263039365822, "grad_norm": 21.02975845336914, "learning_rate": 8.796435453775943e-08, "logits/chosen": -2.3123021125793457, "logits/rejected": -2.3164889812469482, "logps/chosen": -48.549461364746094, "logps/rejected": -53.92346954345703, "loss": 0.6717, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.031459733843803406, "rewards/margins": 0.051202088594436646, "rewards/rejected": -0.01974235475063324, "step": 630 }, { "epoch": 0.9224394198720836, "grad_norm": 21.564970016479492, "learning_rate": 8.741268712195164e-08, "logits/chosen": -2.325171947479248, "logits/rejected": -2.2981820106506348, "logps/chosen": -45.690956115722656, "logps/rejected": -49.10464859008789, "loss": 0.6558, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05857594683766365, "rewards/margins": 0.08950765430927277, "rewards/rejected": -0.030931716784834862, "step": 640 }, { "epoch": 0.9368525358075849, "grad_norm": 22.0821475982666, "learning_rate": 8.685047193086053e-08, "logits/chosen": -2.336580276489258, "logits/rejected": -2.329068660736084, "logps/chosen": -46.42755126953125, "logps/rejected": -49.21259307861328, "loss": 0.672, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.034749384969472885, "rewards/margins": 0.05111519619822502, "rewards/rejected": -0.016365814954042435, "step": 650 }, { "epoch": 0.9512656517430862, "grad_norm": 19.028423309326172, "learning_rate": 8.627786747006144e-08, "logits/chosen": -2.3250503540039062, "logits/rejected": -2.3135623931884766, "logps/chosen": -44.120948791503906, "logps/rejected": -47.624244689941406, "loss": 0.6613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.049294039607048035, "rewards/margins": 0.07464224100112915, "rewards/rejected": -0.025348205119371414, "step": 660 }, { "epoch": 0.9656787676785875, "grad_norm": 22.04196548461914, "learning_rate": 8.569503517418104e-08, "logits/chosen": -2.31101131439209, "logits/rejected": -2.2869081497192383, "logps/chosen": -48.54884719848633, "logps/rejected": -51.203269958496094, "loss": 0.6638, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 0.04563365504145622, "rewards/margins": 0.0732460767030716, "rewards/rejected": -0.027612417936325073, "step": 670 }, { "epoch": 0.9800918836140888, "grad_norm": 23.0523624420166, "learning_rate": 8.510213936138402e-08, "logits/chosen": -2.271580696105957, "logits/rejected": -2.249781847000122, "logps/chosen": -47.92513656616211, "logps/rejected": -49.661014556884766, "loss": 0.6593, "rewards/accuracies": 0.596875011920929, "rewards/chosen": 0.055300354957580566, "rewards/margins": 0.07859370857477188, "rewards/rejected": -0.023293355479836464, "step": 680 }, { "epoch": 0.9945049995495902, "grad_norm": 23.048105239868164, "learning_rate": 8.449934718704685e-08, "logits/chosen": -2.301927328109741, "logits/rejected": -2.2857935428619385, "logps/chosen": -45.03214645385742, "logps/rejected": -48.306583404541016, "loss": 0.6656, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 0.04563567042350769, "rewards/margins": 0.06622838228940964, "rewards/rejected": -0.02059270814061165, "step": 690 }, { "epoch": 1.0089181154850915, "grad_norm": 21.652252197265625, "learning_rate": 8.388682859663152e-08, "logits/chosen": -2.284125804901123, "logits/rejected": -2.2744266986846924, "logps/chosen": -45.62473678588867, "logps/rejected": -48.55522918701172, "loss": 0.6568, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 0.053375959396362305, "rewards/margins": 0.08940999954938889, "rewards/rejected": -0.03603404015302658, "step": 700 }, { "epoch": 1.0233312314205927, "grad_norm": 22.383621215820312, "learning_rate": 8.326475627777277e-08, "logits/chosen": -2.289461612701416, "logits/rejected": -2.2808353900909424, "logps/chosen": -46.80120849609375, "logps/rejected": -50.2525520324707, "loss": 0.6506, "rewards/accuracies": 0.609375, "rewards/chosen": 0.05878598242998123, "rewards/margins": 0.10358556360006332, "rewards/rejected": -0.04479958117008209, "step": 710 }, { "epoch": 1.037744347356094, "grad_norm": 21.419694900512695, "learning_rate": 8.26333056115922e-08, "logits/chosen": -2.3371403217315674, "logits/rejected": -2.312253475189209, "logps/chosen": -47.29637908935547, "logps/rejected": -48.51680374145508, "loss": 0.6598, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.04448707401752472, "rewards/margins": 0.08294164389371872, "rewards/rejected": -0.038454569876194, "step": 720 }, { "epoch": 1.0521574632915953, "grad_norm": 25.013229370117188, "learning_rate": 8.1992654623253e-08, "logits/chosen": -2.3014025688171387, "logits/rejected": -2.2878174781799316, "logps/chosen": -44.94810104370117, "logps/rejected": -50.638641357421875, "loss": 0.6423, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.050335705280303955, "rewards/margins": 0.12491028010845184, "rewards/rejected": -0.07457457482814789, "step": 730 }, { "epoch": 1.0665705792270967, "grad_norm": 21.155508041381836, "learning_rate": 8.134298393176915e-08, "logits/chosen": -2.2642767429351807, "logits/rejected": -2.2350783348083496, "logps/chosen": -44.5008659362793, "logps/rejected": -48.16975021362305, "loss": 0.6497, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.045155756175518036, "rewards/margins": 0.10517505556344986, "rewards/rejected": -0.06001930311322212, "step": 740 }, { "epoch": 1.080983695162598, "grad_norm": 20.4642276763916, "learning_rate": 8.068447669908356e-08, "logits/chosen": -2.2687973976135254, "logits/rejected": -2.2399725914001465, "logps/chosen": -49.161521911621094, "logps/rejected": -51.75724411010742, "loss": 0.651, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 0.048742182552814484, "rewards/margins": 0.10028008371591568, "rewards/rejected": -0.0515378937125206, "step": 750 }, { "epoch": 1.0953968110980994, "grad_norm": 22.40403938293457, "learning_rate": 8.001731857842906e-08, "logits/chosen": -2.273935317993164, "logits/rejected": -2.265725612640381, "logps/chosen": -48.28545379638672, "logps/rejected": -49.2262077331543, "loss": 0.6661, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.03592974692583084, "rewards/margins": 0.0714699849486351, "rewards/rejected": -0.03554024174809456, "step": 760 }, { "epoch": 1.1098099270336006, "grad_norm": 23.13152503967285, "learning_rate": 7.934169766198712e-08, "logits/chosen": -2.3057303428649902, "logits/rejected": -2.281688690185547, "logps/chosen": -46.85943603515625, "logps/rejected": -50.839698791503906, "loss": 0.6527, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 0.0495249405503273, "rewards/margins": 0.09827858954668045, "rewards/rejected": -0.04875364899635315, "step": 770 }, { "epoch": 1.1242230429691018, "grad_norm": 26.3281192779541, "learning_rate": 7.86578044278589e-08, "logits/chosen": -2.320192575454712, "logits/rejected": -2.297276735305786, "logps/chosen": -48.3865966796875, "logps/rejected": -51.7015380859375, "loss": 0.6443, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.057862721383571625, "rewards/margins": 0.11766588687896729, "rewards/rejected": -0.05980316549539566, "step": 780 }, { "epoch": 1.1386361589046032, "grad_norm": 24.804874420166016, "learning_rate": 7.796583168636375e-08, "logits/chosen": -2.3215794563293457, "logits/rejected": -2.305657148361206, "logps/chosen": -48.32183074951172, "logps/rejected": -52.143150329589844, "loss": 0.6451, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.04665009304881096, "rewards/margins": 0.11924920231103897, "rewards/rejected": -0.07259909808635712, "step": 790 }, { "epoch": 1.1530492748401044, "grad_norm": 18.46070671081543, "learning_rate": 7.726597452568007e-08, "logits/chosen": -2.300274610519409, "logits/rejected": -2.2758469581604004, "logps/chosen": -46.92626190185547, "logps/rejected": -50.41767501831055, "loss": 0.6505, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04316678270697594, "rewards/margins": 0.10699989646673203, "rewards/rejected": -0.06383311003446579, "step": 800 }, { "epoch": 1.1674623907756059, "grad_norm": 25.514638900756836, "learning_rate": 7.655843025684402e-08, "logits/chosen": -2.319176435470581, "logits/rejected": -2.308589458465576, "logps/chosen": -48.81470489501953, "logps/rejected": -52.32573318481445, "loss": 0.6545, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04523879662156105, "rewards/margins": 0.09590532630681992, "rewards/rejected": -0.050666533410549164, "step": 810 }, { "epoch": 1.181875506711107, "grad_norm": 19.846221923828125, "learning_rate": 7.584339835812151e-08, "logits/chosen": -2.2872939109802246, "logits/rejected": -2.2704885005950928, "logps/chosen": -48.955596923828125, "logps/rejected": -49.713871002197266, "loss": 0.6512, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0468595027923584, "rewards/margins": 0.1036025881767273, "rewards/rejected": -0.0567430779337883, "step": 820 }, { "epoch": 1.1962886226466085, "grad_norm": 21.35214614868164, "learning_rate": 7.512108041876924e-08, "logits/chosen": -2.253537654876709, "logits/rejected": -2.2463467121124268, "logps/chosen": -43.18073654174805, "logps/rejected": -45.50495147705078, "loss": 0.6565, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.0258056428283453, "rewards/margins": 0.09474330395460129, "rewards/rejected": -0.06893765181303024, "step": 830 }, { "epoch": 1.2107017385821097, "grad_norm": 21.493972778320312, "learning_rate": 7.439168008220056e-08, "logits/chosen": -2.293834686279297, "logits/rejected": -2.276794195175171, "logps/chosen": -45.07364273071289, "logps/rejected": -49.38530731201172, "loss": 0.6437, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 0.04228875786066055, "rewards/margins": 0.12463889271020889, "rewards/rejected": -0.08235013484954834, "step": 840 }, { "epoch": 1.225114854517611, "grad_norm": 19.528154373168945, "learning_rate": 7.365540298857215e-08, "logits/chosen": -2.2884204387664795, "logits/rejected": -2.274423837661743, "logps/chosen": -50.20172119140625, "logps/rejected": -53.54350662231445, "loss": 0.6309, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.06707637757062912, "rewards/margins": 0.15372148156166077, "rewards/rejected": -0.08664509654045105, "step": 850 }, { "epoch": 1.2395279704531124, "grad_norm": 18.76193618774414, "learning_rate": 7.291245671680781e-08, "logits/chosen": -2.2782771587371826, "logits/rejected": -2.2508130073547363, "logps/chosen": -42.898929595947266, "logps/rejected": -46.70447540283203, "loss": 0.6438, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.046091653406620026, "rewards/margins": 0.12635795772075653, "rewards/rejected": -0.08026628941297531, "step": 860 }, { "epoch": 1.2539410863886136, "grad_norm": 21.306549072265625, "learning_rate": 7.216305072607568e-08, "logits/chosen": -2.310192823410034, "logits/rejected": -2.2984118461608887, "logps/chosen": -47.682533264160156, "logps/rejected": -50.96216583251953, "loss": 0.6397, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.029919151216745377, "rewards/margins": 0.13648976385593414, "rewards/rejected": -0.10657060146331787, "step": 870 }, { "epoch": 1.268354202324115, "grad_norm": 23.378387451171875, "learning_rate": 7.14073962967353e-08, "logits/chosen": -2.3239712715148926, "logits/rejected": -2.2972521781921387, "logps/chosen": -51.19306564331055, "logps/rejected": -52.882469177246094, "loss": 0.6301, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04189233481884003, "rewards/margins": 0.16474562883377075, "rewards/rejected": -0.12285330146551132, "step": 880 }, { "epoch": 1.2827673182596162, "grad_norm": 25.997648239135742, "learning_rate": 7.064570647077124e-08, "logits/chosen": -2.3059449195861816, "logits/rejected": -2.2785696983337402, "logps/chosen": -48.22880935668945, "logps/rejected": -50.62089538574219, "loss": 0.6557, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.02739674784243107, "rewards/margins": 0.09727232158184052, "rewards/rejected": -0.06987558305263519, "step": 890 }, { "epoch": 1.2971804341951176, "grad_norm": 25.10089111328125, "learning_rate": 6.987819599173006e-08, "logits/chosen": -2.296082019805908, "logits/rejected": -2.2839462757110596, "logps/chosen": -46.80525207519531, "logps/rejected": -51.168792724609375, "loss": 0.6608, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.037016816437244415, "rewards/margins": 0.09267648309469223, "rewards/rejected": -0.05565967038273811, "step": 900 }, { "epoch": 1.3115935501306188, "grad_norm": 23.599550247192383, "learning_rate": 6.910508124417765e-08, "logits/chosen": -2.268026113510132, "logits/rejected": -2.2599265575408936, "logps/chosen": -43.31938934326172, "logps/rejected": -47.721927642822266, "loss": 0.6397, "rewards/accuracies": 0.609375, "rewards/chosen": 0.03140413016080856, "rewards/margins": 0.13336870074272156, "rewards/rejected": -0.101964570581913, "step": 910 }, { "epoch": 1.32600666606612, "grad_norm": 17.291772842407227, "learning_rate": 6.832658019269373e-08, "logits/chosen": -2.2494916915893555, "logits/rejected": -2.225579261779785, "logps/chosen": -45.836544036865234, "logps/rejected": -48.53606414794922, "loss": 0.6422, "rewards/accuracies": 0.640625, "rewards/chosen": 0.022828983142971992, "rewards/margins": 0.13301518559455872, "rewards/rejected": -0.11018619686365128, "step": 920 }, { "epoch": 1.3404197820016215, "grad_norm": 21.943674087524414, "learning_rate": 6.75429123204211e-08, "logits/chosen": -2.2929205894470215, "logits/rejected": -2.2728652954101562, "logps/chosen": -46.83967208862305, "logps/rejected": -51.28687286376953, "loss": 0.6367, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": 0.023386573418974876, "rewards/margins": 0.14642710983753204, "rewards/rejected": -0.12304054200649261, "step": 930 }, { "epoch": 1.354832897937123, "grad_norm": 23.507673263549805, "learning_rate": 6.675429856718652e-08, "logits/chosen": -2.2670910358428955, "logits/rejected": -2.2336556911468506, "logps/chosen": -45.63422393798828, "logps/rejected": -48.890968322753906, "loss": 0.6405, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.033365607261657715, "rewards/margins": 0.13297542929649353, "rewards/rejected": -0.09960982948541641, "step": 940 }, { "epoch": 1.3692460138726241, "grad_norm": 18.229249954223633, "learning_rate": 6.596096126721123e-08, "logits/chosen": -2.2298777103424072, "logits/rejected": -2.223173141479492, "logps/chosen": -45.84309768676758, "logps/rejected": -50.1830940246582, "loss": 0.6405, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.03936393931508064, "rewards/margins": 0.13684986531734467, "rewards/rejected": -0.09748590737581253, "step": 950 }, { "epoch": 1.3836591298081253, "grad_norm": 19.752758026123047, "learning_rate": 6.516312408642804e-08, "logits/chosen": -2.279609203338623, "logits/rejected": -2.279059410095215, "logps/chosen": -42.76288604736328, "logps/rejected": -48.05985641479492, "loss": 0.6364, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.02452887035906315, "rewards/margins": 0.1416589766740799, "rewards/rejected": -0.1171300858259201, "step": 960 }, { "epoch": 1.3980722457436268, "grad_norm": 23.075225830078125, "learning_rate": 6.436101195942312e-08, "logits/chosen": -2.278662919998169, "logits/rejected": -2.2638564109802246, "logps/chosen": -48.03376007080078, "logps/rejected": -50.43840789794922, "loss": 0.6507, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.02580978348851204, "rewards/margins": 0.11886006593704224, "rewards/rejected": -0.0930502638220787, "step": 970 }, { "epoch": 1.412485361679128, "grad_norm": 23.39198875427246, "learning_rate": 6.35548510260201e-08, "logits/chosen": -2.252816677093506, "logits/rejected": -2.236603260040283, "logps/chosen": -44.25969314575195, "logps/rejected": -48.64405059814453, "loss": 0.6368, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.021795066073536873, "rewards/margins": 0.155609592795372, "rewards/rejected": -0.1338145136833191, "step": 980 }, { "epoch": 1.4268984776146292, "grad_norm": 19.332801818847656, "learning_rate": 6.274486856752442e-08, "logits/chosen": -2.2814013957977295, "logits/rejected": -2.268404483795166, "logps/chosen": -43.803199768066406, "logps/rejected": -48.746334075927734, "loss": 0.6393, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0010702230501919985, "rewards/margins": 0.15793892741203308, "rewards/rejected": -0.15686871111392975, "step": 990 }, { "epoch": 1.4413115935501306, "grad_norm": 17.606380462646484, "learning_rate": 6.193129294264568e-08, "logits/chosen": -2.286543130874634, "logits/rejected": -2.2696192264556885, "logps/chosen": -43.068660736083984, "logps/rejected": -47.965354919433594, "loss": 0.6332, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.026370450854301453, "rewards/margins": 0.15689751505851746, "rewards/rejected": -0.130527064204216, "step": 1000 }, { "epoch": 1.455724709485632, "grad_norm": 24.786046981811523, "learning_rate": 6.111435352311653e-08, "logits/chosen": -2.2797257900238037, "logits/rejected": -2.264954090118408, "logps/chosen": -45.28757095336914, "logps/rejected": -49.00934600830078, "loss": 0.6405, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.010084993205964565, "rewards/margins": 0.14780084788799286, "rewards/rejected": -0.13771584630012512, "step": 1010 }, { "epoch": 1.4701378254211332, "grad_norm": 26.044845581054688, "learning_rate": 6.02942806290257e-08, "logits/chosen": -2.2898941040039062, "logits/rejected": -2.2842609882354736, "logps/chosen": -48.29700469970703, "logps/rejected": -52.98006057739258, "loss": 0.6511, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.018248267471790314, "rewards/margins": 0.12896670401096344, "rewards/rejected": -0.11071842908859253, "step": 1020 }, { "epoch": 1.4845509413566345, "grad_norm": 16.25486946105957, "learning_rate": 5.947130546388376e-08, "logits/chosen": -2.2709572315216064, "logits/rejected": -2.2334933280944824, "logps/chosen": -47.84028244018555, "logps/rejected": -49.67833709716797, "loss": 0.6366, "rewards/accuracies": 0.625, "rewards/chosen": 0.009173278696835041, "rewards/margins": 0.1511419117450714, "rewards/rejected": -0.14196862280368805, "step": 1030 }, { "epoch": 1.4989640572921359, "grad_norm": 21.530025482177734, "learning_rate": 5.864566004943983e-08, "logits/chosen": -2.2685012817382812, "logits/rejected": -2.240506649017334, "logps/chosen": -50.17546463012695, "logps/rejected": -53.723243713378906, "loss": 0.6373, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 0.00729725044220686, "rewards/margins": 0.15390922129154205, "rewards/rejected": -0.14661197364330292, "step": 1040 }, { "epoch": 1.513377173227637, "grad_norm": 23.661149978637695, "learning_rate": 5.78175771602676e-08, "logits/chosen": -2.2800815105438232, "logits/rejected": -2.277493953704834, "logps/chosen": -43.04485321044922, "logps/rejected": -46.71100997924805, "loss": 0.6347, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.01757994294166565, "rewards/margins": 0.1627650111913681, "rewards/rejected": -0.14518508315086365, "step": 1050 }, { "epoch": 1.5277902891631383, "grad_norm": 23.422590255737305, "learning_rate": 5.6987290258139073e-08, "logits/chosen": -2.2339248657226562, "logits/rejected": -2.2009925842285156, "logps/chosen": -48.11365509033203, "logps/rejected": -51.516197204589844, "loss": 0.6384, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.02754228189587593, "rewards/margins": 0.15339332818984985, "rewards/rejected": -0.12585103511810303, "step": 1060 }, { "epoch": 1.5422034050986397, "grad_norm": 27.1705379486084, "learning_rate": 5.6155033426204615e-08, "logits/chosen": -2.2623772621154785, "logits/rejected": -2.2473068237304688, "logps/chosen": -51.77325439453125, "logps/rejected": -54.96337127685547, "loss": 0.6343, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": 0.01034320704638958, "rewards/margins": 0.15684732794761658, "rewards/rejected": -0.14650413393974304, "step": 1070 }, { "epoch": 1.5566165210341412, "grad_norm": 25.371421813964844, "learning_rate": 5.532104130299771e-08, "logits/chosen": -2.26564359664917, "logits/rejected": -2.250556230545044, "logps/chosen": -51.636810302734375, "logps/rejected": -54.49198532104492, "loss": 0.634, "rewards/accuracies": 0.671875, "rewards/chosen": 0.014989291317760944, "rewards/margins": 0.16550514101982117, "rewards/rejected": -0.1505158692598343, "step": 1080 }, { "epoch": 1.5710296369696424, "grad_norm": 26.66484260559082, "learning_rate": 5.448554901628333e-08, "logits/chosen": -2.2615230083465576, "logits/rejected": -2.251753330230713, "logps/chosen": -48.62981033325195, "logps/rejected": -53.1638069152832, "loss": 0.6483, "rewards/accuracies": 0.609375, "rewards/chosen": 0.001556683098897338, "rewards/margins": 0.1258353888988495, "rewards/rejected": -0.124278724193573, "step": 1090 }, { "epoch": 1.5854427529051436, "grad_norm": 23.419185638427734, "learning_rate": 5.364879211676816e-08, "logits/chosen": -2.2811994552612305, "logits/rejected": -2.272005558013916, "logps/chosen": -48.42817687988281, "logps/rejected": -53.677734375, "loss": 0.6268, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.036996982991695404, "rewards/margins": 0.17727255821228027, "rewards/rejected": -0.14027558267116547, "step": 1100 }, { "epoch": 1.599855868840645, "grad_norm": 25.174808502197266, "learning_rate": 5.281100651169175e-08, "logits/chosen": -2.290276527404785, "logits/rejected": -2.277024030685425, "logps/chosen": -50.967430114746094, "logps/rejected": -55.05602264404297, "loss": 0.6327, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.041182320564985275, "rewards/margins": 0.17106422781944275, "rewards/rejected": -0.12988190352916718, "step": 1110 }, { "epoch": 1.6142689847761464, "grad_norm": 20.062232971191406, "learning_rate": 5.197242839831706e-08, "logits/chosen": -2.2445547580718994, "logits/rejected": -2.235886335372925, "logps/chosen": -45.176246643066406, "logps/rejected": -50.29578399658203, "loss": 0.6329, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.004402970429509878, "rewards/margins": 0.17448854446411133, "rewards/rejected": -0.17889150977134705, "step": 1120 }, { "epoch": 1.6286821007116477, "grad_norm": 24.2050838470459, "learning_rate": 5.1133294197339274e-08, "logits/chosen": -2.2965331077575684, "logits/rejected": -2.269667387008667, "logps/chosen": -47.03449630737305, "logps/rejected": -50.98591232299805, "loss": 0.6476, "rewards/accuracies": 0.625, "rewards/chosen": 0.0063371858559548855, "rewards/margins": 0.13688282668590546, "rewards/rejected": -0.13054564595222473, "step": 1130 }, { "epoch": 1.6430952166471489, "grad_norm": 22.103654861450195, "learning_rate": 5.029384048623153e-08, "logits/chosen": -2.253809690475464, "logits/rejected": -2.233123779296875, "logps/chosen": -48.81449508666992, "logps/rejected": -52.2353515625, "loss": 0.649, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0027180719189345837, "rewards/margins": 0.13031043112277985, "rewards/rejected": -0.12759235501289368, "step": 1140 }, { "epoch": 1.6575083325826503, "grad_norm": 19.620878219604492, "learning_rate": 4.9454303932546675e-08, "logits/chosen": -2.2449498176574707, "logits/rejected": -2.2211432456970215, "logps/chosen": -47.44291687011719, "logps/rejected": -48.738990783691406, "loss": 0.6305, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.014069506898522377, "rewards/margins": 0.18172375857830048, "rewards/rejected": -0.16765424609184265, "step": 1150 }, { "epoch": 1.6719214485181515, "grad_norm": 22.302276611328125, "learning_rate": 4.861492122719338e-08, "logits/chosen": -2.2866158485412598, "logits/rejected": -2.264674425125122, "logps/chosen": -49.207786560058594, "logps/rejected": -52.398353576660156, "loss": 0.6408, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.012931051664054394, "rewards/margins": 0.15574631094932556, "rewards/rejected": -0.1428152620792389, "step": 1160 }, { "epoch": 1.6863345644536527, "grad_norm": 23.777292251586914, "learning_rate": 4.777592901770575e-08, "logits/chosen": -2.2917380332946777, "logits/rejected": -2.276853561401367, "logps/chosen": -42.12696075439453, "logps/rejected": -46.638572692871094, "loss": 0.6227, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.00846984051167965, "rewards/margins": 0.20765790343284607, "rewards/rejected": -0.19918808341026306, "step": 1170 }, { "epoch": 1.7007476803891541, "grad_norm": 25.883188247680664, "learning_rate": 4.693756384152529e-08, "logits/chosen": -2.254065752029419, "logits/rejected": -2.2268402576446533, "logps/chosen": -49.0328483581543, "logps/rejected": -53.10075759887695, "loss": 0.6234, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.0014540397096425295, "rewards/margins": 0.19669906795024872, "rewards/rejected": -0.19815312325954437, "step": 1180 }, { "epoch": 1.7151607963246556, "grad_norm": 20.475727081298828, "learning_rate": 4.610006205931365e-08, "logits/chosen": -2.2955899238586426, "logits/rejected": -2.279146194458008, "logps/chosen": -50.25695037841797, "logps/rejected": -52.637916564941406, "loss": 0.6331, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0065743387676775455, "rewards/margins": 0.1675548255443573, "rewards/rejected": -0.1609804779291153, "step": 1190 }, { "epoch": 1.7295739122601568, "grad_norm": 24.313840866088867, "learning_rate": 4.526365978831551e-08, "logits/chosen": -2.2783608436584473, "logits/rejected": -2.2607553005218506, "logps/chosen": -47.65949249267578, "logps/rejected": -51.959983825683594, "loss": 0.631, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.00622877012938261, "rewards/margins": 0.17043979465961456, "rewards/rejected": -0.17666855454444885, "step": 1200 }, { "epoch": 1.743987028195658, "grad_norm": 20.83986473083496, "learning_rate": 4.442859283578981e-08, "logits/chosen": -2.2713427543640137, "logits/rejected": -2.252413511276245, "logps/chosen": -45.476287841796875, "logps/rejected": -49.17133331298828, "loss": 0.6303, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.020858388394117355, "rewards/margins": 0.1802762746810913, "rewards/rejected": -0.20113465189933777, "step": 1210 }, { "epoch": 1.7584001441311594, "grad_norm": 23.378725051879883, "learning_rate": 4.359509663252864e-08, "logits/chosen": -2.251262903213501, "logits/rejected": -2.2400929927825928, "logps/chosen": -46.6330451965332, "logps/rejected": -51.300270080566406, "loss": 0.64, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.014711695723235607, "rewards/margins": 0.15496547520160675, "rewards/rejected": -0.16967716813087463, "step": 1220 }, { "epoch": 1.7728132600666606, "grad_norm": 20.105392456054688, "learning_rate": 4.276340616648198e-08, "logits/chosen": -2.3081746101379395, "logits/rejected": -2.2849628925323486, "logps/chosen": -46.36747360229492, "logps/rejected": -48.48231506347656, "loss": 0.6288, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.00806148536503315, "rewards/margins": 0.18995113670825958, "rewards/rejected": -0.18188965320587158, "step": 1230 }, { "epoch": 1.7872263760021618, "grad_norm": 20.087594985961914, "learning_rate": 4.193375591650758e-08, "logits/chosen": -2.2986416816711426, "logits/rejected": -2.275165557861328, "logps/chosen": -51.384132385253906, "logps/rejected": -54.01851272583008, "loss": 0.6311, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004061401821672916, "rewards/margins": 0.17897441983222961, "rewards/rejected": -0.18303583562374115, "step": 1240 }, { "epoch": 1.8016394919376633, "grad_norm": 21.758268356323242, "learning_rate": 4.110637978626415e-08, "logits/chosen": -2.257802963256836, "logits/rejected": -2.2421045303344727, "logps/chosen": -44.12052536010742, "logps/rejected": -49.74687576293945, "loss": 0.6012, "rewards/accuracies": 0.703125, "rewards/chosen": 0.036210522055625916, "rewards/margins": 0.24105677008628845, "rewards/rejected": -0.20484623312950134, "step": 1250 }, { "epoch": 1.8160526078731647, "grad_norm": 19.154407501220703, "learning_rate": 4.0281511038266867e-08, "logits/chosen": -2.201878309249878, "logits/rejected": -2.183927059173584, "logps/chosen": -49.49979782104492, "logps/rejected": -53.208091735839844, "loss": 0.6219, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.014157836325466633, "rewards/margins": 0.20892783999443054, "rewards/rejected": -0.2230856716632843, "step": 1260 }, { "epoch": 1.830465723808666, "grad_norm": 19.655216217041016, "learning_rate": 3.9459382228123475e-08, "logits/chosen": -2.2442800998687744, "logits/rejected": -2.238391160964966, "logps/chosen": -42.37008285522461, "logps/rejected": -47.834983825683594, "loss": 0.6055, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.017191831022500992, "rewards/margins": 0.2501511871814728, "rewards/rejected": -0.2329593151807785, "step": 1270 }, { "epoch": 1.844878839744167, "grad_norm": 24.132625579833984, "learning_rate": 3.864022513896989e-08, "logits/chosen": -2.255995750427246, "logits/rejected": -2.223539113998413, "logps/chosen": -44.1163330078125, "logps/rejected": -47.280296325683594, "loss": 0.6397, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.004155662842094898, "rewards/margins": 0.1677740067243576, "rewards/rejected": -0.17192967236042023, "step": 1280 }, { "epoch": 1.8592919556796685, "grad_norm": 24.518510818481445, "learning_rate": 3.782427071612339e-08, "logits/chosen": -2.278082847595215, "logits/rejected": -2.2593398094177246, "logps/chosen": -48.02725601196289, "logps/rejected": -51.3366813659668, "loss": 0.6259, "rewards/accuracies": 0.625, "rewards/chosen": -0.005302610341459513, "rewards/margins": 0.19030724465847015, "rewards/rejected": -0.19560983777046204, "step": 1290 }, { "epoch": 1.87370507161517, "grad_norm": 23.40630531311035, "learning_rate": 3.7011749001972174e-08, "logits/chosen": -2.2769112586975098, "logits/rejected": -2.2606639862060547, "logps/chosen": -43.46620559692383, "logps/rejected": -47.172611236572266, "loss": 0.6354, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.972890019416809e-05, "rewards/margins": 0.1833050400018692, "rewards/rejected": -0.18335476517677307, "step": 1300 }, { "epoch": 1.888118187550671, "grad_norm": 22.18378448486328, "learning_rate": 3.620288907111931e-08, "logits/chosen": -2.250119686126709, "logits/rejected": -2.2260870933532715, "logps/chosen": -45.36948776245117, "logps/rejected": -48.90538787841797, "loss": 0.6175, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.0030071833170950413, "rewards/margins": 0.2247433215379715, "rewards/rejected": -0.2277504950761795, "step": 1310 }, { "epoch": 1.9025313034861724, "grad_norm": 25.240890502929688, "learning_rate": 3.539791896579978e-08, "logits/chosen": -2.2886531352996826, "logits/rejected": -2.2704544067382812, "logps/chosen": -51.9533805847168, "logps/rejected": -54.27134323120117, "loss": 0.6407, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.04531291872262955, "rewards/margins": 0.1595449447631836, "rewards/rejected": -0.20485787093639374, "step": 1320 }, { "epoch": 1.9169444194216738, "grad_norm": 20.785062789916992, "learning_rate": 3.459706563158828e-08, "logits/chosen": -2.2486228942871094, "logits/rejected": -2.2339444160461426, "logps/chosen": -51.625091552734375, "logps/rejected": -56.50333786010742, "loss": 0.6202, "rewards/accuracies": 0.671875, "rewards/chosen": -0.01646718569099903, "rewards/margins": 0.2016635239124298, "rewards/rejected": -0.2181307077407837, "step": 1330 }, { "epoch": 1.931357535357175, "grad_norm": 18.25945281982422, "learning_rate": 3.380055485341644e-08, "logits/chosen": -2.2824645042419434, "logits/rejected": -2.2728466987609863, "logps/chosen": -47.70806121826172, "logps/rejected": -52.762123107910156, "loss": 0.6396, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.014143446460366249, "rewards/margins": 0.1655816286802292, "rewards/rejected": -0.17972508072853088, "step": 1340 }, { "epoch": 1.9457706512926762, "grad_norm": 27.237255096435547, "learning_rate": 3.300861119191718e-08, "logits/chosen": -2.2605082988739014, "logits/rejected": -2.240718126296997, "logps/chosen": -50.2346076965332, "logps/rejected": -52.86682891845703, "loss": 0.6388, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.022719861939549446, "rewards/margins": 0.16368316113948822, "rewards/rejected": -0.18640300631523132, "step": 1350 }, { "epoch": 1.9601837672281777, "grad_norm": 21.566373825073242, "learning_rate": 3.2221457920114213e-08, "logits/chosen": -2.281573534011841, "logits/rejected": -2.260481595993042, "logps/chosen": -44.61939239501953, "logps/rejected": -49.09664535522461, "loss": 0.6305, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01838817074894905, "rewards/margins": 0.18449218571186066, "rewards/rejected": -0.2028803527355194, "step": 1360 }, { "epoch": 1.974596883163679, "grad_norm": 22.27100944519043, "learning_rate": 3.143931696047454e-08, "logits/chosen": -2.26967191696167, "logits/rejected": -2.254939079284668, "logps/chosen": -44.52263259887695, "logps/rejected": -48.41568374633789, "loss": 0.6323, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.027285214513540268, "rewards/margins": 0.18556539714336395, "rewards/rejected": -0.21285061538219452, "step": 1370 }, { "epoch": 1.9890099990991803, "grad_norm": 26.420007705688477, "learning_rate": 3.066240882234186e-08, "logits/chosen": -2.2749416828155518, "logits/rejected": -2.275374412536621, "logps/chosen": -49.047096252441406, "logps/rejected": -54.0599250793457, "loss": 0.6264, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.008531761355698109, "rewards/margins": 0.18480119109153748, "rewards/rejected": -0.1933329850435257, "step": 1380 }, { "epoch": 2.0034231150346815, "grad_norm": 21.0603084564209, "learning_rate": 2.989095253976816e-08, "logits/chosen": -2.261396884918213, "logits/rejected": -2.252016305923462, "logps/chosen": -47.794715881347656, "logps/rejected": -51.5122184753418, "loss": 0.6434, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.034887559711933136, "rewards/margins": 0.15125319361686707, "rewards/rejected": -0.1861407607793808, "step": 1390 }, { "epoch": 2.017836230970183, "grad_norm": 21.241836547851562, "learning_rate": 2.912516560976146e-08, "logits/chosen": -2.236016035079956, "logits/rejected": -2.2192585468292236, "logps/chosen": -45.089847564697266, "logps/rejected": -50.58442306518555, "loss": 0.6138, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.017469603568315506, "rewards/margins": 0.23276808857917786, "rewards/rejected": -0.25023767352104187, "step": 1400 }, { "epoch": 2.0322493469056844, "grad_norm": 21.38671112060547, "learning_rate": 2.836526393096661e-08, "logits/chosen": -2.2866103649139404, "logits/rejected": -2.277172088623047, "logps/chosen": -47.35311508178711, "logps/rejected": -52.31220626831055, "loss": 0.6238, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.022761840373277664, "rewards/margins": 0.19283010065555573, "rewards/rejected": -0.2155919373035431, "step": 1410 }, { "epoch": 2.0466624628411854, "grad_norm": 21.73073959350586, "learning_rate": 2.7611461742797165e-08, "logits/chosen": -2.2647812366485596, "logits/rejected": -2.2458884716033936, "logps/chosen": -42.06100082397461, "logps/rejected": -46.326446533203125, "loss": 0.6044, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.0016145706176757812, "rewards/margins": 0.2537813186645508, "rewards/rejected": -0.2521667778491974, "step": 1420 }, { "epoch": 2.061075578776687, "grad_norm": 26.361936569213867, "learning_rate": 2.686397156503445e-08, "logits/chosen": -2.266921043395996, "logits/rejected": -2.2467474937438965, "logps/chosen": -45.512332916259766, "logps/rejected": -49.53184127807617, "loss": 0.6322, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.04077625274658203, "rewards/margins": 0.18639865517616272, "rewards/rejected": -0.22717487812042236, "step": 1430 }, { "epoch": 2.075488694712188, "grad_norm": 22.614898681640625, "learning_rate": 2.6123004137912084e-08, "logits/chosen": -2.2463908195495605, "logits/rejected": -2.2374794483184814, "logps/chosen": -44.685935974121094, "logps/rejected": -49.08884811401367, "loss": 0.6136, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.0009622380021028221, "rewards/margins": 0.20883917808532715, "rewards/rejected": -0.20787692070007324, "step": 1440 }, { "epoch": 2.089901810647689, "grad_norm": 24.510360717773438, "learning_rate": 2.5388768362701585e-08, "logits/chosen": -2.240734100341797, "logits/rejected": -2.229846954345703, "logps/chosen": -50.42451095581055, "logps/rejected": -54.28529739379883, "loss": 0.634, "rewards/accuracies": 0.628125011920929, "rewards/chosen": 0.0019600100349634886, "rewards/margins": 0.17666472494602203, "rewards/rejected": -0.17470474541187286, "step": 1450 }, { "epoch": 2.1043149265831906, "grad_norm": 19.324567794799805, "learning_rate": 2.466147124281703e-08, "logits/chosen": -2.314894437789917, "logits/rejected": -2.2869911193847656, "logps/chosen": -46.29553985595703, "logps/rejected": -49.53280258178711, "loss": 0.6158, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.03561275079846382, "rewards/margins": 0.2275541126728058, "rewards/rejected": -0.2631668448448181, "step": 1460 }, { "epoch": 2.118728042518692, "grad_norm": 22.356782913208008, "learning_rate": 2.3941317825454278e-08, "logits/chosen": -2.263237476348877, "logits/rejected": -2.2347958087921143, "logps/chosen": -47.52599334716797, "logps/rejected": -51.044593811035156, "loss": 0.6157, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.00275771738961339, "rewards/margins": 0.22175411880016327, "rewards/rejected": -0.22451183199882507, "step": 1470 }, { "epoch": 2.1331411584541935, "grad_norm": 28.187788009643555, "learning_rate": 2.322851114378203e-08, "logits/chosen": -2.234954595565796, "logits/rejected": -2.228529691696167, "logps/chosen": -47.602561950683594, "logps/rejected": -51.5837287902832, "loss": 0.6259, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.056993234902620316, "rewards/margins": 0.20701098442077637, "rewards/rejected": -0.2640042006969452, "step": 1480 }, { "epoch": 2.1475542743896945, "grad_norm": 21.61797523498535, "learning_rate": 2.252325215970059e-08, "logits/chosen": -2.223036050796509, "logits/rejected": -2.206120014190674, "logps/chosen": -47.96808624267578, "logps/rejected": -53.099937438964844, "loss": 0.6109, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.013963679783046246, "rewards/margins": 0.2430466115474701, "rewards/rejected": -0.2570103108882904, "step": 1490 }, { "epoch": 2.161967390325196, "grad_norm": 23.325836181640625, "learning_rate": 2.182573970718449e-08, "logits/chosen": -2.2531425952911377, "logits/rejected": -2.2373714447021484, "logps/chosen": -48.79794692993164, "logps/rejected": -53.21220016479492, "loss": 0.6085, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.012846941128373146, "rewards/margins": 0.24934491515159607, "rewards/rejected": -0.2621918320655823, "step": 1500 }, { "epoch": 2.1763805062606973, "grad_norm": 24.3940372467041, "learning_rate": 2.113617043622536e-08, "logits/chosen": -2.222809076309204, "logits/rejected": -2.2011048793792725, "logps/chosen": -45.179893493652344, "logps/rejected": -49.6671257019043, "loss": 0.6207, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.028503399342298508, "rewards/margins": 0.22944235801696777, "rewards/rejected": -0.2579457759857178, "step": 1510 }, { "epoch": 2.1907936221961988, "grad_norm": 23.14939308166504, "learning_rate": 2.045473875739001e-08, "logits/chosen": -2.262533664703369, "logits/rejected": -2.2459208965301514, "logps/chosen": -49.05790328979492, "logps/rejected": -53.393707275390625, "loss": 0.6171, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.017335880547761917, "rewards/margins": 0.2230241745710373, "rewards/rejected": -0.2403600662946701, "step": 1520 }, { "epoch": 2.2052067381316998, "grad_norm": 23.007627487182617, "learning_rate": 1.9781636787010503e-08, "logits/chosen": -2.2727057933807373, "logits/rejected": -2.252056837081909, "logps/chosen": -50.972694396972656, "logps/rejected": -55.11802291870117, "loss": 0.6016, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": 0.011264542117714882, "rewards/margins": 0.26211434602737427, "rewards/rejected": -0.25084978342056274, "step": 1530 }, { "epoch": 2.219619854067201, "grad_norm": 23.422765731811523, "learning_rate": 1.911705429302038e-08, "logits/chosen": -2.2223095893859863, "logits/rejected": -2.2044150829315186, "logps/chosen": -47.016273498535156, "logps/rejected": -49.784297943115234, "loss": 0.6307, "rewards/accuracies": 0.671875, "rewards/chosen": -0.02597871981561184, "rewards/margins": 0.17457632720470428, "rewards/rejected": -0.20055504143238068, "step": 1540 }, { "epoch": 2.2340329700027026, "grad_norm": 20.89065933227539, "learning_rate": 1.8461178641453617e-08, "logits/chosen": -2.2396929264068604, "logits/rejected": -2.2292518615722656, "logps/chosen": -43.25371551513672, "logps/rejected": -46.49152755737305, "loss": 0.6194, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.04524295777082443, "rewards/margins": 0.21859809756278992, "rewards/rejected": -0.26384103298187256, "step": 1550 }, { "epoch": 2.2484460859382036, "grad_norm": 20.526012420654297, "learning_rate": 1.781419474362017e-08, "logits/chosen": -2.2325713634490967, "logits/rejected": -2.2240681648254395, "logps/chosen": -45.1739616394043, "logps/rejected": -50.48863220214844, "loss": 0.614, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03004586696624756, "rewards/margins": 0.22514891624450684, "rewards/rejected": -0.2551947832107544, "step": 1560 }, { "epoch": 2.262859201873705, "grad_norm": 19.403095245361328, "learning_rate": 1.7176285003974033e-08, "logits/chosen": -2.2410597801208496, "logits/rejected": -2.2172932624816895, "logps/chosen": -46.04480743408203, "logps/rejected": -49.85961151123047, "loss": 0.6228, "rewards/accuracies": 0.625, "rewards/chosen": -0.009199894964694977, "rewards/margins": 0.21350452303886414, "rewards/rejected": -0.2227044403553009, "step": 1570 }, { "epoch": 2.2772723178092065, "grad_norm": 22.78799057006836, "learning_rate": 1.6547629268687786e-08, "logits/chosen": -2.271934747695923, "logits/rejected": -2.2632410526275635, "logps/chosen": -42.374305725097656, "logps/rejected": -48.10682678222656, "loss": 0.6102, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.007346804253757, "rewards/margins": 0.23881065845489502, "rewards/rejected": -0.2461574524641037, "step": 1580 }, { "epoch": 2.291685433744708, "grad_norm": 19.8133487701416, "learning_rate": 1.59284047749485e-08, "logits/chosen": -2.239629030227661, "logits/rejected": -2.217893123626709, "logps/chosen": -45.59327697753906, "logps/rejected": -50.000091552734375, "loss": 0.6122, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.005661819130182266, "rewards/margins": 0.25302523374557495, "rewards/rejected": -0.2586870789527893, "step": 1590 }, { "epoch": 2.306098549680209, "grad_norm": 26.438047409057617, "learning_rate": 1.5318786100989188e-08, "logits/chosen": -2.2031397819519043, "logits/rejected": -2.195650577545166, "logps/chosen": -50.8388671875, "logps/rejected": -54.697113037109375, "loss": 0.6324, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06328658759593964, "rewards/margins": 0.19921903312206268, "rewards/rejected": -0.2625056207180023, "step": 1600 }, { "epoch": 2.3205116656157103, "grad_norm": 24.46269416809082, "learning_rate": 1.471894511686988e-08, "logits/chosen": -2.2040247917175293, "logits/rejected": -2.189892530441284, "logps/chosen": -48.797935485839844, "logps/rejected": -50.93223190307617, "loss": 0.6291, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.05211130529642105, "rewards/margins": 0.1955440789461136, "rewards/rejected": -0.24765542149543762, "step": 1610 }, { "epoch": 2.3349247815512117, "grad_norm": 20.400657653808594, "learning_rate": 1.4129050936022214e-08, "logits/chosen": -2.211453914642334, "logits/rejected": -2.1973557472229004, "logps/chosen": -44.81827926635742, "logps/rejected": -49.232845306396484, "loss": 0.6262, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03966861963272095, "rewards/margins": 0.20301298797130585, "rewards/rejected": -0.2426815927028656, "step": 1620 }, { "epoch": 2.3493378974867127, "grad_norm": 23.146989822387695, "learning_rate": 1.3549269867571222e-08, "logits/chosen": -2.2070627212524414, "logits/rejected": -2.2003607749938965, "logps/chosen": -50.647216796875, "logps/rejected": -54.42750930786133, "loss": 0.6403, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.026784483343362808, "rewards/margins": 0.1822723001241684, "rewards/rejected": -0.2090567797422409, "step": 1630 }, { "epoch": 2.363751013422214, "grad_norm": 22.72720718383789, "learning_rate": 1.2979765369447742e-08, "logits/chosen": -2.2865893840789795, "logits/rejected": -2.261965036392212, "logps/chosen": -46.86061477661133, "logps/rejected": -51.7364616394043, "loss": 0.616, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.037126921117305756, "rewards/margins": 0.25349533557891846, "rewards/rejected": -0.2906222939491272, "step": 1640 }, { "epoch": 2.3781641293577156, "grad_norm": 24.311439514160156, "learning_rate": 1.2420698002304608e-08, "logits/chosen": -2.2154691219329834, "logits/rejected": -2.1979966163635254, "logps/chosen": -41.86544418334961, "logps/rejected": -47.02150344848633, "loss": 0.6235, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06195966154336929, "rewards/margins": 0.23036758601665497, "rewards/rejected": -0.29232725501060486, "step": 1650 }, { "epoch": 2.392577245293217, "grad_norm": 18.456798553466797, "learning_rate": 1.1872225384249768e-08, "logits/chosen": -2.2427659034729004, "logits/rejected": -2.232330799102783, "logps/chosen": -48.07630157470703, "logps/rejected": -53.72954177856445, "loss": 0.618, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028111791238188744, "rewards/margins": 0.22475573420524597, "rewards/rejected": -0.25286751985549927, "step": 1660 }, { "epoch": 2.406990361228718, "grad_norm": 29.3465633392334, "learning_rate": 1.1334502146408881e-08, "logits/chosen": -2.2178328037261963, "logits/rejected": -2.2086894512176514, "logps/chosen": -48.64324188232422, "logps/rejected": -52.14066696166992, "loss": 0.6293, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.05821915715932846, "rewards/margins": 0.20005354285240173, "rewards/rejected": -0.2582727074623108, "step": 1670 }, { "epoch": 2.4214034771642194, "grad_norm": 21.647228240966797, "learning_rate": 1.0807679889330163e-08, "logits/chosen": -2.290550708770752, "logits/rejected": -2.289184331893921, "logps/chosen": -48.02743148803711, "logps/rejected": -52.760520935058594, "loss": 0.6348, "rewards/accuracies": 0.65625, "rewards/chosen": -0.035547636449337006, "rewards/margins": 0.17515380680561066, "rewards/rejected": -0.21070146560668945, "step": 1680 }, { "epoch": 2.435816593099721, "grad_norm": 23.48753547668457, "learning_rate": 1.0291907140243538e-08, "logits/chosen": -2.237405300140381, "logits/rejected": -2.217073917388916, "logps/chosen": -45.324363708496094, "logps/rejected": -51.532958984375, "loss": 0.5945, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.014387303963303566, "rewards/margins": 0.2854435443878174, "rewards/rejected": -0.2998308539390564, "step": 1690 }, { "epoch": 2.450229709035222, "grad_norm": 21.15599822998047, "learning_rate": 9.787329311186249e-09, "logits/chosen": -2.2333407402038574, "logits/rejected": -2.2088422775268555, "logps/chosen": -45.39387893676758, "logps/rejected": -48.80021286010742, "loss": 0.6107, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.010507221333682537, "rewards/margins": 0.24884942173957825, "rewards/rejected": -0.25935661792755127, "step": 1700 }, { "epoch": 2.4646428249707233, "grad_norm": 23.810028076171875, "learning_rate": 9.294088658006916e-09, "logits/chosen": -2.2471401691436768, "logits/rejected": -2.2246270179748535, "logps/chosen": -47.628807067871094, "logps/rejected": -52.19799041748047, "loss": 0.6198, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.026676829904317856, "rewards/margins": 0.24438074231147766, "rewards/rejected": -0.2710575461387634, "step": 1710 }, { "epoch": 2.4790559409062247, "grad_norm": 19.998149871826172, "learning_rate": 8.812324240259094e-09, "logits/chosen": -2.2432353496551514, "logits/rejected": -2.2173349857330322, "logps/chosen": -48.27139663696289, "logps/rejected": -52.815460205078125, "loss": 0.6092, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.016811879351735115, "rewards/margins": 0.2507106065750122, "rewards/rejected": -0.2675224542617798, "step": 1720 }, { "epoch": 2.493469056841726, "grad_norm": 23.85225486755371, "learning_rate": 8.342171881996351e-09, "logits/chosen": -2.2501230239868164, "logits/rejected": -2.2334678173065186, "logps/chosen": -47.89342498779297, "logps/rejected": -51.11579132080078, "loss": 0.6234, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.053554244339466095, "rewards/margins": 0.2084587812423706, "rewards/rejected": -0.2620130181312561, "step": 1730 }, { "epoch": 2.507882172777227, "grad_norm": 22.4180908203125, "learning_rate": 7.883764133479137e-09, "logits/chosen": -2.243783473968506, "logits/rejected": -2.2160232067108154, "logps/chosen": -44.910865783691406, "logps/rejected": -50.00033187866211, "loss": 0.6146, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.03045772947371006, "rewards/margins": 0.23907272517681122, "rewards/rejected": -0.2695304751396179, "step": 1740 }, { "epoch": 2.5222952887127286, "grad_norm": 26.53935432434082, "learning_rate": 7.43723023380502e-09, "logits/chosen": -2.185051441192627, "logits/rejected": -2.1775598526000977, "logps/chosen": -47.21742630004883, "logps/rejected": -51.211883544921875, "loss": 0.6049, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.0470205657184124, "rewards/margins": 0.2449074536561966, "rewards/rejected": -0.2919279932975769, "step": 1750 }, { "epoch": 2.53670840464823, "grad_norm": 21.709890365600586, "learning_rate": 7.002696074472075e-09, "logits/chosen": -2.233487606048584, "logits/rejected": -2.21376633644104, "logps/chosen": -50.533409118652344, "logps/rejected": -53.34821319580078, "loss": 0.6136, "rewards/accuracies": 0.671875, "rewards/chosen": -0.058121807873249054, "rewards/margins": 0.2471979856491089, "rewards/rejected": -0.30531978607177734, "step": 1760 }, { "epoch": 2.551121520583731, "grad_norm": 24.80657958984375, "learning_rate": 6.580284163886369e-09, "logits/chosen": -2.237624168395996, "logits/rejected": -2.225713014602661, "logps/chosen": -51.041221618652344, "logps/rejected": -54.869285583496094, "loss": 0.6162, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.038633137941360474, "rewards/margins": 0.22483810782432556, "rewards/rejected": -0.2634713053703308, "step": 1770 }, { "epoch": 2.5655346365192324, "grad_norm": 21.370935440063477, "learning_rate": 6.1701135928230566e-09, "logits/chosen": -2.1967005729675293, "logits/rejected": -2.1760928630828857, "logps/chosen": -51.85515594482422, "logps/rejected": -55.939857482910156, "loss": 0.6185, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07306867837905884, "rewards/margins": 0.2407924234867096, "rewards/rejected": -0.31386110186576843, "step": 1780 }, { "epoch": 2.579947752454734, "grad_norm": 22.11069679260254, "learning_rate": 5.7723000008510655e-09, "logits/chosen": -2.251694440841675, "logits/rejected": -2.226647138595581, "logps/chosen": -47.68011474609375, "logps/rejected": -50.42749786376953, "loss": 0.6207, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.06913193315267563, "rewards/margins": 0.22913888096809387, "rewards/rejected": -0.2982707917690277, "step": 1790 }, { "epoch": 2.5943608683902353, "grad_norm": 22.6718692779541, "learning_rate": 5.386955543730798e-09, "logits/chosen": -2.2583675384521484, "logits/rejected": -2.234276533126831, "logps/chosen": -49.61945343017578, "logps/rejected": -55.055259704589844, "loss": 0.6101, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0628320500254631, "rewards/margins": 0.25214314460754395, "rewards/rejected": -0.31497520208358765, "step": 1800 }, { "epoch": 2.6087739843257363, "grad_norm": 22.774456024169922, "learning_rate": 5.014188861794e-09, "logits/chosen": -2.201119899749756, "logits/rejected": -2.1815037727355957, "logps/chosen": -48.0115966796875, "logps/rejected": -53.1959228515625, "loss": 0.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05451589822769165, "rewards/margins": 0.2779920697212219, "rewards/rejected": -0.3325079679489136, "step": 1810 }, { "epoch": 2.6231871002612377, "grad_norm": 26.43316650390625, "learning_rate": 4.654105049314744e-09, "logits/chosen": -2.257012367248535, "logits/rejected": -2.2474682331085205, "logps/chosen": -49.35502243041992, "logps/rejected": -53.5402946472168, "loss": 0.62, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.02924627996981144, "rewards/margins": 0.22886374592781067, "rewards/rejected": -0.25811004638671875, "step": 1820 }, { "epoch": 2.637600216196739, "grad_norm": 25.40489959716797, "learning_rate": 4.3068056248801496e-09, "logits/chosen": -2.2381327152252197, "logits/rejected": -2.2229650020599365, "logps/chosen": -47.8486328125, "logps/rejected": -53.079307556152344, "loss": 0.624, "rewards/accuracies": 0.640625, "rewards/chosen": -0.05802469328045845, "rewards/margins": 0.22190991044044495, "rewards/rejected": -0.2799345850944519, "step": 1830 }, { "epoch": 2.65201333213224, "grad_norm": 24.248754501342773, "learning_rate": 3.972388502769225e-09, "logits/chosen": -2.2772481441497803, "logits/rejected": -2.2594523429870605, "logps/chosen": -52.78043746948242, "logps/rejected": -56.90644454956055, "loss": 0.612, "rewards/accuracies": 0.671875, "rewards/chosen": -0.07180557399988174, "rewards/margins": 0.2366293966770172, "rewards/rejected": -0.30843502283096313, "step": 1840 }, { "epoch": 2.6664264480677415, "grad_norm": 21.849775314331055, "learning_rate": 3.650947965347817e-09, "logits/chosen": -2.2574167251586914, "logits/rejected": -2.242363452911377, "logps/chosen": -49.745262145996094, "logps/rejected": -55.80384063720703, "loss": 0.5987, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.021196410059928894, "rewards/margins": 0.2885274291038513, "rewards/rejected": -0.3097238540649414, "step": 1850 }, { "epoch": 2.680839564003243, "grad_norm": 26.125858306884766, "learning_rate": 3.342574636487583e-09, "logits/chosen": -2.2989931106567383, "logits/rejected": -2.2858481407165527, "logps/chosen": -48.27467346191406, "logps/rejected": -52.693336486816406, "loss": 0.6224, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.0547061562538147, "rewards/margins": 0.20482540130615234, "rewards/rejected": -0.25953155755996704, "step": 1860 }, { "epoch": 2.6952526799387444, "grad_norm": 21.578413009643555, "learning_rate": 3.0473554560163207e-09, "logits/chosen": -2.2350730895996094, "logits/rejected": -2.2092785835266113, "logps/chosen": -45.995094299316406, "logps/rejected": -50.268714904785156, "loss": 0.6228, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.048373352736234665, "rewards/margins": 0.21610048413276672, "rewards/rejected": -0.2644737958908081, "step": 1870 }, { "epoch": 2.709665795874246, "grad_norm": 28.808284759521484, "learning_rate": 2.7653736552070207e-09, "logits/chosen": -2.26047945022583, "logits/rejected": -2.2417685985565186, "logps/chosen": -52.168251037597656, "logps/rejected": -55.9251594543457, "loss": 0.6094, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.016536137089133263, "rewards/margins": 0.24921099841594696, "rewards/rejected": -0.2657471299171448, "step": 1880 }, { "epoch": 2.724078911809747, "grad_norm": 20.450273513793945, "learning_rate": 2.496708733312419e-09, "logits/chosen": -2.2290151119232178, "logits/rejected": -2.222752094268799, "logps/chosen": -45.20454025268555, "logps/rejected": -50.58509826660156, "loss": 0.6211, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.056550413370132446, "rewards/margins": 0.211500883102417, "rewards/rejected": -0.26805129647254944, "step": 1890 }, { "epoch": 2.7384920277452482, "grad_norm": 19.10955047607422, "learning_rate": 2.241436435151717e-09, "logits/chosen": -2.234227418899536, "logits/rejected": -2.2173571586608887, "logps/chosen": -45.842037200927734, "logps/rejected": -50.64638137817383, "loss": 0.619, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01847267895936966, "rewards/margins": 0.2231343686580658, "rewards/rejected": -0.24160704016685486, "step": 1900 }, { "epoch": 2.7529051436807492, "grad_norm": 18.778671264648438, "learning_rate": 1.9996287297558866e-09, "logits/chosen": -2.2211852073669434, "logits/rejected": -2.2099175453186035, "logps/chosen": -46.6721305847168, "logps/rejected": -51.1614990234375, "loss": 0.6287, "rewards/accuracies": 0.625, "rewards/chosen": -0.056424111127853394, "rewards/margins": 0.22064605355262756, "rewards/rejected": -0.27707016468048096, "step": 1910 }, { "epoch": 2.7673182596162507, "grad_norm": 19.748558044433594, "learning_rate": 1.7713537900772957e-09, "logits/chosen": -2.2626030445098877, "logits/rejected": -2.255251407623291, "logps/chosen": -43.59907913208008, "logps/rejected": -47.853111267089844, "loss": 0.6399, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06305160373449326, "rewards/margins": 0.17098698019981384, "rewards/rejected": -0.2340385913848877, "step": 1920 }, { "epoch": 2.781731375551752, "grad_norm": 24.59339714050293, "learning_rate": 1.5566759737697998e-09, "logits/chosen": -2.2315924167633057, "logits/rejected": -2.215991258621216, "logps/chosen": -49.84008026123047, "logps/rejected": -53.77178955078125, "loss": 0.6144, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.022936221212148666, "rewards/margins": 0.2307942807674408, "rewards/rejected": -0.2537304759025574, "step": 1930 }, { "epoch": 2.7961444914872535, "grad_norm": 22.342159271240234, "learning_rate": 1.3556558050442425e-09, "logits/chosen": -2.2505908012390137, "logits/rejected": -2.233022928237915, "logps/chosen": -48.119747161865234, "logps/rejected": -54.29258346557617, "loss": 0.609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.041114725172519684, "rewards/margins": 0.2513982951641083, "rewards/rejected": -0.292512983083725, "step": 1940 }, { "epoch": 2.810557607422755, "grad_norm": 23.257844924926758, "learning_rate": 1.1683499576049583e-09, "logits/chosen": -2.2333426475524902, "logits/rejected": -2.222357749938965, "logps/chosen": -46.32872772216797, "logps/rejected": -51.00041961669922, "loss": 0.6181, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.05180941894650459, "rewards/margins": 0.22404730319976807, "rewards/rejected": -0.27585673332214355, "step": 1950 }, { "epoch": 2.824970723358256, "grad_norm": 20.40143585205078, "learning_rate": 9.948112386716167e-10, "logits/chosen": -2.2653684616088867, "logits/rejected": -2.2385129928588867, "logps/chosen": -52.7291259765625, "logps/rejected": -56.984962463378906, "loss": 0.6034, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.016901496797800064, "rewards/margins": 0.2864697277545929, "rewards/rejected": -0.30337125062942505, "step": 1960 }, { "epoch": 2.8393838392937574, "grad_norm": 24.3416748046875, "learning_rate": 8.350885740913416e-10, "logits/chosen": -2.199658155441284, "logits/rejected": -2.183741569519043, "logps/chosen": -48.105308532714844, "logps/rejected": -52.25703811645508, "loss": 0.6347, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.06897452473640442, "rewards/margins": 0.20502345263957977, "rewards/rejected": -0.273997962474823, "step": 1970 }, { "epoch": 2.8537969552292584, "grad_norm": 27.89311408996582, "learning_rate": 6.89226994544978e-10, "logits/chosen": -2.202312707901001, "logits/rejected": -2.1836862564086914, "logps/chosen": -50.84931564331055, "logps/rejected": -54.451866149902344, "loss": 0.63, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.0366673581302166, "rewards/margins": 0.19457104802131653, "rewards/rejected": -0.23123839497566223, "step": 1980 }, { "epoch": 2.86821007116476, "grad_norm": 23.623729705810547, "learning_rate": 5.572676228516038e-10, "logits/chosen": -2.23040509223938, "logits/rejected": -2.2104344367980957, "logps/chosen": -48.78104019165039, "logps/rejected": -54.99827194213867, "loss": 0.6092, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.02846195362508297, "rewards/margins": 0.24717013537883759, "rewards/rejected": -0.2756320834159851, "step": 1990 }, { "epoch": 2.882623187100261, "grad_norm": 23.66622543334961, "learning_rate": 4.3924766237473656e-10, "logits/chosen": -2.235327959060669, "logits/rejected": -2.2082302570343018, "logps/chosen": -47.306785583496094, "logps/rejected": -52.0076789855957, "loss": 0.5916, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.014220048673450947, "rewards/margins": 0.29147493839263916, "rewards/rejected": -0.2772548794746399, "step": 2000 }, { "epoch": 2.8970363030357626, "grad_norm": 22.549758911132812, "learning_rate": 3.35200386533574e-10, "logits/chosen": -2.2073681354522705, "logits/rejected": -2.191131114959717, "logps/chosen": -48.15802001953125, "logps/rejected": -52.05684280395508, "loss": 0.6241, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.039518579840660095, "rewards/margins": 0.22054967284202576, "rewards/rejected": -0.26006826758384705, "step": 2010 }, { "epoch": 2.911449418971264, "grad_norm": 23.719388961791992, "learning_rate": 2.4515512942220874e-10, "logits/chosen": -2.253865957260132, "logits/rejected": -2.2316102981567383, "logps/chosen": -50.0792236328125, "logps/rejected": -53.56706619262695, "loss": 0.6187, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.053325168788433075, "rewards/margins": 0.22036127746105194, "rewards/rejected": -0.2736864686012268, "step": 2020 }, { "epoch": 2.925862534906765, "grad_norm": 27.85939598083496, "learning_rate": 1.691372775394717e-10, "logits/chosen": -2.226738214492798, "logits/rejected": -2.2154603004455566, "logps/chosen": -47.156124114990234, "logps/rejected": -50.58882141113281, "loss": 0.6321, "rewards/accuracies": 0.640625, "rewards/chosen": -0.07649682462215424, "rewards/margins": 0.18964572250843048, "rewards/rejected": -0.2661425769329071, "step": 2030 }, { "epoch": 2.9402756508422665, "grad_norm": 26.678600311279297, "learning_rate": 1.0716826263165724e-10, "logits/chosen": -2.2740628719329834, "logits/rejected": -2.248230218887329, "logps/chosen": -47.3420295715332, "logps/rejected": -53.25926971435547, "loss": 0.6144, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.07511458545923233, "rewards/margins": 0.2504141330718994, "rewards/rejected": -0.32552871108055115, "step": 2040 }, { "epoch": 2.954688766777768, "grad_norm": 22.650192260742188, "learning_rate": 5.926555565031743e-11, "logits/chosen": -2.266533613204956, "logits/rejected": -2.254985809326172, "logps/chosen": -49.164058685302734, "logps/rejected": -53.79437255859375, "loss": 0.6302, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.06538953632116318, "rewards/margins": 0.20318007469177246, "rewards/rejected": -0.26856961846351624, "step": 2050 }, { "epoch": 2.969101882713269, "grad_norm": 24.89960289001465, "learning_rate": 2.544266182662458e-11, "logits/chosen": -2.2338194847106934, "logits/rejected": -2.210289239883423, "logps/chosen": -44.41590118408203, "logps/rejected": -49.89021301269531, "loss": 0.6038, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.04492425546050072, "rewards/margins": 0.25600799918174744, "rewards/rejected": -0.30093228816986084, "step": 2060 }, { "epoch": 2.9835149986487703, "grad_norm": 23.839759826660156, "learning_rate": 5.709116863872321e-12, "logits/chosen": -2.250903606414795, "logits/rejected": -2.2327446937561035, "logps/chosen": -45.14078140258789, "logps/rejected": -48.739810943603516, "loss": 0.6244, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.013980092480778694, "rewards/margins": 0.2065594643354416, "rewards/rejected": -0.22053952515125275, "step": 2070 }, { "epoch": 2.9964868029907215, "step": 2079, "total_flos": 0.0, "train_loss": 0.646036940936345, "train_runtime": 18428.4848, "train_samples_per_second": 3.614, "train_steps_per_second": 0.113 } ], "logging_steps": 10, "max_steps": 2079, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }