{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010416666666666666, "grad_norm": 16.651927947998047, "learning_rate": 2e-05, "logits/chosen": -0.780666708946228, "logits/rejected": -1.1369192600250244, "logps/chosen": -3061.823486328125, "logps/rejected": -220.772705078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.020833333333333332, "grad_norm": 13.908960342407227, "learning_rate": 4e-05, "logits/chosen": -0.8081845641136169, "logits/rejected": -1.2085561752319336, "logps/chosen": -4412.2578125, "logps/rejected": -267.08087158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.03125, "grad_norm": 15.761119842529297, "learning_rate": 6e-05, "logits/chosen": -0.9457322359085083, "logits/rejected": -1.1844913959503174, "logps/chosen": -3298.71435546875, "logps/rejected": -223.52560424804688, "loss": 0.6763, "rewards/accuracies": 0.5, "rewards/chosen": -0.01404876634478569, "rewards/margins": -0.040423206984996796, "rewards/rejected": 0.026374435052275658, "step": 3 }, { "epoch": 0.041666666666666664, "grad_norm": 9.980835914611816, "learning_rate": 8e-05, "logits/chosen": -0.8384261131286621, "logits/rejected": -1.271468162536621, "logps/chosen": -3999.277587890625, "logps/rejected": -225.41534423828125, "loss": 0.4997, "rewards/accuracies": 1.0, "rewards/chosen": 0.3802902400493622, "rewards/margins": 0.5509151816368103, "rewards/rejected": -0.17062492668628693, "step": 4 }, { "epoch": 0.052083333333333336, "grad_norm": 6.138638973236084, "learning_rate": 0.0001, "logits/chosen": -0.8365007638931274, "logits/rejected": -1.362606406211853, "logps/chosen": -3391.94677734375, "logps/rejected": -245.4417724609375, "loss": 0.2954, "rewards/accuracies": 1.0, "rewards/chosen": 0.9162139892578125, "rewards/margins": 1.1493107080459595, "rewards/rejected": -0.23309671878814697, "step": 5 }, { "epoch": 0.0625, "grad_norm": 1.2922340631484985, "learning_rate": 0.00012, "logits/chosen": -0.8477813005447388, "logits/rejected": -1.4737606048583984, "logps/chosen": -7458.923828125, "logps/rejected": -282.75335693359375, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 3.14288330078125, "rewards/margins": 5.705089569091797, "rewards/rejected": -2.562206268310547, "step": 6 }, { "epoch": 0.07291666666666667, "grad_norm": 0.03415762260556221, "learning_rate": 0.00014, "logits/chosen": -0.7428842186927795, "logits/rejected": -1.3752648830413818, "logps/chosen": -6296.4091796875, "logps/rejected": -276.24859619140625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 6.91751766204834, "rewards/margins": 11.728842735290527, "rewards/rejected": -4.8113250732421875, "step": 7 }, { "epoch": 0.08333333333333333, "grad_norm": 0.0005445057176984847, "learning_rate": 0.00016, "logits/chosen": -0.8007906079292297, "logits/rejected": -1.3979616165161133, "logps/chosen": -6775.5029296875, "logps/rejected": -370.7599182128906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.398236274719238, "rewards/margins": 20.943986892700195, "rewards/rejected": -9.545750617980957, "step": 8 }, { "epoch": 0.09375, "grad_norm": 0.005390194710344076, "learning_rate": 0.00018, "logits/chosen": -0.8591928482055664, "logits/rejected": -0.895159125328064, "logps/chosen": -2164.9541015625, "logps/rejected": -472.39715576171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.005324363708496, "rewards/margins": 25.770355224609375, "rewards/rejected": -20.765029907226562, "step": 9 }, { "epoch": 0.10416666666666667, "grad_norm": 3.3636655416557915e-07, "learning_rate": 0.0002, "logits/chosen": -0.8266401886940002, "logits/rejected": -0.8240926265716553, "logps/chosen": -2985.67236328125, "logps/rejected": -462.8504638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.8522109985351562, "rewards/margins": 27.728797912597656, "rewards/rejected": -23.876588821411133, "step": 10 }, { "epoch": 0.11458333333333333, "grad_norm": 5.287571491763288e-10, "learning_rate": 0.00019998510240408496, "logits/chosen": -0.9868767261505127, "logits/rejected": -0.8054253458976746, "logps/chosen": -1482.732177734375, "logps/rejected": -553.533935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7153975963592529, "rewards/margins": 32.48991394042969, "rewards/rejected": -33.20531463623047, "step": 11 }, { "epoch": 0.125, "grad_norm": 26.858783721923828, "learning_rate": 0.00019994041405510705, "logits/chosen": -1.03706955909729, "logits/rejected": -0.9741817712783813, "logps/chosen": -4695.4384765625, "logps/rejected": -721.7908935546875, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": -9.903982162475586, "rewards/margins": 36.569175720214844, "rewards/rejected": -46.47315979003906, "step": 12 }, { "epoch": 0.13541666666666666, "grad_norm": 9.055187155659894e-10, "learning_rate": 0.0001998659482680456, "logits/chosen": -0.962079644203186, "logits/rejected": -1.3349699974060059, "logps/chosen": -9016.650390625, "logps/rejected": -793.662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.13396000862121582, "rewards/margins": 55.1308708190918, "rewards/rejected": -54.99691390991211, "step": 13 }, { "epoch": 0.14583333333333334, "grad_norm": 4.094690808642554e-09, "learning_rate": 0.0001997617272301248, "logits/chosen": -0.7595808506011963, "logits/rejected": -1.0764431953430176, "logps/chosen": -2220.482421875, "logps/rejected": -798.6175537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7651824951171875, "rewards/margins": 52.960113525390625, "rewards/rejected": -55.72529602050781, "step": 14 }, { "epoch": 0.15625, "grad_norm": 9.434321379675481e-12, "learning_rate": 0.00019962778199420265, "logits/chosen": -0.8709142804145813, "logits/rejected": -1.1885780096054077, "logps/chosen": -4517.1044921875, "logps/rejected": -827.2481079101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.643299102783203, "rewards/margins": 46.70207977294922, "rewards/rejected": -59.34538269042969, "step": 15 }, { "epoch": 0.16666666666666666, "grad_norm": 2.213202831359747e-20, "learning_rate": 0.0001994641524695193, "logits/chosen": -0.8794220685958862, "logits/rejected": -0.8576048612594604, "logps/chosen": -2103.159912109375, "logps/rejected": -1043.563720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4151291847229004, "rewards/margins": 73.81220245361328, "rewards/rejected": -76.22732543945312, "step": 16 }, { "epoch": 0.17708333333333334, "grad_norm": 3.0938494821608487e-11, "learning_rate": 0.0001992708874098054, "logits/chosen": -0.8563255071640015, "logits/rejected": -1.1423197984695435, "logps/chosen": -5297.2294921875, "logps/rejected": -875.4127197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.190893173217773, "rewards/margins": 58.889068603515625, "rewards/rejected": -64.0799560546875, "step": 17 }, { "epoch": 0.1875, "grad_norm": 1.6154518123142973e-19, "learning_rate": 0.00019904804439875633, "logits/chosen": -0.7584964632987976, "logits/rejected": -1.1960705518722534, "logps/chosen": -4858.4482421875, "logps/rejected": -979.1630249023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.413223743438721, "rewards/margins": 65.54483795166016, "rewards/rejected": -72.95806121826172, "step": 18 }, { "epoch": 0.19791666666666666, "grad_norm": 1.5190748665707273e-20, "learning_rate": 0.00019879568983287467, "logits/chosen": -0.9190940856933594, "logits/rejected": -1.1393502950668335, "logps/chosen": -2892.24267578125, "logps/rejected": -967.30615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.6857757568359375, "rewards/margins": 66.6280288696289, "rewards/rejected": -72.31380462646484, "step": 19 }, { "epoch": 0.20833333333333334, "grad_norm": 5.615921756608865e-20, "learning_rate": 0.0001985138989016874, "logits/chosen": -0.8962702751159668, "logits/rejected": -1.3306204080581665, "logps/chosen": -6762.5302734375, "logps/rejected": -995.3267211914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.49383544921875, "rewards/margins": 71.0030746459961, "rewards/rejected": -75.49691009521484, "step": 20 }, { "epoch": 0.21875, "grad_norm": 1.7029158998823055e-17, "learning_rate": 0.00019820275556534304, "logits/chosen": -0.8105654716491699, "logits/rejected": -1.3740750551223755, "logps/chosen": -9757.419921875, "logps/rejected": -954.4122314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.01361083984375, "rewards/margins": 78.48196411132812, "rewards/rejected": -71.46835327148438, "step": 21 }, { "epoch": 0.22916666666666666, "grad_norm": 0.0, "learning_rate": 0.00019786235252959553, "logits/chosen": -0.8967666625976562, "logits/rejected": -1.1008408069610596, "logps/chosen": -2355.2626953125, "logps/rejected": -1078.12060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.126736640930176, "rewards/margins": 69.51663970947266, "rewards/rejected": -82.64337921142578, "step": 22 }, { "epoch": 0.23958333333333334, "grad_norm": 0.0, "learning_rate": 0.00019749279121818235, "logits/chosen": -0.8905834555625916, "logits/rejected": -1.0848970413208008, "logps/chosen": -2228.7802734375, "logps/rejected": -996.7811279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.612191677093506, "rewards/margins": 73.11354064941406, "rewards/rejected": -76.7257308959961, "step": 23 }, { "epoch": 0.25, "grad_norm": 7.647393330508138e-19, "learning_rate": 0.0001970941817426052, "logits/chosen": -0.8828914165496826, "logits/rejected": -1.166571855545044, "logps/chosen": -3267.3076171875, "logps/rejected": -1007.6031494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.523654460906982, "rewards/margins": 67.767822265625, "rewards/rejected": -75.29147338867188, "step": 24 }, { "epoch": 0.2604166666666667, "grad_norm": 1.2719628243941199e-17, "learning_rate": 0.00019666664286932198, "logits/chosen": -0.9088311791419983, "logits/rejected": -1.0525307655334473, "logps/chosen": -1816.2103271484375, "logps/rejected": -972.173583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.783884048461914, "rewards/margins": 61.763031005859375, "rewards/rejected": -74.54691314697266, "step": 25 }, { "epoch": 0.2708333333333333, "grad_norm": 5.43750881430877e-17, "learning_rate": 0.00019621030198436006, "logits/chosen": -1.0071444511413574, "logits/rejected": -0.8765456676483154, "logps/chosen": -1344.3419189453125, "logps/rejected": -992.904052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.93281364440918, "rewards/margins": 58.60974884033203, "rewards/rejected": -74.54256439208984, "step": 26 }, { "epoch": 0.28125, "grad_norm": 1.87694066018651e-12, "learning_rate": 0.0001957252950553616, "logits/chosen": -0.9608248472213745, "logits/rejected": -1.1069602966308594, "logps/chosen": -2613.587158203125, "logps/rejected": -1058.5823974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.87086296081543, "rewards/margins": 73.99343872070312, "rewards/rejected": -79.86430358886719, "step": 27 }, { "epoch": 0.2916666666666667, "grad_norm": 4.830028054873832e-19, "learning_rate": 0.00019521176659107142, "logits/chosen": -0.9120213985443115, "logits/rejected": -1.3496168851852417, "logps/chosen": -7237.09912109375, "logps/rejected": -1196.45458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.345461368560791, "rewards/margins": 95.51947784423828, "rewards/rejected": -93.17402648925781, "step": 28 }, { "epoch": 0.3020833333333333, "grad_norm": 4.793881215301673e-22, "learning_rate": 0.0001946698695982806, "logits/chosen": -0.8377887010574341, "logits/rejected": -1.4051532745361328, "logps/chosen": -4109.2421875, "logps/rejected": -1000.89111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.656280517578125, "rewards/margins": 71.58284759521484, "rewards/rejected": -77.23912811279297, "step": 29 }, { "epoch": 0.3125, "grad_norm": 6.545510560170225e-20, "learning_rate": 0.00019409976553623766, "logits/chosen": -1.0247584581375122, "logits/rejected": -1.1513103246688843, "logps/chosen": -3428.35400390625, "logps/rejected": -1014.9542236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2859251499176025, "rewards/margins": 76.68400573730469, "rewards/rejected": -77.96993255615234, "step": 30 }, { "epoch": 0.3229166666666667, "grad_norm": 4.597397983254012e-19, "learning_rate": 0.0001935016242685415, "logits/chosen": -0.8390676975250244, "logits/rejected": -1.2956117391586304, "logps/chosen": -9027.2763671875, "logps/rejected": -916.6038818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.76643705368042, "rewards/margins": 78.03052520751953, "rewards/rejected": -70.26408386230469, "step": 31 }, { "epoch": 0.3333333333333333, "grad_norm": 1.3613232990958984e-13, "learning_rate": 0.00019287562401253022, "logits/chosen": -0.9091357588768005, "logits/rejected": -1.3022887706756592, "logps/chosen": -4318.7275390625, "logps/rejected": -1064.7750244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.792261123657227, "rewards/margins": 69.23944854736328, "rewards/rejected": -82.03170776367188, "step": 32 }, { "epoch": 0.34375, "grad_norm": 1.1914185441577697e-15, "learning_rate": 0.00019222195128618106, "logits/chosen": -0.8301032781600952, "logits/rejected": -1.1454460620880127, "logps/chosen": -3503.22802734375, "logps/rejected": -1033.736083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.304028511047363, "rewards/margins": 86.28971099853516, "rewards/rejected": -80.98567962646484, "step": 33 }, { "epoch": 0.3541666666666667, "grad_norm": 1.2077592159173465e-17, "learning_rate": 0.00019154080085253666, "logits/chosen": -0.9025095105171204, "logits/rejected": -1.0029141902923584, "logps/chosen": -2160.233642578125, "logps/rejected": -988.8320922851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.006901741027832, "rewards/margins": 67.99139404296875, "rewards/rejected": -74.99829864501953, "step": 34 }, { "epoch": 0.3645833333333333, "grad_norm": 0.0, "learning_rate": 0.0001908323756616754, "logits/chosen": -0.9620730876922607, "logits/rejected": -1.303593635559082, "logps/chosen": -4102.265625, "logps/rejected": -1039.8701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.7988373041152954, "rewards/margins": 78.1258544921875, "rewards/rejected": -79.92469024658203, "step": 35 }, { "epoch": 0.375, "grad_norm": 2.327858659183149e-19, "learning_rate": 0.0001900968867902419, "logits/chosen": -0.9782366156578064, "logits/rejected": -1.1345422267913818, "logps/chosen": -5296.07373046875, "logps/rejected": -1034.09130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9609848260879517, "rewards/margins": 76.84639739990234, "rewards/rejected": -75.88541412353516, "step": 36 }, { "epoch": 0.3854166666666667, "grad_norm": 5.993950300697435e-16, "learning_rate": 0.00018933455337855632, "logits/chosen": -0.9719871282577515, "logits/rejected": -1.2671374082565308, "logps/chosen": -3619.00830078125, "logps/rejected": -960.418212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8363280296325684, "rewards/margins": 73.91217041015625, "rewards/rejected": -73.07584381103516, "step": 37 }, { "epoch": 0.3958333333333333, "grad_norm": 0.0, "learning_rate": 0.000188545602565321, "logits/chosen": -0.8795933127403259, "logits/rejected": -1.1734505891799927, "logps/chosen": -2961.7783203125, "logps/rejected": -1115.31494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.957999229431152, "rewards/margins": 72.54431915283203, "rewards/rejected": -87.5023193359375, "step": 38 }, { "epoch": 0.40625, "grad_norm": 9.981808812592569e-17, "learning_rate": 0.0001877302694199442, "logits/chosen": -0.9492607116699219, "logits/rejected": -1.0690467357635498, "logps/chosen": -3375.642578125, "logps/rejected": -970.4015502929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.076546669006348, "rewards/margins": 63.78575897216797, "rewards/rejected": -74.8623046875, "step": 39 }, { "epoch": 0.4166666666666667, "grad_norm": 1.677683582476988e-19, "learning_rate": 0.00018688879687250067, "logits/chosen": -0.7364388108253479, "logits/rejected": -1.2299830913543701, "logps/chosen": -4095.498046875, "logps/rejected": -1125.764892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.576887130737305, "rewards/margins": 71.2161865234375, "rewards/rejected": -86.79308319091797, "step": 40 }, { "epoch": 0.4270833333333333, "grad_norm": 0.0, "learning_rate": 0.0001860214356413501, "logits/chosen": -0.8531750440597534, "logits/rejected": -1.2205849885940552, "logps/chosen": -4229.4951171875, "logps/rejected": -1032.5693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.7727296352386475, "rewards/margins": 79.98316192626953, "rewards/rejected": -78.2104263305664, "step": 41 }, { "epoch": 0.4375, "grad_norm": 7.346881715894921e-19, "learning_rate": 0.00018512844415843514, "logits/chosen": -0.8872064352035522, "logits/rejected": -1.347999095916748, "logps/chosen": -6776.28759765625, "logps/rejected": -1047.91064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6724517345428467, "rewards/margins": 77.55692291259766, "rewards/rejected": -78.22937774658203, "step": 42 }, { "epoch": 0.4479166666666667, "grad_norm": 0.0, "learning_rate": 0.00018421008849228118, "logits/chosen": -0.9359738230705261, "logits/rejected": -1.132539987564087, "logps/chosen": -1939.6717529296875, "logps/rejected": -968.7603759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.06414794921875, "rewards/margins": 64.12657928466797, "rewards/rejected": -74.19072723388672, "step": 43 }, { "epoch": 0.4583333333333333, "grad_norm": 1.1837645481569055e-22, "learning_rate": 0.00018326664226872065, "logits/chosen": -1.025484561920166, "logits/rejected": -1.1521519422531128, "logps/chosen": -3046.67138671875, "logps/rejected": -1010.868896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.433685302734375, "rewards/margins": 79.09088134765625, "rewards/rejected": -76.65719604492188, "step": 44 }, { "epoch": 0.46875, "grad_norm": 1.0841111503063809e-20, "learning_rate": 0.00018229838658936564, "logits/chosen": -0.8892084956169128, "logits/rejected": -1.0169939994812012, "logps/chosen": -3305.859130859375, "logps/rejected": -1010.6842041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.042553901672363, "rewards/margins": 71.8106689453125, "rewards/rejected": -77.85322570800781, "step": 45 }, { "epoch": 0.4791666666666667, "grad_norm": 4.2357202397516193e-19, "learning_rate": 0.00018130560994785325, "logits/chosen": -0.8572372198104858, "logits/rejected": -1.3382017612457275, "logps/chosen": -3960.66796875, "logps/rejected": -1066.4102783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.808450698852539, "rewards/margins": 67.266357421875, "rewards/rejected": -82.07479858398438, "step": 46 }, { "epoch": 0.4895833333333333, "grad_norm": 0.0, "learning_rate": 0.00018028860814388827, "logits/chosen": -0.9053932428359985, "logits/rejected": -1.245862603187561, "logps/chosen": -4537.33935546875, "logps/rejected": -1066.2734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.734985828399658, "rewards/margins": 74.92082977294922, "rewards/rejected": -81.65581512451172, "step": 47 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.00017924768419510904, "logits/chosen": -1.034196138381958, "logits/rejected": -1.2119903564453125, "logps/chosen": -4595.2041015625, "logps/rejected": -964.75146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3183517456054688, "rewards/margins": 72.99729919433594, "rewards/rejected": -74.3156509399414, "step": 48 }, { "epoch": 0.5104166666666666, "grad_norm": 1.567794059815996e-19, "learning_rate": 0.000178183148246803, "logits/chosen": -0.9009298086166382, "logits/rejected": -1.2393324375152588, "logps/chosen": -2687.751953125, "logps/rejected": -1079.8603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.771054267883301, "rewards/margins": 76.5450210571289, "rewards/rejected": -82.31607818603516, "step": 49 }, { "epoch": 0.5208333333333334, "grad_norm": 0.0, "learning_rate": 0.00017709531747949796, "logits/chosen": -0.7311493158340454, "logits/rejected": -0.9473284482955933, "logps/chosen": -1437.1895751953125, "logps/rejected": -1016.4439086914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1630539894104004, "rewards/margins": 73.96327209472656, "rewards/rejected": -75.1263198852539, "step": 50 }, { "epoch": 0.53125, "grad_norm": 7.311199956683466e-17, "learning_rate": 0.0001759845160144579, "logits/chosen": -0.9461476802825928, "logits/rejected": -1.2513340711593628, "logps/chosen": -9740.1708984375, "logps/rejected": -998.367431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.608297348022461, "rewards/margins": 67.94587707519531, "rewards/rejected": -76.5541763305664, "step": 51 }, { "epoch": 0.5416666666666666, "grad_norm": 0.0, "learning_rate": 0.00017485107481711012, "logits/chosen": -1.0065760612487793, "logits/rejected": -1.176657795906067, "logps/chosen": -3810.3505859375, "logps/rejected": -1045.2769775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.958648681640625, "rewards/margins": 79.65708923339844, "rewards/rejected": -80.61573791503906, "step": 52 }, { "epoch": 0.5520833333333334, "grad_norm": 0.0, "learning_rate": 0.00017369533159843369, "logits/chosen": -0.7354742288589478, "logits/rejected": -1.1330161094665527, "logps/chosen": -4675.5478515625, "logps/rejected": -1095.03857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.42770004272461, "rewards/margins": 73.42713928222656, "rewards/rejected": -84.8548355102539, "step": 53 }, { "epoch": 0.5625, "grad_norm": 0.0, "learning_rate": 0.00017251763071433765, "logits/chosen": -0.846391499042511, "logits/rejected": -1.4115591049194336, "logps/chosen": -6438.18359375, "logps/rejected": -1123.985107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.262456893920898, "rewards/margins": 74.60464477539062, "rewards/rejected": -84.86710357666016, "step": 54 }, { "epoch": 0.5729166666666666, "grad_norm": 1.4700509215416664e-14, "learning_rate": 0.00017131832306305965, "logits/chosen": -0.960889458656311, "logits/rejected": -1.234442114830017, "logps/chosen": -3840.143310546875, "logps/rejected": -906.8041381835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.883606433868408, "rewards/margins": 67.3929443359375, "rewards/rejected": -71.27655029296875, "step": 55 }, { "epoch": 0.5833333333333334, "grad_norm": 0.0, "learning_rate": 0.00017009776598061495, "logits/chosen": -0.9445286393165588, "logits/rejected": -0.9528659582138062, "logps/chosen": -1339.93603515625, "logps/rejected": -1048.4464111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.9421162605285645, "rewards/margins": 72.82998657226562, "rewards/rejected": -78.77210998535156, "step": 56 }, { "epoch": 0.59375, "grad_norm": 0.0, "learning_rate": 0.0001688563231343277, "logits/chosen": -0.921560525894165, "logits/rejected": -1.464065432548523, "logps/chosen": -6107.14794921875, "logps/rejected": -1177.58154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.16182255744934082, "rewards/margins": 89.7105941772461, "rewards/rejected": -89.54876708984375, "step": 57 }, { "epoch": 0.6041666666666666, "grad_norm": 2.2782913188817383e-17, "learning_rate": 0.00016759436441447545, "logits/chosen": -0.9432457685470581, "logits/rejected": -0.9813876748085022, "logps/chosen": -2878.421875, "logps/rejected": -973.9136962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.945228576660156, "rewards/margins": 64.66465759277344, "rewards/rejected": -73.6098861694336, "step": 58 }, { "epoch": 0.6145833333333334, "grad_norm": 3.532171644213204e-18, "learning_rate": 0.00016631226582407952, "logits/chosen": -1.011328101158142, "logits/rejected": -1.1443777084350586, "logps/chosen": -4537.185546875, "logps/rejected": -1023.2095336914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.814834713935852, "rewards/margins": 78.44985961914062, "rewards/rejected": -77.63502502441406, "step": 59 }, { "epoch": 0.625, "grad_norm": 1.0837672078321995e-16, "learning_rate": 0.00016501040936687443, "logits/chosen": -0.7958677411079407, "logits/rejected": -1.2076987028121948, "logps/chosen": -3676.333984375, "logps/rejected": -991.25146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.135693550109863, "rewards/margins": 61.884666442871094, "rewards/rejected": -76.0203628540039, "step": 60 }, { "epoch": 0.6354166666666666, "grad_norm": 5.757463345409198e-20, "learning_rate": 0.00016368918293348892, "logits/chosen": -0.9800617694854736, "logits/rejected": -1.1600340604782104, "logps/chosen": -3713.349853515625, "logps/rejected": -1017.238037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.153411865234375, "rewards/margins": 75.27999114990234, "rewards/rejected": -79.43340301513672, "step": 61 }, { "epoch": 0.6458333333333334, "grad_norm": 5.311725612781588e-16, "learning_rate": 0.00016234898018587337, "logits/chosen": -0.8364681005477905, "logits/rejected": -1.16648530960083, "logps/chosen": -2505.11962890625, "logps/rejected": -1125.0926513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.13332176208496, "rewards/margins": 66.983642578125, "rewards/rejected": -85.11697387695312, "step": 62 }, { "epoch": 0.65625, "grad_norm": 9.133200582579723e-18, "learning_rate": 0.00016099020044000727, "logits/chosen": -1.0037099123001099, "logits/rejected": -1.054494857788086, "logps/chosen": -2708.493408203125, "logps/rejected": -1137.5875244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.967913627624512, "rewards/margins": 81.5879898071289, "rewards/rejected": -88.555908203125, "step": 63 }, { "epoch": 0.6666666666666666, "grad_norm": 2.593275012522582e-16, "learning_rate": 0.00015961324854692254, "logits/chosen": -0.8983689546585083, "logits/rejected": -1.127020001411438, "logps/chosen": -2932.719482421875, "logps/rejected": -1062.8060302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.111117362976074, "rewards/margins": 79.9205551147461, "rewards/rejected": -82.03167724609375, "step": 64 }, { "epoch": 0.6770833333333334, "grad_norm": 0.0, "learning_rate": 0.00015821853477207708, "logits/chosen": -0.9519010186195374, "logits/rejected": -1.1288293600082397, "logps/chosen": -2201.759521484375, "logps/rejected": -898.9304809570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.13864278793335, "rewards/margins": 63.858665466308594, "rewards/rejected": -69.997314453125, "step": 65 }, { "epoch": 0.6875, "grad_norm": 3.743392066509216e-23, "learning_rate": 0.00015680647467311557, "logits/chosen": -0.8885273337364197, "logits/rejected": -1.2975856065750122, "logps/chosen": -3025.942138671875, "logps/rejected": -998.9232177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.540674209594727, "rewards/margins": 62.277652740478516, "rewards/rejected": -76.81832885742188, "step": 66 }, { "epoch": 0.6979166666666666, "grad_norm": 1.5581050296373422e-17, "learning_rate": 0.0001553774889760533, "logits/chosen": -0.922539472579956, "logits/rejected": -1.1917293071746826, "logps/chosen": -2637.00439453125, "logps/rejected": -1103.3564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.390442848205566, "rewards/margins": 80.02389526367188, "rewards/rejected": -85.41433715820312, "step": 67 }, { "epoch": 0.7083333333333334, "grad_norm": 1.5840932732213986e-11, "learning_rate": 0.00015393200344991995, "logits/chosen": -0.9601885080337524, "logits/rejected": -1.379237174987793, "logps/chosen": -7276.7919921875, "logps/rejected": -1131.1331787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.817126274108887, "rewards/margins": 79.90594482421875, "rewards/rejected": -85.72306823730469, "step": 68 }, { "epoch": 0.71875, "grad_norm": 3.447390015483765e-17, "learning_rate": 0.0001524704487799008, "logits/chosen": -0.9075456857681274, "logits/rejected": -1.2865498065948486, "logps/chosen": -4736.16015625, "logps/rejected": -1066.030029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.429553270339966, "rewards/margins": 80.66062927246094, "rewards/rejected": -83.0901870727539, "step": 69 }, { "epoch": 0.7291666666666666, "grad_norm": 0.0, "learning_rate": 0.0001509932604390136, "logits/chosen": -0.9576442837715149, "logits/rejected": -1.1714496612548828, "logps/chosen": -2777.48486328125, "logps/rejected": -981.0249633789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5866700410842896, "rewards/margins": 74.16746520996094, "rewards/rejected": -74.75413513183594, "step": 70 }, { "epoch": 0.7395833333333334, "grad_norm": 0.0, "learning_rate": 0.00014950087855835815, "logits/chosen": -0.957038164138794, "logits/rejected": -1.309131383895874, "logps/chosen": -3568.40625, "logps/rejected": -1100.7939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.437561511993408, "rewards/margins": 76.48944854736328, "rewards/rejected": -83.92700958251953, "step": 71 }, { "epoch": 0.75, "grad_norm": 2.023447353270838e-18, "learning_rate": 0.00014799374779597867, "logits/chosen": -0.879957377910614, "logits/rejected": -1.1743906736373901, "logps/chosen": -2927.203369140625, "logps/rejected": -942.46728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.356558322906494, "rewards/margins": 65.15306854248047, "rewards/rejected": -72.50962829589844, "step": 72 }, { "epoch": 0.7604166666666666, "grad_norm": 0.0, "learning_rate": 0.00014647231720437686, "logits/chosen": -0.9975137710571289, "logits/rejected": -1.096327543258667, "logps/chosen": -2194.671875, "logps/rejected": -1060.455322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.339242458343506, "rewards/margins": 73.82234191894531, "rewards/rejected": -81.16158294677734, "step": 73 }, { "epoch": 0.7708333333333334, "grad_norm": 0.0, "learning_rate": 0.00014493704009671613, "logits/chosen": -0.8398927450180054, "logits/rejected": -1.05116868019104, "logps/chosen": -2852.46875, "logps/rejected": -1053.2982177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2871978282928467, "rewards/margins": 77.74524688720703, "rewards/rejected": -81.0324478149414, "step": 74 }, { "epoch": 0.78125, "grad_norm": 2.823407507790421e-09, "learning_rate": 0.00014338837391175582, "logits/chosen": -0.8749938607215881, "logits/rejected": -1.3326060771942139, "logps/chosen": -6237.5947265625, "logps/rejected": -1008.5339965820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.8448486328125, "rewards/margins": 65.41060638427734, "rewards/rejected": -78.25544738769531, "step": 75 }, { "epoch": 0.7916666666666666, "grad_norm": 2.146854222062471e-20, "learning_rate": 0.0001418267800775565, "logits/chosen": -0.9367476105690002, "logits/rejected": -1.0628546476364136, "logps/chosen": -3435.0185546875, "logps/rejected": -1105.599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.55868911743164, "rewards/margins": 72.16712951660156, "rewards/rejected": -84.72582244873047, "step": 76 }, { "epoch": 0.8020833333333334, "grad_norm": 4.4584097976070534e-21, "learning_rate": 0.00014025272387399674, "logits/chosen": -1.013418436050415, "logits/rejected": -1.210724115371704, "logps/chosen": -4156.4990234375, "logps/rejected": -1065.85009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.16033363342285, "rewards/margins": 65.87461853027344, "rewards/rejected": -82.03495025634766, "step": 77 }, { "epoch": 0.8125, "grad_norm": 5.350105004942962e-20, "learning_rate": 0.0001386666742941419, "logits/chosen": -0.9383605718612671, "logits/rejected": -1.2753703594207764, "logps/chosen": -5303.810546875, "logps/rejected": -1006.8165893554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.698285102844238, "rewards/margins": 73.23123168945312, "rewards/rejected": -77.92951965332031, "step": 78 }, { "epoch": 0.8229166666666666, "grad_norm": 2.6276267314835005e-17, "learning_rate": 0.00013706910390450677, "logits/chosen": -0.928679347038269, "logits/rejected": -1.1945797204971313, "logps/chosen": -5853.28369140625, "logps/rejected": -996.289306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.84226131439209, "rewards/margins": 66.27059173583984, "rewards/rejected": -78.11285400390625, "step": 79 }, { "epoch": 0.8333333333333334, "grad_norm": 0.0, "learning_rate": 0.00013546048870425356, "logits/chosen": -0.90166175365448, "logits/rejected": -1.432413935661316, "logps/chosen": -5923.66845703125, "logps/rejected": -1065.8446044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.516674041748047, "rewards/margins": 72.67932891845703, "rewards/rejected": -82.19600677490234, "step": 80 }, { "epoch": 0.84375, "grad_norm": 0.0, "learning_rate": 0.00013384130798336705, "logits/chosen": -0.9168898463249207, "logits/rejected": -1.3556921482086182, "logps/chosen": -6369.79296875, "logps/rejected": -1242.0037841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.644824028015137, "rewards/margins": 84.6693344116211, "rewards/rejected": -96.31415557861328, "step": 81 }, { "epoch": 0.8541666666666666, "grad_norm": 9.279878730339463e-19, "learning_rate": 0.00013221204417984908, "logits/chosen": -0.8945307731628418, "logits/rejected": -1.1446576118469238, "logps/chosen": -4128.4306640625, "logps/rejected": -958.488037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.115054190158844, "rewards/margins": 73.7982177734375, "rewards/rejected": -73.91326904296875, "step": 82 }, { "epoch": 0.8645833333333334, "grad_norm": 2.394110249810547e-15, "learning_rate": 0.0001305731827359753, "logits/chosen": -0.9232196807861328, "logits/rejected": -1.1997020244598389, "logps/chosen": -4594.69287109375, "logps/rejected": -1065.5748291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.030016899108887, "rewards/margins": 74.59611511230469, "rewards/rejected": -81.62613677978516, "step": 83 }, { "epoch": 0.875, "grad_norm": 1.8590894193296648e-17, "learning_rate": 0.00012892521195365678, "logits/chosen": -1.0041048526763916, "logits/rejected": -1.1207554340362549, "logps/chosen": -3463.850341796875, "logps/rejected": -1104.9521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.943849563598633, "rewards/margins": 80.47566223144531, "rewards/rejected": -84.41950988769531, "step": 84 }, { "epoch": 0.8854166666666666, "grad_norm": 9.305548438880714e-21, "learning_rate": 0.00012726862284894938, "logits/chosen": -0.9365824460983276, "logits/rejected": -1.378727674484253, "logps/chosen": -5006.0732421875, "logps/rejected": -1108.4298095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.526544570922852, "rewards/margins": 95.75658416748047, "rewards/rejected": -84.23004913330078, "step": 85 }, { "epoch": 0.8958333333333334, "grad_norm": 2.6055184079070218e-14, "learning_rate": 0.0001256039090057547, "logits/chosen": -0.8847850561141968, "logits/rejected": -1.2132179737091064, "logps/chosen": -3019.03466796875, "logps/rejected": -1116.0423583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.331069946289062, "rewards/margins": 68.22811889648438, "rewards/rejected": -85.55919647216797, "step": 86 }, { "epoch": 0.90625, "grad_norm": 1.337394415048057e-13, "learning_rate": 0.0001239315664287558, "logits/chosen": -0.9080939292907715, "logits/rejected": -1.1880122423171997, "logps/chosen": -6956.1767578125, "logps/rejected": -1072.7021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3681154251098633, "rewards/margins": 78.97460174560547, "rewards/rejected": -81.34272003173828, "step": 87 }, { "epoch": 0.9166666666666666, "grad_norm": 1.1238145078521735e-18, "learning_rate": 0.00012225209339563145, "logits/chosen": -0.871356189250946, "logits/rejected": -1.2935049533843994, "logps/chosen": -7261.5849609375, "logps/rejected": -934.6922607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1817047595977783, "rewards/margins": 71.47894287109375, "rewards/rejected": -72.66064453125, "step": 88 }, { "epoch": 0.9270833333333334, "grad_norm": 3.7998657856768726e-21, "learning_rate": 0.00012056599030859366, "logits/chosen": -0.8709771633148193, "logits/rejected": -1.3064707517623901, "logps/chosen": -4282.35302734375, "logps/rejected": -1147.09765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.661895751953125, "rewards/margins": 80.48426818847656, "rewards/rejected": -88.14615631103516, "step": 89 }, { "epoch": 0.9375, "grad_norm": 3.9913009729067834e-17, "learning_rate": 0.00011887375954529168, "logits/chosen": -1.0120631456375122, "logits/rejected": -1.4202260971069336, "logps/chosen": -4564.931640625, "logps/rejected": -1029.2471923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 25.81535530090332, "rewards/margins": 104.50590515136719, "rewards/rejected": -78.6905517578125, "step": 90 }, { "epoch": 0.9479166666666666, "grad_norm": 4.153984713967665e-21, "learning_rate": 0.00011717590530912763, "logits/chosen": -0.8913455009460449, "logits/rejected": -1.2823923826217651, "logps/chosen": -3196.804443359375, "logps/rejected": -1069.353271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.811059951782227, "rewards/margins": 67.35678100585938, "rewards/rejected": -82.16783142089844, "step": 91 }, { "epoch": 0.9583333333333334, "grad_norm": 4.296434017478194e-15, "learning_rate": 0.00011547293347902812, "logits/chosen": -1.0650385618209839, "logits/rejected": -1.330976963043213, "logps/chosen": -5423.759765625, "logps/rejected": -1142.623779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.46921157836914, "rewards/margins": 78.08798217773438, "rewards/rejected": -89.55718994140625, "step": 92 }, { "epoch": 0.96875, "grad_norm": 6.566554551941457e-20, "learning_rate": 0.00011376535145871684, "logits/chosen": -0.9406051635742188, "logits/rejected": -1.2664682865142822, "logps/chosen": -3839.88671875, "logps/rejected": -1140.073486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.9644775390625, "rewards/margins": 70.13662719726562, "rewards/rejected": -88.10110473632812, "step": 93 }, { "epoch": 0.9791666666666666, "grad_norm": 0.0, "learning_rate": 0.0001120536680255323, "logits/chosen": -0.9597834348678589, "logits/rejected": -1.1578893661499023, "logps/chosen": -2502.4541015625, "logps/rejected": -1016.5823974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.05165565013885498, "rewards/margins": 78.03656005859375, "rewards/rejected": -78.08821105957031, "step": 94 }, { "epoch": 0.9895833333333334, "grad_norm": 2.4043631071810274e-16, "learning_rate": 0.00011033839317883701, "logits/chosen": -0.876549482345581, "logits/rejected": -1.2577821016311646, "logps/chosen": -3624.76611328125, "logps/rejected": -1066.66015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.061856269836426, "rewards/margins": 95.77525329589844, "rewards/rejected": -82.7134017944336, "step": 95 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 0.00010862003798806196, "logits/chosen": -0.9283385276794434, "logits/rejected": -1.1488637924194336, "logps/chosen": -4157.3427734375, "logps/rejected": -1044.188232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.248306274414062, "rewards/margins": 71.64842224121094, "rewards/rejected": -81.896728515625, "step": 96 }, { "epoch": 1.0104166666666667, "grad_norm": 5.512825971716348e-20, "learning_rate": 0.00010689911444043248, "logits/chosen": -0.7668694257736206, "logits/rejected": -1.1733273267745972, "logps/chosen": -3267.61962890625, "logps/rejected": -1233.9364013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.425668716430664, "rewards/margins": 77.82489013671875, "rewards/rejected": -94.25055694580078, "step": 97 }, { "epoch": 1.0208333333333333, "grad_norm": 0.0, "learning_rate": 0.00010517613528842097, "logits/chosen": -1.0368558168411255, "logits/rejected": -1.1599735021591187, "logps/chosen": -3525.150634765625, "logps/rejected": -1166.176025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8862044811248779, "rewards/margins": 90.17179107666016, "rewards/rejected": -89.28559112548828, "step": 98 }, { "epoch": 1.03125, "grad_norm": 0.0, "learning_rate": 0.00010345161389697082, "logits/chosen": -0.7527876496315002, "logits/rejected": -1.1752800941467285, "logps/chosen": -4636.1845703125, "logps/rejected": -945.1699829101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.4130585193634033, "rewards/margins": 77.43528747558594, "rewards/rejected": -74.02222442626953, "step": 99 }, { "epoch": 1.0416666666666667, "grad_norm": 0.0, "learning_rate": 0.00010172606409053886, "logits/chosen": -0.9287230968475342, "logits/rejected": -1.0137070417404175, "logps/chosen": -2631.903564453125, "logps/rejected": -1053.97802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.378585815429688, "rewards/margins": 67.58805847167969, "rewards/rejected": -79.96664428710938, "step": 100 }, { "epoch": 1.0520833333333333, "grad_norm": 2.335907492570711e-16, "learning_rate": 0.0001, "logits/chosen": -0.9676191806793213, "logits/rejected": -1.2866928577423096, "logps/chosen": -3778.99462890625, "logps/rejected": -1169.9747314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.198566436767578, "rewards/margins": 72.15303802490234, "rewards/rejected": -91.35160064697266, "step": 101 }, { "epoch": 1.0625, "grad_norm": 3.0693231460494636e-17, "learning_rate": 9.827393590946116e-05, "logits/chosen": -0.8066754341125488, "logits/rejected": -1.3259482383728027, "logps/chosen": -4285.404296875, "logps/rejected": -1049.9791259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3176026344299316, "rewards/margins": 77.5743179321289, "rewards/rejected": -80.89192199707031, "step": 102 }, { "epoch": 1.0729166666666667, "grad_norm": 7.908512019412167e-19, "learning_rate": 9.654838610302923e-05, "logits/chosen": -0.8500939607620239, "logits/rejected": -1.3845233917236328, "logps/chosen": -4154.744140625, "logps/rejected": -998.893798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2993621826171875, "rewards/margins": 79.06625366210938, "rewards/rejected": -77.76689147949219, "step": 103 }, { "epoch": 1.0833333333333333, "grad_norm": 1.884301279524458e-19, "learning_rate": 9.482386471157904e-05, "logits/chosen": -0.803484320640564, "logits/rejected": -1.1875181198120117, "logps/chosen": -2875.72509765625, "logps/rejected": -1111.62060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.236685037612915, "rewards/margins": 81.31465148925781, "rewards/rejected": -83.55133819580078, "step": 104 }, { "epoch": 1.09375, "grad_norm": 0.0, "learning_rate": 9.31008855595675e-05, "logits/chosen": -0.9018905162811279, "logits/rejected": -1.1758604049682617, "logps/chosen": -2899.448486328125, "logps/rejected": -1177.0833740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.149624824523926, "rewards/margins": 84.1700668334961, "rewards/rejected": -90.31968688964844, "step": 105 }, { "epoch": 1.1041666666666667, "grad_norm": 4.704146143799659e-13, "learning_rate": 9.137996201193805e-05, "logits/chosen": -0.9577762484550476, "logits/rejected": -1.2123889923095703, "logps/chosen": -3999.40087890625, "logps/rejected": -1089.39453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.861725091934204, "rewards/margins": 79.9576187133789, "rewards/rejected": -83.81934356689453, "step": 106 }, { "epoch": 1.1145833333333333, "grad_norm": 3.220187770403815e-22, "learning_rate": 8.9661606821163e-05, "logits/chosen": -0.9275646209716797, "logits/rejected": -1.140000343322754, "logps/chosen": -6079.86474609375, "logps/rejected": -1045.149658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.459826946258545, "rewards/margins": 80.20182800292969, "rewards/rejected": -81.66165161132812, "step": 107 }, { "epoch": 1.125, "grad_norm": 0.0, "learning_rate": 8.79463319744677e-05, "logits/chosen": -0.9333359003067017, "logits/rejected": -1.2084262371063232, "logps/chosen": -3093.892578125, "logps/rejected": -1011.2752685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.587095260620117, "rewards/margins": 64.42304992675781, "rewards/rejected": -79.01013946533203, "step": 108 }, { "epoch": 1.1354166666666667, "grad_norm": 4.516256854982578e-16, "learning_rate": 8.62346485412832e-05, "logits/chosen": -0.9335057735443115, "logits/rejected": -1.2948254346847534, "logps/chosen": -3485.91015625, "logps/rejected": -943.7158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.117718696594238, "rewards/margins": 58.55492401123047, "rewards/rejected": -73.67264556884766, "step": 109 }, { "epoch": 1.1458333333333333, "grad_norm": 1.7443403811759252e-20, "learning_rate": 8.452706652097186e-05, "logits/chosen": -0.8658100962638855, "logits/rejected": -1.2765027284622192, "logps/chosen": -7044.2666015625, "logps/rejected": -1111.2900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.557941436767578, "rewards/margins": 70.0662841796875, "rewards/rejected": -84.62422943115234, "step": 110 }, { "epoch": 1.15625, "grad_norm": 8.975433514902403e-14, "learning_rate": 8.282409469087239e-05, "logits/chosen": -0.8567153215408325, "logits/rejected": -1.1527928113937378, "logps/chosen": -2753.281005859375, "logps/rejected": -1106.07861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.908112049102783, "rewards/margins": 77.96578979492188, "rewards/rejected": -84.8739013671875, "step": 111 }, { "epoch": 1.1666666666666667, "grad_norm": 1.8399236127730756e-13, "learning_rate": 8.112624045470835e-05, "logits/chosen": -0.8963369727134705, "logits/rejected": -1.1266080141067505, "logps/chosen": -1911.738037109375, "logps/rejected": -935.976806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.368061065673828, "rewards/margins": 60.68651580810547, "rewards/rejected": -71.05458068847656, "step": 112 }, { "epoch": 1.1770833333333333, "grad_norm": 4.411308711811606e-19, "learning_rate": 7.943400969140635e-05, "logits/chosen": -0.9802509546279907, "logits/rejected": -1.2022995948791504, "logps/chosen": -4026.29296875, "logps/rejected": -1028.704345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.705430030822754, "rewards/margins": 69.18346405029297, "rewards/rejected": -78.8888931274414, "step": 113 }, { "epoch": 1.1875, "grad_norm": 6.856654159841134e-19, "learning_rate": 7.774790660436858e-05, "logits/chosen": -0.7806614637374878, "logits/rejected": -1.2099708318710327, "logps/chosen": -3049.276123046875, "logps/rejected": -1083.7523193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.252618432044983, "rewards/margins": 82.09521484375, "rewards/rejected": -83.34783935546875, "step": 114 }, { "epoch": 1.1979166666666667, "grad_norm": 0.0, "learning_rate": 7.606843357124426e-05, "logits/chosen": -0.8244500160217285, "logits/rejected": -1.3403112888336182, "logps/chosen": -2625.35205078125, "logps/rejected": -1107.463134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.240277290344238, "rewards/margins": 75.68666076660156, "rewards/rejected": -84.92694091796875, "step": 115 }, { "epoch": 1.2083333333333333, "grad_norm": 6.782594740894701e-20, "learning_rate": 7.43960909942453e-05, "logits/chosen": -0.9496563673019409, "logits/rejected": -1.0177761316299438, "logps/chosen": -1753.7301025390625, "logps/rejected": -994.8018798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.330022811889648, "rewards/margins": 62.15534210205078, "rewards/rejected": -75.48535919189453, "step": 116 }, { "epoch": 1.21875, "grad_norm": 1.650893022233743e-17, "learning_rate": 7.273137715105063e-05, "logits/chosen": -0.8159215450286865, "logits/rejected": -1.11468505859375, "logps/chosen": -2901.86376953125, "logps/rejected": -1028.982666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.9665374755859375, "rewards/margins": 75.73015594482422, "rewards/rejected": -80.69668579101562, "step": 117 }, { "epoch": 1.2291666666666667, "grad_norm": 9.338665629292994e-18, "learning_rate": 7.107478804634325e-05, "logits/chosen": -0.9511147737503052, "logits/rejected": -1.131158471107483, "logps/chosen": -2839.02099609375, "logps/rejected": -916.0258178710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.969090938568115, "rewards/margins": 76.42955780029297, "rewards/rejected": -71.46046447753906, "step": 118 }, { "epoch": 1.2395833333333333, "grad_norm": 8.370479371910218e-23, "learning_rate": 6.942681726402473e-05, "logits/chosen": -0.945068895816803, "logits/rejected": -1.2830828428268433, "logps/chosen": -3800.778076171875, "logps/rejected": -905.9716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0239747762680054, "rewards/margins": 69.88013458251953, "rewards/rejected": -68.85615539550781, "step": 119 }, { "epoch": 1.25, "grad_norm": 0.0, "learning_rate": 6.778795582015097e-05, "logits/chosen": -0.9145641326904297, "logits/rejected": -1.1153881549835205, "logps/chosen": -2149.349365234375, "logps/rejected": -1061.110107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.1018218994140625, "rewards/margins": 79.47099304199219, "rewards/rejected": -81.57280731201172, "step": 120 }, { "epoch": 1.2604166666666667, "grad_norm": 3.655288958431413e-17, "learning_rate": 6.615869201663296e-05, "logits/chosen": -0.8868393898010254, "logits/rejected": -1.3098794221878052, "logps/chosen": -5764.04736328125, "logps/rejected": -1124.540283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0704987049102783, "rewards/margins": 85.03953552246094, "rewards/rejected": -86.11003112792969, "step": 121 }, { "epoch": 1.2708333333333333, "grad_norm": 5.505299111008798e-16, "learning_rate": 6.453951129574644e-05, "logits/chosen": -0.9295135140419006, "logits/rejected": -1.1804428100585938, "logps/chosen": -5864.943359375, "logps/rejected": -1078.9697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.316208839416504, "rewards/margins": 71.87071228027344, "rewards/rejected": -82.18692016601562, "step": 122 }, { "epoch": 1.28125, "grad_norm": 0.0, "learning_rate": 6.293089609549325e-05, "logits/chosen": -0.7975695133209229, "logits/rejected": -1.3155672550201416, "logps/chosen": -3814.4150390625, "logps/rejected": -1204.483154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.016815185546875, "rewards/margins": 94.49725341796875, "rewards/rejected": -93.48043823242188, "step": 123 }, { "epoch": 1.2916666666666667, "grad_norm": 1.4148333719063387e-19, "learning_rate": 6.133332570585812e-05, "logits/chosen": -1.063108205795288, "logits/rejected": -1.2205861806869507, "logps/chosen": -2892.2685546875, "logps/rejected": -1125.844970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.142558097839355, "rewards/margins": 72.0556411743164, "rewards/rejected": -85.19820404052734, "step": 124 }, { "epoch": 1.3020833333333333, "grad_norm": 5.8489230255285674e-21, "learning_rate": 5.9747276126003257e-05, "logits/chosen": -0.7630377411842346, "logits/rejected": -1.316693902015686, "logps/chosen": -3374.70263671875, "logps/rejected": -1142.8436279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.068140029907227, "rewards/margins": 79.92533874511719, "rewards/rejected": -87.99348449707031, "step": 125 }, { "epoch": 1.3125, "grad_norm": 1.6185688098081561e-18, "learning_rate": 5.817321992244351e-05, "logits/chosen": -0.9196380376815796, "logits/rejected": -1.2248481512069702, "logps/chosen": -3206.56591796875, "logps/rejected": -1012.30908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.104272842407227, "rewards/margins": 65.4676284790039, "rewards/rejected": -76.5718994140625, "step": 126 }, { "epoch": 1.3229166666666667, "grad_norm": 2.5687962656379646e-21, "learning_rate": 5.6611626088244194e-05, "logits/chosen": -0.9707508087158203, "logits/rejected": -1.2447017431259155, "logps/chosen": -4089.699462890625, "logps/rejected": -1120.4154052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.00418758392334, "rewards/margins": 76.06697082519531, "rewards/rejected": -85.07115936279297, "step": 127 }, { "epoch": 1.3333333333333333, "grad_norm": 2.0503403694456911e-22, "learning_rate": 5.506295990328385e-05, "logits/chosen": -0.9009501934051514, "logits/rejected": -1.353840708732605, "logps/chosen": -7221.36279296875, "logps/rejected": -1135.1197509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.12373661994934082, "rewards/margins": 88.40489959716797, "rewards/rejected": -88.28115844726562, "step": 128 }, { "epoch": 1.34375, "grad_norm": 1.344498075173984e-21, "learning_rate": 5.3527682795623146e-05, "logits/chosen": -0.8090143799781799, "logits/rejected": -1.0533530712127686, "logps/chosen": -2006.4832763671875, "logps/rejected": -1043.121826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.14123249053955, "rewards/margins": 70.5691146850586, "rewards/rejected": -79.71034240722656, "step": 129 }, { "epoch": 1.3541666666666667, "grad_norm": 6.440810335695039e-21, "learning_rate": 5.200625220402139e-05, "logits/chosen": -0.9704806208610535, "logits/rejected": -1.0929075479507446, "logps/chosen": -2996.92822265625, "logps/rejected": -1068.793212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.935484886169434, "rewards/margins": 71.94236755371094, "rewards/rejected": -81.87785339355469, "step": 130 }, { "epoch": 1.3645833333333333, "grad_norm": 0.0, "learning_rate": 5.0499121441641864e-05, "logits/chosen": -0.9403969049453735, "logits/rejected": -1.3582621812820435, "logps/chosen": -8281.2607421875, "logps/rejected": -1090.318115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1239867210388184, "rewards/margins": 84.06963348388672, "rewards/rejected": -82.94564819335938, "step": 131 }, { "epoch": 1.375, "grad_norm": 1.1185308141413086e-19, "learning_rate": 4.900673956098644e-05, "logits/chosen": -1.0328657627105713, "logits/rejected": -1.1909589767456055, "logps/chosen": -1817.351318359375, "logps/rejected": -1014.7122802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.688718318939209, "rewards/margins": 72.41348266601562, "rewards/rejected": -78.1021957397461, "step": 132 }, { "epoch": 1.3854166666666667, "grad_norm": 3.2959890386576225e-18, "learning_rate": 4.75295512200992e-05, "logits/chosen": -0.9048435091972351, "logits/rejected": -1.2165101766586304, "logps/chosen": -3945.4169921875, "logps/rejected": -1073.447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.457312107086182, "rewards/margins": 76.04530334472656, "rewards/rejected": -81.50261688232422, "step": 133 }, { "epoch": 1.3958333333333333, "grad_norm": 0.0, "learning_rate": 4.606799655008009e-05, "logits/chosen": -0.903038740158081, "logits/rejected": -1.4270763397216797, "logps/chosen": -5543.68017578125, "logps/rejected": -1106.446533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1213319301605225, "rewards/margins": 83.86710357666016, "rewards/rejected": -86.98844146728516, "step": 134 }, { "epoch": 1.40625, "grad_norm": 0.0, "learning_rate": 4.462251102394669e-05, "logits/chosen": -0.8994408845901489, "logits/rejected": -1.3292877674102783, "logps/chosen": -3318.34814453125, "logps/rejected": -1010.1569213867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.89552640914917, "rewards/margins": 67.90574645996094, "rewards/rejected": -75.80128479003906, "step": 135 }, { "epoch": 1.4166666666666667, "grad_norm": 1.062754219307314e-21, "learning_rate": 4.3193525326884435e-05, "logits/chosen": -0.9541503190994263, "logits/rejected": -1.0579588413238525, "logps/chosen": -4469.5009765625, "logps/rejected": -955.7963256835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.544214248657227, "rewards/margins": 58.10245132446289, "rewards/rejected": -70.64666748046875, "step": 136 }, { "epoch": 1.4270833333333333, "grad_norm": 4.9187133763885766e-17, "learning_rate": 4.1781465227922957e-05, "logits/chosen": -1.0173542499542236, "logits/rejected": -1.2005999088287354, "logps/chosen": -2528.12060546875, "logps/rejected": -1051.4796142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2586699724197388, "rewards/margins": 77.34017944335938, "rewards/rejected": -78.59884643554688, "step": 137 }, { "epoch": 1.4375, "grad_norm": 0.0, "learning_rate": 4.038675145307747e-05, "logits/chosen": -0.967241644859314, "logits/rejected": -1.1873809099197388, "logps/chosen": -3900.778564453125, "logps/rejected": -1132.687744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1816892623901367, "rewards/margins": 83.33070373535156, "rewards/rejected": -84.51239013671875, "step": 138 }, { "epoch": 1.4479166666666667, "grad_norm": 2.2958917370833966e-17, "learning_rate": 3.900979955999271e-05, "logits/chosen": -0.8518582582473755, "logits/rejected": -0.9221400618553162, "logps/chosen": -1183.7716064453125, "logps/rejected": -911.7036743164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.865546226501465, "rewards/margins": 64.08380126953125, "rewards/rejected": -70.9493408203125, "step": 139 }, { "epoch": 1.4583333333333333, "grad_norm": 0.0, "learning_rate": 3.7651019814126654e-05, "logits/chosen": -0.9010441899299622, "logits/rejected": -1.190494179725647, "logps/chosen": -5493.0009765625, "logps/rejected": -1155.7314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8245346546173096, "rewards/margins": 85.78129577636719, "rewards/rejected": -88.6058349609375, "step": 140 }, { "epoch": 1.46875, "grad_norm": 0.0, "learning_rate": 3.6310817066511105e-05, "logits/chosen": -1.0124684572219849, "logits/rejected": -1.103775978088379, "logps/chosen": -1913.3040771484375, "logps/rejected": -1010.6784057617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.425868511199951, "rewards/margins": 71.89915466308594, "rewards/rejected": -77.32502746582031, "step": 141 }, { "epoch": 1.4791666666666667, "grad_norm": 0.0, "learning_rate": 3.498959063312558e-05, "logits/chosen": -0.8120708465576172, "logits/rejected": -1.4091525077819824, "logps/chosen": -4661.7578125, "logps/rejected": -888.3680419921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.055895805358887, "rewards/margins": 62.75235366821289, "rewards/rejected": -67.8082504272461, "step": 142 }, { "epoch": 1.4895833333333333, "grad_norm": 1.9747184031200515e-15, "learning_rate": 3.36877341759205e-05, "logits/chosen": -0.8257265090942383, "logits/rejected": -1.121315836906433, "logps/chosen": -1288.9190673828125, "logps/rejected": -972.7769775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.115662097930908, "rewards/margins": 67.79209899902344, "rewards/rejected": -74.90776824951172, "step": 143 }, { "epoch": 1.5, "grad_norm": 0.0, "learning_rate": 3.2405635585524565e-05, "logits/chosen": -1.0017499923706055, "logits/rejected": -1.1351391077041626, "logps/chosen": -3693.326171875, "logps/rejected": -1070.761962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.161089897155762, "rewards/margins": 69.91401672363281, "rewards/rejected": -83.07510375976562, "step": 144 }, { "epoch": 1.5104166666666665, "grad_norm": 0.0, "learning_rate": 3.114367686567228e-05, "logits/chosen": -0.848055899143219, "logits/rejected": -1.0882829427719116, "logps/chosen": -2654.097412109375, "logps/rejected": -1041.62060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.7376070022583, "rewards/margins": 65.37726593017578, "rewards/rejected": -80.11487579345703, "step": 145 }, { "epoch": 1.5208333333333335, "grad_norm": 3.5242391416192426e-14, "learning_rate": 2.9902234019385057e-05, "logits/chosen": -0.8896538019180298, "logits/rejected": -1.258103370666504, "logps/chosen": -3455.951171875, "logps/rejected": -998.981201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.142499923706055, "rewards/margins": 64.65351104736328, "rewards/rejected": -75.79601287841797, "step": 146 }, { "epoch": 1.53125, "grad_norm": 4.5005137312183197e-13, "learning_rate": 2.8681676936940393e-05, "logits/chosen": -0.9204589128494263, "logits/rejected": -1.0349059104919434, "logps/chosen": -1558.69287109375, "logps/rejected": -1021.4735107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.636395454406738, "rewards/margins": 64.9701919555664, "rewards/rejected": -76.6065902709961, "step": 147 }, { "epoch": 1.5416666666666665, "grad_norm": 3.658716396254952e-13, "learning_rate": 2.7482369285662378e-05, "logits/chosen": -0.8321296572685242, "logits/rejected": -1.1634087562561035, "logps/chosen": -2103.5849609375, "logps/rejected": -974.1217041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.24515724182129, "rewards/margins": 45.907371520996094, "rewards/rejected": -74.15252685546875, "step": 148 }, { "epoch": 1.5520833333333335, "grad_norm": 3.7246280947998844e-22, "learning_rate": 2.6304668401566335e-05, "logits/chosen": -0.9244425296783447, "logits/rejected": -1.3096859455108643, "logps/chosen": -5354.45458984375, "logps/rejected": -1064.54443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.046282768249512, "rewards/margins": 76.85649871826172, "rewards/rejected": -81.90278625488281, "step": 149 }, { "epoch": 1.5625, "grad_norm": 1.2950027054019793e-20, "learning_rate": 2.514892518288988e-05, "logits/chosen": -0.8687339425086975, "logits/rejected": -1.2188998460769653, "logps/chosen": -4577.24462890625, "logps/rejected": -929.9271240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.647838115692139, "rewards/margins": 67.19004821777344, "rewards/rejected": -71.83788299560547, "step": 150 }, { "epoch": 1.5729166666666665, "grad_norm": 3.180693751106868e-17, "learning_rate": 2.401548398554213e-05, "logits/chosen": -0.8746656775474548, "logits/rejected": -1.1335382461547852, "logps/chosen": -2621.76708984375, "logps/rejected": -1069.098876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.797000408172607, "rewards/margins": 76.91392517089844, "rewards/rejected": -81.71092987060547, "step": 151 }, { "epoch": 1.5833333333333335, "grad_norm": 0.0, "learning_rate": 2.290468252050204e-05, "logits/chosen": -0.7061495780944824, "logits/rejected": -1.2625129222869873, "logps/chosen": -5062.939453125, "logps/rejected": -1201.242919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.179224014282227, "rewards/margins": 82.08878326416016, "rewards/rejected": -93.26800537109375, "step": 152 }, { "epoch": 1.59375, "grad_norm": 0.0, "learning_rate": 2.181685175319702e-05, "logits/chosen": -0.7203177213668823, "logits/rejected": -1.3191170692443848, "logps/chosen": -3958.314453125, "logps/rejected": -1011.8955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7906357049942017, "rewards/margins": 78.65202331542969, "rewards/rejected": -77.86138916015625, "step": 153 }, { "epoch": 1.6041666666666665, "grad_norm": 0.0, "learning_rate": 2.0752315804890977e-05, "logits/chosen": -0.9195300340652466, "logits/rejected": -1.2774739265441895, "logps/chosen": -8186.68408203125, "logps/rejected": -1072.425048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4632796049118042, "rewards/margins": 80.83226013183594, "rewards/rejected": -82.29553985595703, "step": 154 }, { "epoch": 1.6145833333333335, "grad_norm": 4.0147753867538186e-20, "learning_rate": 1.971139185611176e-05, "logits/chosen": -0.9163135886192322, "logits/rejected": -1.3071476221084595, "logps/chosen": -4530.13427734375, "logps/rejected": -1092.8966064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.696630954742432, "rewards/margins": 80.32160949707031, "rewards/rejected": -85.01823425292969, "step": 155 }, { "epoch": 1.625, "grad_norm": 1.5609576705471867e-19, "learning_rate": 1.8694390052146737e-05, "logits/chosen": -0.8604260683059692, "logits/rejected": -1.270695686340332, "logps/chosen": -3943.85595703125, "logps/rejected": -1087.4296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.993682861328125, "rewards/margins": 75.90512084960938, "rewards/rejected": -84.8988037109375, "step": 156 }, { "epoch": 1.6354166666666665, "grad_norm": 2.811402606227264e-17, "learning_rate": 1.7701613410634365e-05, "logits/chosen": -0.8890691995620728, "logits/rejected": -1.3079321384429932, "logps/chosen": -7185.2529296875, "logps/rejected": -1080.0333251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.33111572265625, "rewards/margins": 74.3923568725586, "rewards/rejected": -82.72347259521484, "step": 157 }, { "epoch": 1.6458333333333335, "grad_norm": 1.0592323188268055e-13, "learning_rate": 1.6733357731279377e-05, "logits/chosen": -0.8480991721153259, "logits/rejected": -1.1103326082229614, "logps/chosen": -3468.0048828125, "logps/rejected": -1067.985595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.42734670639038086, "rewards/margins": 82.94938659667969, "rewards/rejected": -83.3767318725586, "step": 158 }, { "epoch": 1.65625, "grad_norm": 0.0, "learning_rate": 1.5789911507718826e-05, "logits/chosen": -0.7679794430732727, "logits/rejected": -1.1826531887054443, "logps/chosen": -2692.7138671875, "logps/rejected": -1040.9814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.65127182006836, "rewards/margins": 67.80027770996094, "rewards/rejected": -81.45155334472656, "step": 159 }, { "epoch": 1.6666666666666665, "grad_norm": 2.101126106157376e-17, "learning_rate": 1.4871555841564887e-05, "logits/chosen": -0.7971072196960449, "logits/rejected": -1.2701507806777954, "logps/chosen": -2931.4375, "logps/rejected": -1022.1775512695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.8612060546875, "rewards/margins": 74.42487335205078, "rewards/rejected": -80.28607940673828, "step": 160 }, { "epoch": 1.6770833333333335, "grad_norm": 1.693179950985332e-16, "learning_rate": 1.3978564358649927e-05, "logits/chosen": -1.000674843788147, "logits/rejected": -1.1274532079696655, "logps/chosen": -2341.431640625, "logps/rejected": -1146.2425537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.8021240234375, "rewards/margins": 82.45307922363281, "rewards/rejected": -86.25519561767578, "step": 161 }, { "epoch": 1.6875, "grad_norm": 1.248795501349495e-18, "learning_rate": 1.311120312749935e-05, "logits/chosen": -0.9954484701156616, "logits/rejected": -1.0421305894851685, "logps/chosen": -1413.7686767578125, "logps/rejected": -919.8433837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.1903657913208, "rewards/margins": 60.226654052734375, "rewards/rejected": -70.4170150756836, "step": 162 }, { "epoch": 1.6979166666666665, "grad_norm": 7.506130692859752e-17, "learning_rate": 1.2269730580055805e-05, "logits/chosen": -0.7554247975349426, "logits/rejected": -1.1726787090301514, "logps/chosen": -2398.90234375, "logps/rejected": -1041.2353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.836740493774414, "rewards/margins": 68.93004608154297, "rewards/rejected": -79.76679229736328, "step": 163 }, { "epoch": 1.7083333333333335, "grad_norm": 3.144996540161571e-16, "learning_rate": 1.1454397434679021e-05, "logits/chosen": -0.8778572082519531, "logits/rejected": -1.1429264545440674, "logps/chosen": -2819.60009765625, "logps/rejected": -1024.1851806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5772266387939453, "rewards/margins": 75.2021713256836, "rewards/rejected": -78.7793960571289, "step": 164 }, { "epoch": 1.71875, "grad_norm": 4.54889006982497e-15, "learning_rate": 1.0665446621443708e-05, "logits/chosen": -1.0242542028427124, "logits/rejected": -1.2949650287628174, "logps/chosen": -6388.978515625, "logps/rejected": -1101.523193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 23.088947296142578, "rewards/margins": 107.18841552734375, "rewards/rejected": -84.09947204589844, "step": 165 }, { "epoch": 1.7291666666666665, "grad_norm": 1.2591869387457155e-11, "learning_rate": 9.903113209758096e-06, "logits/chosen": -1.0302661657333374, "logits/rejected": -1.0212035179138184, "logps/chosen": -1567.7783203125, "logps/rejected": -1051.6270751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.1559648513793945, "rewards/margins": 72.15129089355469, "rewards/rejected": -78.30725860595703, "step": 166 }, { "epoch": 1.7395833333333335, "grad_norm": 0.0, "learning_rate": 9.1676243383246e-06, "logits/chosen": -0.9699329137802124, "logits/rejected": -1.217832088470459, "logps/chosen": -3370.20361328125, "logps/rejected": -1056.4344482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4962918758392334, "rewards/margins": 82.08119201660156, "rewards/rejected": -80.58489990234375, "step": 167 }, { "epoch": 1.75, "grad_norm": 6.902272136647668e-19, "learning_rate": 8.45919914746337e-06, "logits/chosen": -0.8314164876937866, "logits/rejected": -1.1248615980148315, "logps/chosen": -2003.40185546875, "logps/rejected": -976.593017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.036044120788574, "rewards/margins": 70.12460327148438, "rewards/rejected": -75.16064453125, "step": 168 }, { "epoch": 1.7604166666666665, "grad_norm": 0.0, "learning_rate": 7.778048713818975e-06, "logits/chosen": -1.0096337795257568, "logits/rejected": -1.224050521850586, "logps/chosen": -3408.56396484375, "logps/rejected": -1120.5277099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.887763977050781, "rewards/margins": 79.56890869140625, "rewards/rejected": -85.45668029785156, "step": 169 }, { "epoch": 1.7708333333333335, "grad_norm": 0.0, "learning_rate": 7.124375987469767e-06, "logits/chosen": -0.7806891798973083, "logits/rejected": -1.4077966213226318, "logps/chosen": -4579.115234375, "logps/rejected": -1092.3642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.048169136047363, "rewards/margins": 79.57754516601562, "rewards/rejected": -84.6257095336914, "step": 170 }, { "epoch": 1.78125, "grad_norm": 8.991780414846024e-21, "learning_rate": 6.498375731458528e-06, "logits/chosen": -0.8369664549827576, "logits/rejected": -1.2401611804962158, "logps/chosen": -4373.06640625, "logps/rejected": -1185.349365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.959744453430176, "rewards/margins": 80.5737533569336, "rewards/rejected": -91.53350067138672, "step": 171 }, { "epoch": 1.7916666666666665, "grad_norm": 3.254465541259741e-20, "learning_rate": 5.900234463762366e-06, "logits/chosen": -0.9217289090156555, "logits/rejected": -1.3291540145874023, "logps/chosen": -4704.79736328125, "logps/rejected": -1218.4345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.816030502319336, "rewards/margins": 75.90402221679688, "rewards/rejected": -94.72004699707031, "step": 172 }, { "epoch": 1.8020833333333335, "grad_norm": 6.992826570984174e-20, "learning_rate": 5.3301304017194135e-06, "logits/chosen": -0.8925175666809082, "logits/rejected": -1.3186287879943848, "logps/chosen": -5796.84375, "logps/rejected": -1010.6710815429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.582611560821533, "rewards/margins": 71.03655242919922, "rewards/rejected": -78.61917114257812, "step": 173 }, { "epoch": 1.8125, "grad_norm": 0.0, "learning_rate": 4.788233408928589e-06, "logits/chosen": -0.9071134328842163, "logits/rejected": -1.1578150987625122, "logps/chosen": -3065.755615234375, "logps/rejected": -1122.5201416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.236775398254395, "rewards/margins": 73.01652526855469, "rewards/rejected": -87.2532958984375, "step": 174 }, { "epoch": 1.8229166666666665, "grad_norm": 5.8420251147017e-20, "learning_rate": 4.27470494463843e-06, "logits/chosen": -0.8542720079421997, "logits/rejected": -1.22062087059021, "logps/chosen": -4755.65087890625, "logps/rejected": -1087.69970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.7637419700622559, "rewards/margins": 85.65948486328125, "rewards/rejected": -83.89574432373047, "step": 175 }, { "epoch": 1.8333333333333335, "grad_norm": 4.5275721134567166e-20, "learning_rate": 3.789698015639953e-06, "logits/chosen": -0.8436378836631775, "logits/rejected": -1.2432456016540527, "logps/chosen": -7707.376953125, "logps/rejected": -1006.131103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.132122755050659, "rewards/margins": 74.2339096069336, "rewards/rejected": -76.36602783203125, "step": 176 }, { "epoch": 1.84375, "grad_norm": 0.0, "learning_rate": 3.3333571306780497e-06, "logits/chosen": -1.1021344661712646, "logits/rejected": -1.146433711051941, "logps/chosen": -2144.27734375, "logps/rejected": -1221.460693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.952573776245117, "rewards/margins": 81.55621337890625, "rewards/rejected": -95.5087890625, "step": 177 }, { "epoch": 1.8541666666666665, "grad_norm": 3.1447959791819358e-09, "learning_rate": 2.905818257394799e-06, "logits/chosen": -0.9445458650588989, "logits/rejected": -1.1954491138458252, "logps/chosen": -3473.337158203125, "logps/rejected": -898.44140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.790580749511719, "rewards/margins": 61.649497985839844, "rewards/rejected": -68.44007873535156, "step": 178 }, { "epoch": 1.8645833333333335, "grad_norm": 3.4308754787927946e-22, "learning_rate": 2.5072087818176382e-06, "logits/chosen": -0.9306582808494568, "logits/rejected": -1.2371917963027954, "logps/chosen": -3138.842041015625, "logps/rejected": -1054.2342529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.166943550109863, "rewards/margins": 68.14605712890625, "rewards/rejected": -81.31299591064453, "step": 179 }, { "epoch": 1.875, "grad_norm": 0.0, "learning_rate": 2.137647470404469e-06, "logits/chosen": -1.0094091892242432, "logits/rejected": -1.3734456300735474, "logps/chosen": -6787.31982421875, "logps/rejected": -909.1167602539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7543637752532959, "rewards/margins": 70.34402465820312, "rewards/rejected": -71.09838104248047, "step": 180 }, { "epoch": 1.8854166666666665, "grad_norm": 6.411009053864542e-20, "learning_rate": 1.797244434656975e-06, "logits/chosen": -1.0234758853912354, "logits/rejected": -1.2576205730438232, "logps/chosen": -2733.744384765625, "logps/rejected": -1061.349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.5823726654052734, "rewards/margins": 84.70896911621094, "rewards/rejected": -82.12660217285156, "step": 181 }, { "epoch": 1.8958333333333335, "grad_norm": 0.0, "learning_rate": 1.48610109831262e-06, "logits/chosen": -0.8497348427772522, "logits/rejected": -1.0572631359100342, "logps/chosen": -2718.9326171875, "logps/rejected": -961.6370849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9914793968200684, "rewards/margins": 75.01031494140625, "rewards/rejected": -74.01884460449219, "step": 182 }, { "epoch": 1.90625, "grad_norm": 2.2827734383898875e-18, "learning_rate": 1.2043101671253554e-06, "logits/chosen": -0.9114844799041748, "logits/rejected": -1.1452364921569824, "logps/chosen": -1678.3172607421875, "logps/rejected": -1151.85400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.129800796508789, "rewards/margins": 74.070556640625, "rewards/rejected": -88.20036315917969, "step": 183 }, { "epoch": 1.9166666666666665, "grad_norm": 0.0, "learning_rate": 9.519556012436815e-07, "logits/chosen": -0.9810994267463684, "logits/rejected": -1.067641019821167, "logps/chosen": -2078.021728515625, "logps/rejected": -1072.85498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.242856979370117, "rewards/margins": 66.07984161376953, "rewards/rejected": -79.32270050048828, "step": 184 }, { "epoch": 1.9270833333333335, "grad_norm": 1.8167316177431924e-16, "learning_rate": 7.291125901946027e-07, "logits/chosen": -0.8321889042854309, "logits/rejected": -1.1509894132614136, "logps/chosen": -3819.90966796875, "logps/rejected": -1095.14306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.86672592163086, "rewards/margins": 71.09647369384766, "rewards/rejected": -81.96320343017578, "step": 185 }, { "epoch": 1.9375, "grad_norm": 1.0820013757477458e-17, "learning_rate": 5.358475304807375e-07, "logits/chosen": -1.0654444694519043, "logits/rejected": -1.2142765522003174, "logps/chosen": -7236.927734375, "logps/rejected": -949.28857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7587647438049316, "rewards/margins": 70.48192596435547, "rewards/rejected": -73.24069213867188, "step": 186 }, { "epoch": 1.9479166666666665, "grad_norm": 2.45197261230215e-17, "learning_rate": 3.7221800579735346e-07, "logits/chosen": -0.8213499188423157, "logits/rejected": -1.2857908010482788, "logps/chosen": -6349.48046875, "logps/rejected": -1107.80615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.317949295043945, "rewards/margins": 78.53043365478516, "rewards/rejected": -84.84838104248047, "step": 187 }, { "epoch": 1.9583333333333335, "grad_norm": 2.0503403694456911e-22, "learning_rate": 2.382727698752474e-07, "logits/chosen": -0.8970575332641602, "logits/rejected": -1.145188570022583, "logps/chosen": -2160.99365234375, "logps/rejected": -1063.6136474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.6992769241333, "rewards/margins": 72.37908935546875, "rewards/rejected": -81.07836151123047, "step": 188 }, { "epoch": 1.96875, "grad_norm": 1.2547314776381705e-16, "learning_rate": 1.340517319543877e-07, "logits/chosen": -0.8810423612594604, "logits/rejected": -1.099363088607788, "logps/chosen": -2089.138671875, "logps/rejected": -1003.574951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.649038314819336, "rewards/margins": 64.49651336669922, "rewards/rejected": -76.14555358886719, "step": 189 }, { "epoch": 1.9791666666666665, "grad_norm": 0.0, "learning_rate": 5.958594489295921e-08, "logits/chosen": -0.9419525861740112, "logits/rejected": -1.2568933963775635, "logps/chosen": -3585.5712890625, "logps/rejected": -979.148193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.2342803478240967, "rewards/margins": 71.78946685791016, "rewards/rejected": -74.02375030517578, "step": 190 }, { "epoch": 1.9895833333333335, "grad_norm": 0.0, "learning_rate": 1.4897595915053242e-08, "logits/chosen": -0.9514296054840088, "logits/rejected": -1.1773942708969116, "logps/chosen": -3752.94091796875, "logps/rejected": -1072.39013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.917599678039551, "rewards/margins": 77.99908447265625, "rewards/rejected": -81.91668701171875, "step": 191 }, { "epoch": 2.0, "grad_norm": 5.172685426425421e-20, "learning_rate": 0.0, "logits/chosen": -0.8919022083282471, "logits/rejected": -1.0698691606521606, "logps/chosen": -1968.748046875, "logps/rejected": -921.0995483398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.440521240234375, "rewards/margins": 60.94007110595703, "rewards/rejected": -69.3805923461914, "step": 192 } ], "logging_steps": 1, "max_steps": 192, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 96, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }