{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9978094194961664, "eval_steps": 50000, "global_step": 1216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008214676889375685, "grad_norm": 28.31659169550249, "learning_rate": 4.0983606557377046e-08, "logits/chosen": 27.184185028076172, "logits/rejected": 25.856258392333984, "logps/chosen": -244.33399963378906, "logps/rejected": -79.7464828491211, "loss": 0.6863, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.015291815623641014, "rewards/margins": 0.03226657956838608, "rewards/rejected": -0.016974765807390213, "sft_loss": 0.6384800672531128, "step": 5 }, { "epoch": 0.01642935377875137, "grad_norm": 23.954861556073595, "learning_rate": 8.196721311475409e-08, "logits/chosen": 26.661327362060547, "logits/rejected": 25.39139175415039, "logps/chosen": -207.91012573242188, "logps/rejected": -72.70105743408203, "loss": 0.6278, "rewards/accuracies": 0.786666750907898, "rewards/chosen": -0.03712935373187065, "rewards/margins": 0.1326407492160797, "rewards/rejected": -0.16977010667324066, "sft_loss": 0.6469722986221313, "step": 10 }, { "epoch": 0.024644030668127054, "grad_norm": 12.243904208611285, "learning_rate": 1.2295081967213113e-07, "logits/chosen": 27.410442352294922, "logits/rejected": 26.34275245666504, "logps/chosen": -211.9379119873047, "logps/rejected": -84.06497192382812, "loss": 0.4689, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -0.15980161726474762, "rewards/margins": 0.5716416239738464, "rewards/rejected": -0.7314431667327881, "sft_loss": 0.6353262662887573, "step": 15 }, { "epoch": 0.03285870755750274, "grad_norm": 9.287884891913714, "learning_rate": 1.6393442622950818e-07, "logits/chosen": 27.55664825439453, "logits/rejected": 26.192903518676758, "logps/chosen": -280.5326843261719, "logps/rejected": -112.344970703125, "loss": 0.3581, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -0.6804031729698181, "rewards/margins": 1.3235772848129272, "rewards/rejected": -2.003980875015259, "sft_loss": 0.7250083684921265, "step": 20 }, { "epoch": 0.04107338444687842, "grad_norm": 7.405267430642346, "learning_rate": 2.0491803278688524e-07, "logits/chosen": 26.403255462646484, "logits/rejected": 25.545373916625977, "logps/chosen": -249.10903930664062, "logps/rejected": -114.06135559082031, "loss": 0.3047, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -1.3252760171890259, "rewards/margins": 1.8669867515563965, "rewards/rejected": -3.1922624111175537, "sft_loss": 0.7239670157432556, "step": 25 }, { "epoch": 0.04928806133625411, "grad_norm": 4.80122081598555, "learning_rate": 2.4590163934426226e-07, "logits/chosen": 25.064653396606445, "logits/rejected": 24.138179779052734, "logps/chosen": -273.2266540527344, "logps/rejected": -129.16700744628906, "loss": 0.2507, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -1.8779884576797485, "rewards/margins": 2.8930463790893555, "rewards/rejected": -4.7710347175598145, "sft_loss": 0.7124671936035156, "step": 30 }, { "epoch": 0.05750273822562979, "grad_norm": 4.89445256463419, "learning_rate": 2.868852459016393e-07, "logits/chosen": 23.180265426635742, "logits/rejected": 22.45435905456543, "logps/chosen": -281.69757080078125, "logps/rejected": -149.479248046875, "loss": 0.2291, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -2.5409016609191895, "rewards/margins": 3.659693956375122, "rewards/rejected": -6.200596809387207, "sft_loss": 0.78533536195755, "step": 35 }, { "epoch": 0.06571741511500548, "grad_norm": 5.60278997158879, "learning_rate": 3.2786885245901637e-07, "logits/chosen": 21.637163162231445, "logits/rejected": 21.079978942871094, "logps/chosen": -259.0185546875, "logps/rejected": -145.42489624023438, "loss": 0.2061, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -2.746673583984375, "rewards/margins": 3.7938296794891357, "rewards/rejected": -6.54050350189209, "sft_loss": 0.7671460509300232, "step": 40 }, { "epoch": 0.07393209200438117, "grad_norm": 4.3242655896865045, "learning_rate": 3.6885245901639347e-07, "logits/chosen": 22.72602653503418, "logits/rejected": 21.949237823486328, "logps/chosen": -265.2069396972656, "logps/rejected": -151.65060424804688, "loss": 0.1803, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -3.016242265701294, "rewards/margins": 4.000359058380127, "rewards/rejected": -7.016600608825684, "sft_loss": 0.7528119683265686, "step": 45 }, { "epoch": 0.08214676889375684, "grad_norm": 4.333490981221177, "learning_rate": 4.0983606557377047e-07, "logits/chosen": 22.819128036499023, "logits/rejected": 21.5169620513916, "logps/chosen": -287.1695251464844, "logps/rejected": -152.8419647216797, "loss": 0.1609, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.535120725631714, "rewards/margins": 3.6887311935424805, "rewards/rejected": -7.223852157592773, "sft_loss": 0.7424061894416809, "step": 50 }, { "epoch": 0.09036144578313253, "grad_norm": 3.8031629734590995, "learning_rate": 4.508196721311475e-07, "logits/chosen": 22.994489669799805, "logits/rejected": 21.521961212158203, "logps/chosen": -276.05242919921875, "logps/rejected": -162.29678344726562, "loss": 0.1759, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -3.299473762512207, "rewards/margins": 4.333623886108398, "rewards/rejected": -7.6330976486206055, "sft_loss": 0.7973353266716003, "step": 55 }, { "epoch": 0.09857612267250822, "grad_norm": 2.9463104781172405, "learning_rate": 4.918032786885245e-07, "logits/chosen": 23.39748764038086, "logits/rejected": 22.529293060302734, "logps/chosen": -226.25714111328125, "logps/rejected": -133.46775817871094, "loss": 0.166, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -2.8591995239257812, "rewards/margins": 3.5714006423950195, "rewards/rejected": -6.430600166320801, "sft_loss": 0.7205591797828674, "step": 60 }, { "epoch": 0.10679079956188389, "grad_norm": 5.472434832813438, "learning_rate": 4.999852034151641e-07, "logits/chosen": 21.749300003051758, "logits/rejected": 21.059572219848633, "logps/chosen": -288.58892822265625, "logps/rejected": -163.43212890625, "loss": 0.1564, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -3.1936755180358887, "rewards/margins": 4.4503631591796875, "rewards/rejected": -7.644038200378418, "sft_loss": 0.8240499496459961, "step": 65 }, { "epoch": 0.11500547645125958, "grad_norm": 2.7329123191165547, "learning_rate": 4.999250952911133e-07, "logits/chosen": 23.737056732177734, "logits/rejected": 22.234569549560547, "logps/chosen": -282.01953125, "logps/rejected": -153.94252014160156, "loss": 0.1336, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -3.0082314014434814, "rewards/margins": 4.5703325271606445, "rewards/rejected": -7.5785627365112305, "sft_loss": 0.8242512345314026, "step": 70 }, { "epoch": 0.12322015334063527, "grad_norm": 48.59604948359565, "learning_rate": 4.998187619501184e-07, "logits/chosen": 23.999074935913086, "logits/rejected": 22.998958587646484, "logps/chosen": -320.11505126953125, "logps/rejected": -178.56219482421875, "loss": 0.1269, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -3.2927637100219727, "rewards/margins": 5.467880725860596, "rewards/rejected": -8.760644912719727, "sft_loss": 0.8822128772735596, "step": 75 }, { "epoch": 0.13143483023001096, "grad_norm": 34.18732885169035, "learning_rate": 4.996662230591989e-07, "logits/chosen": 21.339969635009766, "logits/rejected": 20.508026123046875, "logps/chosen": -297.17633056640625, "logps/rejected": -178.69137573242188, "loss": 0.1336, "rewards/accuracies": 0.9200000762939453, "rewards/chosen": -3.8860669136047363, "rewards/margins": 5.22913932800293, "rewards/rejected": -9.115203857421875, "sft_loss": 0.8318250179290771, "step": 80 }, { "epoch": 0.13964950711938665, "grad_norm": 3.158882002153525, "learning_rate": 4.994675068313813e-07, "logits/chosen": 21.070241928100586, "logits/rejected": 20.90212631225586, "logps/chosen": -286.97998046875, "logps/rejected": -186.49452209472656, "loss": 0.1122, "rewards/accuracies": 0.9200000762939453, "rewards/chosen": -4.223217010498047, "rewards/margins": 5.617285251617432, "rewards/rejected": -9.84050178527832, "sft_loss": 0.8361734747886658, "step": 85 }, { "epoch": 0.14786418400876233, "grad_norm": 3.2825430282007164, "learning_rate": 4.992226500204806e-07, "logits/chosen": 22.27889060974121, "logits/rejected": 21.50484275817871, "logps/chosen": -287.25439453125, "logps/rejected": -167.19528198242188, "loss": 0.1182, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -3.6078903675079346, "rewards/margins": 5.157074451446533, "rewards/rejected": -8.764965057373047, "sft_loss": 0.865871787071228, "step": 90 }, { "epoch": 0.156078860898138, "grad_norm": 4.571628146800866, "learning_rate": 4.989316979143029e-07, "logits/chosen": 21.98550796508789, "logits/rejected": 20.47240447998047, "logps/chosen": -276.35479736328125, "logps/rejected": -154.84384155273438, "loss": 0.1667, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -3.3969411849975586, "rewards/margins": 4.759787559509277, "rewards/rejected": -8.156728744506836, "sft_loss": 0.8606770038604736, "step": 95 }, { "epoch": 0.16429353778751368, "grad_norm": 5.767989035541414, "learning_rate": 4.985947043262686e-07, "logits/chosen": 20.512102127075195, "logits/rejected": 19.77199935913086, "logps/chosen": -282.6247863769531, "logps/rejected": -166.79330444335938, "loss": 0.1454, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -3.124844551086426, "rewards/margins": 5.334622383117676, "rewards/rejected": -8.459466934204102, "sft_loss": 0.8327052593231201, "step": 100 }, { "epoch": 0.17250821467688937, "grad_norm": 2.592654441763786, "learning_rate": 4.982117315854593e-07, "logits/chosen": 20.459001541137695, "logits/rejected": 20.174640655517578, "logps/chosen": -248.44091796875, "logps/rejected": -155.70086669921875, "loss": 0.1293, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -2.7995762825012207, "rewards/margins": 4.992871284484863, "rewards/rejected": -7.792448043823242, "sft_loss": 0.8232018351554871, "step": 105 }, { "epoch": 0.18072289156626506, "grad_norm": 3.061900848647029, "learning_rate": 4.977828505250903e-07, "logits/chosen": 20.415132522583008, "logits/rejected": 19.290271759033203, "logps/chosen": -247.19740295410156, "logps/rejected": -146.08677673339844, "loss": 0.1359, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -3.0541529655456543, "rewards/margins": 4.263410568237305, "rewards/rejected": -7.317563056945801, "sft_loss": 0.8086569309234619, "step": 110 }, { "epoch": 0.18893756845564075, "grad_norm": 3.8428826764124495, "learning_rate": 4.973081404694087e-07, "logits/chosen": 19.876977920532227, "logits/rejected": 19.5655574798584, "logps/chosen": -273.8357849121094, "logps/rejected": -173.55661010742188, "loss": 0.1152, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -3.349548578262329, "rewards/margins": 5.433572292327881, "rewards/rejected": -8.783121109008789, "sft_loss": 0.8695060610771179, "step": 115 }, { "epoch": 0.19715224534501644, "grad_norm": 3.5510462679770107, "learning_rate": 4.967876892190227e-07, "logits/chosen": 21.53512191772461, "logits/rejected": 19.82799530029297, "logps/chosen": -300.6681823730469, "logps/rejected": -162.72463989257812, "loss": 0.1214, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -3.324470043182373, "rewards/margins": 5.188758850097656, "rewards/rejected": -8.513228416442871, "sft_loss": 0.8628395199775696, "step": 120 }, { "epoch": 0.20536692223439212, "grad_norm": 2.7722703582939263, "learning_rate": 4.962215930346614e-07, "logits/chosen": 20.72852897644043, "logits/rejected": 19.19593048095703, "logps/chosen": -277.3075256347656, "logps/rejected": -168.2571258544922, "loss": 0.1007, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -3.5354294776916504, "rewards/margins": 5.148807048797607, "rewards/rejected": -8.684236526489258, "sft_loss": 0.8580695986747742, "step": 125 }, { "epoch": 0.21358159912376778, "grad_norm": 2.509695712976081, "learning_rate": 4.956099566193716e-07, "logits/chosen": 19.78179931640625, "logits/rejected": 18.386613845825195, "logps/chosen": -292.8381042480469, "logps/rejected": -179.89065551757812, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": -3.8737540245056152, "rewards/margins": 5.42431116104126, "rewards/rejected": -9.298065185546875, "sft_loss": 0.8839088678359985, "step": 130 }, { "epoch": 0.22179627601314347, "grad_norm": 3.4066444126067936, "learning_rate": 4.949528930991521e-07, "logits/chosen": 20.118431091308594, "logits/rejected": 18.660707473754883, "logps/chosen": -300.8720703125, "logps/rejected": -190.8058319091797, "loss": 0.1162, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.312110900878906, "rewards/margins": 6.067122936248779, "rewards/rejected": -10.379232406616211, "sft_loss": 0.8384607434272766, "step": 135 }, { "epoch": 0.23001095290251916, "grad_norm": 2.863527702169043, "learning_rate": 4.9425052400203e-07, "logits/chosen": 19.4266300201416, "logits/rejected": 18.600717544555664, "logps/chosen": -289.1473693847656, "logps/rejected": -183.9163360595703, "loss": 0.1138, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.603145122528076, "rewards/margins": 5.1031365394592285, "rewards/rejected": -9.706281661987305, "sft_loss": 0.8902355432510376, "step": 140 }, { "epoch": 0.23822562979189485, "grad_norm": 4.705307692419688, "learning_rate": 4.935029792355834e-07, "logits/chosen": 19.58378791809082, "logits/rejected": 18.723949432373047, "logps/chosen": -288.09527587890625, "logps/rejected": -178.56491088867188, "loss": 0.0997, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -3.634652853012085, "rewards/margins": 5.0857977867126465, "rewards/rejected": -8.720452308654785, "sft_loss": 0.8318749666213989, "step": 145 }, { "epoch": 0.24644030668127054, "grad_norm": 4.443077031203506, "learning_rate": 4.927103970629147e-07, "logits/chosen": 19.900896072387695, "logits/rejected": 18.748838424682617, "logps/chosen": -296.9352111816406, "logps/rejected": -197.29566955566406, "loss": 0.1249, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -5.586284160614014, "rewards/margins": 5.75706672668457, "rewards/rejected": -11.343351364135742, "sft_loss": 0.8322177529335022, "step": 150 }, { "epoch": 0.2546549835706462, "grad_norm": 3.0620385223375557, "learning_rate": 4.918729240770775e-07, "logits/chosen": 19.462993621826172, "logits/rejected": 19.144250869750977, "logps/chosen": -282.6866455078125, "logps/rejected": -189.68820190429688, "loss": 0.1166, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -5.4352288246154785, "rewards/margins": 6.065767288208008, "rewards/rejected": -11.500997543334961, "sft_loss": 0.9704034328460693, "step": 155 }, { "epoch": 0.2628696604600219, "grad_norm": 3.5256501257886717, "learning_rate": 4.909907151739633e-07, "logits/chosen": 20.455379486083984, "logits/rejected": 19.202545166015625, "logps/chosen": -305.9945373535156, "logps/rejected": -189.74273681640625, "loss": 0.1238, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -4.548050403594971, "rewards/margins": 6.132920265197754, "rewards/rejected": -10.680971145629883, "sft_loss": 0.8781507015228271, "step": 160 }, { "epoch": 0.2710843373493976, "grad_norm": 5.636178726208259, "learning_rate": 4.900639335236526e-07, "logits/chosen": 20.459400177001953, "logits/rejected": 19.524314880371094, "logps/chosen": -278.4658508300781, "logps/rejected": -170.0297088623047, "loss": 0.1276, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -3.6279122829437256, "rewards/margins": 5.202185153961182, "rewards/rejected": -8.830097198486328, "sft_loss": 0.8619104623794556, "step": 165 }, { "epoch": 0.2792990142387733, "grad_norm": 3.839926119567715, "learning_rate": 4.890927505402359e-07, "logits/chosen": 18.842321395874023, "logits/rejected": 18.251901626586914, "logps/chosen": -251.1257781982422, "logps/rejected": -161.70802307128906, "loss": 0.1045, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.4545812606811523, "rewards/margins": 5.017471790313721, "rewards/rejected": -8.472052574157715, "sft_loss": 0.7917510271072388, "step": 170 }, { "epoch": 0.28751369112814895, "grad_norm": 3.2492164364318836, "learning_rate": 4.880773458501089e-07, "logits/chosen": 21.866779327392578, "logits/rejected": 20.14942169189453, "logps/chosen": -265.2092590332031, "logps/rejected": -160.5437469482422, "loss": 0.0838, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -3.7316997051239014, "rewards/margins": 4.919445514678955, "rewards/rejected": -8.651144981384277, "sft_loss": 0.8359836339950562, "step": 175 }, { "epoch": 0.29572836801752467, "grad_norm": 2.829467767903507, "learning_rate": 4.870179072587498e-07, "logits/chosen": 19.415096282958984, "logits/rejected": 17.55588722229004, "logps/chosen": -264.83953857421875, "logps/rejected": -170.27407836914062, "loss": 0.1141, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -4.813459396362305, "rewards/margins": 5.231393337249756, "rewards/rejected": -10.044852256774902, "sft_loss": 0.9549927115440369, "step": 180 }, { "epoch": 0.30394304490690033, "grad_norm": 10.34631367942223, "learning_rate": 4.859146307159841e-07, "logits/chosen": 20.497596740722656, "logits/rejected": 18.766536712646484, "logps/chosen": -261.2963562011719, "logps/rejected": -182.26380920410156, "loss": 0.0915, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.357536792755127, "rewards/margins": 5.210677623748779, "rewards/rejected": -10.568212509155273, "sft_loss": 0.9084681868553162, "step": 185 }, { "epoch": 0.312157721796276, "grad_norm": 5.783975007709455, "learning_rate": 4.847677202797414e-07, "logits/chosen": 21.4503116607666, "logits/rejected": 20.343997955322266, "logps/chosen": -277.43756103515625, "logps/rejected": -190.31895446777344, "loss": 0.1261, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -5.206136226654053, "rewards/margins": 6.117234230041504, "rewards/rejected": -11.323369979858398, "sft_loss": 0.8104835748672485, "step": 190 }, { "epoch": 0.3203723986856517, "grad_norm": 5.842032329450428, "learning_rate": 4.835773880783144e-07, "logits/chosen": 18.739864349365234, "logits/rejected": 18.180030822753906, "logps/chosen": -279.97637939453125, "logps/rejected": -194.57496643066406, "loss": 0.0996, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.8305253982543945, "rewards/margins": 6.600159168243408, "rewards/rejected": -11.430684089660645, "sft_loss": 0.8295060396194458, "step": 195 }, { "epoch": 0.32858707557502737, "grad_norm": 3.2670595180505755, "learning_rate": 4.823438542711238e-07, "logits/chosen": 19.910261154174805, "logits/rejected": 19.292858123779297, "logps/chosen": -298.5541076660156, "logps/rejected": -194.70803833007812, "loss": 0.0875, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -4.582242488861084, "rewards/margins": 6.173951625823975, "rewards/rejected": -10.756195068359375, "sft_loss": 0.917156994342804, "step": 200 }, { "epoch": 0.3368017524644031, "grad_norm": 3.1103237361524463, "learning_rate": 4.81067347007999e-07, "logits/chosen": 21.32513999938965, "logits/rejected": 20.00907325744629, "logps/chosen": -282.4539489746094, "logps/rejected": -180.13723754882812, "loss": 0.1094, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -5.038515090942383, "rewards/margins": 5.208615303039551, "rewards/rejected": -10.247130393981934, "sft_loss": 0.9259530305862427, "step": 205 }, { "epoch": 0.34501642935377874, "grad_norm": 2.739223034780536, "learning_rate": 4.797481023869801e-07, "logits/chosen": 20.315624237060547, "logits/rejected": 19.15978240966797, "logps/chosen": -251.40025329589844, "logps/rejected": -175.2682647705078, "loss": 0.0868, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -4.576803684234619, "rewards/margins": 5.497461795806885, "rewards/rejected": -10.074265480041504, "sft_loss": 0.9133593440055847, "step": 210 }, { "epoch": 0.35323110624315446, "grad_norm": 3.832286663529339, "learning_rate": 4.783863644106502e-07, "logits/chosen": 20.822975158691406, "logits/rejected": 19.61457633972168, "logps/chosen": -272.2485656738281, "logps/rejected": -176.64450073242188, "loss": 0.084, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -4.567282676696777, "rewards/margins": 5.274582862854004, "rewards/rejected": -9.841866493225098, "sft_loss": 0.9593473672866821, "step": 215 }, { "epoch": 0.3614457831325301, "grad_norm": 2.3582346070622857, "learning_rate": 4.769823849410053e-07, "logits/chosen": 18.23250961303711, "logits/rejected": 17.80348777770996, "logps/chosen": -308.80230712890625, "logps/rejected": -218.04078674316406, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": -5.822837829589844, "rewards/margins": 7.0780110359191895, "rewards/rejected": -12.900848388671875, "sft_loss": 0.9446174502372742, "step": 220 }, { "epoch": 0.3696604600219058, "grad_norm": 2.4553351337767424, "learning_rate": 4.7553642365287127e-07, "logits/chosen": 18.841230392456055, "logits/rejected": 18.06731414794922, "logps/chosen": -290.5154724121094, "logps/rejected": -198.66468811035156, "loss": 0.1065, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.4337263107299805, "rewards/margins": 6.076140880584717, "rewards/rejected": -11.509869575500488, "sft_loss": 0.9967135787010193, "step": 225 }, { "epoch": 0.3778751369112815, "grad_norm": 5.091180856379443, "learning_rate": 4.7404874798587493e-07, "logits/chosen": 20.33222770690918, "logits/rejected": 19.57872772216797, "logps/chosen": -296.5139465332031, "logps/rejected": -187.86419677734375, "loss": 0.0947, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -4.502869129180908, "rewards/margins": 6.029680252075195, "rewards/rejected": -10.532547950744629, "sft_loss": 0.8854550719261169, "step": 230 }, { "epoch": 0.38608981380065716, "grad_norm": 3.7252781041155845, "learning_rate": 4.7251963309497965e-07, "logits/chosen": 19.174985885620117, "logits/rejected": 18.191041946411133, "logps/chosen": -294.4977111816406, "logps/rejected": -195.45391845703125, "loss": 0.1189, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -4.810143947601318, "rewards/margins": 5.988135814666748, "rewards/rejected": -10.79827880859375, "sft_loss": 0.9932563304901123, "step": 235 }, { "epoch": 0.39430449069003287, "grad_norm": 2.8445489541878457, "learning_rate": 4.709493617995938e-07, "logits/chosen": 19.772836685180664, "logits/rejected": 18.601234436035156, "logps/chosen": -281.09808349609375, "logps/rejected": -186.28834533691406, "loss": 0.0903, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.58309268951416, "rewards/margins": 6.093752384185791, "rewards/rejected": -10.67684555053711, "sft_loss": 0.8651002049446106, "step": 240 }, { "epoch": 0.40251916757940853, "grad_norm": 3.5938197609476856, "learning_rate": 4.6933822453126114e-07, "logits/chosen": 18.83224105834961, "logits/rejected": 18.597658157348633, "logps/chosen": -230.34445190429688, "logps/rejected": -173.43992614746094, "loss": 0.1183, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -4.852273464202881, "rewards/margins": 5.561976432800293, "rewards/rejected": -10.414249420166016, "sft_loss": 0.9817420840263367, "step": 245 }, { "epoch": 0.41073384446878425, "grad_norm": 5.781394523018812, "learning_rate": 4.676865192799443e-07, "logits/chosen": 20.996742248535156, "logits/rejected": 20.808259963989258, "logps/chosen": -305.62493896484375, "logps/rejected": -224.93048095703125, "loss": 0.0704, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.482853889465332, "rewards/margins": 6.840579509735107, "rewards/rejected": -13.323433876037598, "sft_loss": 0.9305270910263062, "step": 250 }, { "epoch": 0.4189485213581599, "grad_norm": 2.956667548991593, "learning_rate": 4.65994551538909e-07, "logits/chosen": 20.87143898010254, "logits/rejected": 19.100732803344727, "logps/chosen": -339.39825439453125, "logps/rejected": -216.97386169433594, "loss": 0.1042, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.5454792976379395, "rewards/margins": 6.870533466339111, "rewards/rejected": -13.416014671325684, "sft_loss": 1.041126012802124, "step": 255 }, { "epoch": 0.42716319824753557, "grad_norm": 4.392320655716504, "learning_rate": 4.642626342482215e-07, "logits/chosen": 19.316911697387695, "logits/rejected": 18.243391036987305, "logps/chosen": -256.42987060546875, "logps/rejected": -179.37509155273438, "loss": 0.1123, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -5.184771537780762, "rewards/margins": 5.357823848724365, "rewards/rejected": -10.542596817016602, "sft_loss": 0.9005042314529419, "step": 260 }, { "epoch": 0.4353778751369113, "grad_norm": 3.4877310264508186, "learning_rate": 4.624910877368684e-07, "logits/chosen": 18.9818115234375, "logits/rejected": 18.550493240356445, "logps/chosen": -290.06829833984375, "logps/rejected": -182.69097900390625, "loss": 0.0943, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.028794765472412, "rewards/margins": 5.486021041870117, "rewards/rejected": -9.514815330505371, "sft_loss": 0.8677726984024048, "step": 265 }, { "epoch": 0.44359255202628695, "grad_norm": 3.8262254077954485, "learning_rate": 4.606802396635098e-07, "logits/chosen": 19.717529296875, "logits/rejected": 19.07860565185547, "logps/chosen": -295.7071838378906, "logps/rejected": -187.46609497070312, "loss": 0.0909, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -4.471216678619385, "rewards/margins": 5.717488765716553, "rewards/rejected": -10.188706398010254, "sft_loss": 0.8504019379615784, "step": 270 }, { "epoch": 0.45180722891566266, "grad_norm": 5.060345147215167, "learning_rate": 4.588304249558763e-07, "logits/chosen": 18.937856674194336, "logits/rejected": 17.51219940185547, "logps/chosen": -296.1573486328125, "logps/rejected": -193.85498046875, "loss": 0.0999, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -5.353262424468994, "rewards/margins": 5.619455814361572, "rewards/rejected": -10.97271728515625, "sft_loss": 0.9665505886077881, "step": 275 }, { "epoch": 0.4600219058050383, "grad_norm": 4.00534202244361, "learning_rate": 4.569419857488228e-07, "logits/chosen": 19.323719024658203, "logits/rejected": 18.016199111938477, "logps/chosen": -321.9727783203125, "logps/rejected": -199.45265197753906, "loss": 0.1019, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -5.450780868530273, "rewards/margins": 6.008893013000488, "rewards/rejected": -11.459673881530762, "sft_loss": 0.99959397315979, "step": 280 }, { "epoch": 0.46823658269441404, "grad_norm": 3.285371678607192, "learning_rate": 4.550152713210478e-07, "logits/chosen": 20.368547439575195, "logits/rejected": 18.82790756225586, "logps/chosen": -274.7012634277344, "logps/rejected": -187.615966796875, "loss": 0.0864, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -5.436644554138184, "rewards/margins": 5.6056671142578125, "rewards/rejected": -11.042311668395996, "sft_loss": 0.9444936513900757, "step": 285 }, { "epoch": 0.4764512595837897, "grad_norm": 4.461231980649141, "learning_rate": 4.530506380304925e-07, "logits/chosen": 19.512760162353516, "logits/rejected": 17.607872009277344, "logps/chosen": -353.92181396484375, "logps/rejected": -215.2699432373047, "loss": 0.0889, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.762898921966553, "rewards/margins": 6.650979042053223, "rewards/rejected": -12.413877487182617, "sft_loss": 1.0174636840820312, "step": 290 }, { "epoch": 0.4846659364731654, "grad_norm": 6.886972650518142, "learning_rate": 4.510484492484301e-07, "logits/chosen": 18.66490936279297, "logits/rejected": 19.016637802124023, "logps/chosen": -309.6719055175781, "logps/rejected": -233.22732543945312, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": -6.444864749908447, "rewards/margins": 7.558566093444824, "rewards/rejected": -14.003432273864746, "sft_loss": 0.9799606800079346, "step": 295 }, { "epoch": 0.4928806133625411, "grad_norm": 7.0952145291800806, "learning_rate": 4.4900907529225797e-07, "logits/chosen": 18.357818603515625, "logits/rejected": 17.093488693237305, "logps/chosen": -321.1383972167969, "logps/rejected": -212.69412231445312, "loss": 0.1166, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.2735819816589355, "rewards/margins": 6.8673529624938965, "rewards/rejected": -13.140933990478516, "sft_loss": 0.9731053709983826, "step": 300 }, { "epoch": 0.5010952902519168, "grad_norm": 3.035306319401663, "learning_rate": 4.46932893357005e-07, "logits/chosen": 19.986753463745117, "logits/rejected": 19.238985061645508, "logps/chosen": -317.82354736328125, "logps/rejected": -216.91896057128906, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": -5.982201099395752, "rewards/margins": 7.021622657775879, "rewards/rejected": -13.003824234008789, "sft_loss": 0.9540520310401917, "step": 305 }, { "epoch": 0.5093099671412924, "grad_norm": 2.8463069418400826, "learning_rate": 4.448202874455672e-07, "logits/chosen": 18.925642013549805, "logits/rejected": 18.447513580322266, "logps/chosen": -310.6861267089844, "logps/rejected": -205.47084045410156, "loss": 0.1225, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -5.411799430847168, "rewards/margins": 6.359774112701416, "rewards/rejected": -11.771574020385742, "sft_loss": 1.0545023679733276, "step": 310 }, { "epoch": 0.5175246440306681, "grad_norm": 3.9649140060367407, "learning_rate": 4.426716482976838e-07, "logits/chosen": 19.93076515197754, "logits/rejected": 19.637243270874023, "logps/chosen": -303.6711730957031, "logps/rejected": -199.40115356445312, "loss": 0.0913, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -4.821557521820068, "rewards/margins": 6.5209574699401855, "rewards/rejected": -11.342514991760254, "sft_loss": 0.9295309782028198, "step": 315 }, { "epoch": 0.5257393209200438, "grad_norm": 3.5060241985803304, "learning_rate": 4.4048737331766774e-07, "logits/chosen": 21.6712646484375, "logits/rejected": 19.89933204650879, "logps/chosen": -271.3023681640625, "logps/rejected": -179.4087371826172, "loss": 0.1113, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -4.785543918609619, "rewards/margins": 5.356179714202881, "rewards/rejected": -10.1417236328125, "sft_loss": 0.8473352193832397, "step": 320 }, { "epoch": 0.5339539978094195, "grad_norm": 9.641524189799103, "learning_rate": 4.3826786650090273e-07, "logits/chosen": 17.178016662597656, "logits/rejected": 17.272985458374023, "logps/chosen": -279.9751892089844, "logps/rejected": -187.63014221191406, "loss": 0.0909, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -5.042219161987305, "rewards/margins": 5.971033096313477, "rewards/rejected": -11.013254165649414, "sft_loss": 0.9807875156402588, "step": 325 }, { "epoch": 0.5421686746987951, "grad_norm": 4.069079011849237, "learning_rate": 4.3601353835912235e-07, "logits/chosen": 18.978944778442383, "logits/rejected": 18.711896896362305, "logps/chosen": -257.9627990722656, "logps/rejected": -189.10731506347656, "loss": 0.1393, "rewards/accuracies": 0.9066666960716248, "rewards/chosen": -5.991618633270264, "rewards/margins": 5.532088279724121, "rewards/rejected": -11.523706436157227, "sft_loss": 0.9458999633789062, "step": 330 }, { "epoch": 0.5503833515881709, "grad_norm": 11.459818241023113, "learning_rate": 4.337248058444831e-07, "logits/chosen": 18.476844787597656, "logits/rejected": 16.893495559692383, "logps/chosen": -352.3243408203125, "logps/rejected": -229.020751953125, "loss": 0.0884, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -6.682969093322754, "rewards/margins": 6.964285373687744, "rewards/rejected": -13.64725399017334, "sft_loss": 1.09258234500885, "step": 335 }, { "epoch": 0.5585980284775466, "grad_norm": 4.451183947920293, "learning_rate": 4.3140209227244617e-07, "logits/chosen": 19.733028411865234, "logits/rejected": 19.02443504333496, "logps/chosen": -255.39503479003906, "logps/rejected": -174.6426544189453, "loss": 0.1089, "rewards/accuracies": 0.9066667556762695, "rewards/chosen": -4.919735908508301, "rewards/margins": 5.3243584632873535, "rewards/rejected": -10.24409294128418, "sft_loss": 0.974420428276062, "step": 340 }, { "epoch": 0.5668127053669222, "grad_norm": 4.806631914628035, "learning_rate": 4.2904582724348316e-07, "logits/chosen": 18.731412887573242, "logits/rejected": 17.712743759155273, "logps/chosen": -292.1322937011719, "logps/rejected": -186.6304473876953, "loss": 0.093, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -4.8624372482299805, "rewards/margins": 5.988229751586914, "rewards/rejected": -10.850666999816895, "sft_loss": 1.1051782369613647, "step": 345 }, { "epoch": 0.5750273822562979, "grad_norm": 9.642655055260906, "learning_rate": 4.266564465636182e-07, "logits/chosen": 20.78282928466797, "logits/rejected": 19.54219627380371, "logps/chosen": -336.76849365234375, "logps/rejected": -231.12326049804688, "loss": 0.0859, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.264214515686035, "rewards/margins": 7.32451868057251, "rewards/rejected": -13.58873462677002, "sft_loss": 0.9469544291496277, "step": 350 }, { "epoch": 0.5832420591456736, "grad_norm": 2.9057809929137535, "learning_rate": 4.242343921638234e-07, "logits/chosen": 20.12310028076172, "logits/rejected": 18.395959854125977, "logps/chosen": -332.9981994628906, "logps/rejected": -202.02816772460938, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": -5.237385272979736, "rewards/margins": 6.5451836585998535, "rewards/rejected": -11.782567977905273, "sft_loss": 1.00226628780365, "step": 355 }, { "epoch": 0.5914567360350493, "grad_norm": 3.5441590536933605, "learning_rate": 4.2178011201828044e-07, "logits/chosen": 19.193899154663086, "logits/rejected": 17.946836471557617, "logps/chosen": -299.01483154296875, "logps/rejected": -192.25360107421875, "loss": 0.0879, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -5.201097011566162, "rewards/margins": 6.014251708984375, "rewards/rejected": -11.215349197387695, "sft_loss": 0.9916761517524719, "step": 360 }, { "epoch": 0.5996714129244249, "grad_norm": 15.329178085855654, "learning_rate": 4.1929406006152546e-07, "logits/chosen": 19.373323440551758, "logits/rejected": 19.08150291442871, "logps/chosen": -284.7733154296875, "logps/rejected": -214.26292419433594, "loss": 0.104, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.026785373687744, "rewards/margins": 7.561707496643066, "rewards/rejected": -13.588491439819336, "sft_loss": 1.0006964206695557, "step": 365 }, { "epoch": 0.6078860898138007, "grad_norm": 3.928331003690106, "learning_rate": 4.167766961044906e-07, "logits/chosen": 20.291215896606445, "logits/rejected": 18.595857620239258, "logps/chosen": -307.8617858886719, "logps/rejected": -212.20635986328125, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": -5.978768348693848, "rewards/margins": 6.925237655639648, "rewards/rejected": -12.904006004333496, "sft_loss": 0.898669958114624, "step": 370 }, { "epoch": 0.6161007667031764, "grad_norm": 7.9374465786346295, "learning_rate": 4.1422848574945923e-07, "logits/chosen": 19.617891311645508, "logits/rejected": 18.786251068115234, "logps/chosen": -309.9544982910156, "logps/rejected": -208.55972290039062, "loss": 0.086, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.407596111297607, "rewards/margins": 7.109365940093994, "rewards/rejected": -12.516963005065918, "sft_loss": 0.9856653809547424, "step": 375 }, { "epoch": 0.624315443592552, "grad_norm": 21.17776800888567, "learning_rate": 4.1164990030394985e-07, "logits/chosen": 19.750553131103516, "logits/rejected": 18.34299659729004, "logps/chosen": -315.951171875, "logps/rejected": -218.6781768798828, "loss": 0.1158, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.80844783782959, "rewards/margins": 6.5122971534729, "rewards/rejected": -13.320744514465332, "sft_loss": 0.9718233942985535, "step": 380 }, { "epoch": 0.6325301204819277, "grad_norm": 4.237566233767258, "learning_rate": 4.09041416693545e-07, "logits/chosen": 19.702919006347656, "logits/rejected": 18.19818115234375, "logps/chosen": -315.5003662109375, "logps/rejected": -213.6582794189453, "loss": 0.1037, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.986787796020508, "rewards/margins": 6.549233913421631, "rewards/rejected": -13.536023139953613, "sft_loss": 1.0769668817520142, "step": 385 }, { "epoch": 0.6407447973713034, "grad_norm": 8.046716778811138, "learning_rate": 4.064035173736804e-07, "logits/chosen": 16.724149703979492, "logits/rejected": 16.12737464904785, "logps/chosen": -316.17547607421875, "logps/rejected": -242.3610076904297, "loss": 0.098, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -7.848334312438965, "rewards/margins": 8.00571060180664, "rewards/rejected": -15.854043960571289, "sft_loss": 1.207060694694519, "step": 390 }, { "epoch": 0.6489594742606791, "grad_norm": 2.9537654489100564, "learning_rate": 4.0373669024041225e-07, "logits/chosen": 21.014055252075195, "logits/rejected": 19.712244033813477, "logps/chosen": -325.1595153808594, "logps/rejected": -232.09849548339844, "loss": 0.076, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.954678058624268, "rewards/margins": 7.2714033126831055, "rewards/rejected": -15.226082801818848, "sft_loss": 1.0326578617095947, "step": 395 }, { "epoch": 0.6571741511500547, "grad_norm": 7.381518685772535, "learning_rate": 4.010414285401776e-07, "logits/chosen": 20.833526611328125, "logits/rejected": 20.059972763061523, "logps/chosen": -301.8648681640625, "logps/rejected": -219.5161895751953, "loss": 0.0779, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.38788366317749, "rewards/margins": 7.111426830291748, "rewards/rejected": -14.499311447143555, "sft_loss": 1.039659023284912, "step": 400 }, { "epoch": 0.6653888280394304, "grad_norm": 4.428141508810293, "learning_rate": 3.9831823077856565e-07, "logits/chosen": 18.648929595947266, "logits/rejected": 18.11013412475586, "logps/chosen": -301.7897033691406, "logps/rejected": -215.20339965820312, "loss": 0.1026, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -6.600786209106445, "rewards/margins": 6.725845813751221, "rewards/rejected": -13.326631546020508, "sft_loss": 1.1041791439056396, "step": 405 }, { "epoch": 0.6736035049288062, "grad_norm": 3.720237574886871, "learning_rate": 3.95567600628115e-07, "logits/chosen": 18.711490631103516, "logits/rejected": 17.353120803833008, "logps/chosen": -288.9316711425781, "logps/rejected": -199.5088348388672, "loss": 0.0833, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.722890377044678, "rewards/margins": 6.0634002685546875, "rewards/rejected": -11.786290168762207, "sft_loss": 0.9382212162017822, "step": 410 }, { "epoch": 0.6818181818181818, "grad_norm": 3.5502904501789456, "learning_rate": 3.9279004683515783e-07, "logits/chosen": 19.41602897644043, "logits/rejected": 18.82366180419922, "logps/chosen": -300.6460876464844, "logps/rejected": -198.58851623535156, "loss": 0.0921, "rewards/accuracies": 1.0, "rewards/chosen": -5.369536876678467, "rewards/margins": 6.260385036468506, "rewards/rejected": -11.629922866821289, "sft_loss": 0.947778046131134, "step": 415 }, { "epoch": 0.6900328587075575, "grad_norm": 3.775437774667387, "learning_rate": 3.8998608312572234e-07, "logits/chosen": 20.035261154174805, "logits/rejected": 18.32322883605957, "logps/chosen": -321.1794738769531, "logps/rejected": -206.26197814941406, "loss": 0.0666, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.206702709197998, "rewards/margins": 6.016441822052002, "rewards/rejected": -12.223145484924316, "sft_loss": 0.8840410709381104, "step": 420 }, { "epoch": 0.6982475355969332, "grad_norm": 4.6471153709105355, "learning_rate": 3.8715622811051753e-07, "logits/chosen": 20.395259857177734, "logits/rejected": 18.99897003173828, "logps/chosen": -340.93096923828125, "logps/rejected": -233.05569458007812, "loss": 0.0824, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.279754638671875, "rewards/margins": 6.952295303344727, "rewards/rejected": -14.232048988342285, "sft_loss": 0.9561217427253723, "step": 425 }, { "epoch": 0.7064622124863089, "grad_norm": 5.222781909367126, "learning_rate": 3.843010051890114e-07, "logits/chosen": 17.722442626953125, "logits/rejected": 17.12670135498047, "logps/chosen": -305.9290466308594, "logps/rejected": -227.79209899902344, "loss": 0.0968, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -7.020711898803711, "rewards/margins": 7.3669562339782715, "rewards/rejected": -14.38766860961914, "sft_loss": 1.0332236289978027, "step": 430 }, { "epoch": 0.7146768893756845, "grad_norm": 8.793537356204498, "learning_rate": 3.8142094245262615e-07, "logits/chosen": 18.769723892211914, "logits/rejected": 17.301259994506836, "logps/chosen": -281.7428283691406, "logps/rejected": -198.91912841796875, "loss": 0.0907, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -6.324660301208496, "rewards/margins": 6.169273376464844, "rewards/rejected": -12.493931770324707, "sft_loss": 1.6358309984207153, "step": 435 }, { "epoch": 0.7228915662650602, "grad_norm": 3.225698996369955, "learning_rate": 3.785165725870637e-07, "logits/chosen": 18.968225479125977, "logits/rejected": 18.159814834594727, "logps/chosen": -303.6697082519531, "logps/rejected": -215.95223999023438, "loss": 0.0896, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -5.786445140838623, "rewards/margins": 6.777508735656738, "rewards/rejected": -12.563952445983887, "sft_loss": 0.9366723299026489, "step": 440 }, { "epoch": 0.731106243154436, "grad_norm": 5.69116038376307, "learning_rate": 3.7558843277378203e-07, "logits/chosen": 19.500343322753906, "logits/rejected": 18.194765090942383, "logps/chosen": -280.2967834472656, "logps/rejected": -190.11224365234375, "loss": 0.089, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -5.179433345794678, "rewards/margins": 6.132752418518066, "rewards/rejected": -11.31218433380127, "sft_loss": 0.9057294130325317, "step": 445 }, { "epoch": 0.7393209200438116, "grad_norm": 2.910912446985568, "learning_rate": 3.726370645906407e-07, "logits/chosen": 18.821012496948242, "logits/rejected": 17.809709548950195, "logps/chosen": -301.2443542480469, "logps/rejected": -197.73570251464844, "loss": 0.072, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -5.806431293487549, "rewards/margins": 6.135568141937256, "rewards/rejected": -11.942000389099121, "sft_loss": 1.1080551147460938, "step": 450 }, { "epoch": 0.7475355969331873, "grad_norm": 3.4976563659123787, "learning_rate": 3.6966301391173204e-07, "logits/chosen": 18.104902267456055, "logits/rejected": 19.11966896057129, "logps/chosen": -283.95404052734375, "logps/rejected": -216.9072265625, "loss": 0.0961, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.108445167541504, "rewards/margins": 7.591168403625488, "rewards/rejected": -13.699614524841309, "sft_loss": 1.0220645666122437, "step": 455 }, { "epoch": 0.755750273822563, "grad_norm": 3.376381123051291, "learning_rate": 3.6666683080641843e-07, "logits/chosen": 17.50767707824707, "logits/rejected": 16.899118423461914, "logps/chosen": -325.40252685546875, "logps/rejected": -233.790283203125, "loss": 0.0663, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.453089237213135, "rewards/margins": 7.27883243560791, "rewards/rejected": -14.73192310333252, "sft_loss": 1.0263330936431885, "step": 460 }, { "epoch": 0.7639649507119387, "grad_norm": 4.787229066445367, "learning_rate": 3.636490694375937e-07, "logits/chosen": 19.532527923583984, "logits/rejected": 18.083965301513672, "logps/chosen": -335.1581115722656, "logps/rejected": -236.63861083984375, "loss": 0.0698, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.509254455566406, "rewards/margins": 8.015713691711426, "rewards/rejected": -15.5249662399292, "sft_loss": 1.013353943824768, "step": 465 }, { "epoch": 0.7721796276013143, "grad_norm": 1.6525330438642514, "learning_rate": 3.6061028795918734e-07, "logits/chosen": 19.80712127685547, "logits/rejected": 18.532358169555664, "logps/chosen": -338.9714660644531, "logps/rejected": -240.235595703125, "loss": 0.0643, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.31692886352539, "rewards/margins": 7.37734317779541, "rewards/rejected": -15.694271087646484, "sft_loss": 1.046373963356018, "step": 470 }, { "epoch": 0.78039430449069, "grad_norm": 4.380627729663156, "learning_rate": 3.5755104841292974e-07, "logits/chosen": 18.083696365356445, "logits/rejected": 17.68763542175293, "logps/chosen": -296.51568603515625, "logps/rejected": -220.79627990722656, "loss": 0.0753, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.48270845413208, "rewards/margins": 7.01304817199707, "rewards/rejected": -14.495758056640625, "sft_loss": 1.1213568449020386, "step": 475 }, { "epoch": 0.7886089813800657, "grad_norm": 3.5838233904248202, "learning_rate": 3.544719166243998e-07, "logits/chosen": 18.444488525390625, "logits/rejected": 17.91800880432129, "logps/chosen": -323.7398681640625, "logps/rejected": -239.03846740722656, "loss": 0.0675, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.762217998504639, "rewards/margins": 7.673830032348633, "rewards/rejected": -15.43604850769043, "sft_loss": 1.0297211408615112, "step": 480 }, { "epoch": 0.7968236582694413, "grad_norm": 5.338329799579653, "learning_rate": 3.513734620983716e-07, "logits/chosen": 18.91893768310547, "logits/rejected": 17.848176956176758, "logps/chosen": -331.2665100097656, "logps/rejected": -252.34434509277344, "loss": 0.0585, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.252163887023926, "rewards/margins": 8.53612232208252, "rewards/rejected": -16.788286209106445, "sft_loss": 0.9914504289627075, "step": 485 }, { "epoch": 0.8050383351588171, "grad_norm": 10.778725390322887, "learning_rate": 3.482562579134809e-07, "logits/chosen": 17.087730407714844, "logits/rejected": 16.604755401611328, "logps/chosen": -269.3510437011719, "logps/rejected": -222.06834411621094, "loss": 0.0883, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -8.483560562133789, "rewards/margins": 6.750950813293457, "rewards/rejected": -15.23451042175293, "sft_loss": 1.1086839437484741, "step": 490 }, { "epoch": 0.8132530120481928, "grad_norm": 3.589498412964668, "learning_rate": 3.4512088061623073e-07, "logits/chosen": 20.214731216430664, "logits/rejected": 18.510662078857422, "logps/chosen": -364.5818176269531, "logps/rejected": -248.37899780273438, "loss": 0.0795, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -8.396151542663574, "rewards/margins": 7.525665283203125, "rewards/rejected": -15.9218168258667, "sft_loss": 1.0693514347076416, "step": 495 }, { "epoch": 0.8214676889375685, "grad_norm": 3.668001282748929, "learning_rate": 3.419679101143555e-07, "logits/chosen": 19.246572494506836, "logits/rejected": 18.104793548583984, "logps/chosen": -286.7825012207031, "logps/rejected": -219.62115478515625, "loss": 0.0646, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.4586005210876465, "rewards/margins": 7.050439357757568, "rewards/rejected": -14.509037971496582, "sft_loss": 1.0848183631896973, "step": 500 }, { "epoch": 0.8296823658269441, "grad_norm": 4.09686690298713, "learning_rate": 3.387979295695632e-07, "logits/chosen": 19.468666076660156, "logits/rejected": 17.898151397705078, "logps/chosen": -306.32623291015625, "logps/rejected": -223.23086547851562, "loss": 0.0854, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.60283899307251, "rewards/margins": 6.977162837982178, "rewards/rejected": -14.580002784729004, "sft_loss": 1.017836093902588, "step": 505 }, { "epoch": 0.8378970427163198, "grad_norm": 5.03743167428466, "learning_rate": 3.356115252896764e-07, "logits/chosen": 18.379446029663086, "logits/rejected": 17.511045455932617, "logps/chosen": -338.8892517089844, "logps/rejected": -236.15655517578125, "loss": 0.084, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -7.8325958251953125, "rewards/margins": 7.418299674987793, "rewards/rejected": -15.250896453857422, "sft_loss": 1.1587491035461426, "step": 510 }, { "epoch": 0.8461117196056955, "grad_norm": 1.3483241705386844, "learning_rate": 3.3240928662019043e-07, "logits/chosen": 17.142412185668945, "logits/rejected": 16.88658905029297, "logps/chosen": -323.4809265136719, "logps/rejected": -228.69459533691406, "loss": 0.076, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -7.122060298919678, "rewards/margins": 7.332087516784668, "rewards/rejected": -14.454146385192871, "sft_loss": 1.033144235610962, "step": 515 }, { "epoch": 0.8543263964950711, "grad_norm": 5.6768934408505665, "learning_rate": 3.291918058352706e-07, "logits/chosen": 18.301881790161133, "logits/rejected": 17.546480178833008, "logps/chosen": -286.16534423828125, "logps/rejected": -223.24356079101562, "loss": 0.1012, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.11279010772705, "rewards/margins": 6.034675121307373, "rewards/rejected": -14.147465705871582, "sft_loss": 1.0691564083099365, "step": 520 }, { "epoch": 0.8625410733844469, "grad_norm": 2.664959243016448, "learning_rate": 3.259596780282074e-07, "logits/chosen": 19.877466201782227, "logits/rejected": 18.95973014831543, "logps/chosen": -365.1341552734375, "logps/rejected": -256.1839294433594, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": -8.447802543640137, "rewards/margins": 8.053640365600586, "rewards/rejected": -16.501441955566406, "sft_loss": 1.2106726169586182, "step": 525 }, { "epoch": 0.8707557502738226, "grad_norm": 13.037778283402458, "learning_rate": 3.2271350100134975e-07, "logits/chosen": 19.298856735229492, "logits/rejected": 17.636470794677734, "logps/chosen": -322.1096496582031, "logps/rejected": -239.7886505126953, "loss": 0.0671, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.631354331970215, "rewards/margins": 7.214845657348633, "rewards/rejected": -15.846202850341797, "sft_loss": 1.1396478414535522, "step": 530 }, { "epoch": 0.8789704271631983, "grad_norm": 3.29678671312156, "learning_rate": 3.1945387515553843e-07, "logits/chosen": 20.68004608154297, "logits/rejected": 18.87333106994629, "logps/chosen": -342.8370056152344, "logps/rejected": -231.9772491455078, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": -7.539201259613037, "rewards/margins": 7.375776290893555, "rewards/rejected": -14.914976119995117, "sft_loss": 1.073767066001892, "step": 535 }, { "epoch": 0.8871851040525739, "grad_norm": 5.524616451485275, "learning_rate": 3.1618140337905764e-07, "logits/chosen": 19.65703582763672, "logits/rejected": 18.759868621826172, "logps/chosen": -293.4176940917969, "logps/rejected": -222.76730346679688, "loss": 0.0785, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.326995372772217, "rewards/margins": 6.951819896697998, "rewards/rejected": -14.278814315795898, "sft_loss": 1.1393436193466187, "step": 540 }, { "epoch": 0.8953997809419496, "grad_norm": 3.710469983456331, "learning_rate": 3.128966909361271e-07, "logits/chosen": 19.455686569213867, "logits/rejected": 18.928129196166992, "logps/chosen": -346.8277893066406, "logps/rejected": -249.02398681640625, "loss": 0.0577, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.179071426391602, "rewards/margins": 7.697024345397949, "rewards/rejected": -15.87609577178955, "sft_loss": 1.0804804563522339, "step": 545 }, { "epoch": 0.9036144578313253, "grad_norm": 11.048677395421713, "learning_rate": 3.096003453549549e-07, "logits/chosen": 19.185258865356445, "logits/rejected": 17.547964096069336, "logps/chosen": -356.4489440917969, "logps/rejected": -257.8424987792969, "loss": 0.0805, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.774456977844238, "rewards/margins": 8.595198631286621, "rewards/rejected": -17.36965560913086, "sft_loss": 1.0598615407943726, "step": 550 }, { "epoch": 0.911829134720701, "grad_norm": 2.5715819705074447, "learning_rate": 3.06292976315371e-07, "logits/chosen": 18.52203369140625, "logits/rejected": 17.04202651977539, "logps/chosen": -332.1985168457031, "logps/rejected": -236.9717254638672, "loss": 0.0696, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.198724746704102, "rewards/margins": 7.527222156524658, "rewards/rejected": -15.725946426391602, "sft_loss": 1.1585354804992676, "step": 555 }, { "epoch": 0.9200438116100766, "grad_norm": 5.125385527021205, "learning_rate": 3.0297519553606324e-07, "logits/chosen": 19.936016082763672, "logits/rejected": 18.45525360107422, "logps/chosen": -308.65936279296875, "logps/rejected": -223.3343048095703, "loss": 0.0879, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -7.173832893371582, "rewards/margins": 7.109025001525879, "rewards/rejected": -14.282858848571777, "sft_loss": 1.038684368133545, "step": 560 }, { "epoch": 0.9282584884994524, "grad_norm": 3.7082409068486073, "learning_rate": 2.996476166614363e-07, "logits/chosen": 19.476381301879883, "logits/rejected": 17.608320236206055, "logps/chosen": -329.3957214355469, "logps/rejected": -218.0595703125, "loss": 0.0885, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -6.555365085601807, "rewards/margins": 6.53586483001709, "rewards/rejected": -13.091230392456055, "sft_loss": 1.0041550397872925, "step": 565 }, { "epoch": 0.9364731653888281, "grad_norm": 4.785307756807971, "learning_rate": 2.963108551481142e-07, "logits/chosen": 20.289274215698242, "logits/rejected": 18.269121170043945, "logps/chosen": -354.8280029296875, "logps/rejected": -232.3466796875, "loss": 0.0877, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.978261470794678, "rewards/margins": 7.397289752960205, "rewards/rejected": -14.3755521774292, "sft_loss": 1.049277424812317, "step": 570 }, { "epoch": 0.9446878422782037, "grad_norm": 2.70866282954725, "learning_rate": 2.929655281511075e-07, "logits/chosen": 18.807231903076172, "logits/rejected": 17.12908363342285, "logps/chosen": -354.8686828613281, "logps/rejected": -238.53443908691406, "loss": 0.0648, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.602488994598389, "rewards/margins": 7.430182933807373, "rewards/rejected": -15.032671928405762, "sft_loss": 1.1179447174072266, "step": 575 }, { "epoch": 0.9529025191675794, "grad_norm": 1.9988385632818255, "learning_rate": 2.896122544096667e-07, "logits/chosen": 18.314443588256836, "logits/rejected": 17.118688583374023, "logps/chosen": -302.5119934082031, "logps/rejected": -227.22274780273438, "loss": 0.0808, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.768359661102295, "rewards/margins": 7.082144737243652, "rewards/rejected": -14.850504875183105, "sft_loss": 1.1123638153076172, "step": 580 }, { "epoch": 0.9611171960569551, "grad_norm": 3.2519592276620872, "learning_rate": 2.8625165413284307e-07, "logits/chosen": 18.069074630737305, "logits/rejected": 17.50326919555664, "logps/chosen": -342.6742248535156, "logps/rejected": -235.6639862060547, "loss": 0.0951, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -7.134052753448486, "rewards/margins": 7.267971515655518, "rewards/rejected": -14.402023315429688, "sft_loss": 1.0139853954315186, "step": 585 }, { "epoch": 0.9693318729463308, "grad_norm": 3.802310527872131, "learning_rate": 2.8288434888477626e-07, "logits/chosen": 20.131250381469727, "logits/rejected": 18.442293167114258, "logps/chosen": -264.7261047363281, "logps/rejected": -194.17242431640625, "loss": 0.0728, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -6.163069248199463, "rewards/margins": 5.962526798248291, "rewards/rejected": -12.125596046447754, "sft_loss": 0.9994211196899414, "step": 590 }, { "epoch": 0.9775465498357064, "grad_norm": 4.623566930411208, "learning_rate": 2.795109614697326e-07, "logits/chosen": 19.592060089111328, "logits/rejected": 18.126672744750977, "logps/chosen": -295.9714050292969, "logps/rejected": -204.66427612304688, "loss": 0.0873, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.2958526611328125, "rewards/margins": 6.357571601867676, "rewards/rejected": -12.653424263000488, "sft_loss": 0.9684253931045532, "step": 595 }, { "epoch": 0.9857612267250822, "grad_norm": 6.368570274437731, "learning_rate": 2.761321158169134e-07, "logits/chosen": 20.56801414489746, "logits/rejected": 19.605268478393555, "logps/chosen": -325.0837097167969, "logps/rejected": -226.78863525390625, "loss": 0.0896, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -6.953873634338379, "rewards/margins": 7.266047954559326, "rewards/rejected": -14.219923973083496, "sft_loss": 1.0414397716522217, "step": 600 }, { "epoch": 0.9939759036144579, "grad_norm": 3.750987661763106, "learning_rate": 2.727484368650553e-07, "logits/chosen": 17.04819107055664, "logits/rejected": 16.657819747924805, "logps/chosen": -309.3585205078125, "logps/rejected": -228.43280029296875, "loss": 0.0826, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.683638572692871, "rewards/margins": 7.000718116760254, "rewards/rejected": -14.684355735778809, "sft_loss": 1.13958740234375, "step": 605 }, { "epoch": 1.0021905805038336, "grad_norm": 2.4122584075980966, "learning_rate": 2.6936055044684425e-07, "logits/chosen": 18.877971649169922, "logits/rejected": 17.84745979309082, "logps/chosen": -272.0718078613281, "logps/rejected": -214.1260223388672, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": -7.77817440032959, "rewards/margins": 6.673606872558594, "rewards/rejected": -14.4517822265625, "sft_loss": 1.05906343460083, "step": 610 }, { "epoch": 1.0104052573932092, "grad_norm": 4.202536280729459, "learning_rate": 2.659690831731631e-07, "logits/chosen": 18.819133758544922, "logits/rejected": 18.853425979614258, "logps/chosen": -313.6888427734375, "logps/rejected": -247.36912536621094, "loss": 0.0479, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.692021369934082, "rewards/margins": 7.864367961883545, "rewards/rejected": -16.5563907623291, "sft_loss": 1.0393388271331787, "step": 615 }, { "epoch": 1.0186199342825848, "grad_norm": 3.9279717638704224, "learning_rate": 2.6257466231719676e-07, "logits/chosen": 16.448410034179688, "logits/rejected": 15.970653533935547, "logps/chosen": -357.7733154296875, "logps/rejected": -276.7339782714844, "loss": 0.0375, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.677971839904785, "rewards/margins": 9.011359214782715, "rewards/rejected": -18.6893310546875, "sft_loss": 1.3019551038742065, "step": 620 }, { "epoch": 1.0268346111719606, "grad_norm": 2.4098074473851, "learning_rate": 2.591779156984137e-07, "logits/chosen": 18.19355010986328, "logits/rejected": 16.887168884277344, "logps/chosen": -331.2437744140625, "logps/rejected": -270.1482849121094, "loss": 0.0655, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -10.20653247833252, "rewards/margins": 8.743158340454102, "rewards/rejected": -18.949687957763672, "sft_loss": 1.1369999647140503, "step": 625 }, { "epoch": 1.0350492880613362, "grad_norm": 2.015176030113295, "learning_rate": 2.557794715664465e-07, "logits/chosen": 18.09113311767578, "logits/rejected": 16.872072219848633, "logps/chosen": -343.4937438964844, "logps/rejected": -252.0245819091797, "loss": 0.0574, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.15774154663086, "rewards/margins": 7.730542182922363, "rewards/rejected": -16.888282775878906, "sft_loss": 1.1037859916687012, "step": 630 }, { "epoch": 1.0432639649507118, "grad_norm": 4.817911475238817, "learning_rate": 2.5237995848489417e-07, "logits/chosen": 19.07139778137207, "logits/rejected": 17.200511932373047, "logps/chosen": -335.5242614746094, "logps/rejected": -225.45147705078125, "loss": 0.0736, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.962551593780518, "rewards/margins": 7.362425804138184, "rewards/rejected": -14.32497787475586, "sft_loss": 1.0912320613861084, "step": 635 }, { "epoch": 1.0514786418400877, "grad_norm": 5.399497912094965, "learning_rate": 2.48980005215064e-07, "logits/chosen": 19.333988189697266, "logits/rejected": 18.970035552978516, "logps/chosen": -263.4971008300781, "logps/rejected": -204.3727569580078, "loss": 0.0747, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.033030986785889, "rewards/margins": 6.5659332275390625, "rewards/rejected": -13.598965644836426, "sft_loss": 1.2809529304504395, "step": 640 }, { "epoch": 1.0596933187294633, "grad_norm": 5.192926508245981, "learning_rate": 2.45580240599679e-07, "logits/chosen": 18.814851760864258, "logits/rejected": 19.112171173095703, "logps/chosen": -368.88470458984375, "logps/rejected": -262.5347900390625, "loss": 0.073, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -7.645462512969971, "rewards/margins": 8.707850456237793, "rewards/rejected": -16.353313446044922, "sft_loss": 1.1692014932632446, "step": 645 }, { "epoch": 1.067907995618839, "grad_norm": 2.533086454702698, "learning_rate": 2.421812934465696e-07, "logits/chosen": 20.8750057220459, "logits/rejected": 18.880239486694336, "logps/chosen": -341.56805419921875, "logps/rejected": -238.24147033691406, "loss": 0.0526, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.804737567901611, "rewards/margins": 7.901425838470459, "rewards/rejected": -15.706161499023438, "sft_loss": 1.0994611978530884, "step": 650 }, { "epoch": 1.0761226725082147, "grad_norm": 1.8972892397596355, "learning_rate": 2.3878379241237134e-07, "logits/chosen": 18.171382904052734, "logits/rejected": 17.349416732788086, "logps/chosen": -333.02862548828125, "logps/rejected": -257.3088073730469, "loss": 0.0702, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.708560943603516, "rewards/margins": 8.212488174438477, "rewards/rejected": -17.92104721069336, "sft_loss": 1.3080743551254272, "step": 655 }, { "epoch": 1.0843373493975903, "grad_norm": 7.296587489211492, "learning_rate": 2.3538836588625077e-07, "logits/chosen": 16.028032302856445, "logits/rejected": 15.767348289489746, "logps/chosen": -302.0737609863281, "logps/rejected": -251.35279846191406, "loss": 0.0709, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.596309661865234, "rewards/margins": 8.214980125427246, "rewards/rejected": -17.811288833618164, "sft_loss": 1.4402241706848145, "step": 660 }, { "epoch": 1.0925520262869661, "grad_norm": 2.7162575555609547, "learning_rate": 2.3199564187368153e-07, "logits/chosen": 18.975399017333984, "logits/rejected": 17.070423126220703, "logps/chosen": -378.0216369628906, "logps/rejected": -273.23687744140625, "loss": 0.0699, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.879487037658691, "rewards/margins": 8.538008689880371, "rewards/rejected": -18.41749382019043, "sft_loss": 1.127976655960083, "step": 665 }, { "epoch": 1.1007667031763417, "grad_norm": 10.07225580885619, "learning_rate": 2.2860624788029013e-07, "logits/chosen": 17.706756591796875, "logits/rejected": 17.792144775390625, "logps/chosen": -291.8415832519531, "logps/rejected": -239.1862030029297, "loss": 0.0756, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.226842880249023, "rewards/margins": 7.920691013336182, "rewards/rejected": -16.147533416748047, "sft_loss": 1.1893837451934814, "step": 670 }, { "epoch": 1.1089813800657173, "grad_norm": 4.017695151821307, "learning_rate": 2.2522081079579497e-07, "logits/chosen": 17.455995559692383, "logits/rejected": 17.200504302978516, "logps/chosen": -316.4137878417969, "logps/rejected": -253.0958709716797, "loss": 0.0701, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -8.296252250671387, "rewards/margins": 8.523763656616211, "rewards/rejected": -16.82001495361328, "sft_loss": 1.2573199272155762, "step": 675 }, { "epoch": 1.1171960569550932, "grad_norm": 2.9229430403917416, "learning_rate": 2.2183995677805967e-07, "logits/chosen": 17.359153747558594, "logits/rejected": 17.15728759765625, "logps/chosen": -337.8786315917969, "logps/rejected": -256.97259521484375, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -8.227910041809082, "rewards/margins": 8.76689624786377, "rewards/rejected": -16.99480628967285, "sft_loss": 1.1089237928390503, "step": 680 }, { "epoch": 1.1254107338444688, "grad_norm": 3.6239751013897306, "learning_rate": 2.1846431113728062e-07, "logits/chosen": 17.55232048034668, "logits/rejected": 17.560558319091797, "logps/chosen": -321.5279846191406, "logps/rejected": -261.1816711425781, "loss": 0.0621, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.55235767364502, "rewards/margins": 9.196606636047363, "rewards/rejected": -17.748966217041016, "sft_loss": 1.1366941928863525, "step": 685 }, { "epoch": 1.1336254107338444, "grad_norm": 6.419284533750765, "learning_rate": 2.1509449822033205e-07, "logits/chosen": 18.428213119506836, "logits/rejected": 17.30073356628418, "logps/chosen": -371.9090576171875, "logps/rejected": -266.3111572265625, "loss": 0.0415, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.018980026245117, "rewards/margins": 8.834003448486328, "rewards/rejected": -17.852983474731445, "sft_loss": 1.149424433708191, "step": 690 }, { "epoch": 1.1418400876232202, "grad_norm": 6.07442729979061, "learning_rate": 2.1173114129528957e-07, "logits/chosen": 17.701704025268555, "logits/rejected": 17.770456314086914, "logps/chosen": -307.99798583984375, "logps/rejected": -250.03758239746094, "loss": 0.0795, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -9.118383407592773, "rewards/margins": 8.651429176330566, "rewards/rejected": -17.769811630249023, "sft_loss": 1.3337371349334717, "step": 695 }, { "epoch": 1.1500547645125958, "grad_norm": 3.7073147975104885, "learning_rate": 2.0837486243615226e-07, "logits/chosen": 19.352502822875977, "logits/rejected": 17.753198623657227, "logps/chosen": -390.12890625, "logps/rejected": -298.8519592285156, "loss": 0.0802, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -10.813799858093262, "rewards/margins": 9.796292304992676, "rewards/rejected": -20.610090255737305, "sft_loss": 1.080621600151062, "step": 700 }, { "epoch": 1.1582694414019716, "grad_norm": 3.173868712663281, "learning_rate": 2.0502628240778653e-07, "logits/chosen": 18.959392547607422, "logits/rejected": 19.250455856323242, "logps/chosen": -345.37005615234375, "logps/rejected": -278.861328125, "loss": 0.073, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.555724143981934, "rewards/margins": 9.474061012268066, "rewards/rejected": -19.02978515625, "sft_loss": 1.0641828775405884, "step": 705 }, { "epoch": 1.1664841182913472, "grad_norm": 5.00787760201323, "learning_rate": 2.0168602055111173e-07, "logits/chosen": 18.32485008239746, "logits/rejected": 17.315608978271484, "logps/chosen": -337.5779724121094, "logps/rejected": -283.7364196777344, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": -9.335200309753418, "rewards/margins": 10.991543769836426, "rewards/rejected": -20.32674217224121, "sft_loss": 1.1922962665557861, "step": 710 }, { "epoch": 1.1746987951807228, "grad_norm": 4.0968602022146845, "learning_rate": 1.9835469466854887e-07, "logits/chosen": 18.45452880859375, "logits/rejected": 16.768016815185547, "logps/chosen": -344.7873840332031, "logps/rejected": -259.2659912109375, "loss": 0.0523, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.04322624206543, "rewards/margins": 8.293071746826172, "rewards/rejected": -17.336299896240234, "sft_loss": 1.140812635421753, "step": 715 }, { "epoch": 1.1829134720700987, "grad_norm": 4.912796552705337, "learning_rate": 1.9503292090975454e-07, "logits/chosen": 19.511587142944336, "logits/rejected": 18.789897918701172, "logps/chosen": -267.87982177734375, "logps/rejected": -212.76730346679688, "loss": 0.069, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -7.6522440910339355, "rewards/margins": 6.9143452644348145, "rewards/rejected": -14.56658935546875, "sft_loss": 1.1043713092803955, "step": 720 }, { "epoch": 1.1911281489594743, "grad_norm": 3.7513682612693287, "learning_rate": 1.917213136576602e-07, "logits/chosen": 19.222810745239258, "logits/rejected": 18.952795028686523, "logps/chosen": -317.2191162109375, "logps/rejected": -242.29966735839844, "loss": 0.0429, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -7.920867919921875, "rewards/margins": 8.166583061218262, "rewards/rejected": -16.08745002746582, "sft_loss": 1.0877418518066406, "step": 725 }, { "epoch": 1.1993428258488499, "grad_norm": 5.446402649753336, "learning_rate": 1.8842048541483756e-07, "logits/chosen": 19.713359832763672, "logits/rejected": 18.361555099487305, "logps/chosen": -319.3419189453125, "logps/rejected": -238.31849670410156, "loss": 0.0688, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.05375862121582, "rewards/margins": 7.3250813484191895, "rewards/rejected": -16.378841400146484, "sft_loss": 1.220839023590088, "step": 730 }, { "epoch": 1.2075575027382257, "grad_norm": 3.597263962070461, "learning_rate": 1.8513104669021314e-07, "logits/chosen": 18.05857276916504, "logits/rejected": 17.213830947875977, "logps/chosen": -345.5421142578125, "logps/rejected": -268.8918151855469, "loss": 0.069, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.973356246948242, "rewards/margins": 8.510116577148438, "rewards/rejected": -18.48347282409668, "sft_loss": 1.1453214883804321, "step": 735 }, { "epoch": 1.2157721796276013, "grad_norm": 9.406473544966678, "learning_rate": 1.8185360588615057e-07, "logits/chosen": 19.28816795349121, "logits/rejected": 17.978492736816406, "logps/chosen": -365.7281799316406, "logps/rejected": -270.6371154785156, "loss": 0.0678, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.327936172485352, "rewards/margins": 8.730137825012207, "rewards/rejected": -18.058074951171875, "sft_loss": 1.1658827066421509, "step": 740 }, { "epoch": 1.223986856516977, "grad_norm": 3.7662686205808584, "learning_rate": 1.7858876918592232e-07, "logits/chosen": 17.326171875, "logits/rejected": 17.195329666137695, "logps/chosen": -301.6247863769531, "logps/rejected": -246.91964721679688, "loss": 0.0644, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -8.8589506149292, "rewards/margins": 8.645825386047363, "rewards/rejected": -17.50477409362793, "sft_loss": 1.1279815435409546, "step": 745 }, { "epoch": 1.2322015334063527, "grad_norm": 3.8887147488856986, "learning_rate": 1.7533714044159299e-07, "logits/chosen": 18.013948440551758, "logits/rejected": 16.35959243774414, "logps/chosen": -331.99609375, "logps/rejected": -249.0239715576172, "loss": 0.0704, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.633424758911133, "rewards/margins": 7.326272487640381, "rewards/rejected": -16.95969581604004, "sft_loss": 1.6165919303894043, "step": 750 }, { "epoch": 1.2404162102957283, "grad_norm": 2.5663067455515747, "learning_rate": 1.7209932106233264e-07, "logits/chosen": 16.976768493652344, "logits/rejected": 17.75543785095215, "logps/chosen": -336.4115905761719, "logps/rejected": -277.32012939453125, "loss": 0.0531, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.428933143615723, "rewards/margins": 9.152985572814941, "rewards/rejected": -18.58191680908203, "sft_loss": 1.1206488609313965, "step": 755 }, { "epoch": 1.248630887185104, "grad_norm": 7.904031033651109, "learning_rate": 1.688759099031824e-07, "logits/chosen": 17.9715633392334, "logits/rejected": 16.856943130493164, "logps/chosen": -358.84051513671875, "logps/rejected": -278.71514892578125, "loss": 0.0624, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -9.98255729675293, "rewards/margins": 9.00979995727539, "rewards/rejected": -18.99235725402832, "sft_loss": 1.155042290687561, "step": 760 }, { "epoch": 1.2568455640744798, "grad_norm": 4.102166977987274, "learning_rate": 1.656675031542925e-07, "logits/chosen": 19.007150650024414, "logits/rejected": 18.558509826660156, "logps/chosen": -375.6373291015625, "logps/rejected": -275.5815734863281, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": -9.460150718688965, "rewards/margins": 9.107392311096191, "rewards/rejected": -18.567543029785156, "sft_loss": 1.1842076778411865, "step": 765 }, { "epoch": 1.2650602409638554, "grad_norm": 6.075552965079597, "learning_rate": 1.6247469423065343e-07, "logits/chosen": 18.225561141967773, "logits/rejected": 16.852903366088867, "logps/chosen": -316.5226135253906, "logps/rejected": -239.98927307128906, "loss": 0.0567, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.122289657592773, "rewards/margins": 7.42488431930542, "rewards/rejected": -16.54717445373535, "sft_loss": 1.2285444736480713, "step": 770 }, { "epoch": 1.273274917853231, "grad_norm": 1.6786737917411083, "learning_rate": 1.5929807366233977e-07, "logits/chosen": 18.135303497314453, "logits/rejected": 16.776866912841797, "logps/chosen": -388.01239013671875, "logps/rejected": -284.525390625, "loss": 0.0526, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.297353744506836, "rewards/margins": 9.676100730895996, "rewards/rejected": -18.97345542907715, "sft_loss": 1.1469649076461792, "step": 775 }, { "epoch": 1.2814895947426068, "grad_norm": 2.6753220341135213, "learning_rate": 1.5613822898528794e-07, "logits/chosen": 17.79352378845215, "logits/rejected": 17.512025833129883, "logps/chosen": -331.31829833984375, "logps/rejected": -265.259765625, "loss": 0.0508, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.44349479675293, "rewards/margins": 8.841734886169434, "rewards/rejected": -18.285228729248047, "sft_loss": 1.3215175867080688, "step": 780 }, { "epoch": 1.2897042716319824, "grad_norm": 8.768934877423389, "learning_rate": 1.5299574463262794e-07, "logits/chosen": 17.9199275970459, "logits/rejected": 17.427675247192383, "logps/chosen": -371.37432861328125, "logps/rejected": -277.9888916015625, "loss": 0.0713, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.927423477172852, "rewards/margins": 9.477466583251953, "rewards/rejected": -18.404890060424805, "sft_loss": 1.1020786762237549, "step": 785 }, { "epoch": 1.297918948521358, "grad_norm": 3.060484827435086, "learning_rate": 1.4987120182658877e-07, "logits/chosen": 18.74033546447754, "logits/rejected": 19.323453903198242, "logps/chosen": -322.2007141113281, "logps/rejected": -249.55172729492188, "loss": 0.0521, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -7.6707940101623535, "rewards/margins": 8.992262840270996, "rewards/rejected": -16.663057327270508, "sft_loss": 1.0123049020767212, "step": 790 }, { "epoch": 1.3061336254107339, "grad_norm": 2.7470931382147814, "learning_rate": 1.4676517847099745e-07, "logits/chosen": 19.513704299926758, "logits/rejected": 19.02146339416504, "logps/chosen": -285.7082214355469, "logps/rejected": -215.02732849121094, "loss": 0.0717, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -6.395638465881348, "rewards/margins": 7.340782642364502, "rewards/rejected": -13.736421585083008, "sft_loss": 0.9653832316398621, "step": 795 }, { "epoch": 1.3143483023001095, "grad_norm": 6.310958305446118, "learning_rate": 1.4367824904439242e-07, "logits/chosen": 20.603116989135742, "logits/rejected": 19.180021286010742, "logps/chosen": -326.3525085449219, "logps/rejected": -239.44825744628906, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": -7.47385311126709, "rewards/margins": 7.884840488433838, "rewards/rejected": -15.358694076538086, "sft_loss": 0.995606005191803, "step": 800 }, { "epoch": 1.3225629791894853, "grad_norm": 5.691699841334073, "learning_rate": 1.4061098449376985e-07, "logits/chosen": 19.0198974609375, "logits/rejected": 17.974164962768555, "logps/chosen": -380.7705383300781, "logps/rejected": -275.5369873046875, "loss": 0.0558, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.497329711914062, "rewards/margins": 9.07253646850586, "rewards/rejected": -17.569866180419922, "sft_loss": 1.1454724073410034, "step": 805 }, { "epoch": 1.330777656078861, "grad_norm": 3.2559120936424177, "learning_rate": 1.375639521289836e-07, "logits/chosen": 18.49973487854004, "logits/rejected": 16.317462921142578, "logps/chosen": -342.45751953125, "logps/rejected": -251.50741577148438, "loss": 0.0641, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.949553489685059, "rewards/margins": 7.948643684387207, "rewards/rejected": -16.8981990814209, "sft_loss": 1.1290977001190186, "step": 810 }, { "epoch": 1.3389923329682367, "grad_norm": 5.544530645757365, "learning_rate": 1.3453771551781756e-07, "logits/chosen": 16.78171157836914, "logits/rejected": 16.938737869262695, "logps/chosen": -313.18890380859375, "logps/rejected": -255.20042419433594, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": -8.961526870727539, "rewards/margins": 8.269176483154297, "rewards/rejected": -17.23070526123047, "sft_loss": 1.2107350826263428, "step": 815 }, { "epoch": 1.3472070098576123, "grad_norm": 6.724733185846292, "learning_rate": 1.3153283438175034e-07, "logits/chosen": 16.646345138549805, "logits/rejected": 15.98381519317627, "logps/chosen": -331.96099853515625, "logps/rejected": -259.468017578125, "loss": 0.0652, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.80189037322998, "rewards/margins": 8.555667877197266, "rewards/rejected": -18.357561111450195, "sft_loss": 1.1712498664855957, "step": 820 }, { "epoch": 1.355421686746988, "grad_norm": 3.2889655668134083, "learning_rate": 1.2854986449243124e-07, "logits/chosen": 17.65423011779785, "logits/rejected": 17.50803565979004, "logps/chosen": -319.4832458496094, "logps/rejected": -263.18182373046875, "loss": 0.0445, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.653801918029785, "rewards/margins": 8.747140884399414, "rewards/rejected": -18.400943756103516, "sft_loss": 1.077059268951416, "step": 825 }, { "epoch": 1.3636363636363638, "grad_norm": 3.0357613310647613, "learning_rate": 1.2558935756888675e-07, "logits/chosen": 16.95936393737793, "logits/rejected": 16.559120178222656, "logps/chosen": -322.41357421875, "logps/rejected": -263.3858947753906, "loss": 0.0624, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.595723152160645, "rewards/margins": 9.040340423583984, "rewards/rejected": -18.636064529418945, "sft_loss": 1.1398062705993652, "step": 830 }, { "epoch": 1.3718510405257394, "grad_norm": 2.8650653830144184, "learning_rate": 1.226518611754767e-07, "logits/chosen": 19.48563003540039, "logits/rejected": 18.318378448486328, "logps/chosen": -321.63446044921875, "logps/rejected": -262.0445861816406, "loss": 0.0482, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.1325101852417, "rewards/margins": 8.926721572875977, "rewards/rejected": -18.05923080444336, "sft_loss": 1.1335971355438232, "step": 835 }, { "epoch": 1.380065717415115, "grad_norm": 6.425860846095936, "learning_rate": 1.1973791862061871e-07, "logits/chosen": 17.781869888305664, "logits/rejected": 16.827882766723633, "logps/chosen": -339.0770568847656, "logps/rejected": -266.6395568847656, "loss": 0.0703, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.701363563537598, "rewards/margins": 9.394256591796875, "rewards/rejected": -18.095619201660156, "sft_loss": 1.0754181146621704, "step": 840 }, { "epoch": 1.3882803943044908, "grad_norm": 2.9739559158072484, "learning_rate": 1.1684806885630003e-07, "logits/chosen": 18.772544860839844, "logits/rejected": 18.669767379760742, "logps/chosen": -341.22998046875, "logps/rejected": -270.3790283203125, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": -8.742754936218262, "rewards/margins": 9.263623237609863, "rewards/rejected": -18.006380081176758, "sft_loss": 1.1030330657958984, "step": 845 }, { "epoch": 1.3964950711938664, "grad_norm": 3.70632539902967, "learning_rate": 1.1398284637839486e-07, "logits/chosen": 19.672170639038086, "logits/rejected": 18.065275192260742, "logps/chosen": -303.6903076171875, "logps/rejected": -230.23797607421875, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": -8.658843040466309, "rewards/margins": 7.194061756134033, "rewards/rejected": -15.85290241241455, "sft_loss": 1.3414223194122314, "step": 850 }, { "epoch": 1.404709748083242, "grad_norm": 3.8454291780017145, "learning_rate": 1.1114278112780601e-07, "logits/chosen": 19.258955001831055, "logits/rejected": 17.727506637573242, "logps/chosen": -387.6003112792969, "logps/rejected": -285.1304016113281, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -9.63217830657959, "rewards/margins": 9.642850875854492, "rewards/rejected": -19.27503204345703, "sft_loss": 1.0949971675872803, "step": 855 }, { "epoch": 1.4129244249726178, "grad_norm": 81.05899714514051, "learning_rate": 1.08328398392449e-07, "logits/chosen": 18.748620986938477, "logits/rejected": 17.344341278076172, "logps/chosen": -355.6446533203125, "logps/rejected": -269.2040710449219, "loss": 0.0629, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.797952651977539, "rewards/margins": 8.548418045043945, "rewards/rejected": -18.346372604370117, "sft_loss": 1.1425796747207642, "step": 860 }, { "epoch": 1.4211391018619934, "grad_norm": 4.719466773161234, "learning_rate": 1.0554021871009677e-07, "logits/chosen": 18.971662521362305, "logits/rejected": 16.849485397338867, "logps/chosen": -342.5481872558594, "logps/rejected": -268.31475830078125, "loss": 0.0464, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.3140230178833, "rewards/margins": 9.42198657989502, "rewards/rejected": -18.73600959777832, "sft_loss": 1.3192195892333984, "step": 865 }, { "epoch": 1.429353778751369, "grad_norm": 3.6243101382632688, "learning_rate": 1.0277875777210299e-07, "logits/chosen": 16.66229820251465, "logits/rejected": 15.501757621765137, "logps/chosen": -332.5435791015625, "logps/rejected": -261.5600280761719, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": -9.856383323669434, "rewards/margins": 8.77900218963623, "rewards/rejected": -18.635387420654297, "sft_loss": 1.3038902282714844, "step": 870 }, { "epoch": 1.4375684556407449, "grad_norm": 2.7537435903936074, "learning_rate": 1.0004452632802158e-07, "logits/chosen": 18.529787063598633, "logits/rejected": 17.4042911529541, "logps/chosen": -337.29791259765625, "logps/rejected": -277.3015441894531, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": -9.470120429992676, "rewards/margins": 9.908967971801758, "rewards/rejected": -19.379091262817383, "sft_loss": 1.2056940793991089, "step": 875 }, { "epoch": 1.4457831325301205, "grad_norm": 5.683629925320703, "learning_rate": 9.733803009114044e-08, "logits/chosen": 18.083925247192383, "logits/rejected": 16.712495803833008, "logps/chosen": -335.4129943847656, "logps/rejected": -262.9490966796875, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -9.186826705932617, "rewards/margins": 8.34049129486084, "rewards/rejected": -17.527315139770508, "sft_loss": 1.1717486381530762, "step": 880 }, { "epoch": 1.453997809419496, "grad_norm": 3.283485850050913, "learning_rate": 9.465976964494682e-08, "logits/chosen": 17.851030349731445, "logits/rejected": 17.46946144104004, "logps/chosen": -280.1820373535156, "logps/rejected": -242.58209228515625, "loss": 0.0529, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.64423656463623, "rewards/margins": 8.354222297668457, "rewards/rejected": -16.998458862304688, "sft_loss": 1.281132459640503, "step": 885 }, { "epoch": 1.462212486308872, "grad_norm": 2.4606306801838347, "learning_rate": 9.201024035054053e-08, "logits/chosen": 18.96923065185547, "logits/rejected": 17.189115524291992, "logps/chosen": -295.1808166503906, "logps/rejected": -223.66261291503906, "loss": 0.0605, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.9287748336792, "rewards/margins": 7.127224445343018, "rewards/rejected": -16.055997848510742, "sft_loss": 1.340820074081421, "step": 890 }, { "epoch": 1.4704271631982475, "grad_norm": 2.471728869379881, "learning_rate": 8.938993225501495e-08, "logits/chosen": 19.450706481933594, "logits/rejected": 18.156089782714844, "logps/chosen": -343.1815490722656, "logps/rejected": -268.55767822265625, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": -8.863726615905762, "rewards/margins": 9.138747215270996, "rewards/rejected": -18.002471923828125, "sft_loss": 1.042392611503601, "step": 895 }, { "epoch": 1.4786418400876231, "grad_norm": 2.702831742745651, "learning_rate": 8.679933000081879e-08, "logits/chosen": 18.009992599487305, "logits/rejected": 17.068376541137695, "logps/chosen": -340.9268798828125, "logps/rejected": -244.87017822265625, "loss": 0.0603, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -8.484542846679688, "rewards/margins": 8.053364753723145, "rewards/rejected": -16.53791046142578, "sft_loss": 1.1751140356063843, "step": 900 }, { "epoch": 1.486856516976999, "grad_norm": 4.0381849618513135, "learning_rate": 8.423891273611855e-08, "logits/chosen": 16.985027313232422, "logits/rejected": 16.37629508972168, "logps/chosen": -300.3452453613281, "logps/rejected": -240.13729858398438, "loss": 0.0587, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.195282936096191, "rewards/margins": 8.371771812438965, "rewards/rejected": -16.567054748535156, "sft_loss": 1.1615040302276611, "step": 905 }, { "epoch": 1.4950711938663745, "grad_norm": 3.373858379077247, "learning_rate": 8.170915402617739e-08, "logits/chosen": 17.579252243041992, "logits/rejected": 16.743297576904297, "logps/chosen": -347.9422912597656, "logps/rejected": -262.06610107421875, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": -9.031808853149414, "rewards/margins": 8.512653350830078, "rewards/rejected": -17.544462203979492, "sft_loss": 1.1611533164978027, "step": 910 }, { "epoch": 1.5032858707557502, "grad_norm": 1.8601145872189842, "learning_rate": 7.921052176576643e-08, "logits/chosen": 17.42083740234375, "logits/rejected": 17.060815811157227, "logps/chosen": -306.4146728515625, "logps/rejected": -253.76126098632812, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -9.15539264678955, "rewards/margins": 8.407408714294434, "rewards/rejected": -17.562801361083984, "sft_loss": 1.0932406187057495, "step": 915 }, { "epoch": 1.511500547645126, "grad_norm": 4.848816782812418, "learning_rate": 7.674347809262377e-08, "logits/chosen": 17.446378707885742, "logits/rejected": 17.68989372253418, "logps/chosen": -295.2680969238281, "logps/rejected": -246.88856506347656, "loss": 0.0616, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.591629028320312, "rewards/margins": 8.282797813415527, "rewards/rejected": -16.874427795410156, "sft_loss": 1.1592586040496826, "step": 920 }, { "epoch": 1.5197152245345018, "grad_norm": 3.1171315520626317, "learning_rate": 7.430847930198009e-08, "logits/chosen": 18.438880920410156, "logits/rejected": 16.9648494720459, "logps/chosen": -359.6697082519531, "logps/rejected": -265.92462158203125, "loss": 0.0599, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.22867202758789, "rewards/margins": 9.071920394897461, "rewards/rejected": -18.300594329833984, "sft_loss": 1.3760299682617188, "step": 925 }, { "epoch": 1.5279299014238772, "grad_norm": 2.946496194046417, "learning_rate": 7.190597576216384e-08, "logits/chosen": 17.264368057250977, "logits/rejected": 17.493934631347656, "logps/chosen": -341.4744873046875, "logps/rejected": -280.3497619628906, "loss": 0.0357, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.569901466369629, "rewards/margins": 9.21591854095459, "rewards/rejected": -18.785818099975586, "sft_loss": 1.1669524908065796, "step": 930 }, { "epoch": 1.536144578313253, "grad_norm": 4.598015503663298, "learning_rate": 6.953641183130224e-08, "logits/chosen": 18.845075607299805, "logits/rejected": 16.610898971557617, "logps/chosen": -364.0211181640625, "logps/rejected": -261.7635803222656, "loss": 0.055, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.992565155029297, "rewards/margins": 8.313262939453125, "rewards/rejected": -18.305830001831055, "sft_loss": 1.251989722251892, "step": 935 }, { "epoch": 1.5443592552026288, "grad_norm": 3.1309871505463733, "learning_rate": 6.720022577513507e-08, "logits/chosen": 17.22759246826172, "logits/rejected": 15.910391807556152, "logps/chosen": -371.6169738769531, "logps/rejected": -277.5339050292969, "loss": 0.0495, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -10.53276538848877, "rewards/margins": 8.91848087310791, "rewards/rejected": -19.451250076293945, "sft_loss": 1.2916030883789062, "step": 940 }, { "epoch": 1.5525739320920042, "grad_norm": 6.107763356196428, "learning_rate": 6.489784968595444e-08, "logits/chosen": 18.367130279541016, "logits/rejected": 16.745647430419922, "logps/chosen": -372.9920654296875, "logps/rejected": -294.3865966796875, "loss": 0.0797, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -10.943706512451172, "rewards/margins": 10.03380298614502, "rewards/rejected": -20.977510452270508, "sft_loss": 1.2319529056549072, "step": 945 }, { "epoch": 1.56078860898138, "grad_norm": 5.019635803171075, "learning_rate": 6.262970940268652e-08, "logits/chosen": 17.80394172668457, "logits/rejected": 17.284276962280273, "logps/chosen": -319.9245910644531, "logps/rejected": -272.44287109375, "loss": 0.0502, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -10.027565956115723, "rewards/margins": 9.023143768310547, "rewards/rejected": -19.05070686340332, "sft_loss": 1.1501847505569458, "step": 950 }, { "epoch": 1.5690032858707559, "grad_norm": 2.5718093284663017, "learning_rate": 6.039622443213008e-08, "logits/chosen": 17.769981384277344, "logits/rejected": 17.192481994628906, "logps/chosen": -338.2123718261719, "logps/rejected": -270.8006286621094, "loss": 0.0455, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -10.453187942504883, "rewards/margins": 8.49782943725586, "rewards/rejected": -18.95101547241211, "sft_loss": 1.23969304561615, "step": 955 }, { "epoch": 1.5772179627601315, "grad_norm": 2.7871237377104685, "learning_rate": 5.8197807871366e-08, "logits/chosen": 16.671855926513672, "logits/rejected": 15.88489818572998, "logps/chosen": -383.57757568359375, "logps/rejected": -302.80078125, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -10.86944580078125, "rewards/margins": 9.499520301818848, "rewards/rejected": -20.368961334228516, "sft_loss": 2.283703565597534, "step": 960 }, { "epoch": 1.585432639649507, "grad_norm": 8.539385336051494, "learning_rate": 5.6034866331352376e-08, "logits/chosen": 17.10059928894043, "logits/rejected": 15.988658905029297, "logps/chosen": -328.95880126953125, "logps/rejected": -276.1900939941406, "loss": 0.0789, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -11.029694557189941, "rewards/margins": 9.335869789123535, "rewards/rejected": -20.365564346313477, "sft_loss": 1.1840242147445679, "step": 965 }, { "epoch": 1.593647316538883, "grad_norm": 4.957273671741837, "learning_rate": 5.390779986171934e-08, "logits/chosen": 18.002132415771484, "logits/rejected": 16.781938552856445, "logps/chosen": -365.2828674316406, "logps/rejected": -290.2236328125, "loss": 0.0487, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -10.259432792663574, "rewards/margins": 9.27906322479248, "rewards/rejected": -19.538496017456055, "sft_loss": 1.1914342641830444, "step": 970 }, { "epoch": 1.6018619934282585, "grad_norm": 4.9428875327513655, "learning_rate": 5.1817001876777314e-08, "logits/chosen": 16.411977767944336, "logits/rejected": 16.428050994873047, "logps/chosen": -325.99688720703125, "logps/rejected": -282.6103515625, "loss": 0.0437, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.880454063415527, "rewards/margins": 9.455738067626953, "rewards/rejected": -19.336191177368164, "sft_loss": 1.2319772243499756, "step": 975 }, { "epoch": 1.6100766703176341, "grad_norm": 2.957939197451581, "learning_rate": 4.9762859082752464e-08, "logits/chosen": 18.82546043395996, "logits/rejected": 17.623178482055664, "logps/chosen": -363.7794494628906, "logps/rejected": -286.4556884765625, "loss": 0.0679, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -10.052456855773926, "rewards/margins": 9.68246841430664, "rewards/rejected": -19.73492431640625, "sft_loss": 1.0845558643341064, "step": 980 }, { "epoch": 1.61829134720701, "grad_norm": 4.380929225854475, "learning_rate": 4.774575140626316e-08, "logits/chosen": 16.805063247680664, "logits/rejected": 16.495389938354492, "logps/chosen": -326.19549560546875, "logps/rejected": -266.20477294921875, "loss": 0.0416, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.56114673614502, "rewards/margins": 9.003406524658203, "rewards/rejected": -18.56455421447754, "sft_loss": 1.1449401378631592, "step": 985 }, { "epoch": 1.6265060240963856, "grad_norm": 3.1682867458550255, "learning_rate": 4.5766051924049975e-08, "logits/chosen": 19.578977584838867, "logits/rejected": 18.58966064453125, "logps/chosen": -329.7391357421875, "logps/rejected": -268.65899658203125, "loss": 0.078, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.10643482208252, "rewards/margins": 9.36843204498291, "rewards/rejected": -18.474864959716797, "sft_loss": 1.255157470703125, "step": 990 }, { "epoch": 1.6347207009857612, "grad_norm": 0.6942420370445791, "learning_rate": 4.3824126793972934e-08, "logits/chosen": 17.492353439331055, "logits/rejected": 16.303081512451172, "logps/chosen": -363.3464050292969, "logps/rejected": -277.27960205078125, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": -8.666760444641113, "rewards/margins": 10.318286895751953, "rewards/rejected": -18.98504638671875, "sft_loss": 1.1402015686035156, "step": 995 }, { "epoch": 1.642935377875137, "grad_norm": 3.2190001250289964, "learning_rate": 4.192033518728819e-08, "logits/chosen": 17.813274383544922, "logits/rejected": 16.33589744567871, "logps/chosen": -336.6969909667969, "logps/rejected": -259.0225830078125, "loss": 0.0455, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -8.958456993103027, "rewards/margins": 8.632108688354492, "rewards/rejected": -17.590564727783203, "sft_loss": 1.3451780080795288, "step": 1000 }, { "epoch": 1.6511500547645126, "grad_norm": 7.393954413499783, "learning_rate": 4.0055029222217125e-08, "logits/chosen": 17.960172653198242, "logits/rejected": 16.416893005371094, "logps/chosen": -315.2778015136719, "logps/rejected": -254.21824645996094, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": -9.643136978149414, "rewards/margins": 8.79008960723877, "rewards/rejected": -18.4332275390625, "sft_loss": 1.1033631563186646, "step": 1005 }, { "epoch": 1.6593647316538882, "grad_norm": 4.50975910962959, "learning_rate": 3.8228553898819904e-08, "logits/chosen": 19.406719207763672, "logits/rejected": 18.56427001953125, "logps/chosen": -345.2118835449219, "logps/rejected": -286.46697998046875, "loss": 0.0704, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -10.453201293945312, "rewards/margins": 9.429792404174805, "rewards/rejected": -19.882991790771484, "sft_loss": 1.1480904817581177, "step": 1010 }, { "epoch": 1.667579408543264, "grad_norm": 2.421328969282485, "learning_rate": 3.6441247035185416e-08, "logits/chosen": 18.4009952545166, "logits/rejected": 17.309907913208008, "logps/chosen": -386.3546447753906, "logps/rejected": -297.7645568847656, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -10.208230972290039, "rewards/margins": 10.017862319946289, "rewards/rejected": -20.22609519958496, "sft_loss": 1.1494494676589966, "step": 1015 }, { "epoch": 1.6757940854326396, "grad_norm": 3.7694600773057583, "learning_rate": 3.4693439204949855e-08, "logits/chosen": 17.161108016967773, "logits/rejected": 16.785768508911133, "logps/chosen": -296.5499267578125, "logps/rejected": -256.6122741699219, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": -9.63855266571045, "rewards/margins": 8.829416275024414, "rewards/rejected": -18.467967987060547, "sft_loss": 1.2135652303695679, "step": 1020 }, { "epoch": 1.6840087623220152, "grad_norm": 7.949500193131256, "learning_rate": 3.298545367615493e-08, "logits/chosen": 18.552576065063477, "logits/rejected": 17.31821060180664, "logps/chosen": -295.35540771484375, "logps/rejected": -242.86293029785156, "loss": 0.0854, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -9.394362449645996, "rewards/margins": 7.929934501647949, "rewards/rejected": -17.324296951293945, "sft_loss": 1.2392216920852661, "step": 1025 }, { "epoch": 1.692223439211391, "grad_norm": 5.638532817532561, "learning_rate": 3.13176063514575e-08, "logits/chosen": 18.253822326660156, "logits/rejected": 17.51996421813965, "logps/chosen": -363.8645935058594, "logps/rejected": -290.9195556640625, "loss": 0.0719, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.809922218322754, "rewards/margins": 10.452852249145508, "rewards/rejected": -20.26277732849121, "sft_loss": 1.302940845489502, "step": 1030 }, { "epoch": 1.7004381161007667, "grad_norm": 4.692786337225903, "learning_rate": 2.96902057097011e-08, "logits/chosen": 18.16409683227539, "logits/rejected": 17.41961669921875, "logps/chosen": -346.486328125, "logps/rejected": -269.6810607910156, "loss": 0.0603, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.241110801696777, "rewards/margins": 9.675481796264648, "rewards/rejected": -18.916593551635742, "sft_loss": 1.3590866327285767, "step": 1035 }, { "epoch": 1.7086527929901423, "grad_norm": 5.302948433169166, "learning_rate": 2.8103552748861475e-08, "logits/chosen": 17.655193328857422, "logits/rejected": 16.845975875854492, "logps/chosen": -338.5050354003906, "logps/rejected": -274.38592529296875, "loss": 0.0456, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.701102256774902, "rewards/margins": 9.212477684020996, "rewards/rejected": -18.9135799407959, "sft_loss": 1.1541944742202759, "step": 1040 }, { "epoch": 1.716867469879518, "grad_norm": 7.958361212469992, "learning_rate": 2.65579409303745e-08, "logits/chosen": 18.777873992919922, "logits/rejected": 17.120763778686523, "logps/chosen": -387.65240478515625, "logps/rejected": -283.70098876953125, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": -9.420891761779785, "rewards/margins": 10.373385429382324, "rewards/rejected": -19.79427719116211, "sft_loss": 1.1774132251739502, "step": 1045 }, { "epoch": 1.7250821467688937, "grad_norm": 9.531573914482877, "learning_rate": 2.505365612485874e-08, "logits/chosen": 17.264955520629883, "logits/rejected": 14.986218452453613, "logps/chosen": -337.1243896484375, "logps/rejected": -248.45452880859375, "loss": 0.0714, "rewards/accuracies": 0.9333333373069763, "rewards/chosen": -9.543818473815918, "rewards/margins": 7.988076686859131, "rewards/rejected": -17.53189468383789, "sft_loss": 1.5333364009857178, "step": 1050 }, { "epoch": 1.7332968236582693, "grad_norm": 4.137336908358683, "learning_rate": 2.3590976559242275e-08, "logits/chosen": 17.53969383239746, "logits/rejected": 17.553749084472656, "logps/chosen": -310.6015930175781, "logps/rejected": -270.8495178222656, "loss": 0.0577, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.537567138671875, "rewards/margins": 8.755559921264648, "rewards/rejected": -18.29313087463379, "sft_loss": 1.243970513343811, "step": 1055 }, { "epoch": 1.7415115005476451, "grad_norm": 4.1742977406410295, "learning_rate": 2.21701727653025e-08, "logits/chosen": 17.685638427734375, "logits/rejected": 15.837899208068848, "logps/chosen": -373.8312683105469, "logps/rejected": -280.65020751953125, "loss": 0.0635, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -10.465250968933105, "rewards/margins": 9.272075653076172, "rewards/rejected": -19.73732566833496, "sft_loss": 1.230289101600647, "step": 1060 }, { "epoch": 1.749726177437021, "grad_norm": 3.5905659893317434, "learning_rate": 2.0791507529629522e-08, "logits/chosen": 17.48339080810547, "logits/rejected": 17.49197006225586, "logps/chosen": -282.5596008300781, "logps/rejected": -230.78216552734375, "loss": 0.0617, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -8.903531074523926, "rewards/margins": 7.071009635925293, "rewards/rejected": -15.974542617797852, "sft_loss": 1.1227587461471558, "step": 1065 }, { "epoch": 1.7579408543263964, "grad_norm": 13.383809225376258, "learning_rate": 1.945523584502262e-08, "logits/chosen": 19.799388885498047, "logits/rejected": 17.97065544128418, "logps/chosen": -394.8106384277344, "logps/rejected": -280.33184814453125, "loss": 0.0572, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.54643726348877, "rewards/margins": 9.38154125213623, "rewards/rejected": -18.927978515625, "sft_loss": 1.0600664615631104, "step": 1070 }, { "epoch": 1.7661555312157722, "grad_norm": 3.85892797302041, "learning_rate": 1.8161604863327072e-08, "logits/chosen": 16.801225662231445, "logits/rejected": 16.272432327270508, "logps/chosen": -315.7366943359375, "logps/rejected": -254.6013946533203, "loss": 0.0406, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.108306884765625, "rewards/margins": 8.766523361206055, "rewards/rejected": -17.874826431274414, "sft_loss": 1.1722328662872314, "step": 1075 }, { "epoch": 1.774370208105148, "grad_norm": 18.187119238883547, "learning_rate": 1.691085384972235e-08, "logits/chosen": 16.329721450805664, "logits/rejected": 15.469901084899902, "logps/chosen": -283.27142333984375, "logps/rejected": -235.70811462402344, "loss": 0.0575, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.012235641479492, "rewards/margins": 7.615647792816162, "rewards/rejected": -16.62788200378418, "sft_loss": 1.2651445865631104, "step": 1080 }, { "epoch": 1.7825848849945234, "grad_norm": 3.4104772768917426, "learning_rate": 1.570321413846845e-08, "logits/chosen": 16.62438201904297, "logits/rejected": 17.102264404296875, "logps/chosen": -312.1449279785156, "logps/rejected": -271.4256591796875, "loss": 0.0299, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.285161018371582, "rewards/margins": 9.69771957397461, "rewards/rejected": -18.982881546020508, "sft_loss": 1.2524992227554321, "step": 1085 }, { "epoch": 1.7907995618838992, "grad_norm": 4.409471745750223, "learning_rate": 1.4538909090118846e-08, "logits/chosen": 18.32159423828125, "logits/rejected": 16.56727409362793, "logps/chosen": -338.2820129394531, "logps/rejected": -250.31729125976562, "loss": 0.0607, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.47223949432373, "rewards/margins": 8.058004379272461, "rewards/rejected": -17.530242919921875, "sft_loss": 1.2121272087097168, "step": 1090 }, { "epoch": 1.799014238773275, "grad_norm": 3.827920678689443, "learning_rate": 1.3418154050208936e-08, "logits/chosen": 17.697154998779297, "logits/rejected": 16.722187042236328, "logps/chosen": -303.7814636230469, "logps/rejected": -251.51583862304688, "loss": 0.0563, "rewards/accuracies": 0.9466667175292969, "rewards/chosen": -9.34262466430664, "rewards/margins": 8.22611141204834, "rewards/rejected": -17.568735122680664, "sft_loss": 1.133971095085144, "step": 1095 }, { "epoch": 1.8072289156626506, "grad_norm": 2.3604810623566084, "learning_rate": 1.2341156309426447e-08, "logits/chosen": 16.974082946777344, "logits/rejected": 16.85063362121582, "logps/chosen": -346.72998046875, "logps/rejected": -278.26165771484375, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -9.343656539916992, "rewards/margins": 9.743067741394043, "rewards/rejected": -19.086727142333984, "sft_loss": 1.044119954109192, "step": 1100 }, { "epoch": 1.8154435925520263, "grad_norm": 3.1591505712032237, "learning_rate": 1.130811506527149e-08, "logits/chosen": 18.949861526489258, "logits/rejected": 17.834264755249023, "logps/chosen": -389.62042236328125, "logps/rejected": -280.9618835449219, "loss": 0.0328, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.740713119506836, "rewards/margins": 9.123427391052246, "rewards/rejected": -18.8641414642334, "sft_loss": 1.1095327138900757, "step": 1105 }, { "epoch": 1.823658269441402, "grad_norm": 1.034828289458216, "learning_rate": 1.0319221385213934e-08, "logits/chosen": 17.111549377441406, "logits/rejected": 16.737529754638672, "logps/chosen": -317.83343505859375, "logps/rejected": -262.7276916503906, "loss": 0.0392, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.57455825805664, "rewards/margins": 8.217500686645508, "rewards/rejected": -17.792057037353516, "sft_loss": 1.276735782623291, "step": 1110 }, { "epoch": 1.8318729463307777, "grad_norm": 5.969957340964575, "learning_rate": 9.374658171354411e-09, "logits/chosen": 18.033985137939453, "logits/rejected": 16.812503814697266, "logps/chosen": -338.47796630859375, "logps/rejected": -267.0741271972656, "loss": 0.066, "rewards/accuracies": 0.9466666579246521, "rewards/chosen": -9.4961519241333, "rewards/margins": 9.333700180053711, "rewards/rejected": -18.829853057861328, "sft_loss": 1.2590656280517578, "step": 1115 }, { "epoch": 1.8400876232201533, "grad_norm": 3.650713168539723, "learning_rate": 8.474600126594983e-09, "logits/chosen": 18.25022315979004, "logits/rejected": 17.499792098999023, "logps/chosen": -332.2896728515625, "logps/rejected": -259.7283935546875, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": -8.764142990112305, "rewards/margins": 9.385050773620605, "rewards/rejected": -18.149192810058594, "sft_loss": 1.2872370481491089, "step": 1120 }, { "epoch": 1.8483023001095291, "grad_norm": 3.519511198290722, "learning_rate": 7.619213722327184e-09, "logits/chosen": 17.54939842224121, "logits/rejected": 16.605783462524414, "logps/chosen": -330.8531799316406, "logps/rejected": -265.7262878417969, "loss": 0.0435, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.460249900817871, "rewards/margins": 9.11566162109375, "rewards/rejected": -18.575910568237305, "sft_loss": 1.2160968780517578, "step": 1125 }, { "epoch": 1.8565169769989047, "grad_norm": 4.185481328436663, "learning_rate": 6.808657167641896e-09, "logits/chosen": 17.643688201904297, "logits/rejected": 16.674888610839844, "logps/chosen": -361.64715576171875, "logps/rejected": -284.84881591796875, "loss": 0.0669, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.835926055908203, "rewards/margins": 10.017841339111328, "rewards/rejected": -19.8537654876709, "sft_loss": 1.1817443370819092, "step": 1130 }, { "epoch": 1.8647316538882803, "grad_norm": 5.8031765508458095, "learning_rate": 6.043080380067539e-09, "logits/chosen": 16.346317291259766, "logits/rejected": 16.2381649017334, "logps/chosen": -378.3958435058594, "logps/rejected": -303.12091064453125, "loss": 0.0523, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.526629447937012, "rewards/margins": 11.506245613098145, "rewards/rejected": -21.03287696838379, "sft_loss": 1.1959699392318726, "step": 1135 }, { "epoch": 1.8729463307776562, "grad_norm": 8.21221020965199, "learning_rate": 5.322624957841998e-09, "logits/chosen": 18.25189971923828, "logits/rejected": 17.553903579711914, "logps/chosen": -341.5302734375, "logps/rejected": -272.32012939453125, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": -9.903505325317383, "rewards/margins": 8.774069786071777, "rewards/rejected": -18.677574157714844, "sft_loss": 1.110214114189148, "step": 1140 }, { "epoch": 1.8811610076670318, "grad_norm": 1.2597947012426969, "learning_rate": 4.647424153723101e-09, "logits/chosen": 18.352624893188477, "logits/rejected": 16.199268341064453, "logps/chosen": -322.0838928222656, "logps/rejected": -241.83558654785156, "loss": 0.0757, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.200777053833008, "rewards/margins": 7.505552768707275, "rewards/rejected": -16.706331253051758, "sft_loss": 1.2215784788131714, "step": 1145 }, { "epoch": 1.8893756845564074, "grad_norm": 4.292246600774085, "learning_rate": 4.0176028503425826e-09, "logits/chosen": 17.103918075561523, "logits/rejected": 16.931406021118164, "logps/chosen": -311.3345642089844, "logps/rejected": -262.2460021972656, "loss": 0.049, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.341358184814453, "rewards/margins": 9.20698070526123, "rewards/rejected": -18.548337936401367, "sft_loss": 1.2602629661560059, "step": 1150 }, { "epoch": 1.8975903614457832, "grad_norm": 3.9316412520602233, "learning_rate": 3.433277537108481e-09, "logits/chosen": 18.07435417175293, "logits/rejected": 17.416156768798828, "logps/chosen": -373.20477294921875, "logps/rejected": -284.2578430175781, "loss": 0.0481, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -10.183159828186035, "rewards/margins": 8.937155723571777, "rewards/rejected": -19.120319366455078, "sft_loss": 1.2882879972457886, "step": 1155 }, { "epoch": 1.9058050383351588, "grad_norm": 4.002247931370384, "learning_rate": 2.8945562886593944e-09, "logits/chosen": 16.535991668701172, "logits/rejected": 15.887761116027832, "logps/chosen": -281.4147033691406, "logps/rejected": -248.97198486328125, "loss": 0.0572, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.560811996459961, "rewards/margins": 8.152007102966309, "rewards/rejected": -17.712818145751953, "sft_loss": 1.1380819082260132, "step": 1160 }, { "epoch": 1.9140197152245344, "grad_norm": 6.09926805017728, "learning_rate": 2.4015387448756976e-09, "logits/chosen": 16.622642517089844, "logits/rejected": 15.969447135925293, "logps/chosen": -354.6962890625, "logps/rejected": -264.1971435546875, "loss": 0.0698, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.276373863220215, "rewards/margins": 9.219854354858398, "rewards/rejected": -18.496227264404297, "sft_loss": 1.35190749168396, "step": 1165 }, { "epoch": 1.9222343921139102, "grad_norm": 2.2008981068000257, "learning_rate": 1.954316092450281e-09, "logits/chosen": 17.52834129333496, "logits/rejected": 16.535673141479492, "logps/chosen": -338.9072570800781, "logps/rejected": -272.974853515625, "loss": 0.0483, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -10.078722953796387, "rewards/margins": 8.57765007019043, "rewards/rejected": -18.6563720703125, "sft_loss": 1.2665674686431885, "step": 1170 }, { "epoch": 1.9304490690032858, "grad_norm": 6.860255096147663, "learning_rate": 1.5529710480231272e-09, "logits/chosen": 18.16209602355957, "logits/rejected": 17.17742347717285, "logps/chosen": -305.7500305175781, "logps/rejected": -253.6704864501953, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -9.595235824584961, "rewards/margins": 8.096555709838867, "rewards/rejected": -17.69179344177246, "sft_loss": 1.0956637859344482, "step": 1175 }, { "epoch": 1.9386637458926614, "grad_norm": 5.583742228828171, "learning_rate": 1.1975778428823524e-09, "logits/chosen": 17.371732711791992, "logits/rejected": 16.7208309173584, "logps/chosen": -353.3210754394531, "logps/rejected": -281.03302001953125, "loss": 0.0718, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -10.14670467376709, "rewards/margins": 9.053946495056152, "rewards/rejected": -19.200651168823242, "sft_loss": 1.1084918975830078, "step": 1180 }, { "epoch": 1.9468784227820373, "grad_norm": 7.482058530142954, "learning_rate": 8.882022092346064e-10, "logits/chosen": 17.550270080566406, "logits/rejected": 16.700857162475586, "logps/chosen": -347.03363037109375, "logps/rejected": -273.7410583496094, "loss": 0.0371, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.479987144470215, "rewards/margins": 9.481571197509766, "rewards/rejected": -18.961559295654297, "sft_loss": 1.2440842390060425, "step": 1185 }, { "epoch": 1.9550930996714129, "grad_norm": 2.060017516908208, "learning_rate": 6.249013680474368e-10, "logits/chosen": 17.62407875061035, "logits/rejected": 16.294307708740234, "logps/chosen": -312.6640625, "logps/rejected": -252.1957550048828, "loss": 0.042, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.584585189819336, "rewards/margins": 7.968944072723389, "rewards/rejected": -17.553531646728516, "sft_loss": 1.1957759857177734, "step": 1190 }, { "epoch": 1.9633077765607885, "grad_norm": 8.70865898012454, "learning_rate": 4.0772401846608794e-10, "logits/chosen": 18.594797134399414, "logits/rejected": 17.494707107543945, "logps/chosen": -298.91571044921875, "logps/rejected": -247.17645263671875, "loss": 0.0791, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.819672584533691, "rewards/margins": 7.458762168884277, "rewards/rejected": -17.27843475341797, "sft_loss": 1.1721436977386475, "step": 1195 }, { "epoch": 1.9715224534501643, "grad_norm": 8.143555814452077, "learning_rate": 2.367103288061223e-10, "logits/chosen": 17.42876625061035, "logits/rejected": 16.18610191345215, "logps/chosen": -311.3353271484375, "logps/rejected": -255.815673828125, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": -9.853960990905762, "rewards/margins": 8.45102310180664, "rewards/rejected": -18.304983139038086, "sft_loss": 1.2459405660629272, "step": 1200 }, { "epoch": 1.9797371303395401, "grad_norm": 3.1264960747238333, "learning_rate": 1.1189192912416933e-10, "logits/chosen": 17.570209503173828, "logits/rejected": 16.410261154174805, "logps/chosen": -398.1352233886719, "logps/rejected": -293.1958923339844, "loss": 0.0365, "rewards/accuracies": 0.9866666793823242, "rewards/chosen": -9.722556114196777, "rewards/margins": 10.323168754577637, "rewards/rejected": -20.045726776123047, "sft_loss": 1.2017709016799927, "step": 1205 }, { "epoch": 1.9879518072289155, "grad_norm": 4.393395171762388, "learning_rate": 3.329190536757731e-11, "logits/chosen": 18.872753143310547, "logits/rejected": 18.477998733520508, "logps/chosen": -312.2131042480469, "logps/rejected": -260.5821228027344, "loss": 0.0535, "rewards/accuracies": 0.9733333587646484, "rewards/chosen": -9.505008697509766, "rewards/margins": 9.130599021911621, "rewards/rejected": -18.635608673095703, "sft_loss": 1.1339497566223145, "step": 1210 }, { "epoch": 1.9961664841182913, "grad_norm": 5.10635428256011, "learning_rate": 9.247951046897906e-13, "logits/chosen": 18.22831916809082, "logits/rejected": 17.80943489074707, "logps/chosen": -330.88934326171875, "logps/rejected": -262.41461181640625, "loss": 0.0471, "rewards/accuracies": 0.9600000381469727, "rewards/chosen": -9.283769607543945, "rewards/margins": 8.581202507019043, "rewards/rejected": -17.864973068237305, "sft_loss": 1.1466354131698608, "step": 1215 }, { "epoch": 1.9978094194961664, "step": 1216, "total_flos": 131893560147968.0, "train_loss": 0.08742370063708604, "train_runtime": 41055.7102, "train_samples_per_second": 1.778, "train_steps_per_second": 0.03 } ], "logging_steps": 5, "max_steps": 1216, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 131893560147968.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }