diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3930 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9978094194961664, + "eval_steps": 50000, + "global_step": 1216, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008214676889375685, + "grad_norm": 28.31659169550249, + "learning_rate": 4.0983606557377046e-08, + "logits/chosen": 27.184185028076172, + "logits/rejected": 25.856258392333984, + "logps/chosen": -244.33399963378906, + "logps/rejected": -79.7464828491211, + "loss": 0.6863, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.015291815623641014, + "rewards/margins": 0.03226657956838608, + "rewards/rejected": -0.016974765807390213, + "sft_loss": 0.6384800672531128, + "step": 5 + }, + { + "epoch": 0.01642935377875137, + "grad_norm": 23.954861556073595, + "learning_rate": 8.196721311475409e-08, + "logits/chosen": 26.661327362060547, + "logits/rejected": 25.39139175415039, + "logps/chosen": -207.91012573242188, + "logps/rejected": -72.70105743408203, + "loss": 0.6278, + "rewards/accuracies": 0.786666750907898, + "rewards/chosen": -0.03712935373187065, + "rewards/margins": 0.1326407492160797, + "rewards/rejected": -0.16977010667324066, + "sft_loss": 0.6469722986221313, + "step": 10 + }, + { + "epoch": 0.024644030668127054, + "grad_norm": 12.243904208611285, + "learning_rate": 1.2295081967213113e-07, + "logits/chosen": 27.410442352294922, + "logits/rejected": 26.34275245666504, + "logps/chosen": -211.9379119873047, + "logps/rejected": -84.06497192382812, + "loss": 0.4689, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -0.15980161726474762, + "rewards/margins": 0.5716416239738464, + "rewards/rejected": -0.7314431667327881, + "sft_loss": 0.6353262662887573, + "step": 15 + }, + { + "epoch": 0.03285870755750274, + "grad_norm": 9.287884891913714, + "learning_rate": 1.6393442622950818e-07, + "logits/chosen": 27.55664825439453, + "logits/rejected": 26.192903518676758, + "logps/chosen": -280.5326843261719, + "logps/rejected": -112.344970703125, + "loss": 0.3581, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -0.6804031729698181, + "rewards/margins": 1.3235772848129272, + "rewards/rejected": -2.003980875015259, + "sft_loss": 0.7250083684921265, + "step": 20 + }, + { + "epoch": 0.04107338444687842, + "grad_norm": 7.405267430642346, + "learning_rate": 2.0491803278688524e-07, + "logits/chosen": 26.403255462646484, + "logits/rejected": 25.545373916625977, + "logps/chosen": -249.10903930664062, + "logps/rejected": -114.06135559082031, + "loss": 0.3047, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -1.3252760171890259, + "rewards/margins": 1.8669867515563965, + "rewards/rejected": -3.1922624111175537, + "sft_loss": 0.7239670157432556, + "step": 25 + }, + { + "epoch": 0.04928806133625411, + "grad_norm": 4.80122081598555, + "learning_rate": 2.4590163934426226e-07, + "logits/chosen": 25.064653396606445, + "logits/rejected": 24.138179779052734, + "logps/chosen": -273.2266540527344, + "logps/rejected": -129.16700744628906, + "loss": 0.2507, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -1.8779884576797485, + "rewards/margins": 2.8930463790893555, + "rewards/rejected": -4.7710347175598145, + "sft_loss": 0.7124671936035156, + "step": 30 + }, + { + "epoch": 0.05750273822562979, + "grad_norm": 4.89445256463419, + "learning_rate": 2.868852459016393e-07, + "logits/chosen": 23.180265426635742, + "logits/rejected": 22.45435905456543, + "logps/chosen": -281.69757080078125, + "logps/rejected": -149.479248046875, + "loss": 0.2291, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -2.5409016609191895, + "rewards/margins": 3.659693956375122, + "rewards/rejected": -6.200596809387207, + "sft_loss": 0.78533536195755, + "step": 35 + }, + { + "epoch": 0.06571741511500548, + "grad_norm": 5.60278997158879, + "learning_rate": 3.2786885245901637e-07, + "logits/chosen": 21.637163162231445, + "logits/rejected": 21.079978942871094, + "logps/chosen": -259.0185546875, + "logps/rejected": -145.42489624023438, + "loss": 0.2061, + "rewards/accuracies": 0.9333333373069763, + "rewards/chosen": -2.746673583984375, + "rewards/margins": 3.7938296794891357, + "rewards/rejected": -6.54050350189209, + "sft_loss": 0.7671460509300232, + "step": 40 + }, + { + "epoch": 0.07393209200438117, + "grad_norm": 4.3242655896865045, + "learning_rate": 3.6885245901639347e-07, + "logits/chosen": 22.72602653503418, + "logits/rejected": 21.949237823486328, + "logps/chosen": -265.2069396972656, + "logps/rejected": -151.65060424804688, + "loss": 0.1803, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -3.016242265701294, + "rewards/margins": 4.000359058380127, + "rewards/rejected": -7.016600608825684, + "sft_loss": 0.7528119683265686, + "step": 45 + }, + { + "epoch": 0.08214676889375684, + "grad_norm": 4.333490981221177, + "learning_rate": 4.0983606557377047e-07, + "logits/chosen": 22.819128036499023, + "logits/rejected": 21.5169620513916, + "logps/chosen": -287.1695251464844, + "logps/rejected": -152.8419647216797, + "loss": 0.1609, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -3.535120725631714, + "rewards/margins": 3.6887311935424805, + "rewards/rejected": -7.223852157592773, + "sft_loss": 0.7424061894416809, + "step": 50 + }, + { + "epoch": 0.09036144578313253, + "grad_norm": 3.8031629734590995, + "learning_rate": 4.508196721311475e-07, + "logits/chosen": 22.994489669799805, + "logits/rejected": 21.521961212158203, + "logps/chosen": -276.05242919921875, + "logps/rejected": -162.29678344726562, + "loss": 0.1759, + "rewards/accuracies": 0.9333333373069763, + "rewards/chosen": -3.299473762512207, + "rewards/margins": 4.333623886108398, + "rewards/rejected": -7.6330976486206055, + "sft_loss": 0.7973353266716003, + "step": 55 + }, + { + "epoch": 0.09857612267250822, + "grad_norm": 2.9463104781172405, + "learning_rate": 4.918032786885245e-07, + "logits/chosen": 23.39748764038086, + "logits/rejected": 22.529293060302734, + "logps/chosen": -226.25714111328125, + "logps/rejected": -133.46775817871094, + "loss": 0.166, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -2.8591995239257812, + "rewards/margins": 3.5714006423950195, + "rewards/rejected": -6.430600166320801, + "sft_loss": 0.7205591797828674, + "step": 60 + }, + { + "epoch": 0.10679079956188389, + "grad_norm": 5.472434832813438, + "learning_rate": 4.999852034151641e-07, + "logits/chosen": 21.749300003051758, + "logits/rejected": 21.059572219848633, + "logps/chosen": -288.58892822265625, + "logps/rejected": -163.43212890625, + "loss": 0.1564, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -3.1936755180358887, + "rewards/margins": 4.4503631591796875, + "rewards/rejected": -7.644038200378418, + "sft_loss": 0.8240499496459961, + "step": 65 + }, + { + "epoch": 0.11500547645125958, + "grad_norm": 2.7329123191165547, + "learning_rate": 4.999250952911133e-07, + "logits/chosen": 23.737056732177734, + "logits/rejected": 22.234569549560547, + "logps/chosen": -282.01953125, + "logps/rejected": -153.94252014160156, + "loss": 0.1336, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -3.0082314014434814, + "rewards/margins": 4.5703325271606445, + "rewards/rejected": -7.5785627365112305, + "sft_loss": 0.8242512345314026, + "step": 70 + }, + { + "epoch": 0.12322015334063527, + "grad_norm": 48.59604948359565, + "learning_rate": 4.998187619501184e-07, + "logits/chosen": 23.999074935913086, + "logits/rejected": 22.998958587646484, + "logps/chosen": -320.11505126953125, + "logps/rejected": -178.56219482421875, + "loss": 0.1269, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -3.2927637100219727, + "rewards/margins": 5.467880725860596, + "rewards/rejected": -8.760644912719727, + "sft_loss": 0.8822128772735596, + "step": 75 + }, + { + "epoch": 0.13143483023001096, + "grad_norm": 34.18732885169035, + "learning_rate": 4.996662230591989e-07, + "logits/chosen": 21.339969635009766, + "logits/rejected": 20.508026123046875, + "logps/chosen": -297.17633056640625, + "logps/rejected": -178.69137573242188, + "loss": 0.1336, + "rewards/accuracies": 0.9200000762939453, + "rewards/chosen": -3.8860669136047363, + "rewards/margins": 5.22913932800293, + "rewards/rejected": -9.115203857421875, + "sft_loss": 0.8318250179290771, + "step": 80 + }, + { + "epoch": 0.13964950711938665, + "grad_norm": 3.158882002153525, + "learning_rate": 4.994675068313813e-07, + "logits/chosen": 21.070241928100586, + "logits/rejected": 20.90212631225586, + "logps/chosen": -286.97998046875, + "logps/rejected": -186.49452209472656, + "loss": 0.1122, + "rewards/accuracies": 0.9200000762939453, + "rewards/chosen": -4.223217010498047, + "rewards/margins": 5.617285251617432, + "rewards/rejected": -9.84050178527832, + "sft_loss": 0.8361734747886658, + "step": 85 + }, + { + "epoch": 0.14786418400876233, + "grad_norm": 3.2825430282007164, + "learning_rate": 4.992226500204806e-07, + "logits/chosen": 22.27889060974121, + "logits/rejected": 21.50484275817871, + "logps/chosen": -287.25439453125, + "logps/rejected": -167.19528198242188, + "loss": 0.1182, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -3.6078903675079346, + "rewards/margins": 5.157074451446533, + "rewards/rejected": -8.764965057373047, + "sft_loss": 0.865871787071228, + "step": 90 + }, + { + "epoch": 0.156078860898138, + "grad_norm": 4.571628146800866, + "learning_rate": 4.989316979143029e-07, + "logits/chosen": 21.98550796508789, + "logits/rejected": 20.47240447998047, + "logps/chosen": -276.35479736328125, + "logps/rejected": -154.84384155273438, + "loss": 0.1667, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -3.3969411849975586, + "rewards/margins": 4.759787559509277, + "rewards/rejected": -8.156728744506836, + "sft_loss": 0.8606770038604736, + "step": 95 + }, + { + "epoch": 0.16429353778751368, + "grad_norm": 5.767989035541414, + "learning_rate": 4.985947043262686e-07, + "logits/chosen": 20.512102127075195, + "logits/rejected": 19.77199935913086, + "logps/chosen": -282.6247863769531, + "logps/rejected": -166.79330444335938, + "loss": 0.1454, + "rewards/accuracies": 0.9333333373069763, + "rewards/chosen": -3.124844551086426, + "rewards/margins": 5.334622383117676, + "rewards/rejected": -8.459466934204102, + "sft_loss": 0.8327052593231201, + "step": 100 + }, + { + "epoch": 0.17250821467688937, + "grad_norm": 2.592654441763786, + "learning_rate": 4.982117315854593e-07, + "logits/chosen": 20.459001541137695, + "logits/rejected": 20.174640655517578, + "logps/chosen": -248.44091796875, + "logps/rejected": -155.70086669921875, + "loss": 0.1293, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -2.7995762825012207, + "rewards/margins": 4.992871284484863, + "rewards/rejected": -7.792448043823242, + "sft_loss": 0.8232018351554871, + "step": 105 + }, + { + "epoch": 0.18072289156626506, + "grad_norm": 3.061900848647029, + "learning_rate": 4.977828505250903e-07, + "logits/chosen": 20.415132522583008, + "logits/rejected": 19.290271759033203, + "logps/chosen": -247.19740295410156, + "logps/rejected": -146.08677673339844, + "loss": 0.1359, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -3.0541529655456543, + "rewards/margins": 4.263410568237305, + "rewards/rejected": -7.317563056945801, + "sft_loss": 0.8086569309234619, + "step": 110 + }, + { + "epoch": 0.18893756845564075, + "grad_norm": 3.8428826764124495, + "learning_rate": 4.973081404694087e-07, + "logits/chosen": 19.876977920532227, + "logits/rejected": 19.5655574798584, + "logps/chosen": -273.8357849121094, + "logps/rejected": -173.55661010742188, + "loss": 0.1152, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -3.349548578262329, + "rewards/margins": 5.433572292327881, + "rewards/rejected": -8.783121109008789, + "sft_loss": 0.8695060610771179, + "step": 115 + }, + { + "epoch": 0.19715224534501644, + "grad_norm": 3.5510462679770107, + "learning_rate": 4.967876892190227e-07, + "logits/chosen": 21.53512191772461, + "logits/rejected": 19.82799530029297, + "logps/chosen": -300.6681823730469, + "logps/rejected": -162.72463989257812, + "loss": 0.1214, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -3.324470043182373, + "rewards/margins": 5.188758850097656, + "rewards/rejected": -8.513228416442871, + "sft_loss": 0.8628395199775696, + "step": 120 + }, + { + "epoch": 0.20536692223439212, + "grad_norm": 2.7722703582939263, + "learning_rate": 4.962215930346614e-07, + "logits/chosen": 20.72852897644043, + "logits/rejected": 19.19593048095703, + "logps/chosen": -277.3075256347656, + "logps/rejected": -168.2571258544922, + "loss": 0.1007, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -3.5354294776916504, + "rewards/margins": 5.148807048797607, + "rewards/rejected": -8.684236526489258, + "sft_loss": 0.8580695986747742, + "step": 125 + }, + { + "epoch": 0.21358159912376778, + "grad_norm": 2.509695712976081, + "learning_rate": 4.956099566193716e-07, + "logits/chosen": 19.78179931640625, + "logits/rejected": 18.386613845825195, + "logps/chosen": -292.8381042480469, + "logps/rejected": -179.89065551757812, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8737540245056152, + "rewards/margins": 5.42431116104126, + "rewards/rejected": -9.298065185546875, + "sft_loss": 0.8839088678359985, + "step": 130 + }, + { + "epoch": 0.22179627601314347, + "grad_norm": 3.4066444126067936, + "learning_rate": 4.949528930991521e-07, + "logits/chosen": 20.118431091308594, + "logits/rejected": 18.660707473754883, + "logps/chosen": -300.8720703125, + "logps/rejected": -190.8058319091797, + "loss": 0.1162, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -4.312110900878906, + "rewards/margins": 6.067122936248779, + "rewards/rejected": -10.379232406616211, + "sft_loss": 0.8384607434272766, + "step": 135 + }, + { + "epoch": 0.23001095290251916, + "grad_norm": 2.863527702169043, + "learning_rate": 4.9425052400203e-07, + "logits/chosen": 19.4266300201416, + "logits/rejected": 18.600717544555664, + "logps/chosen": -289.1473693847656, + "logps/rejected": -183.9163360595703, + "loss": 0.1138, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -4.603145122528076, + "rewards/margins": 5.1031365394592285, + "rewards/rejected": -9.706281661987305, + "sft_loss": 0.8902355432510376, + "step": 140 + }, + { + "epoch": 0.23822562979189485, + "grad_norm": 4.705307692419688, + "learning_rate": 4.935029792355834e-07, + "logits/chosen": 19.58378791809082, + "logits/rejected": 18.723949432373047, + "logps/chosen": -288.09527587890625, + "logps/rejected": -178.56491088867188, + "loss": 0.0997, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -3.634652853012085, + "rewards/margins": 5.0857977867126465, + "rewards/rejected": -8.720452308654785, + "sft_loss": 0.8318749666213989, + "step": 145 + }, + { + "epoch": 0.24644030668127054, + "grad_norm": 4.443077031203506, + "learning_rate": 4.927103970629147e-07, + "logits/chosen": 19.900896072387695, + "logits/rejected": 18.748838424682617, + "logps/chosen": -296.9352111816406, + "logps/rejected": -197.29566955566406, + "loss": 0.1249, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -5.586284160614014, + "rewards/margins": 5.75706672668457, + "rewards/rejected": -11.343351364135742, + "sft_loss": 0.8322177529335022, + "step": 150 + }, + { + "epoch": 0.2546549835706462, + "grad_norm": 3.0620385223375557, + "learning_rate": 4.918729240770775e-07, + "logits/chosen": 19.462993621826172, + "logits/rejected": 19.144250869750977, + "logps/chosen": -282.6866455078125, + "logps/rejected": -189.68820190429688, + "loss": 0.1166, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -5.4352288246154785, + "rewards/margins": 6.065767288208008, + "rewards/rejected": -11.500997543334961, + "sft_loss": 0.9704034328460693, + "step": 155 + }, + { + "epoch": 0.2628696604600219, + "grad_norm": 3.5256501257886717, + "learning_rate": 4.909907151739633e-07, + "logits/chosen": 20.455379486083984, + "logits/rejected": 19.202545166015625, + "logps/chosen": -305.9945373535156, + "logps/rejected": -189.74273681640625, + "loss": 0.1238, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -4.548050403594971, + "rewards/margins": 6.132920265197754, + "rewards/rejected": -10.680971145629883, + "sft_loss": 0.8781507015228271, + "step": 160 + }, + { + "epoch": 0.2710843373493976, + "grad_norm": 5.636178726208259, + "learning_rate": 4.900639335236526e-07, + "logits/chosen": 20.459400177001953, + "logits/rejected": 19.524314880371094, + "logps/chosen": -278.4658508300781, + "logps/rejected": -170.0297088623047, + "loss": 0.1276, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -3.6279122829437256, + "rewards/margins": 5.202185153961182, + "rewards/rejected": -8.830097198486328, + "sft_loss": 0.8619104623794556, + "step": 165 + }, + { + "epoch": 0.2792990142387733, + "grad_norm": 3.839926119567715, + "learning_rate": 4.890927505402359e-07, + "logits/chosen": 18.842321395874023, + "logits/rejected": 18.251901626586914, + "logps/chosen": -251.1257781982422, + "logps/rejected": -161.70802307128906, + "loss": 0.1045, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -3.4545812606811523, + "rewards/margins": 5.017471790313721, + "rewards/rejected": -8.472052574157715, + "sft_loss": 0.7917510271072388, + "step": 170 + }, + { + "epoch": 0.28751369112814895, + "grad_norm": 3.2492164364318836, + "learning_rate": 4.880773458501089e-07, + "logits/chosen": 21.866779327392578, + "logits/rejected": 20.14942169189453, + "logps/chosen": -265.2092590332031, + "logps/rejected": -160.5437469482422, + "loss": 0.0838, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -3.7316997051239014, + "rewards/margins": 4.919445514678955, + "rewards/rejected": -8.651144981384277, + "sft_loss": 0.8359836339950562, + "step": 175 + }, + { + "epoch": 0.29572836801752467, + "grad_norm": 2.829467767903507, + "learning_rate": 4.870179072587498e-07, + "logits/chosen": 19.415096282958984, + "logits/rejected": 17.55588722229004, + "logps/chosen": -264.83953857421875, + "logps/rejected": -170.27407836914062, + "loss": 0.1141, + "rewards/accuracies": 0.9333333373069763, + "rewards/chosen": -4.813459396362305, + "rewards/margins": 5.231393337249756, + "rewards/rejected": -10.044852256774902, + "sft_loss": 0.9549927115440369, + "step": 180 + }, + { + "epoch": 0.30394304490690033, + "grad_norm": 10.34631367942223, + "learning_rate": 4.859146307159841e-07, + "logits/chosen": 20.497596740722656, + "logits/rejected": 18.766536712646484, + "logps/chosen": -261.2963562011719, + "logps/rejected": -182.26380920410156, + "loss": 0.0915, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -5.357536792755127, + "rewards/margins": 5.210677623748779, + "rewards/rejected": -10.568212509155273, + "sft_loss": 0.9084681868553162, + "step": 185 + }, + { + "epoch": 0.312157721796276, + "grad_norm": 5.783975007709455, + "learning_rate": 4.847677202797414e-07, + "logits/chosen": 21.4503116607666, + "logits/rejected": 20.343997955322266, + "logps/chosen": -277.43756103515625, + "logps/rejected": -190.31895446777344, + "loss": 0.1261, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -5.206136226654053, + "rewards/margins": 6.117234230041504, + "rewards/rejected": -11.323369979858398, + "sft_loss": 0.8104835748672485, + "step": 190 + }, + { + "epoch": 0.3203723986856517, + "grad_norm": 5.842032329450428, + "learning_rate": 4.835773880783144e-07, + "logits/chosen": 18.739864349365234, + "logits/rejected": 18.180030822753906, + "logps/chosen": -279.97637939453125, + "logps/rejected": -194.57496643066406, + "loss": 0.0996, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -4.8305253982543945, + "rewards/margins": 6.600159168243408, + "rewards/rejected": -11.430684089660645, + "sft_loss": 0.8295060396194458, + "step": 195 + }, + { + "epoch": 0.32858707557502737, + "grad_norm": 3.2670595180505755, + "learning_rate": 4.823438542711238e-07, + "logits/chosen": 19.910261154174805, + "logits/rejected": 19.292858123779297, + "logps/chosen": -298.5541076660156, + "logps/rejected": -194.70803833007812, + "loss": 0.0875, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -4.582242488861084, + "rewards/margins": 6.173951625823975, + "rewards/rejected": -10.756195068359375, + "sft_loss": 0.917156994342804, + "step": 200 + }, + { + "epoch": 0.3368017524644031, + "grad_norm": 3.1103237361524463, + "learning_rate": 4.81067347007999e-07, + "logits/chosen": 21.32513999938965, + "logits/rejected": 20.00907325744629, + "logps/chosen": -282.4539489746094, + "logps/rejected": -180.13723754882812, + "loss": 0.1094, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -5.038515090942383, + "rewards/margins": 5.208615303039551, + "rewards/rejected": -10.247130393981934, + "sft_loss": 0.9259530305862427, + "step": 205 + }, + { + "epoch": 0.34501642935377874, + "grad_norm": 2.739223034780536, + "learning_rate": 4.797481023869801e-07, + "logits/chosen": 20.315624237060547, + "logits/rejected": 19.15978240966797, + "logps/chosen": -251.40025329589844, + "logps/rejected": -175.2682647705078, + "loss": 0.0868, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -4.576803684234619, + "rewards/margins": 5.497461795806885, + "rewards/rejected": -10.074265480041504, + "sft_loss": 0.9133593440055847, + "step": 210 + }, + { + "epoch": 0.35323110624315446, + "grad_norm": 3.832286663529339, + "learning_rate": 4.783863644106502e-07, + "logits/chosen": 20.822975158691406, + "logits/rejected": 19.61457633972168, + "logps/chosen": -272.2485656738281, + "logps/rejected": -176.64450073242188, + "loss": 0.084, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -4.567282676696777, + "rewards/margins": 5.274582862854004, + "rewards/rejected": -9.841866493225098, + "sft_loss": 0.9593473672866821, + "step": 215 + }, + { + "epoch": 0.3614457831325301, + "grad_norm": 2.3582346070622857, + "learning_rate": 4.769823849410053e-07, + "logits/chosen": 18.23250961303711, + "logits/rejected": 17.80348777770996, + "logps/chosen": -308.80230712890625, + "logps/rejected": -218.04078674316406, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.822837829589844, + "rewards/margins": 7.0780110359191895, + "rewards/rejected": -12.900848388671875, + "sft_loss": 0.9446174502372742, + "step": 220 + }, + { + "epoch": 0.3696604600219058, + "grad_norm": 2.4553351337767424, + "learning_rate": 4.7553642365287127e-07, + "logits/chosen": 18.841230392456055, + "logits/rejected": 18.06731414794922, + "logps/chosen": -290.5154724121094, + "logps/rejected": -198.66468811035156, + "loss": 0.1065, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -5.4337263107299805, + "rewards/margins": 6.076140880584717, + "rewards/rejected": -11.509869575500488, + "sft_loss": 0.9967135787010193, + "step": 225 + }, + { + "epoch": 0.3778751369112815, + "grad_norm": 5.091180856379443, + "learning_rate": 4.7404874798587493e-07, + "logits/chosen": 20.33222770690918, + "logits/rejected": 19.57872772216797, + "logps/chosen": -296.5139465332031, + "logps/rejected": -187.86419677734375, + "loss": 0.0947, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -4.502869129180908, + "rewards/margins": 6.029680252075195, + "rewards/rejected": -10.532547950744629, + "sft_loss": 0.8854550719261169, + "step": 230 + }, + { + "epoch": 0.38608981380065716, + "grad_norm": 3.7252781041155845, + "learning_rate": 4.7251963309497965e-07, + "logits/chosen": 19.174985885620117, + "logits/rejected": 18.191041946411133, + "logps/chosen": -294.4977111816406, + "logps/rejected": -195.45391845703125, + "loss": 0.1189, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -4.810143947601318, + "rewards/margins": 5.988135814666748, + "rewards/rejected": -10.79827880859375, + "sft_loss": 0.9932563304901123, + "step": 235 + }, + { + "epoch": 0.39430449069003287, + "grad_norm": 2.8445489541878457, + "learning_rate": 4.709493617995938e-07, + "logits/chosen": 19.772836685180664, + "logits/rejected": 18.601234436035156, + "logps/chosen": -281.09808349609375, + "logps/rejected": -186.28834533691406, + "loss": 0.0903, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -4.58309268951416, + "rewards/margins": 6.093752384185791, + "rewards/rejected": -10.67684555053711, + "sft_loss": 0.8651002049446106, + "step": 240 + }, + { + "epoch": 0.40251916757940853, + "grad_norm": 3.5938197609476856, + "learning_rate": 4.6933822453126114e-07, + "logits/chosen": 18.83224105834961, + "logits/rejected": 18.597658157348633, + "logps/chosen": -230.34445190429688, + "logps/rejected": -173.43992614746094, + "loss": 0.1183, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -4.852273464202881, + "rewards/margins": 5.561976432800293, + "rewards/rejected": -10.414249420166016, + "sft_loss": 0.9817420840263367, + "step": 245 + }, + { + "epoch": 0.41073384446878425, + "grad_norm": 5.781394523018812, + "learning_rate": 4.676865192799443e-07, + "logits/chosen": 20.996742248535156, + "logits/rejected": 20.808259963989258, + "logps/chosen": -305.62493896484375, + "logps/rejected": -224.93048095703125, + "loss": 0.0704, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.482853889465332, + "rewards/margins": 6.840579509735107, + "rewards/rejected": -13.323433876037598, + "sft_loss": 0.9305270910263062, + "step": 250 + }, + { + "epoch": 0.4189485213581599, + "grad_norm": 2.956667548991593, + "learning_rate": 4.65994551538909e-07, + "logits/chosen": 20.87143898010254, + "logits/rejected": 19.100732803344727, + "logps/chosen": -339.39825439453125, + "logps/rejected": -216.97386169433594, + "loss": 0.1042, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -6.5454792976379395, + "rewards/margins": 6.870533466339111, + "rewards/rejected": -13.416014671325684, + "sft_loss": 1.041126012802124, + "step": 255 + }, + { + "epoch": 0.42716319824753557, + "grad_norm": 4.392320655716504, + "learning_rate": 4.642626342482215e-07, + "logits/chosen": 19.316911697387695, + "logits/rejected": 18.243391036987305, + "logps/chosen": -256.42987060546875, + "logps/rejected": -179.37509155273438, + "loss": 0.1123, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -5.184771537780762, + "rewards/margins": 5.357823848724365, + "rewards/rejected": -10.542596817016602, + "sft_loss": 0.9005042314529419, + "step": 260 + }, + { + "epoch": 0.4353778751369113, + "grad_norm": 3.4877310264508186, + "learning_rate": 4.624910877368684e-07, + "logits/chosen": 18.9818115234375, + "logits/rejected": 18.550493240356445, + "logps/chosen": -290.06829833984375, + "logps/rejected": -182.69097900390625, + "loss": 0.0943, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -4.028794765472412, + "rewards/margins": 5.486021041870117, + "rewards/rejected": -9.514815330505371, + "sft_loss": 0.8677726984024048, + "step": 265 + }, + { + "epoch": 0.44359255202628695, + "grad_norm": 3.8262254077954485, + "learning_rate": 4.606802396635098e-07, + "logits/chosen": 19.717529296875, + "logits/rejected": 19.07860565185547, + "logps/chosen": -295.7071838378906, + "logps/rejected": -187.46609497070312, + "loss": 0.0909, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -4.471216678619385, + "rewards/margins": 5.717488765716553, + "rewards/rejected": -10.188706398010254, + "sft_loss": 0.8504019379615784, + "step": 270 + }, + { + "epoch": 0.45180722891566266, + "grad_norm": 5.060345147215167, + "learning_rate": 4.588304249558763e-07, + "logits/chosen": 18.937856674194336, + "logits/rejected": 17.51219940185547, + "logps/chosen": -296.1573486328125, + "logps/rejected": -193.85498046875, + "loss": 0.0999, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -5.353262424468994, + "rewards/margins": 5.619455814361572, + "rewards/rejected": -10.97271728515625, + "sft_loss": 0.9665505886077881, + "step": 275 + }, + { + "epoch": 0.4600219058050383, + "grad_norm": 4.00534202244361, + "learning_rate": 4.569419857488228e-07, + "logits/chosen": 19.323719024658203, + "logits/rejected": 18.016199111938477, + "logps/chosen": -321.9727783203125, + "logps/rejected": -199.45265197753906, + "loss": 0.1019, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -5.450780868530273, + "rewards/margins": 6.008893013000488, + "rewards/rejected": -11.459673881530762, + "sft_loss": 0.99959397315979, + "step": 280 + }, + { + "epoch": 0.46823658269441404, + "grad_norm": 3.285371678607192, + "learning_rate": 4.550152713210478e-07, + "logits/chosen": 20.368547439575195, + "logits/rejected": 18.82790756225586, + "logps/chosen": -274.7012634277344, + "logps/rejected": -187.615966796875, + "loss": 0.0864, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -5.436644554138184, + "rewards/margins": 5.6056671142578125, + "rewards/rejected": -11.042311668395996, + "sft_loss": 0.9444936513900757, + "step": 285 + }, + { + "epoch": 0.4764512595837897, + "grad_norm": 4.461231980649141, + "learning_rate": 4.530506380304925e-07, + "logits/chosen": 19.512760162353516, + "logits/rejected": 17.607872009277344, + "logps/chosen": -353.92181396484375, + "logps/rejected": -215.2699432373047, + "loss": 0.0889, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -5.762898921966553, + "rewards/margins": 6.650979042053223, + "rewards/rejected": -12.413877487182617, + "sft_loss": 1.0174636840820312, + "step": 290 + }, + { + "epoch": 0.4846659364731654, + "grad_norm": 6.886972650518142, + "learning_rate": 4.510484492484301e-07, + "logits/chosen": 18.66490936279297, + "logits/rejected": 19.016637802124023, + "logps/chosen": -309.6719055175781, + "logps/rejected": -233.22732543945312, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.444864749908447, + "rewards/margins": 7.558566093444824, + "rewards/rejected": -14.003432273864746, + "sft_loss": 0.9799606800079346, + "step": 295 + }, + { + "epoch": 0.4928806133625411, + "grad_norm": 7.0952145291800806, + "learning_rate": 4.4900907529225797e-07, + "logits/chosen": 18.357818603515625, + "logits/rejected": 17.093488693237305, + "logps/chosen": -321.1383972167969, + "logps/rejected": -212.69412231445312, + "loss": 0.1166, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.2735819816589355, + "rewards/margins": 6.8673529624938965, + "rewards/rejected": -13.140933990478516, + "sft_loss": 0.9731053709983826, + "step": 300 + }, + { + "epoch": 0.5010952902519168, + "grad_norm": 3.035306319401663, + "learning_rate": 4.46932893357005e-07, + "logits/chosen": 19.986753463745117, + "logits/rejected": 19.238985061645508, + "logps/chosen": -317.82354736328125, + "logps/rejected": -216.91896057128906, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.982201099395752, + "rewards/margins": 7.021622657775879, + "rewards/rejected": -13.003824234008789, + "sft_loss": 0.9540520310401917, + "step": 305 + }, + { + "epoch": 0.5093099671412924, + "grad_norm": 2.8463069418400826, + "learning_rate": 4.448202874455672e-07, + "logits/chosen": 18.925642013549805, + "logits/rejected": 18.447513580322266, + "logps/chosen": -310.6861267089844, + "logps/rejected": -205.47084045410156, + "loss": 0.1225, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -5.411799430847168, + "rewards/margins": 6.359774112701416, + "rewards/rejected": -11.771574020385742, + "sft_loss": 1.0545023679733276, + "step": 310 + }, + { + "epoch": 0.5175246440306681, + "grad_norm": 3.9649140060367407, + "learning_rate": 4.426716482976838e-07, + "logits/chosen": 19.93076515197754, + "logits/rejected": 19.637243270874023, + "logps/chosen": -303.6711730957031, + "logps/rejected": -199.40115356445312, + "loss": 0.0913, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -4.821557521820068, + "rewards/margins": 6.5209574699401855, + "rewards/rejected": -11.342514991760254, + "sft_loss": 0.9295309782028198, + "step": 315 + }, + { + "epoch": 0.5257393209200438, + "grad_norm": 3.5060241985803304, + "learning_rate": 4.4048737331766774e-07, + "logits/chosen": 21.6712646484375, + "logits/rejected": 19.89933204650879, + "logps/chosen": -271.3023681640625, + "logps/rejected": -179.4087371826172, + "loss": 0.1113, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -4.785543918609619, + "rewards/margins": 5.356179714202881, + "rewards/rejected": -10.1417236328125, + "sft_loss": 0.8473352193832397, + "step": 320 + }, + { + "epoch": 0.5339539978094195, + "grad_norm": 9.641524189799103, + "learning_rate": 4.3826786650090273e-07, + "logits/chosen": 17.178016662597656, + "logits/rejected": 17.272985458374023, + "logps/chosen": -279.9751892089844, + "logps/rejected": -187.63014221191406, + "loss": 0.0909, + "rewards/accuracies": 0.9333333373069763, + "rewards/chosen": -5.042219161987305, + "rewards/margins": 5.971033096313477, + "rewards/rejected": -11.013254165649414, + "sft_loss": 0.9807875156402588, + "step": 325 + }, + { + "epoch": 0.5421686746987951, + "grad_norm": 4.069079011849237, + "learning_rate": 4.3601353835912235e-07, + "logits/chosen": 18.978944778442383, + "logits/rejected": 18.711896896362305, + "logps/chosen": -257.9627990722656, + "logps/rejected": -189.10731506347656, + "loss": 0.1393, + "rewards/accuracies": 0.9066666960716248, + "rewards/chosen": -5.991618633270264, + "rewards/margins": 5.532088279724121, + "rewards/rejected": -11.523706436157227, + "sft_loss": 0.9458999633789062, + "step": 330 + }, + { + "epoch": 0.5503833515881709, + "grad_norm": 11.459818241023113, + "learning_rate": 4.337248058444831e-07, + "logits/chosen": 18.476844787597656, + "logits/rejected": 16.893495559692383, + "logps/chosen": -352.3243408203125, + "logps/rejected": -229.020751953125, + "loss": 0.0884, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -6.682969093322754, + "rewards/margins": 6.964285373687744, + "rewards/rejected": -13.64725399017334, + "sft_loss": 1.09258234500885, + "step": 335 + }, + { + "epoch": 0.5585980284775466, + "grad_norm": 4.451183947920293, + "learning_rate": 4.3140209227244617e-07, + "logits/chosen": 19.733028411865234, + "logits/rejected": 19.02443504333496, + "logps/chosen": -255.39503479003906, + "logps/rejected": -174.6426544189453, + "loss": 0.1089, + "rewards/accuracies": 0.9066667556762695, + "rewards/chosen": -4.919735908508301, + "rewards/margins": 5.3243584632873535, + "rewards/rejected": -10.24409294128418, + "sft_loss": 0.974420428276062, + "step": 340 + }, + { + "epoch": 0.5668127053669222, + "grad_norm": 4.806631914628035, + "learning_rate": 4.2904582724348316e-07, + "logits/chosen": 18.731412887573242, + "logits/rejected": 17.712743759155273, + "logps/chosen": -292.1322937011719, + "logps/rejected": -186.6304473876953, + "loss": 0.093, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -4.8624372482299805, + "rewards/margins": 5.988229751586914, + "rewards/rejected": -10.850666999816895, + "sft_loss": 1.1051782369613647, + "step": 345 + }, + { + "epoch": 0.5750273822562979, + "grad_norm": 9.642655055260906, + "learning_rate": 4.266564465636182e-07, + "logits/chosen": 20.78282928466797, + "logits/rejected": 19.54219627380371, + "logps/chosen": -336.76849365234375, + "logps/rejected": -231.12326049804688, + "loss": 0.0859, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -6.264214515686035, + "rewards/margins": 7.32451868057251, + "rewards/rejected": -13.58873462677002, + "sft_loss": 0.9469544291496277, + "step": 350 + }, + { + "epoch": 0.5832420591456736, + "grad_norm": 2.9057809929137535, + "learning_rate": 4.242343921638234e-07, + "logits/chosen": 20.12310028076172, + "logits/rejected": 18.395959854125977, + "logps/chosen": -332.9981994628906, + "logps/rejected": -202.02816772460938, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.237385272979736, + "rewards/margins": 6.5451836585998535, + "rewards/rejected": -11.782567977905273, + "sft_loss": 1.00226628780365, + "step": 355 + }, + { + "epoch": 0.5914567360350493, + "grad_norm": 3.5441590536933605, + "learning_rate": 4.2178011201828044e-07, + "logits/chosen": 19.193899154663086, + "logits/rejected": 17.946836471557617, + "logps/chosen": -299.01483154296875, + "logps/rejected": -192.25360107421875, + "loss": 0.0879, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -5.201097011566162, + "rewards/margins": 6.014251708984375, + "rewards/rejected": -11.215349197387695, + "sft_loss": 0.9916761517524719, + "step": 360 + }, + { + "epoch": 0.5996714129244249, + "grad_norm": 15.329178085855654, + "learning_rate": 4.1929406006152546e-07, + "logits/chosen": 19.373323440551758, + "logits/rejected": 19.08150291442871, + "logps/chosen": -284.7733154296875, + "logps/rejected": -214.26292419433594, + "loss": 0.104, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.026785373687744, + "rewards/margins": 7.561707496643066, + "rewards/rejected": -13.588491439819336, + "sft_loss": 1.0006964206695557, + "step": 365 + }, + { + "epoch": 0.6078860898138007, + "grad_norm": 3.928331003690106, + "learning_rate": 4.167766961044906e-07, + "logits/chosen": 20.291215896606445, + "logits/rejected": 18.595857620239258, + "logps/chosen": -307.8617858886719, + "logps/rejected": -212.20635986328125, + "loss": 0.0842, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.978768348693848, + "rewards/margins": 6.925237655639648, + "rewards/rejected": -12.904006004333496, + "sft_loss": 0.898669958114624, + "step": 370 + }, + { + "epoch": 0.6161007667031764, + "grad_norm": 7.9374465786346295, + "learning_rate": 4.1422848574945923e-07, + "logits/chosen": 19.617891311645508, + "logits/rejected": 18.786251068115234, + "logps/chosen": -309.9544982910156, + "logps/rejected": -208.55972290039062, + "loss": 0.086, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -5.407596111297607, + "rewards/margins": 7.109365940093994, + "rewards/rejected": -12.516963005065918, + "sft_loss": 0.9856653809547424, + "step": 375 + }, + { + "epoch": 0.624315443592552, + "grad_norm": 21.17776800888567, + "learning_rate": 4.1164990030394985e-07, + "logits/chosen": 19.750553131103516, + "logits/rejected": 18.34299659729004, + "logps/chosen": -315.951171875, + "logps/rejected": -218.6781768798828, + "loss": 0.1158, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.80844783782959, + "rewards/margins": 6.5122971534729, + "rewards/rejected": -13.320744514465332, + "sft_loss": 0.9718233942985535, + "step": 380 + }, + { + "epoch": 0.6325301204819277, + "grad_norm": 4.237566233767258, + "learning_rate": 4.09041416693545e-07, + "logits/chosen": 19.702919006347656, + "logits/rejected": 18.19818115234375, + "logps/chosen": -315.5003662109375, + "logps/rejected": -213.6582794189453, + "loss": 0.1037, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.986787796020508, + "rewards/margins": 6.549233913421631, + "rewards/rejected": -13.536023139953613, + "sft_loss": 1.0769668817520142, + "step": 385 + }, + { + "epoch": 0.6407447973713034, + "grad_norm": 8.046716778811138, + "learning_rate": 4.064035173736804e-07, + "logits/chosen": 16.724149703979492, + "logits/rejected": 16.12737464904785, + "logps/chosen": -316.17547607421875, + "logps/rejected": -242.3610076904297, + "loss": 0.098, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -7.848334312438965, + "rewards/margins": 8.00571060180664, + "rewards/rejected": -15.854043960571289, + "sft_loss": 1.207060694694519, + "step": 390 + }, + { + "epoch": 0.6489594742606791, + "grad_norm": 2.9537654489100564, + "learning_rate": 4.0373669024041225e-07, + "logits/chosen": 21.014055252075195, + "logits/rejected": 19.712244033813477, + "logps/chosen": -325.1595153808594, + "logps/rejected": -232.09849548339844, + "loss": 0.076, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -7.954678058624268, + "rewards/margins": 7.2714033126831055, + "rewards/rejected": -15.226082801818848, + "sft_loss": 1.0326578617095947, + "step": 395 + }, + { + "epoch": 0.6571741511500547, + "grad_norm": 7.381518685772535, + "learning_rate": 4.010414285401776e-07, + "logits/chosen": 20.833526611328125, + "logits/rejected": 20.059972763061523, + "logps/chosen": -301.8648681640625, + "logps/rejected": -219.5161895751953, + "loss": 0.0779, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.38788366317749, + "rewards/margins": 7.111426830291748, + "rewards/rejected": -14.499311447143555, + "sft_loss": 1.039659023284912, + "step": 400 + }, + { + "epoch": 0.6653888280394304, + "grad_norm": 4.428141508810293, + "learning_rate": 3.9831823077856565e-07, + "logits/chosen": 18.648929595947266, + "logits/rejected": 18.11013412475586, + "logps/chosen": -301.7897033691406, + "logps/rejected": -215.20339965820312, + "loss": 0.1026, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -6.600786209106445, + "rewards/margins": 6.725845813751221, + "rewards/rejected": -13.326631546020508, + "sft_loss": 1.1041791439056396, + "step": 405 + }, + { + "epoch": 0.6736035049288062, + "grad_norm": 3.720237574886871, + "learning_rate": 3.95567600628115e-07, + "logits/chosen": 18.711490631103516, + "logits/rejected": 17.353120803833008, + "logps/chosen": -288.9316711425781, + "logps/rejected": -199.5088348388672, + "loss": 0.0833, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -5.722890377044678, + "rewards/margins": 6.0634002685546875, + "rewards/rejected": -11.786290168762207, + "sft_loss": 0.9382212162017822, + "step": 410 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 3.5502904501789456, + "learning_rate": 3.9279004683515783e-07, + "logits/chosen": 19.41602897644043, + "logits/rejected": 18.82366180419922, + "logps/chosen": -300.6460876464844, + "logps/rejected": -198.58851623535156, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.369536876678467, + "rewards/margins": 6.260385036468506, + "rewards/rejected": -11.629922866821289, + "sft_loss": 0.947778046131134, + "step": 415 + }, + { + "epoch": 0.6900328587075575, + "grad_norm": 3.775437774667387, + "learning_rate": 3.8998608312572234e-07, + "logits/chosen": 20.035261154174805, + "logits/rejected": 18.32322883605957, + "logps/chosen": -321.1794738769531, + "logps/rejected": -206.26197814941406, + "loss": 0.0666, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.206702709197998, + "rewards/margins": 6.016441822052002, + "rewards/rejected": -12.223145484924316, + "sft_loss": 0.8840410709381104, + "step": 420 + }, + { + "epoch": 0.6982475355969332, + "grad_norm": 4.6471153709105355, + "learning_rate": 3.8715622811051753e-07, + "logits/chosen": 20.395259857177734, + "logits/rejected": 18.99897003173828, + "logps/chosen": -340.93096923828125, + "logps/rejected": -233.05569458007812, + "loss": 0.0824, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.279754638671875, + "rewards/margins": 6.952295303344727, + "rewards/rejected": -14.232048988342285, + "sft_loss": 0.9561217427253723, + "step": 425 + }, + { + "epoch": 0.7064622124863089, + "grad_norm": 5.222781909367126, + "learning_rate": 3.843010051890114e-07, + "logits/chosen": 17.722442626953125, + "logits/rejected": 17.12670135498047, + "logps/chosen": -305.9290466308594, + "logps/rejected": -227.79209899902344, + "loss": 0.0968, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -7.020711898803711, + "rewards/margins": 7.3669562339782715, + "rewards/rejected": -14.38766860961914, + "sft_loss": 1.0332236289978027, + "step": 430 + }, + { + "epoch": 0.7146768893756845, + "grad_norm": 8.793537356204498, + "learning_rate": 3.8142094245262615e-07, + "logits/chosen": 18.769723892211914, + "logits/rejected": 17.301259994506836, + "logps/chosen": -281.7428283691406, + "logps/rejected": -198.91912841796875, + "loss": 0.0907, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -6.324660301208496, + "rewards/margins": 6.169273376464844, + "rewards/rejected": -12.493931770324707, + "sft_loss": 1.6358309984207153, + "step": 435 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 3.225698996369955, + "learning_rate": 3.785165725870637e-07, + "logits/chosen": 18.968225479125977, + "logits/rejected": 18.159814834594727, + "logps/chosen": -303.6697082519531, + "logps/rejected": -215.95223999023438, + "loss": 0.0896, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -5.786445140838623, + "rewards/margins": 6.777508735656738, + "rewards/rejected": -12.563952445983887, + "sft_loss": 0.9366723299026489, + "step": 440 + }, + { + "epoch": 0.731106243154436, + "grad_norm": 5.69116038376307, + "learning_rate": 3.7558843277378203e-07, + "logits/chosen": 19.500343322753906, + "logits/rejected": 18.194765090942383, + "logps/chosen": -280.2967834472656, + "logps/rejected": -190.11224365234375, + "loss": 0.089, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -5.179433345794678, + "rewards/margins": 6.132752418518066, + "rewards/rejected": -11.31218433380127, + "sft_loss": 0.9057294130325317, + "step": 445 + }, + { + "epoch": 0.7393209200438116, + "grad_norm": 2.910912446985568, + "learning_rate": 3.726370645906407e-07, + "logits/chosen": 18.821012496948242, + "logits/rejected": 17.809709548950195, + "logps/chosen": -301.2443542480469, + "logps/rejected": -197.73570251464844, + "loss": 0.072, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -5.806431293487549, + "rewards/margins": 6.135568141937256, + "rewards/rejected": -11.942000389099121, + "sft_loss": 1.1080551147460938, + "step": 450 + }, + { + "epoch": 0.7475355969331873, + "grad_norm": 3.4976563659123787, + "learning_rate": 3.6966301391173204e-07, + "logits/chosen": 18.104902267456055, + "logits/rejected": 19.11966896057129, + "logps/chosen": -283.95404052734375, + "logps/rejected": -216.9072265625, + "loss": 0.0961, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.108445167541504, + "rewards/margins": 7.591168403625488, + "rewards/rejected": -13.699614524841309, + "sft_loss": 1.0220645666122437, + "step": 455 + }, + { + "epoch": 0.755750273822563, + "grad_norm": 3.376381123051291, + "learning_rate": 3.6666683080641843e-07, + "logits/chosen": 17.50767707824707, + "logits/rejected": 16.899118423461914, + "logps/chosen": -325.40252685546875, + "logps/rejected": -233.790283203125, + "loss": 0.0663, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -7.453089237213135, + "rewards/margins": 7.27883243560791, + "rewards/rejected": -14.73192310333252, + "sft_loss": 1.0263330936431885, + "step": 460 + }, + { + "epoch": 0.7639649507119387, + "grad_norm": 4.787229066445367, + "learning_rate": 3.636490694375937e-07, + "logits/chosen": 19.532527923583984, + "logits/rejected": 18.083965301513672, + "logps/chosen": -335.1581115722656, + "logps/rejected": -236.63861083984375, + "loss": 0.0698, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.509254455566406, + "rewards/margins": 8.015713691711426, + "rewards/rejected": -15.5249662399292, + "sft_loss": 1.013353943824768, + "step": 465 + }, + { + "epoch": 0.7721796276013143, + "grad_norm": 1.6525330438642514, + "learning_rate": 3.6061028795918734e-07, + "logits/chosen": 19.80712127685547, + "logits/rejected": 18.532358169555664, + "logps/chosen": -338.9714660644531, + "logps/rejected": -240.235595703125, + "loss": 0.0643, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -8.31692886352539, + "rewards/margins": 7.37734317779541, + "rewards/rejected": -15.694271087646484, + "sft_loss": 1.046373963356018, + "step": 470 + }, + { + "epoch": 0.78039430449069, + "grad_norm": 4.380627729663156, + "learning_rate": 3.5755104841292974e-07, + "logits/chosen": 18.083696365356445, + "logits/rejected": 17.68763542175293, + "logps/chosen": -296.51568603515625, + "logps/rejected": -220.79627990722656, + "loss": 0.0753, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -7.48270845413208, + "rewards/margins": 7.01304817199707, + "rewards/rejected": -14.495758056640625, + "sft_loss": 1.1213568449020386, + "step": 475 + }, + { + "epoch": 0.7886089813800657, + "grad_norm": 3.5838233904248202, + "learning_rate": 3.544719166243998e-07, + "logits/chosen": 18.444488525390625, + "logits/rejected": 17.91800880432129, + "logps/chosen": -323.7398681640625, + "logps/rejected": -239.03846740722656, + "loss": 0.0675, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.762217998504639, + "rewards/margins": 7.673830032348633, + "rewards/rejected": -15.43604850769043, + "sft_loss": 1.0297211408615112, + "step": 480 + }, + { + "epoch": 0.7968236582694413, + "grad_norm": 5.338329799579653, + "learning_rate": 3.513734620983716e-07, + "logits/chosen": 18.91893768310547, + "logits/rejected": 17.848176956176758, + "logps/chosen": -331.2665100097656, + "logps/rejected": -252.34434509277344, + "loss": 0.0585, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.252163887023926, + "rewards/margins": 8.53612232208252, + "rewards/rejected": -16.788286209106445, + "sft_loss": 0.9914504289627075, + "step": 485 + }, + { + "epoch": 0.8050383351588171, + "grad_norm": 10.778725390322887, + "learning_rate": 3.482562579134809e-07, + "logits/chosen": 17.087730407714844, + "logits/rejected": 16.604755401611328, + "logps/chosen": -269.3510437011719, + "logps/rejected": -222.06834411621094, + "loss": 0.0883, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -8.483560562133789, + "rewards/margins": 6.750950813293457, + "rewards/rejected": -15.23451042175293, + "sft_loss": 1.1086839437484741, + "step": 490 + }, + { + "epoch": 0.8132530120481928, + "grad_norm": 3.589498412964668, + "learning_rate": 3.4512088061623073e-07, + "logits/chosen": 20.214731216430664, + "logits/rejected": 18.510662078857422, + "logps/chosen": -364.5818176269531, + "logps/rejected": -248.37899780273438, + "loss": 0.0795, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -8.396151542663574, + "rewards/margins": 7.525665283203125, + "rewards/rejected": -15.9218168258667, + "sft_loss": 1.0693514347076416, + "step": 495 + }, + { + "epoch": 0.8214676889375685, + "grad_norm": 3.668001282748929, + "learning_rate": 3.419679101143555e-07, + "logits/chosen": 19.246572494506836, + "logits/rejected": 18.104793548583984, + "logps/chosen": -286.7825012207031, + "logps/rejected": -219.62115478515625, + "loss": 0.0646, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.4586005210876465, + "rewards/margins": 7.050439357757568, + "rewards/rejected": -14.509037971496582, + "sft_loss": 1.0848183631896973, + "step": 500 + }, + { + "epoch": 0.8296823658269441, + "grad_norm": 4.09686690298713, + "learning_rate": 3.387979295695632e-07, + "logits/chosen": 19.468666076660156, + "logits/rejected": 17.898151397705078, + "logps/chosen": -306.32623291015625, + "logps/rejected": -223.23086547851562, + "loss": 0.0854, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.60283899307251, + "rewards/margins": 6.977162837982178, + "rewards/rejected": -14.580002784729004, + "sft_loss": 1.017836093902588, + "step": 505 + }, + { + "epoch": 0.8378970427163198, + "grad_norm": 5.03743167428466, + "learning_rate": 3.356115252896764e-07, + "logits/chosen": 18.379446029663086, + "logits/rejected": 17.511045455932617, + "logps/chosen": -338.8892517089844, + "logps/rejected": -236.15655517578125, + "loss": 0.084, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -7.8325958251953125, + "rewards/margins": 7.418299674987793, + "rewards/rejected": -15.250896453857422, + "sft_loss": 1.1587491035461426, + "step": 510 + }, + { + "epoch": 0.8461117196056955, + "grad_norm": 1.3483241705386844, + "learning_rate": 3.3240928662019043e-07, + "logits/chosen": 17.142412185668945, + "logits/rejected": 16.88658905029297, + "logps/chosen": -323.4809265136719, + "logps/rejected": -228.69459533691406, + "loss": 0.076, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -7.122060298919678, + "rewards/margins": 7.332087516784668, + "rewards/rejected": -14.454146385192871, + "sft_loss": 1.033144235610962, + "step": 515 + }, + { + "epoch": 0.8543263964950711, + "grad_norm": 5.6768934408505665, + "learning_rate": 3.291918058352706e-07, + "logits/chosen": 18.301881790161133, + "logits/rejected": 17.546480178833008, + "logps/chosen": -286.16534423828125, + "logps/rejected": -223.24356079101562, + "loss": 0.1012, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -8.11279010772705, + "rewards/margins": 6.034675121307373, + "rewards/rejected": -14.147465705871582, + "sft_loss": 1.0691564083099365, + "step": 520 + }, + { + "epoch": 0.8625410733844469, + "grad_norm": 2.664959243016448, + "learning_rate": 3.259596780282074e-07, + "logits/chosen": 19.877466201782227, + "logits/rejected": 18.95973014831543, + "logps/chosen": -365.1341552734375, + "logps/rejected": -256.1839294433594, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.447802543640137, + "rewards/margins": 8.053640365600586, + "rewards/rejected": -16.501441955566406, + "sft_loss": 1.2106726169586182, + "step": 525 + }, + { + "epoch": 0.8707557502738226, + "grad_norm": 13.037778283402458, + "learning_rate": 3.2271350100134975e-07, + "logits/chosen": 19.298856735229492, + "logits/rejected": 17.636470794677734, + "logps/chosen": -322.1096496582031, + "logps/rejected": -239.7886505126953, + "loss": 0.0671, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.631354331970215, + "rewards/margins": 7.214845657348633, + "rewards/rejected": -15.846202850341797, + "sft_loss": 1.1396478414535522, + "step": 530 + }, + { + "epoch": 0.8789704271631983, + "grad_norm": 3.29678671312156, + "learning_rate": 3.1945387515553843e-07, + "logits/chosen": 20.68004608154297, + "logits/rejected": 18.87333106994629, + "logps/chosen": -342.8370056152344, + "logps/rejected": -231.9772491455078, + "loss": 0.0816, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.539201259613037, + "rewards/margins": 7.375776290893555, + "rewards/rejected": -14.914976119995117, + "sft_loss": 1.073767066001892, + "step": 535 + }, + { + "epoch": 0.8871851040525739, + "grad_norm": 5.524616451485275, + "learning_rate": 3.1618140337905764e-07, + "logits/chosen": 19.65703582763672, + "logits/rejected": 18.759868621826172, + "logps/chosen": -293.4176940917969, + "logps/rejected": -222.76730346679688, + "loss": 0.0785, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.326995372772217, + "rewards/margins": 6.951819896697998, + "rewards/rejected": -14.278814315795898, + "sft_loss": 1.1393436193466187, + "step": 540 + }, + { + "epoch": 0.8953997809419496, + "grad_norm": 3.710469983456331, + "learning_rate": 3.128966909361271e-07, + "logits/chosen": 19.455686569213867, + "logits/rejected": 18.928129196166992, + "logps/chosen": -346.8277893066406, + "logps/rejected": -249.02398681640625, + "loss": 0.0577, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -8.179071426391602, + "rewards/margins": 7.697024345397949, + "rewards/rejected": -15.87609577178955, + "sft_loss": 1.0804804563522339, + "step": 545 + }, + { + "epoch": 0.9036144578313253, + "grad_norm": 11.048677395421713, + "learning_rate": 3.096003453549549e-07, + "logits/chosen": 19.185258865356445, + "logits/rejected": 17.547964096069336, + "logps/chosen": -356.4489440917969, + "logps/rejected": -257.8424987792969, + "loss": 0.0805, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -8.774456977844238, + "rewards/margins": 8.595198631286621, + "rewards/rejected": -17.36965560913086, + "sft_loss": 1.0598615407943726, + "step": 550 + }, + { + "epoch": 0.911829134720701, + "grad_norm": 2.5715819705074447, + "learning_rate": 3.06292976315371e-07, + "logits/chosen": 18.52203369140625, + "logits/rejected": 17.04202651977539, + "logps/chosen": -332.1985168457031, + "logps/rejected": -236.9717254638672, + "loss": 0.0696, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.198724746704102, + "rewards/margins": 7.527222156524658, + "rewards/rejected": -15.725946426391602, + "sft_loss": 1.1585354804992676, + "step": 555 + }, + { + "epoch": 0.9200438116100766, + "grad_norm": 5.125385527021205, + "learning_rate": 3.0297519553606324e-07, + "logits/chosen": 19.936016082763672, + "logits/rejected": 18.45525360107422, + "logps/chosen": -308.65936279296875, + "logps/rejected": -223.3343048095703, + "loss": 0.0879, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -7.173832893371582, + "rewards/margins": 7.109025001525879, + "rewards/rejected": -14.282858848571777, + "sft_loss": 1.038684368133545, + "step": 560 + }, + { + "epoch": 0.9282584884994524, + "grad_norm": 3.7082409068486073, + "learning_rate": 2.996476166614363e-07, + "logits/chosen": 19.476381301879883, + "logits/rejected": 17.608320236206055, + "logps/chosen": -329.3957214355469, + "logps/rejected": -218.0595703125, + "loss": 0.0885, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -6.555365085601807, + "rewards/margins": 6.53586483001709, + "rewards/rejected": -13.091230392456055, + "sft_loss": 1.0041550397872925, + "step": 565 + }, + { + "epoch": 0.9364731653888281, + "grad_norm": 4.785307756807971, + "learning_rate": 2.963108551481142e-07, + "logits/chosen": 20.289274215698242, + "logits/rejected": 18.269121170043945, + "logps/chosen": -354.8280029296875, + "logps/rejected": -232.3466796875, + "loss": 0.0877, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -6.978261470794678, + "rewards/margins": 7.397289752960205, + "rewards/rejected": -14.3755521774292, + "sft_loss": 1.049277424812317, + "step": 570 + }, + { + "epoch": 0.9446878422782037, + "grad_norm": 2.70866282954725, + "learning_rate": 2.929655281511075e-07, + "logits/chosen": 18.807231903076172, + "logits/rejected": 17.12908363342285, + "logps/chosen": -354.8686828613281, + "logps/rejected": -238.53443908691406, + "loss": 0.0648, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.602488994598389, + "rewards/margins": 7.430182933807373, + "rewards/rejected": -15.032671928405762, + "sft_loss": 1.1179447174072266, + "step": 575 + }, + { + "epoch": 0.9529025191675794, + "grad_norm": 1.9988385632818255, + "learning_rate": 2.896122544096667e-07, + "logits/chosen": 18.314443588256836, + "logits/rejected": 17.118688583374023, + "logps/chosen": -302.5119934082031, + "logps/rejected": -227.22274780273438, + "loss": 0.0808, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.768359661102295, + "rewards/margins": 7.082144737243652, + "rewards/rejected": -14.850504875183105, + "sft_loss": 1.1123638153076172, + "step": 580 + }, + { + "epoch": 0.9611171960569551, + "grad_norm": 3.2519592276620872, + "learning_rate": 2.8625165413284307e-07, + "logits/chosen": 18.069074630737305, + "logits/rejected": 17.50326919555664, + "logps/chosen": -342.6742248535156, + "logps/rejected": -235.6639862060547, + "loss": 0.0951, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -7.134052753448486, + "rewards/margins": 7.267971515655518, + "rewards/rejected": -14.402023315429688, + "sft_loss": 1.0139853954315186, + "step": 585 + }, + { + "epoch": 0.9693318729463308, + "grad_norm": 3.802310527872131, + "learning_rate": 2.8288434888477626e-07, + "logits/chosen": 20.131250381469727, + "logits/rejected": 18.442293167114258, + "logps/chosen": -264.7261047363281, + "logps/rejected": -194.17242431640625, + "loss": 0.0728, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -6.163069248199463, + "rewards/margins": 5.962526798248291, + "rewards/rejected": -12.125596046447754, + "sft_loss": 0.9994211196899414, + "step": 590 + }, + { + "epoch": 0.9775465498357064, + "grad_norm": 4.623566930411208, + "learning_rate": 2.795109614697326e-07, + "logits/chosen": 19.592060089111328, + "logits/rejected": 18.126672744750977, + "logps/chosen": -295.9714050292969, + "logps/rejected": -204.66427612304688, + "loss": 0.0873, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.2958526611328125, + "rewards/margins": 6.357571601867676, + "rewards/rejected": -12.653424263000488, + "sft_loss": 0.9684253931045532, + "step": 595 + }, + { + "epoch": 0.9857612267250822, + "grad_norm": 6.368570274437731, + "learning_rate": 2.761321158169134e-07, + "logits/chosen": 20.56801414489746, + "logits/rejected": 19.605268478393555, + "logps/chosen": -325.0837097167969, + "logps/rejected": -226.78863525390625, + "loss": 0.0896, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -6.953873634338379, + "rewards/margins": 7.266047954559326, + "rewards/rejected": -14.219923973083496, + "sft_loss": 1.0414397716522217, + "step": 600 + }, + { + "epoch": 0.9939759036144579, + "grad_norm": 3.750987661763106, + "learning_rate": 2.727484368650553e-07, + "logits/chosen": 17.04819107055664, + "logits/rejected": 16.657819747924805, + "logps/chosen": -309.3585205078125, + "logps/rejected": -228.43280029296875, + "loss": 0.0826, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.683638572692871, + "rewards/margins": 7.000718116760254, + "rewards/rejected": -14.684355735778809, + "sft_loss": 1.13958740234375, + "step": 605 + }, + { + "epoch": 1.0021905805038336, + "grad_norm": 2.4122584075980966, + "learning_rate": 2.6936055044684425e-07, + "logits/chosen": 18.877971649169922, + "logits/rejected": 17.84745979309082, + "logps/chosen": -272.0718078613281, + "logps/rejected": -214.1260223388672, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.77817440032959, + "rewards/margins": 6.673606872558594, + "rewards/rejected": -14.4517822265625, + "sft_loss": 1.05906343460083, + "step": 610 + }, + { + "epoch": 1.0104052573932092, + "grad_norm": 4.202536280729459, + "learning_rate": 2.659690831731631e-07, + "logits/chosen": 18.819133758544922, + "logits/rejected": 18.853425979614258, + "logps/chosen": -313.6888427734375, + "logps/rejected": -247.36912536621094, + "loss": 0.0479, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -8.692021369934082, + "rewards/margins": 7.864367961883545, + "rewards/rejected": -16.5563907623291, + "sft_loss": 1.0393388271331787, + "step": 615 + }, + { + "epoch": 1.0186199342825848, + "grad_norm": 3.9279717638704224, + "learning_rate": 2.6257466231719676e-07, + "logits/chosen": 16.448410034179688, + "logits/rejected": 15.970653533935547, + "logps/chosen": -357.7733154296875, + "logps/rejected": -276.7339782714844, + "loss": 0.0375, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.677971839904785, + "rewards/margins": 9.011359214782715, + "rewards/rejected": -18.6893310546875, + "sft_loss": 1.3019551038742065, + "step": 620 + }, + { + "epoch": 1.0268346111719606, + "grad_norm": 2.4098074473851, + "learning_rate": 2.591779156984137e-07, + "logits/chosen": 18.19355010986328, + "logits/rejected": 16.887168884277344, + "logps/chosen": -331.2437744140625, + "logps/rejected": -270.1482849121094, + "loss": 0.0655, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -10.20653247833252, + "rewards/margins": 8.743158340454102, + "rewards/rejected": -18.949687957763672, + "sft_loss": 1.1369999647140503, + "step": 625 + }, + { + "epoch": 1.0350492880613362, + "grad_norm": 2.015176030113295, + "learning_rate": 2.557794715664465e-07, + "logits/chosen": 18.09113311767578, + "logits/rejected": 16.872072219848633, + "logps/chosen": -343.4937438964844, + "logps/rejected": -252.0245819091797, + "loss": 0.0574, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.15774154663086, + "rewards/margins": 7.730542182922363, + "rewards/rejected": -16.888282775878906, + "sft_loss": 1.1037859916687012, + "step": 630 + }, + { + "epoch": 1.0432639649507118, + "grad_norm": 4.817911475238817, + "learning_rate": 2.5237995848489417e-07, + "logits/chosen": 19.07139778137207, + "logits/rejected": 17.200511932373047, + "logps/chosen": -335.5242614746094, + "logps/rejected": -225.45147705078125, + "loss": 0.0736, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -6.962551593780518, + "rewards/margins": 7.362425804138184, + "rewards/rejected": -14.32497787475586, + "sft_loss": 1.0912320613861084, + "step": 635 + }, + { + "epoch": 1.0514786418400877, + "grad_norm": 5.399497912094965, + "learning_rate": 2.48980005215064e-07, + "logits/chosen": 19.333988189697266, + "logits/rejected": 18.970035552978516, + "logps/chosen": -263.4971008300781, + "logps/rejected": -204.3727569580078, + "loss": 0.0747, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.033030986785889, + "rewards/margins": 6.5659332275390625, + "rewards/rejected": -13.598965644836426, + "sft_loss": 1.2809529304504395, + "step": 640 + }, + { + "epoch": 1.0596933187294633, + "grad_norm": 5.192926508245981, + "learning_rate": 2.45580240599679e-07, + "logits/chosen": 18.814851760864258, + "logits/rejected": 19.112171173095703, + "logps/chosen": -368.88470458984375, + "logps/rejected": -262.5347900390625, + "loss": 0.073, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -7.645462512969971, + "rewards/margins": 8.707850456237793, + "rewards/rejected": -16.353313446044922, + "sft_loss": 1.1692014932632446, + "step": 645 + }, + { + "epoch": 1.067907995618839, + "grad_norm": 2.533086454702698, + "learning_rate": 2.421812934465696e-07, + "logits/chosen": 20.8750057220459, + "logits/rejected": 18.880239486694336, + "logps/chosen": -341.56805419921875, + "logps/rejected": -238.24147033691406, + "loss": 0.0526, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -7.804737567901611, + "rewards/margins": 7.901425838470459, + "rewards/rejected": -15.706161499023438, + "sft_loss": 1.0994611978530884, + "step": 650 + }, + { + "epoch": 1.0761226725082147, + "grad_norm": 1.8972892397596355, + "learning_rate": 2.3878379241237134e-07, + "logits/chosen": 18.171382904052734, + "logits/rejected": 17.349416732788086, + "logps/chosen": -333.02862548828125, + "logps/rejected": -257.3088073730469, + "loss": 0.0702, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.708560943603516, + "rewards/margins": 8.212488174438477, + "rewards/rejected": -17.92104721069336, + "sft_loss": 1.3080743551254272, + "step": 655 + }, + { + "epoch": 1.0843373493975903, + "grad_norm": 7.296587489211492, + "learning_rate": 2.3538836588625077e-07, + "logits/chosen": 16.028032302856445, + "logits/rejected": 15.767348289489746, + "logps/chosen": -302.0737609863281, + "logps/rejected": -251.35279846191406, + "loss": 0.0709, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -9.596309661865234, + "rewards/margins": 8.214980125427246, + "rewards/rejected": -17.811288833618164, + "sft_loss": 1.4402241706848145, + "step": 660 + }, + { + "epoch": 1.0925520262869661, + "grad_norm": 2.7162575555609547, + "learning_rate": 2.3199564187368153e-07, + "logits/chosen": 18.975399017333984, + "logits/rejected": 17.070423126220703, + "logps/chosen": -378.0216369628906, + "logps/rejected": -273.23687744140625, + "loss": 0.0699, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.879487037658691, + "rewards/margins": 8.538008689880371, + "rewards/rejected": -18.41749382019043, + "sft_loss": 1.127976655960083, + "step": 665 + }, + { + "epoch": 1.1007667031763417, + "grad_norm": 10.07225580885619, + "learning_rate": 2.2860624788029013e-07, + "logits/chosen": 17.706756591796875, + "logits/rejected": 17.792144775390625, + "logps/chosen": -291.8415832519531, + "logps/rejected": -239.1862030029297, + "loss": 0.0756, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.226842880249023, + "rewards/margins": 7.920691013336182, + "rewards/rejected": -16.147533416748047, + "sft_loss": 1.1893837451934814, + "step": 670 + }, + { + "epoch": 1.1089813800657173, + "grad_norm": 4.017695151821307, + "learning_rate": 2.2522081079579497e-07, + "logits/chosen": 17.455995559692383, + "logits/rejected": 17.200504302978516, + "logps/chosen": -316.4137878417969, + "logps/rejected": -253.0958709716797, + "loss": 0.0701, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -8.296252250671387, + "rewards/margins": 8.523763656616211, + "rewards/rejected": -16.82001495361328, + "sft_loss": 1.2573199272155762, + "step": 675 + }, + { + "epoch": 1.1171960569550932, + "grad_norm": 2.9229430403917416, + "learning_rate": 2.2183995677805967e-07, + "logits/chosen": 17.359153747558594, + "logits/rejected": 17.15728759765625, + "logps/chosen": -337.8786315917969, + "logps/rejected": -256.97259521484375, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.227910041809082, + "rewards/margins": 8.76689624786377, + "rewards/rejected": -16.99480628967285, + "sft_loss": 1.1089237928390503, + "step": 680 + }, + { + "epoch": 1.1254107338444688, + "grad_norm": 3.6239751013897306, + "learning_rate": 2.1846431113728062e-07, + "logits/chosen": 17.55232048034668, + "logits/rejected": 17.560558319091797, + "logps/chosen": -321.5279846191406, + "logps/rejected": -261.1816711425781, + "loss": 0.0621, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.55235767364502, + "rewards/margins": 9.196606636047363, + "rewards/rejected": -17.748966217041016, + "sft_loss": 1.1366941928863525, + "step": 685 + }, + { + "epoch": 1.1336254107338444, + "grad_norm": 6.419284533750765, + "learning_rate": 2.1509449822033205e-07, + "logits/chosen": 18.428213119506836, + "logits/rejected": 17.30073356628418, + "logps/chosen": -371.9090576171875, + "logps/rejected": -266.3111572265625, + "loss": 0.0415, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.018980026245117, + "rewards/margins": 8.834003448486328, + "rewards/rejected": -17.852983474731445, + "sft_loss": 1.149424433708191, + "step": 690 + }, + { + "epoch": 1.1418400876232202, + "grad_norm": 6.07442729979061, + "learning_rate": 2.1173114129528957e-07, + "logits/chosen": 17.701704025268555, + "logits/rejected": 17.770456314086914, + "logps/chosen": -307.99798583984375, + "logps/rejected": -250.03758239746094, + "loss": 0.0795, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -9.118383407592773, + "rewards/margins": 8.651429176330566, + "rewards/rejected": -17.769811630249023, + "sft_loss": 1.3337371349334717, + "step": 695 + }, + { + "epoch": 1.1500547645125958, + "grad_norm": 3.7073147975104885, + "learning_rate": 2.0837486243615226e-07, + "logits/chosen": 19.352502822875977, + "logits/rejected": 17.753198623657227, + "logps/chosen": -390.12890625, + "logps/rejected": -298.8519592285156, + "loss": 0.0802, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -10.813799858093262, + "rewards/margins": 9.796292304992676, + "rewards/rejected": -20.610090255737305, + "sft_loss": 1.080621600151062, + "step": 700 + }, + { + "epoch": 1.1582694414019716, + "grad_norm": 3.173868712663281, + "learning_rate": 2.0502628240778653e-07, + "logits/chosen": 18.959392547607422, + "logits/rejected": 19.250455856323242, + "logps/chosen": -345.37005615234375, + "logps/rejected": -278.861328125, + "loss": 0.073, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.555724143981934, + "rewards/margins": 9.474061012268066, + "rewards/rejected": -19.02978515625, + "sft_loss": 1.0641828775405884, + "step": 705 + }, + { + "epoch": 1.1664841182913472, + "grad_norm": 5.00787760201323, + "learning_rate": 2.0168602055111173e-07, + "logits/chosen": 18.32485008239746, + "logits/rejected": 17.315608978271484, + "logps/chosen": -337.5779724121094, + "logps/rejected": -283.7364196777344, + "loss": 0.043, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.335200309753418, + "rewards/margins": 10.991543769836426, + "rewards/rejected": -20.32674217224121, + "sft_loss": 1.1922962665557861, + "step": 710 + }, + { + "epoch": 1.1746987951807228, + "grad_norm": 4.0968602022146845, + "learning_rate": 1.9835469466854887e-07, + "logits/chosen": 18.45452880859375, + "logits/rejected": 16.768016815185547, + "logps/chosen": -344.7873840332031, + "logps/rejected": -259.2659912109375, + "loss": 0.0523, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.04322624206543, + "rewards/margins": 8.293071746826172, + "rewards/rejected": -17.336299896240234, + "sft_loss": 1.140812635421753, + "step": 715 + }, + { + "epoch": 1.1829134720700987, + "grad_norm": 4.912796552705337, + "learning_rate": 1.9503292090975454e-07, + "logits/chosen": 19.511587142944336, + "logits/rejected": 18.789897918701172, + "logps/chosen": -267.87982177734375, + "logps/rejected": -212.76730346679688, + "loss": 0.069, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -7.6522440910339355, + "rewards/margins": 6.9143452644348145, + "rewards/rejected": -14.56658935546875, + "sft_loss": 1.1043713092803955, + "step": 720 + }, + { + "epoch": 1.1911281489594743, + "grad_norm": 3.7513682612693287, + "learning_rate": 1.917213136576602e-07, + "logits/chosen": 19.222810745239258, + "logits/rejected": 18.952795028686523, + "logps/chosen": -317.2191162109375, + "logps/rejected": -242.29966735839844, + "loss": 0.0429, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -7.920867919921875, + "rewards/margins": 8.166583061218262, + "rewards/rejected": -16.08745002746582, + "sft_loss": 1.0877418518066406, + "step": 725 + }, + { + "epoch": 1.1993428258488499, + "grad_norm": 5.446402649753336, + "learning_rate": 1.8842048541483756e-07, + "logits/chosen": 19.713359832763672, + "logits/rejected": 18.361555099487305, + "logps/chosen": -319.3419189453125, + "logps/rejected": -238.31849670410156, + "loss": 0.0688, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.05375862121582, + "rewards/margins": 7.3250813484191895, + "rewards/rejected": -16.378841400146484, + "sft_loss": 1.220839023590088, + "step": 730 + }, + { + "epoch": 1.2075575027382257, + "grad_norm": 3.597263962070461, + "learning_rate": 1.8513104669021314e-07, + "logits/chosen": 18.05857276916504, + "logits/rejected": 17.213830947875977, + "logps/chosen": -345.5421142578125, + "logps/rejected": -268.8918151855469, + "loss": 0.069, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.973356246948242, + "rewards/margins": 8.510116577148438, + "rewards/rejected": -18.48347282409668, + "sft_loss": 1.1453214883804321, + "step": 735 + }, + { + "epoch": 1.2157721796276013, + "grad_norm": 9.406473544966678, + "learning_rate": 1.8185360588615057e-07, + "logits/chosen": 19.28816795349121, + "logits/rejected": 17.978492736816406, + "logps/chosen": -365.7281799316406, + "logps/rejected": -270.6371154785156, + "loss": 0.0678, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.327936172485352, + "rewards/margins": 8.730137825012207, + "rewards/rejected": -18.058074951171875, + "sft_loss": 1.1658827066421509, + "step": 740 + }, + { + "epoch": 1.223986856516977, + "grad_norm": 3.7662686205808584, + "learning_rate": 1.7858876918592232e-07, + "logits/chosen": 17.326171875, + "logits/rejected": 17.195329666137695, + "logps/chosen": -301.6247863769531, + "logps/rejected": -246.91964721679688, + "loss": 0.0644, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -8.8589506149292, + "rewards/margins": 8.645825386047363, + "rewards/rejected": -17.50477409362793, + "sft_loss": 1.1279815435409546, + "step": 745 + }, + { + "epoch": 1.2322015334063527, + "grad_norm": 3.8887147488856986, + "learning_rate": 1.7533714044159299e-07, + "logits/chosen": 18.013948440551758, + "logits/rejected": 16.35959243774414, + "logps/chosen": -331.99609375, + "logps/rejected": -249.0239715576172, + "loss": 0.0704, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -9.633424758911133, + "rewards/margins": 7.326272487640381, + "rewards/rejected": -16.95969581604004, + "sft_loss": 1.6165919303894043, + "step": 750 + }, + { + "epoch": 1.2404162102957283, + "grad_norm": 2.5663067455515747, + "learning_rate": 1.7209932106233264e-07, + "logits/chosen": 16.976768493652344, + "logits/rejected": 17.75543785095215, + "logps/chosen": -336.4115905761719, + "logps/rejected": -277.32012939453125, + "loss": 0.0531, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.428933143615723, + "rewards/margins": 9.152985572814941, + "rewards/rejected": -18.58191680908203, + "sft_loss": 1.1206488609313965, + "step": 755 + }, + { + "epoch": 1.248630887185104, + "grad_norm": 7.904031033651109, + "learning_rate": 1.688759099031824e-07, + "logits/chosen": 17.9715633392334, + "logits/rejected": 16.856943130493164, + "logps/chosen": -358.84051513671875, + "logps/rejected": -278.71514892578125, + "loss": 0.0624, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -9.98255729675293, + "rewards/margins": 9.00979995727539, + "rewards/rejected": -18.99235725402832, + "sft_loss": 1.155042290687561, + "step": 760 + }, + { + "epoch": 1.2568455640744798, + "grad_norm": 4.102166977987274, + "learning_rate": 1.656675031542925e-07, + "logits/chosen": 19.007150650024414, + "logits/rejected": 18.558509826660156, + "logps/chosen": -375.6373291015625, + "logps/rejected": -275.5815734863281, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.460150718688965, + "rewards/margins": 9.107392311096191, + "rewards/rejected": -18.567543029785156, + "sft_loss": 1.1842076778411865, + "step": 765 + }, + { + "epoch": 1.2650602409638554, + "grad_norm": 6.075552965079597, + "learning_rate": 1.6247469423065343e-07, + "logits/chosen": 18.225561141967773, + "logits/rejected": 16.852903366088867, + "logps/chosen": -316.5226135253906, + "logps/rejected": -239.98927307128906, + "loss": 0.0567, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.122289657592773, + "rewards/margins": 7.42488431930542, + "rewards/rejected": -16.54717445373535, + "sft_loss": 1.2285444736480713, + "step": 770 + }, + { + "epoch": 1.273274917853231, + "grad_norm": 1.6786737917411083, + "learning_rate": 1.5929807366233977e-07, + "logits/chosen": 18.135303497314453, + "logits/rejected": 16.776866912841797, + "logps/chosen": -388.01239013671875, + "logps/rejected": -284.525390625, + "loss": 0.0526, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.297353744506836, + "rewards/margins": 9.676100730895996, + "rewards/rejected": -18.97345542907715, + "sft_loss": 1.1469649076461792, + "step": 775 + }, + { + "epoch": 1.2814895947426068, + "grad_norm": 2.6753220341135213, + "learning_rate": 1.5613822898528794e-07, + "logits/chosen": 17.79352378845215, + "logits/rejected": 17.512025833129883, + "logps/chosen": -331.31829833984375, + "logps/rejected": -265.259765625, + "loss": 0.0508, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.44349479675293, + "rewards/margins": 8.841734886169434, + "rewards/rejected": -18.285228729248047, + "sft_loss": 1.3215175867080688, + "step": 780 + }, + { + "epoch": 1.2897042716319824, + "grad_norm": 8.768934877423389, + "learning_rate": 1.5299574463262794e-07, + "logits/chosen": 17.9199275970459, + "logits/rejected": 17.427675247192383, + "logps/chosen": -371.37432861328125, + "logps/rejected": -277.9888916015625, + "loss": 0.0713, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.927423477172852, + "rewards/margins": 9.477466583251953, + "rewards/rejected": -18.404890060424805, + "sft_loss": 1.1020786762237549, + "step": 785 + }, + { + "epoch": 1.297918948521358, + "grad_norm": 3.060484827435086, + "learning_rate": 1.4987120182658877e-07, + "logits/chosen": 18.74033546447754, + "logits/rejected": 19.323453903198242, + "logps/chosen": -322.2007141113281, + "logps/rejected": -249.55172729492188, + "loss": 0.0521, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -7.6707940101623535, + "rewards/margins": 8.992262840270996, + "rewards/rejected": -16.663057327270508, + "sft_loss": 1.0123049020767212, + "step": 790 + }, + { + "epoch": 1.3061336254107339, + "grad_norm": 2.7470931382147814, + "learning_rate": 1.4676517847099745e-07, + "logits/chosen": 19.513704299926758, + "logits/rejected": 19.02146339416504, + "logps/chosen": -285.7082214355469, + "logps/rejected": -215.02732849121094, + "loss": 0.0717, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -6.395638465881348, + "rewards/margins": 7.340782642364502, + "rewards/rejected": -13.736421585083008, + "sft_loss": 0.9653832316398621, + "step": 795 + }, + { + "epoch": 1.3143483023001095, + "grad_norm": 6.310958305446118, + "learning_rate": 1.4367824904439242e-07, + "logits/chosen": 20.603116989135742, + "logits/rejected": 19.180021286010742, + "logps/chosen": -326.3525085449219, + "logps/rejected": -239.44825744628906, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.47385311126709, + "rewards/margins": 7.884840488433838, + "rewards/rejected": -15.358694076538086, + "sft_loss": 0.995606005191803, + "step": 800 + }, + { + "epoch": 1.3225629791894853, + "grad_norm": 5.691699841334073, + "learning_rate": 1.4061098449376985e-07, + "logits/chosen": 19.0198974609375, + "logits/rejected": 17.974164962768555, + "logps/chosen": -380.7705383300781, + "logps/rejected": -275.5369873046875, + "loss": 0.0558, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -8.497329711914062, + "rewards/margins": 9.07253646850586, + "rewards/rejected": -17.569866180419922, + "sft_loss": 1.1454724073410034, + "step": 805 + }, + { + "epoch": 1.330777656078861, + "grad_norm": 3.2559120936424177, + "learning_rate": 1.375639521289836e-07, + "logits/chosen": 18.49973487854004, + "logits/rejected": 16.317462921142578, + "logps/chosen": -342.45751953125, + "logps/rejected": -251.50741577148438, + "loss": 0.0641, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -8.949553489685059, + "rewards/margins": 7.948643684387207, + "rewards/rejected": -16.8981990814209, + "sft_loss": 1.1290977001190186, + "step": 810 + }, + { + "epoch": 1.3389923329682367, + "grad_norm": 5.544530645757365, + "learning_rate": 1.3453771551781756e-07, + "logits/chosen": 16.78171157836914, + "logits/rejected": 16.938737869262695, + "logps/chosen": -313.18890380859375, + "logps/rejected": -255.20042419433594, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.961526870727539, + "rewards/margins": 8.269176483154297, + "rewards/rejected": -17.23070526123047, + "sft_loss": 1.2107350826263428, + "step": 815 + }, + { + "epoch": 1.3472070098576123, + "grad_norm": 6.724733185846292, + "learning_rate": 1.3153283438175034e-07, + "logits/chosen": 16.646345138549805, + "logits/rejected": 15.98381519317627, + "logps/chosen": -331.96099853515625, + "logps/rejected": -259.468017578125, + "loss": 0.0652, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.80189037322998, + "rewards/margins": 8.555667877197266, + "rewards/rejected": -18.357561111450195, + "sft_loss": 1.1712498664855957, + "step": 820 + }, + { + "epoch": 1.355421686746988, + "grad_norm": 3.2889655668134083, + "learning_rate": 1.2854986449243124e-07, + "logits/chosen": 17.65423011779785, + "logits/rejected": 17.50803565979004, + "logps/chosen": -319.4832458496094, + "logps/rejected": -263.18182373046875, + "loss": 0.0445, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.653801918029785, + "rewards/margins": 8.747140884399414, + "rewards/rejected": -18.400943756103516, + "sft_loss": 1.077059268951416, + "step": 825 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 3.0357613310647613, + "learning_rate": 1.2558935756888675e-07, + "logits/chosen": 16.95936393737793, + "logits/rejected": 16.559120178222656, + "logps/chosen": -322.41357421875, + "logps/rejected": -263.3858947753906, + "loss": 0.0624, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.595723152160645, + "rewards/margins": 9.040340423583984, + "rewards/rejected": -18.636064529418945, + "sft_loss": 1.1398062705993652, + "step": 830 + }, + { + "epoch": 1.3718510405257394, + "grad_norm": 2.8650653830144184, + "learning_rate": 1.226518611754767e-07, + "logits/chosen": 19.48563003540039, + "logits/rejected": 18.318378448486328, + "logps/chosen": -321.63446044921875, + "logps/rejected": -262.0445861816406, + "loss": 0.0482, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.1325101852417, + "rewards/margins": 8.926721572875977, + "rewards/rejected": -18.05923080444336, + "sft_loss": 1.1335971355438232, + "step": 835 + }, + { + "epoch": 1.380065717415115, + "grad_norm": 6.425860846095936, + "learning_rate": 1.1973791862061871e-07, + "logits/chosen": 17.781869888305664, + "logits/rejected": 16.827882766723633, + "logps/chosen": -339.0770568847656, + "logps/rejected": -266.6395568847656, + "loss": 0.0703, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.701363563537598, + "rewards/margins": 9.394256591796875, + "rewards/rejected": -18.095619201660156, + "sft_loss": 1.0754181146621704, + "step": 840 + }, + { + "epoch": 1.3882803943044908, + "grad_norm": 2.9739559158072484, + "learning_rate": 1.1684806885630003e-07, + "logits/chosen": 18.772544860839844, + "logits/rejected": 18.669767379760742, + "logps/chosen": -341.22998046875, + "logps/rejected": -270.3790283203125, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.742754936218262, + "rewards/margins": 9.263623237609863, + "rewards/rejected": -18.006380081176758, + "sft_loss": 1.1030330657958984, + "step": 845 + }, + { + "epoch": 1.3964950711938664, + "grad_norm": 3.70632539902967, + "learning_rate": 1.1398284637839486e-07, + "logits/chosen": 19.672170639038086, + "logits/rejected": 18.065275192260742, + "logps/chosen": -303.6903076171875, + "logps/rejected": -230.23797607421875, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.658843040466309, + "rewards/margins": 7.194061756134033, + "rewards/rejected": -15.85290241241455, + "sft_loss": 1.3414223194122314, + "step": 850 + }, + { + "epoch": 1.404709748083242, + "grad_norm": 3.8454291780017145, + "learning_rate": 1.1114278112780601e-07, + "logits/chosen": 19.258955001831055, + "logits/rejected": 17.727506637573242, + "logps/chosen": -387.6003112792969, + "logps/rejected": -285.1304016113281, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.63217830657959, + "rewards/margins": 9.642850875854492, + "rewards/rejected": -19.27503204345703, + "sft_loss": 1.0949971675872803, + "step": 855 + }, + { + "epoch": 1.4129244249726178, + "grad_norm": 81.05899714514051, + "learning_rate": 1.08328398392449e-07, + "logits/chosen": 18.748620986938477, + "logits/rejected": 17.344341278076172, + "logps/chosen": -355.6446533203125, + "logps/rejected": -269.2040710449219, + "loss": 0.0629, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.797952651977539, + "rewards/margins": 8.548418045043945, + "rewards/rejected": -18.346372604370117, + "sft_loss": 1.1425796747207642, + "step": 860 + }, + { + "epoch": 1.4211391018619934, + "grad_norm": 4.719466773161234, + "learning_rate": 1.0554021871009677e-07, + "logits/chosen": 18.971662521362305, + "logits/rejected": 16.849485397338867, + "logps/chosen": -342.5481872558594, + "logps/rejected": -268.31475830078125, + "loss": 0.0464, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.3140230178833, + "rewards/margins": 9.42198657989502, + "rewards/rejected": -18.73600959777832, + "sft_loss": 1.3192195892333984, + "step": 865 + }, + { + "epoch": 1.429353778751369, + "grad_norm": 3.6243101382632688, + "learning_rate": 1.0277875777210299e-07, + "logits/chosen": 16.66229820251465, + "logits/rejected": 15.501757621765137, + "logps/chosen": -332.5435791015625, + "logps/rejected": -261.5600280761719, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.856383323669434, + "rewards/margins": 8.77900218963623, + "rewards/rejected": -18.635387420654297, + "sft_loss": 1.3038902282714844, + "step": 870 + }, + { + "epoch": 1.4375684556407449, + "grad_norm": 2.7537435903936074, + "learning_rate": 1.0004452632802158e-07, + "logits/chosen": 18.529787063598633, + "logits/rejected": 17.4042911529541, + "logps/chosen": -337.29791259765625, + "logps/rejected": -277.3015441894531, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.470120429992676, + "rewards/margins": 9.908967971801758, + "rewards/rejected": -19.379091262817383, + "sft_loss": 1.2056940793991089, + "step": 875 + }, + { + "epoch": 1.4457831325301205, + "grad_norm": 5.683629925320703, + "learning_rate": 9.733803009114044e-08, + "logits/chosen": 18.083925247192383, + "logits/rejected": 16.712495803833008, + "logps/chosen": -335.4129943847656, + "logps/rejected": -262.9490966796875, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.186826705932617, + "rewards/margins": 8.34049129486084, + "rewards/rejected": -17.527315139770508, + "sft_loss": 1.1717486381530762, + "step": 880 + }, + { + "epoch": 1.453997809419496, + "grad_norm": 3.283485850050913, + "learning_rate": 9.465976964494682e-08, + "logits/chosen": 17.851030349731445, + "logits/rejected": 17.46946144104004, + "logps/chosen": -280.1820373535156, + "logps/rejected": -242.58209228515625, + "loss": 0.0529, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -8.64423656463623, + "rewards/margins": 8.354222297668457, + "rewards/rejected": -16.998458862304688, + "sft_loss": 1.281132459640503, + "step": 885 + }, + { + "epoch": 1.462212486308872, + "grad_norm": 2.4606306801838347, + "learning_rate": 9.201024035054053e-08, + "logits/chosen": 18.96923065185547, + "logits/rejected": 17.189115524291992, + "logps/chosen": -295.1808166503906, + "logps/rejected": -223.66261291503906, + "loss": 0.0605, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -8.9287748336792, + "rewards/margins": 7.127224445343018, + "rewards/rejected": -16.055997848510742, + "sft_loss": 1.340820074081421, + "step": 890 + }, + { + "epoch": 1.4704271631982475, + "grad_norm": 2.471728869379881, + "learning_rate": 8.938993225501495e-08, + "logits/chosen": 19.450706481933594, + "logits/rejected": 18.156089782714844, + "logps/chosen": -343.1815490722656, + "logps/rejected": -268.55767822265625, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.863726615905762, + "rewards/margins": 9.138747215270996, + "rewards/rejected": -18.002471923828125, + "sft_loss": 1.042392611503601, + "step": 895 + }, + { + "epoch": 1.4786418400876231, + "grad_norm": 2.702831742745651, + "learning_rate": 8.679933000081879e-08, + "logits/chosen": 18.009992599487305, + "logits/rejected": 17.068376541137695, + "logps/chosen": -340.9268798828125, + "logps/rejected": -244.87017822265625, + "loss": 0.0603, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -8.484542846679688, + "rewards/margins": 8.053364753723145, + "rewards/rejected": -16.53791046142578, + "sft_loss": 1.1751140356063843, + "step": 900 + }, + { + "epoch": 1.486856516976999, + "grad_norm": 4.0381849618513135, + "learning_rate": 8.423891273611855e-08, + "logits/chosen": 16.985027313232422, + "logits/rejected": 16.37629508972168, + "logps/chosen": -300.3452453613281, + "logps/rejected": -240.13729858398438, + "loss": 0.0587, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.195282936096191, + "rewards/margins": 8.371771812438965, + "rewards/rejected": -16.567054748535156, + "sft_loss": 1.1615040302276611, + "step": 905 + }, + { + "epoch": 1.4950711938663745, + "grad_norm": 3.373858379077247, + "learning_rate": 8.170915402617739e-08, + "logits/chosen": 17.579252243041992, + "logits/rejected": 16.743297576904297, + "logps/chosen": -347.9422912597656, + "logps/rejected": -262.06610107421875, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.031808853149414, + "rewards/margins": 8.512653350830078, + "rewards/rejected": -17.544462203979492, + "sft_loss": 1.1611533164978027, + "step": 910 + }, + { + "epoch": 1.5032858707557502, + "grad_norm": 1.8601145872189842, + "learning_rate": 7.921052176576643e-08, + "logits/chosen": 17.42083740234375, + "logits/rejected": 17.060815811157227, + "logps/chosen": -306.4146728515625, + "logps/rejected": -253.76126098632812, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.15539264678955, + "rewards/margins": 8.407408714294434, + "rewards/rejected": -17.562801361083984, + "sft_loss": 1.0932406187057495, + "step": 915 + }, + { + "epoch": 1.511500547645126, + "grad_norm": 4.848816782812418, + "learning_rate": 7.674347809262377e-08, + "logits/chosen": 17.446378707885742, + "logits/rejected": 17.68989372253418, + "logps/chosen": -295.2680969238281, + "logps/rejected": -246.88856506347656, + "loss": 0.0616, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -8.591629028320312, + "rewards/margins": 8.282797813415527, + "rewards/rejected": -16.874427795410156, + "sft_loss": 1.1592586040496826, + "step": 920 + }, + { + "epoch": 1.5197152245345018, + "grad_norm": 3.1171315520626317, + "learning_rate": 7.430847930198009e-08, + "logits/chosen": 18.438880920410156, + "logits/rejected": 16.9648494720459, + "logps/chosen": -359.6697082519531, + "logps/rejected": -265.92462158203125, + "loss": 0.0599, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.22867202758789, + "rewards/margins": 9.071920394897461, + "rewards/rejected": -18.300594329833984, + "sft_loss": 1.3760299682617188, + "step": 925 + }, + { + "epoch": 1.5279299014238772, + "grad_norm": 2.946496194046417, + "learning_rate": 7.190597576216384e-08, + "logits/chosen": 17.264368057250977, + "logits/rejected": 17.493934631347656, + "logps/chosen": -341.4744873046875, + "logps/rejected": -280.3497619628906, + "loss": 0.0357, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.569901466369629, + "rewards/margins": 9.21591854095459, + "rewards/rejected": -18.785818099975586, + "sft_loss": 1.1669524908065796, + "step": 930 + }, + { + "epoch": 1.536144578313253, + "grad_norm": 4.598015503663298, + "learning_rate": 6.953641183130224e-08, + "logits/chosen": 18.845075607299805, + "logits/rejected": 16.610898971557617, + "logps/chosen": -364.0211181640625, + "logps/rejected": -261.7635803222656, + "loss": 0.055, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.992565155029297, + "rewards/margins": 8.313262939453125, + "rewards/rejected": -18.305830001831055, + "sft_loss": 1.251989722251892, + "step": 935 + }, + { + "epoch": 1.5443592552026288, + "grad_norm": 3.1309871505463733, + "learning_rate": 6.720022577513507e-08, + "logits/chosen": 17.22759246826172, + "logits/rejected": 15.910391807556152, + "logps/chosen": -371.6169738769531, + "logps/rejected": -277.5339050292969, + "loss": 0.0495, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -10.53276538848877, + "rewards/margins": 8.91848087310791, + "rewards/rejected": -19.451250076293945, + "sft_loss": 1.2916030883789062, + "step": 940 + }, + { + "epoch": 1.5525739320920042, + "grad_norm": 6.107763356196428, + "learning_rate": 6.489784968595444e-08, + "logits/chosen": 18.367130279541016, + "logits/rejected": 16.745647430419922, + "logps/chosen": -372.9920654296875, + "logps/rejected": -294.3865966796875, + "loss": 0.0797, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -10.943706512451172, + "rewards/margins": 10.03380298614502, + "rewards/rejected": -20.977510452270508, + "sft_loss": 1.2319529056549072, + "step": 945 + }, + { + "epoch": 1.56078860898138, + "grad_norm": 5.019635803171075, + "learning_rate": 6.262970940268652e-08, + "logits/chosen": 17.80394172668457, + "logits/rejected": 17.284276962280273, + "logps/chosen": -319.9245910644531, + "logps/rejected": -272.44287109375, + "loss": 0.0502, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -10.027565956115723, + "rewards/margins": 9.023143768310547, + "rewards/rejected": -19.05070686340332, + "sft_loss": 1.1501847505569458, + "step": 950 + }, + { + "epoch": 1.5690032858707559, + "grad_norm": 2.5718093284663017, + "learning_rate": 6.039622443213008e-08, + "logits/chosen": 17.769981384277344, + "logits/rejected": 17.192481994628906, + "logps/chosen": -338.2123718261719, + "logps/rejected": -270.8006286621094, + "loss": 0.0455, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -10.453187942504883, + "rewards/margins": 8.49782943725586, + "rewards/rejected": -18.95101547241211, + "sft_loss": 1.23969304561615, + "step": 955 + }, + { + "epoch": 1.5772179627601315, + "grad_norm": 2.7871237377104685, + "learning_rate": 5.8197807871366e-08, + "logits/chosen": 16.671855926513672, + "logits/rejected": 15.88489818572998, + "logps/chosen": -383.57757568359375, + "logps/rejected": -302.80078125, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.86944580078125, + "rewards/margins": 9.499520301818848, + "rewards/rejected": -20.368961334228516, + "sft_loss": 2.283703565597534, + "step": 960 + }, + { + "epoch": 1.585432639649507, + "grad_norm": 8.539385336051494, + "learning_rate": 5.6034866331352376e-08, + "logits/chosen": 17.10059928894043, + "logits/rejected": 15.988658905029297, + "logps/chosen": -328.95880126953125, + "logps/rejected": -276.1900939941406, + "loss": 0.0789, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -11.029694557189941, + "rewards/margins": 9.335869789123535, + "rewards/rejected": -20.365564346313477, + "sft_loss": 1.1840242147445679, + "step": 965 + }, + { + "epoch": 1.593647316538883, + "grad_norm": 4.957273671741837, + "learning_rate": 5.390779986171934e-08, + "logits/chosen": 18.002132415771484, + "logits/rejected": 16.781938552856445, + "logps/chosen": -365.2828674316406, + "logps/rejected": -290.2236328125, + "loss": 0.0487, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -10.259432792663574, + "rewards/margins": 9.27906322479248, + "rewards/rejected": -19.538496017456055, + "sft_loss": 1.1914342641830444, + "step": 970 + }, + { + "epoch": 1.6018619934282585, + "grad_norm": 4.9428875327513655, + "learning_rate": 5.1817001876777314e-08, + "logits/chosen": 16.411977767944336, + "logits/rejected": 16.428050994873047, + "logps/chosen": -325.99688720703125, + "logps/rejected": -282.6103515625, + "loss": 0.0437, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.880454063415527, + "rewards/margins": 9.455738067626953, + "rewards/rejected": -19.336191177368164, + "sft_loss": 1.2319772243499756, + "step": 975 + }, + { + "epoch": 1.6100766703176341, + "grad_norm": 2.957939197451581, + "learning_rate": 4.9762859082752464e-08, + "logits/chosen": 18.82546043395996, + "logits/rejected": 17.623178482055664, + "logps/chosen": -363.7794494628906, + "logps/rejected": -286.4556884765625, + "loss": 0.0679, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -10.052456855773926, + "rewards/margins": 9.68246841430664, + "rewards/rejected": -19.73492431640625, + "sft_loss": 1.0845558643341064, + "step": 980 + }, + { + "epoch": 1.61829134720701, + "grad_norm": 4.380929225854475, + "learning_rate": 4.774575140626316e-08, + "logits/chosen": 16.805063247680664, + "logits/rejected": 16.495389938354492, + "logps/chosen": -326.19549560546875, + "logps/rejected": -266.20477294921875, + "loss": 0.0416, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.56114673614502, + "rewards/margins": 9.003406524658203, + "rewards/rejected": -18.56455421447754, + "sft_loss": 1.1449401378631592, + "step": 985 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 3.1682867458550255, + "learning_rate": 4.5766051924049975e-08, + "logits/chosen": 19.578977584838867, + "logits/rejected": 18.58966064453125, + "logps/chosen": -329.7391357421875, + "logps/rejected": -268.65899658203125, + "loss": 0.078, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.10643482208252, + "rewards/margins": 9.36843204498291, + "rewards/rejected": -18.474864959716797, + "sft_loss": 1.255157470703125, + "step": 990 + }, + { + "epoch": 1.6347207009857612, + "grad_norm": 0.6942420370445791, + "learning_rate": 4.3824126793972934e-08, + "logits/chosen": 17.492353439331055, + "logits/rejected": 16.303081512451172, + "logps/chosen": -363.3464050292969, + "logps/rejected": -277.27960205078125, + "loss": 0.0653, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.666760444641113, + "rewards/margins": 10.318286895751953, + "rewards/rejected": -18.98504638671875, + "sft_loss": 1.1402015686035156, + "step": 995 + }, + { + "epoch": 1.642935377875137, + "grad_norm": 3.2190001250289964, + "learning_rate": 4.192033518728819e-08, + "logits/chosen": 17.813274383544922, + "logits/rejected": 16.33589744567871, + "logps/chosen": -336.6969909667969, + "logps/rejected": -259.0225830078125, + "loss": 0.0455, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -8.958456993103027, + "rewards/margins": 8.632108688354492, + "rewards/rejected": -17.590564727783203, + "sft_loss": 1.3451780080795288, + "step": 1000 + }, + { + "epoch": 1.6511500547645126, + "grad_norm": 7.393954413499783, + "learning_rate": 4.0055029222217125e-08, + "logits/chosen": 17.960172653198242, + "logits/rejected": 16.416893005371094, + "logps/chosen": -315.2778015136719, + "logps/rejected": -254.21824645996094, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.643136978149414, + "rewards/margins": 8.79008960723877, + "rewards/rejected": -18.4332275390625, + "sft_loss": 1.1033631563186646, + "step": 1005 + }, + { + "epoch": 1.6593647316538882, + "grad_norm": 4.50975910962959, + "learning_rate": 3.8228553898819904e-08, + "logits/chosen": 19.406719207763672, + "logits/rejected": 18.56427001953125, + "logps/chosen": -345.2118835449219, + "logps/rejected": -286.46697998046875, + "loss": 0.0704, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -10.453201293945312, + "rewards/margins": 9.429792404174805, + "rewards/rejected": -19.882991790771484, + "sft_loss": 1.1480904817581177, + "step": 1010 + }, + { + "epoch": 1.667579408543264, + "grad_norm": 2.421328969282485, + "learning_rate": 3.6441247035185416e-08, + "logits/chosen": 18.4009952545166, + "logits/rejected": 17.309907913208008, + "logps/chosen": -386.3546447753906, + "logps/rejected": -297.7645568847656, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.208230972290039, + "rewards/margins": 10.017862319946289, + "rewards/rejected": -20.22609519958496, + "sft_loss": 1.1494494676589966, + "step": 1015 + }, + { + "epoch": 1.6757940854326396, + "grad_norm": 3.7694600773057583, + "learning_rate": 3.4693439204949855e-08, + "logits/chosen": 17.161108016967773, + "logits/rejected": 16.785768508911133, + "logps/chosen": -296.5499267578125, + "logps/rejected": -256.6122741699219, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.63855266571045, + "rewards/margins": 8.829416275024414, + "rewards/rejected": -18.467967987060547, + "sft_loss": 1.2135652303695679, + "step": 1020 + }, + { + "epoch": 1.6840087623220152, + "grad_norm": 7.949500193131256, + "learning_rate": 3.298545367615493e-08, + "logits/chosen": 18.552576065063477, + "logits/rejected": 17.31821060180664, + "logps/chosen": -295.35540771484375, + "logps/rejected": -242.86293029785156, + "loss": 0.0854, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -9.394362449645996, + "rewards/margins": 7.929934501647949, + "rewards/rejected": -17.324296951293945, + "sft_loss": 1.2392216920852661, + "step": 1025 + }, + { + "epoch": 1.692223439211391, + "grad_norm": 5.638532817532561, + "learning_rate": 3.13176063514575e-08, + "logits/chosen": 18.253822326660156, + "logits/rejected": 17.51996421813965, + "logps/chosen": -363.8645935058594, + "logps/rejected": -290.9195556640625, + "loss": 0.0719, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.809922218322754, + "rewards/margins": 10.452852249145508, + "rewards/rejected": -20.26277732849121, + "sft_loss": 1.302940845489502, + "step": 1030 + }, + { + "epoch": 1.7004381161007667, + "grad_norm": 4.692786337225903, + "learning_rate": 2.96902057097011e-08, + "logits/chosen": 18.16409683227539, + "logits/rejected": 17.41961669921875, + "logps/chosen": -346.486328125, + "logps/rejected": -269.6810607910156, + "loss": 0.0603, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.241110801696777, + "rewards/margins": 9.675481796264648, + "rewards/rejected": -18.916593551635742, + "sft_loss": 1.3590866327285767, + "step": 1035 + }, + { + "epoch": 1.7086527929901423, + "grad_norm": 5.302948433169166, + "learning_rate": 2.8103552748861475e-08, + "logits/chosen": 17.655193328857422, + "logits/rejected": 16.845975875854492, + "logps/chosen": -338.5050354003906, + "logps/rejected": -274.38592529296875, + "loss": 0.0456, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.701102256774902, + "rewards/margins": 9.212477684020996, + "rewards/rejected": -18.9135799407959, + "sft_loss": 1.1541944742202759, + "step": 1040 + }, + { + "epoch": 1.716867469879518, + "grad_norm": 7.958361212469992, + "learning_rate": 2.65579409303745e-08, + "logits/chosen": 18.777873992919922, + "logits/rejected": 17.120763778686523, + "logps/chosen": -387.65240478515625, + "logps/rejected": -283.70098876953125, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.420891761779785, + "rewards/margins": 10.373385429382324, + "rewards/rejected": -19.79427719116211, + "sft_loss": 1.1774132251739502, + "step": 1045 + }, + { + "epoch": 1.7250821467688937, + "grad_norm": 9.531573914482877, + "learning_rate": 2.505365612485874e-08, + "logits/chosen": 17.264955520629883, + "logits/rejected": 14.986218452453613, + "logps/chosen": -337.1243896484375, + "logps/rejected": -248.45452880859375, + "loss": 0.0714, + "rewards/accuracies": 0.9333333373069763, + "rewards/chosen": -9.543818473815918, + "rewards/margins": 7.988076686859131, + "rewards/rejected": -17.53189468383789, + "sft_loss": 1.5333364009857178, + "step": 1050 + }, + { + "epoch": 1.7332968236582693, + "grad_norm": 4.137336908358683, + "learning_rate": 2.3590976559242275e-08, + "logits/chosen": 17.53969383239746, + "logits/rejected": 17.553749084472656, + "logps/chosen": -310.6015930175781, + "logps/rejected": -270.8495178222656, + "loss": 0.0577, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.537567138671875, + "rewards/margins": 8.755559921264648, + "rewards/rejected": -18.29313087463379, + "sft_loss": 1.243970513343811, + "step": 1055 + }, + { + "epoch": 1.7415115005476451, + "grad_norm": 4.1742977406410295, + "learning_rate": 2.21701727653025e-08, + "logits/chosen": 17.685638427734375, + "logits/rejected": 15.837899208068848, + "logps/chosen": -373.8312683105469, + "logps/rejected": -280.65020751953125, + "loss": 0.0635, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -10.465250968933105, + "rewards/margins": 9.272075653076172, + "rewards/rejected": -19.73732566833496, + "sft_loss": 1.230289101600647, + "step": 1060 + }, + { + "epoch": 1.749726177437021, + "grad_norm": 3.5905659893317434, + "learning_rate": 2.0791507529629522e-08, + "logits/chosen": 17.48339080810547, + "logits/rejected": 17.49197006225586, + "logps/chosen": -282.5596008300781, + "logps/rejected": -230.78216552734375, + "loss": 0.0617, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -8.903531074523926, + "rewards/margins": 7.071009635925293, + "rewards/rejected": -15.974542617797852, + "sft_loss": 1.1227587461471558, + "step": 1065 + }, + { + "epoch": 1.7579408543263964, + "grad_norm": 13.383809225376258, + "learning_rate": 1.945523584502262e-08, + "logits/chosen": 19.799388885498047, + "logits/rejected": 17.97065544128418, + "logps/chosen": -394.8106384277344, + "logps/rejected": -280.33184814453125, + "loss": 0.0572, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.54643726348877, + "rewards/margins": 9.38154125213623, + "rewards/rejected": -18.927978515625, + "sft_loss": 1.0600664615631104, + "step": 1070 + }, + { + "epoch": 1.7661555312157722, + "grad_norm": 3.85892797302041, + "learning_rate": 1.8161604863327072e-08, + "logits/chosen": 16.801225662231445, + "logits/rejected": 16.272432327270508, + "logps/chosen": -315.7366943359375, + "logps/rejected": -254.6013946533203, + "loss": 0.0406, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.108306884765625, + "rewards/margins": 8.766523361206055, + "rewards/rejected": -17.874826431274414, + "sft_loss": 1.1722328662872314, + "step": 1075 + }, + { + "epoch": 1.774370208105148, + "grad_norm": 18.187119238883547, + "learning_rate": 1.691085384972235e-08, + "logits/chosen": 16.329721450805664, + "logits/rejected": 15.469901084899902, + "logps/chosen": -283.27142333984375, + "logps/rejected": -235.70811462402344, + "loss": 0.0575, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.012235641479492, + "rewards/margins": 7.615647792816162, + "rewards/rejected": -16.62788200378418, + "sft_loss": 1.2651445865631104, + "step": 1080 + }, + { + "epoch": 1.7825848849945234, + "grad_norm": 3.4104772768917426, + "learning_rate": 1.570321413846845e-08, + "logits/chosen": 16.62438201904297, + "logits/rejected": 17.102264404296875, + "logps/chosen": -312.1449279785156, + "logps/rejected": -271.4256591796875, + "loss": 0.0299, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.285161018371582, + "rewards/margins": 9.69771957397461, + "rewards/rejected": -18.982881546020508, + "sft_loss": 1.2524992227554321, + "step": 1085 + }, + { + "epoch": 1.7907995618838992, + "grad_norm": 4.409471745750223, + "learning_rate": 1.4538909090118846e-08, + "logits/chosen": 18.32159423828125, + "logits/rejected": 16.56727409362793, + "logps/chosen": -338.2820129394531, + "logps/rejected": -250.31729125976562, + "loss": 0.0607, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.47223949432373, + "rewards/margins": 8.058004379272461, + "rewards/rejected": -17.530242919921875, + "sft_loss": 1.2121272087097168, + "step": 1090 + }, + { + "epoch": 1.799014238773275, + "grad_norm": 3.827920678689443, + "learning_rate": 1.3418154050208936e-08, + "logits/chosen": 17.697154998779297, + "logits/rejected": 16.722187042236328, + "logps/chosen": -303.7814636230469, + "logps/rejected": -251.51583862304688, + "loss": 0.0563, + "rewards/accuracies": 0.9466667175292969, + "rewards/chosen": -9.34262466430664, + "rewards/margins": 8.22611141204834, + "rewards/rejected": -17.568735122680664, + "sft_loss": 1.133971095085144, + "step": 1095 + }, + { + "epoch": 1.8072289156626506, + "grad_norm": 2.3604810623566084, + "learning_rate": 1.2341156309426447e-08, + "logits/chosen": 16.974082946777344, + "logits/rejected": 16.85063362121582, + "logps/chosen": -346.72998046875, + "logps/rejected": -278.26165771484375, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.343656539916992, + "rewards/margins": 9.743067741394043, + "rewards/rejected": -19.086727142333984, + "sft_loss": 1.044119954109192, + "step": 1100 + }, + { + "epoch": 1.8154435925520263, + "grad_norm": 3.1591505712032237, + "learning_rate": 1.130811506527149e-08, + "logits/chosen": 18.949861526489258, + "logits/rejected": 17.834264755249023, + "logps/chosen": -389.62042236328125, + "logps/rejected": -280.9618835449219, + "loss": 0.0328, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.740713119506836, + "rewards/margins": 9.123427391052246, + "rewards/rejected": -18.8641414642334, + "sft_loss": 1.1095327138900757, + "step": 1105 + }, + { + "epoch": 1.823658269441402, + "grad_norm": 1.034828289458216, + "learning_rate": 1.0319221385213934e-08, + "logits/chosen": 17.111549377441406, + "logits/rejected": 16.737529754638672, + "logps/chosen": -317.83343505859375, + "logps/rejected": -262.7276916503906, + "loss": 0.0392, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.57455825805664, + "rewards/margins": 8.217500686645508, + "rewards/rejected": -17.792057037353516, + "sft_loss": 1.276735782623291, + "step": 1110 + }, + { + "epoch": 1.8318729463307777, + "grad_norm": 5.969957340964575, + "learning_rate": 9.374658171354411e-09, + "logits/chosen": 18.033985137939453, + "logits/rejected": 16.812503814697266, + "logps/chosen": -338.47796630859375, + "logps/rejected": -267.0741271972656, + "loss": 0.066, + "rewards/accuracies": 0.9466666579246521, + "rewards/chosen": -9.4961519241333, + "rewards/margins": 9.333700180053711, + "rewards/rejected": -18.829853057861328, + "sft_loss": 1.2590656280517578, + "step": 1115 + }, + { + "epoch": 1.8400876232201533, + "grad_norm": 3.650713168539723, + "learning_rate": 8.474600126594983e-09, + "logits/chosen": 18.25022315979004, + "logits/rejected": 17.499792098999023, + "logps/chosen": -332.2896728515625, + "logps/rejected": -259.7283935546875, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.764142990112305, + "rewards/margins": 9.385050773620605, + "rewards/rejected": -18.149192810058594, + "sft_loss": 1.2872370481491089, + "step": 1120 + }, + { + "epoch": 1.8483023001095291, + "grad_norm": 3.519511198290722, + "learning_rate": 7.619213722327184e-09, + "logits/chosen": 17.54939842224121, + "logits/rejected": 16.605783462524414, + "logps/chosen": -330.8531799316406, + "logps/rejected": -265.7262878417969, + "loss": 0.0435, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.460249900817871, + "rewards/margins": 9.11566162109375, + "rewards/rejected": -18.575910568237305, + "sft_loss": 1.2160968780517578, + "step": 1125 + }, + { + "epoch": 1.8565169769989047, + "grad_norm": 4.185481328436663, + "learning_rate": 6.808657167641896e-09, + "logits/chosen": 17.643688201904297, + "logits/rejected": 16.674888610839844, + "logps/chosen": -361.64715576171875, + "logps/rejected": -284.84881591796875, + "loss": 0.0669, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.835926055908203, + "rewards/margins": 10.017841339111328, + "rewards/rejected": -19.8537654876709, + "sft_loss": 1.1817443370819092, + "step": 1130 + }, + { + "epoch": 1.8647316538882803, + "grad_norm": 5.8031765508458095, + "learning_rate": 6.043080380067539e-09, + "logits/chosen": 16.346317291259766, + "logits/rejected": 16.2381649017334, + "logps/chosen": -378.3958435058594, + "logps/rejected": -303.12091064453125, + "loss": 0.0523, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.526629447937012, + "rewards/margins": 11.506245613098145, + "rewards/rejected": -21.03287696838379, + "sft_loss": 1.1959699392318726, + "step": 1135 + }, + { + "epoch": 1.8729463307776562, + "grad_norm": 8.21221020965199, + "learning_rate": 5.322624957841998e-09, + "logits/chosen": 18.25189971923828, + "logits/rejected": 17.553903579711914, + "logps/chosen": -341.5302734375, + "logps/rejected": -272.32012939453125, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.903505325317383, + "rewards/margins": 8.774069786071777, + "rewards/rejected": -18.677574157714844, + "sft_loss": 1.110214114189148, + "step": 1140 + }, + { + "epoch": 1.8811610076670318, + "grad_norm": 1.2597947012426969, + "learning_rate": 4.647424153723101e-09, + "logits/chosen": 18.352624893188477, + "logits/rejected": 16.199268341064453, + "logps/chosen": -322.0838928222656, + "logps/rejected": -241.83558654785156, + "loss": 0.0757, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.200777053833008, + "rewards/margins": 7.505552768707275, + "rewards/rejected": -16.706331253051758, + "sft_loss": 1.2215784788131714, + "step": 1145 + }, + { + "epoch": 1.8893756845564074, + "grad_norm": 4.292246600774085, + "learning_rate": 4.0176028503425826e-09, + "logits/chosen": 17.103918075561523, + "logits/rejected": 16.931406021118164, + "logps/chosen": -311.3345642089844, + "logps/rejected": -262.2460021972656, + "loss": 0.049, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.341358184814453, + "rewards/margins": 9.20698070526123, + "rewards/rejected": -18.548337936401367, + "sft_loss": 1.2602629661560059, + "step": 1150 + }, + { + "epoch": 1.8975903614457832, + "grad_norm": 3.9316412520602233, + "learning_rate": 3.433277537108481e-09, + "logits/chosen": 18.07435417175293, + "logits/rejected": 17.416156768798828, + "logps/chosen": -373.20477294921875, + "logps/rejected": -284.2578430175781, + "loss": 0.0481, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -10.183159828186035, + "rewards/margins": 8.937155723571777, + "rewards/rejected": -19.120319366455078, + "sft_loss": 1.2882879972457886, + "step": 1155 + }, + { + "epoch": 1.9058050383351588, + "grad_norm": 4.002247931370384, + "learning_rate": 2.8945562886593944e-09, + "logits/chosen": 16.535991668701172, + "logits/rejected": 15.887761116027832, + "logps/chosen": -281.4147033691406, + "logps/rejected": -248.97198486328125, + "loss": 0.0572, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.560811996459961, + "rewards/margins": 8.152007102966309, + "rewards/rejected": -17.712818145751953, + "sft_loss": 1.1380819082260132, + "step": 1160 + }, + { + "epoch": 1.9140197152245344, + "grad_norm": 6.09926805017728, + "learning_rate": 2.4015387448756976e-09, + "logits/chosen": 16.622642517089844, + "logits/rejected": 15.969447135925293, + "logps/chosen": -354.6962890625, + "logps/rejected": -264.1971435546875, + "loss": 0.0698, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.276373863220215, + "rewards/margins": 9.219854354858398, + "rewards/rejected": -18.496227264404297, + "sft_loss": 1.35190749168396, + "step": 1165 + }, + { + "epoch": 1.9222343921139102, + "grad_norm": 2.2008981068000257, + "learning_rate": 1.954316092450281e-09, + "logits/chosen": 17.52834129333496, + "logits/rejected": 16.535673141479492, + "logps/chosen": -338.9072570800781, + "logps/rejected": -272.974853515625, + "loss": 0.0483, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -10.078722953796387, + "rewards/margins": 8.57765007019043, + "rewards/rejected": -18.6563720703125, + "sft_loss": 1.2665674686431885, + "step": 1170 + }, + { + "epoch": 1.9304490690032858, + "grad_norm": 6.860255096147663, + "learning_rate": 1.5529710480231272e-09, + "logits/chosen": 18.16209602355957, + "logits/rejected": 17.17742347717285, + "logps/chosen": -305.7500305175781, + "logps/rejected": -253.6704864501953, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.595235824584961, + "rewards/margins": 8.096555709838867, + "rewards/rejected": -17.69179344177246, + "sft_loss": 1.0956637859344482, + "step": 1175 + }, + { + "epoch": 1.9386637458926614, + "grad_norm": 5.583742228828171, + "learning_rate": 1.1975778428823524e-09, + "logits/chosen": 17.371732711791992, + "logits/rejected": 16.7208309173584, + "logps/chosen": -353.3210754394531, + "logps/rejected": -281.03302001953125, + "loss": 0.0718, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -10.14670467376709, + "rewards/margins": 9.053946495056152, + "rewards/rejected": -19.200651168823242, + "sft_loss": 1.1084918975830078, + "step": 1180 + }, + { + "epoch": 1.9468784227820373, + "grad_norm": 7.482058530142954, + "learning_rate": 8.882022092346064e-10, + "logits/chosen": 17.550270080566406, + "logits/rejected": 16.700857162475586, + "logps/chosen": -347.03363037109375, + "logps/rejected": -273.7410583496094, + "loss": 0.0371, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.479987144470215, + "rewards/margins": 9.481571197509766, + "rewards/rejected": -18.961559295654297, + "sft_loss": 1.2440842390060425, + "step": 1185 + }, + { + "epoch": 1.9550930996714129, + "grad_norm": 2.060017516908208, + "learning_rate": 6.249013680474368e-10, + "logits/chosen": 17.62407875061035, + "logits/rejected": 16.294307708740234, + "logps/chosen": -312.6640625, + "logps/rejected": -252.1957550048828, + "loss": 0.042, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.584585189819336, + "rewards/margins": 7.968944072723389, + "rewards/rejected": -17.553531646728516, + "sft_loss": 1.1957759857177734, + "step": 1190 + }, + { + "epoch": 1.9633077765607885, + "grad_norm": 8.70865898012454, + "learning_rate": 4.0772401846608794e-10, + "logits/chosen": 18.594797134399414, + "logits/rejected": 17.494707107543945, + "logps/chosen": -298.91571044921875, + "logps/rejected": -247.17645263671875, + "loss": 0.0791, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.819672584533691, + "rewards/margins": 7.458762168884277, + "rewards/rejected": -17.27843475341797, + "sft_loss": 1.1721436977386475, + "step": 1195 + }, + { + "epoch": 1.9715224534501643, + "grad_norm": 8.143555814452077, + "learning_rate": 2.367103288061223e-10, + "logits/chosen": 17.42876625061035, + "logits/rejected": 16.18610191345215, + "logps/chosen": -311.3353271484375, + "logps/rejected": -255.815673828125, + "loss": 0.057, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.853960990905762, + "rewards/margins": 8.45102310180664, + "rewards/rejected": -18.304983139038086, + "sft_loss": 1.2459405660629272, + "step": 1200 + }, + { + "epoch": 1.9797371303395401, + "grad_norm": 3.1264960747238333, + "learning_rate": 1.1189192912416933e-10, + "logits/chosen": 17.570209503173828, + "logits/rejected": 16.410261154174805, + "logps/chosen": -398.1352233886719, + "logps/rejected": -293.1958923339844, + "loss": 0.0365, + "rewards/accuracies": 0.9866666793823242, + "rewards/chosen": -9.722556114196777, + "rewards/margins": 10.323168754577637, + "rewards/rejected": -20.045726776123047, + "sft_loss": 1.2017709016799927, + "step": 1205 + }, + { + "epoch": 1.9879518072289155, + "grad_norm": 4.393395171762388, + "learning_rate": 3.329190536757731e-11, + "logits/chosen": 18.872753143310547, + "logits/rejected": 18.477998733520508, + "logps/chosen": -312.2131042480469, + "logps/rejected": -260.5821228027344, + "loss": 0.0535, + "rewards/accuracies": 0.9733333587646484, + "rewards/chosen": -9.505008697509766, + "rewards/margins": 9.130599021911621, + "rewards/rejected": -18.635608673095703, + "sft_loss": 1.1339497566223145, + "step": 1210 + }, + { + "epoch": 1.9961664841182913, + "grad_norm": 5.10635428256011, + "learning_rate": 9.247951046897906e-13, + "logits/chosen": 18.22831916809082, + "logits/rejected": 17.80943489074707, + "logps/chosen": -330.88934326171875, + "logps/rejected": -262.41461181640625, + "loss": 0.0471, + "rewards/accuracies": 0.9600000381469727, + "rewards/chosen": -9.283769607543945, + "rewards/margins": 8.581202507019043, + "rewards/rejected": -17.864973068237305, + "sft_loss": 1.1466354131698608, + "step": 1215 + }, + { + "epoch": 1.9978094194961664, + "step": 1216, + "total_flos": 131893560147968.0, + "train_loss": 0.08742370063708604, + "train_runtime": 41055.7102, + "train_samples_per_second": 1.778, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 5, + "max_steps": 1216, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 131893560147968.0, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}