{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990762978015888, "eval_steps": 400, "global_step": 507, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001970564689943962, "grad_norm": 3.444017553990766, "learning_rate": 9.803921568627451e-09, "logits/chosen": -0.23276051878929138, "logits/rejected": -0.43208426237106323, "logps/chosen": -95.95150756835938, "logps/rejected": -103.43749237060547, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00985282344971981, "grad_norm": 3.522148904992214, "learning_rate": 4.901960784313725e-08, "logits/chosen": -0.07696018368005753, "logits/rejected": -0.3334544003009796, "logps/chosen": -106.13592529296875, "logps/rejected": -98.82791137695312, "loss": 0.6931, "rewards/accuracies": 0.3828125, "rewards/chosen": -0.000650706235319376, "rewards/margins": 0.00010106083936989307, "rewards/rejected": -0.00075176713289693, "step": 5 }, { "epoch": 0.01970564689943962, "grad_norm": 3.219795158164747, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.06049077585339546, "logits/rejected": -0.260353147983551, "logps/chosen": -98.03811645507812, "logps/rejected": -97.61465454101562, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0005443535046651959, "rewards/margins": -0.0005626807105727494, "rewards/rejected": 1.8327215002500452e-05, "step": 10 }, { "epoch": 0.02955847034915943, "grad_norm": 3.446656700249776, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -0.12729588150978088, "logits/rejected": -0.295417845249176, "logps/chosen": -99.15818786621094, "logps/rejected": -97.37001037597656, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -7.940361683722585e-05, "rewards/margins": -0.0002843155525624752, "rewards/rejected": 0.00020491195027716458, "step": 15 }, { "epoch": 0.03941129379887924, "grad_norm": 3.2961842224501425, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.11793007701635361, "logits/rejected": -0.29867538809776306, "logps/chosen": -103.20733642578125, "logps/rejected": -97.68354797363281, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0002625386696308851, "rewards/margins": 0.0006132128764875233, "rewards/rejected": -0.0003506741486489773, "step": 20 }, { "epoch": 0.049264117248599054, "grad_norm": 3.486650848831331, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -0.17689576745033264, "logits/rejected": -0.34940794110298157, "logps/chosen": -106.73783874511719, "logps/rejected": -104.61811828613281, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00013853741984348744, "rewards/margins": 0.001048876903951168, "rewards/rejected": -0.0009103395859710872, "step": 25 }, { "epoch": 0.05911694069831886, "grad_norm": 3.326203197649224, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.07684741169214249, "logits/rejected": -0.3008119761943817, "logps/chosen": -102.79545593261719, "logps/rejected": -101.55641174316406, "loss": 0.692, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0009806936141103506, "rewards/margins": 0.003102914895862341, "rewards/rejected": -0.0021222210489213467, "step": 30 }, { "epoch": 0.06896976414803867, "grad_norm": 3.3448990755681747, "learning_rate": 3.431372549019608e-07, "logits/chosen": -0.05913068726658821, "logits/rejected": -0.357162743806839, "logps/chosen": -105.96171569824219, "logps/rejected": -97.8105697631836, "loss": 0.6904, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 9.926412894856185e-05, "rewards/margins": 0.0057004219852387905, "rewards/rejected": -0.005601157899945974, "step": 35 }, { "epoch": 0.07882258759775848, "grad_norm": 3.507668256246831, "learning_rate": 3.92156862745098e-07, "logits/chosen": -0.08776526153087616, "logits/rejected": -0.24237962067127228, "logps/chosen": -90.3080825805664, "logps/rejected": -92.34758758544922, "loss": 0.6883, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.000393406196963042, "rewards/margins": 0.00874106865376234, "rewards/rejected": -0.009134475141763687, "step": 40 }, { "epoch": 0.0886754110474783, "grad_norm": 3.4196928203304737, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -0.10117539018392563, "logits/rejected": -0.3224117159843445, "logps/chosen": -109.0184326171875, "logps/rejected": -108.93272399902344, "loss": 0.6848, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.003931170795112848, "rewards/margins": 0.021240899339318275, "rewards/rejected": -0.025172073394060135, "step": 45 }, { "epoch": 0.09852823449719811, "grad_norm": 3.5401415663183173, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.15071377158164978, "logits/rejected": -0.3232310116291046, "logps/chosen": -104.10140228271484, "logps/rejected": -111.49967193603516, "loss": 0.6794, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.015169525519013405, "rewards/margins": 0.028989236801862717, "rewards/rejected": -0.04415876418352127, "step": 50 }, { "epoch": 0.10838105794691791, "grad_norm": 3.5990117325571944, "learning_rate": 4.999050767562379e-07, "logits/chosen": -0.08029767870903015, "logits/rejected": -0.28052276372909546, "logps/chosen": -99.97483825683594, "logps/rejected": -106.4577407836914, "loss": 0.6768, "rewards/accuracies": 0.75, "rewards/chosen": -0.052169084548950195, "rewards/margins": 0.041819095611572266, "rewards/rejected": -0.09398818016052246, "step": 55 }, { "epoch": 0.11823388139663772, "grad_norm": 3.764415157715389, "learning_rate": 4.99519574616467e-07, "logits/chosen": -0.13724075257778168, "logits/rejected": -0.307682603597641, "logps/chosen": -112.85994720458984, "logps/rejected": -114.5948715209961, "loss": 0.6671, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.08050791174173355, "rewards/margins": 0.06749774515628815, "rewards/rejected": -0.1480056643486023, "step": 60 }, { "epoch": 0.12808670484635754, "grad_norm": 4.280188058589697, "learning_rate": 4.988380179235842e-07, "logits/chosen": -0.04185126721858978, "logits/rejected": -0.2287481129169464, "logps/chosen": -116.09354400634766, "logps/rejected": -124.45157623291016, "loss": 0.6566, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12864899635314941, "rewards/margins": 0.07999014109373093, "rewards/rejected": -0.20863911509513855, "step": 65 }, { "epoch": 0.13793952829607734, "grad_norm": 4.477584197323834, "learning_rate": 4.978612153434526e-07, "logits/chosen": -0.06294523924589157, "logits/rejected": -0.2265748679637909, "logps/chosen": -115.08953857421875, "logps/rejected": -129.55136108398438, "loss": 0.6387, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2160721719264984, "rewards/margins": 0.10703370720148087, "rewards/rejected": -0.32310590147972107, "step": 70 }, { "epoch": 0.14779235174579716, "grad_norm": 14.949806540573308, "learning_rate": 4.965903258506806e-07, "logits/chosen": -0.07772421091794968, "logits/rejected": -0.21028895676136017, "logps/chosen": -133.30584716796875, "logps/rejected": -150.30581665039062, "loss": 0.6349, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30895400047302246, "rewards/margins": 0.1755586862564087, "rewards/rejected": -0.48451265692710876, "step": 75 }, { "epoch": 0.15764517519551696, "grad_norm": 4.905767972397329, "learning_rate": 4.950268573535011e-07, "logits/chosen": -0.04325466603040695, "logits/rejected": -0.2531152367591858, "logps/chosen": -144.97561645507812, "logps/rejected": -158.16122436523438, "loss": 0.6042, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4763612151145935, "rewards/margins": 0.16494214534759521, "rewards/rejected": -0.6413034200668335, "step": 80 }, { "epoch": 0.16749799864523676, "grad_norm": 5.265890759040485, "learning_rate": 4.93172664904641e-07, "logits/chosen": -0.11157502233982086, "logits/rejected": -0.26035866141319275, "logps/chosen": -159.24551391601562, "logps/rejected": -204.39495849609375, "loss": 0.5821, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5800860524177551, "rewards/margins": 0.4450142979621887, "rewards/rejected": -1.0251003503799438, "step": 85 }, { "epoch": 0.1773508220949566, "grad_norm": 5.725906040480978, "learning_rate": 4.910299485003033e-07, "logits/chosen": -0.07629499584436417, "logits/rejected": -0.29128947854042053, "logps/chosen": -174.70559692382812, "logps/rejected": -207.3603515625, "loss": 0.5548, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7107561230659485, "rewards/margins": 0.3972470760345459, "rewards/rejected": -1.10800302028656, "step": 90 }, { "epoch": 0.1872036455446764, "grad_norm": 6.403573729253865, "learning_rate": 4.886012504698769e-07, "logits/chosen": -0.1544032096862793, "logits/rejected": -0.37451326847076416, "logps/chosen": -184.72915649414062, "logps/rejected": -229.79541015625, "loss": 0.5305, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8728511929512024, "rewards/margins": 0.4681348204612732, "rewards/rejected": -1.3409860134124756, "step": 95 }, { "epoch": 0.19705646899439622, "grad_norm": 7.412632948891829, "learning_rate": 4.858894524594652e-07, "logits/chosen": -0.15470778942108154, "logits/rejected": -0.4357427656650543, "logps/chosen": -234.9979705810547, "logps/rejected": -306.1597595214844, "loss": 0.4988, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2750121355056763, "rewards/margins": 0.8118740916252136, "rewards/rejected": -2.086885929107666, "step": 100 }, { "epoch": 0.20690929244411602, "grad_norm": 8.591568819157517, "learning_rate": 4.828977720128198e-07, "logits/chosen": -0.13351579010486603, "logits/rejected": -0.47733497619628906, "logps/chosen": -268.23260498046875, "logps/rejected": -370.4355163574219, "loss": 0.5061, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7161176204681396, "rewards/margins": 1.0792567729949951, "rewards/rejected": -2.7953743934631348, "step": 105 }, { "epoch": 0.21676211589383582, "grad_norm": 9.939985555056147, "learning_rate": 4.796297587537285e-07, "logits/chosen": -0.2192394733428955, "logits/rejected": -0.42173558473587036, "logps/chosen": -294.60870361328125, "logps/rejected": -394.9258117675781, "loss": 0.4457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9103400707244873, "rewards/margins": 1.0444176197052002, "rewards/rejected": -2.9547572135925293, "step": 110 }, { "epoch": 0.22661493934355564, "grad_norm": 9.525042212102928, "learning_rate": 4.760892901743944e-07, "logits/chosen": -0.20480632781982422, "logits/rejected": -0.46547192335128784, "logps/chosen": -295.8061218261719, "logps/rejected": -422.6036071777344, "loss": 0.4309, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9569038152694702, "rewards/margins": 1.2766072750091553, "rewards/rejected": -3.233510971069336, "step": 115 }, { "epoch": 0.23646776279327544, "grad_norm": 12.501263690005059, "learning_rate": 4.7228056703479626e-07, "logits/chosen": -0.21756932139396667, "logits/rejected": -0.45747238397598267, "logps/chosen": -335.4225158691406, "logps/rejected": -476.47637939453125, "loss": 0.4602, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.304030179977417, "rewards/margins": 1.4334566593170166, "rewards/rejected": -3.7374866008758545, "step": 120 }, { "epoch": 0.24632058624299527, "grad_norm": 10.295571989073236, "learning_rate": 4.6820810837849535e-07, "logits/chosen": -0.24834315478801727, "logits/rejected": -0.47704511880874634, "logps/chosen": -360.4291687011719, "logps/rejected": -489.4334411621094, "loss": 0.408, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.5335352420806885, "rewards/margins": 1.3203860521316528, "rewards/rejected": -3.8539211750030518, "step": 125 }, { "epoch": 0.25617340969271507, "grad_norm": 10.797289668730418, "learning_rate": 4.63876746170797e-07, "logits/chosen": -0.2283620834350586, "logits/rejected": -0.45370951294898987, "logps/chosen": -362.5042724609375, "logps/rejected": -505.86968994140625, "loss": 0.4348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5926971435546875, "rewards/margins": 1.4749071598052979, "rewards/rejected": -4.067604064941406, "step": 130 }, { "epoch": 0.2660262331424349, "grad_norm": 12.27688423712984, "learning_rate": 4.592916195656321e-07, "logits/chosen": -0.20323459804058075, "logits/rejected": -0.4668886065483093, "logps/chosen": -317.35345458984375, "logps/rejected": -467.3150329589844, "loss": 0.4105, "rewards/accuracies": 0.8125, "rewards/chosen": -2.133204460144043, "rewards/margins": 1.5337626934051514, "rewards/rejected": -3.6669669151306152, "step": 135 }, { "epoch": 0.27587905659215467, "grad_norm": 15.818054338660462, "learning_rate": 4.544581688079602e-07, "logits/chosen": -0.18876026570796967, "logits/rejected": -0.47572746872901917, "logps/chosen": -343.0673828125, "logps/rejected": -514.2898559570312, "loss": 0.3999, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.457926034927368, "rewards/margins": 1.7443277835845947, "rewards/rejected": -4.2022528648376465, "step": 140 }, { "epoch": 0.2857318800418745, "grad_norm": 10.354898946060759, "learning_rate": 4.493821287789272e-07, "logits/chosen": -0.2576290965080261, "logits/rejected": -0.4858337938785553, "logps/chosen": -385.8844299316406, "logps/rejected": -581.2125244140625, "loss": 0.3801, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.86683988571167, "rewards/margins": 1.994633436203003, "rewards/rejected": -4.861473083496094, "step": 145 }, { "epoch": 0.2955847034915943, "grad_norm": 14.110695862987818, "learning_rate": 4.4406952219143934e-07, "logits/chosen": -0.2310657501220703, "logits/rejected": -0.44463711977005005, "logps/chosen": -385.70623779296875, "logps/rejected": -559.1126708984375, "loss": 0.3314, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.8108325004577637, "rewards/margins": 1.7815577983856201, "rewards/rejected": -4.592390060424805, "step": 150 }, { "epoch": 0.3054375269413141, "grad_norm": 15.545527108248088, "learning_rate": 4.38526652444224e-07, "logits/chosen": -0.26962828636169434, "logits/rejected": -0.41183385252952576, "logps/chosen": -400.4664611816406, "logps/rejected": -608.4261474609375, "loss": 0.3854, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.9346907138824463, "rewards/margins": 2.0727694034576416, "rewards/rejected": -5.007460594177246, "step": 155 }, { "epoch": 0.3152903503910339, "grad_norm": 14.215413027429468, "learning_rate": 4.3276009614285824e-07, "logits/chosen": -0.22768807411193848, "logits/rejected": -0.4115552306175232, "logps/chosen": -361.54876708984375, "logps/rejected": -561.5390625, "loss": 0.3959, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.6572659015655518, "rewards/margins": 1.9841495752334595, "rewards/rejected": -4.641415596008301, "step": 160 }, { "epoch": 0.32514317384075375, "grad_norm": 15.569517834142987, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.20264258980751038, "logits/rejected": -0.40150555968284607, "logps/chosen": -376.89971923828125, "logps/rejected": -603.7943115234375, "loss": 0.3664, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.7810330390930176, "rewards/margins": 2.239793300628662, "rewards/rejected": -5.02082633972168, "step": 165 }, { "epoch": 0.3349959972904735, "grad_norm": 12.496148003358819, "learning_rate": 4.2058354920054043e-07, "logits/chosen": -0.28801003098487854, "logits/rejected": -0.4120853543281555, "logps/chosen": -348.6145324707031, "logps/rejected": -565.19482421875, "loss": 0.3559, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.543839454650879, "rewards/margins": 2.167828321456909, "rewards/rejected": -4.711668014526367, "step": 170 }, { "epoch": 0.34484882074019335, "grad_norm": 14.212926345332207, "learning_rate": 4.141880060119336e-07, "logits/chosen": -0.2191300094127655, "logits/rejected": -0.4566856324672699, "logps/chosen": -396.66876220703125, "logps/rejected": -567.9503784179688, "loss": 0.3842, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9396305084228516, "rewards/margins": 1.7750612497329712, "rewards/rejected": -4.714691162109375, "step": 175 }, { "epoch": 0.3547016441899132, "grad_norm": 12.696873162753882, "learning_rate": 4.0759765403198877e-07, "logits/chosen": -0.28399690985679626, "logits/rejected": -0.4555039405822754, "logps/chosen": -343.26263427734375, "logps/rejected": -577.4595336914062, "loss": 0.3109, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.4329833984375, "rewards/margins": 2.351548671722412, "rewards/rejected": -4.784532070159912, "step": 180 }, { "epoch": 0.364554467639633, "grad_norm": 15.682818507341654, "learning_rate": 4.008203127021797e-07, "logits/chosen": -0.30414339900016785, "logits/rejected": -0.49127548933029175, "logps/chosen": -401.3171081542969, "logps/rejected": -638.1029663085938, "loss": 0.3526, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.013474702835083, "rewards/margins": 2.391366720199585, "rewards/rejected": -5.404841899871826, "step": 185 }, { "epoch": 0.3744072910893528, "grad_norm": 15.137404252638685, "learning_rate": 3.9386402332652754e-07, "logits/chosen": -0.3183102011680603, "logits/rejected": -0.478738397359848, "logps/chosen": -362.82513427734375, "logps/rejected": -609.9462890625, "loss": 0.3586, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.608335018157959, "rewards/margins": 2.479790687561035, "rewards/rejected": -5.088125705718994, "step": 190 }, { "epoch": 0.3842601145390726, "grad_norm": 14.191789849116672, "learning_rate": 3.867370395306068e-07, "logits/chosen": -0.2816595137119293, "logits/rejected": -0.5143749117851257, "logps/chosen": -354.79388427734375, "logps/rejected": -573.3958740234375, "loss": 0.3561, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.5655016899108887, "rewards/margins": 2.193372964859009, "rewards/rejected": -4.758874416351318, "step": 195 }, { "epoch": 0.39411293798879243, "grad_norm": 16.13473953627841, "learning_rate": 3.794478174686328e-07, "logits/chosen": -0.30765408277511597, "logits/rejected": -0.5184319615364075, "logps/chosen": -380.7965087890625, "logps/rejected": -623.9815063476562, "loss": 0.3343, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.7764999866485596, "rewards/margins": 2.448732614517212, "rewards/rejected": -5.2252326011657715, "step": 200 }, { "epoch": 0.4039657614385122, "grad_norm": 13.799406477566153, "learning_rate": 3.720050057902495e-07, "logits/chosen": -0.28575196862220764, "logits/rejected": -0.5561090707778931, "logps/chosen": -412.14154052734375, "logps/rejected": -629.4691772460938, "loss": 0.3917, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0631420612335205, "rewards/margins": 2.176858425140381, "rewards/rejected": -5.2400007247924805, "step": 205 }, { "epoch": 0.41381858488823203, "grad_norm": 13.376596964391368, "learning_rate": 3.644174353789204e-07, "logits/chosen": -0.3736252188682556, "logits/rejected": -0.514769434928894, "logps/chosen": -383.2430725097656, "logps/rejected": -640.6544189453125, "loss": 0.334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.76784086227417, "rewards/margins": 2.553081512451172, "rewards/rejected": -5.320921897888184, "step": 210 }, { "epoch": 0.42367140833795186, "grad_norm": 11.451472713393272, "learning_rate": 3.566941088741009e-07, "logits/chosen": -0.3407908082008362, "logits/rejected": -0.6174032688140869, "logps/chosen": -347.8931884765625, "logps/rejected": -581.584228515625, "loss": 0.3268, "rewards/accuracies": 0.875, "rewards/chosen": -2.4030208587646484, "rewards/margins": 2.3824656009674072, "rewards/rejected": -4.785486221313477, "step": 215 }, { "epoch": 0.43352423178767163, "grad_norm": 17.46089866269522, "learning_rate": 3.488441899896217e-07, "logits/chosen": -0.3514137864112854, "logits/rejected": -0.49381551146507263, "logps/chosen": -360.62103271484375, "logps/rejected": -615.2965087890625, "loss": 0.3361, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.5678088665008545, "rewards/margins": 2.5628271102905273, "rewards/rejected": -5.130635738372803, "step": 220 }, { "epoch": 0.44337705523739146, "grad_norm": 30.418216991487416, "learning_rate": 3.408769926409574e-07, "logits/chosen": -0.32667240500450134, "logits/rejected": -0.46130138635635376, "logps/chosen": -370.0388488769531, "logps/rejected": -607.0093994140625, "loss": 0.3309, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.7761330604553223, "rewards/margins": 2.300311803817749, "rewards/rejected": -5.076444149017334, "step": 225 }, { "epoch": 0.4532298786871113, "grad_norm": 22.496163988303977, "learning_rate": 3.3280196989428263e-07, "logits/chosen": -0.23927278816699982, "logits/rejected": -0.4683164060115814, "logps/chosen": -415.4649963378906, "logps/rejected": -715.1581420898438, "loss": 0.399, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.170175790786743, "rewards/margins": 3.04463529586792, "rewards/rejected": -6.214810848236084, "step": 230 }, { "epoch": 0.46308270213683106, "grad_norm": 15.358280098952724, "learning_rate": 3.2462870275042367e-07, "logits/chosen": -0.2206590175628662, "logits/rejected": -0.4772756099700928, "logps/chosen": -437.8194274902344, "logps/rejected": -676.9967041015625, "loss": 0.3334, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.432298183441162, "rewards/margins": 2.3835928440093994, "rewards/rejected": -5.815890789031982, "step": 235 }, { "epoch": 0.4729355255865509, "grad_norm": 14.636033595582514, "learning_rate": 3.1636688877701806e-07, "logits/chosen": -0.379282683134079, "logits/rejected": -0.5302340388298035, "logps/chosen": -425.94024658203125, "logps/rejected": -697.734130859375, "loss": 0.2963, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.2339348793029785, "rewards/margins": 2.747666835784912, "rewards/rejected": -5.981600761413574, "step": 240 }, { "epoch": 0.4827883490362707, "grad_norm": 15.394330260666242, "learning_rate": 3.080263306023669e-07, "logits/chosen": -0.2971007823944092, "logits/rejected": -0.4823763370513916, "logps/chosen": -433.9507751464844, "logps/rejected": -664.0755615234375, "loss": 0.2877, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3310959339141846, "rewards/margins": 2.319425344467163, "rewards/rejected": -5.650521278381348, "step": 245 }, { "epoch": 0.49264117248599054, "grad_norm": 14.169015562499252, "learning_rate": 2.996169242846328e-07, "logits/chosen": -0.3509058952331543, "logits/rejected": -0.5016965270042419, "logps/chosen": -401.8634948730469, "logps/rejected": -719.4425048828125, "loss": 0.2637, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.0134997367858887, "rewards/margins": 3.148432970046997, "rewards/rejected": -6.161932945251465, "step": 250 }, { "epoch": 0.5024939959357103, "grad_norm": 15.084142231117205, "learning_rate": 2.911486475701835e-07, "logits/chosen": -0.2690550684928894, "logits/rejected": -0.4977152943611145, "logps/chosen": -400.71649169921875, "logps/rejected": -628.1175537109375, "loss": 0.322, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0442709922790527, "rewards/margins": 2.318587064743042, "rewards/rejected": -5.362858772277832, "step": 255 }, { "epoch": 0.5123468193854301, "grad_norm": 10.556248114155261, "learning_rate": 2.826315480550129e-07, "logits/chosen": -0.3703750967979431, "logits/rejected": -0.4834713041782379, "logps/chosen": -379.0237121582031, "logps/rejected": -712.1115112304688, "loss": 0.2737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.82161283493042, "rewards/margins": 3.283346652984619, "rewards/rejected": -6.104959011077881, "step": 260 }, { "epoch": 0.52219964283515, "grad_norm": 16.00385439318665, "learning_rate": 2.740757312632854e-07, "logits/chosen": -0.2965068221092224, "logits/rejected": -0.4728400707244873, "logps/chosen": -432.16650390625, "logps/rejected": -744.01806640625, "loss": 0.2898, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.3073620796203613, "rewards/margins": 3.125457286834717, "rewards/rejected": -6.432818412780762, "step": 265 }, { "epoch": 0.5320524662848698, "grad_norm": 15.37457746102107, "learning_rate": 2.654913486571487e-07, "logits/chosen": -0.26870328187942505, "logits/rejected": -0.5099757313728333, "logps/chosen": -461.7821350097656, "logps/rejected": -804.3317260742188, "loss": 0.281, "rewards/accuracies": 0.90625, "rewards/chosen": -3.477935791015625, "rewards/margins": 3.4908695220947266, "rewards/rejected": -6.968804836273193, "step": 270 }, { "epoch": 0.5419052897345896, "grad_norm": 17.39033230160854, "learning_rate": 2.5688858559204053e-07, "logits/chosen": -0.2892334461212158, "logits/rejected": -0.5267232656478882, "logps/chosen": -488.491943359375, "logps/rejected": -848.1380615234375, "loss": 0.2953, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.8831119537353516, "rewards/margins": 3.5666236877441406, "rewards/rejected": -7.44973611831665, "step": 275 }, { "epoch": 0.5517581131843093, "grad_norm": 19.079707368930343, "learning_rate": 2.4827764923178246e-07, "logits/chosen": -0.2866331934928894, "logits/rejected": -0.5004242062568665, "logps/chosen": -503.42572021484375, "logps/rejected": -888.0465698242188, "loss": 0.2978, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.999812602996826, "rewards/margins": 3.85322904586792, "rewards/rejected": -7.853041648864746, "step": 280 }, { "epoch": 0.5616109366340292, "grad_norm": 19.106895081330208, "learning_rate": 2.3966875643779667e-07, "logits/chosen": -0.3676909804344177, "logits/rejected": -0.5896965861320496, "logps/chosen": -404.20770263671875, "logps/rejected": -752.9081420898438, "loss": 0.288, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.025970935821533, "rewards/margins": 3.454861879348755, "rewards/rejected": -6.480832576751709, "step": 285 }, { "epoch": 0.571463760083749, "grad_norm": 12.765348172490336, "learning_rate": 2.3107212164681774e-07, "logits/chosen": -0.3087966740131378, "logits/rejected": -0.5579283833503723, "logps/chosen": -426.75372314453125, "logps/rejected": -739.0400390625, "loss": 0.3286, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3023364543914795, "rewards/margins": 3.1090188026428223, "rewards/rejected": -6.411355495452881, "step": 290 }, { "epoch": 0.5813165835334688, "grad_norm": 16.231816408507715, "learning_rate": 2.2249794475148019e-07, "logits/chosen": -0.4001474976539612, "logits/rejected": -0.5297967195510864, "logps/chosen": -417.8780822753906, "logps/rejected": -676.3338623046875, "loss": 0.3072, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.1984708309173584, "rewards/margins": 2.528751850128174, "rewards/rejected": -5.727222919464111, "step": 295 }, { "epoch": 0.5911694069831886, "grad_norm": 17.338695196365542, "learning_rate": 2.1395639899816332e-07, "logits/chosen": -0.4291343688964844, "logits/rejected": -0.556428849697113, "logps/chosen": -390.78582763671875, "logps/rejected": -669.5917358398438, "loss": 0.3313, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.9017252922058105, "rewards/margins": 2.7559847831726074, "rewards/rejected": -5.657710075378418, "step": 300 }, { "epoch": 0.6010222304329085, "grad_norm": 12.126664974793542, "learning_rate": 2.0545761891645177e-07, "logits/chosen": -0.37195321917533875, "logits/rejected": -0.5651625394821167, "logps/chosen": -414.18927001953125, "logps/rejected": -704.6055297851562, "loss": 0.2784, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.175144910812378, "rewards/margins": 2.9191794395446777, "rewards/rejected": -6.094325065612793, "step": 305 }, { "epoch": 0.6108750538826282, "grad_norm": 22.304119797273902, "learning_rate": 1.9701168829453305e-07, "logits/chosen": -0.3536795973777771, "logits/rejected": -0.5591118931770325, "logps/chosen": -393.33905029296875, "logps/rejected": -676.2900390625, "loss": 0.303, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.915839672088623, "rewards/margins": 2.8338570594787598, "rewards/rejected": -5.749696731567383, "step": 310 }, { "epoch": 0.620727877332348, "grad_norm": 14.597142317559607, "learning_rate": 1.886286282148002e-07, "logits/chosen": -0.36113765835762024, "logits/rejected": -0.5111299157142639, "logps/chosen": -416.2958984375, "logps/rejected": -689.5440673828125, "loss": 0.2944, "rewards/accuracies": 0.875, "rewards/chosen": -3.1453917026519775, "rewards/margins": 2.7569806575775146, "rewards/rejected": -5.90237283706665, "step": 315 }, { "epoch": 0.6305807007820678, "grad_norm": 14.66404722177273, "learning_rate": 1.8031838516385422e-07, "logits/chosen": -0.29976287484169006, "logits/rejected": -0.5753315687179565, "logps/chosen": -426.15081787109375, "logps/rejected": -668.0872802734375, "loss": 0.3077, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.2156474590301514, "rewards/margins": 2.5099997520446777, "rewards/rejected": -5.725647926330566, "step": 320 }, { "epoch": 0.6404335242317877, "grad_norm": 24.997450008790477, "learning_rate": 1.7209081923101472e-07, "logits/chosen": -0.3522949814796448, "logits/rejected": -0.571318507194519, "logps/chosen": -428.62615966796875, "logps/rejected": -739.1427001953125, "loss": 0.2803, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.299776554107666, "rewards/margins": 3.1473708152770996, "rewards/rejected": -6.447146415710449, "step": 325 }, { "epoch": 0.6502863476815075, "grad_norm": 21.077844248854888, "learning_rate": 1.639556924093404e-07, "logits/chosen": -0.3864585757255554, "logits/rejected": -0.5924688577651978, "logps/chosen": -460.22955322265625, "logps/rejected": -780.5281372070312, "loss": 0.29, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.5728142261505127, "rewards/margins": 3.2330241203308105, "rewards/rejected": -6.805838584899902, "step": 330 }, { "epoch": 0.6601391711312273, "grad_norm": 13.281944316685474, "learning_rate": 1.5592265701304114e-07, "logits/chosen": -0.31550538539886475, "logits/rejected": -0.5574745535850525, "logps/chosen": -432.3729553222656, "logps/rejected": -738.777587890625, "loss": 0.2802, "rewards/accuracies": 0.90625, "rewards/chosen": -3.270681858062744, "rewards/margins": 3.0939764976501465, "rewards/rejected": -6.364658832550049, "step": 335 }, { "epoch": 0.669991994580947, "grad_norm": 13.14308400069569, "learning_rate": 1.4800124422502334e-07, "logits/chosen": -0.35619470477104187, "logits/rejected": -0.5215466618537903, "logps/chosen": -402.02227783203125, "logps/rejected": -689.3594360351562, "loss": 0.2762, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.078434944152832, "rewards/margins": 2.8699941635131836, "rewards/rejected": -5.948429107666016, "step": 340 }, { "epoch": 0.6798448180306669, "grad_norm": 13.958916358154339, "learning_rate": 1.4020085278815743e-07, "logits/chosen": -0.348470538854599, "logits/rejected": -0.554540753364563, "logps/chosen": -393.90362548828125, "logps/rejected": -701.527099609375, "loss": 0.2845, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.926769733428955, "rewards/margins": 3.087486743927002, "rewards/rejected": -6.014256477355957, "step": 345 }, { "epoch": 0.6896976414803867, "grad_norm": 17.268899996459307, "learning_rate": 1.3253073785368545e-07, "logits/chosen": -0.37151604890823364, "logits/rejected": -0.5311695337295532, "logps/chosen": -426.25567626953125, "logps/rejected": -754.6126708984375, "loss": 0.2906, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.2909317016601562, "rewards/margins": 3.2550930976867676, "rewards/rejected": -6.546025276184082, "step": 350 }, { "epoch": 0.6995504649301065, "grad_norm": 23.463103741079866, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.29037579894065857, "logits/rejected": -0.5232888460159302, "logps/chosen": -434.14862060546875, "logps/rejected": -760.4425048828125, "loss": 0.3051, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.306781053543091, "rewards/margins": 3.264423370361328, "rewards/rejected": -6.571203708648682, "step": 355 }, { "epoch": 0.7094032883798264, "grad_norm": 15.524231108582358, "learning_rate": 1.1761757443482285e-07, "logits/chosen": -0.34106117486953735, "logits/rejected": -0.5115201473236084, "logps/chosen": -463.2628479003906, "logps/rejected": -801.8025512695312, "loss": 0.3022, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.6255767345428467, "rewards/margins": 3.3763046264648438, "rewards/rejected": -7.0018815994262695, "step": 360 }, { "epoch": 0.7192561118295462, "grad_norm": 13.880093445950616, "learning_rate": 1.1039222039359644e-07, "logits/chosen": -0.3453301787376404, "logits/rejected": -0.5254852175712585, "logps/chosen": -411.9518127441406, "logps/rejected": -712.6190795898438, "loss": 0.2746, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.0844180583953857, "rewards/margins": 3.044656991958618, "rewards/rejected": -6.129075527191162, "step": 365 }, { "epoch": 0.729108935279266, "grad_norm": 14.175670082816483, "learning_rate": 1.0333251074666608e-07, "logits/chosen": -0.3751557469367981, "logits/rejected": -0.5939264297485352, "logps/chosen": -429.10809326171875, "logps/rejected": -725.9818115234375, "loss": 0.3023, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.204840898513794, "rewards/margins": 3.0460567474365234, "rewards/rejected": -6.2508978843688965, "step": 370 }, { "epoch": 0.7389617587289857, "grad_norm": 16.474329546472436, "learning_rate": 9.644682182758304e-08, "logits/chosen": -0.35652074217796326, "logits/rejected": -0.5406717658042908, "logps/chosen": -442.0631408691406, "logps/rejected": -735.47412109375, "loss": 0.2696, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.3726630210876465, "rewards/margins": 2.9257071018218994, "rewards/rejected": -6.298369407653809, "step": 375 }, { "epoch": 0.7488145821787056, "grad_norm": 18.213192833293316, "learning_rate": 8.974332349459992e-08, "logits/chosen": -0.3362307846546173, "logits/rejected": -0.5135927796363831, "logps/chosen": -418.939453125, "logps/rejected": -711.0570068359375, "loss": 0.2748, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.2188174724578857, "rewards/margins": 2.915469169616699, "rewards/rejected": -6.134286403656006, "step": 380 }, { "epoch": 0.7586674056284254, "grad_norm": 15.38124786698222, "learning_rate": 8.322996943714672e-08, "logits/chosen": -0.3672960698604584, "logits/rejected": -0.5296192765235901, "logps/chosen": -425.13427734375, "logps/rejected": -741.996337890625, "loss": 0.2826, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.2366461753845215, "rewards/margins": 3.166731119155884, "rewards/rejected": -6.403376579284668, "step": 385 }, { "epoch": 0.7685202290781452, "grad_norm": 20.521680628902402, "learning_rate": 7.691448773879256e-08, "logits/chosen": -0.38358789682388306, "logits/rejected": -0.5248023867607117, "logps/chosen": -465.2935485839844, "logps/rejected": -843.0035400390625, "loss": 0.3038, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.6291823387145996, "rewards/margins": 3.776301145553589, "rewards/rejected": -7.405484199523926, "step": 390 }, { "epoch": 0.778373052527865, "grad_norm": 22.670752535686123, "learning_rate": 7.080437170788722e-08, "logits/chosen": -0.3909732699394226, "logits/rejected": -0.46815380454063416, "logps/chosen": -467.8148498535156, "logps/rejected": -813.1572265625, "loss": 0.2818, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.7561402320861816, "rewards/margins": 3.3577582836151123, "rewards/rejected": -7.113898277282715, "step": 395 }, { "epoch": 0.7882258759775849, "grad_norm": 21.305345425176412, "learning_rate": 6.490687098676332e-08, "logits/chosen": -0.40551847219467163, "logits/rejected": -0.5719416737556458, "logps/chosen": -435.22393798828125, "logps/rejected": -831.7904052734375, "loss": 0.2563, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3291544914245605, "rewards/margins": 3.9090828895568848, "rewards/rejected": -7.2382378578186035, "step": 400 }, { "epoch": 0.7882258759775849, "eval_logits/chosen": -1.0068713426589966, "eval_logits/rejected": -0.773371696472168, "eval_logps/chosen": -503.0605773925781, "eval_logps/rejected": -703.8283081054688, "eval_loss": 0.7016588449478149, "eval_rewards/accuracies": 0.7020000219345093, "eval_rewards/chosen": -4.100825786590576, "eval_rewards/margins": 1.7457716464996338, "eval_rewards/rejected": -5.846597671508789, "eval_runtime": 224.226, "eval_samples_per_second": 8.915, "eval_steps_per_second": 1.115, "step": 400 }, { "epoch": 0.7980786994273046, "grad_norm": 16.817512778396765, "learning_rate": 5.9228982950048414e-08, "logits/chosen": -0.34627681970596313, "logits/rejected": -0.648546576499939, "logps/chosen": -444.9021911621094, "logps/rejected": -793.8383178710938, "loss": 0.3108, "rewards/accuracies": 0.90625, "rewards/chosen": -3.4342200756073, "rewards/margins": 3.5523598194122314, "rewards/rejected": -6.986579895019531, "step": 405 }, { "epoch": 0.8079315228770244, "grad_norm": 18.65876742845436, "learning_rate": 5.3777444402291345e-08, "logits/chosen": -0.3808293342590332, "logits/rejected": -0.5627844929695129, "logps/chosen": -447.5360412597656, "logps/rejected": -714.4208374023438, "loss": 0.3028, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4805076122283936, "rewards/margins": 2.6875407695770264, "rewards/rejected": -6.168048858642578, "step": 410 }, { "epoch": 0.8177843463267442, "grad_norm": 21.614577011578998, "learning_rate": 4.855872358475546e-08, "logits/chosen": -0.44765299558639526, "logits/rejected": -0.5962406992912292, "logps/chosen": -454.58135986328125, "logps/rejected": -746.631103515625, "loss": 0.3112, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.5401859283447266, "rewards/margins": 2.886667251586914, "rewards/rejected": -6.426853179931641, "step": 415 }, { "epoch": 0.8276371697764641, "grad_norm": 18.508069249718975, "learning_rate": 4.357901250086107e-08, "logits/chosen": -0.3780784606933594, "logits/rejected": -0.6282440423965454, "logps/chosen": -464.607666015625, "logps/rejected": -732.405029296875, "loss": 0.2871, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.5666542053222656, "rewards/margins": 2.74709153175354, "rewards/rejected": -6.313745498657227, "step": 420 }, { "epoch": 0.8374899932261839, "grad_norm": 21.590957086592525, "learning_rate": 3.884421956938377e-08, "logits/chosen": -0.38022860884666443, "logits/rejected": -0.6045624017715454, "logps/chosen": -429.6410217285156, "logps/rejected": -730.5736083984375, "loss": 0.2923, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.25273060798645, "rewards/margins": 3.082298994064331, "rewards/rejected": -6.335029125213623, "step": 425 }, { "epoch": 0.8473428166759037, "grad_norm": 15.197391870273057, "learning_rate": 3.435996261412591e-08, "logits/chosen": -0.4843681752681732, "logits/rejected": -0.5831128358840942, "logps/chosen": -393.3730773925781, "logps/rejected": -707.8683471679688, "loss": 0.2692, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.8446130752563477, "rewards/margins": 3.154205799102783, "rewards/rejected": -5.998819351196289, "step": 430 }, { "epoch": 0.8571956401256235, "grad_norm": 15.288363962234385, "learning_rate": 3.013156219837776e-08, "logits/chosen": -0.4089592397212982, "logits/rejected": -0.5905001759529114, "logps/chosen": -391.95831298828125, "logps/rejected": -707.7792358398438, "loss": 0.2841, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.9444174766540527, "rewards/margins": 3.1579596996307373, "rewards/rejected": -6.102376937866211, "step": 435 }, { "epoch": 0.8670484635753433, "grad_norm": 17.671998119316797, "learning_rate": 2.6164035312078447e-08, "logits/chosen": -0.42121395468711853, "logits/rejected": -0.5983023047447205, "logps/chosen": -422.3511657714844, "logps/rejected": -786.9813232421875, "loss": 0.3052, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.2139759063720703, "rewards/margins": 3.617478847503662, "rewards/rejected": -6.831455230712891, "step": 440 }, { "epoch": 0.8769012870250631, "grad_norm": 15.463536800192488, "learning_rate": 2.2462089419165776e-08, "logits/chosen": -0.39272046089172363, "logits/rejected": -0.5592643618583679, "logps/chosen": -421.66619873046875, "logps/rejected": -675.9927368164062, "loss": 0.3122, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1869752407073975, "rewards/margins": 2.581495761871338, "rewards/rejected": -5.7684712409973145, "step": 445 }, { "epoch": 0.8867541104747829, "grad_norm": 17.36075063464836, "learning_rate": 1.9030116872178314e-08, "logits/chosen": -0.42874231934547424, "logits/rejected": -0.5679312944412231, "logps/chosen": -416.0025939941406, "logps/rejected": -734.6575317382812, "loss": 0.289, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1724095344543457, "rewards/margins": 3.161487102508545, "rewards/rejected": -6.333896160125732, "step": 450 }, { "epoch": 0.8966069339245027, "grad_norm": 15.642265040018296, "learning_rate": 1.5872189700736337e-08, "logits/chosen": -0.4477892518043518, "logits/rejected": -0.579742431640625, "logps/chosen": -446.33349609375, "logps/rejected": -779.8458862304688, "loss": 0.2806, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4629955291748047, "rewards/margins": 3.308091640472412, "rewards/rejected": -6.771086692810059, "step": 455 }, { "epoch": 0.9064597573742226, "grad_norm": 14.160026400581446, "learning_rate": 1.2992054780085692e-08, "logits/chosen": -0.3982602059841156, "logits/rejected": -0.5977696776390076, "logps/chosen": -418.07061767578125, "logps/rejected": -713.667236328125, "loss": 0.2735, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1576714515686035, "rewards/margins": 2.992490768432617, "rewards/rejected": -6.150162696838379, "step": 460 }, { "epoch": 0.9163125808239424, "grad_norm": 19.50813975803542, "learning_rate": 1.0393129385436823e-08, "logits/chosen": -0.364255428314209, "logits/rejected": -0.6485485434532166, "logps/chosen": -426.3414001464844, "logps/rejected": -729.6509399414062, "loss": 0.2897, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.199294328689575, "rewards/margins": 3.0993099212646484, "rewards/rejected": -6.2986040115356445, "step": 465 }, { "epoch": 0.9261654042736621, "grad_norm": 19.508164166434455, "learning_rate": 8.078497137373242e-09, "logits/chosen": -0.46434831619262695, "logits/rejected": -0.5418060421943665, "logps/chosen": -456.2374572753906, "logps/rejected": -767.5300903320312, "loss": 0.2769, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.5864601135253906, "rewards/margins": 3.058790922164917, "rewards/rejected": -6.645251274108887, "step": 470 }, { "epoch": 0.936018227723382, "grad_norm": 14.71074195682132, "learning_rate": 6.0509043431410945e-09, "logits/chosen": -0.36522990465164185, "logits/rejected": -0.5609266757965088, "logps/chosen": -441.4654235839844, "logps/rejected": -724.3133544921875, "loss": 0.2906, "rewards/accuracies": 0.90625, "rewards/chosen": -3.432781934738159, "rewards/margins": 2.8102941513061523, "rewards/rejected": -6.243076324462891, "step": 475 }, { "epoch": 0.9458710511731018, "grad_norm": 15.31419630581113, "learning_rate": 4.312756738160145e-09, "logits/chosen": -0.4177488386631012, "logits/rejected": -0.6274289488792419, "logps/chosen": -437.7715759277344, "logps/rejected": -771.16552734375, "loss": 0.2431, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.4045562744140625, "rewards/margins": 3.3424010276794434, "rewards/rejected": -6.746957302093506, "step": 480 }, { "epoch": 0.9557238746228216, "grad_norm": 17.527039297327423, "learning_rate": 2.8661166316229223e-09, "logits/chosen": -0.4048340320587158, "logits/rejected": -0.5873134732246399, "logps/chosen": -422.97003173828125, "logps/rejected": -773.1441650390625, "loss": 0.2806, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.2094051837921143, "rewards/margins": 3.512725830078125, "rewards/rejected": -6.722131252288818, "step": 485 }, { "epoch": 0.9655766980725414, "grad_norm": 14.493022786248968, "learning_rate": 1.7127004595681727e-09, "logits/chosen": -0.39268267154693604, "logits/rejected": -0.5996861457824707, "logps/chosen": -428.91986083984375, "logps/rejected": -711.802001953125, "loss": 0.3532, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.3111557960510254, "rewards/margins": 2.841479539871216, "rewards/rejected": -6.152635097503662, "step": 490 }, { "epoch": 0.9754295215222613, "grad_norm": 18.440387807689753, "learning_rate": 8.538767483325383e-10, "logits/chosen": -0.4720977246761322, "logits/rejected": -0.5540111064910889, "logps/chosen": -421.6551208496094, "logps/rejected": -734.9032592773438, "loss": 0.2881, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.2461752891540527, "rewards/margins": 3.103208065032959, "rewards/rejected": -6.349383354187012, "step": 495 }, { "epoch": 0.9852823449719811, "grad_norm": 24.055730221765614, "learning_rate": 2.9066449079634404e-10, "logits/chosen": -0.3708307147026062, "logits/rejected": -0.6160975694656372, "logps/chosen": -466.91717529296875, "logps/rejected": -752.3226318359375, "loss": 0.2672, "rewards/accuracies": 0.90625, "rewards/chosen": -3.598177671432495, "rewards/margins": 2.9196102619171143, "rewards/rejected": -6.517787933349609, "step": 500 }, { "epoch": 0.9951351684217008, "grad_norm": 16.011780285680448, "learning_rate": 2.3731937350224273e-11, "logits/chosen": -0.3636520802974701, "logits/rejected": -0.6337639689445496, "logps/chosen": -420.154052734375, "logps/rejected": -775.5059814453125, "loss": 0.2825, "rewards/accuracies": 0.90625, "rewards/chosen": -3.1779532432556152, "rewards/margins": 3.574568510055542, "rewards/rejected": -6.752521514892578, "step": 505 }, { "epoch": 0.9990762978015888, "step": 507, "total_flos": 0.0, "train_loss": 0.3847499547390308, "train_runtime": 27570.709, "train_samples_per_second": 2.356, "train_steps_per_second": 0.018 } ], "logging_steps": 5, "max_steps": 507, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }