{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993240502906584, "eval_steps": 1000, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.127659574468085e-08, "logits/chosen": -2.4600229263305664, "logits/rejected": -2.442487955093384, "logps/chosen": -419.2090759277344, "logps/rejected": -388.2476501464844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/mix_margin": -1.1801719779214181e-07, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 2.127659574468085e-07, "logits/chosen": -2.3608736991882324, "logits/rejected": -2.3287594318389893, "logps/chosen": -318.4012451171875, "logps/rejected": -263.6573181152344, "loss": 0.6927, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0007825422217138112, "rewards/confidence": -0.008881219662725925, "rewards/confidence_mean_diff": 0.008881219662725925, "rewards/confidence_moving_diff": 0.001681807334534824, "rewards/margins": 0.0013569907750934362, "rewards/mix_margin": 0.0007381609757430851, "rewards/real_percentage": 15.428571701049805, "rewards/rejected": -0.0005744485533796251, "step": 10 }, { "epoch": 0.04, "learning_rate": 4.25531914893617e-07, "logits/chosen": -2.4250235557556152, "logits/rejected": -2.4130008220672607, "logps/chosen": -301.75958251953125, "logps/rejected": -263.33489990234375, "loss": 0.689, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.012543246150016785, "rewards/confidence": -0.011264830827713013, "rewards/confidence_mean_diff": 0.011264830827713013, "rewards/confidence_moving_diff": 0.00037633898318745196, "rewards/margins": 0.007124939002096653, "rewards/mix_margin": 0.003652904648333788, "rewards/real_percentage": 7.199999809265137, "rewards/rejected": 0.005418307613581419, "step": 20 }, { "epoch": 0.06, "learning_rate": 6.382978723404255e-07, "logits/chosen": -2.3625423908233643, "logits/rejected": -2.3323898315429688, "logps/chosen": -357.43328857421875, "logps/rejected": -314.91290283203125, "loss": 0.6747, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.04857476428151131, "rewards/confidence": -0.043907828629016876, "rewards/confidence_mean_diff": 0.043907828629016876, "rewards/confidence_moving_diff": 0.007038711104542017, "rewards/margins": 0.0557299479842186, "rewards/mix_margin": 0.028763342648744583, "rewards/real_percentage": 7.0, "rewards/rejected": -0.007155182305723429, "step": 30 }, { "epoch": 0.09, "learning_rate": 8.51063829787234e-07, "logits/chosen": -2.328860282897949, "logits/rejected": -2.324850082397461, "logps/chosen": -328.1289978027344, "logps/rejected": -297.20880126953125, "loss": 0.658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05925966426730156, "rewards/confidence": -0.1791446954011917, "rewards/confidence_mean_diff": 0.1791446954011917, "rewards/confidence_moving_diff": 0.016184808686375618, "rewards/margins": 0.08581773936748505, "rewards/mix_margin": 0.04467679560184479, "rewards/real_percentage": 7.0, "rewards/rejected": -0.1450774073600769, "step": 40 }, { "epoch": 0.11, "learning_rate": 9.998710660154897e-07, "logits/chosen": -2.2127022743225098, "logits/rejected": -2.2161879539489746, "logps/chosen": -326.4324645996094, "logps/rejected": -318.18798828125, "loss": 0.6406, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.1789505034685135, "rewards/confidence": -0.23482057452201843, "rewards/confidence_mean_diff": 0.23482057452201843, "rewards/confidence_moving_diff": -9.797290113056079e-05, "rewards/margins": 0.10209941864013672, "rewards/mix_margin": 0.05361374467611313, "rewards/real_percentage": 6.199999809265137, "rewards/rejected": -0.2810499370098114, "step": 50 }, { "epoch": 0.13, "learning_rate": 9.975807556654536e-07, "logits/chosen": -2.08450984954834, "logits/rejected": -2.0751709938049316, "logps/chosen": -337.4300537109375, "logps/rejected": -329.66619873046875, "loss": 0.6214, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.40547019243240356, "rewards/confidence": -0.24584360420703888, "rewards/confidence_mean_diff": 0.24584360420703888, "rewards/confidence_moving_diff": -0.008521117269992828, "rewards/margins": 0.17091026902198792, "rewards/mix_margin": 0.09404207020998001, "rewards/real_percentage": 5.400000095367432, "rewards/rejected": -0.5763804912567139, "step": 60 }, { "epoch": 0.15, "learning_rate": 9.92440347807533e-07, "logits/chosen": -1.7078907489776611, "logits/rejected": -1.717552900314331, "logps/chosen": -378.1355895996094, "logps/rejected": -374.5966796875, "loss": 0.5925, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8885644674301147, "rewards/confidence": -0.3056587278842926, "rewards/confidence_mean_diff": 0.3056587278842926, "rewards/confidence_moving_diff": 0.03268023580312729, "rewards/margins": 0.28360408544540405, "rewards/mix_margin": 0.13858327269554138, "rewards/real_percentage": 7.800000190734863, "rewards/rejected": -1.1721686124801636, "step": 70 }, { "epoch": 0.17, "learning_rate": 9.844792862324257e-07, "logits/chosen": -1.7334696054458618, "logits/rejected": -1.7242892980575562, "logps/chosen": -395.7420959472656, "logps/rejected": -401.59197998046875, "loss": 0.5764, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.094656229019165, "rewards/confidence": -0.3173515498638153, "rewards/confidence_mean_diff": 0.3173515498638153, "rewards/confidence_moving_diff": -0.02396375872194767, "rewards/margins": 0.48389825224876404, "rewards/mix_margin": 0.21666650474071503, "rewards/real_percentage": 5.599999904632568, "rewards/rejected": -1.5785545110702515, "step": 80 }, { "epoch": 0.19, "learning_rate": 9.737431711798862e-07, "logits/chosen": -1.9971452951431274, "logits/rejected": -1.9761130809783936, "logps/chosen": -391.64276123046875, "logps/rejected": -360.64886474609375, "loss": 0.5762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9299066662788391, "rewards/confidence": -0.339626669883728, "rewards/confidence_mean_diff": 0.339626669883728, "rewards/confidence_moving_diff": 0.0005624383920803666, "rewards/margins": 0.3824290335178375, "rewards/mix_margin": 0.22833208739757538, "rewards/real_percentage": 6.599999904632568, "rewards/rejected": -1.312335729598999, "step": 90 }, { "epoch": 0.22, "learning_rate": 9.602934981446803e-07, "logits/chosen": -2.2859737873077393, "logits/rejected": -2.2752556800842285, "logps/chosen": -375.16729736328125, "logps/rejected": -376.62249755859375, "loss": 0.5654, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9961857795715332, "rewards/confidence": -0.3247523903846741, "rewards/confidence_mean_diff": 0.3247523903846741, "rewards/confidence_moving_diff": 0.013505702838301659, "rewards/margins": 0.4200294017791748, "rewards/mix_margin": 0.20285817980766296, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -1.416215181350708, "step": 100 }, { "epoch": 0.24, "learning_rate": 9.442073056359603e-07, "logits/chosen": -2.292510986328125, "logits/rejected": -2.240940570831299, "logps/chosen": -380.4697570800781, "logps/rejected": -363.1268005371094, "loss": 0.5619, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0786364078521729, "rewards/confidence": -0.28994521498680115, "rewards/confidence_mean_diff": 0.28994521498680115, "rewards/confidence_moving_diff": -0.01229217927902937, "rewards/margins": 0.4588547348976135, "rewards/mix_margin": 0.2650415003299713, "rewards/real_percentage": 5.800000190734863, "rewards/rejected": -1.5374912023544312, "step": 110 }, { "epoch": 0.26, "learning_rate": 9.255767339076622e-07, "logits/chosen": -2.248251438140869, "logits/rejected": -2.198671340942383, "logps/chosen": -444.0166931152344, "logps/rejected": -446.957275390625, "loss": 0.5569, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5204365253448486, "rewards/confidence": -0.38544487953186035, "rewards/confidence_mean_diff": 0.38544487953186035, "rewards/confidence_moving_diff": 0.014522197656333447, "rewards/margins": 0.4478101134300232, "rewards/mix_margin": 0.22497253119945526, "rewards/real_percentage": 7.0, "rewards/rejected": -1.9682468175888062, "step": 120 }, { "epoch": 0.28, "learning_rate": 9.045084971874737e-07, "logits/chosen": -2.147189140319824, "logits/rejected": -2.145460605621338, "logps/chosen": -368.4799499511719, "logps/rejected": -378.8326416015625, "loss": 0.5498, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2452112436294556, "rewards/confidence": -0.3624282479286194, "rewards/confidence_mean_diff": 0.3624282479286194, "rewards/confidence_moving_diff": 0.014734315685927868, "rewards/margins": 0.4154941439628601, "rewards/mix_margin": 0.19120317697525024, "rewards/real_percentage": 7.800000190734863, "rewards/rejected": -1.660705327987671, "step": 130 }, { "epoch": 0.3, "learning_rate": 8.811232724274034e-07, "logits/chosen": -2.002075433731079, "logits/rejected": -2.005988597869873, "logps/chosen": -417.646728515625, "logps/rejected": -431.691162109375, "loss": 0.537, "rewards/accuracies": 0.71875, "rewards/chosen": -1.487528920173645, "rewards/confidence": -0.4086700975894928, "rewards/confidence_mean_diff": 0.4086700975894928, "rewards/confidence_moving_diff": -0.02756505273282528, "rewards/margins": 0.5225194096565247, "rewards/mix_margin": 0.2395971715450287, "rewards/real_percentage": 6.0, "rewards/rejected": -2.0100481510162354, "step": 140 }, { "epoch": 0.32, "learning_rate": 8.555550080771272e-07, "logits/chosen": -2.0403356552124023, "logits/rejected": -2.0266761779785156, "logps/chosen": -392.81732177734375, "logps/rejected": -392.935791015625, "loss": 0.5576, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3171683549880981, "rewards/confidence": -0.3989728093147278, "rewards/confidence_mean_diff": 0.3989728093147278, "rewards/confidence_moving_diff": 0.0009117841836996377, "rewards/margins": 0.5276567339897156, "rewards/mix_margin": 0.2687237858772278, "rewards/real_percentage": 6.0, "rewards/rejected": -1.8448251485824585, "step": 150 }, { "epoch": 0.35, "learning_rate": 8.279501568393994e-07, "logits/chosen": -2.0831470489501953, "logits/rejected": -2.0378174781799316, "logps/chosen": -440.0665588378906, "logps/rejected": -452.62176513671875, "loss": 0.5631, "rewards/accuracies": 0.75, "rewards/chosen": -1.973459243774414, "rewards/confidence": -0.37913259863853455, "rewards/confidence_mean_diff": 0.37913259863853455, "rewards/confidence_moving_diff": -0.007635933347046375, "rewards/margins": 0.44591131806373596, "rewards/mix_margin": 0.2109535038471222, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -2.419370412826538, "step": 160 }, { "epoch": 0.37, "learning_rate": 7.984668368022335e-07, "logits/chosen": -2.058879852294922, "logits/rejected": -2.0460941791534424, "logps/chosen": -469.419921875, "logps/rejected": -465.22857666015625, "loss": 0.5595, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6018226146697998, "rewards/confidence": -0.3371889293193817, "rewards/confidence_mean_diff": 0.3371889293193817, "rewards/confidence_moving_diff": 0.02232498861849308, "rewards/margins": 0.4381260275840759, "rewards/mix_margin": 0.2263106405735016, "rewards/real_percentage": 7.199999809265137, "rewards/rejected": -2.0399487018585205, "step": 170 }, { "epoch": 0.39, "learning_rate": 7.672739257528134e-07, "logits/chosen": -2.1524975299835205, "logits/rejected": -2.1426620483398438, "logps/chosen": -395.23260498046875, "logps/rejected": -424.2964782714844, "loss": 0.5497, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5090861320495605, "rewards/confidence": -0.5779476165771484, "rewards/confidence_mean_diff": 0.5779476165771484, "rewards/confidence_moving_diff": -0.008263492956757545, "rewards/margins": 0.48516401648521423, "rewards/mix_margin": 0.2023618519306183, "rewards/real_percentage": 6.599999904632568, "rewards/rejected": -1.9942500591278076, "step": 180 }, { "epoch": 0.41, "learning_rate": 7.345500938608219e-07, "logits/chosen": -2.2230193614959717, "logits/rejected": -2.2007393836975098, "logps/chosen": -421.42156982421875, "logps/rejected": -427.07110595703125, "loss": 0.519, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7702022790908813, "rewards/confidence": -0.30366313457489014, "rewards/confidence_mean_diff": 0.30366313457489014, "rewards/confidence_moving_diff": -0.005598017480224371, "rewards/margins": 0.6354817152023315, "rewards/mix_margin": 0.295926570892334, "rewards/real_percentage": 6.599999904632568, "rewards/rejected": -2.405683755874634, "step": 190 }, { "epoch": 0.43, "learning_rate": 7.004827802718889e-07, "logits/chosen": -2.1584935188293457, "logits/rejected": -2.1630496978759766, "logps/chosen": -462.48199462890625, "logps/rejected": -469.06536865234375, "loss": 0.5574, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9269781112670898, "rewards/confidence": -0.3838357627391815, "rewards/confidence_mean_diff": 0.3838357627391815, "rewards/confidence_moving_diff": -0.017508838325738907, "rewards/margins": 0.6354564428329468, "rewards/mix_margin": 0.31543755531311035, "rewards/real_percentage": 5.0, "rewards/rejected": -2.562434434890747, "step": 200 }, { "epoch": 0.45, "learning_rate": 6.652671194731395e-07, "logits/chosen": -2.107018232345581, "logits/rejected": -2.055001974105835, "logps/chosen": -462.31072998046875, "logps/rejected": -467.524658203125, "loss": 0.544, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9643453359603882, "rewards/confidence": -0.46903911232948303, "rewards/confidence_mean_diff": 0.46903911232948303, "rewards/confidence_moving_diff": 0.035244446247816086, "rewards/margins": 0.6127676963806152, "rewards/mix_margin": 0.2926762104034424, "rewards/real_percentage": 6.800000190734863, "rewards/rejected": -2.577113151550293, "step": 210 }, { "epoch": 0.48, "learning_rate": 6.291048235805233e-07, "logits/chosen": -2.1444382667541504, "logits/rejected": -2.1225123405456543, "logps/chosen": -446.4590759277344, "logps/rejected": -433.69573974609375, "loss": 0.5804, "rewards/accuracies": 0.75, "rewards/chosen": -1.3874393701553345, "rewards/confidence": -0.41097918152809143, "rewards/confidence_mean_diff": 0.41097918152809143, "rewards/confidence_moving_diff": -0.002999979304149747, "rewards/margins": 0.44998854398727417, "rewards/mix_margin": 0.27227410674095154, "rewards/real_percentage": 6.0, "rewards/rejected": -1.837428092956543, "step": 220 }, { "epoch": 0.5, "learning_rate": 5.922030269500808e-07, "logits/chosen": -2.0928385257720947, "logits/rejected": -2.0588412284851074, "logps/chosen": -425.83514404296875, "logps/rejected": -431.23382568359375, "loss": 0.5415, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7153390645980835, "rewards/confidence": -0.31595852971076965, "rewards/confidence_mean_diff": 0.31595852971076965, "rewards/confidence_moving_diff": -0.013669434003531933, "rewards/margins": 0.5658494234085083, "rewards/mix_margin": 0.25558221340179443, "rewards/real_percentage": 6.0, "rewards/rejected": -2.281188488006592, "step": 230 }, { "epoch": 0.52, "learning_rate": 5.547730997311105e-07, "logits/chosen": -1.9946399927139282, "logits/rejected": -1.9523468017578125, "logps/chosen": -428.6726989746094, "logps/rejected": -438.1825256347656, "loss": 0.543, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0739216804504395, "rewards/confidence": -0.345465749502182, "rewards/confidence_mean_diff": 0.345465749502182, "rewards/confidence_moving_diff": 0.006545326206833124, "rewards/margins": 0.6022626161575317, "rewards/mix_margin": 0.3206036686897278, "rewards/real_percentage": 5.400000095367432, "rewards/rejected": -2.6761844158172607, "step": 240 }, { "epoch": 0.54, "learning_rate": 5.17029437157094e-07, "logits/chosen": -2.0504579544067383, "logits/rejected": -2.0380096435546875, "logps/chosen": -455.2330017089844, "logps/rejected": -487.70477294921875, "loss": 0.5263, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9092988967895508, "rewards/confidence": -0.521675169467926, "rewards/confidence_mean_diff": 0.521675169467926, "rewards/confidence_moving_diff": 0.03468245267868042, "rewards/margins": 0.6157360672950745, "rewards/mix_margin": 0.2263031005859375, "rewards/real_percentage": 6.800000190734863, "rewards/rejected": -2.5250351428985596, "step": 250 }, { "epoch": 0.56, "learning_rate": 4.791882315092155e-07, "logits/chosen": -2.040631055831909, "logits/rejected": -2.0134525299072266, "logps/chosen": -465.2564392089844, "logps/rejected": -469.700927734375, "loss": 0.5102, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8632171154022217, "rewards/confidence": -0.4674126207828522, "rewards/confidence_mean_diff": 0.4674126207828522, "rewards/confidence_moving_diff": -0.04609960317611694, "rewards/margins": 0.6828121542930603, "rewards/mix_margin": 0.34053856134414673, "rewards/real_percentage": 6.0, "rewards/rejected": -2.546029567718506, "step": 260 }, { "epoch": 0.58, "learning_rate": 4.414662337865529e-07, "logits/chosen": -2.028301477432251, "logits/rejected": -2.0357513427734375, "logps/chosen": -431.650390625, "logps/rejected": -428.67626953125, "loss": 0.5268, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9017683267593384, "rewards/confidence": -0.36343130469322205, "rewards/confidence_mean_diff": 0.36343130469322205, "rewards/confidence_moving_diff": 0.005257821176201105, "rewards/margins": 0.6798229217529297, "rewards/mix_margin": 0.36580324172973633, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -2.5815913677215576, "step": 270 }, { "epoch": 0.61, "learning_rate": 4.04079512175984e-07, "logits/chosen": -2.102811336517334, "logits/rejected": -2.081491708755493, "logps/chosen": -445.09539794921875, "logps/rejected": -462.83721923828125, "loss": 0.5192, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2766060829162598, "rewards/confidence": -0.33939874172210693, "rewards/confidence_mean_diff": 0.33939874172210693, "rewards/confidence_moving_diff": 0.027284782379865646, "rewards/margins": 0.4541402757167816, "rewards/mix_margin": 0.2212882786989212, "rewards/real_percentage": 7.400000095367432, "rewards/rejected": -2.730746030807495, "step": 280 }, { "epoch": 0.63, "learning_rate": 3.672422144331785e-07, "logits/chosen": -2.1012184619903564, "logits/rejected": -2.076214075088501, "logps/chosen": -435.23114013671875, "logps/rejected": -446.3954162597656, "loss": 0.5413, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6879991292953491, "rewards/confidence": -0.48170310258865356, "rewards/confidence_mean_diff": 0.48170310258865356, "rewards/confidence_moving_diff": -0.0009890676010400057, "rewards/margins": 0.6249912977218628, "rewards/mix_margin": 0.30173999071121216, "rewards/real_percentage": 6.199999809265137, "rewards/rejected": -2.312990665435791, "step": 290 }, { "epoch": 0.65, "learning_rate": 3.311653412636468e-07, "logits/chosen": -2.0913848876953125, "logits/rejected": -2.060335874557495, "logps/chosen": -439.56683349609375, "logps/rejected": -458.71905517578125, "loss": 0.5183, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9180190563201904, "rewards/confidence": -0.5002374649047852, "rewards/confidence_mean_diff": 0.5002374649047852, "rewards/confidence_moving_diff": -0.026549097150564194, "rewards/margins": 0.6147478818893433, "rewards/mix_margin": 0.26649755239486694, "rewards/real_percentage": 5.800000190734863, "rewards/rejected": -2.532766819000244, "step": 300 }, { "epoch": 0.67, "learning_rate": 2.9605553772980796e-07, "logits/chosen": -2.05464243888855, "logits/rejected": -2.0151495933532715, "logps/chosen": -452.279296875, "logps/rejected": -461.88897705078125, "loss": 0.5377, "rewards/accuracies": 0.6875, "rewards/chosen": -2.083768129348755, "rewards/confidence": -0.394141286611557, "rewards/confidence_mean_diff": 0.394141286611557, "rewards/confidence_moving_diff": 0.00520284753292799, "rewards/margins": 0.5840972661972046, "rewards/mix_margin": 0.2817578911781311, "rewards/real_percentage": 6.0, "rewards/rejected": -2.66786527633667, "step": 310 }, { "epoch": 0.69, "learning_rate": 2.6211390960678407e-07, "logits/chosen": -2.120527505874634, "logits/rejected": -2.0766115188598633, "logps/chosen": -476.22369384765625, "logps/rejected": -501.09716796875, "loss": 0.5131, "rewards/accuracies": 0.75, "rewards/chosen": -2.364851474761963, "rewards/confidence": -0.44711294770240784, "rewards/confidence_mean_diff": 0.44711294770240784, "rewards/confidence_moving_diff": 0.000249391800025478, "rewards/margins": 0.6894611716270447, "rewards/mix_margin": 0.2901640832424164, "rewards/real_percentage": 5.599999904632568, "rewards/rejected": -3.0543129444122314, "step": 320 }, { "epoch": 0.71, "learning_rate": 2.29534871466734e-07, "logits/chosen": -2.0330708026885986, "logits/rejected": -2.0034308433532715, "logps/chosen": -484.76837158203125, "logps/rejected": -491.1351013183594, "loss": 0.5358, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.210871934890747, "rewards/confidence": -0.45081382989883423, "rewards/confidence_mean_diff": 0.45081382989883423, "rewards/confidence_moving_diff": -0.0033632635604590178, "rewards/margins": 0.6040040254592896, "rewards/mix_margin": 0.27995753288269043, "rewards/real_percentage": 5.599999904632568, "rewards/rejected": -2.814876079559326, "step": 330 }, { "epoch": 0.74, "learning_rate": 1.9850503308978828e-07, "logits/chosen": -2.0655150413513184, "logits/rejected": -2.0272319316864014, "logps/chosen": -479.78338623046875, "logps/rejected": -496.1671447753906, "loss": 0.5244, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.3855416774749756, "rewards/confidence": -0.4124049246311188, "rewards/confidence_mean_diff": 0.4124049246311188, "rewards/confidence_moving_diff": 0.016355862841010094, "rewards/margins": 0.6651355624198914, "rewards/mix_margin": 0.3446359634399414, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -3.0506770610809326, "step": 340 }, { "epoch": 0.76, "learning_rate": 1.6920213058013022e-07, "logits/chosen": -1.9955543279647827, "logits/rejected": -1.9886165857315063, "logps/chosen": -456.9586486816406, "logps/rejected": -483.20550537109375, "loss": 0.5261, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1107418537139893, "rewards/confidence": -0.5677846670150757, "rewards/confidence_mean_diff": 0.5677846670150757, "rewards/confidence_moving_diff": 0.0054519325494766235, "rewards/margins": 0.7480586171150208, "rewards/mix_margin": 0.2732301950454712, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -2.8588004112243652, "step": 350 }, { "epoch": 0.78, "learning_rate": 1.4179400830968412e-07, "logits/chosen": -2.0179452896118164, "logits/rejected": -2.005185842514038, "logps/chosen": -468.8108825683594, "logps/rejected": -483.4927673339844, "loss": 0.523, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.018651247024536, "rewards/confidence": -0.41795068979263306, "rewards/confidence_mean_diff": 0.41795068979263306, "rewards/confidence_moving_diff": -0.030550379306077957, "rewards/margins": 0.7332804799079895, "rewards/mix_margin": 0.33876484632492065, "rewards/real_percentage": 5.0, "rewards/rejected": -2.75193190574646, "step": 360 }, { "epoch": 0.8, "learning_rate": 1.1643765752075468e-07, "logits/chosen": -2.078735828399658, "logits/rejected": -2.0652003288269043, "logps/chosen": -422.59869384765625, "logps/rejected": -423.21875, "loss": 0.5111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1171858310699463, "rewards/confidence": -0.4289500117301941, "rewards/confidence_mean_diff": 0.4289500117301941, "rewards/confidence_moving_diff": 0.009287357330322266, "rewards/margins": 0.5101779103279114, "rewards/mix_margin": 0.2586424648761749, "rewards/real_percentage": 6.800000190734863, "rewards/rejected": -2.627363681793213, "step": 370 }, { "epoch": 0.82, "learning_rate": 9.327831709440792e-08, "logits/chosen": -1.9588645696640015, "logits/rejected": -1.9197273254394531, "logps/chosen": -449.98785400390625, "logps/rejected": -502.673583984375, "loss": 0.5001, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.004117250442505, "rewards/confidence": -0.36164262890815735, "rewards/confidence_mean_diff": 0.36164262890815735, "rewards/confidence_moving_diff": -0.014251338317990303, "rewards/margins": 0.7769009470939636, "rewards/mix_margin": 0.3241749405860901, "rewards/real_percentage": 6.0, "rewards/rejected": -2.781017780303955, "step": 380 }, { "epoch": 0.84, "learning_rate": 7.244864163531162e-08, "logits/chosen": -2.064495325088501, "logits/rejected": -2.0268824100494385, "logps/chosen": -422.8828125, "logps/rejected": -445.21575927734375, "loss": 0.5537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9303261041641235, "rewards/confidence": -0.4756762981414795, "rewards/confidence_mean_diff": 0.4756762981414795, "rewards/confidence_moving_diff": -0.0012016535038128495, "rewards/margins": 0.5886819362640381, "rewards/mix_margin": 0.27653414011001587, "rewards/real_percentage": 7.0, "rewards/rejected": -2.519007921218872, "step": 390 }, { "epoch": 0.87, "learning_rate": 5.4067941638174795e-08, "logits/chosen": -2.077833414077759, "logits/rejected": -2.0256028175354004, "logps/chosen": -427.36444091796875, "logps/rejected": -447.19708251953125, "loss": 0.5163, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0830533504486084, "rewards/confidence": -0.3510534167289734, "rewards/confidence_mean_diff": 0.3510534167289734, "rewards/confidence_moving_diff": 0.025139298290014267, "rewards/margins": 0.7282416224479675, "rewards/mix_margin": 0.3104473054409027, "rewards/real_percentage": 7.0, "rewards/rejected": -2.8112950325012207, "step": 400 }, { "epoch": 0.89, "learning_rate": 3.824150008803767e-08, "logits/chosen": -2.0504865646362305, "logits/rejected": -2.014496326446533, "logps/chosen": -397.7694091796875, "logps/rejected": -405.7162170410156, "loss": 0.5226, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7523311376571655, "rewards/confidence": -0.31372588872909546, "rewards/confidence_mean_diff": 0.31372588872909546, "rewards/confidence_moving_diff": -0.014554053544998169, "rewards/margins": 0.6601498126983643, "rewards/mix_margin": 0.33540448546409607, "rewards/real_percentage": 6.0, "rewards/rejected": -2.4124810695648193, "step": 410 }, { "epoch": 0.91, "learning_rate": 2.5059969408867844e-08, "logits/chosen": -2.0050809383392334, "logits/rejected": -1.9919992685317993, "logps/chosen": -398.9284362792969, "logps/rejected": -417.0933532714844, "loss": 0.5172, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4770046472549438, "rewards/confidence": -0.3290501832962036, "rewards/confidence_mean_diff": 0.3290501832962036, "rewards/confidence_moving_diff": 0.02498219534754753, "rewards/margins": 0.6767237186431885, "rewards/mix_margin": 0.34894901514053345, "rewards/real_percentage": 6.400000095367432, "rewards/rejected": -2.1537282466888428, "step": 420 }, { "epoch": 0.93, "learning_rate": 1.4598852214685486e-08, "logits/chosen": -2.111842632293701, "logits/rejected": -2.0859241485595703, "logps/chosen": -419.46783447265625, "logps/rejected": -449.5113220214844, "loss": 0.5041, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6683928966522217, "rewards/confidence": -0.38691821694374084, "rewards/confidence_mean_diff": 0.38691821694374084, "rewards/confidence_moving_diff": -0.019811248406767845, "rewards/margins": 0.7039911150932312, "rewards/mix_margin": 0.30413728952407837, "rewards/real_percentage": 5.800000190734863, "rewards/rejected": -2.3723835945129395, "step": 430 }, { "epoch": 0.95, "learning_rate": 6.918068837427127e-09, "logits/chosen": -2.06069016456604, "logits/rejected": -2.0733203887939453, "logps/chosen": -453.7708435058594, "logps/rejected": -459.8556213378906, "loss": 0.5331, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9116617441177368, "rewards/confidence": -0.3324928283691406, "rewards/confidence_mean_diff": 0.3324928283691406, "rewards/confidence_moving_diff": 0.008450254797935486, "rewards/margins": 0.7425110936164856, "rewards/mix_margin": 0.3466404378414154, "rewards/real_percentage": 7.0, "rewards/rejected": -2.654172897338867, "step": 440 }, { "epoch": 0.97, "learning_rate": 2.0616141087114737e-09, "logits/chosen": -2.075761079788208, "logits/rejected": -2.0503153800964355, "logps/chosen": -475.33038330078125, "logps/rejected": -491.11932373046875, "loss": 0.4972, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7454744577407837, "rewards/confidence": -0.26364636421203613, "rewards/confidence_mean_diff": 0.26364636421203613, "rewards/confidence_moving_diff": -0.0049579874612390995, "rewards/margins": 0.8323339223861694, "rewards/mix_margin": 0.3889670968055725, "rewards/real_percentage": 6.0, "rewards/rejected": -2.577807903289795, "step": 450 }, { "epoch": 0.99, "learning_rate": 5.7305361427451014e-11, "logits/chosen": -2.111032724380493, "logits/rejected": -2.093623638153076, "logps/chosen": -483.09967041015625, "logps/rejected": -487.7115173339844, "loss": 0.5639, "rewards/accuracies": 0.65625, "rewards/chosen": -2.091290235519409, "rewards/confidence": -0.47068697214126587, "rewards/confidence_mean_diff": 0.47068697214126587, "rewards/confidence_moving_diff": 0.013580495491623878, "rewards/margins": 0.5157801508903503, "rewards/mix_margin": 0.26322659850120544, "rewards/real_percentage": 5.800000190734863, "rewards/rejected": -2.6070706844329834, "step": 460 }, { "epoch": 1.0, "step": 462, "total_flos": 0.0, "train_loss": 0.5550967540059771, "train_runtime": 33981.529, "train_samples_per_second": 0.871, "train_steps_per_second": 0.014 } ], "logging_steps": 10, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }