diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8333 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 4176, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.8733329991965941, + "learning_rate": 1.1961722488038277e-09, + "logits/chosen": -2.8505566120147705, + "logits/rejected": -2.908921003341675, + "logps/chosen": -429.770751953125, + "logps/rejected": -264.9197998046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1.817759545013798, + "learning_rate": 1.1961722488038278e-08, + "logits/chosen": -2.7373788356781006, + "logits/rejected": -2.7256851196289062, + "logps/chosen": -308.5910339355469, + "logps/rejected": -256.5116271972656, + "loss": 0.6931, + "rewards/accuracies": 0.0694444477558136, + "rewards/chosen": -8.499662362737581e-05, + "rewards/margins": -6.767747981939465e-05, + "rewards/margins_max": 0.0005438412772491574, + "rewards/margins_min": -0.0006299633532762527, + "rewards/margins_std": 0.0005042126285843551, + "rewards/rejected": -1.731912743707653e-05, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 1.6702737002599026, + "learning_rate": 2.3923444976076555e-08, + "logits/chosen": -2.7464852333068848, + "logits/rejected": -2.726733922958374, + "logps/chosen": -240.0852813720703, + "logps/rejected": -258.0418701171875, + "loss": 0.6932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00010407304216641933, + "rewards/margins": 0.00011324265506118536, + "rewards/margins_max": 0.003207577858120203, + "rewards/margins_min": -0.0033295839093625546, + "rewards/margins_std": 0.0029281422030180693, + "rewards/rejected": -9.169587428914383e-06, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 2.107174099558945, + "learning_rate": 3.588516746411483e-08, + "logits/chosen": -2.8826613426208496, + "logits/rejected": -2.850792407989502, + "logps/chosen": -340.63238525390625, + "logps/rejected": -264.9729919433594, + "loss": 0.6933, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0002944297739304602, + "rewards/margins": 0.0002557325060479343, + "rewards/margins_max": 0.003186721820384264, + "rewards/margins_min": -0.0027455384843051434, + "rewards/margins_std": 0.002718889620155096, + "rewards/rejected": 3.86972569685895e-05, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 2.304921801020499, + "learning_rate": 4.784688995215311e-08, + "logits/chosen": -2.7977702617645264, + "logits/rejected": -2.766904354095459, + "logps/chosen": -264.3175354003906, + "logps/rejected": -238.17086791992188, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00032830884447321296, + "rewards/margins": 0.0003703173715621233, + "rewards/margins_max": 0.0032206419855356216, + "rewards/margins_min": -0.0021116649731993675, + "rewards/margins_std": 0.00244266539812088, + "rewards/rejected": -4.200851981295273e-05, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 1.7720520388580636, + "learning_rate": 5.980861244019139e-08, + "logits/chosen": -2.871934413909912, + "logits/rejected": -2.8557310104370117, + "logps/chosen": -328.1521911621094, + "logps/rejected": -322.0428771972656, + "loss": 0.6932, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00033686935785226524, + "rewards/margins": -0.000777339213527739, + "rewards/margins_max": 0.0024222906213253736, + "rewards/margins_min": -0.00480139022693038, + "rewards/margins_std": 0.003326979000121355, + "rewards/rejected": 0.0004404698556754738, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.6557868453886389, + "learning_rate": 7.177033492822967e-08, + "logits/chosen": -2.84224009513855, + "logits/rejected": -2.7694106101989746, + "logps/chosen": -306.7173767089844, + "logps/rejected": -259.01873779296875, + "loss": 0.693, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 8.184978651115671e-05, + "rewards/margins": 0.00029427764820866287, + "rewards/margins_max": 0.003731258912011981, + "rewards/margins_min": -0.003034669905900955, + "rewards/margins_std": 0.003077024593949318, + "rewards/rejected": -0.00021242785442154855, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 2.9243532166685755, + "learning_rate": 8.373205741626794e-08, + "logits/chosen": -2.7519397735595703, + "logits/rejected": -2.7474653720855713, + "logps/chosen": -288.6518859863281, + "logps/rejected": -253.1888885498047, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00029823233489878476, + "rewards/margins": 0.0004616590158548206, + "rewards/margins_max": 0.003543038619682193, + "rewards/margins_min": -0.0025465849321335554, + "rewards/margins_std": 0.0027478071860969067, + "rewards/rejected": -0.000163426753715612, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 2.6783819549394763, + "learning_rate": 9.569377990430622e-08, + "logits/chosen": -2.7066688537597656, + "logits/rejected": -2.737964630126953, + "logps/chosen": -233.67822265625, + "logps/rejected": -252.62179565429688, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.176045200030785e-05, + "rewards/margins": 0.00011192444071639329, + "rewards/margins_max": 0.003285625483840704, + "rewards/margins_min": -0.003621011506766081, + "rewards/margins_std": 0.0030982145108282566, + "rewards/rejected": -0.00013368490908760577, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 1.9682895182428097, + "learning_rate": 1.076555023923445e-07, + "logits/chosen": -2.8225607872009277, + "logits/rejected": -2.791503429412842, + "logps/chosen": -283.03143310546875, + "logps/rejected": -248.53964233398438, + "loss": 0.6931, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 8.696295117260888e-05, + "rewards/margins": -0.0002034438803093508, + "rewards/margins_max": 0.002528123091906309, + "rewards/margins_min": -0.0029209128115326166, + "rewards/margins_std": 0.0023842283990234137, + "rewards/rejected": 0.00029040680965408683, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 1.610086968720784, + "learning_rate": 1.1961722488038278e-07, + "logits/chosen": -2.8054909706115723, + "logits/rejected": -2.797973871231079, + "logps/chosen": -300.9483642578125, + "logps/rejected": -310.73065185546875, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00017281100735999644, + "rewards/margins": 0.00018569377425592393, + "rewards/margins_max": 0.003404767718166113, + "rewards/margins_min": -0.0027805559802800417, + "rewards/margins_std": 0.002794269472360611, + "rewards/rejected": -1.2882717783213593e-05, + "step": 100 + }, + { + "epoch": 0.02, + "eval_logits/chosen": -2.803143262863159, + "eval_logits/rejected": -2.7681620121002197, + "eval_logps/chosen": -284.4388427734375, + "eval_logps/rejected": -265.8543395996094, + "eval_loss": 0.6930259466171265, + "eval_rewards/accuracies": 0.5130000114440918, + "eval_rewards/chosen": 0.0001645983284106478, + "eval_rewards/margins": 0.00017520197434350848, + "eval_rewards/margins_max": 0.004733717534691095, + "eval_rewards/margins_min": -0.004303331486880779, + "eval_rewards/margins_std": 0.002950224094092846, + "eval_rewards/rejected": -1.0603625014482532e-05, + "eval_runtime": 859.6856, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 0.291, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.7824351734716144, + "learning_rate": 1.3157894736842104e-07, + "logits/chosen": -2.814023017883301, + "logits/rejected": -2.778928756713867, + "logps/chosen": -274.37091064453125, + "logps/rejected": -255.2414093017578, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0005407524295151234, + "rewards/margins": 0.0002355809265282005, + "rewards/margins_max": 0.004252096172422171, + "rewards/margins_min": -0.003096726257354021, + "rewards/margins_std": 0.0032623987644910812, + "rewards/rejected": 0.00030517150298692286, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 1.6289178129741724, + "learning_rate": 1.4354066985645933e-07, + "logits/chosen": -2.7997655868530273, + "logits/rejected": -2.7307863235473633, + "logps/chosen": -269.5855712890625, + "logps/rejected": -221.72903442382812, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0008956709643825889, + "rewards/margins": 0.0006299163214862347, + "rewards/margins_max": 0.0045247129164636135, + "rewards/margins_min": -0.003170366631820798, + "rewards/margins_std": 0.003422073321416974, + "rewards/rejected": 0.0002657547011040151, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 2.12410380595078, + "learning_rate": 1.555023923444976e-07, + "logits/chosen": -2.8540079593658447, + "logits/rejected": -2.8056535720825195, + "logps/chosen": -318.8978271484375, + "logps/rejected": -284.8515625, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0003819824196398258, + "rewards/margins": 0.0004542602109722793, + "rewards/margins_max": 0.0040441155433654785, + "rewards/margins_min": -0.0029048118740320206, + "rewards/margins_std": 0.0030683670192956924, + "rewards/rejected": -7.227776950458065e-05, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 2.1905846781155547, + "learning_rate": 1.6746411483253589e-07, + "logits/chosen": -2.8298187255859375, + "logits/rejected": -2.8176026344299316, + "logps/chosen": -288.49444580078125, + "logps/rejected": -253.2882080078125, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0008138801786117256, + "rewards/margins": 0.00031006510835140944, + "rewards/margins_max": 0.0037383928429335356, + "rewards/margins_min": -0.0036947287153452635, + "rewards/margins_std": 0.0032638120464980602, + "rewards/rejected": 0.0005038150702603161, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 1.894787831972788, + "learning_rate": 1.7942583732057415e-07, + "logits/chosen": -2.903256416320801, + "logits/rejected": -2.8293240070343018, + "logps/chosen": -322.9395446777344, + "logps/rejected": -311.4129943847656, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0002911566407419741, + "rewards/margins": 7.383768388535827e-05, + "rewards/margins_max": 0.004016853868961334, + "rewards/margins_min": -0.003995803650468588, + "rewards/margins_std": 0.0034933306742459536, + "rewards/rejected": 0.00021731902961619198, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 2.1224104867128992, + "learning_rate": 1.9138755980861244e-07, + "logits/chosen": -2.8328311443328857, + "logits/rejected": -2.845745086669922, + "logps/chosen": -257.3394470214844, + "logps/rejected": -248.88912963867188, + "loss": 0.6925, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0010881477501243353, + "rewards/margins": 0.0008134182426147163, + "rewards/margins_max": 0.004476240370422602, + "rewards/margins_min": -0.0026891534216701984, + "rewards/margins_std": 0.003181836334988475, + "rewards/rejected": 0.000274729507509619, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 2.137764237339619, + "learning_rate": 2.033492822966507e-07, + "logits/chosen": -2.781951904296875, + "logits/rejected": -2.7471015453338623, + "logps/chosen": -297.91473388671875, + "logps/rejected": -237.72140502929688, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0007869511609897017, + "rewards/margins": 0.0010647265007719398, + "rewards/margins_max": 0.005684514995664358, + "rewards/margins_min": -0.003047212492674589, + "rewards/margins_std": 0.0038775629363954067, + "rewards/rejected": -0.0002777752815745771, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 2.366740571690379, + "learning_rate": 2.15311004784689e-07, + "logits/chosen": -2.8212523460388184, + "logits/rejected": -2.798943281173706, + "logps/chosen": -305.50994873046875, + "logps/rejected": -295.97900390625, + "loss": 0.6923, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.0017296562436968088, + "rewards/margins": 0.0019211728358641267, + "rewards/margins_max": 0.005665643606334925, + "rewards/margins_min": -0.002306095790117979, + "rewards/margins_std": 0.003562621073797345, + "rewards/rejected": -0.00019151663582306355, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 1.4712758361311031, + "learning_rate": 2.2727272727272726e-07, + "logits/chosen": -2.8340067863464355, + "logits/rejected": -2.816643238067627, + "logps/chosen": -222.89529418945312, + "logps/rejected": -184.3183135986328, + "loss": 0.6924, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0008180967415682971, + "rewards/margins": 0.001361005357466638, + "rewards/margins_max": 0.00569057185202837, + "rewards/margins_min": -0.002570072654634714, + "rewards/margins_std": 0.0036501861177384853, + "rewards/rejected": -0.0005429086741060019, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 4.296892533700022, + "learning_rate": 2.3923444976076555e-07, + "logits/chosen": -2.7995080947875977, + "logits/rejected": -2.7616238594055176, + "logps/chosen": -262.59808349609375, + "logps/rejected": -226.07138061523438, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0016514122253283858, + "rewards/margins": 0.0019487269455567002, + "rewards/margins_max": 0.0072821988724172115, + "rewards/margins_min": -0.0027829702012240887, + "rewards/margins_std": 0.004508022218942642, + "rewards/rejected": -0.0002973148657474667, + "step": 200 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.8017399311065674, + "eval_logits/rejected": -2.7667651176452637, + "eval_logps/chosen": -284.2892150878906, + "eval_logps/rejected": -265.8525085449219, + "eval_loss": 0.6923297047615051, + "eval_rewards/accuracies": 0.621999979019165, + "eval_rewards/chosen": 0.0016612681793048978, + "eval_rewards/margins": 0.0016538287745788693, + "eval_rewards/margins_max": 0.00986100360751152, + "eval_rewards/margins_min": -0.005685736425220966, + "eval_rewards/margins_std": 0.005118357948958874, + "eval_rewards/rejected": 7.439658020302886e-06, + "eval_runtime": 859.1555, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 0.291, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 1.9734678506216243, + "learning_rate": 2.511961722488038e-07, + "logits/chosen": -2.841190814971924, + "logits/rejected": -2.795135498046875, + "logps/chosen": -285.4710388183594, + "logps/rejected": -251.4048309326172, + "loss": 0.6923, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0018201196799054742, + "rewards/margins": 0.001705428003333509, + "rewards/margins_max": 0.007224083878099918, + "rewards/margins_min": -0.0035212773364037275, + "rewards/margins_std": 0.0048028877936303616, + "rewards/rejected": 0.0001146918730228208, + "step": 210 + }, + { + "epoch": 0.05, + "grad_norm": 1.460897516529647, + "learning_rate": 2.631578947368421e-07, + "logits/chosen": -2.852431058883667, + "logits/rejected": -2.8073434829711914, + "logps/chosen": -257.1728820800781, + "logps/rejected": -236.8297882080078, + "loss": 0.6921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0019468723330646753, + "rewards/margins": 0.0018834697548300028, + "rewards/margins_max": 0.008035682141780853, + "rewards/margins_min": -0.0036481625866144896, + "rewards/margins_std": 0.0051698703318834305, + "rewards/rejected": 6.34025564067997e-05, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 1.8694177181014378, + "learning_rate": 2.7511961722488034e-07, + "logits/chosen": -2.8080573081970215, + "logits/rejected": -2.790807008743286, + "logps/chosen": -275.8009033203125, + "logps/rejected": -252.9151611328125, + "loss": 0.6917, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.003304890124127269, + "rewards/margins": 0.0033024363219738007, + "rewards/margins_max": 0.011663327924907207, + "rewards/margins_min": -0.0038159037940204144, + "rewards/margins_std": 0.0068631889298558235, + "rewards/rejected": 2.4540815957152518e-06, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 1.854606733355213, + "learning_rate": 2.8708133971291866e-07, + "logits/chosen": -2.858386278152466, + "logits/rejected": -2.805567741394043, + "logps/chosen": -255.62924194335938, + "logps/rejected": -235.61962890625, + "loss": 0.6915, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002285485388711095, + "rewards/margins": 0.0019609429873526096, + "rewards/margins_max": 0.008661621250212193, + "rewards/margins_min": -0.003229865338653326, + "rewards/margins_std": 0.005345079582184553, + "rewards/rejected": 0.0003245424304623157, + "step": 240 + }, + { + "epoch": 0.06, + "grad_norm": 1.817379860770541, + "learning_rate": 2.990430622009569e-07, + "logits/chosen": -2.746481418609619, + "logits/rejected": -2.730722188949585, + "logps/chosen": -281.7100830078125, + "logps/rejected": -290.0663146972656, + "loss": 0.6919, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.00281380582600832, + "rewards/margins": 0.0020484558772295713, + "rewards/margins_max": 0.010411800816655159, + "rewards/margins_min": -0.006185551173985004, + "rewards/margins_std": 0.0072416625916957855, + "rewards/rejected": 0.0007653498323634267, + "step": 250 + }, + { + "epoch": 0.06, + "grad_norm": 2.0023821710598284, + "learning_rate": 3.110047846889952e-07, + "logits/chosen": -2.7706775665283203, + "logits/rejected": -2.822251319885254, + "logps/chosen": -257.27923583984375, + "logps/rejected": -275.00030517578125, + "loss": 0.691, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.004468593746423721, + "rewards/margins": 0.005076944828033447, + "rewards/margins_max": 0.014329612255096436, + "rewards/margins_min": -0.0037601019721478224, + "rewards/margins_std": 0.007890553213655949, + "rewards/rejected": -0.0006083514308556914, + "step": 260 + }, + { + "epoch": 0.06, + "grad_norm": 1.7860772159155853, + "learning_rate": 3.229665071770335e-07, + "logits/chosen": -2.891803741455078, + "logits/rejected": -2.8223443031311035, + "logps/chosen": -323.32525634765625, + "logps/rejected": -235.1826171875, + "loss": 0.6912, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.004706279374659061, + "rewards/margins": 0.0037411705125123262, + "rewards/margins_max": 0.014509765431284904, + "rewards/margins_min": -0.0071106404066085815, + "rewards/margins_std": 0.009602605365216732, + "rewards/rejected": 0.0009651094442233443, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 1.5718271722210033, + "learning_rate": 3.3492822966507177e-07, + "logits/chosen": -2.8329367637634277, + "logits/rejected": -2.8499460220336914, + "logps/chosen": -253.4702606201172, + "logps/rejected": -246.0491943359375, + "loss": 0.6917, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004767083562910557, + "rewards/margins": 0.003300449578091502, + "rewards/margins_max": 0.012085122987627983, + "rewards/margins_min": -0.00477250013500452, + "rewards/margins_std": 0.0076704369857907295, + "rewards/rejected": 0.0014666334027424455, + "step": 280 + }, + { + "epoch": 0.07, + "grad_norm": 1.7431756882052798, + "learning_rate": 3.4688995215311004e-07, + "logits/chosen": -2.779371738433838, + "logits/rejected": -2.757108211517334, + "logps/chosen": -265.29156494140625, + "logps/rejected": -221.9234619140625, + "loss": 0.6905, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.004595404490828514, + "rewards/margins": 0.004501349292695522, + "rewards/margins_max": 0.015392111614346504, + "rewards/margins_min": -0.00713689997792244, + "rewards/margins_std": 0.010018928907811642, + "rewards/rejected": 9.405486343894154e-05, + "step": 290 + }, + { + "epoch": 0.07, + "grad_norm": 1.8152609218809446, + "learning_rate": 3.588516746411483e-07, + "logits/chosen": -2.8642630577087402, + "logits/rejected": -2.8560938835144043, + "logps/chosen": -250.71426391601562, + "logps/rejected": -240.01803588867188, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.005799327045679092, + "rewards/margins": 0.0027367686852812767, + "rewards/margins_max": 0.013944950886070728, + "rewards/margins_min": -0.009182724170386791, + "rewards/margins_std": 0.010260081849992275, + "rewards/rejected": 0.0030625583603978157, + "step": 300 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.7978017330169678, + "eval_logits/rejected": -2.7626500129699707, + "eval_logps/chosen": -283.78564453125, + "eval_logps/rejected": -265.6622619628906, + "eval_loss": 0.6908154487609863, + "eval_rewards/accuracies": 0.6520000100135803, + "eval_rewards/chosen": 0.006696476601064205, + "eval_rewards/margins": 0.004786263220012188, + "eval_rewards/margins_max": 0.02526562102138996, + "eval_rewards/margins_min": -0.012469511479139328, + "eval_rewards/margins_std": 0.012487462721765041, + "eval_rewards/rejected": 0.0019102133810520172, + "eval_runtime": 860.3161, + "eval_samples_per_second": 4.649, + "eval_steps_per_second": 0.291, + "step": 300 + }, + { + "epoch": 0.07, + "grad_norm": 1.5859069821282503, + "learning_rate": 3.7081339712918656e-07, + "logits/chosen": -2.8701109886169434, + "logits/rejected": -2.80281400680542, + "logps/chosen": -256.1575622558594, + "logps/rejected": -198.39547729492188, + "loss": 0.6905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.005720221903175116, + "rewards/margins": 0.005076803732663393, + "rewards/margins_max": 0.017741765826940536, + "rewards/margins_min": -0.005554481875151396, + "rewards/margins_std": 0.010260584764182568, + "rewards/rejected": 0.000643418519757688, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 2.5478720556321774, + "learning_rate": 3.827751196172249e-07, + "logits/chosen": -2.8780677318573, + "logits/rejected": -2.8603920936584473, + "logps/chosen": -275.4996643066406, + "logps/rejected": -353.23223876953125, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006795539055019617, + "rewards/margins": 0.004812855739146471, + "rewards/margins_max": 0.02092679962515831, + "rewards/margins_min": -0.010641205124557018, + "rewards/margins_std": 0.014152769930660725, + "rewards/rejected": 0.001982682617381215, + "step": 320 + }, + { + "epoch": 0.08, + "grad_norm": 1.8090933651382484, + "learning_rate": 3.9473684210526315e-07, + "logits/chosen": -2.88478422164917, + "logits/rejected": -2.836512327194214, + "logps/chosen": -332.34991455078125, + "logps/rejected": -263.26214599609375, + "loss": 0.6902, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.009971674531698227, + "rewards/margins": 0.008845487609505653, + "rewards/margins_max": 0.024542566388845444, + "rewards/margins_min": -0.005169200710952282, + "rewards/margins_std": 0.013284943997859955, + "rewards/rejected": 0.0011261856416240335, + "step": 330 + }, + { + "epoch": 0.08, + "grad_norm": 1.889588694650712, + "learning_rate": 4.066985645933014e-07, + "logits/chosen": -2.8786487579345703, + "logits/rejected": -2.8604178428649902, + "logps/chosen": -322.4375, + "logps/rejected": -265.0132141113281, + "loss": 0.6894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.009604470804333687, + "rewards/margins": 0.008933277800679207, + "rewards/margins_max": 0.023819511756300926, + "rewards/margins_min": -0.005439485423266888, + "rewards/margins_std": 0.013345139101147652, + "rewards/rejected": 0.0006711935857310891, + "step": 340 + }, + { + "epoch": 0.08, + "grad_norm": 2.1643558675177577, + "learning_rate": 4.1866028708133973e-07, + "logits/chosen": -2.800771951675415, + "logits/rejected": -2.7496225833892822, + "logps/chosen": -264.0378723144531, + "logps/rejected": -214.9931640625, + "loss": 0.6893, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.007511107716709375, + "rewards/margins": 0.007657433860003948, + "rewards/margins_max": 0.031329743564128876, + "rewards/margins_min": -0.012957903556525707, + "rewards/margins_std": 0.020018046721816063, + "rewards/rejected": -0.00014632634702138603, + "step": 350 + }, + { + "epoch": 0.09, + "grad_norm": 2.0226633365723554, + "learning_rate": 4.30622009569378e-07, + "logits/chosen": -2.8628451824188232, + "logits/rejected": -2.8355746269226074, + "logps/chosen": -281.36749267578125, + "logps/rejected": -245.26431274414062, + "loss": 0.6884, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.008619427680969238, + "rewards/margins": 0.009861720725893974, + "rewards/margins_max": 0.026844218373298645, + "rewards/margins_min": -0.008796043694019318, + "rewards/margins_std": 0.016473382711410522, + "rewards/rejected": -0.0012422938598319888, + "step": 360 + }, + { + "epoch": 0.09, + "grad_norm": 1.4453796371125611, + "learning_rate": 4.425837320574162e-07, + "logits/chosen": -2.9047369956970215, + "logits/rejected": -2.8413548469543457, + "logps/chosen": -296.87872314453125, + "logps/rejected": -231.3509979248047, + "loss": 0.6892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006261153612285852, + "rewards/margins": 0.008242874406278133, + "rewards/margins_max": 0.030506301671266556, + "rewards/margins_min": -0.013848531059920788, + "rewards/margins_std": 0.020168842747807503, + "rewards/rejected": -0.0019817203283309937, + "step": 370 + }, + { + "epoch": 0.09, + "grad_norm": 2.0143229692718427, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -2.8520050048828125, + "logits/rejected": -2.7991726398468018, + "logps/chosen": -279.69146728515625, + "logps/rejected": -224.68331909179688, + "loss": 0.6887, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.010300575755536556, + "rewards/margins": 0.011675434187054634, + "rewards/margins_max": 0.03212700039148331, + "rewards/margins_min": -0.008405391126871109, + "rewards/margins_std": 0.018078230321407318, + "rewards/rejected": -0.0013748581986874342, + "step": 380 + }, + { + "epoch": 0.09, + "grad_norm": 2.173284937469911, + "learning_rate": 4.665071770334928e-07, + "logits/chosen": -2.7563838958740234, + "logits/rejected": -2.7253875732421875, + "logps/chosen": -306.5716552734375, + "logps/rejected": -258.4673767089844, + "loss": 0.6877, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.011723880656063557, + "rewards/margins": 0.012919160537421703, + "rewards/margins_max": 0.035039566457271576, + "rewards/margins_min": -0.009524760767817497, + "rewards/margins_std": 0.020119303837418556, + "rewards/rejected": -0.0011952801141887903, + "step": 390 + }, + { + "epoch": 0.1, + "grad_norm": 1.8297438667404096, + "learning_rate": 4.784688995215311e-07, + "logits/chosen": -2.731421947479248, + "logits/rejected": -2.75995135307312, + "logps/chosen": -268.7449035644531, + "logps/rejected": -259.85772705078125, + "loss": 0.6888, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.009379776194691658, + "rewards/margins": 0.009554450400173664, + "rewards/margins_max": 0.03678436204791069, + "rewards/margins_min": -0.01677670329809189, + "rewards/margins_std": 0.023921573534607887, + "rewards/rejected": -0.00017467378347646445, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.7923877239227295, + "eval_logits/rejected": -2.7572779655456543, + "eval_logps/chosen": -283.4166564941406, + "eval_logps/rejected": -265.89434814453125, + "eval_loss": 0.6879981756210327, + "eval_rewards/accuracies": 0.6644999980926514, + "eval_rewards/chosen": 0.010386648587882519, + "eval_rewards/margins": 0.010797684080898762, + "eval_rewards/margins_max": 0.0544959232211113, + "eval_rewards/margins_min": -0.026414871215820312, + "eval_rewards/margins_std": 0.026797765865921974, + "eval_rewards/rejected": -0.0004110359586775303, + "eval_runtime": 860.5483, + "eval_samples_per_second": 4.648, + "eval_steps_per_second": 0.291, + "step": 400 + }, + { + "epoch": 0.1, + "grad_norm": 1.8439575337135385, + "learning_rate": 4.904306220095694e-07, + "logits/chosen": -2.8291707038879395, + "logits/rejected": -2.7534260749816895, + "logps/chosen": -321.44671630859375, + "logps/rejected": -258.337158203125, + "loss": 0.6871, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.010904048569500446, + "rewards/margins": 0.014547420665621758, + "rewards/margins_max": 0.048709701746702194, + "rewards/margins_min": -0.015160051174461842, + "rewards/margins_std": 0.028659731149673462, + "rewards/rejected": -0.003643373027443886, + "step": 410 + }, + { + "epoch": 0.1, + "grad_norm": 2.1480667071719846, + "learning_rate": 4.999996505732917e-07, + "logits/chosen": -2.8337886333465576, + "logits/rejected": -2.8044726848602295, + "logps/chosen": -297.41912841796875, + "logps/rejected": -293.13800048828125, + "loss": 0.6864, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.012047646567225456, + "rewards/margins": 0.010113747790455818, + "rewards/margins_max": 0.04316211864352226, + "rewards/margins_min": -0.02286229468882084, + "rewards/margins_std": 0.02985798381268978, + "rewards/rejected": 0.0019338976126164198, + "step": 420 + }, + { + "epoch": 0.1, + "grad_norm": 1.951871484954239, + "learning_rate": 4.999874207410648e-07, + "logits/chosen": -2.7598047256469727, + "logits/rejected": -2.7764503955841064, + "logps/chosen": -252.3936004638672, + "logps/rejected": -261.55194091796875, + "loss": 0.6858, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.015963982790708542, + "rewards/margins": 0.011255776509642601, + "rewards/margins_max": 0.044065456837415695, + "rewards/margins_min": -0.015702728182077408, + "rewards/margins_std": 0.026332881301641464, + "rewards/rejected": 0.0047082058154046535, + "step": 430 + }, + { + "epoch": 0.11, + "grad_norm": 1.815723512376768, + "learning_rate": 4.999577205502039e-07, + "logits/chosen": -2.7594494819641113, + "logits/rejected": -2.7417349815368652, + "logps/chosen": -239.5718536376953, + "logps/rejected": -221.40414428710938, + "loss": 0.6874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.014929292723536491, + "rewards/margins": 0.01059373002499342, + "rewards/margins_max": 0.04242347553372383, + "rewards/margins_min": -0.02509412169456482, + "rewards/margins_std": 0.03010099194943905, + "rewards/rejected": 0.004335561767220497, + "step": 440 + }, + { + "epoch": 0.11, + "grad_norm": 1.887094148366367, + "learning_rate": 4.999105520763054e-07, + "logits/chosen": -2.8326034545898438, + "logits/rejected": -2.7527859210968018, + "logps/chosen": -285.027099609375, + "logps/rejected": -256.4057922363281, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.020126910880208015, + "rewards/margins": 0.012689967639744282, + "rewards/margins_max": 0.0455465242266655, + "rewards/margins_min": -0.027548715472221375, + "rewards/margins_std": 0.03222181648015976, + "rewards/rejected": 0.007436943706125021, + "step": 450 + }, + { + "epoch": 0.11, + "grad_norm": 2.0064960126877613, + "learning_rate": 4.998459186157357e-07, + "logits/chosen": -2.8465819358825684, + "logits/rejected": -2.785576820373535, + "logps/chosen": -289.4577331542969, + "logps/rejected": -266.01800537109375, + "loss": 0.6842, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.023445971310138702, + "rewards/margins": 0.014034710824489594, + "rewards/margins_max": 0.05275397375226021, + "rewards/margins_min": -0.020386729389429092, + "rewards/margins_std": 0.03251287341117859, + "rewards/rejected": 0.009411259554326534, + "step": 460 + }, + { + "epoch": 0.11, + "grad_norm": 1.8984794455081915, + "learning_rate": 4.997638246854011e-07, + "logits/chosen": -2.885715961456299, + "logits/rejected": -2.8429105281829834, + "logps/chosen": -282.2930603027344, + "logps/rejected": -270.14898681640625, + "loss": 0.6861, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.028099358081817627, + "rewards/margins": 0.014804655686020851, + "rewards/margins_max": 0.05813845247030258, + "rewards/margins_min": -0.024266045540571213, + "rewards/margins_std": 0.037039484828710556, + "rewards/rejected": 0.013294701464474201, + "step": 470 + }, + { + "epoch": 0.11, + "grad_norm": 2.4713524730316134, + "learning_rate": 4.996642760224317e-07, + "logits/chosen": -2.7256481647491455, + "logits/rejected": -2.7138657569885254, + "logps/chosen": -284.6150207519531, + "logps/rejected": -271.6550598144531, + "loss": 0.685, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03093739226460457, + "rewards/margins": 0.016154423356056213, + "rewards/margins_max": 0.06530088931322098, + "rewards/margins_min": -0.023924505338072777, + "rewards/margins_std": 0.04057624191045761, + "rewards/rejected": 0.014782967045903206, + "step": 480 + }, + { + "epoch": 0.12, + "grad_norm": 1.790829381546387, + "learning_rate": 4.995472795837813e-07, + "logits/chosen": -2.8459765911102295, + "logits/rejected": -2.7399191856384277, + "logps/chosen": -251.16159057617188, + "logps/rejected": -224.5177001953125, + "loss": 0.6826, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.02981259487569332, + "rewards/margins": 0.01665044017136097, + "rewards/margins_max": 0.06273016333580017, + "rewards/margins_min": -0.022494319826364517, + "rewards/margins_std": 0.03761152923107147, + "rewards/rejected": 0.013162153773009777, + "step": 490 + }, + { + "epoch": 0.12, + "grad_norm": 1.7599716827975578, + "learning_rate": 4.994128435457401e-07, + "logits/chosen": -2.832188129425049, + "logits/rejected": -2.7971439361572266, + "logps/chosen": -308.68634033203125, + "logps/rejected": -267.23687744140625, + "loss": 0.6827, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.039294809103012085, + "rewards/margins": 0.022731659933924675, + "rewards/margins_max": 0.06454737484455109, + "rewards/margins_min": -0.025807851925492287, + "rewards/margins_std": 0.04009108990430832, + "rewards/rejected": 0.01656315103173256, + "step": 500 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.787661075592041, + "eval_logits/rejected": -2.7528791427612305, + "eval_logps/chosen": -281.00518798828125, + "eval_logps/rejected": -264.4715270996094, + "eval_loss": 0.6834259629249573, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": 0.034501295536756516, + "eval_rewards/margins": 0.020683957263827324, + "eval_rewards/margins_max": 0.09889663010835648, + "eval_rewards/margins_min": -0.0453532375395298, + "eval_rewards/margins_std": 0.04788992181420326, + "eval_rewards/rejected": 0.01381734013557434, + "eval_runtime": 859.8364, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 500 + }, + { + "epoch": 0.12, + "grad_norm": 1.8380357156008533, + "learning_rate": 4.992609773033638e-07, + "logits/chosen": -2.87412691116333, + "logits/rejected": -2.8063013553619385, + "logps/chosen": -311.1976623535156, + "logps/rejected": -290.9315490722656, + "loss": 0.6806, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.04149966686964035, + "rewards/margins": 0.027491098269820213, + "rewards/margins_max": 0.08390498906373978, + "rewards/margins_min": -0.026887020096182823, + "rewards/margins_std": 0.04932967200875282, + "rewards/rejected": 0.014008568599820137, + "step": 510 + }, + { + "epoch": 0.12, + "grad_norm": 1.8578490078020375, + "learning_rate": 4.990916914698176e-07, + "logits/chosen": -2.8508479595184326, + "logits/rejected": -2.8774688243865967, + "logps/chosen": -269.26678466796875, + "logps/rejected": -282.09149169921875, + "loss": 0.6833, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.026975523680448532, + "rewards/margins": 0.017464371398091316, + "rewards/margins_max": 0.06073393672704697, + "rewards/margins_min": -0.020872922614216805, + "rewards/margins_std": 0.037673480808734894, + "rewards/rejected": 0.009511154145002365, + "step": 520 + }, + { + "epoch": 0.13, + "grad_norm": 1.8975594456223, + "learning_rate": 4.989049978756335e-07, + "logits/chosen": -2.8389906883239746, + "logits/rejected": -2.795275926589966, + "logps/chosen": -259.84613037109375, + "logps/rejected": -223.6072235107422, + "loss": 0.6806, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.03905173018574715, + "rewards/margins": 0.028574619442224503, + "rewards/margins_max": 0.09143301099538803, + "rewards/margins_min": -0.03277328237891197, + "rewards/margins_std": 0.05603231117129326, + "rewards/rejected": 0.010477107018232346, + "step": 530 + }, + { + "epoch": 0.13, + "grad_norm": 1.8617322709156452, + "learning_rate": 4.987009095678842e-07, + "logits/chosen": -2.8395779132843018, + "logits/rejected": -2.7576241493225098, + "logps/chosen": -335.243896484375, + "logps/rejected": -256.72149658203125, + "loss": 0.6757, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.047729261219501495, + "rewards/margins": 0.0380299873650074, + "rewards/margins_max": 0.10880953073501587, + "rewards/margins_min": -0.027835842221975327, + "rewards/margins_std": 0.05956585332751274, + "rewards/rejected": 0.00969927478581667, + "step": 540 + }, + { + "epoch": 0.13, + "grad_norm": 1.6461317379356113, + "learning_rate": 4.984794408092712e-07, + "logits/chosen": -2.747067928314209, + "logits/rejected": -2.761674165725708, + "logps/chosen": -227.9774932861328, + "logps/rejected": -240.8350372314453, + "loss": 0.682, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03224276378750801, + "rewards/margins": 0.019359184429049492, + "rewards/margins_max": 0.08641939610242844, + "rewards/margins_min": -0.03590545803308487, + "rewards/margins_std": 0.05543201044201851, + "rewards/rejected": 0.012883573770523071, + "step": 550 + }, + { + "epoch": 0.13, + "grad_norm": 1.898962128914956, + "learning_rate": 4.982406070771277e-07, + "logits/chosen": -2.8066565990448, + "logits/rejected": -2.7697765827178955, + "logps/chosen": -258.8771667480469, + "logps/rejected": -245.0890655517578, + "loss": 0.679, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.04510858654975891, + "rewards/margins": 0.030218088999390602, + "rewards/margins_max": 0.1033395305275917, + "rewards/margins_min": -0.027472149580717087, + "rewards/margins_std": 0.05755491927266121, + "rewards/rejected": 0.014890496619045734, + "step": 560 + }, + { + "epoch": 0.14, + "grad_norm": 2.01086881886344, + "learning_rate": 4.979844250623374e-07, + "logits/chosen": -2.799595355987549, + "logits/rejected": -2.7691237926483154, + "logps/chosen": -259.0538330078125, + "logps/rejected": -282.46942138671875, + "loss": 0.6795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03679460287094116, + "rewards/margins": 0.026447024196386337, + "rewards/margins_max": 0.11028116941452026, + "rewards/margins_min": -0.043576233088970184, + "rewards/margins_std": 0.0696285218000412, + "rewards/rejected": 0.010347576811909676, + "step": 570 + }, + { + "epoch": 0.14, + "grad_norm": 1.8857229697039908, + "learning_rate": 4.977109126681678e-07, + "logits/chosen": -2.8361918926239014, + "logits/rejected": -2.794586658477783, + "logps/chosen": -334.4454345703125, + "logps/rejected": -279.5559387207031, + "loss": 0.6809, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04045528918504715, + "rewards/margins": 0.027129491791129112, + "rewards/margins_max": 0.11118390411138535, + "rewards/margins_min": -0.04163810983300209, + "rewards/margins_std": 0.06822942197322845, + "rewards/rejected": 0.013325795531272888, + "step": 580 + }, + { + "epoch": 0.14, + "grad_norm": 1.8402639504479013, + "learning_rate": 4.974200890090191e-07, + "logits/chosen": -2.813422441482544, + "logits/rejected": -2.8016512393951416, + "logps/chosen": -243.6787567138672, + "logps/rejected": -241.81228637695312, + "loss": 0.6782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03460235148668289, + "rewards/margins": 0.03581953048706055, + "rewards/margins_max": 0.10852668434381485, + "rewards/margins_min": -0.022002944722771645, + "rewards/margins_std": 0.05729494243860245, + "rewards/rejected": -0.001217175624333322, + "step": 590 + }, + { + "epoch": 0.14, + "grad_norm": 1.8609566893336893, + "learning_rate": 4.971119744090886e-07, + "logits/chosen": -2.822237730026245, + "logits/rejected": -2.7726807594299316, + "logps/chosen": -262.5629577636719, + "logps/rejected": -243.2275848388672, + "loss": 0.6831, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.026615392416715622, + "rewards/margins": 0.030522268265485764, + "rewards/margins_max": 0.12365134805440903, + "rewards/margins_min": -0.059167660772800446, + "rewards/margins_std": 0.08123533427715302, + "rewards/rejected": -0.003906878177076578, + "step": 600 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -2.782672643661499, + "eval_logits/rejected": -2.747931957244873, + "eval_logps/chosen": -281.4937438964844, + "eval_logps/rejected": -266.2421875, + "eval_loss": 0.6776489615440369, + "eval_rewards/accuracies": 0.6909999847412109, + "eval_rewards/chosen": 0.02961578033864498, + "eval_rewards/margins": 0.03350492939352989, + "eval_rewards/margins_max": 0.15520799160003662, + "eval_rewards/margins_min": -0.06960177421569824, + "eval_rewards/margins_std": 0.0744817852973938, + "eval_rewards/rejected": -0.0038891462609171867, + "eval_runtime": 859.6377, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 0.291, + "step": 600 + }, + { + "epoch": 0.15, + "grad_norm": 2.2020432000907517, + "learning_rate": 4.967865904009499e-07, + "logits/chosen": -2.845512866973877, + "logits/rejected": -2.813530683517456, + "logps/chosen": -344.6908264160156, + "logps/rejected": -267.20208740234375, + "loss": 0.6747, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.025227338075637817, + "rewards/margins": 0.038891032338142395, + "rewards/margins_max": 0.1099926233291626, + "rewards/margins_min": -0.021645687520503998, + "rewards/margins_std": 0.059668660163879395, + "rewards/rejected": -0.013663697056472301, + "step": 610 + }, + { + "epoch": 0.15, + "grad_norm": 2.148480002242944, + "learning_rate": 4.964439597240486e-07, + "logits/chosen": -2.8236947059631348, + "logits/rejected": -2.796365976333618, + "logps/chosen": -382.76806640625, + "logps/rejected": -295.16961669921875, + "loss": 0.6724, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04661710932850838, + "rewards/margins": 0.05233670398592949, + "rewards/margins_max": 0.15269331634044647, + "rewards/margins_min": -0.042940981686115265, + "rewards/margins_std": 0.08610849827528, + "rewards/rejected": -0.0057195937260985374, + "step": 620 + }, + { + "epoch": 0.15, + "grad_norm": 2.043037719729389, + "learning_rate": 4.960841063231124e-07, + "logits/chosen": -2.804616928100586, + "logits/rejected": -2.765625476837158, + "logps/chosen": -367.92510986328125, + "logps/rejected": -290.01788330078125, + "loss": 0.6652, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.04542078822851181, + "rewards/margins": 0.06306995451450348, + "rewards/margins_max": 0.15209174156188965, + "rewards/margins_min": -0.030609797686338425, + "rewards/margins_std": 0.08168235421180725, + "rewards/rejected": -0.017649158835411072, + "step": 630 + }, + { + "epoch": 0.15, + "grad_norm": 1.9949981609135232, + "learning_rate": 4.95707055346479e-07, + "logits/chosen": -2.810793161392212, + "logits/rejected": -2.732025623321533, + "logps/chosen": -321.9665222167969, + "logps/rejected": -250.7497100830078, + "loss": 0.6682, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.037032295018434525, + "rewards/margins": 0.06006144359707832, + "rewards/margins_max": 0.14815713465213776, + "rewards/margins_min": -0.021324660629034042, + "rewards/margins_std": 0.07636863738298416, + "rewards/rejected": -0.02302914671599865, + "step": 640 + }, + { + "epoch": 0.16, + "grad_norm": 1.9041296000250105, + "learning_rate": 4.95312833144337e-07, + "logits/chosen": -2.838348150253296, + "logits/rejected": -2.7659010887145996, + "logps/chosen": -290.25091552734375, + "logps/rejected": -254.9677276611328, + "loss": 0.6721, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.019019629806280136, + "rewards/margins": 0.04789215326309204, + "rewards/margins_max": 0.16989126801490784, + "rewards/margins_min": -0.050513893365859985, + "rewards/margins_std": 0.10015592724084854, + "rewards/rejected": -0.028872525319457054, + "step": 650 + }, + { + "epoch": 0.16, + "grad_norm": 1.9181181622206498, + "learning_rate": 4.949014672668858e-07, + "logits/chosen": -2.8450560569763184, + "logits/rejected": -2.823151111602783, + "logps/chosen": -259.1734619140625, + "logps/rejected": -250.8106231689453, + "loss": 0.6749, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01695770025253296, + "rewards/margins": 0.04111206904053688, + "rewards/margins_max": 0.1423286646604538, + "rewards/margins_min": -0.06147942692041397, + "rewards/margins_std": 0.09176277369260788, + "rewards/rejected": -0.02415436878800392, + "step": 660 + }, + { + "epoch": 0.16, + "grad_norm": 2.240906884672249, + "learning_rate": 4.944729864624097e-07, + "logits/chosen": -2.922368049621582, + "logits/rejected": -2.8345208168029785, + "logps/chosen": -330.1218566894531, + "logps/rejected": -267.60223388671875, + "loss": 0.6686, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.030409136787056923, + "rewards/margins": 0.05693582817912102, + "rewards/margins_max": 0.15574179589748383, + "rewards/margins_min": -0.03514755517244339, + "rewards/margins_std": 0.08488789945840836, + "rewards/rejected": -0.026526689529418945, + "step": 670 + }, + { + "epoch": 0.16, + "grad_norm": 2.8553009192620267, + "learning_rate": 4.940274206752687e-07, + "logits/chosen": -2.7653040885925293, + "logits/rejected": -2.7400851249694824, + "logps/chosen": -329.28094482421875, + "logps/rejected": -264.5126953125, + "loss": 0.6664, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022837229073047638, + "rewards/margins": 0.04619354009628296, + "rewards/margins_max": 0.17638953030109406, + "rewards/margins_min": -0.0725640207529068, + "rewards/margins_std": 0.1124802827835083, + "rewards/rejected": -0.02335631661117077, + "step": 680 + }, + { + "epoch": 0.17, + "grad_norm": 2.103590996683848, + "learning_rate": 4.935648010438058e-07, + "logits/chosen": -2.783113479614258, + "logits/rejected": -2.770662307739258, + "logps/chosen": -260.016357421875, + "logps/rejected": -268.6811828613281, + "loss": 0.675, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01028151623904705, + "rewards/margins": 0.04737422987818718, + "rewards/margins_max": 0.16225430369377136, + "rewards/margins_min": -0.055725038051605225, + "rewards/margins_std": 0.0967501625418663, + "rewards/rejected": -0.03709270805120468, + "step": 690 + }, + { + "epoch": 0.17, + "grad_norm": 1.8356022954880307, + "learning_rate": 4.930851598981713e-07, + "logits/chosen": -2.806530237197876, + "logits/rejected": -2.736204147338867, + "logps/chosen": -281.389404296875, + "logps/rejected": -251.22439575195312, + "loss": 0.6652, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0015332363545894623, + "rewards/margins": 0.051959507167339325, + "rewards/margins_max": 0.18142695724964142, + "rewards/margins_min": -0.06150681897997856, + "rewards/margins_std": 0.10783376544713974, + "rewards/rejected": -0.05042628198862076, + "step": 700 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -2.7726073265075684, + "eval_logits/rejected": -2.7381980419158936, + "eval_logps/chosen": -283.5948486328125, + "eval_logps/rejected": -270.1202392578125, + "eval_loss": 0.670020341873169, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": 0.008604736067354679, + "eval_rewards/margins": 0.05127452686429024, + "eval_rewards/margins_max": 0.23500196635723114, + "eval_rewards/margins_min": -0.10567907243967056, + "eval_rewards/margins_std": 0.11278793215751648, + "eval_rewards/rejected": -0.04266979172825813, + "eval_runtime": 859.9574, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 0.291, + "step": 700 + }, + { + "epoch": 0.17, + "grad_norm": 2.1883763329067216, + "learning_rate": 4.925885307580632e-07, + "logits/chosen": -2.7491278648376465, + "logits/rejected": -2.691565752029419, + "logps/chosen": -290.1736145019531, + "logps/rejected": -249.8134765625, + "loss": 0.6684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021899688988924026, + "rewards/margins": 0.08286824077367783, + "rewards/margins_max": 0.22836923599243164, + "rewards/margins_min": -0.053432680666446686, + "rewards/margins_std": 0.12297092378139496, + "rewards/rejected": -0.0609685480594635, + "step": 710 + }, + { + "epoch": 0.17, + "grad_norm": 2.1714985473192288, + "learning_rate": 4.920749483303846e-07, + "logits/chosen": -2.636892080307007, + "logits/rejected": -2.6579179763793945, + "logps/chosen": -274.3876037597656, + "logps/rejected": -278.6937255859375, + "loss": 0.6688, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.008625579997897148, + "rewards/margins": 0.05348697304725647, + "rewards/margins_max": 0.18989379703998566, + "rewards/margins_min": -0.04376517981290817, + "rewards/margins_std": 0.1045072078704834, + "rewards/rejected": -0.04486139863729477, + "step": 720 + }, + { + "epoch": 0.17, + "grad_norm": 2.081086651231241, + "learning_rate": 4.915444485068181e-07, + "logits/chosen": -2.848870277404785, + "logits/rejected": -2.7781660556793213, + "logps/chosen": -321.5794677734375, + "logps/rejected": -291.2355651855469, + "loss": 0.6634, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019336868077516556, + "rewards/margins": 0.05942217633128166, + "rewards/margins_max": 0.1821795403957367, + "rewards/margins_min": -0.05405784398317337, + "rewards/margins_std": 0.1058388501405716, + "rewards/rejected": -0.040085311979055405, + "step": 730 + }, + { + "epoch": 0.18, + "grad_norm": 2.666434476598509, + "learning_rate": 4.90997068361318e-07, + "logits/chosen": -2.852202892303467, + "logits/rejected": -2.807814359664917, + "logps/chosen": -259.76470947265625, + "logps/rejected": -261.9565124511719, + "loss": 0.6591, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.016710925847291946, + "rewards/margins": 0.0612528994679451, + "rewards/margins_max": 0.20229406654834747, + "rewards/margins_min": -0.06611864268779755, + "rewards/margins_std": 0.1207486242055893, + "rewards/rejected": -0.04454197362065315, + "step": 740 + }, + { + "epoch": 0.18, + "grad_norm": 1.786016411427176, + "learning_rate": 4.904328461475189e-07, + "logits/chosen": -2.839444398880005, + "logits/rejected": -2.8065857887268066, + "logps/chosen": -283.57659912109375, + "logps/rejected": -280.9451904296875, + "loss": 0.6699, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01933327689766884, + "rewards/margins": 0.06404153257608414, + "rewards/margins_max": 0.2093987911939621, + "rewards/margins_min": -0.06361619383096695, + "rewards/margins_std": 0.12587139010429382, + "rewards/rejected": -0.044708251953125, + "step": 750 + }, + { + "epoch": 0.18, + "grad_norm": 1.8723215045067456, + "learning_rate": 4.898518212960625e-07, + "logits/chosen": -2.8065075874328613, + "logits/rejected": -2.8201041221618652, + "logps/chosen": -275.5202331542969, + "logps/rejected": -287.6792907714844, + "loss": 0.661, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004976716358214617, + "rewards/margins": 0.04297412186861038, + "rewards/margins_max": 0.18559980392456055, + "rewards/margins_min": -0.09453781694173813, + "rewards/margins_std": 0.12249946594238281, + "rewards/rejected": -0.037997402250766754, + "step": 760 + }, + { + "epoch": 0.18, + "grad_norm": 2.1560222118415435, + "learning_rate": 4.89254034411842e-07, + "logits/chosen": -2.840862512588501, + "logits/rejected": -2.770017385482788, + "logps/chosen": -270.3290100097656, + "logps/rejected": -276.2815246582031, + "loss": 0.6615, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.004142249468713999, + "rewards/margins": 0.05860158056020737, + "rewards/margins_max": 0.19023478031158447, + "rewards/margins_min": -0.07205347716808319, + "rewards/margins_std": 0.1169583648443222, + "rewards/rejected": -0.054459333419799805, + "step": 770 + }, + { + "epoch": 0.19, + "grad_norm": 3.9902538536219994, + "learning_rate": 4.886395272711646e-07, + "logits/chosen": -2.8726820945739746, + "logits/rejected": -2.804377317428589, + "logps/chosen": -309.981689453125, + "logps/rejected": -248.2880401611328, + "loss": 0.6585, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.005033843219280243, + "rewards/margins": 0.08469756692647934, + "rewards/margins_max": 0.24064771831035614, + "rewards/margins_min": -0.07141076028347015, + "rewards/margins_std": 0.142036572098732, + "rewards/rejected": -0.0796637237071991, + "step": 780 + }, + { + "epoch": 0.19, + "grad_norm": 2.544732251425543, + "learning_rate": 4.880083428188314e-07, + "logits/chosen": -2.8103771209716797, + "logits/rejected": -2.7757694721221924, + "logps/chosen": -306.94622802734375, + "logps/rejected": -261.8299255371094, + "loss": 0.6511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006795515306293964, + "rewards/margins": 0.08599305152893066, + "rewards/margins_max": 0.26126423478126526, + "rewards/margins_min": -0.04950443655252457, + "rewards/margins_std": 0.14344367384910583, + "rewards/rejected": -0.07919753342866898, + "step": 790 + }, + { + "epoch": 0.19, + "grad_norm": 3.1077775541846533, + "learning_rate": 4.873605251651373e-07, + "logits/chosen": -2.817831039428711, + "logits/rejected": -2.746727228164673, + "logps/chosen": -306.8286437988281, + "logps/rejected": -260.79437255859375, + "loss": 0.6486, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.007715999148786068, + "rewards/margins": 0.10799429565668106, + "rewards/margins_max": 0.3326597809791565, + "rewards/margins_min": -0.08400187641382217, + "rewards/margins_std": 0.18480992317199707, + "rewards/rejected": -0.10027830302715302, + "step": 800 + }, + { + "epoch": 0.19, + "eval_logits/chosen": -2.7702386379241943, + "eval_logits/rejected": -2.736680746078491, + "eval_logps/chosen": -286.4377746582031, + "eval_logps/rejected": -275.0621643066406, + "eval_loss": 0.6614853739738464, + "eval_rewards/accuracies": 0.6804999709129333, + "eval_rewards/chosen": -0.019824357703328133, + "eval_rewards/margins": 0.07226436585187912, + "eval_rewards/margins_max": 0.3237099051475525, + "eval_rewards/margins_min": -0.14703232049942017, + "eval_rewards/margins_std": 0.15650083124637604, + "eval_rewards/rejected": -0.0920887291431427, + "eval_runtime": 859.6716, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 0.291, + "step": 800 + }, + { + "epoch": 0.19, + "grad_norm": 2.1707019519648285, + "learning_rate": 4.866961195827869e-07, + "logits/chosen": -2.783294200897217, + "logits/rejected": -2.7768232822418213, + "logps/chosen": -246.5240936279297, + "logps/rejected": -249.036865234375, + "loss": 0.6634, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02293337881565094, + "rewards/margins": 0.0682712197303772, + "rewards/margins_max": 0.20986878871917725, + "rewards/margins_min": -0.11614898592233658, + "rewards/margins_std": 0.14644506573677063, + "rewards/rejected": -0.09120459854602814, + "step": 810 + }, + { + "epoch": 0.2, + "grad_norm": 3.7724079757397404, + "learning_rate": 4.860151725037318e-07, + "logits/chosen": -2.741666316986084, + "logits/rejected": -2.730962038040161, + "logps/chosen": -289.5762023925781, + "logps/rejected": -270.65777587890625, + "loss": 0.6453, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.005815769545733929, + "rewards/margins": 0.09417177736759186, + "rewards/margins_max": 0.27101415395736694, + "rewards/margins_min": -0.053407151252031326, + "rewards/margins_std": 0.145038440823555, + "rewards/rejected": -0.09998755156993866, + "step": 820 + }, + { + "epoch": 0.2, + "grad_norm": 2.2482379795645184, + "learning_rate": 4.853177315159253e-07, + "logits/chosen": -2.857778310775757, + "logits/rejected": -2.800199031829834, + "logps/chosen": -347.98565673828125, + "logps/rejected": -289.58050537109375, + "loss": 0.6485, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.011530758813023567, + "rewards/margins": 0.11128588765859604, + "rewards/margins_max": 0.2874368131160736, + "rewards/margins_min": -0.05928174778819084, + "rewards/margins_std": 0.15356837213039398, + "rewards/rejected": -0.09975512325763702, + "step": 830 + }, + { + "epoch": 0.2, + "grad_norm": 2.125471175072589, + "learning_rate": 4.846038453599967e-07, + "logits/chosen": -2.813398599624634, + "logits/rejected": -2.7322680950164795, + "logps/chosen": -299.72760009765625, + "logps/rejected": -268.55255126953125, + "loss": 0.6597, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03378953039646149, + "rewards/margins": 0.09911631047725677, + "rewards/margins_max": 0.27514463663101196, + "rewards/margins_min": -0.07189072668552399, + "rewards/margins_std": 0.1509791612625122, + "rewards/rejected": -0.06532677263021469, + "step": 840 + }, + { + "epoch": 0.2, + "grad_norm": 2.7297000703484, + "learning_rate": 4.838735639258449e-07, + "logits/chosen": -2.839224100112915, + "logits/rejected": -2.8198981285095215, + "logps/chosen": -263.91021728515625, + "logps/rejected": -281.6410217285156, + "loss": 0.6563, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03757554665207863, + "rewards/margins": 0.03192313760519028, + "rewards/margins_max": 0.1935836672782898, + "rewards/margins_min": -0.15483063459396362, + "rewards/margins_std": 0.15361005067825317, + "rewards/rejected": -0.0694986879825592, + "step": 850 + }, + { + "epoch": 0.21, + "grad_norm": 2.322100120915896, + "learning_rate": 4.831269382491519e-07, + "logits/chosen": -2.7852022647857666, + "logits/rejected": -2.8032796382904053, + "logps/chosen": -264.82830810546875, + "logps/rejected": -287.14013671875, + "loss": 0.6561, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.016894642263650894, + "rewards/margins": 0.053649015724658966, + "rewards/margins_max": 0.28593772649765015, + "rewards/margins_min": -0.16067473590373993, + "rewards/margins_std": 0.19484171271324158, + "rewards/rejected": -0.07054366171360016, + "step": 860 + }, + { + "epoch": 0.21, + "grad_norm": 2.4021676844145188, + "learning_rate": 4.823640205078166e-07, + "logits/chosen": -2.8161494731903076, + "logits/rejected": -2.796370029449463, + "logps/chosen": -242.9220428466797, + "logps/rejected": -254.65536499023438, + "loss": 0.6614, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.026305362582206726, + "rewards/margins": 0.06265170872211456, + "rewards/margins_max": 0.23923330008983612, + "rewards/margins_min": -0.13371381163597107, + "rewards/margins_std": 0.16895791888237, + "rewards/rejected": -0.08895707130432129, + "step": 870 + }, + { + "epoch": 0.21, + "grad_norm": 2.5730950503474523, + "learning_rate": 4.815848640183081e-07, + "logits/chosen": -2.749844789505005, + "logits/rejected": -2.7095799446105957, + "logps/chosen": -328.59759521484375, + "logps/rejected": -296.31890869140625, + "loss": 0.6469, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.023172562941908836, + "rewards/margins": 0.09964267909526825, + "rewards/margins_max": 0.3635765612125397, + "rewards/margins_min": -0.10102187097072601, + "rewards/margins_std": 0.20656804740428925, + "rewards/rejected": -0.07647012174129486, + "step": 880 + }, + { + "epoch": 0.21, + "grad_norm": 2.409267861523612, + "learning_rate": 4.807895232319393e-07, + "logits/chosen": -2.766179323196411, + "logits/rejected": -2.7184956073760986, + "logps/chosen": -293.44976806640625, + "logps/rejected": -222.43508911132812, + "loss": 0.6566, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.03060835599899292, + "rewards/margins": 0.08982095867395401, + "rewards/margins_max": 0.30089056491851807, + "rewards/margins_min": -0.08933084458112717, + "rewards/margins_std": 0.17627611756324768, + "rewards/rejected": -0.12042931467294693, + "step": 890 + }, + { + "epoch": 0.22, + "grad_norm": 3.844275461891915, + "learning_rate": 4.799780537310621e-07, + "logits/chosen": -2.7792530059814453, + "logits/rejected": -2.7382729053497314, + "logps/chosen": -321.98876953125, + "logps/rejected": -290.2458190917969, + "loss": 0.6457, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.011465306393802166, + "rewards/margins": 0.12456780672073364, + "rewards/margins_max": 0.35805121064186096, + "rewards/margins_min": -0.07632803171873093, + "rewards/margins_std": 0.19223228096961975, + "rewards/rejected": -0.1360331028699875, + "step": 900 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -2.7500076293945312, + "eval_logits/rejected": -2.7167701721191406, + "eval_logps/chosen": -290.4435729980469, + "eval_logps/rejected": -281.3417663574219, + "eval_loss": 0.6530823111534119, + "eval_rewards/accuracies": 0.6754999756813049, + "eval_rewards/chosen": -0.05988248437643051, + "eval_rewards/margins": 0.09500282257795334, + "eval_rewards/margins_max": 0.42163950204849243, + "eval_rewards/margins_min": -0.19472260773181915, + "eval_rewards/margins_std": 0.20590785145759583, + "eval_rewards/rejected": -0.15488530695438385, + "eval_runtime": 859.3926, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 900 + }, + { + "epoch": 0.22, + "grad_norm": 2.382660807799636, + "learning_rate": 4.791505122251827e-07, + "logits/chosen": -2.824524164199829, + "logits/rejected": -2.7542624473571777, + "logps/chosen": -250.38687133789062, + "logps/rejected": -234.7572479248047, + "loss": 0.6397, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.05901294946670532, + "rewards/margins": 0.12852489948272705, + "rewards/margins_max": 0.30426403880119324, + "rewards/margins_min": -0.05684134364128113, + "rewards/margins_std": 0.17195206880569458, + "rewards/rejected": -0.18753783404827118, + "step": 910 + }, + { + "epoch": 0.22, + "grad_norm": 2.3424411425122367, + "learning_rate": 4.783069565469985e-07, + "logits/chosen": -2.7414608001708984, + "logits/rejected": -2.7249720096588135, + "logps/chosen": -290.5932922363281, + "logps/rejected": -290.652099609375, + "loss": 0.649, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.07417134940624237, + "rewards/margins": 0.09879481792449951, + "rewards/margins_max": 0.35133180022239685, + "rewards/margins_min": -0.12309278547763824, + "rewards/margins_std": 0.21439354121685028, + "rewards/rejected": -0.17296616733074188, + "step": 920 + }, + { + "epoch": 0.22, + "grad_norm": 2.345627604848268, + "learning_rate": 4.77447445648357e-07, + "logits/chosen": -2.751471757888794, + "logits/rejected": -2.7147216796875, + "logps/chosen": -270.4989929199219, + "logps/rejected": -233.923095703125, + "loss": 0.6507, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06592012941837311, + "rewards/margins": 0.09131678938865662, + "rewards/margins_max": 0.3055071234703064, + "rewards/margins_min": -0.1199672594666481, + "rewards/margins_std": 0.19045254588127136, + "rewards/rejected": -0.15723691880702972, + "step": 930 + }, + { + "epoch": 0.23, + "grad_norm": 2.251657087680744, + "learning_rate": 4.765720395961349e-07, + "logits/chosen": -2.779428243637085, + "logits/rejected": -2.778550386428833, + "logps/chosen": -276.707763671875, + "logps/rejected": -277.7341613769531, + "loss": 0.6564, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01194118894636631, + "rewards/margins": 0.10344435274600983, + "rewards/margins_max": 0.3220486044883728, + "rewards/margins_min": -0.08887827396392822, + "rewards/margins_std": 0.18643055856227875, + "rewards/rejected": -0.09150315821170807, + "step": 940 + }, + { + "epoch": 0.23, + "grad_norm": 2.435889861575151, + "learning_rate": 4.7568079956804144e-07, + "logits/chosen": -2.8326854705810547, + "logits/rejected": -2.791243076324463, + "logps/chosen": -322.36151123046875, + "logps/rejected": -308.95574951171875, + "loss": 0.6436, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0057469927705824375, + "rewards/margins": 0.1237279623746872, + "rewards/margins_max": 0.41585248708724976, + "rewards/margins_min": -0.14109480381011963, + "rewards/margins_std": 0.24837708473205566, + "rewards/rejected": -0.129474937915802, + "step": 950 + }, + { + "epoch": 0.23, + "grad_norm": 2.4980689220037164, + "learning_rate": 4.74773787848342e-07, + "logits/chosen": -2.847013473510742, + "logits/rejected": -2.787186622619629, + "logps/chosen": -304.60577392578125, + "logps/rejected": -257.9570617675781, + "loss": 0.6406, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010193193331360817, + "rewards/margins": 0.12367801368236542, + "rewards/margins_max": 0.4210747182369232, + "rewards/margins_min": -0.11235042661428452, + "rewards/margins_std": 0.2386142909526825, + "rewards/rejected": -0.1338711977005005, + "step": 960 + }, + { + "epoch": 0.23, + "grad_norm": 2.953171825542286, + "learning_rate": 4.7385106782350637e-07, + "logits/chosen": -2.7796080112457275, + "logits/rejected": -2.723635196685791, + "logps/chosen": -322.67254638671875, + "logps/rejected": -312.0019226074219, + "loss": 0.6306, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.022632339969277382, + "rewards/margins": 0.1675281822681427, + "rewards/margins_max": 0.4663251042366028, + "rewards/margins_min": -0.11809631437063217, + "rewards/margins_std": 0.25347962975502014, + "rewards/rejected": -0.19016052782535553, + "step": 970 + }, + { + "epoch": 0.23, + "grad_norm": 2.857579235403436, + "learning_rate": 4.729127039777781e-07, + "logits/chosen": -2.6996328830718994, + "logits/rejected": -2.6781938076019287, + "logps/chosen": -257.40435791015625, + "logps/rejected": -246.96444702148438, + "loss": 0.6553, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.14536455273628235, + "rewards/margins": 0.08878039568662643, + "rewards/margins_max": 0.35398316383361816, + "rewards/margins_min": -0.10649490356445312, + "rewards/margins_std": 0.20907866954803467, + "rewards/rejected": -0.23414495587348938, + "step": 980 + }, + { + "epoch": 0.24, + "grad_norm": 4.883361297284326, + "learning_rate": 4.719587618886685e-07, + "logits/chosen": -2.8046514987945557, + "logits/rejected": -2.7349839210510254, + "logps/chosen": -316.08917236328125, + "logps/rejected": -316.1483459472656, + "loss": 0.6611, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05946500971913338, + "rewards/margins": 0.13033434748649597, + "rewards/margins_max": 0.41287779808044434, + "rewards/margins_min": -0.11944649368524551, + "rewards/margins_std": 0.23220928013324738, + "rewards/rejected": -0.18979936838150024, + "step": 990 + }, + { + "epoch": 0.24, + "grad_norm": 2.5571895560851043, + "learning_rate": 4.709893082223737e-07, + "logits/chosen": -2.799964189529419, + "logits/rejected": -2.7523694038391113, + "logps/chosen": -304.8963928222656, + "logps/rejected": -297.629638671875, + "loss": 0.6356, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06462176889181137, + "rewards/margins": 0.10349669307470322, + "rewards/margins_max": 0.38461166620254517, + "rewards/margins_min": -0.16973380744457245, + "rewards/margins_std": 0.2456401288509369, + "rewards/rejected": -0.1681184470653534, + "step": 1000 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.7361843585968018, + "eval_logits/rejected": -2.704155683517456, + "eval_logps/chosen": -290.7086486816406, + "eval_logps/rejected": -283.9889831542969, + "eval_loss": 0.6448861360549927, + "eval_rewards/accuracies": 0.6784999966621399, + "eval_rewards/chosen": -0.0625331774353981, + "eval_rewards/margins": 0.11882392317056656, + "eval_rewards/margins_max": 0.52253258228302, + "eval_rewards/margins_min": -0.24860599637031555, + "eval_rewards/margins_std": 0.25825318694114685, + "eval_rewards/rejected": -0.18135710060596466, + "eval_runtime": 860.0164, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 0.291, + "step": 1000 + }, + { + "epoch": 0.24, + "grad_norm": 3.0310220534360233, + "learning_rate": 4.7000441072911554e-07, + "logits/chosen": -2.7325241565704346, + "logits/rejected": -2.7142276763916016, + "logps/chosen": -259.64825439453125, + "logps/rejected": -293.3815002441406, + "loss": 0.6377, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.05959668755531311, + "rewards/margins": 0.12099339067935944, + "rewards/margins_max": 0.3900481164455414, + "rewards/margins_min": -0.1284235268831253, + "rewards/margins_std": 0.23061911761760712, + "rewards/rejected": -0.18059007823467255, + "step": 1010 + }, + { + "epoch": 0.24, + "grad_norm": 3.1136410073370375, + "learning_rate": 4.690041382384071e-07, + "logits/chosen": -2.7031478881835938, + "logits/rejected": -2.720886468887329, + "logps/chosen": -234.29135131835938, + "logps/rejected": -248.6089324951172, + "loss": 0.6333, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.01792748272418976, + "rewards/margins": 0.1429601013660431, + "rewards/margins_max": 0.3770293593406677, + "rewards/margins_min": -0.058856137096881866, + "rewards/margins_std": 0.1958865374326706, + "rewards/rejected": -0.16088759899139404, + "step": 1020 + }, + { + "epoch": 0.25, + "grad_norm": 3.080625031947799, + "learning_rate": 4.679885606542423e-07, + "logits/chosen": -2.7704384326934814, + "logits/rejected": -2.7624263763427734, + "logps/chosen": -259.32757568359375, + "logps/rejected": -268.58782958984375, + "loss": 0.6351, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02662523090839386, + "rewards/margins": 0.12475328147411346, + "rewards/margins_max": 0.3847391605377197, + "rewards/margins_min": -0.10660214722156525, + "rewards/margins_std": 0.2208033800125122, + "rewards/rejected": -0.15137849748134613, + "step": 1030 + }, + { + "epoch": 0.25, + "grad_norm": 4.393026000885723, + "learning_rate": 4.669577489502108e-07, + "logits/chosen": -2.798563003540039, + "logits/rejected": -2.7397446632385254, + "logps/chosen": -283.36920166015625, + "logps/rejected": -280.3486022949219, + "loss": 0.6213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.018056590110063553, + "rewards/margins": 0.15028563141822815, + "rewards/margins_max": 0.41389912366867065, + "rewards/margins_min": -0.0801553726196289, + "rewards/margins_std": 0.2179095447063446, + "rewards/rejected": -0.1683422327041626, + "step": 1040 + }, + { + "epoch": 0.25, + "grad_norm": 3.3263851865687215, + "learning_rate": 4.6591177516453795e-07, + "logits/chosen": -2.646252155303955, + "logits/rejected": -2.6699962615966797, + "logps/chosen": -262.1933898925781, + "logps/rejected": -261.60467529296875, + "loss": 0.6335, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10094554722309113, + "rewards/margins": 0.09482800960540771, + "rewards/margins_max": 0.33993563055992126, + "rewards/margins_min": -0.14256446063518524, + "rewards/margins_std": 0.22054481506347656, + "rewards/rejected": -0.19577357172966003, + "step": 1050 + }, + { + "epoch": 0.25, + "grad_norm": 3.0935778329126573, + "learning_rate": 4.6485071239505037e-07, + "logits/chosen": -2.765110731124878, + "logits/rejected": -2.7562057971954346, + "logps/chosen": -293.51458740234375, + "logps/rejected": -281.2161865234375, + "loss": 0.6342, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.002409782260656357, + "rewards/margins": 0.17773596942424774, + "rewards/margins_max": 0.5016456842422485, + "rewards/margins_min": -0.13163571059703827, + "rewards/margins_std": 0.2804378569126129, + "rewards/rejected": -0.1801457554101944, + "step": 1060 + }, + { + "epoch": 0.26, + "grad_norm": 2.8692948499223663, + "learning_rate": 4.6377463479406777e-07, + "logits/chosen": -2.7676243782043457, + "logits/rejected": -2.721116304397583, + "logps/chosen": -301.32562255859375, + "logps/rejected": -275.3507385253906, + "loss": 0.6185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06775529682636261, + "rewards/margins": 0.16383641958236694, + "rewards/margins_max": 0.4588744640350342, + "rewards/margins_min": -0.13780052959918976, + "rewards/margins_std": 0.2671576738357544, + "rewards/rejected": -0.23159170150756836, + "step": 1070 + }, + { + "epoch": 0.26, + "grad_norm": 2.8944432789642986, + "learning_rate": 4.6268361756322037e-07, + "logits/chosen": -2.7499701976776123, + "logits/rejected": -2.691213846206665, + "logps/chosen": -320.1142272949219, + "logps/rejected": -289.4026794433594, + "loss": 0.6263, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.0063354698941111565, + "rewards/margins": 0.18059185147285461, + "rewards/margins_max": 0.4770359992980957, + "rewards/margins_min": -0.0981023907661438, + "rewards/margins_std": 0.2540958523750305, + "rewards/rejected": -0.18692728877067566, + "step": 1080 + }, + { + "epoch": 0.26, + "grad_norm": 3.3092779647822512, + "learning_rate": 4.6157773694819396e-07, + "logits/chosen": -2.7550148963928223, + "logits/rejected": -2.742978572845459, + "logps/chosen": -282.58740234375, + "logps/rejected": -348.9330139160156, + "loss": 0.6443, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07441152632236481, + "rewards/margins": 0.12869814038276672, + "rewards/margins_max": 0.483846515417099, + "rewards/margins_min": -0.22396783530712128, + "rewards/margins_std": 0.3191817104816437, + "rewards/rejected": -0.20310965180397034, + "step": 1090 + }, + { + "epoch": 0.26, + "grad_norm": 3.5516894766333014, + "learning_rate": 4.60457070233401e-07, + "logits/chosen": -2.646277904510498, + "logits/rejected": -2.645847797393799, + "logps/chosen": -261.3597412109375, + "logps/rejected": -252.5585479736328, + "loss": 0.6465, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07279185205698013, + "rewards/margins": 0.14220745861530304, + "rewards/margins_max": 0.4044904112815857, + "rewards/margins_min": -0.1086801290512085, + "rewards/margins_std": 0.2287602722644806, + "rewards/rejected": -0.21499928832054138, + "step": 1100 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.730116128921509, + "eval_logits/rejected": -2.698188543319702, + "eval_logps/chosen": -287.3659362792969, + "eval_logps/rejected": -282.8690490722656, + "eval_loss": 0.637828528881073, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -0.029106074944138527, + "eval_rewards/margins": 0.14105208218097687, + "eval_rewards/margins_max": 0.6107731461524963, + "eval_rewards/margins_min": -0.2945648431777954, + "eval_rewards/margins_std": 0.303110808134079, + "eval_rewards/rejected": -0.17015816271305084, + "eval_runtime": 859.776, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 1100 + }, + { + "epoch": 0.27, + "grad_norm": 3.1790500911139357, + "learning_rate": 4.5932169573657987e-07, + "logits/chosen": -2.815812110900879, + "logits/rejected": -2.797717571258545, + "logps/chosen": -319.1174011230469, + "logps/rejected": -329.5782165527344, + "loss": 0.6299, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.015058162622153759, + "rewards/margins": 0.18906506896018982, + "rewards/margins_max": 0.4647350311279297, + "rewards/margins_min": -0.12183723598718643, + "rewards/margins_std": 0.25658389925956726, + "rewards/rejected": -0.17400690913200378, + "step": 1110 + }, + { + "epoch": 0.27, + "grad_norm": 4.171269633630187, + "learning_rate": 4.581716928033216e-07, + "logits/chosen": -2.7732093334198, + "logits/rejected": -2.7576842308044434, + "logps/chosen": -284.92010498046875, + "logps/rejected": -297.70751953125, + "loss": 0.641, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04591558128595352, + "rewards/margins": 0.09049420803785324, + "rewards/margins_max": 0.4978618025779724, + "rewards/margins_min": -0.2861872613430023, + "rewards/margins_std": 0.34444791078567505, + "rewards/rejected": -0.13640980422496796, + "step": 1120 + }, + { + "epoch": 0.27, + "grad_norm": 3.4951713495995476, + "learning_rate": 4.5700714180152467e-07, + "logits/chosen": -2.688694477081299, + "logits/rejected": -2.654418468475342, + "logps/chosen": -230.10556030273438, + "logps/rejected": -241.9662322998047, + "loss": 0.637, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06187693029642105, + "rewards/margins": 0.12760981917381287, + "rewards/margins_max": 0.45869001746177673, + "rewards/margins_min": -0.14676091074943542, + "rewards/margins_std": 0.2812282145023346, + "rewards/rejected": -0.18948674201965332, + "step": 1130 + }, + { + "epoch": 0.27, + "grad_norm": 4.7663237512836645, + "learning_rate": 4.5582812411577887e-07, + "logits/chosen": -2.7296979427337646, + "logits/rejected": -2.7077085971832275, + "logps/chosen": -284.59747314453125, + "logps/rejected": -273.9245910644531, + "loss": 0.6417, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05198391154408455, + "rewards/margins": 0.15814228355884552, + "rewards/margins_max": 0.4948105812072754, + "rewards/margins_min": -0.1632794737815857, + "rewards/margins_std": 0.28802981972694397, + "rewards/rejected": -0.21012616157531738, + "step": 1140 + }, + { + "epoch": 0.28, + "grad_norm": 3.377083311996758, + "learning_rate": 4.546347221416772e-07, + "logits/chosen": -2.728276252746582, + "logits/rejected": -2.6977105140686035, + "logps/chosen": -266.24090576171875, + "logps/rejected": -267.21038818359375, + "loss": 0.6267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021903954446315765, + "rewards/margins": 0.18644428253173828, + "rewards/margins_max": 0.5090216994285583, + "rewards/margins_min": -0.06704654544591904, + "rewards/margins_std": 0.2600798010826111, + "rewards/rejected": -0.20834822952747345, + "step": 1150 + }, + { + "epoch": 0.28, + "grad_norm": 3.6845226876229042, + "learning_rate": 4.534270192800581e-07, + "logits/chosen": -2.707961082458496, + "logits/rejected": -2.677424669265747, + "logps/chosen": -262.2182922363281, + "logps/rejected": -269.26556396484375, + "loss": 0.6281, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.026781385764479637, + "rewards/margins": 0.1710069179534912, + "rewards/margins_max": 0.4876475930213928, + "rewards/margins_min": -0.13151448965072632, + "rewards/margins_std": 0.2734828591346741, + "rewards/rejected": -0.14422553777694702, + "step": 1160 + }, + { + "epoch": 0.28, + "grad_norm": 4.547609879401967, + "learning_rate": 4.5220509993117684e-07, + "logits/chosen": -2.789504289627075, + "logits/rejected": -2.7169666290283203, + "logps/chosen": -298.80975341796875, + "logps/rejected": -274.0549621582031, + "loss": 0.6339, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03234567120671272, + "rewards/margins": 0.11823008954524994, + "rewards/margins_max": 0.45920103788375854, + "rewards/margins_min": -0.26762890815734863, + "rewards/margins_std": 0.33254751563072205, + "rewards/rejected": -0.15057575702667236, + "step": 1170 + }, + { + "epoch": 0.28, + "grad_norm": 2.7231328429432455, + "learning_rate": 4.509690494888071e-07, + "logits/chosen": -2.766167402267456, + "logits/rejected": -2.7049505710601807, + "logps/chosen": -335.00714111328125, + "logps/rejected": -295.6134338378906, + "loss": 0.6278, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04288125038146973, + "rewards/margins": 0.17231056094169617, + "rewards/margins_max": 0.47455501556396484, + "rewards/margins_min": -0.13174986839294434, + "rewards/margins_std": 0.2745053172111511, + "rewards/rejected": -0.12942931056022644, + "step": 1180 + }, + { + "epoch": 0.28, + "grad_norm": 3.9147600172988883, + "learning_rate": 4.4971895433427356e-07, + "logits/chosen": -2.726125955581665, + "logits/rejected": -2.7111525535583496, + "logps/chosen": -227.27102661132812, + "logps/rejected": -236.37942504882812, + "loss": 0.6119, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.00787246972322464, + "rewards/margins": 0.1817532330751419, + "rewards/margins_max": 0.5178753137588501, + "rewards/margins_min": -0.14762163162231445, + "rewards/margins_std": 0.3076411187648773, + "rewards/rejected": -0.17388075590133667, + "step": 1190 + }, + { + "epoch": 0.29, + "grad_norm": 5.455820024290682, + "learning_rate": 4.4845490183041454e-07, + "logits/chosen": -2.7426819801330566, + "logits/rejected": -2.7446188926696777, + "logps/chosen": -308.69561767578125, + "logps/rejected": -318.27105712890625, + "loss": 0.6121, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.03109120763838291, + "rewards/margins": 0.18727445602416992, + "rewards/margins_max": 0.45334863662719727, + "rewards/margins_min": -0.10804096609354019, + "rewards/margins_std": 0.2554578185081482, + "rewards/rejected": -0.21836566925048828, + "step": 1200 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.7208263874053955, + "eval_logits/rejected": -2.68928861618042, + "eval_logps/chosen": -291.0350341796875, + "eval_logps/rejected": -288.4626159667969, + "eval_loss": 0.6317090392112732, + "eval_rewards/accuracies": 0.6779999732971191, + "eval_rewards/chosen": -0.06579707562923431, + "eval_rewards/margins": 0.1602962464094162, + "eval_rewards/margins_max": 0.6846780776977539, + "eval_rewards/margins_min": -0.3354347348213196, + "eval_rewards/margins_std": 0.3418317437171936, + "eval_rewards/rejected": -0.2260933220386505, + "eval_runtime": 859.8431, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 1200 + }, + { + "epoch": 0.29, + "grad_norm": 3.3676733387809215, + "learning_rate": 4.4717698031547733e-07, + "logits/chosen": -2.787823438644409, + "logits/rejected": -2.7212958335876465, + "logps/chosen": -312.23675537109375, + "logps/rejected": -287.49444580078125, + "loss": 0.6022, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.050309885293245316, + "rewards/margins": 0.2127988636493683, + "rewards/margins_max": 0.6404433250427246, + "rewards/margins_min": -0.13751380145549774, + "rewards/margins_std": 0.3414255380630493, + "rewards/rejected": -0.2631087601184845, + "step": 1210 + }, + { + "epoch": 0.29, + "grad_norm": 3.938071315431553, + "learning_rate": 4.458852790969445e-07, + "logits/chosen": -2.7835850715637207, + "logits/rejected": -2.75274658203125, + "logps/chosen": -278.7928771972656, + "logps/rejected": -285.7991027832031, + "loss": 0.6139, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07205849885940552, + "rewards/margins": 0.16392816603183746, + "rewards/margins_max": 0.5056854486465454, + "rewards/margins_min": -0.16414335370063782, + "rewards/margins_std": 0.29500722885131836, + "rewards/rejected": -0.23598666489124298, + "step": 1220 + }, + { + "epoch": 0.29, + "grad_norm": 4.625542479996582, + "learning_rate": 4.4457988844529204e-07, + "logits/chosen": -2.7648215293884277, + "logits/rejected": -2.73262619972229, + "logps/chosen": -258.94415283203125, + "logps/rejected": -301.65460205078125, + "loss": 0.6296, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08725883066654205, + "rewards/margins": 0.18550553917884827, + "rewards/margins_max": 0.6380602121353149, + "rewards/margins_min": -0.20465464890003204, + "rewards/margins_std": 0.37270504236221313, + "rewards/rejected": -0.2727643549442291, + "step": 1230 + }, + { + "epoch": 0.3, + "grad_norm": 4.067322201866829, + "learning_rate": 4.432608995876819e-07, + "logits/chosen": -2.7919411659240723, + "logits/rejected": -2.7073256969451904, + "logps/chosen": -276.91094970703125, + "logps/rejected": -264.3546142578125, + "loss": 0.6487, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08658798038959503, + "rewards/margins": 0.11079300940036774, + "rewards/margins_max": 0.5283591747283936, + "rewards/margins_min": -0.31804972887039185, + "rewards/margins_std": 0.3785925805568695, + "rewards/rejected": -0.19738095998764038, + "step": 1240 + }, + { + "epoch": 0.3, + "grad_norm": 3.6440421509131635, + "learning_rate": 4.419284047015854e-07, + "logits/chosen": -2.795815944671631, + "logits/rejected": -2.7724575996398926, + "logps/chosen": -298.9609375, + "logps/rejected": -261.5467224121094, + "loss": 0.6215, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.08188175410032272, + "rewards/margins": 0.18586918711662292, + "rewards/margins_max": 0.5499319434165955, + "rewards/margins_min": -0.1795201450586319, + "rewards/margins_std": 0.32101696729660034, + "rewards/rejected": -0.26775094866752625, + "step": 1250 + }, + { + "epoch": 0.3, + "grad_norm": 5.21366259739191, + "learning_rate": 4.4058249690834235e-07, + "logits/chosen": -2.7669315338134766, + "logits/rejected": -2.7613534927368164, + "logps/chosen": -264.0572509765625, + "logps/rejected": -264.752685546875, + "loss": 0.6176, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13562199473381042, + "rewards/margins": 0.15420284867286682, + "rewards/margins_max": 0.44718003273010254, + "rewards/margins_min": -0.14009472727775574, + "rewards/margins_std": 0.26433926820755005, + "rewards/rejected": -0.28982487320899963, + "step": 1260 + }, + { + "epoch": 0.3, + "grad_norm": 5.1530670514385015, + "learning_rate": 4.39223270266653e-07, + "logits/chosen": -2.7849080562591553, + "logits/rejected": -2.7341203689575195, + "logps/chosen": -299.35552978515625, + "logps/rejected": -314.02850341796875, + "loss": 0.6043, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10228396952152252, + "rewards/margins": 0.20735082030296326, + "rewards/margins_max": 0.5664128661155701, + "rewards/margins_min": -0.15627393126487732, + "rewards/margins_std": 0.33992764353752136, + "rewards/rejected": -0.30963483452796936, + "step": 1270 + }, + { + "epoch": 0.31, + "grad_norm": 4.552507091267408, + "learning_rate": 4.378508197660045e-07, + "logits/chosen": -2.8158326148986816, + "logits/rejected": -2.751063346862793, + "logps/chosen": -320.4525146484375, + "logps/rejected": -308.1334533691406, + "loss": 0.6183, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11220784485340118, + "rewards/margins": 0.23017704486846924, + "rewards/margins_max": 0.6422857046127319, + "rewards/margins_min": -0.21641802787780762, + "rewards/margins_std": 0.3890889883041382, + "rewards/rejected": -0.34238487482070923, + "step": 1280 + }, + { + "epoch": 0.31, + "grad_norm": 3.7868892769010043, + "learning_rate": 4.364652413200325e-07, + "logits/chosen": -2.81675124168396, + "logits/rejected": -2.758913993835449, + "logps/chosen": -323.35797119140625, + "logps/rejected": -289.88714599609375, + "loss": 0.6069, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12105697393417358, + "rewards/margins": 0.229929119348526, + "rewards/margins_max": 0.6117097735404968, + "rewards/margins_min": -0.11338132619857788, + "rewards/margins_std": 0.32332319021224976, + "rewards/rejected": -0.3509860932826996, + "step": 1290 + }, + { + "epoch": 0.31, + "grad_norm": 7.258932464550304, + "learning_rate": 4.35066631759819e-07, + "logits/chosen": -2.745797634124756, + "logits/rejected": -2.7432026863098145, + "logps/chosen": -297.58990478515625, + "logps/rejected": -297.460205078125, + "loss": 0.6113, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1302531361579895, + "rewards/margins": 0.22044309973716736, + "rewards/margins_max": 0.6038533449172974, + "rewards/margins_min": -0.1656443476676941, + "rewards/margins_std": 0.34139499068260193, + "rewards/rejected": -0.35069626569747925, + "step": 1300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.72509765625, + "eval_logits/rejected": -2.6940572261810303, + "eval_logps/chosen": -302.64703369140625, + "eval_logps/rejected": -301.4143981933594, + "eval_loss": 0.6287034749984741, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": -0.18191717565059662, + "eval_rewards/margins": 0.17369407415390015, + "eval_rewards/margins_max": 0.7286714911460876, + "eval_rewards/margins_min": -0.3415972888469696, + "eval_rewards/margins_std": 0.36210083961486816, + "eval_rewards/rejected": -0.35561126470565796, + "eval_runtime": 859.8191, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 1300 + }, + { + "epoch": 0.31, + "grad_norm": 4.240268786205948, + "learning_rate": 4.3365508882712445e-07, + "logits/chosen": -2.7560200691223145, + "logits/rejected": -2.7424092292785645, + "logps/chosen": -326.30792236328125, + "logps/rejected": -296.25640869140625, + "loss": 0.6078, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08834477514028549, + "rewards/margins": 0.2148284614086151, + "rewards/margins_max": 0.7110589146614075, + "rewards/margins_min": -0.19051234424114227, + "rewards/margins_std": 0.4097180962562561, + "rewards/rejected": -0.3031732439994812, + "step": 1310 + }, + { + "epoch": 0.32, + "grad_norm": 4.561867866097655, + "learning_rate": 4.322307111675573e-07, + "logits/chosen": -2.6840503215789795, + "logits/rejected": -2.655874013900757, + "logps/chosen": -283.03094482421875, + "logps/rejected": -266.55950927734375, + "loss": 0.6208, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1422191858291626, + "rewards/margins": 0.20038846135139465, + "rewards/margins_max": 0.6033264398574829, + "rewards/margins_min": -0.1278662532567978, + "rewards/margins_std": 0.32722723484039307, + "rewards/rejected": -0.34260767698287964, + "step": 1320 + }, + { + "epoch": 0.32, + "grad_norm": 3.4629976156007873, + "learning_rate": 4.3079359832368055e-07, + "logits/chosen": -2.772587299346924, + "logits/rejected": -2.6875176429748535, + "logps/chosen": -298.23114013671875, + "logps/rejected": -250.58224487304688, + "loss": 0.5873, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.05119676515460014, + "rewards/margins": 0.21297863125801086, + "rewards/margins_max": 0.6446580290794373, + "rewards/margins_min": -0.1635059416294098, + "rewards/margins_std": 0.3629075586795807, + "rewards/rejected": -0.2641753554344177, + "step": 1330 + }, + { + "epoch": 0.32, + "grad_norm": 5.110179032923911, + "learning_rate": 4.2934385072805467e-07, + "logits/chosen": -2.6937944889068604, + "logits/rejected": -2.6603851318359375, + "logps/chosen": -269.38800048828125, + "logps/rejected": -250.0190887451172, + "loss": 0.594, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.033771999180316925, + "rewards/margins": 0.20248901844024658, + "rewards/margins_max": 0.6091136336326599, + "rewards/margins_min": -0.19329795241355896, + "rewards/margins_std": 0.375012069940567, + "rewards/rejected": -0.2362610101699829, + "step": 1340 + }, + { + "epoch": 0.32, + "grad_norm": 4.528447830349519, + "learning_rate": 4.278815696962195e-07, + "logits/chosen": -2.7512173652648926, + "logits/rejected": -2.728698968887329, + "logps/chosen": -299.51873779296875, + "logps/rejected": -314.3564758300781, + "loss": 0.595, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.05349518731236458, + "rewards/margins": 0.24695232510566711, + "rewards/margins_max": 0.6851639151573181, + "rewards/margins_min": -0.15185198187828064, + "rewards/margins_std": 0.37347060441970825, + "rewards/rejected": -0.3004475235939026, + "step": 1350 + }, + { + "epoch": 0.33, + "grad_norm": 3.756856118552827, + "learning_rate": 4.264068574196129e-07, + "logits/chosen": -2.680025577545166, + "logits/rejected": -2.6544578075408936, + "logps/chosen": -300.19915771484375, + "logps/rejected": -276.1390686035156, + "loss": 0.6219, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06253058463335037, + "rewards/margins": 0.24794158339500427, + "rewards/margins_max": 0.6766451597213745, + "rewards/margins_min": -0.20260021090507507, + "rewards/margins_std": 0.3898962438106537, + "rewards/rejected": -0.31047219038009644, + "step": 1360 + }, + { + "epoch": 0.33, + "grad_norm": 8.004265125098687, + "learning_rate": 4.2491981695843016e-07, + "logits/chosen": -2.706125020980835, + "logits/rejected": -2.731999158859253, + "logps/chosen": -265.1845703125, + "logps/rejected": -318.8197326660156, + "loss": 0.6301, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.13150812685489655, + "rewards/margins": 0.18570031225681305, + "rewards/margins_max": 0.6965837478637695, + "rewards/margins_min": -0.30658960342407227, + "rewards/margins_std": 0.4410664439201355, + "rewards/rejected": -0.3172084093093872, + "step": 1370 + }, + { + "epoch": 0.33, + "grad_norm": 3.700065024552013, + "learning_rate": 4.2342055223442093e-07, + "logits/chosen": -2.7233641147613525, + "logits/rejected": -2.7470736503601074, + "logps/chosen": -290.7392578125, + "logps/rejected": -291.89642333984375, + "loss": 0.629, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11083575338125229, + "rewards/margins": 0.20581002533435822, + "rewards/margins_max": 0.6339712738990784, + "rewards/margins_min": -0.2242089807987213, + "rewards/margins_std": 0.3768702745437622, + "rewards/rejected": -0.3166458010673523, + "step": 1380 + }, + { + "epoch": 0.33, + "grad_norm": 4.60283629009898, + "learning_rate": 4.2190916802362687e-07, + "logits/chosen": -2.777204990386963, + "logits/rejected": -2.7478649616241455, + "logps/chosen": -266.34307861328125, + "logps/rejected": -287.4773254394531, + "loss": 0.6293, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15680015087127686, + "rewards/margins": 0.13014456629753113, + "rewards/margins_max": 0.5216537714004517, + "rewards/margins_min": -0.27942514419555664, + "rewards/margins_std": 0.36587920784950256, + "rewards/rejected": -0.286944717168808, + "step": 1390 + }, + { + "epoch": 0.34, + "grad_norm": 4.909187943690839, + "learning_rate": 4.203857699490593e-07, + "logits/chosen": -2.7606589794158936, + "logits/rejected": -2.703326940536499, + "logps/chosen": -290.61517333984375, + "logps/rejected": -263.78778076171875, + "loss": 0.6058, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1779560148715973, + "rewards/margins": 0.22627469897270203, + "rewards/margins_max": 0.6385394334793091, + "rewards/margins_min": -0.21331973373889923, + "rewards/margins_std": 0.38510316610336304, + "rewards/rejected": -0.4042307436466217, + "step": 1400 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -2.7134692668914795, + "eval_logits/rejected": -2.6823277473449707, + "eval_logps/chosen": -297.353759765625, + "eval_logps/rejected": -297.89019775390625, + "eval_loss": 0.6233600378036499, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -0.12898415327072144, + "eval_rewards/margins": 0.1913849264383316, + "eval_rewards/margins_max": 0.7907646894454956, + "eval_rewards/margins_min": -0.3942703902721405, + "eval_rewards/margins_std": 0.3995422422885895, + "eval_rewards/rejected": -0.32036906480789185, + "eval_runtime": 859.9261, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 1400 + }, + { + "epoch": 0.34, + "grad_norm": 5.627722356903511, + "learning_rate": 4.1885046447331816e-07, + "logits/chosen": -2.676079034805298, + "logits/rejected": -2.689484119415283, + "logps/chosen": -316.91046142578125, + "logps/rejected": -301.6855163574219, + "loss": 0.5979, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.040952786803245544, + "rewards/margins": 0.2542146146297455, + "rewards/margins_max": 0.6011955738067627, + "rewards/margins_min": -0.07450132817029953, + "rewards/margins_std": 0.3058848977088928, + "rewards/rejected": -0.29516738653182983, + "step": 1410 + }, + { + "epoch": 0.34, + "grad_norm": 5.37700385067337, + "learning_rate": 4.173033588911511e-07, + "logits/chosen": -2.7434334754943848, + "logits/rejected": -2.735919952392578, + "logps/chosen": -325.684814453125, + "logps/rejected": -336.29461669921875, + "loss": 0.6186, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.0978202074766159, + "rewards/margins": 0.2610817849636078, + "rewards/margins_max": 0.7425618171691895, + "rewards/margins_min": -0.14109277725219727, + "rewards/margins_std": 0.3950235843658447, + "rewards/rejected": -0.3589020371437073, + "step": 1420 + }, + { + "epoch": 0.34, + "grad_norm": 5.042836371471277, + "learning_rate": 4.157445613219559e-07, + "logits/chosen": -2.6384220123291016, + "logits/rejected": -2.6442770957946777, + "logps/chosen": -285.9534606933594, + "logps/rejected": -309.20098876953125, + "loss": 0.5913, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.16042251884937286, + "rewards/margins": 0.2354971170425415, + "rewards/margins_max": 0.6843439340591431, + "rewards/margins_min": -0.14893712103366852, + "rewards/margins_std": 0.3668159246444702, + "rewards/rejected": -0.3959196209907532, + "step": 1430 + }, + { + "epoch": 0.34, + "grad_norm": 4.964379073516777, + "learning_rate": 4.141741807022243e-07, + "logits/chosen": -2.7422003746032715, + "logits/rejected": -2.687011241912842, + "logps/chosen": -315.2998352050781, + "logps/rejected": -277.56182861328125, + "loss": 0.6199, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11232249438762665, + "rewards/margins": 0.21811863780021667, + "rewards/margins_max": 0.6316202878952026, + "rewards/margins_min": -0.19104191660881042, + "rewards/margins_std": 0.3665235638618469, + "rewards/rejected": -0.33044111728668213, + "step": 1440 + }, + { + "epoch": 0.35, + "grad_norm": 3.867930208036112, + "learning_rate": 4.1259232677792865e-07, + "logits/chosen": -2.719663143157959, + "logits/rejected": -2.714693069458008, + "logps/chosen": -267.74310302734375, + "logps/rejected": -259.67572021484375, + "loss": 0.6273, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07311922311782837, + "rewards/margins": 0.1725226491689682, + "rewards/margins_max": 0.5369999408721924, + "rewards/margins_min": -0.18779130280017853, + "rewards/margins_std": 0.32979249954223633, + "rewards/rejected": -0.24564187228679657, + "step": 1450 + }, + { + "epoch": 0.35, + "grad_norm": 3.126978743648959, + "learning_rate": 4.1099911009685294e-07, + "logits/chosen": -2.6542458534240723, + "logits/rejected": -2.6500654220581055, + "logps/chosen": -313.4648742675781, + "logps/rejected": -287.6639404296875, + "loss": 0.6001, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07746293395757675, + "rewards/margins": 0.19846072793006897, + "rewards/margins_max": 0.6413771510124207, + "rewards/margins_min": -0.24117057025432587, + "rewards/margins_std": 0.39690056443214417, + "rewards/rejected": -0.27592363953590393, + "step": 1460 + }, + { + "epoch": 0.35, + "grad_norm": 8.246208475634898, + "learning_rate": 4.093946420008668e-07, + "logits/chosen": -2.6944451332092285, + "logits/rejected": -2.6753087043762207, + "logps/chosen": -280.6971740722656, + "logps/rejected": -270.0848083496094, + "loss": 0.6186, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10358381271362305, + "rewards/margins": 0.18480688333511353, + "rewards/margins_max": 0.67694491147995, + "rewards/margins_min": -0.21014046669006348, + "rewards/margins_std": 0.39692941308021545, + "rewards/rejected": -0.2883906960487366, + "step": 1470 + }, + { + "epoch": 0.35, + "grad_norm": 4.551051368391622, + "learning_rate": 4.0777903461814443e-07, + "logits/chosen": -2.7240898609161377, + "logits/rejected": -2.7123374938964844, + "logps/chosen": -304.209228515625, + "logps/rejected": -275.527099609375, + "loss": 0.6363, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06793724000453949, + "rewards/margins": 0.19897602498531342, + "rewards/margins_max": 0.6630719900131226, + "rewards/margins_min": -0.22334155440330505, + "rewards/margins_std": 0.39632511138916016, + "rewards/rejected": -0.2669132649898529, + "step": 1480 + }, + { + "epoch": 0.36, + "grad_norm": 6.694572915321387, + "learning_rate": 4.061524008553285e-07, + "logits/chosen": -2.6535632610321045, + "logits/rejected": -2.633209466934204, + "logps/chosen": -263.6488037109375, + "logps/rejected": -263.69085693359375, + "loss": 0.5912, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.0520213358104229, + "rewards/margins": 0.25981205701828003, + "rewards/margins_max": 0.6896715760231018, + "rewards/margins_min": -0.2117619812488556, + "rewards/margins_std": 0.4034877419471741, + "rewards/rejected": -0.31183338165283203, + "step": 1490 + }, + { + "epoch": 0.36, + "grad_norm": 5.031901252250718, + "learning_rate": 4.045148543896396e-07, + "logits/chosen": -2.7256431579589844, + "logits/rejected": -2.683338165283203, + "logps/chosen": -298.939208984375, + "logps/rejected": -278.4862365722656, + "loss": 0.6169, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.132019504904747, + "rewards/margins": 0.1406988799571991, + "rewards/margins_max": 0.6073684096336365, + "rewards/margins_min": -0.3294394612312317, + "rewards/margins_std": 0.41978007555007935, + "rewards/rejected": -0.2727183699607849, + "step": 1500 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -2.695730686187744, + "eval_logits/rejected": -2.6648154258728027, + "eval_logps/chosen": -296.9002990722656, + "eval_logps/rejected": -298.718017578125, + "eval_loss": 0.6194451451301575, + "eval_rewards/accuracies": 0.6790000200271606, + "eval_rewards/chosen": -0.1244499683380127, + "eval_rewards/margins": 0.20419739186763763, + "eval_rewards/margins_max": 0.8340767025947571, + "eval_rewards/margins_min": -0.4094077944755554, + "eval_rewards/margins_std": 0.4196898937225342, + "eval_rewards/rejected": -0.3286473751068115, + "eval_runtime": 859.603, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 0.291, + "step": 1500 + }, + { + "epoch": 0.36, + "grad_norm": 7.5550805890009425, + "learning_rate": 4.028665096609323e-07, + "logits/chosen": -2.7359156608581543, + "logits/rejected": -2.729952096939087, + "logps/chosen": -322.425048828125, + "logps/rejected": -321.57049560546875, + "loss": 0.6208, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16162991523742676, + "rewards/margins": 0.13901960849761963, + "rewards/margins_max": 0.5425564050674438, + "rewards/margins_min": -0.25365597009658813, + "rewards/margins_std": 0.36137858033180237, + "rewards/rejected": -0.3006495535373688, + "step": 1510 + }, + { + "epoch": 0.36, + "grad_norm": 5.143757431006366, + "learning_rate": 4.01207481863697e-07, + "logits/chosen": -2.8021790981292725, + "logits/rejected": -2.7534518241882324, + "logps/chosen": -345.64410400390625, + "logps/rejected": -318.15753173828125, + "loss": 0.593, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01674334518611431, + "rewards/margins": 0.309499591588974, + "rewards/margins_max": 0.8094781637191772, + "rewards/margins_min": -0.15684106945991516, + "rewards/margins_std": 0.43172699213027954, + "rewards/rejected": -0.29275625944137573, + "step": 1520 + }, + { + "epoch": 0.37, + "grad_norm": 10.866386773496997, + "learning_rate": 3.9953788693901e-07, + "logits/chosen": -2.6948866844177246, + "logits/rejected": -2.6677167415618896, + "logps/chosen": -323.5318908691406, + "logps/rejected": -308.58563232421875, + "loss": 0.6204, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1379726380109787, + "rewards/margins": 0.1759684681892395, + "rewards/margins_max": 0.6472272276878357, + "rewards/margins_min": -0.3483063578605652, + "rewards/margins_std": 0.4497564733028412, + "rewards/rejected": -0.3139411509037018, + "step": 1530 + }, + { + "epoch": 0.37, + "grad_norm": 4.314291761496934, + "learning_rate": 3.978578415664306e-07, + "logits/chosen": -2.6268324851989746, + "logits/rejected": -2.6289455890655518, + "logps/chosen": -273.48516845703125, + "logps/rejected": -260.3494873046875, + "loss": 0.576, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14631512761116028, + "rewards/margins": 0.27899685502052307, + "rewards/margins_max": 0.7275093197822571, + "rewards/margins_min": -0.1443461924791336, + "rewards/margins_std": 0.40121975541114807, + "rewards/rejected": -0.42531198263168335, + "step": 1540 + }, + { + "epoch": 0.37, + "grad_norm": 5.455099453390994, + "learning_rate": 3.9616746315584733e-07, + "logits/chosen": -2.735430955886841, + "logits/rejected": -2.6580283641815186, + "logps/chosen": -334.23291015625, + "logps/rejected": -278.3861999511719, + "loss": 0.5905, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10325656086206436, + "rewards/margins": 0.30803465843200684, + "rewards/margins_max": 0.7940778136253357, + "rewards/margins_min": -0.10969796031713486, + "rewards/margins_std": 0.4056004583835602, + "rewards/rejected": -0.4112912118434906, + "step": 1550 + }, + { + "epoch": 0.37, + "grad_norm": 5.315048324638009, + "learning_rate": 3.9446686983927236e-07, + "logits/chosen": -2.671513795852661, + "logits/rejected": -2.651416301727295, + "logps/chosen": -268.4781494140625, + "logps/rejected": -307.71929931640625, + "loss": 0.6002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10599759966135025, + "rewards/margins": 0.25854843854904175, + "rewards/margins_max": 0.6484243869781494, + "rewards/margins_min": -0.18162080645561218, + "rewards/margins_std": 0.36594900488853455, + "rewards/rejected": -0.364546000957489, + "step": 1560 + }, + { + "epoch": 0.38, + "grad_norm": 7.897137241515315, + "learning_rate": 3.927561804625863e-07, + "logits/chosen": -2.6680185794830322, + "logits/rejected": -2.645139217376709, + "logps/chosen": -319.42694091796875, + "logps/rejected": -326.59466552734375, + "loss": 0.6295, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04393978416919708, + "rewards/margins": 0.1751846820116043, + "rewards/margins_max": 0.6941227912902832, + "rewards/margins_min": -0.28787922859191895, + "rewards/margins_std": 0.4296892583370209, + "rewards/rejected": -0.21912448108196259, + "step": 1570 + }, + { + "epoch": 0.38, + "grad_norm": 4.353279911743585, + "learning_rate": 3.910355145772323e-07, + "logits/chosen": -2.6887450218200684, + "logits/rejected": -2.6724624633789062, + "logps/chosen": -281.3534240722656, + "logps/rejected": -299.25909423828125, + "loss": 0.5963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.006468605250120163, + "rewards/margins": 0.3195800483226776, + "rewards/margins_max": 0.7575172185897827, + "rewards/margins_min": -0.10346603393554688, + "rewards/margins_std": 0.39106759428977966, + "rewards/rejected": -0.31311145424842834, + "step": 1580 + }, + { + "epoch": 0.38, + "grad_norm": 5.15471131457158, + "learning_rate": 3.893049924318613e-07, + "logits/chosen": -2.6913163661956787, + "logits/rejected": -2.6768882274627686, + "logps/chosen": -276.04986572265625, + "logps/rejected": -302.8215026855469, + "loss": 0.5666, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.00238300790078938, + "rewards/margins": 0.2921072840690613, + "rewards/margins_max": 0.6786126494407654, + "rewards/margins_min": -0.09073235094547272, + "rewards/margins_std": 0.35521072149276733, + "rewards/rejected": -0.2944903075695038, + "step": 1590 + }, + { + "epoch": 0.38, + "grad_norm": 8.551269294260868, + "learning_rate": 3.875647349639286e-07, + "logits/chosen": -2.7418625354766846, + "logits/rejected": -2.694929838180542, + "logps/chosen": -296.73883056640625, + "logps/rejected": -245.3251495361328, + "loss": 0.5809, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.08604216575622559, + "rewards/margins": 0.27075013518333435, + "rewards/margins_max": 0.7026273012161255, + "rewards/margins_min": -0.14759303629398346, + "rewards/margins_std": 0.38966792821884155, + "rewards/rejected": -0.35679227113723755, + "step": 1600 + }, + { + "epoch": 0.38, + "eval_logits/chosen": -2.6852588653564453, + "eval_logits/rejected": -2.654654026031494, + "eval_logps/chosen": -295.7021484375, + "eval_logps/rejected": -298.765869140625, + "eval_loss": 0.6163187026977539, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -0.1124684140086174, + "eval_rewards/margins": 0.21665772795677185, + "eval_rewards/margins_max": 0.8822912573814392, + "eval_rewards/margins_min": -0.42427411675453186, + "eval_rewards/margins_std": 0.4398875832557678, + "eval_rewards/rejected": -0.3291260898113251, + "eval_runtime": 859.8439, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 1600 + }, + { + "epoch": 0.39, + "grad_norm": 6.465168869279501, + "learning_rate": 3.8581486379124185e-07, + "logits/chosen": -2.760385036468506, + "logits/rejected": -2.756248950958252, + "logps/chosen": -324.0615234375, + "logps/rejected": -299.63360595703125, + "loss": 0.602, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1135634332895279, + "rewards/margins": 0.28132662177085876, + "rewards/margins_max": 0.75287926197052, + "rewards/margins_min": -0.16431859135627747, + "rewards/margins_std": 0.41888371109962463, + "rewards/rejected": -0.3948900103569031, + "step": 1610 + }, + { + "epoch": 0.39, + "grad_norm": 6.5531401234289905, + "learning_rate": 3.840555012034622e-07, + "logits/chosen": -2.6444177627563477, + "logits/rejected": -2.5895159244537354, + "logps/chosen": -265.99359130859375, + "logps/rejected": -277.5436096191406, + "loss": 0.5879, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13531748950481415, + "rewards/margins": 0.22751832008361816, + "rewards/margins_max": 0.6325951814651489, + "rewards/margins_min": -0.1143103614449501, + "rewards/margins_std": 0.34973862767219543, + "rewards/rejected": -0.3628358244895935, + "step": 1620 + }, + { + "epoch": 0.39, + "grad_norm": 5.518097666358285, + "learning_rate": 3.822867701535578e-07, + "logits/chosen": -2.6742734909057617, + "logits/rejected": -2.666698694229126, + "logps/chosen": -288.57415771484375, + "logps/rejected": -277.10028076171875, + "loss": 0.588, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17379966378211975, + "rewards/margins": 0.2467818558216095, + "rewards/margins_max": 0.7234400510787964, + "rewards/margins_min": -0.2264634668827057, + "rewards/margins_std": 0.42187896370887756, + "rewards/rejected": -0.42058151960372925, + "step": 1630 + }, + { + "epoch": 0.39, + "grad_norm": 18.82054437642546, + "learning_rate": 3.805087942492112e-07, + "logits/chosen": -2.644226312637329, + "logits/rejected": -2.61177659034729, + "logps/chosen": -289.1362609863281, + "logps/rejected": -297.9856872558594, + "loss": 0.5849, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.11222722381353378, + "rewards/margins": 0.2772018015384674, + "rewards/margins_max": 0.7367495894432068, + "rewards/margins_min": -0.14642205834388733, + "rewards/margins_std": 0.39903825521469116, + "rewards/rejected": -0.3894290328025818, + "step": 1640 + }, + { + "epoch": 0.4, + "grad_norm": 9.965038984024348, + "learning_rate": 3.787216977441814e-07, + "logits/chosen": -2.699742317199707, + "logits/rejected": -2.66479229927063, + "logps/chosen": -272.3443908691406, + "logps/rejected": -304.8819580078125, + "loss": 0.599, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12519483268260956, + "rewards/margins": 0.2280396968126297, + "rewards/margins_max": 0.7019823789596558, + "rewards/margins_min": -0.29308685660362244, + "rewards/margins_std": 0.4494010806083679, + "rewards/rejected": -0.35323458909988403, + "step": 1650 + }, + { + "epoch": 0.4, + "grad_norm": 5.680025324160577, + "learning_rate": 3.7692560552961976e-07, + "logits/chosen": -2.693373203277588, + "logits/rejected": -2.65922474861145, + "logps/chosen": -267.5146789550781, + "logps/rejected": -293.3586120605469, + "loss": 0.6309, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16356855630874634, + "rewards/margins": 0.155622199177742, + "rewards/margins_max": 0.5817619562149048, + "rewards/margins_min": -0.26430314779281616, + "rewards/margins_std": 0.3766292929649353, + "rewards/rejected": -0.31919074058532715, + "step": 1660 + }, + { + "epoch": 0.4, + "grad_norm": 5.609989058268003, + "learning_rate": 3.7512064312534276e-07, + "logits/chosen": -2.6858413219451904, + "logits/rejected": -2.6196465492248535, + "logps/chosen": -321.91766357421875, + "logps/rejected": -328.6977233886719, + "loss": 0.5723, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07771825045347214, + "rewards/margins": 0.24791796505451202, + "rewards/margins_max": 0.837367057800293, + "rewards/margins_min": -0.2817845046520233, + "rewards/margins_std": 0.5053817629814148, + "rewards/rejected": -0.32563620805740356, + "step": 1670 + }, + { + "epoch": 0.4, + "grad_norm": 5.810323656426692, + "learning_rate": 3.7330693667105937e-07, + "logits/chosen": -2.7743630409240723, + "logits/rejected": -2.713573455810547, + "logps/chosen": -327.6851501464844, + "logps/rejected": -276.18072509765625, + "loss": 0.6031, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.08341021835803986, + "rewards/margins": 0.20441654324531555, + "rewards/margins_max": 0.6587679982185364, + "rewards/margins_min": -0.3237987160682678, + "rewards/margins_std": 0.4360232353210449, + "rewards/rejected": -0.2878267467021942, + "step": 1680 + }, + { + "epoch": 0.4, + "grad_norm": 8.753814737640813, + "learning_rate": 3.7148461291755626e-07, + "logits/chosen": -2.6960878372192383, + "logits/rejected": -2.667306900024414, + "logps/chosen": -290.73480224609375, + "logps/rejected": -329.62548828125, + "loss": 0.6176, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.19077444076538086, + "rewards/margins": 0.26856881380081177, + "rewards/margins_max": 0.6381527781486511, + "rewards/margins_min": -0.10217100381851196, + "rewards/margins_std": 0.327498197555542, + "rewards/rejected": -0.4593432545661926, + "step": 1690 + }, + { + "epoch": 0.41, + "grad_norm": 7.790281393627224, + "learning_rate": 3.6965379921783945e-07, + "logits/chosen": -2.7386374473571777, + "logits/rejected": -2.715653896331787, + "logps/chosen": -309.8696594238281, + "logps/rejected": -312.392578125, + "loss": 0.5979, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17000164091587067, + "rewards/margins": 0.2889346480369568, + "rewards/margins_max": 0.8153964281082153, + "rewards/margins_min": -0.23362848162651062, + "rewards/margins_std": 0.46885618567466736, + "rewards/rejected": -0.45893630385398865, + "step": 1700 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.677316427230835, + "eval_logits/rejected": -2.646608829498291, + "eval_logps/chosen": -305.7200622558594, + "eval_logps/rejected": -309.882080078125, + "eval_loss": 0.61611407995224, + "eval_rewards/accuracies": 0.6804999709129333, + "eval_rewards/chosen": -0.2126469910144806, + "eval_rewards/margins": 0.22764132916927338, + "eval_rewards/margins_max": 0.9152846336364746, + "eval_rewards/margins_min": -0.4468873143196106, + "eval_rewards/margins_std": 0.46239328384399414, + "eval_rewards/rejected": -0.44028833508491516, + "eval_runtime": 859.5423, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 1700 + }, + { + "epoch": 0.41, + "grad_norm": 7.902971515243615, + "learning_rate": 3.6781462351823455e-07, + "logits/chosen": -2.692828416824341, + "logits/rejected": -2.6957290172576904, + "logps/chosen": -306.51361083984375, + "logps/rejected": -357.2091369628906, + "loss": 0.6187, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2036452293395996, + "rewards/margins": 0.17123371362686157, + "rewards/margins_max": 0.6390005946159363, + "rewards/margins_min": -0.3068637251853943, + "rewards/margins_std": 0.42158761620521545, + "rewards/rejected": -0.3748789429664612, + "step": 1710 + }, + { + "epoch": 0.41, + "grad_norm": 10.017763763451027, + "learning_rate": 3.6596721434944513e-07, + "logits/chosen": -2.7442879676818848, + "logits/rejected": -2.70235276222229, + "logps/chosen": -305.9901123046875, + "logps/rejected": -317.50653076171875, + "loss": 0.6225, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22123900055885315, + "rewards/margins": 0.16670717298984528, + "rewards/margins_max": 0.7052956819534302, + "rewards/margins_min": -0.2980489730834961, + "rewards/margins_std": 0.44336920976638794, + "rewards/rejected": -0.38794612884521484, + "step": 1720 + }, + { + "epoch": 0.41, + "grad_norm": 8.084320112999801, + "learning_rate": 3.6411170081757025e-07, + "logits/chosen": -2.7196030616760254, + "logits/rejected": -2.6944994926452637, + "logps/chosen": -303.96490478515625, + "logps/rejected": -293.9749450683594, + "loss": 0.6045, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11599443852901459, + "rewards/margins": 0.24298004806041718, + "rewards/margins_max": 0.7548770308494568, + "rewards/margins_min": -0.2850678563117981, + "rewards/margins_std": 0.472770631313324, + "rewards/rejected": -0.35897451639175415, + "step": 1730 + }, + { + "epoch": 0.42, + "grad_norm": 5.6218213743034555, + "learning_rate": 3.622482125950821e-07, + "logits/chosen": -2.7411789894104004, + "logits/rejected": -2.7309060096740723, + "logps/chosen": -330.2281188964844, + "logps/rejected": -325.1976013183594, + "loss": 0.6017, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19880354404449463, + "rewards/margins": 0.25061336159706116, + "rewards/margins_max": 0.6677745580673218, + "rewards/margins_min": -0.18377824127674103, + "rewards/margins_std": 0.3760093152523041, + "rewards/rejected": -0.4494169354438782, + "step": 1740 + }, + { + "epoch": 0.42, + "grad_norm": 6.509142388314121, + "learning_rate": 3.603768799117637e-07, + "logits/chosen": -2.687448263168335, + "logits/rejected": -2.6664681434631348, + "logps/chosen": -311.8000793457031, + "logps/rejected": -307.2388610839844, + "loss": 0.6217, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2506563365459442, + "rewards/margins": 0.2387455701828003, + "rewards/margins_max": 0.7983524799346924, + "rewards/margins_min": -0.26707369089126587, + "rewards/margins_std": 0.4749852120876312, + "rewards/rejected": -0.4894019067287445, + "step": 1750 + }, + { + "epoch": 0.42, + "grad_norm": 4.002326288879537, + "learning_rate": 3.584978335456078e-07, + "logits/chosen": -2.6560323238372803, + "logits/rejected": -2.6839094161987305, + "logps/chosen": -289.1376647949219, + "logps/rejected": -336.69024658203125, + "loss": 0.5897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15337905287742615, + "rewards/margins": 0.2918064296245575, + "rewards/margins_max": 0.7951470613479614, + "rewards/margins_min": -0.20761564373970032, + "rewards/margins_std": 0.4645652770996094, + "rewards/rejected": -0.4451855719089508, + "step": 1760 + }, + { + "epoch": 0.42, + "grad_norm": 6.809309185912729, + "learning_rate": 3.5661120481367757e-07, + "logits/chosen": -2.770693302154541, + "logits/rejected": -2.741415023803711, + "logps/chosen": -338.86456298828125, + "logps/rejected": -325.62506103515625, + "loss": 0.6122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1604558527469635, + "rewards/margins": 0.2866327166557312, + "rewards/margins_max": 0.7840811610221863, + "rewards/margins_min": -0.2536107301712036, + "rewards/margins_std": 0.4627218246459961, + "rewards/rejected": -0.4470886290073395, + "step": 1770 + }, + { + "epoch": 0.43, + "grad_norm": 4.392717941861058, + "learning_rate": 3.547171255629292e-07, + "logits/chosen": -2.64208722114563, + "logits/rejected": -2.5881106853485107, + "logps/chosen": -280.07275390625, + "logps/rejected": -261.87738037109375, + "loss": 0.568, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.09692516177892685, + "rewards/margins": 0.3377645015716553, + "rewards/margins_max": 0.7657067179679871, + "rewards/margins_min": -0.13990791141986847, + "rewards/margins_std": 0.3938554525375366, + "rewards/rejected": -0.4346896708011627, + "step": 1780 + }, + { + "epoch": 0.43, + "grad_norm": 6.009492041627867, + "learning_rate": 3.528157281609984e-07, + "logits/chosen": -2.6666150093078613, + "logits/rejected": -2.6736645698547363, + "logps/chosen": -234.13223266601562, + "logps/rejected": -236.21127319335938, + "loss": 0.6364, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.20780625939369202, + "rewards/margins": 0.19026105105876923, + "rewards/margins_max": 0.673338770866394, + "rewards/margins_min": -0.265020489692688, + "rewards/margins_std": 0.41245418787002563, + "rewards/rejected": -0.39806729555130005, + "step": 1790 + }, + { + "epoch": 0.43, + "grad_norm": 5.409885132534405, + "learning_rate": 3.5090714548694916e-07, + "logits/chosen": -2.5533435344696045, + "logits/rejected": -2.557516574859619, + "logps/chosen": -346.5416259765625, + "logps/rejected": -326.44036865234375, + "loss": 0.6034, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.15006712079048157, + "rewards/margins": 0.21542784571647644, + "rewards/margins_max": 0.715800940990448, + "rewards/margins_min": -0.31234538555145264, + "rewards/margins_std": 0.468539297580719, + "rewards/rejected": -0.365494966506958, + "step": 1800 + }, + { + "epoch": 0.43, + "eval_logits/chosen": -2.6671693325042725, + "eval_logits/rejected": -2.636549949645996, + "eval_logps/chosen": -300.97119140625, + "eval_logps/rejected": -305.9888610839844, + "eval_loss": 0.6123643517494202, + "eval_rewards/accuracies": 0.6804999709129333, + "eval_rewards/chosen": -0.1651587039232254, + "eval_rewards/margins": 0.23619771003723145, + "eval_rewards/margins_max": 0.9409818649291992, + "eval_rewards/margins_min": -0.45071518421173096, + "eval_rewards/margins_std": 0.4725970923900604, + "eval_rewards/rejected": -0.40135645866394043, + "eval_runtime": 859.8464, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 1800 + }, + { + "epoch": 0.43, + "grad_norm": 3.8143147386331404, + "learning_rate": 3.489915109219882e-07, + "logits/chosen": -2.643347978591919, + "logits/rejected": -2.603933811187744, + "logps/chosen": -268.88848876953125, + "logps/rejected": -270.46148681640625, + "loss": 0.5921, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2317378968000412, + "rewards/margins": 0.2574358284473419, + "rewards/margins_max": 0.6683996915817261, + "rewards/margins_min": -0.17655006051063538, + "rewards/margins_std": 0.3807302713394165, + "rewards/rejected": -0.4891737103462219, + "step": 1810 + }, + { + "epoch": 0.44, + "grad_norm": 7.259545455028369, + "learning_rate": 3.4706895834014294e-07, + "logits/chosen": -2.7250919342041016, + "logits/rejected": -2.6996560096740723, + "logps/chosen": -313.46527099609375, + "logps/rejected": -324.8810119628906, + "loss": 0.6077, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.20322100818157196, + "rewards/margins": 0.21102142333984375, + "rewards/margins_max": 0.6702043414115906, + "rewards/margins_min": -0.25847309827804565, + "rewards/margins_std": 0.41045159101486206, + "rewards/rejected": -0.4142424166202545, + "step": 1820 + }, + { + "epoch": 0.44, + "grad_norm": 9.341131041045113, + "learning_rate": 3.451396220989064e-07, + "logits/chosen": -2.740797519683838, + "logits/rejected": -2.670044183731079, + "logps/chosen": -295.61309814453125, + "logps/rejected": -281.2939758300781, + "loss": 0.5669, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.14443984627723694, + "rewards/margins": 0.23712129890918732, + "rewards/margins_max": 0.760201632976532, + "rewards/margins_min": -0.28850245475769043, + "rewards/margins_std": 0.48045676946640015, + "rewards/rejected": -0.38156113028526306, + "step": 1830 + }, + { + "epoch": 0.44, + "grad_norm": 8.45587869239201, + "learning_rate": 3.43203637029847e-07, + "logits/chosen": -2.7226665019989014, + "logits/rejected": -2.6695210933685303, + "logps/chosen": -357.6163024902344, + "logps/rejected": -322.4307556152344, + "loss": 0.6314, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.20505478978157043, + "rewards/margins": 0.16338197886943817, + "rewards/margins_max": 0.6402610540390015, + "rewards/margins_min": -0.3119858205318451, + "rewards/margins_std": 0.4293789267539978, + "rewards/rejected": -0.3684367537498474, + "step": 1840 + }, + { + "epoch": 0.44, + "grad_norm": 5.549724352065707, + "learning_rate": 3.4126113842918643e-07, + "logits/chosen": -2.695279598236084, + "logits/rejected": -2.6542508602142334, + "logps/chosen": -289.8641052246094, + "logps/rejected": -282.6558532714844, + "loss": 0.59, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.06596537679433823, + "rewards/margins": 0.26139482855796814, + "rewards/margins_max": 0.7130376696586609, + "rewards/margins_min": -0.16647562384605408, + "rewards/margins_std": 0.39078274369239807, + "rewards/rejected": -0.32736021280288696, + "step": 1850 + }, + { + "epoch": 0.45, + "grad_norm": 4.267178095524697, + "learning_rate": 3.3931226204834397e-07, + "logits/chosen": -2.749413013458252, + "logits/rejected": -2.7500834465026855, + "logps/chosen": -342.98895263671875, + "logps/rejected": -348.8139343261719, + "loss": 0.6153, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16457152366638184, + "rewards/margins": 0.31747838854789734, + "rewards/margins_max": 0.7840696573257446, + "rewards/margins_min": -0.11327596008777618, + "rewards/margins_std": 0.40646958351135254, + "rewards/rejected": -0.48204994201660156, + "step": 1860 + }, + { + "epoch": 0.45, + "grad_norm": 6.036927076368459, + "learning_rate": 3.3735714408445e-07, + "logits/chosen": -2.6747236251831055, + "logits/rejected": -2.6891233921051025, + "logps/chosen": -266.5842590332031, + "logps/rejected": -293.5079040527344, + "loss": 0.6045, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15758280456066132, + "rewards/margins": 0.19839924573898315, + "rewards/margins_max": 0.7733598947525024, + "rewards/margins_min": -0.34084218740463257, + "rewards/margins_std": 0.50069260597229, + "rewards/rejected": -0.3559820055961609, + "step": 1870 + }, + { + "epoch": 0.45, + "grad_norm": 4.122374190150992, + "learning_rate": 3.3539592117082746e-07, + "logits/chosen": -2.6495654582977295, + "logits/rejected": -2.6142737865448, + "logps/chosen": -293.51312255859375, + "logps/rejected": -313.4895324707031, + "loss": 0.6016, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.12317116558551788, + "rewards/margins": 0.25961264967918396, + "rewards/margins_max": 0.7596412301063538, + "rewards/margins_min": -0.23033156991004944, + "rewards/margins_std": 0.45355939865112305, + "rewards/rejected": -0.3827837407588959, + "step": 1880 + }, + { + "epoch": 0.45, + "grad_norm": 11.889700373667367, + "learning_rate": 3.3342873036744346e-07, + "logits/chosen": -2.7058122158050537, + "logits/rejected": -2.6883273124694824, + "logps/chosen": -307.9171447753906, + "logps/rejected": -334.7265625, + "loss": 0.6101, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.08564779907464981, + "rewards/margins": 0.26072975993156433, + "rewards/margins_max": 0.7731336355209351, + "rewards/margins_min": -0.2751905620098114, + "rewards/margins_std": 0.46360093355178833, + "rewards/rejected": -0.34637758135795593, + "step": 1890 + }, + { + "epoch": 0.45, + "grad_norm": 7.893029297996013, + "learning_rate": 3.3145570915133067e-07, + "logits/chosen": -2.6629598140716553, + "logits/rejected": -2.6108360290527344, + "logps/chosen": -269.05572509765625, + "logps/rejected": -288.2684631347656, + "loss": 0.5983, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04458091780543327, + "rewards/margins": 0.2424008548259735, + "rewards/margins_max": 0.7750869393348694, + "rewards/margins_min": -0.23572340607643127, + "rewards/margins_std": 0.4512661099433899, + "rewards/rejected": -0.2869817912578583, + "step": 1900 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -2.6688740253448486, + "eval_logits/rejected": -2.6388726234436035, + "eval_logps/chosen": -289.76275634765625, + "eval_logps/rejected": -293.2796630859375, + "eval_loss": 0.6144084334373474, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -0.05307444930076599, + "eval_rewards/margins": 0.22118933498859406, + "eval_rewards/margins_max": 0.8922848701477051, + "eval_rewards/margins_min": -0.3931286036968231, + "eval_rewards/margins_std": 0.4326675534248352, + "eval_rewards/rejected": -0.27426376938819885, + "eval_runtime": 859.8622, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 1900 + }, + { + "epoch": 0.46, + "grad_norm": 9.609182067422951, + "learning_rate": 3.294769954069802e-07, + "logits/chosen": -2.6731858253479004, + "logits/rejected": -2.631958484649658, + "logps/chosen": -292.23406982421875, + "logps/rejected": -290.16107177734375, + "loss": 0.6018, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.07169095426797867, + "rewards/margins": 0.2285386621952057, + "rewards/margins_max": 0.7320543527603149, + "rewards/margins_min": -0.2693322002887726, + "rewards/margins_std": 0.45920902490615845, + "rewards/rejected": -0.30022960901260376, + "step": 1910 + }, + { + "epoch": 0.46, + "grad_norm": 6.526344610937495, + "learning_rate": 3.274927274167048e-07, + "logits/chosen": -2.6619555950164795, + "logits/rejected": -2.6596617698669434, + "logps/chosen": -276.9857482910156, + "logps/rejected": -289.47235107421875, + "loss": 0.6043, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10402724891901016, + "rewards/margins": 0.28490519523620605, + "rewards/margins_max": 0.7223531603813171, + "rewards/margins_min": -0.18868054449558258, + "rewards/margins_std": 0.4100664258003235, + "rewards/rejected": -0.3889324367046356, + "step": 1920 + }, + { + "epoch": 0.46, + "grad_norm": 4.150434512635606, + "learning_rate": 3.2550304385097575e-07, + "logits/chosen": -2.7032723426818848, + "logits/rejected": -2.6790528297424316, + "logps/chosen": -290.4239807128906, + "logps/rejected": -281.20416259765625, + "loss": 0.5963, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.13188329339027405, + "rewards/margins": 0.26092013716697693, + "rewards/margins_max": 0.7271699905395508, + "rewards/margins_min": -0.19916054606437683, + "rewards/margins_std": 0.4129628539085388, + "rewards/rejected": -0.3928033709526062, + "step": 1930 + }, + { + "epoch": 0.46, + "grad_norm": 12.036491223428849, + "learning_rate": 3.235080837587314e-07, + "logits/chosen": -2.6926076412200928, + "logits/rejected": -2.672503709793091, + "logps/chosen": -238.70413208007812, + "logps/rejected": -302.2066345214844, + "loss": 0.6008, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2117839753627777, + "rewards/margins": 0.24341578781604767, + "rewards/margins_max": 0.7401986122131348, + "rewards/margins_min": -0.19410011172294617, + "rewards/margins_std": 0.4155648648738861, + "rewards/rejected": -0.4551997780799866, + "step": 1940 + }, + { + "epoch": 0.47, + "grad_norm": 6.093328373334495, + "learning_rate": 3.215079865576599e-07, + "logits/chosen": -2.666715145111084, + "logits/rejected": -2.703641653060913, + "logps/chosen": -293.8852844238281, + "logps/rejected": -313.35821533203125, + "loss": 0.5883, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.12059396505355835, + "rewards/margins": 0.25971904397010803, + "rewards/margins_max": 0.8864032626152039, + "rewards/margins_min": -0.2718166708946228, + "rewards/margins_std": 0.5048220753669739, + "rewards/rejected": -0.380312979221344, + "step": 1950 + }, + { + "epoch": 0.47, + "grad_norm": 12.090046949019355, + "learning_rate": 3.1950289202445594e-07, + "logits/chosen": -2.6468160152435303, + "logits/rejected": -2.639539957046509, + "logps/chosen": -290.692138671875, + "logps/rejected": -307.38800048828125, + "loss": 0.5841, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.13301345705986023, + "rewards/margins": 0.2568240165710449, + "rewards/margins_max": 0.7523486018180847, + "rewards/margins_min": -0.24699096381664276, + "rewards/margins_std": 0.4464823603630066, + "rewards/rejected": -0.3898375630378723, + "step": 1960 + }, + { + "epoch": 0.47, + "grad_norm": 5.265987757290794, + "learning_rate": 3.174929402850528e-07, + "logits/chosen": -2.7697157859802246, + "logits/rejected": -2.6936728954315186, + "logps/chosen": -293.42962646484375, + "logps/rejected": -291.2169494628906, + "loss": 0.6268, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.14020130038261414, + "rewards/margins": 0.163181871175766, + "rewards/margins_max": 0.7737967371940613, + "rewards/margins_min": -0.38077330589294434, + "rewards/margins_std": 0.5128040313720703, + "rewards/rejected": -0.3033831715583801, + "step": 1970 + }, + { + "epoch": 0.47, + "grad_norm": 8.041305243979703, + "learning_rate": 3.15478271804829e-07, + "logits/chosen": -2.6651408672332764, + "logits/rejected": -2.6439590454101562, + "logps/chosen": -309.2423095703125, + "logps/rejected": -325.1023254394531, + "loss": 0.5719, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.058884888887405396, + "rewards/margins": 0.3680292069911957, + "rewards/margins_max": 0.8800289034843445, + "rewards/margins_min": -0.12628303468227386, + "rewards/margins_std": 0.4714101254940033, + "rewards/rejected": -0.4269140660762787, + "step": 1980 + }, + { + "epoch": 0.48, + "grad_norm": 3.27675225941295, + "learning_rate": 3.1345902737879257e-07, + "logits/chosen": -2.583885669708252, + "logits/rejected": -2.5821287631988525, + "logps/chosen": -279.5968933105469, + "logps/rejected": -300.0899353027344, + "loss": 0.5977, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17092658579349518, + "rewards/margins": 0.29685845971107483, + "rewards/margins_max": 0.8626619577407837, + "rewards/margins_min": -0.19954678416252136, + "rewards/margins_std": 0.4820861220359802, + "rewards/rejected": -0.4677850306034088, + "step": 1990 + }, + { + "epoch": 0.48, + "grad_norm": 6.900620872096232, + "learning_rate": 3.1143534812174103e-07, + "logits/chosen": -2.7087361812591553, + "logits/rejected": -2.683499813079834, + "logps/chosen": -329.2854919433594, + "logps/rejected": -306.0649108886719, + "loss": 0.5822, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0919618010520935, + "rewards/margins": 0.4252632260322571, + "rewards/margins_max": 1.0302584171295166, + "rewards/margins_min": -0.3328133523464203, + "rewards/margins_std": 0.6099307537078857, + "rewards/rejected": -0.5172249674797058, + "step": 2000 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -2.6678671836853027, + "eval_logits/rejected": -2.63775634765625, + "eval_logps/chosen": -299.4801025390625, + "eval_logps/rejected": -306.8108825683594, + "eval_loss": 0.6049104332923889, + "eval_rewards/accuracies": 0.6884999871253967, + "eval_rewards/chosen": -0.15024752914905548, + "eval_rewards/margins": 0.25932812690734863, + "eval_rewards/margins_max": 1.00696861743927, + "eval_rewards/margins_min": -0.4697433412075043, + "eval_rewards/margins_std": 0.49979886412620544, + "eval_rewards/rejected": -0.4095756411552429, + "eval_runtime": 859.8794, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 2000 + }, + { + "epoch": 0.48, + "grad_norm": 4.974932968224806, + "learning_rate": 3.094073754584001e-07, + "logits/chosen": -2.641544818878174, + "logits/rejected": -2.602346420288086, + "logps/chosen": -295.3077697753906, + "logps/rejected": -297.1141357421875, + "loss": 0.5751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13280975818634033, + "rewards/margins": 0.3550918996334076, + "rewards/margins_max": 0.9477798342704773, + "rewards/margins_min": -0.1992892622947693, + "rewards/margins_std": 0.5173491835594177, + "rewards/rejected": -0.4879016876220703, + "step": 2010 + }, + { + "epoch": 0.48, + "grad_norm": 8.83065289789081, + "learning_rate": 3.0737525111353976e-07, + "logits/chosen": -2.7229080200195312, + "logits/rejected": -2.699981451034546, + "logps/chosen": -305.264404296875, + "logps/rejected": -297.63458251953125, + "loss": 0.5828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1455700546503067, + "rewards/margins": 0.26840436458587646, + "rewards/margins_max": 0.7454306483268738, + "rewards/margins_min": -0.25561413168907166, + "rewards/margins_std": 0.45743808150291443, + "rewards/rejected": -0.4139743745326996, + "step": 2020 + }, + { + "epoch": 0.49, + "grad_norm": 5.411815799169501, + "learning_rate": 3.053391171020702e-07, + "logits/chosen": -2.6677589416503906, + "logits/rejected": -2.625202178955078, + "logps/chosen": -316.2569885253906, + "logps/rejected": -316.2462158203125, + "loss": 0.5928, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14155061542987823, + "rewards/margins": 0.29246291518211365, + "rewards/margins_max": 0.8275464773178101, + "rewards/margins_min": -0.31466326117515564, + "rewards/margins_std": 0.5112706422805786, + "rewards/rejected": -0.4340135455131531, + "step": 2030 + }, + { + "epoch": 0.49, + "grad_norm": 7.476710972761675, + "learning_rate": 3.0329911571911693e-07, + "logits/chosen": -2.6282687187194824, + "logits/rejected": -2.6319591999053955, + "logps/chosen": -278.00897216796875, + "logps/rejected": -305.495361328125, + "loss": 0.6206, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24457895755767822, + "rewards/margins": 0.20937521755695343, + "rewards/margins_max": 0.6903196573257446, + "rewards/margins_min": -0.41123127937316895, + "rewards/margins_std": 0.5036042332649231, + "rewards/rejected": -0.45395416021347046, + "step": 2040 + }, + { + "epoch": 0.49, + "grad_norm": 9.284960688937632, + "learning_rate": 3.012553895300765e-07, + "logits/chosen": -2.6002590656280518, + "logits/rejected": -2.6011807918548584, + "logps/chosen": -290.0956115722656, + "logps/rejected": -310.40155029296875, + "loss": 0.5935, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2200767546892166, + "rewards/margins": 0.3369477093219757, + "rewards/margins_max": 0.8728636503219604, + "rewards/margins_min": -0.18612375855445862, + "rewards/margins_std": 0.4747312664985657, + "rewards/rejected": -0.5570244193077087, + "step": 2050 + }, + { + "epoch": 0.49, + "grad_norm": 11.77746164949279, + "learning_rate": 2.9920808136065336e-07, + "logits/chosen": -2.7036004066467285, + "logits/rejected": -2.667642116546631, + "logps/chosen": -293.30377197265625, + "logps/rejected": -308.8804931640625, + "loss": 0.6148, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13821843266487122, + "rewards/margins": 0.24729657173156738, + "rewards/margins_max": 0.877963662147522, + "rewards/margins_min": -0.2980964481830597, + "rewards/margins_std": 0.539368748664856, + "rewards/rejected": -0.385515034198761, + "step": 2060 + }, + { + "epoch": 0.5, + "grad_norm": 6.111162397381366, + "learning_rate": 2.971573342868786e-07, + "logits/chosen": -2.7201285362243652, + "logits/rejected": -2.667332887649536, + "logps/chosen": -269.16302490234375, + "logps/rejected": -274.974609375, + "loss": 0.5877, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12460535764694214, + "rewards/margins": 0.3083706498146057, + "rewards/margins_max": 0.8954262733459473, + "rewards/margins_min": -0.2783167362213135, + "rewards/margins_std": 0.5282653570175171, + "rewards/rejected": -0.4329760670661926, + "step": 2070 + }, + { + "epoch": 0.5, + "grad_norm": 10.42434722539899, + "learning_rate": 2.9510329162511054e-07, + "logits/chosen": -2.606055974960327, + "logits/rejected": -2.6232898235321045, + "logps/chosen": -314.437744140625, + "logps/rejected": -303.1037902832031, + "loss": 0.6138, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11569087207317352, + "rewards/margins": 0.2684437930583954, + "rewards/margins_max": 0.8543025851249695, + "rewards/margins_min": -0.26375895738601685, + "rewards/margins_std": 0.49721455574035645, + "rewards/rejected": -0.3841346502304077, + "step": 2080 + }, + { + "epoch": 0.5, + "grad_norm": 5.566886629490982, + "learning_rate": 2.930460969220202e-07, + "logits/chosen": -2.7032504081726074, + "logits/rejected": -2.6360905170440674, + "logps/chosen": -275.11688232421875, + "logps/rejected": -322.1624450683594, + "loss": 0.5975, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1841132640838623, + "rewards/margins": 0.2289840430021286, + "rewards/margins_max": 0.8241883516311646, + "rewards/margins_min": -0.3731764554977417, + "rewards/margins_std": 0.5407173037528992, + "rewards/rejected": -0.4130973219871521, + "step": 2090 + }, + { + "epoch": 0.5, + "grad_norm": 4.855753539641828, + "learning_rate": 2.909858939445584e-07, + "logits/chosen": -2.6886954307556152, + "logits/rejected": -2.665351390838623, + "logps/chosen": -288.3397521972656, + "logps/rejected": -290.7861328125, + "loss": 0.6013, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2644990086555481, + "rewards/margins": 0.21240103244781494, + "rewards/margins_max": 0.6695704460144043, + "rewards/margins_min": -0.22528037428855896, + "rewards/margins_std": 0.396864116191864, + "rewards/rejected": -0.4769001007080078, + "step": 2100 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.6592962741851807, + "eval_logits/rejected": -2.6289308071136475, + "eval_logps/chosen": -302.33001708984375, + "eval_logps/rejected": -310.3860168457031, + "eval_loss": 0.6034172177314758, + "eval_rewards/accuracies": 0.6869999766349792, + "eval_rewards/chosen": -0.17874710261821747, + "eval_rewards/margins": 0.2665804624557495, + "eval_rewards/margins_max": 1.0330960750579834, + "eval_rewards/margins_min": -0.4818904399871826, + "eval_rewards/margins_std": 0.5136880874633789, + "eval_rewards/rejected": -0.4453275799751282, + "eval_runtime": 859.4362, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 2100 + }, + { + "epoch": 0.51, + "grad_norm": 7.587780022520818, + "learning_rate": 2.8892282666990894e-07, + "logits/chosen": -2.68300461769104, + "logits/rejected": -2.6219592094421387, + "logps/chosen": -285.99542236328125, + "logps/rejected": -277.61529541015625, + "loss": 0.5884, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11673027276992798, + "rewards/margins": 0.3741188049316406, + "rewards/margins_max": 1.0100022554397583, + "rewards/margins_min": -0.2071894407272339, + "rewards/margins_std": 0.5453618168830872, + "rewards/rejected": -0.4908490777015686, + "step": 2110 + }, + { + "epoch": 0.51, + "grad_norm": 5.437760971646844, + "learning_rate": 2.868570392754272e-07, + "logits/chosen": -2.724592447280884, + "logits/rejected": -2.70935320854187, + "logps/chosen": -336.3945617675781, + "logps/rejected": -362.13507080078125, + "loss": 0.5846, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1303412765264511, + "rewards/margins": 0.3308197855949402, + "rewards/margins_max": 0.8329311609268188, + "rewards/margins_min": -0.2632693648338318, + "rewards/margins_std": 0.4918765425682068, + "rewards/rejected": -0.4611610770225525, + "step": 2120 + }, + { + "epoch": 0.51, + "grad_norm": 7.312307762814366, + "learning_rate": 2.8478867612856394e-07, + "logits/chosen": -2.707106828689575, + "logits/rejected": -2.6523804664611816, + "logps/chosen": -302.043212890625, + "logps/rejected": -284.9640808105469, + "loss": 0.5916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11796555668115616, + "rewards/margins": 0.3160025477409363, + "rewards/margins_max": 0.8329175710678101, + "rewards/margins_min": -0.27798742055892944, + "rewards/margins_std": 0.5030540227890015, + "rewards/rejected": -0.43396812677383423, + "step": 2130 + }, + { + "epoch": 0.51, + "grad_norm": 9.871464736301247, + "learning_rate": 2.827178817767762e-07, + "logits/chosen": -2.615812063217163, + "logits/rejected": -2.598005533218384, + "logps/chosen": -308.57672119140625, + "logps/rejected": -289.3555908203125, + "loss": 0.6014, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.07780320197343826, + "rewards/margins": 0.2799312472343445, + "rewards/margins_max": 0.8629264831542969, + "rewards/margins_min": -0.21283754706382751, + "rewards/margins_std": 0.4856076240539551, + "rewards/rejected": -0.35773441195487976, + "step": 2140 + }, + { + "epoch": 0.51, + "grad_norm": 8.961394693273633, + "learning_rate": 2.8064480093742565e-07, + "logits/chosen": -2.686300277709961, + "logits/rejected": -2.680666208267212, + "logps/chosen": -261.29705810546875, + "logps/rejected": -282.45556640625, + "loss": 0.5678, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.11843335628509521, + "rewards/margins": 0.29204291105270386, + "rewards/margins_max": 0.8688719868659973, + "rewards/margins_min": -0.21983151137828827, + "rewards/margins_std": 0.5022950172424316, + "rewards/rejected": -0.4104762077331543, + "step": 2150 + }, + { + "epoch": 0.52, + "grad_norm": 11.224170108694283, + "learning_rate": 2.7856957848766497e-07, + "logits/chosen": -2.6662299633026123, + "logits/rejected": -2.587806224822998, + "logps/chosen": -290.3630065917969, + "logps/rejected": -305.7333984375, + "loss": 0.591, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2064693421125412, + "rewards/margins": 0.29201430082321167, + "rewards/margins_max": 0.8517812490463257, + "rewards/margins_min": -0.199358269572258, + "rewards/margins_std": 0.47122445702552795, + "rewards/rejected": -0.49848371744155884, + "step": 2160 + }, + { + "epoch": 0.52, + "grad_norm": 7.270993585287031, + "learning_rate": 2.7649235945431336e-07, + "logits/chosen": -2.64408540725708, + "logits/rejected": -2.611272096633911, + "logps/chosen": -309.9103088378906, + "logps/rejected": -368.58514404296875, + "loss": 0.5759, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16110315918922424, + "rewards/margins": 0.27896901965141296, + "rewards/margins_max": 0.7961828708648682, + "rewards/margins_min": -0.22307395935058594, + "rewards/margins_std": 0.4642201066017151, + "rewards/rejected": -0.4400722086429596, + "step": 2170 + }, + { + "epoch": 0.52, + "grad_norm": 8.25762679397384, + "learning_rate": 2.74413289003721e-07, + "logits/chosen": -2.6903395652770996, + "logits/rejected": -2.671844005584717, + "logps/chosen": -302.89886474609375, + "logps/rejected": -312.398193359375, + "loss": 0.5831, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1994825303554535, + "rewards/margins": 0.28639334440231323, + "rewards/margins_max": 0.870135486125946, + "rewards/margins_min": -0.27219831943511963, + "rewards/margins_std": 0.5184741616249084, + "rewards/rejected": -0.4858759343624115, + "step": 2180 + }, + { + "epoch": 0.52, + "grad_norm": 13.975736027478682, + "learning_rate": 2.7233251243162434e-07, + "logits/chosen": -2.672729015350342, + "logits/rejected": -2.6457486152648926, + "logps/chosen": -326.508056640625, + "logps/rejected": -334.08685302734375, + "loss": 0.5922, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.11852113902568817, + "rewards/margins": 0.3247258961200714, + "rewards/margins_max": 0.8556135892868042, + "rewards/margins_min": -0.1976144015789032, + "rewards/margins_std": 0.4724816381931305, + "rewards/rejected": -0.4432470202445984, + "step": 2190 + }, + { + "epoch": 0.53, + "grad_norm": 6.569085005357838, + "learning_rate": 2.7025017515299207e-07, + "logits/chosen": -2.640963077545166, + "logits/rejected": -2.610414743423462, + "logps/chosen": -293.7982482910156, + "logps/rejected": -280.1280822753906, + "loss": 0.6018, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.20703542232513428, + "rewards/margins": 0.19029325246810913, + "rewards/margins_max": 0.7511401176452637, + "rewards/margins_min": -0.3581925928592682, + "rewards/margins_std": 0.49416694045066833, + "rewards/rejected": -0.3973286747932434, + "step": 2200 + }, + { + "epoch": 0.53, + "eval_logits/chosen": -2.6584744453430176, + "eval_logits/rejected": -2.6287267208099365, + "eval_logps/chosen": -300.1773376464844, + "eval_logps/rejected": -308.8055419921875, + "eval_loss": 0.6019492149353027, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -0.157219797372818, + "eval_rewards/margins": 0.2723027765750885, + "eval_rewards/margins_max": 1.0473430156707764, + "eval_rewards/margins_min": -0.48959511518478394, + "eval_rewards/margins_std": 0.5205263495445251, + "eval_rewards/rejected": -0.4295225143432617, + "eval_runtime": 859.8763, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 2200 + }, + { + "epoch": 0.53, + "grad_norm": 16.78526397379326, + "learning_rate": 2.6816642269186275e-07, + "logits/chosen": -2.6230361461639404, + "logits/rejected": -2.616010904312134, + "logps/chosen": -305.01812744140625, + "logps/rejected": -302.0491027832031, + "loss": 0.5996, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21043619513511658, + "rewards/margins": 0.22241902351379395, + "rewards/margins_max": 0.7658696174621582, + "rewards/margins_min": -0.2654082179069519, + "rewards/margins_std": 0.4638918340206146, + "rewards/rejected": -0.4328552186489105, + "step": 2210 + }, + { + "epoch": 0.53, + "grad_norm": 5.8733901395154255, + "learning_rate": 2.660814006711748e-07, + "logits/chosen": -2.6130030155181885, + "logits/rejected": -2.6518306732177734, + "logps/chosen": -283.50518798828125, + "logps/rejected": -329.78179931640625, + "loss": 0.6255, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.27469900250434875, + "rewards/margins": 0.13242587447166443, + "rewards/margins_max": 0.6401777267456055, + "rewards/margins_min": -0.43729621171951294, + "rewards/margins_std": 0.47277194261550903, + "rewards/rejected": -0.40712490677833557, + "step": 2220 + }, + { + "epoch": 0.53, + "grad_norm": 8.27693449613565, + "learning_rate": 2.639952548025899e-07, + "logits/chosen": -2.700397491455078, + "logits/rejected": -2.6179628372192383, + "logps/chosen": -320.1406555175781, + "logps/rejected": -288.6668701171875, + "loss": 0.611, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1986890733242035, + "rewards/margins": 0.23419936001300812, + "rewards/margins_max": 0.895412266254425, + "rewards/margins_min": -0.37990817427635193, + "rewards/margins_std": 0.5677480697631836, + "rewards/rejected": -0.4328884482383728, + "step": 2230 + }, + { + "epoch": 0.54, + "grad_norm": 9.954155549694788, + "learning_rate": 2.619081308763097e-07, + "logits/chosen": -2.64445424079895, + "logits/rejected": -2.6310877799987793, + "logps/chosen": -312.2311706542969, + "logps/rejected": -307.53765869140625, + "loss": 0.5786, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1756788194179535, + "rewards/margins": 0.31714627146720886, + "rewards/margins_max": 0.8764044046401978, + "rewards/margins_min": -0.23695509135723114, + "rewards/margins_std": 0.5006457567214966, + "rewards/rejected": -0.49282512068748474, + "step": 2240 + }, + { + "epoch": 0.54, + "grad_norm": 6.119155586679762, + "learning_rate": 2.598201747508875e-07, + "logits/chosen": -2.6621711254119873, + "logits/rejected": -2.6769607067108154, + "logps/chosen": -330.69952392578125, + "logps/rejected": -344.57928466796875, + "loss": 0.5783, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1518358588218689, + "rewards/margins": 0.33964893221855164, + "rewards/margins_max": 0.8550474047660828, + "rewards/margins_min": -0.2645668685436249, + "rewards/margins_std": 0.507433295249939, + "rewards/rejected": -0.4914848208427429, + "step": 2250 + }, + { + "epoch": 0.54, + "grad_norm": 10.487767678301395, + "learning_rate": 2.577315323430346e-07, + "logits/chosen": -2.652210235595703, + "logits/rejected": -2.6248786449432373, + "logps/chosen": -309.04949951171875, + "logps/rejected": -323.46954345703125, + "loss": 0.5816, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19222597777843475, + "rewards/margins": 0.30796346068382263, + "rewards/margins_max": 0.8325638771057129, + "rewards/margins_min": -0.24727854132652283, + "rewards/margins_std": 0.48955899477005005, + "rewards/rejected": -0.5001894235610962, + "step": 2260 + }, + { + "epoch": 0.54, + "grad_norm": 6.923352912553123, + "learning_rate": 2.5564234961742315e-07, + "logits/chosen": -2.677743434906006, + "logits/rejected": -2.652365207672119, + "logps/chosen": -349.2780456542969, + "logps/rejected": -337.13922119140625, + "loss": 0.5919, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13047567009925842, + "rewards/margins": 0.3868805170059204, + "rewards/margins_max": 0.9608567357063293, + "rewards/margins_min": -0.10169659554958344, + "rewards/margins_std": 0.47666463255882263, + "rewards/rejected": -0.5173561573028564, + "step": 2270 + }, + { + "epoch": 0.55, + "grad_norm": 8.10905605857629, + "learning_rate": 2.5355277257648553e-07, + "logits/chosen": -2.69606351852417, + "logits/rejected": -2.6536648273468018, + "logps/chosen": -288.4073486328125, + "logps/rejected": -295.1745300292969, + "loss": 0.5857, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.19543218612670898, + "rewards/margins": 0.2333766520023346, + "rewards/margins_max": 0.7897024154663086, + "rewards/margins_min": -0.3436315655708313, + "rewards/margins_std": 0.5261818170547485, + "rewards/rejected": -0.42880886793136597, + "step": 2280 + }, + { + "epoch": 0.55, + "grad_norm": 11.325778312969543, + "learning_rate": 2.514629472502108e-07, + "logits/chosen": -2.6397690773010254, + "logits/rejected": -2.655486822128296, + "logps/chosen": -351.07061767578125, + "logps/rejected": -333.7223815917969, + "loss": 0.5638, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14515265822410583, + "rewards/margins": 0.38497716188430786, + "rewards/margins_max": 0.9311981201171875, + "rewards/margins_min": -0.18591797351837158, + "rewards/margins_std": 0.5137249231338501, + "rewards/rejected": -0.5301297903060913, + "step": 2290 + }, + { + "epoch": 0.55, + "grad_norm": 7.7298047675465105, + "learning_rate": 2.4937301968593915e-07, + "logits/chosen": -2.635599136352539, + "logits/rejected": -2.62733793258667, + "logps/chosen": -282.60028076171875, + "logps/rejected": -295.7271728515625, + "loss": 0.6121, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2598281502723694, + "rewards/margins": 0.26622843742370605, + "rewards/margins_max": 0.7813423275947571, + "rewards/margins_min": -0.3214438557624817, + "rewards/margins_std": 0.5005481243133545, + "rewards/rejected": -0.5260566473007202, + "step": 2300 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.647585153579712, + "eval_logits/rejected": -2.6178154945373535, + "eval_logps/chosen": -308.79913330078125, + "eval_logps/rejected": -318.02728271484375, + "eval_loss": 0.6009542346000671, + "eval_rewards/accuracies": 0.690500020980835, + "eval_rewards/chosen": -0.24343764781951904, + "eval_rewards/margins": 0.27830231189727783, + "eval_rewards/margins_max": 1.063331127166748, + "eval_rewards/margins_min": -0.4893389046192169, + "eval_rewards/margins_std": 0.5288664698600769, + "eval_rewards/rejected": -0.5217399001121521, + "eval_runtime": 859.9102, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 2300 + }, + { + "epoch": 0.55, + "grad_norm": 7.022809875756669, + "learning_rate": 2.47283135938156e-07, + "logits/chosen": -2.6830363273620605, + "logits/rejected": -2.664031982421875, + "logps/chosen": -287.798095703125, + "logps/rejected": -279.710205078125, + "loss": 0.6063, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.20198485255241394, + "rewards/margins": 0.2973095774650574, + "rewards/margins_max": 0.9239432215690613, + "rewards/margins_min": -0.2458072006702423, + "rewards/margins_std": 0.5228606462478638, + "rewards/rejected": -0.49929437041282654, + "step": 2310 + }, + { + "epoch": 0.56, + "grad_norm": 9.306559261514748, + "learning_rate": 2.451934420582846e-07, + "logits/chosen": -2.6949825286865234, + "logits/rejected": -2.6665689945220947, + "logps/chosen": -287.1777038574219, + "logps/rejected": -296.3902282714844, + "loss": 0.6109, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17137010395526886, + "rewards/margins": 0.2919192314147949, + "rewards/margins_max": 0.9146235585212708, + "rewards/margins_min": -0.29740530252456665, + "rewards/margins_std": 0.5488147735595703, + "rewards/rejected": -0.463289350271225, + "step": 2320 + }, + { + "epoch": 0.56, + "grad_norm": 4.078485582568216, + "learning_rate": 2.4310408408447903e-07, + "logits/chosen": -2.6312766075134277, + "logits/rejected": -2.5739798545837402, + "logps/chosen": -255.2695770263672, + "logps/rejected": -261.53936767578125, + "loss": 0.609, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.11498737335205078, + "rewards/margins": 0.24487683176994324, + "rewards/margins_max": 0.7137837409973145, + "rewards/margins_min": -0.18724948167800903, + "rewards/margins_std": 0.41405171155929565, + "rewards/rejected": -0.359864205121994, + "step": 2330 + }, + { + "epoch": 0.56, + "grad_norm": 3.069921259910948, + "learning_rate": 2.41015208031419e-07, + "logits/chosen": -2.6944103240966797, + "logits/rejected": -2.6474945545196533, + "logps/chosen": -306.9026794433594, + "logps/rejected": -310.8961486816406, + "loss": 0.5927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.126219242811203, + "rewards/margins": 0.22689545154571533, + "rewards/margins_max": 0.8158214688301086, + "rewards/margins_min": -0.31907206773757935, + "rewards/margins_std": 0.5096734762191772, + "rewards/rejected": -0.3531147241592407, + "step": 2340 + }, + { + "epoch": 0.56, + "grad_norm": 8.258310005265388, + "learning_rate": 2.389269598801048e-07, + "logits/chosen": -2.667680025100708, + "logits/rejected": -2.59625506401062, + "logps/chosen": -292.64959716796875, + "logps/rejected": -271.0260314941406, + "loss": 0.5555, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08529751002788544, + "rewards/margins": 0.3777325749397278, + "rewards/margins_max": 0.9011771082878113, + "rewards/margins_min": -0.11987098306417465, + "rewards/margins_std": 0.4734679162502289, + "rewards/rejected": -0.4630300998687744, + "step": 2350 + }, + { + "epoch": 0.57, + "grad_norm": 8.671069518475692, + "learning_rate": 2.3683948556765624e-07, + "logits/chosen": -2.6653316020965576, + "logits/rejected": -2.629021167755127, + "logps/chosen": -275.5854187011719, + "logps/rejected": -276.62445068359375, + "loss": 0.6337, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19002783298492432, + "rewards/margins": 0.2670327126979828, + "rewards/margins_max": 0.858096718788147, + "rewards/margins_min": -0.23270674049854279, + "rewards/margins_std": 0.4833803176879883, + "rewards/rejected": -0.4570605158805847, + "step": 2360 + }, + { + "epoch": 0.57, + "grad_norm": 8.35667943542909, + "learning_rate": 2.34752930977113e-07, + "logits/chosen": -2.605914831161499, + "logits/rejected": -2.5656206607818604, + "logps/chosen": -297.6455078125, + "logps/rejected": -320.46673583984375, + "loss": 0.5856, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1851060837507248, + "rewards/margins": 0.24210119247436523, + "rewards/margins_max": 0.819743275642395, + "rewards/margins_min": -0.3182319104671478, + "rewards/margins_std": 0.5180691480636597, + "rewards/rejected": -0.4272072911262512, + "step": 2370 + }, + { + "epoch": 0.57, + "grad_norm": 7.835666822967071, + "learning_rate": 2.3266744192724052e-07, + "logits/chosen": -2.6449246406555176, + "logits/rejected": -2.627570629119873, + "logps/chosen": -317.83843994140625, + "logps/rejected": -325.9470520019531, + "loss": 0.5905, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2597433626651764, + "rewards/margins": 0.33320045471191406, + "rewards/margins_max": 0.9372183084487915, + "rewards/margins_min": -0.3607577085494995, + "rewards/margins_std": 0.5766936540603638, + "rewards/rejected": -0.5929437875747681, + "step": 2380 + }, + { + "epoch": 0.57, + "grad_norm": 3.548511010185018, + "learning_rate": 2.3058316416233864e-07, + "logits/chosen": -2.6739461421966553, + "logits/rejected": -2.666889190673828, + "logps/chosen": -312.7624816894531, + "logps/rejected": -303.9245300292969, + "loss": 0.601, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.23950240015983582, + "rewards/margins": 0.19521835446357727, + "rewards/margins_max": 0.8413310050964355, + "rewards/margins_min": -0.46826639771461487, + "rewards/margins_std": 0.5935012102127075, + "rewards/rejected": -0.4347207546234131, + "step": 2390 + }, + { + "epoch": 0.57, + "grad_norm": 9.287142029713884, + "learning_rate": 2.2850024334205654e-07, + "logits/chosen": -2.6532249450683594, + "logits/rejected": -2.6485819816589355, + "logps/chosen": -291.0581970214844, + "logps/rejected": -306.99578857421875, + "loss": 0.5698, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2209687978029251, + "rewards/margins": 0.3287777006626129, + "rewards/margins_max": 0.8810712099075317, + "rewards/margins_min": -0.21057824790477753, + "rewards/margins_std": 0.489013135433197, + "rewards/rejected": -0.5497465133666992, + "step": 2400 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -2.6389448642730713, + "eval_logits/rejected": -2.609215497970581, + "eval_logps/chosen": -303.47515869140625, + "eval_logps/rejected": -313.65570068359375, + "eval_loss": 0.5978549718856812, + "eval_rewards/accuracies": 0.6919999718666077, + "eval_rewards/chosen": -0.19019848108291626, + "eval_rewards/margins": 0.2878260910511017, + "eval_rewards/margins_max": 1.0878815650939941, + "eval_rewards/margins_min": -0.4939045310020447, + "eval_rewards/margins_std": 0.5368744730949402, + "eval_rewards/rejected": -0.47802454233169556, + "eval_runtime": 859.412, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 2400 + }, + { + "epoch": 0.58, + "grad_norm": 8.250334299423635, + "learning_rate": 2.264188250312138e-07, + "logits/chosen": -2.63791823387146, + "logits/rejected": -2.585139751434326, + "logps/chosen": -306.912841796875, + "logps/rejected": -276.64434814453125, + "loss": 0.5608, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1724175363779068, + "rewards/margins": 0.3793061673641205, + "rewards/margins_max": 0.9429492950439453, + "rewards/margins_min": -0.1732269674539566, + "rewards/margins_std": 0.5044452548027039, + "rewards/rejected": -0.5517237186431885, + "step": 2410 + }, + { + "epoch": 0.58, + "grad_norm": 7.536721422462468, + "learning_rate": 2.2433905468962674e-07, + "logits/chosen": -2.69928240776062, + "logits/rejected": -2.678112745285034, + "logps/chosen": -320.2510070800781, + "logps/rejected": -310.3973083496094, + "loss": 0.5638, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18819960951805115, + "rewards/margins": 0.4135063588619232, + "rewards/margins_max": 0.960457444190979, + "rewards/margins_min": -0.21604296565055847, + "rewards/margins_std": 0.5411546230316162, + "rewards/rejected": -0.6017060279846191, + "step": 2420 + }, + { + "epoch": 0.58, + "grad_norm": 9.203042099584177, + "learning_rate": 2.222610776619439e-07, + "logits/chosen": -2.7014262676239014, + "logits/rejected": -2.66084623336792, + "logps/chosen": -315.51361083984375, + "logps/rejected": -298.0957336425781, + "loss": 0.5668, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1906673163175583, + "rewards/margins": 0.36204102635383606, + "rewards/margins_max": 1.0159660577774048, + "rewards/margins_min": -0.2975941002368927, + "rewards/margins_std": 0.5978912115097046, + "rewards/rejected": -0.5527083873748779, + "step": 2430 + }, + { + "epoch": 0.58, + "grad_norm": 12.089585644458106, + "learning_rate": 2.201850391674877e-07, + "logits/chosen": -2.687541961669922, + "logits/rejected": -2.644740581512451, + "logps/chosen": -320.5738220214844, + "logps/rejected": -292.30218505859375, + "loss": 0.5842, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15845851600170135, + "rewards/margins": 0.3309580087661743, + "rewards/margins_max": 0.800940990447998, + "rewards/margins_min": -0.2134658545255661, + "rewards/margins_std": 0.46003514528274536, + "rewards/rejected": -0.4894165098667145, + "step": 2440 + }, + { + "epoch": 0.59, + "grad_norm": 5.970990943647553, + "learning_rate": 2.181110842901066e-07, + "logits/chosen": -2.7046241760253906, + "logits/rejected": -2.632124423980713, + "logps/chosen": -293.5450439453125, + "logps/rejected": -290.432861328125, + "loss": 0.5783, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1822170913219452, + "rewards/margins": 0.36459654569625854, + "rewards/margins_max": 1.079773187637329, + "rewards/margins_min": -0.23687157034873962, + "rewards/margins_std": 0.5974687337875366, + "rewards/rejected": -0.5468136072158813, + "step": 2450 + }, + { + "epoch": 0.59, + "grad_norm": 6.575545070087346, + "learning_rate": 2.160393579680353e-07, + "logits/chosen": -2.59405517578125, + "logits/rejected": -2.613334894180298, + "logps/chosen": -281.93841552734375, + "logps/rejected": -316.2869567871094, + "loss": 0.5566, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.18146109580993652, + "rewards/margins": 0.46110066771507263, + "rewards/margins_max": 1.0213110446929932, + "rewards/margins_min": -0.08433142304420471, + "rewards/margins_std": 0.4864630699157715, + "rewards/rejected": -0.6425617933273315, + "step": 2460 + }, + { + "epoch": 0.59, + "grad_norm": 5.8721728725920075, + "learning_rate": 2.1397000498376634e-07, + "logits/chosen": -2.6570866107940674, + "logits/rejected": -2.5731148719787598, + "logps/chosen": -284.0189208984375, + "logps/rejected": -301.596923828125, + "loss": 0.5591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12092554569244385, + "rewards/margins": 0.38032206892967224, + "rewards/margins_max": 0.9591177701950073, + "rewards/margins_min": -0.1604117453098297, + "rewards/margins_std": 0.5008580088615417, + "rewards/rejected": -0.5012476444244385, + "step": 2470 + }, + { + "epoch": 0.59, + "grad_norm": 12.066010433779923, + "learning_rate": 2.1190316995393144e-07, + "logits/chosen": -2.6345062255859375, + "logits/rejected": -2.571965217590332, + "logps/chosen": -289.5625915527344, + "logps/rejected": -273.22491455078125, + "loss": 0.5902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29963773488998413, + "rewards/margins": 0.27483218908309937, + "rewards/margins_max": 0.8773409128189087, + "rewards/margins_min": -0.2962464988231659, + "rewards/margins_std": 0.5167412161827087, + "rewards/rejected": -0.5744699239730835, + "step": 2480 + }, + { + "epoch": 0.6, + "grad_norm": 13.696080692766579, + "learning_rate": 2.098389973191953e-07, + "logits/chosen": -2.6669273376464844, + "logits/rejected": -2.647965908050537, + "logps/chosen": -314.2147521972656, + "logps/rejected": -331.3398742675781, + "loss": 0.5721, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1829395741224289, + "rewards/margins": 0.29835018515586853, + "rewards/margins_max": 0.9142980575561523, + "rewards/margins_min": -0.3180716335773468, + "rewards/margins_std": 0.5453607439994812, + "rewards/rejected": -0.48128971457481384, + "step": 2490 + }, + { + "epoch": 0.6, + "grad_norm": 9.312767076182801, + "learning_rate": 2.0777763133416118e-07, + "logits/chosen": -2.6531622409820557, + "logits/rejected": -2.606722354888916, + "logps/chosen": -290.75665283203125, + "logps/rejected": -271.52813720703125, + "loss": 0.5656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26194143295288086, + "rewards/margins": 0.3659655749797821, + "rewards/margins_max": 0.9094951748847961, + "rewards/margins_min": -0.1398560106754303, + "rewards/margins_std": 0.46722960472106934, + "rewards/rejected": -0.6279069781303406, + "step": 2500 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.6290502548217773, + "eval_logits/rejected": -2.599135160446167, + "eval_logps/chosen": -311.5382080078125, + "eval_logps/rejected": -321.8217468261719, + "eval_loss": 0.5992329716682434, + "eval_rewards/accuracies": 0.6984999775886536, + "eval_rewards/chosen": -0.2708284258842468, + "eval_rewards/margins": 0.2888563275337219, + "eval_rewards/margins_max": 1.0979818105697632, + "eval_rewards/margins_min": -0.5097129940986633, + "eval_rewards/margins_std": 0.5454325675964355, + "eval_rewards/rejected": -0.5596847534179688, + "eval_runtime": 860.4805, + "eval_samples_per_second": 4.649, + "eval_steps_per_second": 0.291, + "step": 2500 + }, + { + "epoch": 0.6, + "grad_norm": 10.04083079635618, + "learning_rate": 2.057192160572898e-07, + "logits/chosen": -2.662008285522461, + "logits/rejected": -2.5704874992370605, + "logps/chosen": -291.0193176269531, + "logps/rejected": -324.3416442871094, + "loss": 0.5979, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.209696963429451, + "rewards/margins": 0.2942900061607361, + "rewards/margins_max": 0.9081916809082031, + "rewards/margins_min": -0.2142058163881302, + "rewards/margins_std": 0.5111109018325806, + "rewards/rejected": -0.5039870142936707, + "step": 2510 + }, + { + "epoch": 0.6, + "grad_norm": 8.642284820234895, + "learning_rate": 2.0366389534083185e-07, + "logits/chosen": -2.66066312789917, + "logits/rejected": -2.6295483112335205, + "logps/chosen": -305.4989318847656, + "logps/rejected": -297.40618896484375, + "loss": 0.5895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1935977190732956, + "rewards/margins": 0.31689274311065674, + "rewards/margins_max": 0.9463685750961304, + "rewards/margins_min": -0.23440134525299072, + "rewards/margins_std": 0.5165520906448364, + "rewards/rejected": -0.5104904770851135, + "step": 2520 + }, + { + "epoch": 0.61, + "grad_norm": 5.672527349279183, + "learning_rate": 2.0161181282077469e-07, + "logits/chosen": -2.6334547996520996, + "logits/rejected": -2.634446620941162, + "logps/chosen": -265.6521911621094, + "logps/rejected": -294.1048889160156, + "loss": 0.5779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2336483895778656, + "rewards/margins": 0.332066148519516, + "rewards/margins_max": 0.9206092953681946, + "rewards/margins_min": -0.1984013468027115, + "rewards/margins_std": 0.5049009323120117, + "rewards/rejected": -0.5657145977020264, + "step": 2530 + }, + { + "epoch": 0.61, + "grad_norm": 7.054123493685105, + "learning_rate": 1.9956311190680468e-07, + "logits/chosen": -2.6308562755584717, + "logits/rejected": -2.624748945236206, + "logps/chosen": -277.38079833984375, + "logps/rejected": -321.8224792480469, + "loss": 0.5978, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24823418259620667, + "rewards/margins": 0.3347433805465698, + "rewards/margins_max": 0.9458397626876831, + "rewards/margins_min": -0.2256821095943451, + "rewards/margins_std": 0.5262209177017212, + "rewards/rejected": -0.5829775929450989, + "step": 2540 + }, + { + "epoch": 0.61, + "grad_norm": 8.140310094676984, + "learning_rate": 1.9751793577228455e-07, + "logits/chosen": -2.66951322555542, + "logits/rejected": -2.6798131465911865, + "logps/chosen": -321.3728942871094, + "logps/rejected": -321.29351806640625, + "loss": 0.5892, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2094179093837738, + "rewards/margins": 0.2948618531227112, + "rewards/margins_max": 0.7485690712928772, + "rewards/margins_min": -0.18952804803848267, + "rewards/margins_std": 0.4270727038383484, + "rewards/rejected": -0.5042797327041626, + "step": 2550 + }, + { + "epoch": 0.61, + "grad_norm": 7.600463160405789, + "learning_rate": 1.9547642734424823e-07, + "logits/chosen": -2.5959692001342773, + "logits/rejected": -2.621346950531006, + "logps/chosen": -301.83380126953125, + "logps/rejected": -319.92218017578125, + "loss": 0.5753, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12533453106880188, + "rewards/margins": 0.36113643646240234, + "rewards/margins_max": 0.9919248819351196, + "rewards/margins_min": -0.17455923557281494, + "rewards/margins_std": 0.5277222394943237, + "rewards/rejected": -0.4864709973335266, + "step": 2560 + }, + { + "epoch": 0.62, + "grad_norm": 10.267465737032328, + "learning_rate": 1.9343872929341196e-07, + "logits/chosen": -2.654869556427002, + "logits/rejected": -2.629849672317505, + "logps/chosen": -307.09124755859375, + "logps/rejected": -321.4107360839844, + "loss": 0.578, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14508172869682312, + "rewards/margins": 0.3119579553604126, + "rewards/margins_max": 0.9108166694641113, + "rewards/margins_min": -0.29992565512657166, + "rewards/margins_std": 0.5585619211196899, + "rewards/rejected": -0.4570396840572357, + "step": 2570 + }, + { + "epoch": 0.62, + "grad_norm": 3.7576589572809023, + "learning_rate": 1.9140498402420416e-07, + "logits/chosen": -2.647003173828125, + "logits/rejected": -2.624803066253662, + "logps/chosen": -324.68646240234375, + "logps/rejected": -348.1784973144531, + "loss": 0.5693, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1981527954339981, + "rewards/margins": 0.40184909105300903, + "rewards/margins_max": 0.9560012817382812, + "rewards/margins_min": -0.10828708112239838, + "rewards/margins_std": 0.47486764192581177, + "rewards/rejected": -0.6000019311904907, + "step": 2580 + }, + { + "epoch": 0.62, + "grad_norm": 9.371237851903723, + "learning_rate": 1.8937533366481308e-07, + "logits/chosen": -2.552311658859253, + "logits/rejected": -2.591034412384033, + "logps/chosen": -284.5642395019531, + "logps/rejected": -317.30535888671875, + "loss": 0.6052, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2428911030292511, + "rewards/margins": 0.2654326558113098, + "rewards/margins_max": 0.9568518400192261, + "rewards/margins_min": -0.4375254213809967, + "rewards/margins_std": 0.6340715885162354, + "rewards/rejected": -0.5083237290382385, + "step": 2590 + }, + { + "epoch": 0.62, + "grad_norm": 6.374025756592385, + "learning_rate": 1.8734992005725463e-07, + "logits/chosen": -2.4831905364990234, + "logits/rejected": -2.4986119270324707, + "logps/chosen": -315.5087890625, + "logps/rejected": -329.82440185546875, + "loss": 0.5795, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.21649488806724548, + "rewards/margins": 0.3169113099575043, + "rewards/margins_max": 0.8947137594223022, + "rewards/margins_min": -0.2509083151817322, + "rewards/margins_std": 0.5162596702575684, + "rewards/rejected": -0.5334061980247498, + "step": 2600 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.6244351863861084, + "eval_logits/rejected": -2.594435930252075, + "eval_logps/chosen": -305.5476379394531, + "eval_logps/rejected": -316.98046875, + "eval_loss": 0.5950339436531067, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -0.21092304587364197, + "eval_rewards/margins": 0.30034908652305603, + "eval_rewards/margins_max": 1.1206122636795044, + "eval_rewards/margins_min": -0.5079042315483093, + "eval_rewards/margins_std": 0.5532759428024292, + "eval_rewards/rejected": -0.511272132396698, + "eval_runtime": 860.8369, + "eval_samples_per_second": 4.647, + "eval_steps_per_second": 0.29, + "step": 2600 + }, + { + "epoch": 0.62, + "grad_norm": 7.184907395861785, + "learning_rate": 1.853288847474594e-07, + "logits/chosen": -2.6795639991760254, + "logits/rejected": -2.6414713859558105, + "logps/chosen": -327.1631164550781, + "logps/rejected": -323.26617431640625, + "loss": 0.5642, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19482922554016113, + "rewards/margins": 0.3318164348602295, + "rewards/margins_max": 0.9375141263008118, + "rewards/margins_min": -0.2631053328514099, + "rewards/margins_std": 0.5353230237960815, + "rewards/rejected": -0.5266456604003906, + "step": 2610 + }, + { + "epoch": 0.63, + "grad_norm": 8.375504663549572, + "learning_rate": 1.8331236897538065e-07, + "logits/chosen": -2.6555323600769043, + "logits/rejected": -2.6470930576324463, + "logps/chosen": -301.43994140625, + "logps/rejected": -320.8365783691406, + "loss": 0.6076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23343400657176971, + "rewards/margins": 0.2559712827205658, + "rewards/margins_max": 0.8926533460617065, + "rewards/margins_min": -0.34984445571899414, + "rewards/margins_std": 0.5504107475280762, + "rewards/rejected": -0.4894053339958191, + "step": 2620 + }, + { + "epoch": 0.63, + "grad_norm": 8.074893386184252, + "learning_rate": 1.8130051366512447e-07, + "logits/chosen": -2.6671040058135986, + "logits/rejected": -2.5697810649871826, + "logps/chosen": -306.04376220703125, + "logps/rejected": -347.21392822265625, + "loss": 0.5613, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.12057604640722275, + "rewards/margins": 0.39471858739852905, + "rewards/margins_max": 1.1163181066513062, + "rewards/margins_min": -0.3967621922492981, + "rewards/margins_std": 0.6732631921768188, + "rewards/rejected": -0.5152946710586548, + "step": 2630 + }, + { + "epoch": 0.63, + "grad_norm": 13.182140544907838, + "learning_rate": 1.792934594151003e-07, + "logits/chosen": -2.694676637649536, + "logits/rejected": -2.6797292232513428, + "logps/chosen": -293.8489685058594, + "logps/rejected": -289.4837951660156, + "loss": 0.6257, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.27340248227119446, + "rewards/margins": 0.21240122616291046, + "rewards/margins_max": 0.8069620132446289, + "rewards/margins_min": -0.30981430411338806, + "rewards/margins_std": 0.4958290457725525, + "rewards/rejected": -0.4858037531375885, + "step": 2640 + }, + { + "epoch": 0.63, + "grad_norm": 9.431714438194804, + "learning_rate": 1.7729134648819605e-07, + "logits/chosen": -2.5747389793395996, + "logits/rejected": -2.530871868133545, + "logps/chosen": -273.8680114746094, + "logps/rejected": -288.00897216796875, + "loss": 0.574, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2643434405326843, + "rewards/margins": 0.30124327540397644, + "rewards/margins_max": 0.9424476623535156, + "rewards/margins_min": -0.29725712537765503, + "rewards/margins_std": 0.5660034418106079, + "rewards/rejected": -0.5655866861343384, + "step": 2650 + }, + { + "epoch": 0.64, + "grad_norm": 8.482087952859194, + "learning_rate": 1.7529431480197533e-07, + "logits/chosen": -2.6235687732696533, + "logits/rejected": -2.5714054107666016, + "logps/chosen": -297.802978515625, + "logps/rejected": -307.982666015625, + "loss": 0.5844, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2382938116788864, + "rewards/margins": 0.28809601068496704, + "rewards/margins_max": 0.915099024772644, + "rewards/margins_min": -0.3727230429649353, + "rewards/margins_std": 0.5710189342498779, + "rewards/rejected": -0.5263898968696594, + "step": 2660 + }, + { + "epoch": 0.64, + "grad_norm": 7.397709424972579, + "learning_rate": 1.7330250391889961e-07, + "logits/chosen": -2.66947078704834, + "logits/rejected": -2.5924038887023926, + "logps/chosen": -291.0464782714844, + "logps/rejected": -260.6331481933594, + "loss": 0.576, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.10688348859548569, + "rewards/margins": 0.40493711829185486, + "rewards/margins_max": 0.9435163736343384, + "rewards/margins_min": -0.08007287979125977, + "rewards/margins_std": 0.4616405963897705, + "rewards/rejected": -0.5118206143379211, + "step": 2670 + }, + { + "epoch": 0.64, + "grad_norm": 5.814215294199613, + "learning_rate": 1.713160530365747e-07, + "logits/chosen": -2.7008068561553955, + "logits/rejected": -2.6780431270599365, + "logps/chosen": -314.0332336425781, + "logps/rejected": -321.0278625488281, + "loss": 0.6235, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.22004225850105286, + "rewards/margins": 0.1748182624578476, + "rewards/margins_max": 0.7563528418540955, + "rewards/margins_min": -0.36928868293762207, + "rewards/margins_std": 0.5032538771629333, + "rewards/rejected": -0.39486050605773926, + "step": 2680 + }, + { + "epoch": 0.64, + "grad_norm": 13.848762517517642, + "learning_rate": 1.693351009780231e-07, + "logits/chosen": -2.602003574371338, + "logits/rejected": -2.5646305084228516, + "logps/chosen": -307.02569580078125, + "logps/rejected": -298.5267639160156, + "loss": 0.5792, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.20884010195732117, + "rewards/margins": 0.3109968304634094, + "rewards/margins_max": 0.9659851789474487, + "rewards/margins_min": -0.37391597032546997, + "rewards/margins_std": 0.5974076390266418, + "rewards/rejected": -0.519836962223053, + "step": 2690 + }, + { + "epoch": 0.65, + "grad_norm": 8.596476840107231, + "learning_rate": 1.6735978618198215e-07, + "logits/chosen": -2.6759331226348877, + "logits/rejected": -2.661097288131714, + "logps/chosen": -269.54351806640625, + "logps/rejected": -326.4341125488281, + "loss": 0.5909, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19427165389060974, + "rewards/margins": 0.2538819909095764, + "rewards/margins_max": 0.9365754127502441, + "rewards/margins_min": -0.30797111988067627, + "rewards/margins_std": 0.5668118596076965, + "rewards/rejected": -0.44815367460250854, + "step": 2700 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.623544216156006, + "eval_logits/rejected": -2.593374252319336, + "eval_logps/chosen": -304.5151672363281, + "eval_logps/rejected": -316.2978820800781, + "eval_loss": 0.5945470929145813, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -0.2005983293056488, + "eval_rewards/margins": 0.3038475215435028, + "eval_rewards/margins_max": 1.1335314512252808, + "eval_rewards/margins_min": -0.5149688720703125, + "eval_rewards/margins_std": 0.5597691535949707, + "eval_rewards/rejected": -0.5044458508491516, + "eval_runtime": 859.3675, + "eval_samples_per_second": 4.655, + "eval_steps_per_second": 0.291, + "step": 2700 + }, + { + "epoch": 0.65, + "grad_norm": 9.703570025523305, + "learning_rate": 1.6539024669322954e-07, + "logits/chosen": -2.6310129165649414, + "logits/rejected": -2.6164767742156982, + "logps/chosen": -305.9279479980469, + "logps/rejected": -299.69830322265625, + "loss": 0.5767, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.17514154314994812, + "rewards/margins": 0.34417447447776794, + "rewards/margins_max": 0.9141018986701965, + "rewards/margins_min": -0.21524396538734436, + "rewards/margins_std": 0.5172333121299744, + "rewards/rejected": -0.5193160176277161, + "step": 2710 + }, + { + "epoch": 0.65, + "grad_norm": 9.837477850495237, + "learning_rate": 1.6342662015293584e-07, + "logits/chosen": -2.603860378265381, + "logits/rejected": -2.58225679397583, + "logps/chosen": -327.4505615234375, + "logps/rejected": -316.6806945800781, + "loss": 0.5974, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2224818766117096, + "rewards/margins": 0.33033767342567444, + "rewards/margins_max": 0.8051918148994446, + "rewards/margins_min": -0.11464174091815948, + "rewards/margins_std": 0.4120241701602936, + "rewards/rejected": -0.552819550037384, + "step": 2720 + }, + { + "epoch": 0.65, + "grad_norm": 5.8630349897277165, + "learning_rate": 1.6146904378904536e-07, + "logits/chosen": -2.732633590698242, + "logits/rejected": -2.6950955390930176, + "logps/chosen": -367.7388000488281, + "logps/rejected": -373.8130798339844, + "loss": 0.6117, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3329017460346222, + "rewards/margins": 0.24218297004699707, + "rewards/margins_max": 0.9342014193534851, + "rewards/margins_min": -0.3903924226760864, + "rewards/margins_std": 0.5841065645217896, + "rewards/rejected": -0.5750846862792969, + "step": 2730 + }, + { + "epoch": 0.66, + "grad_norm": 7.145113479642076, + "learning_rate": 1.5951765440668635e-07, + "logits/chosen": -2.679503917694092, + "logits/rejected": -2.6337997913360596, + "logps/chosen": -326.5082092285156, + "logps/rejected": -292.46990966796875, + "loss": 0.5674, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17851652204990387, + "rewards/margins": 0.35734179615974426, + "rewards/margins_max": 1.0165164470672607, + "rewards/margins_min": -0.28420490026474, + "rewards/margins_std": 0.5844985246658325, + "rewards/rejected": -0.5358583331108093, + "step": 2740 + }, + { + "epoch": 0.66, + "grad_norm": 11.910015321812995, + "learning_rate": 1.5757258837860998e-07, + "logits/chosen": -2.6206724643707275, + "logits/rejected": -2.5926003456115723, + "logps/chosen": -307.025146484375, + "logps/rejected": -302.80303955078125, + "loss": 0.5821, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2873552739620209, + "rewards/margins": 0.3741292953491211, + "rewards/margins_max": 0.9916057586669922, + "rewards/margins_min": -0.1654580980539322, + "rewards/margins_std": 0.5074654221534729, + "rewards/rejected": -0.6614845991134644, + "step": 2750 + }, + { + "epoch": 0.66, + "grad_norm": 8.511001199533286, + "learning_rate": 1.5563398163566034e-07, + "logits/chosen": -2.63610577583313, + "logits/rejected": -2.648735761642456, + "logps/chosen": -278.10650634765625, + "logps/rejected": -330.8636169433594, + "loss": 0.5522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18409788608551025, + "rewards/margins": 0.3788400888442993, + "rewards/margins_max": 0.9228482246398926, + "rewards/margins_min": -0.09980174154043198, + "rewards/margins_std": 0.4648720622062683, + "rewards/rejected": -0.5629379749298096, + "step": 2760 + }, + { + "epoch": 0.66, + "grad_norm": 8.482171153544789, + "learning_rate": 1.5370196965727438e-07, + "logits/chosen": -2.635110378265381, + "logits/rejected": -2.607448101043701, + "logps/chosen": -296.49542236328125, + "logps/rejected": -314.3409423828125, + "loss": 0.5618, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.27174216508865356, + "rewards/margins": 0.286690890789032, + "rewards/margins_max": 0.8846603631973267, + "rewards/margins_min": -0.32019850611686707, + "rewards/margins_std": 0.530876100063324, + "rewards/rejected": -0.5584330558776855, + "step": 2770 + }, + { + "epoch": 0.67, + "grad_norm": 8.442325370774075, + "learning_rate": 1.5177668746201454e-07, + "logits/chosen": -2.600867748260498, + "logits/rejected": -2.615812301635742, + "logps/chosen": -275.86767578125, + "logps/rejected": -312.8405456542969, + "loss": 0.5954, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2981366515159607, + "rewards/margins": 0.296410471200943, + "rewards/margins_max": 0.9821388125419617, + "rewards/margins_min": -0.39119023084640503, + "rewards/margins_std": 0.6126337051391602, + "rewards/rejected": -0.5945470929145813, + "step": 2780 + }, + { + "epoch": 0.67, + "grad_norm": 5.197446181333802, + "learning_rate": 1.4985826959813254e-07, + "logits/chosen": -2.70615291595459, + "logits/rejected": -2.6561496257781982, + "logps/chosen": -346.88934326171875, + "logps/rejected": -350.87066650390625, + "loss": 0.5909, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29688599705696106, + "rewards/margins": 0.2667461633682251, + "rewards/margins_max": 0.965578556060791, + "rewards/margins_min": -0.36972442269325256, + "rewards/margins_std": 0.6068300008773804, + "rewards/rejected": -0.5636321306228638, + "step": 2790 + }, + { + "epoch": 0.67, + "grad_norm": 7.0483743294802785, + "learning_rate": 1.4794685013416674e-07, + "logits/chosen": -2.624732732772827, + "logits/rejected": -2.594238758087158, + "logps/chosen": -319.1407775878906, + "logps/rejected": -318.9751281738281, + "loss": 0.6097, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1605234146118164, + "rewards/margins": 0.3256325423717499, + "rewards/margins_max": 0.9358084797859192, + "rewards/margins_min": -0.23688547313213348, + "rewards/margins_std": 0.5254564881324768, + "rewards/rejected": -0.48615598678588867, + "step": 2800 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.621018648147583, + "eval_logits/rejected": -2.5909228324890137, + "eval_logps/chosen": -304.81005859375, + "eval_logps/rejected": -316.7603759765625, + "eval_loss": 0.5937537550926208, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -0.20354729890823364, + "eval_rewards/margins": 0.30552393198013306, + "eval_rewards/margins_max": 1.1390976905822754, + "eval_rewards/margins_min": -0.5171207189559937, + "eval_rewards/margins_std": 0.5610091090202332, + "eval_rewards/rejected": -0.5090711712837219, + "eval_runtime": 859.8338, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 2800 + }, + { + "epoch": 0.67, + "grad_norm": 7.734353791795418, + "learning_rate": 1.460425626495725e-07, + "logits/chosen": -2.626800775527954, + "logits/rejected": -2.603543758392334, + "logps/chosen": -268.3589782714844, + "logps/rejected": -282.11590576171875, + "loss": 0.5761, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18991556763648987, + "rewards/margins": 0.3082110285758972, + "rewards/margins_max": 0.8790051341056824, + "rewards/margins_min": -0.26625508069992065, + "rewards/margins_std": 0.5046501159667969, + "rewards/rejected": -0.49812665581703186, + "step": 2810 + }, + { + "epoch": 0.68, + "grad_norm": 5.283835596384424, + "learning_rate": 1.4414554022538737e-07, + "logits/chosen": -2.6725804805755615, + "logits/rejected": -2.624807119369507, + "logps/chosen": -308.9907531738281, + "logps/rejected": -298.0137634277344, + "loss": 0.5764, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.203936368227005, + "rewards/margins": 0.4011301100254059, + "rewards/margins_max": 0.9146728515625, + "rewards/margins_min": -0.11008103936910629, + "rewards/margins_std": 0.4578138291835785, + "rewards/rejected": -0.6050664186477661, + "step": 2820 + }, + { + "epoch": 0.68, + "grad_norm": 8.105790656258398, + "learning_rate": 1.4225591543493025e-07, + "logits/chosen": -2.537041187286377, + "logits/rejected": -2.5283117294311523, + "logps/chosen": -261.78497314453125, + "logps/rejected": -334.4160461425781, + "loss": 0.5651, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.16362933814525604, + "rewards/margins": 0.35795944929122925, + "rewards/margins_max": 0.9304954409599304, + "rewards/margins_min": -0.23991163074970245, + "rewards/margins_std": 0.5223571062088013, + "rewards/rejected": -0.5215888023376465, + "step": 2830 + }, + { + "epoch": 0.68, + "grad_norm": 13.215129527836645, + "learning_rate": 1.4037382033453698e-07, + "logits/chosen": -2.6501476764678955, + "logits/rejected": -2.6472105979919434, + "logps/chosen": -308.47210693359375, + "logps/rejected": -330.118408203125, + "loss": 0.5725, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.25531333684921265, + "rewards/margins": 0.2947203516960144, + "rewards/margins_max": 0.7916975021362305, + "rewards/margins_min": -0.17512428760528564, + "rewards/margins_std": 0.4317558705806732, + "rewards/rejected": -0.550033688545227, + "step": 2840 + }, + { + "epoch": 0.68, + "grad_norm": 7.861511789391789, + "learning_rate": 1.384993864543314e-07, + "logits/chosen": -2.6705267429351807, + "logits/rejected": -2.6534152030944824, + "logps/chosen": -300.2662353515625, + "logps/rejected": -340.5654296875, + "loss": 0.5674, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.17005816102027893, + "rewards/margins": 0.3681716322898865, + "rewards/margins_max": 0.962718665599823, + "rewards/margins_min": -0.232384592294693, + "rewards/margins_std": 0.5470969080924988, + "rewards/rejected": -0.5382298231124878, + "step": 2850 + }, + { + "epoch": 0.68, + "grad_norm": 9.159352476729387, + "learning_rate": 1.366327447890332e-07, + "logits/chosen": -2.6822450160980225, + "logits/rejected": -2.6347575187683105, + "logps/chosen": -319.39990234375, + "logps/rejected": -333.537109375, + "loss": 0.5934, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.13150997459888458, + "rewards/margins": 0.322347491979599, + "rewards/margins_max": 0.8425567746162415, + "rewards/margins_min": -0.22028379142284393, + "rewards/margins_std": 0.4941239356994629, + "rewards/rejected": -0.4538574814796448, + "step": 2860 + }, + { + "epoch": 0.69, + "grad_norm": 5.893212840256697, + "learning_rate": 1.3477402578880356e-07, + "logits/chosen": -2.7067980766296387, + "logits/rejected": -2.665985584259033, + "logps/chosen": -324.10211181640625, + "logps/rejected": -344.2232360839844, + "loss": 0.5912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27206873893737793, + "rewards/margins": 0.3080797791481018, + "rewards/margins_max": 0.816716194152832, + "rewards/margins_min": -0.21273913979530334, + "rewards/margins_std": 0.47305870056152344, + "rewards/rejected": -0.5801485180854797, + "step": 2870 + }, + { + "epoch": 0.69, + "grad_norm": 17.80143142259519, + "learning_rate": 1.3292335935012854e-07, + "logits/chosen": -2.6728432178497314, + "logits/rejected": -2.6427035331726074, + "logps/chosen": -349.10595703125, + "logps/rejected": -328.55853271484375, + "loss": 0.59, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21816524863243103, + "rewards/margins": 0.3346711993217468, + "rewards/margins_max": 0.8795034289360046, + "rewards/margins_min": -0.2123207151889801, + "rewards/margins_std": 0.501558244228363, + "rewards/rejected": -0.5528364181518555, + "step": 2880 + }, + { + "epoch": 0.69, + "grad_norm": 4.457459598856668, + "learning_rate": 1.3108087480674166e-07, + "logits/chosen": -2.643859386444092, + "logits/rejected": -2.6534647941589355, + "logps/chosen": -337.7801513671875, + "logps/rejected": -360.29046630859375, + "loss": 0.5731, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.2451343983411789, + "rewards/margins": 0.37806132435798645, + "rewards/margins_max": 1.0581673383712769, + "rewards/margins_min": -0.2207886278629303, + "rewards/margins_std": 0.5833451747894287, + "rewards/rejected": -0.6231956481933594, + "step": 2890 + }, + { + "epoch": 0.69, + "grad_norm": 4.465711934743042, + "learning_rate": 1.2924670092058465e-07, + "logits/chosen": -2.6721737384796143, + "logits/rejected": -2.6429543495178223, + "logps/chosen": -322.4364929199219, + "logps/rejected": -283.9398193359375, + "loss": 0.5776, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17205238342285156, + "rewards/margins": 0.3702142536640167, + "rewards/margins_max": 0.9432505369186401, + "rewards/margins_min": -0.21967828273773193, + "rewards/margins_std": 0.5122222304344177, + "rewards/rejected": -0.5422666072845459, + "step": 2900 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -2.6177093982696533, + "eval_logits/rejected": -2.587409734725952, + "eval_logps/chosen": -305.8715515136719, + "eval_logps/rejected": -318.1777648925781, + "eval_loss": 0.5928537845611572, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -0.2141621708869934, + "eval_rewards/margins": 0.3090827465057373, + "eval_rewards/margins_max": 1.1530448198318481, + "eval_rewards/margins_min": -0.5251262784004211, + "eval_rewards/margins_std": 0.5672925710678101, + "eval_rewards/rejected": -0.5232448577880859, + "eval_runtime": 859.8299, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 2900 + }, + { + "epoch": 0.7, + "grad_norm": 10.347129799745117, + "learning_rate": 1.2742096587280966e-07, + "logits/chosen": -2.597189426422119, + "logits/rejected": -2.5491786003112793, + "logps/chosen": -300.6410827636719, + "logps/rejected": -296.36639404296875, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26455071568489075, + "rewards/margins": 0.3420693278312683, + "rewards/margins_max": 0.9913008809089661, + "rewards/margins_min": -0.338512122631073, + "rewards/margins_std": 0.590982973575592, + "rewards/rejected": -0.6066200137138367, + "step": 2910 + }, + { + "epoch": 0.7, + "grad_norm": 14.772272395886871, + "learning_rate": 1.2560379725482073e-07, + "logits/chosen": -2.6537861824035645, + "logits/rejected": -2.5844337940216064, + "logps/chosen": -310.41607666015625, + "logps/rejected": -300.7682189941406, + "loss": 0.5778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17608337104320526, + "rewards/margins": 0.29788199067115784, + "rewards/margins_max": 0.9067052602767944, + "rewards/margins_min": -0.2519022822380066, + "rewards/margins_std": 0.5077629089355469, + "rewards/rejected": -0.4739653468132019, + "step": 2920 + }, + { + "epoch": 0.7, + "grad_norm": 8.34466433426549, + "learning_rate": 1.237953220593579e-07, + "logits/chosen": -2.6661570072174072, + "logits/rejected": -2.6105422973632812, + "logps/chosen": -329.9227600097656, + "logps/rejected": -317.62249755859375, + "loss": 0.5757, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2413763552904129, + "rewards/margins": 0.32379597425460815, + "rewards/margins_max": 1.0564546585083008, + "rewards/margins_min": -0.45707249641418457, + "rewards/margins_std": 0.6725181341171265, + "rewards/rejected": -0.5651723742485046, + "step": 2930 + }, + { + "epoch": 0.7, + "grad_norm": 6.24181959765327, + "learning_rate": 1.2199566667162127e-07, + "logits/chosen": -2.6748130321502686, + "logits/rejected": -2.608752965927124, + "logps/chosen": -335.2709045410156, + "logps/rejected": -319.9509582519531, + "loss": 0.5372, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1704896092414856, + "rewards/margins": 0.4461976885795593, + "rewards/margins_max": 1.0146349668502808, + "rewards/margins_min": -0.16861645877361298, + "rewards/margins_std": 0.5327773094177246, + "rewards/rejected": -0.6166872978210449, + "step": 2940 + }, + { + "epoch": 0.71, + "grad_norm": 13.571559991415354, + "learning_rate": 1.2020495686043924e-07, + "logits/chosen": -2.6338913440704346, + "logits/rejected": -2.6120593547821045, + "logps/chosen": -321.2636413574219, + "logps/rejected": -313.9801940917969, + "loss": 0.557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19493046402931213, + "rewards/margins": 0.37187767028808594, + "rewards/margins_max": 1.043025016784668, + "rewards/margins_min": -0.22166283428668976, + "rewards/margins_std": 0.5632063150405884, + "rewards/rejected": -0.5668081641197205, + "step": 2950 + }, + { + "epoch": 0.71, + "grad_norm": 8.294276863387395, + "learning_rate": 1.1842331776947931e-07, + "logits/chosen": -2.6366591453552246, + "logits/rejected": -2.6095542907714844, + "logps/chosen": -350.4244384765625, + "logps/rejected": -306.56805419921875, + "loss": 0.5678, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.18484465777873993, + "rewards/margins": 0.3695618212223053, + "rewards/margins_max": 0.9093047976493835, + "rewards/margins_min": -0.16653835773468018, + "rewards/margins_std": 0.4747069478034973, + "rewards/rejected": -0.554406464099884, + "step": 2960 + }, + { + "epoch": 0.71, + "grad_norm": 10.14282240372356, + "learning_rate": 1.1665087390850187e-07, + "logits/chosen": -2.61653470993042, + "logits/rejected": -2.591304302215576, + "logps/chosen": -244.28515625, + "logps/rejected": -297.53997802734375, + "loss": 0.6192, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.30605635046958923, + "rewards/margins": 0.23393836617469788, + "rewards/margins_max": 0.8981086611747742, + "rewards/margins_min": -0.3801124691963196, + "rewards/margins_std": 0.5839722752571106, + "rewards/rejected": -0.5399946570396423, + "step": 2970 + }, + { + "epoch": 0.71, + "grad_norm": 7.2314257284719625, + "learning_rate": 1.1488774914465918e-07, + "logits/chosen": -2.6050806045532227, + "logits/rejected": -2.604292392730713, + "logps/chosen": -276.8834228515625, + "logps/rejected": -320.0370178222656, + "loss": 0.5755, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2090921849012375, + "rewards/margins": 0.34288525581359863, + "rewards/margins_max": 1.0640244483947754, + "rewards/margins_min": -0.3710848093032837, + "rewards/margins_std": 0.6398088335990906, + "rewards/rejected": -0.5519774556159973, + "step": 2980 + }, + { + "epoch": 0.72, + "grad_norm": 9.784779652184145, + "learning_rate": 1.1313406669383877e-07, + "logits/chosen": -2.6493895053863525, + "logits/rejected": -2.5964341163635254, + "logps/chosen": -348.2480773925781, + "logps/rejected": -330.4178771972656, + "loss": 0.6217, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.25257083773612976, + "rewards/margins": 0.30446967482566833, + "rewards/margins_max": 1.0399785041809082, + "rewards/margins_min": -0.44789689779281616, + "rewards/margins_std": 0.6760698556900024, + "rewards/rejected": -0.5570404529571533, + "step": 2990 + }, + { + "epoch": 0.72, + "grad_norm": 11.042945259967942, + "learning_rate": 1.1138994911205284e-07, + "logits/chosen": -2.627768039703369, + "logits/rejected": -2.5813279151916504, + "logps/chosen": -306.77679443359375, + "logps/rejected": -345.439208984375, + "loss": 0.575, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.12094666808843613, + "rewards/margins": 0.4055045247077942, + "rewards/margins_max": 0.9982796907424927, + "rewards/margins_min": -0.17103391885757446, + "rewards/margins_std": 0.513174295425415, + "rewards/rejected": -0.5264511108398438, + "step": 3000 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.6164939403533936, + "eval_logits/rejected": -2.5860860347747803, + "eval_logps/chosen": -302.9332580566406, + "eval_logps/rejected": -314.7164611816406, + "eval_loss": 0.5947726368904114, + "eval_rewards/accuracies": 0.6980000138282776, + "eval_rewards/chosen": -0.18477900326251984, + "eval_rewards/margins": 0.3038530945777893, + "eval_rewards/margins_max": 1.1465470790863037, + "eval_rewards/margins_min": -0.5243028998374939, + "eval_rewards/margins_std": 0.5646576881408691, + "eval_rewards/rejected": -0.48863208293914795, + "eval_runtime": 859.7576, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 3000 + }, + { + "epoch": 0.72, + "grad_norm": 6.654501528625748, + "learning_rate": 1.0965551828687297e-07, + "logits/chosen": -2.648944616317749, + "logits/rejected": -2.5937111377716064, + "logps/chosen": -277.88275146484375, + "logps/rejected": -333.79376220703125, + "loss": 0.5816, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.11035318672657013, + "rewards/margins": 0.3619997799396515, + "rewards/margins_max": 0.9593564867973328, + "rewards/margins_min": -0.24574780464172363, + "rewards/margins_std": 0.5336962342262268, + "rewards/rejected": -0.47235292196273804, + "step": 3010 + }, + { + "epoch": 0.72, + "grad_norm": 10.100569272758296, + "learning_rate": 1.0793089542891229e-07, + "logits/chosen": -2.592881679534912, + "logits/rejected": -2.56054425239563, + "logps/chosen": -306.00885009765625, + "logps/rejected": -283.6434020996094, + "loss": 0.5527, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.11078639328479767, + "rewards/margins": 0.3766598105430603, + "rewards/margins_max": 0.9737428426742554, + "rewards/margins_min": -0.19211386144161224, + "rewards/margins_std": 0.5283175706863403, + "rewards/rejected": -0.4874461591243744, + "step": 3020 + }, + { + "epoch": 0.73, + "grad_norm": 7.425480209902673, + "learning_rate": 1.062162010633545e-07, + "logits/chosen": -2.66178560256958, + "logits/rejected": -2.619534969329834, + "logps/chosen": -305.2130126953125, + "logps/rejected": -286.83563232421875, + "loss": 0.5774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14195309579372406, + "rewards/margins": 0.35815608501434326, + "rewards/margins_max": 0.9821040034294128, + "rewards/margins_min": -0.17042629420757294, + "rewards/margins_std": 0.5124177932739258, + "rewards/rejected": -0.5001091361045837, + "step": 3030 + }, + { + "epoch": 0.73, + "grad_norm": 5.721931948084131, + "learning_rate": 1.0451155502153138e-07, + "logits/chosen": -2.6573493480682373, + "logits/rejected": -2.6277241706848145, + "logps/chosen": -311.711669921875, + "logps/rejected": -287.81158447265625, + "loss": 0.6348, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.25803881883621216, + "rewards/margins": 0.1889791637659073, + "rewards/margins_max": 0.8133442997932434, + "rewards/margins_min": -0.4662505090236664, + "rewards/margins_std": 0.5739446878433228, + "rewards/rejected": -0.44701796770095825, + "step": 3040 + }, + { + "epoch": 0.73, + "grad_norm": 5.877237745995071, + "learning_rate": 1.028170764325479e-07, + "logits/chosen": -2.6631836891174316, + "logits/rejected": -2.641143560409546, + "logps/chosen": -337.29217529296875, + "logps/rejected": -321.89019775390625, + "loss": 0.5948, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23359163105487823, + "rewards/margins": 0.2570040822029114, + "rewards/margins_max": 0.9715884327888489, + "rewards/margins_min": -0.3664648234844208, + "rewards/margins_std": 0.5953644514083862, + "rewards/rejected": -0.4905957281589508, + "step": 3050 + }, + { + "epoch": 0.73, + "grad_norm": 8.202499046058216, + "learning_rate": 1.0113288371495707e-07, + "logits/chosen": -2.5986862182617188, + "logits/rejected": -2.6056227684020996, + "logps/chosen": -324.17584228515625, + "logps/rejected": -317.966796875, + "loss": 0.574, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19230985641479492, + "rewards/margins": 0.32042282819747925, + "rewards/margins_max": 0.9411641359329224, + "rewards/margins_min": -0.2271740883588791, + "rewards/margins_std": 0.5099384188652039, + "rewards/rejected": -0.5127326250076294, + "step": 3060 + }, + { + "epoch": 0.74, + "grad_norm": 7.156920022707099, + "learning_rate": 9.945909456848434e-08, + "logits/chosen": -2.6237006187438965, + "logits/rejected": -2.5988717079162598, + "logps/chosen": -322.70654296875, + "logps/rejected": -287.52508544921875, + "loss": 0.5955, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20879539847373962, + "rewards/margins": 0.2584335207939148, + "rewards/margins_max": 0.9114171266555786, + "rewards/margins_min": -0.33635640144348145, + "rewards/margins_std": 0.5569571256637573, + "rewards/rejected": -0.4672289490699768, + "step": 3070 + }, + { + "epoch": 0.74, + "grad_norm": 8.227703134955291, + "learning_rate": 9.779582596580203e-08, + "logits/chosen": -2.5234127044677734, + "logits/rejected": -2.52382493019104, + "logps/chosen": -273.2856750488281, + "logps/rejected": -297.1725769042969, + "loss": 0.5929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19276472926139832, + "rewards/margins": 0.35403233766555786, + "rewards/margins_max": 0.9341346621513367, + "rewards/margins_min": -0.2888151705265045, + "rewards/margins_std": 0.5561498999595642, + "rewards/rejected": -0.5467970967292786, + "step": 3080 + }, + { + "epoch": 0.74, + "grad_norm": 11.626153704091989, + "learning_rate": 9.614319414435499e-08, + "logits/chosen": -2.6854310035705566, + "logits/rejected": -2.6439242362976074, + "logps/chosen": -295.9474792480469, + "logps/rejected": -279.2301330566406, + "loss": 0.5466, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.141657292842865, + "rewards/margins": 0.32109707593917847, + "rewards/margins_max": 0.9081085920333862, + "rewards/margins_min": -0.3001248836517334, + "rewards/margins_std": 0.5518236756324768, + "rewards/rejected": -0.46275433897972107, + "step": 3090 + }, + { + "epoch": 0.74, + "grad_norm": 8.190121296600395, + "learning_rate": 9.450131459823688e-08, + "logits/chosen": -2.6764397621154785, + "logits/rejected": -2.655296802520752, + "logps/chosen": -338.6136474609375, + "logps/rejected": -311.8411865234375, + "loss": 0.5767, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11137326061725616, + "rewards/margins": 0.33382076025009155, + "rewards/margins_max": 0.9263644218444824, + "rewards/margins_min": -0.2573855519294739, + "rewards/margins_std": 0.5275608897209167, + "rewards/rejected": -0.4451940655708313, + "step": 3100 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -2.616608142852783, + "eval_logits/rejected": -2.586158037185669, + "eval_logps/chosen": -304.1734313964844, + "eval_logps/rejected": -316.46478271484375, + "eval_loss": 0.5935968160629272, + "eval_rewards/accuracies": 0.7009999752044678, + "eval_rewards/chosen": -0.1971808820962906, + "eval_rewards/margins": 0.3089340329170227, + "eval_rewards/margins_max": 1.1550542116165161, + "eval_rewards/margins_min": -0.5275899171829224, + "eval_rewards/margins_std": 0.5689510107040405, + "eval_rewards/rejected": -0.5061149001121521, + "eval_runtime": 860.0822, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 0.291, + "step": 3100 + }, + { + "epoch": 0.74, + "grad_norm": 6.102975288582354, + "learning_rate": 9.287030207011929e-08, + "logits/chosen": -2.5496556758880615, + "logits/rejected": -2.5553019046783447, + "logps/chosen": -304.30828857421875, + "logps/rejected": -323.41571044921875, + "loss": 0.5947, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2754679322242737, + "rewards/margins": 0.2878427505493164, + "rewards/margins_max": 0.8745163679122925, + "rewards/margins_min": -0.3475717306137085, + "rewards/margins_std": 0.5523396134376526, + "rewards/rejected": -0.5633106827735901, + "step": 3110 + }, + { + "epoch": 0.75, + "grad_norm": 9.587786184563068, + "learning_rate": 9.125027054323256e-08, + "logits/chosen": -2.6219749450683594, + "logits/rejected": -2.595902442932129, + "logps/chosen": -343.4728088378906, + "logps/rejected": -317.55120849609375, + "loss": 0.5891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20043566823005676, + "rewards/margins": 0.3993472456932068, + "rewards/margins_max": 1.026168942451477, + "rewards/margins_min": -0.16250573098659515, + "rewards/margins_std": 0.5393126010894775, + "rewards/rejected": -0.5997829437255859, + "step": 3120 + }, + { + "epoch": 0.75, + "grad_norm": 8.478665223786749, + "learning_rate": 8.964133323340081e-08, + "logits/chosen": -2.572331666946411, + "logits/rejected": -2.5079894065856934, + "logps/chosen": -256.3243408203125, + "logps/rejected": -263.0610046386719, + "loss": 0.5782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22794052958488464, + "rewards/margins": 0.32599711418151855, + "rewards/margins_max": 0.9748775362968445, + "rewards/margins_min": -0.3257297873497009, + "rewards/margins_std": 0.6075552701950073, + "rewards/rejected": -0.5539376139640808, + "step": 3130 + }, + { + "epoch": 0.75, + "grad_norm": 9.307544593099678, + "learning_rate": 8.804360258112861e-08, + "logits/chosen": -2.718839645385742, + "logits/rejected": -2.6697192192077637, + "logps/chosen": -302.1324462890625, + "logps/rejected": -286.06524658203125, + "loss": 0.576, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19155898690223694, + "rewards/margins": 0.29025158286094666, + "rewards/margins_max": 0.9243696928024292, + "rewards/margins_min": -0.32775741815567017, + "rewards/margins_std": 0.5530522465705872, + "rewards/rejected": -0.4818105697631836, + "step": 3140 + }, + { + "epoch": 0.75, + "grad_norm": 6.980694443224683, + "learning_rate": 8.645719024374446e-08, + "logits/chosen": -2.6788792610168457, + "logits/rejected": -2.623927354812622, + "logps/chosen": -327.87261962890625, + "logps/rejected": -320.5452575683594, + "loss": 0.5802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16010625660419464, + "rewards/margins": 0.3369438350200653, + "rewards/margins_max": 0.8717771768569946, + "rewards/margins_min": -0.29793721437454224, + "rewards/margins_std": 0.5188963413238525, + "rewards/rejected": -0.49705010652542114, + "step": 3150 + }, + { + "epoch": 0.76, + "grad_norm": 7.148682436889758, + "learning_rate": 8.488220708759667e-08, + "logits/chosen": -2.6966350078582764, + "logits/rejected": -2.651716470718384, + "logps/chosen": -338.0216064453125, + "logps/rejected": -319.8200378417969, + "loss": 0.547, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2118571251630783, + "rewards/margins": 0.4064742624759674, + "rewards/margins_max": 0.971143901348114, + "rewards/margins_min": -0.24024486541748047, + "rewards/margins_std": 0.5454075336456299, + "rewards/rejected": -0.6183313727378845, + "step": 3160 + }, + { + "epoch": 0.76, + "grad_norm": 11.094830355625867, + "learning_rate": 8.331876318030585e-08, + "logits/chosen": -2.64422607421875, + "logits/rejected": -2.61445951461792, + "logps/chosen": -303.38189697265625, + "logps/rejected": -304.0430603027344, + "loss": 0.5789, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.22517380118370056, + "rewards/margins": 0.2765592336654663, + "rewards/margins_max": 0.9068194627761841, + "rewards/margins_min": -0.3106902241706848, + "rewards/margins_std": 0.5414040088653564, + "rewards/rejected": -0.501733124256134, + "step": 3170 + }, + { + "epoch": 0.76, + "grad_norm": 10.52179309256804, + "learning_rate": 8.176696778307269e-08, + "logits/chosen": -2.6210858821868896, + "logits/rejected": -2.586487293243408, + "logps/chosen": -322.38720703125, + "logps/rejected": -334.0662841796875, + "loss": 0.5626, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23957297205924988, + "rewards/margins": 0.3708464503288269, + "rewards/margins_max": 0.9727503657341003, + "rewards/margins_min": -0.348827600479126, + "rewards/margins_std": 0.5810685753822327, + "rewards/rejected": -0.6104193925857544, + "step": 3180 + }, + { + "epoch": 0.76, + "grad_norm": 6.038743498002542, + "learning_rate": 8.022692934304238e-08, + "logits/chosen": -2.6480252742767334, + "logits/rejected": -2.568481683731079, + "logps/chosen": -321.59954833984375, + "logps/rejected": -311.34429931640625, + "loss": 0.5878, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.21423247456550598, + "rewards/margins": 0.3581593334674835, + "rewards/margins_max": 1.0015473365783691, + "rewards/margins_min": -0.23262456059455872, + "rewards/margins_std": 0.5628671646118164, + "rewards/rejected": -0.5723918080329895, + "step": 3190 + }, + { + "epoch": 0.77, + "grad_norm": 10.88907457915638, + "learning_rate": 7.869875548572588e-08, + "logits/chosen": -2.64384126663208, + "logits/rejected": -2.6285018920898438, + "logps/chosen": -272.13360595703125, + "logps/rejected": -285.25616455078125, + "loss": 0.5642, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17138831317424774, + "rewards/margins": 0.3504408299922943, + "rewards/margins_max": 0.9936060905456543, + "rewards/margins_min": -0.2875770926475525, + "rewards/margins_std": 0.5752379894256592, + "rewards/rejected": -0.5218292474746704, + "step": 3200 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.6170332431793213, + "eval_logits/rejected": -2.586714029312134, + "eval_logps/chosen": -303.88458251953125, + "eval_logps/rejected": -316.1905822753906, + "eval_loss": 0.5936970114707947, + "eval_rewards/accuracies": 0.7009999752044678, + "eval_rewards/chosen": -0.1942928284406662, + "eval_rewards/margins": 0.30908045172691345, + "eval_rewards/margins_max": 1.1615225076675415, + "eval_rewards/margins_min": -0.5332077145576477, + "eval_rewards/margins_std": 0.5726361870765686, + "eval_rewards/rejected": -0.5033733248710632, + "eval_runtime": 860.0191, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 0.291, + "step": 3200 + }, + { + "epoch": 0.77, + "grad_norm": 11.373705444449904, + "learning_rate": 7.718255300747817e-08, + "logits/chosen": -2.60569429397583, + "logits/rejected": -2.5863037109375, + "logps/chosen": -293.9930419921875, + "logps/rejected": -334.334228515625, + "loss": 0.5621, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1691075563430786, + "rewards/margins": 0.3850109577178955, + "rewards/margins_max": 1.034523606300354, + "rewards/margins_min": -0.20032206177711487, + "rewards/margins_std": 0.5684488415718079, + "rewards/rejected": -0.5541185140609741, + "step": 3210 + }, + { + "epoch": 0.77, + "grad_norm": 12.358749045858794, + "learning_rate": 7.567842786803502e-08, + "logits/chosen": -2.5796420574188232, + "logits/rejected": -2.5125057697296143, + "logps/chosen": -305.2764587402344, + "logps/rejected": -315.77783203125, + "loss": 0.5785, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21089377999305725, + "rewards/margins": 0.2600666880607605, + "rewards/margins_max": 0.8219484090805054, + "rewards/margins_min": -0.3330570161342621, + "rewards/margins_std": 0.5194389820098877, + "rewards/rejected": -0.4709605276584625, + "step": 3220 + }, + { + "epoch": 0.77, + "grad_norm": 11.42643961778415, + "learning_rate": 7.418648518310797e-08, + "logits/chosen": -2.6522135734558105, + "logits/rejected": -2.6124863624572754, + "logps/chosen": -300.091796875, + "logps/rejected": -266.65228271484375, + "loss": 0.5677, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.13778580725193024, + "rewards/margins": 0.3589250147342682, + "rewards/margins_max": 1.0125762224197388, + "rewards/margins_min": -0.22981949150562286, + "rewards/margins_std": 0.5771783590316772, + "rewards/rejected": -0.4967108368873596, + "step": 3230 + }, + { + "epoch": 0.78, + "grad_norm": 6.529714011316275, + "learning_rate": 7.270682921703853e-08, + "logits/chosen": -2.639592170715332, + "logits/rejected": -2.646275758743286, + "logps/chosen": -311.7178955078125, + "logps/rejected": -324.42608642578125, + "loss": 0.5831, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2275652438402176, + "rewards/margins": 0.2939935326576233, + "rewards/margins_max": 0.9402956962585449, + "rewards/margins_min": -0.37714487314224243, + "rewards/margins_std": 0.590320348739624, + "rewards/rejected": -0.5215587019920349, + "step": 3240 + }, + { + "epoch": 0.78, + "grad_norm": 8.797069703246049, + "learning_rate": 7.123956337551116e-08, + "logits/chosen": -2.6058413982391357, + "logits/rejected": -2.6359944343566895, + "logps/chosen": -296.3321838378906, + "logps/rejected": -293.48260498046875, + "loss": 0.5856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19313833117485046, + "rewards/margins": 0.36264750361442566, + "rewards/margins_max": 1.0930047035217285, + "rewards/margins_min": -0.3832333981990814, + "rewards/margins_std": 0.6633163690567017, + "rewards/rejected": -0.5557857751846313, + "step": 3250 + }, + { + "epoch": 0.78, + "grad_norm": 15.849059048709869, + "learning_rate": 6.978479019832725e-08, + "logits/chosen": -2.579946279525757, + "logits/rejected": -2.552095413208008, + "logps/chosen": -336.55401611328125, + "logps/rejected": -316.15594482421875, + "loss": 0.5497, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.14274556934833527, + "rewards/margins": 0.4389498233795166, + "rewards/margins_max": 1.0567692518234253, + "rewards/margins_min": -0.18148799240589142, + "rewards/margins_std": 0.5417019128799438, + "rewards/rejected": -0.5816953182220459, + "step": 3260 + }, + { + "epoch": 0.78, + "grad_norm": 5.846431015177925, + "learning_rate": 6.83426113522389e-08, + "logits/chosen": -2.5978214740753174, + "logits/rejected": -2.5879592895507812, + "logps/chosen": -305.3526916503906, + "logps/rejected": -297.043701171875, + "loss": 0.5654, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1424698531627655, + "rewards/margins": 0.2821696400642395, + "rewards/margins_max": 0.7785348892211914, + "rewards/margins_min": -0.21284432709217072, + "rewards/margins_std": 0.4451248049736023, + "rewards/rejected": -0.4246394634246826, + "step": 3270 + }, + { + "epoch": 0.79, + "grad_norm": 7.9976397847405485, + "learning_rate": 6.691312762384396e-08, + "logits/chosen": -2.6755032539367676, + "logits/rejected": -2.629774570465088, + "logps/chosen": -288.03515625, + "logps/rejected": -273.4284973144531, + "loss": 0.5783, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.23924663662910461, + "rewards/margins": 0.24604792892932892, + "rewards/margins_max": 0.8091939687728882, + "rewards/margins_min": -0.3419875204563141, + "rewards/margins_std": 0.5214563608169556, + "rewards/rejected": -0.4852946400642395, + "step": 3280 + }, + { + "epoch": 0.79, + "grad_norm": 8.37224977799963, + "learning_rate": 6.54964389125428e-08, + "logits/chosen": -2.586719036102295, + "logits/rejected": -2.581692934036255, + "logps/chosen": -288.68231201171875, + "logps/rejected": -328.0451965332031, + "loss": 0.5687, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2837051451206207, + "rewards/margins": 0.3233044743537903, + "rewards/margins_max": 0.906445324420929, + "rewards/margins_min": -0.27489590644836426, + "rewards/margins_std": 0.5275270342826843, + "rewards/rejected": -0.6070095896720886, + "step": 3290 + }, + { + "epoch": 0.79, + "grad_norm": 8.264026624313555, + "learning_rate": 6.409264422355642e-08, + "logits/chosen": -2.6691842079162598, + "logits/rejected": -2.666581630706787, + "logps/chosen": -325.159423828125, + "logps/rejected": -350.07513427734375, + "loss": 0.5767, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.17226542532444, + "rewards/margins": 0.4506758153438568, + "rewards/margins_max": 1.059866189956665, + "rewards/margins_min": -0.20353195071220398, + "rewards/margins_std": 0.5784635543823242, + "rewards/rejected": -0.6229413151741028, + "step": 3300 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.613069534301758, + "eval_logits/rejected": -2.5827982425689697, + "eval_logps/chosen": -308.21435546875, + "eval_logps/rejected": -321.5457763671875, + "eval_loss": 0.5913904905319214, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -0.23759020864963531, + "eval_rewards/margins": 0.3193349540233612, + "eval_rewards/margins_max": 1.1828374862670898, + "eval_rewards/margins_min": -0.532956063747406, + "eval_rewards/margins_std": 0.5823011994361877, + "eval_rewards/rejected": -0.5569252371788025, + "eval_runtime": 859.3972, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 3300 + }, + { + "epoch": 0.79, + "grad_norm": 11.482562569568803, + "learning_rate": 6.27018416610078e-08, + "logits/chosen": -2.63213849067688, + "logits/rejected": -2.578720808029175, + "logps/chosen": -269.0001220703125, + "logps/rejected": -352.06964111328125, + "loss": 0.5617, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1835450977087021, + "rewards/margins": 0.31281334161758423, + "rewards/margins_max": 1.032710313796997, + "rewards/margins_min": -0.3121184706687927, + "rewards/margins_std": 0.6005935668945312, + "rewards/rejected": -0.4963584542274475, + "step": 3310 + }, + { + "epoch": 0.8, + "grad_norm": 2.8099366268464685, + "learning_rate": 6.132412842106572e-08, + "logits/chosen": -2.6385445594787598, + "logits/rejected": -2.595637798309326, + "logps/chosen": -330.910400390625, + "logps/rejected": -353.44952392578125, + "loss": 0.5931, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.27031436562538147, + "rewards/margins": 0.30159300565719604, + "rewards/margins_max": 0.9948034286499023, + "rewards/margins_min": -0.37592440843582153, + "rewards/margins_std": 0.6074448227882385, + "rewards/rejected": -0.5719074010848999, + "step": 3320 + }, + { + "epoch": 0.8, + "grad_norm": 12.653057684781732, + "learning_rate": 5.995960078515255e-08, + "logits/chosen": -2.640693187713623, + "logits/rejected": -2.5642504692077637, + "logps/chosen": -312.5872802734375, + "logps/rejected": -312.4304504394531, + "loss": 0.5964, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2662574052810669, + "rewards/margins": 0.29711824655532837, + "rewards/margins_max": 0.8822048306465149, + "rewards/margins_min": -0.3330439031124115, + "rewards/margins_std": 0.5500375628471375, + "rewards/rejected": -0.5633755922317505, + "step": 3330 + }, + { + "epoch": 0.8, + "grad_norm": 8.318222485446203, + "learning_rate": 5.860835411321494e-08, + "logits/chosen": -2.582233428955078, + "logits/rejected": -2.559011220932007, + "logps/chosen": -319.3325500488281, + "logps/rejected": -332.45709228515625, + "loss": 0.5777, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2152424156665802, + "rewards/margins": 0.3113642632961273, + "rewards/margins_max": 0.9375714063644409, + "rewards/margins_min": -0.35544976592063904, + "rewards/margins_std": 0.5669530630111694, + "rewards/rejected": -0.5266066789627075, + "step": 3340 + }, + { + "epoch": 0.8, + "grad_norm": 7.812987045365231, + "learning_rate": 5.7270482837060455e-08, + "logits/chosen": -2.6814351081848145, + "logits/rejected": -2.6515543460845947, + "logps/chosen": -319.64691162109375, + "logps/rejected": -295.6579895019531, + "loss": 0.5548, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1749572455883026, + "rewards/margins": 0.3691059947013855, + "rewards/margins_max": 1.0209461450576782, + "rewards/margins_min": -0.19284987449645996, + "rewards/margins_std": 0.5489664077758789, + "rewards/rejected": -0.5440632104873657, + "step": 3350 + }, + { + "epoch": 0.8, + "grad_norm": 10.508153020737115, + "learning_rate": 5.5946080453757425e-08, + "logits/chosen": -2.530405044555664, + "logits/rejected": -2.5279834270477295, + "logps/chosen": -310.46356201171875, + "logps/rejected": -314.398681640625, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08304639160633087, + "rewards/margins": 0.4107007086277008, + "rewards/margins_max": 1.032921314239502, + "rewards/margins_min": -0.22185806930065155, + "rewards/margins_std": 0.5521965026855469, + "rewards/rejected": -0.49374714493751526, + "step": 3360 + }, + { + "epoch": 0.81, + "grad_norm": 7.319287923562498, + "learning_rate": 5.4635239519101706e-08, + "logits/chosen": -2.588613986968994, + "logits/rejected": -2.61376690864563, + "logps/chosen": -299.3731994628906, + "logps/rejected": -347.53216552734375, + "loss": 0.6014, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22208890318870544, + "rewards/margins": 0.2745796740055084, + "rewards/margins_max": 0.9492992162704468, + "rewards/margins_min": -0.38629618287086487, + "rewards/margins_std": 0.5912618041038513, + "rewards/rejected": -0.49666857719421387, + "step": 3370 + }, + { + "epoch": 0.81, + "grad_norm": 7.002572812972141, + "learning_rate": 5.333805164114744e-08, + "logits/chosen": -2.588752269744873, + "logits/rejected": -2.5390632152557373, + "logps/chosen": -332.0850524902344, + "logps/rejected": -344.94659423828125, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20025333762168884, + "rewards/margins": 0.4261398911476135, + "rewards/margins_max": 0.9834170341491699, + "rewards/margins_min": -0.08735842257738113, + "rewards/margins_std": 0.4765813946723938, + "rewards/rejected": -0.6263931393623352, + "step": 3380 + }, + { + "epoch": 0.81, + "grad_norm": 9.862202911350407, + "learning_rate": 5.205460747380588e-08, + "logits/chosen": -2.6821560859680176, + "logits/rejected": -2.66601300239563, + "logps/chosen": -288.72625732421875, + "logps/rejected": -300.3806457519531, + "loss": 0.5808, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.215274840593338, + "rewards/margins": 0.3353163003921509, + "rewards/margins_max": 0.886604905128479, + "rewards/margins_min": -0.17076632380485535, + "rewards/margins_std": 0.4661891460418701, + "rewards/rejected": -0.5505911111831665, + "step": 3390 + }, + { + "epoch": 0.81, + "grad_norm": 4.144530789791445, + "learning_rate": 5.0784996710509785e-08, + "logits/chosen": -2.6226415634155273, + "logits/rejected": -2.6080586910247803, + "logps/chosen": -380.19482421875, + "logps/rejected": -385.8616943359375, + "loss": 0.5685, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2578801214694977, + "rewards/margins": 0.3303286135196686, + "rewards/margins_max": 0.9798241853713989, + "rewards/margins_min": -0.3123224377632141, + "rewards/margins_std": 0.5742116570472717, + "rewards/rejected": -0.5882086753845215, + "step": 3400 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.6102540493011475, + "eval_logits/rejected": -2.5799925327301025, + "eval_logps/chosen": -306.9150085449219, + "eval_logps/rejected": -320.19580078125, + "eval_loss": 0.5913717150688171, + "eval_rewards/accuracies": 0.7045000195503235, + "eval_rewards/chosen": -0.22459664940834045, + "eval_rewards/margins": 0.3188289403915405, + "eval_rewards/margins_max": 1.185779333114624, + "eval_rewards/margins_min": -0.5379509329795837, + "eval_rewards/margins_std": 0.583387017250061, + "eval_rewards/rejected": -0.5434256196022034, + "eval_runtime": 859.4957, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 3400 + }, + { + "epoch": 0.82, + "grad_norm": 8.964599074139267, + "learning_rate": 4.952930807794503e-08, + "logits/chosen": -2.6341657638549805, + "logits/rejected": -2.6094231605529785, + "logps/chosen": -285.22247314453125, + "logps/rejected": -332.9870300292969, + "loss": 0.5691, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24228985607624054, + "rewards/margins": 0.3657795190811157, + "rewards/margins_max": 1.0035291910171509, + "rewards/margins_min": -0.26299047470092773, + "rewards/margins_std": 0.5673614144325256, + "rewards/rejected": -0.6080694198608398, + "step": 3410 + }, + { + "epoch": 0.82, + "grad_norm": 8.692036055130663, + "learning_rate": 4.828762932985009e-08, + "logits/chosen": -2.6468403339385986, + "logits/rejected": -2.5917208194732666, + "logps/chosen": -316.1946716308594, + "logps/rejected": -312.6452941894531, + "loss": 0.5511, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17702266573905945, + "rewards/margins": 0.4823419451713562, + "rewards/margins_max": 1.041947841644287, + "rewards/margins_min": -0.111463762819767, + "rewards/margins_std": 0.529529869556427, + "rewards/rejected": -0.6593645811080933, + "step": 3420 + }, + { + "epoch": 0.82, + "grad_norm": 11.989930696645063, + "learning_rate": 4.706004724088328e-08, + "logits/chosen": -2.5788004398345947, + "logits/rejected": -2.4974637031555176, + "logps/chosen": -332.91845703125, + "logps/rejected": -343.2325439453125, + "loss": 0.6338, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34716516733169556, + "rewards/margins": 0.15864428877830505, + "rewards/margins_max": 0.7591507434844971, + "rewards/margins_min": -0.35776883363723755, + "rewards/margins_std": 0.5207773447036743, + "rewards/rejected": -0.5058094263076782, + "step": 3430 + }, + { + "epoch": 0.82, + "grad_norm": 8.163596166406325, + "learning_rate": 4.584664760055881e-08, + "logits/chosen": -2.652082681655884, + "logits/rejected": -2.6363697052001953, + "logps/chosen": -257.03045654296875, + "logps/rejected": -270.32330322265625, + "loss": 0.5678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1794271022081375, + "rewards/margins": 0.3605220317840576, + "rewards/margins_max": 0.9137634038925171, + "rewards/margins_min": -0.2497061789035797, + "rewards/margins_std": 0.5212998986244202, + "rewards/rejected": -0.5399491190910339, + "step": 3440 + }, + { + "epoch": 0.83, + "grad_norm": 10.712123412876426, + "learning_rate": 4.4647515207250934e-08, + "logits/chosen": -2.7117748260498047, + "logits/rejected": -2.671877861022949, + "logps/chosen": -323.82489013671875, + "logps/rejected": -321.583251953125, + "loss": 0.5762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23698492348194122, + "rewards/margins": 0.3007177710533142, + "rewards/margins_max": 0.8499841690063477, + "rewards/margins_min": -0.3255491554737091, + "rewards/margins_std": 0.5336715579032898, + "rewards/rejected": -0.5377026796340942, + "step": 3450 + }, + { + "epoch": 0.83, + "grad_norm": 5.333382347994325, + "learning_rate": 4.346273386226812e-08, + "logits/chosen": -2.595189094543457, + "logits/rejected": -2.606052875518799, + "logps/chosen": -323.14666748046875, + "logps/rejected": -320.80706787109375, + "loss": 0.6142, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19109676778316498, + "rewards/margins": 0.29731321334838867, + "rewards/margins_max": 0.9397839307785034, + "rewards/margins_min": -0.29314979910850525, + "rewards/margins_std": 0.5546901822090149, + "rewards/rejected": -0.48840999603271484, + "step": 3460 + }, + { + "epoch": 0.83, + "grad_norm": 7.404540045631026, + "learning_rate": 4.2292386363996484e-08, + "logits/chosen": -2.641002655029297, + "logits/rejected": -2.6082513332366943, + "logps/chosen": -332.7715759277344, + "logps/rejected": -327.356201171875, + "loss": 0.593, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29126280546188354, + "rewards/margins": 0.3013150095939636, + "rewards/margins_max": 0.8860662579536438, + "rewards/margins_min": -0.383476197719574, + "rewards/margins_std": 0.5513616800308228, + "rewards/rejected": -0.5925778150558472, + "step": 3470 + }, + { + "epoch": 0.83, + "grad_norm": 7.847498267915795, + "learning_rate": 4.1136554502113676e-08, + "logits/chosen": -2.613243579864502, + "logits/rejected": -2.6318023204803467, + "logps/chosen": -298.5329284667969, + "logps/rejected": -346.72052001953125, + "loss": 0.5774, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.26258477568626404, + "rewards/margins": 0.27295345067977905, + "rewards/margins_max": 0.8896123170852661, + "rewards/margins_min": -0.32080337405204773, + "rewards/margins_std": 0.5299565196037292, + "rewards/rejected": -0.5355381965637207, + "step": 3480 + }, + { + "epoch": 0.84, + "grad_norm": 9.109331029455413, + "learning_rate": 3.999531905187256e-08, + "logits/chosen": -2.6425492763519287, + "logits/rejected": -2.606245756149292, + "logps/chosen": -326.7677307128906, + "logps/rejected": -343.9598693847656, + "loss": 0.584, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.20647890865802765, + "rewards/margins": 0.4059480130672455, + "rewards/margins_max": 0.9774398803710938, + "rewards/margins_min": -0.2279617339372635, + "rewards/margins_std": 0.5373811721801758, + "rewards/rejected": -0.6124268770217896, + "step": 3490 + }, + { + "epoch": 0.84, + "grad_norm": 7.824226063245459, + "learning_rate": 3.886875976845661e-08, + "logits/chosen": -2.724914789199829, + "logits/rejected": -2.6821529865264893, + "logps/chosen": -336.58465576171875, + "logps/rejected": -328.7319030761719, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20324409008026123, + "rewards/margins": 0.336579293012619, + "rewards/margins_max": 0.8925960659980774, + "rewards/margins_min": -0.24249598383903503, + "rewards/margins_std": 0.5138999819755554, + "rewards/rejected": -0.5398234128952026, + "step": 3500 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.6082119941711426, + "eval_logits/rejected": -2.5778560638427734, + "eval_logps/chosen": -307.88323974609375, + "eval_logps/rejected": -321.4169006347656, + "eval_loss": 0.5908536911010742, + "eval_rewards/accuracies": 0.7045000195503235, + "eval_rewards/chosen": -0.23427951335906982, + "eval_rewards/margins": 0.32135695219039917, + "eval_rewards/margins_max": 1.1904600858688354, + "eval_rewards/margins_min": -0.5370433330535889, + "eval_rewards/margins_std": 0.5854602456092834, + "eval_rewards/rejected": -0.5556364059448242, + "eval_runtime": 859.4933, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 3500 + }, + { + "epoch": 0.84, + "grad_norm": 15.226256851408955, + "learning_rate": 3.775695538140608e-08, + "logits/chosen": -2.615886688232422, + "logits/rejected": -2.5705220699310303, + "logps/chosen": -273.38330078125, + "logps/rejected": -266.79046630859375, + "loss": 0.5741, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2061402052640915, + "rewards/margins": 0.3421597182750702, + "rewards/margins_max": 1.0528227090835571, + "rewards/margins_min": -0.30337151885032654, + "rewards/margins_std": 0.6072283983230591, + "rewards/rejected": -0.5482999086380005, + "step": 3510 + }, + { + "epoch": 0.84, + "grad_norm": 7.619503353884903, + "learning_rate": 3.665998358911593e-08, + "logits/chosen": -2.643258571624756, + "logits/rejected": -2.574820041656494, + "logps/chosen": -271.6816711425781, + "logps/rejected": -307.208251953125, + "loss": 0.5823, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.200513556599617, + "rewards/margins": 0.3419400155544281, + "rewards/margins_max": 1.027790904045105, + "rewards/margins_min": -0.22767655551433563, + "rewards/margins_std": 0.560200572013855, + "rewards/rejected": -0.5424535870552063, + "step": 3520 + }, + { + "epoch": 0.85, + "grad_norm": 7.151962928488147, + "learning_rate": 3.557792105340621e-08, + "logits/chosen": -2.703946113586426, + "logits/rejected": -2.6648306846618652, + "logps/chosen": -308.2814025878906, + "logps/rejected": -313.46185302734375, + "loss": 0.6098, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2687258720397949, + "rewards/margins": 0.2833290696144104, + "rewards/margins_max": 0.9018853902816772, + "rewards/margins_min": -0.34607797861099243, + "rewards/margins_std": 0.5544721484184265, + "rewards/rejected": -0.5520548820495605, + "step": 3530 + }, + { + "epoch": 0.85, + "grad_norm": 8.264274501263046, + "learning_rate": 3.4510843394163966e-08, + "logits/chosen": -2.6094186305999756, + "logits/rejected": -2.6097640991210938, + "logps/chosen": -307.6642150878906, + "logps/rejected": -353.87811279296875, + "loss": 0.5702, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.14493492245674133, + "rewards/margins": 0.5252057313919067, + "rewards/margins_max": 1.1174616813659668, + "rewards/margins_min": -0.020700642839074135, + "rewards/margins_std": 0.5230407118797302, + "rewards/rejected": -0.6701406836509705, + "step": 3540 + }, + { + "epoch": 0.85, + "grad_norm": 15.733612739107912, + "learning_rate": 3.345882518405918e-08, + "logits/chosen": -2.622058868408203, + "logits/rejected": -2.653578996658325, + "logps/chosen": -253.8896026611328, + "logps/rejected": -303.8586120605469, + "loss": 0.5931, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.181403249502182, + "rewards/margins": 0.3778737485408783, + "rewards/margins_max": 1.0367891788482666, + "rewards/margins_min": -0.2821559011936188, + "rewards/margins_std": 0.5796774625778198, + "rewards/rejected": -0.5592769980430603, + "step": 3550 + }, + { + "epoch": 0.85, + "grad_norm": 6.2572531765682795, + "learning_rate": 3.242193994333278e-08, + "logits/chosen": -2.596344470977783, + "logits/rejected": -2.5577056407928467, + "logps/chosen": -283.0947265625, + "logps/rejected": -288.11810302734375, + "loss": 0.5641, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2708722651004791, + "rewards/margins": 0.27966800332069397, + "rewards/margins_max": 0.8920449018478394, + "rewards/margins_min": -0.3251974284648895, + "rewards/margins_std": 0.5486242175102234, + "rewards/rejected": -0.5505402684211731, + "step": 3560 + }, + { + "epoch": 0.85, + "grad_norm": 6.516126435010375, + "learning_rate": 3.14002601346591e-08, + "logits/chosen": -2.5676286220550537, + "logits/rejected": -2.6242175102233887, + "logps/chosen": -311.1514892578125, + "logps/rejected": -345.4110412597656, + "loss": 0.549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17862629890441895, + "rewards/margins": 0.3790408670902252, + "rewards/margins_max": 0.9282873272895813, + "rewards/margins_min": -0.165249764919281, + "rewards/margins_std": 0.5055936574935913, + "rewards/rejected": -0.5576671361923218, + "step": 3570 + }, + { + "epoch": 0.86, + "grad_norm": 9.045146875912847, + "learning_rate": 3.039385715808121e-08, + "logits/chosen": -2.6263604164123535, + "logits/rejected": -2.5675251483917236, + "logps/chosen": -273.31085205078125, + "logps/rejected": -264.5960388183594, + "loss": 0.5653, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.18905775249004364, + "rewards/margins": 0.36532530188560486, + "rewards/margins_max": 0.9135506749153137, + "rewards/margins_min": -0.21762046217918396, + "rewards/margins_std": 0.5158858895301819, + "rewards/rejected": -0.5543830990791321, + "step": 3580 + }, + { + "epoch": 0.86, + "grad_norm": 3.8898146677855023, + "learning_rate": 2.9402801346021937e-08, + "logits/chosen": -2.6774134635925293, + "logits/rejected": -2.6019251346588135, + "logps/chosen": -353.1155090332031, + "logps/rejected": -326.0061950683594, + "loss": 0.579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24207577109336853, + "rewards/margins": 0.31224125623703003, + "rewards/margins_max": 0.984405517578125, + "rewards/margins_min": -0.32634276151657104, + "rewards/margins_std": 0.5798729658126831, + "rewards/rejected": -0.554317057132721, + "step": 3590 + }, + { + "epoch": 0.86, + "grad_norm": 12.174125022701196, + "learning_rate": 2.8427161958368002e-08, + "logits/chosen": -2.586181402206421, + "logits/rejected": -2.5371508598327637, + "logps/chosen": -314.48687744140625, + "logps/rejected": -307.24847412109375, + "loss": 0.5598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18670877814292908, + "rewards/margins": 0.3874475359916687, + "rewards/margins_max": 0.9850362539291382, + "rewards/margins_min": -0.2439243048429489, + "rewards/margins_std": 0.5419309735298157, + "rewards/rejected": -0.5741563439369202, + "step": 3600 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.608355760574341, + "eval_logits/rejected": -2.5780935287475586, + "eval_logps/chosen": -305.0804748535156, + "eval_logps/rejected": -317.9754333496094, + "eval_loss": 0.5924330353736877, + "eval_rewards/accuracies": 0.7059999704360962, + "eval_rewards/chosen": -0.20625153183937073, + "eval_rewards/margins": 0.31497010588645935, + "eval_rewards/margins_max": 1.1819028854370117, + "eval_rewards/margins_min": -0.539983868598938, + "eval_rewards/margins_std": 0.5816512703895569, + "eval_rewards/rejected": -0.5212216973304749, + "eval_runtime": 859.8177, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.291, + "step": 3600 + }, + { + "epoch": 0.86, + "grad_norm": 12.432739573628334, + "learning_rate": 2.7467007177630174e-08, + "logits/chosen": -2.6744627952575684, + "logits/rejected": -2.6617274284362793, + "logps/chosen": -342.64141845703125, + "logps/rejected": -362.93426513671875, + "loss": 0.5704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2297605723142624, + "rewards/margins": 0.2869631350040436, + "rewards/margins_max": 0.9172990918159485, + "rewards/margins_min": -0.2644995450973511, + "rewards/margins_std": 0.5308641791343689, + "rewards/rejected": -0.5167237520217896, + "step": 3610 + }, + { + "epoch": 0.87, + "grad_norm": 11.55258760076882, + "learning_rate": 2.652240410417819e-08, + "logits/chosen": -2.6653380393981934, + "logits/rejected": -2.600297212600708, + "logps/chosen": -320.4395751953125, + "logps/rejected": -300.54669189453125, + "loss": 0.5932, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.20440959930419922, + "rewards/margins": 0.3110480308532715, + "rewards/margins_max": 0.8536907434463501, + "rewards/margins_min": -0.28580746054649353, + "rewards/margins_std": 0.5097913146018982, + "rewards/rejected": -0.5154576897621155, + "step": 3620 + }, + { + "epoch": 0.87, + "grad_norm": 8.302577930556795, + "learning_rate": 2.5593418751551437e-08, + "logits/chosen": -2.6613705158233643, + "logits/rejected": -2.6312716007232666, + "logps/chosen": -348.50787353515625, + "logps/rejected": -311.14068603515625, + "loss": 0.5467, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20068880915641785, + "rewards/margins": 0.41503697633743286, + "rewards/margins_max": 1.0137380361557007, + "rewards/margins_min": -0.1997760385274887, + "rewards/margins_std": 0.5454592108726501, + "rewards/rejected": -0.6157258152961731, + "step": 3630 + }, + { + "epoch": 0.87, + "grad_norm": 7.711523217369656, + "learning_rate": 2.4680116041845834e-08, + "logits/chosen": -2.5822627544403076, + "logits/rejected": -2.586742877960205, + "logps/chosen": -292.9588317871094, + "logps/rejected": -327.4941711425781, + "loss": 0.5653, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14639553427696228, + "rewards/margins": 0.4603632390499115, + "rewards/margins_max": 1.068746566772461, + "rewards/margins_min": -0.1641278713941574, + "rewards/margins_std": 0.5588363409042358, + "rewards/rejected": -0.6067588329315186, + "step": 3640 + }, + { + "epoch": 0.87, + "grad_norm": 11.034392257845468, + "learning_rate": 2.3782559801176354e-08, + "logits/chosen": -2.6122536659240723, + "logits/rejected": -2.5909423828125, + "logps/chosen": -321.4766845703125, + "logps/rejected": -365.0554504394531, + "loss": 0.5506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1527622789144516, + "rewards/margins": 0.4362732470035553, + "rewards/margins_max": 0.9728630185127258, + "rewards/margins_min": -0.160151869058609, + "rewards/margins_std": 0.5050688982009888, + "rewards/rejected": -0.5890355110168457, + "step": 3650 + }, + { + "epoch": 0.88, + "grad_norm": 15.262643044541077, + "learning_rate": 2.290081275521688e-08, + "logits/chosen": -2.557404041290283, + "logits/rejected": -2.5633702278137207, + "logps/chosen": -280.54541015625, + "logps/rejected": -291.76019287109375, + "loss": 0.6138, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.29516711831092834, + "rewards/margins": 0.24436545372009277, + "rewards/margins_max": 0.9078601598739624, + "rewards/margins_min": -0.3921719193458557, + "rewards/margins_std": 0.5873770117759705, + "rewards/rejected": -0.5395325422286987, + "step": 3660 + }, + { + "epoch": 0.88, + "grad_norm": 9.267995701538196, + "learning_rate": 2.2034936524816388e-08, + "logits/chosen": -2.583019256591797, + "logits/rejected": -2.605175495147705, + "logps/chosen": -322.00555419921875, + "logps/rejected": -370.5304870605469, + "loss": 0.5985, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22519807517528534, + "rewards/margins": 0.2562308609485626, + "rewards/margins_max": 0.9576885104179382, + "rewards/margins_min": -0.2878001630306244, + "rewards/margins_std": 0.5579741597175598, + "rewards/rejected": -0.48142892122268677, + "step": 3670 + }, + { + "epoch": 0.88, + "grad_norm": 8.603721132974057, + "learning_rate": 2.118499162169285e-08, + "logits/chosen": -2.631331205368042, + "logits/rejected": -2.5858583450317383, + "logps/chosen": -382.18524169921875, + "logps/rejected": -342.15350341796875, + "loss": 0.5485, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19810251891613007, + "rewards/margins": 0.47410932183265686, + "rewards/margins_max": 1.0249054431915283, + "rewards/margins_min": -0.2214665710926056, + "rewards/margins_std": 0.5518790483474731, + "rewards/rejected": -0.6722118258476257, + "step": 3680 + }, + { + "epoch": 0.88, + "grad_norm": 3.7309544431859396, + "learning_rate": 2.035103744420408e-08, + "logits/chosen": -2.663212299346924, + "logits/rejected": -2.616042375564575, + "logps/chosen": -376.15008544921875, + "logps/rejected": -347.84271240234375, + "loss": 0.5919, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2270462065935135, + "rewards/margins": 0.3260009288787842, + "rewards/margins_max": 0.9850046038627625, + "rewards/margins_min": -0.3287786841392517, + "rewards/margins_std": 0.5911919474601746, + "rewards/rejected": -0.5530470609664917, + "step": 3690 + }, + { + "epoch": 0.89, + "grad_norm": 8.086879568460578, + "learning_rate": 1.953313227319689e-08, + "logits/chosen": -2.538327693939209, + "logits/rejected": -2.4988460540771484, + "logps/chosen": -330.48773193359375, + "logps/rejected": -315.3109130859375, + "loss": 0.5639, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.16558781266212463, + "rewards/margins": 0.31884345412254333, + "rewards/margins_max": 0.9515074491500854, + "rewards/margins_min": -0.43427133560180664, + "rewards/margins_std": 0.6087071299552917, + "rewards/rejected": -0.48443132638931274, + "step": 3700 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.6055796146392822, + "eval_logits/rejected": -2.5751001834869385, + "eval_logps/chosen": -305.3578186035156, + "eval_logps/rejected": -318.4354248046875, + "eval_loss": 0.5920639634132385, + "eval_rewards/accuracies": 0.7055000066757202, + "eval_rewards/chosen": -0.2090248167514801, + "eval_rewards/margins": 0.31679674983024597, + "eval_rewards/margins_max": 1.1849194765090942, + "eval_rewards/margins_min": -0.5398852825164795, + "eval_rewards/margins_std": 0.5830652117729187, + "eval_rewards/rejected": -0.5258215665817261, + "eval_runtime": 859.9471, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 0.291, + "step": 3700 + }, + { + "epoch": 0.89, + "grad_norm": 11.463339046274596, + "learning_rate": 1.873133326793397e-08, + "logits/chosen": -2.6155009269714355, + "logits/rejected": -2.5915045738220215, + "logps/chosen": -303.8510437011719, + "logps/rejected": -318.6623229980469, + "loss": 0.5918, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.17637260258197784, + "rewards/margins": 0.3188071846961975, + "rewards/margins_max": 0.8904364705085754, + "rewards/margins_min": -0.32362252473831177, + "rewards/margins_std": 0.5524941086769104, + "rewards/rejected": -0.4951797425746918, + "step": 3710 + }, + { + "epoch": 0.89, + "grad_norm": 5.6325587031610835, + "learning_rate": 1.794569646209948e-08, + "logits/chosen": -2.5438828468322754, + "logits/rejected": -2.512012243270874, + "logps/chosen": -326.58074951171875, + "logps/rejected": -305.82073974609375, + "loss": 0.6245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2585929334163666, + "rewards/margins": 0.22238990664482117, + "rewards/margins_max": 0.9835413694381714, + "rewards/margins_min": -0.5056635141372681, + "rewards/margins_std": 0.6667352914810181, + "rewards/rejected": -0.4809829294681549, + "step": 3720 + }, + { + "epoch": 0.89, + "grad_norm": 10.081384223474876, + "learning_rate": 1.7176276759883146e-08, + "logits/chosen": -2.5898776054382324, + "logits/rejected": -2.584012508392334, + "logps/chosen": -313.08905029296875, + "logps/rejected": -312.9795837402344, + "loss": 0.5606, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13006040453910828, + "rewards/margins": 0.4346505105495453, + "rewards/margins_max": 1.1141884326934814, + "rewards/margins_min": -0.14765407145023346, + "rewards/margins_std": 0.5635863542556763, + "rewards/rejected": -0.5647109150886536, + "step": 3730 + }, + { + "epoch": 0.9, + "grad_norm": 6.993420915130978, + "learning_rate": 1.642312793214293e-08, + "logits/chosen": -2.566262722015381, + "logits/rejected": -2.5170204639434814, + "logps/chosen": -285.74249267578125, + "logps/rejected": -342.0732116699219, + "loss": 0.5708, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.22177961468696594, + "rewards/margins": 0.4047152101993561, + "rewards/margins_max": 1.231454610824585, + "rewards/margins_min": -0.23452310264110565, + "rewards/margins_std": 0.6764256358146667, + "rewards/rejected": -0.6264947652816772, + "step": 3740 + }, + { + "epoch": 0.9, + "grad_norm": 5.971993888018905, + "learning_rate": 1.568630261264789e-08, + "logits/chosen": -2.6162006855010986, + "logits/rejected": -2.5812339782714844, + "logps/chosen": -292.12518310546875, + "logps/rejected": -277.68109130859375, + "loss": 0.5887, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1901755928993225, + "rewards/margins": 0.3246782124042511, + "rewards/margins_max": 0.9265223741531372, + "rewards/margins_min": -0.1909208595752716, + "rewards/margins_std": 0.5114433169364929, + "rewards/rejected": -0.514853835105896, + "step": 3750 + }, + { + "epoch": 0.9, + "grad_norm": 16.863431002491197, + "learning_rate": 1.49658522943992e-08, + "logits/chosen": -2.599583148956299, + "logits/rejected": -2.5607993602752686, + "logps/chosen": -247.8730926513672, + "logps/rejected": -300.6656494140625, + "loss": 0.5535, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.12827134132385254, + "rewards/margins": 0.36878079175949097, + "rewards/margins_max": 0.8943478465080261, + "rewards/margins_min": -0.16647151112556458, + "rewards/margins_std": 0.48803386092185974, + "rewards/rejected": -0.4970521926879883, + "step": 3760 + }, + { + "epoch": 0.9, + "grad_norm": 7.885285215571356, + "learning_rate": 1.4261827326032122e-08, + "logits/chosen": -2.640138626098633, + "logits/rejected": -2.595416784286499, + "logps/chosen": -318.6853942871094, + "logps/rejected": -314.74224853515625, + "loss": 0.5823, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2055031955242157, + "rewards/margins": 0.312764436006546, + "rewards/margins_max": 1.0183651447296143, + "rewards/margins_min": -0.35077372193336487, + "rewards/margins_std": 0.6288760900497437, + "rewards/rejected": -0.5182676315307617, + "step": 3770 + }, + { + "epoch": 0.91, + "grad_norm": 10.545925159775502, + "learning_rate": 1.3574276908296906e-08, + "logits/chosen": -2.5619964599609375, + "logits/rejected": -2.515324354171753, + "logps/chosen": -250.4527130126953, + "logps/rejected": -288.0833740234375, + "loss": 0.5821, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18306702375411987, + "rewards/margins": 0.3643852472305298, + "rewards/margins_max": 0.9473929405212402, + "rewards/margins_min": -0.2090524137020111, + "rewards/margins_std": 0.5079521536827087, + "rewards/rejected": -0.5474522709846497, + "step": 3780 + }, + { + "epoch": 0.91, + "grad_norm": 5.6495402461329345, + "learning_rate": 1.2903249090620849e-08, + "logits/chosen": -2.6735751628875732, + "logits/rejected": -2.598555564880371, + "logps/chosen": -355.37420654296875, + "logps/rejected": -332.27593994140625, + "loss": 0.5884, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18271778523921967, + "rewards/margins": 0.3155055642127991, + "rewards/margins_max": 0.8619476556777954, + "rewards/margins_min": -0.32753393054008484, + "rewards/margins_std": 0.5435506105422974, + "rewards/rejected": -0.49822330474853516, + "step": 3790 + }, + { + "epoch": 0.91, + "grad_norm": 6.149064726385415, + "learning_rate": 1.2248790767750012e-08, + "logits/chosen": -2.5644943714141846, + "logits/rejected": -2.5778238773345947, + "logps/chosen": -247.0529022216797, + "logps/rejected": -301.94403076171875, + "loss": 0.5931, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2534622848033905, + "rewards/margins": 0.299783855676651, + "rewards/margins_max": 0.9191850423812866, + "rewards/margins_min": -0.3087500035762787, + "rewards/margins_std": 0.5637552738189697, + "rewards/rejected": -0.5532461404800415, + "step": 3800 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -2.6080944538116455, + "eval_logits/rejected": -2.577848196029663, + "eval_logps/chosen": -304.3083801269531, + "eval_logps/rejected": -317.0423583984375, + "eval_loss": 0.5930164456367493, + "eval_rewards/accuracies": 0.7059999704360962, + "eval_rewards/chosen": -0.19853053987026215, + "eval_rewards/margins": 0.3133601248264313, + "eval_rewards/margins_max": 1.1789708137512207, + "eval_rewards/margins_min": -0.5399187803268433, + "eval_rewards/margins_std": 0.5801899433135986, + "eval_rewards/rejected": -0.5118906497955322, + "eval_runtime": 859.5908, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 0.291, + "step": 3800 + }, + { + "epoch": 0.91, + "grad_norm": 7.582877456320794, + "learning_rate": 1.1610947676472277e-08, + "logits/chosen": -2.621169090270996, + "logits/rejected": -2.5998873710632324, + "logps/chosen": -312.3177490234375, + "logps/rejected": -320.4533386230469, + "loss": 0.6041, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2622186541557312, + "rewards/margins": 0.3109205365180969, + "rewards/margins_max": 0.9574284553527832, + "rewards/margins_min": -0.3280250132083893, + "rewards/margins_std": 0.5780671834945679, + "rewards/rejected": -0.5731391906738281, + "step": 3810 + }, + { + "epoch": 0.91, + "grad_norm": 18.351199975133607, + "learning_rate": 1.0989764392420692e-08, + "logits/chosen": -2.6208126544952393, + "logits/rejected": -2.5716211795806885, + "logps/chosen": -336.3990173339844, + "logps/rejected": -344.66241455078125, + "loss": 0.5581, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19919511675834656, + "rewards/margins": 0.34095874428749084, + "rewards/margins_max": 0.9613696336746216, + "rewards/margins_min": -0.26550573110580444, + "rewards/margins_std": 0.5569697022438049, + "rewards/rejected": -0.5401539206504822, + "step": 3820 + }, + { + "epoch": 0.92, + "grad_norm": 6.713050873669192, + "learning_rate": 1.0385284326958593e-08, + "logits/chosen": -2.6769707202911377, + "logits/rejected": -2.5937695503234863, + "logps/chosen": -335.0888671875, + "logps/rejected": -325.7705383300781, + "loss": 0.5864, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12837204337120056, + "rewards/margins": 0.35219669342041016, + "rewards/margins_max": 1.0074574947357178, + "rewards/margins_min": -0.2811533510684967, + "rewards/margins_std": 0.5698063373565674, + "rewards/rejected": -0.4805687963962555, + "step": 3830 + }, + { + "epoch": 0.92, + "grad_norm": 5.734116218403002, + "learning_rate": 9.797549724145731e-09, + "logits/chosen": -2.7008821964263916, + "logits/rejected": -2.6449217796325684, + "logps/chosen": -350.021240234375, + "logps/rejected": -327.3391418457031, + "loss": 0.5596, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15653374791145325, + "rewards/margins": 0.37177354097366333, + "rewards/margins_max": 1.0280953645706177, + "rewards/margins_min": -0.18916736543178558, + "rewards/margins_std": 0.5437596440315247, + "rewards/rejected": -0.528307318687439, + "step": 3840 + }, + { + "epoch": 0.92, + "grad_norm": 4.955004218260416, + "learning_rate": 9.226601657785993e-09, + "logits/chosen": -2.643749475479126, + "logits/rejected": -2.660839080810547, + "logps/chosen": -288.3301086425781, + "logps/rejected": -350.01458740234375, + "loss": 0.58, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2535906136035919, + "rewards/margins": 0.264422744512558, + "rewards/margins_max": 0.8971040844917297, + "rewards/margins_min": -0.45524635910987854, + "rewards/margins_std": 0.609712541103363, + "rewards/rejected": -0.5180133581161499, + "step": 3850 + }, + { + "epoch": 0.92, + "grad_norm": 8.30905699593047, + "learning_rate": 8.672480028556972e-09, + "logits/chosen": -2.4579296112060547, + "logits/rejected": -2.457097291946411, + "logps/chosen": -281.6809387207031, + "logps/rejected": -334.88482666015625, + "loss": 0.6003, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.15676701068878174, + "rewards/margins": 0.284981906414032, + "rewards/margins_max": 0.7929221391677856, + "rewards/margins_min": -0.18706540763378143, + "rewards/margins_std": 0.43972960114479065, + "rewards/rejected": -0.4417489171028137, + "step": 3860 + }, + { + "epoch": 0.93, + "grad_norm": 15.51133067775074, + "learning_rate": 8.13522356122151e-09, + "logits/chosen": -2.690196990966797, + "logits/rejected": -2.618220806121826, + "logps/chosen": -301.7216491699219, + "logps/rejected": -315.8422546386719, + "loss": 0.6058, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.19068580865859985, + "rewards/margins": 0.3160189986228943, + "rewards/margins_max": 0.9212614297866821, + "rewards/margins_min": -0.22967295348644257, + "rewards/margins_std": 0.5226109027862549, + "rewards/rejected": -0.5067048668861389, + "step": 3870 + }, + { + "epoch": 0.93, + "grad_norm": 14.699283629108745, + "learning_rate": 7.614869801921525e-09, + "logits/chosen": -2.632333278656006, + "logits/rejected": -2.5998964309692383, + "logps/chosen": -298.019775390625, + "logps/rejected": -307.9640808105469, + "loss": 0.5814, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22215929627418518, + "rewards/margins": 0.22724071145057678, + "rewards/margins_max": 0.8938091397285461, + "rewards/margins_min": -0.33744460344314575, + "rewards/margins_std": 0.5495746731758118, + "rewards/rejected": -0.44940000772476196, + "step": 3880 + }, + { + "epoch": 0.93, + "grad_norm": 13.135176137869319, + "learning_rate": 7.111455115553944e-09, + "logits/chosen": -2.604551315307617, + "logits/rejected": -2.5755152702331543, + "logps/chosen": -280.9961853027344, + "logps/rejected": -344.09588623046875, + "loss": 0.5858, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1802806705236435, + "rewards/margins": 0.35020098090171814, + "rewards/margins_max": 1.0035594701766968, + "rewards/margins_min": -0.37194377183914185, + "rewards/margins_std": 0.6116623878479004, + "rewards/rejected": -0.5304816961288452, + "step": 3890 + }, + { + "epoch": 0.93, + "grad_norm": 12.509404421258807, + "learning_rate": 6.6250146832294296e-09, + "logits/chosen": -2.646723508834839, + "logits/rejected": -2.626894474029541, + "logps/chosen": -308.8915100097656, + "logps/rejected": -295.9562072753906, + "loss": 0.5542, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.19474686682224274, + "rewards/margins": 0.39745014905929565, + "rewards/margins_max": 0.9413665533065796, + "rewards/margins_min": -0.3064861595630646, + "rewards/margins_std": 0.5544202923774719, + "rewards/rejected": -0.592197060585022, + "step": 3900 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.606426239013672, + "eval_logits/rejected": -2.576036214828491, + "eval_logps/chosen": -304.34912109375, + "eval_logps/rejected": -317.132080078125, + "eval_loss": 0.5929449200630188, + "eval_rewards/accuracies": 0.7059999704360962, + "eval_rewards/chosen": -0.19893796741962433, + "eval_rewards/margins": 0.3138505220413208, + "eval_rewards/margins_max": 1.1806962490081787, + "eval_rewards/margins_min": -0.5397577285766602, + "eval_rewards/margins_std": 0.580825924873352, + "eval_rewards/rejected": -0.5127884745597839, + "eval_runtime": 860.1409, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 0.291, + "step": 3900 + }, + { + "epoch": 0.94, + "grad_norm": 9.576670002532317, + "learning_rate": 6.155582499813655e-09, + "logits/chosen": -2.6036484241485596, + "logits/rejected": -2.55094313621521, + "logps/chosen": -304.96661376953125, + "logps/rejected": -325.8829345703125, + "loss": 0.6165, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24803559482097626, + "rewards/margins": 0.27249425649642944, + "rewards/margins_max": 0.9829356074333191, + "rewards/margins_min": -0.3355262279510498, + "rewards/margins_std": 0.5755178332328796, + "rewards/rejected": -0.5205298662185669, + "step": 3910 + }, + { + "epoch": 0.94, + "grad_norm": 5.535198634166414, + "learning_rate": 5.703191371551841e-09, + "logits/chosen": -2.6377830505371094, + "logits/rejected": -2.52290940284729, + "logps/chosen": -392.7371520996094, + "logps/rejected": -322.02008056640625, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20820406079292297, + "rewards/margins": 0.36383289098739624, + "rewards/margins_max": 0.9824415445327759, + "rewards/margins_min": -0.23504504561424255, + "rewards/margins_std": 0.5402665138244629, + "rewards/rejected": -0.5720369219779968, + "step": 3920 + }, + { + "epoch": 0.94, + "grad_norm": 14.285955743111506, + "learning_rate": 5.267872913775756e-09, + "logits/chosen": -2.704752206802368, + "logits/rejected": -2.669785737991333, + "logps/chosen": -303.36041259765625, + "logps/rejected": -288.13775634765625, + "loss": 0.5707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1602882742881775, + "rewards/margins": 0.34425032138824463, + "rewards/margins_max": 1.0268034934997559, + "rewards/margins_min": -0.32272079586982727, + "rewards/margins_std": 0.5999466776847839, + "rewards/rejected": -0.5045386552810669, + "step": 3930 + }, + { + "epoch": 0.94, + "grad_norm": 3.230351548565471, + "learning_rate": 4.8496575486943744e-09, + "logits/chosen": -2.663681745529175, + "logits/rejected": -2.5701091289520264, + "logps/chosen": -359.291015625, + "logps/rejected": -328.3930969238281, + "loss": 0.5593, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17103374004364014, + "rewards/margins": 0.47963079810142517, + "rewards/margins_max": 1.0605299472808838, + "rewards/margins_min": -0.1423892080783844, + "rewards/margins_std": 0.5544031262397766, + "rewards/rejected": -0.6506645679473877, + "step": 3940 + }, + { + "epoch": 0.95, + "grad_norm": 11.851201996824257, + "learning_rate": 4.448574503268076e-09, + "logits/chosen": -2.5257601737976074, + "logits/rejected": -2.5021889209747314, + "logps/chosen": -283.82159423828125, + "logps/rejected": -315.5705871582031, + "loss": 0.5681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2183389663696289, + "rewards/margins": 0.3329612910747528, + "rewards/margins_max": 0.9345414042472839, + "rewards/margins_min": -0.2967822849750519, + "rewards/margins_std": 0.5389171838760376, + "rewards/rejected": -0.5513002276420593, + "step": 3950 + }, + { + "epoch": 0.95, + "grad_norm": 9.731317079639949, + "learning_rate": 4.064651807165781e-09, + "logits/chosen": -2.587010622024536, + "logits/rejected": -2.5509226322174072, + "logps/chosen": -276.9902648925781, + "logps/rejected": -278.53118896484375, + "loss": 0.535, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2354898452758789, + "rewards/margins": 0.45715102553367615, + "rewards/margins_max": 1.1073687076568604, + "rewards/margins_min": -0.18103381991386414, + "rewards/margins_std": 0.5672684907913208, + "rewards/rejected": -0.6926408410072327, + "step": 3960 + }, + { + "epoch": 0.95, + "grad_norm": 12.975565321409169, + "learning_rate": 3.697916290806291e-09, + "logits/chosen": -2.679515838623047, + "logits/rejected": -2.6130564212799072, + "logps/chosen": -306.14599609375, + "logps/rejected": -290.2887268066406, + "loss": 0.5486, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.13901828229427338, + "rewards/margins": 0.3719906806945801, + "rewards/margins_max": 0.9517256617546082, + "rewards/margins_min": -0.1434636414051056, + "rewards/margins_std": 0.4784785211086273, + "rewards/rejected": -0.5110089182853699, + "step": 3970 + }, + { + "epoch": 0.95, + "grad_norm": 11.34136483834659, + "learning_rate": 3.3483935834831e-09, + "logits/chosen": -2.615412950515747, + "logits/rejected": -2.565363883972168, + "logps/chosen": -314.16363525390625, + "logps/rejected": -324.04608154296875, + "loss": 0.543, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.18938302993774414, + "rewards/margins": 0.43559032678604126, + "rewards/margins_max": 1.0743236541748047, + "rewards/margins_min": -0.20624911785125732, + "rewards/margins_std": 0.5716836452484131, + "rewards/rejected": -0.6249733567237854, + "step": 3980 + }, + { + "epoch": 0.96, + "grad_norm": 9.152276228144013, + "learning_rate": 3.0161081115735456e-09, + "logits/chosen": -2.645146608352661, + "logits/rejected": -2.606229543685913, + "logps/chosen": -334.4048156738281, + "logps/rejected": -325.699462890625, + "loss": 0.5966, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.23924922943115234, + "rewards/margins": 0.23653730750083923, + "rewards/margins_max": 0.9346807599067688, + "rewards/margins_min": -0.3636035621166229, + "rewards/margins_std": 0.5894318222999573, + "rewards/rejected": -0.4757865369319916, + "step": 3990 + }, + { + "epoch": 0.96, + "grad_norm": 15.591206620302135, + "learning_rate": 2.7010830968314802e-09, + "logits/chosen": -2.6069769859313965, + "logits/rejected": -2.592172145843506, + "logps/chosen": -284.5611267089844, + "logps/rejected": -301.4425354003906, + "loss": 0.5713, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1448238044977188, + "rewards/margins": 0.37589389085769653, + "rewards/margins_max": 0.951191782951355, + "rewards/margins_min": -0.22190327942371368, + "rewards/margins_std": 0.5327543616294861, + "rewards/rejected": -0.5207176804542542, + "step": 4000 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.604844093322754, + "eval_logits/rejected": -2.574323892593384, + "eval_logps/chosen": -304.6741027832031, + "eval_logps/rejected": -317.602783203125, + "eval_loss": 0.5926074385643005, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -0.20218777656555176, + "eval_rewards/margins": 0.31530728936195374, + "eval_rewards/margins_max": 1.1830588579177856, + "eval_rewards/margins_min": -0.5406986474990845, + "eval_rewards/margins_std": 0.5823014974594116, + "eval_rewards/rejected": -0.5174950957298279, + "eval_runtime": 859.5521, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.291, + "step": 4000 + }, + { + "epoch": 0.96, + "grad_norm": 4.4781608457654665, + "learning_rate": 2.4033405547646545e-09, + "logits/chosen": -2.6150670051574707, + "logits/rejected": -2.588038921356201, + "logps/chosen": -267.67523193359375, + "logps/rejected": -355.4295349121094, + "loss": 0.558, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18685632944107056, + "rewards/margins": 0.41204652190208435, + "rewards/margins_max": 0.9812752604484558, + "rewards/margins_min": -0.22956518828868866, + "rewards/margins_std": 0.540183961391449, + "rewards/rejected": -0.5989028215408325, + "step": 4010 + }, + { + "epoch": 0.96, + "grad_norm": 8.487408455760916, + "learning_rate": 2.122901293095919e-09, + "logits/chosen": -2.6082186698913574, + "logits/rejected": -2.5535728931427, + "logps/chosen": -303.72967529296875, + "logps/rejected": -323.4769287109375, + "loss": 0.5645, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.11907126754522324, + "rewards/margins": 0.41876569390296936, + "rewards/margins_max": 1.0511444807052612, + "rewards/margins_min": -0.1879216879606247, + "rewards/margins_std": 0.567715585231781, + "rewards/rejected": -0.5378369092941284, + "step": 4020 + }, + { + "epoch": 0.97, + "grad_norm": 10.081986981087429, + "learning_rate": 1.8597849103094143e-09, + "logits/chosen": -2.6195461750030518, + "logits/rejected": -2.592846632003784, + "logps/chosen": -307.9976501464844, + "logps/rejected": -332.479248046875, + "loss": 0.6021, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20657601952552795, + "rewards/margins": 0.33928051590919495, + "rewards/margins_max": 0.9945265054702759, + "rewards/margins_min": -0.26213595271110535, + "rewards/margins_std": 0.5654140710830688, + "rewards/rejected": -0.5458565354347229, + "step": 4030 + }, + { + "epoch": 0.97, + "grad_norm": 10.391196970864554, + "learning_rate": 1.614009794280613e-09, + "logits/chosen": -2.6484408378601074, + "logits/rejected": -2.6035032272338867, + "logps/chosen": -323.63250732421875, + "logps/rejected": -338.87646484375, + "loss": 0.5795, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2887625992298126, + "rewards/margins": 0.3088279664516449, + "rewards/margins_max": 0.9695230722427368, + "rewards/margins_min": -0.4191213548183441, + "rewards/margins_std": 0.6257587671279907, + "rewards/rejected": -0.5975905656814575, + "step": 4040 + }, + { + "epoch": 0.97, + "grad_norm": 9.172514235203973, + "learning_rate": 1.3855931209914295e-09, + "logits/chosen": -2.646422863006592, + "logits/rejected": -2.6451754570007324, + "logps/chosen": -310.06585693359375, + "logps/rejected": -338.1797180175781, + "loss": 0.5912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.26628363132476807, + "rewards/margins": 0.262619286775589, + "rewards/margins_max": 0.8607581257820129, + "rewards/margins_min": -0.3698212802410126, + "rewards/margins_std": 0.5509908199310303, + "rewards/rejected": -0.5289028882980347, + "step": 4050 + }, + { + "epoch": 0.97, + "grad_norm": 6.302490759256043, + "learning_rate": 1.1745508533298754e-09, + "logits/chosen": -2.632183313369751, + "logits/rejected": -2.570913076400757, + "logps/chosen": -308.2278137207031, + "logps/rejected": -297.3208923339844, + "loss": 0.564, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2079722136259079, + "rewards/margins": 0.38544580340385437, + "rewards/margins_max": 0.9723888635635376, + "rewards/margins_min": -0.12654462456703186, + "rewards/margins_std": 0.48897290229797363, + "rewards/rejected": -0.5934180021286011, + "step": 4060 + }, + { + "epoch": 0.97, + "grad_norm": 7.478122592260895, + "learning_rate": 9.808977399744511e-10, + "logits/chosen": -2.5542659759521484, + "logits/rejected": -2.5500786304473877, + "logps/chosen": -299.34912109375, + "logps/rejected": -301.2681884765625, + "loss": 0.5841, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18661324679851532, + "rewards/margins": 0.33656594157218933, + "rewards/margins_max": 0.9998302459716797, + "rewards/margins_min": -0.2652764916419983, + "rewards/margins_std": 0.5792658925056458, + "rewards/rejected": -0.5231791138648987, + "step": 4070 + }, + { + "epoch": 0.98, + "grad_norm": 4.014689176886764, + "learning_rate": 8.046473143635268e-10, + "logits/chosen": -2.576669692993164, + "logits/rejected": -2.5621371269226074, + "logps/chosen": -299.82330322265625, + "logps/rejected": -320.2934875488281, + "loss": 0.5964, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14786235988140106, + "rewards/margins": 0.3951955735683441, + "rewards/margins_max": 1.0109889507293701, + "rewards/margins_min": -0.2718932032585144, + "rewards/margins_std": 0.5676113963127136, + "rewards/rejected": -0.5430579781532288, + "step": 4080 + }, + { + "epoch": 0.98, + "grad_norm": 6.79237121796912, + "learning_rate": 6.458118937494317e-10, + "logits/chosen": -2.5543389320373535, + "logits/rejected": -2.5615315437316895, + "logps/chosen": -337.30828857421875, + "logps/rejected": -349.62884521484375, + "loss": 0.5622, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.17236635088920593, + "rewards/margins": 0.4054867625236511, + "rewards/margins_max": 0.9090726971626282, + "rewards/margins_min": -0.1325097382068634, + "rewards/margins_std": 0.4775218069553375, + "rewards/rejected": -0.5778530836105347, + "step": 4090 + }, + { + "epoch": 0.98, + "grad_norm": 6.85419882290456, + "learning_rate": 5.044025783377259e-10, + "logits/chosen": -2.650761127471924, + "logits/rejected": -2.633965253829956, + "logps/chosen": -330.33197021484375, + "logps/rejected": -337.37677001953125, + "loss": 0.5725, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.23919835686683655, + "rewards/margins": 0.40454989671707153, + "rewards/margins_max": 0.9539936780929565, + "rewards/margins_min": -0.12280567735433578, + "rewards/margins_std": 0.49979060888290405, + "rewards/rejected": -0.6437481641769409, + "step": 4100 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -2.6056201457977295, + "eval_logits/rejected": -2.5751943588256836, + "eval_logps/chosen": -304.7070007324219, + "eval_logps/rejected": -317.59930419921875, + "eval_loss": 0.5925434827804565, + "eval_rewards/accuracies": 0.7059999704360962, + "eval_rewards/chosen": -0.20251673460006714, + "eval_rewards/margins": 0.3149436414241791, + "eval_rewards/margins_max": 1.183323860168457, + "eval_rewards/margins_min": -0.5414925217628479, + "eval_rewards/margins_std": 0.5823516845703125, + "eval_rewards/rejected": -0.5174604058265686, + "eval_runtime": 859.9542, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 0.291, + "step": 4100 + }, + { + "epoch": 0.98, + "grad_norm": 9.748854741055846, + "learning_rate": 3.8042925051148813e-10, + "logits/chosen": -2.5554616451263428, + "logits/rejected": -2.5370991230010986, + "logps/chosen": -324.4612731933594, + "logps/rejected": -315.3202209472656, + "loss": 0.5679, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.196882426738739, + "rewards/margins": 0.3231392204761505, + "rewards/margins_max": 0.942890465259552, + "rewards/margins_min": -0.21961653232574463, + "rewards/margins_std": 0.5246455669403076, + "rewards/rejected": -0.5200216174125671, + "step": 4110 + }, + { + "epoch": 0.99, + "grad_norm": 12.209770317697934, + "learning_rate": 2.7390057414064525e-10, + "logits/chosen": -2.6003313064575195, + "logits/rejected": -2.591627597808838, + "logps/chosen": -326.4041748046875, + "logps/rejected": -320.86260986328125, + "loss": 0.5498, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.19969312846660614, + "rewards/margins": 0.34613776206970215, + "rewards/margins_max": 0.9882827997207642, + "rewards/margins_min": -0.3541576564311981, + "rewards/margins_std": 0.5943008661270142, + "rewards/rejected": -0.5458309054374695, + "step": 4120 + }, + { + "epoch": 0.99, + "grad_norm": 6.440145893221407, + "learning_rate": 1.8482399397654057e-10, + "logits/chosen": -2.659862995147705, + "logits/rejected": -2.6385557651519775, + "logps/chosen": -321.1582336425781, + "logps/rejected": -349.50299072265625, + "loss": 0.5744, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.17049312591552734, + "rewards/margins": 0.303345263004303, + "rewards/margins_max": 0.8798073530197144, + "rewards/margins_min": -0.20885124802589417, + "rewards/margins_std": 0.4921353757381439, + "rewards/rejected": -0.4738383889198303, + "step": 4130 + }, + { + "epoch": 0.99, + "grad_norm": 10.2314091003016, + "learning_rate": 1.1320573513159959e-10, + "logits/chosen": -2.626286268234253, + "logits/rejected": -2.5777461528778076, + "logps/chosen": -287.6363220214844, + "logps/rejected": -287.4676818847656, + "loss": 0.5813, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2590753436088562, + "rewards/margins": 0.2685549557209015, + "rewards/margins_max": 0.9383190274238586, + "rewards/margins_min": -0.35475224256515503, + "rewards/margins_std": 0.588822066783905, + "rewards/rejected": -0.5276302099227905, + "step": 4140 + }, + { + "epoch": 0.99, + "grad_norm": 11.273979570517472, + "learning_rate": 5.905080264431705e-11, + "logits/chosen": -2.594353199005127, + "logits/rejected": -2.572741985321045, + "logps/chosen": -303.2122497558594, + "logps/rejected": -317.05902099609375, + "loss": 0.5635, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20842579007148743, + "rewards/margins": 0.3852156698703766, + "rewards/margins_max": 0.9706063270568848, + "rewards/margins_min": -0.12086659669876099, + "rewards/margins_std": 0.4832797646522522, + "rewards/rejected": -0.593641459941864, + "step": 4150 + }, + { + "epoch": 1.0, + "grad_norm": 10.175839731117941, + "learning_rate": 2.2362981129508963e-11, + "logits/chosen": -2.6398215293884277, + "logits/rejected": -2.602846622467041, + "logps/chosen": -308.8743591308594, + "logps/rejected": -334.1455993652344, + "loss": 0.5567, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1167726069688797, + "rewards/margins": 0.39473292231559753, + "rewards/margins_max": 1.0183244943618774, + "rewards/margins_min": -0.24444147944450378, + "rewards/margins_std": 0.5833578705787659, + "rewards/rejected": -0.5115054845809937, + "step": 4160 + }, + { + "epoch": 1.0, + "grad_norm": 11.274093259903779, + "learning_rate": 3.144834513746364e-12, + "logits/chosen": -2.6418585777282715, + "logits/rejected": -2.634779214859009, + "logps/chosen": -334.3486633300781, + "logps/rejected": -331.58697509765625, + "loss": 0.5539, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.12420052289962769, + "rewards/margins": 0.4431988298892975, + "rewards/margins_max": 1.0661565065383911, + "rewards/margins_min": -0.11877351999282837, + "rewards/margins_std": 0.5281156301498413, + "rewards/rejected": -0.5673993229866028, + "step": 4170 + }, + { + "epoch": 1.0, + "step": 4176, + "total_flos": 0.0, + "train_loss": 0.6106676421631342, + "train_runtime": 67977.0297, + "train_samples_per_second": 0.983, + "train_steps_per_second": 0.061 + } + ], + "logging_steps": 10, + "max_steps": 4176, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}