{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 4176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.8733329991965941, "learning_rate": 1.1961722488038277e-09, "logits/chosen": -2.8505566120147705, "logits/rejected": -2.908921003341675, "logps/chosen": -429.770751953125, "logps/rejected": -264.9197998046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 1.817759545013798, "learning_rate": 1.1961722488038278e-08, "logits/chosen": -2.7373788356781006, "logits/rejected": -2.7256851196289062, "logps/chosen": -308.5910339355469, "logps/rejected": -256.5116271972656, "loss": 0.6931, "rewards/accuracies": 0.0694444477558136, "rewards/chosen": -8.499662362737581e-05, "rewards/margins": -6.767747981939465e-05, "rewards/margins_max": 0.0005438412772491574, "rewards/margins_min": -0.0006299633532762527, "rewards/margins_std": 0.0005042126285843551, "rewards/rejected": -1.731912743707653e-05, "step": 10 }, { "epoch": 0.0, "grad_norm": 1.6702737002599026, "learning_rate": 2.3923444976076555e-08, "logits/chosen": -2.7464852333068848, "logits/rejected": -2.726733922958374, "logps/chosen": -240.0852813720703, "logps/rejected": -258.0418701171875, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00010407304216641933, "rewards/margins": 0.00011324265506118536, "rewards/margins_max": 0.003207577858120203, "rewards/margins_min": -0.0033295839093625546, "rewards/margins_std": 0.0029281422030180693, "rewards/rejected": -9.169587428914383e-06, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.107174099558945, "learning_rate": 3.588516746411483e-08, "logits/chosen": -2.8826613426208496, "logits/rejected": -2.850792407989502, "logps/chosen": -340.63238525390625, "logps/rejected": -264.9729919433594, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0002944297739304602, "rewards/margins": 0.0002557325060479343, "rewards/margins_max": 0.003186721820384264, "rewards/margins_min": -0.0027455384843051434, "rewards/margins_std": 0.002718889620155096, "rewards/rejected": 3.86972569685895e-05, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.304921801020499, "learning_rate": 4.784688995215311e-08, "logits/chosen": -2.7977702617645264, "logits/rejected": -2.766904354095459, "logps/chosen": -264.3175354003906, "logps/rejected": -238.17086791992188, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.00032830884447321296, "rewards/margins": 0.0003703173715621233, "rewards/margins_max": 0.0032206419855356216, "rewards/margins_min": -0.0021116649731993675, "rewards/margins_std": 0.00244266539812088, "rewards/rejected": -4.200851981295273e-05, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.7720520388580636, "learning_rate": 5.980861244019139e-08, "logits/chosen": -2.871934413909912, "logits/rejected": -2.8557310104370117, "logps/chosen": -328.1521911621094, "logps/rejected": -322.0428771972656, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00033686935785226524, "rewards/margins": -0.000777339213527739, "rewards/margins_max": 0.0024222906213253736, "rewards/margins_min": -0.00480139022693038, "rewards/margins_std": 0.003326979000121355, "rewards/rejected": 0.0004404698556754738, "step": 50 }, { "epoch": 0.01, "grad_norm": 1.6557868453886389, "learning_rate": 7.177033492822967e-08, "logits/chosen": -2.84224009513855, "logits/rejected": -2.7694106101989746, "logps/chosen": -306.7173767089844, "logps/rejected": -259.01873779296875, "loss": 0.693, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 8.184978651115671e-05, "rewards/margins": 0.00029427764820866287, "rewards/margins_max": 0.003731258912011981, "rewards/margins_min": -0.003034669905900955, "rewards/margins_std": 0.003077024593949318, "rewards/rejected": -0.00021242785442154855, "step": 60 }, { "epoch": 0.02, "grad_norm": 2.9243532166685755, "learning_rate": 8.373205741626794e-08, "logits/chosen": -2.7519397735595703, "logits/rejected": -2.7474653720855713, "logps/chosen": -288.6518859863281, "logps/rejected": -253.1888885498047, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00029823233489878476, "rewards/margins": 0.0004616590158548206, "rewards/margins_max": 0.003543038619682193, "rewards/margins_min": -0.0025465849321335554, "rewards/margins_std": 0.0027478071860969067, "rewards/rejected": -0.000163426753715612, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.6783819549394763, "learning_rate": 9.569377990430622e-08, "logits/chosen": -2.7066688537597656, "logits/rejected": -2.737964630126953, "logps/chosen": -233.67822265625, "logps/rejected": -252.62179565429688, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.176045200030785e-05, "rewards/margins": 0.00011192444071639329, "rewards/margins_max": 0.003285625483840704, "rewards/margins_min": -0.003621011506766081, "rewards/margins_std": 0.0030982145108282566, "rewards/rejected": -0.00013368490908760577, "step": 80 }, { "epoch": 0.02, "grad_norm": 1.9682895182428097, "learning_rate": 1.076555023923445e-07, "logits/chosen": -2.8225607872009277, "logits/rejected": -2.791503429412842, "logps/chosen": -283.03143310546875, "logps/rejected": -248.53964233398438, "loss": 0.6931, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 8.696295117260888e-05, "rewards/margins": -0.0002034438803093508, "rewards/margins_max": 0.002528123091906309, "rewards/margins_min": -0.0029209128115326166, "rewards/margins_std": 0.0023842283990234137, "rewards/rejected": 0.00029040680965408683, "step": 90 }, { "epoch": 0.02, "grad_norm": 1.610086968720784, "learning_rate": 1.1961722488038278e-07, "logits/chosen": -2.8054909706115723, "logits/rejected": -2.797973871231079, "logps/chosen": -300.9483642578125, "logps/rejected": -310.73065185546875, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00017281100735999644, "rewards/margins": 0.00018569377425592393, "rewards/margins_max": 0.003404767718166113, "rewards/margins_min": -0.0027805559802800417, "rewards/margins_std": 0.002794269472360611, "rewards/rejected": -1.2882717783213593e-05, "step": 100 }, { "epoch": 0.02, "eval_logits/chosen": -2.803143262863159, "eval_logits/rejected": -2.7681620121002197, "eval_logps/chosen": -284.4388427734375, "eval_logps/rejected": -265.8543395996094, "eval_loss": 0.6930259466171265, "eval_rewards/accuracies": 0.5130000114440918, "eval_rewards/chosen": 0.0001645983284106478, "eval_rewards/margins": 0.00017520197434350848, "eval_rewards/margins_max": 0.004733717534691095, "eval_rewards/margins_min": -0.004303331486880779, "eval_rewards/margins_std": 0.002950224094092846, "eval_rewards/rejected": -1.0603625014482532e-05, "eval_runtime": 859.6856, "eval_samples_per_second": 4.653, "eval_steps_per_second": 0.291, "step": 100 }, { "epoch": 0.03, "grad_norm": 1.7824351734716144, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -2.814023017883301, "logits/rejected": -2.778928756713867, "logps/chosen": -274.37091064453125, "logps/rejected": -255.2414093017578, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005407524295151234, "rewards/margins": 0.0002355809265282005, "rewards/margins_max": 0.004252096172422171, "rewards/margins_min": -0.003096726257354021, "rewards/margins_std": 0.0032623987644910812, "rewards/rejected": 0.00030517150298692286, "step": 110 }, { "epoch": 0.03, "grad_norm": 1.6289178129741724, "learning_rate": 1.4354066985645933e-07, "logits/chosen": -2.7997655868530273, "logits/rejected": -2.7307863235473633, "logps/chosen": -269.5855712890625, "logps/rejected": -221.72903442382812, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0008956709643825889, "rewards/margins": 0.0006299163214862347, "rewards/margins_max": 0.0045247129164636135, "rewards/margins_min": -0.003170366631820798, "rewards/margins_std": 0.003422073321416974, "rewards/rejected": 0.0002657547011040151, "step": 120 }, { "epoch": 0.03, "grad_norm": 2.12410380595078, "learning_rate": 1.555023923444976e-07, "logits/chosen": -2.8540079593658447, "logits/rejected": -2.8056535720825195, "logps/chosen": -318.8978271484375, "logps/rejected": -284.8515625, "loss": 0.6927, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0003819824196398258, "rewards/margins": 0.0004542602109722793, "rewards/margins_max": 0.0040441155433654785, "rewards/margins_min": -0.0029048118740320206, "rewards/margins_std": 0.0030683670192956924, "rewards/rejected": -7.227776950458065e-05, "step": 130 }, { "epoch": 0.03, "grad_norm": 2.1905846781155547, "learning_rate": 1.6746411483253589e-07, "logits/chosen": -2.8298187255859375, "logits/rejected": -2.8176026344299316, "logps/chosen": -288.49444580078125, "logps/rejected": -253.2882080078125, "loss": 0.693, "rewards/accuracies": 0.625, "rewards/chosen": 0.0008138801786117256, "rewards/margins": 0.00031006510835140944, "rewards/margins_max": 0.0037383928429335356, "rewards/margins_min": -0.0036947287153452635, "rewards/margins_std": 0.0032638120464980602, "rewards/rejected": 0.0005038150702603161, "step": 140 }, { "epoch": 0.04, "grad_norm": 1.894787831972788, "learning_rate": 1.7942583732057415e-07, "logits/chosen": -2.903256416320801, "logits/rejected": -2.8293240070343018, "logps/chosen": -322.9395446777344, "logps/rejected": -311.4129943847656, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0002911566407419741, "rewards/margins": 7.383768388535827e-05, "rewards/margins_max": 0.004016853868961334, "rewards/margins_min": -0.003995803650468588, "rewards/margins_std": 0.0034933306742459536, "rewards/rejected": 0.00021731902961619198, "step": 150 }, { "epoch": 0.04, "grad_norm": 2.1224104867128992, "learning_rate": 1.9138755980861244e-07, "logits/chosen": -2.8328311443328857, "logits/rejected": -2.845745086669922, "logps/chosen": -257.3394470214844, "logps/rejected": -248.88912963867188, "loss": 0.6925, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0010881477501243353, "rewards/margins": 0.0008134182426147163, "rewards/margins_max": 0.004476240370422602, "rewards/margins_min": -0.0026891534216701984, "rewards/margins_std": 0.003181836334988475, "rewards/rejected": 0.000274729507509619, "step": 160 }, { "epoch": 0.04, "grad_norm": 2.137764237339619, "learning_rate": 2.033492822966507e-07, "logits/chosen": -2.781951904296875, "logits/rejected": -2.7471015453338623, "logps/chosen": -297.91473388671875, "logps/rejected": -237.72140502929688, "loss": 0.6925, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0007869511609897017, "rewards/margins": 0.0010647265007719398, "rewards/margins_max": 0.005684514995664358, "rewards/margins_min": -0.003047212492674589, "rewards/margins_std": 0.0038775629363954067, "rewards/rejected": -0.0002777752815745771, "step": 170 }, { "epoch": 0.04, "grad_norm": 2.366740571690379, "learning_rate": 2.15311004784689e-07, "logits/chosen": -2.8212523460388184, "logits/rejected": -2.798943281173706, "logps/chosen": -305.50994873046875, "logps/rejected": -295.97900390625, "loss": 0.6923, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0017296562436968088, "rewards/margins": 0.0019211728358641267, "rewards/margins_max": 0.005665643606334925, "rewards/margins_min": -0.002306095790117979, "rewards/margins_std": 0.003562621073797345, "rewards/rejected": -0.00019151663582306355, "step": 180 }, { "epoch": 0.05, "grad_norm": 1.4712758361311031, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.8340067863464355, "logits/rejected": -2.816643238067627, "logps/chosen": -222.89529418945312, "logps/rejected": -184.3183135986328, "loss": 0.6924, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0008180967415682971, "rewards/margins": 0.001361005357466638, "rewards/margins_max": 0.00569057185202837, "rewards/margins_min": -0.002570072654634714, "rewards/margins_std": 0.0036501861177384853, "rewards/rejected": -0.0005429086741060019, "step": 190 }, { "epoch": 0.05, "grad_norm": 4.296892533700022, "learning_rate": 2.3923444976076555e-07, "logits/chosen": -2.7995080947875977, "logits/rejected": -2.7616238594055176, "logps/chosen": -262.59808349609375, "logps/rejected": -226.07138061523438, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": 0.0016514122253283858, "rewards/margins": 0.0019487269455567002, "rewards/margins_max": 0.0072821988724172115, "rewards/margins_min": -0.0027829702012240887, "rewards/margins_std": 0.004508022218942642, "rewards/rejected": -0.0002973148657474667, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -2.8017399311065674, "eval_logits/rejected": -2.7667651176452637, "eval_logps/chosen": -284.2892150878906, "eval_logps/rejected": -265.8525085449219, "eval_loss": 0.6923297047615051, "eval_rewards/accuracies": 0.621999979019165, "eval_rewards/chosen": 0.0016612681793048978, "eval_rewards/margins": 0.0016538287745788693, "eval_rewards/margins_max": 0.00986100360751152, "eval_rewards/margins_min": -0.005685736425220966, "eval_rewards/margins_std": 0.005118357948958874, "eval_rewards/rejected": 7.439658020302886e-06, "eval_runtime": 859.1555, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 200 }, { "epoch": 0.05, "grad_norm": 1.9734678506216243, "learning_rate": 2.511961722488038e-07, "logits/chosen": -2.841190814971924, "logits/rejected": -2.795135498046875, "logps/chosen": -285.4710388183594, "logps/rejected": -251.4048309326172, "loss": 0.6923, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0018201196799054742, "rewards/margins": 0.001705428003333509, "rewards/margins_max": 0.007224083878099918, "rewards/margins_min": -0.0035212773364037275, "rewards/margins_std": 0.0048028877936303616, "rewards/rejected": 0.0001146918730228208, "step": 210 }, { "epoch": 0.05, "grad_norm": 1.460897516529647, "learning_rate": 2.631578947368421e-07, "logits/chosen": -2.852431058883667, "logits/rejected": -2.8073434829711914, "logps/chosen": -257.1728820800781, "logps/rejected": -236.8297882080078, "loss": 0.6921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0019468723330646753, "rewards/margins": 0.0018834697548300028, "rewards/margins_max": 0.008035682141780853, "rewards/margins_min": -0.0036481625866144896, "rewards/margins_std": 0.0051698703318834305, "rewards/rejected": 6.34025564067997e-05, "step": 220 }, { "epoch": 0.06, "grad_norm": 1.8694177181014378, "learning_rate": 2.7511961722488034e-07, "logits/chosen": -2.8080573081970215, "logits/rejected": -2.790807008743286, "logps/chosen": -275.8009033203125, "logps/rejected": -252.9151611328125, "loss": 0.6917, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.003304890124127269, "rewards/margins": 0.0033024363219738007, "rewards/margins_max": 0.011663327924907207, "rewards/margins_min": -0.0038159037940204144, "rewards/margins_std": 0.0068631889298558235, "rewards/rejected": 2.4540815957152518e-06, "step": 230 }, { "epoch": 0.06, "grad_norm": 1.854606733355213, "learning_rate": 2.8708133971291866e-07, "logits/chosen": -2.858386278152466, "logits/rejected": -2.805567741394043, "logps/chosen": -255.62924194335938, "logps/rejected": -235.61962890625, "loss": 0.6915, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.002285485388711095, "rewards/margins": 0.0019609429873526096, "rewards/margins_max": 0.008661621250212193, "rewards/margins_min": -0.003229865338653326, "rewards/margins_std": 0.005345079582184553, "rewards/rejected": 0.0003245424304623157, "step": 240 }, { "epoch": 0.06, "grad_norm": 1.817379860770541, "learning_rate": 2.990430622009569e-07, "logits/chosen": -2.746481418609619, "logits/rejected": -2.730722188949585, "logps/chosen": -281.7100830078125, "logps/rejected": -290.0663146972656, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00281380582600832, "rewards/margins": 0.0020484558772295713, "rewards/margins_max": 0.010411800816655159, "rewards/margins_min": -0.006185551173985004, "rewards/margins_std": 0.0072416625916957855, "rewards/rejected": 0.0007653498323634267, "step": 250 }, { "epoch": 0.06, "grad_norm": 2.0023821710598284, "learning_rate": 3.110047846889952e-07, "logits/chosen": -2.7706775665283203, "logits/rejected": -2.822251319885254, "logps/chosen": -257.27923583984375, "logps/rejected": -275.00030517578125, "loss": 0.691, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004468593746423721, "rewards/margins": 0.005076944828033447, "rewards/margins_max": 0.014329612255096436, "rewards/margins_min": -0.0037601019721478224, "rewards/margins_std": 0.007890553213655949, "rewards/rejected": -0.0006083514308556914, "step": 260 }, { "epoch": 0.06, "grad_norm": 1.7860772159155853, "learning_rate": 3.229665071770335e-07, "logits/chosen": -2.891803741455078, "logits/rejected": -2.8223443031311035, "logps/chosen": -323.32525634765625, "logps/rejected": -235.1826171875, "loss": 0.6912, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004706279374659061, "rewards/margins": 0.0037411705125123262, "rewards/margins_max": 0.014509765431284904, "rewards/margins_min": -0.0071106404066085815, "rewards/margins_std": 0.009602605365216732, "rewards/rejected": 0.0009651094442233443, "step": 270 }, { "epoch": 0.07, "grad_norm": 1.5718271722210033, "learning_rate": 3.3492822966507177e-07, "logits/chosen": -2.8329367637634277, "logits/rejected": -2.8499460220336914, "logps/chosen": -253.4702606201172, "logps/rejected": -246.0491943359375, "loss": 0.6917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004767083562910557, "rewards/margins": 0.003300449578091502, "rewards/margins_max": 0.012085122987627983, "rewards/margins_min": -0.00477250013500452, "rewards/margins_std": 0.0076704369857907295, "rewards/rejected": 0.0014666334027424455, "step": 280 }, { "epoch": 0.07, "grad_norm": 1.7431756882052798, "learning_rate": 3.4688995215311004e-07, "logits/chosen": -2.779371738433838, "logits/rejected": -2.757108211517334, "logps/chosen": -265.29156494140625, "logps/rejected": -221.9234619140625, "loss": 0.6905, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004595404490828514, "rewards/margins": 0.004501349292695522, "rewards/margins_max": 0.015392111614346504, "rewards/margins_min": -0.00713689997792244, "rewards/margins_std": 0.010018928907811642, "rewards/rejected": 9.405486343894154e-05, "step": 290 }, { "epoch": 0.07, "grad_norm": 1.8152609218809446, "learning_rate": 3.588516746411483e-07, "logits/chosen": -2.8642630577087402, "logits/rejected": -2.8560938835144043, "logps/chosen": -250.71426391601562, "logps/rejected": -240.01803588867188, "loss": 0.6903, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005799327045679092, "rewards/margins": 0.0027367686852812767, "rewards/margins_max": 0.013944950886070728, "rewards/margins_min": -0.009182724170386791, "rewards/margins_std": 0.010260081849992275, "rewards/rejected": 0.0030625583603978157, "step": 300 }, { "epoch": 0.07, "eval_logits/chosen": -2.7978017330169678, "eval_logits/rejected": -2.7626500129699707, "eval_logps/chosen": -283.78564453125, "eval_logps/rejected": -265.6622619628906, "eval_loss": 0.6908154487609863, "eval_rewards/accuracies": 0.6520000100135803, "eval_rewards/chosen": 0.006696476601064205, "eval_rewards/margins": 0.004786263220012188, "eval_rewards/margins_max": 0.02526562102138996, "eval_rewards/margins_min": -0.012469511479139328, "eval_rewards/margins_std": 0.012487462721765041, "eval_rewards/rejected": 0.0019102133810520172, "eval_runtime": 860.3161, "eval_samples_per_second": 4.649, "eval_steps_per_second": 0.291, "step": 300 }, { "epoch": 0.07, "grad_norm": 1.5859069821282503, "learning_rate": 3.7081339712918656e-07, "logits/chosen": -2.8701109886169434, "logits/rejected": -2.80281400680542, "logps/chosen": -256.1575622558594, "logps/rejected": -198.39547729492188, "loss": 0.6905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005720221903175116, "rewards/margins": 0.005076803732663393, "rewards/margins_max": 0.017741765826940536, "rewards/margins_min": -0.005554481875151396, "rewards/margins_std": 0.010260584764182568, "rewards/rejected": 0.000643418519757688, "step": 310 }, { "epoch": 0.08, "grad_norm": 2.5478720556321774, "learning_rate": 3.827751196172249e-07, "logits/chosen": -2.8780677318573, "logits/rejected": -2.8603920936584473, "logps/chosen": -275.4996643066406, "logps/rejected": -353.23223876953125, "loss": 0.6898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006795539055019617, "rewards/margins": 0.004812855739146471, "rewards/margins_max": 0.02092679962515831, "rewards/margins_min": -0.010641205124557018, "rewards/margins_std": 0.014152769930660725, "rewards/rejected": 0.001982682617381215, "step": 320 }, { "epoch": 0.08, "grad_norm": 1.8090933651382484, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -2.88478422164917, "logits/rejected": -2.836512327194214, "logps/chosen": -332.34991455078125, "logps/rejected": -263.26214599609375, "loss": 0.6902, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.009971674531698227, "rewards/margins": 0.008845487609505653, "rewards/margins_max": 0.024542566388845444, "rewards/margins_min": -0.005169200710952282, "rewards/margins_std": 0.013284943997859955, "rewards/rejected": 0.0011261856416240335, "step": 330 }, { "epoch": 0.08, "grad_norm": 1.889588694650712, "learning_rate": 4.066985645933014e-07, "logits/chosen": -2.8786487579345703, "logits/rejected": -2.8604178428649902, "logps/chosen": -322.4375, "logps/rejected": -265.0132141113281, "loss": 0.6894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.009604470804333687, "rewards/margins": 0.008933277800679207, "rewards/margins_max": 0.023819511756300926, "rewards/margins_min": -0.005439485423266888, "rewards/margins_std": 0.013345139101147652, "rewards/rejected": 0.0006711935857310891, "step": 340 }, { "epoch": 0.08, "grad_norm": 2.1643558675177577, "learning_rate": 4.1866028708133973e-07, "logits/chosen": -2.800771951675415, "logits/rejected": -2.7496225833892822, "logps/chosen": -264.0378723144531, "logps/rejected": -214.9931640625, "loss": 0.6893, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.007511107716709375, "rewards/margins": 0.007657433860003948, "rewards/margins_max": 0.031329743564128876, "rewards/margins_min": -0.012957903556525707, "rewards/margins_std": 0.020018046721816063, "rewards/rejected": -0.00014632634702138603, "step": 350 }, { "epoch": 0.09, "grad_norm": 2.0226633365723554, "learning_rate": 4.30622009569378e-07, "logits/chosen": -2.8628451824188232, "logits/rejected": -2.8355746269226074, "logps/chosen": -281.36749267578125, "logps/rejected": -245.26431274414062, "loss": 0.6884, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.008619427680969238, "rewards/margins": 0.009861720725893974, "rewards/margins_max": 0.026844218373298645, "rewards/margins_min": -0.008796043694019318, "rewards/margins_std": 0.016473382711410522, "rewards/rejected": -0.0012422938598319888, "step": 360 }, { "epoch": 0.09, "grad_norm": 1.4453796371125611, "learning_rate": 4.425837320574162e-07, "logits/chosen": -2.9047369956970215, "logits/rejected": -2.8413548469543457, "logps/chosen": -296.87872314453125, "logps/rejected": -231.3509979248047, "loss": 0.6892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006261153612285852, "rewards/margins": 0.008242874406278133, "rewards/margins_max": 0.030506301671266556, "rewards/margins_min": -0.013848531059920788, "rewards/margins_std": 0.020168842747807503, "rewards/rejected": -0.0019817203283309937, "step": 370 }, { "epoch": 0.09, "grad_norm": 2.0143229692718427, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.8520050048828125, "logits/rejected": -2.7991726398468018, "logps/chosen": -279.69146728515625, "logps/rejected": -224.68331909179688, "loss": 0.6887, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.010300575755536556, "rewards/margins": 0.011675434187054634, "rewards/margins_max": 0.03212700039148331, "rewards/margins_min": -0.008405391126871109, "rewards/margins_std": 0.018078230321407318, "rewards/rejected": -0.0013748581986874342, "step": 380 }, { "epoch": 0.09, "grad_norm": 2.173284937469911, "learning_rate": 4.665071770334928e-07, "logits/chosen": -2.7563838958740234, "logits/rejected": -2.7253875732421875, "logps/chosen": -306.5716552734375, "logps/rejected": -258.4673767089844, "loss": 0.6877, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.011723880656063557, "rewards/margins": 0.012919160537421703, "rewards/margins_max": 0.035039566457271576, "rewards/margins_min": -0.009524760767817497, "rewards/margins_std": 0.020119303837418556, "rewards/rejected": -0.0011952801141887903, "step": 390 }, { "epoch": 0.1, "grad_norm": 1.8297438667404096, "learning_rate": 4.784688995215311e-07, "logits/chosen": -2.731421947479248, "logits/rejected": -2.75995135307312, "logps/chosen": -268.7449035644531, "logps/rejected": -259.85772705078125, "loss": 0.6888, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009379776194691658, "rewards/margins": 0.009554450400173664, "rewards/margins_max": 0.03678436204791069, "rewards/margins_min": -0.01677670329809189, "rewards/margins_std": 0.023921573534607887, "rewards/rejected": -0.00017467378347646445, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -2.7923877239227295, "eval_logits/rejected": -2.7572779655456543, "eval_logps/chosen": -283.4166564941406, "eval_logps/rejected": -265.89434814453125, "eval_loss": 0.6879981756210327, "eval_rewards/accuracies": 0.6644999980926514, "eval_rewards/chosen": 0.010386648587882519, "eval_rewards/margins": 0.010797684080898762, "eval_rewards/margins_max": 0.0544959232211113, "eval_rewards/margins_min": -0.026414871215820312, "eval_rewards/margins_std": 0.026797765865921974, "eval_rewards/rejected": -0.0004110359586775303, "eval_runtime": 860.5483, "eval_samples_per_second": 4.648, "eval_steps_per_second": 0.291, "step": 400 }, { "epoch": 0.1, "grad_norm": 1.8439575337135385, "learning_rate": 4.904306220095694e-07, "logits/chosen": -2.8291707038879395, "logits/rejected": -2.7534260749816895, "logps/chosen": -321.44671630859375, "logps/rejected": -258.337158203125, "loss": 0.6871, "rewards/accuracies": 0.6875, "rewards/chosen": 0.010904048569500446, "rewards/margins": 0.014547420665621758, "rewards/margins_max": 0.048709701746702194, "rewards/margins_min": -0.015160051174461842, "rewards/margins_std": 0.028659731149673462, "rewards/rejected": -0.003643373027443886, "step": 410 }, { "epoch": 0.1, "grad_norm": 2.1480667071719846, "learning_rate": 4.999996505732917e-07, "logits/chosen": -2.8337886333465576, "logits/rejected": -2.8044726848602295, "logps/chosen": -297.41912841796875, "logps/rejected": -293.13800048828125, "loss": 0.6864, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.012047646567225456, "rewards/margins": 0.010113747790455818, "rewards/margins_max": 0.04316211864352226, "rewards/margins_min": -0.02286229468882084, "rewards/margins_std": 0.02985798381268978, "rewards/rejected": 0.0019338976126164198, "step": 420 }, { "epoch": 0.1, "grad_norm": 1.951871484954239, "learning_rate": 4.999874207410648e-07, "logits/chosen": -2.7598047256469727, "logits/rejected": -2.7764503955841064, "logps/chosen": -252.3936004638672, "logps/rejected": -261.55194091796875, "loss": 0.6858, "rewards/accuracies": 0.6875, "rewards/chosen": 0.015963982790708542, "rewards/margins": 0.011255776509642601, "rewards/margins_max": 0.044065456837415695, "rewards/margins_min": -0.015702728182077408, "rewards/margins_std": 0.026332881301641464, "rewards/rejected": 0.0047082058154046535, "step": 430 }, { "epoch": 0.11, "grad_norm": 1.815723512376768, "learning_rate": 4.999577205502039e-07, "logits/chosen": -2.7594494819641113, "logits/rejected": -2.7417349815368652, "logps/chosen": -239.5718536376953, "logps/rejected": -221.40414428710938, "loss": 0.6874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.014929292723536491, "rewards/margins": 0.01059373002499342, "rewards/margins_max": 0.04242347553372383, "rewards/margins_min": -0.02509412169456482, "rewards/margins_std": 0.03010099194943905, "rewards/rejected": 0.004335561767220497, "step": 440 }, { "epoch": 0.11, "grad_norm": 1.887094148366367, "learning_rate": 4.999105520763054e-07, "logits/chosen": -2.8326034545898438, "logits/rejected": -2.7527859210968018, "logps/chosen": -285.027099609375, "logps/rejected": -256.4057922363281, "loss": 0.6866, "rewards/accuracies": 0.75, "rewards/chosen": 0.020126910880208015, "rewards/margins": 0.012689967639744282, "rewards/margins_max": 0.0455465242266655, "rewards/margins_min": -0.027548715472221375, "rewards/margins_std": 0.03222181648015976, "rewards/rejected": 0.007436943706125021, "step": 450 }, { "epoch": 0.11, "grad_norm": 2.0064960126877613, "learning_rate": 4.998459186157357e-07, "logits/chosen": -2.8465819358825684, "logits/rejected": -2.785576820373535, "logps/chosen": -289.4577331542969, "logps/rejected": -266.01800537109375, "loss": 0.6842, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.023445971310138702, "rewards/margins": 0.014034710824489594, "rewards/margins_max": 0.05275397375226021, "rewards/margins_min": -0.020386729389429092, "rewards/margins_std": 0.03251287341117859, "rewards/rejected": 0.009411259554326534, "step": 460 }, { "epoch": 0.11, "grad_norm": 1.8984794455081915, "learning_rate": 4.997638246854011e-07, "logits/chosen": -2.885715961456299, "logits/rejected": -2.8429105281829834, "logps/chosen": -282.2930603027344, "logps/rejected": -270.14898681640625, "loss": 0.6861, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.028099358081817627, "rewards/margins": 0.014804655686020851, "rewards/margins_max": 0.05813845247030258, "rewards/margins_min": -0.024266045540571213, "rewards/margins_std": 0.037039484828710556, "rewards/rejected": 0.013294701464474201, "step": 470 }, { "epoch": 0.11, "grad_norm": 2.4713524730316134, "learning_rate": 4.996642760224317e-07, "logits/chosen": -2.7256481647491455, "logits/rejected": -2.7138657569885254, "logps/chosen": -284.6150207519531, "logps/rejected": -271.6550598144531, "loss": 0.685, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03093739226460457, "rewards/margins": 0.016154423356056213, "rewards/margins_max": 0.06530088931322098, "rewards/margins_min": -0.023924505338072777, "rewards/margins_std": 0.04057624191045761, "rewards/rejected": 0.014782967045903206, "step": 480 }, { "epoch": 0.12, "grad_norm": 1.790829381546387, "learning_rate": 4.995472795837813e-07, "logits/chosen": -2.8459765911102295, "logits/rejected": -2.7399191856384277, "logps/chosen": -251.16159057617188, "logps/rejected": -224.5177001953125, "loss": 0.6826, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02981259487569332, "rewards/margins": 0.01665044017136097, "rewards/margins_max": 0.06273016333580017, "rewards/margins_min": -0.022494319826364517, "rewards/margins_std": 0.03761152923107147, "rewards/rejected": 0.013162153773009777, "step": 490 }, { "epoch": 0.12, "grad_norm": 1.7599716827975578, "learning_rate": 4.994128435457401e-07, "logits/chosen": -2.832188129425049, "logits/rejected": -2.7971439361572266, "logps/chosen": -308.68634033203125, "logps/rejected": -267.23687744140625, "loss": 0.6827, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.039294809103012085, "rewards/margins": 0.022731659933924675, "rewards/margins_max": 0.06454737484455109, "rewards/margins_min": -0.025807851925492287, "rewards/margins_std": 0.04009108990430832, "rewards/rejected": 0.01656315103173256, "step": 500 }, { "epoch": 0.12, "eval_logits/chosen": -2.787661075592041, "eval_logits/rejected": -2.7528791427612305, "eval_logps/chosen": -281.00518798828125, "eval_logps/rejected": -264.4715270996094, "eval_loss": 0.6834259629249573, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": 0.034501295536756516, "eval_rewards/margins": 0.020683957263827324, "eval_rewards/margins_max": 0.09889663010835648, "eval_rewards/margins_min": -0.0453532375395298, "eval_rewards/margins_std": 0.04788992181420326, "eval_rewards/rejected": 0.01381734013557434, "eval_runtime": 859.8364, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 500 }, { "epoch": 0.12, "grad_norm": 1.8380357156008533, "learning_rate": 4.992609773033638e-07, "logits/chosen": -2.87412691116333, "logits/rejected": -2.8063013553619385, "logps/chosen": -311.1976623535156, "logps/rejected": -290.9315490722656, "loss": 0.6806, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04149966686964035, "rewards/margins": 0.027491098269820213, "rewards/margins_max": 0.08390498906373978, "rewards/margins_min": -0.026887020096182823, "rewards/margins_std": 0.04932967200875282, "rewards/rejected": 0.014008568599820137, "step": 510 }, { "epoch": 0.12, "grad_norm": 1.8578490078020375, "learning_rate": 4.990916914698176e-07, "logits/chosen": -2.8508479595184326, "logits/rejected": -2.8774688243865967, "logps/chosen": -269.26678466796875, "logps/rejected": -282.09149169921875, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": 0.026975523680448532, "rewards/margins": 0.017464371398091316, "rewards/margins_max": 0.06073393672704697, "rewards/margins_min": -0.020872922614216805, "rewards/margins_std": 0.037673480808734894, "rewards/rejected": 0.009511154145002365, "step": 520 }, { "epoch": 0.13, "grad_norm": 1.8975594456223, "learning_rate": 4.989049978756335e-07, "logits/chosen": -2.8389906883239746, "logits/rejected": -2.795275926589966, "logps/chosen": -259.84613037109375, "logps/rejected": -223.6072235107422, "loss": 0.6806, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.03905173018574715, "rewards/margins": 0.028574619442224503, "rewards/margins_max": 0.09143301099538803, "rewards/margins_min": -0.03277328237891197, "rewards/margins_std": 0.05603231117129326, "rewards/rejected": 0.010477107018232346, "step": 530 }, { "epoch": 0.13, "grad_norm": 1.8617322709156452, "learning_rate": 4.987009095678842e-07, "logits/chosen": -2.8395779132843018, "logits/rejected": -2.7576241493225098, "logps/chosen": -335.243896484375, "logps/rejected": -256.72149658203125, "loss": 0.6757, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.047729261219501495, "rewards/margins": 0.0380299873650074, "rewards/margins_max": 0.10880953073501587, "rewards/margins_min": -0.027835842221975327, "rewards/margins_std": 0.05956585332751274, "rewards/rejected": 0.00969927478581667, "step": 540 }, { "epoch": 0.13, "grad_norm": 1.6461317379356113, "learning_rate": 4.984794408092712e-07, "logits/chosen": -2.747067928314209, "logits/rejected": -2.761674165725708, "logps/chosen": -227.9774932861328, "logps/rejected": -240.8350372314453, "loss": 0.682, "rewards/accuracies": 0.625, "rewards/chosen": 0.03224276378750801, "rewards/margins": 0.019359184429049492, "rewards/margins_max": 0.08641939610242844, "rewards/margins_min": -0.03590545803308487, "rewards/margins_std": 0.05543201044201851, "rewards/rejected": 0.012883573770523071, "step": 550 }, { "epoch": 0.13, "grad_norm": 1.898962128914956, "learning_rate": 4.982406070771277e-07, "logits/chosen": -2.8066565990448, "logits/rejected": -2.7697765827178955, "logps/chosen": -258.8771667480469, "logps/rejected": -245.0890655517578, "loss": 0.679, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04510858654975891, "rewards/margins": 0.030218088999390602, "rewards/margins_max": 0.1033395305275917, "rewards/margins_min": -0.027472149580717087, "rewards/margins_std": 0.05755491927266121, "rewards/rejected": 0.014890496619045734, "step": 560 }, { "epoch": 0.14, "grad_norm": 2.01086881886344, "learning_rate": 4.979844250623374e-07, "logits/chosen": -2.799595355987549, "logits/rejected": -2.7691237926483154, "logps/chosen": -259.0538330078125, "logps/rejected": -282.46942138671875, "loss": 0.6795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03679460287094116, "rewards/margins": 0.026447024196386337, "rewards/margins_max": 0.11028116941452026, "rewards/margins_min": -0.043576233088970184, "rewards/margins_std": 0.0696285218000412, "rewards/rejected": 0.010347576811909676, "step": 570 }, { "epoch": 0.14, "grad_norm": 1.8857229697039908, "learning_rate": 4.977109126681678e-07, "logits/chosen": -2.8361918926239014, "logits/rejected": -2.794586658477783, "logps/chosen": -334.4454345703125, "logps/rejected": -279.5559387207031, "loss": 0.6809, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04045528918504715, "rewards/margins": 0.027129491791129112, "rewards/margins_max": 0.11118390411138535, "rewards/margins_min": -0.04163810983300209, "rewards/margins_std": 0.06822942197322845, "rewards/rejected": 0.013325795531272888, "step": 580 }, { "epoch": 0.14, "grad_norm": 1.8402639504479013, "learning_rate": 4.974200890090191e-07, "logits/chosen": -2.813422441482544, "logits/rejected": -2.8016512393951416, "logps/chosen": -243.6787567138672, "logps/rejected": -241.81228637695312, "loss": 0.6782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03460235148668289, "rewards/margins": 0.03581953048706055, "rewards/margins_max": 0.10852668434381485, "rewards/margins_min": -0.022002944722771645, "rewards/margins_std": 0.05729494243860245, "rewards/rejected": -0.001217175624333322, "step": 590 }, { "epoch": 0.14, "grad_norm": 1.8609566893336893, "learning_rate": 4.971119744090886e-07, "logits/chosen": -2.822237730026245, "logits/rejected": -2.7726807594299316, "logps/chosen": -262.5629577636719, "logps/rejected": -243.2275848388672, "loss": 0.6831, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.026615392416715622, "rewards/margins": 0.030522268265485764, "rewards/margins_max": 0.12365134805440903, "rewards/margins_min": -0.059167660772800446, "rewards/margins_std": 0.08123533427715302, "rewards/rejected": -0.003906878177076578, "step": 600 }, { "epoch": 0.14, "eval_logits/chosen": -2.782672643661499, "eval_logits/rejected": -2.747931957244873, "eval_logps/chosen": -281.4937438964844, "eval_logps/rejected": -266.2421875, "eval_loss": 0.6776489615440369, "eval_rewards/accuracies": 0.6909999847412109, "eval_rewards/chosen": 0.02961578033864498, "eval_rewards/margins": 0.03350492939352989, "eval_rewards/margins_max": 0.15520799160003662, "eval_rewards/margins_min": -0.06960177421569824, "eval_rewards/margins_std": 0.0744817852973938, "eval_rewards/rejected": -0.0038891462609171867, "eval_runtime": 859.6377, "eval_samples_per_second": 4.653, "eval_steps_per_second": 0.291, "step": 600 }, { "epoch": 0.15, "grad_norm": 2.2020432000907517, "learning_rate": 4.967865904009499e-07, "logits/chosen": -2.845512866973877, "logits/rejected": -2.813530683517456, "logps/chosen": -344.6908264160156, "logps/rejected": -267.20208740234375, "loss": 0.6747, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.025227338075637817, "rewards/margins": 0.038891032338142395, "rewards/margins_max": 0.1099926233291626, "rewards/margins_min": -0.021645687520503998, "rewards/margins_std": 0.059668660163879395, "rewards/rejected": -0.013663697056472301, "step": 610 }, { "epoch": 0.15, "grad_norm": 2.148480002242944, "learning_rate": 4.964439597240486e-07, "logits/chosen": -2.8236947059631348, "logits/rejected": -2.796365976333618, "logps/chosen": -382.76806640625, "logps/rejected": -295.16961669921875, "loss": 0.6724, "rewards/accuracies": 0.75, "rewards/chosen": 0.04661710932850838, "rewards/margins": 0.05233670398592949, "rewards/margins_max": 0.15269331634044647, "rewards/margins_min": -0.042940981686115265, "rewards/margins_std": 0.08610849827528, "rewards/rejected": -0.0057195937260985374, "step": 620 }, { "epoch": 0.15, "grad_norm": 2.043037719729389, "learning_rate": 4.960841063231124e-07, "logits/chosen": -2.804616928100586, "logits/rejected": -2.765625476837158, "logps/chosen": -367.92510986328125, "logps/rejected": -290.01788330078125, "loss": 0.6652, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.04542078822851181, "rewards/margins": 0.06306995451450348, "rewards/margins_max": 0.15209174156188965, "rewards/margins_min": -0.030609797686338425, "rewards/margins_std": 0.08168235421180725, "rewards/rejected": -0.017649158835411072, "step": 630 }, { "epoch": 0.15, "grad_norm": 1.9949981609135232, "learning_rate": 4.95707055346479e-07, "logits/chosen": -2.810793161392212, "logits/rejected": -2.732025623321533, "logps/chosen": -321.9665222167969, "logps/rejected": -250.7497100830078, "loss": 0.6682, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.037032295018434525, "rewards/margins": 0.06006144359707832, "rewards/margins_max": 0.14815713465213776, "rewards/margins_min": -0.021324660629034042, "rewards/margins_std": 0.07636863738298416, "rewards/rejected": -0.02302914671599865, "step": 640 }, { "epoch": 0.16, "grad_norm": 1.9041296000250105, "learning_rate": 4.95312833144337e-07, "logits/chosen": -2.838348150253296, "logits/rejected": -2.7659010887145996, "logps/chosen": -290.25091552734375, "logps/rejected": -254.9677276611328, "loss": 0.6721, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.019019629806280136, "rewards/margins": 0.04789215326309204, "rewards/margins_max": 0.16989126801490784, "rewards/margins_min": -0.050513893365859985, "rewards/margins_std": 0.10015592724084854, "rewards/rejected": -0.028872525319457054, "step": 650 }, { "epoch": 0.16, "grad_norm": 1.9181181622206498, "learning_rate": 4.949014672668858e-07, "logits/chosen": -2.8450560569763184, "logits/rejected": -2.823151111602783, "logps/chosen": -259.1734619140625, "logps/rejected": -250.8106231689453, "loss": 0.6749, "rewards/accuracies": 0.625, "rewards/chosen": 0.01695770025253296, "rewards/margins": 0.04111206904053688, "rewards/margins_max": 0.1423286646604538, "rewards/margins_min": -0.06147942692041397, "rewards/margins_std": 0.09176277369260788, "rewards/rejected": -0.02415436878800392, "step": 660 }, { "epoch": 0.16, "grad_norm": 2.240906884672249, "learning_rate": 4.944729864624097e-07, "logits/chosen": -2.922368049621582, "logits/rejected": -2.8345208168029785, "logps/chosen": -330.1218566894531, "logps/rejected": -267.60223388671875, "loss": 0.6686, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.030409136787056923, "rewards/margins": 0.05693582817912102, "rewards/margins_max": 0.15574179589748383, "rewards/margins_min": -0.03514755517244339, "rewards/margins_std": 0.08488789945840836, "rewards/rejected": -0.026526689529418945, "step": 670 }, { "epoch": 0.16, "grad_norm": 2.8553009192620267, "learning_rate": 4.940274206752687e-07, "logits/chosen": -2.7653040885925293, "logits/rejected": -2.7400851249694824, "logps/chosen": -329.28094482421875, "logps/rejected": -264.5126953125, "loss": 0.6664, "rewards/accuracies": 0.625, "rewards/chosen": 0.022837229073047638, "rewards/margins": 0.04619354009628296, "rewards/margins_max": 0.17638953030109406, "rewards/margins_min": -0.0725640207529068, "rewards/margins_std": 0.1124802827835083, "rewards/rejected": -0.02335631661117077, "step": 680 }, { "epoch": 0.17, "grad_norm": 2.103590996683848, "learning_rate": 4.935648010438058e-07, "logits/chosen": -2.783113479614258, "logits/rejected": -2.770662307739258, "logps/chosen": -260.016357421875, "logps/rejected": -268.6811828613281, "loss": 0.675, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01028151623904705, "rewards/margins": 0.04737422987818718, "rewards/margins_max": 0.16225430369377136, "rewards/margins_min": -0.055725038051605225, "rewards/margins_std": 0.0967501625418663, "rewards/rejected": -0.03709270805120468, "step": 690 }, { "epoch": 0.17, "grad_norm": 1.8356022954880307, "learning_rate": 4.930851598981713e-07, "logits/chosen": -2.806530237197876, "logits/rejected": -2.736204147338867, "logps/chosen": -281.389404296875, "logps/rejected": -251.22439575195312, "loss": 0.6652, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0015332363545894623, "rewards/margins": 0.051959507167339325, "rewards/margins_max": 0.18142695724964142, "rewards/margins_min": -0.06150681897997856, "rewards/margins_std": 0.10783376544713974, "rewards/rejected": -0.05042628198862076, "step": 700 }, { "epoch": 0.17, "eval_logits/chosen": -2.7726073265075684, "eval_logits/rejected": -2.7381980419158936, "eval_logps/chosen": -283.5948486328125, "eval_logps/rejected": -270.1202392578125, "eval_loss": 0.670020341873169, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": 0.008604736067354679, "eval_rewards/margins": 0.05127452686429024, "eval_rewards/margins_max": 0.23500196635723114, "eval_rewards/margins_min": -0.10567907243967056, "eval_rewards/margins_std": 0.11278793215751648, "eval_rewards/rejected": -0.04266979172825813, "eval_runtime": 859.9574, "eval_samples_per_second": 4.651, "eval_steps_per_second": 0.291, "step": 700 }, { "epoch": 0.17, "grad_norm": 2.1883763329067216, "learning_rate": 4.925885307580632e-07, "logits/chosen": -2.7491278648376465, "logits/rejected": -2.691565752029419, "logps/chosen": -290.1736145019531, "logps/rejected": -249.8134765625, "loss": 0.6684, "rewards/accuracies": 0.75, "rewards/chosen": 0.021899688988924026, "rewards/margins": 0.08286824077367783, "rewards/margins_max": 0.22836923599243164, "rewards/margins_min": -0.053432680666446686, "rewards/margins_std": 0.12297092378139496, "rewards/rejected": -0.0609685480594635, "step": 710 }, { "epoch": 0.17, "grad_norm": 2.1714985473192288, "learning_rate": 4.920749483303846e-07, "logits/chosen": -2.636892080307007, "logits/rejected": -2.6579179763793945, "logps/chosen": -274.3876037597656, "logps/rejected": -278.6937255859375, "loss": 0.6688, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.008625579997897148, "rewards/margins": 0.05348697304725647, "rewards/margins_max": 0.18989379703998566, "rewards/margins_min": -0.04376517981290817, "rewards/margins_std": 0.1045072078704834, "rewards/rejected": -0.04486139863729477, "step": 720 }, { "epoch": 0.17, "grad_norm": 2.081086651231241, "learning_rate": 4.915444485068181e-07, "logits/chosen": -2.848870277404785, "logits/rejected": -2.7781660556793213, "logps/chosen": -321.5794677734375, "logps/rejected": -291.2355651855469, "loss": 0.6634, "rewards/accuracies": 0.75, "rewards/chosen": 0.019336868077516556, "rewards/margins": 0.05942217633128166, "rewards/margins_max": 0.1821795403957367, "rewards/margins_min": -0.05405784398317337, "rewards/margins_std": 0.1058388501405716, "rewards/rejected": -0.040085311979055405, "step": 730 }, { "epoch": 0.18, "grad_norm": 2.666434476598509, "learning_rate": 4.90997068361318e-07, "logits/chosen": -2.852202892303467, "logits/rejected": -2.807814359664917, "logps/chosen": -259.76470947265625, "logps/rejected": -261.9565124511719, "loss": 0.6591, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.016710925847291946, "rewards/margins": 0.0612528994679451, "rewards/margins_max": 0.20229406654834747, "rewards/margins_min": -0.06611864268779755, "rewards/margins_std": 0.1207486242055893, "rewards/rejected": -0.04454197362065315, "step": 740 }, { "epoch": 0.18, "grad_norm": 1.786016411427176, "learning_rate": 4.904328461475189e-07, "logits/chosen": -2.839444398880005, "logits/rejected": -2.8065857887268066, "logps/chosen": -283.57659912109375, "logps/rejected": -280.9451904296875, "loss": 0.6699, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01933327689766884, "rewards/margins": 0.06404153257608414, "rewards/margins_max": 0.2093987911939621, "rewards/margins_min": -0.06361619383096695, "rewards/margins_std": 0.12587139010429382, "rewards/rejected": -0.044708251953125, "step": 750 }, { "epoch": 0.18, "grad_norm": 1.8723215045067456, "learning_rate": 4.898518212960625e-07, "logits/chosen": -2.8065075874328613, "logits/rejected": -2.8201041221618652, "logps/chosen": -275.5202331542969, "logps/rejected": -287.6792907714844, "loss": 0.661, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004976716358214617, "rewards/margins": 0.04297412186861038, "rewards/margins_max": 0.18559980392456055, "rewards/margins_min": -0.09453781694173813, "rewards/margins_std": 0.12249946594238281, "rewards/rejected": -0.037997402250766754, "step": 760 }, { "epoch": 0.18, "grad_norm": 2.1560222118415435, "learning_rate": 4.89254034411842e-07, "logits/chosen": -2.840862512588501, "logits/rejected": -2.770017385482788, "logps/chosen": -270.3290100097656, "logps/rejected": -276.2815246582031, "loss": 0.6615, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.004142249468713999, "rewards/margins": 0.05860158056020737, "rewards/margins_max": 0.19023478031158447, "rewards/margins_min": -0.07205347716808319, "rewards/margins_std": 0.1169583648443222, "rewards/rejected": -0.054459333419799805, "step": 770 }, { "epoch": 0.19, "grad_norm": 3.9902538536219994, "learning_rate": 4.886395272711646e-07, "logits/chosen": -2.8726820945739746, "logits/rejected": -2.804377317428589, "logps/chosen": -309.981689453125, "logps/rejected": -248.2880401611328, "loss": 0.6585, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.005033843219280243, "rewards/margins": 0.08469756692647934, "rewards/margins_max": 0.24064771831035614, "rewards/margins_min": -0.07141076028347015, "rewards/margins_std": 0.142036572098732, "rewards/rejected": -0.0796637237071991, "step": 780 }, { "epoch": 0.19, "grad_norm": 2.544732251425543, "learning_rate": 4.880083428188314e-07, "logits/chosen": -2.8103771209716797, "logits/rejected": -2.7757694721221924, "logps/chosen": -306.94622802734375, "logps/rejected": -261.8299255371094, "loss": 0.6511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006795515306293964, "rewards/margins": 0.08599305152893066, "rewards/margins_max": 0.26126423478126526, "rewards/margins_min": -0.04950443655252457, "rewards/margins_std": 0.14344367384910583, "rewards/rejected": -0.07919753342866898, "step": 790 }, { "epoch": 0.19, "grad_norm": 3.1077775541846533, "learning_rate": 4.873605251651373e-07, "logits/chosen": -2.817831039428711, "logits/rejected": -2.746727228164673, "logps/chosen": -306.8286437988281, "logps/rejected": -260.79437255859375, "loss": 0.6486, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.007715999148786068, "rewards/margins": 0.10799429565668106, "rewards/margins_max": 0.3326597809791565, "rewards/margins_min": -0.08400187641382217, "rewards/margins_std": 0.18480992317199707, "rewards/rejected": -0.10027830302715302, "step": 800 }, { "epoch": 0.19, "eval_logits/chosen": -2.7702386379241943, "eval_logits/rejected": -2.736680746078491, "eval_logps/chosen": -286.4377746582031, "eval_logps/rejected": -275.0621643066406, "eval_loss": 0.6614853739738464, "eval_rewards/accuracies": 0.6804999709129333, "eval_rewards/chosen": -0.019824357703328133, "eval_rewards/margins": 0.07226436585187912, "eval_rewards/margins_max": 0.3237099051475525, "eval_rewards/margins_min": -0.14703232049942017, "eval_rewards/margins_std": 0.15650083124637604, "eval_rewards/rejected": -0.0920887291431427, "eval_runtime": 859.6716, "eval_samples_per_second": 4.653, "eval_steps_per_second": 0.291, "step": 800 }, { "epoch": 0.19, "grad_norm": 2.1707019519648285, "learning_rate": 4.866961195827869e-07, "logits/chosen": -2.783294200897217, "logits/rejected": -2.7768232822418213, "logps/chosen": -246.5240936279297, "logps/rejected": -249.036865234375, "loss": 0.6634, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02293337881565094, "rewards/margins": 0.0682712197303772, "rewards/margins_max": 0.20986878871917725, "rewards/margins_min": -0.11614898592233658, "rewards/margins_std": 0.14644506573677063, "rewards/rejected": -0.09120459854602814, "step": 810 }, { "epoch": 0.2, "grad_norm": 3.7724079757397404, "learning_rate": 4.860151725037318e-07, "logits/chosen": -2.741666316986084, "logits/rejected": -2.730962038040161, "logps/chosen": -289.5762023925781, "logps/rejected": -270.65777587890625, "loss": 0.6453, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.005815769545733929, "rewards/margins": 0.09417177736759186, "rewards/margins_max": 0.27101415395736694, "rewards/margins_min": -0.053407151252031326, "rewards/margins_std": 0.145038440823555, "rewards/rejected": -0.09998755156993866, "step": 820 }, { "epoch": 0.2, "grad_norm": 2.2482379795645184, "learning_rate": 4.853177315159253e-07, "logits/chosen": -2.857778310775757, "logits/rejected": -2.800199031829834, "logps/chosen": -347.98565673828125, "logps/rejected": -289.58050537109375, "loss": 0.6485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.011530758813023567, "rewards/margins": 0.11128588765859604, "rewards/margins_max": 0.2874368131160736, "rewards/margins_min": -0.05928174778819084, "rewards/margins_std": 0.15356837213039398, "rewards/rejected": -0.09975512325763702, "step": 830 }, { "epoch": 0.2, "grad_norm": 2.125471175072589, "learning_rate": 4.846038453599967e-07, "logits/chosen": -2.813398599624634, "logits/rejected": -2.7322680950164795, "logps/chosen": -299.72760009765625, "logps/rejected": -268.55255126953125, "loss": 0.6597, "rewards/accuracies": 0.75, "rewards/chosen": 0.03378953039646149, "rewards/margins": 0.09911631047725677, "rewards/margins_max": 0.27514463663101196, "rewards/margins_min": -0.07189072668552399, "rewards/margins_std": 0.1509791612625122, "rewards/rejected": -0.06532677263021469, "step": 840 }, { "epoch": 0.2, "grad_norm": 2.7297000703484, "learning_rate": 4.838735639258449e-07, "logits/chosen": -2.839224100112915, "logits/rejected": -2.8198981285095215, "logps/chosen": -263.91021728515625, "logps/rejected": -281.6410217285156, "loss": 0.6563, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03757554665207863, "rewards/margins": 0.03192313760519028, "rewards/margins_max": 0.1935836672782898, "rewards/margins_min": -0.15483063459396362, "rewards/margins_std": 0.15361005067825317, "rewards/rejected": -0.0694986879825592, "step": 850 }, { "epoch": 0.21, "grad_norm": 2.322100120915896, "learning_rate": 4.831269382491519e-07, "logits/chosen": -2.7852022647857666, "logits/rejected": -2.8032796382904053, "logps/chosen": -264.82830810546875, "logps/rejected": -287.14013671875, "loss": 0.6561, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.016894642263650894, "rewards/margins": 0.053649015724658966, "rewards/margins_max": 0.28593772649765015, "rewards/margins_min": -0.16067473590373993, "rewards/margins_std": 0.19484171271324158, "rewards/rejected": -0.07054366171360016, "step": 860 }, { "epoch": 0.21, "grad_norm": 2.4021676844145188, "learning_rate": 4.823640205078166e-07, "logits/chosen": -2.8161494731903076, "logits/rejected": -2.796370029449463, "logps/chosen": -242.9220428466797, "logps/rejected": -254.65536499023438, "loss": 0.6614, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026305362582206726, "rewards/margins": 0.06265170872211456, "rewards/margins_max": 0.23923330008983612, "rewards/margins_min": -0.13371381163597107, "rewards/margins_std": 0.16895791888237, "rewards/rejected": -0.08895707130432129, "step": 870 }, { "epoch": 0.21, "grad_norm": 2.5730950503474523, "learning_rate": 4.815848640183081e-07, "logits/chosen": -2.749844789505005, "logits/rejected": -2.7095799446105957, "logps/chosen": -328.59759521484375, "logps/rejected": -296.31890869140625, "loss": 0.6469, "rewards/accuracies": 0.6875, "rewards/chosen": 0.023172562941908836, "rewards/margins": 0.09964267909526825, "rewards/margins_max": 0.3635765612125397, "rewards/margins_min": -0.10102187097072601, "rewards/margins_std": 0.20656804740428925, "rewards/rejected": -0.07647012174129486, "step": 880 }, { "epoch": 0.21, "grad_norm": 2.409267861523612, "learning_rate": 4.807895232319393e-07, "logits/chosen": -2.766179323196411, "logits/rejected": -2.7184956073760986, "logps/chosen": -293.44976806640625, "logps/rejected": -222.43508911132812, "loss": 0.6566, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03060835599899292, "rewards/margins": 0.08982095867395401, "rewards/margins_max": 0.30089056491851807, "rewards/margins_min": -0.08933084458112717, "rewards/margins_std": 0.17627611756324768, "rewards/rejected": -0.12042931467294693, "step": 890 }, { "epoch": 0.22, "grad_norm": 3.844275461891915, "learning_rate": 4.799780537310621e-07, "logits/chosen": -2.7792530059814453, "logits/rejected": -2.7382729053497314, "logps/chosen": -321.98876953125, "logps/rejected": -290.2458190917969, "loss": 0.6457, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.011465306393802166, "rewards/margins": 0.12456780672073364, "rewards/margins_max": 0.35805121064186096, "rewards/margins_min": -0.07632803171873093, "rewards/margins_std": 0.19223228096961975, "rewards/rejected": -0.1360331028699875, "step": 900 }, { "epoch": 0.22, "eval_logits/chosen": -2.7500076293945312, "eval_logits/rejected": -2.7167701721191406, "eval_logps/chosen": -290.4435729980469, "eval_logps/rejected": -281.3417663574219, "eval_loss": 0.6530823111534119, "eval_rewards/accuracies": 0.6754999756813049, "eval_rewards/chosen": -0.05988248437643051, "eval_rewards/margins": 0.09500282257795334, "eval_rewards/margins_max": 0.42163950204849243, "eval_rewards/margins_min": -0.19472260773181915, "eval_rewards/margins_std": 0.20590785145759583, "eval_rewards/rejected": -0.15488530695438385, "eval_runtime": 859.3926, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 900 }, { "epoch": 0.22, "grad_norm": 2.382660807799636, "learning_rate": 4.791505122251827e-07, "logits/chosen": -2.824524164199829, "logits/rejected": -2.7542624473571777, "logps/chosen": -250.38687133789062, "logps/rejected": -234.7572479248047, "loss": 0.6397, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.05901294946670532, "rewards/margins": 0.12852489948272705, "rewards/margins_max": 0.30426403880119324, "rewards/margins_min": -0.05684134364128113, "rewards/margins_std": 0.17195206880569458, "rewards/rejected": -0.18753783404827118, "step": 910 }, { "epoch": 0.22, "grad_norm": 2.3424411425122367, "learning_rate": 4.783069565469985e-07, "logits/chosen": -2.7414608001708984, "logits/rejected": -2.7249720096588135, "logps/chosen": -290.5932922363281, "logps/rejected": -290.652099609375, "loss": 0.649, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07417134940624237, "rewards/margins": 0.09879481792449951, "rewards/margins_max": 0.35133180022239685, "rewards/margins_min": -0.12309278547763824, "rewards/margins_std": 0.21439354121685028, "rewards/rejected": -0.17296616733074188, "step": 920 }, { "epoch": 0.22, "grad_norm": 2.345627604848268, "learning_rate": 4.77447445648357e-07, "logits/chosen": -2.751471757888794, "logits/rejected": -2.7147216796875, "logps/chosen": -270.4989929199219, "logps/rejected": -233.923095703125, "loss": 0.6507, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06592012941837311, "rewards/margins": 0.09131678938865662, "rewards/margins_max": 0.3055071234703064, "rewards/margins_min": -0.1199672594666481, "rewards/margins_std": 0.19045254588127136, "rewards/rejected": -0.15723691880702972, "step": 930 }, { "epoch": 0.23, "grad_norm": 2.251657087680744, "learning_rate": 4.765720395961349e-07, "logits/chosen": -2.779428243637085, "logits/rejected": -2.778550386428833, "logps/chosen": -276.707763671875, "logps/rejected": -277.7341613769531, "loss": 0.6564, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01194118894636631, "rewards/margins": 0.10344435274600983, "rewards/margins_max": 0.3220486044883728, "rewards/margins_min": -0.08887827396392822, "rewards/margins_std": 0.18643055856227875, "rewards/rejected": -0.09150315821170807, "step": 940 }, { "epoch": 0.23, "grad_norm": 2.435889861575151, "learning_rate": 4.7568079956804144e-07, "logits/chosen": -2.8326854705810547, "logits/rejected": -2.791243076324463, "logps/chosen": -322.36151123046875, "logps/rejected": -308.95574951171875, "loss": 0.6436, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0057469927705824375, "rewards/margins": 0.1237279623746872, "rewards/margins_max": 0.41585248708724976, "rewards/margins_min": -0.14109480381011963, "rewards/margins_std": 0.24837708473205566, "rewards/rejected": -0.129474937915802, "step": 950 }, { "epoch": 0.23, "grad_norm": 2.4980689220037164, "learning_rate": 4.74773787848342e-07, "logits/chosen": -2.847013473510742, "logits/rejected": -2.787186622619629, "logps/chosen": -304.60577392578125, "logps/rejected": -257.9570617675781, "loss": 0.6406, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.010193193331360817, "rewards/margins": 0.12367801368236542, "rewards/margins_max": 0.4210747182369232, "rewards/margins_min": -0.11235042661428452, "rewards/margins_std": 0.2386142909526825, "rewards/rejected": -0.1338711977005005, "step": 960 }, { "epoch": 0.23, "grad_norm": 2.953171825542286, "learning_rate": 4.7385106782350637e-07, "logits/chosen": -2.7796080112457275, "logits/rejected": -2.723635196685791, "logps/chosen": -322.67254638671875, "logps/rejected": -312.0019226074219, "loss": 0.6306, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.022632339969277382, "rewards/margins": 0.1675281822681427, "rewards/margins_max": 0.4663251042366028, "rewards/margins_min": -0.11809631437063217, "rewards/margins_std": 0.25347962975502014, "rewards/rejected": -0.19016052782535553, "step": 970 }, { "epoch": 0.23, "grad_norm": 2.857579235403436, "learning_rate": 4.729127039777781e-07, "logits/chosen": -2.6996328830718994, "logits/rejected": -2.6781938076019287, "logps/chosen": -257.40435791015625, "logps/rejected": -246.96444702148438, "loss": 0.6553, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.14536455273628235, "rewards/margins": 0.08878039568662643, "rewards/margins_max": 0.35398316383361816, "rewards/margins_min": -0.10649490356445312, "rewards/margins_std": 0.20907866954803467, "rewards/rejected": -0.23414495587348938, "step": 980 }, { "epoch": 0.24, "grad_norm": 4.883361297284326, "learning_rate": 4.719587618886685e-07, "logits/chosen": -2.8046514987945557, "logits/rejected": -2.7349839210510254, "logps/chosen": -316.08917236328125, "logps/rejected": -316.1483459472656, "loss": 0.6611, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.05946500971913338, "rewards/margins": 0.13033434748649597, "rewards/margins_max": 0.41287779808044434, "rewards/margins_min": -0.11944649368524551, "rewards/margins_std": 0.23220928013324738, "rewards/rejected": -0.18979936838150024, "step": 990 }, { "epoch": 0.24, "grad_norm": 2.5571895560851043, "learning_rate": 4.709893082223737e-07, "logits/chosen": -2.799964189529419, "logits/rejected": -2.7523694038391113, "logps/chosen": -304.8963928222656, "logps/rejected": -297.629638671875, "loss": 0.6356, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06462176889181137, "rewards/margins": 0.10349669307470322, "rewards/margins_max": 0.38461166620254517, "rewards/margins_min": -0.16973380744457245, "rewards/margins_std": 0.2456401288509369, "rewards/rejected": -0.1681184470653534, "step": 1000 }, { "epoch": 0.24, "eval_logits/chosen": -2.7361843585968018, "eval_logits/rejected": -2.704155683517456, "eval_logps/chosen": -290.7086486816406, "eval_logps/rejected": -283.9889831542969, "eval_loss": 0.6448861360549927, "eval_rewards/accuracies": 0.6784999966621399, "eval_rewards/chosen": -0.0625331774353981, "eval_rewards/margins": 0.11882392317056656, "eval_rewards/margins_max": 0.52253258228302, "eval_rewards/margins_min": -0.24860599637031555, "eval_rewards/margins_std": 0.25825318694114685, "eval_rewards/rejected": -0.18135710060596466, "eval_runtime": 860.0164, "eval_samples_per_second": 4.651, "eval_steps_per_second": 0.291, "step": 1000 }, { "epoch": 0.24, "grad_norm": 3.0310220534360233, "learning_rate": 4.7000441072911554e-07, "logits/chosen": -2.7325241565704346, "logits/rejected": -2.7142276763916016, "logps/chosen": -259.64825439453125, "logps/rejected": -293.3815002441406, "loss": 0.6377, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.05959668755531311, "rewards/margins": 0.12099339067935944, "rewards/margins_max": 0.3900481164455414, "rewards/margins_min": -0.1284235268831253, "rewards/margins_std": 0.23061911761760712, "rewards/rejected": -0.18059007823467255, "step": 1010 }, { "epoch": 0.24, "grad_norm": 3.1136410073370375, "learning_rate": 4.690041382384071e-07, "logits/chosen": -2.7031478881835938, "logits/rejected": -2.720886468887329, "logps/chosen": -234.29135131835938, "logps/rejected": -248.6089324951172, "loss": 0.6333, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.01792748272418976, "rewards/margins": 0.1429601013660431, "rewards/margins_max": 0.3770293593406677, "rewards/margins_min": -0.058856137096881866, "rewards/margins_std": 0.1958865374326706, "rewards/rejected": -0.16088759899139404, "step": 1020 }, { "epoch": 0.25, "grad_norm": 3.080625031947799, "learning_rate": 4.679885606542423e-07, "logits/chosen": -2.7704384326934814, "logits/rejected": -2.7624263763427734, "logps/chosen": -259.32757568359375, "logps/rejected": -268.58782958984375, "loss": 0.6351, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02662523090839386, "rewards/margins": 0.12475328147411346, "rewards/margins_max": 0.3847391605377197, "rewards/margins_min": -0.10660214722156525, "rewards/margins_std": 0.2208033800125122, "rewards/rejected": -0.15137849748134613, "step": 1030 }, { "epoch": 0.25, "grad_norm": 4.393026000885723, "learning_rate": 4.669577489502108e-07, "logits/chosen": -2.798563003540039, "logits/rejected": -2.7397446632385254, "logps/chosen": -283.36920166015625, "logps/rejected": -280.3486022949219, "loss": 0.6213, "rewards/accuracies": 0.75, "rewards/chosen": -0.018056590110063553, "rewards/margins": 0.15028563141822815, "rewards/margins_max": 0.41389912366867065, "rewards/margins_min": -0.0801553726196289, "rewards/margins_std": 0.2179095447063446, "rewards/rejected": -0.1683422327041626, "step": 1040 }, { "epoch": 0.25, "grad_norm": 3.3263851865687215, "learning_rate": 4.6591177516453795e-07, "logits/chosen": -2.646252155303955, "logits/rejected": -2.6699962615966797, "logps/chosen": -262.1933898925781, "logps/rejected": -261.60467529296875, "loss": 0.6335, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10094554722309113, "rewards/margins": 0.09482800960540771, "rewards/margins_max": 0.33993563055992126, "rewards/margins_min": -0.14256446063518524, "rewards/margins_std": 0.22054481506347656, "rewards/rejected": -0.19577357172966003, "step": 1050 }, { "epoch": 0.25, "grad_norm": 3.0935778329126573, "learning_rate": 4.6485071239505037e-07, "logits/chosen": -2.765110731124878, "logits/rejected": -2.7562057971954346, "logps/chosen": -293.51458740234375, "logps/rejected": -281.2161865234375, "loss": 0.6342, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.002409782260656357, "rewards/margins": 0.17773596942424774, "rewards/margins_max": 0.5016456842422485, "rewards/margins_min": -0.13163571059703827, "rewards/margins_std": 0.2804378569126129, "rewards/rejected": -0.1801457554101944, "step": 1060 }, { "epoch": 0.26, "grad_norm": 2.8692948499223663, "learning_rate": 4.6377463479406777e-07, "logits/chosen": -2.7676243782043457, "logits/rejected": -2.721116304397583, "logps/chosen": -301.32562255859375, "logps/rejected": -275.3507385253906, "loss": 0.6185, "rewards/accuracies": 0.75, "rewards/chosen": -0.06775529682636261, "rewards/margins": 0.16383641958236694, "rewards/margins_max": 0.4588744640350342, "rewards/margins_min": -0.13780052959918976, "rewards/margins_std": 0.2671576738357544, "rewards/rejected": -0.23159170150756836, "step": 1070 }, { "epoch": 0.26, "grad_norm": 2.8944432789642986, "learning_rate": 4.6268361756322037e-07, "logits/chosen": -2.7499701976776123, "logits/rejected": -2.691213846206665, "logps/chosen": -320.1142272949219, "logps/rejected": -289.4026794433594, "loss": 0.6263, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0063354698941111565, "rewards/margins": 0.18059185147285461, "rewards/margins_max": 0.4770359992980957, "rewards/margins_min": -0.0981023907661438, "rewards/margins_std": 0.2540958523750305, "rewards/rejected": -0.18692728877067566, "step": 1080 }, { "epoch": 0.26, "grad_norm": 3.3092779647822512, "learning_rate": 4.6157773694819396e-07, "logits/chosen": -2.7550148963928223, "logits/rejected": -2.742978572845459, "logps/chosen": -282.58740234375, "logps/rejected": -348.9330139160156, "loss": 0.6443, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07441152632236481, "rewards/margins": 0.12869814038276672, "rewards/margins_max": 0.483846515417099, "rewards/margins_min": -0.22396783530712128, "rewards/margins_std": 0.3191817104816437, "rewards/rejected": -0.20310965180397034, "step": 1090 }, { "epoch": 0.26, "grad_norm": 3.5516894766333014, "learning_rate": 4.60457070233401e-07, "logits/chosen": -2.646277904510498, "logits/rejected": -2.645847797393799, "logps/chosen": -261.3597412109375, "logps/rejected": -252.5585479736328, "loss": 0.6465, "rewards/accuracies": 0.75, "rewards/chosen": -0.07279185205698013, "rewards/margins": 0.14220745861530304, "rewards/margins_max": 0.4044904112815857, "rewards/margins_min": -0.1086801290512085, "rewards/margins_std": 0.2287602722644806, "rewards/rejected": -0.21499928832054138, "step": 1100 }, { "epoch": 0.26, "eval_logits/chosen": -2.730116128921509, "eval_logits/rejected": -2.698188543319702, "eval_logps/chosen": -287.3659362792969, "eval_logps/rejected": -282.8690490722656, "eval_loss": 0.637828528881073, "eval_rewards/accuracies": 0.6775000095367432, "eval_rewards/chosen": -0.029106074944138527, "eval_rewards/margins": 0.14105208218097687, "eval_rewards/margins_max": 0.6107731461524963, "eval_rewards/margins_min": -0.2945648431777954, "eval_rewards/margins_std": 0.303110808134079, "eval_rewards/rejected": -0.17015816271305084, "eval_runtime": 859.776, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 1100 }, { "epoch": 0.27, "grad_norm": 3.1790500911139357, "learning_rate": 4.5932169573657987e-07, "logits/chosen": -2.815812110900879, "logits/rejected": -2.797717571258545, "logps/chosen": -319.1174011230469, "logps/rejected": -329.5782165527344, "loss": 0.6299, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.015058162622153759, "rewards/margins": 0.18906506896018982, "rewards/margins_max": 0.4647350311279297, "rewards/margins_min": -0.12183723598718643, "rewards/margins_std": 0.25658389925956726, "rewards/rejected": -0.17400690913200378, "step": 1110 }, { "epoch": 0.27, "grad_norm": 4.171269633630187, "learning_rate": 4.581716928033216e-07, "logits/chosen": -2.7732093334198, "logits/rejected": -2.7576842308044434, "logps/chosen": -284.92010498046875, "logps/rejected": -297.70751953125, "loss": 0.641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04591558128595352, "rewards/margins": 0.09049420803785324, "rewards/margins_max": 0.4978618025779724, "rewards/margins_min": -0.2861872613430023, "rewards/margins_std": 0.34444791078567505, "rewards/rejected": -0.13640980422496796, "step": 1120 }, { "epoch": 0.27, "grad_norm": 3.4951713495995476, "learning_rate": 4.5700714180152467e-07, "logits/chosen": -2.688694477081299, "logits/rejected": -2.654418468475342, "logps/chosen": -230.10556030273438, "logps/rejected": -241.9662322998047, "loss": 0.637, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.06187693029642105, "rewards/margins": 0.12760981917381287, "rewards/margins_max": 0.45869001746177673, "rewards/margins_min": -0.14676091074943542, "rewards/margins_std": 0.2812282145023346, "rewards/rejected": -0.18948674201965332, "step": 1130 }, { "epoch": 0.27, "grad_norm": 4.7663237512836645, "learning_rate": 4.5582812411577887e-07, "logits/chosen": -2.7296979427337646, "logits/rejected": -2.7077085971832275, "logps/chosen": -284.59747314453125, "logps/rejected": -273.9245910644531, "loss": 0.6417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05198391154408455, "rewards/margins": 0.15814228355884552, "rewards/margins_max": 0.4948105812072754, "rewards/margins_min": -0.1632794737815857, "rewards/margins_std": 0.28802981972694397, "rewards/rejected": -0.21012616157531738, "step": 1140 }, { "epoch": 0.28, "grad_norm": 3.377083311996758, "learning_rate": 4.546347221416772e-07, "logits/chosen": -2.728276252746582, "logits/rejected": -2.6977105140686035, "logps/chosen": -266.24090576171875, "logps/rejected": -267.21038818359375, "loss": 0.6267, "rewards/accuracies": 0.75, "rewards/chosen": -0.021903954446315765, "rewards/margins": 0.18644428253173828, "rewards/margins_max": 0.5090216994285583, "rewards/margins_min": -0.06704654544591904, "rewards/margins_std": 0.2600798010826111, "rewards/rejected": -0.20834822952747345, "step": 1150 }, { "epoch": 0.28, "grad_norm": 3.6845226876229042, "learning_rate": 4.534270192800581e-07, "logits/chosen": -2.707961082458496, "logits/rejected": -2.677424669265747, "logps/chosen": -262.2182922363281, "logps/rejected": -269.26556396484375, "loss": 0.6281, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.026781385764479637, "rewards/margins": 0.1710069179534912, "rewards/margins_max": 0.4876475930213928, "rewards/margins_min": -0.13151448965072632, "rewards/margins_std": 0.2734828591346741, "rewards/rejected": -0.14422553777694702, "step": 1160 }, { "epoch": 0.28, "grad_norm": 4.547609879401967, "learning_rate": 4.5220509993117684e-07, "logits/chosen": -2.789504289627075, "logits/rejected": -2.7169666290283203, "logps/chosen": -298.80975341796875, "logps/rejected": -274.0549621582031, "loss": 0.6339, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03234567120671272, "rewards/margins": 0.11823008954524994, "rewards/margins_max": 0.45920103788375854, "rewards/margins_min": -0.26762890815734863, "rewards/margins_std": 0.33254751563072205, "rewards/rejected": -0.15057575702667236, "step": 1170 }, { "epoch": 0.28, "grad_norm": 2.7231328429432455, "learning_rate": 4.509690494888071e-07, "logits/chosen": -2.766167402267456, "logits/rejected": -2.7049505710601807, "logps/chosen": -335.00714111328125, "logps/rejected": -295.6134338378906, "loss": 0.6278, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.04288125038146973, "rewards/margins": 0.17231056094169617, "rewards/margins_max": 0.47455501556396484, "rewards/margins_min": -0.13174986839294434, "rewards/margins_std": 0.2745053172111511, "rewards/rejected": -0.12942931056022644, "step": 1180 }, { "epoch": 0.28, "grad_norm": 3.9147600172988883, "learning_rate": 4.4971895433427356e-07, "logits/chosen": -2.726125955581665, "logits/rejected": -2.7111525535583496, "logps/chosen": -227.27102661132812, "logps/rejected": -236.37942504882812, "loss": 0.6119, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.00787246972322464, "rewards/margins": 0.1817532330751419, "rewards/margins_max": 0.5178753137588501, "rewards/margins_min": -0.14762163162231445, "rewards/margins_std": 0.3076411187648773, "rewards/rejected": -0.17388075590133667, "step": 1190 }, { "epoch": 0.29, "grad_norm": 5.455820024290682, "learning_rate": 4.4845490183041454e-07, "logits/chosen": -2.7426819801330566, "logits/rejected": -2.7446188926696777, "logps/chosen": -308.69561767578125, "logps/rejected": -318.27105712890625, "loss": 0.6121, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03109120763838291, "rewards/margins": 0.18727445602416992, "rewards/margins_max": 0.45334863662719727, "rewards/margins_min": -0.10804096609354019, "rewards/margins_std": 0.2554578185081482, "rewards/rejected": -0.21836566925048828, "step": 1200 }, { "epoch": 0.29, "eval_logits/chosen": -2.7208263874053955, "eval_logits/rejected": -2.68928861618042, "eval_logps/chosen": -291.0350341796875, "eval_logps/rejected": -288.4626159667969, "eval_loss": 0.6317090392112732, "eval_rewards/accuracies": 0.6779999732971191, "eval_rewards/chosen": -0.06579707562923431, "eval_rewards/margins": 0.1602962464094162, "eval_rewards/margins_max": 0.6846780776977539, "eval_rewards/margins_min": -0.3354347348213196, "eval_rewards/margins_std": 0.3418317437171936, "eval_rewards/rejected": -0.2260933220386505, "eval_runtime": 859.8431, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 1200 }, { "epoch": 0.29, "grad_norm": 3.3676733387809215, "learning_rate": 4.4717698031547733e-07, "logits/chosen": -2.787823438644409, "logits/rejected": -2.7212958335876465, "logps/chosen": -312.23675537109375, "logps/rejected": -287.49444580078125, "loss": 0.6022, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.050309885293245316, "rewards/margins": 0.2127988636493683, "rewards/margins_max": 0.6404433250427246, "rewards/margins_min": -0.13751380145549774, "rewards/margins_std": 0.3414255380630493, "rewards/rejected": -0.2631087601184845, "step": 1210 }, { "epoch": 0.29, "grad_norm": 3.938071315431553, "learning_rate": 4.458852790969445e-07, "logits/chosen": -2.7835850715637207, "logits/rejected": -2.75274658203125, "logps/chosen": -278.7928771972656, "logps/rejected": -285.7991027832031, "loss": 0.6139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07205849885940552, "rewards/margins": 0.16392816603183746, "rewards/margins_max": 0.5056854486465454, "rewards/margins_min": -0.16414335370063782, "rewards/margins_std": 0.29500722885131836, "rewards/rejected": -0.23598666489124298, "step": 1220 }, { "epoch": 0.29, "grad_norm": 4.625542479996582, "learning_rate": 4.4457988844529204e-07, "logits/chosen": -2.7648215293884277, "logits/rejected": -2.73262619972229, "logps/chosen": -258.94415283203125, "logps/rejected": -301.65460205078125, "loss": 0.6296, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08725883066654205, "rewards/margins": 0.18550553917884827, "rewards/margins_max": 0.6380602121353149, "rewards/margins_min": -0.20465464890003204, "rewards/margins_std": 0.37270504236221313, "rewards/rejected": -0.2727643549442291, "step": 1230 }, { "epoch": 0.3, "grad_norm": 4.067322201866829, "learning_rate": 4.432608995876819e-07, "logits/chosen": -2.7919411659240723, "logits/rejected": -2.7073256969451904, "logps/chosen": -276.91094970703125, "logps/rejected": -264.3546142578125, "loss": 0.6487, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08658798038959503, "rewards/margins": 0.11079300940036774, "rewards/margins_max": 0.5283591747283936, "rewards/margins_min": -0.31804972887039185, "rewards/margins_std": 0.3785925805568695, "rewards/rejected": -0.19738095998764038, "step": 1240 }, { "epoch": 0.3, "grad_norm": 3.6440421509131635, "learning_rate": 4.419284047015854e-07, "logits/chosen": -2.795815944671631, "logits/rejected": -2.7724575996398926, "logps/chosen": -298.9609375, "logps/rejected": -261.5467224121094, "loss": 0.6215, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.08188175410032272, "rewards/margins": 0.18586918711662292, "rewards/margins_max": 0.5499319434165955, "rewards/margins_min": -0.1795201450586319, "rewards/margins_std": 0.32101696729660034, "rewards/rejected": -0.26775094866752625, "step": 1250 }, { "epoch": 0.3, "grad_norm": 5.21366259739191, "learning_rate": 4.4058249690834235e-07, "logits/chosen": -2.7669315338134766, "logits/rejected": -2.7613534927368164, "logps/chosen": -264.0572509765625, "logps/rejected": -264.752685546875, "loss": 0.6176, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13562199473381042, "rewards/margins": 0.15420284867286682, "rewards/margins_max": 0.44718003273010254, "rewards/margins_min": -0.14009472727775574, "rewards/margins_std": 0.26433926820755005, "rewards/rejected": -0.28982487320899963, "step": 1260 }, { "epoch": 0.3, "grad_norm": 5.1530670514385015, "learning_rate": 4.39223270266653e-07, "logits/chosen": -2.7849080562591553, "logits/rejected": -2.7341203689575195, "logps/chosen": -299.35552978515625, "logps/rejected": -314.02850341796875, "loss": 0.6043, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10228396952152252, "rewards/margins": 0.20735082030296326, "rewards/margins_max": 0.5664128661155701, "rewards/margins_min": -0.15627393126487732, "rewards/margins_std": 0.33992764353752136, "rewards/rejected": -0.30963483452796936, "step": 1270 }, { "epoch": 0.31, "grad_norm": 4.552507091267408, "learning_rate": 4.378508197660045e-07, "logits/chosen": -2.8158326148986816, "logits/rejected": -2.751063346862793, "logps/chosen": -320.4525146484375, "logps/rejected": -308.1334533691406, "loss": 0.6183, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11220784485340118, "rewards/margins": 0.23017704486846924, "rewards/margins_max": 0.6422857046127319, "rewards/margins_min": -0.21641802787780762, "rewards/margins_std": 0.3890889883041382, "rewards/rejected": -0.34238487482070923, "step": 1280 }, { "epoch": 0.31, "grad_norm": 3.7868892769010043, "learning_rate": 4.364652413200325e-07, "logits/chosen": -2.81675124168396, "logits/rejected": -2.758913993835449, "logps/chosen": -323.35797119140625, "logps/rejected": -289.88714599609375, "loss": 0.6069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12105697393417358, "rewards/margins": 0.229929119348526, "rewards/margins_max": 0.6117097735404968, "rewards/margins_min": -0.11338132619857788, "rewards/margins_std": 0.32332319021224976, "rewards/rejected": -0.3509860932826996, "step": 1290 }, { "epoch": 0.31, "grad_norm": 7.258932464550304, "learning_rate": 4.35066631759819e-07, "logits/chosen": -2.745797634124756, "logits/rejected": -2.7432026863098145, "logps/chosen": -297.58990478515625, "logps/rejected": -297.460205078125, "loss": 0.6113, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1302531361579895, "rewards/margins": 0.22044309973716736, "rewards/margins_max": 0.6038533449172974, "rewards/margins_min": -0.1656443476676941, "rewards/margins_std": 0.34139499068260193, "rewards/rejected": -0.35069626569747925, "step": 1300 }, { "epoch": 0.31, "eval_logits/chosen": -2.72509765625, "eval_logits/rejected": -2.6940572261810303, "eval_logps/chosen": -302.64703369140625, "eval_logps/rejected": -301.4143981933594, "eval_loss": 0.6287034749984741, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": -0.18191717565059662, "eval_rewards/margins": 0.17369407415390015, "eval_rewards/margins_max": 0.7286714911460876, "eval_rewards/margins_min": -0.3415972888469696, "eval_rewards/margins_std": 0.36210083961486816, "eval_rewards/rejected": -0.35561126470565796, "eval_runtime": 859.8191, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 1300 }, { "epoch": 0.31, "grad_norm": 4.240268786205948, "learning_rate": 4.3365508882712445e-07, "logits/chosen": -2.7560200691223145, "logits/rejected": -2.7424092292785645, "logps/chosen": -326.30792236328125, "logps/rejected": -296.25640869140625, "loss": 0.6078, "rewards/accuracies": 0.625, "rewards/chosen": -0.08834477514028549, "rewards/margins": 0.2148284614086151, "rewards/margins_max": 0.7110589146614075, "rewards/margins_min": -0.19051234424114227, "rewards/margins_std": 0.4097180962562561, "rewards/rejected": -0.3031732439994812, "step": 1310 }, { "epoch": 0.32, "grad_norm": 4.561867866097655, "learning_rate": 4.322307111675573e-07, "logits/chosen": -2.6840503215789795, "logits/rejected": -2.655874013900757, "logps/chosen": -283.03094482421875, "logps/rejected": -266.55950927734375, "loss": 0.6208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1422191858291626, "rewards/margins": 0.20038846135139465, "rewards/margins_max": 0.6033264398574829, "rewards/margins_min": -0.1278662532567978, "rewards/margins_std": 0.32722723484039307, "rewards/rejected": -0.34260767698287964, "step": 1320 }, { "epoch": 0.32, "grad_norm": 3.4629976156007873, "learning_rate": 4.3079359832368055e-07, "logits/chosen": -2.772587299346924, "logits/rejected": -2.6875176429748535, "logps/chosen": -298.23114013671875, "logps/rejected": -250.58224487304688, "loss": 0.5873, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.05119676515460014, "rewards/margins": 0.21297863125801086, "rewards/margins_max": 0.6446580290794373, "rewards/margins_min": -0.1635059416294098, "rewards/margins_std": 0.3629075586795807, "rewards/rejected": -0.2641753554344177, "step": 1330 }, { "epoch": 0.32, "grad_norm": 5.110179032923911, "learning_rate": 4.2934385072805467e-07, "logits/chosen": -2.6937944889068604, "logits/rejected": -2.6603851318359375, "logps/chosen": -269.38800048828125, "logps/rejected": -250.0190887451172, "loss": 0.594, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.033771999180316925, "rewards/margins": 0.20248901844024658, "rewards/margins_max": 0.6091136336326599, "rewards/margins_min": -0.19329795241355896, "rewards/margins_std": 0.375012069940567, "rewards/rejected": -0.2362610101699829, "step": 1340 }, { "epoch": 0.32, "grad_norm": 4.528447830349519, "learning_rate": 4.278815696962195e-07, "logits/chosen": -2.7512173652648926, "logits/rejected": -2.728698968887329, "logps/chosen": -299.51873779296875, "logps/rejected": -314.3564758300781, "loss": 0.595, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.05349518731236458, "rewards/margins": 0.24695232510566711, "rewards/margins_max": 0.6851639151573181, "rewards/margins_min": -0.15185198187828064, "rewards/margins_std": 0.37347060441970825, "rewards/rejected": -0.3004475235939026, "step": 1350 }, { "epoch": 0.33, "grad_norm": 3.756856118552827, "learning_rate": 4.264068574196129e-07, "logits/chosen": -2.680025577545166, "logits/rejected": -2.6544578075408936, "logps/chosen": -300.19915771484375, "logps/rejected": -276.1390686035156, "loss": 0.6219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06253058463335037, "rewards/margins": 0.24794158339500427, "rewards/margins_max": 0.6766451597213745, "rewards/margins_min": -0.20260021090507507, "rewards/margins_std": 0.3898962438106537, "rewards/rejected": -0.31047219038009644, "step": 1360 }, { "epoch": 0.33, "grad_norm": 8.004265125098687, "learning_rate": 4.2491981695843016e-07, "logits/chosen": -2.706125020980835, "logits/rejected": -2.731999158859253, "logps/chosen": -265.1845703125, "logps/rejected": -318.8197326660156, "loss": 0.6301, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13150812685489655, "rewards/margins": 0.18570031225681305, "rewards/margins_max": 0.6965837478637695, "rewards/margins_min": -0.30658960342407227, "rewards/margins_std": 0.4410664439201355, "rewards/rejected": -0.3172084093093872, "step": 1370 }, { "epoch": 0.33, "grad_norm": 3.700065024552013, "learning_rate": 4.2342055223442093e-07, "logits/chosen": -2.7233641147613525, "logits/rejected": -2.7470736503601074, "logps/chosen": -290.7392578125, "logps/rejected": -291.89642333984375, "loss": 0.629, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11083575338125229, "rewards/margins": 0.20581002533435822, "rewards/margins_max": 0.6339712738990784, "rewards/margins_min": -0.2242089807987213, "rewards/margins_std": 0.3768702745437622, "rewards/rejected": -0.3166458010673523, "step": 1380 }, { "epoch": 0.33, "grad_norm": 4.60283629009898, "learning_rate": 4.2190916802362687e-07, "logits/chosen": -2.777204990386963, "logits/rejected": -2.7478649616241455, "logps/chosen": -266.34307861328125, "logps/rejected": -287.4773254394531, "loss": 0.6293, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15680015087127686, "rewards/margins": 0.13014456629753113, "rewards/margins_max": 0.5216537714004517, "rewards/margins_min": -0.27942514419555664, "rewards/margins_std": 0.36587920784950256, "rewards/rejected": -0.286944717168808, "step": 1390 }, { "epoch": 0.34, "grad_norm": 4.909187943690839, "learning_rate": 4.203857699490593e-07, "logits/chosen": -2.7606589794158936, "logits/rejected": -2.703326940536499, "logps/chosen": -290.61517333984375, "logps/rejected": -263.78778076171875, "loss": 0.6058, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1779560148715973, "rewards/margins": 0.22627469897270203, "rewards/margins_max": 0.6385394334793091, "rewards/margins_min": -0.21331973373889923, "rewards/margins_std": 0.38510316610336304, "rewards/rejected": -0.4042307436466217, "step": 1400 }, { "epoch": 0.34, "eval_logits/chosen": -2.7134692668914795, "eval_logits/rejected": -2.6823277473449707, "eval_logps/chosen": -297.353759765625, "eval_logps/rejected": -297.89019775390625, "eval_loss": 0.6233600378036499, "eval_rewards/accuracies": 0.6775000095367432, "eval_rewards/chosen": -0.12898415327072144, "eval_rewards/margins": 0.1913849264383316, "eval_rewards/margins_max": 0.7907646894454956, "eval_rewards/margins_min": -0.3942703902721405, "eval_rewards/margins_std": 0.3995422422885895, "eval_rewards/rejected": -0.32036906480789185, "eval_runtime": 859.9261, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 1400 }, { "epoch": 0.34, "grad_norm": 5.627722356903511, "learning_rate": 4.1885046447331816e-07, "logits/chosen": -2.676079034805298, "logits/rejected": -2.689484119415283, "logps/chosen": -316.91046142578125, "logps/rejected": -301.6855163574219, "loss": 0.5979, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.040952786803245544, "rewards/margins": 0.2542146146297455, "rewards/margins_max": 0.6011955738067627, "rewards/margins_min": -0.07450132817029953, "rewards/margins_std": 0.3058848977088928, "rewards/rejected": -0.29516738653182983, "step": 1410 }, { "epoch": 0.34, "grad_norm": 5.37700385067337, "learning_rate": 4.173033588911511e-07, "logits/chosen": -2.7434334754943848, "logits/rejected": -2.735919952392578, "logps/chosen": -325.684814453125, "logps/rejected": -336.29461669921875, "loss": 0.6186, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.0978202074766159, "rewards/margins": 0.2610817849636078, "rewards/margins_max": 0.7425618171691895, "rewards/margins_min": -0.14109277725219727, "rewards/margins_std": 0.3950235843658447, "rewards/rejected": -0.3589020371437073, "step": 1420 }, { "epoch": 0.34, "grad_norm": 5.042836371471277, "learning_rate": 4.157445613219559e-07, "logits/chosen": -2.6384220123291016, "logits/rejected": -2.6442770957946777, "logps/chosen": -285.9534606933594, "logps/rejected": -309.20098876953125, "loss": 0.5913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.16042251884937286, "rewards/margins": 0.2354971170425415, "rewards/margins_max": 0.6843439340591431, "rewards/margins_min": -0.14893712103366852, "rewards/margins_std": 0.3668159246444702, "rewards/rejected": -0.3959196209907532, "step": 1430 }, { "epoch": 0.34, "grad_norm": 4.964379073516777, "learning_rate": 4.141741807022243e-07, "logits/chosen": -2.7422003746032715, "logits/rejected": -2.687011241912842, "logps/chosen": -315.2998352050781, "logps/rejected": -277.56182861328125, "loss": 0.6199, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11232249438762665, "rewards/margins": 0.21811863780021667, "rewards/margins_max": 0.6316202878952026, "rewards/margins_min": -0.19104191660881042, "rewards/margins_std": 0.3665235638618469, "rewards/rejected": -0.33044111728668213, "step": 1440 }, { "epoch": 0.35, "grad_norm": 3.867930208036112, "learning_rate": 4.1259232677792865e-07, "logits/chosen": -2.719663143157959, "logits/rejected": -2.714693069458008, "logps/chosen": -267.74310302734375, "logps/rejected": -259.67572021484375, "loss": 0.6273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07311922311782837, "rewards/margins": 0.1725226491689682, "rewards/margins_max": 0.5369999408721924, "rewards/margins_min": -0.18779130280017853, "rewards/margins_std": 0.32979249954223633, "rewards/rejected": -0.24564187228679657, "step": 1450 }, { "epoch": 0.35, "grad_norm": 3.126978743648959, "learning_rate": 4.1099911009685294e-07, "logits/chosen": -2.6542458534240723, "logits/rejected": -2.6500654220581055, "logps/chosen": -313.4648742675781, "logps/rejected": -287.6639404296875, "loss": 0.6001, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07746293395757675, "rewards/margins": 0.19846072793006897, "rewards/margins_max": 0.6413771510124207, "rewards/margins_min": -0.24117057025432587, "rewards/margins_std": 0.39690056443214417, "rewards/rejected": -0.27592363953590393, "step": 1460 }, { "epoch": 0.35, "grad_norm": 8.246208475634898, "learning_rate": 4.093946420008668e-07, "logits/chosen": -2.6944451332092285, "logits/rejected": -2.6753087043762207, "logps/chosen": -280.6971740722656, "logps/rejected": -270.0848083496094, "loss": 0.6186, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10358381271362305, "rewards/margins": 0.18480688333511353, "rewards/margins_max": 0.67694491147995, "rewards/margins_min": -0.21014046669006348, "rewards/margins_std": 0.39692941308021545, "rewards/rejected": -0.2883906960487366, "step": 1470 }, { "epoch": 0.35, "grad_norm": 4.551051368391622, "learning_rate": 4.0777903461814443e-07, "logits/chosen": -2.7240898609161377, "logits/rejected": -2.7123374938964844, "logps/chosen": -304.209228515625, "logps/rejected": -275.527099609375, "loss": 0.6363, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06793724000453949, "rewards/margins": 0.19897602498531342, "rewards/margins_max": 0.6630719900131226, "rewards/margins_min": -0.22334155440330505, "rewards/margins_std": 0.39632511138916016, "rewards/rejected": -0.2669132649898529, "step": 1480 }, { "epoch": 0.36, "grad_norm": 6.694572915321387, "learning_rate": 4.061524008553285e-07, "logits/chosen": -2.6535632610321045, "logits/rejected": -2.633209466934204, "logps/chosen": -263.6488037109375, "logps/rejected": -263.69085693359375, "loss": 0.5912, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0520213358104229, "rewards/margins": 0.25981205701828003, "rewards/margins_max": 0.6896715760231018, "rewards/margins_min": -0.2117619812488556, "rewards/margins_std": 0.4034877419471741, "rewards/rejected": -0.31183338165283203, "step": 1490 }, { "epoch": 0.36, "grad_norm": 5.031901252250718, "learning_rate": 4.045148543896396e-07, "logits/chosen": -2.7256431579589844, "logits/rejected": -2.683338165283203, "logps/chosen": -298.939208984375, "logps/rejected": -278.4862365722656, "loss": 0.6169, "rewards/accuracies": 0.625, "rewards/chosen": -0.132019504904747, "rewards/margins": 0.1406988799571991, "rewards/margins_max": 0.6073684096336365, "rewards/margins_min": -0.3294394612312317, "rewards/margins_std": 0.41978007555007935, "rewards/rejected": -0.2727183699607849, "step": 1500 }, { "epoch": 0.36, "eval_logits/chosen": -2.695730686187744, "eval_logits/rejected": -2.6648154258728027, "eval_logps/chosen": -296.9002990722656, "eval_logps/rejected": -298.718017578125, "eval_loss": 0.6194451451301575, "eval_rewards/accuracies": 0.6790000200271606, "eval_rewards/chosen": -0.1244499683380127, "eval_rewards/margins": 0.20419739186763763, "eval_rewards/margins_max": 0.8340767025947571, "eval_rewards/margins_min": -0.4094077944755554, "eval_rewards/margins_std": 0.4196898937225342, "eval_rewards/rejected": -0.3286473751068115, "eval_runtime": 859.603, "eval_samples_per_second": 4.653, "eval_steps_per_second": 0.291, "step": 1500 }, { "epoch": 0.36, "grad_norm": 7.5550805890009425, "learning_rate": 4.028665096609323e-07, "logits/chosen": -2.7359156608581543, "logits/rejected": -2.729952096939087, "logps/chosen": -322.425048828125, "logps/rejected": -321.57049560546875, "loss": 0.6208, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16162991523742676, "rewards/margins": 0.13901960849761963, "rewards/margins_max": 0.5425564050674438, "rewards/margins_min": -0.25365597009658813, "rewards/margins_std": 0.36137858033180237, "rewards/rejected": -0.3006495535373688, "step": 1510 }, { "epoch": 0.36, "grad_norm": 5.143757431006366, "learning_rate": 4.01207481863697e-07, "logits/chosen": -2.8021790981292725, "logits/rejected": -2.7534518241882324, "logps/chosen": -345.64410400390625, "logps/rejected": -318.15753173828125, "loss": 0.593, "rewards/accuracies": 0.75, "rewards/chosen": 0.01674334518611431, "rewards/margins": 0.309499591588974, "rewards/margins_max": 0.8094781637191772, "rewards/margins_min": -0.15684106945991516, "rewards/margins_std": 0.43172699213027954, "rewards/rejected": -0.29275625944137573, "step": 1520 }, { "epoch": 0.37, "grad_norm": 10.866386773496997, "learning_rate": 3.9953788693901e-07, "logits/chosen": -2.6948866844177246, "logits/rejected": -2.6677167415618896, "logps/chosen": -323.5318908691406, "logps/rejected": -308.58563232421875, "loss": 0.6204, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1379726380109787, "rewards/margins": 0.1759684681892395, "rewards/margins_max": 0.6472272276878357, "rewards/margins_min": -0.3483063578605652, "rewards/margins_std": 0.4497564733028412, "rewards/rejected": -0.3139411509037018, "step": 1530 }, { "epoch": 0.37, "grad_norm": 4.314291761496934, "learning_rate": 3.978578415664306e-07, "logits/chosen": -2.6268324851989746, "logits/rejected": -2.6289455890655518, "logps/chosen": -273.48516845703125, "logps/rejected": -260.3494873046875, "loss": 0.576, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14631512761116028, "rewards/margins": 0.27899685502052307, "rewards/margins_max": 0.7275093197822571, "rewards/margins_min": -0.1443461924791336, "rewards/margins_std": 0.40121975541114807, "rewards/rejected": -0.42531198263168335, "step": 1540 }, { "epoch": 0.37, "grad_norm": 5.455099453390994, "learning_rate": 3.9616746315584733e-07, "logits/chosen": -2.735430955886841, "logits/rejected": -2.6580283641815186, "logps/chosen": -334.23291015625, "logps/rejected": -278.3861999511719, "loss": 0.5905, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10325656086206436, "rewards/margins": 0.30803465843200684, "rewards/margins_max": 0.7940778136253357, "rewards/margins_min": -0.10969796031713486, "rewards/margins_std": 0.4056004583835602, "rewards/rejected": -0.4112912118434906, "step": 1550 }, { "epoch": 0.37, "grad_norm": 5.315048324638009, "learning_rate": 3.9446686983927236e-07, "logits/chosen": -2.671513795852661, "logits/rejected": -2.651416301727295, "logps/chosen": -268.4781494140625, "logps/rejected": -307.71929931640625, "loss": 0.6002, "rewards/accuracies": 0.75, "rewards/chosen": -0.10599759966135025, "rewards/margins": 0.25854843854904175, "rewards/margins_max": 0.6484243869781494, "rewards/margins_min": -0.18162080645561218, "rewards/margins_std": 0.36594900488853455, "rewards/rejected": -0.364546000957489, "step": 1560 }, { "epoch": 0.38, "grad_norm": 7.897137241515315, "learning_rate": 3.927561804625863e-07, "logits/chosen": -2.6680185794830322, "logits/rejected": -2.645139217376709, "logps/chosen": -319.42694091796875, "logps/rejected": -326.59466552734375, "loss": 0.6295, "rewards/accuracies": 0.625, "rewards/chosen": -0.04393978416919708, "rewards/margins": 0.1751846820116043, "rewards/margins_max": 0.6941227912902832, "rewards/margins_min": -0.28787922859191895, "rewards/margins_std": 0.4296892583370209, "rewards/rejected": -0.21912448108196259, "step": 1570 }, { "epoch": 0.38, "grad_norm": 4.353279911743585, "learning_rate": 3.910355145772323e-07, "logits/chosen": -2.6887450218200684, "logits/rejected": -2.6724624633789062, "logps/chosen": -281.3534240722656, "logps/rejected": -299.25909423828125, "loss": 0.5963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006468605250120163, "rewards/margins": 0.3195800483226776, "rewards/margins_max": 0.7575172185897827, "rewards/margins_min": -0.10346603393554688, "rewards/margins_std": 0.39106759428977966, "rewards/rejected": -0.31311145424842834, "step": 1580 }, { "epoch": 0.38, "grad_norm": 5.15471131457158, "learning_rate": 3.893049924318613e-07, "logits/chosen": -2.6913163661956787, "logits/rejected": -2.6768882274627686, "logps/chosen": -276.04986572265625, "logps/rejected": -302.8215026855469, "loss": 0.5666, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.00238300790078938, "rewards/margins": 0.2921072840690613, "rewards/margins_max": 0.6786126494407654, "rewards/margins_min": -0.09073235094547272, "rewards/margins_std": 0.35521072149276733, "rewards/rejected": -0.2944903075695038, "step": 1590 }, { "epoch": 0.38, "grad_norm": 8.551269294260868, "learning_rate": 3.875647349639286e-07, "logits/chosen": -2.7418625354766846, "logits/rejected": -2.694929838180542, "logps/chosen": -296.73883056640625, "logps/rejected": -245.3251495361328, "loss": 0.5809, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.08604216575622559, "rewards/margins": 0.27075013518333435, "rewards/margins_max": 0.7026273012161255, "rewards/margins_min": -0.14759303629398346, "rewards/margins_std": 0.38966792821884155, "rewards/rejected": -0.35679227113723755, "step": 1600 }, { "epoch": 0.38, "eval_logits/chosen": -2.6852588653564453, "eval_logits/rejected": -2.654654026031494, "eval_logps/chosen": -295.7021484375, "eval_logps/rejected": -298.765869140625, "eval_loss": 0.6163187026977539, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.1124684140086174, "eval_rewards/margins": 0.21665772795677185, "eval_rewards/margins_max": 0.8822912573814392, "eval_rewards/margins_min": -0.42427411675453186, "eval_rewards/margins_std": 0.4398875832557678, "eval_rewards/rejected": -0.3291260898113251, "eval_runtime": 859.8439, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 1600 }, { "epoch": 0.39, "grad_norm": 6.465168869279501, "learning_rate": 3.8581486379124185e-07, "logits/chosen": -2.760385036468506, "logits/rejected": -2.756248950958252, "logps/chosen": -324.0615234375, "logps/rejected": -299.63360595703125, "loss": 0.602, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1135634332895279, "rewards/margins": 0.28132662177085876, "rewards/margins_max": 0.75287926197052, "rewards/margins_min": -0.16431859135627747, "rewards/margins_std": 0.41888371109962463, "rewards/rejected": -0.3948900103569031, "step": 1610 }, { "epoch": 0.39, "grad_norm": 6.5531401234289905, "learning_rate": 3.840555012034622e-07, "logits/chosen": -2.6444177627563477, "logits/rejected": -2.5895159244537354, "logps/chosen": -265.99359130859375, "logps/rejected": -277.5436096191406, "loss": 0.5879, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13531748950481415, "rewards/margins": 0.22751832008361816, "rewards/margins_max": 0.6325951814651489, "rewards/margins_min": -0.1143103614449501, "rewards/margins_std": 0.34973862767219543, "rewards/rejected": -0.3628358244895935, "step": 1620 }, { "epoch": 0.39, "grad_norm": 5.518097666358285, "learning_rate": 3.822867701535578e-07, "logits/chosen": -2.6742734909057617, "logits/rejected": -2.666698694229126, "logps/chosen": -288.57415771484375, "logps/rejected": -277.10028076171875, "loss": 0.588, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17379966378211975, "rewards/margins": 0.2467818558216095, "rewards/margins_max": 0.7234400510787964, "rewards/margins_min": -0.2264634668827057, "rewards/margins_std": 0.42187896370887756, "rewards/rejected": -0.42058151960372925, "step": 1630 }, { "epoch": 0.39, "grad_norm": 18.82054437642546, "learning_rate": 3.805087942492112e-07, "logits/chosen": -2.644226312637329, "logits/rejected": -2.61177659034729, "logps/chosen": -289.1362609863281, "logps/rejected": -297.9856872558594, "loss": 0.5849, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.11222722381353378, "rewards/margins": 0.2772018015384674, "rewards/margins_max": 0.7367495894432068, "rewards/margins_min": -0.14642205834388733, "rewards/margins_std": 0.39903825521469116, "rewards/rejected": -0.3894290328025818, "step": 1640 }, { "epoch": 0.4, "grad_norm": 9.965038984024348, "learning_rate": 3.787216977441814e-07, "logits/chosen": -2.699742317199707, "logits/rejected": -2.66479229927063, "logps/chosen": -272.3443908691406, "logps/rejected": -304.8819580078125, "loss": 0.599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12519483268260956, "rewards/margins": 0.2280396968126297, "rewards/margins_max": 0.7019823789596558, "rewards/margins_min": -0.29308685660362244, "rewards/margins_std": 0.4494010806083679, "rewards/rejected": -0.35323458909988403, "step": 1650 }, { "epoch": 0.4, "grad_norm": 5.680025324160577, "learning_rate": 3.7692560552961976e-07, "logits/chosen": -2.693373203277588, "logits/rejected": -2.65922474861145, "logps/chosen": -267.5146789550781, "logps/rejected": -293.3586120605469, "loss": 0.6309, "rewards/accuracies": 0.625, "rewards/chosen": -0.16356855630874634, "rewards/margins": 0.155622199177742, "rewards/margins_max": 0.5817619562149048, "rewards/margins_min": -0.26430314779281616, "rewards/margins_std": 0.3766292929649353, "rewards/rejected": -0.31919074058532715, "step": 1660 }, { "epoch": 0.4, "grad_norm": 5.609989058268003, "learning_rate": 3.7512064312534276e-07, "logits/chosen": -2.6858413219451904, "logits/rejected": -2.6196465492248535, "logps/chosen": -321.91766357421875, "logps/rejected": -328.6977233886719, "loss": 0.5723, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07771825045347214, "rewards/margins": 0.24791796505451202, "rewards/margins_max": 0.837367057800293, "rewards/margins_min": -0.2817845046520233, "rewards/margins_std": 0.5053817629814148, "rewards/rejected": -0.32563620805740356, "step": 1670 }, { "epoch": 0.4, "grad_norm": 5.810323656426692, "learning_rate": 3.7330693667105937e-07, "logits/chosen": -2.7743630409240723, "logits/rejected": -2.713573455810547, "logps/chosen": -327.6851501464844, "logps/rejected": -276.18072509765625, "loss": 0.6031, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.08341021835803986, "rewards/margins": 0.20441654324531555, "rewards/margins_max": 0.6587679982185364, "rewards/margins_min": -0.3237987160682678, "rewards/margins_std": 0.4360232353210449, "rewards/rejected": -0.2878267467021942, "step": 1680 }, { "epoch": 0.4, "grad_norm": 8.753814737640813, "learning_rate": 3.7148461291755626e-07, "logits/chosen": -2.6960878372192383, "logits/rejected": -2.667306900024414, "logps/chosen": -290.73480224609375, "logps/rejected": -329.62548828125, "loss": 0.6176, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.19077444076538086, "rewards/margins": 0.26856881380081177, "rewards/margins_max": 0.6381527781486511, "rewards/margins_min": -0.10217100381851196, "rewards/margins_std": 0.327498197555542, "rewards/rejected": -0.4593432545661926, "step": 1690 }, { "epoch": 0.41, "grad_norm": 7.790281393627224, "learning_rate": 3.6965379921783945e-07, "logits/chosen": -2.7386374473571777, "logits/rejected": -2.715653896331787, "logps/chosen": -309.8696594238281, "logps/rejected": -312.392578125, "loss": 0.5979, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17000164091587067, "rewards/margins": 0.2889346480369568, "rewards/margins_max": 0.8153964281082153, "rewards/margins_min": -0.23362848162651062, "rewards/margins_std": 0.46885618567466736, "rewards/rejected": -0.45893630385398865, "step": 1700 }, { "epoch": 0.41, "eval_logits/chosen": -2.677316427230835, "eval_logits/rejected": -2.646608829498291, "eval_logps/chosen": -305.7200622558594, "eval_logps/rejected": -309.882080078125, "eval_loss": 0.61611407995224, "eval_rewards/accuracies": 0.6804999709129333, "eval_rewards/chosen": -0.2126469910144806, "eval_rewards/margins": 0.22764132916927338, "eval_rewards/margins_max": 0.9152846336364746, "eval_rewards/margins_min": -0.4468873143196106, "eval_rewards/margins_std": 0.46239328384399414, "eval_rewards/rejected": -0.44028833508491516, "eval_runtime": 859.5423, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 1700 }, { "epoch": 0.41, "grad_norm": 7.902971515243615, "learning_rate": 3.6781462351823455e-07, "logits/chosen": -2.692828416824341, "logits/rejected": -2.6957290172576904, "logps/chosen": -306.51361083984375, "logps/rejected": -357.2091369628906, "loss": 0.6187, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2036452293395996, "rewards/margins": 0.17123371362686157, "rewards/margins_max": 0.6390005946159363, "rewards/margins_min": -0.3068637251853943, "rewards/margins_std": 0.42158761620521545, "rewards/rejected": -0.3748789429664612, "step": 1710 }, { "epoch": 0.41, "grad_norm": 10.017763763451027, "learning_rate": 3.6596721434944513e-07, "logits/chosen": -2.7442879676818848, "logits/rejected": -2.70235276222229, "logps/chosen": -305.9901123046875, "logps/rejected": -317.50653076171875, "loss": 0.6225, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22123900055885315, "rewards/margins": 0.16670717298984528, "rewards/margins_max": 0.7052956819534302, "rewards/margins_min": -0.2980489730834961, "rewards/margins_std": 0.44336920976638794, "rewards/rejected": -0.38794612884521484, "step": 1720 }, { "epoch": 0.41, "grad_norm": 8.084320112999801, "learning_rate": 3.6411170081757025e-07, "logits/chosen": -2.7196030616760254, "logits/rejected": -2.6944994926452637, "logps/chosen": -303.96490478515625, "logps/rejected": -293.9749450683594, "loss": 0.6045, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11599443852901459, "rewards/margins": 0.24298004806041718, "rewards/margins_max": 0.7548770308494568, "rewards/margins_min": -0.2850678563117981, "rewards/margins_std": 0.472770631313324, "rewards/rejected": -0.35897451639175415, "step": 1730 }, { "epoch": 0.42, "grad_norm": 5.6218213743034555, "learning_rate": 3.622482125950821e-07, "logits/chosen": -2.7411789894104004, "logits/rejected": -2.7309060096740723, "logps/chosen": -330.2281188964844, "logps/rejected": -325.1976013183594, "loss": 0.6017, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19880354404449463, "rewards/margins": 0.25061336159706116, "rewards/margins_max": 0.6677745580673218, "rewards/margins_min": -0.18377824127674103, "rewards/margins_std": 0.3760093152523041, "rewards/rejected": -0.4494169354438782, "step": 1740 }, { "epoch": 0.42, "grad_norm": 6.509142388314121, "learning_rate": 3.603768799117637e-07, "logits/chosen": -2.687448263168335, "logits/rejected": -2.6664681434631348, "logps/chosen": -311.8000793457031, "logps/rejected": -307.2388610839844, "loss": 0.6217, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2506563365459442, "rewards/margins": 0.2387455701828003, "rewards/margins_max": 0.7983524799346924, "rewards/margins_min": -0.26707369089126587, "rewards/margins_std": 0.4749852120876312, "rewards/rejected": -0.4894019067287445, "step": 1750 }, { "epoch": 0.42, "grad_norm": 4.002326288879537, "learning_rate": 3.584978335456078e-07, "logits/chosen": -2.6560323238372803, "logits/rejected": -2.6839094161987305, "logps/chosen": -289.1376647949219, "logps/rejected": -336.69024658203125, "loss": 0.5897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15337905287742615, "rewards/margins": 0.2918064296245575, "rewards/margins_max": 0.7951470613479614, "rewards/margins_min": -0.20761564373970032, "rewards/margins_std": 0.4645652770996094, "rewards/rejected": -0.4451855719089508, "step": 1760 }, { "epoch": 0.42, "grad_norm": 6.809309185912729, "learning_rate": 3.5661120481367757e-07, "logits/chosen": -2.770693302154541, "logits/rejected": -2.741415023803711, "logps/chosen": -338.86456298828125, "logps/rejected": -325.62506103515625, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": -0.1604558527469635, "rewards/margins": 0.2866327166557312, "rewards/margins_max": 0.7840811610221863, "rewards/margins_min": -0.2536107301712036, "rewards/margins_std": 0.4627218246459961, "rewards/rejected": -0.4470886290073395, "step": 1770 }, { "epoch": 0.43, "grad_norm": 4.392717941861058, "learning_rate": 3.547171255629292e-07, "logits/chosen": -2.64208722114563, "logits/rejected": -2.5881106853485107, "logps/chosen": -280.07275390625, "logps/rejected": -261.87738037109375, "loss": 0.568, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.09692516177892685, "rewards/margins": 0.3377645015716553, "rewards/margins_max": 0.7657067179679871, "rewards/margins_min": -0.13990791141986847, "rewards/margins_std": 0.3938554525375366, "rewards/rejected": -0.4346896708011627, "step": 1780 }, { "epoch": 0.43, "grad_norm": 6.009492041627867, "learning_rate": 3.528157281609984e-07, "logits/chosen": -2.6666150093078613, "logits/rejected": -2.6736645698547363, "logps/chosen": -234.13223266601562, "logps/rejected": -236.21127319335938, "loss": 0.6364, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20780625939369202, "rewards/margins": 0.19026105105876923, "rewards/margins_max": 0.673338770866394, "rewards/margins_min": -0.265020489692688, "rewards/margins_std": 0.41245418787002563, "rewards/rejected": -0.39806729555130005, "step": 1790 }, { "epoch": 0.43, "grad_norm": 5.409885132534405, "learning_rate": 3.5090714548694916e-07, "logits/chosen": -2.5533435344696045, "logits/rejected": -2.557516574859619, "logps/chosen": -346.5416259765625, "logps/rejected": -326.44036865234375, "loss": 0.6034, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15006712079048157, "rewards/margins": 0.21542784571647644, "rewards/margins_max": 0.715800940990448, "rewards/margins_min": -0.31234538555145264, "rewards/margins_std": 0.468539297580719, "rewards/rejected": -0.365494966506958, "step": 1800 }, { "epoch": 0.43, "eval_logits/chosen": -2.6671693325042725, "eval_logits/rejected": -2.636549949645996, "eval_logps/chosen": -300.97119140625, "eval_logps/rejected": -305.9888610839844, "eval_loss": 0.6123643517494202, "eval_rewards/accuracies": 0.6804999709129333, "eval_rewards/chosen": -0.1651587039232254, "eval_rewards/margins": 0.23619771003723145, "eval_rewards/margins_max": 0.9409818649291992, "eval_rewards/margins_min": -0.45071518421173096, "eval_rewards/margins_std": 0.4725970923900604, "eval_rewards/rejected": -0.40135645866394043, "eval_runtime": 859.8464, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 1800 }, { "epoch": 0.43, "grad_norm": 3.8143147386331404, "learning_rate": 3.489915109219882e-07, "logits/chosen": -2.643347978591919, "logits/rejected": -2.603933811187744, "logps/chosen": -268.88848876953125, "logps/rejected": -270.46148681640625, "loss": 0.5921, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2317378968000412, "rewards/margins": 0.2574358284473419, "rewards/margins_max": 0.6683996915817261, "rewards/margins_min": -0.17655006051063538, "rewards/margins_std": 0.3807302713394165, "rewards/rejected": -0.4891737103462219, "step": 1810 }, { "epoch": 0.44, "grad_norm": 7.259545455028369, "learning_rate": 3.4706895834014294e-07, "logits/chosen": -2.7250919342041016, "logits/rejected": -2.6996560096740723, "logps/chosen": -313.46527099609375, "logps/rejected": -324.8810119628906, "loss": 0.6077, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20322100818157196, "rewards/margins": 0.21102142333984375, "rewards/margins_max": 0.6702043414115906, "rewards/margins_min": -0.25847309827804565, "rewards/margins_std": 0.41045159101486206, "rewards/rejected": -0.4142424166202545, "step": 1820 }, { "epoch": 0.44, "grad_norm": 9.341131041045113, "learning_rate": 3.451396220989064e-07, "logits/chosen": -2.740797519683838, "logits/rejected": -2.670044183731079, "logps/chosen": -295.61309814453125, "logps/rejected": -281.2939758300781, "loss": 0.5669, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.14443984627723694, "rewards/margins": 0.23712129890918732, "rewards/margins_max": 0.760201632976532, "rewards/margins_min": -0.28850245475769043, "rewards/margins_std": 0.48045676946640015, "rewards/rejected": -0.38156113028526306, "step": 1830 }, { "epoch": 0.44, "grad_norm": 8.45587869239201, "learning_rate": 3.43203637029847e-07, "logits/chosen": -2.7226665019989014, "logits/rejected": -2.6695210933685303, "logps/chosen": -357.6163024902344, "logps/rejected": -322.4307556152344, "loss": 0.6314, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.20505478978157043, "rewards/margins": 0.16338197886943817, "rewards/margins_max": 0.6402610540390015, "rewards/margins_min": -0.3119858205318451, "rewards/margins_std": 0.4293789267539978, "rewards/rejected": -0.3684367537498474, "step": 1840 }, { "epoch": 0.44, "grad_norm": 5.549724352065707, "learning_rate": 3.4126113842918643e-07, "logits/chosen": -2.695279598236084, "logits/rejected": -2.6542508602142334, "logps/chosen": -289.8641052246094, "logps/rejected": -282.6558532714844, "loss": 0.59, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.06596537679433823, "rewards/margins": 0.26139482855796814, "rewards/margins_max": 0.7130376696586609, "rewards/margins_min": -0.16647562384605408, "rewards/margins_std": 0.39078274369239807, "rewards/rejected": -0.32736021280288696, "step": 1850 }, { "epoch": 0.45, "grad_norm": 4.267178095524697, "learning_rate": 3.3931226204834397e-07, "logits/chosen": -2.749413013458252, "logits/rejected": -2.7500834465026855, "logps/chosen": -342.98895263671875, "logps/rejected": -348.8139343261719, "loss": 0.6153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16457152366638184, "rewards/margins": 0.31747838854789734, "rewards/margins_max": 0.7840696573257446, "rewards/margins_min": -0.11327596008777618, "rewards/margins_std": 0.40646958351135254, "rewards/rejected": -0.48204994201660156, "step": 1860 }, { "epoch": 0.45, "grad_norm": 6.036927076368459, "learning_rate": 3.3735714408445e-07, "logits/chosen": -2.6747236251831055, "logits/rejected": -2.6891233921051025, "logps/chosen": -266.5842590332031, "logps/rejected": -293.5079040527344, "loss": 0.6045, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15758280456066132, "rewards/margins": 0.19839924573898315, "rewards/margins_max": 0.7733598947525024, "rewards/margins_min": -0.34084218740463257, "rewards/margins_std": 0.50069260597229, "rewards/rejected": -0.3559820055961609, "step": 1870 }, { "epoch": 0.45, "grad_norm": 4.122374190150992, "learning_rate": 3.3539592117082746e-07, "logits/chosen": -2.6495654582977295, "logits/rejected": -2.6142737865448, "logps/chosen": -293.51312255859375, "logps/rejected": -313.4895324707031, "loss": 0.6016, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12317116558551788, "rewards/margins": 0.25961264967918396, "rewards/margins_max": 0.7596412301063538, "rewards/margins_min": -0.23033156991004944, "rewards/margins_std": 0.45355939865112305, "rewards/rejected": -0.3827837407588959, "step": 1880 }, { "epoch": 0.45, "grad_norm": 11.889700373667367, "learning_rate": 3.3342873036744346e-07, "logits/chosen": -2.7058122158050537, "logits/rejected": -2.6883273124694824, "logps/chosen": -307.9171447753906, "logps/rejected": -334.7265625, "loss": 0.6101, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.08564779907464981, "rewards/margins": 0.26072975993156433, "rewards/margins_max": 0.7731336355209351, "rewards/margins_min": -0.2751905620098114, "rewards/margins_std": 0.46360093355178833, "rewards/rejected": -0.34637758135795593, "step": 1890 }, { "epoch": 0.45, "grad_norm": 7.893029297996013, "learning_rate": 3.3145570915133067e-07, "logits/chosen": -2.6629598140716553, "logits/rejected": -2.6108360290527344, "logps/chosen": -269.05572509765625, "logps/rejected": -288.2684631347656, "loss": 0.5983, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04458091780543327, "rewards/margins": 0.2424008548259735, "rewards/margins_max": 0.7750869393348694, "rewards/margins_min": -0.23572340607643127, "rewards/margins_std": 0.4512661099433899, "rewards/rejected": -0.2869817912578583, "step": 1900 }, { "epoch": 0.45, "eval_logits/chosen": -2.6688740253448486, "eval_logits/rejected": -2.6388726234436035, "eval_logps/chosen": -289.76275634765625, "eval_logps/rejected": -293.2796630859375, "eval_loss": 0.6144084334373474, "eval_rewards/accuracies": 0.6899999976158142, "eval_rewards/chosen": -0.05307444930076599, "eval_rewards/margins": 0.22118933498859406, "eval_rewards/margins_max": 0.8922848701477051, "eval_rewards/margins_min": -0.3931286036968231, "eval_rewards/margins_std": 0.4326675534248352, "eval_rewards/rejected": -0.27426376938819885, "eval_runtime": 859.8622, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 1900 }, { "epoch": 0.46, "grad_norm": 9.609182067422951, "learning_rate": 3.294769954069802e-07, "logits/chosen": -2.6731858253479004, "logits/rejected": -2.631958484649658, "logps/chosen": -292.23406982421875, "logps/rejected": -290.16107177734375, "loss": 0.6018, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07169095426797867, "rewards/margins": 0.2285386621952057, "rewards/margins_max": 0.7320543527603149, "rewards/margins_min": -0.2693322002887726, "rewards/margins_std": 0.45920902490615845, "rewards/rejected": -0.30022960901260376, "step": 1910 }, { "epoch": 0.46, "grad_norm": 6.526344610937495, "learning_rate": 3.274927274167048e-07, "logits/chosen": -2.6619555950164795, "logits/rejected": -2.6596617698669434, "logps/chosen": -276.9857482910156, "logps/rejected": -289.47235107421875, "loss": 0.6043, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10402724891901016, "rewards/margins": 0.28490519523620605, "rewards/margins_max": 0.7223531603813171, "rewards/margins_min": -0.18868054449558258, "rewards/margins_std": 0.4100664258003235, "rewards/rejected": -0.3889324367046356, "step": 1920 }, { "epoch": 0.46, "grad_norm": 4.150434512635606, "learning_rate": 3.2550304385097575e-07, "logits/chosen": -2.7032723426818848, "logits/rejected": -2.6790528297424316, "logps/chosen": -290.4239807128906, "logps/rejected": -281.20416259765625, "loss": 0.5963, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13188329339027405, "rewards/margins": 0.26092013716697693, "rewards/margins_max": 0.7271699905395508, "rewards/margins_min": -0.19916054606437683, "rewards/margins_std": 0.4129628539085388, "rewards/rejected": -0.3928033709526062, "step": 1930 }, { "epoch": 0.46, "grad_norm": 12.036491223428849, "learning_rate": 3.235080837587314e-07, "logits/chosen": -2.6926076412200928, "logits/rejected": -2.672503709793091, "logps/chosen": -238.70413208007812, "logps/rejected": -302.2066345214844, "loss": 0.6008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2117839753627777, "rewards/margins": 0.24341578781604767, "rewards/margins_max": 0.7401986122131348, "rewards/margins_min": -0.19410011172294617, "rewards/margins_std": 0.4155648648738861, "rewards/rejected": -0.4551997780799866, "step": 1940 }, { "epoch": 0.47, "grad_norm": 6.093328373334495, "learning_rate": 3.215079865576599e-07, "logits/chosen": -2.666715145111084, "logits/rejected": -2.703641653060913, "logps/chosen": -293.8852844238281, "logps/rejected": -313.35821533203125, "loss": 0.5883, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12059396505355835, "rewards/margins": 0.25971904397010803, "rewards/margins_max": 0.8864032626152039, "rewards/margins_min": -0.2718166708946228, "rewards/margins_std": 0.5048220753669739, "rewards/rejected": -0.380312979221344, "step": 1950 }, { "epoch": 0.47, "grad_norm": 12.090046949019355, "learning_rate": 3.1950289202445594e-07, "logits/chosen": -2.6468160152435303, "logits/rejected": -2.639539957046509, "logps/chosen": -290.692138671875, "logps/rejected": -307.38800048828125, "loss": 0.5841, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13301345705986023, "rewards/margins": 0.2568240165710449, "rewards/margins_max": 0.7523486018180847, "rewards/margins_min": -0.24699096381664276, "rewards/margins_std": 0.4464823603630066, "rewards/rejected": -0.3898375630378723, "step": 1960 }, { "epoch": 0.47, "grad_norm": 5.265987757290794, "learning_rate": 3.174929402850528e-07, "logits/chosen": -2.7697157859802246, "logits/rejected": -2.6936728954315186, "logps/chosen": -293.42962646484375, "logps/rejected": -291.2169494628906, "loss": 0.6268, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.14020130038261414, "rewards/margins": 0.163181871175766, "rewards/margins_max": 0.7737967371940613, "rewards/margins_min": -0.38077330589294434, "rewards/margins_std": 0.5128040313720703, "rewards/rejected": -0.3033831715583801, "step": 1970 }, { "epoch": 0.47, "grad_norm": 8.041305243979703, "learning_rate": 3.15478271804829e-07, "logits/chosen": -2.6651408672332764, "logits/rejected": -2.6439590454101562, "logps/chosen": -309.2423095703125, "logps/rejected": -325.1023254394531, "loss": 0.5719, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.058884888887405396, "rewards/margins": 0.3680292069911957, "rewards/margins_max": 0.8800289034843445, "rewards/margins_min": -0.12628303468227386, "rewards/margins_std": 0.4714101254940033, "rewards/rejected": -0.4269140660762787, "step": 1980 }, { "epoch": 0.48, "grad_norm": 3.27675225941295, "learning_rate": 3.1345902737879257e-07, "logits/chosen": -2.583885669708252, "logits/rejected": -2.5821287631988525, "logps/chosen": -279.5968933105469, "logps/rejected": -300.0899353027344, "loss": 0.5977, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17092658579349518, "rewards/margins": 0.29685845971107483, "rewards/margins_max": 0.8626619577407837, "rewards/margins_min": -0.19954678416252136, "rewards/margins_std": 0.4820861220359802, "rewards/rejected": -0.4677850306034088, "step": 1990 }, { "epoch": 0.48, "grad_norm": 6.900620872096232, "learning_rate": 3.1143534812174103e-07, "logits/chosen": -2.7087361812591553, "logits/rejected": -2.683499813079834, "logps/chosen": -329.2854919433594, "logps/rejected": -306.0649108886719, "loss": 0.5822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0919618010520935, "rewards/margins": 0.4252632260322571, "rewards/margins_max": 1.0302584171295166, "rewards/margins_min": -0.3328133523464203, "rewards/margins_std": 0.6099307537078857, "rewards/rejected": -0.5172249674797058, "step": 2000 }, { "epoch": 0.48, "eval_logits/chosen": -2.6678671836853027, "eval_logits/rejected": -2.63775634765625, "eval_logps/chosen": -299.4801025390625, "eval_logps/rejected": -306.8108825683594, "eval_loss": 0.6049104332923889, "eval_rewards/accuracies": 0.6884999871253967, "eval_rewards/chosen": -0.15024752914905548, "eval_rewards/margins": 0.25932812690734863, "eval_rewards/margins_max": 1.00696861743927, "eval_rewards/margins_min": -0.4697433412075043, "eval_rewards/margins_std": 0.49979886412620544, "eval_rewards/rejected": -0.4095756411552429, "eval_runtime": 859.8794, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 2000 }, { "epoch": 0.48, "grad_norm": 4.974932968224806, "learning_rate": 3.094073754584001e-07, "logits/chosen": -2.641544818878174, "logits/rejected": -2.602346420288086, "logps/chosen": -295.3077697753906, "logps/rejected": -297.1141357421875, "loss": 0.5751, "rewards/accuracies": 0.75, "rewards/chosen": -0.13280975818634033, "rewards/margins": 0.3550918996334076, "rewards/margins_max": 0.9477798342704773, "rewards/margins_min": -0.1992892622947693, "rewards/margins_std": 0.5173491835594177, "rewards/rejected": -0.4879016876220703, "step": 2010 }, { "epoch": 0.48, "grad_norm": 8.83065289789081, "learning_rate": 3.0737525111353976e-07, "logits/chosen": -2.7229080200195312, "logits/rejected": -2.699981451034546, "logps/chosen": -305.264404296875, "logps/rejected": -297.63458251953125, "loss": 0.5828, "rewards/accuracies": 0.75, "rewards/chosen": -0.1455700546503067, "rewards/margins": 0.26840436458587646, "rewards/margins_max": 0.7454306483268738, "rewards/margins_min": -0.25561413168907166, "rewards/margins_std": 0.45743808150291443, "rewards/rejected": -0.4139743745326996, "step": 2020 }, { "epoch": 0.49, "grad_norm": 5.411815799169501, "learning_rate": 3.053391171020702e-07, "logits/chosen": -2.6677589416503906, "logits/rejected": -2.625202178955078, "logps/chosen": -316.2569885253906, "logps/rejected": -316.2462158203125, "loss": 0.5928, "rewards/accuracies": 0.75, "rewards/chosen": -0.14155061542987823, "rewards/margins": 0.29246291518211365, "rewards/margins_max": 0.8275464773178101, "rewards/margins_min": -0.31466326117515564, "rewards/margins_std": 0.5112706422805786, "rewards/rejected": -0.4340135455131531, "step": 2030 }, { "epoch": 0.49, "grad_norm": 7.476710972761675, "learning_rate": 3.0329911571911693e-07, "logits/chosen": -2.6282687187194824, "logits/rejected": -2.6319591999053955, "logps/chosen": -278.00897216796875, "logps/rejected": -305.495361328125, "loss": 0.6206, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.24457895755767822, "rewards/margins": 0.20937521755695343, "rewards/margins_max": 0.6903196573257446, "rewards/margins_min": -0.41123127937316895, "rewards/margins_std": 0.5036042332649231, "rewards/rejected": -0.45395416021347046, "step": 2040 }, { "epoch": 0.49, "grad_norm": 9.284960688937632, "learning_rate": 3.012553895300765e-07, "logits/chosen": -2.6002590656280518, "logits/rejected": -2.6011807918548584, "logps/chosen": -290.0956115722656, "logps/rejected": -310.40155029296875, "loss": 0.5935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2200767546892166, "rewards/margins": 0.3369477093219757, "rewards/margins_max": 0.8728636503219604, "rewards/margins_min": -0.18612375855445862, "rewards/margins_std": 0.4747312664985657, "rewards/rejected": -0.5570244193077087, "step": 2050 }, { "epoch": 0.49, "grad_norm": 11.77746164949279, "learning_rate": 2.9920808136065336e-07, "logits/chosen": -2.7036004066467285, "logits/rejected": -2.667642116546631, "logps/chosen": -293.30377197265625, "logps/rejected": -308.8804931640625, "loss": 0.6148, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13821843266487122, "rewards/margins": 0.24729657173156738, "rewards/margins_max": 0.877963662147522, "rewards/margins_min": -0.2980964481830597, "rewards/margins_std": 0.539368748664856, "rewards/rejected": -0.385515034198761, "step": 2060 }, { "epoch": 0.5, "grad_norm": 6.111162397381366, "learning_rate": 2.971573342868786e-07, "logits/chosen": -2.7201285362243652, "logits/rejected": -2.667332887649536, "logps/chosen": -269.16302490234375, "logps/rejected": -274.974609375, "loss": 0.5877, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12460535764694214, "rewards/margins": 0.3083706498146057, "rewards/margins_max": 0.8954262733459473, "rewards/margins_min": -0.2783167362213135, "rewards/margins_std": 0.5282653570175171, "rewards/rejected": -0.4329760670661926, "step": 2070 }, { "epoch": 0.5, "grad_norm": 10.42434722539899, "learning_rate": 2.9510329162511054e-07, "logits/chosen": -2.606055974960327, "logits/rejected": -2.6232898235321045, "logps/chosen": -314.437744140625, "logps/rejected": -303.1037902832031, "loss": 0.6138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11569087207317352, "rewards/margins": 0.2684437930583954, "rewards/margins_max": 0.8543025851249695, "rewards/margins_min": -0.26375895738601685, "rewards/margins_std": 0.49721455574035645, "rewards/rejected": -0.3841346502304077, "step": 2080 }, { "epoch": 0.5, "grad_norm": 5.566886629490982, "learning_rate": 2.930460969220202e-07, "logits/chosen": -2.7032504081726074, "logits/rejected": -2.6360905170440674, "logps/chosen": -275.11688232421875, "logps/rejected": -322.1624450683594, "loss": 0.5975, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1841132640838623, "rewards/margins": 0.2289840430021286, "rewards/margins_max": 0.8241883516311646, "rewards/margins_min": -0.3731764554977417, "rewards/margins_std": 0.5407173037528992, "rewards/rejected": -0.4130973219871521, "step": 2090 }, { "epoch": 0.5, "grad_norm": 4.855753539641828, "learning_rate": 2.909858939445584e-07, "logits/chosen": -2.6886954307556152, "logits/rejected": -2.665351390838623, "logps/chosen": -288.3397521972656, "logps/rejected": -290.7861328125, "loss": 0.6013, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2644990086555481, "rewards/margins": 0.21240103244781494, "rewards/margins_max": 0.6695704460144043, "rewards/margins_min": -0.22528037428855896, "rewards/margins_std": 0.396864116191864, "rewards/rejected": -0.4769001007080078, "step": 2100 }, { "epoch": 0.5, "eval_logits/chosen": -2.6592962741851807, "eval_logits/rejected": -2.6289308071136475, "eval_logps/chosen": -302.33001708984375, "eval_logps/rejected": -310.3860168457031, "eval_loss": 0.6034172177314758, "eval_rewards/accuracies": 0.6869999766349792, "eval_rewards/chosen": -0.17874710261821747, "eval_rewards/margins": 0.2665804624557495, "eval_rewards/margins_max": 1.0330960750579834, "eval_rewards/margins_min": -0.4818904399871826, "eval_rewards/margins_std": 0.5136880874633789, "eval_rewards/rejected": -0.4453275799751282, "eval_runtime": 859.4362, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 2100 }, { "epoch": 0.51, "grad_norm": 7.587780022520818, "learning_rate": 2.8892282666990894e-07, "logits/chosen": -2.68300461769104, "logits/rejected": -2.6219592094421387, "logps/chosen": -285.99542236328125, "logps/rejected": -277.61529541015625, "loss": 0.5884, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11673027276992798, "rewards/margins": 0.3741188049316406, "rewards/margins_max": 1.0100022554397583, "rewards/margins_min": -0.2071894407272339, "rewards/margins_std": 0.5453618168830872, "rewards/rejected": -0.4908490777015686, "step": 2110 }, { "epoch": 0.51, "grad_norm": 5.437760971646844, "learning_rate": 2.868570392754272e-07, "logits/chosen": -2.724592447280884, "logits/rejected": -2.70935320854187, "logps/chosen": -336.3945617675781, "logps/rejected": -362.13507080078125, "loss": 0.5846, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1303412765264511, "rewards/margins": 0.3308197855949402, "rewards/margins_max": 0.8329311609268188, "rewards/margins_min": -0.2632693648338318, "rewards/margins_std": 0.4918765425682068, "rewards/rejected": -0.4611610770225525, "step": 2120 }, { "epoch": 0.51, "grad_norm": 7.312307762814366, "learning_rate": 2.8478867612856394e-07, "logits/chosen": -2.707106828689575, "logits/rejected": -2.6523804664611816, "logps/chosen": -302.043212890625, "logps/rejected": -284.9640808105469, "loss": 0.5916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11796555668115616, "rewards/margins": 0.3160025477409363, "rewards/margins_max": 0.8329175710678101, "rewards/margins_min": -0.27798742055892944, "rewards/margins_std": 0.5030540227890015, "rewards/rejected": -0.43396812677383423, "step": 2130 }, { "epoch": 0.51, "grad_norm": 9.871464736301247, "learning_rate": 2.827178817767762e-07, "logits/chosen": -2.615812063217163, "logits/rejected": -2.598005533218384, "logps/chosen": -308.57672119140625, "logps/rejected": -289.3555908203125, "loss": 0.6014, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07780320197343826, "rewards/margins": 0.2799312472343445, "rewards/margins_max": 0.8629264831542969, "rewards/margins_min": -0.21283754706382751, "rewards/margins_std": 0.4856076240539551, "rewards/rejected": -0.35773441195487976, "step": 2140 }, { "epoch": 0.51, "grad_norm": 8.961394693273633, "learning_rate": 2.8064480093742565e-07, "logits/chosen": -2.686300277709961, "logits/rejected": -2.680666208267212, "logps/chosen": -261.29705810546875, "logps/rejected": -282.45556640625, "loss": 0.5678, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.11843335628509521, "rewards/margins": 0.29204291105270386, "rewards/margins_max": 0.8688719868659973, "rewards/margins_min": -0.21983151137828827, "rewards/margins_std": 0.5022950172424316, "rewards/rejected": -0.4104762077331543, "step": 2150 }, { "epoch": 0.52, "grad_norm": 11.224170108694283, "learning_rate": 2.7856957848766497e-07, "logits/chosen": -2.6662299633026123, "logits/rejected": -2.587806224822998, "logps/chosen": -290.3630065917969, "logps/rejected": -305.7333984375, "loss": 0.591, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2064693421125412, "rewards/margins": 0.29201430082321167, "rewards/margins_max": 0.8517812490463257, "rewards/margins_min": -0.199358269572258, "rewards/margins_std": 0.47122445702552795, "rewards/rejected": -0.49848371744155884, "step": 2160 }, { "epoch": 0.52, "grad_norm": 7.270993585287031, "learning_rate": 2.7649235945431336e-07, "logits/chosen": -2.64408540725708, "logits/rejected": -2.611272096633911, "logps/chosen": -309.9103088378906, "logps/rejected": -368.58514404296875, "loss": 0.5759, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16110315918922424, "rewards/margins": 0.27896901965141296, "rewards/margins_max": 0.7961828708648682, "rewards/margins_min": -0.22307395935058594, "rewards/margins_std": 0.4642201066017151, "rewards/rejected": -0.4400722086429596, "step": 2170 }, { "epoch": 0.52, "grad_norm": 8.25762679397384, "learning_rate": 2.74413289003721e-07, "logits/chosen": -2.6903395652770996, "logits/rejected": -2.671844005584717, "logps/chosen": -302.89886474609375, "logps/rejected": -312.398193359375, "loss": 0.5831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1994825303554535, "rewards/margins": 0.28639334440231323, "rewards/margins_max": 0.870135486125946, "rewards/margins_min": -0.27219831943511963, "rewards/margins_std": 0.5184741616249084, "rewards/rejected": -0.4858759343624115, "step": 2180 }, { "epoch": 0.52, "grad_norm": 13.975736027478682, "learning_rate": 2.7233251243162434e-07, "logits/chosen": -2.672729015350342, "logits/rejected": -2.6457486152648926, "logps/chosen": -326.508056640625, "logps/rejected": -334.08685302734375, "loss": 0.5922, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11852113902568817, "rewards/margins": 0.3247258961200714, "rewards/margins_max": 0.8556135892868042, "rewards/margins_min": -0.1976144015789032, "rewards/margins_std": 0.4724816381931305, "rewards/rejected": -0.4432470202445984, "step": 2190 }, { "epoch": 0.53, "grad_norm": 6.569085005357838, "learning_rate": 2.7025017515299207e-07, "logits/chosen": -2.640963077545166, "logits/rejected": -2.610414743423462, "logps/chosen": -293.7982482910156, "logps/rejected": -280.1280822753906, "loss": 0.6018, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.20703542232513428, "rewards/margins": 0.19029325246810913, "rewards/margins_max": 0.7511401176452637, "rewards/margins_min": -0.3581925928592682, "rewards/margins_std": 0.49416694045066833, "rewards/rejected": -0.3973286747932434, "step": 2200 }, { "epoch": 0.53, "eval_logits/chosen": -2.6584744453430176, "eval_logits/rejected": -2.6287267208099365, "eval_logps/chosen": -300.1773376464844, "eval_logps/rejected": -308.8055419921875, "eval_loss": 0.6019492149353027, "eval_rewards/accuracies": 0.6924999952316284, "eval_rewards/chosen": -0.157219797372818, "eval_rewards/margins": 0.2723027765750885, "eval_rewards/margins_max": 1.0473430156707764, "eval_rewards/margins_min": -0.48959511518478394, "eval_rewards/margins_std": 0.5205263495445251, "eval_rewards/rejected": -0.4295225143432617, "eval_runtime": 859.8763, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 2200 }, { "epoch": 0.53, "grad_norm": 16.78526397379326, "learning_rate": 2.6816642269186275e-07, "logits/chosen": -2.6230361461639404, "logits/rejected": -2.616010904312134, "logps/chosen": -305.01812744140625, "logps/rejected": -302.0491027832031, "loss": 0.5996, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21043619513511658, "rewards/margins": 0.22241902351379395, "rewards/margins_max": 0.7658696174621582, "rewards/margins_min": -0.2654082179069519, "rewards/margins_std": 0.4638918340206146, "rewards/rejected": -0.4328552186489105, "step": 2210 }, { "epoch": 0.53, "grad_norm": 5.8733901395154255, "learning_rate": 2.660814006711748e-07, "logits/chosen": -2.6130030155181885, "logits/rejected": -2.6518306732177734, "logps/chosen": -283.50518798828125, "logps/rejected": -329.78179931640625, "loss": 0.6255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.27469900250434875, "rewards/margins": 0.13242587447166443, "rewards/margins_max": 0.6401777267456055, "rewards/margins_min": -0.43729621171951294, "rewards/margins_std": 0.47277194261550903, "rewards/rejected": -0.40712490677833557, "step": 2220 }, { "epoch": 0.53, "grad_norm": 8.27693449613565, "learning_rate": 2.639952548025899e-07, "logits/chosen": -2.700397491455078, "logits/rejected": -2.6179628372192383, "logps/chosen": -320.1406555175781, "logps/rejected": -288.6668701171875, "loss": 0.611, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1986890733242035, "rewards/margins": 0.23419936001300812, "rewards/margins_max": 0.895412266254425, "rewards/margins_min": -0.37990817427635193, "rewards/margins_std": 0.5677480697631836, "rewards/rejected": -0.4328884482383728, "step": 2230 }, { "epoch": 0.54, "grad_norm": 9.954155549694788, "learning_rate": 2.619081308763097e-07, "logits/chosen": -2.64445424079895, "logits/rejected": -2.6310877799987793, "logps/chosen": -312.2311706542969, "logps/rejected": -307.53765869140625, "loss": 0.5786, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1756788194179535, "rewards/margins": 0.31714627146720886, "rewards/margins_max": 0.8764044046401978, "rewards/margins_min": -0.23695509135723114, "rewards/margins_std": 0.5006457567214966, "rewards/rejected": -0.49282512068748474, "step": 2240 }, { "epoch": 0.54, "grad_norm": 6.119155586679762, "learning_rate": 2.598201747508875e-07, "logits/chosen": -2.6621711254119873, "logits/rejected": -2.6769607067108154, "logps/chosen": -330.69952392578125, "logps/rejected": -344.57928466796875, "loss": 0.5783, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1518358588218689, "rewards/margins": 0.33964893221855164, "rewards/margins_max": 0.8550474047660828, "rewards/margins_min": -0.2645668685436249, "rewards/margins_std": 0.507433295249939, "rewards/rejected": -0.4914848208427429, "step": 2250 }, { "epoch": 0.54, "grad_norm": 10.487767678301395, "learning_rate": 2.577315323430346e-07, "logits/chosen": -2.652210235595703, "logits/rejected": -2.6248786449432373, "logps/chosen": -309.04949951171875, "logps/rejected": -323.46954345703125, "loss": 0.5816, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19222597777843475, "rewards/margins": 0.30796346068382263, "rewards/margins_max": 0.8325638771057129, "rewards/margins_min": -0.24727854132652283, "rewards/margins_std": 0.48955899477005005, "rewards/rejected": -0.5001894235610962, "step": 2260 }, { "epoch": 0.54, "grad_norm": 6.923352912553123, "learning_rate": 2.5564234961742315e-07, "logits/chosen": -2.677743434906006, "logits/rejected": -2.652365207672119, "logps/chosen": -349.2780456542969, "logps/rejected": -337.13922119140625, "loss": 0.5919, "rewards/accuracies": 0.75, "rewards/chosen": -0.13047567009925842, "rewards/margins": 0.3868805170059204, "rewards/margins_max": 0.9608567357063293, "rewards/margins_min": -0.10169659554958344, "rewards/margins_std": 0.47666463255882263, "rewards/rejected": -0.5173561573028564, "step": 2270 }, { "epoch": 0.55, "grad_norm": 8.10905605857629, "learning_rate": 2.5355277257648553e-07, "logits/chosen": -2.69606351852417, "logits/rejected": -2.6536648273468018, "logps/chosen": -288.4073486328125, "logps/rejected": -295.1745300292969, "loss": 0.5857, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.19543218612670898, "rewards/margins": 0.2333766520023346, "rewards/margins_max": 0.7897024154663086, "rewards/margins_min": -0.3436315655708313, "rewards/margins_std": 0.5261818170547485, "rewards/rejected": -0.42880886793136597, "step": 2280 }, { "epoch": 0.55, "grad_norm": 11.325778312969543, "learning_rate": 2.514629472502108e-07, "logits/chosen": -2.6397690773010254, "logits/rejected": -2.655486822128296, "logps/chosen": -351.07061767578125, "logps/rejected": -333.7223815917969, "loss": 0.5638, "rewards/accuracies": 0.75, "rewards/chosen": -0.14515265822410583, "rewards/margins": 0.38497716188430786, "rewards/margins_max": 0.9311981201171875, "rewards/margins_min": -0.18591797351837158, "rewards/margins_std": 0.5137249231338501, "rewards/rejected": -0.5301297903060913, "step": 2290 }, { "epoch": 0.55, "grad_norm": 7.7298047675465105, "learning_rate": 2.4937301968593915e-07, "logits/chosen": -2.635599136352539, "logits/rejected": -2.62733793258667, "logps/chosen": -282.60028076171875, "logps/rejected": -295.7271728515625, "loss": 0.6121, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2598281502723694, "rewards/margins": 0.26622843742370605, "rewards/margins_max": 0.7813423275947571, "rewards/margins_min": -0.3214438557624817, "rewards/margins_std": 0.5005481243133545, "rewards/rejected": -0.5260566473007202, "step": 2300 }, { "epoch": 0.55, "eval_logits/chosen": -2.647585153579712, "eval_logits/rejected": -2.6178154945373535, "eval_logps/chosen": -308.79913330078125, "eval_logps/rejected": -318.02728271484375, "eval_loss": 0.6009542346000671, "eval_rewards/accuracies": 0.690500020980835, "eval_rewards/chosen": -0.24343764781951904, "eval_rewards/margins": 0.27830231189727783, "eval_rewards/margins_max": 1.063331127166748, "eval_rewards/margins_min": -0.4893389046192169, "eval_rewards/margins_std": 0.5288664698600769, "eval_rewards/rejected": -0.5217399001121521, "eval_runtime": 859.9102, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 2300 }, { "epoch": 0.55, "grad_norm": 7.022809875756669, "learning_rate": 2.47283135938156e-07, "logits/chosen": -2.6830363273620605, "logits/rejected": -2.664031982421875, "logps/chosen": -287.798095703125, "logps/rejected": -279.710205078125, "loss": 0.6063, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.20198485255241394, "rewards/margins": 0.2973095774650574, "rewards/margins_max": 0.9239432215690613, "rewards/margins_min": -0.2458072006702423, "rewards/margins_std": 0.5228606462478638, "rewards/rejected": -0.49929437041282654, "step": 2310 }, { "epoch": 0.56, "grad_norm": 9.306559261514748, "learning_rate": 2.451934420582846e-07, "logits/chosen": -2.6949825286865234, "logits/rejected": -2.6665689945220947, "logps/chosen": -287.1777038574219, "logps/rejected": -296.3902282714844, "loss": 0.6109, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17137010395526886, "rewards/margins": 0.2919192314147949, "rewards/margins_max": 0.9146235585212708, "rewards/margins_min": -0.29740530252456665, "rewards/margins_std": 0.5488147735595703, "rewards/rejected": -0.463289350271225, "step": 2320 }, { "epoch": 0.56, "grad_norm": 4.078485582568216, "learning_rate": 2.4310408408447903e-07, "logits/chosen": -2.6312766075134277, "logits/rejected": -2.5739798545837402, "logps/chosen": -255.2695770263672, "logps/rejected": -261.53936767578125, "loss": 0.609, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11498737335205078, "rewards/margins": 0.24487683176994324, "rewards/margins_max": 0.7137837409973145, "rewards/margins_min": -0.18724948167800903, "rewards/margins_std": 0.41405171155929565, "rewards/rejected": -0.359864205121994, "step": 2330 }, { "epoch": 0.56, "grad_norm": 3.069921259910948, "learning_rate": 2.41015208031419e-07, "logits/chosen": -2.6944103240966797, "logits/rejected": -2.6474945545196533, "logps/chosen": -306.9026794433594, "logps/rejected": -310.8961486816406, "loss": 0.5927, "rewards/accuracies": 0.5625, "rewards/chosen": -0.126219242811203, "rewards/margins": 0.22689545154571533, "rewards/margins_max": 0.8158214688301086, "rewards/margins_min": -0.31907206773757935, "rewards/margins_std": 0.5096734762191772, "rewards/rejected": -0.3531147241592407, "step": 2340 }, { "epoch": 0.56, "grad_norm": 8.258310005265388, "learning_rate": 2.389269598801048e-07, "logits/chosen": -2.667680025100708, "logits/rejected": -2.59625506401062, "logps/chosen": -292.64959716796875, "logps/rejected": -271.0260314941406, "loss": 0.5555, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08529751002788544, "rewards/margins": 0.3777325749397278, "rewards/margins_max": 0.9011771082878113, "rewards/margins_min": -0.11987098306417465, "rewards/margins_std": 0.4734679162502289, "rewards/rejected": -0.4630300998687744, "step": 2350 }, { "epoch": 0.57, "grad_norm": 8.671069518475692, "learning_rate": 2.3683948556765624e-07, "logits/chosen": -2.6653316020965576, "logits/rejected": -2.629021167755127, "logps/chosen": -275.5854187011719, "logps/rejected": -276.62445068359375, "loss": 0.6337, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19002783298492432, "rewards/margins": 0.2670327126979828, "rewards/margins_max": 0.858096718788147, "rewards/margins_min": -0.23270674049854279, "rewards/margins_std": 0.4833803176879883, "rewards/rejected": -0.4570605158805847, "step": 2360 }, { "epoch": 0.57, "grad_norm": 8.35667943542909, "learning_rate": 2.34752930977113e-07, "logits/chosen": -2.605914831161499, "logits/rejected": -2.5656206607818604, "logps/chosen": -297.6455078125, "logps/rejected": -320.46673583984375, "loss": 0.5856, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1851060837507248, "rewards/margins": 0.24210119247436523, "rewards/margins_max": 0.819743275642395, "rewards/margins_min": -0.3182319104671478, "rewards/margins_std": 0.5180691480636597, "rewards/rejected": -0.4272072911262512, "step": 2370 }, { "epoch": 0.57, "grad_norm": 7.835666822967071, "learning_rate": 2.3266744192724052e-07, "logits/chosen": -2.6449246406555176, "logits/rejected": -2.627570629119873, "logps/chosen": -317.83843994140625, "logps/rejected": -325.9470520019531, "loss": 0.5905, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2597433626651764, "rewards/margins": 0.33320045471191406, "rewards/margins_max": 0.9372183084487915, "rewards/margins_min": -0.3607577085494995, "rewards/margins_std": 0.5766936540603638, "rewards/rejected": -0.5929437875747681, "step": 2380 }, { "epoch": 0.57, "grad_norm": 3.548511010185018, "learning_rate": 2.3058316416233864e-07, "logits/chosen": -2.6739461421966553, "logits/rejected": -2.666889190673828, "logps/chosen": -312.7624816894531, "logps/rejected": -303.9245300292969, "loss": 0.601, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.23950240015983582, "rewards/margins": 0.19521835446357727, "rewards/margins_max": 0.8413310050964355, "rewards/margins_min": -0.46826639771461487, "rewards/margins_std": 0.5935012102127075, "rewards/rejected": -0.4347207546234131, "step": 2390 }, { "epoch": 0.57, "grad_norm": 9.287142029713884, "learning_rate": 2.2850024334205654e-07, "logits/chosen": -2.6532249450683594, "logits/rejected": -2.6485819816589355, "logps/chosen": -291.0581970214844, "logps/rejected": -306.99578857421875, "loss": 0.5698, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2209687978029251, "rewards/margins": 0.3287777006626129, "rewards/margins_max": 0.8810712099075317, "rewards/margins_min": -0.21057824790477753, "rewards/margins_std": 0.489013135433197, "rewards/rejected": -0.5497465133666992, "step": 2400 }, { "epoch": 0.57, "eval_logits/chosen": -2.6389448642730713, "eval_logits/rejected": -2.609215497970581, "eval_logps/chosen": -303.47515869140625, "eval_logps/rejected": -313.65570068359375, "eval_loss": 0.5978549718856812, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.19019848108291626, "eval_rewards/margins": 0.2878260910511017, "eval_rewards/margins_max": 1.0878815650939941, "eval_rewards/margins_min": -0.4939045310020447, "eval_rewards/margins_std": 0.5368744730949402, "eval_rewards/rejected": -0.47802454233169556, "eval_runtime": 859.412, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 2400 }, { "epoch": 0.58, "grad_norm": 8.250334299423635, "learning_rate": 2.264188250312138e-07, "logits/chosen": -2.63791823387146, "logits/rejected": -2.585139751434326, "logps/chosen": -306.912841796875, "logps/rejected": -276.64434814453125, "loss": 0.5608, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1724175363779068, "rewards/margins": 0.3793061673641205, "rewards/margins_max": 0.9429492950439453, "rewards/margins_min": -0.1732269674539566, "rewards/margins_std": 0.5044452548027039, "rewards/rejected": -0.5517237186431885, "step": 2410 }, { "epoch": 0.58, "grad_norm": 7.536721422462468, "learning_rate": 2.2433905468962674e-07, "logits/chosen": -2.69928240776062, "logits/rejected": -2.678112745285034, "logps/chosen": -320.2510070800781, "logps/rejected": -310.3973083496094, "loss": 0.5638, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18819960951805115, "rewards/margins": 0.4135063588619232, "rewards/margins_max": 0.960457444190979, "rewards/margins_min": -0.21604296565055847, "rewards/margins_std": 0.5411546230316162, "rewards/rejected": -0.6017060279846191, "step": 2420 }, { "epoch": 0.58, "grad_norm": 9.203042099584177, "learning_rate": 2.222610776619439e-07, "logits/chosen": -2.7014262676239014, "logits/rejected": -2.66084623336792, "logps/chosen": -315.51361083984375, "logps/rejected": -298.0957336425781, "loss": 0.5668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1906673163175583, "rewards/margins": 0.36204102635383606, "rewards/margins_max": 1.0159660577774048, "rewards/margins_min": -0.2975941002368927, "rewards/margins_std": 0.5978912115097046, "rewards/rejected": -0.5527083873748779, "step": 2430 }, { "epoch": 0.58, "grad_norm": 12.089585644458106, "learning_rate": 2.201850391674877e-07, "logits/chosen": -2.687541961669922, "logits/rejected": -2.644740581512451, "logps/chosen": -320.5738220214844, "logps/rejected": -292.30218505859375, "loss": 0.5842, "rewards/accuracies": 0.75, "rewards/chosen": -0.15845851600170135, "rewards/margins": 0.3309580087661743, "rewards/margins_max": 0.800940990447998, "rewards/margins_min": -0.2134658545255661, "rewards/margins_std": 0.46003514528274536, "rewards/rejected": -0.4894165098667145, "step": 2440 }, { "epoch": 0.59, "grad_norm": 5.970990943647553, "learning_rate": 2.181110842901066e-07, "logits/chosen": -2.7046241760253906, "logits/rejected": -2.632124423980713, "logps/chosen": -293.5450439453125, "logps/rejected": -290.432861328125, "loss": 0.5783, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1822170913219452, "rewards/margins": 0.36459654569625854, "rewards/margins_max": 1.079773187637329, "rewards/margins_min": -0.23687157034873962, "rewards/margins_std": 0.5974687337875366, "rewards/rejected": -0.5468136072158813, "step": 2450 }, { "epoch": 0.59, "grad_norm": 6.575545070087346, "learning_rate": 2.160393579680353e-07, "logits/chosen": -2.59405517578125, "logits/rejected": -2.613334894180298, "logps/chosen": -281.93841552734375, "logps/rejected": -316.2869567871094, "loss": 0.5566, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.18146109580993652, "rewards/margins": 0.46110066771507263, "rewards/margins_max": 1.0213110446929932, "rewards/margins_min": -0.08433142304420471, "rewards/margins_std": 0.4864630699157715, "rewards/rejected": -0.6425617933273315, "step": 2460 }, { "epoch": 0.59, "grad_norm": 5.8721728725920075, "learning_rate": 2.1397000498376634e-07, "logits/chosen": -2.6570866107940674, "logits/rejected": -2.5731148719787598, "logps/chosen": -284.0189208984375, "logps/rejected": -301.596923828125, "loss": 0.5591, "rewards/accuracies": 0.75, "rewards/chosen": -0.12092554569244385, "rewards/margins": 0.38032206892967224, "rewards/margins_max": 0.9591177701950073, "rewards/margins_min": -0.1604117453098297, "rewards/margins_std": 0.5008580088615417, "rewards/rejected": -0.5012476444244385, "step": 2470 }, { "epoch": 0.59, "grad_norm": 12.066010433779923, "learning_rate": 2.1190316995393144e-07, "logits/chosen": -2.6345062255859375, "logits/rejected": -2.571965217590332, "logps/chosen": -289.5625915527344, "logps/rejected": -273.22491455078125, "loss": 0.5902, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29963773488998413, "rewards/margins": 0.27483218908309937, "rewards/margins_max": 0.8773409128189087, "rewards/margins_min": -0.2962464988231659, "rewards/margins_std": 0.5167412161827087, "rewards/rejected": -0.5744699239730835, "step": 2480 }, { "epoch": 0.6, "grad_norm": 13.696080692766579, "learning_rate": 2.098389973191953e-07, "logits/chosen": -2.6669273376464844, "logits/rejected": -2.647965908050537, "logps/chosen": -314.2147521972656, "logps/rejected": -331.3398742675781, "loss": 0.5721, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1829395741224289, "rewards/margins": 0.29835018515586853, "rewards/margins_max": 0.9142980575561523, "rewards/margins_min": -0.3180716335773468, "rewards/margins_std": 0.5453607439994812, "rewards/rejected": -0.48128971457481384, "step": 2490 }, { "epoch": 0.6, "grad_norm": 9.312767076182801, "learning_rate": 2.0777763133416118e-07, "logits/chosen": -2.6531622409820557, "logits/rejected": -2.606722354888916, "logps/chosen": -290.75665283203125, "logps/rejected": -271.52813720703125, "loss": 0.5656, "rewards/accuracies": 0.75, "rewards/chosen": -0.26194143295288086, "rewards/margins": 0.3659655749797821, "rewards/margins_max": 0.9094951748847961, "rewards/margins_min": -0.1398560106754303, "rewards/margins_std": 0.46722960472106934, "rewards/rejected": -0.6279069781303406, "step": 2500 }, { "epoch": 0.6, "eval_logits/chosen": -2.6290502548217773, "eval_logits/rejected": -2.599135160446167, "eval_logps/chosen": -311.5382080078125, "eval_logps/rejected": -321.8217468261719, "eval_loss": 0.5992329716682434, "eval_rewards/accuracies": 0.6984999775886536, "eval_rewards/chosen": -0.2708284258842468, "eval_rewards/margins": 0.2888563275337219, "eval_rewards/margins_max": 1.0979818105697632, "eval_rewards/margins_min": -0.5097129940986633, "eval_rewards/margins_std": 0.5454325675964355, "eval_rewards/rejected": -0.5596847534179688, "eval_runtime": 860.4805, "eval_samples_per_second": 4.649, "eval_steps_per_second": 0.291, "step": 2500 }, { "epoch": 0.6, "grad_norm": 10.04083079635618, "learning_rate": 2.057192160572898e-07, "logits/chosen": -2.662008285522461, "logits/rejected": -2.5704874992370605, "logps/chosen": -291.0193176269531, "logps/rejected": -324.3416442871094, "loss": 0.5979, "rewards/accuracies": 0.6875, "rewards/chosen": -0.209696963429451, "rewards/margins": 0.2942900061607361, "rewards/margins_max": 0.9081916809082031, "rewards/margins_min": -0.2142058163881302, "rewards/margins_std": 0.5111109018325806, "rewards/rejected": -0.5039870142936707, "step": 2510 }, { "epoch": 0.6, "grad_norm": 8.642284820234895, "learning_rate": 2.0366389534083185e-07, "logits/chosen": -2.66066312789917, "logits/rejected": -2.6295483112335205, "logps/chosen": -305.4989318847656, "logps/rejected": -297.40618896484375, "loss": 0.5895, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1935977190732956, "rewards/margins": 0.31689274311065674, "rewards/margins_max": 0.9463685750961304, "rewards/margins_min": -0.23440134525299072, "rewards/margins_std": 0.5165520906448364, "rewards/rejected": -0.5104904770851135, "step": 2520 }, { "epoch": 0.61, "grad_norm": 5.672527349279183, "learning_rate": 2.0161181282077469e-07, "logits/chosen": -2.6334547996520996, "logits/rejected": -2.634446620941162, "logps/chosen": -265.6521911621094, "logps/rejected": -294.1048889160156, "loss": 0.5779, "rewards/accuracies": 0.75, "rewards/chosen": -0.2336483895778656, "rewards/margins": 0.332066148519516, "rewards/margins_max": 0.9206092953681946, "rewards/margins_min": -0.1984013468027115, "rewards/margins_std": 0.5049009323120117, "rewards/rejected": -0.5657145977020264, "step": 2530 }, { "epoch": 0.61, "grad_norm": 7.054123493685105, "learning_rate": 1.9956311190680468e-07, "logits/chosen": -2.6308562755584717, "logits/rejected": -2.624748945236206, "logps/chosen": -277.38079833984375, "logps/rejected": -321.8224792480469, "loss": 0.5978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.24823418259620667, "rewards/margins": 0.3347433805465698, "rewards/margins_max": 0.9458397626876831, "rewards/margins_min": -0.2256821095943451, "rewards/margins_std": 0.5262209177017212, "rewards/rejected": -0.5829775929450989, "step": 2540 }, { "epoch": 0.61, "grad_norm": 8.140310094676984, "learning_rate": 1.9751793577228455e-07, "logits/chosen": -2.66951322555542, "logits/rejected": -2.6798131465911865, "logps/chosen": -321.3728942871094, "logps/rejected": -321.29351806640625, "loss": 0.5892, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2094179093837738, "rewards/margins": 0.2948618531227112, "rewards/margins_max": 0.7485690712928772, "rewards/margins_min": -0.18952804803848267, "rewards/margins_std": 0.4270727038383484, "rewards/rejected": -0.5042797327041626, "step": 2550 }, { "epoch": 0.61, "grad_norm": 7.600463160405789, "learning_rate": 1.9547642734424823e-07, "logits/chosen": -2.5959692001342773, "logits/rejected": -2.621346950531006, "logps/chosen": -301.83380126953125, "logps/rejected": -319.92218017578125, "loss": 0.5753, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12533453106880188, "rewards/margins": 0.36113643646240234, "rewards/margins_max": 0.9919248819351196, "rewards/margins_min": -0.17455923557281494, "rewards/margins_std": 0.5277222394943237, "rewards/rejected": -0.4864709973335266, "step": 2560 }, { "epoch": 0.62, "grad_norm": 10.267465737032328, "learning_rate": 1.9343872929341196e-07, "logits/chosen": -2.654869556427002, "logits/rejected": -2.629849672317505, "logps/chosen": -307.09124755859375, "logps/rejected": -321.4107360839844, "loss": 0.578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14508172869682312, "rewards/margins": 0.3119579553604126, "rewards/margins_max": 0.9108166694641113, "rewards/margins_min": -0.29992565512657166, "rewards/margins_std": 0.5585619211196899, "rewards/rejected": -0.4570396840572357, "step": 2570 }, { "epoch": 0.62, "grad_norm": 3.7576589572809023, "learning_rate": 1.9140498402420416e-07, "logits/chosen": -2.647003173828125, "logits/rejected": -2.624803066253662, "logps/chosen": -324.68646240234375, "logps/rejected": -348.1784973144531, "loss": 0.5693, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1981527954339981, "rewards/margins": 0.40184909105300903, "rewards/margins_max": 0.9560012817382812, "rewards/margins_min": -0.10828708112239838, "rewards/margins_std": 0.47486764192581177, "rewards/rejected": -0.6000019311904907, "step": 2580 }, { "epoch": 0.62, "grad_norm": 9.371237851903723, "learning_rate": 1.8937533366481308e-07, "logits/chosen": -2.552311658859253, "logits/rejected": -2.591034412384033, "logps/chosen": -284.5642395019531, "logps/rejected": -317.30535888671875, "loss": 0.6052, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2428911030292511, "rewards/margins": 0.2654326558113098, "rewards/margins_max": 0.9568518400192261, "rewards/margins_min": -0.4375254213809967, "rewards/margins_std": 0.6340715885162354, "rewards/rejected": -0.5083237290382385, "step": 2590 }, { "epoch": 0.62, "grad_norm": 6.374025756592385, "learning_rate": 1.8734992005725463e-07, "logits/chosen": -2.4831905364990234, "logits/rejected": -2.4986119270324707, "logps/chosen": -315.5087890625, "logps/rejected": -329.82440185546875, "loss": 0.5795, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.21649488806724548, "rewards/margins": 0.3169113099575043, "rewards/margins_max": 0.8947137594223022, "rewards/margins_min": -0.2509083151817322, "rewards/margins_std": 0.5162596702575684, "rewards/rejected": -0.5334061980247498, "step": 2600 }, { "epoch": 0.62, "eval_logits/chosen": -2.6244351863861084, "eval_logits/rejected": -2.594435930252075, "eval_logps/chosen": -305.5476379394531, "eval_logps/rejected": -316.98046875, "eval_loss": 0.5950339436531067, "eval_rewards/accuracies": 0.6949999928474426, "eval_rewards/chosen": -0.21092304587364197, "eval_rewards/margins": 0.30034908652305603, "eval_rewards/margins_max": 1.1206122636795044, "eval_rewards/margins_min": -0.5079042315483093, "eval_rewards/margins_std": 0.5532759428024292, "eval_rewards/rejected": -0.511272132396698, "eval_runtime": 860.8369, "eval_samples_per_second": 4.647, "eval_steps_per_second": 0.29, "step": 2600 }, { "epoch": 0.62, "grad_norm": 7.184907395861785, "learning_rate": 1.853288847474594e-07, "logits/chosen": -2.6795639991760254, "logits/rejected": -2.6414713859558105, "logps/chosen": -327.1631164550781, "logps/rejected": -323.26617431640625, "loss": 0.5642, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.19482922554016113, "rewards/margins": 0.3318164348602295, "rewards/margins_max": 0.9375141263008118, "rewards/margins_min": -0.2631053328514099, "rewards/margins_std": 0.5353230237960815, "rewards/rejected": -0.5266456604003906, "step": 2610 }, { "epoch": 0.63, "grad_norm": 8.375504663549572, "learning_rate": 1.8331236897538065e-07, "logits/chosen": -2.6555323600769043, "logits/rejected": -2.6470930576324463, "logps/chosen": -301.43994140625, "logps/rejected": -320.8365783691406, "loss": 0.6076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.23343400657176971, "rewards/margins": 0.2559712827205658, "rewards/margins_max": 0.8926533460617065, "rewards/margins_min": -0.34984445571899414, "rewards/margins_std": 0.5504107475280762, "rewards/rejected": -0.4894053339958191, "step": 2620 }, { "epoch": 0.63, "grad_norm": 8.074893386184252, "learning_rate": 1.8130051366512447e-07, "logits/chosen": -2.6671040058135986, "logits/rejected": -2.5697810649871826, "logps/chosen": -306.04376220703125, "logps/rejected": -347.21392822265625, "loss": 0.5613, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12057604640722275, "rewards/margins": 0.39471858739852905, "rewards/margins_max": 1.1163181066513062, "rewards/margins_min": -0.3967621922492981, "rewards/margins_std": 0.6732631921768188, "rewards/rejected": -0.5152946710586548, "step": 2630 }, { "epoch": 0.63, "grad_norm": 13.182140544907838, "learning_rate": 1.792934594151003e-07, "logits/chosen": -2.694676637649536, "logits/rejected": -2.6797292232513428, "logps/chosen": -293.8489685058594, "logps/rejected": -289.4837951660156, "loss": 0.6257, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.27340248227119446, "rewards/margins": 0.21240122616291046, "rewards/margins_max": 0.8069620132446289, "rewards/margins_min": -0.30981430411338806, "rewards/margins_std": 0.4958290457725525, "rewards/rejected": -0.4858037531375885, "step": 2640 }, { "epoch": 0.63, "grad_norm": 9.431714438194804, "learning_rate": 1.7729134648819605e-07, "logits/chosen": -2.5747389793395996, "logits/rejected": -2.530871868133545, "logps/chosen": -273.8680114746094, "logps/rejected": -288.00897216796875, "loss": 0.574, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2643434405326843, "rewards/margins": 0.30124327540397644, "rewards/margins_max": 0.9424476623535156, "rewards/margins_min": -0.29725712537765503, "rewards/margins_std": 0.5660034418106079, "rewards/rejected": -0.5655866861343384, "step": 2650 }, { "epoch": 0.64, "grad_norm": 8.482087952859194, "learning_rate": 1.7529431480197533e-07, "logits/chosen": -2.6235687732696533, "logits/rejected": -2.5714054107666016, "logps/chosen": -297.802978515625, "logps/rejected": -307.982666015625, "loss": 0.5844, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2382938116788864, "rewards/margins": 0.28809601068496704, "rewards/margins_max": 0.915099024772644, "rewards/margins_min": -0.3727230429649353, "rewards/margins_std": 0.5710189342498779, "rewards/rejected": -0.5263898968696594, "step": 2660 }, { "epoch": 0.64, "grad_norm": 7.397709424972579, "learning_rate": 1.7330250391889961e-07, "logits/chosen": -2.66947078704834, "logits/rejected": -2.5924038887023926, "logps/chosen": -291.0464782714844, "logps/rejected": -260.6331481933594, "loss": 0.576, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10688348859548569, "rewards/margins": 0.40493711829185486, "rewards/margins_max": 0.9435163736343384, "rewards/margins_min": -0.08007287979125977, "rewards/margins_std": 0.4616405963897705, "rewards/rejected": -0.5118206143379211, "step": 2670 }, { "epoch": 0.64, "grad_norm": 5.814215294199613, "learning_rate": 1.713160530365747e-07, "logits/chosen": -2.7008068561553955, "logits/rejected": -2.6780431270599365, "logps/chosen": -314.0332336425781, "logps/rejected": -321.0278625488281, "loss": 0.6235, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.22004225850105286, "rewards/margins": 0.1748182624578476, "rewards/margins_max": 0.7563528418540955, "rewards/margins_min": -0.36928868293762207, "rewards/margins_std": 0.5032538771629333, "rewards/rejected": -0.39486050605773926, "step": 2680 }, { "epoch": 0.64, "grad_norm": 13.848762517517642, "learning_rate": 1.693351009780231e-07, "logits/chosen": -2.602003574371338, "logits/rejected": -2.5646305084228516, "logps/chosen": -307.02569580078125, "logps/rejected": -298.5267639160156, "loss": 0.5792, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.20884010195732117, "rewards/margins": 0.3109968304634094, "rewards/margins_max": 0.9659851789474487, "rewards/margins_min": -0.37391597032546997, "rewards/margins_std": 0.5974076390266418, "rewards/rejected": -0.519836962223053, "step": 2690 }, { "epoch": 0.65, "grad_norm": 8.596476840107231, "learning_rate": 1.6735978618198215e-07, "logits/chosen": -2.6759331226348877, "logits/rejected": -2.661097288131714, "logps/chosen": -269.54351806640625, "logps/rejected": -326.4341125488281, "loss": 0.5909, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.19427165389060974, "rewards/margins": 0.2538819909095764, "rewards/margins_max": 0.9365754127502441, "rewards/margins_min": -0.30797111988067627, "rewards/margins_std": 0.5668118596076965, "rewards/rejected": -0.44815367460250854, "step": 2700 }, { "epoch": 0.65, "eval_logits/chosen": -2.623544216156006, "eval_logits/rejected": -2.593374252319336, "eval_logps/chosen": -304.5151672363281, "eval_logps/rejected": -316.2978820800781, "eval_loss": 0.5945470929145813, "eval_rewards/accuracies": 0.6949999928474426, "eval_rewards/chosen": -0.2005983293056488, "eval_rewards/margins": 0.3038475215435028, "eval_rewards/margins_max": 1.1335314512252808, "eval_rewards/margins_min": -0.5149688720703125, "eval_rewards/margins_std": 0.5597691535949707, "eval_rewards/rejected": -0.5044458508491516, "eval_runtime": 859.3675, "eval_samples_per_second": 4.655, "eval_steps_per_second": 0.291, "step": 2700 }, { "epoch": 0.65, "grad_norm": 9.703570025523305, "learning_rate": 1.6539024669322954e-07, "logits/chosen": -2.6310129165649414, "logits/rejected": -2.6164767742156982, "logps/chosen": -305.9279479980469, "logps/rejected": -299.69830322265625, "loss": 0.5767, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.17514154314994812, "rewards/margins": 0.34417447447776794, "rewards/margins_max": 0.9141018986701965, "rewards/margins_min": -0.21524396538734436, "rewards/margins_std": 0.5172333121299744, "rewards/rejected": -0.5193160176277161, "step": 2710 }, { "epoch": 0.65, "grad_norm": 9.837477850495237, "learning_rate": 1.6342662015293584e-07, "logits/chosen": -2.603860378265381, "logits/rejected": -2.58225679397583, "logps/chosen": -327.4505615234375, "logps/rejected": -316.6806945800781, "loss": 0.5974, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2224818766117096, "rewards/margins": 0.33033767342567444, "rewards/margins_max": 0.8051918148994446, "rewards/margins_min": -0.11464174091815948, "rewards/margins_std": 0.4120241701602936, "rewards/rejected": -0.552819550037384, "step": 2720 }, { "epoch": 0.65, "grad_norm": 5.8630349897277165, "learning_rate": 1.6146904378904536e-07, "logits/chosen": -2.732633590698242, "logits/rejected": -2.6950955390930176, "logps/chosen": -367.7388000488281, "logps/rejected": -373.8130798339844, "loss": 0.6117, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3329017460346222, "rewards/margins": 0.24218297004699707, "rewards/margins_max": 0.9342014193534851, "rewards/margins_min": -0.3903924226760864, "rewards/margins_std": 0.5841065645217896, "rewards/rejected": -0.5750846862792969, "step": 2730 }, { "epoch": 0.66, "grad_norm": 7.145113479642076, "learning_rate": 1.5951765440668635e-07, "logits/chosen": -2.679503917694092, "logits/rejected": -2.6337997913360596, "logps/chosen": -326.5082092285156, "logps/rejected": -292.46990966796875, "loss": 0.5674, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17851652204990387, "rewards/margins": 0.35734179615974426, "rewards/margins_max": 1.0165164470672607, "rewards/margins_min": -0.28420490026474, "rewards/margins_std": 0.5844985246658325, "rewards/rejected": -0.5358583331108093, "step": 2740 }, { "epoch": 0.66, "grad_norm": 11.910015321812995, "learning_rate": 1.5757258837860998e-07, "logits/chosen": -2.6206724643707275, "logits/rejected": -2.5926003456115723, "logps/chosen": -307.025146484375, "logps/rejected": -302.80303955078125, "loss": 0.5821, "rewards/accuracies": 0.75, "rewards/chosen": -0.2873552739620209, "rewards/margins": 0.3741292953491211, "rewards/margins_max": 0.9916057586669922, "rewards/margins_min": -0.1654580980539322, "rewards/margins_std": 0.5074654221534729, "rewards/rejected": -0.6614845991134644, "step": 2750 }, { "epoch": 0.66, "grad_norm": 8.511001199533286, "learning_rate": 1.5563398163566034e-07, "logits/chosen": -2.63610577583313, "logits/rejected": -2.648735761642456, "logps/chosen": -278.10650634765625, "logps/rejected": -330.8636169433594, "loss": 0.5522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18409788608551025, "rewards/margins": 0.3788400888442993, "rewards/margins_max": 0.9228482246398926, "rewards/margins_min": -0.09980174154043198, "rewards/margins_std": 0.4648720622062683, "rewards/rejected": -0.5629379749298096, "step": 2760 }, { "epoch": 0.66, "grad_norm": 8.482171153544789, "learning_rate": 1.5370196965727438e-07, "logits/chosen": -2.635110378265381, "logits/rejected": -2.607448101043701, "logps/chosen": -296.49542236328125, "logps/rejected": -314.3409423828125, "loss": 0.5618, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.27174216508865356, "rewards/margins": 0.286690890789032, "rewards/margins_max": 0.8846603631973267, "rewards/margins_min": -0.32019850611686707, "rewards/margins_std": 0.530876100063324, "rewards/rejected": -0.5584330558776855, "step": 2770 }, { "epoch": 0.67, "grad_norm": 8.442325370774075, "learning_rate": 1.5177668746201454e-07, "logits/chosen": -2.600867748260498, "logits/rejected": -2.615812301635742, "logps/chosen": -275.86767578125, "logps/rejected": -312.8405456542969, "loss": 0.5954, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2981366515159607, "rewards/margins": 0.296410471200943, "rewards/margins_max": 0.9821388125419617, "rewards/margins_min": -0.39119023084640503, "rewards/margins_std": 0.6126337051391602, "rewards/rejected": -0.5945470929145813, "step": 2780 }, { "epoch": 0.67, "grad_norm": 5.197446181333802, "learning_rate": 1.4985826959813254e-07, "logits/chosen": -2.70615291595459, "logits/rejected": -2.6561496257781982, "logps/chosen": -346.88934326171875, "logps/rejected": -350.87066650390625, "loss": 0.5909, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.29688599705696106, "rewards/margins": 0.2667461633682251, "rewards/margins_max": 0.965578556060791, "rewards/margins_min": -0.36972442269325256, "rewards/margins_std": 0.6068300008773804, "rewards/rejected": -0.5636321306228638, "step": 2790 }, { "epoch": 0.67, "grad_norm": 7.0483743294802785, "learning_rate": 1.4794685013416674e-07, "logits/chosen": -2.624732732772827, "logits/rejected": -2.594238758087158, "logps/chosen": -319.1407775878906, "logps/rejected": -318.9751281738281, "loss": 0.6097, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1605234146118164, "rewards/margins": 0.3256325423717499, "rewards/margins_max": 0.9358084797859192, "rewards/margins_min": -0.23688547313213348, "rewards/margins_std": 0.5254564881324768, "rewards/rejected": -0.48615598678588867, "step": 2800 }, { "epoch": 0.67, "eval_logits/chosen": -2.621018648147583, "eval_logits/rejected": -2.5909228324890137, "eval_logps/chosen": -304.81005859375, "eval_logps/rejected": -316.7603759765625, "eval_loss": 0.5937537550926208, "eval_rewards/accuracies": 0.6974999904632568, "eval_rewards/chosen": -0.20354729890823364, "eval_rewards/margins": 0.30552393198013306, "eval_rewards/margins_max": 1.1390976905822754, "eval_rewards/margins_min": -0.5171207189559937, "eval_rewards/margins_std": 0.5610091090202332, "eval_rewards/rejected": -0.5090711712837219, "eval_runtime": 859.8338, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 2800 }, { "epoch": 0.67, "grad_norm": 7.734353791795418, "learning_rate": 1.460425626495725e-07, "logits/chosen": -2.626800775527954, "logits/rejected": -2.603543758392334, "logps/chosen": -268.3589782714844, "logps/rejected": -282.11590576171875, "loss": 0.5761, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18991556763648987, "rewards/margins": 0.3082110285758972, "rewards/margins_max": 0.8790051341056824, "rewards/margins_min": -0.26625508069992065, "rewards/margins_std": 0.5046501159667969, "rewards/rejected": -0.49812665581703186, "step": 2810 }, { "epoch": 0.68, "grad_norm": 5.283835596384424, "learning_rate": 1.4414554022538737e-07, "logits/chosen": -2.6725804805755615, "logits/rejected": -2.624807119369507, "logps/chosen": -308.9907531738281, "logps/rejected": -298.0137634277344, "loss": 0.5764, "rewards/accuracies": 0.8125, "rewards/chosen": -0.203936368227005, "rewards/margins": 0.4011301100254059, "rewards/margins_max": 0.9146728515625, "rewards/margins_min": -0.11008103936910629, "rewards/margins_std": 0.4578138291835785, "rewards/rejected": -0.6050664186477661, "step": 2820 }, { "epoch": 0.68, "grad_norm": 8.105790656258398, "learning_rate": 1.4225591543493025e-07, "logits/chosen": -2.537041187286377, "logits/rejected": -2.5283117294311523, "logps/chosen": -261.78497314453125, "logps/rejected": -334.4160461425781, "loss": 0.5651, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.16362933814525604, "rewards/margins": 0.35795944929122925, "rewards/margins_max": 0.9304954409599304, "rewards/margins_min": -0.23991163074970245, "rewards/margins_std": 0.5223571062088013, "rewards/rejected": -0.5215888023376465, "step": 2830 }, { "epoch": 0.68, "grad_norm": 13.215129527836645, "learning_rate": 1.4037382033453698e-07, "logits/chosen": -2.6501476764678955, "logits/rejected": -2.6472105979919434, "logps/chosen": -308.47210693359375, "logps/rejected": -330.118408203125, "loss": 0.5725, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25531333684921265, "rewards/margins": 0.2947203516960144, "rewards/margins_max": 0.7916975021362305, "rewards/margins_min": -0.17512428760528564, "rewards/margins_std": 0.4317558705806732, "rewards/rejected": -0.550033688545227, "step": 2840 }, { "epoch": 0.68, "grad_norm": 7.861511789391789, "learning_rate": 1.384993864543314e-07, "logits/chosen": -2.6705267429351807, "logits/rejected": -2.6534152030944824, "logps/chosen": -300.2662353515625, "logps/rejected": -340.5654296875, "loss": 0.5674, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.17005816102027893, "rewards/margins": 0.3681716322898865, "rewards/margins_max": 0.962718665599823, "rewards/margins_min": -0.232384592294693, "rewards/margins_std": 0.5470969080924988, "rewards/rejected": -0.5382298231124878, "step": 2850 }, { "epoch": 0.68, "grad_norm": 9.159352476729387, "learning_rate": 1.366327447890332e-07, "logits/chosen": -2.6822450160980225, "logits/rejected": -2.6347575187683105, "logps/chosen": -319.39990234375, "logps/rejected": -333.537109375, "loss": 0.5934, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.13150997459888458, "rewards/margins": 0.322347491979599, "rewards/margins_max": 0.8425567746162415, "rewards/margins_min": -0.22028379142284393, "rewards/margins_std": 0.4941239356994629, "rewards/rejected": -0.4538574814796448, "step": 2860 }, { "epoch": 0.69, "grad_norm": 5.893212840256697, "learning_rate": 1.3477402578880356e-07, "logits/chosen": -2.7067980766296387, "logits/rejected": -2.665985584259033, "logps/chosen": -324.10211181640625, "logps/rejected": -344.2232360839844, "loss": 0.5912, "rewards/accuracies": 0.75, "rewards/chosen": -0.27206873893737793, "rewards/margins": 0.3080797791481018, "rewards/margins_max": 0.816716194152832, "rewards/margins_min": -0.21273913979530334, "rewards/margins_std": 0.47305870056152344, "rewards/rejected": -0.5801485180854797, "step": 2870 }, { "epoch": 0.69, "grad_norm": 17.80143142259519, "learning_rate": 1.3292335935012854e-07, "logits/chosen": -2.6728432178497314, "logits/rejected": -2.6427035331726074, "logps/chosen": -349.10595703125, "logps/rejected": -328.55853271484375, "loss": 0.59, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.21816524863243103, "rewards/margins": 0.3346711993217468, "rewards/margins_max": 0.8795034289360046, "rewards/margins_min": -0.2123207151889801, "rewards/margins_std": 0.501558244228363, "rewards/rejected": -0.5528364181518555, "step": 2880 }, { "epoch": 0.69, "grad_norm": 4.457459598856668, "learning_rate": 1.3108087480674166e-07, "logits/chosen": -2.643859386444092, "logits/rejected": -2.6534647941589355, "logps/chosen": -337.7801513671875, "logps/rejected": -360.29046630859375, "loss": 0.5731, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2451343983411789, "rewards/margins": 0.37806132435798645, "rewards/margins_max": 1.0581673383712769, "rewards/margins_min": -0.2207886278629303, "rewards/margins_std": 0.5833451747894287, "rewards/rejected": -0.6231956481933594, "step": 2890 }, { "epoch": 0.69, "grad_norm": 4.465711934743042, "learning_rate": 1.2924670092058465e-07, "logits/chosen": -2.6721737384796143, "logits/rejected": -2.6429543495178223, "logps/chosen": -322.4364929199219, "logps/rejected": -283.9398193359375, "loss": 0.5776, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17205238342285156, "rewards/margins": 0.3702142536640167, "rewards/margins_max": 0.9432505369186401, "rewards/margins_min": -0.21967828273773193, "rewards/margins_std": 0.5122222304344177, "rewards/rejected": -0.5422666072845459, "step": 2900 }, { "epoch": 0.69, "eval_logits/chosen": -2.6177093982696533, "eval_logits/rejected": -2.587409734725952, "eval_logps/chosen": -305.8715515136719, "eval_logps/rejected": -318.1777648925781, "eval_loss": 0.5928537845611572, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": -0.2141621708869934, "eval_rewards/margins": 0.3090827465057373, "eval_rewards/margins_max": 1.1530448198318481, "eval_rewards/margins_min": -0.5251262784004211, "eval_rewards/margins_std": 0.5672925710678101, "eval_rewards/rejected": -0.5232448577880859, "eval_runtime": 859.8299, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 2900 }, { "epoch": 0.7, "grad_norm": 10.347129799745117, "learning_rate": 1.2742096587280966e-07, "logits/chosen": -2.597189426422119, "logits/rejected": -2.5491786003112793, "logps/chosen": -300.6410827636719, "logps/rejected": -296.36639404296875, "loss": 0.5706, "rewards/accuracies": 0.75, "rewards/chosen": -0.26455071568489075, "rewards/margins": 0.3420693278312683, "rewards/margins_max": 0.9913008809089661, "rewards/margins_min": -0.338512122631073, "rewards/margins_std": 0.590982973575592, "rewards/rejected": -0.6066200137138367, "step": 2910 }, { "epoch": 0.7, "grad_norm": 14.772272395886871, "learning_rate": 1.2560379725482073e-07, "logits/chosen": -2.6537861824035645, "logits/rejected": -2.5844337940216064, "logps/chosen": -310.41607666015625, "logps/rejected": -300.7682189941406, "loss": 0.5778, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17608337104320526, "rewards/margins": 0.29788199067115784, "rewards/margins_max": 0.9067052602767944, "rewards/margins_min": -0.2519022822380066, "rewards/margins_std": 0.5077629089355469, "rewards/rejected": -0.4739653468132019, "step": 2920 }, { "epoch": 0.7, "grad_norm": 8.34466433426549, "learning_rate": 1.237953220593579e-07, "logits/chosen": -2.6661570072174072, "logits/rejected": -2.6105422973632812, "logps/chosen": -329.9227600097656, "logps/rejected": -317.62249755859375, "loss": 0.5757, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2413763552904129, "rewards/margins": 0.32379597425460815, "rewards/margins_max": 1.0564546585083008, "rewards/margins_min": -0.45707249641418457, "rewards/margins_std": 0.6725181341171265, "rewards/rejected": -0.5651723742485046, "step": 2930 }, { "epoch": 0.7, "grad_norm": 6.24181959765327, "learning_rate": 1.2199566667162127e-07, "logits/chosen": -2.6748130321502686, "logits/rejected": -2.608752965927124, "logps/chosen": -335.2709045410156, "logps/rejected": -319.9509582519531, "loss": 0.5372, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1704896092414856, "rewards/margins": 0.4461976885795593, "rewards/margins_max": 1.0146349668502808, "rewards/margins_min": -0.16861645877361298, "rewards/margins_std": 0.5327773094177246, "rewards/rejected": -0.6166872978210449, "step": 2940 }, { "epoch": 0.71, "grad_norm": 13.571559991415354, "learning_rate": 1.2020495686043924e-07, "logits/chosen": -2.6338913440704346, "logits/rejected": -2.6120593547821045, "logps/chosen": -321.2636413574219, "logps/rejected": -313.9801940917969, "loss": 0.557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.19493046402931213, "rewards/margins": 0.37187767028808594, "rewards/margins_max": 1.043025016784668, "rewards/margins_min": -0.22166283428668976, "rewards/margins_std": 0.5632063150405884, "rewards/rejected": -0.5668081641197205, "step": 2950 }, { "epoch": 0.71, "grad_norm": 8.294276863387395, "learning_rate": 1.1842331776947931e-07, "logits/chosen": -2.6366591453552246, "logits/rejected": -2.6095542907714844, "logps/chosen": -350.4244384765625, "logps/rejected": -306.56805419921875, "loss": 0.5678, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.18484465777873993, "rewards/margins": 0.3695618212223053, "rewards/margins_max": 0.9093047976493835, "rewards/margins_min": -0.16653835773468018, "rewards/margins_std": 0.4747069478034973, "rewards/rejected": -0.554406464099884, "step": 2960 }, { "epoch": 0.71, "grad_norm": 10.14282240372356, "learning_rate": 1.1665087390850187e-07, "logits/chosen": -2.61653470993042, "logits/rejected": -2.591304302215576, "logps/chosen": -244.28515625, "logps/rejected": -297.53997802734375, "loss": 0.6192, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.30605635046958923, "rewards/margins": 0.23393836617469788, "rewards/margins_max": 0.8981086611747742, "rewards/margins_min": -0.3801124691963196, "rewards/margins_std": 0.5839722752571106, "rewards/rejected": -0.5399946570396423, "step": 2970 }, { "epoch": 0.71, "grad_norm": 7.2314257284719625, "learning_rate": 1.1488774914465918e-07, "logits/chosen": -2.6050806045532227, "logits/rejected": -2.604292392730713, "logps/chosen": -276.8834228515625, "logps/rejected": -320.0370178222656, "loss": 0.5755, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2090921849012375, "rewards/margins": 0.34288525581359863, "rewards/margins_max": 1.0640244483947754, "rewards/margins_min": -0.3710848093032837, "rewards/margins_std": 0.6398088335990906, "rewards/rejected": -0.5519774556159973, "step": 2980 }, { "epoch": 0.72, "grad_norm": 9.784779652184145, "learning_rate": 1.1313406669383877e-07, "logits/chosen": -2.6493895053863525, "logits/rejected": -2.5964341163635254, "logps/chosen": -348.2480773925781, "logps/rejected": -330.4178771972656, "loss": 0.6217, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25257083773612976, "rewards/margins": 0.30446967482566833, "rewards/margins_max": 1.0399785041809082, "rewards/margins_min": -0.44789689779281616, "rewards/margins_std": 0.6760698556900024, "rewards/rejected": -0.5570404529571533, "step": 2990 }, { "epoch": 0.72, "grad_norm": 11.042945259967942, "learning_rate": 1.1138994911205284e-07, "logits/chosen": -2.627768039703369, "logits/rejected": -2.5813279151916504, "logps/chosen": -306.77679443359375, "logps/rejected": -345.439208984375, "loss": 0.575, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.12094666808843613, "rewards/margins": 0.4055045247077942, "rewards/margins_max": 0.9982796907424927, "rewards/margins_min": -0.17103391885757446, "rewards/margins_std": 0.513174295425415, "rewards/rejected": -0.5264511108398438, "step": 3000 }, { "epoch": 0.72, "eval_logits/chosen": -2.6164939403533936, "eval_logits/rejected": -2.5860860347747803, "eval_logps/chosen": -302.9332580566406, "eval_logps/rejected": -314.7164611816406, "eval_loss": 0.5947726368904114, "eval_rewards/accuracies": 0.6980000138282776, "eval_rewards/chosen": -0.18477900326251984, "eval_rewards/margins": 0.3038530945777893, "eval_rewards/margins_max": 1.1465470790863037, "eval_rewards/margins_min": -0.5243028998374939, "eval_rewards/margins_std": 0.5646576881408691, "eval_rewards/rejected": -0.48863208293914795, "eval_runtime": 859.7576, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 3000 }, { "epoch": 0.72, "grad_norm": 6.654501528625748, "learning_rate": 1.0965551828687297e-07, "logits/chosen": -2.648944616317749, "logits/rejected": -2.5937111377716064, "logps/chosen": -277.88275146484375, "logps/rejected": -333.79376220703125, "loss": 0.5816, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11035318672657013, "rewards/margins": 0.3619997799396515, "rewards/margins_max": 0.9593564867973328, "rewards/margins_min": -0.24574780464172363, "rewards/margins_std": 0.5336962342262268, "rewards/rejected": -0.47235292196273804, "step": 3010 }, { "epoch": 0.72, "grad_norm": 10.100569272758296, "learning_rate": 1.0793089542891229e-07, "logits/chosen": -2.592881679534912, "logits/rejected": -2.56054425239563, "logps/chosen": -306.00885009765625, "logps/rejected": -283.6434020996094, "loss": 0.5527, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11078639328479767, "rewards/margins": 0.3766598105430603, "rewards/margins_max": 0.9737428426742554, "rewards/margins_min": -0.19211386144161224, "rewards/margins_std": 0.5283175706863403, "rewards/rejected": -0.4874461591243744, "step": 3020 }, { "epoch": 0.73, "grad_norm": 7.425480209902673, "learning_rate": 1.062162010633545e-07, "logits/chosen": -2.66178560256958, "logits/rejected": -2.619534969329834, "logps/chosen": -305.2130126953125, "logps/rejected": -286.83563232421875, "loss": 0.5774, "rewards/accuracies": 0.75, "rewards/chosen": -0.14195309579372406, "rewards/margins": 0.35815608501434326, "rewards/margins_max": 0.9821040034294128, "rewards/margins_min": -0.17042629420757294, "rewards/margins_std": 0.5124177932739258, "rewards/rejected": -0.5001091361045837, "step": 3030 }, { "epoch": 0.73, "grad_norm": 5.721931948084131, "learning_rate": 1.0451155502153138e-07, "logits/chosen": -2.6573493480682373, "logits/rejected": -2.6277241706848145, "logps/chosen": -311.711669921875, "logps/rejected": -287.81158447265625, "loss": 0.6348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.25803881883621216, "rewards/margins": 0.1889791637659073, "rewards/margins_max": 0.8133442997932434, "rewards/margins_min": -0.4662505090236664, "rewards/margins_std": 0.5739446878433228, "rewards/rejected": -0.44701796770095825, "step": 3040 }, { "epoch": 0.73, "grad_norm": 5.877237745995071, "learning_rate": 1.028170764325479e-07, "logits/chosen": -2.6631836891174316, "logits/rejected": -2.641143560409546, "logps/chosen": -337.29217529296875, "logps/rejected": -321.89019775390625, "loss": 0.5948, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.23359163105487823, "rewards/margins": 0.2570040822029114, "rewards/margins_max": 0.9715884327888489, "rewards/margins_min": -0.3664648234844208, "rewards/margins_std": 0.5953644514083862, "rewards/rejected": -0.4905957281589508, "step": 3050 }, { "epoch": 0.73, "grad_norm": 8.202499046058216, "learning_rate": 1.0113288371495707e-07, "logits/chosen": -2.5986862182617188, "logits/rejected": -2.6056227684020996, "logps/chosen": -324.17584228515625, "logps/rejected": -317.966796875, "loss": 0.574, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19230985641479492, "rewards/margins": 0.32042282819747925, "rewards/margins_max": 0.9411641359329224, "rewards/margins_min": -0.2271740883588791, "rewards/margins_std": 0.5099384188652039, "rewards/rejected": -0.5127326250076294, "step": 3060 }, { "epoch": 0.74, "grad_norm": 7.156920022707099, "learning_rate": 9.945909456848434e-08, "logits/chosen": -2.6237006187438965, "logits/rejected": -2.5988717079162598, "logps/chosen": -322.70654296875, "logps/rejected": -287.52508544921875, "loss": 0.5955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.20879539847373962, "rewards/margins": 0.2584335207939148, "rewards/margins_max": 0.9114171266555786, "rewards/margins_min": -0.33635640144348145, "rewards/margins_std": 0.5569571256637573, "rewards/rejected": -0.4672289490699768, "step": 3070 }, { "epoch": 0.74, "grad_norm": 8.227703134955291, "learning_rate": 9.779582596580203e-08, "logits/chosen": -2.5234127044677734, "logits/rejected": -2.52382493019104, "logps/chosen": -273.2856750488281, "logps/rejected": -297.1725769042969, "loss": 0.5929, "rewards/accuracies": 0.75, "rewards/chosen": -0.19276472926139832, "rewards/margins": 0.35403233766555786, "rewards/margins_max": 0.9341346621513367, "rewards/margins_min": -0.2888151705265045, "rewards/margins_std": 0.5561498999595642, "rewards/rejected": -0.5467970967292786, "step": 3080 }, { "epoch": 0.74, "grad_norm": 11.626153704091989, "learning_rate": 9.614319414435499e-08, "logits/chosen": -2.6854310035705566, "logits/rejected": -2.6439242362976074, "logps/chosen": -295.9474792480469, "logps/rejected": -279.2301330566406, "loss": 0.5466, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.141657292842865, "rewards/margins": 0.32109707593917847, "rewards/margins_max": 0.9081085920333862, "rewards/margins_min": -0.3001248836517334, "rewards/margins_std": 0.5518236756324768, "rewards/rejected": -0.46275433897972107, "step": 3090 }, { "epoch": 0.74, "grad_norm": 8.190121296600395, "learning_rate": 9.450131459823688e-08, "logits/chosen": -2.6764397621154785, "logits/rejected": -2.655296802520752, "logps/chosen": -338.6136474609375, "logps/rejected": -311.8411865234375, "loss": 0.5767, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11137326061725616, "rewards/margins": 0.33382076025009155, "rewards/margins_max": 0.9263644218444824, "rewards/margins_min": -0.2573855519294739, "rewards/margins_std": 0.5275608897209167, "rewards/rejected": -0.4451940655708313, "step": 3100 }, { "epoch": 0.74, "eval_logits/chosen": -2.616608142852783, "eval_logits/rejected": -2.586158037185669, "eval_logps/chosen": -304.1734313964844, "eval_logps/rejected": -316.46478271484375, "eval_loss": 0.5935968160629272, "eval_rewards/accuracies": 0.7009999752044678, "eval_rewards/chosen": -0.1971808820962906, "eval_rewards/margins": 0.3089340329170227, "eval_rewards/margins_max": 1.1550542116165161, "eval_rewards/margins_min": -0.5275899171829224, "eval_rewards/margins_std": 0.5689510107040405, "eval_rewards/rejected": -0.5061149001121521, "eval_runtime": 860.0822, "eval_samples_per_second": 4.651, "eval_steps_per_second": 0.291, "step": 3100 }, { "epoch": 0.74, "grad_norm": 6.102975288582354, "learning_rate": 9.287030207011929e-08, "logits/chosen": -2.5496556758880615, "logits/rejected": -2.5553019046783447, "logps/chosen": -304.30828857421875, "logps/rejected": -323.41571044921875, "loss": 0.5947, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2754679322242737, "rewards/margins": 0.2878427505493164, "rewards/margins_max": 0.8745163679122925, "rewards/margins_min": -0.3475717306137085, "rewards/margins_std": 0.5523396134376526, "rewards/rejected": -0.5633106827735901, "step": 3110 }, { "epoch": 0.75, "grad_norm": 9.587786184563068, "learning_rate": 9.125027054323256e-08, "logits/chosen": -2.6219749450683594, "logits/rejected": -2.595902442932129, "logps/chosen": -343.4728088378906, "logps/rejected": -317.55120849609375, "loss": 0.5891, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.20043566823005676, "rewards/margins": 0.3993472456932068, "rewards/margins_max": 1.026168942451477, "rewards/margins_min": -0.16250573098659515, "rewards/margins_std": 0.5393126010894775, "rewards/rejected": -0.5997829437255859, "step": 3120 }, { "epoch": 0.75, "grad_norm": 8.478665223786749, "learning_rate": 8.964133323340081e-08, "logits/chosen": -2.572331666946411, "logits/rejected": -2.5079894065856934, "logps/chosen": -256.3243408203125, "logps/rejected": -263.0610046386719, "loss": 0.5782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22794052958488464, "rewards/margins": 0.32599711418151855, "rewards/margins_max": 0.9748775362968445, "rewards/margins_min": -0.3257297873497009, "rewards/margins_std": 0.6075552701950073, "rewards/rejected": -0.5539376139640808, "step": 3130 }, { "epoch": 0.75, "grad_norm": 9.307544593099678, "learning_rate": 8.804360258112861e-08, "logits/chosen": -2.718839645385742, "logits/rejected": -2.6697192192077637, "logps/chosen": -302.1324462890625, "logps/rejected": -286.06524658203125, "loss": 0.576, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19155898690223694, "rewards/margins": 0.29025158286094666, "rewards/margins_max": 0.9243696928024292, "rewards/margins_min": -0.32775741815567017, "rewards/margins_std": 0.5530522465705872, "rewards/rejected": -0.4818105697631836, "step": 3140 }, { "epoch": 0.75, "grad_norm": 6.980694443224683, "learning_rate": 8.645719024374446e-08, "logits/chosen": -2.6788792610168457, "logits/rejected": -2.623927354812622, "logps/chosen": -327.87261962890625, "logps/rejected": -320.5452575683594, "loss": 0.5802, "rewards/accuracies": 0.75, "rewards/chosen": -0.16010625660419464, "rewards/margins": 0.3369438350200653, "rewards/margins_max": 0.8717771768569946, "rewards/margins_min": -0.29793721437454224, "rewards/margins_std": 0.5188963413238525, "rewards/rejected": -0.49705010652542114, "step": 3150 }, { "epoch": 0.76, "grad_norm": 7.148682436889758, "learning_rate": 8.488220708759667e-08, "logits/chosen": -2.6966350078582764, "logits/rejected": -2.651716470718384, "logps/chosen": -338.0216064453125, "logps/rejected": -319.8200378417969, "loss": 0.547, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2118571251630783, "rewards/margins": 0.4064742624759674, "rewards/margins_max": 0.971143901348114, "rewards/margins_min": -0.24024486541748047, "rewards/margins_std": 0.5454075336456299, "rewards/rejected": -0.6183313727378845, "step": 3160 }, { "epoch": 0.76, "grad_norm": 11.094830355625867, "learning_rate": 8.331876318030585e-08, "logits/chosen": -2.64422607421875, "logits/rejected": -2.61445951461792, "logps/chosen": -303.38189697265625, "logps/rejected": -304.0430603027344, "loss": 0.5789, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.22517380118370056, "rewards/margins": 0.2765592336654663, "rewards/margins_max": 0.9068194627761841, "rewards/margins_min": -0.3106902241706848, "rewards/margins_std": 0.5414040088653564, "rewards/rejected": -0.501733124256134, "step": 3170 }, { "epoch": 0.76, "grad_norm": 10.52179309256804, "learning_rate": 8.176696778307269e-08, "logits/chosen": -2.6210858821868896, "logits/rejected": -2.586487293243408, "logps/chosen": -322.38720703125, "logps/rejected": -334.0662841796875, "loss": 0.5626, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23957297205924988, "rewards/margins": 0.3708464503288269, "rewards/margins_max": 0.9727503657341003, "rewards/margins_min": -0.348827600479126, "rewards/margins_std": 0.5810685753822327, "rewards/rejected": -0.6104193925857544, "step": 3180 }, { "epoch": 0.76, "grad_norm": 6.038743498002542, "learning_rate": 8.022692934304238e-08, "logits/chosen": -2.6480252742767334, "logits/rejected": -2.568481683731079, "logps/chosen": -321.59954833984375, "logps/rejected": -311.34429931640625, "loss": 0.5878, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.21423247456550598, "rewards/margins": 0.3581593334674835, "rewards/margins_max": 1.0015473365783691, "rewards/margins_min": -0.23262456059455872, "rewards/margins_std": 0.5628671646118164, "rewards/rejected": -0.5723918080329895, "step": 3190 }, { "epoch": 0.77, "grad_norm": 10.88907457915638, "learning_rate": 7.869875548572588e-08, "logits/chosen": -2.64384126663208, "logits/rejected": -2.6285018920898438, "logps/chosen": -272.13360595703125, "logps/rejected": -285.25616455078125, "loss": 0.5642, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17138831317424774, "rewards/margins": 0.3504408299922943, "rewards/margins_max": 0.9936060905456543, "rewards/margins_min": -0.2875770926475525, "rewards/margins_std": 0.5752379894256592, "rewards/rejected": -0.5218292474746704, "step": 3200 }, { "epoch": 0.77, "eval_logits/chosen": -2.6170332431793213, "eval_logits/rejected": -2.586714029312134, "eval_logps/chosen": -303.88458251953125, "eval_logps/rejected": -316.1905822753906, "eval_loss": 0.5936970114707947, "eval_rewards/accuracies": 0.7009999752044678, "eval_rewards/chosen": -0.1942928284406662, "eval_rewards/margins": 0.30908045172691345, "eval_rewards/margins_max": 1.1615225076675415, "eval_rewards/margins_min": -0.5332077145576477, "eval_rewards/margins_std": 0.5726361870765686, "eval_rewards/rejected": -0.5033733248710632, "eval_runtime": 860.0191, "eval_samples_per_second": 4.651, "eval_steps_per_second": 0.291, "step": 3200 }, { "epoch": 0.77, "grad_norm": 11.373705444449904, "learning_rate": 7.718255300747817e-08, "logits/chosen": -2.60569429397583, "logits/rejected": -2.5863037109375, "logps/chosen": -293.9930419921875, "logps/rejected": -334.334228515625, "loss": 0.5621, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1691075563430786, "rewards/margins": 0.3850109577178955, "rewards/margins_max": 1.034523606300354, "rewards/margins_min": -0.20032206177711487, "rewards/margins_std": 0.5684488415718079, "rewards/rejected": -0.5541185140609741, "step": 3210 }, { "epoch": 0.77, "grad_norm": 12.358749045858794, "learning_rate": 7.567842786803502e-08, "logits/chosen": -2.5796420574188232, "logits/rejected": -2.5125057697296143, "logps/chosen": -305.2764587402344, "logps/rejected": -315.77783203125, "loss": 0.5785, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21089377999305725, "rewards/margins": 0.2600666880607605, "rewards/margins_max": 0.8219484090805054, "rewards/margins_min": -0.3330570161342621, "rewards/margins_std": 0.5194389820098877, "rewards/rejected": -0.4709605276584625, "step": 3220 }, { "epoch": 0.77, "grad_norm": 11.42643961778415, "learning_rate": 7.418648518310797e-08, "logits/chosen": -2.6522135734558105, "logits/rejected": -2.6124863624572754, "logps/chosen": -300.091796875, "logps/rejected": -266.65228271484375, "loss": 0.5677, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13778580725193024, "rewards/margins": 0.3589250147342682, "rewards/margins_max": 1.0125762224197388, "rewards/margins_min": -0.22981949150562286, "rewards/margins_std": 0.5771783590316772, "rewards/rejected": -0.4967108368873596, "step": 3230 }, { "epoch": 0.78, "grad_norm": 6.529714011316275, "learning_rate": 7.270682921703853e-08, "logits/chosen": -2.639592170715332, "logits/rejected": -2.646275758743286, "logps/chosen": -311.7178955078125, "logps/rejected": -324.42608642578125, "loss": 0.5831, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2275652438402176, "rewards/margins": 0.2939935326576233, "rewards/margins_max": 0.9402956962585449, "rewards/margins_min": -0.37714487314224243, "rewards/margins_std": 0.590320348739624, "rewards/rejected": -0.5215587019920349, "step": 3240 }, { "epoch": 0.78, "grad_norm": 8.797069703246049, "learning_rate": 7.123956337551116e-08, "logits/chosen": -2.6058413982391357, "logits/rejected": -2.6359944343566895, "logps/chosen": -296.3321838378906, "logps/rejected": -293.48260498046875, "loss": 0.5856, "rewards/accuracies": 0.75, "rewards/chosen": -0.19313833117485046, "rewards/margins": 0.36264750361442566, "rewards/margins_max": 1.0930047035217285, "rewards/margins_min": -0.3832333981990814, "rewards/margins_std": 0.6633163690567017, "rewards/rejected": -0.5557857751846313, "step": 3250 }, { "epoch": 0.78, "grad_norm": 15.849059048709869, "learning_rate": 6.978479019832725e-08, "logits/chosen": -2.579946279525757, "logits/rejected": -2.552095413208008, "logps/chosen": -336.55401611328125, "logps/rejected": -316.15594482421875, "loss": 0.5497, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.14274556934833527, "rewards/margins": 0.4389498233795166, "rewards/margins_max": 1.0567692518234253, "rewards/margins_min": -0.18148799240589142, "rewards/margins_std": 0.5417019128799438, "rewards/rejected": -0.5816953182220459, "step": 3260 }, { "epoch": 0.78, "grad_norm": 5.846431015177925, "learning_rate": 6.83426113522389e-08, "logits/chosen": -2.5978214740753174, "logits/rejected": -2.5879592895507812, "logps/chosen": -305.3526916503906, "logps/rejected": -297.043701171875, "loss": 0.5654, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1424698531627655, "rewards/margins": 0.2821696400642395, "rewards/margins_max": 0.7785348892211914, "rewards/margins_min": -0.21284432709217072, "rewards/margins_std": 0.4451248049736023, "rewards/rejected": -0.4246394634246826, "step": 3270 }, { "epoch": 0.79, "grad_norm": 7.9976397847405485, "learning_rate": 6.691312762384396e-08, "logits/chosen": -2.6755032539367676, "logits/rejected": -2.629774570465088, "logps/chosen": -288.03515625, "logps/rejected": -273.4284973144531, "loss": 0.5783, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.23924663662910461, "rewards/margins": 0.24604792892932892, "rewards/margins_max": 0.8091939687728882, "rewards/margins_min": -0.3419875204563141, "rewards/margins_std": 0.5214563608169556, "rewards/rejected": -0.4852946400642395, "step": 3280 }, { "epoch": 0.79, "grad_norm": 8.37224977799963, "learning_rate": 6.54964389125428e-08, "logits/chosen": -2.586719036102295, "logits/rejected": -2.581692934036255, "logps/chosen": -288.68231201171875, "logps/rejected": -328.0451965332031, "loss": 0.5687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2837051451206207, "rewards/margins": 0.3233044743537903, "rewards/margins_max": 0.906445324420929, "rewards/margins_min": -0.27489590644836426, "rewards/margins_std": 0.5275270342826843, "rewards/rejected": -0.6070095896720886, "step": 3290 }, { "epoch": 0.79, "grad_norm": 8.264026624313555, "learning_rate": 6.409264422355642e-08, "logits/chosen": -2.6691842079162598, "logits/rejected": -2.666581630706787, "logps/chosen": -325.159423828125, "logps/rejected": -350.07513427734375, "loss": 0.5767, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.17226542532444, "rewards/margins": 0.4506758153438568, "rewards/margins_max": 1.059866189956665, "rewards/margins_min": -0.20353195071220398, "rewards/margins_std": 0.5784635543823242, "rewards/rejected": -0.6229413151741028, "step": 3300 }, { "epoch": 0.79, "eval_logits/chosen": -2.613069534301758, "eval_logits/rejected": -2.5827982425689697, "eval_logps/chosen": -308.21435546875, "eval_logps/rejected": -321.5457763671875, "eval_loss": 0.5913904905319214, "eval_rewards/accuracies": 0.7049999833106995, "eval_rewards/chosen": -0.23759020864963531, "eval_rewards/margins": 0.3193349540233612, "eval_rewards/margins_max": 1.1828374862670898, "eval_rewards/margins_min": -0.532956063747406, "eval_rewards/margins_std": 0.5823011994361877, "eval_rewards/rejected": -0.5569252371788025, "eval_runtime": 859.3972, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 3300 }, { "epoch": 0.79, "grad_norm": 11.482562569568803, "learning_rate": 6.27018416610078e-08, "logits/chosen": -2.63213849067688, "logits/rejected": -2.578720808029175, "logps/chosen": -269.0001220703125, "logps/rejected": -352.06964111328125, "loss": 0.5617, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1835450977087021, "rewards/margins": 0.31281334161758423, "rewards/margins_max": 1.032710313796997, "rewards/margins_min": -0.3121184706687927, "rewards/margins_std": 0.6005935668945312, "rewards/rejected": -0.4963584542274475, "step": 3310 }, { "epoch": 0.8, "grad_norm": 2.8099366268464685, "learning_rate": 6.132412842106572e-08, "logits/chosen": -2.6385445594787598, "logits/rejected": -2.595637798309326, "logps/chosen": -330.910400390625, "logps/rejected": -353.44952392578125, "loss": 0.5931, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.27031436562538147, "rewards/margins": 0.30159300565719604, "rewards/margins_max": 0.9948034286499023, "rewards/margins_min": -0.37592440843582153, "rewards/margins_std": 0.6074448227882385, "rewards/rejected": -0.5719074010848999, "step": 3320 }, { "epoch": 0.8, "grad_norm": 12.653057684781732, "learning_rate": 5.995960078515255e-08, "logits/chosen": -2.640693187713623, "logits/rejected": -2.5642504692077637, "logps/chosen": -312.5872802734375, "logps/rejected": -312.4304504394531, "loss": 0.5964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2662574052810669, "rewards/margins": 0.29711824655532837, "rewards/margins_max": 0.8822048306465149, "rewards/margins_min": -0.3330439031124115, "rewards/margins_std": 0.5500375628471375, "rewards/rejected": -0.5633755922317505, "step": 3330 }, { "epoch": 0.8, "grad_norm": 8.318222485446203, "learning_rate": 5.860835411321494e-08, "logits/chosen": -2.582233428955078, "logits/rejected": -2.559011220932007, "logps/chosen": -319.3325500488281, "logps/rejected": -332.45709228515625, "loss": 0.5777, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2152424156665802, "rewards/margins": 0.3113642632961273, "rewards/margins_max": 0.9375714063644409, "rewards/margins_min": -0.35544976592063904, "rewards/margins_std": 0.5669530630111694, "rewards/rejected": -0.5266066789627075, "step": 3340 }, { "epoch": 0.8, "grad_norm": 7.812987045365231, "learning_rate": 5.7270482837060455e-08, "logits/chosen": -2.6814351081848145, "logits/rejected": -2.6515543460845947, "logps/chosen": -319.64691162109375, "logps/rejected": -295.6579895019531, "loss": 0.5548, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1749572455883026, "rewards/margins": 0.3691059947013855, "rewards/margins_max": 1.0209461450576782, "rewards/margins_min": -0.19284987449645996, "rewards/margins_std": 0.5489664077758789, "rewards/rejected": -0.5440632104873657, "step": 3350 }, { "epoch": 0.8, "grad_norm": 10.508153020737115, "learning_rate": 5.5946080453757425e-08, "logits/chosen": -2.530405044555664, "logits/rejected": -2.5279834270477295, "logps/chosen": -310.46356201171875, "logps/rejected": -314.398681640625, "loss": 0.559, "rewards/accuracies": 0.75, "rewards/chosen": -0.08304639160633087, "rewards/margins": 0.4107007086277008, "rewards/margins_max": 1.032921314239502, "rewards/margins_min": -0.22185806930065155, "rewards/margins_std": 0.5521965026855469, "rewards/rejected": -0.49374714493751526, "step": 3360 }, { "epoch": 0.81, "grad_norm": 7.319287923562498, "learning_rate": 5.4635239519101706e-08, "logits/chosen": -2.588613986968994, "logits/rejected": -2.61376690864563, "logps/chosen": -299.3731994628906, "logps/rejected": -347.53216552734375, "loss": 0.6014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22208890318870544, "rewards/margins": 0.2745796740055084, "rewards/margins_max": 0.9492992162704468, "rewards/margins_min": -0.38629618287086487, "rewards/margins_std": 0.5912618041038513, "rewards/rejected": -0.49666857719421387, "step": 3370 }, { "epoch": 0.81, "grad_norm": 7.002572812972141, "learning_rate": 5.333805164114744e-08, "logits/chosen": -2.588752269744873, "logits/rejected": -2.5390632152557373, "logps/chosen": -332.0850524902344, "logps/rejected": -344.94659423828125, "loss": 0.559, "rewards/accuracies": 0.75, "rewards/chosen": -0.20025333762168884, "rewards/margins": 0.4261398911476135, "rewards/margins_max": 0.9834170341491699, "rewards/margins_min": -0.08735842257738113, "rewards/margins_std": 0.4765813946723938, "rewards/rejected": -0.6263931393623352, "step": 3380 }, { "epoch": 0.81, "grad_norm": 9.862202911350407, "learning_rate": 5.205460747380588e-08, "logits/chosen": -2.6821560859680176, "logits/rejected": -2.66601300239563, "logps/chosen": -288.72625732421875, "logps/rejected": -300.3806457519531, "loss": 0.5808, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.215274840593338, "rewards/margins": 0.3353163003921509, "rewards/margins_max": 0.886604905128479, "rewards/margins_min": -0.17076632380485535, "rewards/margins_std": 0.4661891460418701, "rewards/rejected": -0.5505911111831665, "step": 3390 }, { "epoch": 0.81, "grad_norm": 4.144530789791445, "learning_rate": 5.0784996710509785e-08, "logits/chosen": -2.6226415634155273, "logits/rejected": -2.6080586910247803, "logps/chosen": -380.19482421875, "logps/rejected": -385.8616943359375, "loss": 0.5685, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2578801214694977, "rewards/margins": 0.3303286135196686, "rewards/margins_max": 0.9798241853713989, "rewards/margins_min": -0.3123224377632141, "rewards/margins_std": 0.5742116570472717, "rewards/rejected": -0.5882086753845215, "step": 3400 }, { "epoch": 0.81, "eval_logits/chosen": -2.6102540493011475, "eval_logits/rejected": -2.5799925327301025, "eval_logps/chosen": -306.9150085449219, "eval_logps/rejected": -320.19580078125, "eval_loss": 0.5913717150688171, "eval_rewards/accuracies": 0.7045000195503235, "eval_rewards/chosen": -0.22459664940834045, "eval_rewards/margins": 0.3188289403915405, "eval_rewards/margins_max": 1.185779333114624, "eval_rewards/margins_min": -0.5379509329795837, "eval_rewards/margins_std": 0.583387017250061, "eval_rewards/rejected": -0.5434256196022034, "eval_runtime": 859.4957, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 3400 }, { "epoch": 0.82, "grad_norm": 8.964599074139267, "learning_rate": 4.952930807794503e-08, "logits/chosen": -2.6341657638549805, "logits/rejected": -2.6094231605529785, "logps/chosen": -285.22247314453125, "logps/rejected": -332.9870300292969, "loss": 0.5691, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24228985607624054, "rewards/margins": 0.3657795190811157, "rewards/margins_max": 1.0035291910171509, "rewards/margins_min": -0.26299047470092773, "rewards/margins_std": 0.5673614144325256, "rewards/rejected": -0.6080694198608398, "step": 3410 }, { "epoch": 0.82, "grad_norm": 8.692036055130663, "learning_rate": 4.828762932985009e-08, "logits/chosen": -2.6468403339385986, "logits/rejected": -2.5917208194732666, "logps/chosen": -316.1946716308594, "logps/rejected": -312.6452941894531, "loss": 0.5511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17702266573905945, "rewards/margins": 0.4823419451713562, "rewards/margins_max": 1.041947841644287, "rewards/margins_min": -0.111463762819767, "rewards/margins_std": 0.529529869556427, "rewards/rejected": -0.6593645811080933, "step": 3420 }, { "epoch": 0.82, "grad_norm": 11.989930696645063, "learning_rate": 4.706004724088328e-08, "logits/chosen": -2.5788004398345947, "logits/rejected": -2.4974637031555176, "logps/chosen": -332.91845703125, "logps/rejected": -343.2325439453125, "loss": 0.6338, "rewards/accuracies": 0.625, "rewards/chosen": -0.34716516733169556, "rewards/margins": 0.15864428877830505, "rewards/margins_max": 0.7591507434844971, "rewards/margins_min": -0.35776883363723755, "rewards/margins_std": 0.5207773447036743, "rewards/rejected": -0.5058094263076782, "step": 3430 }, { "epoch": 0.82, "grad_norm": 8.163596166406325, "learning_rate": 4.584664760055881e-08, "logits/chosen": -2.652082681655884, "logits/rejected": -2.6363697052001953, "logps/chosen": -257.03045654296875, "logps/rejected": -270.32330322265625, "loss": 0.5678, "rewards/accuracies": 0.75, "rewards/chosen": -0.1794271022081375, "rewards/margins": 0.3605220317840576, "rewards/margins_max": 0.9137634038925171, "rewards/margins_min": -0.2497061789035797, "rewards/margins_std": 0.5212998986244202, "rewards/rejected": -0.5399491190910339, "step": 3440 }, { "epoch": 0.83, "grad_norm": 10.712123412876426, "learning_rate": 4.4647515207250934e-08, "logits/chosen": -2.7117748260498047, "logits/rejected": -2.671877861022949, "logps/chosen": -323.82489013671875, "logps/rejected": -321.583251953125, "loss": 0.5762, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23698492348194122, "rewards/margins": 0.3007177710533142, "rewards/margins_max": 0.8499841690063477, "rewards/margins_min": -0.3255491554737091, "rewards/margins_std": 0.5336715579032898, "rewards/rejected": -0.5377026796340942, "step": 3450 }, { "epoch": 0.83, "grad_norm": 5.333382347994325, "learning_rate": 4.346273386226812e-08, "logits/chosen": -2.595189094543457, "logits/rejected": -2.606052875518799, "logps/chosen": -323.14666748046875, "logps/rejected": -320.80706787109375, "loss": 0.6142, "rewards/accuracies": 0.625, "rewards/chosen": -0.19109676778316498, "rewards/margins": 0.29731321334838867, "rewards/margins_max": 0.9397839307785034, "rewards/margins_min": -0.29314979910850525, "rewards/margins_std": 0.5546901822090149, "rewards/rejected": -0.48840999603271484, "step": 3460 }, { "epoch": 0.83, "grad_norm": 7.404540045631026, "learning_rate": 4.2292386363996484e-08, "logits/chosen": -2.641002655029297, "logits/rejected": -2.6082513332366943, "logps/chosen": -332.7715759277344, "logps/rejected": -327.356201171875, "loss": 0.593, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.29126280546188354, "rewards/margins": 0.3013150095939636, "rewards/margins_max": 0.8860662579536438, "rewards/margins_min": -0.383476197719574, "rewards/margins_std": 0.5513616800308228, "rewards/rejected": -0.5925778150558472, "step": 3470 }, { "epoch": 0.83, "grad_norm": 7.847498267915795, "learning_rate": 4.1136554502113676e-08, "logits/chosen": -2.613243579864502, "logits/rejected": -2.6318023204803467, "logps/chosen": -298.5329284667969, "logps/rejected": -346.72052001953125, "loss": 0.5774, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.26258477568626404, "rewards/margins": 0.27295345067977905, "rewards/margins_max": 0.8896123170852661, "rewards/margins_min": -0.32080337405204773, "rewards/margins_std": 0.5299565196037292, "rewards/rejected": -0.5355381965637207, "step": 3480 }, { "epoch": 0.84, "grad_norm": 9.109331029455413, "learning_rate": 3.999531905187256e-08, "logits/chosen": -2.6425492763519287, "logits/rejected": -2.606245756149292, "logps/chosen": -326.7677307128906, "logps/rejected": -343.9598693847656, "loss": 0.584, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.20647890865802765, "rewards/margins": 0.4059480130672455, "rewards/margins_max": 0.9774398803710938, "rewards/margins_min": -0.2279617339372635, "rewards/margins_std": 0.5373811721801758, "rewards/rejected": -0.6124268770217896, "step": 3490 }, { "epoch": 0.84, "grad_norm": 7.824226063245459, "learning_rate": 3.886875976845661e-08, "logits/chosen": -2.724914789199829, "logits/rejected": -2.6821529865264893, "logps/chosen": -336.58465576171875, "logps/rejected": -328.7319030761719, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": -0.20324409008026123, "rewards/margins": 0.336579293012619, "rewards/margins_max": 0.8925960659980774, "rewards/margins_min": -0.24249598383903503, "rewards/margins_std": 0.5138999819755554, "rewards/rejected": -0.5398234128952026, "step": 3500 }, { "epoch": 0.84, "eval_logits/chosen": -2.6082119941711426, "eval_logits/rejected": -2.5778560638427734, "eval_logps/chosen": -307.88323974609375, "eval_logps/rejected": -321.4169006347656, "eval_loss": 0.5908536911010742, "eval_rewards/accuracies": 0.7045000195503235, "eval_rewards/chosen": -0.23427951335906982, "eval_rewards/margins": 0.32135695219039917, "eval_rewards/margins_max": 1.1904600858688354, "eval_rewards/margins_min": -0.5370433330535889, "eval_rewards/margins_std": 0.5854602456092834, "eval_rewards/rejected": -0.5556364059448242, "eval_runtime": 859.4933, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 3500 }, { "epoch": 0.84, "grad_norm": 15.226256851408955, "learning_rate": 3.775695538140608e-08, "logits/chosen": -2.615886688232422, "logits/rejected": -2.5705220699310303, "logps/chosen": -273.38330078125, "logps/rejected": -266.79046630859375, "loss": 0.5741, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2061402052640915, "rewards/margins": 0.3421597182750702, "rewards/margins_max": 1.0528227090835571, "rewards/margins_min": -0.30337151885032654, "rewards/margins_std": 0.6072283983230591, "rewards/rejected": -0.5482999086380005, "step": 3510 }, { "epoch": 0.84, "grad_norm": 7.619503353884903, "learning_rate": 3.665998358911593e-08, "logits/chosen": -2.643258571624756, "logits/rejected": -2.574820041656494, "logps/chosen": -271.6816711425781, "logps/rejected": -307.208251953125, "loss": 0.5823, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.200513556599617, "rewards/margins": 0.3419400155544281, "rewards/margins_max": 1.027790904045105, "rewards/margins_min": -0.22767655551433563, "rewards/margins_std": 0.560200572013855, "rewards/rejected": -0.5424535870552063, "step": 3520 }, { "epoch": 0.85, "grad_norm": 7.151962928488147, "learning_rate": 3.557792105340621e-08, "logits/chosen": -2.703946113586426, "logits/rejected": -2.6648306846618652, "logps/chosen": -308.2814025878906, "logps/rejected": -313.46185302734375, "loss": 0.6098, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2687258720397949, "rewards/margins": 0.2833290696144104, "rewards/margins_max": 0.9018853902816772, "rewards/margins_min": -0.34607797861099243, "rewards/margins_std": 0.5544721484184265, "rewards/rejected": -0.5520548820495605, "step": 3530 }, { "epoch": 0.85, "grad_norm": 8.264274501263046, "learning_rate": 3.4510843394163966e-08, "logits/chosen": -2.6094186305999756, "logits/rejected": -2.6097640991210938, "logps/chosen": -307.6642150878906, "logps/rejected": -353.87811279296875, "loss": 0.5702, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.14493492245674133, "rewards/margins": 0.5252057313919067, "rewards/margins_max": 1.1174616813659668, "rewards/margins_min": -0.020700642839074135, "rewards/margins_std": 0.5230407118797302, "rewards/rejected": -0.6701406836509705, "step": 3540 }, { "epoch": 0.85, "grad_norm": 15.733612739107912, "learning_rate": 3.345882518405918e-08, "logits/chosen": -2.622058868408203, "logits/rejected": -2.653578996658325, "logps/chosen": -253.8896026611328, "logps/rejected": -303.8586120605469, "loss": 0.5931, "rewards/accuracies": 0.8125, "rewards/chosen": -0.181403249502182, "rewards/margins": 0.3778737485408783, "rewards/margins_max": 1.0367891788482666, "rewards/margins_min": -0.2821559011936188, "rewards/margins_std": 0.5796774625778198, "rewards/rejected": -0.5592769980430603, "step": 3550 }, { "epoch": 0.85, "grad_norm": 6.2572531765682795, "learning_rate": 3.242193994333278e-08, "logits/chosen": -2.596344470977783, "logits/rejected": -2.5577056407928467, "logps/chosen": -283.0947265625, "logps/rejected": -288.11810302734375, "loss": 0.5641, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2708722651004791, "rewards/margins": 0.27966800332069397, "rewards/margins_max": 0.8920449018478394, "rewards/margins_min": -0.3251974284648895, "rewards/margins_std": 0.5486242175102234, "rewards/rejected": -0.5505402684211731, "step": 3560 }, { "epoch": 0.85, "grad_norm": 6.516126435010375, "learning_rate": 3.14002601346591e-08, "logits/chosen": -2.5676286220550537, "logits/rejected": -2.6242175102233887, "logps/chosen": -311.1514892578125, "logps/rejected": -345.4110412597656, "loss": 0.549, "rewards/accuracies": 0.75, "rewards/chosen": -0.17862629890441895, "rewards/margins": 0.3790408670902252, "rewards/margins_max": 0.9282873272895813, "rewards/margins_min": -0.165249764919281, "rewards/margins_std": 0.5055936574935913, "rewards/rejected": -0.5576671361923218, "step": 3570 }, { "epoch": 0.86, "grad_norm": 9.045146875912847, "learning_rate": 3.039385715808121e-08, "logits/chosen": -2.6263604164123535, "logits/rejected": -2.5675251483917236, "logps/chosen": -273.31085205078125, "logps/rejected": -264.5960388183594, "loss": 0.5653, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18905775249004364, "rewards/margins": 0.36532530188560486, "rewards/margins_max": 0.9135506749153137, "rewards/margins_min": -0.21762046217918396, "rewards/margins_std": 0.5158858895301819, "rewards/rejected": -0.5543830990791321, "step": 3580 }, { "epoch": 0.86, "grad_norm": 3.8898146677855023, "learning_rate": 2.9402801346021937e-08, "logits/chosen": -2.6774134635925293, "logits/rejected": -2.6019251346588135, "logps/chosen": -353.1155090332031, "logps/rejected": -326.0061950683594, "loss": 0.579, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24207577109336853, "rewards/margins": 0.31224125623703003, "rewards/margins_max": 0.984405517578125, "rewards/margins_min": -0.32634276151657104, "rewards/margins_std": 0.5798729658126831, "rewards/rejected": -0.554317057132721, "step": 3590 }, { "epoch": 0.86, "grad_norm": 12.174125022701196, "learning_rate": 2.8427161958368002e-08, "logits/chosen": -2.586181402206421, "logits/rejected": -2.5371508598327637, "logps/chosen": -314.48687744140625, "logps/rejected": -307.24847412109375, "loss": 0.5598, "rewards/accuracies": 0.75, "rewards/chosen": -0.18670877814292908, "rewards/margins": 0.3874475359916687, "rewards/margins_max": 0.9850362539291382, "rewards/margins_min": -0.2439243048429489, "rewards/margins_std": 0.5419309735298157, "rewards/rejected": -0.5741563439369202, "step": 3600 }, { "epoch": 0.86, "eval_logits/chosen": -2.608355760574341, "eval_logits/rejected": -2.5780935287475586, "eval_logps/chosen": -305.0804748535156, "eval_logps/rejected": -317.9754333496094, "eval_loss": 0.5924330353736877, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -0.20625153183937073, "eval_rewards/margins": 0.31497010588645935, "eval_rewards/margins_max": 1.1819028854370117, "eval_rewards/margins_min": -0.539983868598938, "eval_rewards/margins_std": 0.5816512703895569, "eval_rewards/rejected": -0.5212216973304749, "eval_runtime": 859.8177, "eval_samples_per_second": 4.652, "eval_steps_per_second": 0.291, "step": 3600 }, { "epoch": 0.86, "grad_norm": 12.432739573628334, "learning_rate": 2.7467007177630174e-08, "logits/chosen": -2.6744627952575684, "logits/rejected": -2.6617274284362793, "logps/chosen": -342.64141845703125, "logps/rejected": -362.93426513671875, "loss": 0.5704, "rewards/accuracies": 0.75, "rewards/chosen": -0.2297605723142624, "rewards/margins": 0.2869631350040436, "rewards/margins_max": 0.9172990918159485, "rewards/margins_min": -0.2644995450973511, "rewards/margins_std": 0.5308641791343689, "rewards/rejected": -0.5167237520217896, "step": 3610 }, { "epoch": 0.87, "grad_norm": 11.55258760076882, "learning_rate": 2.652240410417819e-08, "logits/chosen": -2.6653380393981934, "logits/rejected": -2.600297212600708, "logps/chosen": -320.4395751953125, "logps/rejected": -300.54669189453125, "loss": 0.5932, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.20440959930419922, "rewards/margins": 0.3110480308532715, "rewards/margins_max": 0.8536907434463501, "rewards/margins_min": -0.28580746054649353, "rewards/margins_std": 0.5097913146018982, "rewards/rejected": -0.5154576897621155, "step": 3620 }, { "epoch": 0.87, "grad_norm": 8.302577930556795, "learning_rate": 2.5593418751551437e-08, "logits/chosen": -2.6613705158233643, "logits/rejected": -2.6312716007232666, "logps/chosen": -348.50787353515625, "logps/rejected": -311.14068603515625, "loss": 0.5467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.20068880915641785, "rewards/margins": 0.41503697633743286, "rewards/margins_max": 1.0137380361557007, "rewards/margins_min": -0.1997760385274887, "rewards/margins_std": 0.5454592108726501, "rewards/rejected": -0.6157258152961731, "step": 3630 }, { "epoch": 0.87, "grad_norm": 7.711523217369656, "learning_rate": 2.4680116041845834e-08, "logits/chosen": -2.5822627544403076, "logits/rejected": -2.586742877960205, "logps/chosen": -292.9588317871094, "logps/rejected": -327.4941711425781, "loss": 0.5653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14639553427696228, "rewards/margins": 0.4603632390499115, "rewards/margins_max": 1.068746566772461, "rewards/margins_min": -0.1641278713941574, "rewards/margins_std": 0.5588363409042358, "rewards/rejected": -0.6067588329315186, "step": 3640 }, { "epoch": 0.87, "grad_norm": 11.034392257845468, "learning_rate": 2.3782559801176354e-08, "logits/chosen": -2.6122536659240723, "logits/rejected": -2.5909423828125, "logps/chosen": -321.4766845703125, "logps/rejected": -365.0554504394531, "loss": 0.5506, "rewards/accuracies": 0.75, "rewards/chosen": -0.1527622789144516, "rewards/margins": 0.4362732470035553, "rewards/margins_max": 0.9728630185127258, "rewards/margins_min": -0.160151869058609, "rewards/margins_std": 0.5050688982009888, "rewards/rejected": -0.5890355110168457, "step": 3650 }, { "epoch": 0.88, "grad_norm": 15.262643044541077, "learning_rate": 2.290081275521688e-08, "logits/chosen": -2.557404041290283, "logits/rejected": -2.5633702278137207, "logps/chosen": -280.54541015625, "logps/rejected": -291.76019287109375, "loss": 0.6138, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.29516711831092834, "rewards/margins": 0.24436545372009277, "rewards/margins_max": 0.9078601598739624, "rewards/margins_min": -0.3921719193458557, "rewards/margins_std": 0.5873770117759705, "rewards/rejected": -0.5395325422286987, "step": 3660 }, { "epoch": 0.88, "grad_norm": 9.267995701538196, "learning_rate": 2.2034936524816388e-08, "logits/chosen": -2.583019256591797, "logits/rejected": -2.605175495147705, "logps/chosen": -322.00555419921875, "logps/rejected": -370.5304870605469, "loss": 0.5985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22519807517528534, "rewards/margins": 0.2562308609485626, "rewards/margins_max": 0.9576885104179382, "rewards/margins_min": -0.2878001630306244, "rewards/margins_std": 0.5579741597175598, "rewards/rejected": -0.48142892122268677, "step": 3670 }, { "epoch": 0.88, "grad_norm": 8.603721132974057, "learning_rate": 2.118499162169285e-08, "logits/chosen": -2.631331205368042, "logits/rejected": -2.5858583450317383, "logps/chosen": -382.18524169921875, "logps/rejected": -342.15350341796875, "loss": 0.5485, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.19810251891613007, "rewards/margins": 0.47410932183265686, "rewards/margins_max": 1.0249054431915283, "rewards/margins_min": -0.2214665710926056, "rewards/margins_std": 0.5518790483474731, "rewards/rejected": -0.6722118258476257, "step": 3680 }, { "epoch": 0.88, "grad_norm": 3.7309544431859396, "learning_rate": 2.035103744420408e-08, "logits/chosen": -2.663212299346924, "logits/rejected": -2.616042375564575, "logps/chosen": -376.15008544921875, "logps/rejected": -347.84271240234375, "loss": 0.5919, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2270462065935135, "rewards/margins": 0.3260009288787842, "rewards/margins_max": 0.9850046038627625, "rewards/margins_min": -0.3287786841392517, "rewards/margins_std": 0.5911919474601746, "rewards/rejected": -0.5530470609664917, "step": 3690 }, { "epoch": 0.89, "grad_norm": 8.086879568460578, "learning_rate": 1.953313227319689e-08, "logits/chosen": -2.538327693939209, "logits/rejected": -2.4988460540771484, "logps/chosen": -330.48773193359375, "logps/rejected": -315.3109130859375, "loss": 0.5639, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.16558781266212463, "rewards/margins": 0.31884345412254333, "rewards/margins_max": 0.9515074491500854, "rewards/margins_min": -0.43427133560180664, "rewards/margins_std": 0.6087071299552917, "rewards/rejected": -0.48443132638931274, "step": 3700 }, { "epoch": 0.89, "eval_logits/chosen": -2.6055796146392822, "eval_logits/rejected": -2.5751001834869385, "eval_logps/chosen": -305.3578186035156, "eval_logps/rejected": -318.4354248046875, "eval_loss": 0.5920639634132385, "eval_rewards/accuracies": 0.7055000066757202, "eval_rewards/chosen": -0.2090248167514801, "eval_rewards/margins": 0.31679674983024597, "eval_rewards/margins_max": 1.1849194765090942, "eval_rewards/margins_min": -0.5398852825164795, "eval_rewards/margins_std": 0.5830652117729187, "eval_rewards/rejected": -0.5258215665817261, "eval_runtime": 859.9471, "eval_samples_per_second": 4.651, "eval_steps_per_second": 0.291, "step": 3700 }, { "epoch": 0.89, "grad_norm": 11.463339046274596, "learning_rate": 1.873133326793397e-08, "logits/chosen": -2.6155009269714355, "logits/rejected": -2.5915045738220215, "logps/chosen": -303.8510437011719, "logps/rejected": -318.6623229980469, "loss": 0.5918, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.17637260258197784, "rewards/margins": 0.3188071846961975, "rewards/margins_max": 0.8904364705085754, "rewards/margins_min": -0.32362252473831177, "rewards/margins_std": 0.5524941086769104, "rewards/rejected": -0.4951797425746918, "step": 3710 }, { "epoch": 0.89, "grad_norm": 5.6325587031610835, "learning_rate": 1.794569646209948e-08, "logits/chosen": -2.5438828468322754, "logits/rejected": -2.512012243270874, "logps/chosen": -326.58074951171875, "logps/rejected": -305.82073974609375, "loss": 0.6245, "rewards/accuracies": 0.625, "rewards/chosen": -0.2585929334163666, "rewards/margins": 0.22238990664482117, "rewards/margins_max": 0.9835413694381714, "rewards/margins_min": -0.5056635141372681, "rewards/margins_std": 0.6667352914810181, "rewards/rejected": -0.4809829294681549, "step": 3720 }, { "epoch": 0.89, "grad_norm": 10.081384223474876, "learning_rate": 1.7176276759883146e-08, "logits/chosen": -2.5898776054382324, "logits/rejected": -2.584012508392334, "logps/chosen": -313.08905029296875, "logps/rejected": -312.9795837402344, "loss": 0.5606, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13006040453910828, "rewards/margins": 0.4346505105495453, "rewards/margins_max": 1.1141884326934814, "rewards/margins_min": -0.14765407145023346, "rewards/margins_std": 0.5635863542556763, "rewards/rejected": -0.5647109150886536, "step": 3730 }, { "epoch": 0.9, "grad_norm": 6.993420915130978, "learning_rate": 1.642312793214293e-08, "logits/chosen": -2.566262722015381, "logits/rejected": -2.5170204639434814, "logps/chosen": -285.74249267578125, "logps/rejected": -342.0732116699219, "loss": 0.5708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22177961468696594, "rewards/margins": 0.4047152101993561, "rewards/margins_max": 1.231454610824585, "rewards/margins_min": -0.23452310264110565, "rewards/margins_std": 0.6764256358146667, "rewards/rejected": -0.6264947652816772, "step": 3740 }, { "epoch": 0.9, "grad_norm": 5.971993888018905, "learning_rate": 1.568630261264789e-08, "logits/chosen": -2.6162006855010986, "logits/rejected": -2.5812339782714844, "logps/chosen": -292.12518310546875, "logps/rejected": -277.68109130859375, "loss": 0.5887, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1901755928993225, "rewards/margins": 0.3246782124042511, "rewards/margins_max": 0.9265223741531372, "rewards/margins_min": -0.1909208595752716, "rewards/margins_std": 0.5114433169364929, "rewards/rejected": -0.514853835105896, "step": 3750 }, { "epoch": 0.9, "grad_norm": 16.863431002491197, "learning_rate": 1.49658522943992e-08, "logits/chosen": -2.599583148956299, "logits/rejected": -2.5607993602752686, "logps/chosen": -247.8730926513672, "logps/rejected": -300.6656494140625, "loss": 0.5535, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12827134132385254, "rewards/margins": 0.36878079175949097, "rewards/margins_max": 0.8943478465080261, "rewards/margins_min": -0.16647151112556458, "rewards/margins_std": 0.48803386092185974, "rewards/rejected": -0.4970521926879883, "step": 3760 }, { "epoch": 0.9, "grad_norm": 7.885285215571356, "learning_rate": 1.4261827326032122e-08, "logits/chosen": -2.640138626098633, "logits/rejected": -2.595416784286499, "logps/chosen": -318.6853942871094, "logps/rejected": -314.74224853515625, "loss": 0.5823, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2055031955242157, "rewards/margins": 0.312764436006546, "rewards/margins_max": 1.0183651447296143, "rewards/margins_min": -0.35077372193336487, "rewards/margins_std": 0.6288760900497437, "rewards/rejected": -0.5182676315307617, "step": 3770 }, { "epoch": 0.91, "grad_norm": 10.545925159775502, "learning_rate": 1.3574276908296906e-08, "logits/chosen": -2.5619964599609375, "logits/rejected": -2.515324354171753, "logps/chosen": -250.4527130126953, "logps/rejected": -288.0833740234375, "loss": 0.5821, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18306702375411987, "rewards/margins": 0.3643852472305298, "rewards/margins_max": 0.9473929405212402, "rewards/margins_min": -0.2090524137020111, "rewards/margins_std": 0.5079521536827087, "rewards/rejected": -0.5474522709846497, "step": 3780 }, { "epoch": 0.91, "grad_norm": 5.6495402461329345, "learning_rate": 1.2903249090620849e-08, "logits/chosen": -2.6735751628875732, "logits/rejected": -2.598555564880371, "logps/chosen": -355.37420654296875, "logps/rejected": -332.27593994140625, "loss": 0.5884, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18271778523921967, "rewards/margins": 0.3155055642127991, "rewards/margins_max": 0.8619476556777954, "rewards/margins_min": -0.32753393054008484, "rewards/margins_std": 0.5435506105422974, "rewards/rejected": -0.49822330474853516, "step": 3790 }, { "epoch": 0.91, "grad_norm": 6.149064726385415, "learning_rate": 1.2248790767750012e-08, "logits/chosen": -2.5644943714141846, "logits/rejected": -2.5778238773345947, "logps/chosen": -247.0529022216797, "logps/rejected": -301.94403076171875, "loss": 0.5931, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2534622848033905, "rewards/margins": 0.299783855676651, "rewards/margins_max": 0.9191850423812866, "rewards/margins_min": -0.3087500035762787, "rewards/margins_std": 0.5637552738189697, "rewards/rejected": -0.5532461404800415, "step": 3800 }, { "epoch": 0.91, "eval_logits/chosen": -2.6080944538116455, "eval_logits/rejected": -2.577848196029663, "eval_logps/chosen": -304.3083801269531, "eval_logps/rejected": -317.0423583984375, "eval_loss": 0.5930164456367493, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -0.19853053987026215, "eval_rewards/margins": 0.3133601248264313, "eval_rewards/margins_max": 1.1789708137512207, "eval_rewards/margins_min": -0.5399187803268433, "eval_rewards/margins_std": 0.5801899433135986, "eval_rewards/rejected": -0.5118906497955322, "eval_runtime": 859.5908, "eval_samples_per_second": 4.653, "eval_steps_per_second": 0.291, "step": 3800 }, { "epoch": 0.91, "grad_norm": 7.582877456320794, "learning_rate": 1.1610947676472277e-08, "logits/chosen": -2.621169090270996, "logits/rejected": -2.5998873710632324, "logps/chosen": -312.3177490234375, "logps/rejected": -320.4533386230469, "loss": 0.6041, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2622186541557312, "rewards/margins": 0.3109205365180969, "rewards/margins_max": 0.9574284553527832, "rewards/margins_min": -0.3280250132083893, "rewards/margins_std": 0.5780671834945679, "rewards/rejected": -0.5731391906738281, "step": 3810 }, { "epoch": 0.91, "grad_norm": 18.351199975133607, "learning_rate": 1.0989764392420692e-08, "logits/chosen": -2.6208126544952393, "logits/rejected": -2.5716211795806885, "logps/chosen": -336.3990173339844, "logps/rejected": -344.66241455078125, "loss": 0.5581, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19919511675834656, "rewards/margins": 0.34095874428749084, "rewards/margins_max": 0.9613696336746216, "rewards/margins_min": -0.26550573110580444, "rewards/margins_std": 0.5569697022438049, "rewards/rejected": -0.5401539206504822, "step": 3820 }, { "epoch": 0.92, "grad_norm": 6.713050873669192, "learning_rate": 1.0385284326958593e-08, "logits/chosen": -2.6769707202911377, "logits/rejected": -2.5937695503234863, "logps/chosen": -335.0888671875, "logps/rejected": -325.7705383300781, "loss": 0.5864, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12837204337120056, "rewards/margins": 0.35219669342041016, "rewards/margins_max": 1.0074574947357178, "rewards/margins_min": -0.2811533510684967, "rewards/margins_std": 0.5698063373565674, "rewards/rejected": -0.4805687963962555, "step": 3830 }, { "epoch": 0.92, "grad_norm": 5.734116218403002, "learning_rate": 9.797549724145731e-09, "logits/chosen": -2.7008821964263916, "logits/rejected": -2.6449217796325684, "logps/chosen": -350.021240234375, "logps/rejected": -327.3391418457031, "loss": 0.5596, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15653374791145325, "rewards/margins": 0.37177354097366333, "rewards/margins_max": 1.0280953645706177, "rewards/margins_min": -0.18916736543178558, "rewards/margins_std": 0.5437596440315247, "rewards/rejected": -0.528307318687439, "step": 3840 }, { "epoch": 0.92, "grad_norm": 4.955004218260416, "learning_rate": 9.226601657785993e-09, "logits/chosen": -2.643749475479126, "logits/rejected": -2.660839080810547, "logps/chosen": -288.3301086425781, "logps/rejected": -350.01458740234375, "loss": 0.58, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2535906136035919, "rewards/margins": 0.264422744512558, "rewards/margins_max": 0.8971040844917297, "rewards/margins_min": -0.45524635910987854, "rewards/margins_std": 0.609712541103363, "rewards/rejected": -0.5180133581161499, "step": 3850 }, { "epoch": 0.92, "grad_norm": 8.30905699593047, "learning_rate": 8.672480028556972e-09, "logits/chosen": -2.4579296112060547, "logits/rejected": -2.457097291946411, "logps/chosen": -281.6809387207031, "logps/rejected": -334.88482666015625, "loss": 0.6003, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.15676701068878174, "rewards/margins": 0.284981906414032, "rewards/margins_max": 0.7929221391677856, "rewards/margins_min": -0.18706540763378143, "rewards/margins_std": 0.43972960114479065, "rewards/rejected": -0.4417489171028137, "step": 3860 }, { "epoch": 0.93, "grad_norm": 15.51133067775074, "learning_rate": 8.13522356122151e-09, "logits/chosen": -2.690196990966797, "logits/rejected": -2.618220806121826, "logps/chosen": -301.7216491699219, "logps/rejected": -315.8422546386719, "loss": 0.6058, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.19068580865859985, "rewards/margins": 0.3160189986228943, "rewards/margins_max": 0.9212614297866821, "rewards/margins_min": -0.22967295348644257, "rewards/margins_std": 0.5226109027862549, "rewards/rejected": -0.5067048668861389, "step": 3870 }, { "epoch": 0.93, "grad_norm": 14.699283629108745, "learning_rate": 7.614869801921525e-09, "logits/chosen": -2.632333278656006, "logits/rejected": -2.5998964309692383, "logps/chosen": -298.019775390625, "logps/rejected": -307.9640808105469, "loss": 0.5814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22215929627418518, "rewards/margins": 0.22724071145057678, "rewards/margins_max": 0.8938091397285461, "rewards/margins_min": -0.33744460344314575, "rewards/margins_std": 0.5495746731758118, "rewards/rejected": -0.44940000772476196, "step": 3880 }, { "epoch": 0.93, "grad_norm": 13.135176137869319, "learning_rate": 7.111455115553944e-09, "logits/chosen": -2.604551315307617, "logits/rejected": -2.5755152702331543, "logps/chosen": -280.9961853027344, "logps/rejected": -344.09588623046875, "loss": 0.5858, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1802806705236435, "rewards/margins": 0.35020098090171814, "rewards/margins_max": 1.0035594701766968, "rewards/margins_min": -0.37194377183914185, "rewards/margins_std": 0.6116623878479004, "rewards/rejected": -0.5304816961288452, "step": 3890 }, { "epoch": 0.93, "grad_norm": 12.509404421258807, "learning_rate": 6.6250146832294296e-09, "logits/chosen": -2.646723508834839, "logits/rejected": -2.626894474029541, "logps/chosen": -308.8915100097656, "logps/rejected": -295.9562072753906, "loss": 0.5542, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.19474686682224274, "rewards/margins": 0.39745014905929565, "rewards/margins_max": 0.9413665533065796, "rewards/margins_min": -0.3064861595630646, "rewards/margins_std": 0.5544202923774719, "rewards/rejected": -0.592197060585022, "step": 3900 }, { "epoch": 0.93, "eval_logits/chosen": -2.606426239013672, "eval_logits/rejected": -2.576036214828491, "eval_logps/chosen": -304.34912109375, "eval_logps/rejected": -317.132080078125, "eval_loss": 0.5929449200630188, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -0.19893796741962433, "eval_rewards/margins": 0.3138505220413208, "eval_rewards/margins_max": 1.1806962490081787, "eval_rewards/margins_min": -0.5397577285766602, "eval_rewards/margins_std": 0.580825924873352, "eval_rewards/rejected": -0.5127884745597839, "eval_runtime": 860.1409, "eval_samples_per_second": 4.65, "eval_steps_per_second": 0.291, "step": 3900 }, { "epoch": 0.94, "grad_norm": 9.576670002532317, "learning_rate": 6.155582499813655e-09, "logits/chosen": -2.6036484241485596, "logits/rejected": -2.55094313621521, "logps/chosen": -304.96661376953125, "logps/rejected": -325.8829345703125, "loss": 0.6165, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24803559482097626, "rewards/margins": 0.27249425649642944, "rewards/margins_max": 0.9829356074333191, "rewards/margins_min": -0.3355262279510498, "rewards/margins_std": 0.5755178332328796, "rewards/rejected": -0.5205298662185669, "step": 3910 }, { "epoch": 0.94, "grad_norm": 5.535198634166414, "learning_rate": 5.703191371551841e-09, "logits/chosen": -2.6377830505371094, "logits/rejected": -2.52290940284729, "logps/chosen": -392.7371520996094, "logps/rejected": -322.02008056640625, "loss": 0.5467, "rewards/accuracies": 0.75, "rewards/chosen": -0.20820406079292297, "rewards/margins": 0.36383289098739624, "rewards/margins_max": 0.9824415445327759, "rewards/margins_min": -0.23504504561424255, "rewards/margins_std": 0.5402665138244629, "rewards/rejected": -0.5720369219779968, "step": 3920 }, { "epoch": 0.94, "grad_norm": 14.285955743111506, "learning_rate": 5.267872913775756e-09, "logits/chosen": -2.704752206802368, "logits/rejected": -2.669785737991333, "logps/chosen": -303.36041259765625, "logps/rejected": -288.13775634765625, "loss": 0.5707, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1602882742881775, "rewards/margins": 0.34425032138824463, "rewards/margins_max": 1.0268034934997559, "rewards/margins_min": -0.32272079586982727, "rewards/margins_std": 0.5999466776847839, "rewards/rejected": -0.5045386552810669, "step": 3930 }, { "epoch": 0.94, "grad_norm": 3.230351548565471, "learning_rate": 4.8496575486943744e-09, "logits/chosen": -2.663681745529175, "logits/rejected": -2.5701091289520264, "logps/chosen": -359.291015625, "logps/rejected": -328.3930969238281, "loss": 0.5593, "rewards/accuracies": 0.75, "rewards/chosen": -0.17103374004364014, "rewards/margins": 0.47963079810142517, "rewards/margins_max": 1.0605299472808838, "rewards/margins_min": -0.1423892080783844, "rewards/margins_std": 0.5544031262397766, "rewards/rejected": -0.6506645679473877, "step": 3940 }, { "epoch": 0.95, "grad_norm": 11.851201996824257, "learning_rate": 4.448574503268076e-09, "logits/chosen": -2.5257601737976074, "logits/rejected": -2.5021889209747314, "logps/chosen": -283.82159423828125, "logps/rejected": -315.5705871582031, "loss": 0.5681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2183389663696289, "rewards/margins": 0.3329612910747528, "rewards/margins_max": 0.9345414042472839, "rewards/margins_min": -0.2967822849750519, "rewards/margins_std": 0.5389171838760376, "rewards/rejected": -0.5513002276420593, "step": 3950 }, { "epoch": 0.95, "grad_norm": 9.731317079639949, "learning_rate": 4.064651807165781e-09, "logits/chosen": -2.587010622024536, "logits/rejected": -2.5509226322174072, "logps/chosen": -276.9902648925781, "logps/rejected": -278.53118896484375, "loss": 0.535, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2354898452758789, "rewards/margins": 0.45715102553367615, "rewards/margins_max": 1.1073687076568604, "rewards/margins_min": -0.18103381991386414, "rewards/margins_std": 0.5672684907913208, "rewards/rejected": -0.6926408410072327, "step": 3960 }, { "epoch": 0.95, "grad_norm": 12.975565321409169, "learning_rate": 3.697916290806291e-09, "logits/chosen": -2.679515838623047, "logits/rejected": -2.6130564212799072, "logps/chosen": -306.14599609375, "logps/rejected": -290.2887268066406, "loss": 0.5486, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13901828229427338, "rewards/margins": 0.3719906806945801, "rewards/margins_max": 0.9517256617546082, "rewards/margins_min": -0.1434636414051056, "rewards/margins_std": 0.4784785211086273, "rewards/rejected": -0.5110089182853699, "step": 3970 }, { "epoch": 0.95, "grad_norm": 11.34136483834659, "learning_rate": 3.3483935834831e-09, "logits/chosen": -2.615412950515747, "logits/rejected": -2.565363883972168, "logps/chosen": -314.16363525390625, "logps/rejected": -324.04608154296875, "loss": 0.543, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.18938302993774414, "rewards/margins": 0.43559032678604126, "rewards/margins_max": 1.0743236541748047, "rewards/margins_min": -0.20624911785125732, "rewards/margins_std": 0.5716836452484131, "rewards/rejected": -0.6249733567237854, "step": 3980 }, { "epoch": 0.96, "grad_norm": 9.152276228144013, "learning_rate": 3.0161081115735456e-09, "logits/chosen": -2.645146608352661, "logits/rejected": -2.606229543685913, "logps/chosen": -334.4048156738281, "logps/rejected": -325.699462890625, "loss": 0.5966, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.23924922943115234, "rewards/margins": 0.23653730750083923, "rewards/margins_max": 0.9346807599067688, "rewards/margins_min": -0.3636035621166229, "rewards/margins_std": 0.5894318222999573, "rewards/rejected": -0.4757865369319916, "step": 3990 }, { "epoch": 0.96, "grad_norm": 15.591206620302135, "learning_rate": 2.7010830968314802e-09, "logits/chosen": -2.6069769859313965, "logits/rejected": -2.592172145843506, "logps/chosen": -284.5611267089844, "logps/rejected": -301.4425354003906, "loss": 0.5713, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1448238044977188, "rewards/margins": 0.37589389085769653, "rewards/margins_max": 0.951191782951355, "rewards/margins_min": -0.22190327942371368, "rewards/margins_std": 0.5327543616294861, "rewards/rejected": -0.5207176804542542, "step": 4000 }, { "epoch": 0.96, "eval_logits/chosen": -2.604844093322754, "eval_logits/rejected": -2.574323892593384, "eval_logps/chosen": -304.6741027832031, "eval_logps/rejected": -317.602783203125, "eval_loss": 0.5926074385643005, "eval_rewards/accuracies": 0.7049999833106995, "eval_rewards/chosen": -0.20218777656555176, "eval_rewards/margins": 0.31530728936195374, "eval_rewards/margins_max": 1.1830588579177856, "eval_rewards/margins_min": -0.5406986474990845, "eval_rewards/margins_std": 0.5823014974594116, "eval_rewards/rejected": -0.5174950957298279, "eval_runtime": 859.5521, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 4000 }, { "epoch": 0.96, "grad_norm": 4.4781608457654665, "learning_rate": 2.4033405547646545e-09, "logits/chosen": -2.6150670051574707, "logits/rejected": -2.588038921356201, "logps/chosen": -267.67523193359375, "logps/rejected": -355.4295349121094, "loss": 0.558, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18685632944107056, "rewards/margins": 0.41204652190208435, "rewards/margins_max": 0.9812752604484558, "rewards/margins_min": -0.22956518828868866, "rewards/margins_std": 0.540183961391449, "rewards/rejected": -0.5989028215408325, "step": 4010 }, { "epoch": 0.96, "grad_norm": 8.487408455760916, "learning_rate": 2.122901293095919e-09, "logits/chosen": -2.6082186698913574, "logits/rejected": -2.5535728931427, "logps/chosen": -303.72967529296875, "logps/rejected": -323.4769287109375, "loss": 0.5645, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11907126754522324, "rewards/margins": 0.41876569390296936, "rewards/margins_max": 1.0511444807052612, "rewards/margins_min": -0.1879216879606247, "rewards/margins_std": 0.567715585231781, "rewards/rejected": -0.5378369092941284, "step": 4020 }, { "epoch": 0.97, "grad_norm": 10.081986981087429, "learning_rate": 1.8597849103094143e-09, "logits/chosen": -2.6195461750030518, "logits/rejected": -2.592846632003784, "logps/chosen": -307.9976501464844, "logps/rejected": -332.479248046875, "loss": 0.6021, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20657601952552795, "rewards/margins": 0.33928051590919495, "rewards/margins_max": 0.9945265054702759, "rewards/margins_min": -0.26213595271110535, "rewards/margins_std": 0.5654140710830688, "rewards/rejected": -0.5458565354347229, "step": 4030 }, { "epoch": 0.97, "grad_norm": 10.391196970864554, "learning_rate": 1.614009794280613e-09, "logits/chosen": -2.6484408378601074, "logits/rejected": -2.6035032272338867, "logps/chosen": -323.63250732421875, "logps/rejected": -338.87646484375, "loss": 0.5795, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2887625992298126, "rewards/margins": 0.3088279664516449, "rewards/margins_max": 0.9695230722427368, "rewards/margins_min": -0.4191213548183441, "rewards/margins_std": 0.6257587671279907, "rewards/rejected": -0.5975905656814575, "step": 4040 }, { "epoch": 0.97, "grad_norm": 9.172514235203973, "learning_rate": 1.3855931209914295e-09, "logits/chosen": -2.646422863006592, "logits/rejected": -2.6451754570007324, "logps/chosen": -310.06585693359375, "logps/rejected": -338.1797180175781, "loss": 0.5912, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26628363132476807, "rewards/margins": 0.262619286775589, "rewards/margins_max": 0.8607581257820129, "rewards/margins_min": -0.3698212802410126, "rewards/margins_std": 0.5509908199310303, "rewards/rejected": -0.5289028882980347, "step": 4050 }, { "epoch": 0.97, "grad_norm": 6.302490759256043, "learning_rate": 1.1745508533298754e-09, "logits/chosen": -2.632183313369751, "logits/rejected": -2.570913076400757, "logps/chosen": -308.2278137207031, "logps/rejected": -297.3208923339844, "loss": 0.564, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2079722136259079, "rewards/margins": 0.38544580340385437, "rewards/margins_max": 0.9723888635635376, "rewards/margins_min": -0.12654462456703186, "rewards/margins_std": 0.48897290229797363, "rewards/rejected": -0.5934180021286011, "step": 4060 }, { "epoch": 0.97, "grad_norm": 7.478122592260895, "learning_rate": 9.808977399744511e-10, "logits/chosen": -2.5542659759521484, "logits/rejected": -2.5500786304473877, "logps/chosen": -299.34912109375, "logps/rejected": -301.2681884765625, "loss": 0.5841, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18661324679851532, "rewards/margins": 0.33656594157218933, "rewards/margins_max": 0.9998302459716797, "rewards/margins_min": -0.2652764916419983, "rewards/margins_std": 0.5792658925056458, "rewards/rejected": -0.5231791138648987, "step": 4070 }, { "epoch": 0.98, "grad_norm": 4.014689176886764, "learning_rate": 8.046473143635268e-10, "logits/chosen": -2.576669692993164, "logits/rejected": -2.5621371269226074, "logps/chosen": -299.82330322265625, "logps/rejected": -320.2934875488281, "loss": 0.5964, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14786235988140106, "rewards/margins": 0.3951955735683441, "rewards/margins_max": 1.0109889507293701, "rewards/margins_min": -0.2718932032585144, "rewards/margins_std": 0.5676113963127136, "rewards/rejected": -0.5430579781532288, "step": 4080 }, { "epoch": 0.98, "grad_norm": 6.79237121796912, "learning_rate": 6.458118937494317e-10, "logits/chosen": -2.5543389320373535, "logits/rejected": -2.5615315437316895, "logps/chosen": -337.30828857421875, "logps/rejected": -349.62884521484375, "loss": 0.5622, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.17236635088920593, "rewards/margins": 0.4054867625236511, "rewards/margins_max": 0.9090726971626282, "rewards/margins_min": -0.1325097382068634, "rewards/margins_std": 0.4775218069553375, "rewards/rejected": -0.5778530836105347, "step": 4090 }, { "epoch": 0.98, "grad_norm": 6.85419882290456, "learning_rate": 5.044025783377259e-10, "logits/chosen": -2.650761127471924, "logits/rejected": -2.633965253829956, "logps/chosen": -330.33197021484375, "logps/rejected": -337.37677001953125, "loss": 0.5725, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.23919835686683655, "rewards/margins": 0.40454989671707153, "rewards/margins_max": 0.9539936780929565, "rewards/margins_min": -0.12280567735433578, "rewards/margins_std": 0.49979060888290405, "rewards/rejected": -0.6437481641769409, "step": 4100 }, { "epoch": 0.98, "eval_logits/chosen": -2.6056201457977295, "eval_logits/rejected": -2.5751943588256836, "eval_logps/chosen": -304.7070007324219, "eval_logps/rejected": -317.59930419921875, "eval_loss": 0.5925434827804565, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -0.20251673460006714, "eval_rewards/margins": 0.3149436414241791, "eval_rewards/margins_max": 1.183323860168457, "eval_rewards/margins_min": -0.5414925217628479, "eval_rewards/margins_std": 0.5823516845703125, "eval_rewards/rejected": -0.5174604058265686, "eval_runtime": 859.9542, "eval_samples_per_second": 4.651, "eval_steps_per_second": 0.291, "step": 4100 }, { "epoch": 0.98, "grad_norm": 9.748854741055846, "learning_rate": 3.8042925051148813e-10, "logits/chosen": -2.5554616451263428, "logits/rejected": -2.5370991230010986, "logps/chosen": -324.4612731933594, "logps/rejected": -315.3202209472656, "loss": 0.5679, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.196882426738739, "rewards/margins": 0.3231392204761505, "rewards/margins_max": 0.942890465259552, "rewards/margins_min": -0.21961653232574463, "rewards/margins_std": 0.5246455669403076, "rewards/rejected": -0.5200216174125671, "step": 4110 }, { "epoch": 0.99, "grad_norm": 12.209770317697934, "learning_rate": 2.7390057414064525e-10, "logits/chosen": -2.6003313064575195, "logits/rejected": -2.591627597808838, "logps/chosen": -326.4041748046875, "logps/rejected": -320.86260986328125, "loss": 0.5498, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.19969312846660614, "rewards/margins": 0.34613776206970215, "rewards/margins_max": 0.9882827997207642, "rewards/margins_min": -0.3541576564311981, "rewards/margins_std": 0.5943008661270142, "rewards/rejected": -0.5458309054374695, "step": 4120 }, { "epoch": 0.99, "grad_norm": 6.440145893221407, "learning_rate": 1.8482399397654057e-10, "logits/chosen": -2.659862995147705, "logits/rejected": -2.6385557651519775, "logps/chosen": -321.1582336425781, "logps/rejected": -349.50299072265625, "loss": 0.5744, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.17049312591552734, "rewards/margins": 0.303345263004303, "rewards/margins_max": 0.8798073530197144, "rewards/margins_min": -0.20885124802589417, "rewards/margins_std": 0.4921353757381439, "rewards/rejected": -0.4738383889198303, "step": 4130 }, { "epoch": 0.99, "grad_norm": 10.2314091003016, "learning_rate": 1.1320573513159959e-10, "logits/chosen": -2.626286268234253, "logits/rejected": -2.5777461528778076, "logps/chosen": -287.6363220214844, "logps/rejected": -287.4676818847656, "loss": 0.5813, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2590753436088562, "rewards/margins": 0.2685549557209015, "rewards/margins_max": 0.9383190274238586, "rewards/margins_min": -0.35475224256515503, "rewards/margins_std": 0.588822066783905, "rewards/rejected": -0.5276302099227905, "step": 4140 }, { "epoch": 0.99, "grad_norm": 11.273979570517472, "learning_rate": 5.905080264431705e-11, "logits/chosen": -2.594353199005127, "logits/rejected": -2.572741985321045, "logps/chosen": -303.2122497558594, "logps/rejected": -317.05902099609375, "loss": 0.5635, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20842579007148743, "rewards/margins": 0.3852156698703766, "rewards/margins_max": 0.9706063270568848, "rewards/margins_min": -0.12086659669876099, "rewards/margins_std": 0.4832797646522522, "rewards/rejected": -0.593641459941864, "step": 4150 }, { "epoch": 1.0, "grad_norm": 10.175839731117941, "learning_rate": 2.2362981129508963e-11, "logits/chosen": -2.6398215293884277, "logits/rejected": -2.602846622467041, "logps/chosen": -308.8743591308594, "logps/rejected": -334.1455993652344, "loss": 0.5567, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1167726069688797, "rewards/margins": 0.39473292231559753, "rewards/margins_max": 1.0183244943618774, "rewards/margins_min": -0.24444147944450378, "rewards/margins_std": 0.5833578705787659, "rewards/rejected": -0.5115054845809937, "step": 4160 }, { "epoch": 1.0, "grad_norm": 11.274093259903779, "learning_rate": 3.144834513746364e-12, "logits/chosen": -2.6418585777282715, "logits/rejected": -2.634779214859009, "logps/chosen": -334.3486633300781, "logps/rejected": -331.58697509765625, "loss": 0.5539, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.12420052289962769, "rewards/margins": 0.4431988298892975, "rewards/margins_max": 1.0661565065383911, "rewards/margins_min": -0.11877351999282837, "rewards/margins_std": 0.5281156301498413, "rewards/rejected": -0.5673993229866028, "step": 4170 }, { "epoch": 1.0, "step": 4176, "total_flos": 0.0, "train_loss": 0.6106676421631342, "train_runtime": 67977.0297, "train_samples_per_second": 0.983, "train_steps_per_second": 0.061 } ], "logging_steps": 10, "max_steps": 4176, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }