{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8666962385177612, "logits/rejected": -1.8709977865219116, "logps/chosen": -36.98939514160156, "logps/rejected": -33.66963195800781, "loss": 0.6929, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.00017197892884723842, "rewards/margins": 0.0005675320862792432, "rewards/rejected": -0.0003955531574320048, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9978935718536377, "logits/rejected": -2.000532627105713, "logps/chosen": -29.66562843322754, "logps/rejected": -29.045883178710938, "loss": 0.6934, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.00023434234026353806, "rewards/margins": -0.0004099405778106302, "rewards/rejected": 0.0001755982666509226, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9211324453353882, "logits/rejected": -1.9184545278549194, "logps/chosen": -31.41294288635254, "logps/rejected": -33.23053741455078, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 3.152530553052202e-05, "rewards/margins": 0.000152341352077201, "rewards/rejected": -0.00012081606837455183, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.017341375350952, "logits/rejected": -2.0086092948913574, "logps/chosen": -32.60146713256836, "logps/rejected": -32.49399185180664, "loss": 0.6934, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0002466029836796224, "rewards/margins": -0.0004333632532507181, "rewards/rejected": 0.00018676018225960433, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.862633466720581, "logits/rejected": -1.8518692255020142, "logps/chosen": -33.55931091308594, "logps/rejected": -35.44870376586914, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -1.831089502957184e-05, "rewards/margins": -5.47249146620743e-05, "rewards/rejected": 3.641402145149186e-05, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9409154653549194, "logits/rejected": -1.9428699016571045, "logps/chosen": -32.53916549682617, "logps/rejected": -33.24130630493164, "loss": 0.6925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0006101715262047946, "rewards/margins": 0.0013094183523207903, "rewards/rejected": -0.0006992466514930129, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.070591926574707, "logits/rejected": -2.075544834136963, "logps/chosen": -34.023067474365234, "logps/rejected": -36.647151947021484, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00034581663203425705, "rewards/margins": 0.0004369783273432404, "rewards/rejected": -0.0007827949011698365, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9308092594146729, "logits/rejected": -1.933943748474121, "logps/chosen": -34.318023681640625, "logps/rejected": -34.67802429199219, "loss": 0.6922, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0010978971840813756, "rewards/margins": 0.0019540609791874886, "rewards/rejected": -0.000856163795106113, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9380912780761719, "logits/rejected": -1.9425855875015259, "logps/chosen": -32.38385009765625, "logps/rejected": -32.35346603393555, "loss": 0.6928, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0008357145707122982, "rewards/margins": 0.0007813175907358527, "rewards/rejected": 5.439693995867856e-05, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.035137891769409, "logits/rejected": -2.0331528186798096, "logps/chosen": -32.112831115722656, "logps/rejected": -31.29166030883789, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": 0.0012800416443496943, "rewards/margins": 0.0015345367137342691, "rewards/rejected": -0.0002544948656577617, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2300801277160645, "eval_logits/rejected": -2.225238084793091, "eval_logps/chosen": -34.04683303833008, "eval_logps/rejected": -37.53927230834961, "eval_loss": 0.6930972337722778, "eval_rewards/accuracies": 0.5186877250671387, "eval_rewards/chosen": -0.00012280470400583, "eval_rewards/margins": 0.00010372586984885857, "eval_rewards/rejected": -0.0002265305956825614, "eval_runtime": 145.7259, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.989782691001892, "logits/rejected": -1.9873950481414795, "logps/chosen": -33.12385559082031, "logps/rejected": -34.011810302734375, "loss": 0.6926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0011996207758784294, "rewards/margins": 0.001024017808958888, "rewards/rejected": 0.00017560287960805, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0008151531219482, "logits/rejected": -1.992500901222229, "logps/chosen": -32.320838928222656, "logps/rejected": -32.128170013427734, "loss": 0.6927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.001240686746314168, "rewards/margins": 0.0009073130786418915, "rewards/rejected": 0.0003333735803607851, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0289230346679688, "logits/rejected": -2.020946502685547, "logps/chosen": -30.313907623291016, "logps/rejected": -32.086116790771484, "loss": 0.6922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0016180993989109993, "rewards/margins": 0.0019491963321343064, "rewards/rejected": -0.00033109664218500257, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9592479467391968, "logits/rejected": -1.9694607257843018, "logps/chosen": -31.223953247070312, "logps/rejected": -32.547454833984375, "loss": 0.6921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0019570994190871716, "rewards/margins": 0.002082846825942397, "rewards/rejected": -0.00012574761058203876, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8708124160766602, "logits/rejected": -1.8719879388809204, "logps/chosen": -33.877174377441406, "logps/rejected": -34.78774642944336, "loss": 0.6914, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.003155181184411049, "rewards/margins": 0.0034600873477756977, "rewards/rejected": -0.00030490627977997065, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9215673208236694, "logits/rejected": -1.9181665182113647, "logps/chosen": -36.011531829833984, "logps/rejected": -32.685707092285156, "loss": 0.6925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.001601455733180046, "rewards/margins": 0.001231002388522029, "rewards/rejected": 0.00037045328645035625, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.021604061126709, "logits/rejected": -2.014291524887085, "logps/chosen": -33.482086181640625, "logps/rejected": -31.404422760009766, "loss": 0.6913, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0031577465124428272, "rewards/margins": 0.003683448536321521, "rewards/rejected": -0.0005257020820863545, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.027444839477539, "logits/rejected": -2.032665729522705, "logps/chosen": -32.183101654052734, "logps/rejected": -32.39936065673828, "loss": 0.6918, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0036168727092444897, "rewards/margins": 0.0027590212412178516, "rewards/rejected": 0.0008578516426496208, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.027879238128662, "logits/rejected": -2.025132656097412, "logps/chosen": -31.258464813232422, "logps/rejected": -31.348388671875, "loss": 0.6919, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0023048892617225647, "rewards/margins": 0.0026066480204463005, "rewards/rejected": -0.000301758642308414, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.8983129262924194, "logits/rejected": -1.902967095375061, "logps/chosen": -31.276391983032227, "logps/rejected": -32.81935119628906, "loss": 0.6914, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0032989257015287876, "rewards/margins": 0.00358308176510036, "rewards/rejected": -0.00028415597626008093, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2249655723571777, "eval_logits/rejected": -2.220139503479004, "eval_logps/chosen": -34.04255294799805, "eval_logps/rejected": -37.55300521850586, "eval_loss": 0.6930080056190491, "eval_rewards/accuracies": 0.5245016813278198, "eval_rewards/chosen": -8.006239659152925e-05, "eval_rewards/margins": 0.00028380370349623263, "eval_rewards/rejected": -0.00036386612919159234, "eval_runtime": 145.5269, "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.010593891143799, "logits/rejected": -2.021207332611084, "logps/chosen": -31.7437801361084, "logps/rejected": -33.93886947631836, "loss": 0.6916, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0022752191871404648, "rewards/margins": 0.003036911366507411, "rewards/rejected": -0.000761692295782268, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.9029136896133423, "logits/rejected": -1.9176632165908813, "logps/chosen": -29.78145408630371, "logps/rejected": -31.63638687133789, "loss": 0.6911, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.003300876123830676, "rewards/margins": 0.0040829661302268505, "rewards/rejected": -0.000782089657150209, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9591538906097412, "logits/rejected": -1.9631026983261108, "logps/chosen": -33.05189895629883, "logps/rejected": -31.594707489013672, "loss": 0.6911, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003655704203993082, "rewards/margins": 0.004109731875360012, "rewards/rejected": -0.00045402703108265996, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9572566747665405, "logits/rejected": -1.9354870319366455, "logps/chosen": -33.83857727050781, "logps/rejected": -35.12303924560547, "loss": 0.6907, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0031517534516751766, "rewards/margins": 0.004874187987297773, "rewards/rejected": -0.0017224351176992059, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -1.998875379562378, "logits/rejected": -1.9955555200576782, "logps/chosen": -32.72559356689453, "logps/rejected": -36.2435417175293, "loss": 0.6921, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0019947518594563007, "rewards/margins": 0.0021809376776218414, "rewards/rejected": -0.0001861859782366082, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8661177158355713, "logits/rejected": -1.8636993169784546, "logps/chosen": -33.959014892578125, "logps/rejected": -35.526344299316406, "loss": 0.6919, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0022938635665923357, "rewards/margins": 0.0025111136492341757, "rewards/rejected": -0.0002172500389860943, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8511241674423218, "logits/rejected": -1.8487510681152344, "logps/chosen": -34.16337585449219, "logps/rejected": -31.830408096313477, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": 0.002342230873182416, "rewards/margins": 0.0029330006800591946, "rewards/rejected": -0.0005907699232921004, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9535648822784424, "logits/rejected": -1.943101167678833, "logps/chosen": -35.01304244995117, "logps/rejected": -31.87521743774414, "loss": 0.6913, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0034359837882220745, "rewards/margins": 0.0037782168947160244, "rewards/rejected": -0.0003422332229092717, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.048783302307129, "logits/rejected": -2.0339112281799316, "logps/chosen": -30.716812133789062, "logps/rejected": -32.62614059448242, "loss": 0.6922, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0020731096155941486, "rewards/margins": 0.001816184027120471, "rewards/rejected": 0.00025692558847367764, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9198474884033203, "logits/rejected": -1.917340636253357, "logps/chosen": -32.29683303833008, "logps/rejected": -30.91409683227539, "loss": 0.6895, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0063027567230165005, "rewards/margins": 0.007275627460330725, "rewards/rejected": -0.0009728703880682588, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.221111297607422, "eval_logits/rejected": -2.2162926197052, "eval_logps/chosen": -34.0648307800293, "eval_logps/rejected": -37.58684158325195, "eval_loss": 0.6929495930671692, "eval_rewards/accuracies": 0.5419435501098633, "eval_rewards/chosen": -0.0003027978236787021, "eval_rewards/margins": 0.0003993964346591383, "eval_rewards/rejected": -0.0007021942874416709, "eval_runtime": 145.7415, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 4.84533120650964e-06, "logits/chosen": -1.9055675268173218, "logits/rejected": -1.902345895767212, "logps/chosen": -31.301956176757812, "logps/rejected": -33.823036193847656, "loss": 0.6912, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0030139132868498564, "rewards/margins": 0.0038227462209761143, "rewards/rejected": -0.0008088329923339188, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.825108134172131e-06, "logits/chosen": -1.951906442642212, "logits/rejected": -1.939772605895996, "logps/chosen": -34.27196502685547, "logps/rejected": -33.685001373291016, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.003241057973355055, "rewards/margins": 0.004702677950263023, "rewards/rejected": -0.0014616195112466812, "step": 320 }, { "epoch": 0.86, "learning_rate": 4.80369052967602e-06, "logits/chosen": -1.9854780435562134, "logits/rejected": -1.9840580224990845, "logps/chosen": -33.0145378112793, "logps/rejected": -32.56486511230469, "loss": 0.6904, "rewards/accuracies": 0.75, "rewards/chosen": 0.004712558351457119, "rewards/margins": 0.005565387196838856, "rewards/rejected": -0.0008528297767043114, "step": 330 }, { "epoch": 0.88, "learning_rate": 4.781089396387968e-06, "logits/chosen": -2.070883274078369, "logits/rejected": -2.055272102355957, "logps/chosen": -33.69978713989258, "logps/rejected": -33.0802001953125, "loss": 0.6909, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004796043038368225, "rewards/margins": 0.004417680203914642, "rewards/rejected": 0.0003783629508689046, "step": 340 }, { "epoch": 0.91, "learning_rate": 4.757316345716554e-06, "logits/chosen": -1.944435477256775, "logits/rejected": -1.943645715713501, "logps/chosen": -32.76495361328125, "logps/rejected": -32.4921760559082, "loss": 0.6905, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0051714470610022545, "rewards/margins": 0.005441715009510517, "rewards/rejected": -0.0002702682395465672, "step": 350 }, { "epoch": 0.94, "learning_rate": 4.73238359114687e-06, "logits/chosen": -1.8958152532577515, "logits/rejected": -1.9060084819793701, "logps/chosen": -31.695724487304688, "logps/rejected": -35.41404342651367, "loss": 0.69, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.005089647602289915, "rewards/margins": 0.0063424864783883095, "rewards/rejected": -0.0012528380611911416, "step": 360 }, { "epoch": 0.96, "learning_rate": 4.706303941965804e-06, "logits/chosen": -2.029942035675049, "logits/rejected": -2.0236124992370605, "logps/chosen": -33.23334884643555, "logps/rejected": -29.281543731689453, "loss": 0.6908, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004186153877526522, "rewards/margins": 0.004737343639135361, "rewards/rejected": -0.0005511896451935172, "step": 370 }, { "epoch": 0.99, "learning_rate": 4.679090796681225e-06, "logits/chosen": -1.8858661651611328, "logits/rejected": -1.8880888223648071, "logps/chosen": -33.61238098144531, "logps/rejected": -30.986286163330078, "loss": 0.6895, "rewards/accuracies": 0.75, "rewards/chosen": 0.0062666991725564, "rewards/margins": 0.007403238210827112, "rewards/rejected": -0.001136539620347321, "step": 380 }, { "epoch": 1.01, "learning_rate": 4.650758136138454e-06, "logits/chosen": -1.9126472473144531, "logits/rejected": -1.9113785028457642, "logps/chosen": -33.73168182373047, "logps/rejected": -36.05659484863281, "loss": 0.6879, "rewards/accuracies": 0.7458333373069763, "rewards/chosen": 0.006531029939651489, "rewards/margins": 0.010536923073232174, "rewards/rejected": -0.004005893599241972, "step": 390 }, { "epoch": 1.04, "learning_rate": 4.621320516337559e-06, "logits/chosen": -1.8457567691802979, "logits/rejected": -1.8373829126358032, "logps/chosen": -30.92877197265625, "logps/rejected": -36.478904724121094, "loss": 0.6863, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.008452029898762703, "rewards/margins": 0.013835062272846699, "rewards/rejected": -0.005383032839745283, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.193706512451172, "eval_logits/rejected": -2.1888742446899414, "eval_logps/chosen": -34.14311218261719, "eval_logps/rejected": -37.68904113769531, "eval_loss": 0.6928316950798035, "eval_rewards/accuracies": 0.5681062936782837, "eval_rewards/chosen": -0.0010856210719794035, "eval_rewards/margins": 0.0006385648157447577, "eval_rewards/rejected": -0.001724186004139483, "eval_runtime": 146.0208, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 400 }, { "epoch": 1.06, "learning_rate": 4.590793060955158e-06, "logits/chosen": -2.0138370990753174, "logits/rejected": -2.0166878700256348, "logps/chosen": -32.178985595703125, "logps/rejected": -35.35575485229492, "loss": 0.686, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008749553933739662, "rewards/margins": 0.014378642663359642, "rewards/rejected": -0.005629089195281267, "step": 410 }, { "epoch": 1.09, "learning_rate": 4.559191453574582e-06, "logits/chosen": -1.8486782312393188, "logits/rejected": -1.8472837209701538, "logps/chosen": -28.309524536132812, "logps/rejected": -32.836753845214844, "loss": 0.6868, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0075803459621965885, "rewards/margins": 0.012771248817443848, "rewards/rejected": -0.005190903786569834, "step": 420 }, { "epoch": 1.12, "learning_rate": 4.52653192962838e-06, "logits/chosen": -1.802756905555725, "logits/rejected": -1.7958400249481201, "logps/chosen": -33.09931182861328, "logps/rejected": -34.53899002075195, "loss": 0.6868, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.009747742675244808, "rewards/margins": 0.012684956192970276, "rewards/rejected": -0.002937213983386755, "step": 430 }, { "epoch": 1.14, "learning_rate": 4.492831268057307e-06, "logits/chosen": -1.9703264236450195, "logits/rejected": -1.9651902914047241, "logps/chosen": -30.736658096313477, "logps/rejected": -32.6190071105957, "loss": 0.6847, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.010466375388205051, "rewards/margins": 0.01698034629225731, "rewards/rejected": -0.006513969041407108, "step": 440 }, { "epoch": 1.17, "learning_rate": 4.458106782690094e-06, "logits/chosen": -1.8493196964263916, "logits/rejected": -1.8536157608032227, "logps/chosen": -33.46088409423828, "logps/rejected": -33.30448532104492, "loss": 0.6844, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.010549607686698437, "rewards/margins": 0.01770811341702938, "rewards/rejected": -0.007158507592976093, "step": 450 }, { "epoch": 1.19, "learning_rate": 4.422376313348405e-06, "logits/chosen": -1.8494908809661865, "logits/rejected": -1.843927025794983, "logps/chosen": -34.2591552734375, "logps/rejected": -35.904815673828125, "loss": 0.6827, "rewards/accuracies": 0.9375, "rewards/chosen": 0.011603695340454578, "rewards/margins": 0.02116088569164276, "rewards/rejected": -0.009557187557220459, "step": 460 }, { "epoch": 1.22, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -1.868131399154663, "logits/rejected": -1.8679981231689453, "logps/chosen": -33.08659362792969, "logps/rejected": -34.75391387939453, "loss": 0.685, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.01081737782806158, "rewards/margins": 0.0165016558021307, "rewards/rejected": -0.005684278905391693, "step": 470 }, { "epoch": 1.25, "learning_rate": 4.347971356735789e-06, "logits/chosen": -1.9114658832550049, "logits/rejected": -1.8928560018539429, "logps/chosen": -32.96870040893555, "logps/rejected": -33.964908599853516, "loss": 0.6828, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.011920640245079994, "rewards/margins": 0.020869914442300797, "rewards/rejected": -0.008949270471930504, "step": 480 }, { "epoch": 1.27, "learning_rate": 4.309335095262675e-06, "logits/chosen": -1.8733381032943726, "logits/rejected": -1.8726457357406616, "logps/chosen": -30.497507095336914, "logps/rejected": -31.803579330444336, "loss": 0.6849, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.011137938126921654, "rewards/margins": 0.016713283956050873, "rewards/rejected": -0.005575346294790506, "step": 490 }, { "epoch": 1.3, "learning_rate": 4.269769281772082e-06, "logits/chosen": -1.8297357559204102, "logits/rejected": -1.8228442668914795, "logps/chosen": -31.466567993164062, "logps/rejected": -35.563499450683594, "loss": 0.6826, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.012035631574690342, "rewards/margins": 0.021403178572654724, "rewards/rejected": -0.009367546997964382, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.122058868408203, "eval_logits/rejected": -2.117284059524536, "eval_logps/chosen": -34.34208297729492, "eval_logps/rejected": -37.94715118408203, "eval_loss": 0.6925419569015503, "eval_rewards/accuracies": 0.5651993155479431, "eval_rewards/chosen": -0.003075304673984647, "eval_rewards/margins": 0.001229992602020502, "eval_rewards/rejected": -0.0043052975088357925, "eval_runtime": 145.8949, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 500 }, { "epoch": 1.32, "learning_rate": 4.22929424333435e-06, "logits/chosen": -1.8198668956756592, "logits/rejected": -1.8234672546386719, "logps/chosen": -28.312463760375977, "logps/rejected": -33.89719772338867, "loss": 0.6837, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.008623605594038963, "rewards/margins": 0.019002709537744522, "rewards/rejected": -0.010379104875028133, "step": 510 }, { "epoch": 1.35, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -1.8108766078948975, "logits/rejected": -1.8216520547866821, "logps/chosen": -32.165672302246094, "logps/rejected": -31.733028411865234, "loss": 0.6824, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.010966637171804905, "rewards/margins": 0.021780062466859818, "rewards/rejected": -0.010813427157700062, "step": 520 }, { "epoch": 1.38, "learning_rate": 4.145700124802693e-06, "logits/chosen": -1.74923837184906, "logits/rejected": -1.7469356060028076, "logps/chosen": -30.605663299560547, "logps/rejected": -31.276514053344727, "loss": 0.6825, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.010614162310957909, "rewards/margins": 0.021611668169498444, "rewards/rejected": -0.010997505858540535, "step": 530 }, { "epoch": 1.4, "learning_rate": 4.102623991469562e-06, "logits/chosen": -1.816229224205017, "logits/rejected": -1.8094854354858398, "logps/chosen": -33.24816131591797, "logps/rejected": -34.189598083496094, "loss": 0.6825, "rewards/accuracies": 0.8125, "rewards/chosen": 0.009846633300185204, "rewards/margins": 0.021567735821008682, "rewards/rejected": -0.011721103452146053, "step": 540 }, { "epoch": 1.43, "learning_rate": 4.058724504646834e-06, "logits/chosen": -1.7789214849472046, "logits/rejected": -1.7853628396987915, "logps/chosen": -30.978107452392578, "logps/rejected": -33.693607330322266, "loss": 0.6847, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.007750834338366985, "rewards/margins": 0.017099570482969284, "rewards/rejected": -0.009348735213279724, "step": 550 }, { "epoch": 1.45, "learning_rate": 4.014024217844167e-06, "logits/chosen": -1.8461157083511353, "logits/rejected": -1.8232545852661133, "logps/chosen": -30.5151424407959, "logps/rejected": -33.84736633300781, "loss": 0.6844, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.009224263951182365, "rewards/margins": 0.01773734949529171, "rewards/rejected": -0.008513087406754494, "step": 560 }, { "epoch": 1.48, "learning_rate": 3.968546095984911e-06, "logits/chosen": -1.7744262218475342, "logits/rejected": -1.769487738609314, "logps/chosen": -31.48854637145996, "logps/rejected": -33.041587829589844, "loss": 0.684, "rewards/accuracies": 0.8125, "rewards/chosen": 0.009281843900680542, "rewards/margins": 0.018488582223653793, "rewards/rejected": -0.009206734597682953, "step": 570 }, { "epoch": 1.51, "learning_rate": 3.922313503607806e-06, "logits/chosen": -1.8039462566375732, "logits/rejected": -1.8057708740234375, "logps/chosen": -33.585567474365234, "logps/rejected": -36.357948303222656, "loss": 0.6816, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.007902255281805992, "rewards/margins": 0.023472566157579422, "rewards/rejected": -0.015570309944450855, "step": 580 }, { "epoch": 1.53, "learning_rate": 3.875350192863368e-06, "logits/chosen": -1.779675841331482, "logits/rejected": -1.7792049646377563, "logps/chosen": -29.52834129333496, "logps/rejected": -32.76404571533203, "loss": 0.6815, "rewards/accuracies": 0.875, "rewards/chosen": 0.010637165978550911, "rewards/margins": 0.023661229759454727, "rewards/rejected": -0.013024063780903816, "step": 590 }, { "epoch": 1.56, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -1.7808748483657837, "logits/rejected": -1.778590440750122, "logps/chosen": -32.0461540222168, "logps/rejected": -33.55706024169922, "loss": 0.6823, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.009521286003291607, "rewards/margins": 0.02209232933819294, "rewards/rejected": -0.012571041472256184, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.0287230014801025, "eval_logits/rejected": -2.024071455001831, "eval_logps/chosen": -34.73115539550781, "eval_logps/rejected": -38.49046325683594, "eval_loss": 0.6917924880981445, "eval_rewards/accuracies": 0.5830564498901367, "eval_rewards/chosen": -0.006966045591980219, "eval_rewards/margins": 0.0027723864186555147, "eval_rewards/rejected": -0.009738431312143803, "eval_runtime": 145.7839, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 600 }, { "epoch": 1.58, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -1.8076483011245728, "logits/rejected": -1.8139461278915405, "logps/chosen": -31.64394187927246, "logps/rejected": -33.57398986816406, "loss": 0.6828, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.0059343790635466576, "rewards/margins": 0.02104238048195839, "rewards/rejected": -0.015108002349734306, "step": 610 }, { "epoch": 1.61, "learning_rate": 3.730319028506478e-06, "logits/chosen": -1.753603219985962, "logits/rejected": -1.75141179561615, "logps/chosen": -33.820560455322266, "logps/rejected": -32.37050247192383, "loss": 0.6817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.009353202767670155, "rewards/margins": 0.0232031662017107, "rewards/rejected": -0.013849964365363121, "step": 620 }, { "epoch": 1.64, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -1.6926358938217163, "logits/rejected": -1.686195731163025, "logps/chosen": -34.429847717285156, "logps/rejected": -33.97523880004883, "loss": 0.6801, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.01034373790025711, "rewards/margins": 0.026524048298597336, "rewards/rejected": -0.016180310398340225, "step": 630 }, { "epoch": 1.66, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -1.7083446979522705, "logits/rejected": -1.7146565914154053, "logps/chosen": -33.29853820800781, "logps/rejected": -34.668426513671875, "loss": 0.6827, "rewards/accuracies": 0.8125, "rewards/chosen": 0.006807624362409115, "rewards/margins": 0.021089300513267517, "rewards/rejected": -0.014281675219535828, "step": 640 }, { "epoch": 1.69, "learning_rate": 3.579601087369492e-06, "logits/chosen": -1.7786967754364014, "logits/rejected": -1.792654037475586, "logps/chosen": -31.198848724365234, "logps/rejected": -33.51192855834961, "loss": 0.6825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006536136381328106, "rewards/margins": 0.021678542718291283, "rewards/rejected": -0.015142406336963177, "step": 650 }, { "epoch": 1.71, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.6418495178222656, "logits/rejected": -1.6386057138442993, "logps/chosen": -32.84505081176758, "logps/rejected": -36.883094787597656, "loss": 0.6772, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.010779094882309437, "rewards/margins": 0.03260749578475952, "rewards/rejected": -0.02182840369641781, "step": 660 }, { "epoch": 1.74, "learning_rate": 3.476306177936961e-06, "logits/chosen": -1.7246978282928467, "logits/rejected": -1.7246736288070679, "logps/chosen": -30.864843368530273, "logps/rejected": -36.09869384765625, "loss": 0.6799, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.004207477904856205, "rewards/margins": 0.026930591091513634, "rewards/rejected": -0.022723112255334854, "step": 670 }, { "epoch": 1.77, "learning_rate": 3.423893017450324e-06, "logits/chosen": -1.6627376079559326, "logits/rejected": -1.6593656539916992, "logps/chosen": -30.347408294677734, "logps/rejected": -34.78777313232422, "loss": 0.6809, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.005249389447271824, "rewards/margins": 0.0249490849673748, "rewards/rejected": -0.0196996983140707, "step": 680 }, { "epoch": 1.79, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -1.676200270652771, "logits/rejected": -1.6763780117034912, "logps/chosen": -29.362756729125977, "logps/rejected": -32.716041564941406, "loss": 0.6792, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.008003375492990017, "rewards/margins": 0.028458837419748306, "rewards/rejected": -0.020455462858080864, "step": 690 }, { "epoch": 1.82, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -1.5964815616607666, "logits/rejected": -1.599886417388916, "logps/chosen": -33.50843048095703, "logps/rejected": -33.53223419189453, "loss": 0.6784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009700378403067589, "rewards/margins": 0.03037584200501442, "rewards/rejected": -0.02067546173930168, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -1.9464259147644043, "eval_logits/rejected": -1.9419163465499878, "eval_logps/chosen": -35.35507583618164, "eval_logps/rejected": -39.15093994140625, "eval_loss": 0.6916440725326538, "eval_rewards/accuracies": 0.565614640712738, "eval_rewards/chosen": -0.013205258175730705, "eval_rewards/margins": 0.0031379179563373327, "eval_rewards/rejected": -0.01634317822754383, "eval_runtime": 145.8665, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 700 }, { "epoch": 1.84, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -1.667773962020874, "logits/rejected": -1.6596691608428955, "logps/chosen": -36.030296325683594, "logps/rejected": -33.893470764160156, "loss": 0.681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.004817272536456585, "rewards/margins": 0.024693841114640236, "rewards/rejected": -0.019876569509506226, "step": 710 }, { "epoch": 1.87, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -1.6702191829681396, "logits/rejected": -1.6717958450317383, "logps/chosen": -36.07275390625, "logps/rejected": -35.63324737548828, "loss": 0.6788, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0036263135261833668, "rewards/margins": 0.02919856831431389, "rewards/rejected": -0.025572258979082108, "step": 720 }, { "epoch": 1.9, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -1.6882798671722412, "logits/rejected": -1.6859245300292969, "logps/chosen": -31.580810546875, "logps/rejected": -34.97660446166992, "loss": 0.6787, "rewards/accuracies": 0.8125, "rewards/chosen": 0.009825185872614384, "rewards/margins": 0.02962224744260311, "rewards/rejected": -0.019797060638666153, "step": 730 }, { "epoch": 1.92, "learning_rate": 3.100405083388799e-06, "logits/chosen": -1.6548511981964111, "logits/rejected": -1.6600011587142944, "logps/chosen": -30.910289764404297, "logps/rejected": -35.500179290771484, "loss": 0.6774, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008754345588386059, "rewards/margins": 0.03220795840024948, "rewards/rejected": -0.023453611880540848, "step": 740 }, { "epoch": 1.95, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -1.609222412109375, "logits/rejected": -1.6077518463134766, "logps/chosen": -32.664878845214844, "logps/rejected": -37.466697692871094, "loss": 0.6774, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.004600999411195517, "rewards/margins": 0.032408393919467926, "rewards/rejected": -0.02780739590525627, "step": 750 }, { "epoch": 1.97, "learning_rate": 2.989809792446417e-06, "logits/chosen": -1.4765026569366455, "logits/rejected": -1.472049355506897, "logps/chosen": -35.379676818847656, "logps/rejected": -38.33124542236328, "loss": 0.6749, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.006187664810568094, "rewards/margins": 0.03741595149040222, "rewards/rejected": -0.031228289008140564, "step": 760 }, { "epoch": 2.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.5943939685821533, "logits/rejected": -1.5989573001861572, "logps/chosen": -34.98912811279297, "logps/rejected": -36.04502487182617, "loss": 0.6801, "rewards/accuracies": 0.7208333611488342, "rewards/chosen": 0.00224525248631835, "rewards/margins": 0.026747092604637146, "rewards/rejected": -0.024501841515302658, "step": 770 }, { "epoch": 2.03, "learning_rate": 2.878208065043501e-06, "logits/chosen": -1.542252779006958, "logits/rejected": -1.5407251119613647, "logps/chosen": -32.663124084472656, "logps/rejected": -38.704864501953125, "loss": 0.6661, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.012384667061269283, "rewards/margins": 0.05538671463727951, "rewards/rejected": -0.04300205036997795, "step": 780 }, { "epoch": 2.05, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -1.5762343406677246, "logits/rejected": -1.5737056732177734, "logps/chosen": -32.12613296508789, "logps/rejected": -36.56070327758789, "loss": 0.6701, "rewards/accuracies": 0.875, "rewards/chosen": 0.012496042996644974, "rewards/margins": 0.04723736643791199, "rewards/rejected": -0.03474132716655731, "step": 790 }, { "epoch": 2.08, "learning_rate": 2.76582921478147e-06, "logits/chosen": -1.4931247234344482, "logits/rejected": -1.487870454788208, "logps/chosen": -33.71710968017578, "logps/rejected": -34.95537567138672, "loss": 0.6729, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.006659658160060644, "rewards/margins": 0.04157082363963127, "rewards/rejected": -0.034911174327135086, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -1.8692306280136108, "eval_logits/rejected": -1.8648308515548706, "eval_logps/chosen": -35.941200256347656, "eval_logps/rejected": -39.86221694946289, "eval_loss": 0.6910557746887207, "eval_rewards/accuracies": 0.5539867281913757, "eval_rewards/chosen": -0.019066473469138145, "eval_rewards/margins": 0.004389475099742413, "eval_rewards/rejected": -0.023455949500203133, "eval_runtime": 145.7021, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.295, "step": 800 }, { "epoch": 2.1, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -1.579530954360962, "logits/rejected": -1.58658766746521, "logps/chosen": -31.1917724609375, "logps/rejected": -38.586029052734375, "loss": 0.6677, "rewards/accuracies": 0.875, "rewards/chosen": 0.009364848956465721, "rewards/margins": 0.05266670510172844, "rewards/rejected": -0.04330185800790787, "step": 810 }, { "epoch": 2.13, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -1.5474834442138672, "logits/rejected": -1.5493825674057007, "logps/chosen": -31.922176361083984, "logps/rejected": -36.21441650390625, "loss": 0.6755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006489004008471966, "rewards/margins": 0.03633515536785126, "rewards/rejected": -0.029846150428056717, "step": 820 }, { "epoch": 2.16, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -1.4332886934280396, "logits/rejected": -1.4374314546585083, "logps/chosen": -32.152000427246094, "logps/rejected": -39.53594207763672, "loss": 0.6685, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.004394562914967537, "rewards/margins": 0.05065219849348068, "rewards/rejected": -0.046257637441158295, "step": 830 }, { "epoch": 2.18, "learning_rate": 2.53966490958702e-06, "logits/chosen": -1.4998013973236084, "logits/rejected": -1.496098518371582, "logps/chosen": -32.54491424560547, "logps/rejected": -36.80445098876953, "loss": 0.6725, "rewards/accuracies": 0.8125, "rewards/chosen": 0.004237356595695019, "rewards/margins": 0.04233536496758461, "rewards/rejected": -0.03809800371527672, "step": 840 }, { "epoch": 2.21, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -1.5474607944488525, "logits/rejected": -1.549788236618042, "logps/chosen": -31.952754974365234, "logps/rejected": -40.619407653808594, "loss": 0.6651, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00822580885142088, "rewards/margins": 0.057800523936748505, "rewards/rejected": -0.04957471415400505, "step": 850 }, { "epoch": 2.23, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -1.3754708766937256, "logits/rejected": -1.3716084957122803, "logps/chosen": -36.57474136352539, "logps/rejected": -37.647613525390625, "loss": 0.6716, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.002930630696937442, "rewards/margins": 0.04462647810578346, "rewards/rejected": -0.04755710810422897, "step": 860 }, { "epoch": 2.26, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -1.4011175632476807, "logits/rejected": -1.4010181427001953, "logps/chosen": -35.5493049621582, "logps/rejected": -40.10515213012695, "loss": 0.6662, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0029350135009735823, "rewards/margins": 0.055523864924907684, "rewards/rejected": -0.052588850259780884, "step": 870 }, { "epoch": 2.29, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.4290226697921753, "logits/rejected": -1.4171994924545288, "logps/chosen": -33.74538040161133, "logps/rejected": -38.27408981323242, "loss": 0.6654, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.0036120389122515917, "rewards/margins": 0.057132624089717865, "rewards/rejected": -0.05352058261632919, "step": 880 }, { "epoch": 2.31, "learning_rate": 2.256719512667651e-06, "logits/chosen": -1.5206860303878784, "logits/rejected": -1.5256131887435913, "logps/chosen": -33.839393615722656, "logps/rejected": -38.63503646850586, "loss": 0.6659, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.005094523541629314, "rewards/margins": 0.056891001760959625, "rewards/rejected": -0.061985522508621216, "step": 890 }, { "epoch": 2.34, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -1.4437249898910522, "logits/rejected": -1.4366403818130493, "logps/chosen": -34.63188552856445, "logps/rejected": -38.323524475097656, "loss": 0.6635, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.0022219305392354727, "rewards/margins": 0.06143581122159958, "rewards/rejected": -0.05921388417482376, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -1.7487633228302002, "eval_logits/rejected": -1.7445435523986816, "eval_logps/chosen": -37.12141799926758, "eval_logps/rejected": -41.19174575805664, "eval_loss": 0.6904172301292419, "eval_rewards/accuracies": 0.5365448594093323, "eval_rewards/chosen": -0.03086867742240429, "eval_rewards/margins": 0.00588257284834981, "eval_rewards/rejected": -0.03675125539302826, "eval_runtime": 145.8716, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 900 }, { "epoch": 2.36, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -1.433040738105774, "logits/rejected": -1.4330635070800781, "logps/chosen": -31.19219970703125, "logps/rejected": -41.520694732666016, "loss": 0.6604, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.00025037964223884046, "rewards/margins": 0.06805966049432755, "rewards/rejected": -0.06831003725528717, "step": 910 }, { "epoch": 2.39, "learning_rate": 2.088219349982323e-06, "logits/chosen": -1.3752285242080688, "logits/rejected": -1.3669588565826416, "logps/chosen": -32.70459747314453, "logps/rejected": -40.24443817138672, "loss": 0.6653, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.008656435646116734, "rewards/margins": 0.058074213564395905, "rewards/rejected": -0.06673064827919006, "step": 920 }, { "epoch": 2.42, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -1.5194597244262695, "logits/rejected": -1.517913818359375, "logps/chosen": -30.247411727905273, "logps/rejected": -39.21205520629883, "loss": 0.664, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.002503753872588277, "rewards/margins": 0.06096818298101425, "rewards/rejected": -0.06347193568944931, "step": 930 }, { "epoch": 2.44, "learning_rate": 1.976895560604729e-06, "logits/chosen": -1.4003164768218994, "logits/rejected": -1.4109015464782715, "logps/chosen": -35.36278533935547, "logps/rejected": -40.1971549987793, "loss": 0.6632, "rewards/accuracies": 0.8125, "rewards/chosen": -0.009377234615385532, "rewards/margins": 0.06269785016775131, "rewards/rejected": -0.07207508385181427, "step": 940 }, { "epoch": 2.47, "learning_rate": 1.921622518534466e-06, "logits/chosen": -1.4363105297088623, "logits/rejected": -1.4391801357269287, "logps/chosen": -31.704153060913086, "logps/rejected": -38.39413070678711, "loss": 0.6663, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.010564305819571018, "rewards/margins": 0.056016188114881516, "rewards/rejected": -0.06658048927783966, "step": 950 }, { "epoch": 2.49, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -1.411747932434082, "logits/rejected": -1.4068377017974854, "logps/chosen": -34.955177307128906, "logps/rejected": -41.352115631103516, "loss": 0.6619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.008799608796834946, "rewards/margins": 0.06565765291452408, "rewards/rejected": -0.07445726543664932, "step": 960 }, { "epoch": 2.52, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -1.3314030170440674, "logits/rejected": -1.3333518505096436, "logps/chosen": -33.6180534362793, "logps/rejected": -42.525047302246094, "loss": 0.6617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.008053036406636238, "rewards/margins": 0.06538151204586029, "rewards/rejected": -0.07343455404043198, "step": 970 }, { "epoch": 2.55, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -1.3536522388458252, "logits/rejected": -1.3478691577911377, "logps/chosen": -37.943336486816406, "logps/rejected": -44.42793655395508, "loss": 0.6663, "rewards/accuracies": 0.75, "rewards/chosen": -0.025382736697793007, "rewards/margins": 0.0562770739197731, "rewards/rejected": -0.08165980130434036, "step": 980 }, { "epoch": 2.57, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -1.3406635522842407, "logits/rejected": -1.3342511653900146, "logps/chosen": -32.205875396728516, "logps/rejected": -44.02067565917969, "loss": 0.6603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.012029221281409264, "rewards/margins": 0.0682787075638771, "rewards/rejected": -0.08030791580677032, "step": 990 }, { "epoch": 2.6, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -1.3228440284729004, "logits/rejected": -1.3131605386734009, "logps/chosen": -33.538795471191406, "logps/rejected": -35.858123779296875, "loss": 0.6719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01790105737745762, "rewards/margins": 0.04468285292387009, "rewards/rejected": -0.06258390843868256, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -1.660041332244873, "eval_logits/rejected": -1.6560044288635254, "eval_logps/chosen": -38.48649597167969, "eval_logps/rejected": -42.736080169677734, "eval_loss": 0.6896607875823975, "eval_rewards/accuracies": 0.5485880374908447, "eval_rewards/chosen": -0.04451945051550865, "eval_rewards/margins": 0.007675125263631344, "eval_rewards/rejected": -0.05219458416104317, "eval_runtime": 145.8291, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 1000 }, { "epoch": 2.62, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -1.3907979726791382, "logits/rejected": -1.3904699087142944, "logps/chosen": -33.401824951171875, "logps/rejected": -37.548805236816406, "loss": 0.6661, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.015776285901665688, "rewards/margins": 0.05712694674730301, "rewards/rejected": -0.07290322333574295, "step": 1010 }, { "epoch": 2.65, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -1.4413875341415405, "logits/rejected": -1.4416849613189697, "logps/chosen": -33.36183547973633, "logps/rejected": -38.4256477355957, "loss": 0.665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.014182562939822674, "rewards/margins": 0.05841582268476486, "rewards/rejected": -0.07259838283061981, "step": 1020 }, { "epoch": 2.68, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -1.3972914218902588, "logits/rejected": -1.4021806716918945, "logps/chosen": -34.44452667236328, "logps/rejected": -34.45269775390625, "loss": 0.6727, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.021644581109285355, "rewards/margins": 0.043045226484537125, "rewards/rejected": -0.06468981504440308, "step": 1030 }, { "epoch": 2.7, "learning_rate": 1.440887158673332e-06, "logits/chosen": -1.3861340284347534, "logits/rejected": -1.377633810043335, "logps/chosen": -32.912872314453125, "logps/rejected": -42.408958435058594, "loss": 0.6588, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.01734977774322033, "rewards/margins": 0.07173751294612885, "rewards/rejected": -0.08908729255199432, "step": 1040 }, { "epoch": 2.73, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -1.3528214693069458, "logits/rejected": -1.3631798028945923, "logps/chosen": -35.93256378173828, "logps/rejected": -40.24216079711914, "loss": 0.666, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.023794613778591156, "rewards/margins": 0.05617685988545418, "rewards/rejected": -0.07997147738933563, "step": 1050 }, { "epoch": 2.75, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -1.4136943817138672, "logits/rejected": -1.405368447303772, "logps/chosen": -35.31805419921875, "logps/rejected": -45.923988342285156, "loss": 0.6566, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.018172740936279297, "rewards/margins": 0.07775326073169708, "rewards/rejected": -0.09592600166797638, "step": 1060 }, { "epoch": 2.78, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -1.320759892463684, "logits/rejected": -1.3234620094299316, "logps/chosen": -37.43547439575195, "logps/rejected": -40.544471740722656, "loss": 0.6625, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.014461873099207878, "rewards/margins": 0.06400416046380997, "rewards/rejected": -0.078466035425663, "step": 1070 }, { "epoch": 2.81, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -1.4195467233657837, "logits/rejected": -1.4183709621429443, "logps/chosen": -34.790035247802734, "logps/rejected": -43.464012145996094, "loss": 0.6569, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.017323976382613182, "rewards/margins": 0.07615931332111359, "rewards/rejected": -0.09348328411579132, "step": 1080 }, { "epoch": 2.83, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -1.4136641025543213, "logits/rejected": -1.4168442487716675, "logps/chosen": -32.618568420410156, "logps/rejected": -41.96255111694336, "loss": 0.6611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.017252514138817787, "rewards/margins": 0.06689468771219254, "rewards/rejected": -0.08414719998836517, "step": 1090 }, { "epoch": 2.86, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -1.3993356227874756, "logits/rejected": -1.4037957191467285, "logps/chosen": -35.935604095458984, "logps/rejected": -36.58147430419922, "loss": 0.6749, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.023666027933359146, "rewards/margins": 0.038131967186927795, "rewards/rejected": -0.06179799512028694, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -1.6285730600357056, "eval_logits/rejected": -1.6245777606964111, "eval_logps/chosen": -38.960731506347656, "eval_logps/rejected": -43.267208099365234, "eval_loss": 0.6894330978393555, "eval_rewards/accuracies": 0.5365448594093323, "eval_rewards/chosen": -0.04926181212067604, "eval_rewards/margins": 0.008244064636528492, "eval_rewards/rejected": -0.057505879551172256, "eval_runtime": 145.7626, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 1100 }, { "epoch": 2.88, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -1.3872336149215698, "logits/rejected": -1.3884273767471313, "logps/chosen": -34.366111755371094, "logps/rejected": -38.44298553466797, "loss": 0.6646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.014259574934840202, "rewards/margins": 0.06001114100217819, "rewards/rejected": -0.07427072525024414, "step": 1110 }, { "epoch": 2.91, "learning_rate": 1.049857726072005e-06, "logits/chosen": -1.2578824758529663, "logits/rejected": -1.2607439756393433, "logps/chosen": -36.28580856323242, "logps/rejected": -40.985992431640625, "loss": 0.6636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.017208317294716835, "rewards/margins": 0.06223895400762558, "rewards/rejected": -0.07944727689027786, "step": 1120 }, { "epoch": 2.94, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -1.2802751064300537, "logits/rejected": -1.2788641452789307, "logps/chosen": -33.49232864379883, "logps/rejected": -38.528602600097656, "loss": 0.67, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.024806050583720207, "rewards/margins": 0.04882260411977768, "rewards/rejected": -0.07362865656614304, "step": 1130 }, { "epoch": 2.96, "learning_rate": 9.59060791022566e-07, "logits/chosen": -1.3983075618743896, "logits/rejected": -1.3943830728530884, "logps/chosen": -34.20863723754883, "logps/rejected": -41.15024185180664, "loss": 0.6594, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.011688429862260818, "rewards/margins": 0.07027387619018555, "rewards/rejected": -0.08196230232715607, "step": 1140 }, { "epoch": 2.99, "learning_rate": 9.148382544856885e-07, "logits/chosen": -1.3060632944107056, "logits/rejected": -1.2978880405426025, "logps/chosen": -35.88400650024414, "logps/rejected": -39.35108184814453, "loss": 0.6663, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.024322666227817535, "rewards/margins": 0.05626412108540535, "rewards/rejected": -0.08058679848909378, "step": 1150 }, { "epoch": 3.01, "learning_rate": 8.714301001505568e-07, "logits/chosen": -1.332467794418335, "logits/rejected": -1.3338046073913574, "logps/chosen": -35.79069900512695, "logps/rejected": -38.749183654785156, "loss": 0.6671, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.021089451387524605, "rewards/margins": 0.05438787862658501, "rewards/rejected": -0.07547733187675476, "step": 1160 }, { "epoch": 3.04, "learning_rate": 8.288586291031025e-07, "logits/chosen": -1.4120080471038818, "logits/rejected": -1.4067761898040771, "logps/chosen": -35.47734069824219, "logps/rejected": -40.39026641845703, "loss": 0.667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.019513698294758797, "rewards/margins": 0.05497678369283676, "rewards/rejected": -0.074490487575531, "step": 1170 }, { "epoch": 3.06, "learning_rate": 7.871457125803897e-07, "logits/chosen": -1.3105064630508423, "logits/rejected": -1.3196675777435303, "logps/chosen": -35.979042053222656, "logps/rejected": -40.29875946044922, "loss": 0.6678, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.026460427790880203, "rewards/margins": 0.05291280895471573, "rewards/rejected": -0.07937324047088623, "step": 1180 }, { "epoch": 3.09, "learning_rate": 7.463127807341966e-07, "logits/chosen": -1.3309608697891235, "logits/rejected": -1.325539231300354, "logps/chosen": -33.38233947753906, "logps/rejected": -41.38855743408203, "loss": 0.66, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.012087189592421055, "rewards/margins": 0.06896394491195679, "rewards/rejected": -0.08105112612247467, "step": 1190 }, { "epoch": 3.12, "learning_rate": 7.063808116212021e-07, "logits/chosen": -1.2911185026168823, "logits/rejected": -1.2924482822418213, "logps/chosen": -35.253963470458984, "logps/rejected": -42.57808303833008, "loss": 0.6551, "rewards/accuracies": 0.75, "rewards/chosen": -0.017635192722082138, "rewards/margins": 0.08054514229297638, "rewards/rejected": -0.09818033874034882, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -1.6238328218460083, "eval_logits/rejected": -1.6198344230651855, "eval_logps/chosen": -39.01145935058594, "eval_logps/rejected": -43.33791732788086, "eval_loss": 0.6893402338027954, "eval_rewards/accuracies": 0.5365448594093323, "eval_rewards/chosen": -0.049769096076488495, "eval_rewards/margins": 0.008443917147815228, "eval_rewards/rejected": -0.058213010430336, "eval_runtime": 145.8737, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1200 }, { "epoch": 3.14, "learning_rate": 6.673703204254348e-07, "logits/chosen": -1.2456345558166504, "logits/rejected": -1.2452775239944458, "logps/chosen": -37.221336364746094, "logps/rejected": -42.06071853637695, "loss": 0.6544, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.01168847642838955, "rewards/margins": 0.08182507008314133, "rewards/rejected": -0.09351354837417603, "step": 1210 }, { "epoch": 3.17, "learning_rate": 6.293013489185315e-07, "logits/chosen": -1.3760040998458862, "logits/rejected": -1.3689346313476562, "logps/chosen": -33.46622848510742, "logps/rejected": -42.4327392578125, "loss": 0.6562, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.017625439912080765, "rewards/margins": 0.07791656255722046, "rewards/rejected": -0.09554200619459152, "step": 1220 }, { "epoch": 3.19, "learning_rate": 5.921934551632086e-07, "logits/chosen": -1.2549601793289185, "logits/rejected": -1.2447240352630615, "logps/chosen": -35.567508697509766, "logps/rejected": -42.02611541748047, "loss": 0.6551, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.011818263679742813, "rewards/margins": 0.07950461655855179, "rewards/rejected": -0.0913228839635849, "step": 1230 }, { "epoch": 3.22, "learning_rate": 5.560657034652405e-07, "logits/chosen": -1.3439350128173828, "logits/rejected": -1.338648796081543, "logps/chosen": -33.191280364990234, "logps/rejected": -36.837867736816406, "loss": 0.6678, "rewards/accuracies": 0.6875, "rewards/chosen": -0.024910490959882736, "rewards/margins": 0.05479263514280319, "rewards/rejected": -0.07970312982797623, "step": 1240 }, { "epoch": 3.25, "learning_rate": 5.2093665457911e-07, "logits/chosen": -1.3508336544036865, "logits/rejected": -1.3587679862976074, "logps/chosen": -37.35521697998047, "logps/rejected": -39.65736770629883, "loss": 0.6635, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.019598282873630524, "rewards/margins": 0.06179226562380791, "rewards/rejected": -0.08139055222272873, "step": 1250 }, { "epoch": 3.27, "learning_rate": 4.868243561723535e-07, "logits/chosen": -1.3516252040863037, "logits/rejected": -1.3514872789382935, "logps/chosen": -35.94284439086914, "logps/rejected": -42.4056510925293, "loss": 0.6611, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.029142867773771286, "rewards/margins": 0.0667649507522583, "rewards/rejected": -0.09590782225131989, "step": 1260 }, { "epoch": 3.3, "learning_rate": 4.537463335535161e-07, "logits/chosen": -1.265148401260376, "logits/rejected": -1.2630140781402588, "logps/chosen": -34.26659393310547, "logps/rejected": -43.09412384033203, "loss": 0.6533, "rewards/accuracies": 0.8125, "rewards/chosen": -0.01175761315971613, "rewards/margins": 0.08397753536701202, "rewards/rejected": -0.09573514014482498, "step": 1270 }, { "epoch": 3.32, "learning_rate": 4.217195806684629e-07, "logits/chosen": -1.1799885034561157, "logits/rejected": -1.1763312816619873, "logps/chosen": -36.85099411010742, "logps/rejected": -38.88633346557617, "loss": 0.6628, "rewards/accuracies": 0.75, "rewards/chosen": -0.016135532408952713, "rewards/margins": 0.06346292048692703, "rewards/rejected": -0.07959844172000885, "step": 1280 }, { "epoch": 3.35, "learning_rate": 3.907605513696808e-07, "logits/chosen": -1.353476881980896, "logits/rejected": -1.339864730834961, "logps/chosen": -36.990108489990234, "logps/rejected": -45.27104568481445, "loss": 0.6558, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.0264253169298172, "rewards/margins": 0.0786319449543953, "rewards/rejected": -0.1050572618842125, "step": 1290 }, { "epoch": 3.38, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -1.304051160812378, "logits/rejected": -1.3077205419540405, "logps/chosen": -35.44999313354492, "logps/rejected": -47.28888702392578, "loss": 0.6489, "rewards/accuracies": 0.875, "rewards/chosen": -0.019323688000440598, "rewards/margins": 0.09253297001123428, "rewards/rejected": -0.11185667663812637, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -1.6211615800857544, "eval_logits/rejected": -1.6171820163726807, "eval_logps/chosen": -39.072174072265625, "eval_logps/rejected": -43.41142654418945, "eval_loss": 0.6892833113670349, "eval_rewards/accuracies": 0.5394518375396729, "eval_rewards/chosen": -0.050376225262880325, "eval_rewards/margins": 0.008571851067245007, "eval_rewards/rejected": -0.058948077261447906, "eval_runtime": 145.8858, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1300 }, { "epoch": 3.4, "learning_rate": 3.321087280364757e-07, "logits/chosen": -1.2905550003051758, "logits/rejected": -1.2912893295288086, "logps/chosen": -38.21614074707031, "logps/rejected": -47.26013946533203, "loss": 0.6547, "rewards/accuracies": 0.8125, "rewards/chosen": -0.01972118392586708, "rewards/margins": 0.08203905820846558, "rewards/rejected": -0.10176024585962296, "step": 1310 }, { "epoch": 3.43, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.3596677780151367, "logits/rejected": -1.3584003448486328, "logps/chosen": -33.941978454589844, "logps/rejected": -39.85774230957031, "loss": 0.6596, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.01848122850060463, "rewards/margins": 0.07056263089179993, "rewards/rejected": -0.08904386311769485, "step": 1320 }, { "epoch": 3.45, "learning_rate": 2.779113783626916e-07, "logits/chosen": -1.2977750301361084, "logits/rejected": -1.2993113994598389, "logps/chosen": -35.68281936645508, "logps/rejected": -42.771202087402344, "loss": 0.6558, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.013085673563182354, "rewards/margins": 0.07813958078622818, "rewards/rejected": -0.09122525155544281, "step": 1330 }, { "epoch": 3.48, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -1.3435966968536377, "logits/rejected": -1.3425482511520386, "logps/chosen": -32.46406555175781, "logps/rejected": -40.374244689941406, "loss": 0.659, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.012653985992074013, "rewards/margins": 0.07116873562335968, "rewards/rejected": -0.08382271975278854, "step": 1340 }, { "epoch": 3.51, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -1.362319827079773, "logits/rejected": -1.3474690914154053, "logps/chosen": -36.95580291748047, "logps/rejected": -47.64240264892578, "loss": 0.6519, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.026391323655843735, "rewards/margins": 0.08738837391138077, "rewards/rejected": -0.11377968639135361, "step": 1350 }, { "epoch": 3.53, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -1.3275715112686157, "logits/rejected": -1.3374977111816406, "logps/chosen": -33.35503387451172, "logps/rejected": -41.160377502441406, "loss": 0.6572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.012865647673606873, "rewards/margins": 0.07634075731039047, "rewards/rejected": -0.08920640498399734, "step": 1360 }, { "epoch": 3.56, "learning_rate": 1.833161662683672e-07, "logits/chosen": -1.4463578462600708, "logits/rejected": -1.4461679458618164, "logps/chosen": -32.90170669555664, "logps/rejected": -47.23381423950195, "loss": 0.6432, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.008712759241461754, "rewards/margins": 0.10531653463840485, "rewards/rejected": -0.11402928829193115, "step": 1370 }, { "epoch": 3.58, "learning_rate": 1.626139998169246e-07, "logits/chosen": -1.3187510967254639, "logits/rejected": -1.3259624242782593, "logps/chosen": -35.57271957397461, "logps/rejected": -47.857994079589844, "loss": 0.6506, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.013922369107604027, "rewards/margins": 0.09016112238168716, "rewards/rejected": -0.10408350080251694, "step": 1380 }, { "epoch": 3.61, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -1.2786071300506592, "logits/rejected": -1.2746905088424683, "logps/chosen": -35.84669876098633, "logps/rejected": -39.81802749633789, "loss": 0.659, "rewards/accuracies": 0.8125, "rewards/chosen": -0.010794862173497677, "rewards/margins": 0.07141149789094925, "rewards/rejected": -0.0822063684463501, "step": 1390 }, { "epoch": 3.64, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -1.401760458946228, "logits/rejected": -1.4005050659179688, "logps/chosen": -32.612770080566406, "logps/rejected": -40.44251251220703, "loss": 0.6597, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.013162161223590374, "rewards/margins": 0.06981828063726425, "rewards/rejected": -0.0829804539680481, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -1.6204686164855957, "eval_logits/rejected": -1.6164851188659668, "eval_logps/chosen": -39.084259033203125, "eval_logps/rejected": -43.422786712646484, "eval_loss": 0.6892901659011841, "eval_rewards/accuracies": 0.5423588156700134, "eval_rewards/chosen": -0.050497058779001236, "eval_rewards/margins": 0.008564572781324387, "eval_rewards/rejected": -0.05906163901090622, "eval_runtime": 145.7418, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 1400 }, { "epoch": 3.66, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -1.372238278388977, "logits/rejected": -1.3673722743988037, "logps/chosen": -35.539161682128906, "logps/rejected": -37.22252655029297, "loss": 0.6659, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.018966300413012505, "rewards/margins": 0.05771438404917717, "rewards/rejected": -0.07668069750070572, "step": 1410 }, { "epoch": 3.69, "learning_rate": 9.191080703056604e-08, "logits/chosen": -1.321447730064392, "logits/rejected": -1.3224408626556396, "logps/chosen": -34.87453079223633, "logps/rejected": -43.16680145263672, "loss": 0.6608, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.015096550807356834, "rewards/margins": 0.06733135879039764, "rewards/rejected": -0.08242791891098022, "step": 1420 }, { "epoch": 3.71, "learning_rate": 7.730678442730539e-08, "logits/chosen": -1.271436095237732, "logits/rejected": -1.265836477279663, "logps/chosen": -35.28139114379883, "logps/rejected": -47.02886199951172, "loss": 0.6503, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.01139620877802372, "rewards/margins": 0.09199832379817963, "rewards/rejected": -0.10339454561471939, "step": 1430 }, { "epoch": 3.74, "learning_rate": 6.394742864787806e-08, "logits/chosen": -1.285681962966919, "logits/rejected": -1.2799713611602783, "logps/chosen": -30.83676528930664, "logps/rejected": -40.77880096435547, "loss": 0.6574, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.015235202386975288, "rewards/margins": 0.07524626702070236, "rewards/rejected": -0.0904814749956131, "step": 1440 }, { "epoch": 3.77, "learning_rate": 5.183960310644748e-08, "logits/chosen": -1.337096929550171, "logits/rejected": -1.3268693685531616, "logps/chosen": -34.95880889892578, "logps/rejected": -44.96342086791992, "loss": 0.6587, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02477274276316166, "rewards/margins": 0.07200786471366882, "rewards/rejected": -0.09678061306476593, "step": 1450 }, { "epoch": 3.79, "learning_rate": 4.098952823928693e-08, "logits/chosen": -1.2949423789978027, "logits/rejected": -1.2914998531341553, "logps/chosen": -35.32928466796875, "logps/rejected": -39.03660583496094, "loss": 0.664, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.02019507996737957, "rewards/margins": 0.06126277893781662, "rewards/rejected": -0.08145786076784134, "step": 1460 }, { "epoch": 3.82, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -1.3512227535247803, "logits/rejected": -1.3558355569839478, "logps/chosen": -33.2025260925293, "logps/rejected": -41.91522979736328, "loss": 0.656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011333522386848927, "rewards/margins": 0.07819454371929169, "rewards/rejected": -0.08952806890010834, "step": 1470 }, { "epoch": 3.84, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -1.3535398244857788, "logits/rejected": -1.3626043796539307, "logps/chosen": -32.97187042236328, "logps/rejected": -37.688446044921875, "loss": 0.6605, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.012766157276928425, "rewards/margins": 0.06783264130353928, "rewards/rejected": -0.08059880137443542, "step": 1480 }, { "epoch": 3.87, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -1.2888884544372559, "logits/rejected": -1.2834962606430054, "logps/chosen": -35.54216003417969, "logps/rejected": -40.272682189941406, "loss": 0.6634, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.018856767565011978, "rewards/margins": 0.061765290796756744, "rewards/rejected": -0.08062206208705902, "step": 1490 }, { "epoch": 3.9, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -1.3495625257492065, "logits/rejected": -1.346825122833252, "logps/chosen": -35.6667366027832, "logps/rejected": -43.17388153076172, "loss": 0.6557, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.020411662757396698, "rewards/margins": 0.0787249356508255, "rewards/rejected": -0.0991365909576416, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -1.6205651760101318, "eval_logits/rejected": -1.616579294204712, "eval_logps/chosen": -39.0870246887207, "eval_logps/rejected": -43.4185791015625, "eval_loss": 0.6893215179443359, "eval_rewards/accuracies": 0.5423588156700134, "eval_rewards/chosen": -0.05052470788359642, "eval_rewards/margins": 0.008494864217936993, "eval_rewards/rejected": -0.05901956930756569, "eval_runtime": 145.7021, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.295, "step": 1500 }, { "epoch": 3.92, "learning_rate": 5.777746105209147e-09, "logits/chosen": -1.4113116264343262, "logits/rejected": -1.411259651184082, "logps/chosen": -30.929424285888672, "logps/rejected": -41.88774871826172, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": -0.011131499893963337, "rewards/margins": 0.08034153282642365, "rewards/rejected": -0.09147302061319351, "step": 1510 }, { "epoch": 3.95, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -1.276719331741333, "logits/rejected": -1.2754055261611938, "logps/chosen": -34.34500503540039, "logps/rejected": -42.410675048828125, "loss": 0.6587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.013031329028308392, "rewards/margins": 0.07233807444572449, "rewards/rejected": -0.08536941558122635, "step": 1520 }, { "epoch": 3.97, "learning_rate": 6.421917227455999e-10, "logits/chosen": -1.4115439653396606, "logits/rejected": -1.4091360569000244, "logps/chosen": -33.317054748535156, "logps/rejected": -40.02583694458008, "loss": 0.6612, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.018429961055517197, "rewards/margins": 0.0669919028878212, "rewards/rejected": -0.0854218453168869, "step": 1530 }, { "epoch": 4.0, "learning_rate": 0.0, "logits/chosen": -1.398667573928833, "logits/rejected": -1.3992483615875244, "logps/chosen": -33.088409423828125, "logps/rejected": -36.9452018737793, "loss": 0.6707, "rewards/accuracies": 0.7791666388511658, "rewards/chosen": -0.024684693664312363, "rewards/margins": 0.0468655489385128, "rewards/rejected": -0.07155025750398636, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 0.5396727961379212, "train_runtime": 10793.7948, "train_samples_per_second": 1.141, "train_steps_per_second": 0.143 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }