{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.3046875, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 1.09375, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.09645080566406, "logits/rejected": 80.80389404296875, "logps/chosen": -34.27156066894531, "logps/rejected": -33.039093017578125, "loss": 0.9995, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": -0.00029834467568434775, "rewards/margins": 0.0005084889708086848, "rewards/rejected": -0.0008068337920121849, "step": 10 }, { "epoch": 0.05, "grad_norm": 1.2734375, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.66552734375, "logits/rejected": 80.5560073852539, "logps/chosen": -33.4774055480957, "logps/rejected": -30.691213607788086, "loss": 0.9987, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0021250424906611443, "rewards/margins": 0.0012622694484889507, "rewards/rejected": 0.000862772751133889, "step": 20 }, { "epoch": 0.08, "grad_norm": 1.2109375, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.51115417480469, "logits/rejected": 82.54508972167969, "logps/chosen": -33.80036926269531, "logps/rejected": -31.189748764038086, "loss": 1.0002, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.002124499063938856, "rewards/margins": -0.00021631647541653365, "rewards/rejected": 0.0023408152628690004, "step": 30 }, { "epoch": 0.1, "grad_norm": 1.1796875, "learning_rate": 4.999896948438434e-06, "logits/chosen": 81.10090637207031, "logits/rejected": 81.09576416015625, "logps/chosen": -32.7674560546875, "logps/rejected": -33.11550521850586, "loss": 0.9985, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0040660640224814415, "rewards/margins": 0.0015358638484030962, "rewards/rejected": 0.0025301999412477016, "step": 40 }, { "epoch": 0.13, "grad_norm": 1.1875, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.85154724121094, "logits/rejected": 78.85734558105469, "logps/chosen": -30.360393524169922, "logps/rejected": -30.609283447265625, "loss": 0.9962, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.008448543958365917, "rewards/margins": 0.0038085163105279207, "rewards/rejected": 0.0046400283463299274, "step": 50 }, { "epoch": 0.16, "grad_norm": 0.96484375, "learning_rate": 4.954691471941119e-06, "logits/chosen": 83.49021911621094, "logits/rejected": 83.54866027832031, "logps/chosen": -30.763973236083984, "logps/rejected": -29.17538833618164, "loss": 1.0, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.004111888352781534, "rewards/margins": -3.9446913433494046e-05, "rewards/rejected": 0.004151335451751947, "step": 60 }, { "epoch": 0.18, "grad_norm": 1.2109375, "learning_rate": 4.901618883413549e-06, "logits/chosen": 84.11228942871094, "logits/rejected": 84.1441650390625, "logps/chosen": -30.222454071044922, "logps/rejected": -32.666595458984375, "loss": 0.9996, "rewards/accuracies": 0.5, "rewards/chosen": 0.0040657008066773415, "rewards/margins": 0.0003545849467627704, "rewards/rejected": 0.003711115103214979, "step": 70 }, { "epoch": 0.21, "grad_norm": 1.2421875, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.86946868896484, "logits/rejected": 81.84814453125, "logps/chosen": -30.959096908569336, "logps/rejected": -30.652545928955078, "loss": 0.9956, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005458436906337738, "rewards/margins": 0.0044073979370296, "rewards/rejected": 0.001051038852892816, "step": 80 }, { "epoch": 0.23, "grad_norm": 1.484375, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.68418884277344, "logits/rejected": 78.65721893310547, "logps/chosen": -32.17829513549805, "logps/rejected": -30.884775161743164, "loss": 0.9961, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.00459885410964489, "rewards/margins": 0.00393189862370491, "rewards/rejected": 0.0006669552531093359, "step": 90 }, { "epoch": 0.26, "grad_norm": 1.2421875, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.70716857910156, "logits/rejected": 83.73751068115234, "logps/chosen": -33.73701477050781, "logps/rejected": -31.63702964782715, "loss": 0.9955, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005679761990904808, "rewards/margins": 0.004544637631624937, "rewards/rejected": 0.0011351245921105146, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.73394012451172, "eval_logits/rejected": 98.7273941040039, "eval_logps/chosen": -32.38990783691406, "eval_logps/rejected": -35.92463684082031, "eval_loss": 0.999876856803894, "eval_rewards/accuracies": 0.5186877250671387, "eval_rewards/chosen": 0.0005326389218680561, "eval_rewards/margins": 0.00011375291069271043, "eval_rewards/rejected": 0.00041888616397045553, "eval_runtime": 104.2424, "eval_samples_per_second": 3.29, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 1.4140625, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.85816955566406, "logits/rejected": 83.75128936767578, "logps/chosen": -32.19211959838867, "logps/rejected": -32.65901565551758, "loss": 0.9917, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.008306830190122128, "rewards/margins": 0.008338114246726036, "rewards/rejected": -3.12842421408277e-05, "step": 110 }, { "epoch": 0.31, "grad_norm": 1.3046875, "learning_rate": 4.353806263777678e-06, "logits/chosen": 83.82106018066406, "logits/rejected": 83.92265319824219, "logps/chosen": -28.150625228881836, "logps/rejected": -35.3939208984375, "loss": 0.9929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.007789556868374348, "rewards/margins": 0.0071373311802744865, "rewards/rejected": 0.000652224407531321, "step": 120 }, { "epoch": 0.34, "grad_norm": 0.9453125, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 80.96563720703125, "logits/rejected": 80.99563598632812, "logps/chosen": -30.216140747070312, "logps/rejected": -31.844036102294922, "loss": 0.994, "rewards/accuracies": 0.625, "rewards/chosen": 0.006912143435329199, "rewards/margins": 0.006036223843693733, "rewards/rejected": 0.0008759202319197357, "step": 130 }, { "epoch": 0.36, "grad_norm": 1.1171875, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 81.863525390625, "logits/rejected": 81.86921691894531, "logps/chosen": -26.845142364501953, "logps/rejected": -33.07027816772461, "loss": 0.9875, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.006021805107593536, "rewards/margins": 0.012490840628743172, "rewards/rejected": -0.006469034589827061, "step": 140 }, { "epoch": 0.39, "grad_norm": 1.21875, "learning_rate": 3.834196265035119e-06, "logits/chosen": 80.13746643066406, "logits/rejected": 80.10902404785156, "logps/chosen": -28.976547241210938, "logps/rejected": -33.208518981933594, "loss": 0.9895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.004125660751014948, "rewards/margins": 0.010545835830271244, "rewards/rejected": -0.006420175079256296, "step": 150 }, { "epoch": 0.42, "grad_norm": 1.3515625, "learning_rate": 3.636998309800573e-06, "logits/chosen": 81.72142028808594, "logits/rejected": 81.74298858642578, "logps/chosen": -33.8978157043457, "logps/rejected": -30.907711029052734, "loss": 0.9881, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.002389371395111084, "rewards/margins": 0.011941083706915379, "rewards/rejected": -0.009551710449159145, "step": 160 }, { "epoch": 0.44, "grad_norm": 1.4765625, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 82.35487365722656, "logits/rejected": 82.30474090576172, "logps/chosen": -30.870525360107422, "logps/rejected": -33.04078674316406, "loss": 0.9859, "rewards/accuracies": 0.6875, "rewards/chosen": 0.003887615632265806, "rewards/margins": 0.014094889163970947, "rewards/rejected": -0.010207273997366428, "step": 170 }, { "epoch": 0.47, "grad_norm": 1.21875, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 79.4852066040039, "logits/rejected": 79.46187591552734, "logps/chosen": -31.02083396911621, "logps/rejected": -32.165191650390625, "loss": 0.9882, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0017631975933909416, "rewards/margins": 0.011843027547001839, "rewards/rejected": -0.010079829022288322, "step": 180 }, { "epoch": 0.49, "grad_norm": 0.9609375, "learning_rate": 2.996071664294641e-06, "logits/chosen": 80.97419738769531, "logits/rejected": 80.94820404052734, "logps/chosen": -30.60634994506836, "logps/rejected": -31.083566665649414, "loss": 0.9922, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0016006485093384981, "rewards/margins": 0.007756076753139496, "rewards/rejected": -0.006155428942292929, "step": 190 }, { "epoch": 0.52, "grad_norm": 1.453125, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 76.12115478515625, "logits/rejected": 76.07009887695312, "logps/chosen": -34.18424606323242, "logps/rejected": -33.341392517089844, "loss": 0.9858, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005684881471097469, "rewards/margins": 0.014164777472615242, "rewards/rejected": -0.008479896001517773, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 98.05126190185547, "eval_logits/rejected": 98.02639770507812, "eval_logps/chosen": -32.97175979614258, "eval_logps/rejected": -36.845333099365234, "eval_loss": 0.9964954853057861, "eval_rewards/accuracies": 0.5274086594581604, "eval_rewards/chosen": -0.005285844672471285, "eval_rewards/margins": 0.0035022026859223843, "eval_rewards/rejected": -0.008788047358393669, "eval_runtime": 104.1082, "eval_samples_per_second": 3.295, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.55, "grad_norm": 1.84375, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 78.64119720458984, "logits/rejected": 78.55430603027344, "logps/chosen": -33.689414978027344, "logps/rejected": -36.20193862915039, "loss": 0.9853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.002563029993325472, "rewards/margins": 0.014721485786139965, "rewards/rejected": -0.012158457189798355, "step": 210 }, { "epoch": 0.57, "grad_norm": 1.484375, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 80.5840835571289, "logits/rejected": 80.67861938476562, "logps/chosen": -31.57720947265625, "logps/rejected": -31.91719627380371, "loss": 0.9844, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.003717451822012663, "rewards/margins": 0.015599893406033516, "rewards/rejected": -0.011882440187036991, "step": 220 }, { "epoch": 0.6, "grad_norm": 1.3984375, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 77.53582763671875, "logits/rejected": 77.5838851928711, "logps/chosen": -32.72165298461914, "logps/rejected": -35.34224319458008, "loss": 0.9859, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0008409392321482301, "rewards/margins": 0.014141863211989403, "rewards/rejected": -0.013300922699272633, "step": 230 }, { "epoch": 0.62, "grad_norm": 1.671875, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 79.64659118652344, "logits/rejected": 79.958984375, "logps/chosen": -31.332469940185547, "logps/rejected": -32.86049270629883, "loss": 0.9832, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.003555959090590477, "rewards/margins": 0.016840480268001556, "rewards/rejected": -0.01328451931476593, "step": 240 }, { "epoch": 0.65, "grad_norm": 1.484375, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 78.12522888183594, "logits/rejected": 78.17500305175781, "logps/chosen": -27.822484970092773, "logps/rejected": -31.34881019592285, "loss": 0.9878, "rewards/accuracies": 0.625, "rewards/chosen": -0.002265883143991232, "rewards/margins": 0.012214846909046173, "rewards/rejected": -0.014480730518698692, "step": 250 }, { "epoch": 0.68, "grad_norm": 1.6171875, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 75.08821105957031, "logits/rejected": 75.22389221191406, "logps/chosen": -31.112863540649414, "logps/rejected": -38.40215301513672, "loss": 0.9725, "rewards/accuracies": 0.75, "rewards/chosen": 0.004481295123696327, "rewards/margins": 0.027486557140946388, "rewards/rejected": -0.023005260154604912, "step": 260 }, { "epoch": 0.7, "grad_norm": 1.3515625, "learning_rate": 1.243452991757889e-06, "logits/chosen": 74.10564422607422, "logits/rejected": 74.13673400878906, "logps/chosen": -32.07135772705078, "logps/rejected": -33.231197357177734, "loss": 0.9857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.002924318192526698, "rewards/margins": 0.01433342695236206, "rewards/rejected": -0.017257746309041977, "step": 270 }, { "epoch": 0.73, "grad_norm": 2.03125, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 77.0625991821289, "logits/rejected": 76.84493255615234, "logps/chosen": -32.438629150390625, "logps/rejected": -31.16558265686035, "loss": 0.9913, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.006287367548793554, "rewards/margins": 0.008698503486812115, "rewards/rejected": -0.014985869638621807, "step": 280 }, { "epoch": 0.75, "grad_norm": 1.34375, "learning_rate": 8.737922755071455e-07, "logits/chosen": 77.05482482910156, "logits/rejected": 76.97974395751953, "logps/chosen": -34.25292205810547, "logps/rejected": -34.649898529052734, "loss": 0.9752, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0014867703430354595, "rewards/margins": 0.02484356239438057, "rewards/rejected": -0.026330333203077316, "step": 290 }, { "epoch": 0.78, "grad_norm": 1.4921875, "learning_rate": 7.08321427484816e-07, "logits/chosen": 72.6989974975586, "logits/rejected": 72.833984375, "logps/chosen": -33.264137268066406, "logps/rejected": -30.818592071533203, "loss": 0.9814, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0014156814431771636, "rewards/margins": 0.018581366166472435, "rewards/rejected": -0.017165686935186386, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 97.67390441894531, "eval_logits/rejected": 97.64021301269531, "eval_logps/chosen": -33.30087661743164, "eval_logps/rejected": -37.35591125488281, "eval_loss": 0.99467533826828, "eval_rewards/accuracies": 0.5888704061508179, "eval_rewards/chosen": -0.008576988242566586, "eval_rewards/margins": 0.00531682837754488, "eval_rewards/rejected": -0.01389381755143404, "eval_runtime": 103.9426, "eval_samples_per_second": 3.3, "eval_steps_per_second": 0.414, "step": 300 }, { "epoch": 0.81, "grad_norm": 1.609375, "learning_rate": 5.576113578589035e-07, "logits/chosen": 80.20366668701172, "logits/rejected": 80.20387268066406, "logps/chosen": -30.933481216430664, "logps/rejected": -34.256614685058594, "loss": 0.9801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0006994610885158181, "rewards/margins": 0.019869104027748108, "rewards/rejected": -0.020568564534187317, "step": 310 }, { "epoch": 0.83, "grad_norm": 1.5390625, "learning_rate": 4.229036944380913e-07, "logits/chosen": 77.26924133300781, "logits/rejected": 77.28764343261719, "logps/chosen": -31.467296600341797, "logps/rejected": -30.592571258544922, "loss": 0.982, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0013126448029652238, "rewards/margins": 0.01804344728589058, "rewards/rejected": -0.01673080213367939, "step": 320 }, { "epoch": 0.86, "grad_norm": 1.875, "learning_rate": 3.053082288996112e-07, "logits/chosen": 74.20513916015625, "logits/rejected": 74.25221252441406, "logps/chosen": -29.909320831298828, "logps/rejected": -34.725521087646484, "loss": 0.9747, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.004432853776961565, "rewards/margins": 0.025339430198073387, "rewards/rejected": -0.02090657688677311, "step": 330 }, { "epoch": 0.88, "grad_norm": 1.7578125, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 78.9554672241211, "logits/rejected": 78.990478515625, "logps/chosen": -33.333351135253906, "logps/rejected": -35.866722106933594, "loss": 0.9762, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0012120162136852741, "rewards/margins": 0.023768287152051926, "rewards/rejected": -0.024980302900075912, "step": 340 }, { "epoch": 0.91, "grad_norm": 1.5859375, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 77.8236083984375, "logits/rejected": 77.84061431884766, "logps/chosen": -33.3131103515625, "logps/rejected": -35.05299758911133, "loss": 0.9771, "rewards/accuracies": 0.75, "rewards/chosen": 0.0033062633592635393, "rewards/margins": 0.0229182131588459, "rewards/rejected": -0.01961195096373558, "step": 350 }, { "epoch": 0.94, "grad_norm": 1.5234375, "learning_rate": 6.41315865106129e-08, "logits/chosen": 79.53218078613281, "logits/rejected": 79.56050109863281, "logps/chosen": -29.001379013061523, "logps/rejected": -33.16984176635742, "loss": 0.9793, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005508318077772856, "rewards/margins": 0.020708225667476654, "rewards/rejected": -0.015199905261397362, "step": 360 }, { "epoch": 0.96, "grad_norm": 1.7890625, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 78.969970703125, "logits/rejected": 78.9748306274414, "logps/chosen": -33.2999267578125, "logps/rejected": -37.32087326049805, "loss": 0.9821, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004237635992467403, "rewards/margins": 0.017877381294965744, "rewards/rejected": -0.022115018218755722, "step": 370 }, { "epoch": 0.99, "grad_norm": 1.359375, "learning_rate": 2.575864278703266e-09, "logits/chosen": 72.49492645263672, "logits/rejected": 72.36249542236328, "logps/chosen": -30.828378677368164, "logps/rejected": -29.885875701904297, "loss": 0.9853, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0025225188583135605, "rewards/margins": 0.014730495400726795, "rewards/rejected": -0.01725301705300808, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.9878765378679548, "train_runtime": 2559.7639, "train_samples_per_second": 1.203, "train_steps_per_second": 0.15 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }