{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8665436506271362, "logits/rejected": -1.8708542585372925, "logps/chosen": -36.982460021972656, "logps/rejected": -33.656585693359375, "loss": 0.6882, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.0048276386223733425, "rewards/margins": 0.010130541399121284, "rewards/rejected": -0.005302901845425367, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9975639581680298, "logits/rejected": -2.000220537185669, "logps/chosen": -29.6324462890625, "logps/rejected": -29.057437896728516, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0019495869055390358, "rewards/margins": 0.0007479515625163913, "rewards/rejected": 0.0012016354594379663, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9208921194076538, "logits/rejected": -1.918195128440857, "logps/chosen": -31.420969009399414, "logps/rejected": -33.22509765625, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.000974868715275079, "rewards/margins": 0.0003514128620736301, "rewards/rejected": -0.0013262818101793528, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.018131971359253, "logits/rejected": -2.0093750953674316, "logps/chosen": -32.56100082397461, "logps/rejected": -32.525699615478516, "loss": 0.6905, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0031610552687197924, "rewards/margins": 0.0057663023471832275, "rewards/rejected": -0.002605247776955366, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8627980947494507, "logits/rejected": -1.852043867111206, "logps/chosen": -33.57464599609375, "logps/rejected": -35.459083557128906, "loss": 0.6945, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.003433463629335165, "rewards/margins": -0.0020852875895798206, "rewards/rejected": -0.0013481763890013099, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9412128925323486, "logits/rejected": -1.9431606531143188, "logps/chosen": -32.51996612548828, "logps/rejected": -33.226768493652344, "loss": 0.6806, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01604365184903145, "rewards/margins": 0.02712194062769413, "rewards/rejected": -0.011078287847340107, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.072383165359497, "logits/rejected": -2.0773634910583496, "logps/chosen": -33.964813232421875, "logps/rejected": -36.62574005126953, "loss": 0.6861, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004735334776341915, "rewards/margins": 0.016109289601445198, "rewards/rejected": -0.011373954825103283, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9332096576690674, "logits/rejected": -1.9363422393798828, "logps/chosen": -34.310150146484375, "logps/rejected": -34.669891357421875, "loss": 0.6749, "rewards/accuracies": 0.625, "rewards/chosen": 0.023532329127192497, "rewards/margins": 0.03902902454137802, "rewards/rejected": -0.015496693551540375, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9401609897613525, "logits/rejected": -1.944685935974121, "logps/chosen": -32.39472961425781, "logps/rejected": -32.360809326171875, "loss": 0.6865, "rewards/accuracies": 0.625, "rewards/chosen": 0.01453828252851963, "rewards/margins": 0.014918209984898567, "rewards/rejected": -0.000379929319024086, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.0377113819122314, "logits/rejected": -2.035726308822632, "logps/chosen": -32.14560317993164, "logps/rejected": -31.297948837280273, "loss": 0.6813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01904645375907421, "rewards/margins": 0.0253940187394619, "rewards/rejected": -0.006347564049065113, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.231863498687744, "eval_logits/rejected": -2.227025270462036, "eval_logps/chosen": -34.02951431274414, "eval_logps/rejected": -37.534454345703125, "eval_loss": 0.6914929151535034, "eval_rewards/accuracies": 0.5220099687576294, "eval_rewards/chosen": 0.0010076783364638686, "eval_rewards/margins": 0.004574176389724016, "eval_rewards/rejected": -0.0035664979368448257, "eval_runtime": 145.7925, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.992661476135254, "logits/rejected": -1.9902756214141846, "logps/chosen": -33.13728713989258, "logps/rejected": -34.051490783691406, "loss": 0.6834, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02130572497844696, "rewards/margins": 0.025729293003678322, "rewards/rejected": -0.004423565696924925, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0037741661071777, "logits/rejected": -1.995439887046814, "logps/chosen": -32.33051681518555, "logps/rejected": -32.13713836669922, "loss": 0.6853, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.022878289222717285, "rewards/margins": 0.01800495944917202, "rewards/rejected": 0.004873327445238829, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.032257318496704, "logits/rejected": -2.024292230606079, "logps/chosen": -30.306774139404297, "logps/rejected": -32.081024169921875, "loss": 0.6754, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03378837928175926, "rewards/margins": 0.03939158096909523, "rewards/rejected": -0.00560319609940052, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9630043506622314, "logits/rejected": -1.973233938217163, "logps/chosen": -31.224035263061523, "logps/rejected": -32.56679916381836, "loss": 0.6718, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0391254760324955, "rewards/margins": 0.045508723706007004, "rewards/rejected": -0.006383246276527643, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.874066710472107, "logits/rejected": -1.8752658367156982, "logps/chosen": -33.90449905395508, "logps/rejected": -34.78742218017578, "loss": 0.6642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05763882398605347, "rewards/margins": 0.06367169320583344, "rewards/rejected": -0.006032869219779968, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9251247644424438, "logits/rejected": -1.9217197895050049, "logps/chosen": -36.002262115478516, "logps/rejected": -32.70859909057617, "loss": 0.6788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03388362005352974, "rewards/margins": 0.031053191050887108, "rewards/rejected": 0.0028304329607635736, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.025562286376953, "logits/rejected": -2.018228530883789, "logps/chosen": -33.46672439575195, "logps/rejected": -31.409042358398438, "loss": 0.657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06622789800167084, "rewards/margins": 0.07766623049974442, "rewards/rejected": -0.011438337154686451, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0323550701141357, "logits/rejected": -2.037605047225952, "logps/chosen": -32.23218536376953, "logps/rejected": -32.439491271972656, "loss": 0.6682, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06252055615186691, "rewards/margins": 0.05339093878865242, "rewards/rejected": 0.009129621088504791, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0334537029266357, "logits/rejected": -2.030702590942383, "logps/chosen": -31.266775131225586, "logps/rejected": -31.347997665405273, "loss": 0.67, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04443611577153206, "rewards/margins": 0.050393205136060715, "rewards/rejected": -0.005957084707915783, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9037307500839233, "logits/rejected": -1.9083878993988037, "logps/chosen": -31.30868148803711, "logps/rejected": -32.823123931884766, "loss": 0.6632, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05951983854174614, "rewards/margins": 0.0659579336643219, "rewards/rejected": -0.006438094191253185, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.22880482673645, "eval_logits/rejected": -2.2239701747894287, "eval_logps/chosen": -34.05141067504883, "eval_logps/rejected": -37.58522033691406, "eval_loss": 0.6888246536254883, "eval_rewards/accuracies": 0.5660299062728882, "eval_rewards/chosen": -0.003371814964339137, "eval_rewards/margins": 0.01034836657345295, "eval_rewards/rejected": -0.013720180839300156, "eval_runtime": 145.665, "eval_samples_per_second": 2.355, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.016369342803955, "logits/rejected": -2.027027130126953, "logps/chosen": -31.75009536743164, "logps/rejected": -33.965911865234375, "loss": 0.6631, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04424174129962921, "rewards/margins": 0.06488426774740219, "rewards/rejected": -0.020642530173063278, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.909157156944275, "logits/rejected": -1.9239362478256226, "logps/chosen": -29.8031005859375, "logps/rejected": -31.647253036499023, "loss": 0.6564, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06168792396783829, "rewards/margins": 0.07950346171855927, "rewards/rejected": -0.017815548926591873, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9656283855438232, "logits/rejected": -1.9696086645126343, "logps/chosen": -33.09147644042969, "logps/rejected": -31.6539306640625, "loss": 0.6543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06519898772239685, "rewards/margins": 0.08612470328807831, "rewards/rejected": -0.02092570997774601, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.963721513748169, "logits/rejected": -1.9419059753417969, "logps/chosen": -33.84326934814453, "logps/rejected": -35.13850784301758, "loss": 0.6479, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.06209472939372063, "rewards/margins": 0.0996372401714325, "rewards/rejected": -0.03754251450300217, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.004868268966675, "logits/rejected": -2.0015549659729004, "logps/chosen": -32.729896545410156, "logps/rejected": -36.27812957763672, "loss": 0.6706, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.03903444483876228, "rewards/margins": 0.04967557638883591, "rewards/rejected": -0.010641133412718773, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8725497722625732, "logits/rejected": -1.8701589107513428, "logps/chosen": -33.99323654174805, "logps/rejected": -35.557159423828125, "loss": 0.6706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.039031852036714554, "rewards/margins": 0.04953894019126892, "rewards/rejected": -0.010507088154554367, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8571383953094482, "logits/rejected": -1.854741096496582, "logps/chosen": -34.20746612548828, "logps/rejected": -31.83416748046875, "loss": 0.6708, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0380266010761261, "rewards/margins": 0.05059366300702095, "rewards/rejected": -0.01256705541163683, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9600715637207031, "logits/rejected": -1.9495933055877686, "logps/chosen": -34.99934387207031, "logps/rejected": -31.900096893310547, "loss": 0.6544, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.07145937532186508, "rewards/margins": 0.08328022807836533, "rewards/rejected": -0.011820845305919647, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0559241771698, "logits/rejected": -2.04101300239563, "logps/chosen": -30.744888305664062, "logps/rejected": -32.62079620361328, "loss": 0.6809, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.035847391933202744, "rewards/margins": 0.029639026150107384, "rewards/rejected": 0.006208371836692095, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9259971380233765, "logits/rejected": -1.9234883785247803, "logps/chosen": -32.35750961303711, "logps/rejected": -30.92331314086914, "loss": 0.6327, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.11392021179199219, "rewards/margins": 0.13522081077098846, "rewards/rejected": -0.021300604566931725, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2262163162231445, "eval_logits/rejected": -2.2213830947875977, "eval_logps/chosen": -34.093963623046875, "eval_logps/rejected": -37.60972213745117, "eval_loss": 0.6908519268035889, "eval_rewards/accuracies": 0.49501657485961914, "eval_rewards/chosen": -0.011881927028298378, "eval_rewards/margins": 0.006738076452165842, "eval_rewards/rejected": -0.018620004877448082, "eval_runtime": 145.4585, "eval_samples_per_second": 2.358, "eval_steps_per_second": 0.296, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9112951755523682, "logits/rejected": -1.9080655574798584, "logps/chosen": -31.314626693725586, "logps/rejected": -33.85969543457031, "loss": 0.6566, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.05774437263607979, "rewards/margins": 0.08125325292348862, "rewards/rejected": -0.023508887737989426, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.960738182067871, "logits/rejected": -1.9485470056533813, "logps/chosen": -34.32622528076172, "logps/rejected": -33.65083312988281, "loss": 0.6586, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0539703294634819, "rewards/margins": 0.0763682946562767, "rewards/rejected": -0.022397976368665695, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -1.9961421489715576, "logits/rejected": -1.9947240352630615, "logps/chosen": -33.134159088134766, "logps/rejected": -32.56267547607422, "loss": 0.6536, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.07032694667577744, "rewards/margins": 0.08694546669721603, "rewards/rejected": -0.0166185162961483, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.083061933517456, "logits/rejected": -2.0673813819885254, "logps/chosen": -33.78687286376953, "logps/rejected": -33.12049102783203, "loss": 0.6568, "rewards/accuracies": 0.75, "rewards/chosen": 0.07850398868322372, "rewards/margins": 0.07899539172649384, "rewards/rejected": -0.0004914018791168928, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.9557991027832031, "logits/rejected": -1.9549639225006104, "logps/chosen": -32.84474563598633, "logps/rejected": -32.53619384765625, "loss": 0.648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0874704122543335, "rewards/margins": 0.10167930275201797, "rewards/rejected": -0.014208881184458733, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.9108169078826904, "logits/rejected": -1.9210844039916992, "logps/chosen": -31.8836612701416, "logps/rejected": -35.36156463623047, "loss": 0.6572, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06420587003231049, "rewards/margins": 0.07876714318990707, "rewards/rejected": -0.014561265707015991, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.0503978729248047, "logits/rejected": -2.043942928314209, "logps/chosen": -33.33507537841797, "logps/rejected": -29.28571128845215, "loss": 0.6582, "rewards/accuracies": 0.75, "rewards/chosen": 0.06337906420230865, "rewards/margins": 0.07523626834154129, "rewards/rejected": -0.011857211589813232, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.9104009866714478, "logits/rejected": -1.9126161336898804, "logps/chosen": -33.88220977783203, "logps/rejected": -30.987438201904297, "loss": 0.65, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.07136790454387665, "rewards/margins": 0.09433047473430634, "rewards/rejected": -0.02296256460249424, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.6689951481757226, "train_runtime": 3249.3574, "train_samples_per_second": 0.948, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }