{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 26.125, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 20.75, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.08618927001953, "logits/rejected": 80.79019165039062, "logps/chosen": -34.27778625488281, "logps/rejected": -33.089111328125, "loss": 0.6843, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.014423643238842487, "rewards/margins": 0.037857502698898315, "rewards/rejected": -0.05228114500641823, "step": 10 }, { "epoch": 0.05, "grad_norm": 23.5, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.65704345703125, "logits/rejected": 80.54637908935547, "logps/chosen": -33.64678192138672, "logps/rejected": -30.826171875, "loss": 0.6907, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.017251869663596153, "rewards/margins": 0.03672502189874649, "rewards/rejected": -0.019473150372505188, "step": 20 }, { "epoch": 0.08, "grad_norm": 23.375, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.53121185302734, "logits/rejected": 82.56344604492188, "logps/chosen": -33.85637664794922, "logps/rejected": -31.18195152282715, "loss": 0.7305, "rewards/accuracies": 0.4375, "rewards/chosen": 0.06257696449756622, "rewards/margins": -0.03417497128248215, "rewards/rejected": 0.09675192832946777, "step": 30 }, { "epoch": 0.1, "grad_norm": 21.625, "learning_rate": 4.999896948438434e-06, "logits/chosen": 81.05611419677734, "logits/rejected": 81.05009460449219, "logps/chosen": -32.67264175415039, "logps/rejected": -33.195556640625, "loss": 0.6623, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.20056962966918945, "rewards/margins": 0.13138213753700256, "rewards/rejected": 0.0691874772310257, "step": 40 }, { "epoch": 0.13, "grad_norm": 16.5, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.71464538574219, "logits/rejected": 78.72676086425781, "logps/chosen": -30.6053466796875, "logps/rejected": -30.86154556274414, "loss": 0.6674, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.23996099829673767, "rewards/margins": 0.15526394546031952, "rewards/rejected": 0.08469705283641815, "step": 50 }, { "epoch": 0.16, "grad_norm": 20.375, "learning_rate": 4.954691471941119e-06, "logits/chosen": 83.24500274658203, "logits/rejected": 83.30210876464844, "logps/chosen": -30.880474090576172, "logps/rejected": -29.403039932250977, "loss": 0.7121, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.11787523329257965, "rewards/margins": 0.04288250952959061, "rewards/rejected": 0.07499273121356964, "step": 60 }, { "epoch": 0.18, "grad_norm": 30.75, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.83995056152344, "logits/rejected": 83.8766098022461, "logps/chosen": -30.424224853515625, "logps/rejected": -32.918212890625, "loss": 0.704, "rewards/accuracies": 0.5625, "rewards/chosen": 0.08192038536071777, "rewards/margins": 0.03412293642759323, "rewards/rejected": 0.04779745265841484, "step": 70 }, { "epoch": 0.21, "grad_norm": 20.5, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.50079345703125, "logits/rejected": 81.48374938964844, "logps/chosen": -31.288936614990234, "logps/rejected": -30.928760528564453, "loss": 0.6548, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.08640192449092865, "rewards/margins": 0.15484753251075745, "rewards/rejected": -0.0684456080198288, "step": 80 }, { "epoch": 0.23, "grad_norm": 27.375, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.15573120117188, "logits/rejected": 78.12377166748047, "logps/chosen": -32.45014190673828, "logps/rejected": -31.189159393310547, "loss": 0.6587, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07521401345729828, "rewards/margins": 0.17028900980949402, "rewards/rejected": -0.09507499635219574, "step": 90 }, { "epoch": 0.26, "grad_norm": 21.25, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.35581970214844, "logits/rejected": 83.3793716430664, "logps/chosen": -33.93380355834961, "logps/rejected": -31.7375431060791, "loss": 0.6649, "rewards/accuracies": 0.625, "rewards/chosen": 0.14847493171691895, "rewards/margins": 0.14327403903007507, "rewards/rejected": 0.005200871266424656, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.69609832763672, "eval_logits/rejected": 98.68218231201172, "eval_logps/chosen": -32.47989273071289, "eval_logps/rejected": -36.04493713378906, "eval_loss": 0.7133548259735107, "eval_rewards/accuracies": 0.5220099687576294, "eval_rewards/chosen": -0.01468663476407528, "eval_rewards/margins": 0.01667696051299572, "eval_rewards/rejected": -0.0313635915517807, "eval_runtime": 104.2778, "eval_samples_per_second": 3.289, "eval_steps_per_second": 0.412, "step": 100 }, { "epoch": 0.29, "grad_norm": 27.375, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.54657745361328, "logits/rejected": 83.43326568603516, "logps/chosen": -32.3030891418457, "logps/rejected": -32.73809051513672, "loss": 0.6007, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2878861129283905, "rewards/margins": 0.3207677900791168, "rewards/rejected": -0.03288168087601662, "step": 110 }, { "epoch": 0.31, "grad_norm": 30.625, "learning_rate": 4.353806263777678e-06, "logits/chosen": 83.636474609375, "logits/rejected": 83.75120544433594, "logps/chosen": -28.302623748779297, "logps/rejected": -35.44631576538086, "loss": 0.6186, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2507820725440979, "rewards/margins": 0.2456505298614502, "rewards/rejected": 0.005131528712809086, "step": 120 }, { "epoch": 0.34, "grad_norm": 19.375, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 80.81904602050781, "logits/rejected": 80.84658813476562, "logps/chosen": -30.323780059814453, "logps/rejected": -32.11851119995117, "loss": 0.6101, "rewards/accuracies": 0.625, "rewards/chosen": 0.2334306687116623, "rewards/margins": 0.3081851899623871, "rewards/rejected": -0.07475452125072479, "step": 130 }, { "epoch": 0.36, "grad_norm": 16.375, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 81.91621398925781, "logits/rejected": 81.9150161743164, "logps/chosen": -26.948715209960938, "logps/rejected": -33.053627014160156, "loss": 0.5561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19944298267364502, "rewards/margins": 0.45154237747192383, "rewards/rejected": -0.2520993649959564, "step": 140 }, { "epoch": 0.39, "grad_norm": 17.75, "learning_rate": 3.834196265035119e-06, "logits/chosen": 80.3634262084961, "logits/rejected": 80.32890319824219, "logps/chosen": -28.899206161499023, "logps/rejected": -33.3048095703125, "loss": 0.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1959628015756607, "rewards/margins": 0.49128589034080505, "rewards/rejected": -0.29532313346862793, "step": 150 }, { "epoch": 0.42, "grad_norm": 29.625, "learning_rate": 3.636998309800573e-06, "logits/chosen": 82.13862609863281, "logits/rejected": 82.1554183959961, "logps/chosen": -33.77667236328125, "logps/rejected": -30.610042572021484, "loss": 0.6209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14403103291988373, "rewards/margins": 0.4070332646369934, "rewards/rejected": -0.2630022168159485, "step": 160 }, { "epoch": 0.44, "grad_norm": 21.375, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 82.94571685791016, "logits/rejected": 82.90376281738281, "logps/chosen": -30.815759658813477, "logps/rejected": -32.71514129638672, "loss": 0.5622, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.17741146683692932, "rewards/margins": 0.45544299483299255, "rewards/rejected": -0.2780315577983856, "step": 170 }, { "epoch": 0.47, "grad_norm": 17.25, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 80.42796325683594, "logits/rejected": 80.40777587890625, "logps/chosen": -30.608139038085938, "logps/rejected": -31.78671646118164, "loss": 0.5453, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23560428619384766, "rewards/margins": 0.48740649223327637, "rewards/rejected": -0.25180214643478394, "step": 180 }, { "epoch": 0.49, "grad_norm": 10.5, "learning_rate": 2.996071664294641e-06, "logits/chosen": 82.06446838378906, "logits/rejected": 82.06089782714844, "logps/chosen": -30.304092407226562, "logps/rejected": -30.887014389038086, "loss": 0.6087, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1849292516708374, "rewards/margins": 0.3525257706642151, "rewards/rejected": -0.16759653389453888, "step": 190 }, { "epoch": 0.52, "grad_norm": 14.75, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 77.5021743774414, "logits/rejected": 77.44898223876953, "logps/chosen": -33.84641647338867, "logps/rejected": -32.781463623046875, "loss": 0.6015, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.36252856254577637, "rewards/margins": 0.477752149105072, "rewards/rejected": -0.11522357165813446, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 98.46090698242188, "eval_logits/rejected": 98.43693542480469, "eval_logps/chosen": -32.81344985961914, "eval_logps/rejected": -36.51091384887695, "eval_loss": 0.6983966827392578, "eval_rewards/accuracies": 0.5402824282646179, "eval_rewards/chosen": -0.14810949563980103, "eval_rewards/margins": 0.0696462020277977, "eval_rewards/rejected": -0.21775569021701813, "eval_runtime": 104.0141, "eval_samples_per_second": 3.298, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.55, "grad_norm": 30.375, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 80.12822723388672, "logits/rejected": 80.03518676757812, "logps/chosen": -33.25244903564453, "logps/rejected": -35.43198013305664, "loss": 0.5658, "rewards/accuracies": 0.75, "rewards/chosen": 0.27730754017829895, "rewards/margins": 0.4556616246700287, "rewards/rejected": -0.17835409939289093, "step": 210 }, { "epoch": 0.57, "grad_norm": 17.625, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 82.24729919433594, "logits/rejected": 82.3295669555664, "logps/chosen": -31.125244140625, "logps/rejected": -31.354183197021484, "loss": 0.5216, "rewards/accuracies": 0.75, "rewards/chosen": 0.3294834792613983, "rewards/margins": 0.57957524061203, "rewards/rejected": -0.2500917315483093, "step": 220 }, { "epoch": 0.6, "grad_norm": 21.125, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 79.34232330322266, "logits/rejected": 79.39698791503906, "logps/chosen": -32.36313247680664, "logps/rejected": -34.425838470458984, "loss": 0.618, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.17704486846923828, "rewards/margins": 0.34251970052719116, "rewards/rejected": -0.16547484695911407, "step": 230 }, { "epoch": 0.62, "grad_norm": 22.5, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 81.70603942871094, "logits/rejected": 81.9966049194336, "logps/chosen": -30.78457260131836, "logps/rejected": -32.03525161743164, "loss": 0.5166, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3613971471786499, "rewards/margins": 0.5626804232597351, "rewards/rejected": -0.20128324627876282, "step": 240 }, { "epoch": 0.65, "grad_norm": 23.375, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 80.38089752197266, "logits/rejected": 80.44111633300781, "logps/chosen": -26.974994659423828, "logps/rejected": -30.13335609436035, "loss": 0.6427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24836082756519318, "rewards/margins": 0.34140732884407043, "rewards/rejected": -0.09304650872945786, "step": 250 }, { "epoch": 0.68, "grad_norm": 20.125, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 77.59573364257812, "logits/rejected": 77.74366760253906, "logps/chosen": -30.3758487701416, "logps/rejected": -36.788978576660156, "loss": 0.4885, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.4740586280822754, "rewards/margins": 0.749001145362854, "rewards/rejected": -0.2749424874782562, "step": 260 }, { "epoch": 0.7, "grad_norm": 14.875, "learning_rate": 1.243452991757889e-06, "logits/chosen": 76.95594787597656, "logits/rejected": 76.9817886352539, "logps/chosen": -30.940093994140625, "logps/rejected": -31.99422264099121, "loss": 0.5356, "rewards/accuracies": 0.75, "rewards/chosen": 0.3355324864387512, "rewards/margins": 0.5310525894165039, "rewards/rejected": -0.19552013278007507, "step": 270 }, { "epoch": 0.73, "grad_norm": 22.375, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 79.73005676269531, "logits/rejected": 79.521484375, "logps/chosen": -31.06637954711914, "logps/rejected": -30.05121421813965, "loss": 0.5946, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2974059581756592, "rewards/margins": 0.4510935842990875, "rewards/rejected": -0.15368762612342834, "step": 280 }, { "epoch": 0.75, "grad_norm": 18.125, "learning_rate": 8.737922755071455e-07, "logits/chosen": 79.8319091796875, "logits/rejected": 79.74632263183594, "logps/chosen": -33.005733489990234, "logps/rejected": -32.86864471435547, "loss": 0.4772, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.43940457701683044, "rewards/margins": 0.780119001865387, "rewards/rejected": -0.3407144248485565, "step": 290 }, { "epoch": 0.78, "grad_norm": 23.375, "learning_rate": 7.08321427484816e-07, "logits/chosen": 75.52354431152344, "logits/rejected": 75.6214599609375, "logps/chosen": -32.215476989746094, "logps/rejected": -29.42368507385254, "loss": 0.5603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.47609180212020874, "rewards/margins": 0.6047566533088684, "rewards/rejected": -0.1286647617816925, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 98.4832534790039, "eval_logits/rejected": 98.45703125, "eval_logps/chosen": -32.67014694213867, "eval_logps/rejected": -36.31406784057617, "eval_loss": 0.7083631753921509, "eval_rewards/accuracies": 0.5398671627044678, "eval_rewards/chosen": -0.09078802913427353, "eval_rewards/margins": 0.04822726547718048, "eval_rewards/rejected": -0.1390153020620346, "eval_runtime": 103.8446, "eval_samples_per_second": 3.303, "eval_steps_per_second": 0.414, "step": 300 }, { "epoch": 0.81, "grad_norm": 17.375, "learning_rate": 5.576113578589035e-07, "logits/chosen": 82.69928741455078, "logits/rejected": 82.7286376953125, "logps/chosen": -30.082874298095703, "logps/rejected": -32.789588928222656, "loss": 0.5344, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3122648298740387, "rewards/margins": 0.5481952428817749, "rewards/rejected": -0.2359304428100586, "step": 310 }, { "epoch": 0.83, "grad_norm": 17.75, "learning_rate": 4.229036944380913e-07, "logits/chosen": 80.08171081542969, "logits/rejected": 80.08326721191406, "logps/chosen": -30.4410457611084, "logps/rejected": -29.53485107421875, "loss": 0.4679, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4630066454410553, "rewards/margins": 0.7091516852378845, "rewards/rejected": -0.24614505469799042, "step": 320 }, { "epoch": 0.86, "grad_norm": 14.6875, "learning_rate": 3.053082288996112e-07, "logits/chosen": 77.24263000488281, "logits/rejected": 77.28250122070312, "logps/chosen": -28.96456527709961, "logps/rejected": -33.16306686401367, "loss": 0.4741, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5552161931991577, "rewards/margins": 0.7664985656738281, "rewards/rejected": -0.21128229796886444, "step": 330 }, { "epoch": 0.88, "grad_norm": 29.875, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 81.6305923461914, "logits/rejected": 81.66109466552734, "logps/chosen": -32.311283111572266, "logps/rejected": -34.081214904785156, "loss": 0.5334, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3603467345237732, "rewards/margins": 0.6453540921211243, "rewards/rejected": -0.2850072979927063, "step": 340 }, { "epoch": 0.91, "grad_norm": 11.625, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 80.61808776855469, "logits/rejected": 80.62725067138672, "logps/chosen": -32.49099349975586, "logps/rejected": -33.57762908935547, "loss": 0.5483, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4610978960990906, "rewards/margins": 0.6554309129714966, "rewards/rejected": -0.19433295726776123, "step": 350 }, { "epoch": 0.94, "grad_norm": 15.8125, "learning_rate": 6.41315865106129e-08, "logits/chosen": 82.13656616210938, "logits/rejected": 82.17839050292969, "logps/chosen": -28.390300750732422, "logps/rejected": -31.930139541625977, "loss": 0.5259, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.46476393938064575, "rewards/margins": 0.5768795013427734, "rewards/rejected": -0.11211560666561127, "step": 360 }, { "epoch": 0.96, "grad_norm": 20.375, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 81.61884307861328, "logits/rejected": 81.6422348022461, "logps/chosen": -31.991628646850586, "logps/rejected": -35.56809616088867, "loss": 0.5548, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3538144528865814, "rewards/margins": 0.5373049974441528, "rewards/rejected": -0.1834905445575714, "step": 370 }, { "epoch": 0.99, "grad_norm": 21.375, "learning_rate": 2.575864278703266e-09, "logits/chosen": 75.43504333496094, "logits/rejected": 75.3118667602539, "logps/chosen": -29.84956932067871, "logps/rejected": -28.521799087524414, "loss": 0.5873, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.29062420129776, "rewards/margins": 0.4351147711277008, "rewards/rejected": -0.1444905549287796, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.5890550520512965, "train_runtime": 2558.5725, "train_samples_per_second": 1.203, "train_steps_per_second": 0.15 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }