{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8664777278900146, "logits/rejected": -1.8707849979400635, "logps/chosen": -36.99364471435547, "logps/rejected": -33.650604248046875, "loss": 0.9766, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.00906434003263712, "rewards/margins": 0.023435616865754128, "rewards/rejected": -0.014371277764439583, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9982150793075562, "logits/rejected": -2.0008621215820312, "logps/chosen": -29.64394760131836, "logps/rejected": -29.04986000061035, "loss": 1.0107, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0012267641723155975, "rewards/margins": -0.010734880343079567, "rewards/rejected": 0.00950811617076397, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.920768141746521, "logits/rejected": -1.9180870056152344, "logps/chosen": -31.416461944580078, "logps/rejected": -33.2098274230957, "loss": 1.0063, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.000256747764069587, "rewards/margins": -0.0063001858070492744, "rewards/rejected": 0.00604343693703413, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.0177602767944336, "logits/rejected": -2.009014129638672, "logps/chosen": -32.56236267089844, "logps/rejected": -32.517822265625, "loss": 0.9863, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.010110180824995041, "rewards/margins": 0.013717299327254295, "rewards/rejected": -0.003607118036597967, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.862694501876831, "logits/rejected": -1.8519262075424194, "logps/chosen": -33.541160583496094, "logps/rejected": -35.44048309326172, "loss": 0.9969, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.011423684656620026, "rewards/margins": 0.003121361369267106, "rewards/rejected": 0.00830232072621584, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9417282342910767, "logits/rejected": -1.9436867237091064, "logps/chosen": -32.52958679199219, "logps/rejected": -33.216880798339844, "loss": 0.9296, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.04941769689321518, "rewards/margins": 0.08126799017190933, "rewards/rejected": -0.031850285828113556, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.0729386806488037, "logits/rejected": -2.0779125690460205, "logps/chosen": -33.99254608154297, "logps/rejected": -36.62586212158203, "loss": 0.9629, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0028400986921042204, "rewards/margins": 0.037055134773254395, "rewards/rejected": -0.03989524394273758, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9335737228393555, "logits/rejected": -1.9366981983184814, "logps/chosen": -34.332157135009766, "logps/rejected": -34.641021728515625, "loss": 0.9026, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06695752590894699, "rewards/margins": 0.10098665952682495, "rewards/rejected": -0.03402913734316826, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9401309490203857, "logits/rejected": -1.9446433782577515, "logps/chosen": -32.37213897705078, "logps/rejected": -32.343849182128906, "loss": 0.9438, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.06670050323009491, "rewards/margins": 0.05615914613008499, "rewards/rejected": 0.010541360825300217, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.037087917327881, "logits/rejected": -2.035101890563965, "logps/chosen": -32.13945388793945, "logps/rejected": -31.313283920288086, "loss": 0.8961, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07096613943576813, "rewards/margins": 0.10391455888748169, "rewards/rejected": -0.032948415726423264, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.231482982635498, "eval_logits/rejected": -2.226637363433838, "eval_logps/chosen": -34.02524185180664, "eval_logps/rejected": -37.54085159301758, "eval_loss": 0.9766585230827332, "eval_rewards/accuracies": 0.5365448594093323, "eval_rewards/chosen": 0.006516099441796541, "eval_rewards/margins": 0.02347717247903347, "eval_rewards/rejected": -0.016961071640253067, "eval_runtime": 145.8279, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9920060634613037, "logits/rejected": -1.9896419048309326, "logps/chosen": -33.146766662597656, "logps/rejected": -34.02008819580078, "loss": 0.9486, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0679345652461052, "rewards/margins": 0.061437882483005524, "rewards/rejected": 0.006496679037809372, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.003952741622925, "logits/rejected": -1.9956319332122803, "logps/chosen": -32.33639144897461, "logps/rejected": -32.133079528808594, "loss": 0.9488, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.07595954835414886, "rewards/margins": 0.05606143921613693, "rewards/rejected": 0.019898109138011932, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0316150188446045, "logits/rejected": -2.0236544609069824, "logps/chosen": -30.298206329345703, "logps/rejected": -32.07080841064453, "loss": 0.8911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.12425784766674042, "rewards/margins": 0.1367165446281433, "rewards/rejected": -0.0124586820602417, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.962376356124878, "logits/rejected": -1.9726108312606812, "logps/chosen": -31.235275268554688, "logps/rejected": -32.56925964355469, "loss": 0.8524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1290682703256607, "rewards/margins": 0.15313370525836945, "rewards/rejected": -0.024065453559160233, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8732143640518188, "logits/rejected": -1.874371886253357, "logps/chosen": -33.8985481262207, "logps/rejected": -34.81908416748047, "loss": 0.786, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.20590214431285858, "rewards/margins": 0.24918103218078613, "rewards/rejected": -0.04327889531850815, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.924538016319275, "logits/rejected": -1.921121597290039, "logps/chosen": -36.01353454589844, "logps/rejected": -32.723262786865234, "loss": 0.8894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11069967597723007, "rewards/margins": 0.11105670034885406, "rewards/rejected": -0.0003570284752640873, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.0247209072113037, "logits/rejected": -2.017392635345459, "logps/chosen": -33.49879455566406, "logps/rejected": -31.44363784790039, "loss": 0.7434, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2093469649553299, "rewards/margins": 0.2735980153083801, "rewards/rejected": -0.06425107270479202, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.031324863433838, "logits/rejected": -2.0365915298461914, "logps/chosen": -32.253074645996094, "logps/rejected": -32.45112609863281, "loss": 0.8312, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20420043170452118, "rewards/margins": 0.18039169907569885, "rewards/rejected": 0.023808732628822327, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0317559242248535, "logits/rejected": -2.0289719104766846, "logps/chosen": -31.279537200927734, "logps/rejected": -31.34115219116211, "loss": 0.8405, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.14659160375595093, "rewards/margins": 0.16264860332012177, "rewards/rejected": -0.016057008877396584, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9018064737319946, "logits/rejected": -1.9064457416534424, "logps/chosen": -31.301830291748047, "logps/rejected": -32.8339729309082, "loss": 0.7699, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21311891078948975, "rewards/margins": 0.2432461678981781, "rewards/rejected": -0.03012726828455925, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2282192707061768, "eval_logits/rejected": -2.223379611968994, "eval_logps/chosen": -34.05467224121094, "eval_logps/rejected": -37.57374572753906, "eval_loss": 0.974229633808136, "eval_rewards/accuracies": 0.530315637588501, "eval_rewards/chosen": -0.01408342458307743, "eval_rewards/margins": 0.02590302750468254, "eval_rewards/rejected": -0.03998645395040512, "eval_runtime": 145.7826, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.014596462249756, "logits/rejected": -2.0252418518066406, "logps/chosen": -31.797557830810547, "logps/rejected": -33.982398986816406, "loss": 0.8139, "rewards/accuracies": 0.625, "rewards/chosen": 0.12162177264690399, "rewards/margins": 0.20541362464427948, "rewards/rejected": -0.08379185199737549, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.9068737030029297, "logits/rejected": -1.9216482639312744, "logps/chosen": -29.838830947875977, "logps/rejected": -31.62994956970215, "loss": 0.766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1908968687057495, "rewards/margins": 0.24113738536834717, "rewards/rejected": -0.05024053901433945, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9636850357055664, "logits/rejected": -1.967655897140503, "logps/chosen": -33.12433624267578, "logps/rejected": -31.637094497680664, "loss": 0.7833, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20519034564495087, "rewards/margins": 0.2666449546813965, "rewards/rejected": -0.06145460531115532, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9611847400665283, "logits/rejected": -1.9393657445907593, "logps/chosen": -33.867958068847656, "logps/rejected": -35.12390899658203, "loss": 0.737, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2000524252653122, "rewards/margins": 0.32123422622680664, "rewards/rejected": -0.12118180096149445, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.001960277557373, "logits/rejected": -1.9986454248428345, "logps/chosen": -32.73499298095703, "logps/rejected": -36.28093719482422, "loss": 0.8434, "rewards/accuracies": 0.625, "rewards/chosen": 0.13304933905601501, "rewards/margins": 0.17225751280784607, "rewards/rejected": -0.03920816630125046, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8687576055526733, "logits/rejected": -1.866320013999939, "logps/chosen": -33.981781005859375, "logps/rejected": -35.54584503173828, "loss": 0.8296, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14463281631469727, "rewards/margins": 0.1734902262687683, "rewards/rejected": -0.028857415542006493, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.853939414024353, "logits/rejected": -1.8515437841415405, "logps/chosen": -34.234046936035156, "logps/rejected": -31.837631225585938, "loss": 0.8487, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.11448518931865692, "rewards/margins": 0.16089434921741486, "rewards/rejected": -0.04640916362404823, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9570705890655518, "logits/rejected": -1.9465986490249634, "logps/chosen": -35.030006408691406, "logps/rejected": -31.88030433654785, "loss": 0.7553, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22864773869514465, "rewards/margins": 0.2561652660369873, "rewards/rejected": -0.02751758135855198, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0522685050964355, "logits/rejected": -2.0373730659484863, "logps/chosen": -30.7352352142334, "logps/rejected": -32.61699676513672, "loss": 0.9072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.13222160935401917, "rewards/margins": 0.10783363878726959, "rewards/rejected": 0.024387964978814125, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9228973388671875, "logits/rejected": -1.9203764200210571, "logps/chosen": -32.44710159301758, "logps/rejected": -30.934436798095703, "loss": 0.6723, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3360074460506439, "rewards/margins": 0.418344646692276, "rewards/rejected": -0.08233721554279327, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2234153747558594, "eval_logits/rejected": -2.218602180480957, "eval_logps/chosen": -34.08680725097656, "eval_logps/rejected": -37.60466003417969, "eval_loss": 0.976102888584137, "eval_rewards/accuracies": 0.529900312423706, "eval_rewards/chosen": -0.036580219864845276, "eval_rewards/margins": 0.02504708059132099, "eval_rewards/rejected": -0.06162729859352112, "eval_runtime": 145.7665, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9082481861114502, "logits/rejected": -1.9050118923187256, "logps/chosen": -31.349285125732422, "logps/rejected": -33.84658432006836, "loss": 0.7796, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.17784307897090912, "rewards/margins": 0.2509470283985138, "rewards/rejected": -0.07310393452644348, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.9580894708633423, "logits/rejected": -1.9458973407745361, "logps/chosen": -34.3031005859375, "logps/rejected": -33.67659378051758, "loss": 0.7302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2050826996564865, "rewards/margins": 0.3015114367008209, "rewards/rejected": -0.09642868489027023, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -1.9932842254638672, "logits/rejected": -1.9918495416641235, "logps/chosen": -33.17847442626953, "logps/rejected": -32.54157638549805, "loss": 0.7677, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21512338519096375, "rewards/margins": 0.2585209906101227, "rewards/rejected": -0.04339758679270744, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.0800719261169434, "logits/rejected": -2.064396381378174, "logps/chosen": -33.80484390258789, "logps/rejected": -33.1123046875, "loss": 0.7636, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2621825039386749, "rewards/margins": 0.25817227363586426, "rewards/rejected": 0.004010227043181658, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.9522559642791748, "logits/rejected": -1.951424241065979, "logps/chosen": -32.8499755859375, "logps/rejected": -32.56407165527344, "loss": 0.6881, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3024839758872986, "rewards/margins": 0.3717316687107086, "rewards/rejected": -0.06924761831760406, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.9075695276260376, "logits/rejected": -1.917851209640503, "logps/chosen": -31.882221221923828, "logps/rejected": -35.31555938720703, "loss": 0.7689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22572879493236542, "rewards/margins": 0.24449090659618378, "rewards/rejected": -0.018762132152915, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.0478641986846924, "logits/rejected": -2.041414737701416, "logps/chosen": -33.331912994384766, "logps/rejected": -29.259756088256836, "loss": 0.7658, "rewards/accuracies": 0.75, "rewards/chosen": 0.2240387201309204, "rewards/margins": 0.2473684549331665, "rewards/rejected": -0.02332974039018154, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.9081246852874756, "logits/rejected": -1.9103105068206787, "logps/chosen": -33.882568359375, "logps/rejected": -30.96805191040039, "loss": 0.741, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24953576922416687, "rewards/margins": 0.3163323998451233, "rewards/rejected": -0.06679664552211761, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.8438688600218142, "train_runtime": 3250.9917, "train_samples_per_second": 0.947, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }