{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8667426109313965, "logits/rejected": -1.8710602521896362, "logps/chosen": -36.991912841796875, "logps/rejected": -33.67206954956055, "loss": 0.9547, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.011750025674700737, "rewards/margins": 0.04534290358424187, "rewards/rejected": -0.03359287604689598, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9977442026138306, "logits/rejected": -2.0003952980041504, "logps/chosen": -29.659366607666016, "logps/rejected": -29.05437660217285, "loss": 1.021, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.013736436143517494, "rewards/margins": -0.02098780684173107, "rewards/rejected": 0.007251373026520014, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.920693039894104, "logits/rejected": -1.91802179813385, "logps/chosen": -31.39971351623535, "logps/rejected": -33.21495819091797, "loss": 0.9897, "rewards/accuracies": 0.5625, "rewards/chosen": 0.013103686273097992, "rewards/margins": 0.010304747149348259, "rewards/rejected": 0.002798942383378744, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.018057107925415, "logits/rejected": -2.0093047618865967, "logps/chosen": -32.565284729003906, "logps/rejected": -32.50053405761719, "loss": 1.0005, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.009215259924530983, "rewards/margins": -0.0004936732584610581, "rewards/rejected": 0.009708933532238007, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8627817630767822, "logits/rejected": -1.851999044418335, "logps/chosen": -33.549964904785156, "logps/rejected": -35.44340896606445, "loss": 1.0011, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0060120681300759315, "rewards/margins": -0.0011377219343557954, "rewards/rejected": 0.007149793207645416, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9416770935058594, "logits/rejected": -1.9436094760894775, "logps/chosen": -32.53351593017578, "logps/rejected": -33.217529296875, "loss": 0.9247, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05333293229341507, "rewards/margins": 0.09025315940380096, "rewards/rejected": -0.036920223385095596, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.072779655456543, "logits/rejected": -2.077756881713867, "logps/chosen": -34.002342224121094, "logps/rejected": -36.633216857910156, "loss": 0.9596, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.011083832010626793, "rewards/margins": 0.040393490344285965, "rewards/rejected": -0.05147732421755791, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9329026937484741, "logits/rejected": -1.936031699180603, "logps/chosen": -34.33915710449219, "logps/rejected": -34.65736770629883, "loss": 0.8892, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07092130184173584, "rewards/margins": 0.12288935482501984, "rewards/rejected": -0.051968056708574295, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9402366876602173, "logits/rejected": -1.9447383880615234, "logps/chosen": -32.383628845214844, "logps/rejected": -32.351661682128906, "loss": 0.9388, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06703362613916397, "rewards/margins": 0.06123671680688858, "rewards/rejected": 0.0057969121262431145, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.0364508628845215, "logits/rejected": -2.0344765186309814, "logps/chosen": -32.12981033325195, "logps/rejected": -31.279254913330078, "loss": 0.9007, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.08881844580173492, "rewards/margins": 0.09925105422735214, "rewards/rejected": -0.010432596318423748, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.231006622314453, "eval_logits/rejected": -2.2261619567871094, "eval_logps/chosen": -34.019046783447266, "eval_logps/rejected": -37.540069580078125, "eval_loss": 0.96916264295578, "eval_rewards/accuracies": 0.5751661062240601, "eval_rewards/chosen": 0.012404282577335835, "eval_rewards/margins": 0.031165316700935364, "eval_rewards/rejected": -0.018761036917567253, "eval_runtime": 146.0189, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.991776466369629, "logits/rejected": -1.9893957376480103, "logps/chosen": -33.122779846191406, "logps/rejected": -33.99274444580078, "loss": 0.9495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.09682749211788177, "rewards/margins": 0.0675249844789505, "rewards/rejected": 0.029302507638931274, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.003622531890869, "logits/rejected": -1.9952924251556396, "logps/chosen": -32.31382369995117, "logps/rejected": -32.122901916503906, "loss": 0.9341, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.10486602783203125, "rewards/margins": 0.07398126274347305, "rewards/rejected": 0.030884763225913048, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0311331748962402, "logits/rejected": -2.0231704711914062, "logps/chosen": -30.32816505432129, "logps/rejected": -32.052425384521484, "loss": 0.9159, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11803986132144928, "rewards/margins": 0.11757204681634903, "rewards/rejected": 0.0004678152617998421, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9613821506500244, "logits/rejected": -1.9716154336929321, "logps/chosen": -31.22269630432129, "logps/rejected": -32.548851013183594, "loss": 0.8406, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15757359564304352, "rewards/margins": 0.16875064373016357, "rewards/rejected": -0.01117704901844263, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8720792531967163, "logits/rejected": -1.8732519149780273, "logps/chosen": -33.89937973022461, "logps/rejected": -34.796844482421875, "loss": 0.7907, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.23464930057525635, "rewards/margins": 0.2663186490535736, "rewards/rejected": -0.03166933357715607, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.923437476158142, "logits/rejected": -1.9200271368026733, "logps/chosen": -35.97971725463867, "logps/rejected": -32.6976203918457, "loss": 0.8704, "rewards/accuracies": 0.625, "rewards/chosen": 0.1535695493221283, "rewards/margins": 0.1334635317325592, "rewards/rejected": 0.02010601945221424, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.0249733924865723, "logits/rejected": -2.017641067504883, "logps/chosen": -33.441261291503906, "logps/rejected": -31.389623641967773, "loss": 0.7234, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2852834165096283, "rewards/margins": 0.31550443172454834, "rewards/rejected": -0.03022097982466221, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.031703472137451, "logits/rejected": -2.036947727203369, "logps/chosen": -32.208961486816406, "logps/rejected": -32.41345977783203, "loss": 0.8097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26866063475608826, "rewards/margins": 0.21131709218025208, "rewards/rejected": 0.05734356492757797, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.032174587249756, "logits/rejected": -2.029404401779175, "logps/chosen": -31.245046615600586, "logps/rejected": -31.299081802368164, "loss": 0.8381, "rewards/accuracies": 0.625, "rewards/chosen": 0.19512517750263214, "rewards/margins": 0.17981843650341034, "rewards/rejected": 0.015306718647480011, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9025481939315796, "logits/rejected": -1.9071909189224243, "logps/chosen": -31.24801254272461, "logps/rejected": -32.809139251708984, "loss": 0.7243, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.28661441802978516, "rewards/margins": 0.3011789917945862, "rewards/rejected": -0.014564569108188152, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.227985382080078, "eval_logits/rejected": -2.2231621742248535, "eval_logps/chosen": -34.01750946044922, "eval_logps/rejected": -37.539207458496094, "eval_loss": 0.9690985083580017, "eval_rewards/accuracies": 0.565614640712738, "eval_rewards/chosen": 0.013635948300361633, "eval_rewards/margins": 0.03170585632324219, "eval_rewards/rejected": -0.018069909885525703, "eval_runtime": 145.4584, "eval_samples_per_second": 2.358, "eval_steps_per_second": 0.296, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.0139317512512207, "logits/rejected": -2.0245535373687744, "logps/chosen": -31.7435359954834, "logps/rejected": -33.9241828918457, "loss": 0.7947, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1822136789560318, "rewards/margins": 0.23139998316764832, "rewards/rejected": -0.04918632656335831, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.9063589572906494, "logits/rejected": -1.9211170673370361, "logps/chosen": -29.79791831970215, "logps/rejected": -31.584829330444336, "loss": 0.7478, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2508983910083771, "rewards/margins": 0.2722209095954895, "rewards/rejected": -0.02132250741124153, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9628435373306274, "logits/rejected": -1.9668251276016235, "logps/chosen": -33.099082946777344, "logps/rejected": -31.59071922302246, "loss": 0.7716, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.25470608472824097, "rewards/margins": 0.28783971071243286, "rewards/rejected": -0.033133648335933685, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9614289999008179, "logits/rejected": -1.939612627029419, "logps/chosen": -33.833091735839844, "logps/rejected": -35.08003234863281, "loss": 0.7052, "rewards/accuracies": 0.6875, "rewards/chosen": 0.25653010606765747, "rewards/margins": 0.3599211871623993, "rewards/rejected": -0.10339111089706421, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.0032362937927246, "logits/rejected": -1.999916434288025, "logps/chosen": -32.70505142211914, "logps/rejected": -36.224037170410156, "loss": 0.8429, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17601460218429565, "rewards/margins": 0.17530463635921478, "rewards/rejected": 0.0007099613430909812, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8701452016830444, "logits/rejected": -1.8677012920379639, "logps/chosen": -33.97339630126953, "logps/rejected": -35.50096130371094, "loss": 0.8408, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17200371623039246, "rewards/margins": 0.16907431185245514, "rewards/rejected": 0.002929417882114649, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8542945384979248, "logits/rejected": -1.851894736289978, "logps/chosen": -34.1937370300293, "logps/rejected": -31.8076114654541, "loss": 0.8124, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1630934774875641, "rewards/margins": 0.1921185553073883, "rewards/rejected": -0.029025081545114517, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9576352834701538, "logits/rejected": -1.9471737146377563, "logps/chosen": -34.961952209472656, "logps/rejected": -31.824676513671875, "loss": 0.7162, "rewards/accuracies": 0.75, "rewards/chosen": 0.31575411558151245, "rewards/margins": 0.30270126461982727, "rewards/rejected": 0.013052871450781822, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.052537202835083, "logits/rejected": -2.037666082382202, "logps/chosen": -30.679821014404297, "logps/rejected": -32.591346740722656, "loss": 0.8779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1954430788755417, "rewards/margins": 0.14705480635166168, "rewards/rejected": 0.048388272523880005, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9234060049057007, "logits/rejected": -1.9208656549453735, "logps/chosen": -32.422889709472656, "logps/rejected": -30.887353897094727, "loss": 0.6515, "rewards/accuracies": 0.75, "rewards/chosen": 0.4033745229244232, "rewards/margins": 0.45980948209762573, "rewards/rejected": -0.056434907019138336, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2234299182891846, "eval_logits/rejected": -2.218611001968384, "eval_logps/chosen": -34.04222106933594, "eval_logps/rejected": -37.57155990600586, "eval_loss": 0.9644458889961243, "eval_rewards/accuracies": 0.560215950012207, "eval_rewards/chosen": -0.006133326329290867, "eval_rewards/margins": 0.03781980276107788, "eval_rewards/rejected": -0.04395313188433647, "eval_runtime": 145.7457, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9086744785308838, "logits/rejected": -1.905432939529419, "logps/chosen": -31.29376220703125, "logps/rejected": -33.765647888183594, "loss": 0.7696, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24766740202903748, "rewards/margins": 0.2664671242237091, "rewards/rejected": -0.01879967749118805, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.958169937133789, "logits/rejected": -1.9459987878799438, "logps/chosen": -34.305747985839844, "logps/rejected": -33.66083526611328, "loss": 0.7197, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23226289451122284, "rewards/margins": 0.32985854148864746, "rewards/rejected": -0.0975956842303276, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -1.9929885864257812, "logits/rejected": -1.9915746450424194, "logps/chosen": -33.1512336730957, "logps/rejected": -32.53498077392578, "loss": 0.7279, "rewards/accuracies": 0.75, "rewards/chosen": 0.26764601469039917, "rewards/margins": 0.3119625747203827, "rewards/rejected": -0.04431656002998352, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.0791096687316895, "logits/rejected": -2.063427686691284, "logps/chosen": -33.73707580566406, "logps/rejected": -33.045745849609375, "loss": 0.7446, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3538528084754944, "rewards/margins": 0.2960215210914612, "rewards/rejected": 0.05783123895525932, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.952497124671936, "logits/rejected": -1.951664686203003, "logps/chosen": -32.81846618652344, "logps/rejected": -32.528770446777344, "loss": 0.6539, "rewards/accuracies": 0.75, "rewards/chosen": 0.37090426683425903, "rewards/margins": 0.42179951071739197, "rewards/rejected": -0.05089529603719711, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.907370924949646, "logits/rejected": -1.9176517724990845, "logps/chosen": -31.86977195739746, "logps/rejected": -35.2895393371582, "loss": 0.7562, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2679324746131897, "rewards/margins": 0.26855653524398804, "rewards/rejected": -0.0006240725633688271, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.047156572341919, "logits/rejected": -2.0407111644744873, "logps/chosen": -33.288124084472656, "logps/rejected": -29.224395751953125, "loss": 0.7344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2910745441913605, "rewards/margins": 0.28944963216781616, "rewards/rejected": 0.0016249760519713163, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.9066269397735596, "logits/rejected": -1.9088102579116821, "logps/chosen": -33.847068786621094, "logps/rejected": -30.924524307250977, "loss": 0.7185, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.31358352303504944, "rewards/margins": 0.35510215163230896, "rewards/rejected": -0.04151865094900131, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.8298485310046704, "train_runtime": 3249.7236, "train_samples_per_second": 0.947, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }