{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8668904304504395, "logits/rejected": -1.8712035417556763, "logps/chosen": -36.981239318847656, "logps/rejected": -33.63866424560547, "loss": 0.6837, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.017751876264810562, "rewards/margins": 0.023763436824083328, "rewards/rejected": -0.0060115596279501915, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9975477457046509, "logits/rejected": -2.0001885890960693, "logps/chosen": -29.638402938842773, "logps/rejected": -29.045080184936523, "loss": 0.7005, "rewards/accuracies": 0.4375, "rewards/chosen": 0.002654150128364563, "rewards/margins": -0.010201702825725079, "rewards/rejected": 0.012855852022767067, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9209153652191162, "logits/rejected": -1.9182332754135132, "logps/chosen": -31.392269134521484, "logps/rejected": -33.214996337890625, "loss": 0.6893, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.016679534688591957, "rewards/margins": 0.014253495261073112, "rewards/rejected": 0.0024260382633656263, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.017472505569458, "logits/rejected": -2.0087223052978516, "logps/chosen": -32.56648635864258, "logps/rejected": -32.53681564331055, "loss": 0.6837, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007227012421935797, "rewards/margins": 0.02412785217165947, "rewards/rejected": -0.016900835558772087, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8619842529296875, "logits/rejected": -1.8512147665023804, "logps/chosen": -33.57755661010742, "logps/rejected": -35.45317840576172, "loss": 0.7035, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.014055396430194378, "rewards/margins": -0.013470378704369068, "rewards/rejected": -0.0005850124871358275, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.940241813659668, "logits/rejected": -1.942185401916504, "logps/chosen": -32.55712127685547, "logps/rejected": -33.23926544189453, "loss": 0.6629, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.030143504962325096, "rewards/margins": 0.07766537368297577, "rewards/rejected": -0.047521863132715225, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.070986270904541, "logits/rejected": -2.0759458541870117, "logps/chosen": -33.997291564941406, "logps/rejected": -36.65125274658203, "loss": 0.6775, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.006164415739476681, "rewards/margins": 0.051500022411346436, "rewards/rejected": -0.057664431631565094, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9328521490097046, "logits/rejected": -1.935974359512329, "logps/chosen": -34.317806243896484, "logps/rejected": -34.648555755615234, "loss": 0.6465, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07700104266405106, "rewards/margins": 0.11630574613809586, "rewards/rejected": -0.0393047034740448, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.941239356994629, "logits/rejected": -1.9457323551177979, "logps/chosen": -32.385948181152344, "logps/rejected": -32.34455108642578, "loss": 0.6805, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.057031381875276566, "rewards/margins": 0.046979233622550964, "rewards/rejected": 0.010052147321403027, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.0390255451202393, "logits/rejected": -2.0370407104492188, "logps/chosen": -32.15161895751953, "logps/rejected": -31.304489135742188, "loss": 0.6577, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.06245182827115059, "rewards/margins": 0.08924683928489685, "rewards/rejected": -0.02679501473903656, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.233787775039673, "eval_logits/rejected": -2.228933572769165, "eval_logps/chosen": -34.04001235961914, "eval_logps/rejected": -37.535667419433594, "eval_loss": 0.6948937773704529, "eval_rewards/accuracies": 0.5315614938735962, "eval_rewards/chosen": -0.0038188453763723373, "eval_rewards/margins": 0.00951432902365923, "eval_rewards/rejected": -0.013333176262676716, "eval_runtime": 145.7707, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.994390845298767, "logits/rejected": -1.991999864578247, "logps/chosen": -33.1099739074707, "logps/rejected": -34.02802276611328, "loss": 0.6747, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09369214624166489, "rewards/margins": 0.0927465409040451, "rewards/rejected": 0.0009456165134906769, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0057625770568848, "logits/rejected": -1.9974448680877686, "logps/chosen": -32.31781768798828, "logps/rejected": -32.132328033447266, "loss": 0.6694, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0889604315161705, "rewards/margins": 0.06853620707988739, "rewards/rejected": 0.02042422816157341, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0336880683898926, "logits/rejected": -2.0257279872894287, "logps/chosen": -30.32443618774414, "logps/rejected": -32.07221221923828, "loss": 0.6522, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10589826107025146, "rewards/margins": 0.11934350430965424, "rewards/rejected": -0.013445250689983368, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9642670154571533, "logits/rejected": -1.9745124578475952, "logps/chosen": -31.212024688720703, "logps/rejected": -32.565834045410156, "loss": 0.6272, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14534947276115417, "rewards/margins": 0.16701875627040863, "rewards/rejected": -0.02166926860809326, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8759937286376953, "logits/rejected": -1.8771553039550781, "logps/chosen": -33.917335510253906, "logps/rejected": -34.8185920715332, "loss": 0.6064, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.19274887442588806, "rewards/margins": 0.235683411359787, "rewards/rejected": -0.042934536933898926, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.927369475364685, "logits/rejected": -1.9239553213119507, "logps/chosen": -35.98323440551758, "logps/rejected": -32.705108642578125, "loss": 0.6464, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.13190819323062897, "rewards/margins": 0.11955627053976059, "rewards/rejected": 0.012351910583674908, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.028454542160034, "logits/rejected": -2.0210976600646973, "logps/chosen": -33.50457763671875, "logps/rejected": -31.386072158813477, "loss": 0.6077, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.20529839396476746, "rewards/margins": 0.22925393283367157, "rewards/rejected": -0.02395555004477501, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0345230102539062, "logits/rejected": -2.0397608280181885, "logps/chosen": -32.19758224487305, "logps/rejected": -32.435523986816406, "loss": 0.6083, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24304144084453583, "rewards/margins": 0.2083090990781784, "rewards/rejected": 0.03473237156867981, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.035979747772217, "logits/rejected": -2.0332181453704834, "logps/chosen": -31.255151748657227, "logps/rejected": -31.321630477905273, "loss": 0.6326, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.163661926984787, "rewards/margins": 0.16605310142040253, "rewards/rejected": -0.0023911758325994015, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9058892726898193, "logits/rejected": -1.910540223121643, "logps/chosen": -31.33197021484375, "logps/rejected": -32.817604064941406, "loss": 0.6156, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.19201864302158356, "rewards/margins": 0.21068540215492249, "rewards/rejected": -0.01866675540804863, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2316529750823975, "eval_logits/rejected": -2.2268118858337402, "eval_logps/chosen": -34.053470611572266, "eval_logps/rejected": -37.557804107666016, "eval_loss": 0.6943473219871521, "eval_rewards/accuracies": 0.5191029906272888, "eval_rewards/chosen": -0.013246187008917332, "eval_rewards/margins": 0.015584951266646385, "eval_rewards/rejected": -0.028831137344241142, "eval_runtime": 145.8114, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.01808762550354, "logits/rejected": -2.0287296772003174, "logps/chosen": -31.763586044311523, "logps/rejected": -33.926063537597656, "loss": 0.622, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.14540112018585205, "rewards/margins": 0.1897541582584381, "rewards/rejected": -0.04435301572084427, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.910658597946167, "logits/rejected": -1.9254140853881836, "logps/chosen": -29.832544326782227, "logps/rejected": -31.612533569335938, "loss": 0.6024, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.19529443979263306, "rewards/margins": 0.23334410786628723, "rewards/rejected": -0.038049641996622086, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9673402309417725, "logits/rejected": -1.9713077545166016, "logps/chosen": -33.062660217285156, "logps/rejected": -31.608203887939453, "loss": 0.5911, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24836687743663788, "rewards/margins": 0.2895973324775696, "rewards/rejected": -0.0412304513156414, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9657704830169678, "logits/rejected": -1.9439115524291992, "logps/chosen": -33.82358932495117, "logps/rejected": -35.09131622314453, "loss": 0.5733, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23111140727996826, "rewards/margins": 0.32947883009910583, "rewards/rejected": -0.09836738556623459, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.007312536239624, "logits/rejected": -2.003988742828369, "logps/chosen": -32.68491744995117, "logps/rejected": -36.24928283691406, "loss": 0.6228, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1681055724620819, "rewards/margins": 0.18515849113464355, "rewards/rejected": -0.01705293543636799, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8734614849090576, "logits/rejected": -1.8710591793060303, "logps/chosen": -33.96207809448242, "logps/rejected": -35.5266227722168, "loss": 0.6286, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15842413902282715, "rewards/margins": 0.17382602393627167, "rewards/rejected": -0.015401872806251049, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8584483861923218, "logits/rejected": -1.8560174703598022, "logps/chosen": -34.1807975769043, "logps/rejected": -31.80449867248535, "loss": 0.6325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15176042914390564, "rewards/margins": 0.17497751116752625, "rewards/rejected": -0.02321707457304001, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9630708694458008, "logits/rejected": -1.9525701999664307, "logps/chosen": -35.00975036621094, "logps/rejected": -31.84867286682129, "loss": 0.5978, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2428235560655594, "rewards/margins": 0.24820086359977722, "rewards/rejected": -0.005377279128879309, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0586793422698975, "logits/rejected": -2.043766498565674, "logps/chosen": -30.7253360748291, "logps/rejected": -32.66551971435547, "loss": 0.6462, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.13915367424488068, "rewards/margins": 0.14873233437538147, "rewards/rejected": -0.009578653611242771, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9287078380584717, "logits/rejected": -1.92616868019104, "logps/chosen": -32.39979553222656, "logps/rejected": -30.87893295288086, "loss": 0.5468, "rewards/accuracies": 0.75, "rewards/chosen": 0.36911940574645996, "rewards/margins": 0.4126061797142029, "rewards/rejected": -0.04348675161600113, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2277145385742188, "eval_logits/rejected": -2.222882032394409, "eval_logps/chosen": -34.06844711303711, "eval_logps/rejected": -37.586029052734375, "eval_loss": 0.6902604699134827, "eval_rewards/accuracies": 0.5191029906272888, "eval_rewards/chosen": -0.023726314306259155, "eval_rewards/margins": 0.024859989061951637, "eval_rewards/rejected": -0.04858630895614624, "eval_runtime": 145.7514, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9126994609832764, "logits/rejected": -1.909444808959961, "logps/chosen": -31.307331085205078, "logps/rejected": -33.79678726196289, "loss": 0.6022, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.20721367001533508, "rewards/margins": 0.24546091258525848, "rewards/rejected": -0.038247235119342804, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.9636253118515015, "logits/rejected": -1.9514182806015015, "logps/chosen": -34.32239532470703, "logps/rejected": -33.65345764160156, "loss": 0.5933, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.19157564640045166, "rewards/margins": 0.27180662751197815, "rewards/rejected": -0.08023098856210709, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -1.9983371496200562, "logits/rejected": -1.9969165325164795, "logps/chosen": -33.176841735839844, "logps/rejected": -32.538509368896484, "loss": 0.5981, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21626754105091095, "rewards/margins": 0.257517009973526, "rewards/rejected": -0.04124947637319565, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.085096836090088, "logits/rejected": -2.069387435913086, "logps/chosen": -33.758583068847656, "logps/rejected": -33.068748474121094, "loss": 0.5952, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.29456597566604614, "rewards/margins": 0.26006320118904114, "rewards/rejected": 0.034502796828746796, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.9578405618667603, "logits/rejected": -1.9569900035858154, "logps/chosen": -32.835628509521484, "logps/rejected": -32.520538330078125, "loss": 0.5707, "rewards/accuracies": 0.75, "rewards/chosen": 0.3125234544277191, "rewards/margins": 0.3512992262840271, "rewards/rejected": -0.038775794208049774, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.9134842157363892, "logits/rejected": -1.9237855672836304, "logps/chosen": -31.848628997802734, "logps/rejected": -35.309120178222656, "loss": 0.5923, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24924305081367493, "rewards/margins": 0.2634957432746887, "rewards/rejected": -0.014252680353820324, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.052818775177002, "logits/rejected": -2.046323776245117, "logps/chosen": -33.34727096557617, "logps/rejected": -29.256671905517578, "loss": 0.5993, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21328690648078918, "rewards/margins": 0.23445896804332733, "rewards/rejected": -0.021172069013118744, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.9116294384002686, "logits/rejected": -1.913830041885376, "logps/chosen": -33.83928680419922, "logps/rejected": -30.931400299072266, "loss": 0.5714, "rewards/accuracies": 0.75, "rewards/chosen": 0.27983543276786804, "rewards/margins": 0.3209769129753113, "rewards/rejected": -0.04114149510860443, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.6291831790626823, "train_runtime": 3251.1508, "train_samples_per_second": 0.947, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }