{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-08, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.2820512820512818e-07, "logits/chosen": -1.8662465810775757, "logits/rejected": -1.8705615997314453, "logps/chosen": -36.9855842590332, "logps/rejected": -33.65031433105469, "loss": 0.4935, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.016807515174150467, "rewards/margins": 0.03299880772829056, "rewards/rejected": -0.01619129255414009, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -1.9972314834594727, "logits/rejected": -1.999875783920288, "logps/chosen": -29.622329711914062, "logps/rejected": -29.04340171813965, "loss": 0.4989, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.01589435711503029, "rewards/margins": -0.00013793967082165182, "rewards/rejected": 0.016032297164201736, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.9198119640350342, "logits/rejected": -1.9171171188354492, "logps/chosen": -31.40401268005371, "logps/rejected": -33.211997985839844, "loss": 0.4999, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0096666868776083, "rewards/margins": 0.0044937655329704285, "rewards/rejected": 0.005172918550670147, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438433e-07, "logits/chosen": -2.0162081718444824, "logits/rejected": -2.007472515106201, "logps/chosen": -32.587196350097656, "logps/rejected": -32.514732360839844, "loss": 0.5011, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00831048097461462, "rewards/margins": -0.00666379788890481, "rewards/rejected": -0.0016466856468468904, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542186e-07, "logits/chosen": -1.8641865253448486, "logits/rejected": -1.8533992767333984, "logps/chosen": -33.56541061401367, "logps/rejected": -35.421974182128906, "loss": 0.5049, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.006343575660139322, "rewards/margins": -0.030638951808214188, "rewards/rejected": 0.02429538033902645, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941118e-07, "logits/chosen": -1.945642113685608, "logits/rejected": -1.9475781917572021, "logps/chosen": -32.572914123535156, "logps/rejected": -33.161590576171875, "loss": 0.4972, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.021815191954374313, "rewards/margins": 0.01398499310016632, "rewards/rejected": 0.007830199785530567, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413548e-07, "logits/chosen": -2.0798556804656982, "logits/rejected": -2.0848286151885986, "logps/chosen": -33.97870635986328, "logps/rejected": -36.580543518066406, "loss": 0.4965, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.007821302860975266, "rewards/margins": 0.017155062407255173, "rewards/rejected": -0.009333762340247631, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-07, "logits/chosen": -1.9424225091934204, "logits/rejected": -1.9455715417861938, "logps/chosen": -34.389671325683594, "logps/rejected": -34.575042724609375, "loss": 0.4968, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.030510548502206802, "rewards/margins": 0.016619805246591568, "rewards/rejected": 0.013890743255615234, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.736716601303429e-07, "logits/chosen": -1.9505561590194702, "logits/rejected": -1.9550676345825195, "logps/chosen": -32.456565856933594, "logps/rejected": -32.361209869384766, "loss": 0.4976, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.008682352490723133, "rewards/margins": 0.0105207534506917, "rewards/rejected": -0.0018384016584604979, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.62624545834521e-07, "logits/chosen": -2.048992156982422, "logits/rejected": -2.0470006465911865, "logps/chosen": -32.2261848449707, "logps/rejected": -31.262670516967773, "loss": 0.4981, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.01171906292438507, "rewards/margins": 0.008886159397661686, "rewards/rejected": 0.0028329025954008102, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.244107484817505, "eval_logits/rejected": -2.239222764968872, "eval_logps/chosen": -34.02288818359375, "eval_logps/rejected": -37.50926208496094, "eval_loss": 0.49965521693229675, "eval_rewards/accuracies": 0.5078904032707214, "eval_rewards/chosen": 0.009330343455076218, "eval_rewards/margins": 0.003444999223574996, "eval_rewards/rejected": 0.005885345861315727, "eval_runtime": 146.2405, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.4982572012636904e-07, "logits/chosen": -2.005356788635254, "logits/rejected": -2.0029444694519043, "logps/chosen": -33.24960708618164, "logps/rejected": -34.023834228515625, "loss": 0.5009, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.004633937496691942, "rewards/margins": -0.009062351658940315, "rewards/rejected": 0.00442841649055481, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777677e-07, "logits/chosen": -2.017059564590454, "logits/rejected": -2.008686065673828, "logps/chosen": -32.44651412963867, "logps/rejected": -32.1725959777832, "loss": 0.4978, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0012888375204056501, "rewards/margins": 0.007584270089864731, "rewards/rejected": -0.008873110637068748, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.194082707715275e-07, "logits/chosen": -2.0462288856506348, "logits/rejected": -2.038172960281372, "logps/chosen": -30.503637313842773, "logps/rejected": -32.05141067504883, "loss": 0.5043, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.022338179871439934, "rewards/margins": -0.023616474121809006, "rewards/rejected": 0.0012782930862158537, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.020402418666621e-07, "logits/chosen": -1.9769847393035889, "logits/rejected": -1.9872528314590454, "logps/chosen": -31.388320922851562, "logps/rejected": -32.554039001464844, "loss": 0.4928, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.02507348358631134, "rewards/margins": 0.04039759188890457, "rewards/rejected": -0.015324106439948082, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.8341962650351185e-07, "logits/chosen": -1.8903350830078125, "logits/rejected": -1.8914152383804321, "logps/chosen": -34.154296875, "logps/rejected": -34.76646423339844, "loss": 0.4927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.030716974288225174, "rewards/margins": 0.03808692842721939, "rewards/rejected": -0.007369957864284515, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800572e-07, "logits/chosen": -1.9426672458648682, "logits/rejected": -1.939186453819275, "logps/chosen": -36.157989501953125, "logps/rejected": -32.7253303527832, "loss": 0.4972, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.010952227748930454, "rewards/margins": 0.013016450218856335, "rewards/rejected": -0.0020642229355871677, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.430433172111807e-07, "logits/chosen": -2.0421078205108643, "logits/rejected": -2.0347187519073486, "logps/chosen": -33.78765106201172, "logps/rejected": -31.363611221313477, "loss": 0.496, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.008164674043655396, "rewards/margins": 0.0175738874822855, "rewards/rejected": -0.009409213438630104, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.216202642830543e-07, "logits/chosen": -2.0476274490356445, "logits/rejected": -2.0528929233551025, "logps/chosen": -32.52192306518555, "logps/rejected": -32.50886917114258, "loss": 0.4919, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.01828894577920437, "rewards/margins": 0.037272557616233826, "rewards/rejected": -0.018983609974384308, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.9960716642946403e-07, "logits/chosen": -2.0479583740234375, "logits/rejected": -2.0451717376708984, "logps/chosen": -31.4959774017334, "logps/rejected": -31.333343505859375, "loss": 0.4986, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.005620955489575863, "rewards/margins": 0.006484903395175934, "rewards/rejected": -0.012105859816074371, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.771853789806683e-07, "logits/chosen": -1.9188188314437866, "logits/rejected": -1.9234987497329712, "logps/chosen": -31.607952117919922, "logps/rejected": -32.79724884033203, "loss": 0.4987, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0013334359973669052, "rewards/margins": 0.0037218607030808926, "rewards/rejected": -0.005055299494415522, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.243473768234253, "eval_logits/rejected": -2.2385945320129395, "eval_logps/chosen": -34.013954162597656, "eval_logps/rejected": -37.49945068359375, "eval_loss": 0.49993959069252014, "eval_rewards/accuracies": 0.5074750781059265, "eval_rewards/chosen": 0.016473928466439247, "eval_rewards/margins": 0.0027353279292583466, "eval_rewards/rejected": 0.0137386005371809, "eval_runtime": 145.8931, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402e-07, "logits/chosen": -2.0315659046173096, "logits/rejected": -2.0422449111938477, "logps/chosen": -31.945932388305664, "logps/rejected": -33.85708999633789, "loss": 0.497, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.020296860486268997, "rewards/margins": 0.015811126679182053, "rewards/rejected": 0.0044857352040708065, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.318564697655179e-07, "logits/chosen": -1.9255174398422241, "logits/rejected": -1.9403587579727173, "logps/chosen": -30.07940101623535, "logps/rejected": -31.557880401611328, "loss": 0.4944, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02570931240916252, "rewards/margins": 0.02547053061425686, "rewards/rejected": 0.00023878086358308792, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.093227910899832e-07, "logits/chosen": -1.9828882217407227, "logits/rejected": -1.986853003501892, "logps/chosen": -33.40575408935547, "logps/rejected": -31.552501678466797, "loss": 0.4976, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00937105156481266, "rewards/margins": 0.011929656378924847, "rewards/rejected": -0.0025586034171283245, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279356e-07, "logits/chosen": -1.982568383216858, "logits/rejected": -1.9606094360351562, "logps/chosen": -34.157630920410156, "logps/rejected": -34.96028518676758, "loss": 0.4991, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00310176657512784, "rewards/margins": 0.004490714054554701, "rewards/rejected": -0.007592480629682541, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.654436768970182e-07, "logits/chosen": -2.024078845977783, "logits/rejected": -2.0207676887512207, "logps/chosen": -32.89586639404297, "logps/rejected": -36.22296905517578, "loss": 0.4952, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.023358644917607307, "rewards/margins": 0.02179408073425293, "rewards/rejected": 0.0015645644161850214, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.444597403062196e-07, "logits/chosen": -1.8911396265029907, "logits/rejected": -1.8886913061141968, "logps/chosen": -34.1867790222168, "logps/rejected": -35.52009201049805, "loss": 0.4978, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0012989563401788473, "rewards/margins": 0.013674641028046608, "rewards/rejected": -0.012375684455037117, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.2434529917578887e-07, "logits/chosen": -1.8761117458343506, "logits/rejected": -1.8735707998275757, "logps/chosen": -34.38152313232422, "logps/rejected": -31.744131088256836, "loss": 0.5019, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.012860517017543316, "rewards/margins": -0.008900386281311512, "rewards/rejected": 0.02176090143620968, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603521e-07, "logits/chosen": -1.979835867881775, "logits/rejected": -1.9692022800445557, "logps/chosen": -35.314170837402344, "logps/rejected": -31.835962295532227, "loss": 0.4935, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.033973388373851776, "rewards/margins": 0.029950443655252457, "rewards/rejected": 0.004022946115583181, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071453e-08, "logits/chosen": -2.075986385345459, "logits/rejected": -2.0609803199768066, "logps/chosen": -30.91201400756836, "logps/rejected": -32.63774871826172, "loss": 0.5005, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.009686904028058052, "rewards/margins": -0.0015827339375391603, "rewards/rejected": 0.011269642040133476, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-08, "logits/chosen": -1.9475170373916626, "logits/rejected": -1.944972038269043, "logps/chosen": -32.88249969482422, "logps/rejected": -30.8377685546875, "loss": 0.4901, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03568952530622482, "rewards/margins": 0.052456192672252655, "rewards/rejected": -0.01676667109131813, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.244415283203125, "eval_logits/rejected": -2.2395384311676025, "eval_logps/chosen": -34.01211166381836, "eval_logps/rejected": -37.495113372802734, "eval_loss": 0.5000770092010498, "eval_rewards/accuracies": 0.5045680999755859, "eval_rewards/chosen": 0.01795424334704876, "eval_rewards/margins": 0.0007461770437657833, "eval_rewards/rejected": 0.01720806397497654, "eval_runtime": 145.9415, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589034e-08, "logits/chosen": -1.9293571710586548, "logits/rejected": -1.926099419593811, "logps/chosen": -31.5543212890625, "logps/rejected": -33.74175262451172, "loss": 0.4912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.039222296327352524, "rewards/margins": 0.03890404850244522, "rewards/rejected": 0.00031825463520362973, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380912e-08, "logits/chosen": -1.9810116291046143, "logits/rejected": -1.9687116146087646, "logps/chosen": -34.56316375732422, "logps/rejected": -33.561100006103516, "loss": 0.4923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.026327725499868393, "rewards/margins": 0.04413483291864395, "rewards/rejected": -0.01780710555613041, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-08, "logits/chosen": -2.0162246227264404, "logits/rejected": -2.014770269393921, "logps/chosen": -33.477134704589844, "logps/rejected": -32.470088958740234, "loss": 0.4997, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.006926077418029308, "rewards/margins": -0.0006712455069646239, "rewards/rejected": 0.007597323507070541, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.05793773749158e-08, "logits/chosen": -2.103529214859009, "logits/rejected": -2.0877299308776855, "logps/chosen": -34.1456298828125, "logps/rejected": -33.083770751953125, "loss": 0.5012, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.027009794488549232, "rewards/margins": -0.0004009060503449291, "rewards/rejected": 0.02741069719195366, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.251801807404168e-08, "logits/chosen": -1.9753166437149048, "logits/rejected": -1.974352478981018, "logps/chosen": -33.25692367553711, "logps/rejected": -32.45876693725586, "loss": 0.4973, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.02013799361884594, "rewards/margins": 0.015033388510346413, "rewards/rejected": 0.005104603711515665, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-09, "logits/chosen": -1.931349515914917, "logits/rejected": -1.941706895828247, "logps/chosen": -32.206031799316406, "logps/rejected": -35.306983947753906, "loss": 0.4989, "rewards/accuracies": 0.5, "rewards/chosen": -0.001073227496817708, "rewards/margins": 0.013507463037967682, "rewards/rejected": -0.01458069123327732, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050324e-09, "logits/chosen": -2.070504665374756, "logits/rejected": -2.0639472007751465, "logps/chosen": -33.65058135986328, "logps/rejected": -29.202342987060547, "loss": 0.504, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.0011108577018603683, "rewards/margins": -0.018153894692659378, "rewards/rejected": 0.01926475390791893, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-10, "logits/chosen": -1.9301570653915405, "logits/rejected": -1.9323084354400635, "logps/chosen": -34.23206329345703, "logps/rejected": -30.903820037841797, "loss": 0.4945, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005591380409896374, "rewards/margins": 0.03054152801632881, "rewards/rejected": -0.024950148537755013, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.10985468208015739, "train_runtime": 628.1848, "train_samples_per_second": 4.901, "train_steps_per_second": 0.613 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }