{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-08, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.2820512820512818e-07, "logits/chosen": -1.866280436515808, "logits/rejected": -1.8705918788909912, "logps/chosen": -37.00367736816406, "logps/rejected": -33.67123794555664, "loss": 0.4986, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.000584296474698931, "rewards/margins": 0.008817334659397602, "rewards/rejected": -0.00823303870856762, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -1.9974257946014404, "logits/rejected": -2.0000691413879395, "logps/chosen": -29.649478912353516, "logps/rejected": -29.038330078125, "loss": 0.5011, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0014564015436917543, "rewards/margins": -0.0064794206991791725, "rewards/rejected": 0.0050230189226567745, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.9199994802474976, "logits/rejected": -1.91730535030365, "logps/chosen": -31.410457611083984, "logps/rejected": -33.23019027709961, "loss": 0.4992, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0011272300034761429, "rewards/margins": 0.0034732469357550144, "rewards/rejected": -0.0023460157681256533, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438433e-07, "logits/chosen": -2.017059326171875, "logits/rejected": -2.008305788040161, "logps/chosen": -32.582275390625, "logps/rejected": -32.48918914794922, "loss": 0.5013, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0010934959864243865, "rewards/margins": -0.005789449438452721, "rewards/rejected": 0.004695953335613012, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542186e-07, "logits/chosen": -1.8646684885025024, "logits/rejected": -1.853882074356079, "logps/chosen": -33.567169189453125, "logps/rejected": -35.43851852416992, "loss": 0.5009, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0019382357131689787, "rewards/margins": -0.00470335315912962, "rewards/rejected": 0.0027651176787912846, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941118e-07, "logits/chosen": -1.9455400705337524, "logits/rejected": -1.947479486465454, "logps/chosen": -32.57770919799805, "logps/rejected": -33.181541442871094, "loss": 0.4989, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004494256805628538, "rewards/margins": 0.006526687648147345, "rewards/rejected": -0.00203243107534945, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413548e-07, "logits/chosen": -2.079427719116211, "logits/rejected": -2.084414482116699, "logps/chosen": -33.99236297607422, "logps/rejected": -36.600887298583984, "loss": 0.4989, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0007750942604616284, "rewards/margins": 0.005627653561532497, "rewards/rejected": -0.006402746774256229, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-07, "logits/chosen": -1.9420673847198486, "logits/rejected": -1.9452102184295654, "logps/chosen": -34.41423416137695, "logps/rejected": -34.5662727355957, "loss": 0.5004, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0027157063595950603, "rewards/margins": -0.0025114950258284807, "rewards/rejected": 0.00522720068693161, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.736716601303429e-07, "logits/chosen": -1.951348066329956, "logits/rejected": -1.9558594226837158, "logps/chosen": -32.46399688720703, "logps/rejected": -32.344329833984375, "loss": 0.5004, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0006849506171420217, "rewards/margins": -0.0022309008054435253, "rewards/rejected": 0.002915852004662156, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.62624545834521e-07, "logits/chosen": -2.0500051975250244, "logits/rejected": -2.0480096340179443, "logps/chosen": -32.25367736816406, "logps/rejected": -31.2783260345459, "loss": 0.5, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.002567791845649481, "rewards/margins": -0.00014422755339182913, "rewards/rejected": -0.0024235642049461603, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.244023323059082, "eval_logits/rejected": -2.2391417026519775, "eval_logps/chosen": -34.02798080444336, "eval_logps/rejected": -37.506290435791016, "eval_loss": 0.500300407409668, "eval_rewards/accuracies": 0.49335551261901855, "eval_rewards/chosen": 0.0013138726353645325, "eval_rewards/margins": -0.0007514380267821252, "eval_rewards/rejected": 0.0020653100218623877, "eval_runtime": 146.2282, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.4982572012636904e-07, "logits/chosen": -2.0061328411102295, "logits/rejected": -2.0037178993225098, "logps/chosen": -33.24314880371094, "logps/rejected": -34.023292541503906, "loss": 0.5003, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0001343002513749525, "rewards/margins": -0.001081160269677639, "rewards/rejected": 0.0012154604773968458, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777677e-07, "logits/chosen": -2.0173094272613525, "logits/rejected": -2.008920431137085, "logps/chosen": -32.45183563232422, "logps/rejected": -32.17412185668945, "loss": 0.5002, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0013863157946616411, "rewards/margins": 0.0011368464911356568, "rewards/rejected": -0.002523162867873907, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.194082707715275e-07, "logits/chosen": -2.0478129386901855, "logits/rejected": -2.0397789478302, "logps/chosen": -30.514944076538086, "logps/rejected": -32.053070068359375, "loss": 0.5012, "rewards/accuracies": 0.5, "rewards/chosen": -0.007845849730074406, "rewards/margins": -0.007833347655832767, "rewards/rejected": -1.2502726349339355e-05, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.020402418666621e-07, "logits/chosen": -1.978299856185913, "logits/rejected": -1.9885809421539307, "logps/chosen": -31.413021087646484, "logps/rejected": -32.54629135131836, "loss": 0.4995, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0013283130247145891, "rewards/margins": 0.0036107772029936314, "rewards/rejected": -0.002282463712617755, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.8341962650351185e-07, "logits/chosen": -1.8922052383422852, "logits/rejected": -1.8932926654815674, "logps/chosen": -34.210052490234375, "logps/rejected": -34.78533935546875, "loss": 0.4998, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.003472150769084692, "rewards/margins": 0.0021447453182190657, "rewards/rejected": -0.005616897251456976, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800572e-07, "logits/chosen": -1.9435640573501587, "logits/rejected": -1.9400733709335327, "logps/chosen": -36.18402862548828, "logps/rejected": -32.75020217895508, "loss": 0.4998, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0024705410469323397, "rewards/margins": 0.0030191238038241863, "rewards/rejected": -0.00548966508358717, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.430433172111807e-07, "logits/chosen": -2.043527603149414, "logits/rejected": -2.0361220836639404, "logps/chosen": -33.81038284301758, "logps/rejected": -31.369457244873047, "loss": 0.4997, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.002503907773643732, "rewards/margins": 0.0010178396478295326, "rewards/rejected": -0.0035217474214732647, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.216202642830543e-07, "logits/chosen": -2.0494205951690674, "logits/rejected": -2.0546982288360596, "logps/chosen": -32.51402282714844, "logps/rejected": -32.5015983581543, "loss": 0.4982, "rewards/accuracies": 0.625, "rewards/chosen": 0.0061522433534264565, "rewards/margins": 0.009443378075957298, "rewards/rejected": -0.003291133791208267, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.9960716642946403e-07, "logits/chosen": -2.0493359565734863, "logits/rejected": -2.046536922454834, "logps/chosen": -31.47686195373535, "logps/rejected": -31.326452255249023, "loss": 0.4991, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.002418341813609004, "rewards/margins": 0.004066127352416515, "rewards/rejected": -0.0016477858880534768, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.771853789806683e-07, "logits/chosen": -1.9191436767578125, "logits/rejected": -1.9238201379776, "logps/chosen": -31.60672378540039, "logps/rejected": -32.80228805541992, "loss": 0.4994, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.77844140632078e-05, "rewards/margins": 0.002183270873501897, "rewards/rejected": -0.0022710547782480717, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2449820041656494, "eval_logits/rejected": -2.2400975227355957, "eval_logps/chosen": -34.03920364379883, "eval_logps/rejected": -37.51057434082031, "eval_loss": 0.5004644989967346, "eval_rewards/accuracies": 0.5020764470100403, "eval_rewards/chosen": -0.0009311072644777596, "eval_rewards/margins": -0.002139872871339321, "eval_rewards/rejected": 0.0012087655486539006, "eval_runtime": 146.0512, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.294, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402e-07, "logits/chosen": -2.03255033493042, "logits/rejected": -2.0432262420654297, "logps/chosen": -31.951080322265625, "logps/rejected": -33.896484375, "loss": 0.498, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004044383764266968, "rewards/margins": 0.010801524855196476, "rewards/rejected": -0.006757141090929508, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.318564697655179e-07, "logits/chosen": -1.9258638620376587, "logits/rejected": -1.9407155513763428, "logps/chosen": -30.115543365478516, "logps/rejected": -31.575191497802734, "loss": 0.4992, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0008008191362023354, "rewards/margins": 0.002602284774184227, "rewards/rejected": -0.003403103444725275, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.093227910899832e-07, "logits/chosen": -1.983538269996643, "logits/rejected": -1.9875080585479736, "logps/chosen": -33.397071838378906, "logps/rejected": -31.56662940979004, "loss": 0.4985, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004079356789588928, "rewards/margins": 0.007544734515249729, "rewards/rejected": -0.0034653779584914446, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279356e-07, "logits/chosen": -1.9831326007843018, "logits/rejected": -1.9611790180206299, "logps/chosen": -34.164363861083984, "logps/rejected": -34.951324462890625, "loss": 0.5004, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0021236296743154526, "rewards/margins": -0.0020176086109131575, "rewards/rejected": -0.00010602096881484613, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.654436768970182e-07, "logits/chosen": -2.0248496532440186, "logits/rejected": -2.021538734436035, "logps/chosen": -32.92078399658203, "logps/rejected": -36.21001434326172, "loss": 0.5005, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0008573724189773202, "rewards/margins": -0.002124571241438389, "rewards/rejected": 0.002981943776831031, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.444597403062196e-07, "logits/chosen": -1.8912862539291382, "logits/rejected": -1.8888485431671143, "logps/chosen": -34.20059585571289, "logps/rejected": -35.51679992675781, "loss": 0.5001, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0024391734041273594, "rewards/margins": -3.7313438951969147e-06, "rewards/rejected": -0.002435441594570875, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.2434529917578887e-07, "logits/chosen": -1.876117467880249, "logits/rejected": -1.8735599517822266, "logps/chosen": -34.39899444580078, "logps/rejected": -31.73566246032715, "loss": 0.5014, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00027935029356740415, "rewards/margins": -0.007413160987198353, "rewards/rejected": 0.007133810315281153, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603521e-07, "logits/chosen": -1.9805666208267212, "logits/rejected": -1.9699329137802124, "logps/chosen": -35.328285217285156, "logps/rejected": -31.84103012084961, "loss": 0.4988, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.005670648999512196, "rewards/margins": 0.0056780558079481125, "rewards/rejected": -7.406389158859383e-06, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071453e-08, "logits/chosen": -2.0757369995117188, "logits/rejected": -2.06070613861084, "logps/chosen": -30.9174861907959, "logps/rejected": -32.63935470581055, "loss": 0.5004, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.001327360630966723, "rewards/margins": -0.0011682776967063546, "rewards/rejected": 0.0024956378620117903, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-08, "logits/chosen": -1.947251319885254, "logits/rejected": -1.9447133541107178, "logps/chosen": -32.908634185791016, "logps/rejected": -30.82659912109375, "loss": 0.499, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0036955091636627913, "rewards/margins": 0.005653515458106995, "rewards/rejected": -0.001958005130290985, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2445528507232666, "eval_logits/rejected": -2.23966383934021, "eval_logps/chosen": -34.04175567626953, "eval_logps/rejected": -37.499141693115234, "eval_loss": 0.5009724497795105, "eval_rewards/accuracies": 0.4431063234806061, "eval_rewards/chosen": -0.0014407250564545393, "eval_rewards/margins": -0.004937068559229374, "eval_rewards/rejected": 0.003496343968436122, "eval_runtime": 145.9859, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589034e-08, "logits/chosen": -1.9292316436767578, "logits/rejected": -1.9259681701660156, "logps/chosen": -31.583276748657227, "logps/rejected": -33.727840423583984, "loss": 0.4993, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.004015020560473204, "rewards/margins": 0.001152882701717317, "rewards/rejected": 0.002862137509509921, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380912e-08, "logits/chosen": -1.9808372259140015, "logits/rejected": -1.9685356616973877, "logps/chosen": -34.57278823852539, "logps/rejected": -33.57910919189453, "loss": 0.498, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004657471086829901, "rewards/margins": 0.01271037757396698, "rewards/rejected": -0.008052906021475792, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-08, "logits/chosen": -2.0161705017089844, "logits/rejected": -2.0147035121917725, "logps/chosen": -33.47340393066406, "logps/rejected": -32.46953582763672, "loss": 0.4998, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0024778605438768864, "rewards/margins": 0.00046821607975289226, "rewards/rejected": 0.0020096441730856895, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.05793773749158e-08, "logits/chosen": -2.1033377647399902, "logits/rejected": -2.0875496864318848, "logps/chosen": -34.152557373046875, "logps/rejected": -33.08795166015625, "loss": 0.5003, "rewards/accuracies": 0.5, "rewards/chosen": 0.005367305129766464, "rewards/margins": -0.0006489218212664127, "rewards/rejected": 0.006016227416694164, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.251801807404168e-08, "logits/chosen": -1.9754228591918945, "logits/rejected": -1.974477767944336, "logps/chosen": -33.24272537231445, "logps/rejected": -32.46410369873047, "loss": 0.4986, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.007874632254242897, "rewards/margins": 0.007666703313589096, "rewards/rejected": 0.00020792819850612432, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-09, "logits/chosen": -1.9320882558822632, "logits/rejected": -1.9424550533294678, "logps/chosen": -32.212371826171875, "logps/rejected": -35.286354064941406, "loss": 0.5003, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0015360517427325249, "rewards/margins": -0.002017115242779255, "rewards/rejected": 0.00048106274334713817, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050324e-09, "logits/chosen": -2.070864200592041, "logits/rejected": -2.0643177032470703, "logps/chosen": -33.652870178222656, "logps/rejected": -29.220972061157227, "loss": 0.5004, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00018037435074802488, "rewards/margins": -0.001271072425879538, "rewards/rejected": 0.001090698060579598, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-10, "logits/chosen": -1.9306039810180664, "logits/rejected": -1.932790756225586, "logps/chosen": -34.26443862915039, "logps/rejected": -30.895788192749023, "loss": 0.5001, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.005078143440186977, "rewards/margins": -0.00044635325320996344, "rewards/rejected": -0.004631790332496166, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.4997496530607149, "train_runtime": 3257.6696, "train_samples_per_second": 0.945, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }