diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.0, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1540, + "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15,7 +15,7 @@ "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, - "loss": 0.5, + "loss": 0.3086, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,2537 +25,597 @@ { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, - "logits/chosen": -1.8665987253189087, - "logits/rejected": -1.8709272146224976, - "logps/chosen": -36.985595703125, - "logps/rejected": -33.68160629272461, - "loss": 0.4886, - "rewards/accuracies": 0.5694444179534912, - "rewards/chosen": 0.018904482945799828, - "rewards/margins": 0.06528304517269135, - "rewards/rejected": -0.04637856408953667, + "logits/chosen": -1.8665881156921387, + "logits/rejected": -1.8709055185317993, + "logps/chosen": -36.99662399291992, + "logps/rejected": -33.65571594238281, + "loss": 0.3052, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.008980684913694859, + "rewards/margins": 0.032059140503406525, + "rewards/rejected": -0.02307845838367939, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, - "logits/chosen": -1.997780203819275, - "logits/rejected": -2.000434398651123, - "logps/chosen": -29.643661499023438, - "logps/rejected": -29.043325424194336, - "loss": 0.5031, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.001316396868787706, - "rewards/margins": -0.019422104582190514, - "rewards/rejected": 0.018105709925293922, + "logits/chosen": -1.9977455139160156, + "logits/rejected": -2.000382423400879, + "logps/chosen": -29.642925262451172, + "logps/rejected": -29.056737899780273, + "loss": 0.3525, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.0006573178106918931, + "rewards/margins": -0.00669272243976593, + "rewards/rejected": 0.006035405211150646, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, - "logits/chosen": -1.9207446575164795, - "logits/rejected": -1.918060064315796, - "logps/chosen": -31.41064453125, - "logps/rejected": -33.227088928222656, - "loss": 0.4976, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.004905471112579107, - "rewards/margins": 0.012669263407588005, - "rewards/rejected": -0.007763790898025036, + "logits/chosen": -1.9204607009887695, + "logits/rejected": -1.9177772998809814, + "logps/chosen": -31.42336654663086, + "logps/rejected": -33.22785568237305, + "loss": 0.3603, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.006546213291585445, + "rewards/margins": 0.0019128695130348206, + "rewards/rejected": -0.00845908559858799, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, - "logits/chosen": -2.017446517944336, - "logits/rejected": -2.0087125301361084, - "logps/chosen": -32.553016662597656, - "logps/rejected": -32.50551986694336, - "loss": 0.4977, + "logits/chosen": -2.0172362327575684, + "logits/rejected": -2.008507251739502, + "logps/chosen": -32.56964874267578, + "logps/rejected": -32.50572967529297, + "loss": 0.3558, "rewards/accuracies": 0.5, - "rewards/chosen": 0.021415216848254204, - "rewards/margins": 0.014982220716774464, - "rewards/rejected": 0.006432999856770039, + "rewards/chosen": 0.006439635064452887, + "rewards/margins": 0.00019515231542754918, + "rewards/rejected": 0.006244482938200235, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, - "logits/chosen": -1.8627235889434814, - "logits/rejected": -1.851959228515625, - "logps/chosen": -33.5064697265625, - "logps/rejected": -35.43267059326172, - "loss": 0.4951, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.0459100641310215, - "rewards/margins": 0.02820250764489174, - "rewards/rejected": 0.017707552760839462, + "logits/chosen": -1.8618619441986084, + "logits/rejected": -1.8510783910751343, + "logps/chosen": -33.56026077270508, + "logps/rejected": -35.45254898071289, + "loss": 0.3691, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0025013976264744997, + "rewards/margins": -0.002315213903784752, + "rewards/rejected": -0.00018618404283188283, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, - "logits/chosen": -1.9425691366195679, - "logits/rejected": -1.94449782371521, - "logps/chosen": -32.46650695800781, - "logps/rejected": -33.15652847290039, - "loss": 0.4765, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.12030963599681854, - "rewards/margins": 0.10694190114736557, - "rewards/rejected": 0.013367725536227226, + "logits/chosen": -1.9393202066421509, + "logits/rejected": -1.9412600994110107, + "logps/chosen": -32.57838439941406, + "logps/rejected": -33.215576171875, + "loss": 0.3063, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.01962057128548622, + "rewards/margins": 0.05940054729580879, + "rewards/rejected": -0.03977997973561287, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, - "logits/chosen": -2.073408842086792, - "logits/rejected": -2.078367233276367, - "logps/chosen": -33.917694091796875, - "logps/rejected": -36.547218322753906, - "loss": 0.4901, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.06371410191059113, - "rewards/margins": 0.04422418028116226, - "rewards/rejected": 0.019489921629428864, + "logits/chosen": -2.0718436241149902, + "logits/rejected": -2.0768017768859863, + "logps/chosen": -33.97806167602539, + "logps/rejected": -36.63082504272461, + "loss": 0.4257, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00938049890100956, + "rewards/margins": 0.06513925641775131, + "rewards/rejected": -0.0557587556540966, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, - "logits/chosen": -1.9349607229232788, - "logits/rejected": -1.9380786418914795, - "logps/chosen": -34.223785400390625, - "logps/rejected": -34.53069305419922, - "loss": 0.4713, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.1836203634738922, - "rewards/margins": 0.12807974219322205, - "rewards/rejected": 0.05554063245654106, + "logits/chosen": -1.9351739883422852, + "logits/rejected": -1.9383188486099243, + "logps/chosen": -34.33073043823242, + "logps/rejected": -34.61904525756836, + "loss": 0.2902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08736880123615265, + "rewards/margins": 0.11134655773639679, + "rewards/rejected": -0.023977745324373245, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, - "logits/chosen": -1.9439691305160522, - "logits/rejected": -1.9484784603118896, - "logps/chosen": -32.27050018310547, - "logps/rejected": -32.26476287841797, - "loss": 0.4779, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.1772255003452301, - "rewards/margins": 0.09250012785196304, - "rewards/rejected": 0.08472537249326706, + "logits/chosen": -1.944392204284668, + "logits/rejected": -1.9489190578460693, + "logps/chosen": -32.419586181640625, + "logps/rejected": -32.3698844909668, + "loss": 0.3611, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.043051257729530334, + "rewards/margins": 0.052930813282728195, + "rewards/rejected": -0.009879561141133308, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, - "logits/chosen": -2.0411603450775146, - "logits/rejected": -2.039163112640381, - "logps/chosen": -31.98573875427246, - "logps/rejected": -31.193227767944336, - "loss": 0.4639, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.22958631813526154, - "rewards/margins": 0.16390272974967957, - "rewards/rejected": 0.06568360328674316, + "logits/chosen": -2.0419769287109375, + "logits/rejected": -2.0399627685546875, + "logps/chosen": -32.174407958984375, + "logps/rejected": -31.26608657836914, + "loss": 0.3038, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.059785228222608566, + "rewards/margins": 0.05967242643237114, + "rewards/rejected": 0.00011279433965682983, "step": 100 }, { "epoch": 0.26, - "eval_logits/chosen": -2.235391855239868, - "eval_logits/rejected": -2.2305493354797363, - "eval_logps/chosen": -33.869815826416016, - "eval_logps/rejected": -37.382774353027344, - "eval_loss": 0.4939241409301758, - "eval_rewards/accuracies": 0.5627076625823975, - "eval_rewards/chosen": 0.1482628434896469, - "eval_rewards/margins": 0.02780282311141491, - "eval_rewards/rejected": 0.12046003341674805, - "eval_runtime": 145.9747, - "eval_samples_per_second": 2.35, + "eval_logits/chosen": -2.2372143268585205, + "eval_logits/rejected": -2.2323503494262695, + "eval_logps/chosen": -34.01276779174805, + "eval_logps/rejected": -37.51152420043945, + "eval_loss": 0.3511974811553955, + "eval_rewards/accuracies": 0.5423588156700134, + "eval_rewards/chosen": 0.01960929110646248, + "eval_rewards/margins": 0.01502405758947134, + "eval_rewards/rejected": 0.004585230257362127, + "eval_runtime": 145.9032, + "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, - "logits/chosen": -1.997287392616272, - "logits/rejected": -1.9949369430541992, - "logps/chosen": -32.96843719482422, - "logps/rejected": -33.866310119628906, - "loss": 0.4739, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.24784216284751892, - "rewards/margins": 0.10108550637960434, - "rewards/rejected": 0.14675670862197876, + "logits/chosen": -1.998891830444336, + "logits/rejected": -1.996492624282837, + "logps/chosen": -33.14598083496094, + "logps/rejected": -34.020729064941406, + "loss": 0.4689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.08805312216281891, + "rewards/margins": 0.08027410507202148, + "rewards/rejected": 0.007779018487781286, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, - "logits/chosen": -2.008091688156128, - "logits/rejected": -1.9997599124908447, - "logps/chosen": -32.20352554321289, - "logps/rejected": -31.995223999023438, - "loss": 0.485, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.2172430008649826, - "rewards/margins": 0.06759083271026611, - "rewards/rejected": 0.1496521681547165, + "logits/chosen": -2.010932445526123, + "logits/rejected": -2.0025696754455566, + "logps/chosen": -32.37172317504883, + "logps/rejected": -32.118797302246094, + "loss": 0.4465, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.06586603820323944, + "rewards/margins": 0.02742874063551426, + "rewards/rejected": 0.03843729570508003, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, - "logits/chosen": -2.035614490509033, - "logits/rejected": -2.027682304382324, - "logps/chosen": -30.1588077545166, - "logps/rejected": -31.886260986328125, - "loss": 0.4717, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.2852162718772888, - "rewards/margins": 0.1351451873779297, - "rewards/rejected": 0.15007111430168152, + "logits/chosen": -2.0387539863586426, + "logits/rejected": -2.030724287033081, + "logps/chosen": -30.41655921936035, + "logps/rejected": -32.060333251953125, + "loss": 0.3844, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.05324209854006767, + "rewards/margins": 0.05983499437570572, + "rewards/rejected": -0.00659290561452508, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, - "logits/chosen": -1.965490698814392, - "logits/rejected": -1.9756921529769897, - "logps/chosen": -31.065088272094727, - "logps/rejected": -32.42934036254883, - "loss": 0.4482, + "logits/chosen": -1.967858076095581, + "logits/rejected": -1.9781148433685303, + "logps/chosen": -31.223413467407227, + "logps/rejected": -32.55517578125, + "loss": 0.3904, "rewards/accuracies": 0.6875, - "rewards/chosen": 0.3191176950931549, - "rewards/margins": 0.22413134574890137, - "rewards/rejected": 0.09498633444309235, + "rewards/chosen": 0.1766217201948166, + "rewards/margins": 0.1948881596326828, + "rewards/rejected": -0.01826643943786621, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, - "logits/chosen": -1.8782259225845337, - "logits/rejected": -1.8793823719024658, - "logps/chosen": -33.68832778930664, - "logps/rejected": -34.58278274536133, - "loss": 0.4367, + "logits/chosen": -1.880910873413086, + "logits/rejected": -1.8820507526397705, + "logps/chosen": -34.01464080810547, + "logps/rejected": -34.783546447753906, + "loss": 0.3871, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.45392999053001404, - "rewards/margins": 0.2969031035900116, - "rewards/rejected": 0.15702682733535767, + "rewards/chosen": 0.160243421792984, + "rewards/margins": 0.1839032918214798, + "rewards/rejected": -0.023659853264689445, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, - "logits/chosen": -1.9295704364776611, - "logits/rejected": -1.9261808395385742, - "logps/chosen": -35.74212646484375, - "logps/rejected": -32.51028060913086, - "loss": 0.4538, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.38659390807151794, - "rewards/margins": 0.1953679323196411, - "rewards/rejected": 0.1912260353565216, + "logits/chosen": -1.933895468711853, + "logits/rejected": -1.9304730892181396, + "logps/chosen": -36.02853775024414, + "logps/rejected": -32.699058532714844, + "loss": 0.2729, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1288261115550995, + "rewards/margins": 0.10750452429056168, + "rewards/rejected": 0.021321602165699005, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, - "logits/chosen": -2.031176805496216, - "logits/rejected": -2.023855686187744, - "logps/chosen": -33.24225616455078, - "logps/rejected": -31.193195343017578, - "loss": 0.42, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.5000473260879517, - "rewards/margins": 0.3572581112384796, - "rewards/rejected": 0.1427893042564392, + "logits/chosen": -2.0341715812683105, + "logits/rejected": -2.0267820358276367, + "logps/chosen": -33.55347442626953, + "logps/rejected": -31.3526554107666, + "loss": 0.294, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.21994857490062714, + "rewards/margins": 0.22067300975322723, + "rewards/rejected": -0.0007244400912895799, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, - "logits/chosen": -2.038222074508667, - "logits/rejected": -2.0434067249298096, - "logps/chosen": -31.95560646057129, - "logps/rejected": -32.17836380004883, - "loss": 0.4404, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.5302629470825195, - "rewards/margins": 0.2541634440422058, - "rewards/rejected": 0.2760995924472809, + "logits/chosen": -2.0400891304016113, + "logits/rejected": -2.045360565185547, + "logps/chosen": -32.370338439941406, + "logps/rejected": -32.4719123840332, + "loss": 0.2829, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.15700635313987732, + "rewards/margins": 0.145101398229599, + "rewards/rejected": 0.011904975399374962, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, - "logits/chosen": -2.0387401580810547, - "logits/rejected": -2.036006450653076, - "logps/chosen": -31.0674991607666, - "logps/rejected": -31.083877563476562, - "loss": 0.4607, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.3793107569217682, - "rewards/margins": 0.16841106116771698, - "rewards/rejected": 0.21089968085289001, + "logits/chosen": -2.041393280029297, + "logits/rejected": -2.038623809814453, + "logps/chosen": -31.328174591064453, + "logps/rejected": -31.316492080688477, + "loss": 0.3044, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14470075070858002, + "rewards/margins": 0.1431477963924408, + "rewards/rejected": 0.0015529401134699583, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, - "logits/chosen": -1.9085681438446045, - "logits/rejected": -1.9132543802261353, - "logps/chosen": -31.083459854125977, - "logps/rejected": -32.602638244628906, - "loss": 0.4308, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.4705420434474945, - "rewards/margins": 0.301074743270874, - "rewards/rejected": 0.16946731507778168, + "logits/chosen": -1.9122215509414673, + "logits/rejected": -1.9168663024902344, + "logps/chosen": -31.424020767211914, + "logps/rejected": -32.784080505371094, + "loss": 0.3157, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.16403506696224213, + "rewards/margins": 0.1578713059425354, + "rewards/rejected": 0.0061637843027710915, "step": 200 }, { "epoch": 0.52, - "eval_logits/chosen": -2.2338287830352783, - "eval_logits/rejected": -2.229020357131958, - "eval_logps/chosen": -33.7449951171875, - "eval_logps/rejected": -37.27743911743164, - "eval_loss": 0.48942965269088745, - "eval_rewards/accuracies": 0.5544019937515259, - "eval_rewards/chosen": 0.26059985160827637, - "eval_rewards/margins": 0.0453372597694397, - "eval_rewards/rejected": 0.21526260673999786, - "eval_runtime": 145.8953, - "eval_samples_per_second": 2.351, - "eval_steps_per_second": 0.295, + "eval_logits/chosen": -2.237051010131836, + "eval_logits/rejected": -2.2321863174438477, + "eval_logps/chosen": -34.018070220947266, + "eval_logps/rejected": -37.51838684082031, + "eval_loss": 0.3716273605823517, + "eval_rewards/accuracies": 0.5245016813278198, + "eval_rewards/chosen": 0.014834923669695854, + "eval_rewards/margins": 0.016425320878624916, + "eval_rewards/rejected": -0.0015903981402516365, + "eval_runtime": 145.5151, + "eval_samples_per_second": 2.357, + "eval_steps_per_second": 0.296, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, - "logits/chosen": -2.0216596126556396, - "logits/rejected": -2.032275915145874, - "logps/chosen": -31.500268936157227, - "logps/rejected": -33.663352966308594, - "loss": 0.4458, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.42393389344215393, - "rewards/margins": 0.2445230931043625, - "rewards/rejected": 0.17941072583198547, + "logits/chosen": -2.023789882659912, + "logits/rejected": -2.034484386444092, + "logps/chosen": -31.767370223999023, + "logps/rejected": -33.890621185302734, + "loss": 0.275, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.18354059755802155, + "rewards/margins": 0.20867136120796204, + "rewards/rejected": -0.025130782276391983, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, - "logits/chosen": -1.9136396646499634, - "logits/rejected": -1.928344964981079, - "logps/chosen": -29.588964462280273, - "logps/rejected": -31.396224975585938, - "loss": 0.4269, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.4703185558319092, - "rewards/margins": 0.3245617151260376, - "rewards/rejected": 0.1457568258047104, + "logits/chosen": -1.916685700416565, + "logits/rejected": -1.9314892292022705, + "logps/chosen": -29.956628799438477, + "logps/rejected": -31.564035415649414, + "loss": 0.2958, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.13942097127437592, + "rewards/margins": 0.14469322562217712, + "rewards/rejected": -0.005272268317639828, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, - "logits/chosen": -1.970298171043396, - "logits/rejected": -1.974283218383789, - "logps/chosen": -32.81959915161133, - "logps/rejected": -31.408565521240234, - "loss": 0.4109, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.5380831956863403, - "rewards/margins": 0.4114208221435547, - "rewards/rejected": 0.12666237354278564, + "logits/chosen": -1.9737945795059204, + "logits/rejected": -1.9777710437774658, + "logps/chosen": -33.19129180908203, + "logps/rejected": -31.5566463470459, + "loss": 0.287, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.20355579257011414, + "rewards/margins": 0.21016716957092285, + "rewards/rejected": -0.0066113718785345554, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, - "logits/chosen": -1.9695065021514893, - "logits/rejected": -1.9477574825286865, - "logps/chosen": -33.58247756958008, - "logps/rejected": -34.828121185302734, - "loss": 0.4129, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.5141419172286987, - "rewards/margins": 0.403735876083374, - "rewards/rejected": 0.11040612310171127, + "logits/chosen": -1.9743419885635376, + "logits/rejected": -1.9523779153823853, + "logps/chosen": -33.9401969909668, + "logps/rejected": -35.008758544921875, + "loss": 0.3185, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1921955645084381, + "rewards/margins": 0.24436470866203308, + "rewards/rejected": -0.052169155329465866, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, - "logits/chosen": -2.0098202228546143, - "logits/rejected": -2.0065340995788574, - "logps/chosen": -32.43529510498047, - "logps/rejected": -35.97461700439453, - "loss": 0.4514, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.44079580903053284, - "rewards/margins": 0.2155180424451828, - "rewards/rejected": 0.22527781128883362, + "logits/chosen": -2.0161709785461426, + "logits/rejected": -2.0128414630889893, + "logps/chosen": -32.74829864501953, + "logps/rejected": -36.236392974853516, + "loss": 0.2847, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15909257531166077, + "rewards/margins": 0.1694144755601883, + "rewards/rejected": -0.010321905836462975, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, - "logits/chosen": -1.8770506381988525, - "logits/rejected": -1.8746120929718018, - "logps/chosen": -33.7199821472168, - "logps/rejected": -35.28092575073242, - "loss": 0.4498, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.42157459259033203, - "rewards/margins": 0.2202514111995697, - "rewards/rejected": 0.2013232260942459, + "logits/chosen": -1.8847742080688477, + "logits/rejected": -1.8823268413543701, + "logps/chosen": -34.01182174682617, + "logps/rejected": -35.481346130371094, + "loss": 0.3073, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1589193046092987, + "rewards/margins": 0.13797220587730408, + "rewards/rejected": 0.020947108045220375, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, - "logits/chosen": -1.8618510961532593, - "logits/rejected": -1.8593294620513916, - "logps/chosen": -33.92017364501953, - "logps/rejected": -31.6002197265625, - "loss": 0.4397, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.42968273162841797, - "rewards/margins": 0.27568089962005615, - "rewards/rejected": 0.15400180220603943, + "logits/chosen": -1.8693536520004272, + "logits/rejected": -1.866838812828064, + "logps/chosen": -34.20549392700195, + "logps/rejected": -31.726673126220703, + "loss": 0.2733, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17289286851882935, + "rewards/margins": 0.1327010840177536, + "rewards/rejected": 0.040191780775785446, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, - "logits/chosen": -1.9657011032104492, - "logits/rejected": -1.9552650451660156, - "logps/chosen": -34.72232437133789, - "logps/rejected": -31.632369995117188, - "loss": 0.4114, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.5708868503570557, - "rewards/margins": 0.3831265866756439, - "rewards/rejected": 0.18776027858257294, + "logits/chosen": -1.9730831384658813, + "logits/rejected": -1.962480902671814, + "logps/chosen": -35.06049728393555, + "logps/rejected": -31.792781829833984, + "loss": 0.2523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2665289342403412, + "rewards/margins": 0.22314274311065674, + "rewards/rejected": 0.04338619112968445, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, - "logits/chosen": -2.0614376068115234, - "logits/rejected": -2.046600341796875, - "logps/chosen": -30.400625228881836, - "logps/rejected": -32.34136199951172, - "loss": 0.456, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.4711507260799408, - "rewards/margins": 0.19172403216362, - "rewards/rejected": 0.2794266939163208, + "logits/chosen": -2.0680813789367676, + "logits/rejected": -2.053079128265381, + "logps/chosen": -30.738479614257812, + "logps/rejected": -32.61243438720703, + "loss": 0.3194, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.16707859933376312, + "rewards/margins": 0.13161785900592804, + "rewards/rejected": 0.035460732877254486, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, - "logits/chosen": -1.9332094192504883, - "logits/rejected": -1.9307467937469482, - "logps/chosen": -32.10976028442383, - "logps/rejected": -30.661523818969727, - "loss": 0.374, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.7356175184249878, - "rewards/margins": 0.5958597660064697, - "rewards/rejected": 0.13975778222084045, + "logits/chosen": -1.9398882389068604, + "logits/rejected": -1.9373395442962646, + "logps/chosen": -32.6181640625, + "logps/rejected": -30.843700408935547, + "loss": 0.2156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2780519425868988, + "rewards/margins": 0.3022567331790924, + "rewards/rejected": -0.024204757064580917, "step": 300 }, { "epoch": 0.78, - "eval_logits/chosen": -2.2307660579681396, - "eval_logits/rejected": -2.22594952583313, - "eval_logps/chosen": -33.74896240234375, - "eval_logps/rejected": -37.275413513183594, - "eval_loss": 0.49038100242614746, - "eval_rewards/accuracies": 0.5220099687576294, - "eval_rewards/chosen": 0.2570302486419678, - "eval_rewards/margins": 0.039943769574165344, - "eval_rewards/rejected": 0.21708647906780243, - "eval_runtime": 145.8077, - "eval_samples_per_second": 2.352, + "eval_logits/chosen": -2.2364187240600586, + "eval_logits/rejected": -2.2315518856048584, + "eval_logps/chosen": -34.01428985595703, + "eval_logps/rejected": -37.496952056884766, + "eval_loss": 0.38450533151626587, + "eval_rewards/accuracies": 0.49335551261901855, + "eval_rewards/chosen": 0.018236981704831123, + "eval_rewards/margins": 0.0005384809919632971, + "eval_rewards/rejected": 0.01769850216805935, + "eval_runtime": 145.7485, + "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, - "grad_norm": 9.0, - "learning_rate": 4.84533120650964e-06, - "logits/chosen": -2.0677387714385986, - "logits/rejected": -2.055021286010742, - "logps/chosen": -31.79671859741211, - "logps/rejected": -32.5869026184082, - "loss": 0.3713, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.6585079431533813, - "rewards/margins": 0.6097704172134399, - "rewards/rejected": 0.04873766377568245, + "learning_rate": 5.576113578589035e-07, + "logits/chosen": -1.922579050064087, + "logits/rejected": -1.9193273782730103, + "logps/chosen": -31.345911026000977, + "logps/rejected": -33.72126007080078, + "loss": 0.2917, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.23169513046741486, + "rewards/margins": 0.21289470791816711, + "rewards/rejected": 0.018800420686602592, "step": 310 }, { "epoch": 0.83, - "grad_norm": 8.4375, - "learning_rate": 4.825108134172131e-06, - "logits/chosen": -1.979414939880371, - "logits/rejected": -1.9707914590835571, - "logps/chosen": -31.38620948791504, - "logps/rejected": -30.1413631439209, - "loss": 0.3423, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.8718989491462708, - "rewards/margins": 0.7818731069564819, - "rewards/rejected": 0.09002566337585449, + "learning_rate": 4.229036944380913e-07, + "logits/chosen": -1.9754743576049805, + "logits/rejected": -1.9631853103637695, + "logps/chosen": -34.408077239990234, + "logps/rejected": -33.58232879638672, + "loss": 0.2394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1691979616880417, + "rewards/margins": 0.20833876729011536, + "rewards/rejected": -0.03914082050323486, "step": 320 }, { "epoch": 0.86, - "grad_norm": 6.5625, - "learning_rate": 4.80369052967602e-06, - "logits/chosen": -1.9164222478866577, - "logits/rejected": -1.9284029006958008, - "logps/chosen": -29.445837020874023, - "logps/rejected": -33.39659118652344, - "loss": 0.3128, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.9300888776779175, - "rewards/margins": 0.9249893426895142, - "rewards/rejected": 0.005099574103951454, + "learning_rate": 3.053082288996112e-07, + "logits/chosen": -2.0105607509613037, + "logits/rejected": -2.009115219116211, + "logps/chosen": -33.31591033935547, + "logps/rejected": -32.47368621826172, + "loss": 0.2769, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.15289874374866486, + "rewards/margins": 0.14759239554405212, + "rewards/rejected": 0.005306343547999859, "step": 330 }, { "epoch": 0.88, - "grad_norm": 11.0, - "learning_rate": 4.781089396387968e-06, - "logits/chosen": -1.8785717487335205, - "logits/rejected": -1.8694852590560913, - "logps/chosen": -33.58583450317383, - "logps/rejected": -35.898094177246094, - "loss": 0.3059, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.0017844438552856, - "rewards/margins": 1.0390136241912842, - "rewards/rejected": -0.037229109555482864, + "learning_rate": 2.0579377374915805e-07, + "logits/chosen": -2.096872091293335, + "logits/rejected": -2.0811073780059814, + "logps/chosen": -33.87510681152344, + "logps/rejected": -33.06427764892578, + "loss": 0.2792, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.27385497093200684, + "rewards/margins": 0.2254684418439865, + "rewards/rejected": 0.04838654398918152, "step": 340 }, { "epoch": 0.91, - "grad_norm": 6.28125, - "learning_rate": 4.757316345716554e-06, - "logits/chosen": -1.9316015243530273, - "logits/rejected": -1.9322454929351807, - "logps/chosen": -33.23583221435547, - "logps/rejected": -33.75827407836914, - "loss": 0.3127, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 1.0525028705596924, - "rewards/margins": 0.970870316028595, - "rewards/rejected": 0.08163277059793472, + "learning_rate": 1.2518018074041684e-07, + "logits/chosen": -1.969496488571167, + "logits/rejected": -1.9685735702514648, + "logps/chosen": -32.98945999145508, + "logps/rejected": -32.4643440246582, + "loss": 0.2958, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.26337358355522156, + "rewards/margins": 0.26264840364456177, + "rewards/rejected": 0.0007252089562825859, "step": 350 }, { "epoch": 0.94, - "grad_norm": 7.71875, - "learning_rate": 4.73238359114687e-06, - "logits/chosen": -2.0583748817443848, - "logits/rejected": -2.0645618438720703, - "logps/chosen": -30.67654037475586, - "logps/rejected": -32.611019134521484, - "loss": 0.3507, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.8070351481437683, - "rewards/margins": 0.7554978132247925, - "rewards/rejected": 0.05153726786375046, + "learning_rate": 6.41315865106129e-08, + "logits/chosen": -1.9255645275115967, + "logits/rejected": -1.9359004497528076, + "logps/chosen": -32.013362884521484, + "logps/rejected": -35.26326370239258, + "loss": 0.3425, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1721937507390976, + "rewards/margins": 0.14924712479114532, + "rewards/rejected": 0.022946633398532867, "step": 360 }, { "epoch": 0.96, - "grad_norm": 9.3125, - "learning_rate": 4.706303941965804e-06, - "logits/chosen": -1.9886471033096313, - "logits/rejected": -1.9882348775863647, - "logps/chosen": -32.35576629638672, - "logps/rejected": -35.93062210083008, - "loss": 0.3254, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.9802559018135071, - "rewards/margins": 0.873017430305481, - "rewards/rejected": 0.10723841190338135, + "learning_rate": 2.3150941078050325e-08, + "logits/chosen": -2.0643956661224365, + "logits/rejected": -2.057886838912964, + "logps/chosen": -33.48772430419922, + "logps/rejected": -29.191638946533203, + "loss": 0.298, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14782151579856873, + "rewards/margins": 0.11651048809289932, + "rewards/rejected": 0.03131101652979851, "step": 370 }, { "epoch": 0.99, - "grad_norm": 7.0, - "learning_rate": 4.679090796681225e-06, - "logits/chosen": -2.0210633277893066, - "logits/rejected": -2.0164623260498047, - "logps/chosen": -29.66098403930664, - "logps/rejected": -29.14764976501465, - "loss": 0.3253, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.944656491279602, - "rewards/margins": 0.8850381970405579, - "rewards/rejected": 0.059618253260850906, + "learning_rate": 2.575864278703266e-09, + "logits/chosen": -1.9235725402832031, + "logits/rejected": -1.9257465600967407, + "logps/chosen": -33.965919494628906, + "logps/rejected": -30.839218139648438, + "loss": 0.2616, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.24581687152385712, + "rewards/margins": 0.2157471626996994, + "rewards/rejected": 0.03006969951093197, "step": 380 }, { - "epoch": 1.01, - "grad_norm": 9.125, - "learning_rate": 4.650758136138454e-06, - "logits/chosen": -1.7931617498397827, - "logits/rejected": -1.7995964288711548, - "logps/chosen": -31.082958221435547, - "logps/rejected": -36.22032928466797, - "loss": 0.2684, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 1.1724803447723389, - "rewards/margins": 1.2643165588378906, - "rewards/rejected": -0.09183625876903534, - "step": 390 - }, - { - "epoch": 1.04, - "grad_norm": 6.71875, - "learning_rate": 4.621320516337559e-06, - "logits/chosen": -1.946929931640625, - "logits/rejected": -1.9408056735992432, - "logps/chosen": -32.456817626953125, - "logps/rejected": -32.25872039794922, - "loss": 0.2788, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 1.2382924556732178, - "rewards/margins": 1.1931540966033936, - "rewards/rejected": 0.045138586312532425, - "step": 400 - }, - { - "epoch": 1.04, - "eval_logits/chosen": -2.208998918533325, - "eval_logits/rejected": -2.2041895389556885, - "eval_logps/chosen": -33.622596740722656, - "eval_logps/rejected": -37.19999694824219, - "eval_loss": 0.47984185814857483, - "eval_rewards/accuracies": 0.5772424936294556, - "eval_rewards/chosen": 0.370761513710022, - "eval_rewards/margins": 0.08579742908477783, - "eval_rewards/rejected": 0.28496408462524414, - "eval_runtime": 145.5426, - "eval_samples_per_second": 2.357, - "eval_steps_per_second": 0.295, - "step": 400 - }, - { - "epoch": 1.06, - "grad_norm": 7.46875, - "learning_rate": 4.590793060955158e-06, - "logits/chosen": -1.944295883178711, - "logits/rejected": -1.9515262842178345, - "logps/chosen": -28.099727630615234, - "logps/rejected": -29.202655792236328, - "loss": 0.3113, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.9185803532600403, - "rewards/margins": 1.0125702619552612, - "rewards/rejected": -0.09398989379405975, - "step": 410 - }, - { - "epoch": 1.09, - "grad_norm": 5.90625, - "learning_rate": 4.559191453574582e-06, - "logits/chosen": -1.9665906429290771, - "logits/rejected": -1.9656177759170532, - "logps/chosen": -33.038299560546875, - "logps/rejected": -30.731185913085938, - "loss": 0.3292, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 1.091166377067566, - "rewards/margins": 0.9004921913146973, - "rewards/rejected": 0.19067440927028656, - "step": 420 - }, - { - "epoch": 1.12, - "grad_norm": 6.5625, - "learning_rate": 4.52653192962838e-06, - "logits/chosen": -1.960172414779663, - "logits/rejected": -1.9431111812591553, - "logps/chosen": -29.893768310546875, - "logps/rejected": -33.135128021240234, - "loss": 0.2907, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.9750927686691284, - "rewards/margins": 1.1630522012710571, - "rewards/rejected": -0.18795931339263916, - "step": 430 - }, - { - "epoch": 1.14, - "grad_norm": 7.5, - "learning_rate": 4.492831268057307e-06, - "logits/chosen": -1.9910480976104736, - "logits/rejected": -1.9929386377334595, - "logps/chosen": -35.075401306152344, - "logps/rejected": -34.91225051879883, - "loss": 0.2412, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.3832672834396362, - "rewards/margins": 1.4303205013275146, - "rewards/rejected": -0.047053247690200806, - "step": 440 - }, - { - "epoch": 1.17, - "grad_norm": 7.40625, - "learning_rate": 4.458106782690094e-06, - "logits/chosen": -2.0687942504882812, - "logits/rejected": -2.0686755180358887, - "logps/chosen": -31.33111000061035, - "logps/rejected": -33.56393051147461, - "loss": 0.2933, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 1.1720173358917236, - "rewards/margins": 1.1754536628723145, - "rewards/rejected": -0.003436268772929907, - "step": 450 - }, - { - "epoch": 1.19, - "grad_norm": 6.21875, - "learning_rate": 4.422376313348405e-06, - "logits/chosen": -2.0110697746276855, - "logits/rejected": -2.00362229347229, - "logps/chosen": -30.857044219970703, - "logps/rejected": -36.18989181518555, - "loss": 0.2415, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.3940247297286987, - "rewards/margins": 1.516292691230774, - "rewards/rejected": -0.122267946600914, - "step": 460 - }, - { - "epoch": 1.22, - "grad_norm": 7.5, - "learning_rate": 4.3856582166815696e-06, - "logits/chosen": -1.9154274463653564, - "logits/rejected": -1.911982774734497, - "logps/chosen": -32.375511169433594, - "logps/rejected": -32.90439224243164, - "loss": 0.258, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 1.401104211807251, - "rewards/margins": 1.383563756942749, - "rewards/rejected": 0.017540520057082176, - "step": 470 - }, - { - "epoch": 1.25, - "grad_norm": 5.9375, - "learning_rate": 4.347971356735789e-06, - "logits/chosen": -2.04083251953125, - "logits/rejected": -2.0339038372039795, - "logps/chosen": -29.64451026916504, - "logps/rejected": -32.0025634765625, - "loss": 0.3003, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 1.0656226873397827, - "rewards/margins": 1.0737924575805664, - "rewards/rejected": -0.008169800043106079, - "step": 480 - }, - { - "epoch": 1.27, - "grad_norm": 5.75, - "learning_rate": 4.309335095262675e-06, - "logits/chosen": -1.9866294860839844, - "logits/rejected": -1.9882761240005493, - "logps/chosen": -33.75426483154297, - "logps/rejected": -33.61265182495117, - "loss": 0.2426, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.439030647277832, - "rewards/margins": 1.4130573272705078, - "rewards/rejected": 0.02597307227551937, - "step": 490 - }, - { - "epoch": 1.3, - "grad_norm": 7.5625, - "learning_rate": 4.269769281772082e-06, - "logits/chosen": -1.875967025756836, - "logits/rejected": -1.873674988746643, - "logps/chosen": -31.584508895874023, - "logps/rejected": -36.59474563598633, - "loss": 0.2438, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.4290008544921875, - "rewards/margins": 1.5463793277740479, - "rewards/rejected": -0.11737833172082901, - "step": 500 - }, - { - "epoch": 1.3, - "eval_logits/chosen": -2.2097504138946533, - "eval_logits/rejected": -2.2049450874328613, - "eval_logps/chosen": -33.5948486328125, - "eval_logps/rejected": -37.14616775512695, - "eval_loss": 0.48535072803497314, - "eval_rewards/accuracies": 0.5394518375396729, - "eval_rewards/chosen": 0.3957318663597107, - "eval_rewards/margins": 0.06232503429055214, - "eval_rewards/rejected": 0.33340683579444885, - "eval_runtime": 145.3203, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 500 - }, - { - "epoch": 1.32, - "grad_norm": 5.6875, - "learning_rate": 4.22929424333435e-06, - "logits/chosen": -1.9793621301651, - "logits/rejected": -1.984104871749878, - "logps/chosen": -31.774097442626953, - "logps/rejected": -31.5977725982666, - "loss": 0.2427, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 1.5149345397949219, - "rewards/margins": 1.4714324474334717, - "rewards/rejected": 0.043502308428287506, - "step": 510 - }, - { - "epoch": 1.35, - "grad_norm": 6.5, - "learning_rate": 4.1879307741372085e-06, - "logits/chosen": -2.0099575519561768, - "logits/rejected": -2.0209403038024902, - "logps/chosen": -29.757198333740234, - "logps/rejected": -31.555334091186523, - "loss": 0.2409, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.4698936939239502, - "rewards/margins": 1.5127770900726318, - "rewards/rejected": -0.04288337752223015, - "step": 520 - }, - { - "epoch": 1.38, - "grad_norm": 4.625, - "learning_rate": 4.145700124802693e-06, - "logits/chosen": -1.9370012283325195, - "logits/rejected": -1.9336122274398804, - "logps/chosen": -31.002681732177734, - "logps/rejected": -32.36973190307617, - "loss": 0.2675, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 1.2760448455810547, - "rewards/margins": 1.2823030948638916, - "rewards/rejected": -0.006258136127144098, - "step": 530 - }, - { - "epoch": 1.4, - "grad_norm": 7.3125, - "learning_rate": 4.102623991469562e-06, - "logits/chosen": -1.8017065525054932, - "logits/rejected": -1.8108688592910767, - "logps/chosen": -30.88136863708496, - "logps/rejected": -31.948009490966797, - "loss": 0.2414, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.5192195177078247, - "rewards/margins": 1.6020395755767822, - "rewards/rejected": -0.08282008022069931, - "step": 540 - }, - { - "epoch": 1.43, - "grad_norm": 6.375, - "learning_rate": 4.058724504646834e-06, - "logits/chosen": -1.8965680599212646, - "logits/rejected": -1.8902814388275146, - "logps/chosen": -31.88201332092285, - "logps/rejected": -30.860742568969727, - "loss": 0.2485, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.6436617374420166, - "rewards/margins": 1.5261151790618896, - "rewards/rejected": 0.1175464615225792, - "step": 550 - }, - { - "epoch": 1.45, - "grad_norm": 4.71875, - "learning_rate": 4.014024217844167e-06, - "logits/chosen": -1.986634612083435, - "logits/rejected": -1.9846274852752686, - "logps/chosen": -32.79119110107422, - "logps/rejected": -31.430110931396484, - "loss": 0.2457, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 1.6020066738128662, - "rewards/margins": 1.5414594411849976, - "rewards/rejected": 0.060547251254320145, - "step": 560 - }, - { - "epoch": 1.48, - "grad_norm": 7.3125, - "learning_rate": 3.968546095984911e-06, - "logits/chosen": -1.8208553791046143, - "logits/rejected": -1.8186527490615845, - "logps/chosen": -31.178722381591797, - "logps/rejected": -30.978708267211914, - "loss": 0.2544, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 1.455919623374939, - "rewards/margins": 1.4346846342086792, - "rewards/rejected": 0.021234944462776184, - "step": 570 - }, - { - "epoch": 1.51, - "grad_norm": 6.125, - "learning_rate": 3.922313503607806e-06, - "logits/chosen": -1.9555221796035767, - "logits/rejected": -1.95218026638031, - "logps/chosen": -29.46148681640625, - "logps/rejected": -34.874351501464844, - "loss": 0.2218, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.4788440465927124, - "rewards/margins": 1.7022136449813843, - "rewards/rejected": -0.22336962819099426, - "step": 580 - }, - { - "epoch": 1.53, - "grad_norm": 6.46875, - "learning_rate": 3.875350192863368e-06, - "logits/chosen": -1.893742322921753, - "logits/rejected": -1.8973023891448975, - "logps/chosen": -28.269176483154297, - "logps/rejected": -30.657907485961914, - "loss": 0.2751, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 1.2112681865692139, - "rewards/margins": 1.210404634475708, - "rewards/rejected": 0.0008634254336357117, - "step": 590 - }, - { - "epoch": 1.56, - "grad_norm": 7.8125, - "learning_rate": 3.8276802913111436e-06, - "logits/chosen": -1.931885004043579, - "logits/rejected": -1.9316871166229248, - "logps/chosen": -30.367956161499023, - "logps/rejected": -31.157052993774414, - "loss": 0.233, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.6682202816009521, - "rewards/margins": 1.6358535289764404, - "rewards/rejected": 0.032366663217544556, - "step": 600 - }, - { - "epoch": 1.56, - "eval_logits/chosen": -2.196866750717163, - "eval_logits/rejected": -2.192082166671753, - "eval_logps/chosen": -33.557369232177734, - "eval_logps/rejected": -37.16165542602539, - "eval_loss": 0.4758525788784027, - "eval_rewards/accuracies": 0.550664484500885, - "eval_rewards/chosen": 0.42946678400039673, - "eval_rewards/margins": 0.10999691486358643, - "eval_rewards/rejected": 0.3194698989391327, - "eval_runtime": 145.1602, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 600 - }, - { - "epoch": 1.58, - "grad_norm": 6.4375, - "learning_rate": 3.7793282895240927e-06, - "logits/chosen": -1.9942924976348877, - "logits/rejected": -1.9949829578399658, - "logps/chosen": -32.94305419921875, - "logps/rejected": -33.07844543457031, - "loss": 0.2036, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.878381371498108, - "rewards/margins": 1.8732118606567383, - "rewards/rejected": 0.00516938790678978, - "step": 610 - }, - { - "epoch": 1.61, - "grad_norm": 4.3125, - "learning_rate": 3.730319028506478e-06, - "logits/chosen": -1.9580987691879272, - "logits/rejected": -1.9557793140411377, - "logps/chosen": -31.34842872619629, - "logps/rejected": -32.18495178222656, - "loss": 0.226, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.599273920059204, - "rewards/margins": 1.702805757522583, - "rewards/rejected": -0.10353174060583115, - "step": 620 - }, - { - "epoch": 1.64, - "grad_norm": 8.375, - "learning_rate": 3.6806776869317074e-06, - "logits/chosen": -1.971247911453247, - "logits/rejected": -1.9623386859893799, - "logps/chosen": -30.84600830078125, - "logps/rejected": -31.0206356048584, - "loss": 0.2237, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 1.737097144126892, - "rewards/margins": 1.8138401508331299, - "rewards/rejected": -0.07674300670623779, - "step": 630 - }, - { - "epoch": 1.66, - "grad_norm": 7.53125, - "learning_rate": 3.6304297682067146e-06, - "logits/chosen": -1.9715744256973267, - "logits/rejected": -1.9683929681777954, - "logps/chosen": -30.522232055664062, - "logps/rejected": -32.50453567504883, - "loss": 0.2273, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.5085704326629639, - "rewards/margins": 1.585178256034851, - "rewards/rejected": -0.07660768926143646, - "step": 640 - }, - { - "epoch": 1.69, - "grad_norm": 6.09375, - "learning_rate": 3.579601087369492e-06, - "logits/chosen": -1.978017807006836, - "logits/rejected": -1.980383276939392, - "logps/chosen": -31.817895889282227, - "logps/rejected": -33.76380920410156, - "loss": 0.2009, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.6522296667099, - "rewards/margins": 1.7246001958847046, - "rewards/rejected": -0.07237066328525543, - "step": 650 - }, - { - "epoch": 1.71, - "grad_norm": 8.75, - "learning_rate": 3.5282177578265295e-06, - "logits/chosen": -1.893317461013794, - "logits/rejected": -1.8938663005828857, - "logps/chosen": -32.03578186035156, - "logps/rejected": -31.60036277770996, - "loss": 0.2051, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.7199455499649048, - "rewards/margins": 1.7137399911880493, - "rewards/rejected": 0.006205317564308643, - "step": 660 - }, - { - "epoch": 1.74, - "grad_norm": 6.90625, - "learning_rate": 3.476306177936961e-06, - "logits/chosen": -1.9339790344238281, - "logits/rejected": -1.9244499206542969, - "logps/chosen": -32.11154556274414, - "logps/rejected": -32.44702911376953, - "loss": 0.2005, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.7007629871368408, - "rewards/margins": 1.8260762691497803, - "rewards/rejected": -0.1253131628036499, - "step": 670 - }, - { - "epoch": 1.77, - "grad_norm": 3.796875, - "learning_rate": 3.423893017450324e-06, - "logits/chosen": -1.8335208892822266, - "logits/rejected": -1.8305240869522095, - "logps/chosen": -29.47957420349121, - "logps/rejected": -34.32526397705078, - "loss": 0.2123, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.7832454442977905, - "rewards/margins": 1.7892627716064453, - "rewards/rejected": -0.0060173808597028255, - "step": 680 - }, - { - "epoch": 1.79, - "grad_norm": 5.09375, - "learning_rate": 3.3710052038048794e-06, - "logits/chosen": -1.8939129114151, - "logits/rejected": -1.8931477069854736, - "logps/chosen": -33.063987731933594, - "logps/rejected": -35.4408073425293, - "loss": 0.1817, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.9856712818145752, - "rewards/margins": 2.0639584064483643, - "rewards/rejected": -0.07828731834888458, - "step": 690 - }, - { - "epoch": 1.82, - "grad_norm": 4.09375, - "learning_rate": 3.3176699082935546e-06, - "logits/chosen": -1.871268630027771, - "logits/rejected": -1.874311089515686, - "logps/chosen": -30.577835083007812, - "logps/rejected": -35.52479553222656, - "loss": 0.2347, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.8225120306015015, - "rewards/margins": 1.7173261642456055, - "rewards/rejected": 0.10518588870763779, - "step": 700 - }, - { - "epoch": 1.82, - "eval_logits/chosen": -2.190619945526123, - "eval_logits/rejected": -2.1858627796173096, - "eval_logps/chosen": -33.56110763549805, - "eval_logps/rejected": -37.17965316772461, - "eval_loss": 0.47246184945106506, - "eval_rewards/accuracies": 0.5921927094459534, - "eval_rewards/chosen": 0.42609870433807373, - "eval_rewards/margins": 0.12282571941614151, - "eval_rewards/rejected": 0.3032729923725128, - "eval_runtime": 145.3847, - "eval_samples_per_second": 2.359, - "eval_steps_per_second": 0.296, - "step": 700 - }, - { - "epoch": 1.84, - "grad_norm": 6.84375, - "learning_rate": 3.2639145321045933e-06, - "logits/chosen": -1.9779857397079468, - "logits/rejected": -1.9807437658309937, - "logps/chosen": -32.9129638671875, - "logps/rejected": -34.280601501464844, - "loss": 0.2289, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 1.7843647003173828, - "rewards/margins": 1.7937663793563843, - "rewards/rejected": -0.009401577524840832, - "step": 710 - }, - { - "epoch": 1.87, - "grad_norm": 8.0, - "learning_rate": 3.2097666922441107e-06, - "logits/chosen": -1.8327629566192627, - "logits/rejected": -1.8268474340438843, - "logps/chosen": -32.74610137939453, - "logps/rejected": -32.28891372680664, - "loss": 0.2118, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 1.8784115314483643, - "rewards/margins": 1.8217761516571045, - "rewards/rejected": 0.0566352978348732, - "step": 720 - }, - { - "epoch": 1.9, - "grad_norm": 3.96875, - "learning_rate": 3.1552542073477554e-06, - "logits/chosen": -2.0041515827178955, - "logits/rejected": -2.001244068145752, - "logps/chosen": -28.74154281616211, - "logps/rejected": -31.609844207763672, - "loss": 0.2078, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.720298409461975, - "rewards/margins": 1.8657814264297485, - "rewards/rejected": -0.14548318088054657, - "step": 730 - }, - { - "epoch": 1.92, - "grad_norm": 4.125, - "learning_rate": 3.100405083388799e-06, - "logits/chosen": -1.8444658517837524, - "logits/rejected": -1.8444955348968506, - "logps/chosen": -31.312763214111328, - "logps/rejected": -37.6312141418457, - "loss": 0.1873, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.9353710412979126, - "rewards/margins": 1.938436508178711, - "rewards/rejected": -0.003065618919208646, - "step": 740 - }, - { - "epoch": 1.95, - "grad_norm": 3.0625, - "learning_rate": 3.0452474992899645e-06, - "logits/chosen": -1.727169394493103, - "logits/rejected": -1.7323789596557617, - "logps/chosen": -34.66059112548828, - "logps/rejected": -33.93421173095703, - "loss": 0.197, - "rewards/accuracies": 0.875, - "rewards/chosen": 2.044684648513794, - "rewards/margins": 2.01037335395813, - "rewards/rejected": 0.03431138023734093, - "step": 750 - }, - { - "epoch": 1.97, - "grad_norm": 5.90625, - "learning_rate": 2.989809792446417e-06, - "logits/chosen": -1.933237075805664, - "logits/rejected": -1.934692621231079, - "logps/chosen": -30.707813262939453, - "logps/rejected": -32.81720733642578, - "loss": 0.2075, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.7080799341201782, - "rewards/margins": 1.7422746419906616, - "rewards/rejected": -0.034194689244031906, - "step": 760 - }, - { - "epoch": 2.0, - "grad_norm": 6.0, - "learning_rate": 2.9341204441673267e-06, - "logits/chosen": -1.9043340682983398, - "logits/rejected": -1.9034773111343384, - "logps/chosen": -30.087554931640625, - "logps/rejected": -34.8135871887207, - "loss": 0.2157, - "rewards/accuracies": 0.8833333849906921, - "rewards/chosen": 1.879249930381775, - "rewards/margins": 1.6861387491226196, - "rewards/rejected": 0.19311121106147766, - "step": 770 - }, - { - "epoch": 2.03, - "grad_norm": 4.0625, - "learning_rate": 2.878208065043501e-06, - "logits/chosen": -1.904748558998108, - "logits/rejected": -1.9041064977645874, - "logps/chosen": -32.38899612426758, - "logps/rejected": -31.7285099029541, - "loss": 0.1535, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.033520460128784, - "rewards/margins": 2.3116116523742676, - "rewards/rejected": -0.27809059619903564, - "step": 780 - }, - { - "epoch": 2.05, - "grad_norm": 5.0625, - "learning_rate": 2.8221013802485974e-06, - "logits/chosen": -1.9349530935287476, - "logits/rejected": -1.9334694147109985, - "logps/chosen": -27.4735050201416, - "logps/rejected": -33.12495040893555, - "loss": 0.1493, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.8807083368301392, - "rewards/margins": 2.3204848766326904, - "rewards/rejected": -0.4397762715816498, - "step": 790 - }, - { - "epoch": 2.08, - "grad_norm": 3.53125, - "learning_rate": 2.76582921478147e-06, - "logits/chosen": -1.9856784343719482, - "logits/rejected": -1.9823191165924072, - "logps/chosen": -30.055683135986328, - "logps/rejected": -34.58222961425781, - "loss": 0.1395, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.134375810623169, - "rewards/margins": 2.4792473316192627, - "rewards/rejected": -0.3448713719844818, - "step": 800 - }, - { - "epoch": 2.08, - "eval_logits/chosen": -2.1801018714904785, - "eval_logits/rejected": -2.175370454788208, - "eval_logps/chosen": -33.61674499511719, - "eval_logps/rejected": -37.23181915283203, - "eval_loss": 0.47494587302207947, - "eval_rewards/accuracies": 0.5568937063217163, - "eval_rewards/chosen": 0.37602853775024414, - "eval_rewards/margins": 0.11970727145671844, - "eval_rewards/rejected": 0.2563212811946869, - "eval_runtime": 145.2165, - "eval_samples_per_second": 2.362, - "eval_steps_per_second": 0.296, - "step": 800 - }, - { - "epoch": 2.1, - "grad_norm": 2.484375, - "learning_rate": 2.7094204786572254e-06, - "logits/chosen": -1.8192085027694702, - "logits/rejected": -1.8114818334579468, - "logps/chosen": -31.662174224853516, - "logps/rejected": -35.00957489013672, - "loss": 0.1102, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 2.5146613121032715, - "rewards/margins": 2.919175624847412, - "rewards/rejected": -0.4045144021511078, - "step": 810 - }, - { - "epoch": 2.13, - "grad_norm": 4.78125, - "learning_rate": 2.6529041520546072e-06, - "logits/chosen": -1.8948942422866821, - "logits/rejected": -1.9053504467010498, - "logps/chosen": -33.113929748535156, - "logps/rejected": -32.70395278930664, - "loss": 0.153, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.3235878944396973, - "rewards/margins": 2.5688529014587402, - "rewards/rejected": -0.24526500701904297, - "step": 820 - }, - { - "epoch": 2.16, - "grad_norm": 3.9375, - "learning_rate": 2.5963092704273302e-06, - "logits/chosen": -1.9460933208465576, - "logits/rejected": -1.9504921436309814, - "logps/chosen": -32.59149169921875, - "logps/rejected": -29.507518768310547, - "loss": 0.1482, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.0801615715026855, - "rewards/margins": 2.2955098152160645, - "rewards/rejected": -0.21534815430641174, - "step": 830 - }, - { - "epoch": 2.18, - "grad_norm": 5.40625, - "learning_rate": 2.53966490958702e-06, - "logits/chosen": -1.9453834295272827, - "logits/rejected": -1.95318603515625, - "logps/chosen": -32.17658233642578, - "logps/rejected": -30.290695190429688, - "loss": 0.1261, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.3503057956695557, - "rewards/margins": 2.7489988803863525, - "rewards/rejected": -0.39869317412376404, - "step": 840 - }, - { - "epoch": 2.21, - "grad_norm": 4.3125, - "learning_rate": 2.4830001707654135e-06, - "logits/chosen": -1.8776146173477173, - "logits/rejected": -1.8682146072387695, - "logps/chosen": -29.36258888244629, - "logps/rejected": -32.322593688964844, - "loss": 0.1372, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.1875457763671875, - "rewards/margins": 2.4654555320739746, - "rewards/rejected": -0.27790942788124084, - "step": 850 - }, - { - "epoch": 2.23, - "grad_norm": 3.96875, - "learning_rate": 2.4263441656635054e-06, - "logits/chosen": -2.0152745246887207, - "logits/rejected": -2.0056991577148438, - "logps/chosen": -23.986736297607422, - "logps/rejected": -30.276992797851562, - "loss": 0.1555, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.0136044025421143, - "rewards/margins": 2.3777928352355957, - "rewards/rejected": -0.3641887307167053, - "step": 860 - }, - { - "epoch": 2.26, - "grad_norm": 3.6875, - "learning_rate": 2.3697260014953107e-06, - "logits/chosen": -1.8663276433944702, - "logits/rejected": -1.867753267288208, - "logps/chosen": -31.755935668945312, - "logps/rejected": -30.341991424560547, - "loss": 0.1304, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.3193624019622803, - "rewards/margins": 2.6429049968719482, - "rewards/rejected": -0.3235425353050232, - "step": 870 - }, - { - "epoch": 2.29, - "grad_norm": 3.203125, - "learning_rate": 2.3131747660339396e-06, - "logits/chosen": -1.8777776956558228, - "logits/rejected": -1.878570318222046, - "logps/chosen": -30.02212142944336, - "logps/rejected": -33.44975662231445, - "loss": 0.1505, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 2.1304116249084473, - "rewards/margins": 2.4893665313720703, - "rewards/rejected": -0.3589547574520111, - "step": 880 - }, - { - "epoch": 2.31, - "grad_norm": 3.21875, - "learning_rate": 2.256719512667651e-06, - "logits/chosen": -1.7799694538116455, - "logits/rejected": -1.7784477472305298, - "logps/chosen": -33.0731086730957, - "logps/rejected": -36.56109619140625, - "loss": 0.1232, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.3326289653778076, - "rewards/margins": 2.94810152053833, - "rewards/rejected": -0.6154726147651672, - "step": 890 - }, - { - "epoch": 2.34, - "grad_norm": 3.34375, - "learning_rate": 2.2003892454735786e-06, - "logits/chosen": -1.9299914836883545, - "logits/rejected": -1.9232423305511475, - "logps/chosen": -29.402902603149414, - "logps/rejected": -33.13259506225586, - "loss": 0.131, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.3397605419158936, - "rewards/margins": 2.676835536956787, - "rewards/rejected": -0.3370749354362488, - "step": 900 - }, - { - "epoch": 2.34, - "eval_logits/chosen": -2.178741455078125, - "eval_logits/rejected": -2.1739933490753174, - "eval_logps/chosen": -33.59278106689453, - "eval_logps/rejected": -37.21134948730469, - "eval_loss": 0.4745200574398041, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": 0.39759397506713867, - "eval_rewards/margins": 0.12284980714321136, - "eval_rewards/rejected": 0.2747441232204437, - "eval_runtime": 145.1513, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 900 - }, - { - "epoch": 2.36, - "grad_norm": 4.09375, - "learning_rate": 2.1442129043167877e-06, - "logits/chosen": -1.971784234046936, - "logits/rejected": -1.9671688079833984, - "logps/chosen": -31.10721778869629, - "logps/rejected": -34.966033935546875, - "loss": 0.1265, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.2054171562194824, - "rewards/margins": 2.5990865230560303, - "rewards/rejected": -0.3936692774295807, - "step": 910 - }, - { - "epoch": 2.39, - "grad_norm": 3.859375, - "learning_rate": 2.088219349982323e-06, - "logits/chosen": -1.8900690078735352, - "logits/rejected": -1.895115613937378, - "logps/chosen": -32.61782455444336, - "logps/rejected": -32.775604248046875, - "loss": 0.1299, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.457686424255371, - "rewards/margins": 2.7058145999908447, - "rewards/rejected": -0.2481282651424408, - "step": 920 - }, - { - "epoch": 2.42, - "grad_norm": 4.3125, - "learning_rate": 2.0324373493478803e-06, - "logits/chosen": -1.9874694347381592, - "logits/rejected": -1.9783003330230713, - "logps/chosen": -29.76763916015625, - "logps/rejected": -34.72348403930664, - "loss": 0.1274, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 2.265561819076538, - "rewards/margins": 2.518176555633545, - "rewards/rejected": -0.252614825963974, - "step": 930 - }, - { - "epoch": 2.44, - "grad_norm": 3.984375, - "learning_rate": 1.976895560604729e-06, - "logits/chosen": -1.9131931066513062, - "logits/rejected": -1.9100478887557983, - "logps/chosen": -29.089197158813477, - "logps/rejected": -32.67066192626953, - "loss": 0.1646, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.979331374168396, - "rewards/margins": 2.1640915870666504, - "rewards/rejected": -0.18476030230522156, - "step": 940 - }, - { - "epoch": 2.47, - "grad_norm": 3.359375, - "learning_rate": 1.921622518534466e-06, - "logits/chosen": -1.8536300659179688, - "logits/rejected": -1.8610553741455078, - "logps/chosen": -30.455718994140625, - "logps/rejected": -35.732582092285156, - "loss": 0.1727, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.1738009452819824, - "rewards/margins": 2.3381874561309814, - "rewards/rejected": -0.1643865555524826, - "step": 950 - }, - { - "epoch": 2.49, - "grad_norm": 4.25, - "learning_rate": 1.8666466198491794e-06, - "logits/chosen": -1.8633100986480713, - "logits/rejected": -1.8566601276397705, - "logps/chosen": -31.251073837280273, - "logps/rejected": -37.12010192871094, - "loss": 0.1258, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.319697856903076, - "rewards/margins": 2.7888638973236084, - "rewards/rejected": -0.46916627883911133, - "step": 960 - }, - { - "epoch": 2.52, - "grad_norm": 3.4375, - "learning_rate": 1.8119961086025376e-06, - "logits/chosen": -1.8636627197265625, - "logits/rejected": -1.8642667531967163, - "logps/chosen": -28.609216690063477, - "logps/rejected": -33.2410888671875, - "loss": 0.1212, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.364471912384033, - "rewards/margins": 2.598318099975586, - "rewards/rejected": -0.23384615778923035, - "step": 970 - }, - { - "epoch": 2.55, - "grad_norm": 3.828125, - "learning_rate": 1.7576990616793139e-06, - "logits/chosen": -1.868687391281128, - "logits/rejected": -1.8798682689666748, - "logps/chosen": -30.348342895507812, - "logps/rejected": -34.40825653076172, - "loss": 0.1265, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 2.4555139541625977, - "rewards/margins": 2.704857587814331, - "rewards/rejected": -0.2493438720703125, - "step": 980 - }, - { - "epoch": 2.57, - "grad_norm": 2.6875, - "learning_rate": 1.7037833743707892e-06, - "logits/chosen": -1.9379189014434814, - "logits/rejected": -1.9354835748672485, - "logps/chosen": -33.531471252441406, - "logps/rejected": -32.576297760009766, - "loss": 0.1497, - "rewards/accuracies": 0.875, - "rewards/chosen": 2.3306241035461426, - "rewards/margins": 2.489985704421997, - "rewards/rejected": -0.15936140716075897, - "step": 990 - }, - { - "epoch": 2.6, - "grad_norm": 5.625, - "learning_rate": 1.6502767460434588e-06, - "logits/chosen": -1.908524751663208, - "logits/rejected": -1.9124901294708252, - "logps/chosen": -31.943683624267578, - "logps/rejected": -34.699554443359375, - "loss": 0.1437, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.267646074295044, - "rewards/margins": 2.459801435470581, - "rewards/rejected": -0.19215506315231323, - "step": 1000 - }, - { - "epoch": 2.6, - "eval_logits/chosen": -2.172844409942627, - "eval_logits/rejected": -2.168105363845825, - "eval_logps/chosen": -33.605751037597656, - "eval_logps/rejected": -37.2342414855957, - "eval_loss": 0.4726148843765259, - "eval_rewards/accuracies": 0.5685215592384338, - "eval_rewards/chosen": 0.38591769337654114, - "eval_rewards/margins": 0.1317760944366455, - "eval_rewards/rejected": 0.25414159893989563, - "eval_runtime": 145.3481, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 1000 - }, - { - "epoch": 2.62, - "grad_norm": 6.8125, - "learning_rate": 1.5972066659083796e-06, - "logits/chosen": -1.9187676906585693, - "logits/rejected": -1.9236234426498413, - "logps/chosen": -30.417675018310547, - "logps/rejected": -32.636802673339844, - "loss": 0.1716, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 2.1444733142852783, - "rewards/margins": 2.2484617233276367, - "rewards/rejected": -0.10398862510919571, - "step": 1010 - }, - { - "epoch": 2.65, - "grad_norm": 3.515625, - "learning_rate": 1.5446003988985041e-06, - "logits/chosen": -1.8794755935668945, - "logits/rejected": -1.8822200298309326, - "logps/chosen": -27.984928131103516, - "logps/rejected": -32.084327697753906, - "loss": 0.1328, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.2146403789520264, - "rewards/margins": 2.5612902641296387, - "rewards/rejected": -0.34664976596832275, - "step": 1020 - }, - { - "epoch": 2.68, - "grad_norm": 2.65625, - "learning_rate": 1.4924849716612211e-06, - "logits/chosen": -1.9341211318969727, - "logits/rejected": -1.9285913705825806, - "logps/chosen": -31.582775115966797, - "logps/rejected": -33.81012725830078, - "loss": 0.1451, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.1657042503356934, - "rewards/margins": 2.428621768951416, - "rewards/rejected": -0.2629176080226898, - "step": 1030 - }, - { - "epoch": 2.7, - "grad_norm": 5.53125, - "learning_rate": 1.440887158673332e-06, - "logits/chosen": -1.8665565252304077, - "logits/rejected": -1.8703277111053467, - "logps/chosen": -33.26926040649414, - "logps/rejected": -34.53216552734375, - "loss": 0.1518, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.2360215187072754, - "rewards/margins": 2.3373520374298096, - "rewards/rejected": -0.10133042186498642, - "step": 1040 - }, - { - "epoch": 2.73, - "grad_norm": 3.46875, - "learning_rate": 1.3898334684855647e-06, - "logits/chosen": -1.8647922277450562, - "logits/rejected": -1.8776315450668335, - "logps/chosen": -29.479755401611328, - "logps/rejected": -33.282066345214844, - "loss": 0.1532, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.144817590713501, - "rewards/margins": 2.3806402683258057, - "rewards/rejected": -0.2358228713274002, - "step": 1050 - }, - { - "epoch": 2.75, - "grad_norm": 4.625, - "learning_rate": 1.3393501301037245e-06, - "logits/chosen": -1.8554801940917969, - "logits/rejected": -1.849180817604065, - "logps/chosen": -29.633052825927734, - "logps/rejected": -33.600250244140625, - "loss": 0.1356, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.273649215698242, - "rewards/margins": 2.527707576751709, - "rewards/rejected": -0.254058301448822, - "step": 1060 - }, - { - "epoch": 2.78, - "grad_norm": 3.375, - "learning_rate": 1.2894630795134454e-06, - "logits/chosen": -1.9811935424804688, - "logits/rejected": -1.9819939136505127, - "logps/chosen": -31.09320068359375, - "logps/rejected": -33.08074188232422, - "loss": 0.1343, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.208488941192627, - "rewards/margins": 2.4209327697753906, - "rewards/rejected": -0.21244390308856964, - "step": 1070 - }, - { - "epoch": 2.81, - "grad_norm": 5.8125, - "learning_rate": 1.2401979463554984e-06, - "logits/chosen": -1.912397027015686, - "logits/rejected": -1.9113376140594482, - "logps/chosen": -31.625701904296875, - "logps/rejected": -33.2298583984375, - "loss": 0.1702, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 2.2177155017852783, - "rewards/margins": 2.2519402503967285, - "rewards/rejected": -0.03422477841377258, - "step": 1080 - }, - { - "epoch": 2.83, - "grad_norm": 3.078125, - "learning_rate": 1.1915800407584705e-06, - "logits/chosen": -1.9387133121490479, - "logits/rejected": -1.9312927722930908, - "logps/chosen": -31.74675941467285, - "logps/rejected": -31.231353759765625, - "loss": 0.143, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.2312071323394775, - "rewards/margins": 2.4020464420318604, - "rewards/rejected": -0.17083951830863953, - "step": 1090 - }, - { - "epoch": 2.86, - "grad_norm": 3.5, - "learning_rate": 1.1436343403356019e-06, - "logits/chosen": -1.908151388168335, - "logits/rejected": -1.907440185546875, - "logps/chosen": -32.65666961669922, - "logps/rejected": -36.80711364746094, - "loss": 0.1187, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.4855704307556152, - "rewards/margins": 2.8945364952087402, - "rewards/rejected": -0.4089658856391907, - "step": 1100 - }, - { - "epoch": 2.86, - "eval_logits/chosen": -2.1719584465026855, - "eval_logits/rejected": -2.1672279834747314, - "eval_logps/chosen": -33.60622024536133, - "eval_logps/rejected": -37.22234344482422, - "eval_loss": 0.47522035241127014, - "eval_rewards/accuracies": 0.5539867281913757, - "eval_rewards/chosen": 0.3854992091655731, - "eval_rewards/margins": 0.12064921855926514, - "eval_rewards/rejected": 0.2648499608039856, - "eval_runtime": 145.2443, - "eval_samples_per_second": 2.362, - "eval_steps_per_second": 0.296, - "step": 1100 - }, - { - "epoch": 2.88, - "grad_norm": 2.890625, - "learning_rate": 1.0963854773524548e-06, - "logits/chosen": -1.9528541564941406, - "logits/rejected": -1.9581425189971924, - "logps/chosen": -32.96589279174805, - "logps/rejected": -35.81908416748047, - "loss": 0.154, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.251096725463867, - "rewards/margins": 2.4990475177764893, - "rewards/rejected": -0.24795059859752655, - "step": 1110 - }, - { - "epoch": 2.91, - "grad_norm": 2.234375, - "learning_rate": 1.049857726072005e-06, - "logits/chosen": -1.92671799659729, - "logits/rejected": -1.924842119216919, - "logps/chosen": -30.239665985107422, - "logps/rejected": -33.359275817871094, - "loss": 0.1469, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.1388847827911377, - "rewards/margins": 2.477752208709717, - "rewards/rejected": -0.3388676643371582, - "step": 1120 - }, - { - "epoch": 2.94, - "grad_norm": 2.765625, - "learning_rate": 1.0040749902836508e-06, - "logits/chosen": -1.8301622867584229, - "logits/rejected": -1.8328670263290405, - "logps/chosen": -27.269672393798828, - "logps/rejected": -30.77303695678711, - "loss": 0.1185, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.231065273284912, - "rewards/margins": 2.7062644958496094, - "rewards/rejected": -0.47519931197166443, - "step": 1130 - }, - { - "epoch": 2.96, - "grad_norm": 3.734375, - "learning_rate": 9.59060791022566e-07, - "logits/chosen": -1.9288259744644165, - "logits/rejected": -1.925840973854065, - "logps/chosen": -30.6153564453125, - "logps/rejected": -32.5478401184082, - "loss": 0.1456, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.3295514583587646, - "rewards/margins": 2.3484175205230713, - "rewards/rejected": -0.01886589825153351, - "step": 1140 - }, - { - "epoch": 2.99, - "grad_norm": 4.6875, - "learning_rate": 9.148382544856885e-07, - "logits/chosen": -1.8482894897460938, - "logits/rejected": -1.842042326927185, - "logps/chosen": -25.929004669189453, - "logps/rejected": -32.56935119628906, - "loss": 0.1583, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.9880859851837158, - "rewards/margins": 2.367035388946533, - "rewards/rejected": -0.37894967198371887, - "step": 1150 - }, - { - "epoch": 3.01, - "grad_norm": 2.3125, - "learning_rate": 8.714301001505568e-07, - "logits/chosen": -1.963226556777954, - "logits/rejected": -1.9608166217803955, - "logps/chosen": -30.5425968170166, - "logps/rejected": -35.541412353515625, - "loss": 0.1322, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.298123359680176, - "rewards/margins": 2.7030162811279297, - "rewards/rejected": -0.40489259362220764, - "step": 1160 - }, - { - "epoch": 3.04, - "grad_norm": 2.375, - "learning_rate": 8.288586291031025e-07, - "logits/chosen": -1.9463905096054077, - "logits/rejected": -1.9444236755371094, - "logps/chosen": -28.985403060913086, - "logps/rejected": -32.826229095458984, - "loss": 0.1166, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.366102695465088, - "rewards/margins": 2.7820885181427, - "rewards/rejected": -0.41598600149154663, - "step": 1170 - }, - { - "epoch": 3.06, - "grad_norm": 3.015625, - "learning_rate": 7.871457125803897e-07, - "logits/chosen": -1.9402363300323486, - "logits/rejected": -1.928139328956604, - "logps/chosen": -33.41131591796875, - "logps/rejected": -34.492095947265625, - "loss": 0.1248, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.370587110519409, - "rewards/margins": 2.749145030975342, - "rewards/rejected": -0.3785577118396759, - "step": 1180 - }, - { - "epoch": 3.09, - "grad_norm": 5.0, - "learning_rate": 7.463127807341966e-07, - "logits/chosen": -1.832439661026001, - "logits/rejected": -1.8273050785064697, - "logps/chosen": -32.512786865234375, - "logps/rejected": -34.4256591796875, - "loss": 0.1203, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.5532097816467285, - "rewards/margins": 2.7222681045532227, - "rewards/rejected": -0.16905789077281952, - "step": 1190 - }, - { - "epoch": 3.12, - "grad_norm": 3.03125, - "learning_rate": 7.063808116212021e-07, - "logits/chosen": -1.8775498867034912, - "logits/rejected": -1.8805272579193115, - "logps/chosen": -30.205677032470703, - "logps/rejected": -31.677600860595703, - "loss": 0.1561, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.1684765815734863, - "rewards/margins": 2.340757369995117, - "rewards/rejected": -0.17228101193904877, - "step": 1200 - }, - { - "epoch": 3.12, - "eval_logits/chosen": -2.172138214111328, - "eval_logits/rejected": -2.1674084663391113, - "eval_logps/chosen": -33.60100173950195, - "eval_logps/rejected": -37.23575973510742, - "eval_loss": 0.4716731905937195, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": 0.39019396901130676, - "eval_rewards/margins": 0.13741979002952576, - "eval_rewards/rejected": 0.2527742087841034, - "eval_runtime": 145.1136, - "eval_samples_per_second": 2.364, - "eval_steps_per_second": 0.296, - "step": 1200 - }, - { - "epoch": 3.14, - "grad_norm": 3.84375, - "learning_rate": 6.673703204254348e-07, - "logits/chosen": -1.903577446937561, - "logits/rejected": -1.8992490768432617, - "logps/chosen": -28.683547973632812, - "logps/rejected": -31.243778228759766, - "loss": 0.1197, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.3311078548431396, - "rewards/margins": 2.63321852684021, - "rewards/rejected": -0.30211085081100464, - "step": 1210 - }, - { - "epoch": 3.17, - "grad_norm": 3.0625, - "learning_rate": 6.293013489185315e-07, - "logits/chosen": -1.9149614572525024, - "logits/rejected": -1.9096797704696655, - "logps/chosen": -32.52824020385742, - "logps/rejected": -35.000606536865234, - "loss": 0.1192, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.471128225326538, - "rewards/margins": 2.8587820529937744, - "rewards/rejected": -0.38765376806259155, - "step": 1220 - }, - { - "epoch": 3.19, - "grad_norm": 2.578125, - "learning_rate": 5.921934551632086e-07, - "logits/chosen": -1.917303442955017, - "logits/rejected": -1.9042131900787354, - "logps/chosen": -30.63210105895996, - "logps/rejected": -34.53245544433594, - "loss": 0.1356, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 2.215723991394043, - "rewards/margins": 2.5870730876922607, - "rewards/rejected": -0.37134915590286255, - "step": 1230 - }, - { - "epoch": 3.22, - "grad_norm": 2.125, - "learning_rate": 5.560657034652405e-07, - "logits/chosen": -1.9619649648666382, - "logits/rejected": -1.9593585729599, - "logps/chosen": -32.446205139160156, - "logps/rejected": -32.16899490356445, - "loss": 0.1303, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.3280248641967773, - "rewards/margins": 2.6242423057556152, - "rewards/rejected": -0.29621756076812744, - "step": 1240 - }, - { - "epoch": 3.25, - "grad_norm": 2.46875, - "learning_rate": 5.2093665457911e-07, - "logits/chosen": -1.8923189640045166, - "logits/rejected": -1.8896526098251343, - "logps/chosen": -31.654674530029297, - "logps/rejected": -35.40267562866211, - "loss": 0.1013, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.4673264026641846, - "rewards/margins": 3.0123214721679688, - "rewards/rejected": -0.5449954867362976, - "step": 1250 - }, - { - "epoch": 3.27, - "grad_norm": 2.40625, - "learning_rate": 4.868243561723535e-07, - "logits/chosen": -1.9567426443099976, - "logits/rejected": -1.9514939785003662, - "logps/chosen": -28.140026092529297, - "logps/rejected": -32.6693115234375, - "loss": 0.1201, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.3586957454681396, - "rewards/margins": 2.61677885055542, - "rewards/rejected": -0.25808292627334595, - "step": 1260 - }, - { - "epoch": 3.3, - "grad_norm": 2.953125, - "learning_rate": 4.537463335535161e-07, - "logits/chosen": -1.9826234579086304, - "logits/rejected": -1.9873912334442139, - "logps/chosen": -30.03902816772461, - "logps/rejected": -32.010902404785156, - "loss": 0.1241, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.404773235321045, - "rewards/margins": 2.7013566493988037, - "rewards/rejected": -0.2965838313102722, - "step": 1270 - }, - { - "epoch": 3.32, - "grad_norm": 5.03125, - "learning_rate": 4.217195806684629e-07, - "logits/chosen": -1.8774194717407227, - "logits/rejected": -1.8847315311431885, - "logps/chosen": -32.28123092651367, - "logps/rejected": -32.95527648925781, - "loss": 0.135, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.3634095191955566, - "rewards/margins": 2.6847171783447266, - "rewards/rejected": -0.3213076591491699, - "step": 1280 - }, - { - "epoch": 3.35, - "grad_norm": 3.421875, - "learning_rate": 3.907605513696808e-07, - "logits/chosen": -1.7818914651870728, - "logits/rejected": -1.7842298746109009, - "logps/chosen": -30.396137237548828, - "logps/rejected": -36.838951110839844, - "loss": 0.138, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.4621834754943848, - "rewards/margins": 2.7183938026428223, - "rewards/rejected": -0.2562103867530823, - "step": 1290 - }, - { - "epoch": 3.38, - "grad_norm": 1.3671875, - "learning_rate": 3.6088515096305675e-07, - "logits/chosen": -1.836519479751587, - "logits/rejected": -1.8399410247802734, - "logps/chosen": -30.34341812133789, - "logps/rejected": -32.9296875, - "loss": 0.108, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 2.5859272480010986, - "rewards/margins": 3.004894971847534, - "rewards/rejected": -0.41896796226501465, - "step": 1300 - }, - { - "epoch": 3.38, - "eval_logits/chosen": -2.171917200088501, - "eval_logits/rejected": -2.1671812534332275, - "eval_logps/chosen": -33.60578536987305, - "eval_logps/rejected": -37.23236083984375, - "eval_loss": 0.47303175926208496, - "eval_rewards/accuracies": 0.5689368844032288, - "eval_rewards/chosen": 0.38588812947273254, - "eval_rewards/margins": 0.13005691766738892, - "eval_rewards/rejected": 0.25583121180534363, - "eval_runtime": 145.1226, - "eval_samples_per_second": 2.364, - "eval_steps_per_second": 0.296, - "step": 1300 - }, - { - "epoch": 3.4, - "grad_norm": 2.828125, - "learning_rate": 3.321087280364757e-07, - "logits/chosen": -1.9456993341445923, - "logits/rejected": -1.9257911443710327, - "logps/chosen": -28.394466400146484, - "logps/rejected": -35.65379333496094, - "loss": 0.1234, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.447040557861328, - "rewards/margins": 2.885685682296753, - "rewards/rejected": -0.43864506483078003, - "step": 1310 - }, - { - "epoch": 3.43, - "grad_norm": 2.828125, - "learning_rate": 3.044460665744284e-07, - "logits/chosen": -1.9243189096450806, - "logits/rejected": -1.9299890995025635, - "logps/chosen": -30.12347984313965, - "logps/rejected": -32.91923141479492, - "loss": 0.0969, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.6192753314971924, - "rewards/margins": 3.0049073696136475, - "rewards/rejected": -0.38563182950019836, - "step": 1320 - }, - { - "epoch": 3.45, - "grad_norm": 3.71875, - "learning_rate": 2.779113783626916e-07, - "logits/chosen": -1.895808219909668, - "logits/rejected": -1.8909356594085693, - "logps/chosen": -30.40378761291504, - "logps/rejected": -35.07976531982422, - "loss": 0.1147, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.356848955154419, - "rewards/margins": 2.8135764598846436, - "rewards/rejected": -0.4567275941371918, - "step": 1330 - }, - { - "epoch": 3.48, - "grad_norm": 1.90625, - "learning_rate": 2.5251829568697204e-07, - "logits/chosen": -1.757712960243225, - "logits/rejected": -1.7673540115356445, - "logps/chosen": -31.065128326416016, - "logps/rejected": -31.27077293395996, - "loss": 0.1313, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.4610185623168945, - "rewards/margins": 2.6740119457244873, - "rewards/rejected": -0.2129933089017868, - "step": 1340 - }, - { - "epoch": 3.51, - "grad_norm": 3.078125, - "learning_rate": 2.2827986432927774e-07, - "logits/chosen": -1.8187271356582642, - "logits/rejected": -1.8234609365463257, - "logps/chosen": -31.141077041625977, - "logps/rejected": -33.421714782714844, - "loss": 0.1021, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.528876781463623, - "rewards/margins": 2.724090099334717, - "rewards/rejected": -0.19521372020244598, - "step": 1350 - }, - { - "epoch": 3.53, - "grad_norm": 4.03125, - "learning_rate": 2.0520853686560177e-07, - "logits/chosen": -1.847139596939087, - "logits/rejected": -1.8526817560195923, - "logps/chosen": -31.114614486694336, - "logps/rejected": -35.04568862915039, - "loss": 0.122, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.4453420639038086, - "rewards/margins": 2.879702091217041, - "rewards/rejected": -0.43436020612716675, - "step": 1360 - }, - { - "epoch": 3.56, - "grad_norm": 1.9375, - "learning_rate": 1.833161662683672e-07, - "logits/chosen": -1.938638687133789, - "logits/rejected": -1.934190034866333, - "logps/chosen": -30.08294105529785, - "logps/rejected": -31.296899795532227, - "loss": 0.1307, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.2974698543548584, - "rewards/margins": 2.6439690589904785, - "rewards/rejected": -0.3464987277984619, - "step": 1370 - }, - { - "epoch": 3.58, - "grad_norm": 4.1875, - "learning_rate": 1.626139998169246e-07, - "logits/chosen": -1.8553369045257568, - "logits/rejected": -1.8571650981903076, - "logps/chosen": -27.17293357849121, - "logps/rejected": -31.064167022705078, - "loss": 0.1414, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.237731456756592, - "rewards/margins": 2.5257174968719482, - "rewards/rejected": -0.2879858911037445, - "step": 1380 - }, - { - "epoch": 3.61, - "grad_norm": 5.40625, - "learning_rate": 1.4311267331922535e-07, - "logits/chosen": -1.8484070301055908, - "logits/rejected": -1.8530298471450806, - "logps/chosen": -29.575347900390625, - "logps/rejected": -33.01356887817383, - "loss": 0.1445, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.1997337341308594, - "rewards/margins": 2.2226874828338623, - "rewards/rejected": -0.022953515872359276, - "step": 1390 - }, - { - "epoch": 3.64, - "grad_norm": 1.7109375, - "learning_rate": 1.2482220564763669e-07, - "logits/chosen": -1.9163728952407837, - "logits/rejected": -1.919600486755371, - "logps/chosen": -32.167964935302734, - "logps/rejected": -33.807533264160156, - "loss": 0.1141, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 2.486348867416382, - "rewards/margins": 2.831922769546509, - "rewards/rejected": -0.34557363390922546, - "step": 1400 - }, - { - "epoch": 3.64, - "eval_logits/chosen": -2.171635627746582, - "eval_logits/rejected": -2.166898250579834, - "eval_logps/chosen": -33.605712890625, - "eval_logps/rejected": -37.237552642822266, - "eval_loss": 0.4719361364841461, - "eval_rewards/accuracies": 0.5510797500610352, - "eval_rewards/chosen": 0.38595661520957947, - "eval_rewards/margins": 0.1347959190607071, - "eval_rewards/rejected": 0.25116074085235596, - "eval_runtime": 145.1512, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 1400 - }, - { - "epoch": 3.66, - "grad_norm": 3.140625, - "learning_rate": 1.0775199359171346e-07, - "logits/chosen": -1.984933614730835, - "logits/rejected": -1.9782533645629883, - "logps/chosen": -31.038227081298828, - "logps/rejected": -34.51592254638672, - "loss": 0.1285, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.48789119720459, - "rewards/margins": 2.7321090698242188, - "rewards/rejected": -0.24421784281730652, - "step": 1410 - }, - { - "epoch": 3.69, - "grad_norm": 4.4375, - "learning_rate": 9.191080703056604e-08, - "logits/chosen": -1.8718812465667725, - "logits/rejected": -1.8825832605361938, - "logps/chosen": -31.401153564453125, - "logps/rejected": -34.048274993896484, - "loss": 0.112, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.4733338356018066, - "rewards/margins": 2.884812355041504, - "rewards/rejected": -0.41147857904434204, - "step": 1420 - }, - { - "epoch": 3.71, - "grad_norm": 3.890625, - "learning_rate": 7.730678442730539e-08, - "logits/chosen": -1.9544727802276611, - "logits/rejected": -1.966357946395874, - "logps/chosen": -32.22984313964844, - "logps/rejected": -34.12482452392578, - "loss": 0.1131, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.5478649139404297, - "rewards/margins": 2.985705852508545, - "rewards/rejected": -0.43784099817276, - "step": 1430 - }, - { - "epoch": 3.74, - "grad_norm": 3.359375, - "learning_rate": 6.394742864787806e-08, - "logits/chosen": -1.9448413848876953, - "logits/rejected": -1.9469585418701172, - "logps/chosen": -30.53590965270996, - "logps/rejected": -34.36793899536133, - "loss": 0.1126, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.405226230621338, - "rewards/margins": 2.8624002933502197, - "rewards/rejected": -0.4571738839149475, - "step": 1440 - }, - { - "epoch": 3.77, - "grad_norm": 4.625, - "learning_rate": 5.183960310644748e-08, - "logits/chosen": -1.9278266429901123, - "logits/rejected": -1.9208500385284424, - "logps/chosen": -32.90120315551758, - "logps/rejected": -34.015830993652344, - "loss": 0.1376, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 2.4004130363464355, - "rewards/margins": 2.6212949752807617, - "rewards/rejected": -0.22088181972503662, - "step": 1450 - }, - { - "epoch": 3.79, - "grad_norm": 2.640625, - "learning_rate": 4.098952823928693e-08, - "logits/chosen": -1.8969509601593018, - "logits/rejected": -1.902937650680542, - "logps/chosen": -28.635034561157227, - "logps/rejected": -34.15027618408203, - "loss": 0.1233, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.3203296661376953, - "rewards/margins": 2.7525792121887207, - "rewards/rejected": -0.4322497844696045, - "step": 1460 - }, - { - "epoch": 3.82, - "grad_norm": 3.234375, - "learning_rate": 3.1402778309014284e-08, - "logits/chosen": -1.8424794673919678, - "logits/rejected": -1.8404964208602905, - "logps/chosen": -28.250411987304688, - "logps/rejected": -30.827194213867188, - "loss": 0.1287, - "rewards/accuracies": 0.9375, - "rewards/chosen": 2.335628032684326, - "rewards/margins": 2.5452919006347656, - "rewards/rejected": -0.20966355502605438, - "step": 1470 - }, - { - "epoch": 3.84, - "grad_norm": 3.5625, - "learning_rate": 2.3084278540791427e-08, - "logits/chosen": -2.0166893005371094, - "logits/rejected": -2.0111632347106934, - "logps/chosen": -32.708534240722656, - "logps/rejected": -32.34349822998047, - "loss": 0.1253, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.319053888320923, - "rewards/margins": 2.5771634578704834, - "rewards/rejected": -0.25810959935188293, - "step": 1480 - }, - { - "epoch": 3.87, - "grad_norm": 3.28125, - "learning_rate": 1.6038302591975807e-08, - "logits/chosen": -1.9086692333221436, - "logits/rejected": -1.9107682704925537, - "logps/chosen": -26.228809356689453, - "logps/rejected": -28.285873413085938, - "loss": 0.1564, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 2.0228161811828613, - "rewards/margins": 2.2600486278533936, - "rewards/rejected": -0.23723240196704865, - "step": 1490 - }, - { - "epoch": 3.9, - "grad_norm": 4.21875, - "learning_rate": 1.0268470356514237e-08, - "logits/chosen": -1.91689932346344, - "logits/rejected": -1.9114105701446533, - "logps/chosen": -30.34906578063965, - "logps/rejected": -32.491310119628906, - "loss": 0.1302, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.206552505493164, - "rewards/margins": 2.676675796508789, - "rewards/rejected": -0.47012314200401306, - "step": 1500 - }, - { - "epoch": 3.9, - "eval_logits/chosen": -2.1712188720703125, - "eval_logits/rejected": -2.1664905548095703, - "eval_logps/chosen": -33.60893249511719, - "eval_logps/rejected": -37.23386001586914, - "eval_loss": 0.4735250771045685, - "eval_rewards/accuracies": 0.5539867281913757, - "eval_rewards/chosen": 0.38305899500846863, - "eval_rewards/margins": 0.12857645750045776, - "eval_rewards/rejected": 0.2544824779033661, - "eval_runtime": 145.3788, - "eval_samples_per_second": 2.359, - "eval_steps_per_second": 0.296, - "step": 1500 - }, - { - "epoch": 3.92, - "grad_norm": 6.34375, - "learning_rate": 5.777746105209147e-09, - "logits/chosen": -1.842259407043457, - "logits/rejected": -1.846498727798462, - "logps/chosen": -31.69326400756836, - "logps/rejected": -34.58927536010742, - "loss": 0.147, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 2.229959487915039, - "rewards/margins": 2.3909640312194824, - "rewards/rejected": -0.16100430488586426, - "step": 1510 - }, - { - "epoch": 3.95, - "grad_norm": 1.6484375, - "learning_rate": 2.5684369628148352e-09, - "logits/chosen": -1.9006131887435913, - "logits/rejected": -1.8990404605865479, - "logps/chosen": -28.169570922851562, - "logps/rejected": -32.99760437011719, - "loss": 0.1284, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.308145046234131, - "rewards/margins": 2.6752429008483887, - "rewards/rejected": -0.3670978248119354, - "step": 1520 - }, - { - "epoch": 3.97, - "grad_norm": 4.125, - "learning_rate": 6.421917227455999e-10, - "logits/chosen": -1.9976537227630615, - "logits/rejected": -1.9900623559951782, - "logps/chosen": -25.804935455322266, - "logps/rejected": -29.05643653869629, - "loss": 0.1396, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.032208204269409, - "rewards/margins": 2.3785529136657715, - "rewards/rejected": -0.34634485840797424, - "step": 1530 - }, - { - "epoch": 4.0, - "grad_norm": 2.546875, - "learning_rate": 0.0, - "logits/chosen": -1.9000084400177002, - "logits/rejected": -1.8901214599609375, - "logps/chosen": -30.514385223388672, - "logps/rejected": -35.95043182373047, - "loss": 0.1117, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 2.2966294288635254, - "rewards/margins": 2.781961679458618, - "rewards/rejected": -0.48533210158348083, - "step": 1540 - }, - { - "epoch": 4.0, - "step": 1540, + "epoch": 1.0, + "step": 385, "total_flos": 0.0, - "train_loss": 0.14454748347982183, - "train_runtime": 10766.5084, - "train_samples_per_second": 1.144, - "train_steps_per_second": 0.143 + "train_loss": 0.31703355428460356, + "train_runtime": 3251.5033, + "train_samples_per_second": 0.947, + "train_steps_per_second": 0.118 } ], "logging_steps": 10, - "max_steps": 1540, + "max_steps": 385, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4,