diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.0, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1540, + "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15,7 +15,7 @@ "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, - "loss": 0.6931, + "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,2537 +25,597 @@ { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, - "logits/chosen": -1.866413950920105, - "logits/rejected": -1.8707411289215088, - "logps/chosen": -36.98916244506836, - "logps/rejected": -33.67436981201172, - "loss": 0.6701, - "rewards/accuracies": 0.5416666865348816, - "rewards/chosen": 0.01569323241710663, - "rewards/margins": 0.05555717274546623, - "rewards/rejected": -0.039863936603069305, + "logits/chosen": -1.8665987253189087, + "logits/rejected": -1.8709272146224976, + "logps/chosen": -36.985595703125, + "logps/rejected": -33.68160629272461, + "loss": 0.4886, + "rewards/accuracies": 0.5694444179534912, + "rewards/chosen": 0.018904482945799828, + "rewards/margins": 0.06528304517269135, + "rewards/rejected": -0.04637856408953667, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, - "logits/chosen": -1.9979650974273682, - "logits/rejected": -2.0006086826324463, - "logps/chosen": -29.624820709228516, - "logps/rejected": -29.0762939453125, - "loss": 0.6837, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.01563635841012001, - "rewards/margins": 0.027204299345612526, - "rewards/rejected": -0.01156794372946024, + "logits/chosen": -1.997780203819275, + "logits/rejected": -2.000434398651123, + "logps/chosen": -29.643661499023438, + "logps/rejected": -29.043325424194336, + "loss": 0.5031, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.001316396868787706, + "rewards/margins": -0.019422104582190514, + "rewards/rejected": 0.018105709925293922, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, - "logits/chosen": -1.921021819114685, - "logits/rejected": -1.9183374643325806, - "logps/chosen": -31.40532875061035, - "logps/rejected": -33.23241424560547, - "loss": 0.6877, + "logits/chosen": -1.9207446575164795, + "logits/rejected": -1.918060064315796, + "logps/chosen": -31.41064453125, + "logps/rejected": -33.227088928222656, + "loss": 0.4976, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.00968973059207201, - "rewards/margins": 0.022251319140195847, - "rewards/rejected": -0.012561586685478687, + "rewards/chosen": 0.004905471112579107, + "rewards/margins": 0.012669263407588005, + "rewards/rejected": -0.007763790898025036, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, - "logits/chosen": -2.0176353454589844, - "logits/rejected": -2.008906364440918, - "logps/chosen": -32.574256896972656, - "logps/rejected": -32.53368377685547, - "loss": 0.6874, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": 0.0022967704571783543, - "rewards/margins": 0.02120940014719963, - "rewards/rejected": -0.018912632018327713, + "logits/chosen": -2.017446517944336, + "logits/rejected": -2.0087125301361084, + "logps/chosen": -32.553016662597656, + "logps/rejected": -32.50551986694336, + "loss": 0.4977, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.021415216848254204, + "rewards/margins": 0.014982220716774464, + "rewards/rejected": 0.006432999856770039, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, - "logits/chosen": -1.8619186878204346, - "logits/rejected": -1.85114324092865, - "logps/chosen": -33.55537414550781, - "logps/rejected": -35.45675277709961, - "loss": 0.6957, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": 0.001892436295747757, - "rewards/margins": 0.005858602002263069, - "rewards/rejected": -0.003966164775192738, + "logits/chosen": -1.8627235889434814, + "logits/rejected": -1.851959228515625, + "logps/chosen": -33.5064697265625, + "logps/rejected": -35.43267059326172, + "loss": 0.4951, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0459100641310215, + "rewards/margins": 0.02820250764489174, + "rewards/rejected": 0.017707552760839462, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, - "logits/chosen": -1.9400945901870728, - "logits/rejected": -1.9420464038848877, - "logps/chosen": -32.56509780883789, - "logps/rejected": -33.2406120300293, - "loss": 0.6632, + "logits/chosen": -1.9425691366195679, + "logits/rejected": -1.94449782371521, + "logps/chosen": -32.46650695800781, + "logps/rejected": -33.15652847290039, + "loss": 0.4765, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.031578924506902695, - "rewards/margins": 0.09388783574104309, - "rewards/rejected": -0.062308914959430695, + "rewards/chosen": 0.12030963599681854, + "rewards/margins": 0.10694190114736557, + "rewards/rejected": 0.013367725536227226, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, - "logits/chosen": -2.0712790489196777, - "logits/rejected": -2.0762436389923096, - "logps/chosen": -33.981910705566406, - "logps/rejected": -36.62363815307617, - "loss": 0.6833, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.005918038543313742, - "rewards/margins": 0.05520814657211304, - "rewards/rejected": -0.04929010197520256, + "logits/chosen": -2.073408842086792, + "logits/rejected": -2.078367233276367, + "logps/chosen": -33.917694091796875, + "logps/rejected": -36.547218322753906, + "loss": 0.4901, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.06371410191059113, + "rewards/margins": 0.04422418028116226, + "rewards/rejected": 0.019489921629428864, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, - "logits/chosen": -1.9327905178070068, - "logits/rejected": -1.935909628868103, - "logps/chosen": -34.32685470581055, - "logps/rejected": -34.65606689453125, - "loss": 0.639, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.09085920453071594, - "rewards/margins": 0.14815348386764526, - "rewards/rejected": -0.057294271886348724, + "logits/chosen": -1.9349607229232788, + "logits/rejected": -1.9380786418914795, + "logps/chosen": -34.223785400390625, + "logps/rejected": -34.53069305419922, + "loss": 0.4713, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1836203634738922, + "rewards/margins": 0.12807974219322205, + "rewards/rejected": 0.05554063245654106, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, - "logits/chosen": -1.9414918422698975, - "logits/rejected": -1.946007490158081, - "logps/chosen": -32.406803131103516, - "logps/rejected": -32.36021041870117, - "loss": 0.6792, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.054556868970394135, - "rewards/margins": 0.05573350936174393, - "rewards/rejected": -0.0011766403913497925, + "logits/chosen": -1.9439691305160522, + "logits/rejected": -1.9484784603118896, + "logps/chosen": -32.27050018310547, + "logps/rejected": -32.26476287841797, + "loss": 0.4779, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1772255003452301, + "rewards/margins": 0.09250012785196304, + "rewards/rejected": 0.08472537249326706, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, - "logits/chosen": -2.039034128189087, - "logits/rejected": -2.0370402336120605, - "logps/chosen": -32.172786712646484, - "logps/rejected": -31.333194732666016, - "loss": 0.6464, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.06124376505613327, - "rewards/margins": 0.12152798473834991, - "rewards/rejected": -0.06028420478105545, + "logits/chosen": -2.0411603450775146, + "logits/rejected": -2.039163112640381, + "logps/chosen": -31.98573875427246, + "logps/rejected": -31.193227767944336, + "loss": 0.4639, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.22958631813526154, + "rewards/margins": 0.16390272974967957, + "rewards/rejected": 0.06568360328674316, "step": 100 }, { "epoch": 0.26, - "eval_logits/chosen": -2.2339773178100586, - "eval_logits/rejected": -2.229137420654297, - "eval_logps/chosen": -34.04054641723633, - "eval_logps/rejected": -37.549957275390625, - "eval_loss": 0.6902773976325989, - "eval_rewards/accuracies": 0.5685215592384338, - "eval_rewards/chosen": -0.005393954925239086, - "eval_rewards/margins": 0.024608083069324493, - "eval_rewards/rejected": -0.030002037063241005, - "eval_runtime": 146.034, - "eval_samples_per_second": 2.349, - "eval_steps_per_second": 0.294, + "eval_logits/chosen": -2.235391855239868, + "eval_logits/rejected": -2.2305493354797363, + "eval_logps/chosen": -33.869815826416016, + "eval_logps/rejected": -37.382774353027344, + "eval_loss": 0.4939241409301758, + "eval_rewards/accuracies": 0.5627076625823975, + "eval_rewards/chosen": 0.1482628434896469, + "eval_rewards/margins": 0.02780282311141491, + "eval_rewards/rejected": 0.12046003341674805, + "eval_runtime": 145.9747, + "eval_samples_per_second": 2.35, + "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, - "logits/chosen": -1.994192123413086, - "logits/rejected": -1.9918158054351807, - "logps/chosen": -33.142940521240234, - "logps/rejected": -34.01188278198242, - "loss": 0.6911, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.09078876674175262, - "rewards/margins": 0.07505009323358536, - "rewards/rejected": 0.015738680958747864, + "logits/chosen": -1.997287392616272, + "logits/rejected": -1.9949369430541992, + "logps/chosen": -32.96843719482422, + "logps/rejected": -33.866310119628906, + "loss": 0.4739, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.24784216284751892, + "rewards/margins": 0.10108550637960434, + "rewards/rejected": 0.14675670862197876, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, - "logits/chosen": -2.0053954124450684, - "logits/rejected": -1.997046709060669, - "logps/chosen": -32.33894348144531, - "logps/rejected": -32.1308708190918, - "loss": 0.6746, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.09536493569612503, - "rewards/margins": 0.06779730319976807, - "rewards/rejected": 0.027567636221647263, + "logits/chosen": -2.008091688156128, + "logits/rejected": -1.9997599124908447, + "logps/chosen": -32.20352554321289, + "logps/rejected": -31.995223999023438, + "loss": 0.485, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.2172430008649826, + "rewards/margins": 0.06759083271026611, + "rewards/rejected": 0.1496521681547165, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, - "logits/chosen": -2.0336387157440186, - "logits/rejected": -2.025650978088379, - "logps/chosen": -30.345691680908203, - "logps/rejected": -32.078697204589844, - "loss": 0.6527, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.11702337116003036, - "rewards/margins": 0.14014457166194916, - "rewards/rejected": -0.023121213540434837, + "logits/chosen": -2.035614490509033, + "logits/rejected": -2.027682304382324, + "logps/chosen": -30.1588077545166, + "logps/rejected": -31.886260986328125, + "loss": 0.4717, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.2852162718772888, + "rewards/margins": 0.1351451873779297, + "rewards/rejected": 0.15007111430168152, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, - "logits/chosen": -1.9642337560653687, - "logits/rejected": -1.9744552373886108, - "logps/chosen": -31.243911743164062, - "logps/rejected": -32.590267181396484, - "loss": 0.6171, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.1581769436597824, - "rewards/margins": 0.20802685618400574, - "rewards/rejected": -0.04984992742538452, + "logits/chosen": -1.965490698814392, + "logits/rejected": -1.9756921529769897, + "logps/chosen": -31.065088272094727, + "logps/rejected": -32.42934036254883, + "loss": 0.4482, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3191176950931549, + "rewards/margins": 0.22413134574890137, + "rewards/rejected": 0.09498633444309235, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, - "logits/chosen": -1.876604437828064, - "logits/rejected": -1.8777605295181274, - "logps/chosen": -33.938690185546875, - "logps/rejected": -34.807891845703125, - "loss": 0.6043, + "logits/chosen": -1.8782259225845337, + "logits/rejected": -1.8793823719024658, + "logps/chosen": -33.68832778930664, + "logps/rejected": -34.58278274536133, + "loss": 0.4367, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.22860188782215118, - "rewards/margins": 0.2741745412349701, - "rewards/rejected": -0.0455726757645607, + "rewards/chosen": 0.45392999053001404, + "rewards/margins": 0.2969031035900116, + "rewards/rejected": 0.15702682733535767, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, - "logits/chosen": -1.9282041788101196, - "logits/rejected": -1.9247684478759766, - "logps/chosen": -36.02125930786133, - "logps/rejected": -32.71831130981445, - "loss": 0.6454, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.13537634909152985, - "rewards/margins": 0.13137592375278473, - "rewards/rejected": 0.004000450484454632, + "logits/chosen": -1.9295704364776611, + "logits/rejected": -1.9261808395385742, + "logps/chosen": -35.74212646484375, + "logps/rejected": -32.51028060913086, + "loss": 0.4538, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.38659390807151794, + "rewards/margins": 0.1953679323196411, + "rewards/rejected": 0.1912260353565216, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, - "logits/chosen": -2.029125928878784, - "logits/rejected": -2.0217747688293457, - "logps/chosen": -33.49839401245117, - "logps/rejected": -31.400177001953125, - "loss": 0.5828, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.26951926946640015, - "rewards/margins": 0.3130132555961609, - "rewards/rejected": -0.04349397122859955, + "logits/chosen": -2.031176805496216, + "logits/rejected": -2.023855686187744, + "logps/chosen": -33.24225616455078, + "logps/rejected": -31.193195343017578, + "loss": 0.42, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5000473260879517, + "rewards/margins": 0.3572581112384796, + "rewards/rejected": 0.1427893042564392, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, - "logits/chosen": -2.0355944633483887, - "logits/rejected": -2.040832042694092, - "logps/chosen": -32.235923767089844, - "logps/rejected": -32.460418701171875, - "loss": 0.5943, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.2779761850833893, - "rewards/margins": 0.2557251751422882, - "rewards/rejected": 0.02225096896290779, + "logits/chosen": -2.038222074508667, + "logits/rejected": -2.0434067249298096, + "logps/chosen": -31.95560646057129, + "logps/rejected": -32.17836380004883, + "loss": 0.4404, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5302629470825195, + "rewards/margins": 0.2541634440422058, + "rewards/rejected": 0.2760995924472809, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, - "logits/chosen": -2.0362112522125244, - "logits/rejected": -2.0334599018096924, - "logps/chosen": -31.269250869750977, - "logps/rejected": -31.325435638427734, - "loss": 0.6245, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.19773444533348083, - "rewards/margins": 0.20423230528831482, - "rewards/rejected": -0.0064978525042533875, + "logits/chosen": -2.0387401580810547, + "logits/rejected": -2.036006450653076, + "logps/chosen": -31.0674991607666, + "logps/rejected": -31.083877563476562, + "loss": 0.4607, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3793107569217682, + "rewards/margins": 0.16841106116771698, + "rewards/rejected": 0.21089968085289001, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, - "logits/chosen": -1.9060389995574951, - "logits/rejected": -1.9106788635253906, - "logps/chosen": -31.306299209594727, - "logps/rejected": -32.81407165527344, - "loss": 0.5931, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.2699825167655945, - "rewards/margins": 0.2908058166503906, - "rewards/rejected": -0.02082330361008644, + "logits/chosen": -1.9085681438446045, + "logits/rejected": -1.9132543802261353, + "logps/chosen": -31.083459854125977, + "logps/rejected": -32.602638244628906, + "loss": 0.4308, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4705420434474945, + "rewards/margins": 0.301074743270874, + "rewards/rejected": 0.16946731507778168, "step": 200 }, { "epoch": 0.52, - "eval_logits/chosen": -2.231553792953491, - "eval_logits/rejected": -2.2267112731933594, - "eval_logps/chosen": -34.07304763793945, - "eval_logps/rejected": -37.57693862915039, - "eval_loss": 0.6979728937149048, - "eval_rewards/accuracies": 0.5157807469367981, - "eval_rewards/chosen": -0.03464451804757118, - "eval_rewards/margins": 0.019641490653157234, - "eval_rewards/rejected": -0.054286014288663864, - "eval_runtime": 145.8095, - "eval_samples_per_second": 2.352, + "eval_logits/chosen": -2.2338287830352783, + "eval_logits/rejected": -2.229020357131958, + "eval_logps/chosen": -33.7449951171875, + "eval_logps/rejected": -37.27743911743164, + "eval_loss": 0.48942965269088745, + "eval_rewards/accuracies": 0.5544019937515259, + "eval_rewards/chosen": 0.26059985160827637, + "eval_rewards/margins": 0.0453372597694397, + "eval_rewards/rejected": 0.21526260673999786, + "eval_runtime": 145.8953, + "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, - "logits/chosen": -2.018519163131714, - "logits/rejected": -2.0291810035705566, - "logps/chosen": -31.742992401123047, - "logps/rejected": -33.946937561035156, - "loss": 0.5902, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.2054794579744339, - "rewards/margins": 0.2812942862510681, - "rewards/rejected": -0.07581482082605362, + "logits/chosen": -2.0216596126556396, + "logits/rejected": -2.032275915145874, + "logps/chosen": -31.500268936157227, + "logps/rejected": -33.663352966308594, + "loss": 0.4458, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.42393389344215393, + "rewards/margins": 0.2445230931043625, + "rewards/rejected": 0.17941072583198547, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, - "logits/chosen": -1.911586046218872, - "logits/rejected": -1.9263393878936768, - "logps/chosen": -29.84616470336914, - "logps/rejected": -31.615009307861328, - "loss": 0.5879, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.23883743584156036, - "rewards/margins": 0.2899848222732544, - "rewards/rejected": -0.051147449761629105, + "logits/chosen": -1.9136396646499634, + "logits/rejected": -1.928344964981079, + "logps/chosen": -29.588964462280273, + "logps/rejected": -31.396224975585938, + "loss": 0.4269, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4703185558319092, + "rewards/margins": 0.3245617151260376, + "rewards/rejected": 0.1457568258047104, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, - "logits/chosen": -1.9677941799163818, - "logits/rejected": -1.9717823266983032, - "logps/chosen": -33.100074768066406, - "logps/rejected": -31.62213134765625, - "loss": 0.5748, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.28565075993537903, - "rewards/margins": 0.3511958718299866, - "rewards/rejected": -0.06554517149925232, + "logits/chosen": -1.970298171043396, + "logits/rejected": -1.974283218383789, + "logps/chosen": -32.81959915161133, + "logps/rejected": -31.408565521240234, + "loss": 0.4109, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5380831956863403, + "rewards/margins": 0.4114208221435547, + "rewards/rejected": 0.12666237354278564, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, - "logits/chosen": -1.9661725759506226, - "logits/rejected": -1.944300651550293, - "logps/chosen": -33.841453552246094, - "logps/rejected": -35.11375045776367, - "loss": 0.5473, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.2810631990432739, - "rewards/margins": 0.4277234673500061, - "rewards/rejected": -0.14666026830673218, + "logits/chosen": -1.9695065021514893, + "logits/rejected": -1.9477574825286865, + "logps/chosen": -33.58247756958008, + "logps/rejected": -34.828121185302734, + "loss": 0.4129, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5141419172286987, + "rewards/margins": 0.403735876083374, + "rewards/rejected": 0.11040612310171127, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, - "logits/chosen": -2.007416009902954, - "logits/rejected": -2.0040948390960693, - "logps/chosen": -32.70330810546875, - "logps/rejected": -36.29412841796875, - "loss": 0.5992, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.1995842456817627, - "rewards/margins": 0.2618715763092041, - "rewards/rejected": -0.06228730082511902, + "logits/chosen": -2.0098202228546143, + "logits/rejected": -2.0065340995788574, + "logps/chosen": -32.43529510498047, + "logps/rejected": -35.97461700439453, + "loss": 0.4514, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.44079580903053284, + "rewards/margins": 0.2155180424451828, + "rewards/rejected": 0.22527781128883362, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, - "logits/chosen": -1.8749721050262451, - "logits/rejected": -1.8725513219833374, - "logps/chosen": -34.00068664550781, - "logps/rejected": -35.53888702392578, - "loss": 0.6254, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.16894161701202393, - "rewards/margins": 0.1997825801372528, - "rewards/rejected": -0.030840963125228882, + "logits/chosen": -1.8770506381988525, + "logits/rejected": -1.8746120929718018, + "logps/chosen": -33.7199821472168, + "logps/rejected": -35.28092575073242, + "loss": 0.4498, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.42157459259033203, + "rewards/margins": 0.2202514111995697, + "rewards/rejected": 0.2013232260942459, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, - "logits/chosen": -1.8600317239761353, - "logits/rejected": -1.8576066493988037, - "logps/chosen": -34.1875, - "logps/rejected": -31.8159122467041, - "loss": 0.616, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.1890900433063507, - "rewards/margins": 0.22921428084373474, - "rewards/rejected": -0.04012420028448105, + "logits/chosen": -1.8618510961532593, + "logits/rejected": -1.8593294620513916, + "logps/chosen": -33.92017364501953, + "logps/rejected": -31.6002197265625, + "loss": 0.4397, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.42968273162841797, + "rewards/margins": 0.27568089962005615, + "rewards/rejected": 0.15400180220603943, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, - "logits/chosen": -1.9631398916244507, - "logits/rejected": -1.9526073932647705, - "logps/chosen": -35.023719787597656, - "logps/rejected": -31.869693756103516, - "loss": 0.5782, + "logits/chosen": -1.9657011032104492, + "logits/rejected": -1.9552650451660156, + "logps/chosen": -34.72232437133789, + "logps/rejected": -31.632369995117188, + "loss": 0.4114, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.29963088035583496, - "rewards/margins": 0.32546472549438477, - "rewards/rejected": -0.025833839550614357, + "rewards/chosen": 0.5708868503570557, + "rewards/margins": 0.3831265866756439, + "rewards/rejected": 0.18776027858257294, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, - "logits/chosen": -2.0582926273345947, - "logits/rejected": -2.0433640480041504, - "logps/chosen": -30.733753204345703, - "logps/rejected": -32.67460632324219, - "loss": 0.6392, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.17133468389511108, - "rewards/margins": 0.19182677567005157, - "rewards/rejected": -0.020492086187005043, + "logits/chosen": -2.0614376068115234, + "logits/rejected": -2.046600341796875, + "logps/chosen": -30.400625228881836, + "logps/rejected": -32.34136199951172, + "loss": 0.456, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4711507260799408, + "rewards/margins": 0.19172403216362, + "rewards/rejected": 0.2794266939163208, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, - "logits/chosen": -1.929610013961792, - "logits/rejected": -1.9270601272583008, - "logps/chosen": -32.42620086669922, - "logps/rejected": -30.873455047607422, - "loss": 0.5301, + "logits/chosen": -1.9332094192504883, + "logits/rejected": -1.9307467937469482, + "logps/chosen": -32.10976028442383, + "logps/rejected": -30.661523818969727, + "loss": 0.374, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.450817346572876, - "rewards/margins": 0.5018006563186646, - "rewards/rejected": -0.050983332097530365, + "rewards/chosen": 0.7356175184249878, + "rewards/margins": 0.5958597660064697, + "rewards/rejected": 0.13975778222084045, "step": 300 }, { "epoch": 0.78, - "eval_logits/chosen": -2.229154348373413, - "eval_logits/rejected": -2.2243051528930664, - "eval_logps/chosen": -34.09621810913086, - "eval_logps/rejected": -37.59999084472656, - "eval_loss": 0.6972895860671997, - "eval_rewards/accuracies": 0.5390365719795227, - "eval_rewards/chosen": -0.05550166219472885, - "eval_rewards/margins": 0.019528048112988472, - "eval_rewards/rejected": -0.07502970844507217, - "eval_runtime": 145.7792, - "eval_samples_per_second": 2.353, + "eval_logits/chosen": -2.2307660579681396, + "eval_logits/rejected": -2.22594952583313, + "eval_logps/chosen": -33.74896240234375, + "eval_logps/rejected": -37.275413513183594, + "eval_loss": 0.49038100242614746, + "eval_rewards/accuracies": 0.5220099687576294, + "eval_rewards/chosen": 0.2570302486419678, + "eval_rewards/margins": 0.039943769574165344, + "eval_rewards/rejected": 0.21708647906780243, + "eval_runtime": 145.8077, + "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, - "grad_norm": 11.75, - "learning_rate": 4.84533120650964e-06, - "logits/chosen": -2.0636165142059326, - "logits/rejected": -2.0508041381835938, - "logps/chosen": -32.113487243652344, - "logps/rejected": -32.89537811279297, - "loss": 0.4684, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.3734120726585388, - "rewards/margins": 0.6023003458976746, - "rewards/rejected": -0.2288883477449417, + "learning_rate": 5.576113578589035e-07, + "logits/chosen": -1.9164836406707764, + "logits/rejected": -1.913336992263794, + "logps/chosen": -31.014041900634766, + "logps/rejected": -33.548377990722656, + "loss": 0.4218, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5303782224655151, + "rewards/margins": 0.3559793531894684, + "rewards/rejected": 0.1743989735841751, "step": 310 }, { "epoch": 0.83, - "grad_norm": 10.875, - "learning_rate": 4.825108134172131e-06, - "logits/chosen": -1.9748560190200806, - "logits/rejected": -1.9662139415740967, - "logps/chosen": -31.80029296875, - "logps/rejected": -30.449291229248047, - "loss": 0.463, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.49922245740890503, - "rewards/margins": 0.6863331198692322, - "rewards/rejected": -0.18711069226264954, + "learning_rate": 4.229036944380913e-07, + "logits/chosen": -1.967760682106018, + "logits/rejected": -1.955615758895874, + "logps/chosen": -34.05602264404297, + "logps/rejected": -33.42683410644531, + "loss": 0.4147, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.48604661226272583, + "rewards/margins": 0.385240375995636, + "rewards/rejected": 0.10080619156360626, "step": 320 }, { "epoch": 0.86, - "grad_norm": 12.875, - "learning_rate": 4.80369052967602e-06, - "logits/chosen": -1.910094976425171, - "logits/rejected": -1.9221827983856201, - "logps/chosen": -29.87582778930664, - "logps/rejected": -33.66598129272461, - "loss": 0.4237, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.5430954694747925, - "rewards/margins": 0.7804504632949829, - "rewards/rejected": -0.23735502362251282, + "learning_rate": 3.053082288996112e-07, + "logits/chosen": -2.003138780593872, + "logits/rejected": -2.001786470413208, + "logps/chosen": -32.86919403076172, + "logps/rejected": -32.247493743896484, + "loss": 0.4259, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5549365878105164, + "rewards/margins": 0.3460560441017151, + "rewards/rejected": 0.20888061821460724, "step": 330 }, { "epoch": 0.88, - "grad_norm": 18.375, - "learning_rate": 4.781089396387968e-06, - "logits/chosen": -1.8735644817352295, - "logits/rejected": -1.8643461465835571, - "logps/chosen": -34.02741241455078, - "logps/rejected": -36.179935455322266, - "loss": 0.4051, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.6043617725372314, - "rewards/margins": 0.89524906873703, - "rewards/rejected": -0.29088738560676575, + "learning_rate": 2.0579377374915805e-07, + "logits/chosen": -2.0895984172821045, + "logits/rejected": -2.073963165283203, + "logps/chosen": -33.46659469604492, + "logps/rejected": -32.82307815551758, + "loss": 0.4171, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6415189504623413, + "rewards/margins": 0.37605759501457214, + "rewards/rejected": 0.26546135544776917, "step": 340 }, { "epoch": 0.91, - "grad_norm": 12.25, - "learning_rate": 4.757316345716554e-06, - "logits/chosen": -1.9254121780395508, - "logits/rejected": -1.9260631799697876, - "logps/chosen": -33.68886947631836, - "logps/rejected": -34.135963439941406, - "loss": 0.4149, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.6447720527648926, - "rewards/margins": 0.9030619859695435, - "rewards/rejected": -0.2582899332046509, + "learning_rate": 1.2518018074041684e-07, + "logits/chosen": -1.9623206853866577, + "logits/rejected": -1.9614824056625366, + "logps/chosen": -32.549312591552734, + "logps/rejected": -32.24496078491211, + "loss": 0.4, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6595046520233154, + "rewards/margins": 0.46133819222450256, + "rewards/rejected": 0.19816650450229645, "step": 350 }, { "epoch": 0.94, - "grad_norm": 14.9375, - "learning_rate": 4.73238359114687e-06, - "logits/chosen": -2.052173376083374, - "logits/rejected": -2.0583267211914062, - "logps/chosen": -31.09401512145996, - "logps/rejected": -33.012630462646484, - "loss": 0.4478, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.43130677938461304, - "rewards/margins": 0.7412186861038208, - "rewards/rejected": -0.30991190671920776, + "learning_rate": 6.41315865106129e-08, + "logits/chosen": -1.9189163446426392, + "logits/rejected": -1.9292027950286865, + "logps/chosen": -31.60186767578125, + "logps/rejected": -34.987525939941406, + "loss": 0.4401, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5425392985343933, + "rewards/margins": 0.27143171429634094, + "rewards/rejected": 0.27110758423805237, "step": 360 }, { "epoch": 0.96, - "grad_norm": 28.0, - "learning_rate": 4.706303941965804e-06, - "logits/chosen": -1.9808381795883179, - "logits/rejected": -1.9804092645645142, - "logps/chosen": -32.843143463134766, - "logps/rejected": -36.34934616088867, - "loss": 0.4406, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.5416162610054016, - "rewards/margins": 0.8112291097640991, - "rewards/rejected": -0.2696128487586975, + "learning_rate": 2.3150941078050325e-08, + "logits/chosen": -2.0577945709228516, + "logits/rejected": -2.051274061203003, + "logps/chosen": -33.05121612548828, + "logps/rejected": -28.990320205688477, + "loss": 0.4233, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5406805276870728, + "rewards/margins": 0.3281847834587097, + "rewards/rejected": 0.21249575912952423, "step": 370 }, { "epoch": 0.99, - "grad_norm": 10.5, - "learning_rate": 4.679090796681225e-06, - "logits/chosen": -2.012341022491455, - "logits/rejected": -2.0077567100524902, - "logps/chosen": -30.083026885986328, - "logps/rejected": -29.55636215209961, - "loss": 0.411, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5648170709609985, - "rewards/margins": 0.873041033744812, - "rewards/rejected": -0.30822402238845825, + "learning_rate": 2.575864278703266e-09, + "logits/chosen": -1.917284607887268, + "logits/rejected": -1.9194421768188477, + "logps/chosen": -33.59749984741211, + "logps/rejected": -30.708057403564453, + "loss": 0.4061, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5773938298225403, + "rewards/margins": 0.429278701543808, + "rewards/rejected": 0.14811506867408752, "step": 380 }, { - "epoch": 1.01, - "grad_norm": 13.75, - "learning_rate": 4.650758136138454e-06, - "logits/chosen": -1.784257173538208, - "logits/rejected": -1.7905915975570679, - "logps/chosen": -31.67917823791504, - "logps/rejected": -36.660545349121094, - "loss": 0.3793, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.635884165763855, - "rewards/margins": 1.1239113807678223, - "rewards/rejected": -0.4880271553993225, - "step": 390 - }, - { - "epoch": 1.04, - "grad_norm": 10.625, - "learning_rate": 4.621320516337559e-06, - "logits/chosen": -1.937954306602478, - "logits/rejected": -1.931673288345337, - "logps/chosen": -33.02653121948242, - "logps/rejected": -32.67657470703125, - "loss": 0.389, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.7255499958992004, - "rewards/margins": 1.0564748048782349, - "rewards/rejected": -0.33092474937438965, - "step": 400 - }, - { - "epoch": 1.04, - "eval_logits/chosen": -2.2031846046447754, - "eval_logits/rejected": -2.198335647583008, - "eval_logps/chosen": -34.16795349121094, - "eval_logps/rejected": -37.72208786010742, - "eval_loss": 0.6933022737503052, - "eval_rewards/accuracies": 0.550664484500885, - "eval_rewards/chosen": -0.12006273865699768, - "eval_rewards/margins": 0.06485801190137863, - "eval_rewards/rejected": -0.1849207729101181, - "eval_runtime": 145.5175, - "eval_samples_per_second": 2.357, - "eval_steps_per_second": 0.295, - "step": 400 - }, - { - "epoch": 1.06, - "grad_norm": 13.375, - "learning_rate": 4.590793060955158e-06, - "logits/chosen": -1.9355392456054688, - "logits/rejected": -1.9427944421768188, - "logps/chosen": -28.549734115600586, - "logps/rejected": -29.689483642578125, - "loss": 0.3677, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5135748982429504, - "rewards/margins": 1.0457103252410889, - "rewards/rejected": -0.5321354866027832, - "step": 410 - }, - { - "epoch": 1.09, - "grad_norm": 11.375, - "learning_rate": 4.559191453574582e-06, - "logits/chosen": -1.9551494121551514, - "logits/rejected": -1.9541441202163696, - "logps/chosen": -33.60606002807617, - "logps/rejected": -31.240774154663086, - "loss": 0.4587, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.5801833868026733, - "rewards/margins": 0.8481400609016418, - "rewards/rejected": -0.2679567039012909, - "step": 420 - }, - { - "epoch": 1.12, - "grad_norm": 15.125, - "learning_rate": 4.52653192962838e-06, - "logits/chosen": -1.9499473571777344, - "logits/rejected": -1.932682752609253, - "logps/chosen": -30.44845199584961, - "logps/rejected": -33.599735260009766, - "loss": 0.3784, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.4758743345737457, - "rewards/margins": 1.0819809436798096, - "rewards/rejected": -0.6061066389083862, - "step": 430 - }, - { - "epoch": 1.14, - "grad_norm": 14.1875, - "learning_rate": 4.492831268057307e-06, - "logits/chosen": -1.9813868999481201, - "logits/rejected": -1.9835193157196045, - "logps/chosen": -35.81322479248047, - "logps/rejected": -35.490821838378906, - "loss": 0.313, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.7192217111587524, - "rewards/margins": 1.2869927883148193, - "rewards/rejected": -0.5677711367607117, - "step": 440 - }, - { - "epoch": 1.17, - "grad_norm": 12.3125, - "learning_rate": 4.458106782690094e-06, - "logits/chosen": -2.05714750289917, - "logits/rejected": -2.0569212436676025, - "logps/chosen": -31.911890029907227, - "logps/rejected": -34.006473541259766, - "loss": 0.3759, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6493128538131714, - "rewards/margins": 1.0510342121124268, - "rewards/rejected": -0.40172141790390015, - "step": 450 - }, - { - "epoch": 1.19, - "grad_norm": 11.9375, - "learning_rate": 4.422376313348405e-06, - "logits/chosen": -2.001530885696411, - "logits/rejected": -1.9940494298934937, - "logps/chosen": -31.56293296813965, - "logps/rejected": -36.774314880371094, - "loss": 0.3025, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.7587249279022217, - "rewards/margins": 1.4069726467132568, - "rewards/rejected": -0.6482478380203247, - "step": 460 - }, - { - "epoch": 1.22, - "grad_norm": 20.375, - "learning_rate": 4.3856582166815696e-06, - "logits/chosen": -1.9044301509857178, - "logits/rejected": -1.900957465171814, - "logps/chosen": -33.156280517578125, - "logps/rejected": -33.48976516723633, - "loss": 0.3626, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6984156966209412, - "rewards/margins": 1.2077140808105469, - "rewards/rejected": -0.5092984437942505, - "step": 470 - }, - { - "epoch": 1.25, - "grad_norm": 14.0625, - "learning_rate": 4.347971356735789e-06, - "logits/chosen": -2.0298960208892822, - "logits/rejected": -2.0229077339172363, - "logps/chosen": -30.285165786743164, - "logps/rejected": -32.622947692871094, - "loss": 0.4059, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.48903241753578186, - "rewards/margins": 1.0555496215820312, - "rewards/rejected": -0.566517174243927, - "step": 480 - }, - { - "epoch": 1.27, - "grad_norm": 9.4375, - "learning_rate": 4.309335095262675e-06, - "logits/chosen": -1.9742801189422607, - "logits/rejected": -1.975847840309143, - "logps/chosen": -34.49077606201172, - "logps/rejected": -34.25959396362305, - "loss": 0.3167, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.7761706113815308, - "rewards/margins": 1.3324440717697144, - "rewards/rejected": -0.5562735795974731, - "step": 490 - }, - { - "epoch": 1.3, - "grad_norm": 20.25, - "learning_rate": 4.269769281772082e-06, - "logits/chosen": -1.8631584644317627, - "logits/rejected": -1.860769510269165, - "logps/chosen": -32.35368347167969, - "logps/rejected": -37.259803771972656, - "loss": 0.322, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.7367446422576904, - "rewards/margins": 1.452678918838501, - "rewards/rejected": -0.715934157371521, - "step": 500 - }, - { - "epoch": 1.3, - "eval_logits/chosen": -2.201748847961426, - "eval_logits/rejected": -2.1969025135040283, - "eval_logps/chosen": -34.34733200073242, - "eval_logps/rejected": -37.91175842285156, - "eval_loss": 0.7055429816246033, - "eval_rewards/accuracies": 0.5514950156211853, - "eval_rewards/chosen": -0.281506210565567, - "eval_rewards/margins": 0.07411985099315643, - "eval_rewards/rejected": -0.35562604665756226, - "eval_runtime": 145.2329, - "eval_samples_per_second": 2.362, - "eval_steps_per_second": 0.296, - "step": 500 - }, - { - "epoch": 1.32, - "grad_norm": 14.9375, - "learning_rate": 4.22929424333435e-06, - "logits/chosen": -1.9645631313323975, - "logits/rejected": -1.9692989587783813, - "logps/chosen": -32.608741760253906, - "logps/rejected": -32.16291809082031, - "loss": 0.3561, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.7637556791305542, - "rewards/margins": 1.2288849353790283, - "rewards/rejected": -0.46512943506240845, - "step": 510 - }, - { - "epoch": 1.35, - "grad_norm": 14.375, - "learning_rate": 4.1879307741372085e-06, - "logits/chosen": -1.9946361780166626, - "logits/rejected": -2.005610466003418, - "logps/chosen": -30.554733276367188, - "logps/rejected": -32.16284942626953, - "loss": 0.358, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.7521130442619324, - "rewards/margins": 1.3417600393295288, - "rewards/rejected": -0.589647114276886, - "step": 520 - }, - { - "epoch": 1.38, - "grad_norm": 6.53125, - "learning_rate": 4.145700124802693e-06, - "logits/chosen": -1.9223169088363647, - "logits/rejected": -1.9189828634262085, - "logps/chosen": -31.747081756591797, - "logps/rejected": -33.04930877685547, - "loss": 0.349, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.6060883402824402, - "rewards/margins": 1.2239677906036377, - "rewards/rejected": -0.6178793907165527, - "step": 530 - }, - { - "epoch": 1.4, - "grad_norm": 11.0, - "learning_rate": 4.102623991469562e-06, - "logits/chosen": -1.7875115871429443, - "logits/rejected": -1.7967026233673096, - "logps/chosen": -31.79451560974121, - "logps/rejected": -32.5256233215332, - "loss": 0.3606, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.697385847568512, - "rewards/margins": 1.3000586032867432, - "rewards/rejected": -0.602672815322876, - "step": 540 - }, - { - "epoch": 1.43, - "grad_norm": 13.1875, - "learning_rate": 4.058724504646834e-06, - "logits/chosen": -1.8841272592544556, - "logits/rejected": -1.8778330087661743, - "logps/chosen": -32.83342742919922, - "logps/rejected": -31.528995513916016, - "loss": 0.3722, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.7873939275741577, - "rewards/margins": 1.2712757587432861, - "rewards/rejected": -0.48388180136680603, - "step": 550 - }, - { - "epoch": 1.45, - "grad_norm": 10.125, - "learning_rate": 4.014024217844167e-06, - "logits/chosen": -1.9727208614349365, - "logits/rejected": -1.9707006216049194, - "logps/chosen": -33.61963653564453, - "logps/rejected": -31.988178253173828, - "loss": 0.3617, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.8564074635505676, - "rewards/margins": 1.298119306564331, - "rewards/rejected": -0.4417116641998291, - "step": 560 - }, - { - "epoch": 1.48, - "grad_norm": 12.625, - "learning_rate": 3.968546095984911e-06, - "logits/chosen": -1.8058189153671265, - "logits/rejected": -1.8036988973617554, - "logps/chosen": -31.945751190185547, - "logps/rejected": -31.496994018554688, - "loss": 0.3948, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.7655945420265198, - "rewards/margins": 1.210815191268921, - "rewards/rejected": -0.44522079825401306, - "step": 570 - }, - { - "epoch": 1.51, - "grad_norm": 13.125, - "learning_rate": 3.922313503607806e-06, - "logits/chosen": -1.9404058456420898, - "logits/rejected": -1.9370429515838623, - "logps/chosen": -30.20537757873535, - "logps/rejected": -35.37580871582031, - "loss": 0.32, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.8093468546867371, - "rewards/margins": 1.4840278625488281, - "rewards/rejected": -0.6746810078620911, - "step": 580 - }, - { - "epoch": 1.53, - "grad_norm": 9.6875, - "learning_rate": 3.875350192863368e-06, - "logits/chosen": -1.8791511058807373, - "logits/rejected": -1.8827145099639893, - "logps/chosen": -28.8929386138916, - "logps/rejected": -31.133419036865234, - "loss": 0.3795, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.6498847603797913, - "rewards/margins": 1.0769810676574707, - "rewards/rejected": -0.4270961880683899, - "step": 590 - }, - { - "epoch": 1.56, - "grad_norm": 15.0625, - "learning_rate": 3.8276802913111436e-06, - "logits/chosen": -1.915834665298462, - "logits/rejected": -1.9157018661499023, - "logps/chosen": -31.168197631835938, - "logps/rejected": -31.740047454833984, - "loss": 0.327, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.9480020403862, - "rewards/margins": 1.4403297901153564, - "rewards/rejected": -0.49232783913612366, - "step": 600 - }, - { - "epoch": 1.56, - "eval_logits/chosen": -2.1867270469665527, - "eval_logits/rejected": -2.1818978786468506, - "eval_logps/chosen": -34.19493103027344, - "eval_logps/rejected": -37.843685150146484, - "eval_loss": 0.6703336834907532, - "eval_rewards/accuracies": 0.5805647969245911, - "eval_rewards/chosen": -0.14434270560741425, - "eval_rewards/margins": 0.15001599490642548, - "eval_rewards/rejected": -0.2943587005138397, - "eval_runtime": 145.4291, - "eval_samples_per_second": 2.359, - "eval_steps_per_second": 0.296, - "step": 600 - }, - { - "epoch": 1.58, - "grad_norm": 13.75, - "learning_rate": 3.7793282895240927e-06, - "logits/chosen": -1.9787023067474365, - "logits/rejected": -1.9794394969940186, - "logps/chosen": -33.91895294189453, - "logps/rejected": -33.77281951904297, - "loss": 0.2961, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.0000724792480469, - "rewards/margins": 1.6198437213897705, - "rewards/rejected": -0.6197710037231445, - "step": 610 - }, - { - "epoch": 1.61, - "grad_norm": 10.25, - "learning_rate": 3.730319028506478e-06, - "logits/chosen": -1.9425113201141357, - "logits/rejected": -1.9399850368499756, - "logps/chosen": -32.121673583984375, - "logps/rejected": -32.70948791503906, - "loss": 0.3125, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.9033487439155579, - "rewards/margins": 1.4789619445800781, - "rewards/rejected": -0.5756131410598755, - "step": 620 - }, - { - "epoch": 1.64, - "grad_norm": 30.75, - "learning_rate": 3.6806776869317074e-06, - "logits/chosen": -1.953460693359375, - "logits/rejected": -1.9444434642791748, - "logps/chosen": -31.709529876708984, - "logps/rejected": -31.5223388671875, - "loss": 0.3619, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.9599300622940063, - "rewards/margins": 1.488205909729004, - "rewards/rejected": -0.5282759070396423, - "step": 630 - }, - { - "epoch": 1.66, - "grad_norm": 17.25, - "learning_rate": 3.6304297682067146e-06, - "logits/chosen": -1.9538257122039795, - "logits/rejected": -1.9505916833877563, - "logps/chosen": -31.257614135742188, - "logps/rejected": -33.02531814575195, - "loss": 0.3186, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.8467265367507935, - "rewards/margins": 1.3920328617095947, - "rewards/rejected": -0.5453063249588013, - "step": 640 - }, - { - "epoch": 1.69, - "grad_norm": 9.4375, - "learning_rate": 3.579601087369492e-06, - "logits/chosen": -1.9619709253311157, - "logits/rejected": -1.9643146991729736, - "logps/chosen": -32.65868377685547, - "logps/rejected": -34.36846160888672, - "loss": 0.2785, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.8955224752426147, - "rewards/margins": 1.512078881263733, - "rewards/rejected": -0.6165562868118286, - "step": 650 - }, - { - "epoch": 1.71, - "grad_norm": 19.625, - "learning_rate": 3.5282177578265295e-06, - "logits/chosen": -1.8742033243179321, - "logits/rejected": -1.8746894598007202, - "logps/chosen": -32.91667175292969, - "logps/rejected": -32.129493713378906, - "loss": 0.328, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.9271462559700012, - "rewards/margins": 1.3971627950668335, - "rewards/rejected": -0.4700165390968323, - "step": 660 - }, - { - "epoch": 1.74, - "grad_norm": 14.1875, - "learning_rate": 3.476306177936961e-06, - "logits/chosen": -1.9133888483047485, - "logits/rejected": -1.9037227630615234, - "logps/chosen": -32.933998107910156, - "logps/rejected": -33.00373077392578, - "loss": 0.2651, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.9605558514595032, - "rewards/margins": 1.586902379989624, - "rewards/rejected": -0.6263464689254761, - "step": 670 - }, - { - "epoch": 1.77, - "grad_norm": 10.1875, - "learning_rate": 3.423893017450324e-06, - "logits/chosen": -1.8113712072372437, - "logits/rejected": -1.8081716299057007, - "logps/chosen": -30.349456787109375, - "logps/rejected": -34.96870422363281, - "loss": 0.2808, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.0003511905670166, - "rewards/margins": 1.5854613780975342, - "rewards/rejected": -0.5851101875305176, - "step": 680 - }, - { - "epoch": 1.79, - "grad_norm": 8.4375, - "learning_rate": 3.3710052038048794e-06, - "logits/chosen": -1.8722549676895142, - "logits/rejected": -1.8714803457260132, - "logps/chosen": -34.03639602661133, - "logps/rejected": -36.12696075439453, - "loss": 0.2342, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.110505223274231, - "rewards/margins": 1.8063312768936157, - "rewards/rejected": -0.6958259344100952, - "step": 690 - }, - { - "epoch": 1.82, - "grad_norm": 6.90625, - "learning_rate": 3.3176699082935546e-06, - "logits/chosen": -1.8479416370391846, - "logits/rejected": -1.8507680892944336, - "logps/chosen": -31.403573989868164, - "logps/rejected": -36.341434478759766, - "loss": 0.3034, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.079347014427185, - "rewards/margins": 1.7091315984725952, - "rewards/rejected": -0.6297845840454102, - "step": 700 - }, - { - "epoch": 1.82, - "eval_logits/chosen": -2.174877166748047, - "eval_logits/rejected": -2.1700735092163086, - "eval_logps/chosen": -34.240230560302734, - "eval_logps/rejected": -37.869354248046875, - "eval_loss": 0.6868197917938232, - "eval_rewards/accuracies": 0.565614640712738, - "eval_rewards/chosen": -0.18511110544204712, - "eval_rewards/margins": 0.13234683871269226, - "eval_rewards/rejected": -0.31745797395706177, - "eval_runtime": 145.1401, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 700 - }, - { - "epoch": 1.84, - "grad_norm": 15.9375, - "learning_rate": 3.2639145321045933e-06, - "logits/chosen": -1.9555838108062744, - "logits/rejected": -1.958298683166504, - "logps/chosen": -33.82456970214844, - "logps/rejected": -34.84859085083008, - "loss": 0.3553, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.9639171361923218, - "rewards/margins": 1.4845099449157715, - "rewards/rejected": -0.5205925703048706, - "step": 710 - }, - { - "epoch": 1.87, - "grad_norm": 14.375, - "learning_rate": 3.2097666922441107e-06, - "logits/chosen": -1.8090355396270752, - "logits/rejected": -1.8031476736068726, - "logps/chosen": -33.71675491333008, - "logps/rejected": -32.93245315551758, - "loss": 0.323, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 1.0048266649246216, - "rewards/margins": 1.527376413345337, - "rewards/rejected": -0.5225496292114258, - "step": 720 - }, - { - "epoch": 1.9, - "grad_norm": 7.46875, - "learning_rate": 3.1552542073477554e-06, - "logits/chosen": -1.9834911823272705, - "logits/rejected": -1.9804458618164062, - "logps/chosen": -29.5953426361084, - "logps/rejected": -32.31734085083008, - "loss": 0.2567, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9518804550170898, - "rewards/margins": 1.7341070175170898, - "rewards/rejected": -0.7822265028953552, - "step": 730 - }, - { - "epoch": 1.92, - "grad_norm": 7.28125, - "learning_rate": 3.100405083388799e-06, - "logits/chosen": -1.8222984075546265, - "logits/rejected": -1.8223203420639038, - "logps/chosen": -32.201148986816406, - "logps/rejected": -38.2669792175293, - "loss": 0.2789, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.1358217000961304, - "rewards/margins": 1.7110786437988281, - "rewards/rejected": -0.5752568244934082, - "step": 740 - }, - { - "epoch": 1.95, - "grad_norm": 6.75, - "learning_rate": 3.0452474992899645e-06, - "logits/chosen": -1.7060989141464233, - "logits/rejected": -1.711395025253296, - "logps/chosen": -35.7166748046875, - "logps/rejected": -34.62081527709961, - "loss": 0.3267, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.09420907497406, - "rewards/margins": 1.6778392791748047, - "rewards/rejected": -0.5836302638053894, - "step": 750 - }, - { - "epoch": 1.97, - "grad_norm": 9.0, - "learning_rate": 2.989809792446417e-06, - "logits/chosen": -1.9088201522827148, - "logits/rejected": -1.910348892211914, - "logps/chosen": -31.48809814453125, - "logps/rejected": -33.39298629760742, - "loss": 0.2839, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.005824327468872, - "rewards/margins": 1.5582213401794434, - "rewards/rejected": -0.5523970723152161, - "step": 760 - }, - { - "epoch": 2.0, - "grad_norm": 14.625, - "learning_rate": 2.9341204441673267e-06, - "logits/chosen": -1.882775068283081, - "logits/rejected": -1.8818010091781616, - "logps/chosen": -30.975149154663086, - "logps/rejected": -35.45886993408203, - "loss": 0.3076, - "rewards/accuracies": 0.908333420753479, - "rewards/chosen": 1.0804139375686646, - "rewards/margins": 1.4680591821670532, - "rewards/rejected": -0.3876451253890991, - "step": 770 - }, - { - "epoch": 2.03, - "grad_norm": 6.53125, - "learning_rate": 2.878208065043501e-06, - "logits/chosen": -1.882063627243042, - "logits/rejected": -1.8813728094100952, - "logps/chosen": -33.322509765625, - "logps/rejected": -32.38517379760742, - "loss": 0.1884, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.193357229232788, - "rewards/margins": 2.062441349029541, - "rewards/rejected": -0.8690838813781738, - "step": 780 - }, - { - "epoch": 2.05, - "grad_norm": 10.0625, - "learning_rate": 2.8221013802485974e-06, - "logits/chosen": -1.9113022089004517, - "logits/rejected": -1.909850835800171, - "logps/chosen": -28.325769424438477, - "logps/rejected": -33.69379425048828, - "loss": 0.1718, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1136726140975952, - "rewards/margins": 2.0654101371765137, - "rewards/rejected": -0.9517375826835632, - "step": 790 - }, - { - "epoch": 2.08, - "grad_norm": 4.375, - "learning_rate": 2.76582921478147e-06, - "logits/chosen": -1.9627540111541748, - "logits/rejected": -1.959313988685608, - "logps/chosen": -31.032363891601562, - "logps/rejected": -35.28049087524414, - "loss": 0.1649, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.2553597688674927, - "rewards/margins": 2.2286696434020996, - "rewards/rejected": -0.9733098745346069, - "step": 800 - }, - { - "epoch": 2.08, - "eval_logits/chosen": -2.164193868637085, - "eval_logits/rejected": -2.1594185829162598, - "eval_logps/chosen": -34.28217315673828, - "eval_logps/rejected": -37.94434356689453, - "eval_loss": 0.6811564564704895, - "eval_rewards/accuracies": 0.595099687576294, - "eval_rewards/chosen": -0.22285737097263336, - "eval_rewards/margins": 0.16209454834461212, - "eval_rewards/rejected": -0.3849518895149231, - "eval_runtime": 145.2348, - "eval_samples_per_second": 2.362, - "eval_steps_per_second": 0.296, - "step": 800 - }, - { - "epoch": 2.1, - "grad_norm": 7.375, - "learning_rate": 2.7094204786572254e-06, - "logits/chosen": -1.793891191482544, - "logits/rejected": -1.7861675024032593, - "logps/chosen": -32.827354431152344, - "logps/rejected": -35.588600158691406, - "loss": 0.1606, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.465995192527771, - "rewards/margins": 2.391630172729492, - "rewards/rejected": -0.9256349802017212, - "step": 810 - }, - { - "epoch": 2.13, - "grad_norm": 8.75, - "learning_rate": 2.6529041520546072e-06, - "logits/chosen": -1.8692048788070679, - "logits/rejected": -1.879642128944397, - "logps/chosen": -34.127784729003906, - "logps/rejected": -33.262062072753906, - "loss": 0.1935, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4111144542694092, - "rewards/margins": 2.158674955368042, - "rewards/rejected": -0.7475605607032776, - "step": 820 - }, - { - "epoch": 2.16, - "grad_norm": 11.375, - "learning_rate": 2.5963092704273302e-06, - "logits/chosen": -1.918723702430725, - "logits/rejected": -1.9233121871948242, - "logps/chosen": -33.50402069091797, - "logps/rejected": -30.132360458374023, - "loss": 0.1956, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2588849067687988, - "rewards/margins": 2.036588430404663, - "rewards/rejected": -0.777703583240509, - "step": 830 - }, - { - "epoch": 2.18, - "grad_norm": 6.65625, - "learning_rate": 2.53966490958702e-06, - "logits/chosen": -1.9170500040054321, - "logits/rejected": -1.9250987768173218, - "logps/chosen": -33.287925720214844, - "logps/rejected": -30.88974952697754, - "loss": 0.1648, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.350099802017212, - "rewards/margins": 2.28794264793396, - "rewards/rejected": -0.937842845916748, - "step": 840 - }, - { - "epoch": 2.21, - "grad_norm": 9.1875, - "learning_rate": 2.4830001707654135e-06, - "logits/chosen": -1.8453495502471924, - "logits/rejected": -1.835889458656311, - "logps/chosen": -30.441539764404297, - "logps/rejected": -32.993690490722656, - "loss": 0.1814, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.2164894342422485, - "rewards/margins": 2.098388195037842, - "rewards/rejected": -0.8818984031677246, - "step": 850 - }, - { - "epoch": 2.23, - "grad_norm": 5.25, - "learning_rate": 2.4263441656635054e-06, - "logits/chosen": -1.983048677444458, - "logits/rejected": -1.973232626914978, - "logps/chosen": -24.832033157348633, - "logps/rejected": -30.95058250427246, - "loss": 0.1791, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.2528350353240967, - "rewards/margins": 2.2232518196105957, - "rewards/rejected": -0.9704168438911438, - "step": 860 - }, - { - "epoch": 2.26, - "grad_norm": 7.1875, - "learning_rate": 2.3697260014953107e-06, - "logits/chosen": -1.8357470035552979, - "logits/rejected": -1.8370872735977173, - "logps/chosen": -32.829017639160156, - "logps/rejected": -30.976612091064453, - "loss": 0.1644, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.353589653968811, - "rewards/margins": 2.2482855319976807, - "rewards/rejected": -0.8946956396102905, - "step": 870 - }, - { - "epoch": 2.29, - "grad_norm": 6.46875, - "learning_rate": 2.3131747660339396e-06, - "logits/chosen": -1.8432958126068115, - "logits/rejected": -1.8441736698150635, - "logps/chosen": -31.128490447998047, - "logps/rejected": -34.13156509399414, - "loss": 0.2286, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.1346783638000488, - "rewards/margins": 2.107259750366211, - "rewards/rejected": -0.9725813865661621, - "step": 880 - }, - { - "epoch": 2.31, - "grad_norm": 6.4375, - "learning_rate": 2.256719512667651e-06, - "logits/chosen": -1.747079849243164, - "logits/rejected": -1.7454869747161865, - "logps/chosen": -34.19733810424805, - "logps/rejected": -37.23557662963867, - "loss": 0.1614, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3208175897598267, - "rewards/margins": 2.543323040008545, - "rewards/rejected": -1.2225055694580078, - "step": 890 - }, - { - "epoch": 2.34, - "grad_norm": 6.0, - "learning_rate": 2.2003892454735786e-06, - "logits/chosen": -1.8969894647598267, - "logits/rejected": -1.8902244567871094, - "logps/chosen": -30.53342056274414, - "logps/rejected": -33.870323181152344, - "loss": 0.1691, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3222969770431519, - "rewards/margins": 2.3233275413513184, - "rewards/rejected": -1.0010308027267456, - "step": 900 - }, - { - "epoch": 2.34, - "eval_logits/chosen": -2.15236496925354, - "eval_logits/rejected": -2.1475820541381836, - "eval_logps/chosen": -34.313846588134766, - "eval_logps/rejected": -37.98138427734375, - "eval_loss": 0.6880638599395752, - "eval_rewards/accuracies": 0.5830564498901367, - "eval_rewards/chosen": -0.2513664662837982, - "eval_rewards/margins": 0.16692043840885162, - "eval_rewards/rejected": -0.41828688979148865, - "eval_runtime": 145.3621, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 900 - }, - { - "epoch": 2.36, - "grad_norm": 12.0625, - "learning_rate": 2.1442129043167877e-06, - "logits/chosen": -1.9361755847930908, - "logits/rejected": -1.9315325021743774, - "logps/chosen": -32.13105010986328, - "logps/rejected": -35.70573425292969, - "loss": 0.1596, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2839645147323608, - "rewards/margins": 2.3433659076690674, - "rewards/rejected": -1.0594011545181274, - "step": 910 - }, - { - "epoch": 2.39, - "grad_norm": 6.5, - "learning_rate": 2.088219349982323e-06, - "logits/chosen": -1.8552948236465454, - "logits/rejected": -1.860346794128418, - "logps/chosen": -33.7907829284668, - "logps/rejected": -33.559757232666016, - "loss": 0.1777, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4020235538482666, - "rewards/margins": 2.3558895587921143, - "rewards/rejected": -0.9538658261299133, - "step": 920 - }, - { - "epoch": 2.42, - "grad_norm": 6.53125, - "learning_rate": 2.0324373493478803e-06, - "logits/chosen": -1.953168272972107, - "logits/rejected": -1.943918228149414, - "logps/chosen": -30.902240753173828, - "logps/rejected": -35.58992385864258, - "loss": 0.1581, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2444217205047607, - "rewards/margins": 2.276834011077881, - "rewards/rejected": -1.032412052154541, - "step": 930 - }, - { - "epoch": 2.44, - "grad_norm": 8.5, - "learning_rate": 1.976895560604729e-06, - "logits/chosen": -1.879119873046875, - "logits/rejected": -1.8759711980819702, - "logps/chosen": -30.023090362548828, - "logps/rejected": -33.454833984375, - "loss": 0.2017, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1388328075408936, - "rewards/margins": 2.029343366622925, - "rewards/rejected": -0.8905106782913208, - "step": 940 - }, - { - "epoch": 2.47, - "grad_norm": 4.625, - "learning_rate": 1.921622518534466e-06, - "logits/chosen": -1.8175594806671143, - "logits/rejected": -1.8249561786651611, - "logps/chosen": -31.570093154907227, - "logps/rejected": -36.65325164794922, - "loss": 0.2168, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.1708660125732422, - "rewards/margins": 2.163855791091919, - "rewards/rejected": -0.9929895401000977, - "step": 950 - }, - { - "epoch": 2.49, - "grad_norm": 3.953125, - "learning_rate": 1.8666466198491794e-06, - "logits/chosen": -1.8278528451919556, - "logits/rejected": -1.8209831714630127, - "logps/chosen": -32.37910842895508, - "logps/rejected": -37.90230178833008, - "loss": 0.1494, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3044663667678833, - "rewards/margins": 2.4776108264923096, - "rewards/rejected": -1.1731446981430054, - "step": 960 - }, - { - "epoch": 2.52, - "grad_norm": 5.375, - "learning_rate": 1.8119961086025376e-06, - "logits/chosen": -1.8297231197357178, - "logits/rejected": -1.8303101062774658, - "logps/chosen": -29.768722534179688, - "logps/rejected": -33.98983383178711, - "loss": 0.1847, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.3209145069122314, - "rewards/margins": 2.2286314964294434, - "rewards/rejected": -0.9077168703079224, - "step": 970 - }, - { - "epoch": 2.55, - "grad_norm": 7.9375, - "learning_rate": 1.7576990616793139e-06, - "logits/chosen": -1.8326854705810547, - "logits/rejected": -1.8439195156097412, - "logps/chosen": -31.450542449951172, - "logps/rejected": -35.19971466064453, - "loss": 0.168, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.4635357856750488, - "rewards/margins": 2.4251866340637207, - "rewards/rejected": -0.9616511464118958, - "step": 980 - }, - { - "epoch": 2.57, - "grad_norm": 6.0, - "learning_rate": 1.7037833743707892e-06, - "logits/chosen": -1.903786301612854, - "logits/rejected": -1.901314377784729, - "logps/chosen": -34.712928771972656, - "logps/rejected": -33.3343620300293, - "loss": 0.2409, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.2673081159591675, - "rewards/margins": 2.1089327335357666, - "rewards/rejected": -0.8416244387626648, - "step": 990 - }, - { - "epoch": 2.6, - "grad_norm": 9.5, - "learning_rate": 1.6502767460434588e-06, - "logits/chosen": -1.875101089477539, - "logits/rejected": -1.8791959285736084, - "logps/chosen": -33.045143127441406, - "logps/rejected": -35.46049880981445, - "loss": 0.1953, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2763327360153198, - "rewards/margins": 2.153337001800537, - "rewards/rejected": -0.8770040273666382, - "step": 1000 - }, - { - "epoch": 2.6, - "eval_logits/chosen": -2.1447298526763916, - "eval_logits/rejected": -2.139974594116211, - "eval_logps/chosen": -34.36631774902344, - "eval_logps/rejected": -38.036582946777344, - "eval_loss": 0.695711076259613, - "eval_rewards/accuracies": 0.5917773842811584, - "eval_rewards/chosen": -0.2985913157463074, - "eval_rewards/margins": 0.16937348246574402, - "eval_rewards/rejected": -0.4679647982120514, - "eval_runtime": 145.3859, - "eval_samples_per_second": 2.359, - "eval_steps_per_second": 0.296, - "step": 1000 - }, - { - "epoch": 2.62, - "grad_norm": 20.25, - "learning_rate": 1.5972066659083796e-06, - "logits/chosen": -1.8841043710708618, - "logits/rejected": -1.8890196084976196, - "logps/chosen": -31.524646759033203, - "logps/rejected": -33.430824279785156, - "loss": 0.2343, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.1482000350952148, - "rewards/margins": 1.9668105840682983, - "rewards/rejected": -0.8186105489730835, - "step": 1010 - }, - { - "epoch": 2.65, - "grad_norm": 15.5625, - "learning_rate": 1.5446003988985041e-06, - "logits/chosen": -1.843640685081482, - "logits/rejected": -1.8461425304412842, - "logps/chosen": -29.06394386291504, - "logps/rejected": -32.841705322265625, - "loss": 0.1766, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2435271739959717, - "rewards/margins": 2.2718143463134766, - "rewards/rejected": -1.0282870531082153, - "step": 1020 - }, - { - "epoch": 2.68, - "grad_norm": 4.625, - "learning_rate": 1.4924849716612211e-06, - "logits/chosen": -1.8973257541656494, - "logits/rejected": -1.8918126821517944, - "logps/chosen": -32.64158248901367, - "logps/rejected": -34.6368293762207, - "loss": 0.1812, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.2127745151519775, - "rewards/margins": 2.2197234630584717, - "rewards/rejected": -1.0069488286972046, - "step": 1030 - }, - { - "epoch": 2.7, - "grad_norm": 17.875, - "learning_rate": 1.440887158673332e-06, - "logits/chosen": -1.8320951461791992, - "logits/rejected": -1.836050033569336, - "logps/chosen": -34.43301010131836, - "logps/rejected": -35.46531677246094, - "loss": 0.1938, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1886460781097412, - "rewards/margins": 2.1298069953918457, - "rewards/rejected": -0.9411608576774597, - "step": 1040 - }, - { - "epoch": 2.73, - "grad_norm": 7.09375, - "learning_rate": 1.3898334684855647e-06, - "logits/chosen": -1.8290131092071533, - "logits/rejected": -1.841897964477539, - "logps/chosen": -30.590591430664062, - "logps/rejected": -34.06149673461914, - "loss": 0.2044, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.1450647115707397, - "rewards/margins": 2.082374095916748, - "rewards/rejected": -0.9373094439506531, - "step": 1050 - }, - { - "epoch": 2.75, - "grad_norm": 19.25, - "learning_rate": 1.3393501301037245e-06, - "logits/chosen": -1.8185697793960571, - "logits/rejected": -1.8119618892669678, - "logps/chosen": -30.647253036499023, - "logps/rejected": -34.33002471923828, - "loss": 0.1727, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3608683347702026, - "rewards/margins": 2.2717270851135254, - "rewards/rejected": -0.9108586311340332, - "step": 1060 - }, - { - "epoch": 2.78, - "grad_norm": 5.1875, - "learning_rate": 1.2894630795134454e-06, - "logits/chosen": -1.944183349609375, - "logits/rejected": -1.944902777671814, - "logps/chosen": -32.107669830322266, - "logps/rejected": -33.907928466796875, - "loss": 0.1656, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2954630851745605, - "rewards/margins": 2.2523720264434814, - "rewards/rejected": -0.9569088816642761, - "step": 1070 - }, - { - "epoch": 2.81, - "grad_norm": 10.0, - "learning_rate": 1.2401979463554984e-06, - "logits/chosen": -1.8745800256729126, - "logits/rejected": -1.8734228610992432, - "logps/chosen": -32.74195861816406, - "logps/rejected": -34.108428955078125, - "loss": 0.2386, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.2130850553512573, - "rewards/margins": 2.0380234718322754, - "rewards/rejected": -0.8249381184577942, - "step": 1080 - }, - { - "epoch": 2.83, - "grad_norm": 4.96875, - "learning_rate": 1.1915800407584705e-06, - "logits/chosen": -1.8999583721160889, - "logits/rejected": -1.8924366235733032, - "logps/chosen": -32.777381896972656, - "logps/rejected": -31.974105834960938, - "loss": 0.1764, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3036444187164307, - "rewards/margins": 2.142960548400879, - "rewards/rejected": -0.8393163681030273, - "step": 1090 - }, - { - "epoch": 2.86, - "grad_norm": 5.125, - "learning_rate": 1.1436343403356019e-06, - "logits/chosen": -1.8712406158447266, - "logits/rejected": -1.870269775390625, - "logps/chosen": -33.90606689453125, - "logps/rejected": -37.61457061767578, - "loss": 0.1463, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.361112117767334, - "rewards/margins": 2.496788263320923, - "rewards/rejected": -1.1356757879257202, - "step": 1100 - }, - { - "epoch": 2.86, - "eval_logits/chosen": -2.142699956893921, - "eval_logits/rejected": -2.1379306316375732, - "eval_logps/chosen": -34.36824035644531, - "eval_logps/rejected": -38.02313995361328, - "eval_loss": 0.7009721994400024, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": -0.30031847953796387, - "eval_rewards/margins": 0.15554992854595184, - "eval_rewards/rejected": -0.4558684229850769, - "eval_runtime": 145.3955, - "eval_samples_per_second": 2.359, - "eval_steps_per_second": 0.296, - "step": 1100 - }, - { - "epoch": 2.88, - "grad_norm": 6.53125, - "learning_rate": 1.0963854773524548e-06, - "logits/chosen": -1.9193534851074219, - "logits/rejected": -1.9247900247573853, - "logps/chosen": -34.15703582763672, - "logps/rejected": -36.63218688964844, - "loss": 0.2075, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.179071307182312, - "rewards/margins": 2.15881085395813, - "rewards/rejected": -0.9797393679618835, - "step": 1110 - }, - { - "epoch": 2.91, - "grad_norm": 4.625, - "learning_rate": 1.049857726072005e-06, - "logits/chosen": -1.888942003250122, - "logits/rejected": -1.8869132995605469, - "logps/chosen": -31.29522132873535, - "logps/rejected": -34.09874725341797, - "loss": 0.1873, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1888843774795532, - "rewards/margins": 2.193281650543213, - "rewards/rejected": -1.0043971538543701, - "step": 1120 - }, - { - "epoch": 2.94, - "grad_norm": 5.90625, - "learning_rate": 1.0040749902836508e-06, - "logits/chosen": -1.7918437719345093, - "logits/rejected": -1.7945436239242554, - "logps/chosen": -28.33380126953125, - "logps/rejected": -31.490909576416016, - "loss": 0.156, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.273350715637207, - "rewards/margins": 2.394632577896118, - "rewards/rejected": -1.1212818622589111, - "step": 1130 - }, - { - "epoch": 2.96, - "grad_norm": 5.8125, - "learning_rate": 9.59060791022566e-07, - "logits/chosen": -1.8893934488296509, - "logits/rejected": -1.8862323760986328, - "logps/chosen": -31.71720314025879, - "logps/rejected": -33.38925552368164, - "loss": 0.2101, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.337890863418579, - "rewards/margins": 2.114030122756958, - "rewards/rejected": -0.7761393189430237, - "step": 1140 - }, - { - "epoch": 2.99, - "grad_norm": 8.8125, - "learning_rate": 9.148382544856885e-07, - "logits/chosen": -1.8082507848739624, - "logits/rejected": -1.8019250631332397, - "logps/chosen": -26.8853702545166, - "logps/rejected": -33.35715103149414, - "loss": 0.1765, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1273572444915771, - "rewards/margins": 2.2153234481811523, - "rewards/rejected": -1.0879663228988647, - "step": 1150 - }, - { - "epoch": 3.01, - "grad_norm": 3.359375, - "learning_rate": 8.714301001505568e-07, - "logits/chosen": -1.922569990158081, - "logits/rejected": -1.920064926147461, - "logps/chosen": -31.649404525756836, - "logps/rejected": -36.346885681152344, - "loss": 0.1692, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3019969463348389, - "rewards/margins": 2.431814670562744, - "rewards/rejected": -1.1298176050186157, - "step": 1160 - }, - { - "epoch": 3.04, - "grad_norm": 4.625, - "learning_rate": 8.288586291031025e-07, - "logits/chosen": -1.9079128503799438, - "logits/rejected": -1.9058338403701782, - "logps/chosen": -30.213918685913086, - "logps/rejected": -33.71696472167969, - "loss": 0.1383, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2604410648345947, - "rewards/margins": 2.478085517883301, - "rewards/rejected": -1.217644453048706, - "step": 1170 - }, - { - "epoch": 3.06, - "grad_norm": 4.96875, - "learning_rate": 7.871457125803897e-07, - "logits/chosen": -1.9004592895507812, - "logits/rejected": -1.8882315158843994, - "logps/chosen": -34.54669189453125, - "logps/rejected": -35.38771057128906, - "loss": 0.1478, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.348745584487915, - "rewards/margins": 2.5333569049835205, - "rewards/rejected": -1.1846110820770264, - "step": 1180 - }, - { - "epoch": 3.09, - "grad_norm": 6.75, - "learning_rate": 7.463127807341966e-07, - "logits/chosen": -1.7955455780029297, - "logits/rejected": -1.790220022201538, - "logps/chosen": -33.658992767333984, - "logps/rejected": -35.314979553222656, - "loss": 0.1451, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.52162504196167, - "rewards/margins": 2.4910683631896973, - "rewards/rejected": -0.9694433212280273, - "step": 1190 - }, - { - "epoch": 3.12, - "grad_norm": 6.90625, - "learning_rate": 7.063808116212021e-07, - "logits/chosen": -1.8400506973266602, - "logits/rejected": -1.8432636260986328, - "logps/chosen": -31.264917373657227, - "logps/rejected": -32.59490203857422, - "loss": 0.1796, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2151604890823364, - "rewards/margins": 2.21301007270813, - "rewards/rejected": -0.9978495836257935, - "step": 1200 - }, - { - "epoch": 3.12, - "eval_logits/chosen": -2.142347574234009, - "eval_logits/rejected": -2.137584686279297, - "eval_logps/chosen": -34.35411071777344, - "eval_logps/rejected": -38.025672912597656, - "eval_loss": 0.6907772421836853, - "eval_rewards/accuracies": 0.5747508406639099, - "eval_rewards/chosen": -0.2876059412956238, - "eval_rewards/margins": 0.1705409586429596, - "eval_rewards/rejected": -0.45814695954322815, - "eval_runtime": 145.3519, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 1200 - }, - { - "epoch": 3.14, - "grad_norm": 9.9375, - "learning_rate": 6.673703204254348e-07, - "logits/chosen": -1.8635101318359375, - "logits/rejected": -1.858994722366333, - "logps/chosen": -29.87615966796875, - "logps/rejected": -32.12238311767578, - "loss": 0.1548, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2577589750289917, - "rewards/margins": 2.350618362426758, - "rewards/rejected": -1.0928595066070557, - "step": 1210 - }, - { - "epoch": 3.17, - "grad_norm": 5.375, - "learning_rate": 6.293013489185315e-07, - "logits/chosen": -1.8775825500488281, - "logits/rejected": -1.8721100091934204, - "logps/chosen": -33.751792907714844, - "logps/rejected": -35.85100555419922, - "loss": 0.1601, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.3699350357055664, - "rewards/margins": 2.522946834564209, - "rewards/rejected": -1.1530119180679321, - "step": 1220 - }, - { - "epoch": 3.19, - "grad_norm": 4.25, - "learning_rate": 5.921934551632086e-07, - "logits/chosen": -1.879861831665039, - "logits/rejected": -1.8666155338287354, - "logps/chosen": -31.769283294677734, - "logps/rejected": -35.38819122314453, - "loss": 0.1889, - "rewards/accuracies": 0.875, - "rewards/chosen": 1.1922564506530762, - "rewards/margins": 2.333768606185913, - "rewards/rejected": -1.141512393951416, - "step": 1230 - }, - { - "epoch": 3.22, - "grad_norm": 3.765625, - "learning_rate": 5.560657034652405e-07, - "logits/chosen": -1.9246861934661865, - "logits/rejected": -1.922136664390564, - "logps/chosen": -33.49988555908203, - "logps/rejected": -32.82632064819336, - "loss": 0.1608, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3797134160995483, - "rewards/margins": 2.2675235271453857, - "rewards/rejected": -0.8878101110458374, - "step": 1240 - }, - { - "epoch": 3.25, - "grad_norm": 5.96875, - "learning_rate": 5.2093665457911e-07, - "logits/chosen": -1.851680040359497, - "logits/rejected": -1.8488292694091797, - "logps/chosen": -32.87358856201172, - "logps/rejected": -36.248497009277344, - "loss": 0.1342, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3703018426895142, - "rewards/margins": 2.6765310764312744, - "rewards/rejected": -1.3062288761138916, - "step": 1250 - }, - { - "epoch": 3.27, - "grad_norm": 4.5625, - "learning_rate": 4.868243561723535e-07, - "logits/chosen": -1.9186599254608154, - "logits/rejected": -1.913433313369751, - "logps/chosen": -29.251794815063477, - "logps/rejected": -33.522377014160156, - "loss": 0.1506, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3581030368804932, - "rewards/margins": 2.383944511413574, - "rewards/rejected": -1.025841474533081, - "step": 1260 - }, - { - "epoch": 3.3, - "grad_norm": 4.15625, - "learning_rate": 4.537463335535161e-07, - "logits/chosen": -1.9449501037597656, - "logits/rejected": -1.9497982263565063, - "logps/chosen": -31.12078285217285, - "logps/rejected": -32.769309997558594, - "loss": 0.1426, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4311946630477905, - "rewards/margins": 2.410348892211914, - "rewards/rejected": -0.9791544079780579, - "step": 1270 - }, - { - "epoch": 3.32, - "grad_norm": 7.09375, - "learning_rate": 4.217195806684629e-07, - "logits/chosen": -1.8380733728408813, - "logits/rejected": -1.8454326391220093, - "logps/chosen": -33.449485778808594, - "logps/rejected": -33.86452102661133, - "loss": 0.1535, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3119791746139526, - "rewards/margins": 2.4516043663024902, - "rewards/rejected": -1.1396249532699585, - "step": 1280 - }, - { - "epoch": 3.35, - "grad_norm": 4.625, - "learning_rate": 3.907605513696808e-07, - "logits/chosen": -1.7485191822052002, - "logits/rejected": -1.7507747411727905, - "logps/chosen": -31.557483673095703, - "logps/rejected": -37.77085494995117, - "loss": 0.1711, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.416973352432251, - "rewards/margins": 2.511898994445801, - "rewards/rejected": -1.0949256420135498, - "step": 1290 - }, - { - "epoch": 3.38, - "grad_norm": 4.65625, - "learning_rate": 3.6088515096305675e-07, - "logits/chosen": -1.7980448007583618, - "logits/rejected": -1.8015098571777344, - "logps/chosen": -31.520572662353516, - "logps/rejected": -33.67357635498047, - "loss": 0.1264, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.5264867544174194, - "rewards/margins": 2.614957332611084, - "rewards/rejected": -1.088470458984375, - "step": 1300 - }, - { - "epoch": 3.38, - "eval_logits/chosen": -2.1421782970428467, - "eval_logits/rejected": -2.1374199390411377, - "eval_logps/chosen": -34.34251022338867, - "eval_logps/rejected": -38.01955795288086, - "eval_loss": 0.6911265850067139, - "eval_rewards/accuracies": 0.5892857313156128, - "eval_rewards/chosen": -0.27716198563575745, - "eval_rewards/margins": 0.17548424005508423, - "eval_rewards/rejected": -0.4526461660861969, - "eval_runtime": 145.3336, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 1300 - }, - { - "epoch": 3.4, - "grad_norm": 3.578125, - "learning_rate": 3.321087280364757e-07, - "logits/chosen": -1.9077885150909424, - "logits/rejected": -1.8874790668487549, - "logps/chosen": -29.57822036743164, - "logps/rejected": -36.54710388183594, - "loss": 0.1528, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.3816629648208618, - "rewards/margins": 2.624289035797119, - "rewards/rejected": -1.242626428604126, - "step": 1310 - }, - { - "epoch": 3.43, - "grad_norm": 5.71875, - "learning_rate": 3.044460665744284e-07, - "logits/chosen": -1.8845760822296143, - "logits/rejected": -1.8903357982635498, - "logps/chosen": -31.33782958984375, - "logps/rejected": -33.613182067871094, - "loss": 0.1368, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5263617038726807, - "rewards/margins": 2.5365498065948486, - "rewards/rejected": -1.0101878643035889, - "step": 1320 - }, - { - "epoch": 3.45, - "grad_norm": 3.9375, - "learning_rate": 2.779113783626916e-07, - "logits/chosen": -1.8593124151229858, - "logits/rejected": -1.8543720245361328, - "logps/chosen": -31.642696380615234, - "logps/rejected": -36.003231048583984, - "loss": 0.1361, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2418328523635864, - "rewards/margins": 2.529675245285034, - "rewards/rejected": -1.2878425121307373, - "step": 1330 - }, - { - "epoch": 3.48, - "grad_norm": 5.03125, - "learning_rate": 2.5251829568697204e-07, - "logits/chosen": -1.7216873168945312, - "logits/rejected": -1.731299638748169, - "logps/chosen": -32.13956069946289, - "logps/rejected": -32.022666931152344, - "loss": 0.153, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4940283298492432, - "rewards/margins": 2.383723735809326, - "rewards/rejected": -0.8896951675415039, - "step": 1340 - }, - { - "epoch": 3.51, - "grad_norm": 4.46875, - "learning_rate": 2.2827986432927774e-07, - "logits/chosen": -1.779524803161621, - "logits/rejected": -1.784233808517456, - "logps/chosen": -32.241004943847656, - "logps/rejected": -34.2801513671875, - "loss": 0.1178, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.5389398336410522, - "rewards/margins": 2.5067484378814697, - "rewards/rejected": -0.9678082466125488, - "step": 1350 - }, - { - "epoch": 3.53, - "grad_norm": 22.75, - "learning_rate": 2.0520853686560177e-07, - "logits/chosen": -1.808258056640625, - "logits/rejected": -1.813680648803711, - "logps/chosen": -32.30830001831055, - "logps/rejected": -35.810401916503906, - "loss": 0.1575, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.371025800704956, - "rewards/margins": 2.493631601333618, - "rewards/rejected": -1.1226056814193726, - "step": 1360 - }, - { - "epoch": 3.56, - "grad_norm": 3.953125, - "learning_rate": 1.833161662683672e-07, - "logits/chosen": -1.900948166847229, - "logits/rejected": -1.8964240550994873, - "logps/chosen": -31.239208221435547, - "logps/rejected": -32.06488800048828, - "loss": 0.1478, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2568284273147583, - "rewards/margins": 2.2945187091827393, - "rewards/rejected": -1.0376904010772705, - "step": 1370 - }, - { - "epoch": 3.58, - "grad_norm": 6.21875, - "learning_rate": 1.626139998169246e-07, - "logits/chosen": -1.8157840967178345, - "logits/rejected": -1.817800760269165, - "logps/chosen": -28.267253875732422, - "logps/rejected": -31.886409759521484, - "loss": 0.1623, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2528432607650757, - "rewards/margins": 2.2808501720428467, - "rewards/rejected": -1.0280072689056396, - "step": 1380 - }, - { - "epoch": 3.61, - "grad_norm": 9.25, - "learning_rate": 1.4311267331922535e-07, - "logits/chosen": -1.8119876384735107, - "logits/rejected": -1.8166316747665405, - "logps/chosen": -30.596643447875977, - "logps/rejected": -33.92557907104492, - "loss": 0.1638, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.280565619468689, - "rewards/margins": 2.124329090118408, - "rewards/rejected": -0.8437638282775879, - "step": 1390 - }, - { - "epoch": 3.64, - "grad_norm": 3.5, - "learning_rate": 1.2482220564763669e-07, - "logits/chosen": -1.8785665035247803, - "logits/rejected": -1.881792664527893, - "logps/chosen": -33.31572723388672, - "logps/rejected": -34.651222229003906, - "loss": 0.1206, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4533611536026, - "rewards/margins": 2.5582594871520996, - "rewards/rejected": -1.10489821434021, - "step": 1400 - }, - { - "epoch": 3.64, - "eval_logits/chosen": -2.1418943405151367, - "eval_logits/rejected": -2.137127637863159, - "eval_logps/chosen": -34.35324478149414, - "eval_logps/rejected": -38.02573013305664, - "eval_loss": 0.692409336566925, - "eval_rewards/accuracies": 0.5917773842811584, - "eval_rewards/chosen": -0.28682059049606323, - "eval_rewards/margins": 0.17138195037841797, - "eval_rewards/rejected": -0.4582025408744812, - "eval_runtime": 145.3556, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 1400 - }, - { - "epoch": 3.66, - "grad_norm": 6.125, - "learning_rate": 1.0775199359171346e-07, - "logits/chosen": -1.946838140487671, - "logits/rejected": -1.939971923828125, - "logps/chosen": -32.17081832885742, - "logps/rejected": -35.340518951416016, - "loss": 0.1438, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4685560464859009, - "rewards/margins": 2.454911470413208, - "rewards/rejected": -0.9863556027412415, - "step": 1410 - }, - { - "epoch": 3.69, - "grad_norm": 11.0, - "learning_rate": 9.191080703056604e-08, - "logits/chosen": -1.8355038166046143, - "logits/rejected": -1.8465068340301514, - "logps/chosen": -32.56521987915039, - "logps/rejected": -34.721900939941406, - "loss": 0.1399, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4256752729415894, - "rewards/margins": 2.4434168338775635, - "rewards/rejected": -1.0177414417266846, - "step": 1420 - }, - { - "epoch": 3.71, - "grad_norm": 7.5625, - "learning_rate": 7.730678442730539e-08, - "logits/chosen": -1.9164823293685913, - "logits/rejected": -1.928348183631897, - "logps/chosen": -33.428531646728516, - "logps/rejected": -34.8869514465332, - "loss": 0.1342, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4690487384796143, - "rewards/margins": 2.5928056240081787, - "rewards/rejected": -1.123757004737854, - "step": 1430 - }, - { - "epoch": 3.74, - "grad_norm": 9.5625, - "learning_rate": 6.394742864787806e-08, - "logits/chosen": -1.9064290523529053, - "logits/rejected": -1.9086523056030273, - "logps/chosen": -31.696964263916016, - "logps/rejected": -35.06704330444336, - "loss": 0.1635, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3602821826934814, - "rewards/margins": 2.4466512203216553, - "rewards/rejected": -1.086369276046753, - "step": 1440 - }, - { - "epoch": 3.77, - "grad_norm": 5.46875, - "learning_rate": 5.183960310644748e-08, - "logits/chosen": -1.8891935348510742, - "logits/rejected": -1.8821351528167725, - "logps/chosen": -33.91851043701172, - "logps/rejected": -34.860595703125, - "loss": 0.1585, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4848333597183228, - "rewards/margins": 2.4660050868988037, - "rewards/rejected": -0.981171727180481, - "step": 1450 - }, - { - "epoch": 3.79, - "grad_norm": 4.96875, - "learning_rate": 4.098952823928693e-08, - "logits/chosen": -1.8560640811920166, - "logits/rejected": -1.8620649576187134, - "logps/chosen": -29.741008758544922, - "logps/rejected": -34.997169494628906, - "loss": 0.1378, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3249523639678955, - "rewards/margins": 2.519402027130127, - "rewards/rejected": -1.194449782371521, - "step": 1460 - }, - { - "epoch": 3.82, - "grad_norm": 5.40625, - "learning_rate": 3.1402778309014284e-08, - "logits/chosen": -1.803776502609253, - "logits/rejected": -1.801746129989624, - "logps/chosen": -29.282577514648438, - "logps/rejected": -31.621936798095703, - "loss": 0.1565, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4066790342330933, - "rewards/margins": 2.331610918045044, - "rewards/rejected": -0.9249318242073059, - "step": 1470 - }, - { - "epoch": 3.84, - "grad_norm": 6.1875, - "learning_rate": 2.3084278540791427e-08, - "logits/chosen": -1.9790493249893188, - "logits/rejected": -1.9734687805175781, - "logps/chosen": -33.892913818359375, - "logps/rejected": -33.2518310546875, - "loss": 0.1448, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.253114104270935, - "rewards/margins": 2.3287227153778076, - "rewards/rejected": -1.0756088495254517, - "step": 1480 - }, - { - "epoch": 3.87, - "grad_norm": 7.28125, - "learning_rate": 1.6038302591975807e-08, - "logits/chosen": -1.8717892169952393, - "logits/rejected": -1.8740745782852173, - "logps/chosen": -27.078582763671875, - "logps/rejected": -29.020471572875977, - "loss": 0.1879, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.2580173015594482, - "rewards/margins": 2.156388759613037, - "rewards/rejected": -0.8983713984489441, - "step": 1490 - }, - { - "epoch": 3.9, - "grad_norm": 5.75, - "learning_rate": 1.0268470356514237e-08, - "logits/chosen": -1.8792669773101807, - "logits/rejected": -1.8736785650253296, - "logps/chosen": -31.454416275024414, - "logps/rejected": -33.20934295654297, - "loss": 0.1645, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2117339372634888, - "rewards/margins": 2.328080892562866, - "rewards/rejected": -1.116347074508667, - "step": 1500 - }, - { - "epoch": 3.9, - "eval_logits/chosen": -2.1418604850769043, - "eval_logits/rejected": -2.1370973587036133, - "eval_logps/chosen": -34.35454177856445, - "eval_logps/rejected": -38.024742126464844, - "eval_loss": 0.6943246126174927, - "eval_rewards/accuracies": 0.5718438625335693, - "eval_rewards/chosen": -0.28799253702163696, - "eval_rewards/margins": 0.16931606829166412, - "eval_rewards/rejected": -0.4573085606098175, - "eval_runtime": 145.3262, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 1500 - }, - { - "epoch": 3.92, - "grad_norm": 9.4375, - "learning_rate": 5.777746105209147e-09, - "logits/chosen": -1.8055070638656616, - "logits/rejected": -1.8097158670425415, - "logps/chosen": -32.785335540771484, - "logps/rejected": -35.42612075805664, - "loss": 0.1841, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.2470948696136475, - "rewards/margins": 2.1612586975097656, - "rewards/rejected": -0.9141640663146973, - "step": 1510 - }, - { - "epoch": 3.95, - "grad_norm": 5.40625, - "learning_rate": 2.5684369628148352e-09, - "logits/chosen": -1.8615461587905884, - "logits/rejected": -1.8599445819854736, - "logps/chosen": -29.254053115844727, - "logps/rejected": -33.70402908325195, - "loss": 0.1759, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3321115970611572, - "rewards/margins": 2.334989547729492, - "rewards/rejected": -1.002877950668335, - "step": 1520 - }, - { - "epoch": 3.97, - "grad_norm": 8.8125, - "learning_rate": 6.421917227455999e-10, - "logits/chosen": -1.960646390914917, - "logits/rejected": -1.9528770446777344, - "logps/chosen": -26.7325439453125, - "logps/rejected": -29.855510711669922, - "loss": 0.1584, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1973587274551392, - "rewards/margins": 2.2628719806671143, - "rewards/rejected": -1.065513253211975, - "step": 1530 - }, - { - "epoch": 4.0, - "grad_norm": 7.0625, - "learning_rate": 0.0, - "logits/chosen": -1.863152265548706, - "logits/rejected": -1.85297429561615, - "logps/chosen": -31.610912322998047, - "logps/rejected": -36.648475646972656, - "loss": 0.1361, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3097574710845947, - "rewards/margins": 2.4233245849609375, - "rewards/rejected": -1.1135669946670532, - "step": 1540 - }, - { - "epoch": 4.0, - "step": 1540, + "epoch": 1.0, + "step": 385, "total_flos": 0.0, - "train_loss": 0.19153660760297403, - "train_runtime": 10767.2812, - "train_samples_per_second": 1.144, - "train_steps_per_second": 0.143 + "train_loss": 0.447351616079157, + "train_runtime": 3253.4458, + "train_samples_per_second": 0.946, + "train_steps_per_second": 0.118 } ], "logging_steps": 10, - "max_steps": 1540, + "max_steps": 385, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4,