diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.0, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1540, + "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15,7 +15,7 @@ "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, - "loss": 1.0, + "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,2537 +25,597 @@ { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, - "logits/chosen": -1.8667490482330322, - "logits/rejected": -1.8710733652114868, - "logps/chosen": -36.97007369995117, - "logps/rejected": -33.66944885253906, - "loss": 0.9317, - "rewards/accuracies": 0.5694444179534912, - "rewards/chosen": 0.03287407010793686, - "rewards/margins": 0.06830974668264389, - "rewards/rejected": -0.03543568402528763, + "logits/chosen": -1.866413950920105, + "logits/rejected": -1.8707411289215088, + "logps/chosen": -36.98916244506836, + "logps/rejected": -33.67436981201172, + "loss": 0.6701, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.01569323241710663, + "rewards/margins": 0.05555717274546623, + "rewards/rejected": -0.039863936603069305, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, - "logits/chosen": -1.9981460571289062, - "logits/rejected": -2.000789165496826, - "logps/chosen": -29.641231536865234, - "logps/rejected": -29.06744384765625, - "loss": 0.9955, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": 0.0008672710391692817, - "rewards/margins": 0.004467610269784927, - "rewards/rejected": -0.0036003391724079847, + "logits/chosen": -1.9979650974273682, + "logits/rejected": -2.0006086826324463, + "logps/chosen": -29.624820709228516, + "logps/rejected": -29.0762939453125, + "loss": 0.6837, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.01563635841012001, + "rewards/margins": 0.027204299345612526, + "rewards/rejected": -0.01156794372946024, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, - "logits/chosen": -1.920600175857544, - "logits/rejected": -1.917925238609314, - "logps/chosen": -31.395061492919922, - "logps/rejected": -33.240909576416016, - "loss": 0.9609, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.01893135905265808, - "rewards/margins": 0.03913776949048042, - "rewards/rejected": -0.020206410437822342, + "logits/chosen": -1.921021819114685, + "logits/rejected": -1.9183374643325806, + "logps/chosen": -31.40532875061035, + "logps/rejected": -33.23241424560547, + "loss": 0.6877, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00968973059207201, + "rewards/margins": 0.022251319140195847, + "rewards/rejected": -0.012561586685478687, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, - "logits/chosen": -2.017815113067627, - "logits/rejected": -2.0090720653533936, - "logps/chosen": -32.5806884765625, - "logps/rejected": -32.515098571777344, - "loss": 1.0013, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -0.003494268748909235, - "rewards/margins": -0.0013132141903042793, - "rewards/rejected": -0.002181055024266243, + "logits/chosen": -2.0176353454589844, + "logits/rejected": -2.008906364440918, + "logps/chosen": -32.574256896972656, + "logps/rejected": -32.53368377685547, + "loss": 0.6874, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0022967704571783543, + "rewards/margins": 0.02120940014719963, + "rewards/rejected": -0.018912632018327713, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, - "logits/chosen": -1.8630876541137695, - "logits/rejected": -1.8523353338241577, - "logps/chosen": -33.549766540527344, - "logps/rejected": -35.46318435668945, - "loss": 0.9833, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.006940312683582306, - "rewards/margins": 0.016694897785782814, - "rewards/rejected": -0.009754580445587635, + "logits/chosen": -1.8619186878204346, + "logits/rejected": -1.85114324092865, + "logps/chosen": -33.55537414550781, + "logps/rejected": -35.45675277709961, + "loss": 0.6957, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.001892436295747757, + "rewards/margins": 0.005858602002263069, + "rewards/rejected": -0.003966164775192738, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, - "logits/chosen": -1.9419746398925781, - "logits/rejected": -1.943914771080017, - "logps/chosen": -32.527896881103516, - "logps/rejected": -33.21547317504883, - "loss": 0.9153, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.06505907326936722, - "rewards/margins": 0.10474522411823273, - "rewards/rejected": -0.03968615084886551, + "logits/chosen": -1.9400945901870728, + "logits/rejected": -1.9420464038848877, + "logps/chosen": -32.56509780883789, + "logps/rejected": -33.2406120300293, + "loss": 0.6632, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.031578924506902695, + "rewards/margins": 0.09388783574104309, + "rewards/rejected": -0.062308914959430695, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, - "logits/chosen": -2.07257080078125, - "logits/rejected": -2.0775399208068848, - "logps/chosen": -34.00202560424805, - "logps/rejected": -36.622886657714844, - "loss": 0.9636, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -0.012188142165541649, - "rewards/margins": 0.03642461448907852, - "rewards/rejected": -0.04861275106668472, + "logits/chosen": -2.0712790489196777, + "logits/rejected": -2.0762436389923096, + "logps/chosen": -33.981910705566406, + "logps/rejected": -36.62363815307617, + "loss": 0.6833, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.005918038543313742, + "rewards/margins": 0.05520814657211304, + "rewards/rejected": -0.04929010197520256, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, - "logits/chosen": -1.9333629608154297, - "logits/rejected": -1.9364970922470093, - "logps/chosen": -34.302101135253906, - "logps/rejected": -34.63160705566406, - "loss": 0.862, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.1131378561258316, - "rewards/margins": 0.1484164148569107, - "rewards/rejected": -0.0352785661816597, + "logits/chosen": -1.9327905178070068, + "logits/rejected": -1.935909628868103, + "logps/chosen": -34.32685470581055, + "logps/rejected": -34.65606689453125, + "loss": 0.639, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.09085920453071594, + "rewards/margins": 0.14815348386764526, + "rewards/rejected": -0.057294271886348724, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, - "logits/chosen": -1.9408857822418213, - "logits/rejected": -1.945412039756775, - "logps/chosen": -32.36528015136719, - "logps/rejected": -32.34526824951172, - "loss": 0.9225, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.09192506223917007, - "rewards/margins": 0.07964853197336197, - "rewards/rejected": 0.012276534922420979, + "logits/chosen": -1.9414918422698975, + "logits/rejected": -1.946007490158081, + "logps/chosen": -32.406803131103516, + "logps/rejected": -32.36021041870117, + "loss": 0.6792, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.054556868970394135, + "rewards/margins": 0.05573350936174393, + "rewards/rejected": -0.0011766403913497925, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, - "logits/chosen": -2.037550210952759, - "logits/rejected": -2.0355725288391113, - "logps/chosen": -32.142730712890625, - "logps/rejected": -31.29366683959961, - "loss": 0.8913, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.08829358220100403, - "rewards/margins": 0.1130049005150795, - "rewards/rejected": -0.02471131458878517, + "logits/chosen": -2.039034128189087, + "logits/rejected": -2.0370402336120605, + "logps/chosen": -32.172786712646484, + "logps/rejected": -31.333194732666016, + "loss": 0.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06124376505613327, + "rewards/margins": 0.12152798473834991, + "rewards/rejected": -0.06028420478105545, "step": 100 }, { "epoch": 0.26, - "eval_logits/chosen": -2.232161283493042, - "eval_logits/rejected": -2.2273108959198, - "eval_logps/chosen": -34.040714263916016, - "eval_logps/rejected": -37.54047775268555, - "eval_loss": 0.9844526052474976, - "eval_rewards/accuracies": 0.5195183157920837, - "eval_rewards/chosen": -0.005542654078453779, - "eval_rewards/margins": 0.015924591571092606, - "eval_rewards/rejected": -0.021467244252562523, - "eval_runtime": 145.9018, - "eval_samples_per_second": 2.351, - "eval_steps_per_second": 0.295, + "eval_logits/chosen": -2.2339773178100586, + "eval_logits/rejected": -2.229137420654297, + "eval_logps/chosen": -34.04054641723633, + "eval_logps/rejected": -37.549957275390625, + "eval_loss": 0.6902773976325989, + "eval_rewards/accuracies": 0.5685215592384338, + "eval_rewards/chosen": -0.005393954925239086, + "eval_rewards/margins": 0.024608083069324493, + "eval_rewards/rejected": -0.030002037063241005, + "eval_runtime": 146.034, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, - "logits/chosen": -1.9926517009735107, - "logits/rejected": -1.9902753829956055, - "logps/chosen": -33.12412643432617, - "logps/rejected": -34.011417388916016, - "loss": 0.9361, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.10772128403186798, - "rewards/margins": 0.09156213700771332, - "rewards/rejected": 0.016159160062670708, + "logits/chosen": -1.994192123413086, + "logits/rejected": -1.9918158054351807, + "logps/chosen": -33.142940521240234, + "logps/rejected": -34.01188278198242, + "loss": 0.6911, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.09078876674175262, + "rewards/margins": 0.07505009323358536, + "rewards/rejected": 0.015738680958747864, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, - "logits/chosen": -2.00441312789917, - "logits/rejected": -1.996093988418579, - "logps/chosen": -32.33955383300781, - "logps/rejected": -32.13432312011719, - "loss": 0.9401, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.09481850266456604, - "rewards/margins": 0.07035262137651443, - "rewards/rejected": 0.024465877562761307, + "logits/chosen": -2.0053954124450684, + "logits/rejected": -1.997046709060669, + "logps/chosen": -32.33894348144531, + "logps/rejected": -32.1308708190918, + "loss": 0.6746, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09536493569612503, + "rewards/margins": 0.06779730319976807, + "rewards/rejected": 0.027567636221647263, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, - "logits/chosen": -2.0318965911865234, - "logits/rejected": -2.023927688598633, - "logps/chosen": -30.336984634399414, - "logps/rejected": -32.0634765625, - "loss": 0.9061, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.12486012279987335, - "rewards/margins": 0.1342838853597641, - "rewards/rejected": -0.009423775598406792, + "logits/chosen": -2.0336387157440186, + "logits/rejected": -2.025650978088379, + "logps/chosen": -30.345691680908203, + "logps/rejected": -32.078697204589844, + "loss": 0.6527, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.11702337116003036, + "rewards/margins": 0.14014457166194916, + "rewards/rejected": -0.023121213540434837, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, - "logits/chosen": -1.9620994329452515, - "logits/rejected": -1.9723354578018188, - "logps/chosen": -31.222240447998047, - "logps/rejected": -32.57916259765625, - "loss": 0.795, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.17767605185508728, - "rewards/margins": 0.21753115952014923, - "rewards/rejected": -0.03985511139035225, + "logits/chosen": -1.9642337560653687, + "logits/rejected": -1.9744552373886108, + "logps/chosen": -31.243911743164062, + "logps/rejected": -32.590267181396484, + "loss": 0.6171, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1581769436597824, + "rewards/margins": 0.20802685618400574, + "rewards/rejected": -0.04984992742538452, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, - "logits/chosen": -1.8727748394012451, - "logits/rejected": -1.87395441532135, - "logps/chosen": -33.931861877441406, - "logps/rejected": -34.79869842529297, - "loss": 0.7946, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.23474939167499542, - "rewards/margins": 0.272051066160202, - "rewards/rejected": -0.03730170056223869, + "logits/chosen": -1.876604437828064, + "logits/rejected": -1.8777605295181274, + "logps/chosen": -33.938690185546875, + "logps/rejected": -34.807891845703125, + "loss": 0.6043, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.22860188782215118, + "rewards/margins": 0.2741745412349701, + "rewards/rejected": -0.0455726757645607, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, - "logits/chosen": -1.9248745441436768, - "logits/rejected": -1.9214649200439453, - "logps/chosen": -36.014469146728516, - "logps/rejected": -32.73783493041992, - "loss": 0.8532, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.14148668944835663, - "rewards/margins": 0.15506146848201752, - "rewards/rejected": -0.013574766926467419, + "logits/chosen": -1.9282041788101196, + "logits/rejected": -1.9247684478759766, + "logps/chosen": -36.02125930786133, + "logps/rejected": -32.71831130981445, + "loss": 0.6454, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.13537634909152985, + "rewards/margins": 0.13137592375278473, + "rewards/rejected": 0.004000450484454632, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, - "logits/chosen": -2.025555372238159, - "logits/rejected": -2.01819109916687, - "logps/chosen": -33.50218200683594, - "logps/rejected": -31.41971206665039, - "loss": 0.7292, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.26611366868019104, - "rewards/margins": 0.3271873891353607, - "rewards/rejected": -0.06107370927929878, + "logits/chosen": -2.029125928878784, + "logits/rejected": -2.0217747688293457, + "logps/chosen": -33.49839401245117, + "logps/rejected": -31.400177001953125, + "loss": 0.5828, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.26951926946640015, + "rewards/margins": 0.3130132555961609, + "rewards/rejected": -0.04349397122859955, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, - "logits/chosen": -2.0320167541503906, - "logits/rejected": -2.037261486053467, - "logps/chosen": -32.24850845336914, - "logps/rejected": -32.45344924926758, - "loss": 0.7865, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.2666531801223755, - "rewards/margins": 0.23813048005104065, - "rewards/rejected": 0.028522688895463943, + "logits/chosen": -2.0355944633483887, + "logits/rejected": -2.040832042694092, + "logps/chosen": -32.235923767089844, + "logps/rejected": -32.460418701171875, + "loss": 0.5943, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2779761850833893, + "rewards/margins": 0.2557251751422882, + "rewards/rejected": 0.02225096896290779, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, - "logits/chosen": -2.032525062561035, - "logits/rejected": -2.0297436714172363, - "logps/chosen": -31.313217163085938, - "logps/rejected": -31.349472045898438, - "loss": 0.8387, + "logits/chosen": -2.0362112522125244, + "logits/rejected": -2.0334599018096924, + "logps/chosen": -31.269250869750977, + "logps/rejected": -31.325435638427734, + "loss": 0.6245, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.1581628918647766, - "rewards/margins": 0.18629543483257294, - "rewards/rejected": -0.02813255414366722, + "rewards/chosen": 0.19773444533348083, + "rewards/margins": 0.20423230528831482, + "rewards/rejected": -0.0064978525042533875, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, - "logits/chosen": -1.902632713317871, - "logits/rejected": -1.907284140586853, - "logps/chosen": -31.320043563842773, - "logps/rejected": -32.85698699951172, - "loss": 0.7293, + "logits/chosen": -1.9060389995574951, + "logits/rejected": -1.9106788635253906, + "logps/chosen": -31.306299209594727, + "logps/rejected": -32.81407165527344, + "loss": 0.5931, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.2576131224632263, - "rewards/margins": 0.31706100702285767, - "rewards/rejected": -0.05944784730672836, + "rewards/chosen": 0.2699825167655945, + "rewards/margins": 0.2908058166503906, + "rewards/rejected": -0.02082330361008644, "step": 200 }, { "epoch": 0.52, - "eval_logits/chosen": -2.228637933731079, - "eval_logits/rejected": -2.2238004207611084, - "eval_logps/chosen": -34.053680419921875, - "eval_logps/rejected": -37.581058502197266, - "eval_loss": 0.9601577520370483, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": -0.01721162348985672, - "eval_rewards/margins": 0.040783192962408066, - "eval_rewards/rejected": -0.057994820177555084, - "eval_runtime": 145.5388, - "eval_samples_per_second": 2.357, + "eval_logits/chosen": -2.231553792953491, + "eval_logits/rejected": -2.2267112731933594, + "eval_logps/chosen": -34.07304763793945, + "eval_logps/rejected": -37.57693862915039, + "eval_loss": 0.6979728937149048, + "eval_rewards/accuracies": 0.5157807469367981, + "eval_rewards/chosen": -0.03464451804757118, + "eval_rewards/margins": 0.019641490653157234, + "eval_rewards/rejected": -0.054286014288663864, + "eval_runtime": 145.8095, + "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, - "logits/chosen": -2.0149245262145996, - "logits/rejected": -2.025560140609741, - "logps/chosen": -31.77438735961914, - "logps/rejected": -33.95419692993164, - "loss": 0.7666, + "logits/chosen": -2.018519163131714, + "logits/rejected": -2.0291810035705566, + "logps/chosen": -31.742992401123047, + "logps/rejected": -33.946937561035156, + "loss": 0.5902, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.17722666263580322, - "rewards/margins": 0.25957340002059937, - "rewards/rejected": -0.08234670013189316, + "rewards/chosen": 0.2054794579744339, + "rewards/margins": 0.2812942862510681, + "rewards/rejected": -0.07581482082605362, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, - "logits/chosen": -1.906951904296875, - "logits/rejected": -1.9217418432235718, - "logps/chosen": -29.83829116821289, - "logps/rejected": -31.636096954345703, - "loss": 0.7204, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.2459266185760498, - "rewards/margins": 0.3160557448863983, - "rewards/rejected": -0.07012919336557388, + "logits/chosen": -1.911586046218872, + "logits/rejected": -1.9263393878936768, + "logps/chosen": -29.84616470336914, + "logps/rejected": -31.615009307861328, + "loss": 0.5879, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.23883743584156036, + "rewards/margins": 0.2899848222732544, + "rewards/rejected": -0.051147449761629105, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, - "logits/chosen": -1.9629713296890259, - "logits/rejected": -1.9669532775878906, - "logps/chosen": -33.124656677246094, - "logps/rejected": -31.630443572998047, - "loss": 0.7348, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.2635299265384674, - "rewards/margins": 0.3365571200847626, - "rewards/rejected": -0.07302714884281158, + "logits/chosen": -1.9677941799163818, + "logits/rejected": -1.9717823266983032, + "logps/chosen": -33.100074768066406, + "logps/rejected": -31.62213134765625, + "loss": 0.5748, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.28565075993537903, + "rewards/margins": 0.3511958718299866, + "rewards/rejected": -0.06554517149925232, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, - "logits/chosen": -1.9614177942276, - "logits/rejected": -1.9395818710327148, - "logps/chosen": -33.87095260620117, - "logps/rejected": -35.10104751586914, - "loss": 0.6865, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.254514217376709, - "rewards/margins": 0.38974156975746155, - "rewards/rejected": -0.13522735238075256, + "logits/chosen": -1.9661725759506226, + "logits/rejected": -1.944300651550293, + "logps/chosen": -33.841453552246094, + "logps/rejected": -35.11375045776367, + "loss": 0.5473, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.2810631990432739, + "rewards/margins": 0.4277234673500061, + "rewards/rejected": -0.14666026830673218, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, - "logits/chosen": -2.0029492378234863, - "logits/rejected": -1.999629020690918, - "logps/chosen": -32.730865478515625, - "logps/rejected": -36.28009796142578, - "loss": 0.8055, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.17478153109550476, - "rewards/margins": 0.2244400531053543, - "rewards/rejected": -0.04965851828455925, + "logits/chosen": -2.007416009902954, + "logits/rejected": -2.0040948390960693, + "logps/chosen": -32.70330810546875, + "logps/rejected": -36.29412841796875, + "loss": 0.5992, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1995842456817627, + "rewards/margins": 0.2618715763092041, + "rewards/rejected": -0.06228730082511902, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, - "logits/chosen": -1.8703190088272095, - "logits/rejected": -1.8679043054580688, - "logps/chosen": -33.98231887817383, - "logps/rejected": -35.54644775390625, - "loss": 0.8042, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.18547315895557404, - "rewards/margins": 0.22311437129974365, - "rewards/rejected": -0.03764120861887932, + "logits/chosen": -1.8749721050262451, + "logits/rejected": -1.8725513219833374, + "logps/chosen": -34.00068664550781, + "logps/rejected": -35.53888702392578, + "loss": 0.6254, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16894161701202393, + "rewards/margins": 0.1997825801372528, + "rewards/rejected": -0.030840963125228882, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, - "logits/chosen": -1.8551464080810547, - "logits/rejected": -1.852746605873108, - "logps/chosen": -34.20850372314453, - "logps/rejected": -31.803356170654297, - "loss": 0.8096, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.170187309384346, - "rewards/margins": 0.19900819659233093, - "rewards/rejected": -0.02882089652121067, + "logits/chosen": -1.8600317239761353, + "logits/rejected": -1.8576066493988037, + "logps/chosen": -34.1875, + "logps/rejected": -31.8159122467041, + "loss": 0.616, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1890900433063507, + "rewards/margins": 0.22921428084373474, + "rewards/rejected": -0.04012420028448105, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, - "logits/chosen": -1.9582526683807373, - "logits/rejected": -1.947749376296997, - "logps/chosen": -35.0114631652832, - "logps/rejected": -31.88564682006836, - "loss": 0.692, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.3106640875339508, - "rewards/margins": 0.3508565425872803, - "rewards/rejected": -0.040192440152168274, + "logits/chosen": -1.9631398916244507, + "logits/rejected": -1.9526073932647705, + "logps/chosen": -35.023719787597656, + "logps/rejected": -31.869693756103516, + "loss": 0.5782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.29963088035583496, + "rewards/margins": 0.32546472549438477, + "rewards/rejected": -0.025833839550614357, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, - "logits/chosen": -2.053699493408203, - "logits/rejected": -2.038789749145508, - "logps/chosen": -30.727243423461914, - "logps/rejected": -32.641685485839844, - "loss": 0.8827, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.17719359695911407, - "rewards/margins": 0.16805905103683472, - "rewards/rejected": 0.009134533815085888, + "logits/chosen": -2.0582926273345947, + "logits/rejected": -2.0433640480041504, + "logps/chosen": -30.733753204345703, + "logps/rejected": -32.67460632324219, + "loss": 0.6392, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.17133468389511108, + "rewards/margins": 0.19182677567005157, + "rewards/rejected": -0.020492086187005043, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, - "logits/chosen": -1.9243850708007812, - "logits/rejected": -1.9218356609344482, - "logps/chosen": -32.43050003051758, - "logps/rejected": -30.8950138092041, - "loss": 0.6144, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.446951299905777, - "rewards/margins": 0.5173346400260925, - "rewards/rejected": -0.07038338482379913, + "logits/chosen": -1.929610013961792, + "logits/rejected": -1.9270601272583008, + "logps/chosen": -32.42620086669922, + "logps/rejected": -30.873455047607422, + "loss": 0.5301, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.450817346572876, + "rewards/margins": 0.5018006563186646, + "rewards/rejected": -0.050983332097530365, "step": 300 }, { "epoch": 0.78, - "eval_logits/chosen": -2.2248916625976562, - "eval_logits/rejected": -2.2200686931610107, - "eval_logps/chosen": -34.08660125732422, - "eval_logps/rejected": -37.603153228759766, - "eval_loss": 0.9712583422660828, - "eval_rewards/accuracies": 0.5282392501831055, - "eval_rewards/chosen": -0.046843186020851135, - "eval_rewards/margins": 0.031036507338285446, - "eval_rewards/rejected": -0.07787969708442688, - "eval_runtime": 145.8399, - "eval_samples_per_second": 2.352, + "eval_logits/chosen": -2.229154348373413, + "eval_logits/rejected": -2.2243051528930664, + "eval_logps/chosen": -34.09621810913086, + "eval_logps/rejected": -37.59999084472656, + "eval_loss": 0.6972895860671997, + "eval_rewards/accuracies": 0.5390365719795227, + "eval_rewards/chosen": -0.05550166219472885, + "eval_rewards/margins": 0.019528048112988472, + "eval_rewards/rejected": -0.07502970844507217, + "eval_runtime": 145.7792, + "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, - "grad_norm": 24.75, - "learning_rate": 4.84533120650964e-06, - "logits/chosen": -2.057966709136963, - "logits/rejected": -2.045175790786743, - "logps/chosen": -32.121585845947266, - "logps/rejected": -32.866825103759766, - "loss": 0.5062, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.3661278784275055, - "rewards/margins": 0.5693171620368958, - "rewards/rejected": -0.20318928360939026, + "learning_rate": 5.576113578589035e-07, + "logits/chosen": -1.9142345190048218, + "logits/rejected": -1.9109809398651123, + "logps/chosen": -31.33791732788086, + "logps/rejected": -33.82014465332031, + "loss": 0.5861, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23888680338859558, + "rewards/margins": 0.30907896161079407, + "rewards/rejected": -0.07019217312335968, "step": 310 }, { "epoch": 0.83, - "grad_norm": 23.875, - "learning_rate": 4.825108134172131e-06, - "logits/chosen": -1.9696298837661743, - "logits/rejected": -1.9610140323638916, - "logps/chosen": -31.775466918945312, - "logps/rejected": -30.438364028930664, - "loss": 0.4574, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.5215664505958557, - "rewards/margins": 0.6988391876220703, - "rewards/rejected": -0.1772727519273758, + "learning_rate": 4.229036944380913e-07, + "logits/chosen": -1.9650068283081055, + "logits/rejected": -1.9527791738510132, + "logps/chosen": -34.34791946411133, + "logps/rejected": -33.650447845458984, + "loss": 0.5828, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22334297001361847, + "rewards/margins": 0.3237887918949127, + "rewards/rejected": -0.10044582933187485, "step": 320 }, { "epoch": 0.86, - "grad_norm": 30.5, - "learning_rate": 4.80369052967602e-06, - "logits/chosen": -1.9035661220550537, - "logits/rejected": -1.9156697988510132, - "logps/chosen": -29.8248291015625, - "logps/rejected": -33.69133758544922, - "loss": 0.3788, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.5889965295791626, - "rewards/margins": 0.8491708040237427, - "rewards/rejected": -0.2601741552352905, + "learning_rate": 3.053082288996112e-07, + "logits/chosen": -2.00040602684021, + "logits/rejected": -1.9989902973175049, + "logps/chosen": -33.210105895996094, + "logps/rejected": -32.56142807006836, + "loss": 0.5803, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.24812059104442596, + "rewards/margins": 0.32178014516830444, + "rewards/rejected": -0.07365953922271729, "step": 330 }, { "epoch": 0.88, - "grad_norm": 47.0, - "learning_rate": 4.781089396387968e-06, - "logits/chosen": -1.8689849376678467, - "logits/rejected": -1.8597549200057983, - "logps/chosen": -34.021995544433594, - "logps/rejected": -36.21257400512695, - "loss": 0.3569, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.6092386841773987, - "rewards/margins": 0.9295026659965515, - "rewards/rejected": -0.32026407122612, + "learning_rate": 2.0579377374915805e-07, + "logits/chosen": -2.0870866775512695, + "logits/rejected": -2.0713772773742676, + "logps/chosen": -33.80995178222656, + "logps/rejected": -33.120697021484375, + "loss": 0.5723, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3324963450431824, + "rewards/margins": 0.33488941192626953, + "rewards/rejected": -0.0023930787574499846, "step": 340 }, { "epoch": 0.91, - "grad_norm": 27.0, - "learning_rate": 4.757316345716554e-06, - "logits/chosen": -1.9230518341064453, - "logits/rejected": -1.923689842224121, - "logps/chosen": -33.71295166015625, - "logps/rejected": -34.07986068725586, - "loss": 0.4213, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.6230990886688232, - "rewards/margins": 0.8308955430984497, - "rewards/rejected": -0.20779642462730408, + "learning_rate": 1.2518018074041684e-07, + "logits/chosen": -1.959240198135376, + "logits/rejected": -1.9583876132965088, + "logps/chosen": -32.863216400146484, + "logps/rejected": -32.54397201538086, + "loss": 0.5523, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.37699171900749207, + "rewards/margins": 0.44793614745140076, + "rewards/rejected": -0.0709443911910057, "step": 350 }, { "epoch": 0.94, - "grad_norm": 28.625, - "learning_rate": 4.73238359114687e-06, - "logits/chosen": -2.050320863723755, - "logits/rejected": -2.056476593017578, - "logps/chosen": -31.107620239257812, - "logps/rejected": -32.95317840576172, - "loss": 0.4902, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.41906651854515076, - "rewards/margins": 0.6754721403121948, - "rewards/rejected": -0.25640562176704407, + "learning_rate": 6.41315865106129e-08, + "logits/chosen": -1.9147189855575562, + "logits/rejected": -1.9250224828720093, + "logps/chosen": -31.902795791625977, + "logps/rejected": -35.3552131652832, + "loss": 0.5743, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2717086672782898, + "rewards/margins": 0.33151620626449585, + "rewards/rejected": -0.05980752781033516, "step": 360 }, { "epoch": 0.96, - "grad_norm": 41.75, - "learning_rate": 4.706303941965804e-06, - "logits/chosen": -1.9822814464569092, - "logits/rejected": -1.9818360805511475, - "logps/chosen": -32.8140869140625, - "logps/rejected": -36.2825813293457, - "loss": 0.4132, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.5677691102027893, - "rewards/margins": 0.7772952318191528, - "rewards/rejected": -0.2095261812210083, + "learning_rate": 2.3150941078050325e-08, + "logits/chosen": -2.05413556098938, + "logits/rejected": -2.047651767730713, + "logps/chosen": -33.377376556396484, + "logps/rejected": -29.2799072265625, + "loss": 0.5801, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.24713313579559326, + "rewards/margins": 0.2952673137187958, + "rewards/rejected": -0.04813414067029953, "step": 370 }, { "epoch": 0.99, - "grad_norm": 31.125, - "learning_rate": 4.679090796681225e-06, - "logits/chosen": -2.0170412063598633, - "logits/rejected": -2.012453556060791, - "logps/chosen": -30.147674560546875, - "logps/rejected": -29.48748207092285, - "loss": 0.4139, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.506630539894104, - "rewards/margins": 0.7528629302978516, - "rewards/rejected": -0.24623243510723114, + "learning_rate": 2.575864278703266e-09, + "logits/chosen": -1.9141871929168701, + "logits/rejected": -1.9163949489593506, + "logps/chosen": -33.87698745727539, + "logps/rejected": -30.976858139038086, + "loss": 0.5489, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.32585546374320984, + "rewards/margins": 0.4196627140045166, + "rewards/rejected": -0.09380728751420975, "step": 380 }, { - "epoch": 1.01, - "grad_norm": 31.75, - "learning_rate": 4.650758136138454e-06, - "logits/chosen": -1.7892462015151978, - "logits/rejected": -1.7956597805023193, - "logps/chosen": -31.721576690673828, - "logps/rejected": -36.598609924316406, - "loss": 0.3273, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.5977230668067932, - "rewards/margins": 1.0300120115280151, - "rewards/rejected": -0.4322889745235443, - "step": 390 - }, - { - "epoch": 1.04, - "grad_norm": 22.625, - "learning_rate": 4.621320516337559e-06, - "logits/chosen": -1.9460976123809814, - "logits/rejected": -1.9397977590560913, - "logps/chosen": -33.06171417236328, - "logps/rejected": -32.60498809814453, - "loss": 0.3632, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.6938852667808533, - "rewards/margins": 0.9603848457336426, - "rewards/rejected": -0.2664996087551117, - "step": 400 - }, - { - "epoch": 1.04, - "eval_logits/chosen": -2.212498426437378, - "eval_logits/rejected": -2.207648992538452, - "eval_logps/chosen": -34.13554000854492, - "eval_logps/rejected": -37.67595291137695, - "eval_loss": 0.9494805932044983, - "eval_rewards/accuracies": 0.560215950012207, - "eval_rewards/chosen": -0.09088955074548721, - "eval_rewards/margins": 0.05251322314143181, - "eval_rewards/rejected": -0.14340277016162872, - "eval_runtime": 145.4418, - "eval_samples_per_second": 2.358, - "eval_steps_per_second": 0.296, - "step": 400 - }, - { - "epoch": 1.06, - "grad_norm": 29.25, - "learning_rate": 4.590793060955158e-06, - "logits/chosen": -1.9469482898712158, - "logits/rejected": -1.9542169570922852, - "logps/chosen": -28.586200714111328, - "logps/rejected": -29.615833282470703, - "loss": 0.3369, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.4807574152946472, - "rewards/margins": 0.9466059803962708, - "rewards/rejected": -0.4658486247062683, - "step": 410 - }, - { - "epoch": 1.09, - "grad_norm": 29.625, - "learning_rate": 4.559191453574582e-06, - "logits/chosen": -1.967563271522522, - "logits/rejected": -1.9665197134017944, - "logps/chosen": -33.637752532958984, - "logps/rejected": -31.163837432861328, - "loss": 0.4787, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.551658034324646, - "rewards/margins": 0.7503688335418701, - "rewards/rejected": -0.19871078431606293, - "step": 420 - }, - { - "epoch": 1.12, - "grad_norm": 26.125, - "learning_rate": 4.52653192962838e-06, - "logits/chosen": -1.9623167514801025, - "logits/rejected": -1.9450109004974365, - "logps/chosen": -30.47269630432129, - "logps/rejected": -33.48539733886719, - "loss": 0.3673, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.4540596604347229, - "rewards/margins": 0.9572601318359375, - "rewards/rejected": -0.5032004117965698, - "step": 430 - }, - { - "epoch": 1.14, - "grad_norm": 18.125, - "learning_rate": 4.492831268057307e-06, - "logits/chosen": -1.9934860467910767, - "logits/rejected": -1.9955333471298218, - "logps/chosen": -35.85193634033203, - "logps/rejected": -35.39922332763672, - "loss": 0.2297, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.6843839883804321, - "rewards/margins": 1.1697149276733398, - "rewards/rejected": -0.4853309988975525, - "step": 440 - }, - { - "epoch": 1.17, - "grad_norm": 26.625, - "learning_rate": 4.458106782690094e-06, - "logits/chosen": -2.072458267211914, - "logits/rejected": -2.0722086429595947, - "logps/chosen": -31.9611759185791, - "logps/rejected": -33.9387321472168, - "loss": 0.3876, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.6049578189849854, - "rewards/margins": 0.9457138776779175, - "rewards/rejected": -0.3407560884952545, - "step": 450 - }, - { - "epoch": 1.19, - "grad_norm": 30.625, - "learning_rate": 4.422376313348405e-06, - "logits/chosen": -2.0193634033203125, - "logits/rejected": -2.011868715286255, - "logps/chosen": -31.689464569091797, - "logps/rejected": -36.604766845703125, - "loss": 0.2964, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6448441743850708, - "rewards/margins": 1.140498399734497, - "rewards/rejected": -0.4956541061401367, - "step": 460 - }, - { - "epoch": 1.22, - "grad_norm": 28.75, - "learning_rate": 4.3856582166815696e-06, - "logits/chosen": -1.9231773614883423, - "logits/rejected": -1.9196126461029053, - "logps/chosen": -33.275428771972656, - "logps/rejected": -33.32958221435547, - "loss": 0.3447, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5911833047866821, - "rewards/margins": 0.9563148617744446, - "rewards/rejected": -0.36513155698776245, - "step": 470 - }, - { - "epoch": 1.25, - "grad_norm": 31.0, - "learning_rate": 4.347971356735789e-06, - "logits/chosen": -2.050720453262329, - "logits/rejected": -2.0436971187591553, - "logps/chosen": -30.2794132232666, - "logps/rejected": -32.480506896972656, - "loss": 0.3824, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.49421030282974243, - "rewards/margins": 0.9325267672538757, - "rewards/rejected": -0.4383164346218109, - "step": 480 - }, - { - "epoch": 1.27, - "grad_norm": 14.3125, - "learning_rate": 4.309335095262675e-06, - "logits/chosen": -1.996100664138794, - "logits/rejected": -1.997636079788208, - "logps/chosen": -34.52549743652344, - "logps/rejected": -34.070865631103516, - "loss": 0.2553, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.7449228763580322, - "rewards/margins": 1.1313400268554688, - "rewards/rejected": -0.38641709089279175, - "step": 490 - }, - { - "epoch": 1.3, - "grad_norm": 36.25, - "learning_rate": 4.269769281772082e-06, - "logits/chosen": -1.8836076259613037, - "logits/rejected": -1.8812100887298584, - "logps/chosen": -32.4937629699707, - "logps/rejected": -37.06315231323242, - "loss": 0.2994, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.6106754541397095, - "rewards/margins": 1.149621605873108, - "rewards/rejected": -0.538946270942688, - "step": 500 - }, - { - "epoch": 1.3, - "eval_logits/chosen": -2.221041202545166, - "eval_logits/rejected": -2.2161660194396973, - "eval_logps/chosen": -34.217567443847656, - "eval_logps/rejected": -37.774173736572266, - "eval_loss": 0.9461130499839783, - "eval_rewards/accuracies": 0.5539867281913757, - "eval_rewards/chosen": -0.16471394896507263, - "eval_rewards/margins": 0.0670846477150917, - "eval_rewards/rejected": -0.23179861903190613, - "eval_runtime": 145.1678, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 500 - }, - { - "epoch": 1.32, - "grad_norm": 28.125, - "learning_rate": 4.22929424333435e-06, - "logits/chosen": -1.9868427515029907, - "logits/rejected": -1.9915640354156494, - "logps/chosen": -32.677276611328125, - "logps/rejected": -32.02418899536133, - "loss": 0.3257, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.702068030834198, - "rewards/margins": 1.0423411130905151, - "rewards/rejected": -0.3402729630470276, - "step": 510 - }, - { - "epoch": 1.35, - "grad_norm": 31.25, - "learning_rate": 4.1879307741372085e-06, - "logits/chosen": -2.0190398693084717, - "logits/rejected": -2.0300133228302, - "logps/chosen": -30.644947052001953, - "logps/rejected": -32.037715911865234, - "loss": 0.2992, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6709223389625549, - "rewards/margins": 1.147947907447815, - "rewards/rejected": -0.4770255982875824, - "step": 520 - }, - { - "epoch": 1.38, - "grad_norm": 16.625, - "learning_rate": 4.145700124802693e-06, - "logits/chosen": -1.9455944299697876, - "logits/rejected": -1.9422004222869873, - "logps/chosen": -31.76815414428711, - "logps/rejected": -32.878482818603516, - "loss": 0.3351, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5871203541755676, - "rewards/margins": 1.0512535572052002, - "rewards/rejected": -0.4641333222389221, - "step": 530 - }, - { - "epoch": 1.4, - "grad_norm": 23.875, - "learning_rate": 4.102623991469562e-06, - "logits/chosen": -1.8109652996063232, - "logits/rejected": -1.8201634883880615, - "logps/chosen": -31.86029624938965, - "logps/rejected": -32.3470458984375, - "loss": 0.3393, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.6381845474243164, - "rewards/margins": 1.0801360607147217, - "rewards/rejected": -0.4419515132904053, - "step": 540 - }, - { - "epoch": 1.43, - "grad_norm": 27.25, - "learning_rate": 4.058724504646834e-06, - "logits/chosen": -1.9057979583740234, - "logits/rejected": -1.8994461297988892, - "logps/chosen": -32.93577575683594, - "logps/rejected": -31.36001968383789, - "loss": 0.3481, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.6952788233757019, - "rewards/margins": 1.0270812511444092, - "rewards/rejected": -0.33180245757102966, - "step": 550 - }, - { - "epoch": 1.45, - "grad_norm": 36.5, - "learning_rate": 4.014024217844167e-06, - "logits/chosen": -1.9947702884674072, - "logits/rejected": -1.9926884174346924, - "logps/chosen": -33.76757049560547, - "logps/rejected": -31.845256805419922, - "loss": 0.3511, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.7232670187950134, - "rewards/margins": 1.0363487005233765, - "rewards/rejected": -0.3130818009376526, - "step": 560 - }, - { - "epoch": 1.48, - "grad_norm": 24.875, - "learning_rate": 3.968546095984911e-06, - "logits/chosen": -1.8271764516830444, - "logits/rejected": -1.8250176906585693, - "logps/chosen": -32.03794479370117, - "logps/rejected": -31.38640785217285, - "loss": 0.4057, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.6826205253601074, - "rewards/margins": 1.0283124446868896, - "rewards/rejected": -0.34569185972213745, - "step": 570 - }, - { - "epoch": 1.51, - "grad_norm": 18.625, - "learning_rate": 3.922313503607806e-06, - "logits/chosen": -1.9618475437164307, - "logits/rejected": -1.9585418701171875, - "logps/chosen": -30.302867889404297, - "logps/rejected": -35.241397857666016, - "loss": 0.2905, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.7215973138809204, - "rewards/margins": 1.2753074169158936, - "rewards/rejected": -0.5537099242210388, - "step": 580 - }, - { - "epoch": 1.53, - "grad_norm": 22.0, - "learning_rate": 3.875350192863368e-06, - "logits/chosen": -1.902073860168457, - "logits/rejected": -1.9056593179702759, - "logps/chosen": -28.98307228088379, - "logps/rejected": -31.06064796447754, - "loss": 0.389, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.56876540184021, - "rewards/margins": 0.9303674697875977, - "rewards/rejected": -0.36160212755203247, - "step": 590 - }, - { - "epoch": 1.56, - "grad_norm": 26.25, - "learning_rate": 3.8276802913111436e-06, - "logits/chosen": -1.9391911029815674, - "logits/rejected": -1.939034104347229, - "logps/chosen": -31.3705997467041, - "logps/rejected": -31.58413314819336, - "loss": 0.3408, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.7658407092094421, - "rewards/margins": 1.1178474426269531, - "rewards/rejected": -0.352006733417511, - "step": 600 - }, - { - "epoch": 1.56, - "eval_logits/chosen": -2.206589460372925, - "eval_logits/rejected": -2.201718807220459, - "eval_logps/chosen": -34.10955047607422, - "eval_logps/rejected": -37.704795837402344, - "eval_loss": 0.9076833724975586, - "eval_rewards/accuracies": 0.5867940187454224, - "eval_rewards/chosen": -0.06749669462442398, - "eval_rewards/margins": 0.10186038166284561, - "eval_rewards/rejected": -0.1693570613861084, - "eval_runtime": 145.2918, - "eval_samples_per_second": 2.361, - "eval_steps_per_second": 0.296, - "step": 600 - }, - { - "epoch": 1.58, - "grad_norm": 29.125, - "learning_rate": 3.7793282895240927e-06, - "logits/chosen": -2.0020594596862793, - "logits/rejected": -2.0028228759765625, - "logps/chosen": -34.12175750732422, - "logps/rejected": -33.66775894165039, - "loss": 0.2586, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.8175474405288696, - "rewards/margins": 1.3427560329437256, - "rewards/rejected": -0.5252088308334351, - "step": 610 - }, - { - "epoch": 1.61, - "grad_norm": 22.875, - "learning_rate": 3.730319028506478e-06, - "logits/chosen": -1.964914321899414, - "logits/rejected": -1.962311029434204, - "logps/chosen": -32.368412017822266, - "logps/rejected": -32.58821487426758, - "loss": 0.2969, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.6812867522239685, - "rewards/margins": 1.1477558612823486, - "rewards/rejected": -0.4664689898490906, - "step": 620 - }, - { - "epoch": 1.64, - "grad_norm": 41.75, - "learning_rate": 3.6806776869317074e-06, - "logits/chosen": -1.978055715560913, - "logits/rejected": -1.9689325094223022, - "logps/chosen": -31.888168334960938, - "logps/rejected": -31.396778106689453, - "loss": 0.3729, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.7991548180580139, - "rewards/margins": 1.214425802230835, - "rewards/rejected": -0.4152711033821106, - "step": 630 - }, - { - "epoch": 1.66, - "grad_norm": 38.0, - "learning_rate": 3.6304297682067146e-06, - "logits/chosen": -1.9791650772094727, - "logits/rejected": -1.9759613275527954, - "logps/chosen": -31.385793685913086, - "logps/rejected": -32.911964416503906, - "loss": 0.2822, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7313672304153442, - "rewards/margins": 1.1746617555618286, - "rewards/rejected": -0.44329458475112915, - "step": 640 - }, - { - "epoch": 1.69, - "grad_norm": 23.875, - "learning_rate": 3.579601087369492e-06, - "logits/chosen": -1.9867624044418335, - "logits/rejected": -1.9891353845596313, - "logps/chosen": -32.82892608642578, - "logps/rejected": -34.27159881591797, - "loss": 0.2431, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.7423025369644165, - "rewards/margins": 1.2716872692108154, - "rewards/rejected": -0.5293846726417542, - "step": 650 - }, - { - "epoch": 1.71, - "grad_norm": 37.75, - "learning_rate": 3.5282177578265295e-06, - "logits/chosen": -1.899405837059021, - "logits/rejected": -1.899852991104126, - "logps/chosen": -33.07904815673828, - "logps/rejected": -32.00278091430664, - "loss": 0.3078, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.7810029983520508, - "rewards/margins": 1.136976957321167, - "rewards/rejected": -0.3559740483760834, - "step": 660 - }, - { - "epoch": 1.74, - "grad_norm": 18.875, - "learning_rate": 3.476306177936961e-06, - "logits/chosen": -1.941097617149353, - "logits/rejected": -1.9313017129898071, - "logps/chosen": -33.12556838989258, - "logps/rejected": -32.835227966308594, - "loss": 0.1901, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.788142740726471, - "rewards/margins": 1.2628332376480103, - "rewards/rejected": -0.4746905267238617, - "step": 670 - }, - { - "epoch": 1.77, - "grad_norm": 34.0, - "learning_rate": 3.423893017450324e-06, - "logits/chosen": -1.8415699005126953, - "logits/rejected": -1.838326096534729, - "logps/chosen": -30.559249877929688, - "logps/rejected": -34.762123107910156, - "loss": 0.2502, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.8115372657775879, - "rewards/margins": 1.2107248306274414, - "rewards/rejected": -0.3991875946521759, - "step": 680 - }, - { - "epoch": 1.79, - "grad_norm": 17.75, - "learning_rate": 3.3710052038048794e-06, - "logits/chosen": -1.902061104774475, - "logits/rejected": -1.9013097286224365, - "logps/chosen": -34.259620666503906, - "logps/rejected": -35.9239501953125, - "loss": 0.183, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9095993041992188, - "rewards/margins": 1.42271089553833, - "rewards/rejected": -0.5131114721298218, - "step": 690 - }, - { - "epoch": 1.82, - "grad_norm": 12.625, - "learning_rate": 3.3176699082935546e-06, - "logits/chosen": -1.8813689947128296, - "logits/rejected": -1.884253740310669, - "logps/chosen": -31.644283294677734, - "logps/rejected": -36.16461181640625, - "loss": 0.2796, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.8627067804336548, - "rewards/margins": 1.3333566188812256, - "rewards/rejected": -0.4706498980522156, - "step": 700 - }, - { - "epoch": 1.82, - "eval_logits/chosen": -2.206071615219116, - "eval_logits/rejected": -2.2012383937835693, - "eval_logps/chosen": -34.13777542114258, - "eval_logps/rejected": -37.69728088378906, - "eval_loss": 0.9425109624862671, - "eval_rewards/accuracies": 0.5568937063217163, - "eval_rewards/chosen": -0.0929015502333641, - "eval_rewards/margins": 0.06969217956066132, - "eval_rewards/rejected": -0.16259372234344482, - "eval_runtime": 145.1331, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 700 - }, - { - "epoch": 1.84, - "grad_norm": 28.75, - "learning_rate": 3.2639145321045933e-06, - "logits/chosen": -1.9908039569854736, - "logits/rejected": -1.9935470819473267, - "logps/chosen": -33.92770767211914, - "logps/rejected": -34.66431427001953, - "loss": 0.3611, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.8710969090461731, - "rewards/margins": 1.2258384227752686, - "rewards/rejected": -0.3547416031360626, - "step": 710 - }, - { - "epoch": 1.87, - "grad_norm": 37.5, - "learning_rate": 3.2097666922441107e-06, - "logits/chosen": -1.845144271850586, - "logits/rejected": -1.8392305374145508, - "logps/chosen": -33.98247146606445, - "logps/rejected": -32.777137756347656, - "loss": 0.3525, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.7656866908073425, - "rewards/margins": 1.1484500169754028, - "rewards/rejected": -0.38276320695877075, - "step": 720 - }, - { - "epoch": 1.9, - "grad_norm": 18.375, - "learning_rate": 3.1552542073477554e-06, - "logits/chosen": -2.0191843509674072, - "logits/rejected": -2.0161468982696533, - "logps/chosen": -29.76511573791504, - "logps/rejected": -32.01858901977539, - "loss": 0.2082, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.7990841865539551, - "rewards/margins": 1.3124363422393799, - "rewards/rejected": -0.5133520364761353, - "step": 730 - }, - { - "epoch": 1.92, - "grad_norm": 34.0, - "learning_rate": 3.100405083388799e-06, - "logits/chosen": -1.8623883724212646, - "logits/rejected": -1.8623119592666626, - "logps/chosen": -32.44645309448242, - "logps/rejected": -38.02064895629883, - "loss": 0.2773, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.9150499105453491, - "rewards/margins": 1.2686070203781128, - "rewards/rejected": -0.3535570502281189, - "step": 740 - }, - { - "epoch": 1.95, - "grad_norm": 14.5, - "learning_rate": 3.0452474992899645e-06, - "logits/chosen": -1.7451330423355103, - "logits/rejected": -1.7503840923309326, - "logps/chosen": -35.91836166381836, - "logps/rejected": -34.34960174560547, - "loss": 0.3131, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.9126898646354675, - "rewards/margins": 1.2522337436676025, - "rewards/rejected": -0.339543879032135, - "step": 750 - }, - { - "epoch": 1.97, - "grad_norm": 33.5, - "learning_rate": 2.989809792446417e-06, - "logits/chosen": -1.9522911310195923, - "logits/rejected": -1.9537473917007446, - "logps/chosen": -31.657379150390625, - "logps/rejected": -33.18722915649414, - "loss": 0.2682, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.853469967842102, - "rewards/margins": 1.2206826210021973, - "rewards/rejected": -0.36721271276474, - "step": 760 - }, - { - "epoch": 2.0, - "grad_norm": 25.125, - "learning_rate": 2.9341204441673267e-06, - "logits/chosen": -1.9250198602676392, - "logits/rejected": -1.9241039752960205, - "logps/chosen": -31.101261138916016, - "logps/rejected": -35.2042350769043, - "loss": 0.2804, - "rewards/accuracies": 0.8958333730697632, - "rewards/chosen": 0.9669157266616821, - "rewards/margins": 1.1253904104232788, - "rewards/rejected": -0.1584748923778534, - "step": 770 - }, - { - "epoch": 2.03, - "grad_norm": 6.28125, - "learning_rate": 2.878208065043501e-06, - "logits/chosen": -1.9248275756835938, - "logits/rejected": -1.9241101741790771, - "logps/chosen": -33.638389587402344, - "logps/rejected": -32.00913619995117, - "loss": 0.1397, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9090627431869507, - "rewards/margins": 1.4397172927856445, - "rewards/rejected": -0.5306544899940491, - "step": 780 - }, - { - "epoch": 2.05, - "grad_norm": 22.875, - "learning_rate": 2.8221013802485974e-06, - "logits/chosen": -1.9553604125976562, - "logits/rejected": -1.9538936614990234, - "logps/chosen": -28.537487030029297, - "logps/rejected": -33.33250045776367, - "loss": 0.084, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9231246709823608, - "rewards/margins": 1.5496938228607178, - "rewards/rejected": -0.6265692114830017, - "step": 790 - }, - { - "epoch": 2.08, - "grad_norm": 0.0135498046875, - "learning_rate": 2.76582921478147e-06, - "logits/chosen": -2.0081448554992676, - "logits/rejected": -2.0046160221099854, - "logps/chosen": -31.324649810791016, - "logps/rejected": -34.89632797241211, - "loss": 0.1052, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.9923051595687866, - "rewards/margins": 1.6198686361312866, - "rewards/rejected": -0.6275635957717896, - "step": 800 - }, - { - "epoch": 2.08, - "eval_logits/chosen": -2.205134868621826, - "eval_logits/rejected": -2.200315475463867, - "eval_logps/chosen": -34.128814697265625, - "eval_logps/rejected": -37.72360610961914, - "eval_loss": 0.9124869704246521, - "eval_rewards/accuracies": 0.5926079750061035, - "eval_rewards/chosen": -0.08483708649873734, - "eval_rewards/margins": 0.10145010054111481, - "eval_rewards/rejected": -0.18628719449043274, - "eval_runtime": 145.104, - "eval_samples_per_second": 2.364, - "eval_steps_per_second": 0.296, - "step": 800 - }, - { - "epoch": 2.1, - "grad_norm": 15.1875, - "learning_rate": 2.7094204786572254e-06, - "logits/chosen": -1.8404695987701416, - "logits/rejected": -1.8326365947723389, - "logps/chosen": -33.10143280029297, - "logps/rejected": -35.159244537353516, - "loss": 0.1258, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.2193273305892944, - "rewards/margins": 1.7585445642471313, - "rewards/rejected": -0.5392170548439026, - "step": 810 - }, - { - "epoch": 2.13, - "grad_norm": 21.125, - "learning_rate": 2.6529041520546072e-06, - "logits/chosen": -1.9172642230987549, - "logits/rejected": -1.927819013595581, - "logps/chosen": -34.42330551147461, - "logps/rejected": -32.97254180908203, - "loss": 0.1226, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1451458930969238, - "rewards/margins": 1.632136583328247, - "rewards/rejected": -0.48699086904525757, - "step": 820 - }, - { - "epoch": 2.16, - "grad_norm": 28.5, - "learning_rate": 2.5963092704273302e-06, - "logits/chosen": -1.9668289422988892, - "logits/rejected": -1.971239686012268, - "logps/chosen": -33.753257751464844, - "logps/rejected": -29.762826919555664, - "loss": 0.1291, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.034569501876831, - "rewards/margins": 1.479695439338684, - "rewards/rejected": -0.44512590765953064, - "step": 830 - }, - { - "epoch": 2.18, - "grad_norm": 14.5, - "learning_rate": 2.53966490958702e-06, - "logits/chosen": -1.9671608209609985, - "logits/rejected": -1.9750837087631226, - "logps/chosen": -33.52558135986328, - "logps/rejected": -30.574993133544922, - "loss": 0.071, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1362085342407227, - "rewards/margins": 1.7907711267471313, - "rewards/rejected": -0.6545625329017639, - "step": 840 - }, - { - "epoch": 2.21, - "grad_norm": 22.75, - "learning_rate": 2.4830001707654135e-06, - "logits/chosen": -1.8968629837036133, - "logits/rejected": -1.8873090744018555, - "logps/chosen": -30.793508529663086, - "logps/rejected": -32.640350341796875, - "loss": 0.1021, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8997215032577515, - "rewards/margins": 1.4636071920394897, - "rewards/rejected": -0.5638857483863831, - "step": 850 - }, - { - "epoch": 2.23, - "grad_norm": 15.4375, - "learning_rate": 2.4263441656635054e-06, - "logits/chosen": -2.0392231941223145, - "logits/rejected": -2.02933931350708, - "logps/chosen": -25.14605140686035, - "logps/rejected": -30.546062469482422, - "loss": 0.1161, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.9702181816101074, - "rewards/margins": 1.576568603515625, - "rewards/rejected": -0.6063503623008728, - "step": 860 - }, - { - "epoch": 2.26, - "grad_norm": 19.375, - "learning_rate": 2.3697260014953107e-06, - "logits/chosen": -1.8909342288970947, - "logits/rejected": -1.8921743631362915, - "logps/chosen": -33.14449691772461, - "logps/rejected": -30.5850772857666, - "loss": 0.0889, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0696533918380737, - "rewards/margins": 1.61197030544281, - "rewards/rejected": -0.5423170328140259, - "step": 870 - }, - { - "epoch": 2.29, - "grad_norm": 10.875, - "learning_rate": 2.3131747660339396e-06, - "logits/chosen": -1.9035065174102783, - "logits/rejected": -1.9043560028076172, - "logps/chosen": -31.356319427490234, - "logps/rejected": -33.71599197387695, - "loss": 0.2119, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.929639458656311, - "rewards/margins": 1.5282022953033447, - "rewards/rejected": -0.5985628962516785, - "step": 880 - }, - { - "epoch": 2.31, - "grad_norm": 28.75, - "learning_rate": 2.256719512667651e-06, - "logits/chosen": -1.8071092367172241, - "logits/rejected": -1.8054319620132446, - "logps/chosen": -34.50264358520508, - "logps/rejected": -36.736839294433594, - "loss": 0.091, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0460432767868042, - "rewards/margins": 1.8196884393692017, - "rewards/rejected": -0.7736451029777527, - "step": 890 - }, - { - "epoch": 2.34, - "grad_norm": 11.75, - "learning_rate": 2.2003892454735786e-06, - "logits/chosen": -1.9551169872283936, - "logits/rejected": -1.9482837915420532, - "logps/chosen": -30.755849838256836, - "logps/rejected": -33.490779876708984, - "loss": 0.095, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.122107982635498, - "rewards/margins": 1.781549096107483, - "rewards/rejected": -0.6594408750534058, - "step": 900 - }, - { - "epoch": 2.34, - "eval_logits/chosen": -2.2067153453826904, - "eval_logits/rejected": -2.2018842697143555, - "eval_logps/chosen": -34.123661041259766, - "eval_logps/rejected": -37.73240280151367, - "eval_loss": 0.900454580783844, - "eval_rewards/accuracies": 0.5539867281913757, - "eval_rewards/chosen": -0.08019853383302689, - "eval_rewards/margins": 0.11400878429412842, - "eval_rewards/rejected": -0.19420728087425232, - "eval_runtime": 145.2038, - "eval_samples_per_second": 2.362, - "eval_steps_per_second": 0.296, - "step": 900 - }, - { - "epoch": 2.36, - "grad_norm": 25.625, - "learning_rate": 2.1442129043167877e-06, - "logits/chosen": -1.9985930919647217, - "logits/rejected": -1.9938539266586304, - "logps/chosen": -32.36955261230469, - "logps/rejected": -35.18764877319336, - "loss": 0.091, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0693137645721436, - "rewards/margins": 1.6624386310577393, - "rewards/rejected": -0.5931245684623718, - "step": 910 - }, - { - "epoch": 2.39, - "grad_norm": 9.6875, - "learning_rate": 2.088219349982323e-06, - "logits/chosen": -1.9170243740081787, - "logits/rejected": -1.9220654964447021, - "logps/chosen": -34.132789611816406, - "logps/rejected": -33.121482849121094, - "loss": 0.1312, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0942142009735107, - "rewards/margins": 1.6536308526992798, - "rewards/rejected": -0.5594164133071899, - "step": 920 - }, - { - "epoch": 2.42, - "grad_norm": 16.75, - "learning_rate": 2.0324373493478803e-06, - "logits/chosen": -2.0138721466064453, - "logits/rejected": -2.004511833190918, - "logps/chosen": -31.145587921142578, - "logps/rejected": -35.140323638916016, - "loss": 0.09, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0254095792770386, - "rewards/margins": 1.653183937072754, - "rewards/rejected": -0.6277744174003601, - "step": 930 - }, - { - "epoch": 2.44, - "grad_norm": 17.0, - "learning_rate": 1.976895560604729e-06, - "logits/chosen": -1.9402869939804077, - "logits/rejected": -1.9371931552886963, - "logps/chosen": -30.240589141845703, - "logps/rejected": -33.107887268066406, - "loss": 0.1529, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9430824518203735, - "rewards/margins": 1.5213444232940674, - "rewards/rejected": -0.578262209892273, - "step": 940 - }, - { - "epoch": 2.47, - "grad_norm": 4.25, - "learning_rate": 1.921622518534466e-06, - "logits/chosen": -1.8814115524291992, - "logits/rejected": -1.8888870477676392, - "logps/chosen": -31.7093448638916, - "logps/rejected": -36.24521255493164, - "loss": 0.181, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.0455348491668701, - "rewards/margins": 1.671287178993225, - "rewards/rejected": -0.625752329826355, - "step": 950 - }, - { - "epoch": 2.49, - "grad_norm": 0.04150390625, - "learning_rate": 1.8666466198491794e-06, - "logits/chosen": -1.8903650045394897, - "logits/rejected": -1.8835119009017944, - "logps/chosen": -32.601585388183594, - "logps/rejected": -37.4316520690918, - "loss": 0.0724, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.104241132736206, - "rewards/margins": 1.853803277015686, - "rewards/rejected": -0.7495620846748352, - "step": 960 - }, - { - "epoch": 2.52, - "grad_norm": 0.0, - "learning_rate": 1.8119961086025376e-06, - "logits/chosen": -1.8931461572647095, - "logits/rejected": -1.8937944173812866, - "logps/chosen": -30.099145889282227, - "logps/rejected": -33.60529327392578, - "loss": 0.1061, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0235319137573242, - "rewards/margins": 1.585160493850708, - "rewards/rejected": -0.5616283416748047, - "step": 970 - }, - { - "epoch": 2.55, - "grad_norm": 9.75, - "learning_rate": 1.7576990616793139e-06, - "logits/chosen": -1.8976783752441406, - "logits/rejected": -1.9089406728744507, - "logps/chosen": -31.712087631225586, - "logps/rejected": -34.73202133178711, - "loss": 0.1285, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.2281408309936523, - "rewards/margins": 1.7688732147216797, - "rewards/rejected": -0.5407322645187378, - "step": 980 - }, - { - "epoch": 2.57, - "grad_norm": 9.875, - "learning_rate": 1.7037833743707892e-06, - "logits/chosen": -1.9692922830581665, - "logits/rejected": -1.966718077659607, - "logps/chosen": -34.942657470703125, - "logps/rejected": -32.915771484375, - "loss": 0.1929, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.0605528354644775, - "rewards/margins": 1.525442361831665, - "rewards/rejected": -0.4648895263671875, - "step": 990 - }, - { - "epoch": 2.6, - "grad_norm": 19.25, - "learning_rate": 1.6502767460434588e-06, - "logits/chosen": -1.9397118091583252, - "logits/rejected": -1.9436838626861572, - "logps/chosen": -33.27742385864258, - "logps/rejected": -35.068485260009766, - "loss": 0.123, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0672835111618042, - "rewards/margins": 1.5914794206619263, - "rewards/rejected": -0.5241960287094116, - "step": 1000 - }, - { - "epoch": 2.6, - "eval_logits/chosen": -2.2042503356933594, - "eval_logits/rejected": -2.199423313140869, - "eval_logps/chosen": -34.13529586791992, - "eval_logps/rejected": -37.72508239746094, - "eval_loss": 0.9194008708000183, - "eval_rewards/accuracies": 0.5510797500610352, - "eval_rewards/chosen": -0.09066839516162872, - "eval_rewards/margins": 0.09694948047399521, - "eval_rewards/rejected": -0.18761785328388214, - "eval_runtime": 145.3351, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 1000 - }, - { - "epoch": 2.62, - "grad_norm": 28.0, - "learning_rate": 1.5972066659083796e-06, - "logits/chosen": -1.9486963748931885, - "logits/rejected": -1.9537280797958374, - "logps/chosen": -31.729883193969727, - "logps/rejected": -33.12669372558594, - "loss": 0.1503, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.9634879231452942, - "rewards/margins": 1.5083791017532349, - "rewards/rejected": -0.5448910593986511, - "step": 1010 - }, - { - "epoch": 2.65, - "grad_norm": 21.125, - "learning_rate": 1.5446003988985041e-06, - "logits/chosen": -1.9090359210968018, - "logits/rejected": -1.911655068397522, - "logps/chosen": -29.359201431274414, - "logps/rejected": -32.33182144165039, - "loss": 0.1044, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9777968525886536, - "rewards/margins": 1.5471875667572021, - "rewards/rejected": -0.5693906545639038, - "step": 1020 - }, - { - "epoch": 2.68, - "grad_norm": 0.0, - "learning_rate": 1.4924849716612211e-06, - "logits/chosen": -1.96365225315094, - "logits/rejected": -1.9579105377197266, - "logps/chosen": -32.81294631958008, - "logps/rejected": -34.20869064331055, - "loss": 0.1032, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0585477352142334, - "rewards/margins": 1.6801731586456299, - "rewards/rejected": -0.6216254830360413, - "step": 1030 - }, - { - "epoch": 2.7, - "grad_norm": 28.875, - "learning_rate": 1.440887158673332e-06, - "logits/chosen": -1.8949559926986694, - "logits/rejected": -1.8988254070281982, - "logps/chosen": -34.670326232910156, - "logps/rejected": -35.01986312866211, - "loss": 0.1415, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9750639200210571, - "rewards/margins": 1.515320062637329, - "rewards/rejected": -0.5402560830116272, - "step": 1040 - }, - { - "epoch": 2.73, - "grad_norm": 0.0, - "learning_rate": 1.3898334684855647e-06, - "logits/chosen": -1.8945505619049072, - "logits/rejected": -1.9076248407363892, - "logps/chosen": -30.79265785217285, - "logps/rejected": -33.660736083984375, - "loss": 0.1249, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9632073640823364, - "rewards/margins": 1.5398341417312622, - "rewards/rejected": -0.5766268372535706, - "step": 1050 - }, - { - "epoch": 2.75, - "grad_norm": 25.75, - "learning_rate": 1.3393501301037245e-06, - "logits/chosen": -1.8843669891357422, - "logits/rejected": -1.8777261972427368, - "logps/chosen": -31.016742706298828, - "logps/rejected": -33.897727966308594, - "loss": 0.0762, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.028328776359558, - "rewards/margins": 1.5501177310943604, - "rewards/rejected": -0.5217889547348022, - "step": 1060 - }, - { - "epoch": 2.78, - "grad_norm": 0.0, - "learning_rate": 1.2894630795134454e-06, - "logits/chosen": -2.0126426219940186, - "logits/rejected": -2.0134384632110596, - "logps/chosen": -32.44770050048828, - "logps/rejected": -33.501129150390625, - "loss": 0.0927, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.98944091796875, - "rewards/margins": 1.5802329778671265, - "rewards/rejected": -0.590792179107666, - "step": 1070 - }, - { - "epoch": 2.81, - "grad_norm": 26.125, - "learning_rate": 1.2401979463554984e-06, - "logits/chosen": -1.9425004720687866, - "logits/rejected": -1.9413063526153564, - "logps/chosen": -33.02809524536133, - "logps/rejected": -33.649681091308594, - "loss": 0.2202, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.9555650949478149, - "rewards/margins": 1.3676296472549438, - "rewards/rejected": -0.4120645523071289, - "step": 1080 - }, - { - "epoch": 2.83, - "grad_norm": 0.0, - "learning_rate": 1.1915800407584705e-06, - "logits/chosen": -1.970933198928833, - "logits/rejected": -1.9633777141571045, - "logps/chosen": -33.092559814453125, - "logps/rejected": -31.60101890563965, - "loss": 0.0991, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0199841260910034, - "rewards/margins": 1.5235220193862915, - "rewards/rejected": -0.5035377740859985, - "step": 1090 - }, - { - "epoch": 2.86, - "grad_norm": 25.625, - "learning_rate": 1.1436343403356019e-06, - "logits/chosen": -1.939095139503479, - "logits/rejected": -1.9381242990493774, - "logps/chosen": -34.21430206298828, - "logps/rejected": -37.063575744628906, - "loss": 0.0894, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.083695650100708, - "rewards/margins": 1.7234748601913452, - "rewards/rejected": -0.6397790908813477, - "step": 1100 - }, - { - "epoch": 2.86, - "eval_logits/chosen": -2.2049529552459717, - "eval_logits/rejected": -2.200129747390747, - "eval_logps/chosen": -34.136192321777344, - "eval_logps/rejected": -37.72666931152344, - "eval_loss": 0.9181937575340271, - "eval_rewards/accuracies": 0.5336378812789917, - "eval_rewards/chosen": -0.09147636592388153, - "eval_rewards/margins": 0.09756775945425034, - "eval_rewards/rejected": -0.18904413282871246, - "eval_runtime": 145.1219, - "eval_samples_per_second": 2.364, - "eval_steps_per_second": 0.296, - "step": 1100 - }, - { - "epoch": 2.88, - "grad_norm": 16.625, - "learning_rate": 1.0963854773524548e-06, - "logits/chosen": -1.9868457317352295, - "logits/rejected": -1.9922664165496826, - "logps/chosen": -34.364707946777344, - "logps/rejected": -36.20539093017578, - "loss": 0.1547, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.992165744304657, - "rewards/margins": 1.5877904891967773, - "rewards/rejected": -0.5956246852874756, - "step": 1110 - }, - { - "epoch": 2.91, - "grad_norm": 29.0, - "learning_rate": 1.049857726072005e-06, - "logits/chosen": -1.9590438604354858, - "logits/rejected": -1.9570538997650146, - "logps/chosen": -31.564861297607422, - "logps/rejected": -33.61270523071289, - "loss": 0.1168, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9462112188339233, - "rewards/margins": 1.5131675004959106, - "rewards/rejected": -0.5669562220573425, - "step": 1120 - }, - { - "epoch": 2.94, - "grad_norm": 17.0, - "learning_rate": 1.0040749902836508e-06, - "logits/chosen": -1.8618561029434204, - "logits/rejected": -1.8646554946899414, - "logps/chosen": -28.602588653564453, - "logps/rejected": -31.00104331970215, - "loss": 0.0896, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0314428806304932, - "rewards/margins": 1.7118492126464844, - "rewards/rejected": -0.6804062128067017, - "step": 1130 - }, - { - "epoch": 2.96, - "grad_norm": 11.6875, - "learning_rate": 9.59060791022566e-07, - "logits/chosen": -1.9582477807998657, - "logits/rejected": -1.955122709274292, - "logps/chosen": -32.12145233154297, - "logps/rejected": -33.0306282043457, - "loss": 0.1423, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9740654826164246, - "rewards/margins": 1.4274402856826782, - "rewards/rejected": -0.4533747136592865, - "step": 1140 - }, - { - "epoch": 2.99, - "grad_norm": 11.875, - "learning_rate": 9.148382544856885e-07, - "logits/chosen": -1.878685712814331, - "logits/rejected": -1.8723747730255127, - "logps/chosen": -27.11667251586914, - "logps/rejected": -32.9290771484375, - "loss": 0.1014, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.9191849827766418, - "rewards/margins": 1.6218817234039307, - "rewards/rejected": -0.7026965618133545, - "step": 1150 - }, - { - "epoch": 3.01, - "grad_norm": 0.0, - "learning_rate": 8.714301001505568e-07, - "logits/chosen": -1.9950920343399048, - "logits/rejected": -1.9926071166992188, - "logps/chosen": -31.993701934814453, - "logps/rejected": -35.8871955871582, - "loss": 0.0971, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9921283721923828, - "rewards/margins": 1.7082250118255615, - "rewards/rejected": -0.7160967588424683, - "step": 1160 - }, - { - "epoch": 3.04, - "grad_norm": 0.0, - "learning_rate": 8.288586291031025e-07, - "logits/chosen": -1.9799201488494873, - "logits/rejected": -1.977949857711792, - "logps/chosen": -30.49740982055664, - "logps/rejected": -33.171287536621094, - "loss": 0.0419, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.005298376083374, - "rewards/margins": 1.7318332195281982, - "rewards/rejected": -0.7265347242355347, - "step": 1170 - }, - { - "epoch": 3.06, - "grad_norm": 12.9375, - "learning_rate": 7.871457125803897e-07, - "logits/chosen": -1.972078561782837, - "logits/rejected": -1.959705114364624, - "logps/chosen": -34.829872131347656, - "logps/rejected": -34.836021423339844, - "loss": 0.0889, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0938866138458252, - "rewards/margins": 1.7819774150848389, - "rewards/rejected": -0.6880909204483032, - "step": 1180 - }, - { - "epoch": 3.09, - "grad_norm": 0.0, - "learning_rate": 7.463127807341966e-07, - "logits/chosen": -1.865128755569458, - "logits/rejected": -1.8597571849822998, - "logps/chosen": -34.058189392089844, - "logps/rejected": -34.86783981323242, - "loss": 0.0598, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1623505353927612, - "rewards/margins": 1.729365587234497, - "rewards/rejected": -0.5670150518417358, - "step": 1190 - }, - { - "epoch": 3.12, - "grad_norm": 13.8125, - "learning_rate": 7.063808116212021e-07, - "logits/chosen": -1.9072997570037842, - "logits/rejected": -1.9104669094085693, - "logps/chosen": -31.529245376586914, - "logps/rejected": -32.14056396484375, - "loss": 0.1086, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9772661328315735, - "rewards/margins": 1.5662124156951904, - "rewards/rejected": -0.5889462232589722, - "step": 1200 - }, - { - "epoch": 3.12, - "eval_logits/chosen": -2.205395221710205, - "eval_logits/rejected": -2.2005577087402344, - "eval_logps/chosen": -34.13056945800781, - "eval_logps/rejected": -37.736209869384766, - "eval_loss": 0.9022552967071533, - "eval_rewards/accuracies": 0.5627076625823975, - "eval_rewards/chosen": -0.08641883730888367, - "eval_rewards/margins": 0.11121704429388046, - "eval_rewards/rejected": -0.19763588905334473, - "eval_runtime": 145.103, - "eval_samples_per_second": 2.364, - "eval_steps_per_second": 0.296, - "step": 1200 - }, - { - "epoch": 3.14, - "grad_norm": 23.125, - "learning_rate": 6.673703204254348e-07, - "logits/chosen": -1.9353859424591064, - "logits/rejected": -1.9308178424835205, - "logps/chosen": -30.187076568603516, - "logps/rejected": -31.651758193969727, - "loss": 0.0719, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9779340028762817, - "rewards/margins": 1.647231101989746, - "rewards/rejected": -0.6692970991134644, - "step": 1210 - }, - { - "epoch": 3.17, - "grad_norm": 0.0, - "learning_rate": 6.293013489185315e-07, - "logits/chosen": -1.9491630792617798, - "logits/rejected": -1.9436299800872803, - "logps/chosen": -34.10167694091797, - "logps/rejected": -35.294559478759766, - "loss": 0.0867, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.055037260055542, - "rewards/margins": 1.7072465419769287, - "rewards/rejected": -0.6522093415260315, - "step": 1220 - }, - { - "epoch": 3.19, - "grad_norm": 0.0, - "learning_rate": 5.921934551632086e-07, - "logits/chosen": -1.9506566524505615, - "logits/rejected": -1.9374538660049438, - "logps/chosen": -31.99393081665039, - "logps/rejected": -34.90449142456055, - "loss": 0.1538, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.9900773167610168, - "rewards/margins": 1.6962600946426392, - "rewards/rejected": -0.706182599067688, - "step": 1230 - }, - { - "epoch": 3.22, - "grad_norm": 0.0, - "learning_rate": 5.560657034652405e-07, - "logits/chosen": -1.9950263500213623, - "logits/rejected": -1.9922975301742554, - "logps/chosen": -33.79475402832031, - "logps/rejected": -32.4438362121582, - "loss": 0.0771, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.114330530166626, - "rewards/margins": 1.6578998565673828, - "rewards/rejected": -0.5435694456100464, - "step": 1240 - }, - { - "epoch": 3.25, - "grad_norm": 16.5, - "learning_rate": 5.2093665457911e-07, - "logits/chosen": -1.9237819910049438, - "logits/rejected": -1.9209115505218506, - "logps/chosen": -33.26008987426758, - "logps/rejected": -35.61740493774414, - "loss": 0.0971, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0224542617797852, - "rewards/margins": 1.7607052326202393, - "rewards/rejected": -0.7382508516311646, - "step": 1250 - }, - { - "epoch": 3.27, - "grad_norm": 15.125, - "learning_rate": 4.868243561723535e-07, - "logits/chosen": -1.988936185836792, - "logits/rejected": -1.9837830066680908, - "logps/chosen": -29.6236629486084, - "logps/rejected": -33.01030349731445, - "loss": 0.0567, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0234204530715942, - "rewards/margins": 1.5883967876434326, - "rewards/rejected": -0.5649763345718384, - "step": 1260 - }, - { - "epoch": 3.3, - "grad_norm": 0.0, - "learning_rate": 4.537463335535161e-07, - "logits/chosen": -2.0158066749572754, - "logits/rejected": -2.0205185413360596, - "logps/chosen": -31.42230796813965, - "logps/rejected": -32.35493087768555, - "loss": 0.077, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1598228216171265, - "rewards/margins": 1.766034483909607, - "rewards/rejected": -0.6062116026878357, - "step": 1270 - }, - { - "epoch": 3.32, - "grad_norm": 14.25, - "learning_rate": 4.217195806684629e-07, - "logits/chosen": -1.9084632396697998, - "logits/rejected": -1.91574227809906, - "logps/chosen": -33.739585876464844, - "logps/rejected": -33.33063507080078, - "loss": 0.0795, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0508931875228882, - "rewards/margins": 1.7100213766098022, - "rewards/rejected": -0.6591281890869141, - "step": 1280 - }, - { - "epoch": 3.35, - "grad_norm": 15.5, - "learning_rate": 3.907605513696808e-07, - "logits/chosen": -1.8143936395645142, - "logits/rejected": -1.8166534900665283, - "logps/chosen": -31.8719539642334, - "logps/rejected": -37.17786407470703, - "loss": 0.0853, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1339490413665771, - "rewards/margins": 1.6951805353164673, - "rewards/rejected": -0.5612314343452454, - "step": 1290 - }, - { - "epoch": 3.38, - "grad_norm": 28.25, - "learning_rate": 3.6088515096305675e-07, - "logits/chosen": -1.867759346961975, - "logits/rejected": -1.8711694478988647, - "logps/chosen": -32.002017974853516, - "logps/rejected": -33.140037536621094, - "loss": 0.0577, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0931856632232666, - "rewards/margins": 1.7014715671539307, - "rewards/rejected": -0.6082859039306641, - "step": 1300 - }, - { - "epoch": 3.38, - "eval_logits/chosen": -2.204974412918091, - "eval_logits/rejected": -2.2001500129699707, - "eval_logps/chosen": -34.137020111083984, - "eval_logps/rejected": -37.73167037963867, - "eval_loss": 0.9154069423675537, - "eval_rewards/accuracies": 0.5598006844520569, - "eval_rewards/chosen": -0.09222196787595749, - "eval_rewards/margins": 0.10132110863924026, - "eval_rewards/rejected": -0.19354306161403656, - "eval_runtime": 145.1705, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 1300 - }, - { - "epoch": 3.4, - "grad_norm": 0.0301513671875, - "learning_rate": 3.321087280364757e-07, - "logits/chosen": -1.97760808467865, - "logits/rejected": -1.9573789834976196, - "logps/chosen": -29.92989158630371, - "logps/rejected": -35.94379425048828, - "loss": 0.1028, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.065155029296875, - "rewards/margins": 1.764804482460022, - "rewards/rejected": -0.6996492743492126, - "step": 1310 - }, - { - "epoch": 3.43, - "grad_norm": 3.5, - "learning_rate": 3.044460665744284e-07, - "logits/chosen": -1.9561742544174194, - "logits/rejected": -1.9619709253311157, - "logps/chosen": -31.752899169921875, - "logps/rejected": -33.09962463378906, - "loss": 0.0827, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.15280020236969, - "rewards/margins": 1.7007869482040405, - "rewards/rejected": -0.5479868650436401, - "step": 1320 - }, - { - "epoch": 3.45, - "grad_norm": 0.0211181640625, - "learning_rate": 2.779113783626916e-07, - "logits/chosen": -1.9276313781738281, - "logits/rejected": -1.9227180480957031, - "logps/chosen": -31.962305068969727, - "logps/rejected": -35.37018585205078, - "loss": 0.0646, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9541842341423035, - "rewards/margins": 1.6722910404205322, - "rewards/rejected": -0.7181065678596497, - "step": 1330 - }, - { - "epoch": 3.48, - "grad_norm": 0.0, - "learning_rate": 2.5251829568697204e-07, - "logits/chosen": -1.790858507156372, - "logits/rejected": -1.8005390167236328, - "logps/chosen": -32.53480911254883, - "logps/rejected": -31.600765228271484, - "loss": 0.0607, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1383044719696045, - "rewards/margins": 1.6482902765274048, - "rewards/rejected": -0.5099858045578003, - "step": 1340 - }, - { - "epoch": 3.51, - "grad_norm": 0.0, - "learning_rate": 2.2827986432927774e-07, - "logits/chosen": -1.8518263101577759, - "logits/rejected": -1.8565566539764404, - "logps/chosen": -32.678627014160156, - "logps/rejected": -33.76792526245117, - "loss": 0.0315, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1450836658477783, - "rewards/margins": 1.6518867015838623, - "rewards/rejected": -0.5068029165267944, - "step": 1350 - }, - { - "epoch": 3.53, - "grad_norm": 31.375, - "learning_rate": 2.0520853686560177e-07, - "logits/chosen": -1.8785499334335327, - "logits/rejected": -1.8840558528900146, - "logps/chosen": -32.6619873046875, - "logps/rejected": -35.23099136352539, - "loss": 0.0754, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0527045726776123, - "rewards/margins": 1.6538400650024414, - "rewards/rejected": -0.6011354923248291, - "step": 1360 - }, - { - "epoch": 3.56, - "grad_norm": 0.02587890625, - "learning_rate": 1.833161662683672e-07, - "logits/chosen": -1.9697166681289673, - "logits/rejected": -1.9651731252670288, - "logps/chosen": -31.484020233154297, - "logps/rejected": -31.601449966430664, - "loss": 0.0507, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0364978313446045, - "rewards/margins": 1.657091498374939, - "rewards/rejected": -0.6205938458442688, - "step": 1370 - }, - { - "epoch": 3.58, - "grad_norm": 16.875, - "learning_rate": 1.626139998169246e-07, - "logits/chosen": -1.888527274131775, - "logits/rejected": -1.8903592824935913, - "logps/chosen": -28.53927993774414, - "logps/rejected": -31.388671875, - "loss": 0.0877, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0080214738845825, - "rewards/margins": 1.5880650281906128, - "rewards/rejected": -0.5800435543060303, - "step": 1380 - }, - { - "epoch": 3.61, - "grad_norm": 21.125, - "learning_rate": 1.4311267331922535e-07, - "logits/chosen": -1.8796398639678955, - "logits/rejected": -1.8842071294784546, - "logps/chosen": -30.890060424804688, - "logps/rejected": -33.532711029052734, - "loss": 0.0687, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0164906978607178, - "rewards/margins": 1.50667405128479, - "rewards/rejected": -0.49018335342407227, - "step": 1390 - }, - { - "epoch": 3.64, - "grad_norm": 0.0, - "learning_rate": 1.2482220564763669e-07, - "logits/chosen": -1.9481089115142822, - "logits/rejected": -1.9513651132583618, - "logps/chosen": -33.666908264160156, - "logps/rejected": -34.179168701171875, - "loss": 0.0375, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1372981071472168, - "rewards/margins": 1.8173433542251587, - "rewards/rejected": -0.6800452470779419, - "step": 1400 - }, - { - "epoch": 3.64, - "eval_logits/chosen": -2.204983949661255, - "eval_logits/rejected": -2.200157403945923, - "eval_logps/chosen": -34.134159088134766, - "eval_logps/rejected": -37.7177848815918, - "eval_loss": 0.9233020544052124, - "eval_rewards/accuracies": 0.5568937063217163, - "eval_rewards/chosen": -0.08964409679174423, - "eval_rewards/margins": 0.09140493720769882, - "eval_rewards/rejected": -0.18104901909828186, - "eval_runtime": 145.1261, - "eval_samples_per_second": 2.363, - "eval_steps_per_second": 0.296, - "step": 1400 - }, - { - "epoch": 3.66, - "grad_norm": 14.875, - "learning_rate": 1.0775199359171346e-07, - "logits/chosen": -2.017505407333374, - "logits/rejected": -2.010617256164551, - "logps/chosen": -32.54536056518555, - "logps/rejected": -34.80001449584961, - "loss": 0.0811, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.131469488143921, - "rewards/margins": 1.6313743591308594, - "rewards/rejected": -0.4999050498008728, - "step": 1410 - }, - { - "epoch": 3.69, - "grad_norm": 18.5, - "learning_rate": 9.191080703056604e-08, - "logits/chosen": -1.9044952392578125, - "logits/rejected": -1.9153999090194702, - "logps/chosen": -32.92113494873047, - "logps/rejected": -34.28981018066406, - "loss": 0.0577, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1053516864776611, - "rewards/margins": 1.7342084646224976, - "rewards/rejected": -0.6288568377494812, - "step": 1420 - }, - { - "epoch": 3.71, - "grad_norm": 19.0, - "learning_rate": 7.730678442730539e-08, - "logits/chosen": -1.9869391918182373, - "logits/rejected": -1.9988434314727783, - "logps/chosen": -33.78189468383789, - "logps/rejected": -34.36466598510742, - "loss": 0.0555, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1510204076766968, - "rewards/margins": 1.8047173023223877, - "rewards/rejected": -0.6536968350410461, - "step": 1430 - }, - { - "epoch": 3.74, - "grad_norm": 16.5, - "learning_rate": 6.394742864787806e-08, - "logits/chosen": -1.977538824081421, - "logits/rejected": -1.979612112045288, - "logps/chosen": -32.0444450378418, - "logps/rejected": -34.551475524902344, - "loss": 0.1098, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0475465059280396, - "rewards/margins": 1.6699031591415405, - "rewards/rejected": -0.6223568320274353, - "step": 1440 - }, - { - "epoch": 3.77, - "grad_norm": 0.01312255859375, - "learning_rate": 5.183960310644748e-08, - "logits/chosen": -1.9596837759017944, - "logits/rejected": -1.9525411128997803, - "logps/chosen": -34.377952575683594, - "logps/rejected": -34.37388229370117, - "loss": 0.0864, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0713341236114502, - "rewards/margins": 1.6144660711288452, - "rewards/rejected": -0.5431317090988159, - "step": 1450 - }, - { - "epoch": 3.79, - "grad_norm": 9.6875, - "learning_rate": 4.098952823928693e-08, - "logits/chosen": -1.9263242483139038, - "logits/rejected": -1.9324493408203125, - "logps/chosen": -30.057865142822266, - "logps/rejected": -34.518531799316406, - "loss": 0.0742, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0397825241088867, - "rewards/margins": 1.8034608364105225, - "rewards/rejected": -0.7636784315109253, - "step": 1460 - }, - { - "epoch": 3.82, - "grad_norm": 11.9375, - "learning_rate": 3.1402778309014284e-08, - "logits/chosen": -1.8729248046875, - "logits/rejected": -1.8707479238510132, - "logps/chosen": -29.638717651367188, - "logps/rejected": -31.215627670288086, - "loss": 0.0943, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.0861549377441406, - "rewards/margins": 1.6454105377197266, - "rewards/rejected": -0.5592554211616516, - "step": 1470 - }, - { - "epoch": 3.84, - "grad_norm": 0.0, - "learning_rate": 2.3084278540791427e-08, - "logits/chosen": -2.048567295074463, - "logits/rejected": -2.042868137359619, - "logps/chosen": -34.20145797729492, - "logps/rejected": -32.80525588989258, - "loss": 0.0547, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.9754201173782349, - "rewards/margins": 1.649107575416565, - "rewards/rejected": -0.673687756061554, - "step": 1480 - }, - { - "epoch": 3.87, - "grad_norm": 24.5, - "learning_rate": 1.6038302591975807e-08, - "logits/chosen": -1.939672827720642, - "logits/rejected": -1.9418550729751587, - "logps/chosen": -27.397912979125977, - "logps/rejected": -28.60453224182129, - "loss": 0.1225, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.9706217050552368, - "rewards/margins": 1.4946484565734863, - "rewards/rejected": -0.5240268111228943, - "step": 1490 - }, - { - "epoch": 3.9, - "grad_norm": 15.5625, - "learning_rate": 1.0268470356514237e-08, - "logits/chosen": -1.9486587047576904, - "logits/rejected": -1.9430547952651978, - "logps/chosen": -31.683019638061523, - "logps/rejected": -32.714141845703125, - "loss": 0.0724, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0059928894042969, - "rewards/margins": 1.6766607761383057, - "rewards/rejected": -0.6706677675247192, - "step": 1500 - }, - { - "epoch": 3.9, - "eval_logits/chosen": -2.204876184463501, - "eval_logits/rejected": -2.200045108795166, - "eval_logps/chosen": -34.13526153564453, - "eval_logps/rejected": -37.724769592285156, - "eval_loss": 0.9175900816917419, - "eval_rewards/accuracies": 0.5627076625823975, - "eval_rewards/chosen": -0.09064043313264847, - "eval_rewards/margins": 0.09669183194637299, - "eval_rewards/rejected": -0.18733225762844086, - "eval_runtime": 145.3111, - "eval_samples_per_second": 2.36, - "eval_steps_per_second": 0.296, - "step": 1500 - }, - { - "epoch": 3.92, - "grad_norm": 18.625, - "learning_rate": 5.777746105209147e-09, - "logits/chosen": -1.87344491481781, - "logits/rejected": -1.8777220249176025, - "logps/chosen": -33.06303405761719, - "logps/rejected": -35.00675964355469, - "loss": 0.1331, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.9971674084663391, - "rewards/margins": 1.5339066982269287, - "rewards/rejected": -0.5367392301559448, - "step": 1510 - }, - { - "epoch": 3.95, - "grad_norm": 6.90625, - "learning_rate": 2.5684369628148352e-09, - "logits/chosen": -1.9323539733886719, - "logits/rejected": -1.9306873083114624, - "logps/chosen": -29.582763671875, - "logps/rejected": -33.175575256347656, - "loss": 0.1185, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0362732410430908, - "rewards/margins": 1.5635396242141724, - "rewards/rejected": -0.5272663831710815, - "step": 1520 - }, - { - "epoch": 3.97, - "grad_norm": 18.875, - "learning_rate": 6.421917227455999e-10, - "logits/chosen": -2.030973434448242, - "logits/rejected": -2.0232560634613037, - "logps/chosen": -27.034753799438477, - "logps/rejected": -29.34645652770996, - "loss": 0.0492, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9253705143928528, - "rewards/margins": 1.5327354669570923, - "rewards/rejected": -0.6073648929595947, - "step": 1530 - }, - { - "epoch": 4.0, - "grad_norm": 14.4375, - "learning_rate": 0.0, - "logits/chosen": -1.9311290979385376, - "logits/rejected": -1.9209285974502563, - "logps/chosen": -31.977828979492188, - "logps/rejected": -36.17755126953125, - "loss": 0.0394, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.979533851146698, - "rewards/margins": 1.6692752838134766, - "rewards/rejected": -0.689741313457489, - "step": 1540 - }, - { - "epoch": 4.0, - "step": 1540, + "epoch": 1.0, + "step": 385, "total_flos": 0.0, - "train_loss": 0.15107146371881683, - "train_runtime": 10740.8737, - "train_samples_per_second": 1.147, - "train_steps_per_second": 0.143 + "train_loss": 0.6175476637753573, + "train_runtime": 3252.7839, + "train_samples_per_second": 0.947, + "train_steps_per_second": 0.118 } ], "logging_steps": 10, - "max_steps": 1540, + "max_steps": 385, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4,