{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.866413950920105, "logits/rejected": -1.8707411289215088, "logps/chosen": -36.98916244506836, "logps/rejected": -33.67436981201172, "loss": 0.6701, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.01569323241710663, "rewards/margins": 0.05555717274546623, "rewards/rejected": -0.039863936603069305, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9979650974273682, "logits/rejected": -2.0006086826324463, "logps/chosen": -29.624820709228516, "logps/rejected": -29.0762939453125, "loss": 0.6837, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.01563635841012001, "rewards/margins": 0.027204299345612526, "rewards/rejected": -0.01156794372946024, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.921021819114685, "logits/rejected": -1.9183374643325806, "logps/chosen": -31.40532875061035, "logps/rejected": -33.23241424560547, "loss": 0.6877, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00968973059207201, "rewards/margins": 0.022251319140195847, "rewards/rejected": -0.012561586685478687, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.0176353454589844, "logits/rejected": -2.008906364440918, "logps/chosen": -32.574256896972656, "logps/rejected": -32.53368377685547, "loss": 0.6874, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0022967704571783543, "rewards/margins": 0.02120940014719963, "rewards/rejected": -0.018912632018327713, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8619186878204346, "logits/rejected": -1.85114324092865, "logps/chosen": -33.55537414550781, "logps/rejected": -35.45675277709961, "loss": 0.6957, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.001892436295747757, "rewards/margins": 0.005858602002263069, "rewards/rejected": -0.003966164775192738, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9400945901870728, "logits/rejected": -1.9420464038848877, "logps/chosen": -32.56509780883789, "logps/rejected": -33.2406120300293, "loss": 0.6632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.031578924506902695, "rewards/margins": 0.09388783574104309, "rewards/rejected": -0.062308914959430695, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.0712790489196777, "logits/rejected": -2.0762436389923096, "logps/chosen": -33.981910705566406, "logps/rejected": -36.62363815307617, "loss": 0.6833, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005918038543313742, "rewards/margins": 0.05520814657211304, "rewards/rejected": -0.04929010197520256, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9327905178070068, "logits/rejected": -1.935909628868103, "logps/chosen": -34.32685470581055, "logps/rejected": -34.65606689453125, "loss": 0.639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.09085920453071594, "rewards/margins": 0.14815348386764526, "rewards/rejected": -0.057294271886348724, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9414918422698975, "logits/rejected": -1.946007490158081, "logps/chosen": -32.406803131103516, "logps/rejected": -32.36021041870117, "loss": 0.6792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.054556868970394135, "rewards/margins": 0.05573350936174393, "rewards/rejected": -0.0011766403913497925, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.039034128189087, "logits/rejected": -2.0370402336120605, "logps/chosen": -32.172786712646484, "logps/rejected": -31.333194732666016, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": 0.06124376505613327, "rewards/margins": 0.12152798473834991, "rewards/rejected": -0.06028420478105545, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2339773178100586, "eval_logits/rejected": -2.229137420654297, "eval_logps/chosen": -34.04054641723633, "eval_logps/rejected": -37.549957275390625, "eval_loss": 0.6902773976325989, "eval_rewards/accuracies": 0.5685215592384338, "eval_rewards/chosen": -0.005393954925239086, "eval_rewards/margins": 0.024608083069324493, "eval_rewards/rejected": -0.030002037063241005, "eval_runtime": 146.034, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.994192123413086, "logits/rejected": -1.9918158054351807, "logps/chosen": -33.142940521240234, "logps/rejected": -34.01188278198242, "loss": 0.6911, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.09078876674175262, "rewards/margins": 0.07505009323358536, "rewards/rejected": 0.015738680958747864, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0053954124450684, "logits/rejected": -1.997046709060669, "logps/chosen": -32.33894348144531, "logps/rejected": -32.1308708190918, "loss": 0.6746, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09536493569612503, "rewards/margins": 0.06779730319976807, "rewards/rejected": 0.027567636221647263, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0336387157440186, "logits/rejected": -2.025650978088379, "logps/chosen": -30.345691680908203, "logps/rejected": -32.078697204589844, "loss": 0.6527, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.11702337116003036, "rewards/margins": 0.14014457166194916, "rewards/rejected": -0.023121213540434837, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9642337560653687, "logits/rejected": -1.9744552373886108, "logps/chosen": -31.243911743164062, "logps/rejected": -32.590267181396484, "loss": 0.6171, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1581769436597824, "rewards/margins": 0.20802685618400574, "rewards/rejected": -0.04984992742538452, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.876604437828064, "logits/rejected": -1.8777605295181274, "logps/chosen": -33.938690185546875, "logps/rejected": -34.807891845703125, "loss": 0.6043, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.22860188782215118, "rewards/margins": 0.2741745412349701, "rewards/rejected": -0.0455726757645607, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9282041788101196, "logits/rejected": -1.9247684478759766, "logps/chosen": -36.02125930786133, "logps/rejected": -32.71831130981445, "loss": 0.6454, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.13537634909152985, "rewards/margins": 0.13137592375278473, "rewards/rejected": 0.004000450484454632, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.029125928878784, "logits/rejected": -2.0217747688293457, "logps/chosen": -33.49839401245117, "logps/rejected": -31.400177001953125, "loss": 0.5828, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.26951926946640015, "rewards/margins": 0.3130132555961609, "rewards/rejected": -0.04349397122859955, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0355944633483887, "logits/rejected": -2.040832042694092, "logps/chosen": -32.235923767089844, "logps/rejected": -32.460418701171875, "loss": 0.5943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2779761850833893, "rewards/margins": 0.2557251751422882, "rewards/rejected": 0.02225096896290779, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0362112522125244, "logits/rejected": -2.0334599018096924, "logps/chosen": -31.269250869750977, "logps/rejected": -31.325435638427734, "loss": 0.6245, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19773444533348083, "rewards/margins": 0.20423230528831482, "rewards/rejected": -0.0064978525042533875, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9060389995574951, "logits/rejected": -1.9106788635253906, "logps/chosen": -31.306299209594727, "logps/rejected": -32.81407165527344, "loss": 0.5931, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2699825167655945, "rewards/margins": 0.2908058166503906, "rewards/rejected": -0.02082330361008644, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.231553792953491, "eval_logits/rejected": -2.2267112731933594, "eval_logps/chosen": -34.07304763793945, "eval_logps/rejected": -37.57693862915039, "eval_loss": 0.6979728937149048, "eval_rewards/accuracies": 0.5157807469367981, "eval_rewards/chosen": -0.03464451804757118, "eval_rewards/margins": 0.019641490653157234, "eval_rewards/rejected": -0.054286014288663864, "eval_runtime": 145.8095, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.018519163131714, "logits/rejected": -2.0291810035705566, "logps/chosen": -31.742992401123047, "logps/rejected": -33.946937561035156, "loss": 0.5902, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2054794579744339, "rewards/margins": 0.2812942862510681, "rewards/rejected": -0.07581482082605362, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.911586046218872, "logits/rejected": -1.9263393878936768, "logps/chosen": -29.84616470336914, "logps/rejected": -31.615009307861328, "loss": 0.5879, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23883743584156036, "rewards/margins": 0.2899848222732544, "rewards/rejected": -0.051147449761629105, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9677941799163818, "logits/rejected": -1.9717823266983032, "logps/chosen": -33.100074768066406, "logps/rejected": -31.62213134765625, "loss": 0.5748, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.28565075993537903, "rewards/margins": 0.3511958718299866, "rewards/rejected": -0.06554517149925232, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9661725759506226, "logits/rejected": -1.944300651550293, "logps/chosen": -33.841453552246094, "logps/rejected": -35.11375045776367, "loss": 0.5473, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2810631990432739, "rewards/margins": 0.4277234673500061, "rewards/rejected": -0.14666026830673218, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.007416009902954, "logits/rejected": -2.0040948390960693, "logps/chosen": -32.70330810546875, "logps/rejected": -36.29412841796875, "loss": 0.5992, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1995842456817627, "rewards/margins": 0.2618715763092041, "rewards/rejected": -0.06228730082511902, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8749721050262451, "logits/rejected": -1.8725513219833374, "logps/chosen": -34.00068664550781, "logps/rejected": -35.53888702392578, "loss": 0.6254, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16894161701202393, "rewards/margins": 0.1997825801372528, "rewards/rejected": -0.030840963125228882, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8600317239761353, "logits/rejected": -1.8576066493988037, "logps/chosen": -34.1875, "logps/rejected": -31.8159122467041, "loss": 0.616, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1890900433063507, "rewards/margins": 0.22921428084373474, "rewards/rejected": -0.04012420028448105, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9631398916244507, "logits/rejected": -1.9526073932647705, "logps/chosen": -35.023719787597656, "logps/rejected": -31.869693756103516, "loss": 0.5782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.29963088035583496, "rewards/margins": 0.32546472549438477, "rewards/rejected": -0.025833839550614357, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0582926273345947, "logits/rejected": -2.0433640480041504, "logps/chosen": -30.733753204345703, "logps/rejected": -32.67460632324219, "loss": 0.6392, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.17133468389511108, "rewards/margins": 0.19182677567005157, "rewards/rejected": -0.020492086187005043, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.929610013961792, "logits/rejected": -1.9270601272583008, "logps/chosen": -32.42620086669922, "logps/rejected": -30.873455047607422, "loss": 0.5301, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.450817346572876, "rewards/margins": 0.5018006563186646, "rewards/rejected": -0.050983332097530365, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.229154348373413, "eval_logits/rejected": -2.2243051528930664, "eval_logps/chosen": -34.09621810913086, "eval_logps/rejected": -37.59999084472656, "eval_loss": 0.6972895860671997, "eval_rewards/accuracies": 0.5390365719795227, "eval_rewards/chosen": -0.05550166219472885, "eval_rewards/margins": 0.019528048112988472, "eval_rewards/rejected": -0.07502970844507217, "eval_runtime": 145.7792, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "grad_norm": 11.75, "learning_rate": 4.84533120650964e-06, "logits/chosen": -2.0636165142059326, "logits/rejected": -2.0508041381835938, "logps/chosen": -32.113487243652344, "logps/rejected": -32.89537811279297, "loss": 0.4684, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3734120726585388, "rewards/margins": 0.6023003458976746, "rewards/rejected": -0.2288883477449417, "step": 310 }, { "epoch": 0.83, "grad_norm": 10.875, "learning_rate": 4.825108134172131e-06, "logits/chosen": -1.9748560190200806, "logits/rejected": -1.9662139415740967, "logps/chosen": -31.80029296875, "logps/rejected": -30.449291229248047, "loss": 0.463, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.49922245740890503, "rewards/margins": 0.6863331198692322, "rewards/rejected": -0.18711069226264954, "step": 320 }, { "epoch": 0.86, "grad_norm": 12.875, "learning_rate": 4.80369052967602e-06, "logits/chosen": -1.910094976425171, "logits/rejected": -1.9221827983856201, "logps/chosen": -29.87582778930664, "logps/rejected": -33.66598129272461, "loss": 0.4237, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.5430954694747925, "rewards/margins": 0.7804504632949829, "rewards/rejected": -0.23735502362251282, "step": 330 }, { "epoch": 0.88, "grad_norm": 18.375, "learning_rate": 4.781089396387968e-06, "logits/chosen": -1.8735644817352295, "logits/rejected": -1.8643461465835571, "logps/chosen": -34.02741241455078, "logps/rejected": -36.179935455322266, "loss": 0.4051, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6043617725372314, "rewards/margins": 0.89524906873703, "rewards/rejected": -0.29088738560676575, "step": 340 }, { "epoch": 0.91, "grad_norm": 12.25, "learning_rate": 4.757316345716554e-06, "logits/chosen": -1.9254121780395508, "logits/rejected": -1.9260631799697876, "logps/chosen": -33.68886947631836, "logps/rejected": -34.135963439941406, "loss": 0.4149, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.6447720527648926, "rewards/margins": 0.9030619859695435, "rewards/rejected": -0.2582899332046509, "step": 350 }, { "epoch": 0.94, "grad_norm": 14.9375, "learning_rate": 4.73238359114687e-06, "logits/chosen": -2.052173376083374, "logits/rejected": -2.0583267211914062, "logps/chosen": -31.09401512145996, "logps/rejected": -33.012630462646484, "loss": 0.4478, "rewards/accuracies": 0.875, "rewards/chosen": 0.43130677938461304, "rewards/margins": 0.7412186861038208, "rewards/rejected": -0.30991190671920776, "step": 360 }, { "epoch": 0.96, "grad_norm": 28.0, "learning_rate": 4.706303941965804e-06, "logits/chosen": -1.9808381795883179, "logits/rejected": -1.9804092645645142, "logps/chosen": -32.843143463134766, "logps/rejected": -36.34934616088867, "loss": 0.4406, "rewards/accuracies": 0.875, "rewards/chosen": 0.5416162610054016, "rewards/margins": 0.8112291097640991, "rewards/rejected": -0.2696128487586975, "step": 370 }, { "epoch": 0.99, "grad_norm": 10.5, "learning_rate": 4.679090796681225e-06, "logits/chosen": -2.012341022491455, "logits/rejected": -2.0077567100524902, "logps/chosen": -30.083026885986328, "logps/rejected": -29.55636215209961, "loss": 0.411, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.5648170709609985, "rewards/margins": 0.873041033744812, "rewards/rejected": -0.30822402238845825, "step": 380 }, { "epoch": 1.01, "grad_norm": 13.75, "learning_rate": 4.650758136138454e-06, "logits/chosen": -1.784257173538208, "logits/rejected": -1.7905915975570679, "logps/chosen": -31.67917823791504, "logps/rejected": -36.660545349121094, "loss": 0.3793, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.635884165763855, "rewards/margins": 1.1239113807678223, "rewards/rejected": -0.4880271553993225, "step": 390 }, { "epoch": 1.04, "grad_norm": 10.625, "learning_rate": 4.621320516337559e-06, "logits/chosen": -1.937954306602478, "logits/rejected": -1.931673288345337, "logps/chosen": -33.02653121948242, "logps/rejected": -32.67657470703125, "loss": 0.389, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7255499958992004, "rewards/margins": 1.0564748048782349, "rewards/rejected": -0.33092474937438965, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.2031846046447754, "eval_logits/rejected": -2.198335647583008, "eval_logps/chosen": -34.16795349121094, "eval_logps/rejected": -37.72208786010742, "eval_loss": 0.6933022737503052, "eval_rewards/accuracies": 0.550664484500885, "eval_rewards/chosen": -0.12006273865699768, "eval_rewards/margins": 0.06485801190137863, "eval_rewards/rejected": -0.1849207729101181, "eval_runtime": 145.5175, "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.295, "step": 400 }, { "epoch": 1.06, "grad_norm": 13.375, "learning_rate": 4.590793060955158e-06, "logits/chosen": -1.9355392456054688, "logits/rejected": -1.9427944421768188, "logps/chosen": -28.549734115600586, "logps/rejected": -29.689483642578125, "loss": 0.3677, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5135748982429504, "rewards/margins": 1.0457103252410889, "rewards/rejected": -0.5321354866027832, "step": 410 }, { "epoch": 1.09, "grad_norm": 11.375, "learning_rate": 4.559191453574582e-06, "logits/chosen": -1.9551494121551514, "logits/rejected": -1.9541441202163696, "logps/chosen": -33.60606002807617, "logps/rejected": -31.240774154663086, "loss": 0.4587, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.5801833868026733, "rewards/margins": 0.8481400609016418, "rewards/rejected": -0.2679567039012909, "step": 420 }, { "epoch": 1.12, "grad_norm": 15.125, "learning_rate": 4.52653192962838e-06, "logits/chosen": -1.9499473571777344, "logits/rejected": -1.932682752609253, "logps/chosen": -30.44845199584961, "logps/rejected": -33.599735260009766, "loss": 0.3784, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.4758743345737457, "rewards/margins": 1.0819809436798096, "rewards/rejected": -0.6061066389083862, "step": 430 }, { "epoch": 1.14, "grad_norm": 14.1875, "learning_rate": 4.492831268057307e-06, "logits/chosen": -1.9813868999481201, "logits/rejected": -1.9835193157196045, "logps/chosen": -35.81322479248047, "logps/rejected": -35.490821838378906, "loss": 0.313, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7192217111587524, "rewards/margins": 1.2869927883148193, "rewards/rejected": -0.5677711367607117, "step": 440 }, { "epoch": 1.17, "grad_norm": 12.3125, "learning_rate": 4.458106782690094e-06, "logits/chosen": -2.05714750289917, "logits/rejected": -2.0569212436676025, "logps/chosen": -31.911890029907227, "logps/rejected": -34.006473541259766, "loss": 0.3759, "rewards/accuracies": 0.875, "rewards/chosen": 0.6493128538131714, "rewards/margins": 1.0510342121124268, "rewards/rejected": -0.40172141790390015, "step": 450 }, { "epoch": 1.19, "grad_norm": 11.9375, "learning_rate": 4.422376313348405e-06, "logits/chosen": -2.001530885696411, "logits/rejected": -1.9940494298934937, "logps/chosen": -31.56293296813965, "logps/rejected": -36.774314880371094, "loss": 0.3025, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7587249279022217, "rewards/margins": 1.4069726467132568, "rewards/rejected": -0.6482478380203247, "step": 460 }, { "epoch": 1.22, "grad_norm": 20.375, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -1.9044301509857178, "logits/rejected": -1.900957465171814, "logps/chosen": -33.156280517578125, "logps/rejected": -33.48976516723633, "loss": 0.3626, "rewards/accuracies": 0.875, "rewards/chosen": 0.6984156966209412, "rewards/margins": 1.2077140808105469, "rewards/rejected": -0.5092984437942505, "step": 470 }, { "epoch": 1.25, "grad_norm": 14.0625, "learning_rate": 4.347971356735789e-06, "logits/chosen": -2.0298960208892822, "logits/rejected": -2.0229077339172363, "logps/chosen": -30.285165786743164, "logps/rejected": -32.622947692871094, "loss": 0.4059, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.48903241753578186, "rewards/margins": 1.0555496215820312, "rewards/rejected": -0.566517174243927, "step": 480 }, { "epoch": 1.27, "grad_norm": 9.4375, "learning_rate": 4.309335095262675e-06, "logits/chosen": -1.9742801189422607, "logits/rejected": -1.975847840309143, "logps/chosen": -34.49077606201172, "logps/rejected": -34.25959396362305, "loss": 0.3167, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.7761706113815308, "rewards/margins": 1.3324440717697144, "rewards/rejected": -0.5562735795974731, "step": 490 }, { "epoch": 1.3, "grad_norm": 20.25, "learning_rate": 4.269769281772082e-06, "logits/chosen": -1.8631584644317627, "logits/rejected": -1.860769510269165, "logps/chosen": -32.35368347167969, "logps/rejected": -37.259803771972656, "loss": 0.322, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.7367446422576904, "rewards/margins": 1.452678918838501, "rewards/rejected": -0.715934157371521, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.201748847961426, "eval_logits/rejected": -2.1969025135040283, "eval_logps/chosen": -34.34733200073242, "eval_logps/rejected": -37.91175842285156, "eval_loss": 0.7055429816246033, "eval_rewards/accuracies": 0.5514950156211853, "eval_rewards/chosen": -0.281506210565567, "eval_rewards/margins": 0.07411985099315643, "eval_rewards/rejected": -0.35562604665756226, "eval_runtime": 145.2329, "eval_samples_per_second": 2.362, "eval_steps_per_second": 0.296, "step": 500 }, { "epoch": 1.32, "grad_norm": 14.9375, "learning_rate": 4.22929424333435e-06, "logits/chosen": -1.9645631313323975, "logits/rejected": -1.9692989587783813, "logps/chosen": -32.608741760253906, "logps/rejected": -32.16291809082031, "loss": 0.3561, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7637556791305542, "rewards/margins": 1.2288849353790283, "rewards/rejected": -0.46512943506240845, "step": 510 }, { "epoch": 1.35, "grad_norm": 14.375, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -1.9946361780166626, "logits/rejected": -2.005610466003418, "logps/chosen": -30.554733276367188, "logps/rejected": -32.16284942626953, "loss": 0.358, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.7521130442619324, "rewards/margins": 1.3417600393295288, "rewards/rejected": -0.589647114276886, "step": 520 }, { "epoch": 1.38, "grad_norm": 6.53125, "learning_rate": 4.145700124802693e-06, "logits/chosen": -1.9223169088363647, "logits/rejected": -1.9189828634262085, "logps/chosen": -31.747081756591797, "logps/rejected": -33.04930877685547, "loss": 0.349, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6060883402824402, "rewards/margins": 1.2239677906036377, "rewards/rejected": -0.6178793907165527, "step": 530 }, { "epoch": 1.4, "grad_norm": 11.0, "learning_rate": 4.102623991469562e-06, "logits/chosen": -1.7875115871429443, "logits/rejected": -1.7967026233673096, "logps/chosen": -31.79451560974121, "logps/rejected": -32.5256233215332, "loss": 0.3606, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.697385847568512, "rewards/margins": 1.3000586032867432, "rewards/rejected": -0.602672815322876, "step": 540 }, { "epoch": 1.43, "grad_norm": 13.1875, "learning_rate": 4.058724504646834e-06, "logits/chosen": -1.8841272592544556, "logits/rejected": -1.8778330087661743, "logps/chosen": -32.83342742919922, "logps/rejected": -31.528995513916016, "loss": 0.3722, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.7873939275741577, "rewards/margins": 1.2712757587432861, "rewards/rejected": -0.48388180136680603, "step": 550 }, { "epoch": 1.45, "grad_norm": 10.125, "learning_rate": 4.014024217844167e-06, "logits/chosen": -1.9727208614349365, "logits/rejected": -1.9707006216049194, "logps/chosen": -33.61963653564453, "logps/rejected": -31.988178253173828, "loss": 0.3617, "rewards/accuracies": 0.875, "rewards/chosen": 0.8564074635505676, "rewards/margins": 1.298119306564331, "rewards/rejected": -0.4417116641998291, "step": 560 }, { "epoch": 1.48, "grad_norm": 12.625, "learning_rate": 3.968546095984911e-06, "logits/chosen": -1.8058189153671265, "logits/rejected": -1.8036988973617554, "logps/chosen": -31.945751190185547, "logps/rejected": -31.496994018554688, "loss": 0.3948, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.7655945420265198, "rewards/margins": 1.210815191268921, "rewards/rejected": -0.44522079825401306, "step": 570 }, { "epoch": 1.51, "grad_norm": 13.125, "learning_rate": 3.922313503607806e-06, "logits/chosen": -1.9404058456420898, "logits/rejected": -1.9370429515838623, "logps/chosen": -30.20537757873535, "logps/rejected": -35.37580871582031, "loss": 0.32, "rewards/accuracies": 0.875, "rewards/chosen": 0.8093468546867371, "rewards/margins": 1.4840278625488281, "rewards/rejected": -0.6746810078620911, "step": 580 }, { "epoch": 1.53, "grad_norm": 9.6875, "learning_rate": 3.875350192863368e-06, "logits/chosen": -1.8791511058807373, "logits/rejected": -1.8827145099639893, "logps/chosen": -28.8929386138916, "logps/rejected": -31.133419036865234, "loss": 0.3795, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6498847603797913, "rewards/margins": 1.0769810676574707, "rewards/rejected": -0.4270961880683899, "step": 590 }, { "epoch": 1.56, "grad_norm": 15.0625, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -1.915834665298462, "logits/rejected": -1.9157018661499023, "logps/chosen": -31.168197631835938, "logps/rejected": -31.740047454833984, "loss": 0.327, "rewards/accuracies": 0.875, "rewards/chosen": 0.9480020403862, "rewards/margins": 1.4403297901153564, "rewards/rejected": -0.49232783913612366, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.1867270469665527, "eval_logits/rejected": -2.1818978786468506, "eval_logps/chosen": -34.19493103027344, "eval_logps/rejected": -37.843685150146484, "eval_loss": 0.6703336834907532, "eval_rewards/accuracies": 0.5805647969245911, "eval_rewards/chosen": -0.14434270560741425, "eval_rewards/margins": 0.15001599490642548, "eval_rewards/rejected": -0.2943587005138397, "eval_runtime": 145.4291, "eval_samples_per_second": 2.359, "eval_steps_per_second": 0.296, "step": 600 }, { "epoch": 1.58, "grad_norm": 13.75, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -1.9787023067474365, "logits/rejected": -1.9794394969940186, "logps/chosen": -33.91895294189453, "logps/rejected": -33.77281951904297, "loss": 0.2961, "rewards/accuracies": 0.875, "rewards/chosen": 1.0000724792480469, "rewards/margins": 1.6198437213897705, "rewards/rejected": -0.6197710037231445, "step": 610 }, { "epoch": 1.61, "grad_norm": 10.25, "learning_rate": 3.730319028506478e-06, "logits/chosen": -1.9425113201141357, "logits/rejected": -1.9399850368499756, "logps/chosen": -32.121673583984375, "logps/rejected": -32.70948791503906, "loss": 0.3125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9033487439155579, "rewards/margins": 1.4789619445800781, "rewards/rejected": -0.5756131410598755, "step": 620 }, { "epoch": 1.64, "grad_norm": 30.75, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -1.953460693359375, "logits/rejected": -1.9444434642791748, "logps/chosen": -31.709529876708984, "logps/rejected": -31.5223388671875, "loss": 0.3619, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.9599300622940063, "rewards/margins": 1.488205909729004, "rewards/rejected": -0.5282759070396423, "step": 630 }, { "epoch": 1.66, "grad_norm": 17.25, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -1.9538257122039795, "logits/rejected": -1.9505916833877563, "logps/chosen": -31.257614135742188, "logps/rejected": -33.02531814575195, "loss": 0.3186, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8467265367507935, "rewards/margins": 1.3920328617095947, "rewards/rejected": -0.5453063249588013, "step": 640 }, { "epoch": 1.69, "grad_norm": 9.4375, "learning_rate": 3.579601087369492e-06, "logits/chosen": -1.9619709253311157, "logits/rejected": -1.9643146991729736, "logps/chosen": -32.65868377685547, "logps/rejected": -34.36846160888672, "loss": 0.2785, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8955224752426147, "rewards/margins": 1.512078881263733, "rewards/rejected": -0.6165562868118286, "step": 650 }, { "epoch": 1.71, "grad_norm": 19.625, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.8742033243179321, "logits/rejected": -1.8746894598007202, "logps/chosen": -32.91667175292969, "logps/rejected": -32.129493713378906, "loss": 0.328, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.9271462559700012, "rewards/margins": 1.3971627950668335, "rewards/rejected": -0.4700165390968323, "step": 660 }, { "epoch": 1.74, "grad_norm": 14.1875, "learning_rate": 3.476306177936961e-06, "logits/chosen": -1.9133888483047485, "logits/rejected": -1.9037227630615234, "logps/chosen": -32.933998107910156, "logps/rejected": -33.00373077392578, "loss": 0.2651, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.9605558514595032, "rewards/margins": 1.586902379989624, "rewards/rejected": -0.6263464689254761, "step": 670 }, { "epoch": 1.77, "grad_norm": 10.1875, "learning_rate": 3.423893017450324e-06, "logits/chosen": -1.8113712072372437, "logits/rejected": -1.8081716299057007, "logps/chosen": -30.349456787109375, "logps/rejected": -34.96870422363281, "loss": 0.2808, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0003511905670166, "rewards/margins": 1.5854613780975342, "rewards/rejected": -0.5851101875305176, "step": 680 }, { "epoch": 1.79, "grad_norm": 8.4375, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -1.8722549676895142, "logits/rejected": -1.8714803457260132, "logps/chosen": -34.03639602661133, "logps/rejected": -36.12696075439453, "loss": 0.2342, "rewards/accuracies": 0.9375, "rewards/chosen": 1.110505223274231, "rewards/margins": 1.8063312768936157, "rewards/rejected": -0.6958259344100952, "step": 690 }, { "epoch": 1.82, "grad_norm": 6.90625, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -1.8479416370391846, "logits/rejected": -1.8507680892944336, "logps/chosen": -31.403573989868164, "logps/rejected": -36.341434478759766, "loss": 0.3034, "rewards/accuracies": 0.875, "rewards/chosen": 1.079347014427185, "rewards/margins": 1.7091315984725952, "rewards/rejected": -0.6297845840454102, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -2.174877166748047, "eval_logits/rejected": -2.1700735092163086, "eval_logps/chosen": -34.240230560302734, "eval_logps/rejected": -37.869354248046875, "eval_loss": 0.6868197917938232, "eval_rewards/accuracies": 0.565614640712738, "eval_rewards/chosen": -0.18511110544204712, "eval_rewards/margins": 0.13234683871269226, "eval_rewards/rejected": -0.31745797395706177, "eval_runtime": 145.1401, "eval_samples_per_second": 2.363, "eval_steps_per_second": 0.296, "step": 700 }, { "epoch": 1.84, "grad_norm": 15.9375, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -1.9555838108062744, "logits/rejected": -1.958298683166504, "logps/chosen": -33.82456970214844, "logps/rejected": -34.84859085083008, "loss": 0.3553, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.9639171361923218, "rewards/margins": 1.4845099449157715, "rewards/rejected": -0.5205925703048706, "step": 710 }, { "epoch": 1.87, "grad_norm": 14.375, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -1.8090355396270752, "logits/rejected": -1.8031476736068726, "logps/chosen": -33.71675491333008, "logps/rejected": -32.93245315551758, "loss": 0.323, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.0048266649246216, "rewards/margins": 1.527376413345337, "rewards/rejected": -0.5225496292114258, "step": 720 }, { "epoch": 1.9, "grad_norm": 7.46875, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -1.9834911823272705, "logits/rejected": -1.9804458618164062, "logps/chosen": -29.5953426361084, "logps/rejected": -32.31734085083008, "loss": 0.2567, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9518804550170898, "rewards/margins": 1.7341070175170898, "rewards/rejected": -0.7822265028953552, "step": 730 }, { "epoch": 1.92, "grad_norm": 7.28125, "learning_rate": 3.100405083388799e-06, "logits/chosen": -1.8222984075546265, "logits/rejected": -1.8223203420639038, "logps/chosen": -32.201148986816406, "logps/rejected": -38.2669792175293, "loss": 0.2789, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.1358217000961304, "rewards/margins": 1.7110786437988281, "rewards/rejected": -0.5752568244934082, "step": 740 }, { "epoch": 1.95, "grad_norm": 6.75, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -1.7060989141464233, "logits/rejected": -1.711395025253296, "logps/chosen": -35.7166748046875, "logps/rejected": -34.62081527709961, "loss": 0.3267, "rewards/accuracies": 0.875, "rewards/chosen": 1.09420907497406, "rewards/margins": 1.6778392791748047, "rewards/rejected": -0.5836302638053894, "step": 750 }, { "epoch": 1.97, "grad_norm": 9.0, "learning_rate": 2.989809792446417e-06, "logits/chosen": -1.9088201522827148, "logits/rejected": -1.910348892211914, "logps/chosen": -31.48809814453125, "logps/rejected": -33.39298629760742, "loss": 0.2839, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.005824327468872, "rewards/margins": 1.5582213401794434, "rewards/rejected": -0.5523970723152161, "step": 760 }, { "epoch": 2.0, "grad_norm": 14.625, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.882775068283081, "logits/rejected": -1.8818010091781616, "logps/chosen": -30.975149154663086, "logps/rejected": -35.45886993408203, "loss": 0.3076, "rewards/accuracies": 0.908333420753479, "rewards/chosen": 1.0804139375686646, "rewards/margins": 1.4680591821670532, "rewards/rejected": -0.3876451253890991, "step": 770 }, { "epoch": 2.03, "grad_norm": 6.53125, "learning_rate": 2.878208065043501e-06, "logits/chosen": -1.882063627243042, "logits/rejected": -1.8813728094100952, "logps/chosen": -33.322509765625, "logps/rejected": -32.38517379760742, "loss": 0.1884, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.193357229232788, "rewards/margins": 2.062441349029541, "rewards/rejected": -0.8690838813781738, "step": 780 }, { "epoch": 2.05, "grad_norm": 10.0625, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -1.9113022089004517, "logits/rejected": -1.909850835800171, "logps/chosen": -28.325769424438477, "logps/rejected": -33.69379425048828, "loss": 0.1718, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1136726140975952, "rewards/margins": 2.0654101371765137, "rewards/rejected": -0.9517375826835632, "step": 790 }, { "epoch": 2.08, "grad_norm": 4.375, "learning_rate": 2.76582921478147e-06, "logits/chosen": -1.9627540111541748, "logits/rejected": -1.959313988685608, "logps/chosen": -31.032363891601562, "logps/rejected": -35.28049087524414, "loss": 0.1649, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2553597688674927, "rewards/margins": 2.2286696434020996, "rewards/rejected": -0.9733098745346069, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -2.164193868637085, "eval_logits/rejected": -2.1594185829162598, "eval_logps/chosen": -34.28217315673828, "eval_logps/rejected": -37.94434356689453, "eval_loss": 0.6811564564704895, "eval_rewards/accuracies": 0.595099687576294, "eval_rewards/chosen": -0.22285737097263336, "eval_rewards/margins": 0.16209454834461212, "eval_rewards/rejected": -0.3849518895149231, "eval_runtime": 145.2348, "eval_samples_per_second": 2.362, "eval_steps_per_second": 0.296, "step": 800 }, { "epoch": 2.1, "grad_norm": 7.375, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -1.793891191482544, "logits/rejected": -1.7861675024032593, "logps/chosen": -32.827354431152344, "logps/rejected": -35.588600158691406, "loss": 0.1606, "rewards/accuracies": 0.9375, "rewards/chosen": 1.465995192527771, "rewards/margins": 2.391630172729492, "rewards/rejected": -0.9256349802017212, "step": 810 }, { "epoch": 2.13, "grad_norm": 8.75, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -1.8692048788070679, "logits/rejected": -1.879642128944397, "logps/chosen": -34.127784729003906, "logps/rejected": -33.262062072753906, "loss": 0.1935, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4111144542694092, "rewards/margins": 2.158674955368042, "rewards/rejected": -0.7475605607032776, "step": 820 }, { "epoch": 2.16, "grad_norm": 11.375, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -1.918723702430725, "logits/rejected": -1.9233121871948242, "logps/chosen": -33.50402069091797, "logps/rejected": -30.132360458374023, "loss": 0.1956, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2588849067687988, "rewards/margins": 2.036588430404663, "rewards/rejected": -0.777703583240509, "step": 830 }, { "epoch": 2.18, "grad_norm": 6.65625, "learning_rate": 2.53966490958702e-06, "logits/chosen": -1.9170500040054321, "logits/rejected": -1.9250987768173218, "logps/chosen": -33.287925720214844, "logps/rejected": -30.88974952697754, "loss": 0.1648, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.350099802017212, "rewards/margins": 2.28794264793396, "rewards/rejected": -0.937842845916748, "step": 840 }, { "epoch": 2.21, "grad_norm": 9.1875, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -1.8453495502471924, "logits/rejected": -1.835889458656311, "logps/chosen": -30.441539764404297, "logps/rejected": -32.993690490722656, "loss": 0.1814, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2164894342422485, "rewards/margins": 2.098388195037842, "rewards/rejected": -0.8818984031677246, "step": 850 }, { "epoch": 2.23, "grad_norm": 5.25, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -1.983048677444458, "logits/rejected": -1.973232626914978, "logps/chosen": -24.832033157348633, "logps/rejected": -30.95058250427246, "loss": 0.1791, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2528350353240967, "rewards/margins": 2.2232518196105957, "rewards/rejected": -0.9704168438911438, "step": 860 }, { "epoch": 2.26, "grad_norm": 7.1875, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -1.8357470035552979, "logits/rejected": -1.8370872735977173, "logps/chosen": -32.829017639160156, "logps/rejected": -30.976612091064453, "loss": 0.1644, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.353589653968811, "rewards/margins": 2.2482855319976807, "rewards/rejected": -0.8946956396102905, "step": 870 }, { "epoch": 2.29, "grad_norm": 6.46875, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.8432958126068115, "logits/rejected": -1.8441736698150635, "logps/chosen": -31.128490447998047, "logps/rejected": -34.13156509399414, "loss": 0.2286, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1346783638000488, "rewards/margins": 2.107259750366211, "rewards/rejected": -0.9725813865661621, "step": 880 }, { "epoch": 2.31, "grad_norm": 6.4375, "learning_rate": 2.256719512667651e-06, "logits/chosen": -1.747079849243164, "logits/rejected": -1.7454869747161865, "logps/chosen": -34.19733810424805, "logps/rejected": -37.23557662963867, "loss": 0.1614, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3208175897598267, "rewards/margins": 2.543323040008545, "rewards/rejected": -1.2225055694580078, "step": 890 }, { "epoch": 2.34, "grad_norm": 6.0, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -1.8969894647598267, "logits/rejected": -1.8902244567871094, "logps/chosen": -30.53342056274414, "logps/rejected": -33.870323181152344, "loss": 0.1691, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3222969770431519, "rewards/margins": 2.3233275413513184, "rewards/rejected": -1.0010308027267456, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -2.15236496925354, "eval_logits/rejected": -2.1475820541381836, "eval_logps/chosen": -34.313846588134766, "eval_logps/rejected": -37.98138427734375, "eval_loss": 0.6880638599395752, "eval_rewards/accuracies": 0.5830564498901367, "eval_rewards/chosen": -0.2513664662837982, "eval_rewards/margins": 0.16692043840885162, "eval_rewards/rejected": -0.41828688979148865, "eval_runtime": 145.3621, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.296, "step": 900 }, { "epoch": 2.36, "grad_norm": 12.0625, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -1.9361755847930908, "logits/rejected": -1.9315325021743774, "logps/chosen": -32.13105010986328, "logps/rejected": -35.70573425292969, "loss": 0.1596, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2839645147323608, "rewards/margins": 2.3433659076690674, "rewards/rejected": -1.0594011545181274, "step": 910 }, { "epoch": 2.39, "grad_norm": 6.5, "learning_rate": 2.088219349982323e-06, "logits/chosen": -1.8552948236465454, "logits/rejected": -1.860346794128418, "logps/chosen": -33.7907829284668, "logps/rejected": -33.559757232666016, "loss": 0.1777, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4020235538482666, "rewards/margins": 2.3558895587921143, "rewards/rejected": -0.9538658261299133, "step": 920 }, { "epoch": 2.42, "grad_norm": 6.53125, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -1.953168272972107, "logits/rejected": -1.943918228149414, "logps/chosen": -30.902240753173828, "logps/rejected": -35.58992385864258, "loss": 0.1581, "rewards/accuracies": 1.0, "rewards/chosen": 1.2444217205047607, "rewards/margins": 2.276834011077881, "rewards/rejected": -1.032412052154541, "step": 930 }, { "epoch": 2.44, "grad_norm": 8.5, "learning_rate": 1.976895560604729e-06, "logits/chosen": -1.879119873046875, "logits/rejected": -1.8759711980819702, "logps/chosen": -30.023090362548828, "logps/rejected": -33.454833984375, "loss": 0.2017, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1388328075408936, "rewards/margins": 2.029343366622925, "rewards/rejected": -0.8905106782913208, "step": 940 }, { "epoch": 2.47, "grad_norm": 4.625, "learning_rate": 1.921622518534466e-06, "logits/chosen": -1.8175594806671143, "logits/rejected": -1.8249561786651611, "logps/chosen": -31.570093154907227, "logps/rejected": -36.65325164794922, "loss": 0.2168, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.1708660125732422, "rewards/margins": 2.163855791091919, "rewards/rejected": -0.9929895401000977, "step": 950 }, { "epoch": 2.49, "grad_norm": 3.953125, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -1.8278528451919556, "logits/rejected": -1.8209831714630127, "logps/chosen": -32.37910842895508, "logps/rejected": -37.90230178833008, "loss": 0.1494, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3044663667678833, "rewards/margins": 2.4776108264923096, "rewards/rejected": -1.1731446981430054, "step": 960 }, { "epoch": 2.52, "grad_norm": 5.375, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -1.8297231197357178, "logits/rejected": -1.8303101062774658, "logps/chosen": -29.768722534179688, "logps/rejected": -33.98983383178711, "loss": 0.1847, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3209145069122314, "rewards/margins": 2.2286314964294434, "rewards/rejected": -0.9077168703079224, "step": 970 }, { "epoch": 2.55, "grad_norm": 7.9375, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -1.8326854705810547, "logits/rejected": -1.8439195156097412, "logps/chosen": -31.450542449951172, "logps/rejected": -35.19971466064453, "loss": 0.168, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.4635357856750488, "rewards/margins": 2.4251866340637207, "rewards/rejected": -0.9616511464118958, "step": 980 }, { "epoch": 2.57, "grad_norm": 6.0, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -1.903786301612854, "logits/rejected": -1.901314377784729, "logps/chosen": -34.712928771972656, "logps/rejected": -33.3343620300293, "loss": 0.2409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2673081159591675, "rewards/margins": 2.1089327335357666, "rewards/rejected": -0.8416244387626648, "step": 990 }, { "epoch": 2.6, "grad_norm": 9.5, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -1.875101089477539, "logits/rejected": -1.8791959285736084, "logps/chosen": -33.045143127441406, "logps/rejected": -35.46049880981445, "loss": 0.1953, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2763327360153198, "rewards/margins": 2.153337001800537, "rewards/rejected": -0.8770040273666382, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -2.1447298526763916, "eval_logits/rejected": -2.139974594116211, "eval_logps/chosen": -34.36631774902344, "eval_logps/rejected": -38.036582946777344, "eval_loss": 0.695711076259613, "eval_rewards/accuracies": 0.5917773842811584, "eval_rewards/chosen": -0.2985913157463074, "eval_rewards/margins": 0.16937348246574402, "eval_rewards/rejected": -0.4679647982120514, "eval_runtime": 145.3859, "eval_samples_per_second": 2.359, "eval_steps_per_second": 0.296, "step": 1000 }, { "epoch": 2.62, "grad_norm": 20.25, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -1.8841043710708618, "logits/rejected": -1.8890196084976196, "logps/chosen": -31.524646759033203, "logps/rejected": -33.430824279785156, "loss": 0.2343, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1482000350952148, "rewards/margins": 1.9668105840682983, "rewards/rejected": -0.8186105489730835, "step": 1010 }, { "epoch": 2.65, "grad_norm": 15.5625, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -1.843640685081482, "logits/rejected": -1.8461425304412842, "logps/chosen": -29.06394386291504, "logps/rejected": -32.841705322265625, "loss": 0.1766, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2435271739959717, "rewards/margins": 2.2718143463134766, "rewards/rejected": -1.0282870531082153, "step": 1020 }, { "epoch": 2.68, "grad_norm": 4.625, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -1.8973257541656494, "logits/rejected": -1.8918126821517944, "logps/chosen": -32.64158248901367, "logps/rejected": -34.6368293762207, "loss": 0.1812, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2127745151519775, "rewards/margins": 2.2197234630584717, "rewards/rejected": -1.0069488286972046, "step": 1030 }, { "epoch": 2.7, "grad_norm": 17.875, "learning_rate": 1.440887158673332e-06, "logits/chosen": -1.8320951461791992, "logits/rejected": -1.836050033569336, "logps/chosen": -34.43301010131836, "logps/rejected": -35.46531677246094, "loss": 0.1938, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1886460781097412, "rewards/margins": 2.1298069953918457, "rewards/rejected": -0.9411608576774597, "step": 1040 }, { "epoch": 2.73, "grad_norm": 7.09375, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -1.8290131092071533, "logits/rejected": -1.841897964477539, "logps/chosen": -30.590591430664062, "logps/rejected": -34.06149673461914, "loss": 0.2044, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1450647115707397, "rewards/margins": 2.082374095916748, "rewards/rejected": -0.9373094439506531, "step": 1050 }, { "epoch": 2.75, "grad_norm": 19.25, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -1.8185697793960571, "logits/rejected": -1.8119618892669678, "logps/chosen": -30.647253036499023, "logps/rejected": -34.33002471923828, "loss": 0.1727, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3608683347702026, "rewards/margins": 2.2717270851135254, "rewards/rejected": -0.9108586311340332, "step": 1060 }, { "epoch": 2.78, "grad_norm": 5.1875, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -1.944183349609375, "logits/rejected": -1.944902777671814, "logps/chosen": -32.107669830322266, "logps/rejected": -33.907928466796875, "loss": 0.1656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2954630851745605, "rewards/margins": 2.2523720264434814, "rewards/rejected": -0.9569088816642761, "step": 1070 }, { "epoch": 2.81, "grad_norm": 10.0, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -1.8745800256729126, "logits/rejected": -1.8734228610992432, "logps/chosen": -32.74195861816406, "logps/rejected": -34.108428955078125, "loss": 0.2386, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.2130850553512573, "rewards/margins": 2.0380234718322754, "rewards/rejected": -0.8249381184577942, "step": 1080 }, { "epoch": 2.83, "grad_norm": 4.96875, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -1.8999583721160889, "logits/rejected": -1.8924366235733032, "logps/chosen": -32.777381896972656, "logps/rejected": -31.974105834960938, "loss": 0.1764, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3036444187164307, "rewards/margins": 2.142960548400879, "rewards/rejected": -0.8393163681030273, "step": 1090 }, { "epoch": 2.86, "grad_norm": 5.125, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -1.8712406158447266, "logits/rejected": -1.870269775390625, "logps/chosen": -33.90606689453125, "logps/rejected": -37.61457061767578, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 1.361112117767334, "rewards/margins": 2.496788263320923, "rewards/rejected": -1.1356757879257202, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -2.142699956893921, "eval_logits/rejected": -2.1379306316375732, "eval_logps/chosen": -34.36824035644531, "eval_logps/rejected": -38.02313995361328, "eval_loss": 0.7009721994400024, "eval_rewards/accuracies": 0.5714285373687744, "eval_rewards/chosen": -0.30031847953796387, "eval_rewards/margins": 0.15554992854595184, "eval_rewards/rejected": -0.4558684229850769, "eval_runtime": 145.3955, "eval_samples_per_second": 2.359, "eval_steps_per_second": 0.296, "step": 1100 }, { "epoch": 2.88, "grad_norm": 6.53125, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -1.9193534851074219, "logits/rejected": -1.9247900247573853, "logps/chosen": -34.15703582763672, "logps/rejected": -36.63218688964844, "loss": 0.2075, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.179071307182312, "rewards/margins": 2.15881085395813, "rewards/rejected": -0.9797393679618835, "step": 1110 }, { "epoch": 2.91, "grad_norm": 4.625, "learning_rate": 1.049857726072005e-06, "logits/chosen": -1.888942003250122, "logits/rejected": -1.8869132995605469, "logps/chosen": -31.29522132873535, "logps/rejected": -34.09874725341797, "loss": 0.1873, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1888843774795532, "rewards/margins": 2.193281650543213, "rewards/rejected": -1.0043971538543701, "step": 1120 }, { "epoch": 2.94, "grad_norm": 5.90625, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -1.7918437719345093, "logits/rejected": -1.7945436239242554, "logps/chosen": -28.33380126953125, "logps/rejected": -31.490909576416016, "loss": 0.156, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.273350715637207, "rewards/margins": 2.394632577896118, "rewards/rejected": -1.1212818622589111, "step": 1130 }, { "epoch": 2.96, "grad_norm": 5.8125, "learning_rate": 9.59060791022566e-07, "logits/chosen": -1.8893934488296509, "logits/rejected": -1.8862323760986328, "logps/chosen": -31.71720314025879, "logps/rejected": -33.38925552368164, "loss": 0.2101, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.337890863418579, "rewards/margins": 2.114030122756958, "rewards/rejected": -0.7761393189430237, "step": 1140 }, { "epoch": 2.99, "grad_norm": 8.8125, "learning_rate": 9.148382544856885e-07, "logits/chosen": -1.8082507848739624, "logits/rejected": -1.8019250631332397, "logps/chosen": -26.8853702545166, "logps/rejected": -33.35715103149414, "loss": 0.1765, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1273572444915771, "rewards/margins": 2.2153234481811523, "rewards/rejected": -1.0879663228988647, "step": 1150 }, { "epoch": 3.01, "grad_norm": 3.359375, "learning_rate": 8.714301001505568e-07, "logits/chosen": -1.922569990158081, "logits/rejected": -1.920064926147461, "logps/chosen": -31.649404525756836, "logps/rejected": -36.346885681152344, "loss": 0.1692, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3019969463348389, "rewards/margins": 2.431814670562744, "rewards/rejected": -1.1298176050186157, "step": 1160 }, { "epoch": 3.04, "grad_norm": 4.625, "learning_rate": 8.288586291031025e-07, "logits/chosen": -1.9079128503799438, "logits/rejected": -1.9058338403701782, "logps/chosen": -30.213918685913086, "logps/rejected": -33.71696472167969, "loss": 0.1383, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2604410648345947, "rewards/margins": 2.478085517883301, "rewards/rejected": -1.217644453048706, "step": 1170 }, { "epoch": 3.06, "grad_norm": 4.96875, "learning_rate": 7.871457125803897e-07, "logits/chosen": -1.9004592895507812, "logits/rejected": -1.8882315158843994, "logps/chosen": -34.54669189453125, "logps/rejected": -35.38771057128906, "loss": 0.1478, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.348745584487915, "rewards/margins": 2.5333569049835205, "rewards/rejected": -1.1846110820770264, "step": 1180 }, { "epoch": 3.09, "grad_norm": 6.75, "learning_rate": 7.463127807341966e-07, "logits/chosen": -1.7955455780029297, "logits/rejected": -1.790220022201538, "logps/chosen": -33.658992767333984, "logps/rejected": -35.314979553222656, "loss": 0.1451, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.52162504196167, "rewards/margins": 2.4910683631896973, "rewards/rejected": -0.9694433212280273, "step": 1190 }, { "epoch": 3.12, "grad_norm": 6.90625, "learning_rate": 7.063808116212021e-07, "logits/chosen": -1.8400506973266602, "logits/rejected": -1.8432636260986328, "logps/chosen": -31.264917373657227, "logps/rejected": -32.59490203857422, "loss": 0.1796, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2151604890823364, "rewards/margins": 2.21301007270813, "rewards/rejected": -0.9978495836257935, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -2.142347574234009, "eval_logits/rejected": -2.137584686279297, "eval_logps/chosen": -34.35411071777344, "eval_logps/rejected": -38.025672912597656, "eval_loss": 0.6907772421836853, "eval_rewards/accuracies": 0.5747508406639099, "eval_rewards/chosen": -0.2876059412956238, "eval_rewards/margins": 0.1705409586429596, "eval_rewards/rejected": -0.45814695954322815, "eval_runtime": 145.3519, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.296, "step": 1200 }, { "epoch": 3.14, "grad_norm": 9.9375, "learning_rate": 6.673703204254348e-07, "logits/chosen": -1.8635101318359375, "logits/rejected": -1.858994722366333, "logps/chosen": -29.87615966796875, "logps/rejected": -32.12238311767578, "loss": 0.1548, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2577589750289917, "rewards/margins": 2.350618362426758, "rewards/rejected": -1.0928595066070557, "step": 1210 }, { "epoch": 3.17, "grad_norm": 5.375, "learning_rate": 6.293013489185315e-07, "logits/chosen": -1.8775825500488281, "logits/rejected": -1.8721100091934204, "logps/chosen": -33.751792907714844, "logps/rejected": -35.85100555419922, "loss": 0.1601, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3699350357055664, "rewards/margins": 2.522946834564209, "rewards/rejected": -1.1530119180679321, "step": 1220 }, { "epoch": 3.19, "grad_norm": 4.25, "learning_rate": 5.921934551632086e-07, "logits/chosen": -1.879861831665039, "logits/rejected": -1.8666155338287354, "logps/chosen": -31.769283294677734, "logps/rejected": -35.38819122314453, "loss": 0.1889, "rewards/accuracies": 0.875, "rewards/chosen": 1.1922564506530762, "rewards/margins": 2.333768606185913, "rewards/rejected": -1.141512393951416, "step": 1230 }, { "epoch": 3.22, "grad_norm": 3.765625, "learning_rate": 5.560657034652405e-07, "logits/chosen": -1.9246861934661865, "logits/rejected": -1.922136664390564, "logps/chosen": -33.49988555908203, "logps/rejected": -32.82632064819336, "loss": 0.1608, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3797134160995483, "rewards/margins": 2.2675235271453857, "rewards/rejected": -0.8878101110458374, "step": 1240 }, { "epoch": 3.25, "grad_norm": 5.96875, "learning_rate": 5.2093665457911e-07, "logits/chosen": -1.851680040359497, "logits/rejected": -1.8488292694091797, "logps/chosen": -32.87358856201172, "logps/rejected": -36.248497009277344, "loss": 0.1342, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3703018426895142, "rewards/margins": 2.6765310764312744, "rewards/rejected": -1.3062288761138916, "step": 1250 }, { "epoch": 3.27, "grad_norm": 4.5625, "learning_rate": 4.868243561723535e-07, "logits/chosen": -1.9186599254608154, "logits/rejected": -1.913433313369751, "logps/chosen": -29.251794815063477, "logps/rejected": -33.522377014160156, "loss": 0.1506, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3581030368804932, "rewards/margins": 2.383944511413574, "rewards/rejected": -1.025841474533081, "step": 1260 }, { "epoch": 3.3, "grad_norm": 4.15625, "learning_rate": 4.537463335535161e-07, "logits/chosen": -1.9449501037597656, "logits/rejected": -1.9497982263565063, "logps/chosen": -31.12078285217285, "logps/rejected": -32.769309997558594, "loss": 0.1426, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4311946630477905, "rewards/margins": 2.410348892211914, "rewards/rejected": -0.9791544079780579, "step": 1270 }, { "epoch": 3.32, "grad_norm": 7.09375, "learning_rate": 4.217195806684629e-07, "logits/chosen": -1.8380733728408813, "logits/rejected": -1.8454326391220093, "logps/chosen": -33.449485778808594, "logps/rejected": -33.86452102661133, "loss": 0.1535, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3119791746139526, "rewards/margins": 2.4516043663024902, "rewards/rejected": -1.1396249532699585, "step": 1280 }, { "epoch": 3.35, "grad_norm": 4.625, "learning_rate": 3.907605513696808e-07, "logits/chosen": -1.7485191822052002, "logits/rejected": -1.7507747411727905, "logps/chosen": -31.557483673095703, "logps/rejected": -37.77085494995117, "loss": 0.1711, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.416973352432251, "rewards/margins": 2.511898994445801, "rewards/rejected": -1.0949256420135498, "step": 1290 }, { "epoch": 3.38, "grad_norm": 4.65625, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -1.7980448007583618, "logits/rejected": -1.8015098571777344, "logps/chosen": -31.520572662353516, "logps/rejected": -33.67357635498047, "loss": 0.1264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5264867544174194, "rewards/margins": 2.614957332611084, "rewards/rejected": -1.088470458984375, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -2.1421782970428467, "eval_logits/rejected": -2.1374199390411377, "eval_logps/chosen": -34.34251022338867, "eval_logps/rejected": -38.01955795288086, "eval_loss": 0.6911265850067139, "eval_rewards/accuracies": 0.5892857313156128, "eval_rewards/chosen": -0.27716198563575745, "eval_rewards/margins": 0.17548424005508423, "eval_rewards/rejected": -0.4526461660861969, "eval_runtime": 145.3336, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.296, "step": 1300 }, { "epoch": 3.4, "grad_norm": 3.578125, "learning_rate": 3.321087280364757e-07, "logits/chosen": -1.9077885150909424, "logits/rejected": -1.8874790668487549, "logps/chosen": -29.57822036743164, "logps/rejected": -36.54710388183594, "loss": 0.1528, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3816629648208618, "rewards/margins": 2.624289035797119, "rewards/rejected": -1.242626428604126, "step": 1310 }, { "epoch": 3.43, "grad_norm": 5.71875, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.8845760822296143, "logits/rejected": -1.8903357982635498, "logps/chosen": -31.33782958984375, "logps/rejected": -33.613182067871094, "loss": 0.1368, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5263617038726807, "rewards/margins": 2.5365498065948486, "rewards/rejected": -1.0101878643035889, "step": 1320 }, { "epoch": 3.45, "grad_norm": 3.9375, "learning_rate": 2.779113783626916e-07, "logits/chosen": -1.8593124151229858, "logits/rejected": -1.8543720245361328, "logps/chosen": -31.642696380615234, "logps/rejected": -36.003231048583984, "loss": 0.1361, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2418328523635864, "rewards/margins": 2.529675245285034, "rewards/rejected": -1.2878425121307373, "step": 1330 }, { "epoch": 3.48, "grad_norm": 5.03125, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -1.7216873168945312, "logits/rejected": -1.731299638748169, "logps/chosen": -32.13956069946289, "logps/rejected": -32.022666931152344, "loss": 0.153, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4940283298492432, "rewards/margins": 2.383723735809326, "rewards/rejected": -0.8896951675415039, "step": 1340 }, { "epoch": 3.51, "grad_norm": 4.46875, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -1.779524803161621, "logits/rejected": -1.784233808517456, "logps/chosen": -32.241004943847656, "logps/rejected": -34.2801513671875, "loss": 0.1178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5389398336410522, "rewards/margins": 2.5067484378814697, "rewards/rejected": -0.9678082466125488, "step": 1350 }, { "epoch": 3.53, "grad_norm": 22.75, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -1.808258056640625, "logits/rejected": -1.813680648803711, "logps/chosen": -32.30830001831055, "logps/rejected": -35.810401916503906, "loss": 0.1575, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.371025800704956, "rewards/margins": 2.493631601333618, "rewards/rejected": -1.1226056814193726, "step": 1360 }, { "epoch": 3.56, "grad_norm": 3.953125, "learning_rate": 1.833161662683672e-07, "logits/chosen": -1.900948166847229, "logits/rejected": -1.8964240550994873, "logps/chosen": -31.239208221435547, "logps/rejected": -32.06488800048828, "loss": 0.1478, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2568284273147583, "rewards/margins": 2.2945187091827393, "rewards/rejected": -1.0376904010772705, "step": 1370 }, { "epoch": 3.58, "grad_norm": 6.21875, "learning_rate": 1.626139998169246e-07, "logits/chosen": -1.8157840967178345, "logits/rejected": -1.817800760269165, "logps/chosen": -28.267253875732422, "logps/rejected": -31.886409759521484, "loss": 0.1623, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2528432607650757, "rewards/margins": 2.2808501720428467, "rewards/rejected": -1.0280072689056396, "step": 1380 }, { "epoch": 3.61, "grad_norm": 9.25, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -1.8119876384735107, "logits/rejected": -1.8166316747665405, "logps/chosen": -30.596643447875977, "logps/rejected": -33.92557907104492, "loss": 0.1638, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.280565619468689, "rewards/margins": 2.124329090118408, "rewards/rejected": -0.8437638282775879, "step": 1390 }, { "epoch": 3.64, "grad_norm": 3.5, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -1.8785665035247803, "logits/rejected": -1.881792664527893, "logps/chosen": -33.31572723388672, "logps/rejected": -34.651222229003906, "loss": 0.1206, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4533611536026, "rewards/margins": 2.5582594871520996, "rewards/rejected": -1.10489821434021, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -2.1418943405151367, "eval_logits/rejected": -2.137127637863159, "eval_logps/chosen": -34.35324478149414, "eval_logps/rejected": -38.02573013305664, "eval_loss": 0.692409336566925, "eval_rewards/accuracies": 0.5917773842811584, "eval_rewards/chosen": -0.28682059049606323, "eval_rewards/margins": 0.17138195037841797, "eval_rewards/rejected": -0.4582025408744812, "eval_runtime": 145.3556, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.296, "step": 1400 }, { "epoch": 3.66, "grad_norm": 6.125, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -1.946838140487671, "logits/rejected": -1.939971923828125, "logps/chosen": -32.17081832885742, "logps/rejected": -35.340518951416016, "loss": 0.1438, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4685560464859009, "rewards/margins": 2.454911470413208, "rewards/rejected": -0.9863556027412415, "step": 1410 }, { "epoch": 3.69, "grad_norm": 11.0, "learning_rate": 9.191080703056604e-08, "logits/chosen": -1.8355038166046143, "logits/rejected": -1.8465068340301514, "logps/chosen": -32.56521987915039, "logps/rejected": -34.721900939941406, "loss": 0.1399, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4256752729415894, "rewards/margins": 2.4434168338775635, "rewards/rejected": -1.0177414417266846, "step": 1420 }, { "epoch": 3.71, "grad_norm": 7.5625, "learning_rate": 7.730678442730539e-08, "logits/chosen": -1.9164823293685913, "logits/rejected": -1.928348183631897, "logps/chosen": -33.428531646728516, "logps/rejected": -34.8869514465332, "loss": 0.1342, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4690487384796143, "rewards/margins": 2.5928056240081787, "rewards/rejected": -1.123757004737854, "step": 1430 }, { "epoch": 3.74, "grad_norm": 9.5625, "learning_rate": 6.394742864787806e-08, "logits/chosen": -1.9064290523529053, "logits/rejected": -1.9086523056030273, "logps/chosen": -31.696964263916016, "logps/rejected": -35.06704330444336, "loss": 0.1635, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3602821826934814, "rewards/margins": 2.4466512203216553, "rewards/rejected": -1.086369276046753, "step": 1440 }, { "epoch": 3.77, "grad_norm": 5.46875, "learning_rate": 5.183960310644748e-08, "logits/chosen": -1.8891935348510742, "logits/rejected": -1.8821351528167725, "logps/chosen": -33.91851043701172, "logps/rejected": -34.860595703125, "loss": 0.1585, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4848333597183228, "rewards/margins": 2.4660050868988037, "rewards/rejected": -0.981171727180481, "step": 1450 }, { "epoch": 3.79, "grad_norm": 4.96875, "learning_rate": 4.098952823928693e-08, "logits/chosen": -1.8560640811920166, "logits/rejected": -1.8620649576187134, "logps/chosen": -29.741008758544922, "logps/rejected": -34.997169494628906, "loss": 0.1378, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3249523639678955, "rewards/margins": 2.519402027130127, "rewards/rejected": -1.194449782371521, "step": 1460 }, { "epoch": 3.82, "grad_norm": 5.40625, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -1.803776502609253, "logits/rejected": -1.801746129989624, "logps/chosen": -29.282577514648438, "logps/rejected": -31.621936798095703, "loss": 0.1565, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4066790342330933, "rewards/margins": 2.331610918045044, "rewards/rejected": -0.9249318242073059, "step": 1470 }, { "epoch": 3.84, "grad_norm": 6.1875, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -1.9790493249893188, "logits/rejected": -1.9734687805175781, "logps/chosen": -33.892913818359375, "logps/rejected": -33.2518310546875, "loss": 0.1448, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.253114104270935, "rewards/margins": 2.3287227153778076, "rewards/rejected": -1.0756088495254517, "step": 1480 }, { "epoch": 3.87, "grad_norm": 7.28125, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -1.8717892169952393, "logits/rejected": -1.8740745782852173, "logps/chosen": -27.078582763671875, "logps/rejected": -29.020471572875977, "loss": 0.1879, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.2580173015594482, "rewards/margins": 2.156388759613037, "rewards/rejected": -0.8983713984489441, "step": 1490 }, { "epoch": 3.9, "grad_norm": 5.75, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -1.8792669773101807, "logits/rejected": -1.8736785650253296, "logps/chosen": -31.454416275024414, "logps/rejected": -33.20934295654297, "loss": 0.1645, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2117339372634888, "rewards/margins": 2.328080892562866, "rewards/rejected": -1.116347074508667, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -2.1418604850769043, "eval_logits/rejected": -2.1370973587036133, "eval_logps/chosen": -34.35454177856445, "eval_logps/rejected": -38.024742126464844, "eval_loss": 0.6943246126174927, "eval_rewards/accuracies": 0.5718438625335693, "eval_rewards/chosen": -0.28799253702163696, "eval_rewards/margins": 0.16931606829166412, "eval_rewards/rejected": -0.4573085606098175, "eval_runtime": 145.3262, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.296, "step": 1500 }, { "epoch": 3.92, "grad_norm": 9.4375, "learning_rate": 5.777746105209147e-09, "logits/chosen": -1.8055070638656616, "logits/rejected": -1.8097158670425415, "logps/chosen": -32.785335540771484, "logps/rejected": -35.42612075805664, "loss": 0.1841, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.2470948696136475, "rewards/margins": 2.1612586975097656, "rewards/rejected": -0.9141640663146973, "step": 1510 }, { "epoch": 3.95, "grad_norm": 5.40625, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -1.8615461587905884, "logits/rejected": -1.8599445819854736, "logps/chosen": -29.254053115844727, "logps/rejected": -33.70402908325195, "loss": 0.1759, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3321115970611572, "rewards/margins": 2.334989547729492, "rewards/rejected": -1.002877950668335, "step": 1520 }, { "epoch": 3.97, "grad_norm": 8.8125, "learning_rate": 6.421917227455999e-10, "logits/chosen": -1.960646390914917, "logits/rejected": -1.9528770446777344, "logps/chosen": -26.7325439453125, "logps/rejected": -29.855510711669922, "loss": 0.1584, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1973587274551392, "rewards/margins": 2.2628719806671143, "rewards/rejected": -1.065513253211975, "step": 1530 }, { "epoch": 4.0, "grad_norm": 7.0625, "learning_rate": 0.0, "logits/chosen": -1.863152265548706, "logits/rejected": -1.85297429561615, "logps/chosen": -31.610912322998047, "logps/rejected": -36.648475646972656, "loss": 0.1361, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3097574710845947, "rewards/margins": 2.4233245849609375, "rewards/rejected": -1.1135669946670532, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 0.19153660760297403, "train_runtime": 10767.2812, "train_samples_per_second": 1.144, "train_steps_per_second": 0.143 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }