{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010204081632653062, "grad_norm": 2.5858418941497803, "learning_rate": 1.020408163265306e-08, "logits/chosen": -0.8166377544403076, "logits/rejected": -0.6783266663551331, "logps/chosen": -295.1116943359375, "logps/rejected": -327.4919128417969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01020408163265306, "grad_norm": 2.2085180282592773, "learning_rate": 1.0204081632653061e-07, "logits/chosen": -0.7580730319023132, "logits/rejected": -0.7665800452232361, "logps/chosen": -232.833984375, "logps/rejected": -262.5196533203125, "loss": 0.6943, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.008211496286094189, "rewards/margins": 0.006794700864702463, "rewards/rejected": 0.0014167949557304382, "step": 10 }, { "epoch": 0.02040816326530612, "grad_norm": 2.0046942234039307, "learning_rate": 2.0408163265306121e-07, "logits/chosen": -0.7919167280197144, "logits/rejected": -0.7875319719314575, "logps/chosen": -208.1246795654297, "logps/rejected": -255.45565795898438, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008393492316827178, "rewards/margins": 0.014187255874276161, "rewards/rejected": -0.015026603825390339, "step": 20 }, { "epoch": 0.030612244897959183, "grad_norm": 1.6893500089645386, "learning_rate": 3.0612244897959183e-07, "logits/chosen": -0.653283953666687, "logits/rejected": -0.7346900105476379, "logps/chosen": -205.4357147216797, "logps/rejected": -266.03204345703125, "loss": 0.6916, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.015907617285847664, "rewards/margins": -0.014567399397492409, "rewards/rejected": -0.0013402182376012206, "step": 30 }, { "epoch": 0.04081632653061224, "grad_norm": 1.8859857320785522, "learning_rate": 4.0816326530612243e-07, "logits/chosen": -0.7067805528640747, "logits/rejected": -0.7997003793716431, "logps/chosen": -158.7813720703125, "logps/rejected": -174.23060607910156, "loss": 0.6932, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0005168341449461877, "rewards/margins": 0.005659103859215975, "rewards/rejected": -0.006175938528031111, "step": 40 }, { "epoch": 0.05102040816326531, "grad_norm": 1.9115869998931885, "learning_rate": 5.10204081632653e-07, "logits/chosen": -0.714581310749054, "logits/rejected": -0.7048059701919556, "logps/chosen": -247.39810180664062, "logps/rejected": -303.0687255859375, "loss": 0.6939, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.016289008781313896, "rewards/margins": 0.003998421598225832, "rewards/rejected": 0.0122905895113945, "step": 50 }, { "epoch": 0.061224489795918366, "grad_norm": 1.7422277927398682, "learning_rate": 6.122448979591837e-07, "logits/chosen": -0.6196914315223694, "logits/rejected": -0.5781084895133972, "logps/chosen": -144.5482940673828, "logps/rejected": -166.89369201660156, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": 0.03231963887810707, "rewards/margins": 0.028604138642549515, "rewards/rejected": 0.003715501632541418, "step": 60 }, { "epoch": 0.07142857142857142, "grad_norm": 1.6072555780410767, "learning_rate": 7.142857142857143e-07, "logits/chosen": -0.8848626017570496, "logits/rejected": -0.8458296656608582, "logps/chosen": -295.02587890625, "logps/rejected": -262.872802734375, "loss": 0.6872, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.042309124022722244, "rewards/margins": 0.015812452882528305, "rewards/rejected": 0.02649666927754879, "step": 70 }, { "epoch": 0.08163265306122448, "grad_norm": 2.0791561603546143, "learning_rate": 8.163265306122449e-07, "logits/chosen": -0.6937960386276245, "logits/rejected": -0.7234374284744263, "logps/chosen": -224.0985565185547, "logps/rejected": -269.337158203125, "loss": 0.6732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.07420104742050171, "rewards/margins": 0.04235782474279404, "rewards/rejected": 0.03184322267770767, "step": 80 }, { "epoch": 0.09183673469387756, "grad_norm": 1.962824821472168, "learning_rate": 9.183673469387755e-07, "logits/chosen": -0.8258784413337708, "logits/rejected": -0.8503522872924805, "logps/chosen": -144.6414337158203, "logps/rejected": -175.49205017089844, "loss": 0.6669, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11745353788137436, "rewards/margins": 0.07300996780395508, "rewards/rejected": 0.04444356635212898, "step": 90 }, { "epoch": 0.10204081632653061, "grad_norm": 2.2074368000030518, "learning_rate": 9.999873129474573e-07, "logits/chosen": -0.6715458035469055, "logits/rejected": -0.738847553730011, "logps/chosen": -164.20828247070312, "logps/rejected": -208.1692352294922, "loss": 0.6534, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.24040062725543976, "rewards/margins": 0.17234429717063904, "rewards/rejected": 0.06805632263422012, "step": 100 }, { "epoch": 0.10204081632653061, "eval_logits/chosen": -0.6843910217285156, "eval_logits/rejected": -0.6762082576751709, "eval_logps/chosen": -236.70948791503906, "eval_logps/rejected": -269.5726623535156, "eval_loss": 0.6139070391654968, "eval_rewards/accuracies": 0.7572254538536072, "eval_rewards/chosen": 0.2871367931365967, "eval_rewards/margins": 0.19106332957744598, "eval_rewards/rejected": 0.09607347846031189, "eval_runtime": 255.7234, "eval_samples_per_second": 10.805, "eval_steps_per_second": 1.353, "step": 100 }, { "epoch": 0.11224489795918367, "grad_norm": 2.093169689178467, "learning_rate": 9.99543333708549e-07, "logits/chosen": -0.6496793031692505, "logits/rejected": -0.5985936522483826, "logps/chosen": -185.32540893554688, "logps/rejected": -167.50845336914062, "loss": 0.636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.29740238189697266, "rewards/margins": 0.16500218212604523, "rewards/rejected": 0.13240019977092743, "step": 110 }, { "epoch": 0.12244897959183673, "grad_norm": 1.4166107177734375, "learning_rate": 9.98465645540859e-07, "logits/chosen": -0.7910449504852295, "logits/rejected": -0.8481178283691406, "logps/chosen": -147.35057067871094, "logps/rejected": -244.04727172851562, "loss": 0.6246, "rewards/accuracies": 0.75, "rewards/chosen": 0.2833811640739441, "rewards/margins": 0.13916133344173431, "rewards/rejected": 0.14421981573104858, "step": 120 }, { "epoch": 0.1326530612244898, "grad_norm": 1.4679032564163208, "learning_rate": 9.9675561557426e-07, "logits/chosen": -0.6362483501434326, "logits/rejected": -0.6534683704376221, "logps/chosen": -150.24880981445312, "logps/rejected": -175.07742309570312, "loss": 0.5876, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.458209753036499, "rewards/margins": 0.2481471598148346, "rewards/rejected": 0.21006262302398682, "step": 130 }, { "epoch": 0.14285714285714285, "grad_norm": 1.6511420011520386, "learning_rate": 9.944154131125642e-07, "logits/chosen": -0.6063439846038818, "logits/rejected": -0.6045389771461487, "logps/chosen": -191.49533081054688, "logps/rejected": -255.36972045898438, "loss": 0.5851, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5490959882736206, "rewards/margins": 0.31212958693504333, "rewards/rejected": 0.2369663417339325, "step": 140 }, { "epoch": 0.15306122448979592, "grad_norm": 1.6853809356689453, "learning_rate": 9.914480068815961e-07, "logits/chosen": -0.7999967336654663, "logits/rejected": -0.8724571466445923, "logps/chosen": -184.25137329101562, "logps/rejected": -204.84927368164062, "loss": 0.5554, "rewards/accuracies": 0.75, "rewards/chosen": 0.7743161916732788, "rewards/margins": 0.4069131910800934, "rewards/rejected": 0.3674030303955078, "step": 150 }, { "epoch": 0.16326530612244897, "grad_norm": 1.6386051177978516, "learning_rate": 9.878571612631363e-07, "logits/chosen": -0.7798652052879333, "logits/rejected": -0.7520347833633423, "logps/chosen": -186.4380645751953, "logps/rejected": -206.5469207763672, "loss": 0.5345, "rewards/accuracies": 0.875, "rewards/chosen": 0.9827211499214172, "rewards/margins": 0.4804193377494812, "rewards/rejected": 0.502301812171936, "step": 160 }, { "epoch": 0.17346938775510204, "grad_norm": 1.3212541341781616, "learning_rate": 9.836474315195147e-07, "logits/chosen": -0.7808311581611633, "logits/rejected": -0.8207923769950867, "logps/chosen": -186.6453094482422, "logps/rejected": -277.3121032714844, "loss": 0.5479, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9796573519706726, "rewards/margins": 0.272901713848114, "rewards/rejected": 0.7067556381225586, "step": 170 }, { "epoch": 0.1836734693877551, "grad_norm": 1.7019201517105103, "learning_rate": 9.788241580149122e-07, "logits/chosen": -0.7383798360824585, "logits/rejected": -0.7045127153396606, "logps/chosen": -183.7281036376953, "logps/rejected": -172.53787231445312, "loss": 0.5175, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.2867637872695923, "rewards/margins": 0.5766666531562805, "rewards/rejected": 0.7100971341133118, "step": 180 }, { "epoch": 0.19387755102040816, "grad_norm": 1.4821258783340454, "learning_rate": 9.73393459440701e-07, "logits/chosen": -0.7343258261680603, "logits/rejected": -0.7688428163528442, "logps/chosen": -228.34228515625, "logps/rejected": -278.99908447265625, "loss": 0.4814, "rewards/accuracies": 0.875, "rewards/chosen": 1.2823985815048218, "rewards/margins": 0.6514226794242859, "rewards/rejected": 0.6309759616851807, "step": 190 }, { "epoch": 0.20408163265306123, "grad_norm": 1.6141135692596436, "learning_rate": 9.673622250534155e-07, "logits/chosen": -0.6541659235954285, "logits/rejected": -0.6301986575126648, "logps/chosen": -157.97817993164062, "logps/rejected": -167.7241973876953, "loss": 0.4902, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.514700174331665, "rewards/margins": 0.7473451495170593, "rewards/rejected": 0.7673550844192505, "step": 200 }, { "epoch": 0.20408163265306123, "eval_logits/chosen": -0.650209903717041, "eval_logits/rejected": -0.6512798070907593, "eval_logps/chosen": -213.61082458496094, "eval_logps/rejected": -260.0234375, "eval_loss": 0.452963650226593, "eval_rewards/accuracies": 0.8063583970069885, "eval_rewards/chosen": 1.4420698881149292, "eval_rewards/margins": 0.8685339689254761, "eval_rewards/rejected": 0.5735359191894531, "eval_runtime": 252.7043, "eval_samples_per_second": 10.934, "eval_steps_per_second": 1.369, "step": 200 }, { "epoch": 0.21428571428571427, "grad_norm": 1.1991658210754395, "learning_rate": 9.607381059352038e-07, "logits/chosen": -0.6899908781051636, "logits/rejected": -0.6769914627075195, "logps/chosen": -170.54293823242188, "logps/rejected": -229.1009521484375, "loss": 0.4374, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.6567569971084595, "rewards/margins": 0.7052143812179565, "rewards/rejected": 0.9515425562858582, "step": 210 }, { "epoch": 0.22448979591836735, "grad_norm": 1.43000328540802, "learning_rate": 9.535295052878449e-07, "logits/chosen": -0.6404227614402771, "logits/rejected": -0.6296104192733765, "logps/chosen": -118.60369873046875, "logps/rejected": -171.08534240722656, "loss": 0.4418, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.7140804529190063, "rewards/margins": 0.9742295145988464, "rewards/rejected": 0.7398509979248047, "step": 220 }, { "epoch": 0.23469387755102042, "grad_norm": 1.6586986780166626, "learning_rate": 9.457455677726447e-07, "logits/chosen": -0.7370392084121704, "logits/rejected": -0.7159712910652161, "logps/chosen": -156.8678436279297, "logps/rejected": -178.65988159179688, "loss": 0.4, "rewards/accuracies": 0.875, "rewards/chosen": 1.953412413597107, "rewards/margins": 1.0005583763122559, "rewards/rejected": 0.9528541564941406, "step": 230 }, { "epoch": 0.24489795918367346, "grad_norm": 1.3758224248886108, "learning_rate": 9.37396167909733e-07, "logits/chosen": -0.70029217004776, "logits/rejected": -0.6873424053192139, "logps/chosen": -127.82401275634766, "logps/rejected": -177.5338592529297, "loss": 0.4333, "rewards/accuracies": 0.875, "rewards/chosen": 1.9316447973251343, "rewards/margins": 0.856708824634552, "rewards/rejected": 1.0749361515045166, "step": 240 }, { "epoch": 0.25510204081632654, "grad_norm": 1.4639925956726074, "learning_rate": 9.284918975514797e-07, "logits/chosen": -0.6979633569717407, "logits/rejected": -0.7350119352340698, "logps/chosen": -141.00392150878906, "logps/rejected": -214.472412109375, "loss": 0.4482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.902488350868225, "rewards/margins": 0.7867880463600159, "rewards/rejected": 1.1157002449035645, "step": 250 }, { "epoch": 0.2653061224489796, "grad_norm": 1.1242108345031738, "learning_rate": 9.190440524459202e-07, "logits/chosen": -0.5260006785392761, "logits/rejected": -0.6740385293960571, "logps/chosen": -176.22897338867188, "logps/rejected": -269.9410400390625, "loss": 0.4658, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.040607213973999, "rewards/margins": 1.309356689453125, "rewards/rejected": 0.7312506437301636, "step": 260 }, { "epoch": 0.2755102040816326, "grad_norm": 1.5435305833816528, "learning_rate": 9.09064617907235e-07, "logits/chosen": -0.7274152040481567, "logits/rejected": -0.7272646427154541, "logps/chosen": -203.15975952148438, "logps/rejected": -226.68399047851562, "loss": 0.4237, "rewards/accuracies": 0.875, "rewards/chosen": 2.141923666000366, "rewards/margins": 1.0537580251693726, "rewards/rejected": 1.0881658792495728, "step": 270 }, { "epoch": 0.2857142857142857, "grad_norm": 1.1201239824295044, "learning_rate": 8.985662536114612e-07, "logits/chosen": -0.6508474349975586, "logits/rejected": -0.659797191619873, "logps/chosen": -146.76266479492188, "logps/rejected": -182.32579040527344, "loss": 0.4144, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.2803127765655518, "rewards/margins": 1.108865737915039, "rewards/rejected": 1.1714469194412231, "step": 280 }, { "epoch": 0.29591836734693877, "grad_norm": 0.9688098430633545, "learning_rate": 8.875622775367259e-07, "logits/chosen": -0.6407650709152222, "logits/rejected": -0.6345282793045044, "logps/chosen": -178.30838012695312, "logps/rejected": -213.2600860595703, "loss": 0.4007, "rewards/accuracies": 0.75, "rewards/chosen": 2.2921342849731445, "rewards/margins": 0.9978988766670227, "rewards/rejected": 1.2942354679107666, "step": 290 }, { "epoch": 0.30612244897959184, "grad_norm": 1.3581178188323975, "learning_rate": 8.760666490683719e-07, "logits/chosen": -0.643558144569397, "logits/rejected": -0.6327681541442871, "logps/chosen": -128.29537963867188, "logps/rejected": -162.97642517089844, "loss": 0.391, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.3287272453308105, "rewards/margins": 1.0853456258773804, "rewards/rejected": 1.2433817386627197, "step": 300 }, { "epoch": 0.30612244897959184, "eval_logits/chosen": -0.6297730803489685, "eval_logits/rejected": -0.6321488618850708, "eval_logps/chosen": -204.23443603515625, "eval_logps/rejected": -257.63165283203125, "eval_loss": 0.39348161220550537, "eval_rewards/accuracies": 0.8381502628326416, "eval_rewards/chosen": 1.910889744758606, "eval_rewards/margins": 1.217763066291809, "eval_rewards/rejected": 0.6931266188621521, "eval_runtime": 252.9021, "eval_samples_per_second": 10.925, "eval_steps_per_second": 1.368, "step": 300 }, { "epoch": 0.3163265306122449, "grad_norm": 1.3367196321487427, "learning_rate": 8.640939512904095e-07, "logits/chosen": -0.6319596767425537, "logits/rejected": -0.6244379281997681, "logps/chosen": -146.67283630371094, "logps/rejected": -179.8704833984375, "loss": 0.386, "rewards/accuracies": 0.875, "rewards/chosen": 2.33925199508667, "rewards/margins": 1.3749719858169556, "rewards/rejected": 0.9642800092697144, "step": 310 }, { "epoch": 0.32653061224489793, "grad_norm": 0.9586181044578552, "learning_rate": 8.516593724857597e-07, "logits/chosen": -0.6093655824661255, "logits/rejected": -0.580748438835144, "logps/chosen": -180.91659545898438, "logps/rejected": -233.35824584960938, "loss": 0.4029, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.2102105617523193, "rewards/margins": 1.3727915287017822, "rewards/rejected": 0.8374192118644714, "step": 320 }, { "epoch": 0.336734693877551, "grad_norm": 0.8618925213813782, "learning_rate": 8.387786868687548e-07, "logits/chosen": -0.5689299702644348, "logits/rejected": -0.5300137400627136, "logps/chosen": -109.52386474609375, "logps/rejected": -144.4683837890625, "loss": 0.3223, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3346762657165527, "rewards/margins": 1.4390984773635864, "rewards/rejected": 0.8955775499343872, "step": 330 }, { "epoch": 0.3469387755102041, "grad_norm": 1.1246938705444336, "learning_rate": 8.254682345743405e-07, "logits/chosen": -0.769761323928833, "logits/rejected": -0.7216005921363831, "logps/chosen": -199.35218811035156, "logps/rejected": -197.91156005859375, "loss": 0.4085, "rewards/accuracies": 0.75, "rewards/chosen": 2.1999125480651855, "rewards/margins": 1.3574109077453613, "rewards/rejected": 0.8425019383430481, "step": 340 }, { "epoch": 0.35714285714285715, "grad_norm": 1.2478692531585693, "learning_rate": 8.117449009293668e-07, "logits/chosen": -0.7673205733299255, "logits/rejected": -0.7887976765632629, "logps/chosen": -165.0550537109375, "logps/rejected": -216.4749298095703, "loss": 0.3823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4004361629486084, "rewards/margins": 1.426731824874878, "rewards/rejected": 0.9737041592597961, "step": 350 }, { "epoch": 0.3673469387755102, "grad_norm": 1.0376545190811157, "learning_rate": 7.976260950322571e-07, "logits/chosen": -0.6736082434654236, "logits/rejected": -0.6928958892822266, "logps/chosen": -181.90908813476562, "logps/rejected": -217.79177856445312, "loss": 0.3887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8513981103897095, "rewards/margins": 1.058131217956543, "rewards/rejected": 0.7932666540145874, "step": 360 }, { "epoch": 0.37755102040816324, "grad_norm": 1.8091073036193848, "learning_rate": 7.831297276682368e-07, "logits/chosen": -0.6461857557296753, "logits/rejected": -0.7057845592498779, "logps/chosen": -89.2549057006836, "logps/rejected": -165.2856903076172, "loss": 0.3589, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.603538990020752, "rewards/margins": 1.7810055017471313, "rewards/rejected": 0.8225336074829102, "step": 370 }, { "epoch": 0.3877551020408163, "grad_norm": 1.5131062269210815, "learning_rate": 7.682741885881314e-07, "logits/chosen": -0.6561241149902344, "logits/rejected": -0.6394567489624023, "logps/chosen": -191.556640625, "logps/rejected": -244.42593383789062, "loss": 0.3447, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.0958144664764404, "rewards/margins": 1.2701359987258911, "rewards/rejected": 0.8256783485412598, "step": 380 }, { "epoch": 0.3979591836734694, "grad_norm": 1.401531457901001, "learning_rate": 7.530783231795614e-07, "logits/chosen": -0.5236614942550659, "logits/rejected": -0.6208306550979614, "logps/chosen": -173.1316680908203, "logps/rejected": -263.10003662109375, "loss": 0.3731, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.393113613128662, "rewards/margins": 1.6618480682373047, "rewards/rejected": 0.7312653660774231, "step": 390 }, { "epoch": 0.40816326530612246, "grad_norm": 1.200061559677124, "learning_rate": 7.375614085601264e-07, "logits/chosen": -0.6140845417976379, "logits/rejected": -0.575400173664093, "logps/chosen": -180.64183044433594, "logps/rejected": -241.3333282470703, "loss": 0.3497, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.477724075317383, "rewards/margins": 1.4085882902145386, "rewards/rejected": 1.0691356658935547, "step": 400 }, { "epoch": 0.40816326530612246, "eval_logits/chosen": -0.6313372850418091, "eval_logits/rejected": -0.6323339343070984, "eval_logps/chosen": -203.02207946777344, "eval_logps/rejected": -260.01409912109375, "eval_loss": 0.36327043175697327, "eval_rewards/accuracies": 0.8468208312988281, "eval_rewards/chosen": 1.9715064764022827, "eval_rewards/margins": 1.3975027799606323, "eval_rewards/rejected": 0.5740035772323608, "eval_runtime": 252.8433, "eval_samples_per_second": 10.928, "eval_steps_per_second": 1.368, "step": 400 }, { "epoch": 0.41836734693877553, "grad_norm": 1.6836450099945068, "learning_rate": 7.217431291229067e-07, "logits/chosen": -0.7939841747283936, "logits/rejected": -0.613411545753479, "logps/chosen": -217.92153930664062, "logps/rejected": -224.36270141601562, "loss": 0.3867, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.1043503284454346, "rewards/margins": 1.5646950006484985, "rewards/rejected": 0.5396553874015808, "step": 410 }, { "epoch": 0.42857142857142855, "grad_norm": 1.3317594528198242, "learning_rate": 7.056435515653058e-07, "logits/chosen": -0.6276999115943909, "logits/rejected": -0.5372880697250366, "logps/chosen": -198.774658203125, "logps/rejected": -198.1266632080078, "loss": 0.3866, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.228754997253418, "rewards/margins": 1.542759895324707, "rewards/rejected": 0.6859949231147766, "step": 420 }, { "epoch": 0.4387755102040816, "grad_norm": 1.6838312149047852, "learning_rate": 6.892830994329088e-07, "logits/chosen": -0.5538614392280579, "logits/rejected": -0.5668340921401978, "logps/chosen": -115.6775131225586, "logps/rejected": -192.1859588623047, "loss": 0.3508, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.350921630859375, "rewards/margins": 1.5608246326446533, "rewards/rejected": 0.7900969982147217, "step": 430 }, { "epoch": 0.4489795918367347, "grad_norm": 1.517739176750183, "learning_rate": 6.726825272106538e-07, "logits/chosen": -0.7243350744247437, "logits/rejected": -0.6703056693077087, "logps/chosen": -159.9098663330078, "logps/rejected": -242.1475067138672, "loss": 0.3489, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.179281711578369, "rewards/margins": 1.4223108291625977, "rewards/rejected": 0.7569707632064819, "step": 440 }, { "epoch": 0.45918367346938777, "grad_norm": 1.332316517829895, "learning_rate": 6.558628939941791e-07, "logits/chosen": -0.6250364184379578, "logits/rejected": -0.6584871411323547, "logps/chosen": -141.18350219726562, "logps/rejected": -210.0460205078125, "loss": 0.3277, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.4289634227752686, "rewards/margins": 1.7670814990997314, "rewards/rejected": 0.6618821024894714, "step": 450 }, { "epoch": 0.46938775510204084, "grad_norm": 1.1579140424728394, "learning_rate": 6.388455367747502e-07, "logits/chosen": -0.6399953365325928, "logits/rejected": -0.6188939809799194, "logps/chosen": -122.9395751953125, "logps/rejected": -187.66537475585938, "loss": 0.3203, "rewards/accuracies": 0.875, "rewards/chosen": 2.4994819164276123, "rewards/margins": 1.6673539876937866, "rewards/rejected": 0.8321278691291809, "step": 460 }, { "epoch": 0.47959183673469385, "grad_norm": 1.3472042083740234, "learning_rate": 6.216520433716544e-07, "logits/chosen": -0.5729564428329468, "logits/rejected": -0.5995985865592957, "logps/chosen": -162.9604034423828, "logps/rejected": -261.8568420410156, "loss": 0.3378, "rewards/accuracies": 0.875, "rewards/chosen": 2.0735116004943848, "rewards/margins": 1.7422335147857666, "rewards/rejected": 0.3312779664993286, "step": 470 }, { "epoch": 0.4897959183673469, "grad_norm": 1.2735215425491333, "learning_rate": 6.043042250464004e-07, "logits/chosen": -0.6866289377212524, "logits/rejected": -0.6188154220581055, "logps/chosen": -123.27571868896484, "logps/rejected": -148.08094787597656, "loss": 0.2965, "rewards/accuracies": 0.875, "rewards/chosen": 2.6011900901794434, "rewards/margins": 1.8508793115615845, "rewards/rejected": 0.7503107190132141, "step": 480 }, { "epoch": 0.5, "grad_norm": 1.0802596807479858, "learning_rate": 5.868240888334652e-07, "logits/chosen": -0.803338885307312, "logits/rejected": -0.7212686538696289, "logps/chosen": -173.48219299316406, "logps/rejected": -273.45672607421875, "loss": 0.3035, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5986487865448, "rewards/margins": 2.1381328105926514, "rewards/rejected": 0.4605160653591156, "step": 490 }, { "epoch": 0.5102040816326531, "grad_norm": 0.8763641715049744, "learning_rate": 5.69233809622687e-07, "logits/chosen": -0.49485841393470764, "logits/rejected": -0.5676048398017883, "logps/chosen": -129.4870147705078, "logps/rejected": -191.2211151123047, "loss": 0.3378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.351245403289795, "rewards/margins": 1.694441556930542, "rewards/rejected": 0.6568037271499634, "step": 500 }, { "epoch": 0.5102040816326531, "eval_logits/chosen": -0.6102895736694336, "eval_logits/rejected": -0.6196657419204712, "eval_logps/chosen": -201.76104736328125, "eval_logps/rejected": -262.2907409667969, "eval_loss": 0.34211036562919617, "eval_rewards/accuracies": 0.8699421882629395, "eval_rewards/chosen": 2.034559726715088, "eval_rewards/margins": 1.5743900537490845, "eval_rewards/rejected": 0.46016958355903625, "eval_runtime": 252.9052, "eval_samples_per_second": 10.925, "eval_steps_per_second": 1.368, "step": 500 }, { "epoch": 0.5204081632653061, "grad_norm": 1.9054065942764282, "learning_rate": 5.515557020287218e-07, "logits/chosen": -0.6607390642166138, "logits/rejected": -0.6723104119300842, "logps/chosen": -144.58413696289062, "logps/rejected": -223.1163330078125, "loss": 0.3585, "rewards/accuracies": 0.875, "rewards/chosen": 2.4164395332336426, "rewards/margins": 1.9343305826187134, "rewards/rejected": 0.4821089804172516, "step": 510 }, { "epoch": 0.5306122448979592, "grad_norm": 1.4127832651138306, "learning_rate": 5.338121920832475e-07, "logits/chosen": -0.5705611109733582, "logits/rejected": -0.6025998592376709, "logps/chosen": -219.2863311767578, "logps/rejected": -222.2490692138672, "loss": 0.3387, "rewards/accuracies": 0.875, "rewards/chosen": 2.2865653038024902, "rewards/margins": 1.6282856464385986, "rewards/rejected": 0.6582795977592468, "step": 520 }, { "epoch": 0.5408163265306123, "grad_norm": 1.9716460704803467, "learning_rate": 5.160257887858277e-07, "logits/chosen": -0.7576996088027954, "logits/rejected": -0.7345749139785767, "logps/chosen": -144.2480926513672, "logps/rejected": -193.87539672851562, "loss": 0.3335, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3815081119537354, "rewards/margins": 1.888880968093872, "rewards/rejected": 0.49262747168540955, "step": 530 }, { "epoch": 0.5510204081632653, "grad_norm": 1.2313483953475952, "learning_rate": 4.982190555495235e-07, "logits/chosen": -0.6418560147285461, "logits/rejected": -0.6474324464797974, "logps/chosen": -136.88446044921875, "logps/rejected": -218.2066192626953, "loss": 0.3287, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.419318675994873, "rewards/margins": 1.6617262363433838, "rewards/rejected": 0.7575927376747131, "step": 540 }, { "epoch": 0.5612244897959183, "grad_norm": 1.5645098686218262, "learning_rate": 4.804145815774786e-07, "logits/chosen": -0.6569366455078125, "logits/rejected": -0.669120192527771, "logps/chosen": -142.89028930664062, "logps/rejected": -210.3660888671875, "loss": 0.3265, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.5318100452423096, "rewards/margins": 1.5718120336532593, "rewards/rejected": 0.9599977731704712, "step": 550 }, { "epoch": 0.5714285714285714, "grad_norm": 1.0555964708328247, "learning_rate": 4.626349532067879e-07, "logits/chosen": -0.7187200784683228, "logits/rejected": -0.7504357099533081, "logps/chosen": -145.81968688964844, "logps/rejected": -256.0328063964844, "loss": 0.321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.87051522731781, "rewards/margins": 1.6857688426971436, "rewards/rejected": 0.18474629521369934, "step": 560 }, { "epoch": 0.5816326530612245, "grad_norm": 3.175672769546509, "learning_rate": 4.4490272525599936e-07, "logits/chosen": -0.7243942618370056, "logits/rejected": -0.6909176707267761, "logps/chosen": -112.03414154052734, "logps/rejected": -205.05517578125, "loss": 0.3165, "rewards/accuracies": 0.875, "rewards/chosen": 2.3773770332336426, "rewards/margins": 2.077335834503174, "rewards/rejected": 0.3000412583351135, "step": 570 }, { "epoch": 0.5918367346938775, "grad_norm": 0.9734162092208862, "learning_rate": 4.272403924126035e-07, "logits/chosen": -0.6561388373374939, "logits/rejected": -0.5848960280418396, "logps/chosen": -148.91261291503906, "logps/rejected": -197.70758056640625, "loss": 0.2936, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.421445369720459, "rewards/margins": 2.3003313541412354, "rewards/rejected": 0.12111417204141617, "step": 580 }, { "epoch": 0.6020408163265306, "grad_norm": 1.2602763175964355, "learning_rate": 4.096703606968006e-07, "logits/chosen": -0.5611236095428467, "logits/rejected": -0.5989875793457031, "logps/chosen": -181.88597106933594, "logps/rejected": -338.6321105957031, "loss": 0.3525, "rewards/accuracies": 0.875, "rewards/chosen": 2.1114940643310547, "rewards/margins": 1.8167270421981812, "rewards/rejected": 0.294766902923584, "step": 590 }, { "epoch": 0.6122448979591837, "grad_norm": 1.2026368379592896, "learning_rate": 3.9221491903775013e-07, "logits/chosen": -0.6353614926338196, "logits/rejected": -0.6344829797744751, "logps/chosen": -225.2716064453125, "logps/rejected": -285.7774963378906, "loss": 0.2904, "rewards/accuracies": 0.875, "rewards/chosen": 1.963526725769043, "rewards/margins": 1.5393855571746826, "rewards/rejected": 0.4241412281990051, "step": 600 }, { "epoch": 0.6122448979591837, "eval_logits/chosen": -0.6159467697143555, "eval_logits/rejected": -0.6221225261688232, "eval_logps/chosen": -203.55426025390625, "eval_logps/rejected": -265.3277587890625, "eval_loss": 0.32874658703804016, "eval_rewards/accuracies": 0.8757225275039673, "eval_rewards/chosen": 1.9448989629745483, "eval_rewards/margins": 1.636578917503357, "eval_rewards/rejected": 0.3083205819129944, "eval_runtime": 252.8477, "eval_samples_per_second": 10.928, "eval_steps_per_second": 1.368, "step": 600 }, { "epoch": 0.6224489795918368, "grad_norm": 0.9654647707939148, "learning_rate": 3.7489621099836043e-07, "logits/chosen": -0.6111562848091125, "logits/rejected": -0.5714690089225769, "logps/chosen": -193.060302734375, "logps/rejected": -250.1613311767578, "loss": 0.3157, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.0583136081695557, "rewards/margins": 1.5960966348648071, "rewards/rejected": 0.4622170329093933, "step": 610 }, { "epoch": 0.6326530612244898, "grad_norm": 0.9556177854537964, "learning_rate": 3.577362066844838e-07, "logits/chosen": -0.6297867894172668, "logits/rejected": -0.6922434568405151, "logps/chosen": -128.23373413085938, "logps/rejected": -200.610595703125, "loss": 0.3082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2646005153656006, "rewards/margins": 1.6751960515975952, "rewards/rejected": 0.5894044041633606, "step": 620 }, { "epoch": 0.6428571428571429, "grad_norm": 1.8962053060531616, "learning_rate": 3.4075667487415785e-07, "logits/chosen": -0.5675973892211914, "logits/rejected": -0.6235415935516357, "logps/chosen": -201.10992431640625, "logps/rejected": -309.8710632324219, "loss": 0.3583, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.077495813369751, "rewards/margins": 2.106235980987549, "rewards/rejected": -0.028740186244249344, "step": 630 }, { "epoch": 0.6530612244897959, "grad_norm": 2.3064181804656982, "learning_rate": 3.239791554022449e-07, "logits/chosen": -0.647456705570221, "logits/rejected": -0.595936119556427, "logps/chosen": -185.43714904785156, "logps/rejected": -198.25482177734375, "loss": 0.3458, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3314576148986816, "rewards/margins": 1.9370521306991577, "rewards/rejected": 0.3944053649902344, "step": 640 }, { "epoch": 0.6632653061224489, "grad_norm": 1.1449445486068726, "learning_rate": 3.0742493183550454e-07, "logits/chosen": -0.6164785623550415, "logits/rejected": -0.5928055047988892, "logps/chosen": -167.90647888183594, "logps/rejected": -219.0677490234375, "loss": 0.2919, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.3085310459136963, "rewards/margins": 1.8304294347763062, "rewards/rejected": 0.47810110449790955, "step": 650 }, { "epoch": 0.673469387755102, "grad_norm": 1.494554877281189, "learning_rate": 2.911150044727605e-07, "logits/chosen": -0.6391149163246155, "logits/rejected": -0.6734142303466797, "logps/chosen": -145.32362365722656, "logps/rejected": -197.6822967529297, "loss": 0.326, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4695799350738525, "rewards/margins": 1.6128828525543213, "rewards/rejected": 0.8566972017288208, "step": 660 }, { "epoch": 0.6836734693877551, "grad_norm": 0.9126285314559937, "learning_rate": 2.750700637044155e-07, "logits/chosen": -0.6191089749336243, "logits/rejected": -0.7010880708694458, "logps/chosen": -159.2322998046875, "logps/rejected": -246.2435760498047, "loss": 0.2968, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.389880895614624, "rewards/margins": 2.214604377746582, "rewards/rejected": 0.175276517868042, "step": 670 }, { "epoch": 0.6938775510204082, "grad_norm": 1.308219075202942, "learning_rate": 2.593104637651087e-07, "logits/chosen": -0.5017037987709045, "logits/rejected": -0.5034186244010925, "logps/chosen": -121.1073226928711, "logps/rejected": -187.53866577148438, "loss": 0.3082, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.530785083770752, "rewards/margins": 2.0506749153137207, "rewards/rejected": 0.4801098704338074, "step": 680 }, { "epoch": 0.7040816326530612, "grad_norm": 1.0809426307678223, "learning_rate": 2.438561969128114e-07, "logits/chosen": -0.590795636177063, "logits/rejected": -0.6325095891952515, "logps/chosen": -134.36793518066406, "logps/rejected": -201.61770629882812, "loss": 0.3408, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.304894208908081, "rewards/margins": 1.7218987941741943, "rewards/rejected": 0.5829951763153076, "step": 690 }, { "epoch": 0.7142857142857143, "grad_norm": 1.6444560289382935, "learning_rate": 2.2872686806712032e-07, "logits/chosen": -0.6764811277389526, "logits/rejected": -0.6604726910591125, "logps/chosen": -178.37112426757812, "logps/rejected": -279.1824645996094, "loss": 0.3053, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4546444416046143, "rewards/margins": 1.9984443187713623, "rewards/rejected": 0.4562003016471863, "step": 700 }, { "epoch": 0.7142857142857143, "eval_logits/chosen": -0.6110620498657227, "eval_logits/rejected": -0.616197943687439, "eval_logps/chosen": -202.58566284179688, "eval_logps/rejected": -266.28179931640625, "eval_loss": 0.3206620216369629, "eval_rewards/accuracies": 0.8901734352111816, "eval_rewards/chosen": 1.9933290481567383, "eval_rewards/margins": 1.7327111959457397, "eval_rewards/rejected": 0.2606178820133209, "eval_runtime": 253.1776, "eval_samples_per_second": 10.913, "eval_steps_per_second": 1.367, "step": 700 }, { "epoch": 0.7244897959183674, "grad_norm": 2.030115842819214, "learning_rate": 2.1394166993891526e-07, "logits/chosen": -0.5332853198051453, "logits/rejected": -0.6465424299240112, "logps/chosen": -162.91802978515625, "logps/rejected": -236.94656372070312, "loss": 0.3133, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.1907992362976074, "rewards/margins": 2.2500343322753906, "rewards/rejected": -0.05923491716384888, "step": 710 }, { "epoch": 0.7346938775510204, "grad_norm": 1.1324609518051147, "learning_rate": 1.995193586829387e-07, "logits/chosen": -0.658591091632843, "logits/rejected": -0.6164897084236145, "logps/chosen": -170.56906127929688, "logps/rejected": -209.83834838867188, "loss": 0.2705, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4906888008117676, "rewards/margins": 1.8935827016830444, "rewards/rejected": 0.597105860710144, "step": 720 }, { "epoch": 0.7448979591836735, "grad_norm": 1.2381037473678589, "learning_rate": 1.8547823010417873e-07, "logits/chosen": -0.5904741883277893, "logits/rejected": -0.571013331413269, "logps/chosen": -144.27389526367188, "logps/rejected": -181.3529510498047, "loss": 0.3058, "rewards/accuracies": 0.875, "rewards/chosen": 2.409674882888794, "rewards/margins": 1.6706383228302002, "rewards/rejected": 0.739036500453949, "step": 730 }, { "epoch": 0.7551020408163265, "grad_norm": 1.4913420677185059, "learning_rate": 1.7183609644824092e-07, "logits/chosen": -0.7272003293037415, "logits/rejected": -0.7424389123916626, "logps/chosen": -169.88607788085938, "logps/rejected": -232.5887908935547, "loss": 0.3306, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.0715317726135254, "rewards/margins": 1.9093472957611084, "rewards/rejected": 0.16218456625938416, "step": 740 }, { "epoch": 0.7653061224489796, "grad_norm": 1.1602520942687988, "learning_rate": 1.5861026380515163e-07, "logits/chosen": -0.7100823521614075, "logits/rejected": -0.6146202683448792, "logps/chosen": -166.2672576904297, "logps/rejected": -280.4190673828125, "loss": 0.2727, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.526275157928467, "rewards/margins": 2.7400615215301514, "rewards/rejected": -0.21378597617149353, "step": 750 }, { "epoch": 0.7755102040816326, "grad_norm": 1.6637645959854126, "learning_rate": 1.4581751015526033e-07, "logits/chosen": -0.6776250600814819, "logits/rejected": -0.6692344546318054, "logps/chosen": -113.79902648925781, "logps/rejected": -179.08889770507812, "loss": 0.3184, "rewards/accuracies": 0.875, "rewards/chosen": 2.358813762664795, "rewards/margins": 2.0016608238220215, "rewards/rejected": 0.3571527600288391, "step": 760 }, { "epoch": 0.7857142857142857, "grad_norm": 0.9753669500350952, "learning_rate": 1.3347406408508694e-07, "logits/chosen": -0.5141528844833374, "logits/rejected": -0.5624244809150696, "logps/chosen": -95.5937728881836, "logps/rejected": -185.74070739746094, "loss": 0.2773, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.0672779083251953, "rewards/margins": 2.953110694885254, "rewards/rejected": 0.11416707187891006, "step": 770 }, { "epoch": 0.7959183673469388, "grad_norm": 2.153414249420166, "learning_rate": 1.2159558420011905e-07, "logits/chosen": -0.7019663453102112, "logits/rejected": -0.6160884499549866, "logps/chosen": -162.27789306640625, "logps/rejected": -200.33497619628906, "loss": 0.3196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.516826629638672, "rewards/margins": 1.9273223876953125, "rewards/rejected": 0.5895041823387146, "step": 780 }, { "epoch": 0.8061224489795918, "grad_norm": 1.181531548500061, "learning_rate": 1.1019713926067392e-07, "logits/chosen": -0.6071494817733765, "logits/rejected": -0.6017253398895264, "logps/chosen": -134.8675537109375, "logps/rejected": -199.9590301513672, "loss": 0.3118, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.1532888412475586, "rewards/margins": 1.8393771648406982, "rewards/rejected": 0.31391164660453796, "step": 790 }, { "epoch": 0.8163265306122449, "grad_norm": 1.002733826637268, "learning_rate": 9.929318906602174e-08, "logits/chosen": -0.6381187438964844, "logits/rejected": -0.6446717977523804, "logps/chosen": -127.46337890625, "logps/rejected": -191.8046112060547, "loss": 0.2655, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.273101568222046, "rewards/margins": 2.162346363067627, "rewards/rejected": 0.11075510829687119, "step": 800 }, { "epoch": 0.8163265306122449, "eval_logits/chosen": -0.602638304233551, "eval_logits/rejected": -0.6127411723136902, "eval_logps/chosen": -202.7614288330078, "eval_logps/rejected": -266.9698486328125, "eval_loss": 0.315766304731369, "eval_rewards/accuracies": 0.8815028667449951, "eval_rewards/chosen": 1.9845408201217651, "eval_rewards/margins": 1.7583247423171997, "eval_rewards/rejected": 0.22621627151966095, "eval_runtime": 252.9772, "eval_samples_per_second": 10.922, "eval_steps_per_second": 1.368, "step": 800 }, { "epoch": 0.826530612244898, "grad_norm": 1.031960129737854, "learning_rate": 8.889756611102539e-08, "logits/chosen": -0.6104857921600342, "logits/rejected": -0.6152311563491821, "logps/chosen": -177.21951293945312, "logps/rejected": -195.98184204101562, "loss": 0.2861, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.403092622756958, "rewards/margins": 2.128187656402588, "rewards/rejected": 0.27490508556365967, "step": 810 }, { "epoch": 0.8367346938775511, "grad_norm": 1.2540313005447388, "learning_rate": 7.902345803856264e-08, "logits/chosen": -0.5539565682411194, "logits/rejected": -0.6319509744644165, "logps/chosen": -136.90011596679688, "logps/rejected": -277.3617248535156, "loss": 0.2758, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.5164365768432617, "rewards/margins": 2.1890451908111572, "rewards/rejected": 0.32739144563674927, "step": 820 }, { "epoch": 0.8469387755102041, "grad_norm": 1.9569461345672607, "learning_rate": 6.968339090999186e-08, "logits/chosen": -0.7001439332962036, "logits/rejected": -0.7415611743927002, "logps/chosen": -111.2337646484375, "logps/rejected": -191.3740692138672, "loss": 0.2879, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.318671703338623, "rewards/margins": 2.1912620067596436, "rewards/rejected": 0.12740962207317352, "step": 830 }, { "epoch": 0.8571428571428571, "grad_norm": 1.5039212703704834, "learning_rate": 6.088921331488566e-08, "logits/chosen": -0.5268384218215942, "logits/rejected": -0.5624841451644897, "logps/chosen": -117.46417236328125, "logps/rejected": -222.89755249023438, "loss": 0.3116, "rewards/accuracies": 0.875, "rewards/chosen": 2.4481053352355957, "rewards/margins": 2.348428249359131, "rewards/rejected": 0.09967675060033798, "step": 840 }, { "epoch": 0.8673469387755102, "grad_norm": 0.9287813305854797, "learning_rate": 5.2652081340188506e-08, "logits/chosen": -0.5797746777534485, "logits/rejected": -0.5913048386573792, "logps/chosen": -94.5796890258789, "logps/rejected": -174.916748046875, "loss": 0.2915, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9385693073272705, "rewards/margins": 2.669956684112549, "rewards/rejected": 0.2686125636100769, "step": 850 }, { "epoch": 0.8775510204081632, "grad_norm": 1.252423644065857, "learning_rate": 4.498244441786675e-08, "logits/chosen": -0.604209303855896, "logits/rejected": -0.6816591620445251, "logps/chosen": -102.5703125, "logps/rejected": -223.8843994140625, "loss": 0.2842, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.3505682945251465, "rewards/margins": 2.114715814590454, "rewards/rejected": 0.23585255444049835, "step": 860 }, { "epoch": 0.8877551020408163, "grad_norm": 1.17081880569458, "learning_rate": 3.789003206900537e-08, "logits/chosen": -0.8669666051864624, "logits/rejected": -0.807847797870636, "logps/chosen": -248.0942840576172, "logps/rejected": -313.04302978515625, "loss": 0.317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.950645089149475, "rewards/margins": 1.5294300317764282, "rewards/rejected": 0.4212152063846588, "step": 870 }, { "epoch": 0.8979591836734694, "grad_norm": 0.9833048582077026, "learning_rate": 3.1383841561166134e-08, "logits/chosen": -0.6270785331726074, "logits/rejected": -0.6980186104774475, "logps/chosen": -178.78965759277344, "logps/rejected": -230.45925903320312, "loss": 0.3263, "rewards/accuracies": 0.875, "rewards/chosen": 2.228593349456787, "rewards/margins": 2.083462953567505, "rewards/rejected": 0.14513027667999268, "step": 880 }, { "epoch": 0.9081632653061225, "grad_norm": 0.9479549527168274, "learning_rate": 2.547212649466568e-08, "logits/chosen": -0.7561019062995911, "logits/rejected": -0.7451134324073792, "logps/chosen": -134.2133331298828, "logps/rejected": -225.8154754638672, "loss": 0.2947, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.320394992828369, "rewards/margins": 1.929671049118042, "rewards/rejected": 0.39072394371032715, "step": 890 }, { "epoch": 0.9183673469387755, "grad_norm": 1.0661786794662476, "learning_rate": 2.0162386332251648e-08, "logits/chosen": -0.6181257963180542, "logits/rejected": -0.6277596354484558, "logps/chosen": -109.55928039550781, "logps/rejected": -188.4981689453125, "loss": 0.2943, "rewards/accuracies": 0.875, "rewards/chosen": 2.673180103302002, "rewards/margins": 2.270214319229126, "rewards/rejected": 0.40296584367752075, "step": 900 }, { "epoch": 0.9183673469387755, "eval_logits/chosen": -0.6051714420318604, "eval_logits/rejected": -0.6136297583580017, "eval_logps/chosen": -202.5171356201172, "eval_logps/rejected": -267.1376647949219, "eval_loss": 0.3143753111362457, "eval_rewards/accuracies": 0.884393036365509, "eval_rewards/chosen": 1.9967551231384277, "eval_rewards/margins": 1.7789306640625, "eval_rewards/rejected": 0.21782423555850983, "eval_runtime": 252.9824, "eval_samples_per_second": 10.922, "eval_steps_per_second": 1.368, "step": 900 }, { "epoch": 0.9285714285714286, "grad_norm": 1.4716278314590454, "learning_rate": 1.5461356885461075e-08, "logits/chosen": -0.639533519744873, "logits/rejected": -0.6445420980453491, "logps/chosen": -115.40907287597656, "logps/rejected": -226.23721313476562, "loss": 0.288, "rewards/accuracies": 0.875, "rewards/chosen": 2.6810295581817627, "rewards/margins": 2.130437135696411, "rewards/rejected": 0.5505925416946411, "step": 910 }, { "epoch": 0.9387755102040817, "grad_norm": 1.3117539882659912, "learning_rate": 1.1375001769727999e-08, "logits/chosen": -0.6442250609397888, "logits/rejected": -0.6030440926551819, "logps/chosen": -154.71214294433594, "logps/rejected": -220.8012237548828, "loss": 0.2717, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1535449028015137, "rewards/margins": 1.923103928565979, "rewards/rejected": 0.23044133186340332, "step": 920 }, { "epoch": 0.9489795918367347, "grad_norm": 1.646316409111023, "learning_rate": 7.908504839081342e-09, "logits/chosen": -0.7338714599609375, "logits/rejected": -0.7258167266845703, "logps/chosen": -154.7112274169922, "logps/rejected": -180.6020965576172, "loss": 0.2866, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.1198439598083496, "rewards/margins": 1.7403348684310913, "rewards/rejected": 0.3795092701911926, "step": 930 }, { "epoch": 0.9591836734693877, "grad_norm": 1.2458878755569458, "learning_rate": 5.0662636100292086e-09, "logits/chosen": -0.6468678712844849, "logits/rejected": -0.5771836042404175, "logps/chosen": -185.34632873535156, "logps/rejected": -211.8957061767578, "loss": 0.2936, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.3122048377990723, "rewards/margins": 2.1624603271484375, "rewards/rejected": 0.14974427223205566, "step": 940 }, { "epoch": 0.9693877551020408, "grad_norm": 1.061425805091858, "learning_rate": 2.851883682973233e-09, "logits/chosen": -0.6436801552772522, "logits/rejected": -0.6932533979415894, "logps/chosen": -126.62858581542969, "logps/rejected": -224.2174072265625, "loss": 0.299, "rewards/accuracies": 0.875, "rewards/chosen": 2.1702253818511963, "rewards/margins": 2.06597638130188, "rewards/rejected": 0.1042490229010582, "step": 950 }, { "epoch": 0.9795918367346939, "grad_norm": 1.376145362854004, "learning_rate": 1.2681741682282754e-09, "logits/chosen": -0.6445289850234985, "logits/rejected": -0.5357323884963989, "logps/chosen": -160.71707153320312, "logps/rejected": -172.3175811767578, "loss": 0.3125, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.2926859855651855, "rewards/margins": 1.9077179431915283, "rewards/rejected": 0.384968101978302, "step": 960 }, { "epoch": 0.9897959183673469, "grad_norm": 1.1405550241470337, "learning_rate": 3.171441224514848e-10, "logits/chosen": -0.7027498483657837, "logits/rejected": -0.6814337968826294, "logps/chosen": -189.13955688476562, "logps/rejected": -220.84585571289062, "loss": 0.2744, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1381711959838867, "rewards/margins": 1.7079559564590454, "rewards/rejected": 0.4302152693271637, "step": 970 }, { "epoch": 1.0, "grad_norm": 1.1128407716751099, "learning_rate": 0.0, "logits/chosen": -0.6814506649971008, "logits/rejected": -0.6116394996643066, "logps/chosen": -214.19149780273438, "logps/rejected": -261.8013916015625, "loss": 0.276, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.9612834453582764, "rewards/margins": 1.9090359210968018, "rewards/rejected": 0.05224757641553879, "step": 980 }, { "epoch": 1.0, "step": 980, "total_flos": 0.0, "train_loss": 0.39260031933687173, "train_runtime": 7916.79, "train_samples_per_second": 3.96, "train_steps_per_second": 0.124 } ], "logging_steps": 10, "max_steps": 980, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }