{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 6.25, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.866089105606079, "logits/rejected": -1.8704073429107666, "logps/chosen": -36.98554229736328, "logps/rejected": -33.6707763671875, "loss": 5.979, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.004210897721350193, "rewards/margins": 0.01235075294971466, "rewards/rejected": -0.008139855228364468, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9978349208831787, "logits/rejected": -2.0004687309265137, "logps/chosen": -29.640878677368164, "logps/rejected": -29.042272567749023, "loss": 6.3859, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.00026315837749280035, "rewards/margins": -0.003971050027757883, "rewards/rejected": 0.004234207794070244, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9206383228302002, "logits/rejected": -1.9179503917694092, "logps/chosen": -31.404415130615234, "logps/rejected": -33.228981018066406, "loss": 6.1874, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00233639357611537, "rewards/margins": 0.004440182354301214, "rewards/rejected": -0.002103788312524557, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.017291784286499, "logits/rejected": -2.008547067642212, "logps/chosen": -32.58599090576172, "logps/rejected": -32.512664794921875, "loss": 6.3474, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.001837015151977539, "rewards/margins": -0.0018385002622380853, "rewards/rejected": 1.4854595065116882e-06, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8622735738754272, "logits/rejected": -1.8514816761016846, "logps/chosen": -33.54685592651367, "logps/rejected": -35.447818756103516, "loss": 6.2748, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0021248008124530315, "rewards/margins": 0.0012201189529150724, "rewards/rejected": 0.0009046817431226373, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9413617849349976, "logits/rejected": -1.943302869796753, "logps/chosen": -32.52573013305664, "logps/rejected": -33.22004318237305, "loss": 5.8451, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.014890777878463268, "rewards/margins": 0.024623576551675797, "rewards/rejected": -0.009732798673212528, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.0721487998962402, "logits/rejected": -2.07711124420166, "logps/chosen": -33.97162628173828, "logps/rejected": -36.63127517700195, "loss": 6.0458, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0033715851604938507, "rewards/margins": 0.015852421522140503, "rewards/rejected": -0.012480835430324078, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9334779977798462, "logits/rejected": -1.936608910560608, "logps/chosen": -34.305667877197266, "logps/rejected": -34.637855529785156, "loss": 5.6523, "rewards/accuracies": 0.625, "rewards/chosen": 0.02442934736609459, "rewards/margins": 0.033519335091114044, "rewards/rejected": -0.009089985862374306, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9406198263168335, "logits/rejected": -1.9451316595077515, "logps/chosen": -32.37959289550781, "logps/rejected": -32.313934326171875, "loss": 6.1848, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.017566002905368805, "rewards/margins": 0.008571788668632507, "rewards/rejected": 0.008994214236736298, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.0386805534362793, "logits/rejected": -2.036684513092041, "logps/chosen": -32.129981994628906, "logps/rejected": -31.296749114990234, "loss": 5.6745, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.022171173244714737, "rewards/margins": 0.028278371319174767, "rewards/rejected": -0.006107199937105179, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2338244915008545, "eval_logits/rejected": -2.2289819717407227, "eval_logps/chosen": -34.01533508300781, "eval_logps/rejected": -37.518131256103516, "eval_loss": 6.2508440017700195, "eval_rewards/accuracies": 0.5460963845252991, "eval_rewards/chosen": 0.0038431365974247456, "eval_rewards/margins": 0.0041458746418356895, "eval_rewards/rejected": -0.0003027375496458262, "eval_runtime": 145.7849, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9937114715576172, "logits/rejected": -1.9913352727890015, "logps/chosen": -33.10139465332031, "logps/rejected": -33.98957061767578, "loss": 6.1578, "rewards/accuracies": 0.625, "rewards/chosen": 0.028484445065259933, "rewards/margins": 0.02052464708685875, "rewards/rejected": 0.007959800772368908, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0058560371398926, "logits/rejected": -1.9975417852401733, "logps/chosen": -32.32807159423828, "logps/rejected": -32.11988067626953, "loss": 6.0258, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.02336641401052475, "rewards/margins": 0.015041453763842583, "rewards/rejected": 0.008324960246682167, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.034005641937256, "logits/rejected": -2.026031494140625, "logps/chosen": -30.320354461669922, "logps/rejected": -32.04728698730469, "loss": 5.7894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.031072016805410385, "rewards/margins": 0.029927905648946762, "rewards/rejected": 0.0011441137176007032, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9651100635528564, "logits/rejected": -1.9753506183624268, "logps/chosen": -31.207500457763672, "logps/rejected": -32.54130172729492, "loss": 5.4157, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.042432788759469986, "rewards/margins": 0.0437164343893528, "rewards/rejected": -0.0012836471432819963, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8767722845077515, "logits/rejected": -1.8779083490371704, "logps/chosen": -33.90843963623047, "logps/rejected": -34.76317596435547, "loss": 5.2936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05685017257928848, "rewards/margins": 0.058033354580402374, "rewards/rejected": -0.0011831853771582246, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9284919500350952, "logits/rejected": -1.9250543117523193, "logps/chosen": -36.01051712036133, "logps/rejected": -32.69367980957031, "loss": 5.7928, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.032231830060482025, "rewards/margins": 0.026417434215545654, "rewards/rejected": 0.005814394913613796, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.029017925262451, "logits/rejected": -2.0216681957244873, "logps/chosen": -33.47245407104492, "logps/rejected": -31.359905242919922, "loss": 5.0567, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0650816410779953, "rewards/margins": 0.06669269502162933, "rewards/rejected": -0.0016110436990857124, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0359654426574707, "logits/rejected": -2.041189670562744, "logps/chosen": -32.1973876953125, "logps/rejected": -32.39836883544922, "loss": 5.2683, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0694805309176445, "rewards/margins": 0.05212607979774475, "rewards/rejected": 0.017354462295770645, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0371925830841064, "logits/rejected": -2.0344390869140625, "logps/chosen": -31.23935317993164, "logps/rejected": -31.283512115478516, "loss": 5.5783, "rewards/accuracies": 0.625, "rewards/chosen": 0.049920208752155304, "rewards/margins": 0.04297895357012749, "rewards/rejected": 0.006941256113350391, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9059925079345703, "logits/rejected": -1.9106495380401611, "logps/chosen": -31.285137176513672, "logps/rejected": -32.777244567871094, "loss": 5.2135, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.064228855073452, "rewards/margins": 0.061491239815950394, "rewards/rejected": 0.002737621311098337, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.231491804122925, "eval_logits/rejected": -2.2266557216644287, "eval_logps/chosen": -34.00144577026367, "eval_logps/rejected": -37.5041618347168, "eval_loss": 6.2880659103393555, "eval_rewards/accuracies": 0.5402824282646179, "eval_rewards/chosen": 0.006621644366532564, "eval_rewards/margins": 0.00412956066429615, "eval_rewards/rejected": 0.0024920827709138393, "eval_runtime": 145.8338, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.017942428588867, "logits/rejected": -2.0285964012145996, "logps/chosen": -31.721837997436523, "logps/rejected": -33.87845993041992, "loss": 5.312, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0498933307826519, "rewards/margins": 0.05304562300443649, "rewards/rejected": -0.003152288496494293, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.9104692935943604, "logits/rejected": -1.9252119064331055, "logps/chosen": -29.818248748779297, "logps/rejected": -31.569311141967773, "loss": 5.1276, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.05865820124745369, "rewards/margins": 0.060884904116392136, "rewards/rejected": -0.002226702868938446, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9671802520751953, "logits/rejected": -1.9711711406707764, "logps/chosen": -33.06322479248047, "logps/rejected": -31.57196617126465, "loss": 4.9986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0708485022187233, "rewards/margins": 0.075381800532341, "rewards/rejected": -0.004533302970230579, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9659277200698853, "logits/rejected": -1.9440826177597046, "logps/chosen": -33.80937957763672, "logps/rejected": -35.049232482910156, "loss": 4.6987, "rewards/accuracies": 0.75, "rewards/chosen": 0.06887368112802505, "rewards/margins": 0.08856189250946045, "rewards/rejected": -0.019688209518790245, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.0067532062530518, "logits/rejected": -2.0034396648406982, "logps/chosen": -32.64842987060547, "logps/rejected": -36.244728088378906, "loss": 5.1569, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05532795190811157, "rewards/margins": 0.05928860232234001, "rewards/rejected": -0.003960648085922003, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8740017414093018, "logits/rejected": -1.8715832233428955, "logps/chosen": -33.948036193847656, "logps/rejected": -35.48664093017578, "loss": 5.4652, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04807313531637192, "rewards/margins": 0.0444767102599144, "rewards/rejected": 0.0035964243579655886, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8596279621124268, "logits/rejected": -1.8571679592132568, "logps/chosen": -34.14976119995117, "logps/rejected": -31.774593353271484, "loss": 5.3924, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.049567773938179016, "rewards/margins": 0.05022105574607849, "rewards/rejected": -0.0006532802362926304, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9632982015609741, "logits/rejected": -1.9527626037597656, "logps/chosen": -34.97089767456055, "logps/rejected": -31.810266494750977, "loss": 4.9049, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.07714872062206268, "rewards/margins": 0.07100304216146469, "rewards/rejected": 0.0061456747353076935, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0585777759552, "logits/rejected": -2.043663740158081, "logps/chosen": -30.67743492126465, "logps/rejected": -32.60033416748047, "loss": 5.7039, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04933839291334152, "rewards/margins": 0.039037786424160004, "rewards/rejected": 0.01030060462653637, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9290869235992432, "logits/rejected": -1.9265540838241577, "logps/chosen": -32.41482925415039, "logps/rejected": -30.851070404052734, "loss": 4.3883, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1024569720029831, "rewards/margins": 0.10930945724248886, "rewards/rejected": -0.006852488033473492, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2290894985198975, "eval_logits/rejected": -2.2242588996887207, "eval_logps/chosen": -34.0192756652832, "eval_logps/rejected": -37.53245544433594, "eval_loss": 6.2381510734558105, "eval_rewards/accuracies": 0.5166113376617432, "eval_rewards/chosen": 0.0030557620339095592, "eval_rewards/margins": 0.0062218476086854935, "eval_rewards/rejected": -0.003166085807606578, "eval_runtime": 145.7657, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 4.84533120650964e-06, "logits/chosen": -1.9132274389266968, "logits/rejected": -1.9099791049957275, "logps/chosen": -31.283920288085938, "logps/rejected": -33.752479553222656, "loss": 5.1369, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06388597935438156, "rewards/margins": 0.06595198810100555, "rewards/rejected": -0.002066010609269142, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.825108134172131e-06, "logits/chosen": -1.961586356163025, "logits/rejected": -1.949385404586792, "logps/chosen": -34.29393768310547, "logps/rejected": -33.63207244873047, "loss": 4.8373, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0604279451072216, "rewards/margins": 0.07907330244779587, "rewards/rejected": -0.01864534802734852, "step": 320 }, { "epoch": 0.86, "learning_rate": 4.80369052967602e-06, "logits/chosen": -1.9958088397979736, "logits/rejected": -1.9943931102752686, "logps/chosen": -33.11198043823242, "logps/rejected": -32.48575973510742, "loss": 4.8522, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.07476169615983963, "rewards/margins": 0.07599709928035736, "rewards/rejected": -0.0012354092905297875, "step": 330 }, { "epoch": 0.88, "learning_rate": 4.781089396387968e-06, "logits/chosen": -2.0807433128356934, "logits/rejected": -2.0650699138641357, "logps/chosen": -33.721527099609375, "logps/rejected": -33.0389518737793, "loss": 4.8887, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09157303720712662, "rewards/margins": 0.07575628161430359, "rewards/rejected": 0.015816759318113327, "step": 340 }, { "epoch": 0.91, "learning_rate": 4.757316345716554e-06, "logits/chosen": -1.9550502300262451, "logits/rejected": -1.9541908502578735, "logps/chosen": -32.77659606933594, "logps/rejected": -32.46695327758789, "loss": 4.6833, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10109980404376984, "rewards/margins": 0.10146143287420273, "rewards/rejected": -0.0003616276371758431, "step": 350 }, { "epoch": 0.94, "learning_rate": 4.73238359114687e-06, "logits/chosen": -1.9093825817108154, "logits/rejected": -1.919638991355896, "logps/chosen": -31.725839614868164, "logps/rejected": -35.360939025878906, "loss": 4.4471, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.09577047824859619, "rewards/margins": 0.11020632833242416, "rewards/rejected": -0.014435847289860249, "step": 360 }, { "epoch": 0.96, "learning_rate": 4.706303941965804e-06, "logits/chosen": -2.0445544719696045, "logits/rejected": -2.038135051727295, "logps/chosen": -33.22490692138672, "logps/rejected": -29.234344482421875, "loss": 4.6128, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.08541258424520493, "rewards/margins": 0.08699630200862885, "rewards/rejected": -0.0015837062383070588, "step": 370 }, { "epoch": 0.99, "learning_rate": 4.679090796681225e-06, "logits/chosen": -1.9023174047470093, "logits/rejected": -1.9045337438583374, "logps/chosen": -33.59349060058594, "logps/rejected": -30.87937355041504, "loss": 4.2966, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12911152839660645, "rewards/margins": 0.13046079874038696, "rewards/rejected": -0.0013492825673893094, "step": 380 }, { "epoch": 1.01, "learning_rate": 4.650758136138454e-06, "logits/chosen": -1.9307386875152588, "logits/rejected": -1.9295110702514648, "logps/chosen": -33.762290954589844, "logps/rejected": -35.92414474487305, "loss": 3.4964, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.1244986280798912, "rewards/margins": 0.1781271994113922, "rewards/rejected": -0.05362857133150101, "step": 390 }, { "epoch": 1.04, "learning_rate": 4.621320516337559e-06, "logits/chosen": -1.8655402660369873, "logits/rejected": -1.8571646213531494, "logps/chosen": -31.09210205078125, "logps/rejected": -36.32701873779297, "loss": 2.9753, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.13637451827526093, "rewards/margins": 0.2136588841676712, "rewards/rejected": -0.07728435099124908, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.2141356468200684, "eval_logits/rejected": -2.209277629852295, "eval_logps/chosen": -33.99992752075195, "eval_logps/rejected": -37.56977081298828, "eval_loss": 6.036898612976074, "eval_rewards/accuracies": 0.6034052968025208, "eval_rewards/chosen": 0.006925266236066818, "eval_rewards/margins": 0.01755475252866745, "eval_rewards/rejected": -0.010629487223923206, "eval_runtime": 146.4885, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.294, "step": 400 }, { "epoch": 1.06, "learning_rate": 4.590793060955158e-06, "logits/chosen": -2.038944721221924, "logits/rejected": -2.0418522357940674, "logps/chosen": -32.3203010559082, "logps/rejected": -35.14866638183594, "loss": 2.7334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.1467280387878418, "rewards/margins": 0.2178923636674881, "rewards/rejected": -0.07116430997848511, "step": 410 }, { "epoch": 1.09, "learning_rate": 4.559191453574582e-06, "logits/chosen": -1.8793596029281616, "logits/rejected": -1.877995252609253, "logps/chosen": -28.413448333740234, "logps/rejected": -32.73353958129883, "loss": 2.6582, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1308210790157318, "rewards/margins": 0.21399688720703125, "rewards/rejected": -0.08317580074071884, "step": 420 }, { "epoch": 1.12, "learning_rate": 4.52653192962838e-06, "logits/chosen": -1.8368927240371704, "logits/rejected": -1.8299328088760376, "logps/chosen": -33.15730667114258, "logps/rejected": -34.441612243652344, "loss": 2.4992, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.18335571885108948, "rewards/margins": 0.2226235568523407, "rewards/rejected": -0.039267830550670624, "step": 430 }, { "epoch": 1.14, "learning_rate": 4.492831268057307e-06, "logits/chosen": -2.0078189373016357, "logits/rejected": -2.0026912689208984, "logps/chosen": -30.97007179260254, "logps/rejected": -32.39707565307617, "loss": 2.5198, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.16264474391937256, "rewards/margins": 0.24853797256946564, "rewards/rejected": -0.08589322865009308, "step": 440 }, { "epoch": 1.17, "learning_rate": 4.458106782690094e-06, "logits/chosen": -1.8930528163909912, "logits/rejected": -1.897268295288086, "logps/chosen": -33.69664764404297, "logps/rejected": -33.11066818237305, "loss": 2.0077, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.16384032368659973, "rewards/margins": 0.26824674010276794, "rewards/rejected": -0.1044064313173294, "step": 450 }, { "epoch": 1.19, "learning_rate": 4.422376313348405e-06, "logits/chosen": -1.9001489877700806, "logits/rejected": -1.8944313526153564, "logps/chosen": -34.53886032104492, "logps/rejected": -35.5705451965332, "loss": 2.054, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.17613370716571808, "rewards/margins": 0.30042320489883423, "rewards/rejected": -0.12428952753543854, "step": 460 }, { "epoch": 1.22, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -1.929231882095337, "logits/rejected": -1.9289064407348633, "logps/chosen": -33.301231384277344, "logps/rejected": -34.58076858520508, "loss": 2.5855, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.17341957986354828, "rewards/margins": 0.2524762451648712, "rewards/rejected": -0.07905664294958115, "step": 470 }, { "epoch": 1.25, "learning_rate": 4.347971356735789e-06, "logits/chosen": -1.9759957790374756, "logits/rejected": -1.9571311473846436, "logps/chosen": -33.301368713378906, "logps/rejected": -33.618614196777344, "loss": 2.3815, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.171879380941391, "rewards/margins": 0.28160572052001953, "rewards/rejected": -0.10972632467746735, "step": 480 }, { "epoch": 1.27, "learning_rate": 4.309335095262675e-06, "logits/chosen": -1.9400146007537842, "logits/rejected": -1.9393583536148071, "logps/chosen": -30.776290893554688, "logps/rejected": -31.59432029724121, "loss": 2.4535, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.16700208187103271, "rewards/margins": 0.2366572916507721, "rewards/rejected": -0.06965517997741699, "step": 490 }, { "epoch": 1.3, "learning_rate": 4.269769281772082e-06, "logits/chosen": -1.9039013385772705, "logits/rejected": -1.8969634771347046, "logps/chosen": -31.851303100585938, "logps/rejected": -35.185157775878906, "loss": 2.4163, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.16376493871212006, "rewards/margins": 0.2754477858543396, "rewards/rejected": -0.11168281733989716, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.190660238265991, "eval_logits/rejected": -2.185832977294922, "eval_logps/chosen": -34.10919189453125, "eval_logps/rejected": -37.70389938354492, "eval_loss": 6.067742347717285, "eval_rewards/accuracies": 0.5801494717597961, "eval_rewards/chosen": -0.014928131364285946, "eval_rewards/margins": 0.02252793498337269, "eval_rewards/rejected": -0.03745606541633606, "eval_runtime": 145.8919, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 500 }, { "epoch": 1.32, "learning_rate": 4.22929424333435e-06, "logits/chosen": -1.9019960165023804, "logits/rejected": -1.9057096242904663, "logps/chosen": -28.43143081665039, "logps/rejected": -33.44213104248047, "loss": 2.4231, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.1486787497997284, "rewards/margins": 0.2652469575405121, "rewards/rejected": -0.11656824499368668, "step": 510 }, { "epoch": 1.35, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -1.9110679626464844, "logits/rejected": -1.921726942062378, "logps/chosen": -32.49781036376953, "logps/rejected": -31.106653213500977, "loss": 2.6543, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.15290473401546478, "rewards/margins": 0.2438981980085373, "rewards/rejected": -0.0909934788942337, "step": 520 }, { "epoch": 1.38, "learning_rate": 4.145700124802693e-06, "logits/chosen": -1.8584115505218506, "logits/rejected": -1.8560092449188232, "logps/chosen": -30.80838394165039, "logps/rejected": -30.5560302734375, "loss": 2.4362, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.17173974215984344, "rewards/margins": 0.2475927770137787, "rewards/rejected": -0.07585303485393524, "step": 530 }, { "epoch": 1.4, "learning_rate": 4.102623991469562e-06, "logits/chosen": -1.9417930841445923, "logits/rejected": -1.9347444772720337, "logps/chosen": -33.26526641845703, "logps/rejected": -33.53062057495117, "loss": 2.2112, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.19351224601268768, "rewards/margins": 0.2961391806602478, "rewards/rejected": -0.10262690484523773, "step": 540 }, { "epoch": 1.43, "learning_rate": 4.058724504646834e-06, "logits/chosen": -1.9108800888061523, "logits/rejected": -1.9172462224960327, "logps/chosen": -30.884307861328125, "logps/rejected": -33.133323669433594, "loss": 2.3198, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.17377518117427826, "rewards/margins": 0.248693585395813, "rewards/rejected": -0.07491841167211533, "step": 550 }, { "epoch": 1.45, "learning_rate": 4.014024217844167e-06, "logits/chosen": -1.9825620651245117, "logits/rejected": -1.9594837427139282, "logps/chosen": -30.586624145507812, "logps/rejected": -33.408687591552734, "loss": 2.6429, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.17018920183181763, "rewards/margins": 0.25271496176719666, "rewards/rejected": -0.08252575248479843, "step": 560 }, { "epoch": 1.48, "learning_rate": 3.968546095984911e-06, "logits/chosen": -1.9179880619049072, "logits/rejected": -1.9130771160125732, "logps/chosen": -31.49106216430664, "logps/rejected": -32.453086853027344, "loss": 2.7291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.18513426184654236, "rewards/margins": 0.25156813859939575, "rewards/rejected": -0.0664338767528534, "step": 570 }, { "epoch": 1.51, "learning_rate": 3.922313503607806e-06, "logits/chosen": -1.9604476690292358, "logits/rejected": -1.9623527526855469, "logps/chosen": -33.557342529296875, "logps/rejected": -35.4670524597168, "loss": 2.2236, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1636902391910553, "rewards/margins": 0.29691803455352783, "rewards/rejected": -0.13322779536247253, "step": 580 }, { "epoch": 1.53, "learning_rate": 3.875350192863368e-06, "logits/chosen": -1.9461857080459595, "logits/rejected": -1.9455292224884033, "logps/chosen": -29.717754364013672, "logps/rejected": -31.953426361083984, "loss": 2.4743, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.17486092448234558, "rewards/margins": 0.2732195258140564, "rewards/rejected": -0.09835861623287201, "step": 590 }, { "epoch": 1.56, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -1.9628851413726807, "logits/rejected": -1.960695505142212, "logps/chosen": -32.0013542175293, "logps/rejected": -32.742637634277344, "loss": 2.52, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19938594102859497, "rewards/margins": 0.2879222333431244, "rewards/rejected": -0.08853629976511002, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.1998937129974365, "eval_logits/rejected": -2.195075750350952, "eval_logps/chosen": -34.08315658569336, "eval_logps/rejected": -37.69052505493164, "eval_loss": 5.999026775360107, "eval_rewards/accuracies": 0.5747508406639099, "eval_rewards/chosen": -0.009720847941935062, "eval_rewards/margins": 0.025059644132852554, "eval_rewards/rejected": -0.03478049114346504, "eval_runtime": 145.9294, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 600 }, { "epoch": 1.58, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -2.0093092918395996, "logits/rejected": -2.0159478187561035, "logps/chosen": -31.2258358001709, "logps/rejected": -32.541290283203125, "loss": 2.1606, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.20230896770954132, "rewards/margins": 0.29792895913124084, "rewards/rejected": -0.09561996161937714, "step": 610 }, { "epoch": 1.61, "learning_rate": 3.730319028506478e-06, "logits/chosen": -1.9672517776489258, "logits/rejected": -1.9648239612579346, "logps/chosen": -33.72132110595703, "logps/rejected": -31.379138946533203, "loss": 2.198, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.20691220462322235, "rewards/margins": 0.2856389880180359, "rewards/rejected": -0.07872680574655533, "step": 620 }, { "epoch": 1.64, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -1.912590742111206, "logits/rejected": -1.9057750701904297, "logps/chosen": -34.36874008178711, "logps/rejected": -32.927589416503906, "loss": 2.2377, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2190968096256256, "rewards/margins": 0.3331736922264099, "rewards/rejected": -0.11407686769962311, "step": 630 }, { "epoch": 1.66, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -1.9224069118499756, "logits/rejected": -1.9286394119262695, "logps/chosen": -33.01720428466797, "logps/rejected": -33.65998840332031, "loss": 2.4802, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.19241946935653687, "rewards/margins": 0.2763652503490448, "rewards/rejected": -0.08394578844308853, "step": 640 }, { "epoch": 1.69, "learning_rate": 3.579601087369492e-06, "logits/chosen": -2.0105245113372803, "logits/rejected": -2.024742603302002, "logps/chosen": -30.962276458740234, "logps/rejected": -32.379337310791016, "loss": 2.7242, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.17803707718849182, "rewards/margins": 0.25436651706695557, "rewards/rejected": -0.07632941007614136, "step": 650 }, { "epoch": 1.71, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.8864635229110718, "logits/rejected": -1.8834316730499268, "logps/chosen": -32.803226470947266, "logps/rejected": -35.24103546142578, "loss": 2.0371, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.22394683957099915, "rewards/margins": 0.3321036398410797, "rewards/rejected": -0.10815682262182236, "step": 660 }, { "epoch": 1.74, "learning_rate": 3.476306177936961e-06, "logits/chosen": -1.9782884120941162, "logits/rejected": -1.9784494638442993, "logps/chosen": -30.333904266357422, "logps/rejected": -34.28630828857422, "loss": 2.1727, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.19033712148666382, "rewards/margins": 0.28232187032699585, "rewards/rejected": -0.09198474884033203, "step": 670 }, { "epoch": 1.77, "learning_rate": 3.423893017450324e-06, "logits/chosen": -1.9290120601654053, "logits/rejected": -1.9259191751480103, "logps/chosen": -29.953277587890625, "logps/rejected": -33.294857025146484, "loss": 2.5836, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.18381401896476746, "rewards/margins": 0.27922549843788147, "rewards/rejected": -0.09541147202253342, "step": 680 }, { "epoch": 1.79, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -1.9663722515106201, "logits/rejected": -1.9663045406341553, "logps/chosen": -29.003856658935547, "logps/rejected": -31.04767417907715, "loss": 2.0189, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23184648156166077, "rewards/margins": 0.30728113651275635, "rewards/rejected": -0.07543464004993439, "step": 690 }, { "epoch": 1.82, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -1.8771839141845703, "logits/rejected": -1.8805001974105835, "logps/chosen": -33.498069763183594, "logps/rejected": -31.84822654724121, "loss": 2.9186, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.19608056545257568, "rewards/margins": 0.272787868976593, "rewards/rejected": -0.07670729607343674, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -2.2096614837646484, "eval_logits/rejected": -2.2048423290252686, "eval_logps/chosen": -34.122745513916016, "eval_logps/rejected": -37.69883728027344, "eval_loss": 6.169560432434082, "eval_rewards/accuracies": 0.5598006844520569, "eval_rewards/chosen": -0.01763882488012314, "eval_rewards/margins": 0.01880452036857605, "eval_rewards/rejected": -0.03644334897398949, "eval_runtime": 145.9673, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 700 }, { "epoch": 1.84, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -1.9619114398956299, "logits/rejected": -1.9527734518051147, "logps/chosen": -35.47718811035156, "logps/rejected": -32.34703826904297, "loss": 2.0062, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.20696695148944855, "rewards/margins": 0.2952113747596741, "rewards/rejected": -0.08824445307254791, "step": 710 }, { "epoch": 1.87, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -1.9771125316619873, "logits/rejected": -1.9781534671783447, "logps/chosen": -35.389427185058594, "logps/rejected": -33.445552825927734, "loss": 2.2199, "rewards/accuracies": 0.9375, "rewards/chosen": 0.20919255912303925, "rewards/margins": 0.28309938311576843, "rewards/rejected": -0.07390682399272919, "step": 720 }, { "epoch": 1.9, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -1.9929240942001343, "logits/rejected": -1.9904762506484985, "logps/chosen": -31.665563583374023, "logps/rejected": -33.510894775390625, "loss": 2.1505, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17955288290977478, "rewards/margins": 0.28235217928886414, "rewards/rejected": -0.10279928147792816, "step": 730 }, { "epoch": 1.92, "learning_rate": 3.100405083388799e-06, "logits/chosen": -1.9763247966766357, "logits/rejected": -1.981488823890686, "logps/chosen": -30.730270385742188, "logps/rejected": -33.69140625, "loss": 1.8054, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.21109160780906677, "rewards/margins": 0.3184094727039337, "rewards/rejected": -0.10731786489486694, "step": 740 }, { "epoch": 1.95, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -1.9217841625213623, "logits/rejected": -1.9206024408340454, "logps/chosen": -32.20301818847656, "logps/rejected": -35.21897888183594, "loss": 2.2296, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.18439263105392456, "rewards/margins": 0.29099735617637634, "rewards/rejected": -0.10660471022129059, "step": 750 }, { "epoch": 1.97, "learning_rate": 2.989809792446417e-06, "logits/chosen": -1.801102638244629, "logits/rejected": -1.7960617542266846, "logps/chosen": -35.06943130493164, "logps/rejected": -35.63484573364258, "loss": 2.4449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.1858016401529312, "rewards/margins": 0.27108871936798096, "rewards/rejected": -0.08528711646795273, "step": 760 }, { "epoch": 2.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.941235899925232, "logits/rejected": -1.9449260234832764, "logps/chosen": -34.17765808105469, "logps/rejected": -33.9752311706543, "loss": 2.3588, "rewards/accuracies": 0.8833333849906921, "rewards/chosen": 0.2072003185749054, "rewards/margins": 0.2832789123058319, "rewards/rejected": -0.07607860863208771, "step": 770 }, { "epoch": 2.03, "learning_rate": 2.878208065043501e-06, "logits/chosen": -1.884577751159668, "logits/rejected": -1.8827228546142578, "logps/chosen": -32.60283279418945, "logps/rejected": -35.19242858886719, "loss": 0.9771, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.25975239276885986, "rewards/margins": 0.41730666160583496, "rewards/rejected": -0.1575542539358139, "step": 780 }, { "epoch": 2.05, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -1.933282494544983, "logits/rejected": -1.931807279586792, "logps/chosen": -32.13750076293945, "logps/rejected": -33.67426300048828, "loss": 1.1284, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24764792621135712, "rewards/margins": 0.3651863932609558, "rewards/rejected": -0.11753849685192108, "step": 790 }, { "epoch": 2.08, "learning_rate": 2.76582921478147e-06, "logits/chosen": -1.8644397258758545, "logits/rejected": -1.8584182262420654, "logps/chosen": -33.09177780151367, "logps/rejected": -32.095069885253906, "loss": 1.2867, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2582601010799408, "rewards/margins": 0.38442230224609375, "rewards/rejected": -0.12616217136383057, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -2.2108798027038574, "eval_logits/rejected": -2.2060329914093018, "eval_logps/chosen": -34.095699310302734, "eval_logps/rejected": -37.697025299072266, "eval_loss": 6.059365272521973, "eval_rewards/accuracies": 0.5776578187942505, "eval_rewards/chosen": -0.012229476124048233, "eval_rewards/margins": 0.023852398619055748, "eval_rewards/rejected": -0.03608187288045883, "eval_runtime": 145.9095, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 800 }, { "epoch": 2.1, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -1.965205430984497, "logits/rejected": -1.972726821899414, "logps/chosen": -30.922103881835938, "logps/rejected": -34.932464599609375, "loss": 1.1344, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.24123027920722961, "rewards/margins": 0.37655484676361084, "rewards/rejected": -0.13532456755638123, "step": 810 }, { "epoch": 2.13, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -1.9390350580215454, "logits/rejected": -1.9418513774871826, "logps/chosen": -31.403156280517578, "logps/rejected": -33.695457458496094, "loss": 1.4773, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23358435928821564, "rewards/margins": 0.32671505212783813, "rewards/rejected": -0.09313070774078369, "step": 820 }, { "epoch": 2.16, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -1.8422248363494873, "logits/rejected": -1.8463550806045532, "logps/chosen": -31.341222763061523, "logps/rejected": -35.71354675292969, "loss": 0.9876, "rewards/accuracies": 1.0, "rewards/chosen": 0.2500464916229248, "rewards/margins": 0.4107195734977722, "rewards/rejected": -0.16067302227020264, "step": 830 }, { "epoch": 2.18, "learning_rate": 2.53966490958702e-06, "logits/chosen": -1.904828429222107, "logits/rejected": -1.9009695053100586, "logps/chosen": -31.746740341186523, "logps/rejected": -33.4873161315918, "loss": 1.6306, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2443816363811493, "rewards/margins": 0.3429158329963684, "rewards/rejected": -0.09853418916463852, "step": 840 }, { "epoch": 2.21, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -1.98971426486969, "logits/rejected": -1.991711974143982, "logps/chosen": -31.50253677368164, "logps/rejected": -36.3741569519043, "loss": 0.9313, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2545596957206726, "rewards/margins": 0.39700326323509216, "rewards/rejected": -0.14244356751441956, "step": 850 }, { "epoch": 2.23, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -1.7937977313995361, "logits/rejected": -1.7878059148788452, "logps/chosen": -35.0206184387207, "logps/rejected": -33.611572265625, "loss": 1.1249, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2522119879722595, "rewards/margins": 0.39614588022232056, "rewards/rejected": -0.14393387734889984, "step": 860 }, { "epoch": 2.26, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -1.8565397262573242, "logits/rejected": -1.85663640499115, "logps/chosen": -34.53380584716797, "logps/rejected": -35.518733978271484, "loss": 1.1272, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2618001401424408, "rewards/margins": 0.396293580532074, "rewards/rejected": -0.13449345529079437, "step": 870 }, { "epoch": 2.29, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.9093639850616455, "logits/rejected": -1.897884726524353, "logps/chosen": -32.93827819824219, "logps/rejected": -33.690372467041016, "loss": 0.865, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.23366093635559082, "rewards/margins": 0.3873291015625, "rewards/rejected": -0.15366819500923157, "step": 880 }, { "epoch": 2.31, "learning_rate": 2.256719512667651e-06, "logits/chosen": -2.010274648666382, "logits/rejected": -2.014864444732666, "logps/chosen": -32.14999771118164, "logps/rejected": -33.120323181152344, "loss": 1.1326, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23598790168762207, "rewards/margins": 0.3727555572986603, "rewards/rejected": -0.1367676854133606, "step": 890 }, { "epoch": 2.34, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -1.9262173175811768, "logits/rejected": -1.9189611673355103, "logps/chosen": -33.60108184814453, "logps/rejected": -33.117454528808594, "loss": 0.8862, "rewards/accuracies": 1.0, "rewards/chosen": 0.2505994439125061, "rewards/margins": 0.39366382360458374, "rewards/rejected": -0.14306436479091644, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -2.207599639892578, "eval_logits/rejected": -2.202746629714966, "eval_logps/chosen": -34.117210388183594, "eval_logps/rejected": -37.7178955078125, "eval_loss": 6.062132835388184, "eval_rewards/accuracies": 0.5917773842811584, "eval_rewards/chosen": -0.01653219200670719, "eval_rewards/margins": 0.02372238226234913, "eval_rewards/rejected": -0.04025457799434662, "eval_runtime": 145.9709, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 900 }, { "epoch": 2.36, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -1.9199193716049194, "logits/rejected": -1.9204641580581665, "logps/chosen": -30.064319610595703, "logps/rejected": -35.48960494995117, "loss": 1.2088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22056813538074493, "rewards/margins": 0.38055095076560974, "rewards/rejected": -0.15998278558254242, "step": 910 }, { "epoch": 2.39, "learning_rate": 2.088219349982323e-06, "logits/chosen": -1.8791996240615845, "logits/rejected": -1.871145248413086, "logps/chosen": -30.649662017822266, "logps/rejected": -34.292842864990234, "loss": 1.1982, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.23785707354545593, "rewards/margins": 0.3821527063846588, "rewards/rejected": -0.14429563283920288, "step": 920 }, { "epoch": 2.42, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -2.044569969177246, "logits/rejected": -2.045048236846924, "logps/chosen": -28.890884399414062, "logps/rejected": -33.5275993347168, "loss": 1.6139, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.22123007476329803, "rewards/margins": 0.3537783920764923, "rewards/rejected": -0.13254830241203308, "step": 930 }, { "epoch": 2.44, "learning_rate": 1.976895560604729e-06, "logits/chosen": -1.9257404804229736, "logits/rejected": -1.935403823852539, "logps/chosen": -33.113128662109375, "logps/rejected": -33.752899169921875, "loss": 1.3358, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2623876631259918, "rewards/margins": 0.415038526058197, "rewards/rejected": -0.15265092253684998, "step": 940 }, { "epoch": 2.47, "learning_rate": 1.921622518534466e-06, "logits/chosen": -1.9671064615249634, "logits/rejected": -1.9709781408309937, "logps/chosen": -29.538061141967773, "logps/rejected": -32.42096710205078, "loss": 1.1713, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.22193200886249542, "rewards/margins": 0.3589089512825012, "rewards/rejected": -0.13697698712348938, "step": 950 }, { "epoch": 2.49, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -1.9648845195770264, "logits/rejected": -1.96088445186615, "logps/chosen": -32.74964141845703, "logps/rejected": -34.44969940185547, "loss": 1.3059, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.26511532068252563, "rewards/margins": 0.3737781345844269, "rewards/rejected": -0.10866282880306244, "step": 960 }, { "epoch": 2.52, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -1.8809791803359985, "logits/rejected": -1.883603811264038, "logps/chosen": -31.52639389038086, "logps/rejected": -35.896881103515625, "loss": 0.8379, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2572704553604126, "rewards/margins": 0.40032801032066345, "rewards/rejected": -0.14305754005908966, "step": 970 }, { "epoch": 2.55, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -1.9164276123046875, "logits/rejected": -1.9099451303482056, "logps/chosen": -34.2746696472168, "logps/rejected": -36.992088317871094, "loss": 1.5424, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.2260798215866089, "rewards/margins": 0.3721070885658264, "rewards/rejected": -0.14602726697921753, "step": 980 }, { "epoch": 2.57, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -1.890472412109375, "logits/rejected": -1.8852546215057373, "logps/chosen": -29.865234375, "logps/rejected": -36.65885543823242, "loss": 1.2374, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.22754326462745667, "rewards/margins": 0.3613375723361969, "rewards/rejected": -0.13379430770874023, "step": 990 }, { "epoch": 2.6, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -1.8666467666625977, "logits/rejected": -1.8558076620101929, "logps/chosen": -30.546798706054688, "logps/rejected": -30.192230224609375, "loss": 1.2395, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2403787076473236, "rewards/margins": 0.35887858271598816, "rewards/rejected": -0.11849988996982574, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -2.2049996852874756, "eval_logits/rejected": -2.2001616954803467, "eval_logps/chosen": -34.11613845825195, "eval_logps/rejected": -37.72568130493164, "eval_loss": 6.000017166137695, "eval_rewards/accuracies": 0.5863787531852722, "eval_rewards/chosen": -0.016317714005708694, "eval_rewards/margins": 0.025494439527392387, "eval_rewards/rejected": -0.04181215539574623, "eval_runtime": 145.9275, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 1000 }, { "epoch": 2.62, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -1.9777822494506836, "logits/rejected": -1.977115273475647, "logps/chosen": -30.621231079101562, "logps/rejected": -30.826059341430664, "loss": 1.3924, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2405930459499359, "rewards/margins": 0.35410913825035095, "rewards/rejected": -0.11351609230041504, "step": 1010 }, { "epoch": 2.65, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -2.015455961227417, "logits/rejected": -2.0161612033843994, "logps/chosen": -30.704198837280273, "logps/rejected": -31.847875595092773, "loss": 1.1721, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.24787600338459015, "rewards/margins": 0.3842898905277252, "rewards/rejected": -0.1364138424396515, "step": 1020 }, { "epoch": 2.68, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -1.9872283935546875, "logits/rejected": -1.9903920888900757, "logps/chosen": -31.16965675354004, "logps/rejected": -28.573312759399414, "loss": 1.2521, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2220829278230667, "rewards/margins": 0.3400026857852936, "rewards/rejected": -0.11791972070932388, "step": 1030 }, { "epoch": 2.7, "learning_rate": 1.440887158673332e-06, "logits/chosen": -2.000020980834961, "logits/rejected": -1.9921737909317017, "logps/chosen": -29.917871475219727, "logps/rejected": -34.14818572998047, "loss": 1.036, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.25200483202934265, "rewards/margins": 0.38159602880477905, "rewards/rejected": -0.12959113717079163, "step": 1040 }, { "epoch": 2.73, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -1.9298603534698486, "logits/rejected": -1.940890908241272, "logps/chosen": -32.330406188964844, "logps/rejected": -32.83357238769531, "loss": 1.158, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2445407211780548, "rewards/margins": 0.3622528612613678, "rewards/rejected": -0.1177120953798294, "step": 1050 }, { "epoch": 2.75, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -2.009378433227539, "logits/rejected": -1.9999854564666748, "logps/chosen": -32.38091278076172, "logps/rejected": -37.074951171875, "loss": 1.4986, "rewards/accuracies": 0.9375, "rewards/chosen": 0.22397327423095703, "rewards/margins": 0.37268632650375366, "rewards/rejected": -0.1487130969762802, "step": 1060 }, { "epoch": 2.78, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -1.9161951541900635, "logits/rejected": -1.9179637432098389, "logps/chosen": -34.65896987915039, "logps/rejected": -33.29723358154297, "loss": 1.1095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2660633623600006, "rewards/margins": 0.3859363794326782, "rewards/rejected": -0.11987300962209702, "step": 1070 }, { "epoch": 2.81, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -2.0510499477386475, "logits/rejected": -2.052177906036377, "logps/chosen": -31.77374839782715, "logps/rejected": -34.914920806884766, "loss": 1.054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2567782998085022, "rewards/margins": 0.4166257381439209, "rewards/rejected": -0.15984737873077393, "step": 1080 }, { "epoch": 2.83, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -2.019425868988037, "logits/rejected": -2.023533344268799, "logps/chosen": -29.67460060119629, "logps/rejected": -34.19086456298828, "loss": 1.3668, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.2437433898448944, "rewards/margins": 0.37235018610954285, "rewards/rejected": -0.12860681116580963, "step": 1090 }, { "epoch": 2.86, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -2.0084166526794434, "logits/rejected": -2.0137572288513184, "logps/chosen": -32.371253967285156, "logps/rejected": -30.9876651763916, "loss": 1.4312, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.23955067992210388, "rewards/margins": 0.35674941539764404, "rewards/rejected": -0.11719872802495956, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -2.2037713527679443, "eval_logits/rejected": -2.1989424228668213, "eval_logps/chosen": -34.106727600097656, "eval_logps/rejected": -37.72097396850586, "eval_loss": 5.990525722503662, "eval_rewards/accuracies": 0.5859634280204773, "eval_rewards/chosen": -0.01443566381931305, "eval_rewards/margins": 0.02643514797091484, "eval_rewards/rejected": -0.04087081179022789, "eval_runtime": 145.9163, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1100 }, { "epoch": 2.88, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -2.000826835632324, "logits/rejected": -2.000603199005127, "logps/chosen": -31.66360092163086, "logps/rejected": -31.520954132080078, "loss": 1.1158, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2553100287914276, "rewards/margins": 0.3563171923160553, "rewards/rejected": -0.1010071411728859, "step": 1110 }, { "epoch": 2.91, "learning_rate": 1.049857726072005e-06, "logits/chosen": -1.8451296091079712, "logits/rejected": -1.8467060327529907, "logps/chosen": -33.390525817871094, "logps/rejected": -33.623374938964844, "loss": 1.7743, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2348897010087967, "rewards/margins": 0.3513115346431732, "rewards/rejected": -0.11642180383205414, "step": 1120 }, { "epoch": 2.94, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -1.884866714477539, "logits/rejected": -1.882128357887268, "logps/chosen": -29.975570678710938, "logps/rejected": -31.670913696289062, "loss": 2.1098, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.20722976326942444, "rewards/margins": 0.3082646131515503, "rewards/rejected": -0.10103483498096466, "step": 1130 }, { "epoch": 2.96, "learning_rate": 9.59060791022566e-07, "logits/chosen": -2.0206472873687744, "logits/rejected": -2.0149998664855957, "logps/chosen": -31.671985626220703, "logps/rejected": -33.523948669433594, "loss": 1.0808, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.27356189489364624, "rewards/margins": 0.3875501751899719, "rewards/rejected": -0.11398820579051971, "step": 1140 }, { "epoch": 2.99, "learning_rate": 9.148382544856885e-07, "logits/chosen": -1.8714290857315063, "logits/rejected": -1.8613548278808594, "logps/chosen": -32.308326721191406, "logps/rejected": -31.882171630859375, "loss": 1.3498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.22868108749389648, "rewards/margins": 0.34663525223731995, "rewards/rejected": -0.11795412003993988, "step": 1150 }, { "epoch": 3.01, "learning_rate": 8.714301001505568e-07, "logits/chosen": -1.9497884511947632, "logits/rejected": -1.9496195316314697, "logps/chosen": -32.30144119262695, "logps/rejected": -31.75592613220215, "loss": 0.782, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2760619819164276, "rewards/margins": 0.3869572877883911, "rewards/rejected": -0.1108952984213829, "step": 1160 }, { "epoch": 3.04, "learning_rate": 8.288586291031025e-07, "logits/chosen": -2.022653818130493, "logits/rejected": -2.017073154449463, "logps/chosen": -32.25920867919922, "logps/rejected": -33.56011962890625, "loss": 0.9637, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.25335273146629333, "rewards/margins": 0.37713244557380676, "rewards/rejected": -0.12377973645925522, "step": 1170 }, { "epoch": 3.06, "learning_rate": 7.871457125803897e-07, "logits/chosen": -1.8782762289047241, "logits/rejected": -1.8860442638397217, "logps/chosen": -32.099754333496094, "logps/rejected": -33.019081115722656, "loss": 0.8431, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24664826691150665, "rewards/margins": 0.3781769871711731, "rewards/rejected": -0.13152867555618286, "step": 1180 }, { "epoch": 3.09, "learning_rate": 7.463127807341966e-07, "logits/chosen": -1.954120397567749, "logits/rejected": -1.9483168125152588, "logps/chosen": -30.81561851501465, "logps/rejected": -33.87861251831055, "loss": 0.9223, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2715989053249359, "rewards/margins": 0.3906315267086029, "rewards/rejected": -0.11903263628482819, "step": 1190 }, { "epoch": 3.12, "learning_rate": 7.063808116212021e-07, "logits/chosen": -1.893996000289917, "logits/rejected": -1.8961610794067383, "logps/chosen": -32.268707275390625, "logps/rejected": -33.56250762939453, "loss": 1.0133, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24434716999530792, "rewards/margins": 0.4048388600349426, "rewards/rejected": -0.1604916751384735, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -2.204813241958618, "eval_logits/rejected": -2.1999785900115967, "eval_logps/chosen": -34.118247985839844, "eval_logps/rejected": -37.71463394165039, "eval_loss": 6.110255241394043, "eval_rewards/accuracies": 0.5888704061508179, "eval_rewards/chosen": -0.016739506274461746, "eval_rewards/margins": 0.022863931953907013, "eval_rewards/rejected": -0.03960343450307846, "eval_runtime": 145.9534, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 1200 }, { "epoch": 3.14, "learning_rate": 6.673703204254348e-07, "logits/chosen": -1.8266493082046509, "logits/rejected": -1.8255360126495361, "logps/chosen": -34.7088623046875, "logps/rejected": -33.4069709777832, "loss": 0.8168, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.268724262714386, "rewards/margins": 0.40824633836746216, "rewards/rejected": -0.13952204585075378, "step": 1210 }, { "epoch": 3.17, "learning_rate": 6.293013489185315e-07, "logits/chosen": -2.001779317855835, "logits/rejected": -1.9957544803619385, "logps/chosen": -30.487533569335938, "logps/rejected": -33.67655563354492, "loss": 0.7601, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.24323053658008575, "rewards/margins": 0.40283313393592834, "rewards/rejected": -0.15960261225700378, "step": 1220 }, { "epoch": 3.19, "learning_rate": 5.921934551632086e-07, "logits/chosen": -1.850182294845581, "logits/rejected": -1.8384950160980225, "logps/chosen": -33.03240203857422, "logps/rejected": -33.570960998535156, "loss": 0.8285, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2706558108329773, "rewards/margins": 0.4060825705528259, "rewards/rejected": -0.13542678952217102, "step": 1230 }, { "epoch": 3.22, "learning_rate": 5.560657034652405e-07, "logits/chosen": -1.94095778465271, "logits/rejected": -1.934069275856018, "logps/chosen": -29.424108505249023, "logps/rejected": -29.53556251525879, "loss": 0.8909, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.25522512197494507, "rewards/margins": 0.38882559537887573, "rewards/rejected": -0.13360042870044708, "step": 1240 }, { "epoch": 3.25, "learning_rate": 5.2093665457911e-07, "logits/chosen": -1.9732204675674438, "logits/rejected": -1.97970449924469, "logps/chosen": -33.94004440307617, "logps/rejected": -32.08849334716797, "loss": 0.8667, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.29106852412223816, "rewards/margins": 0.4051054120063782, "rewards/rejected": -0.11403689533472061, "step": 1250 }, { "epoch": 3.27, "learning_rate": 4.868243561723535e-07, "logits/chosen": -1.9436290264129639, "logits/rejected": -1.944186806678772, "logps/chosen": -31.774776458740234, "logps/rejected": -33.607566833496094, "loss": 0.7227, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.25075581669807434, "rewards/margins": 0.40929579734802246, "rewards/rejected": -0.15853998064994812, "step": 1260 }, { "epoch": 3.3, "learning_rate": 4.537463335535161e-07, "logits/chosen": -1.8878374099731445, "logits/rejected": -1.887284517288208, "logps/chosen": -31.73285484313965, "logps/rejected": -34.178916931152344, "loss": 0.8078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2715964913368225, "rewards/margins": 0.4032578468322754, "rewards/rejected": -0.13166138529777527, "step": 1270 }, { "epoch": 3.32, "learning_rate": 4.217195806684629e-07, "logits/chosen": -1.7711877822875977, "logits/rejected": -1.7664165496826172, "logps/chosen": -33.83073425292969, "logps/rejected": -31.557464599609375, "loss": 0.7326, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.28134211897850037, "rewards/margins": 0.407537043094635, "rewards/rejected": -0.12619495391845703, "step": 1280 }, { "epoch": 3.35, "learning_rate": 3.907605513696808e-07, "logits/chosen": -1.9766957759857178, "logits/rejected": -1.96097731590271, "logps/chosen": -33.02162551879883, "logps/rejected": -35.638221740722656, "loss": 0.6214, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.26518964767456055, "rewards/margins": 0.4397699236869812, "rewards/rejected": -0.17458033561706543, "step": 1290 }, { "epoch": 3.38, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -1.9200379848480225, "logits/rejected": -1.9249954223632812, "logps/chosen": -32.194488525390625, "logps/rejected": -36.9532356262207, "loss": 0.5152, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.26462727785110474, "rewards/margins": 0.4346303343772888, "rewards/rejected": -0.17000307142734528, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -2.2051475048065186, "eval_logits/rejected": -2.200305461883545, "eval_logps/chosen": -34.10041427612305, "eval_logps/rejected": -37.70795822143555, "eval_loss": 6.057835102081299, "eval_rewards/accuracies": 0.5544019937515259, "eval_rewards/chosen": -0.013172022067010403, "eval_rewards/margins": 0.025095123797655106, "eval_rewards/rejected": -0.038267143070697784, "eval_runtime": 145.9616, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 1300 }, { "epoch": 3.4, "learning_rate": 3.321087280364757e-07, "logits/chosen": -1.8912603855133057, "logits/rejected": -1.8913657665252686, "logps/chosen": -34.96311950683594, "logps/rejected": -37.694114685058594, "loss": 1.0811, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2561797797679901, "rewards/margins": 0.37817925214767456, "rewards/rejected": -0.12199944257736206, "step": 1310 }, { "epoch": 3.43, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.9785616397857666, "logits/rejected": -1.9770876169204712, "logps/chosen": -30.837026596069336, "logps/rejected": -31.65818214416504, "loss": 0.8959, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.25136545300483704, "rewards/margins": 0.39232978224754333, "rewards/rejected": -0.1409643441438675, "step": 1320 }, { "epoch": 3.45, "learning_rate": 2.779113783626916e-07, "logits/chosen": -1.882939100265503, "logits/rejected": -1.884426474571228, "logps/chosen": -33.03701400756836, "logps/rejected": -34.348941802978516, "loss": 0.6463, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.26744818687438965, "rewards/margins": 0.4075010418891907, "rewards/rejected": -0.14005282521247864, "step": 1330 }, { "epoch": 3.48, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -1.9446265697479248, "logits/rejected": -1.9434131383895874, "logps/chosen": -29.983739852905273, "logps/rejected": -32.796180725097656, "loss": 0.8896, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.24298477172851562, "rewards/margins": 0.4038262367248535, "rewards/rejected": -0.1608414649963379, "step": 1340 }, { "epoch": 3.51, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -1.9611021280288696, "logits/rejected": -1.9463632106781006, "logps/chosen": -33.147804260253906, "logps/rejected": -36.96843338012695, "loss": 0.8124, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.23377390205860138, "rewards/margins": 0.37457385659217834, "rewards/rejected": -0.14079990983009338, "step": 1350 }, { "epoch": 3.53, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -1.9647136926651, "logits/rejected": -1.9772529602050781, "logps/chosen": -30.748397827148438, "logps/rejected": -32.809547424316406, "loss": 1.0166, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2640145421028137, "rewards/margins": 0.37797611951828003, "rewards/rejected": -0.1139616146683693, "step": 1360 }, { "epoch": 3.56, "learning_rate": 1.833161662683672e-07, "logits/chosen": -2.0538225173950195, "logits/rejected": -2.0530457496643066, "logps/chosen": -30.740360260009766, "logps/rejected": -36.674530029296875, "loss": 0.5522, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.25801384449005127, "rewards/margins": 0.4267433285713196, "rewards/rejected": -0.16872946918010712, "step": 1370 }, { "epoch": 3.58, "learning_rate": 1.626139998169246e-07, "logits/chosen": -1.9202001094818115, "logits/rejected": -1.928109884262085, "logps/chosen": -32.956661224365234, "logps/rejected": -38.160308837890625, "loss": 1.16, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24476349353790283, "rewards/margins": 0.3868965208530426, "rewards/rejected": -0.14213302731513977, "step": 1380 }, { "epoch": 3.61, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -1.8742387294769287, "logits/rejected": -1.8700668811798096, "logps/chosen": -33.33854675292969, "logps/rejected": -32.12687301635742, "loss": 0.9939, "rewards/accuracies": 0.9375, "rewards/chosen": 0.28573352098464966, "rewards/margins": 0.39162954688072205, "rewards/rejected": -0.10589603334665298, "step": 1390 }, { "epoch": 3.64, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -2.037750720977783, "logits/rejected": -2.0352249145507812, "logps/chosen": -30.064016342163086, "logps/rejected": -32.856353759765625, "loss": 0.8378, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24650637805461884, "rewards/margins": 0.38888365030288696, "rewards/rejected": -0.14237727224826813, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -2.2052032947540283, "eval_logits/rejected": -2.2003655433654785, "eval_logps/chosen": -34.10353088378906, "eval_logps/rejected": -37.7112922668457, "eval_loss": 6.057151794433594, "eval_rewards/accuracies": 0.5747508406639099, "eval_rewards/chosen": -0.013795554637908936, "eval_rewards/margins": 0.025138981640338898, "eval_rewards/rejected": -0.03893453627824783, "eval_runtime": 145.9658, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 1400 }, { "epoch": 3.66, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -1.9774906635284424, "logits/rejected": -1.9731305837631226, "logps/chosen": -32.331031799316406, "logps/rejected": -30.20021629333496, "loss": 0.8015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.26229986548423767, "rewards/margins": 0.39145129919052124, "rewards/rejected": -0.1291513890028, "step": 1410 }, { "epoch": 3.69, "learning_rate": 9.191080703056604e-08, "logits/chosen": -1.935434341430664, "logits/rejected": -1.936626672744751, "logps/chosen": -32.0463981628418, "logps/rejected": -35.533714294433594, "loss": 1.0098, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2636954188346863, "rewards/margins": 0.38563698530197144, "rewards/rejected": -0.12194149196147919, "step": 1420 }, { "epoch": 3.71, "learning_rate": 7.730678442730539e-08, "logits/chosen": -1.8892452716827393, "logits/rejected": -1.8824679851531982, "logps/chosen": -32.84967803955078, "logps/rejected": -37.3257942199707, "loss": 0.883, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2584185004234314, "rewards/margins": 0.3856961131095886, "rewards/rejected": -0.12727761268615723, "step": 1430 }, { "epoch": 3.74, "learning_rate": 6.394742864787806e-08, "logits/chosen": -1.9004735946655273, "logits/rejected": -1.8949673175811768, "logps/chosen": -28.052719116210938, "logps/rejected": -32.38301467895508, "loss": 1.1273, "rewards/accuracies": 0.9375, "rewards/chosen": 0.25210484862327576, "rewards/margins": 0.38257673382759094, "rewards/rejected": -0.13047190010547638, "step": 1440 }, { "epoch": 3.77, "learning_rate": 5.183960310644748e-08, "logits/chosen": -1.9209520816802979, "logits/rejected": -1.910509705543518, "logps/chosen": -31.346166610717773, "logps/rejected": -36.17350387573242, "loss": 0.819, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22707252204418182, "rewards/margins": 0.4047010540962219, "rewards/rejected": -0.17762848734855652, "step": 1450 }, { "epoch": 3.79, "learning_rate": 4.098952823928693e-08, "logits/chosen": -1.8970378637313843, "logits/rejected": -1.8941428661346436, "logps/chosen": -32.03502655029297, "logps/rejected": -31.428430557250977, "loss": 1.1237, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.25494998693466187, "rewards/margins": 0.36247119307518005, "rewards/rejected": -0.1075211763381958, "step": 1460 }, { "epoch": 3.82, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -1.9586317539215088, "logits/rejected": -1.9652128219604492, "logps/chosen": -30.69277000427246, "logps/rejected": -33.610328674316406, "loss": 0.8233, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2752807140350342, "rewards/margins": 0.4048624038696289, "rewards/rejected": -0.12958170473575592, "step": 1470 }, { "epoch": 3.84, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -1.9582951068878174, "logits/rejected": -1.9687860012054443, "logps/chosen": -30.41074562072754, "logps/rejected": -30.278295516967773, "loss": 0.8239, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.25690194964408875, "rewards/margins": 0.38684824109077454, "rewards/rejected": -0.1299462616443634, "step": 1480 }, { "epoch": 3.87, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -1.8908532857894897, "logits/rejected": -1.8837330341339111, "logps/chosen": -32.43671417236328, "logps/rejected": -32.93724822998047, "loss": 0.9657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.24395498633384705, "rewards/margins": 0.38930970430374146, "rewards/rejected": -0.14535477757453918, "step": 1490 }, { "epoch": 3.9, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -1.943483591079712, "logits/rejected": -1.940323829650879, "logps/chosen": -32.4309196472168, "logps/rejected": -34.01084518432617, "loss": 0.9599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2389308512210846, "rewards/margins": 0.38905516266822815, "rewards/rejected": -0.15012428164482117, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -2.2052109241485596, "eval_logits/rejected": -2.2003753185272217, "eval_logps/chosen": -34.09724044799805, "eval_logps/rejected": -37.70912551879883, "eval_loss": 6.034818649291992, "eval_rewards/accuracies": 0.5834717750549316, "eval_rewards/chosen": -0.012537354603409767, "eval_rewards/margins": 0.025963816791772842, "eval_rewards/rejected": -0.03850117325782776, "eval_runtime": 145.968, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 1500 }, { "epoch": 3.92, "learning_rate": 5.777746105209147e-09, "logits/chosen": -2.0217642784118652, "logits/rejected": -2.022365093231201, "logps/chosen": -28.69793701171875, "logps/rejected": -33.473472595214844, "loss": 1.3374, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.22366802394390106, "rewards/margins": 0.37027162313461304, "rewards/rejected": -0.14660362899303436, "step": 1510 }, { "epoch": 3.95, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -1.8759753704071045, "logits/rejected": -1.8758065700531006, "logps/chosen": -31.9100341796875, "logps/rejected": -34.565345764160156, "loss": 1.196, "rewards/accuracies": 0.9375, "rewards/chosen": 0.22636690735816956, "rewards/margins": 0.3646892309188843, "rewards/rejected": -0.13832230865955353, "step": 1520 }, { "epoch": 3.97, "learning_rate": 6.421917227455999e-10, "logits/chosen": -2.0270986557006836, "logits/rejected": -2.024442195892334, "logps/chosen": -30.32328224182129, "logps/rejected": -32.253662109375, "loss": 0.9082, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.23015546798706055, "rewards/margins": 0.384157657623291, "rewards/rejected": -0.15400215983390808, "step": 1530 }, { "epoch": 4.0, "learning_rate": 0.0, "logits/chosen": -2.0110628604888916, "logits/rejected": -2.0121970176696777, "logps/chosen": -29.453231811523438, "logps/rejected": -30.480737686157227, "loss": 0.9153, "rewards/accuracies": 0.98333340883255, "rewards/chosen": 0.23334148526191711, "rewards/margins": 0.37145254015922546, "rewards/rejected": -0.13811106979846954, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 1.3887401007986688, "train_runtime": 10806.9485, "train_samples_per_second": 1.14, "train_steps_per_second": 0.143 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }