{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6155917425310937, "eval_steps": 10, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004615976407231696, "grad_norm": 60.83765068742266, "learning_rate": 1.1494252873563218e-08, "logits/chosen": 0.4711977541446686, "logits/rejected": 0.4847034811973572, "logps/chosen": -41.84939193725586, "logps/rejected": -44.508792877197266, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.009231952814463392, "grad_norm": 89.49857504360673, "learning_rate": 2.2988505747126436e-08, "logits/chosen": 0.4102262556552887, "logits/rejected": 0.4489870071411133, "logps/chosen": -33.33359909057617, "logps/rejected": -48.11466979980469, "loss": 0.6965, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.016991177573800087, "rewards/margins": -0.00249446090310812, "rewards/rejected": 0.01948563940823078, "step": 4 }, { "epoch": 0.01384792922169509, "grad_norm": 91.503921376188, "learning_rate": 3.448275862068965e-08, "logits/chosen": 0.4212642312049866, "logits/rejected": 0.448761522769928, "logps/chosen": -39.75364685058594, "logps/rejected": -51.98044967651367, "loss": 0.7058, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": 0.01352924108505249, "rewards/margins": -0.021095700562000275, "rewards/rejected": 0.034624941647052765, "step": 6 }, { "epoch": 0.018463905628926785, "grad_norm": 74.67414376851612, "learning_rate": 4.597701149425287e-08, "logits/chosen": 0.3533351719379425, "logits/rejected": 0.38716739416122437, "logps/chosen": -42.66749954223633, "logps/rejected": -59.93525695800781, "loss": 0.682, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.15689438581466675, "rewards/margins": 0.02920585870742798, "rewards/rejected": 0.12768852710723877, "step": 8 }, { "epoch": 0.023079882036158482, "grad_norm": 68.70984683419653, "learning_rate": 5.747126436781609e-08, "logits/chosen": 0.49728691577911377, "logits/rejected": 0.5158182978630066, "logps/chosen": -40.442108154296875, "logps/rejected": -47.894962310791016, "loss": 0.6784, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.20257243514060974, "rewards/margins": 0.033093564212322235, "rewards/rejected": 0.1694788932800293, "step": 10 }, { "epoch": 0.023079882036158482, "eval_logits/chosen": 0.3284655511379242, "eval_logits/rejected": 0.3523290753364563, "eval_logps/chosen": -41.368160247802734, "eval_logps/rejected": -47.68316650390625, "eval_loss": 0.6900005340576172, "eval_rewards/accuracies": 0.5040322542190552, "eval_rewards/chosen": 0.18856020271778107, "eval_rewards/margins": 0.010975954122841358, "eval_rewards/rejected": 0.1775842159986496, "eval_runtime": 223.5149, "eval_samples_per_second": 7.758, "eval_steps_per_second": 1.942, "step": 10 }, { "epoch": 0.02769585844339018, "grad_norm": 83.08821357925031, "learning_rate": 6.89655172413793e-08, "logits/chosen": 0.39168137311935425, "logits/rejected": 0.428312748670578, "logps/chosen": -40.189659118652344, "logps/rejected": -55.229732513427734, "loss": 0.6825, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.10889428108930588, "rewards/margins": 0.025906018912792206, "rewards/rejected": 0.08298826217651367, "step": 12 }, { "epoch": 0.032311834850621876, "grad_norm": 83.2788469670015, "learning_rate": 8.045977011494252e-08, "logits/chosen": 0.4244603216648102, "logits/rejected": 0.45606857538223267, "logps/chosen": -45.81875228881836, "logps/rejected": -59.79555130004883, "loss": 0.707, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.0007680323324166238, "rewards/margins": -0.02245757356286049, "rewards/rejected": 0.023225605487823486, "step": 14 }, { "epoch": 0.03692781125785357, "grad_norm": 65.27464739827121, "learning_rate": 9.195402298850574e-08, "logits/chosen": 0.43778783082962036, "logits/rejected": 0.47771337628364563, "logps/chosen": -33.643489837646484, "logps/rejected": -47.315940856933594, "loss": 0.6907, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.16155114769935608, "rewards/margins": 0.009526676498353481, "rewards/rejected": 0.15202444791793823, "step": 16 }, { "epoch": 0.04154378766508527, "grad_norm": 60.46601141846051, "learning_rate": 1.0344827586206897e-07, "logits/chosen": 0.4576772153377533, "logits/rejected": 0.4669303894042969, "logps/chosen": -49.01601791381836, "logps/rejected": -44.165489196777344, "loss": 0.7024, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.1639019399881363, "rewards/margins": -0.013081331737339497, "rewards/rejected": 0.1769832819700241, "step": 18 }, { "epoch": 0.046159764072316964, "grad_norm": 79.31933330847342, "learning_rate": 1.1494252873563217e-07, "logits/chosen": 0.40101033449172974, "logits/rejected": 0.4429229199886322, "logps/chosen": -42.295860290527344, "logps/rejected": -61.62363052368164, "loss": 0.6993, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.1744026243686676, "rewards/margins": -0.005564332008361816, "rewards/rejected": 0.17996692657470703, "step": 20 }, { "epoch": 0.046159764072316964, "eval_logits/chosen": 0.3300890624523163, "eval_logits/rejected": 0.3539319634437561, "eval_logps/chosen": -41.36879348754883, "eval_logps/rejected": -47.67192840576172, "eval_loss": 0.6927710771560669, "eval_rewards/accuracies": 0.4694700539112091, "eval_rewards/chosen": 0.1882432997226715, "eval_rewards/margins": 0.005040565971285105, "eval_rewards/rejected": 0.18320275843143463, "eval_runtime": 220.5959, "eval_samples_per_second": 7.861, "eval_steps_per_second": 1.967, "step": 20 }, { "epoch": 0.05077574047954866, "grad_norm": 74.95614553584633, "learning_rate": 1.2643678160919542e-07, "logits/chosen": 0.35644879937171936, "logits/rejected": 0.39824995398521423, "logps/chosen": -44.09666442871094, "logps/rejected": -67.98532104492188, "loss": 0.6849, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.061192478984594345, "rewards/margins": 0.024405598640441895, "rewards/rejected": 0.03678688034415245, "step": 22 }, { "epoch": 0.05539171688678036, "grad_norm": 59.95529358393387, "learning_rate": 1.379310344827586e-07, "logits/chosen": 0.4076593816280365, "logits/rejected": 0.4187220335006714, "logps/chosen": -50.34169006347656, "logps/rejected": -52.33488464355469, "loss": 0.6737, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.18035806715488434, "rewards/margins": 0.046983275562524796, "rewards/rejected": 0.13337479531764984, "step": 24 }, { "epoch": 0.06000769329401205, "grad_norm": 58.82337491053465, "learning_rate": 1.4942528735632184e-07, "logits/chosen": 0.38400429487228394, "logits/rejected": 0.3896331191062927, "logps/chosen": -45.30482482910156, "logps/rejected": -38.63485336303711, "loss": 0.6927, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.12515152990818024, "rewards/margins": 0.005594419315457344, "rewards/rejected": 0.11955711245536804, "step": 26 }, { "epoch": 0.06462366970124375, "grad_norm": 79.835671840217, "learning_rate": 1.6091954022988505e-07, "logits/chosen": 0.38133352994918823, "logits/rejected": 0.4193841814994812, "logps/chosen": -46.66801452636719, "logps/rejected": -66.68572998046875, "loss": 0.6817, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.1615171581506729, "rewards/margins": 0.03017430752515793, "rewards/rejected": 0.1313428282737732, "step": 28 }, { "epoch": 0.06923964610847544, "grad_norm": 63.4551490148668, "learning_rate": 1.7241379310344828e-07, "logits/chosen": 0.38135284185409546, "logits/rejected": 0.4095006585121155, "logps/chosen": -40.06434631347656, "logps/rejected": -49.53153610229492, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": 0.18324324488639832, "rewards/margins": 0.04405728355050087, "rewards/rejected": 0.13918595016002655, "step": 30 }, { "epoch": 0.06923964610847544, "eval_logits/chosen": 0.33052363991737366, "eval_logits/rejected": 0.3544217050075531, "eval_logps/chosen": -41.5091438293457, "eval_logps/rejected": -47.86111068725586, "eval_loss": 0.6812014579772949, "eval_rewards/accuracies": 0.546658992767334, "eval_rewards/chosen": 0.11806601285934448, "eval_rewards/margins": 0.029455602169036865, "eval_rewards/rejected": 0.08861041069030762, "eval_runtime": 220.5898, "eval_samples_per_second": 7.861, "eval_steps_per_second": 1.967, "step": 30 }, { "epoch": 0.07385562251570714, "grad_norm": 54.00177702879831, "learning_rate": 1.839080459770115e-07, "logits/chosen": 0.4309755563735962, "logits/rejected": 0.45285335183143616, "logps/chosen": -42.45962905883789, "logps/rejected": -47.46916198730469, "loss": 0.6778, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.059671804308891296, "rewards/margins": 0.03700065612792969, "rewards/rejected": 0.022671150043606758, "step": 32 }, { "epoch": 0.07847159892293884, "grad_norm": 55.987754524641964, "learning_rate": 1.9540229885057472e-07, "logits/chosen": 0.3958838880062103, "logits/rejected": 0.43136459589004517, "logps/chosen": -37.61958694458008, "logps/rejected": -52.296146392822266, "loss": 0.6756, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.2221948355436325, "rewards/margins": 0.04236772283911705, "rewards/rejected": 0.17982712388038635, "step": 34 }, { "epoch": 0.08308757533017054, "grad_norm": 67.31410028619514, "learning_rate": 2.0689655172413793e-07, "logits/chosen": 0.44812121987342834, "logits/rejected": 0.46431127190589905, "logps/chosen": -42.98078155517578, "logps/rejected": -41.65153884887695, "loss": 0.6493, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": 0.30534830689430237, "rewards/margins": 0.09969804435968399, "rewards/rejected": 0.20565026998519897, "step": 36 }, { "epoch": 0.08770355173740223, "grad_norm": 57.904024813693326, "learning_rate": 2.1839080459770114e-07, "logits/chosen": 0.49128374457359314, "logits/rejected": 0.5145975351333618, "logps/chosen": -44.50560760498047, "logps/rejected": -49.38070297241211, "loss": 0.6628, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.18226391077041626, "rewards/margins": 0.0719500482082367, "rewards/rejected": 0.11031384021043777, "step": 38 }, { "epoch": 0.09231952814463393, "grad_norm": 64.05496800782066, "learning_rate": 2.2988505747126435e-07, "logits/chosen": 0.46414005756378174, "logits/rejected": 0.47909700870513916, "logps/chosen": -45.80656433105469, "logps/rejected": -48.13614273071289, "loss": 0.6614, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": 0.1019454374909401, "rewards/margins": 0.07323868572711945, "rewards/rejected": 0.028706755489110947, "step": 40 }, { "epoch": 0.09231952814463393, "eval_logits/chosen": 0.3304091989994049, "eval_logits/rejected": 0.35432368516921997, "eval_logps/chosen": -41.51032638549805, "eval_logps/rejected": -47.951194763183594, "eval_loss": 0.6623325347900391, "eval_rewards/accuracies": 0.5748847723007202, "eval_rewards/chosen": 0.11747448146343231, "eval_rewards/margins": 0.07390521466732025, "eval_rewards/rejected": 0.04356926307082176, "eval_runtime": 220.5888, "eval_samples_per_second": 7.861, "eval_steps_per_second": 1.967, "step": 40 }, { "epoch": 0.09693550455186563, "grad_norm": 57.52177717893154, "learning_rate": 2.413793103448276e-07, "logits/chosen": 0.40689817070961, "logits/rejected": 0.427402138710022, "logps/chosen": -38.75439453125, "logps/rejected": -44.31669235229492, "loss": 0.6302, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.2862703502178192, "rewards/margins": 0.14747940003871918, "rewards/rejected": 0.13879093527793884, "step": 42 }, { "epoch": 0.10155148095909731, "grad_norm": 64.43087177898828, "learning_rate": 2.5287356321839084e-07, "logits/chosen": 0.38502392172813416, "logits/rejected": 0.42915642261505127, "logps/chosen": -44.23611831665039, "logps/rejected": -70.150634765625, "loss": 0.6523, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.29370981454849243, "rewards/margins": 0.11090421676635742, "rewards/rejected": 0.18280558288097382, "step": 44 }, { "epoch": 0.10616745736632902, "grad_norm": 60.64457511313282, "learning_rate": 2.64367816091954e-07, "logits/chosen": 0.4625084698200226, "logits/rejected": 0.47940170764923096, "logps/chosen": -47.40989685058594, "logps/rejected": -50.2266731262207, "loss": 0.6586, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.19395428895950317, "rewards/margins": 0.09819034487009048, "rewards/rejected": 0.09576395153999329, "step": 46 }, { "epoch": 0.11078343377356072, "grad_norm": 48.97275141927136, "learning_rate": 2.758620689655172e-07, "logits/chosen": 0.40377330780029297, "logits/rejected": 0.4251302480697632, "logps/chosen": -40.91835021972656, "logps/rejected": -46.69221878051758, "loss": 0.6553, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.19001546502113342, "rewards/margins": 0.11491294950246811, "rewards/rejected": 0.07510250806808472, "step": 48 }, { "epoch": 0.1153994101807924, "grad_norm": 50.405334242894924, "learning_rate": 2.873563218390804e-07, "logits/chosen": 0.42986366152763367, "logits/rejected": 0.4425734579563141, "logps/chosen": -45.240882873535156, "logps/rejected": -45.33219528198242, "loss": 0.6545, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.27720221877098083, "rewards/margins": 0.10149689018726349, "rewards/rejected": 0.17570529878139496, "step": 50 }, { "epoch": 0.1153994101807924, "eval_logits/chosen": 0.3320508301258087, "eval_logits/rejected": 0.35591834783554077, "eval_logps/chosen": -41.36655807495117, "eval_logps/rejected": -48.028892517089844, "eval_loss": 0.6255878210067749, "eval_rewards/accuracies": 0.6278801560401917, "eval_rewards/chosen": 0.1893603652715683, "eval_rewards/margins": 0.18464061617851257, "eval_rewards/rejected": 0.004719759337604046, "eval_runtime": 220.5277, "eval_samples_per_second": 7.863, "eval_steps_per_second": 1.968, "step": 50 }, { "epoch": 0.1200153865880241, "grad_norm": 59.45036930086866, "learning_rate": 2.988505747126437e-07, "logits/chosen": 0.4412320852279663, "logits/rejected": 0.47745391726493835, "logps/chosen": -38.808204650878906, "logps/rejected": -57.61214828491211, "loss": 0.6523, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.24403540790081024, "rewards/margins": 0.14384596049785614, "rewards/rejected": 0.10018942505121231, "step": 52 }, { "epoch": 0.1246313629952558, "grad_norm": 54.50350019894718, "learning_rate": 3.103448275862069e-07, "logits/chosen": 0.305615097284317, "logits/rejected": 0.3378358781337738, "logps/chosen": -41.46311950683594, "logps/rejected": -55.873138427734375, "loss": 0.6345, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.2176976054906845, "rewards/margins": 0.17212893068790436, "rewards/rejected": 0.045568663626909256, "step": 54 }, { "epoch": 0.1292473394024875, "grad_norm": 49.044811443941626, "learning_rate": 3.218390804597701e-07, "logits/chosen": 0.4806426763534546, "logits/rejected": 0.5007810592651367, "logps/chosen": -37.00300216674805, "logps/rejected": -42.795040130615234, "loss": 0.6005, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 0.41853082180023193, "rewards/margins": 0.23157899081707, "rewards/rejected": 0.18695180118083954, "step": 56 }, { "epoch": 0.1338633158097192, "grad_norm": 54.19272761171978, "learning_rate": 3.333333333333333e-07, "logits/chosen": 0.4073159098625183, "logits/rejected": 0.4315372109413147, "logps/chosen": -39.63461685180664, "logps/rejected": -41.75359344482422, "loss": 0.5767, "rewards/accuracies": 0.75, "rewards/chosen": 0.488341748714447, "rewards/margins": 0.2964838743209839, "rewards/rejected": 0.19185791909694672, "step": 58 }, { "epoch": 0.13847929221695088, "grad_norm": 45.35161413256781, "learning_rate": 3.4482758620689656e-07, "logits/chosen": 0.3869187831878662, "logits/rejected": 0.4154462218284607, "logps/chosen": -40.21774673461914, "logps/rejected": -49.05698013305664, "loss": 0.5939, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 0.4058813452720642, "rewards/margins": 0.2916874289512634, "rewards/rejected": 0.11419390141963959, "step": 60 }, { "epoch": 0.13847929221695088, "eval_logits/chosen": 0.3351307511329651, "eval_logits/rejected": 0.35898876190185547, "eval_logps/chosen": -40.8883171081543, "eval_logps/rejected": -47.73066711425781, "eval_loss": 0.5981891751289368, "eval_rewards/accuracies": 0.6745391488075256, "eval_rewards/chosen": 0.4284805655479431, "eval_rewards/margins": 0.2746467888355255, "eval_rewards/rejected": 0.15383380651474, "eval_runtime": 220.5776, "eval_samples_per_second": 7.861, "eval_steps_per_second": 1.968, "step": 60 }, { "epoch": 0.1430952686241826, "grad_norm": 43.3905484769897, "learning_rate": 3.5632183908045977e-07, "logits/chosen": 0.4458725154399872, "logits/rejected": 0.46262863278388977, "logps/chosen": -40.7205924987793, "logps/rejected": -47.21548080444336, "loss": 0.6053, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.42287477850914, "rewards/margins": 0.2391357719898224, "rewards/rejected": 0.18373897671699524, "step": 62 }, { "epoch": 0.14771124503141428, "grad_norm": 46.32576522285691, "learning_rate": 3.67816091954023e-07, "logits/chosen": 0.42775771021842957, "logits/rejected": 0.4581214487552643, "logps/chosen": -42.59015655517578, "logps/rejected": -51.6392822265625, "loss": 0.5669, "rewards/accuracies": 0.75, "rewards/chosen": 0.43508684635162354, "rewards/margins": 0.3611146807670593, "rewards/rejected": 0.0739721804857254, "step": 64 }, { "epoch": 0.152327221438646, "grad_norm": 42.51968356117282, "learning_rate": 3.793103448275862e-07, "logits/chosen": 0.4170711636543274, "logits/rejected": 0.45475757122039795, "logps/chosen": -38.8193359375, "logps/rejected": -59.24808120727539, "loss": 0.5514, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.5248206257820129, "rewards/margins": 0.48520517349243164, "rewards/rejected": 0.03961547836661339, "step": 66 }, { "epoch": 0.15694319784587768, "grad_norm": 59.1347578756685, "learning_rate": 3.9080459770114945e-07, "logits/chosen": 0.3444980978965759, "logits/rejected": 0.38142290711402893, "logps/chosen": -37.63268280029297, "logps/rejected": -56.55868911743164, "loss": 0.6421, "rewards/accuracies": 0.6805555820465088, "rewards/chosen": 0.28356897830963135, "rewards/margins": 0.37217438220977783, "rewards/rejected": -0.0886053591966629, "step": 68 }, { "epoch": 0.16155917425310937, "grad_norm": 43.01512643240851, "learning_rate": 4.0229885057471266e-07, "logits/chosen": 0.47459664940834045, "logits/rejected": 0.5048218369483948, "logps/chosen": -37.06074905395508, "logps/rejected": -41.83311462402344, "loss": 0.4985, "rewards/accuracies": 0.75, "rewards/chosen": 0.7238032221794128, "rewards/margins": 0.6140663623809814, "rewards/rejected": 0.10973668098449707, "step": 70 }, { "epoch": 0.16155917425310937, "eval_logits/chosen": 0.3373754024505615, "eval_logits/rejected": 0.361337274312973, "eval_logps/chosen": -40.72093200683594, "eval_logps/rejected": -47.9627685546875, "eval_loss": 0.5673334002494812, "eval_rewards/accuracies": 0.7073732614517212, "eval_rewards/chosen": 0.512172520160675, "eval_rewards/margins": 0.47438928484916687, "eval_rewards/rejected": 0.03778325766324997, "eval_runtime": 220.4667, "eval_samples_per_second": 7.865, "eval_steps_per_second": 1.969, "step": 70 }, { "epoch": 0.16617515066034108, "grad_norm": 58.03313651952633, "learning_rate": 4.1379310344827586e-07, "logits/chosen": 0.47328370809555054, "logits/rejected": 0.516916036605835, "logps/chosen": -39.381927490234375, "logps/rejected": -63.04606628417969, "loss": 0.5446, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.5364831686019897, "rewards/margins": 0.5998459458351135, "rewards/rejected": -0.06336280703544617, "step": 72 }, { "epoch": 0.17079112706757277, "grad_norm": 38.72118579621577, "learning_rate": 4.25287356321839e-07, "logits/chosen": 0.4743606746196747, "logits/rejected": 0.48414406180381775, "logps/chosen": -47.13395690917969, "logps/rejected": -47.23988723754883, "loss": 0.6296, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 0.46243613958358765, "rewards/margins": 0.4000816345214844, "rewards/rejected": 0.06235449016094208, "step": 74 }, { "epoch": 0.17540710347480445, "grad_norm": 52.02457163056824, "learning_rate": 4.367816091954023e-07, "logits/chosen": 0.4869605302810669, "logits/rejected": 0.5183277726173401, "logps/chosen": -41.5470085144043, "logps/rejected": -52.64150619506836, "loss": 0.5554, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.6152575612068176, "rewards/margins": 0.5033391118049622, "rewards/rejected": 0.11191850155591965, "step": 76 }, { "epoch": 0.18002307988203617, "grad_norm": 39.942306949985145, "learning_rate": 4.482758620689655e-07, "logits/chosen": 0.4678427278995514, "logits/rejected": 0.4919649660587311, "logps/chosen": -36.33488845825195, "logps/rejected": -46.28294372558594, "loss": 0.5509, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 0.7630440592765808, "rewards/margins": 0.5350204110145569, "rewards/rejected": 0.22802363336086273, "step": 78 }, { "epoch": 0.18463905628926786, "grad_norm": 49.68265630941736, "learning_rate": 4.597701149425287e-07, "logits/chosen": 0.4118601381778717, "logits/rejected": 0.4340742528438568, "logps/chosen": -36.466026306152344, "logps/rejected": -40.359230041503906, "loss": 0.5161, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.7488323450088501, "rewards/margins": 0.6430253982543945, "rewards/rejected": 0.1058068722486496, "step": 80 }, { "epoch": 0.18463905628926786, "eval_logits/chosen": 0.3424670994281769, "eval_logits/rejected": 0.3665529489517212, "eval_logps/chosen": -40.603668212890625, "eval_logps/rejected": -48.121337890625, "eval_loss": 0.5314013957977295, "eval_rewards/accuracies": 0.7206221222877502, "eval_rewards/chosen": 0.5708039999008179, "eval_rewards/margins": 0.6123039126396179, "eval_rewards/rejected": -0.04149990156292915, "eval_runtime": 220.269, "eval_samples_per_second": 7.872, "eval_steps_per_second": 1.97, "step": 80 }, { "epoch": 0.18925503269649954, "grad_norm": 40.18854063526523, "learning_rate": 4.712643678160919e-07, "logits/chosen": 0.4146896302700043, "logits/rejected": 0.44372716546058655, "logps/chosen": -44.112205505371094, "logps/rejected": -54.97979736328125, "loss": 0.5066, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.5255990624427795, "rewards/margins": 0.6622204780578613, "rewards/rejected": -0.1366213709115982, "step": 82 }, { "epoch": 0.19387100910373126, "grad_norm": 35.8737356471814, "learning_rate": 4.827586206896552e-07, "logits/chosen": 0.46901315450668335, "logits/rejected": 0.5202505588531494, "logps/chosen": -37.11308288574219, "logps/rejected": -64.51854705810547, "loss": 0.4791, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6412889957427979, "rewards/margins": 0.8545607924461365, "rewards/rejected": -0.213271826505661, "step": 84 }, { "epoch": 0.19848698551096294, "grad_norm": 38.06201763208911, "learning_rate": 4.942528735632184e-07, "logits/chosen": 0.4869195520877838, "logits/rejected": 0.5158190727233887, "logps/chosen": -41.02754592895508, "logps/rejected": -52.270511627197266, "loss": 0.4592, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.6756889820098877, "rewards/margins": 0.815551221370697, "rewards/rejected": -0.13986223936080933, "step": 86 }, { "epoch": 0.20310296191819463, "grad_norm": 36.34414677933188, "learning_rate": 4.999979670146248e-07, "logits/chosen": 0.4226466119289398, "logits/rejected": 0.4440222680568695, "logps/chosen": -45.02897644042969, "logps/rejected": -53.814674377441406, "loss": 0.4665, "rewards/accuracies": 0.875, "rewards/chosen": 0.5262306928634644, "rewards/margins": 0.772502601146698, "rewards/rejected": -0.24627192318439484, "step": 88 }, { "epoch": 0.20771893832542634, "grad_norm": 47.62232189356859, "learning_rate": 4.99981703330008e-07, "logits/chosen": 0.43458905816078186, "logits/rejected": 0.45631253719329834, "logps/chosen": -39.44232177734375, "logps/rejected": -49.5074462890625, "loss": 0.508, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.6604294180870056, "rewards/margins": 0.6735664010047913, "rewards/rejected": -0.013136889785528183, "step": 90 }, { "epoch": 0.20771893832542634, "eval_logits/chosen": 0.3509339988231659, "eval_logits/rejected": 0.37502503395080566, "eval_logps/chosen": -40.31688690185547, "eval_logps/rejected": -48.16210174560547, "eval_loss": 0.4914422631263733, "eval_rewards/accuracies": 0.7263824939727783, "eval_rewards/chosen": 0.7141958475112915, "eval_rewards/margins": 0.7760785222053528, "eval_rewards/rejected": -0.06188271939754486, "eval_runtime": 220.2045, "eval_samples_per_second": 7.874, "eval_steps_per_second": 1.971, "step": 90 }, { "epoch": 0.21233491473265803, "grad_norm": 33.53538998473167, "learning_rate": 4.99949177018813e-07, "logits/chosen": 0.4293578863143921, "logits/rejected": 0.46211349964141846, "logps/chosen": -34.20891571044922, "logps/rejected": -45.82402420043945, "loss": 0.4007, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.9528428316116333, "rewards/margins": 1.0176244974136353, "rewards/rejected": -0.06478171050548553, "step": 92 }, { "epoch": 0.21695089113988972, "grad_norm": 47.45137697847349, "learning_rate": 4.999003901970474e-07, "logits/chosen": 0.4385245442390442, "logits/rejected": 0.45108774304389954, "logps/chosen": -47.24710464477539, "logps/rejected": -47.30147171020508, "loss": 0.5534, "rewards/accuracies": 0.6944444179534912, "rewards/chosen": 0.644627034664154, "rewards/margins": 0.6343204975128174, "rewards/rejected": 0.010306484065949917, "step": 94 }, { "epoch": 0.22156686754712143, "grad_norm": 33.39629568865361, "learning_rate": 4.998353460385512e-07, "logits/chosen": 0.4504711329936981, "logits/rejected": 0.48663392663002014, "logps/chosen": -40.03446578979492, "logps/rejected": -55.506591796875, "loss": 0.4222, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.6967446804046631, "rewards/margins": 1.0778069496154785, "rewards/rejected": -0.381062388420105, "step": 96 }, { "epoch": 0.22618284395435312, "grad_norm": 34.18594601316725, "learning_rate": 4.997540487747892e-07, "logits/chosen": 0.38444679975509644, "logits/rejected": 0.4130491614341736, "logps/chosen": -37.72957992553711, "logps/rejected": -57.71113967895508, "loss": 0.4716, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.864948570728302, "rewards/margins": 1.0170652866363525, "rewards/rejected": -0.152116596698761, "step": 98 }, { "epoch": 0.2307988203615848, "grad_norm": 31.852168197293704, "learning_rate": 4.996565036945769e-07, "logits/chosen": 0.4658397436141968, "logits/rejected": 0.4849558472633362, "logps/chosen": -44.069618225097656, "logps/rejected": -46.06491470336914, "loss": 0.4924, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.5598255395889282, "rewards/margins": 0.8147852420806885, "rewards/rejected": -0.25495976209640503, "step": 100 }, { "epoch": 0.2307988203615848, "eval_logits/chosen": 0.3590577244758606, "eval_logits/rejected": 0.38313183188438416, "eval_logps/chosen": -40.04033660888672, "eval_logps/rejected": -48.23354721069336, "eval_loss": 0.4618569314479828, "eval_rewards/accuracies": 0.7298387289047241, "eval_rewards/chosen": 0.852470874786377, "eval_rewards/margins": 0.9500778913497925, "eval_rewards/rejected": -0.09760700911283493, "eval_runtime": 220.4716, "eval_samples_per_second": 7.865, "eval_steps_per_second": 1.969, "step": 100 }, { "epoch": 0.23541479676881652, "grad_norm": 32.563251146586694, "learning_rate": 4.995427171437356e-07, "logits/chosen": 0.41394177079200745, "logits/rejected": 0.4560126066207886, "logps/chosen": -36.68212890625, "logps/rejected": -56.006553649902344, "loss": 0.3851, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.7943739891052246, "rewards/margins": 1.1945956945419312, "rewards/rejected": -0.40022173523902893, "step": 102 }, { "epoch": 0.2400307731760482, "grad_norm": 35.159104202159625, "learning_rate": 4.994126965246796e-07, "logits/chosen": 0.43339937925338745, "logits/rejected": 0.45789778232574463, "logps/chosen": -40.00631332397461, "logps/rejected": -48.161224365234375, "loss": 0.4153, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.7441484928131104, "rewards/margins": 1.0307202339172363, "rewards/rejected": -0.28657177090644836, "step": 104 }, { "epoch": 0.24464674958327992, "grad_norm": 35.54279884741835, "learning_rate": 4.992664502959351e-07, "logits/chosen": 0.42503511905670166, "logits/rejected": 0.48626741766929626, "logps/chosen": -36.73310852050781, "logps/rejected": -73.78736877441406, "loss": 0.3536, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.8739730715751648, "rewards/margins": 1.558694839477539, "rewards/rejected": -0.6847219467163086, "step": 106 }, { "epoch": 0.2492627259905116, "grad_norm": 45.173611856138976, "learning_rate": 4.991039879715898e-07, "logits/chosen": 0.4289478361606598, "logits/rejected": 0.46912992000579834, "logps/chosen": -40.94606399536133, "logps/rejected": -58.62925338745117, "loss": 0.4057, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 1.023528814315796, "rewards/margins": 1.251956582069397, "rewards/rejected": -0.22842761874198914, "step": 108 }, { "epoch": 0.2538787023977433, "grad_norm": 25.213187587591246, "learning_rate": 4.989253201206736e-07, "logits/chosen": 0.4647282361984253, "logits/rejected": 0.4716295003890991, "logps/chosen": -40.334922790527344, "logps/rejected": -41.65603256225586, "loss": 0.4339, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.9747889637947083, "rewards/margins": 1.0528353452682495, "rewards/rejected": -0.07804636657238007, "step": 110 }, { "epoch": 0.2538787023977433, "eval_logits/chosen": 0.36145398020744324, "eval_logits/rejected": 0.38587653636932373, "eval_logps/chosen": -39.77558135986328, "eval_logps/rejected": -48.301231384277344, "eval_loss": 0.43463748693466187, "eval_rewards/accuracies": 0.7379032373428345, "eval_rewards/chosen": 0.9848493337631226, "eval_rewards/margins": 1.1162999868392944, "eval_rewards/rejected": -0.1314505934715271, "eval_runtime": 220.4446, "eval_samples_per_second": 7.866, "eval_steps_per_second": 1.969, "step": 110 }, { "epoch": 0.258494678804975, "grad_norm": 39.895524747857486, "learning_rate": 4.987304583664712e-07, "logits/chosen": 0.4972270429134369, "logits/rejected": 0.5156663060188293, "logps/chosen": -46.859134674072266, "logps/rejected": -53.12602996826172, "loss": 0.4463, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.8810398578643799, "rewards/margins": 0.9829990863800049, "rewards/rejected": -0.10195919126272202, "step": 112 }, { "epoch": 0.26311065521220667, "grad_norm": 36.88032065773003, "learning_rate": 4.985194153857662e-07, "logits/chosen": 0.4386284351348877, "logits/rejected": 0.4557953476905823, "logps/chosen": -36.74658203125, "logps/rejected": -39.56464767456055, "loss": 0.4788, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.9255303144454956, "rewards/margins": 0.9156983494758606, "rewards/rejected": 0.009831971488893032, "step": 114 }, { "epoch": 0.2677266316194384, "grad_norm": 23.636821560598456, "learning_rate": 4.982922049080163e-07, "logits/chosen": 0.40630775690078735, "logits/rejected": 0.4236665964126587, "logps/chosen": -35.141971588134766, "logps/rejected": -42.14583969116211, "loss": 0.3872, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.8281899690628052, "rewards/margins": 1.215153455734253, "rewards/rejected": -0.386963427066803, "step": 116 }, { "epoch": 0.2723426080266701, "grad_norm": 38.873691089935114, "learning_rate": 4.980488417144599e-07, "logits/chosen": 0.37884485721588135, "logits/rejected": 0.4280329644680023, "logps/chosen": -41.57583999633789, "logps/rejected": -71.53160095214844, "loss": 0.4818, "rewards/accuracies": 0.75, "rewards/chosen": 0.7444247603416443, "rewards/margins": 1.1956461668014526, "rewards/rejected": -0.4512213468551636, "step": 118 }, { "epoch": 0.27695858443390176, "grad_norm": 27.126567445081033, "learning_rate": 4.977893416371544e-07, "logits/chosen": 0.4753592908382416, "logits/rejected": 0.4997613728046417, "logps/chosen": -34.07433319091797, "logps/rejected": -45.33045959472656, "loss": 0.3838, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.8865776062011719, "rewards/margins": 1.4225349426269531, "rewards/rejected": -0.5359571576118469, "step": 120 }, { "epoch": 0.27695858443390176, "eval_logits/chosen": 0.3657575249671936, "eval_logits/rejected": 0.39033937454223633, "eval_logps/chosen": -39.95652770996094, "eval_logps/rejected": -48.739437103271484, "eval_loss": 0.410579651594162, "eval_rewards/accuracies": 0.7540322542190552, "eval_rewards/chosen": 0.8943750858306885, "eval_rewards/margins": 1.2449262142181396, "eval_rewards/rejected": -0.35055097937583923, "eval_runtime": 220.2442, "eval_samples_per_second": 7.873, "eval_steps_per_second": 1.971, "step": 120 }, { "epoch": 0.28157456084113347, "grad_norm": 26.815456443053705, "learning_rate": 4.975137215579469e-07, "logits/chosen": 0.5420396327972412, "logits/rejected": 0.5500521659851074, "logps/chosen": -45.788516998291016, "logps/rejected": -42.21580505371094, "loss": 0.4117, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.8019249439239502, "rewards/margins": 1.2268595695495605, "rewards/rejected": -0.42493465542793274, "step": 122 }, { "epoch": 0.2861905372483652, "grad_norm": 30.749785890404876, "learning_rate": 4.972219994073755e-07, "logits/chosen": 0.49169254302978516, "logits/rejected": 0.5404393672943115, "logps/chosen": -38.644107818603516, "logps/rejected": -67.01266479492188, "loss": 0.3844, "rewards/accuracies": 0.75, "rewards/chosen": 0.8940033316612244, "rewards/margins": 1.6317830085754395, "rewards/rejected": -0.7377796173095703, "step": 124 }, { "epoch": 0.2908065136555969, "grad_norm": 29.538977791375373, "learning_rate": 4.969141941635025e-07, "logits/chosen": 0.47598233819007874, "logits/rejected": 0.5060492753982544, "logps/chosen": -40.60331344604492, "logps/rejected": -59.37862014770508, "loss": 0.4476, "rewards/accuracies": 0.75, "rewards/chosen": 0.5469496250152588, "rewards/margins": 1.4448275566101074, "rewards/rejected": -0.8978776931762695, "step": 126 }, { "epoch": 0.29542249006282856, "grad_norm": 50.663011631161446, "learning_rate": 4.965903258506806e-07, "logits/chosen": 0.49228647351264954, "logits/rejected": 0.5329996943473816, "logps/chosen": -39.90941619873047, "logps/rejected": -61.23884963989258, "loss": 0.347, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.8301137685775757, "rewards/margins": 1.5421695709228516, "rewards/rejected": -0.7120558619499207, "step": 128 }, { "epoch": 0.30003846647006027, "grad_norm": 32.98345370505989, "learning_rate": 4.962504155382493e-07, "logits/chosen": 0.4239842891693115, "logits/rejected": 0.44136151671409607, "logps/chosen": -36.07121276855469, "logps/rejected": -41.06203079223633, "loss": 0.3667, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.7973310351371765, "rewards/margins": 1.2334084510803223, "rewards/rejected": -0.4360772669315338, "step": 130 }, { "epoch": 0.30003846647006027, "eval_logits/chosen": 0.3723231256008148, "eval_logits/rejected": 0.3968786299228668, "eval_logps/chosen": -39.925048828125, "eval_logps/rejected": -48.9533576965332, "eval_loss": 0.39173391461372375, "eval_rewards/accuracies": 0.7753456234931946, "eval_rewards/chosen": 0.9101160168647766, "eval_rewards/margins": 1.3676302433013916, "eval_rewards/rejected": -0.4575144052505493, "eval_runtime": 220.267, "eval_samples_per_second": 7.872, "eval_steps_per_second": 1.97, "step": 130 }, { "epoch": 0.304654442877292, "grad_norm": 28.392702162901728, "learning_rate": 4.958944853391652e-07, "logits/chosen": 0.520796537399292, "logits/rejected": 0.5420558452606201, "logps/chosen": -37.87763595581055, "logps/rejected": -46.05318069458008, "loss": 0.3819, "rewards/accuracies": 0.75, "rewards/chosen": 0.932469367980957, "rewards/margins": 1.2907413244247437, "rewards/rejected": -0.3582719564437866, "step": 132 }, { "epoch": 0.30927041928452365, "grad_norm": 27.83688192223066, "learning_rate": 4.955225584085624e-07, "logits/chosen": 0.42395105957984924, "logits/rejected": 0.44882073998451233, "logps/chosen": -36.98991775512695, "logps/rejected": -51.79054260253906, "loss": 0.3951, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.9578195214271545, "rewards/margins": 1.4403272867202759, "rewards/rejected": -0.48250770568847656, "step": 134 }, { "epoch": 0.31388639569175536, "grad_norm": 27.432482792006017, "learning_rate": 4.951346589422467e-07, "logits/chosen": 0.483965128660202, "logits/rejected": 0.5153691172599792, "logps/chosen": -37.48245620727539, "logps/rejected": -54.50342559814453, "loss": 0.3942, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 1.0384331941604614, "rewards/margins": 1.5820738077163696, "rewards/rejected": -0.5436408519744873, "step": 136 }, { "epoch": 0.3185023720989871, "grad_norm": 46.62557646275611, "learning_rate": 4.94730812175122e-07, "logits/chosen": 0.43841731548309326, "logits/rejected": 0.4499746561050415, "logps/chosen": -38.93119812011719, "logps/rejected": -42.26424026489258, "loss": 0.4384, "rewards/accuracies": 0.6527777910232544, "rewards/chosen": 0.8961164951324463, "rewards/margins": 1.247178554534912, "rewards/rejected": -0.3510621190071106, "step": 138 }, { "epoch": 0.32311834850621873, "grad_norm": 34.05743924648359, "learning_rate": 4.943110443795476e-07, "logits/chosen": 0.49757227301597595, "logits/rejected": 0.5091323852539062, "logps/chosen": -42.93407440185547, "logps/rejected": -45.01084899902344, "loss": 0.4061, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.8557752370834351, "rewards/margins": 1.3424351215362549, "rewards/rejected": -0.48665979504585266, "step": 140 }, { "epoch": 0.32311834850621873, "eval_logits/chosen": 0.3763599395751953, "eval_logits/rejected": 0.4011194705963135, "eval_logps/chosen": -39.798316955566406, "eval_logps/rejected": -49.11299133300781, "eval_loss": 0.3788905441761017, "eval_rewards/accuracies": 0.764976978302002, "eval_rewards/chosen": 0.9734821915626526, "eval_rewards/margins": 1.5108132362365723, "eval_rewards/rejected": -0.5373309850692749, "eval_runtime": 220.3233, "eval_samples_per_second": 7.87, "eval_steps_per_second": 1.97, "step": 140 }, { "epoch": 0.32773432491345045, "grad_norm": 36.481001944632766, "learning_rate": 4.938753828636297e-07, "logits/chosen": 0.4888935089111328, "logits/rejected": 0.4963880777359009, "logps/chosen": -46.02848815917969, "logps/rejected": -44.94346237182617, "loss": 0.4382, "rewards/accuracies": 0.75, "rewards/chosen": 0.7767104506492615, "rewards/margins": 1.235382080078125, "rewards/rejected": -0.45867156982421875, "step": 142 }, { "epoch": 0.33235030132068216, "grad_norm": 27.008693506029694, "learning_rate": 4.934238559694447e-07, "logits/chosen": 0.460690975189209, "logits/rejected": 0.5057052969932556, "logps/chosen": -38.473411560058594, "logps/rejected": -54.91615295410156, "loss": 0.3338, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.8698298335075378, "rewards/margins": 1.6055673360824585, "rewards/rejected": -0.7357374429702759, "step": 144 }, { "epoch": 0.3369662777279138, "grad_norm": 32.261266015848825, "learning_rate": 4.929564930711957e-07, "logits/chosen": 0.4295574426651001, "logits/rejected": 0.4522492587566376, "logps/chosen": -39.829490661621094, "logps/rejected": -44.733333587646484, "loss": 0.3533, "rewards/accuracies": 0.875, "rewards/chosen": 0.7346515655517578, "rewards/margins": 1.3469676971435547, "rewards/rejected": -0.6123161315917969, "step": 146 }, { "epoch": 0.34158225413514554, "grad_norm": 28.797840444924386, "learning_rate": 4.924733245733008e-07, "logits/chosen": 0.5410254001617432, "logits/rejected": 0.5485421419143677, "logps/chosen": -43.81610870361328, "logps/rejected": -40.52272033691406, "loss": 0.3651, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.9063374996185303, "rewards/margins": 1.2729685306549072, "rewards/rejected": -0.366630882024765, "step": 148 }, { "epoch": 0.34619823054237725, "grad_norm": 30.202896827963542, "learning_rate": 4.91974381908416e-07, "logits/chosen": 0.42066994309425354, "logits/rejected": 0.4589553475379944, "logps/chosen": -38.81809997558594, "logps/rejected": -58.59386444091797, "loss": 0.3446, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.6800815463066101, "rewards/margins": 1.928250789642334, "rewards/rejected": -1.2481693029403687, "step": 150 }, { "epoch": 0.34619823054237725, "eval_logits/chosen": 0.3821311295032501, "eval_logits/rejected": 0.40684476494789124, "eval_logps/chosen": -40.001861572265625, "eval_logps/rejected": -49.4797477722168, "eval_loss": 0.3633531332015991, "eval_rewards/accuracies": 0.7724654674530029, "eval_rewards/chosen": 0.8717083930969238, "eval_rewards/margins": 1.5924171209335327, "eval_rewards/rejected": -0.7207087278366089, "eval_runtime": 220.1362, "eval_samples_per_second": 7.877, "eval_steps_per_second": 1.972, "step": 150 }, { "epoch": 0.3508142069496089, "grad_norm": 26.385033894551455, "learning_rate": 4.914596975353898e-07, "logits/chosen": 0.4991176426410675, "logits/rejected": 0.5242553353309631, "logps/chosen": -38.974281311035156, "logps/rejected": -48.54939270019531, "loss": 0.3721, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.771294355392456, "rewards/margins": 1.5243595838546753, "rewards/rejected": -0.7530653476715088, "step": 152 }, { "epoch": 0.3554301833568406, "grad_norm": 42.428423927932826, "learning_rate": 4.909293049371519e-07, "logits/chosen": 0.5288230180740356, "logits/rejected": 0.5352779626846313, "logps/chosen": -45.90478515625, "logps/rejected": -44.53614044189453, "loss": 0.3542, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.7464312314987183, "rewards/margins": 1.5150079727172852, "rewards/rejected": -0.7685766220092773, "step": 154 }, { "epoch": 0.36004615976407234, "grad_norm": 36.75812549927479, "learning_rate": 4.903832386185343e-07, "logits/chosen": 0.47585126757621765, "logits/rejected": 0.49040529131889343, "logps/chosen": -44.172325134277344, "logps/rejected": -43.98606872558594, "loss": 0.3956, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.5973650813102722, "rewards/margins": 1.340658187866211, "rewards/rejected": -0.743293046951294, "step": 156 }, { "epoch": 0.364662136171304, "grad_norm": 26.152211217958236, "learning_rate": 4.89821534104028e-07, "logits/chosen": 0.39484938979148865, "logits/rejected": 0.42477357387542725, "logps/chosen": -41.93134307861328, "logps/rejected": -56.39106750488281, "loss": 0.3275, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.827555239200592, "rewards/margins": 1.9599329233169556, "rewards/rejected": -1.1323776245117188, "step": 158 }, { "epoch": 0.3692781125785357, "grad_norm": 29.041350828980583, "learning_rate": 4.892442279354698e-07, "logits/chosen": 0.4744550287723541, "logits/rejected": 0.5093830227851868, "logps/chosen": -42.794578552246094, "logps/rejected": -59.93064498901367, "loss": 0.3605, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.540644645690918, "rewards/margins": 1.6665728092193604, "rewards/rejected": -1.125928282737732, "step": 160 }, { "epoch": 0.3692781125785357, "eval_logits/chosen": 0.38920047879219055, "eval_logits/rejected": 0.41388043761253357, "eval_logps/chosen": -40.3745231628418, "eval_logps/rejected": -49.94725036621094, "eval_loss": 0.3510279059410095, "eval_rewards/accuracies": 0.7920507192611694, "eval_rewards/chosen": 0.6853779554367065, "eval_rewards/margins": 1.6398398876190186, "eval_rewards/rejected": -0.9544618129730225, "eval_runtime": 220.1812, "eval_samples_per_second": 7.875, "eval_steps_per_second": 1.971, "step": 160 }, { "epoch": 0.3738940889857674, "grad_norm": 32.36067481486556, "learning_rate": 4.886513576696673e-07, "logits/chosen": 0.4680570960044861, "logits/rejected": 0.5030277371406555, "logps/chosen": -42.39280700683594, "logps/rejected": -58.18678283691406, "loss": 0.392, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.7217347621917725, "rewards/margins": 1.8615412712097168, "rewards/rejected": -1.1398065090179443, "step": 162 }, { "epoch": 0.3785100653929991, "grad_norm": 27.802667550507227, "learning_rate": 4.880429618759543e-07, "logits/chosen": 0.46893131732940674, "logits/rejected": 0.4787411093711853, "logps/chosen": -45.52459716796875, "logps/rejected": -46.459312438964844, "loss": 0.3819, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.870037317276001, "rewards/margins": 1.4820109605789185, "rewards/rejected": -0.6119736433029175, "step": 164 }, { "epoch": 0.3831260418002308, "grad_norm": 27.278325528930967, "learning_rate": 4.874190801336817e-07, "logits/chosen": 0.46610963344573975, "logits/rejected": 0.4872422218322754, "logps/chosen": -44.28363037109375, "logps/rejected": -51.54701232910156, "loss": 0.323, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.6502636075019836, "rewards/margins": 1.7216179370880127, "rewards/rejected": -1.0713541507720947, "step": 166 }, { "epoch": 0.3877420182074625, "grad_norm": 25.09454062223173, "learning_rate": 4.867797530296431e-07, "logits/chosen": 0.4582709074020386, "logits/rejected": 0.48244646191596985, "logps/chosen": -45.76988983154297, "logps/rejected": -55.2458610534668, "loss": 0.2842, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.6319215297698975, "rewards/margins": 2.007154941558838, "rewards/rejected": -1.3752332925796509, "step": 168 }, { "epoch": 0.39235799461469417, "grad_norm": 25.014228656107395, "learning_rate": 4.861250221554343e-07, "logits/chosen": 0.4760267436504364, "logits/rejected": 0.5161222219467163, "logps/chosen": -36.09988021850586, "logps/rejected": -58.49198913574219, "loss": 0.317, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.6937295794487, "rewards/margins": 2.0070507526397705, "rewards/rejected": -1.3133213520050049, "step": 170 }, { "epoch": 0.39235799461469417, "eval_logits/chosen": 0.39130648970603943, "eval_logits/rejected": 0.41622862219810486, "eval_logps/chosen": -40.4833984375, "eval_logps/rejected": -50.18775177001953, "eval_loss": 0.343056857585907, "eval_rewards/accuracies": 0.796658992767334, "eval_rewards/chosen": 0.6309407949447632, "eval_rewards/margins": 1.7056493759155273, "eval_rewards/rejected": -1.0747085809707642, "eval_runtime": 220.3261, "eval_samples_per_second": 7.87, "eval_steps_per_second": 1.97, "step": 170 }, { "epoch": 0.3969739710219259, "grad_norm": 21.660777806253456, "learning_rate": 4.854549301047476e-07, "logits/chosen": 0.5408195853233337, "logits/rejected": 0.5565234422683716, "logps/chosen": -42.90623474121094, "logps/rejected": -43.590702056884766, "loss": 0.373, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.6533936262130737, "rewards/margins": 1.5095248222351074, "rewards/rejected": -0.8561312556266785, "step": 172 }, { "epoch": 0.4015899474291576, "grad_norm": 32.27746768838142, "learning_rate": 4.847695204706005e-07, "logits/chosen": 0.47649839520454407, "logits/rejected": 0.49190616607666016, "logps/chosen": -38.49553680419922, "logps/rejected": -40.65150451660156, "loss": 0.3558, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.7918031811714172, "rewards/margins": 1.4087355136871338, "rewards/rejected": -0.6169323325157166, "step": 174 }, { "epoch": 0.40620592383638926, "grad_norm": 31.844706703711985, "learning_rate": 4.840688378425e-07, "logits/chosen": 0.5188453793525696, "logits/rejected": 0.5562708973884583, "logps/chosen": -46.135372161865234, "logps/rejected": -56.292930603027344, "loss": 0.261, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.7925480604171753, "rewards/margins": 2.1678171157836914, "rewards/rejected": -1.3752690553665161, "step": 176 }, { "epoch": 0.410821900243621, "grad_norm": 26.376171346573187, "learning_rate": 4.833529278035422e-07, "logits/chosen": 0.357127845287323, "logits/rejected": 0.4103134572505951, "logps/chosen": -37.78556442260742, "logps/rejected": -67.52072143554688, "loss": 0.2899, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.9015005826950073, "rewards/margins": 2.719820261001587, "rewards/rejected": -1.81831955909729, "step": 178 }, { "epoch": 0.4154378766508527, "grad_norm": 26.0393680431703, "learning_rate": 4.826218369274459e-07, "logits/chosen": 0.4666251540184021, "logits/rejected": 0.5160384178161621, "logps/chosen": -39.356258392333984, "logps/rejected": -62.83391571044922, "loss": 0.3066, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.8675535917282104, "rewards/margins": 2.234145164489746, "rewards/rejected": -1.3665915727615356, "step": 180 }, { "epoch": 0.4154378766508527, "eval_logits/chosen": 0.3935144245624542, "eval_logits/rejected": 0.41844189167022705, "eval_logps/chosen": -39.861793518066406, "eval_logps/rejected": -49.855037689208984, "eval_loss": 0.3321295380592346, "eval_rewards/accuracies": 0.7926267385482788, "eval_rewards/chosen": 0.941743791103363, "eval_rewards/margins": 1.8500969409942627, "eval_rewards/rejected": -0.9083530902862549, "eval_runtime": 220.3176, "eval_samples_per_second": 7.87, "eval_steps_per_second": 1.97, "step": 180 }, { "epoch": 0.42005385305808435, "grad_norm": 23.061889635448846, "learning_rate": 4.818756127755237e-07, "logits/chosen": 0.49034425616264343, "logits/rejected": 0.5069853663444519, "logps/chosen": -37.846553802490234, "logps/rejected": -41.30693817138672, "loss": 0.2693, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.0121811628341675, "rewards/margins": 1.8432265520095825, "rewards/rejected": -0.831045389175415, "step": 182 }, { "epoch": 0.42466982946531606, "grad_norm": 22.17904586209137, "learning_rate": 4.811143038935873e-07, "logits/chosen": 0.5580455660820007, "logits/rejected": 0.5748550295829773, "logps/chosen": -42.32413101196289, "logps/rejected": -46.0750732421875, "loss": 0.3264, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 1.0455000400543213, "rewards/margins": 1.93173086643219, "rewards/rejected": -0.8862307667732239, "step": 184 }, { "epoch": 0.4292858058725478, "grad_norm": 30.29917055573095, "learning_rate": 4.803379598087899e-07, "logits/chosen": 0.5174715518951416, "logits/rejected": 0.5311744213104248, "logps/chosen": -40.50711441040039, "logps/rejected": -40.298824310302734, "loss": 0.316, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.9772326350212097, "rewards/margins": 1.7723863124847412, "rewards/rejected": -0.795153796672821, "step": 186 }, { "epoch": 0.43390178227977944, "grad_norm": 40.38001852713412, "learning_rate": 4.795466310264034e-07, "logits/chosen": 0.42736437916755676, "logits/rejected": 0.463912695646286, "logps/chosen": -39.35895919799805, "logps/rejected": -64.93545532226562, "loss": 0.4185, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.5966134667396545, "rewards/margins": 1.980704665184021, "rewards/rejected": -1.3840913772583008, "step": 188 }, { "epoch": 0.43851775868701115, "grad_norm": 17.949323810733784, "learning_rate": 4.787403690265335e-07, "logits/chosen": 0.5044853091239929, "logits/rejected": 0.5284148454666138, "logps/chosen": -39.47854995727539, "logps/rejected": -49.92608642578125, "loss": 0.3266, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 1.0101630687713623, "rewards/margins": 1.9091652631759644, "rewards/rejected": -0.8990020751953125, "step": 190 }, { "epoch": 0.43851775868701115, "eval_logits/chosen": 0.3972060978412628, "eval_logits/rejected": 0.4221220314502716, "eval_logps/chosen": -39.831119537353516, "eval_logps/rejected": -50.09023666381836, "eval_loss": 0.3243154287338257, "eval_rewards/accuracies": 0.7914746403694153, "eval_rewards/chosen": 0.9570826292037964, "eval_rewards/margins": 1.9830337762832642, "eval_rewards/rejected": -1.0259510278701782, "eval_runtime": 220.3237, "eval_samples_per_second": 7.87, "eval_steps_per_second": 1.97, "step": 190 }, { "epoch": 0.44313373509424286, "grad_norm": 36.065620072852695, "learning_rate": 4.779192262607702e-07, "logits/chosen": 0.5138534903526306, "logits/rejected": 0.544155478477478, "logps/chosen": -43.310760498046875, "logps/rejected": -59.56623840332031, "loss": 0.3542, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.9537274837493896, "rewards/margins": 2.111888885498047, "rewards/rejected": -1.1581614017486572, "step": 192 }, { "epoch": 0.4477497115014745, "grad_norm": 24.653058016123207, "learning_rate": 4.770832561487758e-07, "logits/chosen": 0.4504295885562897, "logits/rejected": 0.46597781777381897, "logps/chosen": -41.51498794555664, "logps/rejected": -43.07120132446289, "loss": 0.2587, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.9096213579177856, "rewards/margins": 2.131375551223755, "rewards/rejected": -1.2217543125152588, "step": 194 }, { "epoch": 0.45236568790870624, "grad_norm": 36.95305184003922, "learning_rate": 4.762325130748097e-07, "logits/chosen": 0.5585076808929443, "logits/rejected": 0.5717556476593018, "logps/chosen": -47.50046920776367, "logps/rejected": -44.811973571777344, "loss": 0.3412, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.9956084489822388, "rewards/margins": 1.8879083395004272, "rewards/rejected": -0.8922999501228333, "step": 196 }, { "epoch": 0.45698166431593795, "grad_norm": 16.999205852011567, "learning_rate": 4.7536705238418995e-07, "logits/chosen": 0.47373294830322266, "logits/rejected": 0.49137142300605774, "logps/chosen": -42.69048309326172, "logps/rejected": -50.26279067993164, "loss": 0.275, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.8595349788665771, "rewards/margins": 2.2136645317077637, "rewards/rejected": -1.3541297912597656, "step": 198 }, { "epoch": 0.4615976407231696, "grad_norm": 33.06750404898565, "learning_rate": 4.7448693037969336e-07, "logits/chosen": 0.5136507749557495, "logits/rejected": 0.527184247970581, "logps/chosen": -41.794132232666016, "logps/rejected": -48.2490119934082, "loss": 0.2986, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.8771340847015381, "rewards/margins": 1.9797013998031616, "rewards/rejected": -1.102567195892334, "step": 200 }, { "epoch": 0.4615976407231696, "eval_logits/chosen": 0.39842745661735535, "eval_logits/rejected": 0.42355066537857056, "eval_logps/chosen": -40.12582778930664, "eval_logps/rejected": -50.502620697021484, "eval_loss": 0.3160472810268402, "eval_rewards/accuracies": 0.7978110313415527, "eval_rewards/chosen": 0.8097268342971802, "eval_rewards/margins": 2.041868209838867, "eval_rewards/rejected": -1.2321414947509766, "eval_runtime": 220.4769, "eval_samples_per_second": 7.865, "eval_steps_per_second": 1.968, "step": 200 }, { "epoch": 0.4662136171304013, "grad_norm": 27.6688088244827, "learning_rate": 4.735922043178923e-07, "logits/chosen": 0.5529847741127014, "logits/rejected": 0.5818406939506531, "logps/chosen": -42.29270553588867, "logps/rejected": -57.84202575683594, "loss": 0.2725, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.8104487061500549, "rewards/margins": 2.3321969509124756, "rewards/rejected": -1.521748423576355, "step": 202 }, { "epoch": 0.47082959353763304, "grad_norm": 23.404484719369563, "learning_rate": 4.7268293240543017e-07, "logits/chosen": 0.48225533962249756, "logits/rejected": 0.5109025239944458, "logps/chosen": -40.953433990478516, "logps/rejected": -55.026153564453125, "loss": 0.3435, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.8147386908531189, "rewards/margins": 2.057671546936035, "rewards/rejected": -1.2429331541061401, "step": 204 }, { "epoch": 0.4754455699448647, "grad_norm": 29.663210206611154, "learning_rate": 4.717591737952344e-07, "logits/chosen": 0.48208919167518616, "logits/rejected": 0.517291247844696, "logps/chosen": -36.30723190307617, "logps/rejected": -54.3764533996582, "loss": 0.3135, "rewards/accuracies": 0.75, "rewards/chosen": 0.8137081861495972, "rewards/margins": 2.101260185241699, "rewards/rejected": -1.287551999092102, "step": 206 }, { "epoch": 0.4800615463520964, "grad_norm": 29.39474251364716, "learning_rate": 4.7082098858266837e-07, "logits/chosen": 0.48040205240249634, "logits/rejected": 0.5284512042999268, "logps/chosen": -31.84227180480957, "logps/rejected": -61.47830581665039, "loss": 0.3821, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.455925315618515, "rewards/margins": 2.105367422103882, "rewards/rejected": -1.649442195892334, "step": 208 }, { "epoch": 0.4846775227593281, "grad_norm": 15.879511628269139, "learning_rate": 4.698684378016222e-07, "logits/chosen": 0.4825616478919983, "logits/rejected": 0.5131646394729614, "logps/chosen": -43.97586441040039, "logps/rejected": -58.62031936645508, "loss": 0.271, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.6879336833953857, "rewards/margins": 2.212796211242676, "rewards/rejected": -1.5248624086380005, "step": 210 }, { "epoch": 0.4846775227593281, "eval_logits/chosen": 0.40579578280448914, "eval_logits/rejected": 0.43089571595191956, "eval_logps/chosen": -40.55123519897461, "eval_logps/rejected": -51.04283905029297, "eval_loss": 0.3111670911312103, "eval_rewards/accuracies": 0.804147481918335, "eval_rewards/chosen": 0.597020149230957, "eval_rewards/margins": 2.099271535873413, "eval_rewards/rejected": -1.502251386642456, "eval_runtime": 220.3759, "eval_samples_per_second": 7.868, "eval_steps_per_second": 1.969, "step": 210 }, { "epoch": 0.48929349916655984, "grad_norm": 33.220388342252136, "learning_rate": 4.6890158342054174e-07, "logits/chosen": 0.46122825145721436, "logits/rejected": 0.48773014545440674, "logps/chosen": -38.094722747802734, "logps/rejected": -50.649871826171875, "loss": 0.3288, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.5131194591522217, "rewards/margins": 2.1884312629699707, "rewards/rejected": -1.6753116846084595, "step": 212 }, { "epoch": 0.4939094755737915, "grad_norm": 27.37607169791161, "learning_rate": 4.679204883383973e-07, "logits/chosen": 0.45677465200424194, "logits/rejected": 0.5006839632987976, "logps/chosen": -36.343292236328125, "logps/rejected": -65.76275634765625, "loss": 0.301, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.5972538590431213, "rewards/margins": 2.6644716262817383, "rewards/rejected": -2.0672178268432617, "step": 214 }, { "epoch": 0.4985254519810232, "grad_norm": 28.712191033509406, "learning_rate": 4.669252163805919e-07, "logits/chosen": 0.48203393816947937, "logits/rejected": 0.5129568576812744, "logps/chosen": -40.263328552246094, "logps/rejected": -53.96393966674805, "loss": 0.3434, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.3674449920654297, "rewards/margins": 2.094463348388672, "rewards/rejected": -1.7270184755325317, "step": 216 }, { "epoch": 0.5031414283882549, "grad_norm": 21.430060439194165, "learning_rate": 4.65915832294809e-07, "logits/chosen": 0.5647565722465515, "logits/rejected": 0.6052375435829163, "logps/chosen": -37.24385070800781, "logps/rejected": -58.28202438354492, "loss": 0.2945, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.5437911748886108, "rewards/margins": 2.518171787261963, "rewards/rejected": -1.9743802547454834, "step": 218 }, { "epoch": 0.5077574047954866, "grad_norm": 24.194015322932014, "learning_rate": 4.6489240174680026e-07, "logits/chosen": 0.5365298390388489, "logits/rejected": 0.5451048612594604, "logps/chosen": -40.26055145263672, "logps/rejected": -40.11984634399414, "loss": 0.4064, "rewards/accuracies": 0.75, "rewards/chosen": 0.4349411725997925, "rewards/margins": 1.4253244400024414, "rewards/rejected": -0.9903832674026489, "step": 220 }, { "epoch": 0.5077574047954866, "eval_logits/chosen": 0.40611767768859863, "eval_logits/rejected": 0.43133166432380676, "eval_logps/chosen": -40.628150939941406, "eval_logps/rejected": -51.22826385498047, "eval_loss": 0.30713996291160583, "eval_rewards/accuracies": 0.8018433451652527, "eval_rewards/chosen": 0.5585668087005615, "eval_rewards/margins": 2.153529644012451, "eval_rewards/rejected": -1.5949628353118896, "eval_runtime": 220.3416, "eval_samples_per_second": 7.87, "eval_steps_per_second": 1.97, "step": 220 }, { "epoch": 0.5123733812027182, "grad_norm": 23.39715012730976, "learning_rate": 4.638549913161138e-07, "logits/chosen": 0.5600088834762573, "logits/rejected": 0.5736495852470398, "logps/chosen": -46.20627212524414, "logps/rejected": -47.1099739074707, "loss": 0.2227, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.7162383794784546, "rewards/margins": 2.4795196056365967, "rewards/rejected": -1.763281226158142, "step": 222 }, { "epoch": 0.51698935760995, "grad_norm": 23.70013936518676, "learning_rate": 4.6280366849176267e-07, "logits/chosen": 0.553576648235321, "logits/rejected": 0.5800661444664001, "logps/chosen": -41.73429870605469, "logps/rejected": -47.09934997558594, "loss": 0.2708, "rewards/accuracies": 0.875, "rewards/chosen": 0.6174063682556152, "rewards/margins": 2.10538649559021, "rewards/rejected": -1.4879801273345947, "step": 224 }, { "epoch": 0.5216053340171817, "grad_norm": 19.39438827436505, "learning_rate": 4.6173850166783446e-07, "logits/chosen": 0.5699052810668945, "logits/rejected": 0.5908712148666382, "logps/chosen": -40.74462127685547, "logps/rejected": -53.7403450012207, "loss": 0.2716, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.5502187609672546, "rewards/margins": 2.0002176761627197, "rewards/rejected": -1.4499988555908203, "step": 226 }, { "epoch": 0.5262213104244133, "grad_norm": 24.49934372199594, "learning_rate": 4.606595601390417e-07, "logits/chosen": 0.46904435753822327, "logits/rejected": 0.5106580257415771, "logps/chosen": -39.85272979736328, "logps/rejected": -61.70741653442383, "loss": 0.2336, "rewards/accuracies": 0.875, "rewards/chosen": 0.3319948613643646, "rewards/margins": 2.6446897983551025, "rewards/rejected": -2.312695026397705, "step": 228 }, { "epoch": 0.5308372868316451, "grad_norm": 28.165420212664795, "learning_rate": 4.595669140962143e-07, "logits/chosen": 0.4127655625343323, "logits/rejected": 0.479299396276474, "logps/chosen": -34.939422607421875, "logps/rejected": -78.63516235351562, "loss": 0.3107, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.18619215488433838, "rewards/margins": 2.8822548389434814, "rewards/rejected": -2.6960630416870117, "step": 230 }, { "epoch": 0.5308372868316451, "eval_logits/chosen": 0.4082220494747162, "eval_logits/rejected": 0.4335884749889374, "eval_logps/chosen": -40.824676513671875, "eval_logps/rejected": -51.529090881347656, "eval_loss": 0.30161648988723755, "eval_rewards/accuracies": 0.8104838728904724, "eval_rewards/chosen": 0.4603023827075958, "eval_rewards/margins": 2.205678939819336, "eval_rewards/rejected": -1.745376706123352, "eval_runtime": 220.269, "eval_samples_per_second": 7.872, "eval_steps_per_second": 1.97, "step": 230 }, { "epoch": 0.5354532632388768, "grad_norm": 16.564014282307436, "learning_rate": 4.5846063462173284e-07, "logits/chosen": 0.5141347050666809, "logits/rejected": 0.5398997664451599, "logps/chosen": -38.93478012084961, "logps/rejected": -53.1637077331543, "loss": 0.2932, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.3137105405330658, "rewards/margins": 2.214162826538086, "rewards/rejected": -1.9004522562026978, "step": 232 }, { "epoch": 0.5400692396461084, "grad_norm": 30.180896923031582, "learning_rate": 4.573407936849044e-07, "logits/chosen": 0.49748367071151733, "logits/rejected": 0.502750039100647, "logps/chosen": -46.67736053466797, "logps/rejected": -48.594566345214844, "loss": 0.3143, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.39324572682380676, "rewards/margins": 1.9298076629638672, "rewards/rejected": -1.5365619659423828, "step": 234 }, { "epoch": 0.5446852160533402, "grad_norm": 43.03719615392396, "learning_rate": 4.5620746413728063e-07, "logits/chosen": 0.5845724940299988, "logits/rejected": 0.5915371775627136, "logps/chosen": -52.0160026550293, "logps/rejected": -49.12672805786133, "loss": 0.2833, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.14373371005058289, "rewards/margins": 2.1639184951782227, "rewards/rejected": -2.0201845169067383, "step": 236 }, { "epoch": 0.5493011924605719, "grad_norm": 21.1030283537707, "learning_rate": 4.550607197079185e-07, "logits/chosen": 0.552834153175354, "logits/rejected": 0.5818264484405518, "logps/chosen": -38.04405212402344, "logps/rejected": -46.87253189086914, "loss": 0.2897, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.4428212344646454, "rewards/margins": 1.7602063417434692, "rewards/rejected": -1.317385196685791, "step": 238 }, { "epoch": 0.5539171688678035, "grad_norm": 14.340136381864786, "learning_rate": 4.5390063499858353e-07, "logits/chosen": 0.5454181432723999, "logits/rejected": 0.5769542455673218, "logps/chosen": -47.16811752319336, "logps/rejected": -62.15293884277344, "loss": 0.2046, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.36544325947761536, "rewards/margins": 2.6488418579101562, "rewards/rejected": -2.2833986282348633, "step": 240 }, { "epoch": 0.5539171688678035, "eval_logits/chosen": 0.41252779960632324, "eval_logits/rejected": 0.4378991425037384, "eval_logps/chosen": -40.974342346191406, "eval_logps/rejected": -51.8930778503418, "eval_loss": 0.2962896525859833, "eval_rewards/accuracies": 0.8070276379585266, "eval_rewards/chosen": 0.3854685127735138, "eval_rewards/margins": 2.312840223312378, "eval_rewards/rejected": -1.927371859550476, "eval_runtime": 220.4271, "eval_samples_per_second": 7.867, "eval_steps_per_second": 1.969, "step": 240 }, { "epoch": 0.5585331452750353, "grad_norm": 20.121912107871452, "learning_rate": 4.5272728547889687e-07, "logits/chosen": 0.5017317533493042, "logits/rejected": 0.5252359509468079, "logps/chosen": -43.418678283691406, "logps/rejected": -51.78999710083008, "loss": 0.2157, "rewards/accuracies": 0.875, "rewards/chosen": 0.29254353046417236, "rewards/margins": 2.571570873260498, "rewards/rejected": -2.2790274620056152, "step": 242 }, { "epoch": 0.5631491216822669, "grad_norm": 36.79556689673262, "learning_rate": 4.5154074748142535e-07, "logits/chosen": 0.5326908230781555, "logits/rejected": 0.5592876672744751, "logps/chosen": -45.176578521728516, "logps/rejected": -55.26374053955078, "loss": 0.2959, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.26444554328918457, "rewards/margins": 2.2498509883880615, "rewards/rejected": -1.985405445098877, "step": 244 }, { "epoch": 0.5677650980894986, "grad_norm": 30.279268688467162, "learning_rate": 4.503410981967158e-07, "logits/chosen": 0.508591890335083, "logits/rejected": 0.5472189784049988, "logps/chosen": -37.81255340576172, "logps/rejected": -59.81355285644531, "loss": 0.3387, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.324074387550354, "rewards/margins": 2.479010581970215, "rewards/rejected": -2.1549363136291504, "step": 246 }, { "epoch": 0.5723810744967304, "grad_norm": 32.696656835575155, "learning_rate": 4.4912841566827333e-07, "logits/chosen": 0.5358154773712158, "logits/rejected": 0.572979211807251, "logps/chosen": -40.84016799926758, "logps/rejected": -57.57326889038086, "loss": 0.2559, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.6461736559867859, "rewards/margins": 2.717188835144043, "rewards/rejected": -2.0710153579711914, "step": 248 }, { "epoch": 0.576997050903962, "grad_norm": 26.864795137183627, "learning_rate": 4.4790277878748415e-07, "logits/chosen": 0.5129296779632568, "logits/rejected": 0.543644368648529, "logps/chosen": -36.90694046020508, "logps/rejected": -51.41253662109375, "loss": 0.2466, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.4497109651565552, "rewards/margins": 2.559537172317505, "rewards/rejected": -2.1098265647888184, "step": 250 }, { "epoch": 0.576997050903962, "eval_logits/chosen": 0.4140053689479828, "eval_logits/rejected": 0.43953680992126465, "eval_logps/chosen": -40.92128372192383, "eval_logps/rejected": -52.06728744506836, "eval_loss": 0.29202744364738464, "eval_rewards/accuracies": 0.8064516186714172, "eval_rewards/chosen": 0.41199636459350586, "eval_rewards/margins": 2.4264743328094482, "eval_rewards/rejected": -2.0144779682159424, "eval_runtime": 220.3958, "eval_samples_per_second": 7.868, "eval_steps_per_second": 1.969, "step": 250 }, { "epoch": 0.5816130273111938, "grad_norm": 34.34355868179491, "learning_rate": 4.466642672884835e-07, "logits/chosen": 0.5273095965385437, "logits/rejected": 0.5604310631752014, "logps/chosen": -39.039512634277344, "logps/rejected": -52.470951080322266, "loss": 0.2676, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.256040096282959, "rewards/margins": 2.4306235313415527, "rewards/rejected": -2.1745834350585938, "step": 252 }, { "epoch": 0.5862290037184255, "grad_norm": 27.545044099293104, "learning_rate": 4.454129617429682e-07, "logits/chosen": 0.515310525894165, "logits/rejected": 0.5264334678649902, "logps/chosen": -41.25297546386719, "logps/rejected": -44.831031799316406, "loss": 0.2921, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.2963744103908539, "rewards/margins": 2.2201662063598633, "rewards/rejected": -1.9237921237945557, "step": 254 }, { "epoch": 0.5908449801256571, "grad_norm": 16.22258168997157, "learning_rate": 4.441489435549551e-07, "logits/chosen": 0.5497354865074158, "logits/rejected": 0.5820472240447998, "logps/chosen": -45.16104507446289, "logps/rejected": -60.09016799926758, "loss": 0.2492, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.36222705245018005, "rewards/margins": 2.6290435791015625, "rewards/rejected": -2.2668166160583496, "step": 256 }, { "epoch": 0.5954609565328889, "grad_norm": 22.519317936372268, "learning_rate": 4.4287229495548573e-07, "logits/chosen": 0.5290111303329468, "logits/rejected": 0.550987184047699, "logps/chosen": -45.896942138671875, "logps/rejected": -57.38431930541992, "loss": 0.2158, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.3132680654525757, "rewards/margins": 2.935549020767212, "rewards/rejected": -2.622281074523926, "step": 258 }, { "epoch": 0.6000769329401205, "grad_norm": 33.27879387908239, "learning_rate": 4.415830989972761e-07, "logits/chosen": 0.613827645778656, "logits/rejected": 0.6395273208618164, "logps/chosen": -40.98984146118164, "logps/rejected": -48.8809700012207, "loss": 0.3209, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.3634183704853058, "rewards/margins": 2.285569190979004, "rewards/rejected": -1.922150731086731, "step": 260 }, { "epoch": 0.6000769329401205, "eval_logits/chosen": 0.41586774587631226, "eval_logits/rejected": 0.4413994550704956, "eval_logps/chosen": -41.435340881347656, "eval_logps/rejected": -52.66230773925781, "eval_loss": 0.28806936740875244, "eval_rewards/accuracies": 0.8116359710693359, "eval_rewards/chosen": 0.15496963262557983, "eval_rewards/margins": 2.4669582843780518, "eval_rewards/rejected": -2.3119888305664062, "eval_runtime": 220.1153, "eval_samples_per_second": 7.878, "eval_steps_per_second": 1.972, "step": 260 }, { "epoch": 0.6046929093473522, "grad_norm": 28.090703957454657, "learning_rate": 4.402814395493142e-07, "logits/chosen": 0.49612462520599365, "logits/rejected": 0.4979320168495178, "logps/chosen": -40.7058219909668, "logps/rejected": -38.908050537109375, "loss": 0.3653, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.15811699628829956, "rewards/margins": 1.8890395164489746, "rewards/rejected": -1.7309226989746094, "step": 262 }, { "epoch": 0.609308885754584, "grad_norm": 20.963207734816056, "learning_rate": 4.3896740129140354e-07, "logits/chosen": 0.49926820397377014, "logits/rejected": 0.518930196762085, "logps/chosen": -41.947425842285156, "logps/rejected": -42.273597717285156, "loss": 0.2493, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.2666120231151581, "rewards/margins": 2.4279704093933105, "rewards/rejected": -2.161358594894409, "step": 264 }, { "epoch": 0.6139248621618156, "grad_norm": 24.847993356607933, "learning_rate": 4.3764106970865456e-07, "logits/chosen": 0.5007407665252686, "logits/rejected": 0.5330516695976257, "logps/chosen": -36.07570266723633, "logps/rejected": -50.92935562133789, "loss": 0.3174, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.02925288677215576, "rewards/margins": 2.231614589691162, "rewards/rejected": -2.202361583709717, "step": 266 }, { "epoch": 0.6185408385690473, "grad_norm": 26.539349634561272, "learning_rate": 4.3630253108592305e-07, "logits/chosen": 0.5235443115234375, "logits/rejected": 0.5463228821754456, "logps/chosen": -48.52283477783203, "logps/rejected": -54.78059387207031, "loss": 0.2266, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.005189484916627407, "rewards/margins": 2.9114773273468018, "rewards/rejected": -2.9062881469726562, "step": 268 }, { "epoch": 0.6231568149762791, "grad_norm": 35.3397663590889, "learning_rate": 4.3495187250219723e-07, "logits/chosen": 0.4959086775779724, "logits/rejected": 0.5330989360809326, "logps/chosen": -37.50285339355469, "logps/rejected": -56.99623489379883, "loss": 0.2865, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.16485626995563507, "rewards/margins": 2.9254465103149414, "rewards/rejected": -3.0903029441833496, "step": 270 }, { "epoch": 0.6231568149762791, "eval_logits/chosen": 0.4182251989841461, "eval_logits/rejected": 0.44391536712646484, "eval_logps/chosen": -41.51067352294922, "eval_logps/rejected": -52.77988052368164, "eval_loss": 0.2869359254837036, "eval_rewards/accuracies": 0.8116359710693359, "eval_rewards/chosen": 0.11730305105447769, "eval_rewards/margins": 2.488077163696289, "eval_rewards/rejected": -2.3707735538482666, "eval_runtime": 220.1579, "eval_samples_per_second": 7.876, "eval_steps_per_second": 1.971, "step": 270 }, { "epoch": 0.6277727913835107, "grad_norm": 23.403340630174217, "learning_rate": 4.3358918182493253e-07, "logits/chosen": 0.5670427083969116, "logits/rejected": 0.5846278071403503, "logps/chosen": -41.197166442871094, "logps/rejected": -48.75783920288086, "loss": 0.229, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.05103777348995209, "rewards/margins": 2.2875313758850098, "rewards/rejected": -2.338569164276123, "step": 272 }, { "epoch": 0.6323887677907424, "grad_norm": 31.35543837574939, "learning_rate": 4.3221454770433554e-07, "logits/chosen": 0.5044899582862854, "logits/rejected": 0.5252879858016968, "logps/chosen": -46.43470764160156, "logps/rejected": -50.872764587402344, "loss": 0.2558, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.030280061066150665, "rewards/margins": 2.529269218444824, "rewards/rejected": -2.4989893436431885, "step": 274 }, { "epoch": 0.6370047441979741, "grad_norm": 27.239886684790495, "learning_rate": 4.308280595675966e-07, "logits/chosen": 0.5399680733680725, "logits/rejected": 0.5539530515670776, "logps/chosen": -45.22441101074219, "logps/rejected": -51.61985397338867, "loss": 0.3439, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": -0.1256939023733139, "rewards/margins": 2.2664339542388916, "rewards/rejected": -2.392127752304077, "step": 276 }, { "epoch": 0.6416207206052058, "grad_norm": 29.254953852014435, "learning_rate": 4.2942980761307227e-07, "logits/chosen": 0.5513600707054138, "logits/rejected": 0.5763798356056213, "logps/chosen": -42.95576477050781, "logps/rejected": -53.852542877197266, "loss": 0.2795, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.3323157727718353, "rewards/margins": 2.3478498458862305, "rewards/rejected": -2.680166006088257, "step": 278 }, { "epoch": 0.6462366970124375, "grad_norm": 16.01715280590405, "learning_rate": 4.2801988280441765e-07, "logits/chosen": 0.5487841367721558, "logits/rejected": 0.5692893862724304, "logps/chosen": -45.817508697509766, "logps/rejected": -54.61252975463867, "loss": 0.2162, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.03073420189321041, "rewards/margins": 2.809882402420044, "rewards/rejected": -2.840616226196289, "step": 280 }, { "epoch": 0.6462366970124375, "eval_logits/chosen": 0.41910773515701294, "eval_logits/rejected": 0.44490164518356323, "eval_logps/chosen": -41.43645477294922, "eval_logps/rejected": -52.90102005004883, "eval_loss": 0.2802717387676239, "eval_rewards/accuracies": 0.8104838728904724, "eval_rewards/chosen": 0.15440984070301056, "eval_rewards/margins": 2.585754156112671, "eval_rewards/rejected": -2.431344509124756, "eval_runtime": 220.3099, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 280 }, { "epoch": 0.6508526734196692, "grad_norm": 21.181113416054586, "learning_rate": 4.2659837686466813e-07, "logits/chosen": 0.498602032661438, "logits/rejected": 0.5217832922935486, "logps/chosen": -40.613285064697266, "logps/rejected": -50.06806945800781, "loss": 0.262, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.07628664374351501, "rewards/margins": 2.542593240737915, "rewards/rejected": -2.466306447982788, "step": 282 }, { "epoch": 0.6554686498269009, "grad_norm": 27.465624654814576, "learning_rate": 4.25165382270273e-07, "logits/chosen": 0.5099713206291199, "logits/rejected": 0.5337219834327698, "logps/chosen": -37.57986831665039, "logps/rejected": -45.39601516723633, "loss": 0.2483, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.15927743911743164, "rewards/margins": 2.373776912689209, "rewards/rejected": -2.2144994735717773, "step": 284 }, { "epoch": 0.6600846262341326, "grad_norm": 24.232084058794833, "learning_rate": 4.2372099224507875e-07, "logits/chosen": 0.47430500388145447, "logits/rejected": 0.5168524980545044, "logps/chosen": -34.61323547363281, "logps/rejected": -60.36859130859375, "loss": 0.2904, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.013289166614413261, "rewards/margins": 2.84716534614563, "rewards/rejected": -2.860454797744751, "step": 286 }, { "epoch": 0.6647006026413643, "grad_norm": 28.26074226923709, "learning_rate": 4.2226530075426503e-07, "logits/chosen": 0.5559656620025635, "logits/rejected": 0.562049150466919, "logps/chosen": -48.77291488647461, "logps/rejected": -52.30695343017578, "loss": 0.2904, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.016986362636089325, "rewards/margins": 2.4160873889923096, "rewards/rejected": -2.3991012573242188, "step": 288 }, { "epoch": 0.669316579048596, "grad_norm": 25.964047989048964, "learning_rate": 4.2079840249823106e-07, "logits/chosen": 0.5188059210777283, "logits/rejected": 0.5476034879684448, "logps/chosen": -43.39430236816406, "logps/rejected": -63.02970886230469, "loss": 0.2964, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.22233732044696808, "rewards/margins": 2.6584837436676025, "rewards/rejected": -2.8808212280273438, "step": 290 }, { "epoch": 0.669316579048596, "eval_logits/chosen": 0.41873642802238464, "eval_logits/rejected": 0.44454658031463623, "eval_logps/chosen": -41.64173126220703, "eval_logps/rejected": -53.234169006347656, "eval_loss": 0.27578282356262207, "eval_rewards/accuracies": 0.8127880096435547, "eval_rewards/chosen": 0.05177304521203041, "eval_rewards/margins": 2.6496896743774414, "eval_rewards/rejected": -2.597916603088379, "eval_runtime": 220.2319, "eval_samples_per_second": 7.874, "eval_steps_per_second": 1.971, "step": 290 }, { "epoch": 0.6739325554558276, "grad_norm": 28.11981406671555, "learning_rate": 4.193203929064353e-07, "logits/chosen": 0.5352766513824463, "logits/rejected": 0.5633915066719055, "logps/chosen": -43.08574676513672, "logps/rejected": -63.65277099609375, "loss": 0.292, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.09769348800182343, "rewards/margins": 2.7585980892181396, "rewards/rejected": -2.8562917709350586, "step": 292 }, { "epoch": 0.6785485318630594, "grad_norm": 22.159785280949862, "learning_rate": 4.1783136813118705e-07, "logits/chosen": 0.5104035139083862, "logits/rejected": 0.5326347947120667, "logps/chosen": -44.235877990722656, "logps/rejected": -53.24985885620117, "loss": 0.2764, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.3315318822860718, "rewards/margins": 2.574824810028076, "rewards/rejected": -2.9063568115234375, "step": 294 }, { "epoch": 0.6831645082702911, "grad_norm": 16.58376439365046, "learning_rate": 4.163314250413913e-07, "logits/chosen": 0.5757681131362915, "logits/rejected": 0.6053035855293274, "logps/chosen": -40.00181579589844, "logps/rejected": -50.29273986816406, "loss": 0.193, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.18691450357437134, "rewards/margins": 2.6521503925323486, "rewards/rejected": -2.465236186981201, "step": 296 }, { "epoch": 0.6877804846775227, "grad_norm": 32.319500846176076, "learning_rate": 4.1482066121624716e-07, "logits/chosen": 0.5265994668006897, "logits/rejected": 0.5376725792884827, "logps/chosen": -42.3819580078125, "logps/rejected": -43.448524475097656, "loss": 0.3285, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.1531985104084015, "rewards/margins": 2.268404245376587, "rewards/rejected": -2.115206003189087, "step": 298 }, { "epoch": 0.6923964610847545, "grad_norm": 23.349636529497012, "learning_rate": 4.1329917493889933e-07, "logits/chosen": 0.43518775701522827, "logits/rejected": 0.46238911151885986, "logps/chosen": -39.432003021240234, "logps/rejected": -52.38154983520508, "loss": 0.2382, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.1465599089860916, "rewards/margins": 2.628819704055786, "rewards/rejected": -2.7753796577453613, "step": 300 }, { "epoch": 0.6923964610847545, "eval_logits/chosen": 0.4236195683479309, "eval_logits/rejected": 0.4493381381034851, "eval_logps/chosen": -41.62788009643555, "eval_logps/rejected": -53.235809326171875, "eval_loss": 0.2743636965751648, "eval_rewards/accuracies": 0.8122119903564453, "eval_rewards/chosen": 0.05869903042912483, "eval_rewards/margins": 2.6574366092681885, "eval_rewards/rejected": -2.5987374782562256, "eval_runtime": 220.281, "eval_samples_per_second": 7.872, "eval_steps_per_second": 1.97, "step": 300 }, { "epoch": 0.6970124374919862, "grad_norm": 23.497513813632327, "learning_rate": 4.117670651900446e-07, "logits/chosen": 0.5692274570465088, "logits/rejected": 0.5857737064361572, "logps/chosen": -44.88375473022461, "logps/rejected": -50.89904022216797, "loss": 0.3059, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.18370471894741058, "rewards/margins": 2.1322684288024902, "rewards/rejected": -2.3159730434417725, "step": 302 }, { "epoch": 0.7016284138992178, "grad_norm": 31.67224576363876, "learning_rate": 4.1022443164149237e-07, "logits/chosen": 0.48219427466392517, "logits/rejected": 0.5107440948486328, "logps/chosen": -46.37804412841797, "logps/rejected": -62.33393859863281, "loss": 0.2685, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.04567752406001091, "rewards/margins": 2.84682559967041, "rewards/rejected": -2.892503261566162, "step": 304 }, { "epoch": 0.7062443903064496, "grad_norm": 19.857257644454698, "learning_rate": 4.086713746496808e-07, "logits/chosen": 0.5637336373329163, "logits/rejected": 0.588976263999939, "logps/chosen": -39.28482437133789, "logps/rejected": -50.71957778930664, "loss": 0.2575, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.2317693531513214, "rewards/margins": 2.6872549057006836, "rewards/rejected": -2.4554860591888428, "step": 306 }, { "epoch": 0.7108603667136812, "grad_norm": 17.71463775233371, "learning_rate": 4.0710799524914805e-07, "logits/chosen": 0.5934479832649231, "logits/rejected": 0.6081465482711792, "logps/chosen": -50.33334732055664, "logps/rejected": -55.25143814086914, "loss": 0.2103, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.034923017024993896, "rewards/margins": 2.6961231231689453, "rewards/rejected": -2.731046199798584, "step": 308 }, { "epoch": 0.7154763431209129, "grad_norm": 19.132153588643654, "learning_rate": 4.055343951459592e-07, "logits/chosen": 0.5560102462768555, "logits/rejected": 0.5947719812393188, "logps/chosen": -37.43670654296875, "logps/rejected": -57.06461715698242, "loss": 0.226, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.07254935055971146, "rewards/margins": 2.918682336807251, "rewards/rejected": -2.991231918334961, "step": 310 }, { "epoch": 0.7154763431209129, "eval_logits/chosen": 0.42303159832954407, "eval_logits/rejected": 0.44889286160469055, "eval_logps/chosen": -41.60685348510742, "eval_logps/rejected": -53.284358978271484, "eval_loss": 0.27253130078315735, "eval_rewards/accuracies": 0.8133640289306641, "eval_rewards/chosen": 0.06921074539422989, "eval_rewards/margins": 2.692223072052002, "eval_rewards/rejected": -2.6230127811431885, "eval_runtime": 220.2961, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 310 }, { "epoch": 0.7200923195281447, "grad_norm": 20.574269162073108, "learning_rate": 4.0395067671108985e-07, "logits/chosen": 0.47218936681747437, "logits/rejected": 0.5014721155166626, "logps/chosen": -35.916664123535156, "logps/rejected": -44.856101989746094, "loss": 0.2579, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.182376891374588, "rewards/margins": 2.569021701812744, "rewards/rejected": -2.3866446018218994, "step": 312 }, { "epoch": 0.7247082959353763, "grad_norm": 30.250167869534483, "learning_rate": 4.0235694297376637e-07, "logits/chosen": 0.5631113648414612, "logits/rejected": 0.5769122242927551, "logps/chosen": -49.87733459472656, "logps/rejected": -55.8229866027832, "loss": 0.2861, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.1988232433795929, "rewards/margins": 2.635685443878174, "rewards/rejected": -2.4368624687194824, "step": 314 }, { "epoch": 0.729324272342608, "grad_norm": 32.09859733085628, "learning_rate": 4.0075329761476347e-07, "logits/chosen": 0.5582194924354553, "logits/rejected": 0.5716796517372131, "logps/chosen": -44.06077575683594, "logps/rejected": -48.060577392578125, "loss": 0.2637, "rewards/accuracies": 0.875, "rewards/chosen": -0.14417774975299835, "rewards/margins": 2.182429313659668, "rewards/rejected": -2.3266072273254395, "step": 316 }, { "epoch": 0.7339402487498398, "grad_norm": 20.839702603979845, "learning_rate": 3.991398449596588e-07, "logits/chosen": 0.5104639530181885, "logits/rejected": 0.5302228331565857, "logps/chosen": -46.450565338134766, "logps/rejected": -56.8250732421875, "loss": 0.2178, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.05337013676762581, "rewards/margins": 2.7899389266967773, "rewards/rejected": -2.7365689277648926, "step": 318 }, { "epoch": 0.7385562251570714, "grad_norm": 35.607964067039056, "learning_rate": 3.9751668997204647e-07, "logits/chosen": 0.573165774345398, "logits/rejected": 0.592732310295105, "logps/chosen": -46.10280990600586, "logps/rejected": -53.3104248046875, "loss": 0.238, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.08940169960260391, "rewards/margins": 2.5656909942626953, "rewards/rejected": -2.655092716217041, "step": 320 }, { "epoch": 0.7385562251570714, "eval_logits/chosen": 0.4224054217338562, "eval_logits/rejected": 0.4482380449771881, "eval_logps/chosen": -41.65960693359375, "eval_logps/rejected": -53.47556686401367, "eval_loss": 0.2701371908187866, "eval_rewards/accuracies": 0.8185483813285828, "eval_rewards/chosen": 0.04283595457673073, "eval_rewards/margins": 2.761453866958618, "eval_rewards/rejected": -2.718618154525757, "eval_runtime": 220.4956, "eval_samples_per_second": 7.864, "eval_steps_per_second": 1.968, "step": 320 }, { "epoch": 0.7431722015643031, "grad_norm": 40.34998221595971, "learning_rate": 3.958839382467084e-07, "logits/chosen": 0.5077357888221741, "logits/rejected": 0.5302278995513916, "logps/chosen": -38.23583984375, "logps/rejected": -49.62001037597656, "loss": 0.2911, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.431808739900589, "rewards/margins": 2.4383790493011475, "rewards/rejected": -2.0065698623657227, "step": 322 }, { "epoch": 0.7477881779715349, "grad_norm": 37.34949673704143, "learning_rate": 3.9424169600274494e-07, "logits/chosen": 0.5166856646537781, "logits/rejected": 0.5311781167984009, "logps/chosen": -43.24025344848633, "logps/rejected": -48.49333190917969, "loss": 0.3054, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.1698003113269806, "rewards/margins": 2.2522177696228027, "rewards/rejected": -2.422018051147461, "step": 324 }, { "epoch": 0.7524041543787665, "grad_norm": 25.91010722050029, "learning_rate": 3.9259007007666436e-07, "logits/chosen": 0.5167285203933716, "logits/rejected": 0.5338759422302246, "logps/chosen": -44.82267761230469, "logps/rejected": -55.40620803833008, "loss": 0.2723, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.06528851389884949, "rewards/margins": 2.759828805923462, "rewards/rejected": -2.694540023803711, "step": 326 }, { "epoch": 0.7570201307859982, "grad_norm": 30.862683948057615, "learning_rate": 3.909291679154332e-07, "logits/chosen": 0.5040656328201294, "logits/rejected": 0.5386430025100708, "logps/chosen": -42.25190734863281, "logps/rejected": -62.51930618286133, "loss": 0.2759, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.2548324167728424, "rewards/margins": 3.0783848762512207, "rewards/rejected": -3.333217144012451, "step": 328 }, { "epoch": 0.7616361071932299, "grad_norm": 19.125155732205084, "learning_rate": 3.892590975694858e-07, "logits/chosen": 0.49563461542129517, "logits/rejected": 0.539116621017456, "logps/chosen": -39.31736755371094, "logps/rejected": -60.45228576660156, "loss": 0.2182, "rewards/accuracies": 0.875, "rewards/chosen": 0.16796639561653137, "rewards/margins": 3.4695467948913574, "rewards/rejected": -3.301579713821411, "step": 330 }, { "epoch": 0.7616361071932299, "eval_logits/chosen": 0.4225333333015442, "eval_logits/rejected": 0.44842836260795593, "eval_logps/chosen": -41.670494079589844, "eval_logps/rejected": -53.553314208984375, "eval_loss": 0.2688952684402466, "eval_rewards/accuracies": 0.8145161271095276, "eval_rewards/chosen": 0.037393342703580856, "eval_rewards/margins": 2.7948849201202393, "eval_rewards/rejected": -2.7574915885925293, "eval_runtime": 220.4734, "eval_samples_per_second": 7.865, "eval_steps_per_second": 1.968, "step": 330 }, { "epoch": 0.7662520836004616, "grad_norm": 20.197390141727503, "learning_rate": 3.875799676856952e-07, "logits/chosen": 0.5481100082397461, "logits/rejected": 0.5680783987045288, "logps/chosen": -43.26856994628906, "logps/rejected": -54.90293884277344, "loss": 0.2148, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.2920362651348114, "rewards/margins": 2.9112956523895264, "rewards/rejected": -3.20333194732666, "step": 332 }, { "epoch": 0.7708680600076933, "grad_norm": 28.41138671183374, "learning_rate": 3.858918875003053e-07, "logits/chosen": 0.5375738143920898, "logits/rejected": 0.5755133628845215, "logps/chosen": -41.622859954833984, "logps/rejected": -61.92311096191406, "loss": 0.2733, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.09028860926628113, "rewards/margins": 3.286768674850464, "rewards/rejected": -3.3770573139190674, "step": 334 }, { "epoch": 0.775484036414925, "grad_norm": 16.265551276537238, "learning_rate": 3.8419496683182396e-07, "logits/chosen": 0.5556432604789734, "logits/rejected": 0.5942565202713013, "logps/chosen": -41.74842071533203, "logps/rejected": -57.50096893310547, "loss": 0.1896, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.0623447448015213, "rewards/margins": 2.878957748413086, "rewards/rejected": -2.941302537918091, "step": 336 }, { "epoch": 0.7801000128221567, "grad_norm": 26.59915287717055, "learning_rate": 3.824893160738792e-07, "logits/chosen": 0.5246456861495972, "logits/rejected": 0.553848385810852, "logps/chosen": -42.39156723022461, "logps/rejected": -57.20592498779297, "loss": 0.2682, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.07438618689775467, "rewards/margins": 3.046879291534424, "rewards/rejected": -3.1212656497955322, "step": 338 }, { "epoch": 0.7847159892293883, "grad_norm": 23.023616857684974, "learning_rate": 3.8077504618803737e-07, "logits/chosen": 0.580450713634491, "logits/rejected": 0.5835237503051758, "logps/chosen": -48.9189567565918, "logps/rejected": -47.836578369140625, "loss": 0.2668, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.06672815978527069, "rewards/margins": 2.457933187484741, "rewards/rejected": -2.5246615409851074, "step": 340 }, { "epoch": 0.7847159892293883, "eval_logits/chosen": 0.4240727126598358, "eval_logits/rejected": 0.4500102698802948, "eval_logps/chosen": -41.714290618896484, "eval_logps/rejected": -53.6696662902832, "eval_loss": 0.2670309841632843, "eval_rewards/accuracies": 0.8179723620414734, "eval_rewards/chosen": 0.015493539161980152, "eval_rewards/margins": 2.83115816116333, "eval_rewards/rejected": -2.815664768218994, "eval_runtime": 220.6721, "eval_samples_per_second": 7.858, "eval_steps_per_second": 1.967, "step": 340 }, { "epoch": 0.7893319656366201, "grad_norm": 16.479244956266236, "learning_rate": 3.7905226869658446e-07, "logits/chosen": 0.4684799015522003, "logits/rejected": 0.4874458909034729, "logps/chosen": -43.62626647949219, "logps/rejected": -55.70362854003906, "loss": 0.2494, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.13190358877182007, "rewards/margins": 2.8091206550598145, "rewards/rejected": -2.6772167682647705, "step": 342 }, { "epoch": 0.7939479420438518, "grad_norm": 24.369877883114157, "learning_rate": 3.773210956752709e-07, "logits/chosen": 0.544243574142456, "logits/rejected": 0.5578660368919373, "logps/chosen": -40.1495246887207, "logps/rejected": -44.17314910888672, "loss": 0.2798, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.020625757053494453, "rewards/margins": 2.502214193344116, "rewards/rejected": -2.481588363647461, "step": 344 }, { "epoch": 0.7985639184510834, "grad_norm": 25.623903462647995, "learning_rate": 3.7558163974602093e-07, "logits/chosen": 0.474899560213089, "logits/rejected": 0.5161857008934021, "logps/chosen": -37.74607467651367, "logps/rejected": -55.48906707763672, "loss": 0.2419, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.001830246765166521, "rewards/margins": 2.923034906387329, "rewards/rejected": -2.924865245819092, "step": 346 }, { "epoch": 0.8031798948583152, "grad_norm": 25.184522607734593, "learning_rate": 3.73834014069605e-07, "logits/chosen": 0.558302104473114, "logits/rejected": 0.5833041667938232, "logps/chosen": -48.4046630859375, "logps/rejected": -61.20756149291992, "loss": 0.2374, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.1346227377653122, "rewards/margins": 2.8843278884887695, "rewards/rejected": -3.0189502239227295, "step": 348 }, { "epoch": 0.8077958712655469, "grad_norm": 24.77024105098058, "learning_rate": 3.7207833233827914e-07, "logits/chosen": 0.4649287462234497, "logits/rejected": 0.482571542263031, "logps/chosen": -44.39389419555664, "logps/rejected": -58.24624252319336, "loss": 0.2534, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.14530682563781738, "rewards/margins": 3.2228527069091797, "rewards/rejected": -3.368159532546997, "step": 350 }, { "epoch": 0.8077958712655469, "eval_logits/chosen": 0.42746230959892273, "eval_logits/rejected": 0.45336535573005676, "eval_logps/chosen": -42.037269592285156, "eval_logps/rejected": -54.03358459472656, "eval_loss": 0.2634715437889099, "eval_rewards/accuracies": 0.8168202638626099, "eval_rewards/chosen": -0.1459963023662567, "eval_rewards/margins": 2.8516335487365723, "eval_rewards/rejected": -2.9976296424865723, "eval_runtime": 220.3701, "eval_samples_per_second": 7.869, "eval_steps_per_second": 1.969, "step": 350 }, { "epoch": 0.8124118476727785, "grad_norm": 26.201135314502036, "learning_rate": 3.7031470876838786e-07, "logits/chosen": 0.5293068289756775, "logits/rejected": 0.5655782222747803, "logps/chosen": -42.89842224121094, "logps/rejected": -63.14483642578125, "loss": 0.2516, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.4706004559993744, "rewards/margins": 2.8817062377929688, "rewards/rejected": -3.352307081222534, "step": 352 }, { "epoch": 0.8170278240800103, "grad_norm": 22.294887268242963, "learning_rate": 3.6854325809293455e-07, "logits/chosen": 0.49771615862846375, "logits/rejected": 0.5413529276847839, "logps/chosen": -36.90867233276367, "logps/rejected": -64.4770278930664, "loss": 0.2284, "rewards/accuracies": 0.875, "rewards/chosen": -0.27428972721099854, "rewards/margins": 3.501157522201538, "rewards/rejected": -3.775447368621826, "step": 354 }, { "epoch": 0.821643800487242, "grad_norm": 28.188753078893058, "learning_rate": 3.6676409555411653e-07, "logits/chosen": 0.5484297871589661, "logits/rejected": 0.5813949704170227, "logps/chosen": -45.460365295410156, "logps/rejected": -60.86439895629883, "loss": 0.2542, "rewards/accuracies": 0.875, "rewards/chosen": -0.4246326684951782, "rewards/margins": 3.2056918144226074, "rewards/rejected": -3.630324363708496, "step": 356 }, { "epoch": 0.8262597768944736, "grad_norm": 17.14121226520804, "learning_rate": 3.6497733689582866e-07, "logits/chosen": 0.48876845836639404, "logits/rejected": 0.5145962238311768, "logps/chosen": -39.37761688232422, "logps/rejected": -49.643211364746094, "loss": 0.2016, "rewards/accuracies": 0.875, "rewards/chosen": -0.21149006485939026, "rewards/margins": 2.893353223800659, "rewards/rejected": -3.1048433780670166, "step": 358 }, { "epoch": 0.8308757533017054, "grad_norm": 35.17955186267088, "learning_rate": 3.631830983561335e-07, "logits/chosen": 0.573662519454956, "logits/rejected": 0.5948094725608826, "logps/chosen": -47.85080337524414, "logps/rejected": -52.225006103515625, "loss": 0.2586, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.3559052646160126, "rewards/margins": 2.786222219467163, "rewards/rejected": -3.142127513885498, "step": 360 }, { "epoch": 0.8308757533017054, "eval_logits/chosen": 0.42756161093711853, "eval_logits/rejected": 0.45349106192588806, "eval_logps/chosen": -42.38340759277344, "eval_logps/rejected": -54.44844436645508, "eval_loss": 0.2630784213542938, "eval_rewards/accuracies": 0.8179723620414734, "eval_rewards/chosen": -0.31906506419181824, "eval_rewards/margins": 2.8859920501708984, "eval_rewards/rejected": -3.205056667327881, "eval_runtime": 220.2057, "eval_samples_per_second": 7.874, "eval_steps_per_second": 1.971, "step": 360 }, { "epoch": 0.835491729708937, "grad_norm": 36.03053976982613, "learning_rate": 3.613814966596991e-07, "logits/chosen": 0.5263631343841553, "logits/rejected": 0.5573300123214722, "logps/chosen": -43.24696731567383, "logps/rejected": -57.23331069946289, "loss": 0.2526, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.4683598279953003, "rewards/margins": 3.082267999649048, "rewards/rejected": -3.5506277084350586, "step": 362 }, { "epoch": 0.8401077061161687, "grad_norm": 15.328563865471402, "learning_rate": 3.595726490102059e-07, "logits/chosen": 0.5707637071609497, "logits/rejected": 0.6143693327903748, "logps/chosen": -40.44147491455078, "logps/rejected": -62.61209487915039, "loss": 0.1294, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.3496915102005005, "rewards/margins": 3.486618995666504, "rewards/rejected": -3.836310863494873, "step": 364 }, { "epoch": 0.8447236825234005, "grad_norm": 15.002635114989888, "learning_rate": 3.577566730827214e-07, "logits/chosen": 0.5126733779907227, "logits/rejected": 0.5439874529838562, "logps/chosen": -40.29549789428711, "logps/rejected": -56.204898834228516, "loss": 0.2951, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.3362084925174713, "rewards/margins": 2.846021890640259, "rewards/rejected": -3.182230234146118, "step": 366 }, { "epoch": 0.8493396589306321, "grad_norm": 25.52691859216037, "learning_rate": 3.559336870160453e-07, "logits/chosen": 0.5128374099731445, "logits/rejected": 0.5424924492835999, "logps/chosen": -38.71543884277344, "logps/rejected": -52.61689758300781, "loss": 0.2084, "rewards/accuracies": 0.875, "rewards/chosen": -0.28658950328826904, "rewards/margins": 3.0817792415618896, "rewards/rejected": -3.368368625640869, "step": 368 }, { "epoch": 0.8539556353378638, "grad_norm": 30.283513234320385, "learning_rate": 3.541038094050241e-07, "logits/chosen": 0.515430212020874, "logits/rejected": 0.5466374158859253, "logps/chosen": -45.59136962890625, "logps/rejected": -63.18849182128906, "loss": 0.2378, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.5768634676933289, "rewards/margins": 3.5630674362182617, "rewards/rejected": -4.139930725097656, "step": 370 }, { "epoch": 0.8539556353378638, "eval_logits/chosen": 0.4274056553840637, "eval_logits/rejected": 0.45338377356529236, "eval_logps/chosen": -43.063682556152344, "eval_logps/rejected": -55.225093841552734, "eval_loss": 0.2617854177951813, "eval_rewards/accuracies": 0.817396342754364, "eval_rewards/chosen": -0.659203290939331, "eval_rewards/margins": 2.9341788291931152, "eval_rewards/rejected": -3.5933821201324463, "eval_runtime": 220.2088, "eval_samples_per_second": 7.874, "eval_steps_per_second": 1.971, "step": 370 }, { "epoch": 0.8585716117450956, "grad_norm": 30.9826241797592, "learning_rate": 3.52267159292835e-07, "logits/chosen": 0.4993041455745697, "logits/rejected": 0.5248599052429199, "logps/chosen": -44.83211898803711, "logps/rejected": -61.29323959350586, "loss": 0.2333, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.7047384977340698, "rewards/margins": 3.358118772506714, "rewards/rejected": -4.062856674194336, "step": 372 }, { "epoch": 0.8631875881523272, "grad_norm": 16.52463887201103, "learning_rate": 3.5042385616324236e-07, "logits/chosen": 0.4287330210208893, "logits/rejected": 0.46707651019096375, "logps/chosen": -36.363590240478516, "logps/rejected": -59.82657241821289, "loss": 0.22, "rewards/accuracies": 0.875, "rewards/chosen": -0.8720024228096008, "rewards/margins": 3.389249086380005, "rewards/rejected": -4.261251449584961, "step": 374 }, { "epoch": 0.8678035645595589, "grad_norm": 15.500715269356169, "learning_rate": 3.485740199328244e-07, "logits/chosen": 0.5408291816711426, "logits/rejected": 0.5578600764274597, "logps/chosen": -50.285335540771484, "logps/rejected": -54.07209014892578, "loss": 0.1876, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.5448592305183411, "rewards/margins": 3.2346181869506836, "rewards/rejected": -3.779477119445801, "step": 376 }, { "epoch": 0.8724195409667906, "grad_norm": 12.222084345575727, "learning_rate": 3.4671777094317196e-07, "logits/chosen": 0.5013281106948853, "logits/rejected": 0.5262949466705322, "logps/chosen": -46.47956848144531, "logps/rejected": -53.49814224243164, "loss": 0.1677, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.7341945767402649, "rewards/margins": 3.0543222427368164, "rewards/rejected": -3.7885169982910156, "step": 378 }, { "epoch": 0.8770355173740223, "grad_norm": 22.531696347522484, "learning_rate": 3.448552299530595e-07, "logits/chosen": 0.5649933218955994, "logits/rejected": 0.5860426425933838, "logps/chosen": -42.52098083496094, "logps/rejected": -52.308616638183594, "loss": 0.3071, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.9869860410690308, "rewards/margins": 2.7113142013549805, "rewards/rejected": -3.698300361633301, "step": 380 }, { "epoch": 0.8770355173740223, "eval_logits/chosen": 0.4274827539920807, "eval_logits/rejected": 0.45349830389022827, "eval_logps/chosen": -43.129615783691406, "eval_logps/rejected": -55.33893585205078, "eval_loss": 0.2627149224281311, "eval_rewards/accuracies": 0.8156682252883911, "eval_rewards/chosen": -0.6921693086624146, "eval_rewards/margins": 2.958131790161133, "eval_rewards/rejected": -3.650301218032837, "eval_runtime": 220.3046, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 380 }, { "epoch": 0.881651493781254, "grad_norm": 39.03269809250303, "learning_rate": 3.429865181305894e-07, "logits/chosen": 0.5594089031219482, "logits/rejected": 0.5762946605682373, "logps/chosen": -46.85918045043945, "logps/rejected": -55.68655776977539, "loss": 0.2915, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.9153691530227661, "rewards/margins": 2.779404401779175, "rewards/rejected": -3.694772958755493, "step": 382 }, { "epoch": 0.8862674701884857, "grad_norm": 25.617922410092657, "learning_rate": 3.411117570453091e-07, "logits/chosen": 0.5484945774078369, "logits/rejected": 0.5738579034805298, "logps/chosen": -42.73631286621094, "logps/rejected": -53.853271484375, "loss": 0.2369, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7328565120697021, "rewards/margins": 2.8266656398773193, "rewards/rejected": -3.5595223903656006, "step": 384 }, { "epoch": 0.8908834465957174, "grad_norm": 30.869961559508535, "learning_rate": 3.392310686603025e-07, "logits/chosen": 0.534080982208252, "logits/rejected": 0.5444844365119934, "logps/chosen": -42.41215515136719, "logps/rejected": -50.85294723510742, "loss": 0.2909, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": -0.9006066918373108, "rewards/margins": 2.361262559890747, "rewards/rejected": -3.261868953704834, "step": 386 }, { "epoch": 0.895499423002949, "grad_norm": 19.657432685783167, "learning_rate": 3.3734457532425554e-07, "logits/chosen": 0.5231594443321228, "logits/rejected": 0.5530441403388977, "logps/chosen": -42.48830795288086, "logps/rejected": -57.00692367553711, "loss": 0.2606, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.6170899271965027, "rewards/margins": 3.237041711807251, "rewards/rejected": -3.8541314601898193, "step": 388 }, { "epoch": 0.9001153994101808, "grad_norm": 24.399140672578795, "learning_rate": 3.354523997634969e-07, "logits/chosen": 0.540899932384491, "logits/rejected": 0.5695917010307312, "logps/chosen": -44.531185150146484, "logps/rejected": -58.8494873046875, "loss": 0.2251, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.7790883183479309, "rewards/margins": 3.128167152404785, "rewards/rejected": -3.9072554111480713, "step": 390 }, { "epoch": 0.9001153994101808, "eval_logits/chosen": 0.42857107520103455, "eval_logits/rejected": 0.4546278119087219, "eval_logps/chosen": -43.16852951049805, "eval_logps/rejected": -55.42344665527344, "eval_loss": 0.2621525228023529, "eval_rewards/accuracies": 0.8179723620414734, "eval_rewards/chosen": -0.7116276621818542, "eval_rewards/margins": 2.980929374694824, "eval_rewards/rejected": -3.6925570964813232, "eval_runtime": 220.3143, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 390 }, { "epoch": 0.9047313758174125, "grad_norm": 35.01908054863291, "learning_rate": 3.3355466507401374e-07, "logits/chosen": 0.5315423607826233, "logits/rejected": 0.5454668998718262, "logps/chosen": -42.16218185424805, "logps/rejected": -44.85585403442383, "loss": 0.372, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": -0.805086612701416, "rewards/margins": 2.338005542755127, "rewards/rejected": -3.143092155456543, "step": 392 }, { "epoch": 0.9093473522246441, "grad_norm": 21.288998506479572, "learning_rate": 3.3165149471344394e-07, "logits/chosen": 0.5552914142608643, "logits/rejected": 0.5818530321121216, "logps/chosen": -42.95904541015625, "logps/rejected": -52.76212692260742, "loss": 0.2934, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.9580552577972412, "rewards/margins": 2.6676671504974365, "rewards/rejected": -3.6257221698760986, "step": 394 }, { "epoch": 0.9139633286318759, "grad_norm": 25.556003693396036, "learning_rate": 3.297430124930444e-07, "logits/chosen": 0.582655668258667, "logits/rejected": 0.5952574014663696, "logps/chosen": -48.771934509277344, "logps/rejected": -54.426483154296875, "loss": 0.3223, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": -0.6146318912506104, "rewards/margins": 2.4974234104156494, "rewards/rejected": -3.112055540084839, "step": 396 }, { "epoch": 0.9185793050391076, "grad_norm": 23.905362174336005, "learning_rate": 3.2782934256963647e-07, "logits/chosen": 0.5089656114578247, "logits/rejected": 0.5398065447807312, "logps/chosen": -45.75530242919922, "logps/rejected": -61.64253234863281, "loss": 0.2549, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.6105983853340149, "rewards/margins": 3.1589841842651367, "rewards/rejected": -3.769582509994507, "step": 398 }, { "epoch": 0.9231952814463392, "grad_norm": 24.17532494020093, "learning_rate": 3.259106094375289e-07, "logits/chosen": 0.539167046546936, "logits/rejected": 0.5812445282936096, "logps/chosen": -39.31736755371094, "logps/rejected": -63.33793640136719, "loss": 0.2698, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.3948301374912262, "rewards/margins": 3.442387819290161, "rewards/rejected": -3.8372182846069336, "step": 400 }, { "epoch": 0.9231952814463392, "eval_logits/chosen": 0.42656469345092773, "eval_logits/rejected": 0.45276370644569397, "eval_logps/chosen": -42.66855239868164, "eval_logps/rejected": -55.0075798034668, "eval_loss": 0.2560158371925354, "eval_rewards/accuracies": 0.8231566548347473, "eval_rewards/chosen": -0.46163854002952576, "eval_rewards/margins": 3.0229856967926025, "eval_rewards/rejected": -3.4846243858337402, "eval_runtime": 220.2216, "eval_samples_per_second": 7.874, "eval_steps_per_second": 1.971, "step": 400 }, { "epoch": 0.927811257853571, "grad_norm": 30.671620714098214, "learning_rate": 3.239869379204189e-07, "logits/chosen": 0.4974105656147003, "logits/rejected": 0.5221477746963501, "logps/chosen": -45.057281494140625, "logps/rejected": -56.83816909790039, "loss": 0.2017, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.5868238210678101, "rewards/margins": 3.3964414596557617, "rewards/rejected": -3.9832653999328613, "step": 402 }, { "epoch": 0.9324272342608027, "grad_norm": 24.915176146115876, "learning_rate": 3.2205845316327144e-07, "logits/chosen": 0.5429517030715942, "logits/rejected": 0.5683455467224121, "logps/chosen": -34.97327423095703, "logps/rejected": -46.666717529296875, "loss": 0.3399, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": -0.43591320514678955, "rewards/margins": 2.185106039047241, "rewards/rejected": -2.6210196018218994, "step": 404 }, { "epoch": 0.9370432106680343, "grad_norm": 23.867375292949593, "learning_rate": 3.2012528062417845e-07, "logits/chosen": 0.5323294997215271, "logits/rejected": 0.5459015369415283, "logps/chosen": -43.10551071166992, "logps/rejected": -47.71934127807617, "loss": 0.2436, "rewards/accuracies": 0.875, "rewards/chosen": -0.7240028977394104, "rewards/margins": 2.4708030223846436, "rewards/rejected": -3.1948060989379883, "step": 406 }, { "epoch": 0.9416591870752661, "grad_norm": 15.007721932706033, "learning_rate": 3.1818754606619643e-07, "logits/chosen": 0.5331852436065674, "logits/rejected": 0.564946174621582, "logps/chosen": -36.540283203125, "logps/rejected": -57.03317642211914, "loss": 0.2822, "rewards/accuracies": 0.75, "rewards/chosen": -0.16474466025829315, "rewards/margins": 3.167923927307129, "rewards/rejected": -3.3326683044433594, "step": 408 }, { "epoch": 0.9462751634824977, "grad_norm": 22.364487052769828, "learning_rate": 3.162453755491655e-07, "logits/chosen": 0.49684393405914307, "logits/rejected": 0.5316374897956848, "logps/chosen": -38.39241027832031, "logps/rejected": -59.15244674682617, "loss": 0.1874, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.322665810585022, "rewards/margins": 3.4969892501831055, "rewards/rejected": -3.819655179977417, "step": 410 }, { "epoch": 0.9462751634824977, "eval_logits/chosen": 0.4290708899497986, "eval_logits/rejected": 0.45515918731689453, "eval_logps/chosen": -42.679603576660156, "eval_logps/rejected": -55.10276412963867, "eval_loss": 0.2565246820449829, "eval_rewards/accuracies": 0.8191244006156921, "eval_rewards/chosen": -0.467162162065506, "eval_rewards/margins": 3.065053939819336, "eval_rewards/rejected": -3.5322158336639404, "eval_runtime": 220.2891, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 410 }, { "epoch": 0.9508911398897294, "grad_norm": 25.19862106785063, "learning_rate": 3.142988954215079e-07, "logits/chosen": 0.5264102816581726, "logits/rejected": 0.5622512698173523, "logps/chosen": -43.48373794555664, "logps/rejected": -66.42120361328125, "loss": 0.2996, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.48827776312828064, "rewards/margins": 3.3450686931610107, "rewards/rejected": -3.833346128463745, "step": 412 }, { "epoch": 0.9555071162969612, "grad_norm": 21.74301345510537, "learning_rate": 3.1234823231200925e-07, "logits/chosen": 0.5031583309173584, "logits/rejected": 0.5540390014648438, "logps/chosen": -40.93600845336914, "logps/rejected": -66.30878448486328, "loss": 0.2428, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.5792509317398071, "rewards/margins": 3.6368870735168457, "rewards/rejected": -4.2161383628845215, "step": 414 }, { "epoch": 0.9601230927041928, "grad_norm": 22.436508219334904, "learning_rate": 3.1039351312157993e-07, "logits/chosen": 0.56053227186203, "logits/rejected": 0.590539813041687, "logps/chosen": -41.67660140991211, "logps/rejected": -58.28109359741211, "loss": 0.2048, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.6333367228507996, "rewards/margins": 3.312451124191284, "rewards/rejected": -3.9457881450653076, "step": 416 }, { "epoch": 0.9647390691114246, "grad_norm": 36.50210265432233, "learning_rate": 3.0843486501499967e-07, "logits/chosen": 0.508413553237915, "logits/rejected": 0.5429882407188416, "logps/chosen": -42.58755111694336, "logps/rejected": -52.10399627685547, "loss": 0.3069, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.4402269721031189, "rewards/margins": 2.6428239345550537, "rewards/rejected": -3.0830507278442383, "step": 418 }, { "epoch": 0.9693550455186563, "grad_norm": 19.432988353108243, "learning_rate": 3.064724154126449e-07, "logits/chosen": 0.48101869225502014, "logits/rejected": 0.49470260739326477, "logps/chosen": -43.99076461791992, "logps/rejected": -47.8154411315918, "loss": 0.2486, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.6770768761634827, "rewards/margins": 2.6182446479797363, "rewards/rejected": -3.2953217029571533, "step": 420 }, { "epoch": 0.9693550455186563, "eval_logits/chosen": 0.4298844337463379, "eval_logits/rejected": 0.45596131682395935, "eval_logps/chosen": -42.74457550048828, "eval_logps/rejected": -55.1827278137207, "eval_loss": 0.2540464699268341, "eval_rewards/accuracies": 0.8231566548347473, "eval_rewards/chosen": -0.4996483027935028, "eval_rewards/margins": 3.072551727294922, "eval_rewards/rejected": -3.572199821472168, "eval_runtime": 220.4655, "eval_samples_per_second": 7.865, "eval_steps_per_second": 1.969, "step": 420 }, { "epoch": 0.9739710219258879, "grad_norm": 21.396529357952137, "learning_rate": 3.045062919821995e-07, "logits/chosen": 0.5096142292022705, "logits/rejected": 0.5509178638458252, "logps/chosen": -40.65134811401367, "logps/rejected": -64.13406372070312, "loss": 0.2407, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.25429394841194153, "rewards/margins": 3.5406899452209473, "rewards/rejected": -3.7949838638305664, "step": 422 }, { "epoch": 0.9785869983331197, "grad_norm": 27.30197314549755, "learning_rate": 3.0253662263034925e-07, "logits/chosen": 0.5253940224647522, "logits/rejected": 0.5617537498474121, "logps/chosen": -44.63224792480469, "logps/rejected": -62.29665756225586, "loss": 0.2666, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.6128353476524353, "rewards/margins": 3.4666247367858887, "rewards/rejected": -4.079460144042969, "step": 424 }, { "epoch": 0.9832029747403513, "grad_norm": 40.51282949087652, "learning_rate": 3.005635354944606e-07, "logits/chosen": 0.5502428412437439, "logits/rejected": 0.5616468787193298, "logps/chosen": -46.97676467895508, "logps/rejected": -46.36595153808594, "loss": 0.2894, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.7273317575454712, "rewards/margins": 2.6478114128112793, "rewards/rejected": -3.375143051147461, "step": 426 }, { "epoch": 0.987818951147583, "grad_norm": 23.92512657865844, "learning_rate": 2.9858715893424504e-07, "logits/chosen": 0.5228149890899658, "logits/rejected": 0.5698718428611755, "logps/chosen": -40.91889953613281, "logps/rejected": -64.06893920898438, "loss": 0.1794, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.32393407821655273, "rewards/margins": 3.8048884868621826, "rewards/rejected": -4.128821849822998, "step": 428 }, { "epoch": 0.9924349275548148, "grad_norm": 18.33017798245734, "learning_rate": 2.966076215234082e-07, "logits/chosen": 0.5833015441894531, "logits/rejected": 0.6151509881019592, "logps/chosen": -47.47243118286133, "logps/rejected": -64.26097869873047, "loss": 0.2098, "rewards/accuracies": 0.875, "rewards/chosen": -0.2687421441078186, "rewards/margins": 3.582411766052246, "rewards/rejected": -3.85115385055542, "step": 430 }, { "epoch": 0.9924349275548148, "eval_logits/chosen": 0.42911431193351746, "eval_logits/rejected": 0.45535048842430115, "eval_logps/chosen": -42.6432991027832, "eval_logps/rejected": -55.0967903137207, "eval_loss": 0.25298023223876953, "eval_rewards/accuracies": 0.8237327337265015, "eval_rewards/chosen": -0.4490084946155548, "eval_rewards/margins": 3.0802206993103027, "eval_rewards/rejected": -3.5292294025421143, "eval_runtime": 220.5016, "eval_samples_per_second": 7.864, "eval_steps_per_second": 1.968, "step": 430 }, { "epoch": 0.9970509039620464, "grad_norm": 24.845062608395242, "learning_rate": 2.94625052041286e-07, "logits/chosen": 0.529398500919342, "logits/rejected": 0.5461426377296448, "logps/chosen": -42.26673889160156, "logps/rejected": -52.43321228027344, "loss": 0.2582, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.38506922125816345, "rewards/margins": 2.947833299636841, "rewards/rejected": -3.332902431488037, "step": 432 }, { "epoch": 1.001666880369278, "grad_norm": 14.705625802608846, "learning_rate": 2.926395794644665e-07, "logits/chosen": 0.5060461759567261, "logits/rejected": 0.5222041010856628, "logps/chosen": -45.8979606628418, "logps/rejected": -55.48097229003906, "loss": 0.1798, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.3213649392127991, "rewards/margins": 3.302720308303833, "rewards/rejected": -3.6240854263305664, "step": 434 }, { "epoch": 1.0062828567765099, "grad_norm": 24.90302953634143, "learning_rate": 2.906513329583991e-07, "logits/chosen": 0.5120677351951599, "logits/rejected": 0.5406749844551086, "logps/chosen": -40.07225036621094, "logps/rejected": -54.882259368896484, "loss": 0.2186, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.5253066420555115, "rewards/margins": 3.1281352043151855, "rewards/rejected": -3.653441905975342, "step": 436 }, { "epoch": 1.0108988331837414, "grad_norm": 20.006366802619794, "learning_rate": 2.886604418689921e-07, "logits/chosen": 0.48885577917099, "logits/rejected": 0.5327137112617493, "logps/chosen": -38.752708435058594, "logps/rejected": -66.8874740600586, "loss": 0.2705, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.5506837368011475, "rewards/margins": 3.6388425827026367, "rewards/rejected": -4.189526557922363, "step": 438 }, { "epoch": 1.0155148095909732, "grad_norm": 11.538422039384988, "learning_rate": 2.866670357141979e-07, "logits/chosen": 0.5471632480621338, "logits/rejected": 0.5706813931465149, "logps/chosen": -44.1706428527832, "logps/rejected": -54.80915832519531, "loss": 0.2123, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.5128348469734192, "rewards/margins": 3.5640437602996826, "rewards/rejected": -4.076879024505615, "step": 440 }, { "epoch": 1.0155148095909732, "eval_logits/chosen": 0.42714568972587585, "eval_logits/rejected": 0.4533489942550659, "eval_logps/chosen": -42.395565032958984, "eval_logps/rejected": -54.934104919433594, "eval_loss": 0.2539977729320526, "eval_rewards/accuracies": 0.8231566548347473, "eval_rewards/chosen": -0.3251444697380066, "eval_rewards/margins": 3.122741937637329, "eval_rewards/rejected": -3.4478864669799805, "eval_runtime": 220.3559, "eval_samples_per_second": 7.869, "eval_steps_per_second": 1.97, "step": 440 }, { "epoch": 1.020130785998205, "grad_norm": 16.119320288131345, "learning_rate": 2.8467124417558737e-07, "logits/chosen": 0.5559278130531311, "logits/rejected": 0.5782606601715088, "logps/chosen": -43.08287048339844, "logps/rejected": -55.4886474609375, "loss": 0.2118, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.22590351104736328, "rewards/margins": 3.3553009033203125, "rewards/rejected": -3.581204414367676, "step": 442 }, { "epoch": 1.0247467624054365, "grad_norm": 21.10014479926061, "learning_rate": 2.8267319708991253e-07, "logits/chosen": 0.5570061206817627, "logits/rejected": 0.5741885304450989, "logps/chosen": -46.57928466796875, "logps/rejected": -48.77629089355469, "loss": 0.2203, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.2255779653787613, "rewards/margins": 2.8583762645721436, "rewards/rejected": -3.083954334259033, "step": 444 }, { "epoch": 1.0293627388126683, "grad_norm": 21.99323071947427, "learning_rate": 2.806730244406612e-07, "logits/chosen": 0.5444987416267395, "logits/rejected": 0.5731097459793091, "logps/chosen": -40.73080825805664, "logps/rejected": -52.80342102050781, "loss": 0.2407, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.2986847758293152, "rewards/margins": 3.0820257663726807, "rewards/rejected": -3.3807103633880615, "step": 446 }, { "epoch": 1.0339787152199, "grad_norm": 17.17450683483707, "learning_rate": 2.786708563496001e-07, "logits/chosen": 0.5541989207267761, "logits/rejected": 0.5817456841468811, "logps/chosen": -45.73213195800781, "logps/rejected": -61.18666458129883, "loss": 0.1772, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.05669987201690674, "rewards/margins": 3.8165981769561768, "rewards/rejected": -3.873298168182373, "step": 448 }, { "epoch": 1.0385946916271316, "grad_norm": 27.653708636239905, "learning_rate": 2.7666682306830994e-07, "logits/chosen": 0.5207394957542419, "logits/rejected": 0.5322983860969543, "logps/chosen": -41.09166717529297, "logps/rejected": -43.31468200683594, "loss": 0.2544, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.3381701707839966, "rewards/margins": 2.6456761360168457, "rewards/rejected": -2.9838459491729736, "step": 450 }, { "epoch": 1.0385946916271316, "eval_logits/chosen": 0.43128177523612976, "eval_logits/rejected": 0.4573296308517456, "eval_logps/chosen": -42.16498565673828, "eval_logps/rejected": -54.75392150878906, "eval_loss": 0.2521970570087433, "eval_rewards/accuracies": 0.8248847723007202, "eval_rewards/chosen": -0.20985357463359833, "eval_rewards/margins": 3.147939920425415, "eval_rewards/rejected": -3.3577938079833984, "eval_runtime": 220.2887, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 450 }, { "epoch": 1.0432106680343634, "grad_norm": 26.863807248353726, "learning_rate": 2.746610549697119e-07, "logits/chosen": 0.5497666001319885, "logits/rejected": 0.5746829509735107, "logps/chosen": -42.95619583129883, "logps/rejected": -57.17405700683594, "loss": 0.2279, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.33137860894203186, "rewards/margins": 3.0671894550323486, "rewards/rejected": -3.3985676765441895, "step": 452 }, { "epoch": 1.0478266444415951, "grad_norm": 15.765922708965844, "learning_rate": 2.7265368253958615e-07, "logits/chosen": 0.5027904510498047, "logits/rejected": 0.5187773108482361, "logps/chosen": -40.01198959350586, "logps/rejected": -49.16390609741211, "loss": 0.1826, "rewards/accuracies": 0.875, "rewards/chosen": -0.024355987086892128, "rewards/margins": 3.001004219055176, "rewards/rejected": -3.025360107421875, "step": 454 }, { "epoch": 1.0524426208488267, "grad_norm": 13.117750938407347, "learning_rate": 2.706448363680831e-07, "logits/chosen": 0.5505272746086121, "logits/rejected": 0.592627763748169, "logps/chosen": -40.86323928833008, "logps/rejected": -65.0215072631836, "loss": 0.1182, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": -0.19750367105007172, "rewards/margins": 4.092833995819092, "rewards/rejected": -4.290337562561035, "step": 456 }, { "epoch": 1.0570585972560584, "grad_norm": 16.896591758231867, "learning_rate": 2.686346471412277e-07, "logits/chosen": 0.4872972071170807, "logits/rejected": 0.5277370810508728, "logps/chosen": -44.69199752807617, "logps/rejected": -65.82919311523438, "loss": 0.1481, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.253704696893692, "rewards/margins": 3.8575947284698486, "rewards/rejected": -4.111299514770508, "step": 458 }, { "epoch": 1.0616745736632902, "grad_norm": 20.974972760985903, "learning_rate": 2.6662324563241805e-07, "logits/chosen": 0.5082690119743347, "logits/rejected": 0.5304160118103027, "logps/chosen": -39.70173263549805, "logps/rejected": -50.749732971191406, "loss": 0.218, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.058096084743738174, "rewards/margins": 2.925325632095337, "rewards/rejected": -2.983421802520752, "step": 460 }, { "epoch": 1.0616745736632902, "eval_logits/chosen": 0.42715081572532654, "eval_logits/rejected": 0.45357510447502136, "eval_logps/chosen": -41.917137145996094, "eval_logps/rejected": -54.64493179321289, "eval_loss": 0.2522634267807007, "eval_rewards/accuracies": 0.8231566548347473, "eval_rewards/chosen": -0.08592969179153442, "eval_rewards/margins": 3.217369556427002, "eval_rewards/rejected": -3.3032991886138916, "eval_runtime": 220.2922, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 460 }, { "epoch": 1.0662905500705218, "grad_norm": 14.344965515087893, "learning_rate": 2.6461076269391713e-07, "logits/chosen": 0.5723965167999268, "logits/rejected": 0.6080074310302734, "logps/chosen": -47.22536087036133, "logps/rejected": -63.04933166503906, "loss": 0.1633, "rewards/accuracies": 0.875, "rewards/chosen": -0.08401741087436676, "rewards/margins": 4.024357318878174, "rewards/rejected": -4.10837459564209, "step": 462 }, { "epoch": 1.0709065264777535, "grad_norm": 22.161377940303407, "learning_rate": 2.625973292483409e-07, "logits/chosen": 0.49575677514076233, "logits/rejected": 0.5175695419311523, "logps/chosen": -49.86793518066406, "logps/rejected": -61.0032958984375, "loss": 0.2086, "rewards/accuracies": 0.875, "rewards/chosen": -0.2437991052865982, "rewards/margins": 3.3475723266601562, "rewards/rejected": -3.5913712978363037, "step": 464 }, { "epoch": 1.0755225028849853, "grad_norm": 9.157546830395537, "learning_rate": 2.6058307628014065e-07, "logits/chosen": 0.5648156404495239, "logits/rejected": 0.5903113484382629, "logps/chosen": -47.16014099121094, "logps/rejected": -58.00987243652344, "loss": 0.1681, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.20527897775173187, "rewards/margins": 3.885181427001953, "rewards/rejected": -4.090460777282715, "step": 466 }, { "epoch": 1.0801384792922168, "grad_norm": 20.418800394750264, "learning_rate": 2.5856813482708217e-07, "logits/chosen": 0.5167273879051208, "logits/rejected": 0.5341954827308655, "logps/chosen": -44.03962707519531, "logps/rejected": -48.64061737060547, "loss": 0.205, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.06323742121458054, "rewards/margins": 3.104510545730591, "rewards/rejected": -3.041273355484009, "step": 468 }, { "epoch": 1.0847544556994486, "grad_norm": 24.70628607742756, "learning_rate": 2.565526359717206e-07, "logits/chosen": 0.537581205368042, "logits/rejected": 0.5596475005149841, "logps/chosen": -37.46675109863281, "logps/rejected": -45.9968147277832, "loss": 0.3005, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.24194829165935516, "rewards/margins": 2.6193909645080566, "rewards/rejected": -2.8613390922546387, "step": 470 }, { "epoch": 1.0847544556994486, "eval_logits/chosen": 0.4362466037273407, "eval_logits/rejected": 0.4623866379261017, "eval_logps/chosen": -42.15773010253906, "eval_logps/rejected": -54.935401916503906, "eval_loss": 0.24963192641735077, "eval_rewards/accuracies": 0.8260368704795837, "eval_rewards/chosen": -0.20622780919075012, "eval_rewards/margins": 3.242306709289551, "eval_rewards/rejected": -3.4485342502593994, "eval_runtime": 220.4037, "eval_samples_per_second": 7.867, "eval_steps_per_second": 1.969, "step": 470 }, { "epoch": 1.0893704321066804, "grad_norm": 27.430779359112005, "learning_rate": 2.545367108328731e-07, "logits/chosen": 0.5652859807014465, "logits/rejected": 0.591205358505249, "logps/chosen": -43.71979904174805, "logps/rejected": -53.00830841064453, "loss": 0.2156, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.20083469152450562, "rewards/margins": 3.2087488174438477, "rewards/rejected": -3.409583330154419, "step": 472 }, { "epoch": 1.0939864085139122, "grad_norm": 13.134510140867176, "learning_rate": 2.525204905570889e-07, "logits/chosen": 0.5791910290718079, "logits/rejected": 0.6038353443145752, "logps/chosen": -46.998390197753906, "logps/rejected": -59.18220520019531, "loss": 0.1707, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.05355483293533325, "rewards/margins": 3.5535666942596436, "rewards/rejected": -3.607121706008911, "step": 474 }, { "epoch": 1.0986023849211437, "grad_norm": 19.90392742325827, "learning_rate": 2.505041063101171e-07, "logits/chosen": 0.5816848278045654, "logits/rejected": 0.6008831858634949, "logps/chosen": -47.19880676269531, "logps/rejected": -51.822105407714844, "loss": 0.2218, "rewards/accuracies": 0.875, "rewards/chosen": 0.03883904218673706, "rewards/margins": 3.348583221435547, "rewards/rejected": -3.309744358062744, "step": 476 }, { "epoch": 1.1032183613283755, "grad_norm": 17.00116980477646, "learning_rate": 2.4848768926837466e-07, "logits/chosen": 0.5338962078094482, "logits/rejected": 0.5906614065170288, "logps/chosen": -40.04157257080078, "logps/rejected": -76.84749603271484, "loss": 0.1893, "rewards/accuracies": 0.875, "rewards/chosen": -0.1601162701845169, "rewards/margins": 4.218037128448486, "rewards/rejected": -4.378152847290039, "step": 478 }, { "epoch": 1.107834337735607, "grad_norm": 15.038557815597683, "learning_rate": 2.464713706104113e-07, "logits/chosen": 0.5352125763893127, "logits/rejected": 0.5612537264823914, "logps/chosen": -43.91660690307617, "logps/rejected": -56.44979476928711, "loss": 0.1633, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.2793487310409546, "rewards/margins": 3.6175765991210938, "rewards/rejected": -3.896925210952759, "step": 480 }, { "epoch": 1.107834337735607, "eval_logits/chosen": 0.43004509806632996, "eval_logits/rejected": 0.4563468098640442, "eval_logps/chosen": -42.171958923339844, "eval_logps/rejected": -54.986507415771484, "eval_loss": 0.24832715094089508, "eval_rewards/accuracies": 0.8271889686584473, "eval_rewards/chosen": -0.21334028244018555, "eval_rewards/margins": 3.2607483863830566, "eval_rewards/rejected": -3.474088668823242, "eval_runtime": 220.2251, "eval_samples_per_second": 7.874, "eval_steps_per_second": 1.971, "step": 480 }, { "epoch": 1.1124503141428388, "grad_norm": 22.9744657106464, "learning_rate": 2.444552815083767e-07, "logits/chosen": 0.6254298686981201, "logits/rejected": 0.6373676061630249, "logps/chosen": -42.673282623291016, "logps/rejected": -45.563087463378906, "loss": 0.2114, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.057508740574121475, "rewards/margins": 3.0235791206359863, "rewards/rejected": -3.081087350845337, "step": 482 }, { "epoch": 1.1170662905500706, "grad_norm": 17.674691508042564, "learning_rate": 2.4243955311948693e-07, "logits/chosen": 0.5245480537414551, "logits/rejected": 0.5648095011711121, "logps/chosen": -39.3298225402832, "logps/rejected": -61.31127166748047, "loss": 0.2236, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.1908557116985321, "rewards/margins": 3.677870512008667, "rewards/rejected": -3.8687260150909424, "step": 484 }, { "epoch": 1.1216822669573023, "grad_norm": 19.4717194397301, "learning_rate": 2.4042431657749115e-07, "logits/chosen": 0.585620105266571, "logits/rejected": 0.6345695853233337, "logps/chosen": -41.645267486572266, "logps/rejected": -72.78955078125, "loss": 0.1703, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.20467931032180786, "rewards/margins": 4.08174991607666, "rewards/rejected": -4.286429405212402, "step": 486 }, { "epoch": 1.1262982433645339, "grad_norm": 30.909727917565508, "learning_rate": 2.384097029841419e-07, "logits/chosen": 0.4901224672794342, "logits/rejected": 0.5071887969970703, "logps/chosen": -43.30605697631836, "logps/rejected": -50.992618560791016, "loss": 0.2185, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.18728405237197876, "rewards/margins": 2.9479784965515137, "rewards/rejected": -3.1352624893188477, "step": 488 }, { "epoch": 1.1309142197717656, "grad_norm": 16.93415094151409, "learning_rate": 2.3639584340066544e-07, "logits/chosen": 0.5211553573608398, "logits/rejected": 0.5518543124198914, "logps/chosen": -37.83938980102539, "logps/rejected": -53.91053009033203, "loss": 0.234, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.05988183990120888, "rewards/margins": 3.5345206260681152, "rewards/rejected": -3.4746387004852295, "step": 490 }, { "epoch": 1.1309142197717656, "eval_logits/chosen": 0.43326738476753235, "eval_logits/rejected": 0.45958051085472107, "eval_logps/chosen": -41.84520721435547, "eval_logps/rejected": -54.6281852722168, "eval_loss": 0.24792973697185516, "eval_rewards/accuracies": 0.8220046162605286, "eval_rewards/chosen": -0.04996471852064133, "eval_rewards/margins": 3.244964361190796, "eval_rewards/rejected": -3.294929265975952, "eval_runtime": 220.3046, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 490 }, { "epoch": 1.1355301961789972, "grad_norm": 16.790260075155444, "learning_rate": 2.3438286883923539e-07, "logits/chosen": 0.5881079435348511, "logits/rejected": 0.6105315685272217, "logps/chosen": -46.794837951660156, "logps/rejected": -53.43986511230469, "loss": 0.2269, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.08306831121444702, "rewards/margins": 3.1719002723693848, "rewards/rejected": -3.088831663131714, "step": 492 }, { "epoch": 1.140146172586229, "grad_norm": 22.957641710400285, "learning_rate": 2.323709102544506e-07, "logits/chosen": 0.6002509593963623, "logits/rejected": 0.6072889566421509, "logps/chosen": -39.66600036621094, "logps/rejected": -41.07653045654297, "loss": 0.2857, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.20397840440273285, "rewards/margins": 2.4769766330718994, "rewards/rejected": -2.272998094558716, "step": 494 }, { "epoch": 1.1447621489934607, "grad_norm": 27.504424003065566, "learning_rate": 2.3036009853481474e-07, "logits/chosen": 0.5301830768585205, "logits/rejected": 0.5608452558517456, "logps/chosen": -39.39542770385742, "logps/rejected": -58.36659622192383, "loss": 0.2681, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.2189822793006897, "rewards/margins": 3.4378933906555176, "rewards/rejected": -3.6568756103515625, "step": 496 }, { "epoch": 1.1493781254006925, "grad_norm": 16.835368907101664, "learning_rate": 2.283505644942223e-07, "logits/chosen": 0.5190525054931641, "logits/rejected": 0.5493537783622742, "logps/chosen": -34.43808364868164, "logps/rejected": -54.84063720703125, "loss": 0.1937, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.13352231681346893, "rewards/margins": 3.440141201019287, "rewards/rejected": -3.3066186904907227, "step": 498 }, { "epoch": 1.153994101807924, "grad_norm": 14.320814422051418, "learning_rate": 2.2634243886344781e-07, "logits/chosen": 0.5132643580436707, "logits/rejected": 0.5381724834442139, "logps/chosen": -41.94618225097656, "logps/rejected": -54.74879455566406, "loss": 0.243, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.1846380978822708, "rewards/margins": 3.523959159851074, "rewards/rejected": -3.3393211364746094, "step": 500 }, { "epoch": 1.153994101807924, "eval_logits/chosen": 0.43241602182388306, "eval_logits/rejected": 0.45862025022506714, "eval_logps/chosen": -41.512245178222656, "eval_logps/rejected": -54.365325927734375, "eval_loss": 0.24479356408119202, "eval_rewards/accuracies": 0.8289170265197754, "eval_rewards/chosen": 0.11651827394962311, "eval_rewards/margins": 3.2800135612487793, "eval_rewards/rejected": -3.1634950637817383, "eval_runtime": 220.3257, "eval_samples_per_second": 7.87, "eval_steps_per_second": 1.97, "step": 500 }, { "epoch": 1.1586100782151558, "grad_norm": 17.24901468893502, "learning_rate": 2.2433585228164115e-07, "logits/chosen": 0.5386977791786194, "logits/rejected": 0.5774834156036377, "logps/chosen": -43.753910064697266, "logps/rejected": -65.60494232177734, "loss": 0.1918, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.19071653485298157, "rewards/margins": 4.159061908721924, "rewards/rejected": -3.9683446884155273, "step": 502 }, { "epoch": 1.1632260546223874, "grad_norm": 22.994462305856853, "learning_rate": 2.2233093528782938e-07, "logits/chosen": 0.5429908037185669, "logits/rejected": 0.5663915872573853, "logps/chosen": -49.295047760009766, "logps/rejected": -58.83778381347656, "loss": 0.1741, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.27108439803123474, "rewards/margins": 3.4974775314331055, "rewards/rejected": -3.226392984390259, "step": 504 }, { "epoch": 1.1678420310296191, "grad_norm": 19.749474882703815, "learning_rate": 2.2032781831242367e-07, "logits/chosen": 0.5360143184661865, "logits/rejected": 0.5641200542449951, "logps/chosen": -35.82609558105469, "logps/rejected": -44.779361724853516, "loss": 0.2253, "rewards/accuracies": 0.875, "rewards/chosen": 0.4115668535232544, "rewards/margins": 2.9376118183135986, "rewards/rejected": -2.526045083999634, "step": 506 }, { "epoch": 1.172458007436851, "grad_norm": 29.881557534524536, "learning_rate": 2.183266316687347e-07, "logits/chosen": 0.5799429416656494, "logits/rejected": 0.5963388681411743, "logps/chosen": -42.11252975463867, "logps/rejected": -44.56486511230469, "loss": 0.2905, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.2770005762577057, "rewards/margins": 2.54060435295105, "rewards/rejected": -2.263603687286377, "step": 508 }, { "epoch": 1.1770739838440827, "grad_norm": 11.72889590765659, "learning_rate": 2.16327505544495e-07, "logits/chosen": 0.5231108069419861, "logits/rejected": 0.5499060153961182, "logps/chosen": -43.436798095703125, "logps/rejected": -57.92034912109375, "loss": 0.1472, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.47280406951904297, "rewards/margins": 4.098244667053223, "rewards/rejected": -3.625440835952759, "step": 510 }, { "epoch": 1.1770739838440827, "eval_logits/chosen": 0.43323588371276855, "eval_logits/rejected": 0.4594508111476898, "eval_logps/chosen": -41.14154815673828, "eval_logps/rejected": -54.075172424316406, "eval_loss": 0.247583270072937, "eval_rewards/accuracies": 0.828341007232666, "eval_rewards/chosen": 0.30186572670936584, "eval_rewards/margins": 3.3202853202819824, "eval_rewards/rejected": -3.0184197425842285, "eval_runtime": 220.3645, "eval_samples_per_second": 7.869, "eval_steps_per_second": 1.969, "step": 510 }, { "epoch": 1.1816899602513142, "grad_norm": 19.02915371887465, "learning_rate": 2.143305699933892e-07, "logits/chosen": 0.5309435725212097, "logits/rejected": 0.5609121322631836, "logps/chosen": -39.10821533203125, "logps/rejected": -55.85133743286133, "loss": 0.2148, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.35923802852630615, "rewards/margins": 3.6412789821624756, "rewards/rejected": -3.282041549682617, "step": 512 }, { "epoch": 1.186305936658546, "grad_norm": 18.184730820886717, "learning_rate": 2.1233595492659382e-07, "logits/chosen": 0.6312618851661682, "logits/rejected": 0.6453579068183899, "logps/chosen": -48.93413543701172, "logps/rejected": -50.58020782470703, "loss": 0.1701, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.28959882259368896, "rewards/margins": 3.4992854595184326, "rewards/rejected": -3.209686040878296, "step": 514 }, { "epoch": 1.1909219130657775, "grad_norm": 21.115621604290848, "learning_rate": 2.1034379010432542e-07, "logits/chosen": 0.5738712549209595, "logits/rejected": 0.5990296006202698, "logps/chosen": -36.4149055480957, "logps/rejected": -47.95274353027344, "loss": 0.2192, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.35762646794319153, "rewards/margins": 3.1450395584106445, "rewards/rejected": -2.7874133586883545, "step": 516 }, { "epoch": 1.1955378894730093, "grad_norm": 18.313049973835163, "learning_rate": 2.0835420512739957e-07, "logits/chosen": 0.48849010467529297, "logits/rejected": 0.5418619513511658, "logps/chosen": -39.52627182006836, "logps/rejected": -70.53701782226562, "loss": 0.1678, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.39579084515571594, "rewards/margins": 4.528857231140137, "rewards/rejected": -4.133066654205322, "step": 518 }, { "epoch": 1.200153865880241, "grad_norm": 18.512425100692376, "learning_rate": 2.0636732942879917e-07, "logits/chosen": 0.5643823146820068, "logits/rejected": 0.5917804837226868, "logps/chosen": -43.44633483886719, "logps/rejected": -56.26163101196289, "loss": 0.166, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.33819130063056946, "rewards/margins": 3.693488121032715, "rewards/rejected": -3.3552963733673096, "step": 520 }, { "epoch": 1.200153865880241, "eval_logits/chosen": 0.4335879981517792, "eval_logits/rejected": 0.45994046330451965, "eval_logps/chosen": -41.402774810791016, "eval_logps/rejected": -54.35234451293945, "eval_loss": 0.2449788749217987, "eval_rewards/accuracies": 0.8317972421646118, "eval_rewards/chosen": 0.1712525486946106, "eval_rewards/margins": 3.328258991241455, "eval_rewards/rejected": -3.1570065021514893, "eval_runtime": 220.2998, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 520 }, { "epoch": 1.2047698422874729, "grad_norm": 11.696545134195471, "learning_rate": 2.0438329226525415e-07, "logits/chosen": 0.5642399787902832, "logits/rejected": 0.587860643863678, "logps/chosen": -41.212337493896484, "logps/rejected": -43.521636962890625, "loss": 0.2246, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.5518161058425903, "rewards/margins": 2.9677634239196777, "rewards/rejected": -2.415947675704956, "step": 522 }, { "epoch": 1.2093858186947044, "grad_norm": 24.196902238001236, "learning_rate": 2.0240222270883288e-07, "logits/chosen": 0.5227870941162109, "logits/rejected": 0.5579611659049988, "logps/chosen": -44.49864196777344, "logps/rejected": -64.84123229980469, "loss": 0.2314, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.16053809225559235, "rewards/margins": 3.896054267883301, "rewards/rejected": -3.73551607131958, "step": 524 }, { "epoch": 1.2140017951019362, "grad_norm": 12.971615376216704, "learning_rate": 2.0042424963854542e-07, "logits/chosen": 0.5063973665237427, "logits/rejected": 0.5544097423553467, "logps/chosen": -40.40736389160156, "logps/rejected": -70.9152603149414, "loss": 0.1526, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.3248124122619629, "rewards/margins": 4.234506607055664, "rewards/rejected": -3.9096946716308594, "step": 526 }, { "epoch": 1.2186177715091677, "grad_norm": 14.0866861852398, "learning_rate": 1.9844950173195883e-07, "logits/chosen": 0.5182596445083618, "logits/rejected": 0.549498975276947, "logps/chosen": -39.39563751220703, "logps/rejected": -54.05485153198242, "loss": 0.1818, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.22824376821517944, "rewards/margins": 3.397740364074707, "rewards/rejected": -3.169497013092041, "step": 528 }, { "epoch": 1.2232337479163995, "grad_norm": 13.76916365285817, "learning_rate": 1.964781074568265e-07, "logits/chosen": 0.5031299591064453, "logits/rejected": 0.5121724009513855, "logps/chosen": -41.18108367919922, "logps/rejected": -45.627994537353516, "loss": 0.1945, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.03019801713526249, "rewards/margins": 3.0934128761291504, "rewards/rejected": -3.0632145404815674, "step": 530 }, { "epoch": 1.2232337479163995, "eval_logits/chosen": 0.43405523896217346, "eval_logits/rejected": 0.46039465069770813, "eval_logps/chosen": -41.60369110107422, "eval_logps/rejected": -54.51262664794922, "eval_loss": 0.24258121848106384, "eval_rewards/accuracies": 0.8335253596305847, "eval_rewards/chosen": 0.07079467922449112, "eval_rewards/margins": 3.3079416751861572, "eval_rewards/rejected": -3.2371468544006348, "eval_runtime": 220.2641, "eval_samples_per_second": 7.872, "eval_steps_per_second": 1.97, "step": 530 }, { "epoch": 1.2278497243236313, "grad_norm": 16.411903473780164, "learning_rate": 1.9451019506273018e-07, "logits/chosen": 0.541588306427002, "logits/rejected": 0.5615941286087036, "logps/chosen": -36.563297271728516, "logps/rejected": -48.32072448730469, "loss": 0.2351, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.17822687327861786, "rewards/margins": 2.845065116882324, "rewards/rejected": -2.6668384075164795, "step": 532 }, { "epoch": 1.232465700730863, "grad_norm": 13.467269631637619, "learning_rate": 1.9254589257273712e-07, "logits/chosen": 0.5137292146682739, "logits/rejected": 0.5505712032318115, "logps/chosen": -36.598384857177734, "logps/rejected": -57.48229217529297, "loss": 0.1473, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.19568167626857758, "rewards/margins": 4.128161907196045, "rewards/rejected": -3.9324798583984375, "step": 534 }, { "epoch": 1.2370816771380946, "grad_norm": 24.645788661655104, "learning_rate": 1.9058532777507141e-07, "logits/chosen": 0.5294635891914368, "logits/rejected": 0.5472697615623474, "logps/chosen": -39.22220230102539, "logps/rejected": -49.91395950317383, "loss": 0.2172, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.11992057412862778, "rewards/margins": 3.224815845489502, "rewards/rejected": -3.1048953533172607, "step": 536 }, { "epoch": 1.2416976535453264, "grad_norm": 18.291984511184836, "learning_rate": 1.886286282148002e-07, "logits/chosen": 0.5298857688903809, "logits/rejected": 0.5633623600006104, "logps/chosen": -41.294647216796875, "logps/rejected": -57.79304885864258, "loss": 0.2731, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.11145944148302078, "rewards/margins": 3.1801443099975586, "rewards/rejected": -3.2916040420532227, "step": 538 }, { "epoch": 1.246313629952558, "grad_norm": 17.71916747448851, "learning_rate": 1.8667592118553693e-07, "logits/chosen": 0.5349301099777222, "logits/rejected": 0.5512058734893799, "logps/chosen": -43.72676467895508, "logps/rejected": -52.80296325683594, "loss": 0.2216, "rewards/accuracies": 0.875, "rewards/chosen": -0.03689540922641754, "rewards/margins": 3.2271673679351807, "rewards/rejected": -3.2640628814697266, "step": 540 }, { "epoch": 1.246313629952558, "eval_logits/chosen": 0.4325529932975769, "eval_logits/rejected": 0.45892781019210815, "eval_logps/chosen": -41.67875289916992, "eval_logps/rejected": -54.59620666503906, "eval_loss": 0.24205271899700165, "eval_rewards/accuracies": 0.8277649879455566, "eval_rewards/chosen": 0.03326287120580673, "eval_rewards/margins": 3.312199115753174, "eval_rewards/rejected": -3.2789359092712402, "eval_runtime": 220.1774, "eval_samples_per_second": 7.875, "eval_steps_per_second": 1.971, "step": 540 }, { "epoch": 1.2509296063597897, "grad_norm": 15.1063531754732, "learning_rate": 1.8472733372115956e-07, "logits/chosen": 0.4958040416240692, "logits/rejected": 0.5259097814559937, "logps/chosen": -43.43186950683594, "logps/rejected": -60.27039337158203, "loss": 0.1823, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.40173831582069397, "rewards/margins": 3.8025894165039062, "rewards/rejected": -4.2043280601501465, "step": 542 }, { "epoch": 1.2555455827670214, "grad_norm": 23.60965925798032, "learning_rate": 1.8278299258754692e-07, "logits/chosen": 0.47050708532333374, "logits/rejected": 0.5154716968536377, "logps/chosen": -43.42805480957031, "logps/rejected": -71.56327056884766, "loss": 0.2284, "rewards/accuracies": 0.875, "rewards/chosen": -0.35217729210853577, "rewards/margins": 4.311697483062744, "rewards/rejected": -4.663875102996826, "step": 544 }, { "epoch": 1.2601615591742532, "grad_norm": 11.785150141913245, "learning_rate": 1.808430242743316e-07, "logits/chosen": 0.46195343136787415, "logits/rejected": 0.4784909784793854, "logps/chosen": -42.974945068359375, "logps/rejected": -54.21615219116211, "loss": 0.1867, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.31646448373794556, "rewards/margins": 3.5641021728515625, "rewards/rejected": -3.2476377487182617, "step": 546 }, { "epoch": 1.2647775355814848, "grad_norm": 13.346160813344762, "learning_rate": 1.7890755498667104e-07, "logits/chosen": 0.5626040101051331, "logits/rejected": 0.5980097651481628, "logps/chosen": -36.59039306640625, "logps/rejected": -55.57601547241211, "loss": 0.182, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.17459021508693695, "rewards/margins": 3.451416015625, "rewards/rejected": -3.2768259048461914, "step": 548 }, { "epoch": 1.2693935119887165, "grad_norm": 25.621843956328824, "learning_rate": 1.7697671063703756e-07, "logits/chosen": 0.5085393786430359, "logits/rejected": 0.5440909266471863, "logps/chosen": -39.27238464355469, "logps/rejected": -59.40525817871094, "loss": 0.2243, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.011964095756411552, "rewards/margins": 3.6004185676574707, "rewards/rejected": -3.588454246520996, "step": 550 }, { "epoch": 1.2693935119887165, "eval_logits/chosen": 0.4355390965938568, "eval_logits/rejected": 0.46181005239486694, "eval_logps/chosen": -41.701602935791016, "eval_logps/rejected": -54.663360595703125, "eval_loss": 0.24010230600833893, "eval_rewards/accuracies": 0.8260368704795837, "eval_rewards/chosen": 0.0218377523124218, "eval_rewards/margins": 3.3343515396118164, "eval_rewards/rejected": -3.312513828277588, "eval_runtime": 220.234, "eval_samples_per_second": 7.873, "eval_steps_per_second": 1.971, "step": 550 }, { "epoch": 1.274009488395948, "grad_norm": 29.85339571581757, "learning_rate": 1.750506168370267e-07, "logits/chosen": 0.5484946370124817, "logits/rejected": 0.5642725229263306, "logps/chosen": -40.738338470458984, "logps/rejected": -47.2222900390625, "loss": 0.2665, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.49148619174957275, "rewards/margins": 3.0378835201263428, "rewards/rejected": -2.5463972091674805, "step": 552 }, { "epoch": 1.2786254648031798, "grad_norm": 11.606234417897845, "learning_rate": 1.7312939888918594e-07, "logits/chosen": 0.5540368556976318, "logits/rejected": 0.5830137729644775, "logps/chosen": -43.42100143432617, "logps/rejected": -63.07583999633789, "loss": 0.1529, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.060752179473638535, "rewards/margins": 3.951368570327759, "rewards/rejected": -3.8906164169311523, "step": 554 }, { "epoch": 1.2832414412104116, "grad_norm": 8.195981315855988, "learning_rate": 1.712131817788628e-07, "logits/chosen": 0.5598903298377991, "logits/rejected": 0.582931637763977, "logps/chosen": -39.05931854248047, "logps/rejected": -49.5858154296875, "loss": 0.2278, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.035774629563093185, "rewards/margins": 3.2900662422180176, "rewards/rejected": -3.325840950012207, "step": 556 }, { "epoch": 1.2878574176176434, "grad_norm": 10.58953396876903, "learning_rate": 1.693020901660738e-07, "logits/chosen": 0.5586022138595581, "logits/rejected": 0.5835521221160889, "logps/chosen": -46.566070556640625, "logps/rejected": -56.1746940612793, "loss": 0.1347, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1323520541191101, "rewards/margins": 3.951080322265625, "rewards/rejected": -3.81872820854187, "step": 558 }, { "epoch": 1.292473394024875, "grad_norm": 20.647672350132265, "learning_rate": 1.6739624837739518e-07, "logits/chosen": 0.4893258512020111, "logits/rejected": 0.5065658092498779, "logps/chosen": -46.70867919921875, "logps/rejected": -53.02800369262695, "loss": 0.2073, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.15436476469039917, "rewards/margins": 3.050819158554077, "rewards/rejected": -3.205183744430542, "step": 560 }, { "epoch": 1.292473394024875, "eval_logits/chosen": 0.4335208237171173, "eval_logits/rejected": 0.45989227294921875, "eval_logps/chosen": -41.82432556152344, "eval_logps/rejected": -54.859825134277344, "eval_loss": 0.23924875259399414, "eval_rewards/accuracies": 0.8312212228775024, "eval_rewards/chosen": -0.03952277451753616, "eval_rewards/margins": 3.371224880218506, "eval_rewards/rejected": -3.410747766494751, "eval_runtime": 220.3082, "eval_samples_per_second": 7.871, "eval_steps_per_second": 1.97, "step": 560 }, { "epoch": 1.2970893704321067, "grad_norm": 15.328848187023517, "learning_rate": 1.6549578039787434e-07, "logits/chosen": 0.5223647356033325, "logits/rejected": 0.5576710104942322, "logps/chosen": -43.448875427246094, "logps/rejected": -67.14339447021484, "loss": 0.2405, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.1683083474636078, "rewards/margins": 3.6626782417297363, "rewards/rejected": -3.830986499786377, "step": 562 }, { "epoch": 1.3017053468393383, "grad_norm": 14.362719389125761, "learning_rate": 1.6360080986296384e-07, "logits/chosen": 0.5163556337356567, "logits/rejected": 0.5569749474525452, "logps/chosen": -37.78327941894531, "logps/rejected": -64.23339080810547, "loss": 0.186, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.047993943095207214, "rewards/margins": 4.109629154205322, "rewards/rejected": -4.157623291015625, "step": 564 }, { "epoch": 1.30632132324657, "grad_norm": 8.849930925918736, "learning_rate": 1.6171146005047894e-07, "logits/chosen": 0.5622715353965759, "logits/rejected": 0.5891626477241516, "logps/chosen": -46.50107955932617, "logps/rejected": -63.37003707885742, "loss": 0.1689, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.1092919185757637, "rewards/margins": 4.0769548416137695, "rewards/rejected": -3.967662811279297, "step": 566 }, { "epoch": 1.3109372996538018, "grad_norm": 16.110148125770678, "learning_rate": 1.5982785387257694e-07, "logits/chosen": 0.5649956464767456, "logits/rejected": 0.5782197117805481, "logps/chosen": -43.4311408996582, "logps/rejected": -49.03315734863281, "loss": 0.2002, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.15342091023921967, "rewards/margins": 2.909942150115967, "rewards/rejected": -3.0633630752563477, "step": 568 }, { "epoch": 1.3155532760610336, "grad_norm": 23.725153045927403, "learning_rate": 1.5795011386776159e-07, "logits/chosen": 0.5103439688682556, "logits/rejected": 0.5300507545471191, "logps/chosen": -42.80021667480469, "logps/rejected": -47.7119255065918, "loss": 0.2255, "rewards/accuracies": 0.875, "rewards/chosen": -0.14677530527114868, "rewards/margins": 3.0557618141174316, "rewards/rejected": -3.2025370597839355, "step": 570 }, { "epoch": 1.3155532760610336, "eval_logits/chosen": 0.43335986137390137, "eval_logits/rejected": 0.4598417580127716, "eval_logps/chosen": -41.851680755615234, "eval_logps/rejected": -54.97309112548828, "eval_loss": 0.23906731605529785, "eval_rewards/accuracies": 0.835829496383667, "eval_rewards/chosen": -0.05320117622613907, "eval_rewards/margins": 3.4141783714294434, "eval_rewards/rejected": -3.467379570007324, "eval_runtime": 220.3588, "eval_samples_per_second": 7.869, "eval_steps_per_second": 1.97, "step": 570 }, { "epoch": 1.320169252468265, "grad_norm": 16.172756609459842, "learning_rate": 1.560783621929113e-07, "logits/chosen": 0.5175637006759644, "logits/rejected": 0.5229324102401733, "logps/chosen": -49.446102142333984, "logps/rejected": -55.164894104003906, "loss": 0.1869, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.08015252649784088, "rewards/margins": 3.3609066009521484, "rewards/rejected": -3.2807538509368896, "step": 572 }, { "epoch": 1.3247852288754969, "grad_norm": 24.72268513177688, "learning_rate": 1.5421272061533177e-07, "logits/chosen": 0.5066720247268677, "logits/rejected": 0.5451788306236267, "logps/chosen": -37.343570709228516, "logps/rejected": -60.23046112060547, "loss": 0.2949, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.1486133188009262, "rewards/margins": 3.3898818492889404, "rewards/rejected": -3.2412681579589844, "step": 574 }, { "epoch": 1.3294012052827284, "grad_norm": 18.734543272703554, "learning_rate": 1.5235331050483513e-07, "logits/chosen": 0.5524860620498657, "logits/rejected": 0.5772072672843933, "logps/chosen": -43.33749771118164, "logps/rejected": -56.5976676940918, "loss": 0.2367, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.3428743779659271, "rewards/margins": 3.3112895488739014, "rewards/rejected": -3.6541638374328613, "step": 576 }, { "epoch": 1.3340171816899602, "grad_norm": 15.636365920242639, "learning_rate": 1.5050025282584327e-07, "logits/chosen": 0.5805926322937012, "logits/rejected": 0.6090676188468933, "logps/chosen": -49.13417434692383, "logps/rejected": -64.1076431274414, "loss": 0.1791, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.08167193830013275, "rewards/margins": 3.955726146697998, "rewards/rejected": -4.037397861480713, "step": 578 }, { "epoch": 1.338633158097192, "grad_norm": 15.524132351808905, "learning_rate": 1.4865366812951921e-07, "logits/chosen": 0.598872721195221, "logits/rejected": 0.62497878074646, "logps/chosen": -36.58146667480469, "logps/rejected": -46.25484085083008, "loss": 0.1893, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.01747778430581093, "rewards/margins": 3.4903595447540283, "rewards/rejected": -3.5078377723693848, "step": 580 }, { "epoch": 1.338633158097192, "eval_logits/chosen": 0.4342789053916931, "eval_logits/rejected": 0.46078288555145264, "eval_logps/chosen": -42.1205940246582, "eval_logps/rejected": -55.25835418701172, "eval_loss": 0.2389531433582306, "eval_rewards/accuracies": 0.8352534770965576, "eval_rewards/chosen": -0.18765874207019806, "eval_rewards/margins": 3.4223523139953613, "eval_rewards/rejected": -3.610011339187622, "eval_runtime": 220.361, "eval_samples_per_second": 7.869, "eval_steps_per_second": 1.969, "step": 580 }, { "epoch": 1.3432491345044237, "grad_norm": 22.418640332294185, "learning_rate": 1.4681367654592446e-07, "logits/chosen": 0.583182692527771, "logits/rejected": 0.596510112285614, "logps/chosen": -45.08745574951172, "logps/rejected": -52.57502746582031, "loss": 0.1635, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.24202129244804382, "rewards/margins": 3.0601682662963867, "rewards/rejected": -3.302189826965332, "step": 582 }, { "epoch": 1.3478651109116553, "grad_norm": 16.477398466397805, "learning_rate": 1.4498039777620353e-07, "logits/chosen": 0.5257098078727722, "logits/rejected": 0.5561378598213196, "logps/chosen": -49.92831039428711, "logps/rejected": -66.70814514160156, "loss": 0.1983, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.07970259338617325, "rewards/margins": 4.159069538116455, "rewards/rejected": -4.079366683959961, "step": 584 }, { "epoch": 1.352481087318887, "grad_norm": 21.638282072644653, "learning_rate": 1.4315395108479728e-07, "logits/chosen": 0.5448426008224487, "logits/rejected": 0.5733739733695984, "logps/chosen": -42.567203521728516, "logps/rejected": -59.23841094970703, "loss": 0.1872, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.3566977083683014, "rewards/margins": 3.441741943359375, "rewards/rejected": -3.7984399795532227, "step": 586 }, { "epoch": 1.3570970637261186, "grad_norm": 22.386629994354788, "learning_rate": 1.4133445529168365e-07, "logits/chosen": 0.5482079982757568, "logits/rejected": 0.5674624443054199, "logps/chosen": -47.31834030151367, "logps/rejected": -59.47747802734375, "loss": 0.1735, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.25350263714790344, "rewards/margins": 3.711785316467285, "rewards/rejected": -3.965287923812866, "step": 588 }, { "epoch": 1.3617130401333504, "grad_norm": 14.716672759245373, "learning_rate": 1.395220287646483e-07, "logits/chosen": 0.5413531064987183, "logits/rejected": 0.5619943141937256, "logps/chosen": -45.74396514892578, "logps/rejected": -54.50990295410156, "loss": 0.1609, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.3855374753475189, "rewards/margins": 3.439289093017578, "rewards/rejected": -3.82482647895813, "step": 590 }, { "epoch": 1.3617130401333504, "eval_logits/chosen": 0.43462061882019043, "eval_logits/rejected": 0.461076945066452, "eval_logps/chosen": -42.448509216308594, "eval_logps/rejected": -55.58904266357422, "eval_loss": 0.2393806427717209, "eval_rewards/accuracies": 0.8317972421646118, "eval_rewards/chosen": -0.3516136407852173, "eval_rewards/margins": 3.4237425327301025, "eval_rewards/rejected": -3.7753562927246094, "eval_runtime": 220.4141, "eval_samples_per_second": 7.867, "eval_steps_per_second": 1.969, "step": 590 }, { "epoch": 1.3663290165405821, "grad_norm": 21.200823940085225, "learning_rate": 1.377167894115837e-07, "logits/chosen": 0.562565803527832, "logits/rejected": 0.6183031797409058, "logps/chosen": -38.32450866699219, "logps/rejected": -68.53689575195312, "loss": 0.179, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.12118612229824066, "rewards/margins": 4.04473876953125, "rewards/rejected": -4.165925025939941, "step": 592 }, { "epoch": 1.370944992947814, "grad_norm": 13.082922810935031, "learning_rate": 1.3591885467281877e-07, "logits/chosen": 0.4695725440979004, "logits/rejected": 0.4965362548828125, "logps/chosen": -39.13195037841797, "logps/rejected": -58.23176574707031, "loss": 0.1861, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.30562350153923035, "rewards/margins": 3.781522035598755, "rewards/rejected": -4.0871453285217285, "step": 594 }, { "epoch": 1.3755609693550455, "grad_norm": 34.97692684836387, "learning_rate": 1.3412834151347896e-07, "logits/chosen": 0.5469548106193542, "logits/rejected": 0.5717971324920654, "logps/chosen": -44.02994155883789, "logps/rejected": -57.28227996826172, "loss": 0.2084, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.3421705365180969, "rewards/margins": 3.692906379699707, "rewards/rejected": -4.035076141357422, "step": 596 }, { "epoch": 1.3801769457622772, "grad_norm": 14.254996050777464, "learning_rate": 1.323453664158769e-07, "logits/chosen": 0.5193799138069153, "logits/rejected": 0.5635771155357361, "logps/chosen": -40.06482696533203, "logps/rejected": -67.0745620727539, "loss": 0.2322, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.5795915126800537, "rewards/margins": 3.6668989658355713, "rewards/rejected": -4.246490001678467, "step": 598 }, { "epoch": 1.3847929221695088, "grad_norm": 18.46063830068681, "learning_rate": 1.3057004537193422e-07, "logits/chosen": 0.5273723602294922, "logits/rejected": 0.5402401685714722, "logps/chosen": -45.491241455078125, "logps/rejected": -53.827972412109375, "loss": 0.185, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.32591530680656433, "rewards/margins": 3.758335590362549, "rewards/rejected": -4.084251403808594, "step": 600 }, { "epoch": 1.3847929221695088, "eval_logits/chosen": 0.4299531877040863, "eval_logits/rejected": 0.45649805665016174, "eval_logps/chosen": -42.37248992919922, "eval_logps/rejected": -55.565975189208984, "eval_loss": 0.23996217548847198, "eval_rewards/accuracies": 0.8300691246986389, "eval_rewards/chosen": -0.31360533833503723, "eval_rewards/margins": 3.450216054916382, "eval_rewards/rejected": -3.7638211250305176, "eval_runtime": 220.4449, "eval_samples_per_second": 7.866, "eval_steps_per_second": 1.969, "step": 600 }, { "epoch": 1.3894088985767405, "grad_norm": 24.193490725343704, "learning_rate": 1.2880249387563662e-07, "logits/chosen": 0.5480252504348755, "logits/rejected": 0.5805102586746216, "logps/chosen": -43.4918098449707, "logps/rejected": -62.1549072265625, "loss": 0.1713, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.4662397801876068, "rewards/margins": 3.974961280822754, "rewards/rejected": -4.441201210021973, "step": 602 }, { "epoch": 1.3940248749839723, "grad_norm": 8.975682909766576, "learning_rate": 1.2704282691551938e-07, "logits/chosen": 0.45732539892196655, "logits/rejected": 0.5041163563728333, "logps/chosen": -40.32965850830078, "logps/rejected": -67.52854919433594, "loss": 0.1754, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.014653444290161133, "rewards/margins": 4.295289993286133, "rewards/rejected": -4.280636787414551, "step": 604 }, { "epoch": 1.398640851391204, "grad_norm": 27.018968489026342, "learning_rate": 1.2529115896718714e-07, "logits/chosen": 0.5242836475372314, "logits/rejected": 0.5399221777915955, "logps/chosen": -45.72035217285156, "logps/rejected": -52.612548828125, "loss": 0.2076, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.37530067563056946, "rewards/margins": 3.2071659564971924, "rewards/rejected": -3.5824666023254395, "step": 606 }, { "epoch": 1.4032568277984356, "grad_norm": 13.414881670063712, "learning_rate": 1.2354760398586708e-07, "logits/chosen": 0.5383539199829102, "logits/rejected": 0.5773718953132629, "logps/chosen": -48.75130081176758, "logps/rejected": -72.36872863769531, "loss": 0.1511, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.44930893182754517, "rewards/margins": 4.512818336486816, "rewards/rejected": -4.962126731872559, "step": 608 }, { "epoch": 1.4078728042056674, "grad_norm": 7.330900567316457, "learning_rate": 1.2181227539899468e-07, "logits/chosen": 0.5381309986114502, "logits/rejected": 0.5586973428726196, "logps/chosen": -45.09908676147461, "logps/rejected": -58.20050811767578, "loss": 0.1744, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.2882673442363739, "rewards/margins": 3.7085728645324707, "rewards/rejected": -3.996840238571167, "step": 610 }, { "epoch": 1.4078728042056674, "eval_logits/chosen": 0.4304519295692444, "eval_logits/rejected": 0.45695292949676514, "eval_logps/chosen": -42.44300842285156, "eval_logps/rejected": -55.6538200378418, "eval_loss": 0.238841712474823, "eval_rewards/accuracies": 0.8352534770965576, "eval_rewards/chosen": -0.34886524081230164, "eval_rewards/margins": 3.4588773250579834, "eval_rewards/rejected": -3.8077423572540283, "eval_runtime": 220.5308, "eval_samples_per_second": 7.863, "eval_steps_per_second": 1.968, "step": 610 }, { "epoch": 1.412488780612899, "grad_norm": 10.681757170937171, "learning_rate": 1.2008528609883557e-07, "logits/chosen": 0.5007774233818054, "logits/rejected": 0.5296944379806519, "logps/chosen": -47.22381591796875, "logps/rejected": -64.06365966796875, "loss": 0.1531, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.030609939247369766, "rewards/margins": 4.320724010467529, "rewards/rejected": -4.351334571838379, "step": 612 }, { "epoch": 1.4171047570201307, "grad_norm": 10.655182313924602, "learning_rate": 1.1836674843514042e-07, "logits/chosen": 0.5347999930381775, "logits/rejected": 0.564474880695343, "logps/chosen": -37.77484893798828, "logps/rejected": -54.86954879760742, "loss": 0.175, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.38236376643180847, "rewards/margins": 3.763371706008911, "rewards/rejected": -4.145735263824463, "step": 614 }, { "epoch": 1.4217207334273625, "grad_norm": 4.808937007878847, "learning_rate": 1.1665677420783671e-07, "logits/chosen": 0.5504859089851379, "logits/rejected": 0.5750877261161804, "logps/chosen": -43.14183807373047, "logps/rejected": -53.28805160522461, "loss": 0.1417, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.12460337579250336, "rewards/margins": 3.7694272994995117, "rewards/rejected": -3.894031047821045, "step": 616 }, { "epoch": 1.4263367098345943, "grad_norm": 25.84566759360446, "learning_rate": 1.149554746597553e-07, "logits/chosen": 0.5723487734794617, "logits/rejected": 0.6003535389900208, "logps/chosen": -45.33318328857422, "logps/rejected": -59.90052795410156, "loss": 0.262, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.3526383936405182, "rewards/margins": 3.843003988265991, "rewards/rejected": -4.195642471313477, "step": 618 }, { "epoch": 1.4309526862418258, "grad_norm": 16.545628594299828, "learning_rate": 1.1326296046939333e-07, "logits/chosen": 0.5338951945304871, "logits/rejected": 0.5544497966766357, "logps/chosen": -39.78907775878906, "logps/rejected": -49.23013687133789, "loss": 0.2511, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.12468406558036804, "rewards/margins": 3.2776834964752197, "rewards/rejected": -3.402367353439331, "step": 620 }, { "epoch": 1.4309526862418258, "eval_logits/chosen": 0.43395209312438965, "eval_logits/rejected": 0.46030664443969727, "eval_logps/chosen": -42.41356658935547, "eval_logps/rejected": -55.699623107910156, "eval_loss": 0.23819313943386078, "eval_rewards/accuracies": 0.8317972421646118, "eval_rewards/chosen": -0.3341463804244995, "eval_rewards/margins": 3.4965004920959473, "eval_rewards/rejected": -3.8306467533111572, "eval_runtime": 220.4261, "eval_samples_per_second": 7.867, "eval_steps_per_second": 1.969, "step": 620 }, { "epoch": 1.4355686626490576, "grad_norm": 18.451086465748666, "learning_rate": 1.1157934174371413e-07, "logits/chosen": 0.497620165348053, "logits/rejected": 0.5271977782249451, "logps/chosen": -44.88563919067383, "logps/rejected": -63.52084732055664, "loss": 0.1973, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.4545660614967346, "rewards/margins": 4.014831066131592, "rewards/rejected": -4.469396591186523, "step": 622 }, { "epoch": 1.4401846390562894, "grad_norm": 15.41826391561629, "learning_rate": 1.0990472801098419e-07, "logits/chosen": 0.49964290857315063, "logits/rejected": 0.5341427326202393, "logps/chosen": -39.38306427001953, "logps/rejected": -59.41951370239258, "loss": 0.1465, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.07668253034353256, "rewards/margins": 4.010004043579102, "rewards/rejected": -4.086687088012695, "step": 624 }, { "epoch": 1.444800615463521, "grad_norm": 13.657128245878823, "learning_rate": 1.0823922821364795e-07, "logits/chosen": 0.5488825440406799, "logits/rejected": 0.5648425221443176, "logps/chosen": -49.72515869140625, "logps/rejected": -57.29216766357422, "loss": 0.1844, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.15428660809993744, "rewards/margins": 3.7048492431640625, "rewards/rejected": -3.859135627746582, "step": 626 }, { "epoch": 1.4494165918707527, "grad_norm": 17.171702939592354, "learning_rate": 1.0658295070124026e-07, "logits/chosen": 0.5274313688278198, "logits/rejected": 0.540188729763031, "logps/chosen": -47.955406188964844, "logps/rejected": -54.03617477416992, "loss": 0.2187, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.16990727186203003, "rewards/margins": 3.60162091255188, "rewards/rejected": -3.7715280055999756, "step": 628 }, { "epoch": 1.4540325682779844, "grad_norm": 25.795693399142227, "learning_rate": 1.0493600322333762e-07, "logits/chosen": 0.5215524435043335, "logits/rejected": 0.5590708255767822, "logps/chosen": -44.3021354675293, "logps/rejected": -73.55774688720703, "loss": 0.141, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.3967975080013275, "rewards/margins": 4.7301740646362305, "rewards/rejected": -5.12697172164917, "step": 630 }, { "epoch": 1.4540325682779844, "eval_logits/chosen": 0.43174034357070923, "eval_logits/rejected": 0.4582732319831848, "eval_logps/chosen": -42.194610595703125, "eval_logps/rejected": -55.55934524536133, "eval_loss": 0.23693177103996277, "eval_rewards/accuracies": 0.8317972421646118, "eval_rewards/chosen": -0.22466643154621124, "eval_rewards/margins": 3.535839080810547, "eval_rewards/rejected": -3.7605059146881104, "eval_runtime": 220.3801, "eval_samples_per_second": 7.868, "eval_steps_per_second": 1.969, "step": 630 }, { "epoch": 1.458648544685216, "grad_norm": 14.475820972948407, "learning_rate": 1.0329849292254883e-07, "logits/chosen": 0.596792995929718, "logits/rejected": 0.624647855758667, "logps/chosen": -45.63186264038086, "logps/rejected": -62.25794982910156, "loss": 0.1936, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.2872418463230133, "rewards/margins": 3.9080302715301514, "rewards/rejected": -4.195271968841553, "step": 632 }, { "epoch": 1.4632645210924478, "grad_norm": 26.862980766739724, "learning_rate": 1.0167052632754458e-07, "logits/chosen": 0.5725838541984558, "logits/rejected": 0.5932745337486267, "logps/chosen": -41.20800018310547, "logps/rejected": -51.21732711791992, "loss": 0.227, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.39695149660110474, "rewards/margins": 2.928715229034424, "rewards/rejected": -3.325666666030884, "step": 634 }, { "epoch": 1.4678804974996795, "grad_norm": 13.962052681918495, "learning_rate": 1.0005220934612713e-07, "logits/chosen": 0.6229636669158936, "logits/rejected": 0.6402004361152649, "logps/chosen": -46.95052719116211, "logps/rejected": -53.86199951171875, "loss": 0.1824, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.3401051461696625, "rewards/margins": 3.6175549030303955, "rewards/rejected": -3.95766019821167, "step": 636 }, { "epoch": 1.472496473906911, "grad_norm": 9.092245687630806, "learning_rate": 9.844364725834056e-08, "logits/chosen": 0.48213544487953186, "logits/rejected": 0.5316063761711121, "logps/chosen": -45.23646545410156, "logps/rejected": -75.49991607666016, "loss": 0.0997, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": -0.1606331765651703, "rewards/margins": 5.202739238739014, "rewards/rejected": -5.363372802734375, "step": 638 }, { "epoch": 1.4771124503141428, "grad_norm": 18.96340702396886, "learning_rate": 9.68449447096217e-08, "logits/chosen": 0.4373500943183899, "logits/rejected": 0.4579113721847534, "logps/chosen": -39.44499588012695, "logps/rejected": -51.54633712768555, "loss": 0.3299, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.2867163419723511, "rewards/margins": 3.076793670654297, "rewards/rejected": -3.3635098934173584, "step": 640 }, { "epoch": 1.4771124503141428, "eval_logits/chosen": 0.4346330463886261, "eval_logits/rejected": 0.461146742105484, "eval_logps/chosen": -42.071449279785156, "eval_logps/rejected": -55.46683883666992, "eval_loss": 0.23784740269184113, "eval_rewards/accuracies": 0.835829496383667, "eval_rewards/chosen": -0.16308562457561493, "eval_rewards/margins": 3.551164388656616, "eval_rewards/rejected": -3.714250087738037, "eval_runtime": 220.3881, "eval_samples_per_second": 7.868, "eval_steps_per_second": 1.969, "step": 640 }, { "epoch": 1.4817284267213746, "grad_norm": 22.570461867090884, "learning_rate": 9.525620570399259e-08, "logits/chosen": 0.5038811564445496, "logits/rejected": 0.5432533025741577, "logps/chosen": -44.41080856323242, "logps/rejected": -65.23593139648438, "loss": 0.1275, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": -0.2485545426607132, "rewards/margins": 4.013004779815674, "rewards/rejected": -4.261559009552002, "step": 642 }, { "epoch": 1.4863444031286062, "grad_norm": 11.127499049370783, "learning_rate": 9.36775335972943e-08, "logits/chosen": 0.4518318772315979, "logits/rejected": 0.531367838382721, "logps/chosen": -39.415767669677734, "logps/rejected": -98.71846771240234, "loss": 0.1566, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.000497970322612673, "rewards/margins": 6.575231075286865, "rewards/rejected": -6.575727939605713, "step": 644 }, { "epoch": 1.490960379535838, "grad_norm": 24.53509661266678, "learning_rate": 9.210903109046284e-08, "logits/chosen": 0.46663856506347656, "logits/rejected": 0.5147727727890015, "logps/chosen": -43.30581283569336, "logps/rejected": -63.16206741333008, "loss": 0.1684, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.5338683128356934, "rewards/margins": 4.3571882247924805, "rewards/rejected": -4.89105749130249, "step": 646 }, { "epoch": 1.4955763559430697, "grad_norm": 11.303027411423997, "learning_rate": 9.05508002228485e-08, "logits/chosen": 0.529050350189209, "logits/rejected": 0.5628350377082825, "logps/chosen": -38.363826751708984, "logps/rejected": -53.06625747680664, "loss": 0.2071, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.031818799674510956, "rewards/margins": 3.961611032485962, "rewards/rejected": -3.929792642593384, "step": 648 }, { "epoch": 1.5001923323503012, "grad_norm": 10.500286558923209, "learning_rate": 8.900294236557707e-08, "logits/chosen": 0.49337685108184814, "logits/rejected": 0.5243138074874878, "logps/chosen": -37.17765808105469, "logps/rejected": -49.10523986816406, "loss": 0.2143, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.008912450633943081, "rewards/margins": 3.240175485610962, "rewards/rejected": -3.2312631607055664, "step": 650 }, { "epoch": 1.5001923323503012, "eval_logits/chosen": 0.4313080310821533, "eval_logits/rejected": 0.45790737867355347, "eval_logps/chosen": -42.17680740356445, "eval_logps/rejected": -55.66178894042969, "eval_loss": 0.2398524433374405, "eval_rewards/accuracies": 0.8306451439857483, "eval_rewards/chosen": -0.21576282382011414, "eval_rewards/margins": 3.59596586227417, "eval_rewards/rejected": -3.8117284774780273, "eval_runtime": 220.4293, "eval_samples_per_second": 7.866, "eval_steps_per_second": 1.969, "step": 650 }, { "epoch": 1.504808308757533, "grad_norm": 21.390880404408534, "learning_rate": 8.746555821495561e-08, "logits/chosen": 0.4801899492740631, "logits/rejected": 0.5136987566947937, "logps/chosen": -43.907596588134766, "logps/rejected": -62.06863021850586, "loss": 0.1972, "rewards/accuracies": 0.875, "rewards/chosen": -0.21802374720573425, "rewards/margins": 4.019637584686279, "rewards/rejected": -4.237661361694336, "step": 652 }, { "epoch": 1.5094242851647648, "grad_norm": 17.814740010117944, "learning_rate": 8.593874778592122e-08, "logits/chosen": 0.4772498309612274, "logits/rejected": 0.5082363486289978, "logps/chosen": -36.85258483886719, "logps/rejected": -49.34876251220703, "loss": 0.1537, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.038329627364873886, "rewards/margins": 3.5393142700195312, "rewards/rejected": -3.577643394470215, "step": 654 }, { "epoch": 1.5140402615719966, "grad_norm": 24.684686325904988, "learning_rate": 8.442261040553472e-08, "logits/chosen": 0.5512763857841492, "logits/rejected": 0.5618037581443787, "logps/chosen": -44.694515228271484, "logps/rejected": -49.48525619506836, "loss": 0.1683, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.0919620469212532, "rewards/margins": 3.498401403427124, "rewards/rejected": -3.406439781188965, "step": 656 }, { "epoch": 1.518656237979228, "grad_norm": 21.50701378180569, "learning_rate": 8.291724470651903e-08, "logits/chosen": 0.49069249629974365, "logits/rejected": 0.5210825800895691, "logps/chosen": -44.639766693115234, "logps/rejected": -57.28916549682617, "loss": 0.2335, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.4066303074359894, "rewards/margins": 3.4069387912750244, "rewards/rejected": -3.813568592071533, "step": 658 }, { "epoch": 1.5232722143864597, "grad_norm": 11.082339838552715, "learning_rate": 8.14227486208423e-08, "logits/chosen": 0.4665941596031189, "logits/rejected": 0.4930134415626526, "logps/chosen": -37.94073486328125, "logps/rejected": -53.0433464050293, "loss": 0.1797, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.09727773815393448, "rewards/margins": 3.91404128074646, "rewards/rejected": -3.8167638778686523, "step": 660 }, { "epoch": 1.5232722143864597, "eval_logits/chosen": 0.43500614166259766, "eval_logits/rejected": 0.4616233706474304, "eval_logps/chosen": -42.075767517089844, "eval_logps/rejected": -55.58706283569336, "eval_loss": 0.2391819953918457, "eval_rewards/accuracies": 0.8306451439857483, "eval_rewards/chosen": -0.1652439683675766, "eval_rewards/margins": 3.609118938446045, "eval_rewards/rejected": -3.774362802505493, "eval_runtime": 220.4966, "eval_samples_per_second": 7.864, "eval_steps_per_second": 1.968, "step": 660 }, { "epoch": 1.5278881907936914, "grad_norm": 17.884909353386927, "learning_rate": 7.993921937334716e-08, "logits/chosen": 0.5584304332733154, "logits/rejected": 0.5700749754905701, "logps/chosen": -41.323944091796875, "logps/rejected": -49.892147064208984, "loss": 0.2096, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.27705806493759155, "rewards/margins": 3.536667823791504, "rewards/rejected": -3.813725709915161, "step": 662 }, { "epoch": 1.5325041672009232, "grad_norm": 6.982953174746173, "learning_rate": 7.846675347542578e-08, "logits/chosen": 0.5807335376739502, "logits/rejected": 0.6132792234420776, "logps/chosen": -37.81986999511719, "logps/rejected": -49.71797180175781, "loss": 0.1272, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.3861154019832611, "rewards/margins": 4.170031547546387, "rewards/rejected": -3.783916473388672, "step": 664 }, { "epoch": 1.537120143608155, "grad_norm": 18.18022469520284, "learning_rate": 7.700544671874079e-08, "logits/chosen": 0.6006969213485718, "logits/rejected": 0.6162829995155334, "logps/chosen": -47.33814239501953, "logps/rejected": -52.70623016357422, "loss": 0.1962, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.2818297743797302, "rewards/margins": 3.495248317718506, "rewards/rejected": -3.7770779132843018, "step": 666 }, { "epoch": 1.5417361200153867, "grad_norm": 17.752568042598934, "learning_rate": 7.555539416899437e-08, "logits/chosen": 0.5043608546257019, "logits/rejected": 0.535383939743042, "logps/chosen": -37.40916442871094, "logps/rejected": -52.42148971557617, "loss": 0.2323, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.4006814658641815, "rewards/margins": 3.385708808898926, "rewards/rejected": -3.7863900661468506, "step": 668 }, { "epoch": 1.5463520964226183, "grad_norm": 14.165329854797266, "learning_rate": 7.41166901597429e-08, "logits/chosen": 0.5081818699836731, "logits/rejected": 0.5341579914093018, "logps/chosen": -42.154205322265625, "logps/rejected": -55.97992706298828, "loss": 0.1774, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.05981425940990448, "rewards/margins": 3.988154172897339, "rewards/rejected": -4.047967910766602, "step": 670 }, { "epoch": 1.5463520964226183, "eval_logits/chosen": 0.4372006952762604, "eval_logits/rejected": 0.46362602710723877, "eval_logps/chosen": -42.13774490356445, "eval_logps/rejected": -55.63636779785156, "eval_loss": 0.23786574602127075, "eval_rewards/accuracies": 0.8329492807388306, "eval_rewards/chosen": -0.19623348116874695, "eval_rewards/margins": 3.602783203125, "eval_rewards/rejected": -3.7990164756774902, "eval_runtime": 220.5205, "eval_samples_per_second": 7.863, "eval_steps_per_second": 1.968, "step": 670 }, { "epoch": 1.5509680728298498, "grad_norm": 22.84931762442886, "learning_rate": 7.268942828626046e-08, "logits/chosen": 0.5015777349472046, "logits/rejected": 0.5260412096977234, "logps/chosen": -39.39936828613281, "logps/rejected": -50.80826950073242, "loss": 0.2259, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.02117648348212242, "rewards/margins": 3.6156790256500244, "rewards/rejected": -3.6368556022644043, "step": 672 }, { "epoch": 1.5555840492370816, "grad_norm": 10.729660784502734, "learning_rate": 7.127370139945018e-08, "logits/chosen": 0.5064399242401123, "logits/rejected": 0.542765736579895, "logps/chosen": -41.118350982666016, "logps/rejected": -57.55162048339844, "loss": 0.1581, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.18698811531066895, "rewards/margins": 4.028824806213379, "rewards/rejected": -4.215813159942627, "step": 674 }, { "epoch": 1.5602000256443134, "grad_norm": 12.758336439580667, "learning_rate": 6.986960159980326e-08, "logits/chosen": 0.5471921563148499, "logits/rejected": 0.5656020045280457, "logps/chosen": -44.28984069824219, "logps/rejected": -53.67868423461914, "loss": 0.1621, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.007483018562197685, "rewards/margins": 3.514232873916626, "rewards/rejected": -3.5217158794403076, "step": 676 }, { "epoch": 1.5648160020515451, "grad_norm": 25.743372698631337, "learning_rate": 6.847722023140776e-08, "logits/chosen": 0.5099420547485352, "logits/rejected": 0.5306479930877686, "logps/chosen": -38.24551773071289, "logps/rejected": -46.37004470825195, "loss": 0.2453, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.13890628516674042, "rewards/margins": 3.242166757583618, "rewards/rejected": -3.381072998046875, "step": 678 }, { "epoch": 1.569431978458777, "grad_norm": 29.001544411683714, "learning_rate": 6.709664787600616e-08, "logits/chosen": 0.5341071486473083, "logits/rejected": 0.549387514591217, "logps/chosen": -38.39107131958008, "logps/rejected": -45.22284698486328, "loss": 0.2519, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": -0.32282909750938416, "rewards/margins": 2.876624822616577, "rewards/rejected": -3.1994540691375732, "step": 680 }, { "epoch": 1.569431978458777, "eval_logits/chosen": 0.4367799460887909, "eval_logits/rejected": 0.46335569024086, "eval_logps/chosen": -42.14803695678711, "eval_logps/rejected": -55.68684005737305, "eval_loss": 0.23701736330986023, "eval_rewards/accuracies": 0.8335253596305847, "eval_rewards/chosen": -0.20137952268123627, "eval_rewards/margins": 3.622871160507202, "eval_rewards/rejected": -3.8242506980895996, "eval_runtime": 220.405, "eval_samples_per_second": 7.867, "eval_steps_per_second": 1.969, "step": 680 }, { "epoch": 1.5740479548660085, "grad_norm": 26.57226192590101, "learning_rate": 6.572797434710219e-08, "logits/chosen": 0.47764989733695984, "logits/rejected": 0.5231152772903442, "logps/chosen": -39.2479362487793, "logps/rejected": -67.22251892089844, "loss": 0.1985, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.03589929640293121, "rewards/margins": 4.406409740447998, "rewards/rejected": -4.370510578155518, "step": 682 }, { "epoch": 1.57866393127324, "grad_norm": 7.8158043752344115, "learning_rate": 6.437128868411856e-08, "logits/chosen": 0.5327097177505493, "logits/rejected": 0.5473262071609497, "logps/chosen": -38.83921813964844, "logps/rejected": -47.30848693847656, "loss": 0.212, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.002615167060866952, "rewards/margins": 3.3942179679870605, "rewards/rejected": -3.3916027545928955, "step": 684 }, { "epoch": 1.5832799076804718, "grad_norm": 11.10012939486401, "learning_rate": 6.302667914660384e-08, "logits/chosen": 0.5219799280166626, "logits/rejected": 0.55839604139328, "logps/chosen": -37.46578598022461, "logps/rejected": -54.46531295776367, "loss": 0.2233, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.02391706593334675, "rewards/margins": 3.7034101486206055, "rewards/rejected": -3.679492950439453, "step": 686 }, { "epoch": 1.5878958840877035, "grad_norm": 19.67549763311113, "learning_rate": 6.169423320849112e-08, "logits/chosen": 0.5211795568466187, "logits/rejected": 0.5298517346382141, "logps/chosen": -45.8150520324707, "logps/rejected": -47.33256149291992, "loss": 0.2021, "rewards/accuracies": 0.875, "rewards/chosen": -0.2716074287891388, "rewards/margins": 3.559727191925049, "rewards/rejected": -3.831334352493286, "step": 688 }, { "epoch": 1.5925118604949353, "grad_norm": 15.711220514951888, "learning_rate": 6.037403755240748e-08, "logits/chosen": 0.5544189810752869, "logits/rejected": 0.5787670612335205, "logps/chosen": -45.216304779052734, "logps/rejected": -59.76258850097656, "loss": 0.1572, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": -0.14509858191013336, "rewards/margins": 3.88366436958313, "rewards/rejected": -4.0287628173828125, "step": 690 }, { "epoch": 1.5925118604949353, "eval_logits/chosen": 0.43276646733283997, "eval_logits/rejected": 0.45934849977493286, "eval_logps/chosen": -42.20445251464844, "eval_logps/rejected": -55.753753662109375, "eval_loss": 0.23724210262298584, "eval_rewards/accuracies": 0.8317972421646118, "eval_rewards/chosen": -0.2295861542224884, "eval_rewards/margins": 3.6281206607818604, "eval_rewards/rejected": -3.8577067852020264, "eval_runtime": 220.4833, "eval_samples_per_second": 7.865, "eval_steps_per_second": 1.968, "step": 690 }, { "epoch": 1.597127836902167, "grad_norm": 14.487508826565733, "learning_rate": 5.9066178064034326e-08, "logits/chosen": 0.4430210590362549, "logits/rejected": 0.4965353012084961, "logps/chosen": -33.27760696411133, "logps/rejected": -71.74127197265625, "loss": 0.2328, "rewards/accuracies": 0.875, "rewards/chosen": -0.2861379384994507, "rewards/margins": 4.55012321472168, "rewards/rejected": -4.836262226104736, "step": 692 }, { "epoch": 1.6017438133093986, "grad_norm": 23.580990452467088, "learning_rate": 5.777073982652064e-08, "logits/chosen": 0.5170236825942993, "logits/rejected": 0.5521243214607239, "logps/chosen": -35.71030044555664, "logps/rejected": -52.74575424194336, "loss": 0.2247, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": -0.3935600519180298, "rewards/margins": 3.574741840362549, "rewards/rejected": -3.96830153465271, "step": 694 }, { "epoch": 1.6063597897166302, "grad_norm": 13.54068941517088, "learning_rate": 5.6487807114947325e-08, "logits/chosen": 0.551853358745575, "logits/rejected": 0.5928479433059692, "logps/chosen": -42.63957214355469, "logps/rejected": -70.68295288085938, "loss": 0.1803, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": -0.2251981645822525, "rewards/margins": 4.277625560760498, "rewards/rejected": -4.502823352813721, "step": 696 }, { "epoch": 1.610975766123862, "grad_norm": 27.742044897151906, "learning_rate": 5.521746339084532e-08, "logits/chosen": 0.5765677094459534, "logits/rejected": 0.5921374559402466, "logps/chosen": -47.175655364990234, "logps/rejected": -58.09642028808594, "loss": 0.2516, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": -0.3188338875770569, "rewards/margins": 3.57645583152771, "rewards/rejected": -3.895289897918701, "step": 698 }, { "epoch": 1.6155917425310937, "grad_norm": 13.652878320465026, "learning_rate": 5.39597912967652e-08, "logits/chosen": 0.5359885692596436, "logits/rejected": 0.575743556022644, "logps/chosen": -38.843807220458984, "logps/rejected": -61.49338150024414, "loss": 0.1886, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -0.01514108944684267, "rewards/margins": 4.108646392822266, "rewards/rejected": -4.1237874031066895, "step": 700 }, { "epoch": 1.6155917425310937, "eval_logits/chosen": 0.43191081285476685, "eval_logits/rejected": 0.4585791528224945, "eval_logps/chosen": -42.20844268798828, "eval_logps/rejected": -55.773094177246094, "eval_loss": 0.23592650890350342, "eval_rewards/accuracies": 0.8364055156707764, "eval_rewards/chosen": -0.23158276081085205, "eval_rewards/margins": 3.635798692703247, "eval_rewards/rejected": -3.8673815727233887, "eval_runtime": 220.5019, "eval_samples_per_second": 7.864, "eval_steps_per_second": 1.968, "step": 700 } ], "logging_steps": 2, "max_steps": 866, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }