{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 400, "global_step": 975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "count/fg_chosen": 30.85714340209961, "count/fg_rejected": 7.4285712242126465, "epoch": 0.010256410256410256, "fg_kl": NaN, "fg_logps/policy_KL": -11.948674201965332, "fg_logps/policy_chosen": -6.262445449829102, "fg_logps/policy_rejected": -8.74467945098877, "fg_logps/reference_KL": -11.94157600402832, "fg_logps/reference_chosen": -6.2594828605651855, "fg_logps/reference_rejected": -8.742448806762695, "fg_loss": 0.8008173704147339, "fg_rewards/chosen_sum": -0.008917576633393764, "fg_rewards/rejected_sum": -0.0010543327080085874, "grad_norm": 70.97090228694296, "kl": 0.15787295997142792, "learning_rate": 2.5110157309792834e-07, "logps/chosen": -366.76351768092104, "logps/rejected": -369.69268508184524, "loss": 0.6347, "rewards/chosen": 0.014076207813463713, "rewards/margins": -0.000743936476552097, "rewards/rejected": 0.01482014429001581, "step": 10 }, { "count/fg_chosen": 26.352941513061523, "count/fg_rejected": 6.058823585510254, "epoch": 0.020512820512820513, "fg_kl": NaN, "fg_logps/policy_KL": -10.825294494628906, "fg_logps/policy_chosen": -5.95189905166626, "fg_logps/policy_rejected": -5.48292350769043, "fg_logps/reference_KL": -10.80107307434082, "fg_logps/reference_chosen": -5.9293718338012695, "fg_logps/reference_rejected": -5.445353984832764, "fg_loss": 0.7557108402252197, "fg_rewards/chosen_sum": -0.05455803498625755, "fg_rewards/rejected_sum": -0.025491168722510338, "grad_norm": 76.27540979070403, "kl": 0.05270981788635254, "learning_rate": 3.2669067855881653e-07, "logps/chosen": -385.705078125, "logps/rejected": -347.460890436747, "loss": 0.5881, "rewards/chosen": 0.02315927016270625, "rewards/margins": 0.05135368041786262, "rewards/rejected": -0.028194410255156368, "step": 20 }, { "count/fg_chosen": 27.0, "count/fg_rejected": 7.176470756530762, "epoch": 0.03076923076923077, "fg_kl": NaN, "fg_logps/policy_KL": -12.925114631652832, "fg_logps/policy_chosen": -6.261083602905273, "fg_logps/policy_rejected": -7.534660816192627, "fg_logps/reference_KL": -12.83480167388916, "fg_logps/reference_chosen": -6.232151985168457, "fg_logps/reference_rejected": -7.494457244873047, "fg_loss": 0.8402522206306458, "fg_rewards/chosen_sum": -0.0777626782655716, "fg_rewards/rejected_sum": -0.03122476302087307, "grad_norm": 112.80411347358924, "kl": 0.02636871300637722, "learning_rate": 3.709074707164929e-07, "logps/chosen": -380.789990234375, "logps/rejected": -344.6407958984375, "loss": 0.5949, "rewards/chosen": 0.017523756623268126, "rewards/margins": 0.0999738484621048, "rewards/rejected": -0.08245009183883667, "step": 30 }, { "count/fg_chosen": 32.60869598388672, "count/fg_rejected": 8.136363983154297, "epoch": 0.041025641025641026, "fg_kl": NaN, "fg_logps/policy_KL": -11.220458030700684, "fg_logps/policy_chosen": -5.84191370010376, "fg_logps/policy_rejected": -6.710254192352295, "fg_logps/reference_KL": -11.054718971252441, "fg_logps/reference_chosen": -5.7772111892700195, "fg_logps/reference_rejected": -6.645753860473633, "fg_loss": 0.808053195476532, "fg_rewards/chosen_sum": -0.1892387568950653, "fg_rewards/rejected_sum": -0.05010434612631798, "grad_norm": 50.84808526748507, "kl": 0.0, "learning_rate": 4.022797840197047e-07, "logps/chosen": -383.87660435267856, "logps/rejected": -374.25223581414474, "loss": 0.6121, "rewards/chosen": 0.04867589473724365, "rewards/margins": 0.2174466848373413, "rewards/rejected": -0.16877079010009766, "step": 40 }, { "count/fg_chosen": 32.79999923706055, "count/fg_rejected": 9.533333778381348, "epoch": 0.05128205128205128, "fg_kl": NaN, "fg_logps/policy_KL": -10.373883247375488, "fg_logps/policy_chosen": -5.664600849151611, "fg_logps/policy_rejected": -5.962148189544678, "fg_logps/reference_KL": -10.145182609558105, "fg_logps/reference_chosen": -5.588218688964844, "fg_logps/reference_rejected": -5.897520065307617, "fg_loss": 0.8677656054496765, "fg_rewards/chosen_sum": -0.25619086623191833, "fg_rewards/rejected_sum": -0.07288213074207306, "grad_norm": 64.57457570643511, "kl": 0.009938049130141735, "learning_rate": 4.2661404073496845e-07, "logps/chosen": -346.134577371988, "logps/rejected": -368.41335227272725, "loss": 0.5682, "rewards/chosen": 0.2387204227677311, "rewards/margins": 0.3148248284061905, "rewards/rejected": -0.07610440563845944, "step": 50 }, { "count/fg_chosen": 26.294116973876953, "count/fg_rejected": 5.470588207244873, "epoch": 0.06153846153846154, "fg_kl": NaN, "fg_logps/policy_KL": -12.107590675354004, "fg_logps/policy_chosen": -6.114685535430908, "fg_logps/policy_rejected": -7.69861364364624, "fg_logps/reference_KL": -11.640838623046875, "fg_logps/reference_chosen": -6.00569486618042, "fg_logps/reference_rejected": -7.617056369781494, "fg_loss": 0.7082093954086304, "fg_rewards/chosen_sum": -0.27744388580322266, "fg_rewards/rejected_sum": -0.06493094563484192, "grad_norm": 49.41694776530553, "kl": 0.0, "learning_rate": 4.4649657617738114e-07, "logps/chosen": -353.69510690789474, "logps/rejected": -366.0445033482143, "loss": 0.5548, "rewards/chosen": 0.17124160967375102, "rewards/margins": 0.3800723295761529, "rewards/rejected": -0.20883071990240187, "step": 60 }, { "count/fg_chosen": 32.19047546386719, "count/fg_rejected": 6.238095283508301, "epoch": 0.07179487179487179, "fg_kl": NaN, "fg_logps/policy_KL": -12.570515632629395, "fg_logps/policy_chosen": -6.555771827697754, "fg_logps/policy_rejected": -8.589411735534668, "fg_logps/reference_KL": -12.056818008422852, "fg_logps/reference_chosen": -6.406361103057861, "fg_logps/reference_rejected": -8.353743553161621, "fg_loss": 0.716395378112793, "fg_rewards/chosen_sum": -0.46484148502349854, "fg_rewards/rejected_sum": -0.1304880827665329, "grad_norm": 68.38293723456421, "kl": 0.0, "learning_rate": 4.633070203674842e-07, "logps/chosen": -319.37548828125, "logps/rejected": -425.4502418154762, "loss": 0.6056, "rewards/chosen": -0.016339432252080816, "rewards/margins": 0.2762198054551481, "rewards/rejected": -0.2925592377072289, "step": 70 }, { "count/fg_chosen": 34.77777862548828, "count/fg_rejected": 9.470588684082031, "epoch": 0.08205128205128205, "fg_kl": NaN, "fg_logps/policy_KL": -10.578932762145996, "fg_logps/policy_chosen": -5.850771427154541, "fg_logps/policy_rejected": -7.156460762023926, "fg_logps/reference_KL": -10.092942237854004, "fg_logps/reference_chosen": -5.712654113769531, "fg_logps/reference_rejected": -6.944825172424316, "fg_loss": 0.8019319176673889, "fg_rewards/chosen_sum": -0.4671042263507843, "fg_rewards/rejected_sum": -0.17894208431243896, "grad_norm": 39.81642138358546, "kl": 0.0, "learning_rate": 4.77868889480593e-07, "logps/chosen": -317.1737351190476, "logps/rejected": -377.7265625, "loss": 0.5778, "rewards/chosen": 0.11412754512968518, "rewards/margins": 0.5135003988605394, "rewards/rejected": -0.39937285373085424, "step": 80 }, { "count/fg_chosen": 31.0625, "count/fg_rejected": 6.25, "epoch": 0.09230769230769231, "fg_kl": NaN, "fg_logps/policy_KL": -11.664382934570312, "fg_logps/policy_chosen": -6.444965839385986, "fg_logps/policy_rejected": -8.933675765991211, "fg_logps/reference_KL": -10.985946655273438, "fg_logps/reference_chosen": -6.227105140686035, "fg_logps/reference_rejected": -8.714274406433105, "fg_loss": 0.6460863351821899, "fg_rewards/chosen_sum": -0.5869948267936707, "fg_rewards/rejected_sum": -0.14423823356628418, "grad_norm": 55.62438439854401, "kl": 0.0, "learning_rate": 4.907133683350575e-07, "logps/chosen": -404.47572544642856, "logps/rejected": -416.0439453125, "loss": 0.5292, "rewards/chosen": 0.30132850011189777, "rewards/margins": 0.755986305705288, "rewards/rejected": -0.4546578055933902, "step": 90 }, { "count/fg_chosen": 28.772727966308594, "count/fg_rejected": 6.333333492279053, "epoch": 0.10256410256410256, "fg_kl": NaN, "fg_logps/policy_KL": -12.864214897155762, "fg_logps/policy_chosen": -6.919012546539307, "fg_logps/policy_rejected": -9.152473449707031, "fg_logps/reference_KL": -12.052042007446289, "fg_logps/reference_chosen": -6.679240703582764, "fg_logps/reference_rejected": -8.906697273254395, "fg_loss": 0.7610839605331421, "fg_rewards/chosen_sum": -0.674491822719574, "fg_rewards/rejected_sum": -0.1936338096857071, "grad_norm": 63.5056444865937, "kl": 0.0, "learning_rate": 4.994298745724059e-07, "logps/chosen": -340.2066359747024, "logps/rejected": -390.9894377055921, "loss": 0.6206, "rewards/chosen": 0.13452401615324475, "rewards/margins": 0.2098735791997503, "rewards/rejected": -0.07534956304650557, "step": 100 }, { "count/fg_chosen": 32.42856979370117, "count/fg_rejected": 5.736842155456543, "epoch": 0.11282051282051282, "fg_kl": NaN, "fg_logps/policy_KL": -12.667691230773926, "fg_logps/policy_chosen": -6.325592041015625, "fg_logps/policy_rejected": -7.5075507164001465, "fg_logps/reference_KL": -11.706281661987305, "fg_logps/reference_chosen": -6.0957818031311035, "fg_logps/reference_rejected": -7.153979778289795, "fg_loss": 0.6360421180725098, "fg_rewards/chosen_sum": -0.7651198506355286, "fg_rewards/rejected_sum": -0.1911022961139679, "grad_norm": 45.1771736885249, "kl": 0.0, "learning_rate": 4.937286202964652e-07, "logps/chosen": -358.7214664152299, "logps/rejected": -373.4951305650685, "loss": 0.5458, "rewards/chosen": 0.35928555192618533, "rewards/margins": 0.8201791045234402, "rewards/rejected": -0.4608935525972549, "step": 110 }, { "count/fg_chosen": 36.45000076293945, "count/fg_rejected": 10.210526466369629, "epoch": 0.12307692307692308, "fg_kl": NaN, "fg_logps/policy_KL": -12.05976676940918, "fg_logps/policy_chosen": -6.244026184082031, "fg_logps/policy_rejected": -7.9034271240234375, "fg_logps/reference_KL": -11.134529113769531, "fg_logps/reference_chosen": -6.0136542320251465, "fg_logps/reference_rejected": -7.628241062164307, "fg_loss": 0.881417453289032, "fg_rewards/chosen_sum": -0.7559553384780884, "fg_rewards/rejected_sum": -0.3012525737285614, "grad_norm": 50.09822696941802, "kl": 0.015017986297607422, "learning_rate": 4.880273660205244e-07, "logps/chosen": -320.32579210069446, "logps/rejected": -356.7809392755682, "loss": 0.601, "rewards/chosen": 0.6363146040174696, "rewards/margins": 0.7909924068836252, "rewards/rejected": -0.15467780286615546, "step": 120 }, { "count/fg_chosen": 36.400001525878906, "count/fg_rejected": 7.933333396911621, "epoch": 0.13333333333333333, "fg_kl": NaN, "fg_logps/policy_KL": -11.468809127807617, "fg_logps/policy_chosen": -6.23488712310791, "fg_logps/policy_rejected": -6.54256010055542, "fg_logps/reference_KL": -10.525192260742188, "fg_logps/reference_chosen": -6.026561260223389, "fg_logps/reference_rejected": -6.267870903015137, "fg_loss": 0.8011055588722229, "fg_rewards/chosen_sum": -0.647752583026886, "fg_rewards/rejected_sum": -0.20049738883972168, "grad_norm": 50.115512333731104, "kl": 0.03811788558959961, "learning_rate": 4.823261117445838e-07, "logps/chosen": -450.6170099431818, "logps/rejected": -392.42621527777777, "loss": 0.5078, "rewards/chosen": 1.0679140090942383, "rewards/margins": 1.1960734128952026, "rewards/rejected": -0.12815940380096436, "step": 130 }, { "count/fg_chosen": 31.4761905670166, "count/fg_rejected": 8.699999809265137, "epoch": 0.14358974358974358, "fg_kl": NaN, "fg_logps/policy_KL": -12.193472862243652, "fg_logps/policy_chosen": -6.388846397399902, "fg_logps/policy_rejected": -7.679136753082275, "fg_logps/reference_KL": -11.09471607208252, "fg_logps/reference_chosen": -6.169702529907227, "fg_logps/reference_rejected": -7.332272529602051, "fg_loss": 0.7396747469902039, "fg_rewards/chosen_sum": -0.5886417627334595, "fg_rewards/rejected_sum": -0.24698862433433533, "grad_norm": 48.94452126464587, "kl": 0.03351273387670517, "learning_rate": 4.766248574686431e-07, "logps/chosen": -334.752628279321, "logps/rejected": -386.6670292721519, "loss": 0.5475, "rewards/chosen": 0.8302505869924286, "rewards/margins": 1.2393360176688526, "rewards/rejected": -0.40908543067642406, "step": 140 }, { "count/fg_chosen": 26.105262756347656, "count/fg_rejected": 4.526315689086914, "epoch": 0.15384615384615385, "fg_kl": NaN, "fg_logps/policy_KL": -13.575007438659668, "fg_logps/policy_chosen": -7.0048089027404785, "fg_logps/policy_rejected": -8.861261367797852, "fg_logps/reference_KL": -12.408534049987793, "fg_logps/reference_chosen": -6.816770076751709, "fg_logps/reference_rejected": -8.450145721435547, "fg_loss": 0.7104328870773315, "fg_rewards/chosen_sum": -0.4196644723415375, "fg_rewards/rejected_sum": -0.17191696166992188, "grad_norm": 49.69486730354108, "kl": 0.05726609379053116, "learning_rate": 4.7092360319270236e-07, "logps/chosen": -418.9573688271605, "logps/rejected": -390.3833069620253, "loss": 0.5064, "rewards/chosen": 0.9700225076557677, "rewards/margins": 1.2821282176491542, "rewards/rejected": -0.31210570999338655, "step": 150 }, { "count/fg_chosen": 25.764705657958984, "count/fg_rejected": 4.125, "epoch": 0.1641025641025641, "fg_kl": NaN, "fg_logps/policy_KL": -14.1893892288208, "fg_logps/policy_chosen": -6.999147415161133, "fg_logps/policy_rejected": -7.968658447265625, "fg_logps/reference_KL": -12.764519691467285, "fg_logps/reference_chosen": -6.616185188293457, "fg_logps/reference_rejected": -7.533709526062012, "fg_loss": 0.7398098707199097, "fg_rewards/chosen_sum": -0.8501734733581543, "fg_rewards/rejected_sum": -0.15099674463272095, "grad_norm": 41.227539280479604, "kl": 0.0, "learning_rate": 4.652223489167617e-07, "logps/chosen": -377.22486787683823, "logps/rejected": -458.93584408967394, "loss": 0.4883, "rewards/chosen": 0.8492268955006319, "rewards/margins": 1.2064810067491458, "rewards/rejected": -0.3572541112485139, "step": 160 }, { "count/fg_chosen": 33.06666564941406, "count/fg_rejected": 7.615384578704834, "epoch": 0.17435897435897435, "fg_kl": NaN, "fg_logps/policy_KL": -12.796510696411133, "fg_logps/policy_chosen": -6.570502758026123, "fg_logps/policy_rejected": -7.280531883239746, "fg_logps/reference_KL": -11.486601829528809, "fg_logps/reference_chosen": -6.288327693939209, "fg_logps/reference_rejected": -6.919613361358643, "fg_loss": 0.7529634237289429, "fg_rewards/chosen_sum": -0.7710135579109192, "fg_rewards/rejected_sum": -0.2511799931526184, "grad_norm": 50.55336505699433, "kl": 0.0, "learning_rate": 4.5952109464082095e-07, "logps/chosen": -339.1629430259146, "logps/rejected": -407.0223607772436, "loss": 0.4934, "rewards/chosen": 0.810060268495141, "rewards/margins": 1.2479861452103855, "rewards/rejected": -0.4379258767152444, "step": 170 }, { "count/fg_chosen": 32.875, "count/fg_rejected": 8.25, "epoch": 0.18461538461538463, "fg_kl": NaN, "fg_logps/policy_KL": -12.054594993591309, "fg_logps/policy_chosen": -6.169870853424072, "fg_logps/policy_rejected": -7.339913368225098, "fg_logps/reference_KL": -10.784111022949219, "fg_logps/reference_chosen": -5.852520942687988, "fg_logps/reference_rejected": -6.8034443855285645, "fg_loss": 0.7260686755180359, "fg_rewards/chosen_sum": -0.8828132152557373, "fg_rewards/rejected_sum": -0.5087793469429016, "grad_norm": 79.2618606536439, "kl": 0.0, "learning_rate": 4.5381984036488027e-07, "logps/chosen": -298.9371427210366, "logps/rejected": -415.6966145833333, "loss": 0.5143, "rewards/chosen": 0.3949350496617759, "rewards/margins": 1.4249423386679356, "rewards/rejected": -1.0300072890061598, "step": 180 }, { "count/fg_chosen": 31.363636016845703, "count/fg_rejected": 6.409090995788574, "epoch": 0.19487179487179487, "fg_kl": NaN, "fg_logps/policy_KL": -11.836668968200684, "fg_logps/policy_chosen": -6.473412990570068, "fg_logps/policy_rejected": -7.306280612945557, "fg_logps/reference_KL": -10.574777603149414, "fg_logps/reference_chosen": -6.141035079956055, "fg_logps/reference_rejected": -7.045315265655518, "fg_loss": 0.7152173519134521, "fg_rewards/chosen_sum": -0.9164342880249023, "fg_rewards/rejected_sum": -0.21355971693992615, "grad_norm": 62.51457716342411, "kl": 0.0, "learning_rate": 4.4811858608893954e-07, "logps/chosen": -378.7162252286585, "logps/rejected": -380.27271133814105, "loss": 0.482, "rewards/chosen": 0.9614362949278297, "rewards/margins": 1.7487119516035108, "rewards/rejected": -0.7872756566756811, "step": 190 }, { "count/fg_chosen": 27.789474487304688, "count/fg_rejected": 6.263157844543457, "epoch": 0.20512820512820512, "fg_kl": NaN, "fg_logps/policy_KL": -14.003227233886719, "fg_logps/policy_chosen": -6.706553936004639, "fg_logps/policy_rejected": -7.728447437286377, "fg_logps/reference_KL": -12.440956115722656, "fg_logps/reference_chosen": -6.388577938079834, "fg_logps/reference_rejected": -7.369418621063232, "fg_loss": 0.7191720008850098, "fg_rewards/chosen_sum": -0.8144214749336243, "fg_rewards/rejected_sum": -0.220667764544487, "grad_norm": 53.07709071243361, "kl": 0.016681909561157227, "learning_rate": 4.4241733181299887e-07, "logps/chosen": -376.6458753360215, "logps/rejected": -420.5650652985075, "loss": 0.5059, "rewards/chosen": 0.6015197179650748, "rewards/margins": 1.3459980290767528, "rewards/rejected": -0.744478311111678, "step": 200 }, { "count/fg_chosen": 35.38461685180664, "count/fg_rejected": 9.230769157409668, "epoch": 0.2153846153846154, "fg_kl": NaN, "fg_logps/policy_KL": -13.624795913696289, "fg_logps/policy_chosen": -6.518624305725098, "fg_logps/policy_rejected": -6.696259021759033, "fg_logps/reference_KL": -12.152335166931152, "fg_logps/reference_chosen": -6.259435653686523, "fg_logps/reference_rejected": -6.192153453826904, "fg_loss": 0.7262544631958008, "fg_rewards/chosen_sum": -0.6904063820838928, "fg_rewards/rejected_sum": -0.5146002173423767, "grad_norm": 53.74268239721948, "kl": 0.0, "learning_rate": 4.3671607753705814e-07, "logps/chosen": -386.4969911317568, "logps/rejected": -467.1247274709302, "loss": 0.4207, "rewards/chosen": 1.3281025242161106, "rewards/margins": 2.8707167735120773, "rewards/rejected": -1.5426142492959665, "step": 210 }, { "count/fg_chosen": 24.214284896850586, "count/fg_rejected": 7.5, "epoch": 0.22564102564102564, "fg_kl": NaN, "fg_logps/policy_KL": -12.594059944152832, "fg_logps/policy_chosen": -6.371434688568115, "fg_logps/policy_rejected": -7.165874004364014, "fg_logps/reference_KL": -10.995756149291992, "fg_logps/reference_chosen": -6.087092399597168, "fg_logps/reference_rejected": -6.730601787567139, "fg_loss": 0.7775211334228516, "fg_rewards/chosen_sum": -0.6547192931175232, "fg_rewards/rejected_sum": -0.29393261671066284, "grad_norm": 46.801027582446935, "kl": 0.0, "learning_rate": 4.3101482326111746e-07, "logps/chosen": -344.7091128700658, "logps/rejected": -398.7763671875, "loss": 0.4334, "rewards/chosen": 0.5747735876786081, "rewards/margins": 2.151842461492782, "rewards/rejected": -1.577068873814174, "step": 220 }, { "count/fg_chosen": 29.272727966308594, "count/fg_rejected": 6.55555534362793, "epoch": 0.2358974358974359, "fg_kl": NaN, "fg_logps/policy_KL": -11.965290069580078, "fg_logps/policy_chosen": -6.299896717071533, "fg_logps/policy_rejected": -7.80208158493042, "fg_logps/reference_KL": -10.39778995513916, "fg_logps/reference_chosen": -5.838742256164551, "fg_logps/reference_rejected": -7.279135704040527, "fg_loss": 0.6748415231704712, "fg_rewards/chosen_sum": -1.2193199396133423, "fg_rewards/rejected_sum": -0.36254453659057617, "grad_norm": 45.120354528449354, "kl": 0.0, "learning_rate": 4.2531356898517673e-07, "logps/chosen": -358.53559470663265, "logps/rejected": -378.2181829637097, "loss": 0.4775, "rewards/chosen": 0.5253227389588648, "rewards/margins": 2.214199475507504, "rewards/rejected": -1.688876736548639, "step": 230 }, { "count/fg_chosen": 36.1875, "count/fg_rejected": 7.199999809265137, "epoch": 0.24615384615384617, "fg_kl": NaN, "fg_logps/policy_KL": -12.929790496826172, "fg_logps/policy_chosen": -6.825524806976318, "fg_logps/policy_rejected": -7.702876091003418, "fg_logps/reference_KL": -11.064438819885254, "fg_logps/reference_chosen": -6.324741363525391, "fg_logps/reference_rejected": -7.179370403289795, "fg_loss": 0.7855690717697144, "fg_rewards/chosen_sum": -1.7501062154769897, "fg_rewards/rejected_sum": -0.501429557800293, "grad_norm": 41.89950021868993, "kl": 0.26182326674461365, "learning_rate": 4.1961231470923605e-07, "logps/chosen": -347.2475725446429, "logps/rejected": -382.114453125, "loss": 0.4878, "rewards/chosen": 1.19210935320173, "rewards/margins": 1.8935284205845424, "rewards/rejected": -0.7014190673828125, "step": 240 }, { "count/fg_chosen": 29.399999618530273, "count/fg_rejected": 7.266666889190674, "epoch": 0.2564102564102564, "fg_kl": NaN, "fg_logps/policy_KL": -14.258040428161621, "fg_logps/policy_chosen": -6.943975925445557, "fg_logps/policy_rejected": -7.556437969207764, "fg_logps/reference_KL": -11.820046424865723, "fg_logps/reference_chosen": -6.191815376281738, "fg_logps/reference_rejected": -6.7565178871154785, "fg_loss": 0.8718132972717285, "fg_rewards/chosen_sum": -1.6466922760009766, "fg_rewards/rejected_sum": -0.6005190014839172, "grad_norm": 41.93986307357718, "kl": 0.0, "learning_rate": 4.139110604332953e-07, "logps/chosen": -327.5843017578125, "logps/rejected": -385.35126953125, "loss": 0.4012, "rewards/chosen": 1.1960113525390625, "rewards/margins": 2.935526466369629, "rewards/rejected": -1.7395151138305665, "step": 250 }, { "count/fg_chosen": 33.38461685180664, "count/fg_rejected": 7.692307472229004, "epoch": 0.26666666666666666, "fg_kl": NaN, "fg_logps/policy_KL": -12.9034423828125, "fg_logps/policy_chosen": -6.196410655975342, "fg_logps/policy_rejected": -7.9534759521484375, "fg_logps/reference_KL": -10.658266067504883, "fg_logps/reference_chosen": -5.453469276428223, "fg_logps/reference_rejected": -7.014803409576416, "fg_loss": 0.9994122982025146, "fg_rewards/chosen_sum": -1.9320785999298096, "fg_rewards/rejected_sum": -0.8277677893638611, "grad_norm": 32.044513351466335, "kl": 0.0, "learning_rate": 4.0820980615735465e-07, "logps/chosen": -334.1531723484849, "logps/rejected": -391.7591838430851, "loss": 0.4248, "rewards/chosen": 0.9683192859996449, "rewards/margins": 2.343336492719226, "rewards/rejected": -1.3750172067195812, "step": 260 }, { "count/fg_chosen": 25.071428298950195, "count/fg_rejected": 5.142857074737549, "epoch": 0.27692307692307694, "fg_kl": NaN, "fg_logps/policy_KL": -14.50527286529541, "fg_logps/policy_chosen": -7.0650315284729, "fg_logps/policy_rejected": -8.667463302612305, "fg_logps/reference_KL": -12.12157154083252, "fg_logps/reference_chosen": -6.389377593994141, "fg_logps/reference_rejected": -7.870638370513916, "fg_loss": 0.7973106503486633, "fg_rewards/chosen_sum": -1.2882376909255981, "fg_rewards/rejected_sum": -0.4027543365955353, "grad_norm": 68.19722281582398, "kl": 0.020351696759462357, "learning_rate": 4.025085518814139e-07, "logps/chosen": -405.6799411525974, "logps/rejected": -402.68011106927713, "loss": 0.4322, "rewards/chosen": 0.5332601472928926, "rewards/margins": 2.588090307778585, "rewards/rejected": -2.0548301604856927, "step": 270 }, { "count/fg_chosen": 31.959999084472656, "count/fg_rejected": 9.0, "epoch": 0.28717948717948716, "fg_kl": NaN, "fg_logps/policy_KL": -12.018957138061523, "fg_logps/policy_chosen": -6.5620503425598145, "fg_logps/policy_rejected": -8.468265533447266, "fg_logps/reference_KL": -9.829309463500977, "fg_logps/reference_chosen": -5.755721569061279, "fg_logps/reference_rejected": -7.4278693199157715, "fg_loss": 0.8410596251487732, "fg_rewards/chosen_sum": -2.1979994773864746, "fg_rewards/rejected_sum": -1.1406161785125732, "grad_norm": 42.49380746961441, "kl": 0.0, "learning_rate": 3.9680729760547324e-07, "logps/chosen": -333.30290316358025, "logps/rejected": -438.11288568037975, "loss": 0.5378, "rewards/chosen": 0.4965087513864776, "rewards/margins": 2.022118021313297, "rewards/rejected": -1.5256092699268196, "step": 280 }, { "count/fg_chosen": 31.38888931274414, "count/fg_rejected": 9.166666984558105, "epoch": 0.29743589743589743, "fg_kl": NaN, "fg_logps/policy_KL": -13.498015403747559, "fg_logps/policy_chosen": -6.605287551879883, "fg_logps/policy_rejected": -7.190271377563477, "fg_logps/reference_KL": -11.353104591369629, "fg_logps/reference_chosen": -6.0449957847595215, "fg_logps/reference_rejected": -6.696670055389404, "fg_loss": 0.9254876971244812, "fg_rewards/chosen_sum": -1.749000072479248, "fg_rewards/rejected_sum": -0.5272374153137207, "grad_norm": 34.10871711923179, "kl": 0.0, "learning_rate": 3.9110604332953246e-07, "logps/chosen": -374.3355087652439, "logps/rejected": -443.0232371794872, "loss": 0.4904, "rewards/chosen": 0.7626230658554449, "rewards/margins": 2.348516941368766, "rewards/rejected": -1.5858938755133214, "step": 290 }, { "count/fg_chosen": 29.75, "count/fg_rejected": 7.6315789222717285, "epoch": 0.3076923076923077, "fg_kl": NaN, "fg_logps/policy_KL": -12.35108757019043, "fg_logps/policy_chosen": -6.358473300933838, "fg_logps/policy_rejected": -7.603394508361816, "fg_logps/reference_KL": -10.117968559265137, "fg_logps/reference_chosen": -5.656960964202881, "fg_logps/reference_rejected": -6.988058090209961, "fg_loss": 0.687099277973175, "fg_rewards/chosen_sum": -1.7474342584609985, "fg_rewards/rejected_sum": -0.6550286412239075, "grad_norm": 45.87667859445539, "kl": 0.0, "learning_rate": 3.854047890535917e-07, "logps/chosen": -291.94080528846155, "logps/rejected": -454.1446265243902, "loss": 0.5046, "rewards/chosen": -0.05552493608914889, "rewards/margins": 2.302112236702867, "rewards/rejected": -2.357637172792016, "step": 300 }, { "count/fg_chosen": 29.052631378173828, "count/fg_rejected": 6.157894611358643, "epoch": 0.31794871794871793, "fg_kl": NaN, "fg_logps/policy_KL": -13.878127098083496, "fg_logps/policy_chosen": -6.794922828674316, "fg_logps/policy_rejected": -8.655734062194824, "fg_logps/reference_KL": -11.557811737060547, "fg_logps/reference_chosen": -6.259448528289795, "fg_logps/reference_rejected": -7.914809226989746, "fg_loss": 0.7258095145225525, "fg_rewards/chosen_sum": -1.6086143255233765, "fg_rewards/rejected_sum": -0.5489023327827454, "grad_norm": 43.21903612458155, "kl": 0.0, "learning_rate": 3.7970353477765105e-07, "logps/chosen": -361.2526117369186, "logps/rejected": -382.5555320945946, "loss": 0.4464, "rewards/chosen": 0.8190518756245457, "rewards/margins": 3.1284712498327383, "rewards/rejected": -2.3094193742081925, "step": 310 }, { "count/fg_chosen": 27.66666603088379, "count/fg_rejected": 7.05555534362793, "epoch": 0.3282051282051282, "fg_kl": NaN, "fg_logps/policy_KL": -14.007112503051758, "fg_logps/policy_chosen": -7.029316425323486, "fg_logps/policy_rejected": -8.816922187805176, "fg_logps/reference_KL": -11.414546012878418, "fg_logps/reference_chosen": -6.1291069984436035, "fg_logps/reference_rejected": -8.072396278381348, "fg_loss": 0.6982179880142212, "fg_rewards/chosen_sum": -1.864067554473877, "fg_rewards/rejected_sum": -0.625391960144043, "grad_norm": 49.56835553157457, "kl": 0.0, "learning_rate": 3.740022805017103e-07, "logps/chosen": -308.43726245777026, "logps/rejected": -462.0808502906977, "loss": 0.4611, "rewards/chosen": 0.7936567358068518, "rewards/margins": 2.709756816430035, "rewards/rejected": -1.9161000806231832, "step": 320 }, { "count/fg_chosen": 29.941177368164062, "count/fg_rejected": 6.125, "epoch": 0.3384615384615385, "fg_kl": NaN, "fg_logps/policy_KL": -13.976774215698242, "fg_logps/policy_chosen": -6.938152313232422, "fg_logps/policy_rejected": -7.871105670928955, "fg_logps/reference_KL": -11.392000198364258, "fg_logps/reference_chosen": -6.208968162536621, "fg_logps/reference_rejected": -6.786693096160889, "fg_loss": 0.8347401022911072, "fg_rewards/chosen_sum": -2.1311469078063965, "fg_rewards/rejected_sum": -0.6044603586196899, "grad_norm": 23.82687521831931, "kl": 0.0, "learning_rate": 3.6830102622576964e-07, "logps/chosen": -316.02463269589555, "logps/rejected": -366.19430443548384, "loss": 0.4596, "rewards/chosen": 0.36891575713655844, "rewards/margins": 2.952398068754502, "rewards/rejected": -2.5834823116179435, "step": 330 }, { "count/fg_chosen": 31.647058486938477, "count/fg_rejected": 5.0, "epoch": 0.3487179487179487, "fg_kl": NaN, "fg_logps/policy_KL": -13.484383583068848, "fg_logps/policy_chosen": -6.541139602661133, "fg_logps/policy_rejected": -8.020905494689941, "fg_logps/reference_KL": -11.032403945922852, "fg_logps/reference_chosen": -5.8485236167907715, "fg_logps/reference_rejected": -7.10052490234375, "fg_loss": 0.7300294041633606, "fg_rewards/chosen_sum": -1.9582923650741577, "fg_rewards/rejected_sum": -0.4550693929195404, "grad_norm": 44.41360203093714, "kl": 0.0, "learning_rate": 3.625997719498289e-07, "logps/chosen": -337.7124953497024, "logps/rejected": -364.32930715460526, "loss": 0.4522, "rewards/chosen": 0.633344604855492, "rewards/margins": 2.6144102545907923, "rewards/rejected": -1.9810656497353, "step": 340 }, { "count/fg_chosen": 34.35293960571289, "count/fg_rejected": 8.764705657958984, "epoch": 0.358974358974359, "fg_kl": NaN, "fg_logps/policy_KL": -14.230541229248047, "fg_logps/policy_chosen": -7.154247760772705, "fg_logps/policy_rejected": -7.5230560302734375, "fg_logps/reference_KL": -11.367884635925293, "fg_logps/reference_chosen": -6.23461389541626, "fg_logps/reference_rejected": -6.495339870452881, "fg_loss": 0.853543221950531, "fg_rewards/chosen_sum": -2.904083490371704, "fg_rewards/rejected_sum": -0.9132155179977417, "grad_norm": 30.3057952063642, "kl": 0.0, "learning_rate": 3.5689851767388824e-07, "logps/chosen": -394.6584884129214, "logps/rejected": -393.08568992077466, "loss": 0.4936, "rewards/chosen": 0.6523181615250834, "rewards/margins": 2.7101455334716062, "rewards/rejected": -2.057827371946523, "step": 350 }, { "count/fg_chosen": 27.30769157409668, "count/fg_rejected": 5.0, "epoch": 0.36923076923076925, "fg_kl": NaN, "fg_logps/policy_KL": -12.898691177368164, "fg_logps/policy_chosen": -7.163309574127197, "fg_logps/policy_rejected": -6.87624979019165, "fg_logps/reference_KL": -10.903498649597168, "fg_logps/reference_chosen": -6.516329288482666, "fg_logps/reference_rejected": -6.057809829711914, "fg_loss": 0.7118747234344482, "fg_rewards/chosen_sum": -1.5648789405822754, "fg_rewards/rejected_sum": -0.6483681201934814, "grad_norm": 37.36414008433845, "kl": 0.0, "learning_rate": 3.511972633979475e-07, "logps/chosen": -316.7720240542763, "logps/rejected": -391.03125, "loss": 0.4128, "rewards/chosen": 1.343739258615594, "rewards/margins": 2.428480033587692, "rewards/rejected": -1.0847407749720983, "step": 360 }, { "count/fg_chosen": 31.190475463867188, "count/fg_rejected": 7.050000190734863, "epoch": 0.37948717948717947, "fg_kl": NaN, "fg_logps/policy_KL": -12.466145515441895, "fg_logps/policy_chosen": -6.301754951477051, "fg_logps/policy_rejected": -8.657445907592773, "fg_logps/reference_KL": -10.610649108886719, "fg_logps/reference_chosen": -5.920670509338379, "fg_logps/reference_rejected": -8.133522987365723, "fg_loss": 0.7434370517730713, "fg_rewards/chosen_sum": -0.8639131188392639, "fg_rewards/rejected_sum": -0.45794281363487244, "grad_norm": 28.069125042652725, "kl": 0.0, "learning_rate": 3.4549600912200683e-07, "logps/chosen": -327.7210542485955, "logps/rejected": -410.5584286971831, "loss": 0.4659, "rewards/chosen": 1.6249963996115695, "rewards/margins": 2.5865509134622515, "rewards/rejected": -0.9615545138506822, "step": 370 }, { "count/fg_chosen": 34.900001525878906, "count/fg_rejected": 8.899999618530273, "epoch": 0.38974358974358975, "fg_kl": NaN, "fg_logps/policy_KL": -13.721631050109863, "fg_logps/policy_chosen": -6.37019681930542, "fg_logps/policy_rejected": -6.883843898773193, "fg_logps/reference_KL": -11.473298072814941, "fg_logps/reference_chosen": -6.092167854309082, "fg_logps/reference_rejected": -6.5457611083984375, "fg_loss": 0.8677409887313843, "fg_rewards/chosen_sum": -0.9150064587593079, "fg_rewards/rejected_sum": -0.41792982816696167, "grad_norm": 42.9333549511222, "kl": 0.0, "learning_rate": 3.397947548460661e-07, "logps/chosen": -340.5213176448171, "logps/rejected": -451.7316706730769, "loss": 0.4514, "rewards/chosen": 1.505601836413872, "rewards/margins": 2.4657741472674877, "rewards/rejected": -0.9601723108536158, "step": 380 }, { "count/fg_chosen": 29.55555534362793, "count/fg_rejected": 7.0, "epoch": 0.4, "fg_kl": NaN, "fg_logps/policy_KL": -12.328727722167969, "fg_logps/policy_chosen": -6.023360729217529, "fg_logps/policy_rejected": -7.300014972686768, "fg_logps/reference_KL": -10.421500205993652, "fg_logps/reference_chosen": -5.618011474609375, "fg_logps/reference_rejected": -6.620323181152344, "fg_loss": 0.7781895399093628, "fg_rewards/chosen_sum": -0.9389697313308716, "fg_rewards/rejected_sum": -0.5253291130065918, "grad_norm": 42.84097066535792, "kl": 0.0, "learning_rate": 3.340935005701254e-07, "logps/chosen": -364.2041149400685, "logps/rejected": -441.3032956178161, "loss": 0.4992, "rewards/chosen": 1.0798116187526756, "rewards/margins": 2.0560204480843476, "rewards/rejected": -0.9762088293316721, "step": 390 }, { "count/fg_chosen": 26.72222137451172, "count/fg_rejected": 6.647058963775635, "epoch": 0.41025641025641024, "fg_kl": NaN, "fg_logps/policy_KL": -13.881956100463867, "fg_logps/policy_chosen": -6.5600266456604, "fg_logps/policy_rejected": -9.088929176330566, "fg_logps/reference_KL": -11.40665054321289, "fg_logps/reference_chosen": -5.7827043533325195, "fg_logps/reference_rejected": -8.174890518188477, "fg_loss": 0.8777969479560852, "fg_rewards/chosen_sum": -1.556259036064148, "fg_rewards/rejected_sum": -0.7908374667167664, "grad_norm": 37.72027450146743, "kl": 0.0, "learning_rate": 3.283922462941847e-07, "logps/chosen": -365.9978794642857, "logps/rejected": -436.88159722222224, "loss": 0.4478, "rewards/chosen": 1.2335292271205358, "rewards/margins": 2.4691440885029143, "rewards/rejected": -1.2356148613823785, "step": 400 }, { "epoch": 0.41025641025641024, "eval_count/fg_chosen": 30.183246612548828, "eval_count/fg_rejected": 6.92391300201416, "eval_fg_kl": NaN, "eval_fg_logps/policy_KL": -13.678318977355957, "eval_fg_logps/policy_chosen": -6.628693580627441, "eval_fg_logps/policy_rejected": -8.363188743591309, "eval_fg_logps/reference_KL": -11.47359848022461, "eval_fg_logps/reference_chosen": -6.041894912719727, "eval_fg_logps/reference_rejected": -7.58065938949585, "eval_fg_loss": 0.7654322385787964, "eval_fg_rewards/chosen_sum": -1.3938791751861572, "eval_fg_rewards/rejected_sum": -0.6767725944519043, "eval_kl": 0.02797871269285679, "eval_logps/chosen": -340.2313144329897, "eval_logps/rejected": -400.85385283893396, "eval_loss": 0.4325231909751892, "eval_rewards/chosen": 1.316945568665879, "eval_rewards/margins": 3.0533541780318263, "eval_rewards/rejected": -1.7364086093659472, "eval_runtime": 492.9712, "eval_samples_per_second": 3.515, "eval_steps_per_second": 0.88, "step": 400 }, { "count/fg_chosen": 26.549999237060547, "count/fg_rejected": 6.25, "epoch": 0.4205128205128205, "fg_kl": NaN, "fg_logps/policy_KL": -15.237287521362305, "fg_logps/policy_chosen": -6.820374488830566, "fg_logps/policy_rejected": -8.927366256713867, "fg_logps/reference_KL": -12.311280250549316, "fg_logps/reference_chosen": -5.970030784606934, "fg_logps/reference_rejected": -7.938845634460449, "fg_loss": 0.8091492056846619, "fg_rewards/chosen_sum": -1.5172061920166016, "fg_rewards/rejected_sum": -0.7380185723304749, "grad_norm": 53.93549024472509, "kl": 0.0, "learning_rate": 3.22690992018244e-07, "logps/chosen": -324.6869419642857, "logps/rejected": -405.54951054216866, "loss": 0.4023, "rewards/chosen": 1.265897478376116, "rewards/margins": 3.5306600810328366, "rewards/rejected": -2.2647626026567207, "step": 410 }, { "count/fg_chosen": 23.399999618530273, "count/fg_rejected": 6.133333206176758, "epoch": 0.4307692307692308, "fg_kl": NaN, "fg_logps/policy_KL": -14.516840934753418, "fg_logps/policy_chosen": -7.477798938751221, "fg_logps/policy_rejected": -8.535691261291504, "fg_logps/reference_KL": -11.834728240966797, "fg_logps/reference_chosen": -6.342043876647949, "fg_logps/reference_rejected": -7.10928201675415, "fg_loss": 0.8881044387817383, "fg_rewards/chosen_sum": -2.0730390548706055, "fg_rewards/rejected_sum": -0.9379479289054871, "grad_norm": 37.934930263204464, "kl": 0.04062976688146591, "learning_rate": 3.169897377423033e-07, "logps/chosen": -352.2984280873494, "logps/rejected": -437.3393871753247, "loss": 0.4353, "rewards/chosen": 0.722329794642437, "rewards/margins": 3.3173880871799777, "rewards/rejected": -2.5950582925375407, "step": 420 }, { "count/fg_chosen": 29.41176414489746, "count/fg_rejected": 5.882352828979492, "epoch": 0.441025641025641, "fg_kl": NaN, "fg_logps/policy_KL": -13.61406421661377, "fg_logps/policy_chosen": -6.908777713775635, "fg_logps/policy_rejected": -9.259625434875488, "fg_logps/reference_KL": -10.859848976135254, "fg_logps/reference_chosen": -5.828268527984619, "fg_logps/reference_rejected": -7.893514156341553, "fg_loss": 0.7920488119125366, "fg_rewards/chosen_sum": -2.7851388454437256, "fg_rewards/rejected_sum": -0.8430763483047485, "grad_norm": 31.263236198590103, "kl": 0.20134501159191132, "learning_rate": 3.112884834663626e-07, "logps/chosen": -338.0028831845238, "logps/rejected": -437.03207236842104, "loss": 0.4237, "rewards/chosen": 1.1830097380138578, "rewards/margins": 3.346872267567723, "rewards/rejected": -2.163862529553865, "step": 430 }, { "count/fg_chosen": 31.16666603088379, "count/fg_rejected": 5.583333492279053, "epoch": 0.4512820512820513, "fg_kl": NaN, "fg_logps/policy_KL": -12.71406078338623, "fg_logps/policy_chosen": -6.013169765472412, "fg_logps/policy_rejected": -7.012132167816162, "fg_logps/reference_KL": -10.454259872436523, "fg_logps/reference_chosen": -5.288631439208984, "fg_logps/reference_rejected": -6.411142826080322, "fg_loss": 0.8265379071235657, "fg_rewards/chosen_sum": -1.9277740716934204, "fg_rewards/rejected_sum": -0.4081937372684479, "grad_norm": 33.498855345311206, "kl": 0.0, "learning_rate": 3.055872291904219e-07, "logps/chosen": -433.9810126582278, "logps/rejected": -409.45997299382714, "loss": 0.4124, "rewards/chosen": 0.23994885215276404, "rewards/margins": 2.5844254342442956, "rewards/rejected": -2.3444765820915316, "step": 440 }, { "count/fg_chosen": 28.214284896850586, "count/fg_rejected": 7.0714287757873535, "epoch": 0.46153846153846156, "fg_kl": NaN, "fg_logps/policy_KL": -14.437283515930176, "fg_logps/policy_chosen": -7.329289436340332, "fg_logps/policy_rejected": -7.971861839294434, "fg_logps/reference_KL": -11.504508018493652, "fg_logps/reference_chosen": -6.307824611663818, "fg_logps/reference_rejected": -7.138981342315674, "fg_loss": 0.8858200907707214, "fg_rewards/chosen_sum": -2.4875144958496094, "fg_rewards/rejected_sum": -0.8280299305915833, "grad_norm": 31.189671182424355, "kl": 0.0, "learning_rate": 2.998859749144812e-07, "logps/chosen": -298.6210195806962, "logps/rejected": -419.6058545524691, "loss": 0.4201, "rewards/chosen": 0.8161652963372725, "rewards/margins": 3.091305375788468, "rewards/rejected": -2.275140079451196, "step": 450 }, { "count/fg_chosen": 32.3636360168457, "count/fg_rejected": 4.7272725105285645, "epoch": 0.4717948717948718, "fg_kl": NaN, "fg_logps/policy_KL": -13.52069091796875, "fg_logps/policy_chosen": -6.610226154327393, "fg_logps/policy_rejected": -10.518632888793945, "fg_logps/reference_KL": -10.879704475402832, "fg_logps/reference_chosen": -5.88496732711792, "fg_logps/reference_rejected": -9.407367706298828, "fg_loss": 0.6551663279533386, "fg_rewards/chosen_sum": -2.258774757385254, "fg_rewards/rejected_sum": -0.5948446989059448, "grad_norm": 33.51878345246348, "kl": 0.029797697439789772, "learning_rate": 2.941847206385404e-07, "logps/chosen": -332.60402610085225, "logps/rejected": -395.40771484375, "loss": 0.4001, "rewards/chosen": 1.4196222478693181, "rewards/margins": 3.0095812864977907, "rewards/rejected": -1.5899590386284723, "step": 460 }, { "count/fg_chosen": 31.5, "count/fg_rejected": 5.800000190734863, "epoch": 0.48205128205128206, "fg_kl": NaN, "fg_logps/policy_KL": -12.285706520080566, "fg_logps/policy_chosen": -6.048055648803711, "fg_logps/policy_rejected": -8.306843757629395, "fg_logps/reference_KL": -10.14644718170166, "fg_logps/reference_chosen": -5.535238742828369, "fg_logps/reference_rejected": -7.302800178527832, "fg_loss": 0.6428090333938599, "fg_rewards/chosen_sum": -1.36484956741333, "fg_rewards/rejected_sum": -0.6093672513961792, "grad_norm": 26.805133801472735, "kl": 0.17673882842063904, "learning_rate": 2.8848346636259974e-07, "logps/chosen": -317.91790291432585, "logps/rejected": -368.47114326584506, "loss": 0.4744, "rewards/chosen": 1.5331906093640273, "rewards/margins": 2.4303819928332997, "rewards/rejected": -0.8971913834692726, "step": 470 }, { "count/fg_chosen": 39.266666412353516, "count/fg_rejected": 7.4666666984558105, "epoch": 0.49230769230769234, "fg_kl": NaN, "fg_logps/policy_KL": -13.738459587097168, "fg_logps/policy_chosen": -6.287077903747559, "fg_logps/policy_rejected": -6.787537097930908, "fg_logps/reference_KL": -11.216691017150879, "fg_logps/reference_chosen": -5.826966762542725, "fg_logps/reference_rejected": -6.379599571228027, "fg_loss": 0.7447641491889954, "fg_rewards/chosen_sum": -0.980557382106781, "fg_rewards/rejected_sum": -0.506367564201355, "grad_norm": 39.97199247238054, "kl": 0.0, "learning_rate": 2.82782212086659e-07, "logps/chosen": -401.7761665239726, "logps/rejected": -363.2634698275862, "loss": 0.5103, "rewards/chosen": 1.34750000418049, "rewards/margins": 1.8919780588848274, "rewards/rejected": -0.5444780547043373, "step": 480 }, { "count/fg_chosen": 26.875, "count/fg_rejected": 5.400000095367432, "epoch": 0.5025641025641026, "fg_kl": NaN, "fg_logps/policy_KL": -16.641780853271484, "fg_logps/policy_chosen": -8.056283950805664, "fg_logps/policy_rejected": -8.251357078552246, "fg_logps/reference_KL": -13.59717845916748, "fg_logps/reference_chosen": -7.162622928619385, "fg_logps/reference_rejected": -7.323818683624268, "fg_loss": 0.8411279916763306, "fg_rewards/chosen_sum": -1.9043647050857544, "fg_rewards/rejected_sum": -0.7839928269386292, "grad_norm": 29.60091559742249, "kl": 0.22372007369995117, "learning_rate": 2.7708095781071834e-07, "logps/chosen": -324.56468441611844, "logps/rejected": -461.18638392857144, "loss": 0.4437, "rewards/chosen": 1.3998164126747532, "rewards/margins": 2.890947968141178, "rewards/rejected": -1.4911315554664248, "step": 490 }, { "count/fg_chosen": 27.3157901763916, "count/fg_rejected": 5.294117450714111, "epoch": 0.5128205128205128, "fg_kl": NaN, "fg_logps/policy_KL": -14.019055366516113, "fg_logps/policy_chosen": -6.862349033355713, "fg_logps/policy_rejected": -8.392266273498535, "fg_logps/reference_KL": -11.065834999084473, "fg_logps/reference_chosen": -6.182069301605225, "fg_logps/reference_rejected": -7.583798408508301, "fg_loss": 0.8486608266830444, "fg_rewards/chosen_sum": -1.7188913822174072, "fg_rewards/rejected_sum": -0.4710962176322937, "grad_norm": 37.93308715806046, "kl": 0.0, "learning_rate": 2.713797035347776e-07, "logps/chosen": -336.33485504518075, "logps/rejected": -412.9320211038961, "loss": 0.4063, "rewards/chosen": 1.6464567988751881, "rewards/margins": 3.429747315121637, "rewards/rejected": -1.783290516246449, "step": 500 }, { "count/fg_chosen": 32.52941131591797, "count/fg_rejected": 6.1875, "epoch": 0.5230769230769231, "fg_kl": NaN, "fg_logps/policy_KL": -14.041118621826172, "fg_logps/policy_chosen": -6.475778579711914, "fg_logps/policy_rejected": -8.933878898620605, "fg_logps/reference_KL": -11.386185646057129, "fg_logps/reference_chosen": -6.088446617126465, "fg_logps/reference_rejected": -8.2723970413208, "fg_loss": 0.6639065742492676, "fg_rewards/chosen_sum": -1.039247989654541, "fg_rewards/rejected_sum": -0.31398898363113403, "grad_norm": 51.003351409032156, "kl": 0.0, "learning_rate": 2.6567844925883693e-07, "logps/chosen": -315.5387290396341, "logps/rejected": -434.9411057692308, "loss": 0.4328, "rewards/chosen": 1.3493434626881669, "rewards/margins": 2.8553199195503964, "rewards/rejected": -1.5059764568622296, "step": 510 }, { "count/fg_chosen": 29.549999237060547, "count/fg_rejected": 7.300000190734863, "epoch": 0.5333333333333333, "fg_kl": NaN, "fg_logps/policy_KL": -15.461477279663086, "fg_logps/policy_chosen": -7.2638678550720215, "fg_logps/policy_rejected": -8.44337272644043, "fg_logps/reference_KL": -12.010942459106445, "fg_logps/reference_chosen": -6.417178153991699, "fg_logps/reference_rejected": -7.042668342590332, "fg_loss": 0.8494647145271301, "fg_rewards/chosen_sum": -2.0671496391296387, "fg_rewards/rejected_sum": -0.9835360646247864, "grad_norm": 41.33647069198984, "kl": 0.0, "learning_rate": 2.599771949828962e-07, "logps/chosen": -332.418183117378, "logps/rejected": -371.52498998397436, "loss": 0.455, "rewards/chosen": 1.320475787651248, "rewards/margins": 3.076387394659962, "rewards/rejected": -1.755911607008714, "step": 520 }, { "count/fg_chosen": 30.526315689086914, "count/fg_rejected": 9.11111068725586, "epoch": 0.5435897435897435, "fg_kl": NaN, "fg_logps/policy_KL": -13.753694534301758, "fg_logps/policy_chosen": -6.313483238220215, "fg_logps/policy_rejected": -8.319221496582031, "fg_logps/reference_KL": -10.8753080368042, "fg_logps/reference_chosen": -5.6761603355407715, "fg_logps/reference_rejected": -7.430839538574219, "fg_loss": 0.7786957621574402, "fg_rewards/chosen_sum": -1.3800252676010132, "fg_rewards/rejected_sum": -1.1956380605697632, "grad_norm": 34.93340197656234, "kl": 0.0, "learning_rate": 2.542759407069555e-07, "logps/chosen": -311.92927758487656, "logps/rejected": -372.22604331487344, "loss": 0.448, "rewards/chosen": 1.0696545824592496, "rewards/margins": 3.1661021045864253, "rewards/rejected": -2.0964475221271757, "step": 530 }, { "count/fg_chosen": 26.764705657958984, "count/fg_rejected": 6.352941036224365, "epoch": 0.5538461538461539, "fg_kl": NaN, "fg_logps/policy_KL": -13.03756046295166, "fg_logps/policy_chosen": -6.464415073394775, "fg_logps/policy_rejected": -7.652871608734131, "fg_logps/reference_KL": -10.35094165802002, "fg_logps/reference_chosen": -5.7773261070251465, "fg_logps/reference_rejected": -6.825320720672607, "fg_loss": 0.8047051429748535, "fg_rewards/chosen_sum": -1.6725194454193115, "fg_rewards/rejected_sum": -0.7678513526916504, "grad_norm": 41.240539642737886, "kl": 0.0, "learning_rate": 2.485746864310148e-07, "logps/chosen": -345.1589215158046, "logps/rejected": -397.16462435787673, "loss": 0.4683, "rewards/chosen": 1.1091806696749282, "rewards/margins": 2.9815141440301294, "rewards/rejected": -1.8723334743552011, "step": 540 }, { "count/fg_chosen": 36.0, "count/fg_rejected": 7.176470756530762, "epoch": 0.5641025641025641, "fg_kl": NaN, "fg_logps/policy_KL": -13.34708023071289, "fg_logps/policy_chosen": -6.144561767578125, "fg_logps/policy_rejected": -7.657267093658447, "fg_logps/reference_KL": -10.591930389404297, "fg_logps/reference_chosen": -5.485869884490967, "fg_logps/reference_rejected": -6.727408409118652, "fg_loss": 0.7655860185623169, "fg_rewards/chosen_sum": -1.8279484510421753, "fg_rewards/rejected_sum": -0.5319306254386902, "grad_norm": 34.418931407344175, "kl": 0.0, "learning_rate": 2.428734321550741e-07, "logps/chosen": -332.24548669763516, "logps/rejected": -384.9080214389535, "loss": 0.4533, "rewards/chosen": 1.4185297166978992, "rewards/margins": 3.59055198175513, "rewards/rejected": -2.172022265057231, "step": 550 }, { "count/fg_chosen": 30.6842098236084, "count/fg_rejected": 5.842105388641357, "epoch": 0.5743589743589743, "fg_kl": NaN, "fg_logps/policy_KL": -12.044573783874512, "fg_logps/policy_chosen": -6.35300874710083, "fg_logps/policy_rejected": -8.149062156677246, "fg_logps/reference_KL": -9.564573287963867, "fg_logps/reference_chosen": -5.577574253082275, "fg_logps/reference_rejected": -7.246463298797607, "fg_loss": 0.7160053849220276, "fg_rewards/chosen_sum": -1.8285123109817505, "fg_rewards/rejected_sum": -0.428337961435318, "grad_norm": 32.969063143022574, "kl": 0.0, "learning_rate": 2.371721778791334e-07, "logps/chosen": -404.944683908046, "logps/rejected": -394.23758561643837, "loss": 0.4521, "rewards/chosen": 1.1878378857141254, "rewards/margins": 2.883744642389532, "rewards/rejected": -1.6959067566754067, "step": 560 }, { "count/fg_chosen": 29.5, "count/fg_rejected": 6.733333110809326, "epoch": 0.5846153846153846, "fg_kl": NaN, "fg_logps/policy_KL": -12.527681350708008, "fg_logps/policy_chosen": -6.548896789550781, "fg_logps/policy_rejected": -6.040011882781982, "fg_logps/reference_KL": -9.865092277526855, "fg_logps/reference_chosen": -5.893582344055176, "fg_logps/reference_rejected": -5.497416019439697, "fg_loss": 0.5755335092544556, "fg_rewards/chosen_sum": -1.5131157636642456, "fg_rewards/rejected_sum": -0.35849064588546753, "grad_norm": 40.43474566516004, "kl": 0.0, "learning_rate": 2.314709236031927e-07, "logps/chosen": -354.1348353794643, "logps/rejected": -417.60911800986844, "loss": 0.4105, "rewards/chosen": 0.7818209330240885, "rewards/margins": 3.425596471418414, "rewards/rejected": -2.6437755383943258, "step": 570 }, { "count/fg_chosen": 31.428571701049805, "count/fg_rejected": 7.599999904632568, "epoch": 0.5948717948717949, "fg_kl": NaN, "fg_logps/policy_KL": -13.412928581237793, "fg_logps/policy_chosen": -6.371100902557373, "fg_logps/policy_rejected": -7.6736016273498535, "fg_logps/reference_KL": -10.896791458129883, "fg_logps/reference_chosen": -5.65976095199585, "fg_logps/reference_rejected": -6.982507228851318, "fg_loss": 0.7268858551979065, "fg_rewards/chosen_sum": -1.6253752708435059, "fg_rewards/rejected_sum": -0.8460947871208191, "grad_norm": 57.39186053322085, "kl": 0.0, "learning_rate": 2.2576966932725198e-07, "logps/chosen": -291.66895736882714, "logps/rejected": -384.9341623813291, "loss": 0.4683, "rewards/chosen": 1.6188120900848766, "rewards/margins": 2.9102534159847977, "rewards/rejected": -1.2914413258999209, "step": 580 }, { "count/fg_chosen": 28.904762268066406, "count/fg_rejected": 6.526315689086914, "epoch": 0.6051282051282051, "fg_kl": NaN, "fg_logps/policy_KL": -13.970428466796875, "fg_logps/policy_chosen": -6.568854808807373, "fg_logps/policy_rejected": -8.221002578735352, "fg_logps/reference_KL": -11.08731746673584, "fg_logps/reference_chosen": -5.649308204650879, "fg_logps/reference_rejected": -7.402557373046875, "fg_loss": 0.705423891544342, "fg_rewards/chosen_sum": -2.060758590698242, "fg_rewards/rejected_sum": -0.8252547979354858, "grad_norm": 35.44361919546434, "kl": 0.0, "learning_rate": 2.2006841505131128e-07, "logps/chosen": -442.12862723214283, "logps/rejected": -406.4741981907895, "loss": 0.4365, "rewards/chosen": 1.5486488342285156, "rewards/margins": 3.623551418906764, "rewards/rejected": -2.0749025846782483, "step": 590 }, { "count/fg_chosen": 27.83333396911621, "count/fg_rejected": 6.2727274894714355, "epoch": 0.6153846153846154, "fg_kl": NaN, "fg_logps/policy_KL": -14.578009605407715, "fg_logps/policy_chosen": -7.306002140045166, "fg_logps/policy_rejected": -8.329404830932617, "fg_logps/reference_KL": -11.412123680114746, "fg_logps/reference_chosen": -6.204747676849365, "fg_logps/reference_rejected": -6.661261081695557, "fg_loss": 0.649915337562561, "fg_rewards/chosen_sum": -2.443979263305664, "fg_rewards/rejected_sum": -0.9118065237998962, "grad_norm": 48.33661978525442, "kl": 0.0, "learning_rate": 2.1436716077537057e-07, "logps/chosen": -353.3546720805921, "logps/rejected": -475.7469773065476, "loss": 0.4239, "rewards/chosen": 1.3294219970703125, "rewards/margins": 3.233419145856585, "rewards/rejected": -1.9039971487862724, "step": 600 }, { "count/fg_chosen": 25.647058486938477, "count/fg_rejected": 6.470588207244873, "epoch": 0.6256410256410256, "fg_kl": NaN, "fg_logps/policy_KL": -13.512959480285645, "fg_logps/policy_chosen": -6.967007160186768, "fg_logps/policy_rejected": -8.427515029907227, "fg_logps/reference_KL": -10.780054092407227, "fg_logps/reference_chosen": -6.145755290985107, "fg_logps/reference_rejected": -7.377350807189941, "fg_loss": 0.8846892714500427, "fg_rewards/chosen_sum": -1.872863531112671, "fg_rewards/rejected_sum": -0.8110222816467285, "grad_norm": 27.94628622877484, "kl": 0.0, "learning_rate": 2.0866590649942987e-07, "logps/chosen": -325.96284054487177, "logps/rejected": -358.4112280868902, "loss": 0.443, "rewards/chosen": 1.2946516183706431, "rewards/margins": 2.8088877894417656, "rewards/rejected": -1.5142361710711223, "step": 610 }, { "count/fg_chosen": 34.238094329833984, "count/fg_rejected": 6.949999809265137, "epoch": 0.6358974358974359, "fg_kl": NaN, "fg_logps/policy_KL": -14.278539657592773, "fg_logps/policy_chosen": -6.847230434417725, "fg_logps/policy_rejected": -9.253725051879883, "fg_logps/reference_KL": -11.118898391723633, "fg_logps/reference_chosen": -5.77498722076416, "fg_logps/reference_rejected": -7.873915195465088, "fg_loss": 0.7475059628486633, "fg_rewards/chosen_sum": -2.7328929901123047, "fg_rewards/rejected_sum": -1.1806801557540894, "grad_norm": 37.21517260596127, "kl": 0.0, "learning_rate": 2.0296465222348917e-07, "logps/chosen": -345.37012924382714, "logps/rejected": -452.38132911392404, "loss": 0.4182, "rewards/chosen": 2.110634132667824, "rewards/margins": 4.059031805296879, "rewards/rejected": -1.9483976726290546, "step": 620 }, { "count/fg_chosen": 28.3125, "count/fg_rejected": 7.4375, "epoch": 0.6461538461538462, "fg_kl": NaN, "fg_logps/policy_KL": -15.633122444152832, "fg_logps/policy_chosen": -6.758295059204102, "fg_logps/policy_rejected": -8.180941581726074, "fg_logps/reference_KL": -12.073482513427734, "fg_logps/reference_chosen": -5.663504600524902, "fg_logps/reference_rejected": -6.7309250831604, "fg_loss": 0.8814060091972351, "fg_rewards/chosen_sum": -2.544795513153076, "fg_rewards/rejected_sum": -1.284183144569397, "grad_norm": 42.32817075648944, "kl": 0.0, "learning_rate": 1.9726339794754846e-07, "logps/chosen": -351.775993441358, "logps/rejected": -522.9557950949367, "loss": 0.4343, "rewards/chosen": 1.261369964222849, "rewards/margins": 3.565065836083015, "rewards/rejected": -2.303695871860166, "step": 630 }, { "count/fg_chosen": 33.64706039428711, "count/fg_rejected": 8.058823585510254, "epoch": 0.6564102564102564, "fg_kl": NaN, "fg_logps/policy_KL": -13.795089721679688, "fg_logps/policy_chosen": -6.418089389801025, "fg_logps/policy_rejected": -8.086543083190918, "fg_logps/reference_KL": -10.756563186645508, "fg_logps/reference_chosen": -5.482754230499268, "fg_logps/reference_rejected": -6.624981880187988, "fg_loss": 0.8172480463981628, "fg_rewards/chosen_sum": -2.2949178218841553, "fg_rewards/rejected_sum": -1.4898707866668701, "grad_norm": 49.06648139657291, "kl": 0.0, "learning_rate": 1.9156214367160776e-07, "logps/chosen": -323.18095703125, "logps/rejected": -457.679443359375, "loss": 0.3975, "rewards/chosen": 1.7368663787841796, "rewards/margins": 4.7420207977294915, "rewards/rejected": -3.0051544189453123, "step": 640 }, { "count/fg_chosen": 26.785715103149414, "count/fg_rejected": 6.0, "epoch": 0.6666666666666666, "fg_kl": NaN, "fg_logps/policy_KL": -16.923561096191406, "fg_logps/policy_chosen": -7.209476470947266, "fg_logps/policy_rejected": -8.816498756408691, "fg_logps/reference_KL": -12.868348121643066, "fg_logps/reference_chosen": -5.93485164642334, "fg_logps/reference_rejected": -7.260035991668701, "fg_loss": 0.7890381217002869, "fg_rewards/chosen_sum": -2.429896831512451, "fg_rewards/rejected_sum": -0.9942983388900757, "grad_norm": 42.684857833622196, "kl": 0.0, "learning_rate": 1.8586088939566706e-07, "logps/chosen": -307.12525576636904, "logps/rejected": -396.83095189144734, "loss": 0.4325, "rewards/chosen": 1.7140017918178014, "rewards/margins": 3.020266568750367, "rewards/rejected": -1.3062647769325657, "step": 650 }, { "count/fg_chosen": 32.578948974609375, "count/fg_rejected": 5.5789475440979, "epoch": 0.676923076923077, "fg_kl": NaN, "fg_logps/policy_KL": -13.829089164733887, "fg_logps/policy_chosen": -6.605353832244873, "fg_logps/policy_rejected": -8.679577827453613, "fg_logps/reference_KL": -11.020931243896484, "fg_logps/reference_chosen": -5.843183517456055, "fg_logps/reference_rejected": -7.641061305999756, "fg_loss": 0.7101105451583862, "fg_rewards/chosen_sum": -1.6802619695663452, "fg_rewards/rejected_sum": -0.5640377998352051, "grad_norm": 51.20082166149615, "kl": 0.0, "learning_rate": 1.8015963511972635e-07, "logps/chosen": -326.6531840479651, "logps/rejected": -467.38508234797297, "loss": 0.3955, "rewards/chosen": 1.7110205362009447, "rewards/margins": 3.708857912151114, "rewards/rejected": -1.9978373759501689, "step": 660 }, { "count/fg_chosen": 34.52941131591797, "count/fg_rejected": 8.882352828979492, "epoch": 0.6871794871794872, "fg_kl": NaN, "fg_logps/policy_KL": -15.263018608093262, "fg_logps/policy_chosen": -7.245337009429932, "fg_logps/policy_rejected": -10.788996696472168, "fg_logps/reference_KL": -11.636683464050293, "fg_logps/reference_chosen": -6.2825164794921875, "fg_logps/reference_rejected": -8.302498817443848, "fg_loss": 0.9282689094543457, "fg_rewards/chosen_sum": -2.5015087127685547, "fg_rewards/rejected_sum": -1.4177420139312744, "grad_norm": 43.77061774737123, "kl": 0.16583053767681122, "learning_rate": 1.7445838084378562e-07, "logps/chosen": -347.34482020547944, "logps/rejected": -440.41316451149424, "loss": 0.4428, "rewards/chosen": 1.4226540343402183, "rewards/margins": 3.992055567094349, "rewards/rejected": -2.5694015327541306, "step": 670 }, { "count/fg_chosen": 33.0625, "count/fg_rejected": 6.1875, "epoch": 0.6974358974358974, "fg_kl": NaN, "fg_logps/policy_KL": -13.69076156616211, "fg_logps/policy_chosen": -6.070934295654297, "fg_logps/policy_rejected": -9.86292552947998, "fg_logps/reference_KL": -10.596210479736328, "fg_logps/reference_chosen": -5.411348342895508, "fg_logps/reference_rejected": -8.194981575012207, "fg_loss": 0.7803856730461121, "fg_rewards/chosen_sum": -1.5216065645217896, "fg_rewards/rejected_sum": -0.7260585427284241, "grad_norm": 41.07172408412241, "kl": 0.0, "learning_rate": 1.6875712656784492e-07, "logps/chosen": -321.7051943824405, "logps/rejected": -402.75840357730266, "loss": 0.431, "rewards/chosen": 1.6818878537132627, "rewards/margins": 2.9571292035860526, "rewards/rejected": -1.27524134987279, "step": 680 }, { "count/fg_chosen": 33.04166793823242, "count/fg_rejected": 7.956521511077881, "epoch": 0.7076923076923077, "fg_kl": NaN, "fg_logps/policy_KL": -13.681419372558594, "fg_logps/policy_chosen": -6.311405181884766, "fg_logps/policy_rejected": -8.458182334899902, "fg_logps/reference_KL": -10.43921947479248, "fg_logps/reference_chosen": -5.538773059844971, "fg_logps/reference_rejected": -7.221550941467285, "fg_loss": 0.774190366268158, "fg_rewards/chosen_sum": -2.0744409561157227, "fg_rewards/rejected_sum": -1.0601459741592407, "grad_norm": 26.76398723713879, "kl": 0.0, "learning_rate": 1.6305587229190422e-07, "logps/chosen": -337.47130408653845, "logps/rejected": -426.1006573932927, "loss": 0.4689, "rewards/chosen": 1.7125216753054888, "rewards/margins": 3.957819735280718, "rewards/rejected": -2.245298059975229, "step": 690 }, { "count/fg_chosen": 29.933332443237305, "count/fg_rejected": 7.5, "epoch": 0.717948717948718, "fg_kl": NaN, "fg_logps/policy_KL": -18.62053871154785, "fg_logps/policy_chosen": -6.723959445953369, "fg_logps/policy_rejected": -8.809980392456055, "fg_logps/reference_KL": -14.621339797973633, "fg_logps/reference_chosen": -5.710059642791748, "fg_logps/reference_rejected": -7.013789176940918, "fg_loss": 0.7358676791191101, "fg_rewards/chosen_sum": -2.391604423522949, "fg_rewards/rejected_sum": -1.1176395416259766, "grad_norm": 32.71207561201088, "kl": 0.0, "learning_rate": 1.573546180159635e-07, "logps/chosen": -377.94707661290323, "logps/rejected": -423.6705923507463, "loss": 0.3983, "rewards/chosen": 1.9157637729439685, "rewards/margins": 3.7593996736406803, "rewards/rejected": -1.8436359006967118, "step": 700 }, { "count/fg_chosen": 34.0, "count/fg_rejected": 7.0, "epoch": 0.7282051282051282, "fg_kl": NaN, "fg_logps/policy_KL": -12.678239822387695, "fg_logps/policy_chosen": -5.730792999267578, "fg_logps/policy_rejected": -6.297366142272949, "fg_logps/reference_KL": -10.122818946838379, "fg_logps/reference_chosen": -5.305339813232422, "fg_logps/reference_rejected": -5.903895378112793, "fg_loss": 0.8335784673690796, "fg_rewards/chosen_sum": -1.278421401977539, "fg_rewards/rejected_sum": -0.4984094202518463, "grad_norm": 30.093125040941494, "kl": 0.0, "learning_rate": 1.516533637400228e-07, "logps/chosen": -311.1296672077922, "logps/rejected": -378.9050263554217, "loss": 0.4411, "rewards/chosen": 1.7025673606178977, "rewards/margins": 4.042804529998545, "rewards/rejected": -2.3402371693806474, "step": 710 }, { "count/fg_chosen": 25.214284896850586, "count/fg_rejected": 4.538461685180664, "epoch": 0.7384615384615385, "fg_kl": NaN, "fg_logps/policy_KL": -15.262101173400879, "fg_logps/policy_chosen": -6.9626898765563965, "fg_logps/policy_rejected": -9.714632034301758, "fg_logps/reference_KL": -11.78027057647705, "fg_logps/reference_chosen": -6.246325969696045, "fg_logps/reference_rejected": -8.87566089630127, "fg_loss": 0.7088484764099121, "fg_rewards/chosen_sum": -1.5614358186721802, "fg_rewards/rejected_sum": -0.42503052949905396, "grad_norm": 41.939062686169684, "kl": 0.0, "learning_rate": 1.459521094640821e-07, "logps/chosen": -357.324462890625, "logps/rejected": -446.333984375, "loss": 0.3344, "rewards/chosen": 1.5038203239440917, "rewards/margins": 4.055677318572998, "rewards/rejected": -2.5518569946289062, "step": 720 }, { "count/fg_chosen": 34.75, "count/fg_rejected": 8.470588684082031, "epoch": 0.7487179487179487, "fg_kl": NaN, "fg_logps/policy_KL": -13.44446086883545, "fg_logps/policy_chosen": -6.577872276306152, "fg_logps/policy_rejected": -8.81990909576416, "fg_logps/reference_KL": -10.716168403625488, "fg_logps/reference_chosen": -6.01681661605835, "fg_logps/reference_rejected": -8.056208610534668, "fg_loss": 0.7597875595092773, "fg_rewards/chosen_sum": -1.6217119693756104, "fg_rewards/rejected_sum": -1.0568969249725342, "grad_norm": 29.217422708191275, "kl": 0.0, "learning_rate": 1.402508551881414e-07, "logps/chosen": -325.07026041666666, "logps/rejected": -434.04232536764704, "loss": 0.3915, "rewards/chosen": 2.13581298828125, "rewards/margins": 5.36644473805147, "rewards/rejected": -3.2306317497702204, "step": 730 }, { "count/fg_chosen": 32.095237731933594, "count/fg_rejected": 8.7619047164917, "epoch": 0.7589743589743589, "fg_kl": NaN, "fg_logps/policy_KL": -16.643653869628906, "fg_logps/policy_chosen": -7.633936405181885, "fg_logps/policy_rejected": -8.764749526977539, "fg_logps/reference_KL": -12.597454071044922, "fg_logps/reference_chosen": -6.6152801513671875, "fg_logps/reference_rejected": -7.493948936462402, "fg_loss": 0.8724325895309448, "fg_rewards/chosen_sum": -2.190063714981079, "fg_rewards/rejected_sum": -1.0442728996276855, "grad_norm": 35.508557979266605, "kl": 0.0, "learning_rate": 1.345496009122007e-07, "logps/chosen": -323.4399604301948, "logps/rejected": -447.21136106927713, "loss": 0.4821, "rewards/chosen": 1.2747364787312296, "rewards/margins": 3.8379994181843546, "rewards/rejected": -2.563262939453125, "step": 740 }, { "count/fg_chosen": 35.55555725097656, "count/fg_rejected": 7.764705657958984, "epoch": 0.7692307692307693, "fg_kl": NaN, "fg_logps/policy_KL": -12.14694881439209, "fg_logps/policy_chosen": -5.938383102416992, "fg_logps/policy_rejected": -6.999124526977539, "fg_logps/reference_KL": -9.31165599822998, "fg_logps/reference_chosen": -5.441190242767334, "fg_logps/reference_rejected": -6.349566459655762, "fg_loss": 0.7159730792045593, "fg_rewards/chosen_sum": -1.1845917701721191, "fg_rewards/rejected_sum": -0.6632856726646423, "grad_norm": 29.30713270318622, "kl": 0.0, "learning_rate": 1.2884834663625997e-07, "logps/chosen": -351.3742959665698, "logps/rejected": -477.6983741554054, "loss": 0.4247, "rewards/chosen": 1.214830709058185, "rewards/margins": 3.774690938550372, "rewards/rejected": -2.5598602294921875, "step": 750 }, { "count/fg_chosen": 29.450000762939453, "count/fg_rejected": 8.941176414489746, "epoch": 0.7794871794871795, "fg_kl": NaN, "fg_logps/policy_KL": -13.91418743133545, "fg_logps/policy_chosen": -6.500802516937256, "fg_logps/policy_rejected": -7.696014404296875, "fg_logps/reference_KL": -10.872803688049316, "fg_logps/reference_chosen": -6.060533046722412, "fg_logps/reference_rejected": -6.9095940589904785, "fg_loss": 0.8821809887886047, "fg_rewards/chosen_sum": -1.222095012664795, "fg_rewards/rejected_sum": -0.8427726030349731, "grad_norm": 23.409794702847652, "kl": 0.0, "learning_rate": 1.2314709236031927e-07, "logps/chosen": -350.1331449468085, "logps/rejected": -399.28329190340907, "loss": 0.4556, "rewards/chosen": 1.079000432440575, "rewards/margins": 3.400327179525684, "rewards/rejected": -2.321326747085109, "step": 760 }, { "count/fg_chosen": 26.789474487304688, "count/fg_rejected": 7.0, "epoch": 0.7897435897435897, "fg_kl": NaN, "fg_logps/policy_KL": -14.873922348022461, "fg_logps/policy_chosen": -7.308094024658203, "fg_logps/policy_rejected": -7.175009727478027, "fg_logps/reference_KL": -11.33347225189209, "fg_logps/reference_chosen": -6.172826766967773, "fg_logps/reference_rejected": -6.013812065124512, "fg_loss": 0.7144444584846497, "fg_rewards/chosen_sum": -2.5322296619415283, "fg_rewards/rejected_sum": -0.9527682065963745, "grad_norm": 39.24437423103846, "kl": 0.0, "learning_rate": 1.1744583808437855e-07, "logps/chosen": -331.80161458333333, "logps/rejected": -438.73373161764704, "loss": 0.4217, "rewards/chosen": 1.7163297526041668, "rewards/margins": 3.8777695360370714, "rewards/rejected": -2.1614397834329044, "step": 770 }, { "count/fg_chosen": 32.1875, "count/fg_rejected": 5.8125, "epoch": 0.8, "fg_kl": NaN, "fg_logps/policy_KL": -13.304938316345215, "fg_logps/policy_chosen": -6.930516242980957, "fg_logps/policy_rejected": -7.694925785064697, "fg_logps/reference_KL": -10.620773315429688, "fg_logps/reference_chosen": -6.272948265075684, "fg_logps/reference_rejected": -6.997465133666992, "fg_loss": 0.7136435508728027, "fg_rewards/chosen_sum": -1.7926644086837769, "fg_rewards/rejected_sum": -0.3232946991920471, "grad_norm": 24.909110123805196, "kl": 0.0, "learning_rate": 1.1174458380843785e-07, "logps/chosen": -303.2761627906977, "logps/rejected": -392.0064400337838, "loss": 0.4265, "rewards/chosen": 1.246623904206032, "rewards/margins": 3.1565479731724597, "rewards/rejected": -1.9099240689664274, "step": 780 }, { "count/fg_chosen": 31.0, "count/fg_rejected": 6.0, "epoch": 0.8102564102564103, "fg_kl": NaN, "fg_logps/policy_KL": -13.473762512207031, "fg_logps/policy_chosen": -6.058125972747803, "fg_logps/policy_rejected": -7.64716100692749, "fg_logps/reference_KL": -10.569419860839844, "fg_logps/reference_chosen": -5.636739730834961, "fg_logps/reference_rejected": -6.955509662628174, "fg_loss": 0.6918079257011414, "fg_rewards/chosen_sum": -0.8551385998725891, "fg_rewards/rejected_sum": -0.39899593591690063, "grad_norm": 22.682895979877173, "kl": 0.0, "learning_rate": 1.0604332953249714e-07, "logps/chosen": -329.2703077936747, "logps/rejected": -374.5271154626623, "loss": 0.4628, "rewards/chosen": 1.6247121052569653, "rewards/margins": 2.985569189456945, "rewards/rejected": -1.3608570841999796, "step": 790 }, { "count/fg_chosen": 32.46666717529297, "count/fg_rejected": 5.666666507720947, "epoch": 0.8205128205128205, "fg_kl": NaN, "fg_logps/policy_KL": -14.543877601623535, "fg_logps/policy_chosen": -7.2420654296875, "fg_logps/policy_rejected": -8.499801635742188, "fg_logps/reference_KL": -11.280120849609375, "fg_logps/reference_chosen": -6.594120502471924, "fg_logps/reference_rejected": -7.241811275482178, "fg_loss": 0.6977981925010681, "fg_rewards/chosen_sum": -1.8107116222381592, "fg_rewards/rejected_sum": -0.6922832727432251, "grad_norm": 39.98049317260069, "kl": 0.0, "learning_rate": 1.0034207525655644e-07, "logps/chosen": -359.1409722222222, "logps/rejected": -515.9350446428572, "loss": 0.4043, "rewards/chosen": 1.5236521402994792, "rewards/margins": 3.4145729428245906, "rewards/rejected": -1.8909208025251116, "step": 800 }, { "epoch": 0.8205128205128205, "eval_count/fg_chosen": 30.183246612548828, "eval_count/fg_rejected": 6.92391300201416, "eval_fg_kl": NaN, "eval_fg_logps/policy_KL": -14.794645309448242, "eval_fg_logps/policy_chosen": -6.733245849609375, "eval_fg_logps/policy_rejected": -8.626864433288574, "eval_fg_logps/reference_KL": -11.47359848022461, "eval_fg_logps/reference_chosen": -6.041894912719727, "eval_fg_logps/reference_rejected": -7.58065938949585, "eval_fg_loss": 0.762517511844635, "eval_fg_rewards/chosen_sum": -1.556026816368103, "eval_fg_rewards/rejected_sum": -0.9032577276229858, "eval_kl": 0.014131884090602398, "eval_logps/chosen": -336.04120131729667, "eval_logps/rejected": -406.1173232908459, "eval_loss": 0.41103363037109375, "eval_rewards/chosen": 1.7359535243503006, "eval_rewards/margins": 3.998709942730949, "eval_rewards/rejected": -2.262756418380649, "eval_runtime": 462.7715, "eval_samples_per_second": 3.745, "eval_steps_per_second": 0.938, "step": 800 }, { "count/fg_chosen": 25.733333587646484, "count/fg_rejected": 8.800000190734863, "epoch": 0.8307692307692308, "fg_kl": NaN, "fg_logps/policy_KL": -14.521686553955078, "fg_logps/policy_chosen": -6.098317623138428, "fg_logps/policy_rejected": -7.37031888961792, "fg_logps/reference_KL": -11.138436317443848, "fg_logps/reference_chosen": -5.529090881347656, "fg_logps/reference_rejected": -6.567668437957764, "fg_loss": 0.7037224173545837, "fg_rewards/chosen_sum": -1.2442917823791504, "fg_rewards/rejected_sum": -0.9032351970672607, "grad_norm": 39.139843341578626, "kl": 0.0, "learning_rate": 9.464082098061574e-08, "logps/chosen": -351.8864535108025, "logps/rejected": -414.8218453322785, "loss": 0.4442, "rewards/chosen": 1.5498073248215665, "rewards/margins": 3.332525065809996, "rewards/rejected": -1.7827177409884296, "step": 810 }, { "count/fg_chosen": 29.53333282470703, "count/fg_rejected": 8.714285850524902, "epoch": 0.841025641025641, "fg_kl": NaN, "fg_logps/policy_KL": -16.194475173950195, "fg_logps/policy_chosen": -7.472283840179443, "fg_logps/policy_rejected": -8.897085189819336, "fg_logps/reference_KL": -12.283650398254395, "fg_logps/reference_chosen": -6.056351184844971, "fg_logps/reference_rejected": -7.35612154006958, "fg_loss": 0.8785532712936401, "fg_rewards/chosen_sum": -2.9767565727233887, "fg_rewards/rejected_sum": -1.322200059890747, "grad_norm": 18.084277972168177, "kl": 0.11083474010229111, "learning_rate": 8.893956670467502e-08, "logps/chosen": -357.466950491573, "logps/rejected": -436.3477937940141, "loss": 0.4174, "rewards/chosen": 1.0562572693556882, "rewards/margins": 4.643260735464843, "rewards/rejected": -3.587003466109155, "step": 820 }, { "count/fg_chosen": 38.94117736816406, "count/fg_rejected": 9.133333206176758, "epoch": 0.8512820512820513, "fg_kl": NaN, "fg_logps/policy_KL": -16.23822021484375, "fg_logps/policy_chosen": -6.3059844970703125, "fg_logps/policy_rejected": -7.230159282684326, "fg_logps/reference_KL": -12.178247451782227, "fg_logps/reference_chosen": -5.708430290222168, "fg_logps/reference_rejected": -5.989579200744629, "fg_loss": 0.8925216794013977, "fg_rewards/chosen_sum": -1.6965184211730957, "fg_rewards/rejected_sum": -1.320400357246399, "grad_norm": 36.47201338586735, "kl": 0.0, "learning_rate": 8.323831242873432e-08, "logps/chosen": -357.6101471656977, "logps/rejected": -458.4434121621622, "loss": 0.4677, "rewards/chosen": 1.8875178847202034, "rewards/margins": 3.2487285789043776, "rewards/rejected": -1.3612106941841744, "step": 830 }, { "count/fg_chosen": 32.33333206176758, "count/fg_rejected": 6.941176414489746, "epoch": 0.8615384615384616, "fg_kl": NaN, "fg_logps/policy_KL": -13.08393669128418, "fg_logps/policy_chosen": -6.780251502990723, "fg_logps/policy_rejected": -7.011376857757568, "fg_logps/reference_KL": -10.381501197814941, "fg_logps/reference_chosen": -6.354065418243408, "fg_logps/reference_rejected": -6.308195114135742, "fg_loss": 0.7612662315368652, "fg_rewards/chosen_sum": -1.2925324440002441, "fg_rewards/rejected_sum": -0.4022027552127838, "grad_norm": 32.781155840821306, "kl": 0.0, "learning_rate": 7.753705815279361e-08, "logps/chosen": -331.03251953125, "logps/rejected": -414.158447265625, "loss": 0.4681, "rewards/chosen": 1.1188287734985352, "rewards/margins": 3.2083324432373046, "rewards/rejected": -2.0895036697387694, "step": 840 }, { "count/fg_chosen": 31.047618865966797, "count/fg_rejected": 9.380952835083008, "epoch": 0.8717948717948718, "fg_kl": NaN, "fg_logps/policy_KL": -14.173805236816406, "fg_logps/policy_chosen": -6.497308731079102, "fg_logps/policy_rejected": -8.013084411621094, "fg_logps/reference_KL": -10.995800018310547, "fg_logps/reference_chosen": -5.791131973266602, "fg_logps/reference_rejected": -7.026320457458496, "fg_loss": 0.8731069564819336, "fg_rewards/chosen_sum": -1.6361898183822632, "fg_rewards/rejected_sum": -1.0837599039077759, "grad_norm": 29.75869838237667, "kl": 0.0, "learning_rate": 7.183580387685291e-08, "logps/chosen": -385.02049512987014, "logps/rejected": -396.0473926957831, "loss": 0.4773, "rewards/chosen": 1.234356471470424, "rewards/margins": 3.495361367519464, "rewards/rejected": -2.26100489604904, "step": 850 }, { "count/fg_chosen": 30.14285659790039, "count/fg_rejected": 5.857142925262451, "epoch": 0.882051282051282, "fg_kl": NaN, "fg_logps/policy_KL": -15.854009628295898, "fg_logps/policy_chosen": -6.418879508972168, "fg_logps/policy_rejected": -8.779900550842285, "fg_logps/reference_KL": -12.032855033874512, "fg_logps/reference_chosen": -5.604477405548096, "fg_logps/reference_rejected": -7.313387870788574, "fg_loss": 0.8508789539337158, "fg_rewards/chosen_sum": -1.4381144046783447, "fg_rewards/rejected_sum": -0.7955017685890198, "grad_norm": 25.28218871544972, "kl": 0.0, "learning_rate": 6.613454960091219e-08, "logps/chosen": -300.05953414351853, "logps/rejected": -383.09023931962025, "loss": 0.3998, "rewards/chosen": 1.808587156696084, "rewards/margins": 3.9609218286823378, "rewards/rejected": -2.152334671986254, "step": 860 }, { "count/fg_chosen": 28.25, "count/fg_rejected": 5.800000190734863, "epoch": 0.8923076923076924, "fg_kl": NaN, "fg_logps/policy_KL": -14.163013458251953, "fg_logps/policy_chosen": -6.579830646514893, "fg_logps/policy_rejected": -7.748932361602783, "fg_logps/reference_KL": -10.930763244628906, "fg_logps/reference_chosen": -5.884364128112793, "fg_logps/reference_rejected": -6.846892356872559, "fg_loss": 0.7400026321411133, "fg_rewards/chosen_sum": -1.4189780950546265, "fg_rewards/rejected_sum": -0.8359920978546143, "grad_norm": 35.432818767338894, "kl": 0.0, "learning_rate": 6.043329532497149e-08, "logps/chosen": -343.70828125, "logps/rejected": -468.51098345588235, "loss": 0.3876, "rewards/chosen": 1.4050553385416666, "rewards/margins": 4.349431846469056, "rewards/rejected": -2.9443765079273896, "step": 870 }, { "count/fg_chosen": 23.538461685180664, "count/fg_rejected": 6.166666507720947, "epoch": 0.9025641025641026, "fg_kl": NaN, "fg_logps/policy_KL": -15.849803924560547, "fg_logps/policy_chosen": -6.889503002166748, "fg_logps/policy_rejected": -9.813651084899902, "fg_logps/reference_KL": -11.724679946899414, "fg_logps/reference_chosen": -6.115569114685059, "fg_logps/reference_rejected": -8.466334342956543, "fg_loss": 0.923600435256958, "fg_rewards/chosen_sum": -1.7626667022705078, "fg_rewards/rejected_sum": -0.8649892210960388, "grad_norm": 28.479901425326158, "kl": 0.0, "learning_rate": 5.4732041049030787e-08, "logps/chosen": -310.8540810032895, "logps/rejected": -370.5546642485119, "loss": 0.3796, "rewards/chosen": 2.1148808127955387, "rewards/margins": 4.169318722603016, "rewards/rejected": -2.054437909807478, "step": 880 }, { "count/fg_chosen": 28.647058486938477, "count/fg_rejected": 7.214285850524902, "epoch": 0.9128205128205128, "fg_kl": NaN, "fg_logps/policy_KL": -14.122632026672363, "fg_logps/policy_chosen": -6.575372219085693, "fg_logps/policy_rejected": -8.797600746154785, "fg_logps/reference_KL": -11.014878273010254, "fg_logps/reference_chosen": -6.147511005401611, "fg_logps/reference_rejected": -7.686254024505615, "fg_loss": 0.7267153263092041, "fg_rewards/chosen_sum": -1.0292545557022095, "fg_rewards/rejected_sum": -1.021606206893921, "grad_norm": 14.860543570560992, "kl": 0.0, "learning_rate": 4.9030786773090077e-08, "logps/chosen": -310.51392463235294, "logps/rejected": -438.26770833333336, "loss": 0.4036, "rewards/chosen": 1.7528151568244486, "rewards/margins": 4.661474010991116, "rewards/rejected": -2.9086588541666667, "step": 890 }, { "count/fg_chosen": 25.649999618530273, "count/fg_rejected": 5.099999904632568, "epoch": 0.9230769230769231, "fg_kl": NaN, "fg_logps/policy_KL": -15.193872451782227, "fg_logps/policy_chosen": -6.921626091003418, "fg_logps/policy_rejected": -9.522343635559082, "fg_logps/reference_KL": -11.5450439453125, "fg_logps/reference_chosen": -6.295389175415039, "fg_logps/reference_rejected": -8.422341346740723, "fg_loss": 0.8100715279579163, "fg_rewards/chosen_sum": -1.2453607320785522, "fg_rewards/rejected_sum": -0.6394971609115601, "grad_norm": 26.313505637704687, "kl": 0.0, "learning_rate": 4.332953249714937e-08, "logps/chosen": -379.3786095727848, "logps/rejected": -469.6590470679012, "loss": 0.4082, "rewards/chosen": 1.5717159222952928, "rewards/margins": 4.730814041206847, "rewards/rejected": -3.1590981189115546, "step": 900 }, { "count/fg_chosen": 31.809524536132812, "count/fg_rejected": 7.949999809265137, "epoch": 0.9333333333333333, "fg_kl": NaN, "fg_logps/policy_KL": -13.88550090789795, "fg_logps/policy_chosen": -6.575550556182861, "fg_logps/policy_rejected": -8.97218132019043, "fg_logps/reference_KL": -10.913046836853027, "fg_logps/reference_chosen": -5.786440372467041, "fg_logps/reference_rejected": -8.161312103271484, "fg_loss": 0.6918947696685791, "fg_rewards/chosen_sum": -1.8372831344604492, "fg_rewards/rejected_sum": -0.7708438038825989, "grad_norm": 30.704094667430244, "kl": 0.0, "learning_rate": 3.762827822120866e-08, "logps/chosen": -342.4310891544118, "logps/rejected": -465.26286458333334, "loss": 0.394, "rewards/chosen": 1.6974679385914522, "rewards/margins": 5.162705975700828, "rewards/rejected": -3.465238037109375, "step": 910 }, { "count/fg_chosen": 33.44444274902344, "count/fg_rejected": 7.05555534362793, "epoch": 0.9435897435897436, "fg_kl": NaN, "fg_logps/policy_KL": -16.852989196777344, "fg_logps/policy_chosen": -8.116382598876953, "fg_logps/policy_rejected": -9.44908332824707, "fg_logps/reference_KL": -12.696809768676758, "fg_logps/reference_chosen": -6.711515426635742, "fg_logps/reference_rejected": -8.2290678024292, "fg_loss": 0.8823240995407104, "fg_rewards/chosen_sum": -2.763169527053833, "fg_rewards/rejected_sum": -0.838661789894104, "grad_norm": 28.750070922344392, "kl": 0.0, "learning_rate": 3.192702394526796e-08, "logps/chosen": -369.98503449675326, "logps/rejected": -378.85448042168673, "loss": 0.455, "rewards/chosen": 1.3270432113052963, "rewards/margins": 4.1537548429882785, "rewards/rejected": -2.826711631682982, "step": 920 }, { "count/fg_chosen": 31.0, "count/fg_rejected": 7.222222328186035, "epoch": 0.9538461538461539, "fg_kl": NaN, "fg_logps/policy_KL": -16.319583892822266, "fg_logps/policy_chosen": -7.1412739753723145, "fg_logps/policy_rejected": -8.26430892944336, "fg_logps/reference_KL": -12.383834838867188, "fg_logps/reference_chosen": -6.26031494140625, "fg_logps/reference_rejected": -7.345766544342041, "fg_loss": 0.7640350461006165, "fg_rewards/chosen_sum": -1.9626283645629883, "fg_rewards/rejected_sum": -0.9573346376419067, "grad_norm": 30.27804248529551, "kl": 0.0, "learning_rate": 2.6225769669327253e-08, "logps/chosen": -328.4565281723485, "logps/rejected": -376.20595079787233, "loss": 0.3767, "rewards/chosen": 2.0332070552941524, "rewards/margins": 5.176523121305622, "rewards/rejected": -3.1433160660114696, "step": 930 }, { "count/fg_chosen": 30.625, "count/fg_rejected": 7.0, "epoch": 0.9641025641025641, "fg_kl": NaN, "fg_logps/policy_KL": -15.706071853637695, "fg_logps/policy_chosen": -7.53517484664917, "fg_logps/policy_rejected": -10.4346284866333, "fg_logps/reference_KL": -11.768562316894531, "fg_logps/reference_chosen": -6.489566326141357, "fg_logps/reference_rejected": -9.170198440551758, "fg_loss": 0.7636561989784241, "fg_rewards/chosen_sum": -2.5118489265441895, "fg_rewards/rejected_sum": -0.7650282979011536, "grad_norm": 39.27702261397548, "kl": 0.0, "learning_rate": 2.0524515393386543e-08, "logps/chosen": -346.63032670454544, "logps/rejected": -424.40479103915663, "loss": 0.4212, "rewards/chosen": 1.459620389071378, "rewards/margins": 4.2732272947213215, "rewards/rejected": -2.8136069056499435, "step": 940 }, { "count/fg_chosen": 25.272727966308594, "count/fg_rejected": 4.363636493682861, "epoch": 0.9743589743589743, "fg_kl": NaN, "fg_logps/policy_KL": -14.18252182006836, "fg_logps/policy_chosen": -6.842197895050049, "fg_logps/policy_rejected": -7.672128677368164, "fg_logps/reference_KL": -10.544868469238281, "fg_logps/reference_chosen": -5.763217449188232, "fg_logps/reference_rejected": -6.418917655944824, "fg_loss": 0.6513127088546753, "fg_rewards/chosen_sum": -1.7274693250656128, "fg_rewards/rejected_sum": -0.795689582824707, "grad_norm": 23.77562108897806, "kl": 0.0, "learning_rate": 1.4823261117445838e-08, "logps/chosen": -349.6753555689103, "logps/rejected": -403.2014100609756, "loss": 0.3828, "rewards/chosen": 1.3632464286608574, "rewards/margins": 3.730917743923815, "rewards/rejected": -2.367671315262957, "step": 950 }, { "count/fg_chosen": 26.649999618530273, "count/fg_rejected": 6.5, "epoch": 0.9846153846153847, "fg_kl": NaN, "fg_logps/policy_KL": -15.6975736618042, "fg_logps/policy_chosen": -7.5320539474487305, "fg_logps/policy_rejected": -9.357258796691895, "fg_logps/reference_KL": -12.039111137390137, "fg_logps/reference_chosen": -6.5699005126953125, "fg_logps/reference_rejected": -8.148908615112305, "fg_loss": 0.6992577910423279, "fg_rewards/chosen_sum": -1.6473188400268555, "fg_rewards/rejected_sum": -0.9504286646842957, "grad_norm": 29.616218116706296, "kl": 0.0, "learning_rate": 9.122006841505132e-09, "logps/chosen": -366.2652652138158, "logps/rejected": -412.12672061011904, "loss": 0.4654, "rewards/chosen": 0.8081491369950143, "rewards/margins": 3.221180578819791, "rewards/rejected": -2.413031441824777, "step": 960 }, { "count/fg_chosen": 27.294116973876953, "count/fg_rejected": 6.294117450714111, "epoch": 0.9948717948717949, "fg_kl": NaN, "fg_logps/policy_KL": -15.349953651428223, "fg_logps/policy_chosen": -7.822242259979248, "fg_logps/policy_rejected": -8.426324844360352, "fg_logps/reference_KL": -11.580068588256836, "fg_logps/reference_chosen": -6.358785629272461, "fg_logps/reference_rejected": -7.00697135925293, "fg_loss": 0.8312911987304688, "fg_rewards/chosen_sum": -2.5030882358551025, "fg_rewards/rejected_sum": -0.9714083671569824, "grad_norm": 26.639106291790316, "kl": 0.0, "learning_rate": 3.420752565564424e-09, "logps/chosen": -419.9399809966216, "logps/rejected": -412.7228379360465, "loss": 0.4666, "rewards/chosen": 0.5730146459631018, "rewards/margins": 2.736655932413865, "rewards/rejected": -2.1636412864507633, "step": 970 }, { "epoch": 1.0, "step": 975, "total_flos": 0.0, "train_loss": 0.45996271347388246, "train_runtime": 8430.3956, "train_samples_per_second": 1.85, "train_steps_per_second": 0.116 } ], "logging_steps": 10, "max_steps": 975, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }