{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 314, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.5625e-08, "logits/generated": -3.0857884883880615, "logits/real": -2.988919258117676, "logps/generated": -105.47663879394531, "logps/real": -159.43017578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 1.5624999999999999e-07, "logits/generated": -3.0524420738220215, "logits/real": -2.888369560241699, "logps/generated": -98.16116333007812, "logps/real": -124.24053192138672, "loss": 0.4499, "rewards/accuracies": 0.7916666865348816, "rewards/generated": -0.5709724426269531, "rewards/margins": 0.7495355606079102, "rewards/real": 0.17856311798095703, "step": 10 }, { "epoch": 0.13, "learning_rate": 3.1249999999999997e-07, "logits/generated": -3.0165483951568604, "logits/real": -2.757232189178467, "logps/generated": -125.3254165649414, "logps/real": -120.93900299072266, "loss": 0.0982, "rewards/accuracies": 1.0, "rewards/generated": -2.5599706172943115, "rewards/margins": 3.3923709392547607, "rewards/real": 0.8324005007743835, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.6874999999999996e-07, "logits/generated": -2.9717984199523926, "logits/real": -2.703868865966797, "logps/generated": -137.14996337890625, "logps/real": -138.37025451660156, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/generated": -3.964935302734375, "rewards/margins": 4.9714460372924805, "rewards/real": 1.0065107345581055, "step": 30 }, { "epoch": 0.25, "learning_rate": 4.858156028368794e-07, "logits/generated": -2.896350145339966, "logits/real": -2.632850408554077, "logps/generated": -149.0387420654297, "logps/real": -131.4845733642578, "loss": 0.0209, "rewards/accuracies": 0.987500011920929, "rewards/generated": -5.3386735916137695, "rewards/margins": 6.378259658813477, "rewards/real": 1.0395863056182861, "step": 40 }, { "epoch": 0.32, "learning_rate": 4.6808510638297873e-07, "logits/generated": -2.864506244659424, "logits/real": -2.635793685913086, "logps/generated": -161.58311462402344, "logps/real": -133.5462646484375, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/generated": -6.72866678237915, "rewards/margins": 7.655924320220947, "rewards/real": 0.9272577166557312, "step": 50 }, { "epoch": 0.38, "learning_rate": 4.50354609929078e-07, "logits/generated": -2.8477931022644043, "logits/real": -2.5890610218048096, "logps/generated": -163.30792236328125, "logps/real": -120.96208190917969, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/generated": -7.271246433258057, "rewards/margins": 7.847572326660156, "rewards/real": 0.5763252973556519, "step": 60 }, { "epoch": 0.45, "learning_rate": 4.326241134751773e-07, "logits/generated": -2.817225694656372, "logits/real": -2.618878126144409, "logps/generated": -178.3530731201172, "logps/real": -133.58116149902344, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/generated": -8.513948440551758, "rewards/margins": 9.06941032409668, "rewards/real": 0.5554608702659607, "step": 70 }, { "epoch": 0.51, "learning_rate": 4.148936170212766e-07, "logits/generated": -2.743182420730591, "logits/real": -2.5085911750793457, "logps/generated": -182.21102905273438, "logps/real": -141.52438354492188, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/generated": -9.143477439880371, "rewards/margins": 8.894549369812012, "rewards/real": -0.24892878532409668, "step": 80 }, { "epoch": 0.57, "learning_rate": 3.971631205673759e-07, "logits/generated": -2.667142152786255, "logits/real": -2.3680872917175293, "logps/generated": -221.4196014404297, "logps/real": -156.9970703125, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/generated": -11.756684303283691, "rewards/margins": 11.24407958984375, "rewards/real": -0.5126041769981384, "step": 90 }, { "epoch": 0.64, "learning_rate": 3.7943262411347514e-07, "logits/generated": -2.716907024383545, "logits/real": -2.384260654449463, "logps/generated": -195.7281951904297, "logps/real": -141.03294372558594, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/generated": -10.495864868164062, "rewards/margins": 10.102469444274902, "rewards/real": -0.39339518547058105, "step": 100 }, { "epoch": 0.64, "eval_logits/generated": -2.708615779876709, "eval_logits/real": -2.4488255977630615, "eval_logps/generated": -208.8447265625, "eval_logps/real": -143.2593536376953, "eval_loss": 0.008355233818292618, "eval_rewards/accuracies": 0.9992038011550903, "eval_rewards/generated": -11.555154800415039, "eval_rewards/margins": 10.695781707763672, "eval_rewards/real": -0.8593728542327881, "eval_runtime": 338.0923, "eval_samples_per_second": 14.789, "eval_steps_per_second": 0.464, "step": 100 }, { "epoch": 0.7, "learning_rate": 3.617021276595745e-07, "logits/generated": -2.6799185276031494, "logits/real": -2.4362854957580566, "logps/generated": -229.9918975830078, "logps/real": -144.79042053222656, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/generated": -13.137945175170898, "rewards/margins": 11.87834358215332, "rewards/real": -1.259602427482605, "step": 110 }, { "epoch": 0.76, "learning_rate": 3.4397163120567375e-07, "logits/generated": -2.70621919631958, "logits/real": -2.4253954887390137, "logps/generated": -228.62405395507812, "logps/real": -158.47494506835938, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/generated": -13.207649230957031, "rewards/margins": 12.149269104003906, "rewards/real": -1.0583809614181519, "step": 120 }, { "epoch": 0.83, "learning_rate": 3.2624113475177305e-07, "logits/generated": -2.662909746170044, "logits/real": -2.3800911903381348, "logps/generated": -227.56619262695312, "logps/real": -156.76431274414062, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/generated": -13.599740982055664, "rewards/margins": 12.402183532714844, "rewards/real": -1.1975574493408203, "step": 130 }, { "epoch": 0.89, "learning_rate": 3.085106382978723e-07, "logits/generated": -2.69126033782959, "logits/real": -2.4004807472229004, "logps/generated": -229.597900390625, "logps/real": -146.2201385498047, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/generated": -13.650607109069824, "rewards/margins": 12.299604415893555, "rewards/real": -1.3510032892227173, "step": 140 }, { "epoch": 0.96, "learning_rate": 2.907801418439716e-07, "logits/generated": -2.6775763034820557, "logits/real": -2.324918508529663, "logps/generated": -218.3529510498047, "logps/real": -140.95547485351562, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/generated": -12.812429428100586, "rewards/margins": 12.05975341796875, "rewards/real": -0.752677321434021, "step": 150 }, { "epoch": 1.02, "learning_rate": 2.730496453900709e-07, "logits/generated": -2.6712429523468018, "logits/real": -2.2760770320892334, "logps/generated": -196.49375915527344, "logps/real": -121.7732925415039, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/generated": -10.348185539245605, "rewards/margins": 9.94708251953125, "rewards/real": -0.40110310912132263, "step": 160 }, { "epoch": 1.08, "learning_rate": 2.5531914893617016e-07, "logits/generated": -2.6528658866882324, "logits/real": -2.1584651470184326, "logps/generated": -213.1004638671875, "logps/real": -121.10084533691406, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/generated": -11.93848991394043, "rewards/margins": 11.793214797973633, "rewards/real": -0.14527548849582672, "step": 170 }, { "epoch": 1.15, "learning_rate": 2.375886524822695e-07, "logits/generated": -2.6257143020629883, "logits/real": -2.2728638648986816, "logps/generated": -228.39248657226562, "logps/real": -152.61045837402344, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/generated": -13.100919723510742, "rewards/margins": 12.766759872436523, "rewards/real": -0.3341600298881531, "step": 180 }, { "epoch": 1.21, "learning_rate": 2.198581560283688e-07, "logits/generated": -2.609994411468506, "logits/real": -2.2576498985290527, "logps/generated": -222.8015594482422, "logps/real": -145.87921142578125, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/generated": -13.061747550964355, "rewards/margins": 12.396347999572754, "rewards/real": -0.6654000282287598, "step": 190 }, { "epoch": 1.27, "learning_rate": 2.0212765957446807e-07, "logits/generated": -2.600148916244507, "logits/real": -2.1764094829559326, "logps/generated": -236.9789581298828, "logps/real": -137.09503173828125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/generated": -14.19157886505127, "rewards/margins": 13.38233470916748, "rewards/real": -0.8092441558837891, "step": 200 }, { "epoch": 1.27, "eval_logits/generated": -2.600484609603882, "eval_logits/real": -2.2709195613861084, "eval_logps/generated": -234.47348022460938, "eval_logps/real": -144.0410919189453, "eval_loss": 0.0056638033129274845, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -14.118030548095703, "eval_rewards/margins": 13.180484771728516, "eval_rewards/real": -0.9375430345535278, "eval_runtime": 334.9223, "eval_samples_per_second": 14.929, "eval_steps_per_second": 0.469, "step": 200 }, { "epoch": 1.34, "learning_rate": 1.8439716312056735e-07, "logits/generated": -2.617143154144287, "logits/real": -2.263977289199829, "logps/generated": -242.97866821289062, "logps/real": -152.24502563476562, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/generated": -14.723902702331543, "rewards/margins": 13.91853141784668, "rewards/real": -0.8053719401359558, "step": 210 }, { "epoch": 1.4, "learning_rate": 1.6666666666666665e-07, "logits/generated": -2.5576019287109375, "logits/real": -2.3169806003570557, "logps/generated": -233.3745880126953, "logps/real": -145.44570922851562, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/generated": -14.311078071594238, "rewards/margins": 13.486970901489258, "rewards/real": -0.8241075277328491, "step": 220 }, { "epoch": 1.46, "learning_rate": 1.4893617021276595e-07, "logits/generated": -2.553480386734009, "logits/real": -2.315396785736084, "logps/generated": -243.16702270507812, "logps/real": -168.13119506835938, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/generated": -14.701271057128906, "rewards/margins": 13.865699768066406, "rewards/real": -0.8355696797370911, "step": 230 }, { "epoch": 1.53, "learning_rate": 1.3120567375886523e-07, "logits/generated": -2.6132516860961914, "logits/real": -2.2840161323547363, "logps/generated": -246.56961059570312, "logps/real": -152.701171875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/generated": -14.995321273803711, "rewards/margins": 14.276300430297852, "rewards/real": -0.7190229892730713, "step": 240 }, { "epoch": 1.59, "learning_rate": 1.1347517730496453e-07, "logits/generated": -2.591722249984741, "logits/real": -2.277804374694824, "logps/generated": -263.14129638671875, "logps/real": -158.726806640625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/generated": -16.25307273864746, "rewards/margins": 15.087217330932617, "rewards/real": -1.1658554077148438, "step": 250 }, { "epoch": 1.66, "learning_rate": 9.574468085106382e-08, "logits/generated": -2.569918155670166, "logits/real": -2.1422371864318848, "logps/generated": -244.6776123046875, "logps/real": -145.0582733154297, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/generated": -15.619791984558105, "rewards/margins": 14.159820556640625, "rewards/real": -1.4599710702896118, "step": 260 }, { "epoch": 1.72, "learning_rate": 7.801418439716311e-08, "logits/generated": -2.539750099182129, "logits/real": -2.137476682662964, "logps/generated": -265.46429443359375, "logps/real": -163.37588500976562, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/generated": -16.41511344909668, "rewards/margins": 14.795074462890625, "rewards/real": -1.6200393438339233, "step": 270 }, { "epoch": 1.78, "learning_rate": 6.02836879432624e-08, "logits/generated": -2.574856996536255, "logits/real": -2.0835378170013428, "logps/generated": -262.6173095703125, "logps/real": -142.27984619140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/generated": -16.557262420654297, "rewards/margins": 15.236516952514648, "rewards/real": -1.3207473754882812, "step": 280 }, { "epoch": 1.85, "learning_rate": 4.25531914893617e-08, "logits/generated": -2.5489261150360107, "logits/real": -2.2142765522003174, "logps/generated": -268.306396484375, "logps/real": -147.05963134765625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/generated": -17.089282989501953, "rewards/margins": 15.543319702148438, "rewards/real": -1.5459634065628052, "step": 290 }, { "epoch": 1.91, "learning_rate": 2.4822695035460993e-08, "logits/generated": -2.556087017059326, "logits/real": -2.1201999187469482, "logps/generated": -249.31875610351562, "logps/real": -138.080322265625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/generated": -15.890460014343262, "rewards/margins": 14.16200065612793, "rewards/real": -1.728456735610962, "step": 300 }, { "epoch": 1.91, "eval_logits/generated": -2.5437142848968506, "eval_logits/real": -2.19522762298584, "eval_logps/generated": -257.48858642578125, "eval_logps/real": -151.0966796875, "eval_loss": 0.005383754149079323, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -16.419538497924805, "eval_rewards/margins": 14.776435852050781, "eval_rewards/real": -1.6431050300598145, "eval_runtime": 335.2989, "eval_samples_per_second": 14.912, "eval_steps_per_second": 0.468, "step": 300 }, { "epoch": 1.97, "learning_rate": 7.092198581560283e-09, "logits/generated": -2.5656914710998535, "logits/real": -2.300166368484497, "logps/generated": -260.5529479980469, "logps/real": -153.91053771972656, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/generated": -16.434940338134766, "rewards/margins": 14.854705810546875, "rewards/real": -1.5802339315414429, "step": 310 }, { "epoch": 2.0, "step": 314, "total_flos": 0.0, "train_loss": 0.025043586236395084, "train_runtime": 2649.1704, "train_samples_per_second": 3.775, "train_steps_per_second": 0.119 } ], "logging_steps": 10, "max_steps": 314, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }