{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 45.75, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 38.75, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.08214569091797, "logits/rejected": 80.78972625732422, "logps/chosen": -34.26863098144531, "logps/rejected": -33.00303649902344, "loss": 0.7238, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.018833572044968605, "rewards/margins": 0.012407698668539524, "rewards/rejected": -0.03124127723276615, "step": 10 }, { "epoch": 0.05, "grad_norm": 37.25, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.68824005126953, "logits/rejected": 80.57817840576172, "logps/chosen": -33.58771514892578, "logps/rejected": -30.75152015686035, "loss": 0.7285, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0715368390083313, "rewards/margins": 0.05335939675569534, "rewards/rejected": 0.01817743293941021, "step": 20 }, { "epoch": 0.08, "grad_norm": 39.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.5134506225586, "logits/rejected": 82.5453872680664, "logps/chosen": -33.79930877685547, "logps/rejected": -31.215984344482422, "loss": 0.7655, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.14945873618125916, "rewards/margins": 0.00396394869312644, "rewards/rejected": 0.14549477398395538, "step": 30 }, { "epoch": 0.1, "grad_norm": 44.5, "learning_rate": 4.999896948438434e-06, "logits/chosen": 81.0338363647461, "logits/rejected": 81.03011322021484, "logps/chosen": -32.87316131591797, "logps/rejected": -33.17707061767578, "loss": 0.7622, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.21063438057899475, "rewards/margins": 0.07661890983581543, "rewards/rejected": 0.13401541113853455, "step": 40 }, { "epoch": 0.13, "grad_norm": 29.125, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.6342544555664, "logits/rejected": 78.64932250976562, "logps/chosen": -30.660537719726562, "logps/rejected": -30.76174545288086, "loss": 0.7328, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.38129910826683044, "rewards/margins": 0.16322235763072968, "rewards/rejected": 0.21807675063610077, "step": 50 }, { "epoch": 0.16, "grad_norm": 38.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": 83.13832092285156, "logits/rejected": 83.19276428222656, "logps/chosen": -30.93692970275879, "logps/rejected": -29.44403648376465, "loss": 0.7486, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.16676156222820282, "rewards/margins": 0.0642227977514267, "rewards/rejected": 0.10253874957561493, "step": 60 }, { "epoch": 0.18, "grad_norm": 64.5, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.7562255859375, "logits/rejected": 83.78288269042969, "logps/chosen": -30.605281829833984, "logps/rejected": -33.032676696777344, "loss": 0.7785, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.01661914773285389, "rewards/margins": 0.013097524642944336, "rewards/rejected": 0.003521624254062772, "step": 70 }, { "epoch": 0.21, "grad_norm": 42.5, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.37464904785156, "logits/rejected": 81.36463165283203, "logps/chosen": -31.443639755249023, "logps/rejected": -30.998950958251953, "loss": 0.6854, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04291192442178726, "rewards/margins": 0.211822509765625, "rewards/rejected": -0.16891059279441833, "step": 80 }, { "epoch": 0.23, "grad_norm": 38.5, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.1283187866211, "logits/rejected": 78.10060119628906, "logps/chosen": -32.54193878173828, "logps/rejected": -31.2618408203125, "loss": 0.6597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06736886501312256, "rewards/margins": 0.2846258580684662, "rewards/rejected": -0.21725702285766602, "step": 90 }, { "epoch": 0.26, "grad_norm": 33.0, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.37786865234375, "logits/rejected": 83.40235900878906, "logps/chosen": -34.06679153442383, "logps/rejected": -31.954029083251953, "loss": 0.6331, "rewards/accuracies": 0.625, "rewards/chosen": 0.166738823056221, "rewards/margins": 0.30917495489120483, "rewards/rejected": -0.14243611693382263, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.75102233886719, "eval_logits/rejected": 98.73809814453125, "eval_logps/chosen": -32.44057846069336, "eval_logps/rejected": -36.09393310546875, "eval_loss": 0.7266324758529663, "eval_rewards/accuracies": 0.545265793800354, "eval_rewards/chosen": 0.0018192834686487913, "eval_rewards/margins": 0.09100572764873505, "eval_rewards/rejected": -0.08918644487857819, "eval_runtime": 104.1233, "eval_samples_per_second": 3.294, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 53.25, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.52274322509766, "logits/rejected": 83.4115219116211, "logps/chosen": -32.51097869873047, "logps/rejected": -32.80630874633789, "loss": 0.6201, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.35827895998954773, "rewards/margins": 0.46357136964797974, "rewards/rejected": -0.10529237985610962, "step": 110 }, { "epoch": 0.31, "grad_norm": 50.25, "learning_rate": 4.353806263777678e-06, "logits/chosen": 83.7201919555664, "logits/rejected": 83.82737731933594, "logps/chosen": -28.233470916748047, "logps/rejected": -35.50123977661133, "loss": 0.5892, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4872798025608063, "rewards/margins": 0.5167454481124878, "rewards/rejected": -0.029465626925230026, "step": 120 }, { "epoch": 0.34, "grad_norm": 26.0, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 80.89537048339844, "logits/rejected": 80.91288757324219, "logps/chosen": -30.439437866210938, "logps/rejected": -32.11792755126953, "loss": 0.6342, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.32754257321357727, "rewards/margins": 0.457952082157135, "rewards/rejected": -0.13040950894355774, "step": 130 }, { "epoch": 0.36, "grad_norm": 30.5, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 82.11260223388672, "logits/rejected": 82.12245178222656, "logps/chosen": -27.101327896118164, "logps/rejected": -33.005577087402344, "loss": 0.5503, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24219810962677002, "rewards/margins": 0.6497381329536438, "rewards/rejected": -0.4075400233268738, "step": 140 }, { "epoch": 0.39, "grad_norm": 28.25, "learning_rate": 3.834196265035119e-06, "logits/chosen": 80.61543273925781, "logits/rejected": 80.58251953125, "logps/chosen": -28.909435272216797, "logps/rejected": -33.041297912597656, "loss": 0.5489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3357718586921692, "rewards/margins": 0.6681298613548279, "rewards/rejected": -0.3323580324649811, "step": 150 }, { "epoch": 0.42, "grad_norm": 55.5, "learning_rate": 3.636998309800573e-06, "logits/chosen": 82.49334716796875, "logits/rejected": 82.49332427978516, "logps/chosen": -33.531585693359375, "logps/rejected": -30.385196685791016, "loss": 0.6226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4236120283603668, "rewards/margins": 0.7264719009399414, "rewards/rejected": -0.3028598725795746, "step": 160 }, { "epoch": 0.44, "grad_norm": 42.25, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 83.25149536132812, "logits/rejected": 83.19024658203125, "logps/chosen": -30.89450454711914, "logps/rejected": -32.51388931274414, "loss": 0.5987, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2553495168685913, "rewards/margins": 0.601028323173523, "rewards/rejected": -0.3456788957118988, "step": 170 }, { "epoch": 0.47, "grad_norm": 34.25, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 80.78834533691406, "logits/rejected": 80.77064514160156, "logps/chosen": -30.47861671447754, "logps/rejected": -31.64987564086914, "loss": 0.5119, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5029744505882263, "rewards/margins": 0.8478416204452515, "rewards/rejected": -0.34486719965934753, "step": 180 }, { "epoch": 0.49, "grad_norm": 21.125, "learning_rate": 2.996071664294641e-06, "logits/chosen": 82.49182891845703, "logits/rejected": 82.4795150756836, "logps/chosen": -30.340301513671875, "logps/rejected": -30.779190063476562, "loss": 0.6399, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2982791066169739, "rewards/margins": 0.5160930752754211, "rewards/rejected": -0.21781396865844727, "step": 190 }, { "epoch": 0.52, "grad_norm": 17.375, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 77.9924545288086, "logits/rejected": 77.93614196777344, "logps/chosen": -33.81483459472656, "logps/rejected": -32.65379333496094, "loss": 0.6048, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6565331816673279, "rewards/margins": 0.7688080072402954, "rewards/rejected": -0.1122748851776123, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 98.63562774658203, "eval_logits/rejected": 98.61270141601562, "eval_logps/chosen": -32.66818618774414, "eval_logps/rejected": -36.3549919128418, "eval_loss": 0.7483024001121521, "eval_rewards/accuracies": 0.5282392501831055, "eval_rewards/chosen": -0.15750552713871002, "eval_rewards/margins": 0.1144195944070816, "eval_rewards/rejected": -0.27192509174346924, "eval_runtime": 104.0056, "eval_samples_per_second": 3.298, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.55, "grad_norm": 67.5, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 80.62068176269531, "logits/rejected": 80.52841186523438, "logps/chosen": -33.23737716674805, "logps/rejected": -35.3394889831543, "loss": 0.5657, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.49584078788757324, "rewards/margins": 0.7432178258895874, "rewards/rejected": -0.2473769634962082, "step": 210 }, { "epoch": 0.57, "grad_norm": 22.875, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 82.79103088378906, "logits/rejected": 82.86891174316406, "logps/chosen": -31.00775718688965, "logps/rejected": -31.1812801361084, "loss": 0.4699, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6588366627693176, "rewards/margins": 0.9754641652107239, "rewards/rejected": -0.31662750244140625, "step": 220 }, { "epoch": 0.6, "grad_norm": 39.75, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 79.89860534667969, "logits/rejected": 79.95353698730469, "logps/chosen": -32.31645965576172, "logps/rejected": -34.39720153808594, "loss": 0.6045, "rewards/accuracies": 0.625, "rewards/chosen": 0.3425000309944153, "rewards/margins": 0.612037718296051, "rewards/rejected": -0.26953771710395813, "step": 230 }, { "epoch": 0.62, "grad_norm": 34.0, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 82.30177307128906, "logits/rejected": 82.58096313476562, "logps/chosen": -30.619409561157227, "logps/rejected": -31.930099487304688, "loss": 0.4503, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7480591535568237, "rewards/margins": 1.0266973972320557, "rewards/rejected": -0.2786383032798767, "step": 240 }, { "epoch": 0.65, "grad_norm": 37.25, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 81.00114440917969, "logits/rejected": 81.05775451660156, "logps/chosen": -26.927043914794922, "logps/rejected": -30.175378799438477, "loss": 0.5818, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.46819525957107544, "rewards/margins": 0.6604421138763428, "rewards/rejected": -0.19224683940410614, "step": 250 }, { "epoch": 0.68, "grad_norm": 28.125, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 78.21713256835938, "logits/rejected": 78.345458984375, "logps/chosen": -30.480411529541016, "logps/rejected": -36.508689880371094, "loss": 0.4701, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7564077377319336, "rewards/margins": 1.0413528680801392, "rewards/rejected": -0.28494516015052795, "step": 260 }, { "epoch": 0.7, "grad_norm": 24.875, "learning_rate": 1.243452991757889e-06, "logits/chosen": 77.48748779296875, "logits/rejected": 77.51399230957031, "logps/chosen": -30.899953842163086, "logps/rejected": -31.809417724609375, "loss": 0.5373, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6152798533439636, "rewards/margins": 0.828079104423523, "rewards/rejected": -0.2127993404865265, "step": 270 }, { "epoch": 0.73, "grad_norm": 40.5, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 80.2722396850586, "logits/rejected": 80.06110382080078, "logps/chosen": -31.229726791381836, "logps/rejected": -29.85305404663086, "loss": 0.6573, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.406115859746933, "rewards/margins": 0.5363569259643555, "rewards/rejected": -0.1302410513162613, "step": 280 }, { "epoch": 0.75, "grad_norm": 23.5, "learning_rate": 8.737922755071455e-07, "logits/chosen": 80.33818054199219, "logits/rejected": 80.25775146484375, "logps/chosen": -33.049842834472656, "logps/rejected": -32.65058135986328, "loss": 0.4554, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7380836606025696, "rewards/margins": 1.1816825866699219, "rewards/rejected": -0.44359898567199707, "step": 290 }, { "epoch": 0.78, "grad_norm": 40.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": 76.02481079101562, "logits/rejected": 76.12067413330078, "logps/chosen": -32.21509552001953, "logps/rejected": -29.180316925048828, "loss": 0.5829, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8334287405014038, "rewards/margins": 0.8882354497909546, "rewards/rejected": -0.054806679487228394, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 98.6636962890625, "eval_logits/rejected": 98.63994598388672, "eval_logps/chosen": -32.58477020263672, "eval_logps/rejected": -36.28050231933594, "eval_loss": 0.7390850782394409, "eval_rewards/accuracies": 0.5485880374908447, "eval_rewards/chosen": -0.09911961853504181, "eval_rewards/margins": 0.12066645920276642, "eval_rewards/rejected": -0.21978609263896942, "eval_runtime": 103.9194, "eval_samples_per_second": 3.301, "eval_steps_per_second": 0.414, "step": 300 }, { "epoch": 0.81, "grad_norm": 31.5, "learning_rate": 5.576113578589035e-07, "logits/chosen": 83.141357421875, "logits/rejected": 83.17015075683594, "logps/chosen": -30.017724990844727, "logps/rejected": -32.537620544433594, "loss": 0.5115, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.5920647382736206, "rewards/margins": 0.8285657167434692, "rewards/rejected": -0.23650094866752625, "step": 310 }, { "epoch": 0.83, "grad_norm": 25.5, "learning_rate": 4.229036944380913e-07, "logits/chosen": 80.59849548339844, "logits/rejected": 80.60069274902344, "logps/chosen": -30.53042221069336, "logps/rejected": -29.161365509033203, "loss": 0.4928, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7476966977119446, "rewards/margins": 0.9170078039169312, "rewards/rejected": -0.16931119561195374, "step": 320 }, { "epoch": 0.86, "grad_norm": 22.25, "learning_rate": 3.053082288996112e-07, "logits/chosen": 77.74131774902344, "logits/rejected": 77.79161071777344, "logps/chosen": -29.038299560546875, "logps/rejected": -32.908966064453125, "loss": 0.458, "rewards/accuracies": 0.75, "rewards/chosen": 0.9200155138969421, "rewards/margins": 1.111884355545044, "rewards/rejected": -0.19186890125274658, "step": 330 }, { "epoch": 0.88, "grad_norm": 57.25, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 82.08992767333984, "logits/rejected": 82.12026977539062, "logps/chosen": -32.29141616821289, "logps/rejected": -33.880916595458984, "loss": 0.5523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6445139050483704, "rewards/margins": 1.003070592880249, "rewards/rejected": -0.3585566580295563, "step": 340 }, { "epoch": 0.91, "grad_norm": 14.125, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 81.13373565673828, "logits/rejected": 81.14064025878906, "logps/chosen": -32.35675048828125, "logps/rejected": -33.414161682128906, "loss": 0.4904, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9008921384811401, "rewards/margins": 1.1265466213226318, "rewards/rejected": -0.22565443813800812, "step": 350 }, { "epoch": 0.94, "grad_norm": 27.625, "learning_rate": 6.41315865106129e-08, "logits/chosen": 82.62144470214844, "logits/rejected": 82.64656066894531, "logps/chosen": -28.411449432373047, "logps/rejected": -31.78824806213379, "loss": 0.5051, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7985339760780334, "rewards/margins": 0.8954124450683594, "rewards/rejected": -0.09687861800193787, "step": 360 }, { "epoch": 0.96, "grad_norm": 43.0, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 82.0575942993164, "logits/rejected": 82.07881164550781, "logps/chosen": -31.82853126525879, "logps/rejected": -35.34919357299805, "loss": 0.6002, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7333434820175171, "rewards/margins": 0.9012172818183899, "rewards/rejected": -0.1678738296031952, "step": 370 }, { "epoch": 0.99, "grad_norm": 34.75, "learning_rate": 2.575864278703266e-09, "logits/chosen": 75.95097351074219, "logits/rejected": 75.82870483398438, "logps/chosen": -29.8321475982666, "logps/rejected": -28.438806533813477, "loss": 0.5739, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5207866430282593, "rewards/margins": 0.7155483365058899, "rewards/rejected": -0.19476178288459778, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.5948986065852178, "train_runtime": 2557.7017, "train_samples_per_second": 1.204, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }