{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 111303, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02695345138945042, "grad_norm": 1.7572588920593262, "learning_rate": 4.955077581017583e-05, "loss": 2.2431, "step": 1000 }, { "epoch": 0.05390690277890084, "grad_norm": 1.843064308166504, "learning_rate": 4.9101551620351654e-05, "loss": 1.8256, "step": 2000 }, { "epoch": 0.08086035416835126, "grad_norm": 1.7338684797286987, "learning_rate": 4.865232743052748e-05, "loss": 1.5467, "step": 3000 }, { "epoch": 0.10781380555780168, "grad_norm": 1.794568657875061, "learning_rate": 4.8203103240703305e-05, "loss": 1.3891, "step": 4000 }, { "epoch": 0.13476725694725208, "grad_norm": 1.6764180660247803, "learning_rate": 4.7753879050879134e-05, "loss": 1.3107, "step": 5000 }, { "epoch": 0.16172070833670252, "grad_norm": 1.5753543376922607, "learning_rate": 4.7304654861054956e-05, "loss": 1.2522, "step": 6000 }, { "epoch": 0.18867415972615292, "grad_norm": 1.5777322053909302, "learning_rate": 4.6855430671230785e-05, "loss": 1.2054, "step": 7000 }, { "epoch": 0.21562761111560336, "grad_norm": 1.6358872652053833, "learning_rate": 4.640620648140661e-05, "loss": 1.1639, "step": 8000 }, { "epoch": 0.24258106250505376, "grad_norm": 1.4646263122558594, "learning_rate": 4.595698229158244e-05, "loss": 1.1349, "step": 9000 }, { "epoch": 0.26953451389450417, "grad_norm": 1.8237578868865967, "learning_rate": 4.5507758101758266e-05, "loss": 1.1017, "step": 10000 }, { "epoch": 0.2964879652839546, "grad_norm": 4.1678619384765625, "learning_rate": 4.5058533911934095e-05, "loss": 1.0802, "step": 11000 }, { "epoch": 0.32344141667340504, "grad_norm": 1.6083228588104248, "learning_rate": 4.4609309722109924e-05, "loss": 1.0584, "step": 12000 }, { "epoch": 0.35039486806285547, "grad_norm": 1.5944287776947021, "learning_rate": 4.4160085532285746e-05, "loss": 1.0376, "step": 13000 }, { "epoch": 0.37734831945230585, "grad_norm": 1.5678300857543945, "learning_rate": 4.3710861342461575e-05, "loss": 1.0186, "step": 14000 }, { "epoch": 0.4043017708417563, "grad_norm": 1.5760828256607056, "learning_rate": 4.32616371526374e-05, "loss": 1.0006, "step": 15000 }, { "epoch": 0.4312552222312067, "grad_norm": 1.5422290563583374, "learning_rate": 4.281241296281323e-05, "loss": 0.9868, "step": 16000 }, { "epoch": 0.45820867362065715, "grad_norm": 1.8263678550720215, "learning_rate": 4.236318877298905e-05, "loss": 0.9737, "step": 17000 }, { "epoch": 0.4851621250101075, "grad_norm": 1.4930555820465088, "learning_rate": 4.191396458316488e-05, "loss": 0.9595, "step": 18000 }, { "epoch": 0.512115576399558, "grad_norm": 1.548412561416626, "learning_rate": 4.14647403933407e-05, "loss": 0.9448, "step": 19000 }, { "epoch": 0.5390690277890083, "grad_norm": 1.478004813194275, "learning_rate": 4.101551620351653e-05, "loss": 0.9351, "step": 20000 }, { "epoch": 0.5660224791784588, "grad_norm": 1.5493645668029785, "learning_rate": 4.056629201369235e-05, "loss": 0.9235, "step": 21000 }, { "epoch": 0.5929759305679092, "grad_norm": 1.5261894464492798, "learning_rate": 4.011706782386818e-05, "loss": 0.9133, "step": 22000 }, { "epoch": 0.6199293819573597, "grad_norm": 1.3742462396621704, "learning_rate": 3.9667843634044e-05, "loss": 0.9027, "step": 23000 }, { "epoch": 0.6468828333468101, "grad_norm": 1.451717495918274, "learning_rate": 3.921861944421983e-05, "loss": 0.8937, "step": 24000 }, { "epoch": 0.6738362847362604, "grad_norm": 1.5163180828094482, "learning_rate": 3.876939525439566e-05, "loss": 0.8837, "step": 25000 }, { "epoch": 0.7007897361257109, "grad_norm": 1.312432885169983, "learning_rate": 3.832017106457148e-05, "loss": 0.8735, "step": 26000 }, { "epoch": 0.7277431875151613, "grad_norm": 1.4663609266281128, "learning_rate": 3.787094687474731e-05, "loss": 0.8663, "step": 27000 }, { "epoch": 0.7546966389046117, "grad_norm": 1.457512617111206, "learning_rate": 3.742172268492314e-05, "loss": 0.8587, "step": 28000 }, { "epoch": 0.7816500902940622, "grad_norm": 1.4610168933868408, "learning_rate": 3.697249849509897e-05, "loss": 0.8514, "step": 29000 }, { "epoch": 0.8086035416835126, "grad_norm": 1.6427557468414307, "learning_rate": 3.652327430527479e-05, "loss": 0.8457, "step": 30000 }, { "epoch": 0.8355569930729629, "grad_norm": 1.3497254848480225, "learning_rate": 3.607405011545062e-05, "loss": 0.8372, "step": 31000 }, { "epoch": 0.8625104444624134, "grad_norm": 1.597374677658081, "learning_rate": 3.5624825925626444e-05, "loss": 0.8308, "step": 32000 }, { "epoch": 0.8894638958518638, "grad_norm": 1.5185362100601196, "learning_rate": 3.517560173580227e-05, "loss": 0.8262, "step": 33000 }, { "epoch": 0.9164173472413143, "grad_norm": 1.452099323272705, "learning_rate": 3.4726377545978095e-05, "loss": 0.8188, "step": 34000 }, { "epoch": 0.9433707986307647, "grad_norm": 1.4626882076263428, "learning_rate": 3.4277153356153925e-05, "loss": 0.8128, "step": 35000 }, { "epoch": 0.970324250020215, "grad_norm": 1.329575538635254, "learning_rate": 3.3827929166329754e-05, "loss": 0.8057, "step": 36000 }, { "epoch": 0.9972777014096655, "grad_norm": 1.3917378187179565, "learning_rate": 3.3378704976505576e-05, "loss": 0.8014, "step": 37000 }, { "epoch": 1.024231152799116, "grad_norm": 1.4865970611572266, "learning_rate": 3.2929480786681405e-05, "loss": 0.7947, "step": 38000 }, { "epoch": 1.0511846041885664, "grad_norm": 1.4632256031036377, "learning_rate": 3.248025659685723e-05, "loss": 0.7898, "step": 39000 }, { "epoch": 1.0781380555780167, "grad_norm": 1.5164929628372192, "learning_rate": 3.2031032407033056e-05, "loss": 0.7843, "step": 40000 }, { "epoch": 1.1050915069674672, "grad_norm": 1.4825857877731323, "learning_rate": 3.158180821720888e-05, "loss": 0.7796, "step": 41000 }, { "epoch": 1.1320449583569177, "grad_norm": 1.4526509046554565, "learning_rate": 3.113258402738471e-05, "loss": 0.7771, "step": 42000 }, { "epoch": 1.158998409746368, "grad_norm": 1.4606186151504517, "learning_rate": 3.068335983756053e-05, "loss": 0.7728, "step": 43000 }, { "epoch": 1.1859518611358184, "grad_norm": 3.3901302814483643, "learning_rate": 3.0234135647736362e-05, "loss": 0.7664, "step": 44000 }, { "epoch": 1.212905312525269, "grad_norm": 1.4377284049987793, "learning_rate": 2.9784911457912185e-05, "loss": 0.7656, "step": 45000 }, { "epoch": 1.2398587639147194, "grad_norm": 1.432142734527588, "learning_rate": 2.9335687268088014e-05, "loss": 0.7592, "step": 46000 }, { "epoch": 1.2668122153041697, "grad_norm": 1.3727272748947144, "learning_rate": 2.8886463078263843e-05, "loss": 0.7563, "step": 47000 }, { "epoch": 1.2937656666936201, "grad_norm": 1.5601295232772827, "learning_rate": 2.8437238888439665e-05, "loss": 0.7509, "step": 48000 }, { "epoch": 1.3207191180830704, "grad_norm": 1.4392520189285278, "learning_rate": 2.7988014698615494e-05, "loss": 0.7461, "step": 49000 }, { "epoch": 1.347672569472521, "grad_norm": 1.365379810333252, "learning_rate": 2.7538790508791316e-05, "loss": 0.7416, "step": 50000 }, { "epoch": 1.3746260208619714, "grad_norm": 1.352710247039795, "learning_rate": 2.7089566318967145e-05, "loss": 0.7403, "step": 51000 }, { "epoch": 1.4015794722514219, "grad_norm": 1.3562721014022827, "learning_rate": 2.664034212914297e-05, "loss": 0.7359, "step": 52000 }, { "epoch": 1.4285329236408721, "grad_norm": 1.627213954925537, "learning_rate": 2.61911179393188e-05, "loss": 0.7329, "step": 53000 }, { "epoch": 1.4554863750303226, "grad_norm": 1.4729622602462769, "learning_rate": 2.5741893749494622e-05, "loss": 0.7302, "step": 54000 }, { "epoch": 1.4824398264197731, "grad_norm": 1.4408637285232544, "learning_rate": 2.529266955967045e-05, "loss": 0.7283, "step": 55000 }, { "epoch": 1.5093932778092234, "grad_norm": 1.504920482635498, "learning_rate": 2.4843445369846277e-05, "loss": 0.7233, "step": 56000 }, { "epoch": 1.5363467291986739, "grad_norm": 1.4795109033584595, "learning_rate": 2.4394221180022103e-05, "loss": 0.7211, "step": 57000 }, { "epoch": 1.5633001805881244, "grad_norm": 1.4444007873535156, "learning_rate": 2.394499699019793e-05, "loss": 0.7162, "step": 58000 }, { "epoch": 1.5902536319775749, "grad_norm": 1.4556983709335327, "learning_rate": 2.3495772800373754e-05, "loss": 0.7156, "step": 59000 }, { "epoch": 1.6172070833670251, "grad_norm": 1.5000152587890625, "learning_rate": 2.304654861054958e-05, "loss": 0.7118, "step": 60000 }, { "epoch": 1.6441605347564756, "grad_norm": 1.385746717453003, "learning_rate": 2.259732442072541e-05, "loss": 0.7091, "step": 61000 }, { "epoch": 1.6711139861459259, "grad_norm": 1.5274490118026733, "learning_rate": 2.2148100230901235e-05, "loss": 0.7073, "step": 62000 }, { "epoch": 1.6980674375353764, "grad_norm": 1.411332607269287, "learning_rate": 2.169887604107706e-05, "loss": 0.703, "step": 63000 }, { "epoch": 1.7250208889248269, "grad_norm": 1.6316828727722168, "learning_rate": 2.124965185125289e-05, "loss": 0.702, "step": 64000 }, { "epoch": 1.7519743403142773, "grad_norm": 1.4710853099822998, "learning_rate": 2.0800427661428715e-05, "loss": 0.6995, "step": 65000 }, { "epoch": 1.7789277917037276, "grad_norm": 1.984465479850769, "learning_rate": 2.035120347160454e-05, "loss": 0.6976, "step": 66000 }, { "epoch": 1.805881243093178, "grad_norm": 1.5006201267242432, "learning_rate": 1.9901979281780366e-05, "loss": 0.6917, "step": 67000 }, { "epoch": 1.8328346944826284, "grad_norm": 1.4539721012115479, "learning_rate": 1.9452755091956192e-05, "loss": 0.6906, "step": 68000 }, { "epoch": 1.8597881458720789, "grad_norm": 1.4637404680252075, "learning_rate": 1.9003530902132018e-05, "loss": 0.6866, "step": 69000 }, { "epoch": 1.8867415972615293, "grad_norm": 1.4094914197921753, "learning_rate": 1.8554306712307847e-05, "loss": 0.6856, "step": 70000 }, { "epoch": 1.9136950486509798, "grad_norm": 1.828584909439087, "learning_rate": 1.8105082522483672e-05, "loss": 0.6838, "step": 71000 }, { "epoch": 1.9406485000404303, "grad_norm": 1.3690617084503174, "learning_rate": 1.7655858332659498e-05, "loss": 0.6815, "step": 72000 }, { "epoch": 1.9676019514298806, "grad_norm": 1.438297986984253, "learning_rate": 1.7206634142835324e-05, "loss": 0.678, "step": 73000 }, { "epoch": 1.9945554028193309, "grad_norm": 1.482967495918274, "learning_rate": 1.675740995301115e-05, "loss": 0.6761, "step": 74000 }, { "epoch": 2.0215088542087813, "grad_norm": 1.5024503469467163, "learning_rate": 1.6308185763186975e-05, "loss": 0.6719, "step": 75000 }, { "epoch": 2.048462305598232, "grad_norm": 1.444847583770752, "learning_rate": 1.5858961573362804e-05, "loss": 0.6715, "step": 76000 }, { "epoch": 2.0754157569876823, "grad_norm": 1.4710804224014282, "learning_rate": 1.540973738353863e-05, "loss": 0.6697, "step": 77000 }, { "epoch": 2.102369208377133, "grad_norm": 1.4638535976409912, "learning_rate": 1.4960513193714457e-05, "loss": 0.6676, "step": 78000 }, { "epoch": 2.1293226597665833, "grad_norm": 1.4491604566574097, "learning_rate": 1.4511289003890283e-05, "loss": 0.666, "step": 79000 }, { "epoch": 2.1562761111560333, "grad_norm": 1.5296227931976318, "learning_rate": 1.4062064814066108e-05, "loss": 0.6638, "step": 80000 }, { "epoch": 2.183229562545484, "grad_norm": 1.4576656818389893, "learning_rate": 1.3612840624241934e-05, "loss": 0.6603, "step": 81000 }, { "epoch": 2.2101830139349343, "grad_norm": 1.3864960670471191, "learning_rate": 1.3163616434417761e-05, "loss": 0.6593, "step": 82000 }, { "epoch": 2.237136465324385, "grad_norm": 1.7719892263412476, "learning_rate": 1.2714392244593587e-05, "loss": 0.6602, "step": 83000 }, { "epoch": 2.2640899167138353, "grad_norm": 1.3673619031906128, "learning_rate": 1.2265168054769415e-05, "loss": 0.656, "step": 84000 }, { "epoch": 2.291043368103286, "grad_norm": 1.4584944248199463, "learning_rate": 1.181594386494524e-05, "loss": 0.6536, "step": 85000 }, { "epoch": 2.317996819492736, "grad_norm": 1.3770402669906616, "learning_rate": 1.1366719675121066e-05, "loss": 0.6519, "step": 86000 }, { "epoch": 2.3449502708821863, "grad_norm": 1.3435842990875244, "learning_rate": 1.0917495485296893e-05, "loss": 0.6498, "step": 87000 }, { "epoch": 2.371903722271637, "grad_norm": 1.4926843643188477, "learning_rate": 1.0468271295472719e-05, "loss": 0.6497, "step": 88000 }, { "epoch": 2.3988571736610873, "grad_norm": 1.4428088665008545, "learning_rate": 1.0019047105648545e-05, "loss": 0.648, "step": 89000 }, { "epoch": 2.425810625050538, "grad_norm": 1.3555138111114502, "learning_rate": 9.569822915824372e-06, "loss": 0.6463, "step": 90000 }, { "epoch": 2.4527640764399883, "grad_norm": 1.8231887817382812, "learning_rate": 9.1205987260002e-06, "loss": 0.6441, "step": 91000 }, { "epoch": 2.4797175278294388, "grad_norm": 1.4294542074203491, "learning_rate": 8.671374536176025e-06, "loss": 0.6443, "step": 92000 }, { "epoch": 2.506670979218889, "grad_norm": 1.4158700704574585, "learning_rate": 8.22215034635185e-06, "loss": 0.6408, "step": 93000 }, { "epoch": 2.5336244306083393, "grad_norm": 1.3847190141677856, "learning_rate": 7.772926156527676e-06, "loss": 0.6407, "step": 94000 }, { "epoch": 2.56057788199779, "grad_norm": 1.4408886432647705, "learning_rate": 7.323701966703503e-06, "loss": 0.6408, "step": 95000 }, { "epoch": 2.5875313333872403, "grad_norm": 1.5943797826766968, "learning_rate": 6.87447777687933e-06, "loss": 0.6379, "step": 96000 }, { "epoch": 2.6144847847766908, "grad_norm": 1.4184848070144653, "learning_rate": 6.425253587055157e-06, "loss": 0.6367, "step": 97000 }, { "epoch": 2.641438236166141, "grad_norm": 1.4443740844726562, "learning_rate": 5.976029397230982e-06, "loss": 0.635, "step": 98000 }, { "epoch": 2.6683916875555918, "grad_norm": 1.5342421531677246, "learning_rate": 5.526805207406809e-06, "loss": 0.6349, "step": 99000 }, { "epoch": 2.695345138945042, "grad_norm": 1.4228054285049438, "learning_rate": 5.077581017582635e-06, "loss": 0.6349, "step": 100000 }, { "epoch": 2.7222985903344923, "grad_norm": 1.5616410970687866, "learning_rate": 4.628356827758461e-06, "loss": 0.6345, "step": 101000 }, { "epoch": 2.7492520417239428, "grad_norm": 1.4892140626907349, "learning_rate": 4.1791326379342876e-06, "loss": 0.6322, "step": 102000 }, { "epoch": 2.7762054931133933, "grad_norm": 1.48268723487854, "learning_rate": 3.7299084481101145e-06, "loss": 0.6318, "step": 103000 }, { "epoch": 2.8031589445028438, "grad_norm": 1.6489795446395874, "learning_rate": 3.2806842582859406e-06, "loss": 0.6295, "step": 104000 }, { "epoch": 2.830112395892294, "grad_norm": 1.458978295326233, "learning_rate": 2.8314600684617667e-06, "loss": 0.6291, "step": 105000 }, { "epoch": 2.8570658472817443, "grad_norm": 1.338670015335083, "learning_rate": 2.3822358786375928e-06, "loss": 0.6265, "step": 106000 }, { "epoch": 2.8840192986711948, "grad_norm": 1.3654705286026, "learning_rate": 1.9330116888134193e-06, "loss": 0.6259, "step": 107000 }, { "epoch": 2.9109727500606453, "grad_norm": 1.3295788764953613, "learning_rate": 1.4837874989892456e-06, "loss": 0.6274, "step": 108000 }, { "epoch": 2.9379262014500958, "grad_norm": 1.3103362321853638, "learning_rate": 1.0345633091650721e-06, "loss": 0.6261, "step": 109000 }, { "epoch": 2.9648796528395462, "grad_norm": 1.3937286138534546, "learning_rate": 5.853391193408983e-07, "loss": 0.6253, "step": 110000 }, { "epoch": 2.9918331042289967, "grad_norm": 1.3265410661697388, "learning_rate": 1.3611492951672462e-07, "loss": 0.6257, "step": 111000 }, { "epoch": 3.0, "step": 111303, "total_flos": 2.6556045146219667e+19, "train_loss": 0.8057731239829722, "train_runtime": 323766.3615, "train_samples_per_second": 88.005, "train_steps_per_second": 0.344 } ], "logging_steps": 1000, "max_steps": 111303, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6556045146219667e+19, "train_batch_size": 256, "trial_name": null, "trial_params": null }