|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 1000, |
|
"global_step": 111303, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02695345138945042, |
|
"grad_norm": 1.7572588920593262, |
|
"learning_rate": 4.955077581017583e-05, |
|
"loss": 2.2431, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05390690277890084, |
|
"grad_norm": 1.843064308166504, |
|
"learning_rate": 4.9101551620351654e-05, |
|
"loss": 1.8256, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08086035416835126, |
|
"grad_norm": 1.7338684797286987, |
|
"learning_rate": 4.865232743052748e-05, |
|
"loss": 1.5467, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10781380555780168, |
|
"grad_norm": 1.794568657875061, |
|
"learning_rate": 4.8203103240703305e-05, |
|
"loss": 1.3891, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.13476725694725208, |
|
"grad_norm": 1.6764180660247803, |
|
"learning_rate": 4.7753879050879134e-05, |
|
"loss": 1.3107, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.16172070833670252, |
|
"grad_norm": 1.5753543376922607, |
|
"learning_rate": 4.7304654861054956e-05, |
|
"loss": 1.2522, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.18867415972615292, |
|
"grad_norm": 1.5777322053909302, |
|
"learning_rate": 4.6855430671230785e-05, |
|
"loss": 1.2054, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.21562761111560336, |
|
"grad_norm": 1.6358872652053833, |
|
"learning_rate": 4.640620648140661e-05, |
|
"loss": 1.1639, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.24258106250505376, |
|
"grad_norm": 1.4646263122558594, |
|
"learning_rate": 4.595698229158244e-05, |
|
"loss": 1.1349, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.26953451389450417, |
|
"grad_norm": 1.8237578868865967, |
|
"learning_rate": 4.5507758101758266e-05, |
|
"loss": 1.1017, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2964879652839546, |
|
"grad_norm": 4.1678619384765625, |
|
"learning_rate": 4.5058533911934095e-05, |
|
"loss": 1.0802, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.32344141667340504, |
|
"grad_norm": 1.6083228588104248, |
|
"learning_rate": 4.4609309722109924e-05, |
|
"loss": 1.0584, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.35039486806285547, |
|
"grad_norm": 1.5944287776947021, |
|
"learning_rate": 4.4160085532285746e-05, |
|
"loss": 1.0376, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.37734831945230585, |
|
"grad_norm": 1.5678300857543945, |
|
"learning_rate": 4.3710861342461575e-05, |
|
"loss": 1.0186, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.4043017708417563, |
|
"grad_norm": 1.5760828256607056, |
|
"learning_rate": 4.32616371526374e-05, |
|
"loss": 1.0006, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4312552222312067, |
|
"grad_norm": 1.5422290563583374, |
|
"learning_rate": 4.281241296281323e-05, |
|
"loss": 0.9868, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.45820867362065715, |
|
"grad_norm": 1.8263678550720215, |
|
"learning_rate": 4.236318877298905e-05, |
|
"loss": 0.9737, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.4851621250101075, |
|
"grad_norm": 1.4930555820465088, |
|
"learning_rate": 4.191396458316488e-05, |
|
"loss": 0.9595, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.512115576399558, |
|
"grad_norm": 1.548412561416626, |
|
"learning_rate": 4.14647403933407e-05, |
|
"loss": 0.9448, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.5390690277890083, |
|
"grad_norm": 1.478004813194275, |
|
"learning_rate": 4.101551620351653e-05, |
|
"loss": 0.9351, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5660224791784588, |
|
"grad_norm": 1.5493645668029785, |
|
"learning_rate": 4.056629201369235e-05, |
|
"loss": 0.9235, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.5929759305679092, |
|
"grad_norm": 1.5261894464492798, |
|
"learning_rate": 4.011706782386818e-05, |
|
"loss": 0.9133, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.6199293819573597, |
|
"grad_norm": 1.3742462396621704, |
|
"learning_rate": 3.9667843634044e-05, |
|
"loss": 0.9027, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.6468828333468101, |
|
"grad_norm": 1.451717495918274, |
|
"learning_rate": 3.921861944421983e-05, |
|
"loss": 0.8937, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6738362847362604, |
|
"grad_norm": 1.5163180828094482, |
|
"learning_rate": 3.876939525439566e-05, |
|
"loss": 0.8837, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.7007897361257109, |
|
"grad_norm": 1.312432885169983, |
|
"learning_rate": 3.832017106457148e-05, |
|
"loss": 0.8735, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.7277431875151613, |
|
"grad_norm": 1.4663609266281128, |
|
"learning_rate": 3.787094687474731e-05, |
|
"loss": 0.8663, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.7546966389046117, |
|
"grad_norm": 1.457512617111206, |
|
"learning_rate": 3.742172268492314e-05, |
|
"loss": 0.8587, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.7816500902940622, |
|
"grad_norm": 1.4610168933868408, |
|
"learning_rate": 3.697249849509897e-05, |
|
"loss": 0.8514, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.8086035416835126, |
|
"grad_norm": 1.6427557468414307, |
|
"learning_rate": 3.652327430527479e-05, |
|
"loss": 0.8457, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8355569930729629, |
|
"grad_norm": 1.3497254848480225, |
|
"learning_rate": 3.607405011545062e-05, |
|
"loss": 0.8372, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.8625104444624134, |
|
"grad_norm": 1.597374677658081, |
|
"learning_rate": 3.5624825925626444e-05, |
|
"loss": 0.8308, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.8894638958518638, |
|
"grad_norm": 1.5185362100601196, |
|
"learning_rate": 3.517560173580227e-05, |
|
"loss": 0.8262, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.9164173472413143, |
|
"grad_norm": 1.452099323272705, |
|
"learning_rate": 3.4726377545978095e-05, |
|
"loss": 0.8188, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.9433707986307647, |
|
"grad_norm": 1.4626882076263428, |
|
"learning_rate": 3.4277153356153925e-05, |
|
"loss": 0.8128, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.970324250020215, |
|
"grad_norm": 1.329575538635254, |
|
"learning_rate": 3.3827929166329754e-05, |
|
"loss": 0.8057, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.9972777014096655, |
|
"grad_norm": 1.3917378187179565, |
|
"learning_rate": 3.3378704976505576e-05, |
|
"loss": 0.8014, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.024231152799116, |
|
"grad_norm": 1.4865970611572266, |
|
"learning_rate": 3.2929480786681405e-05, |
|
"loss": 0.7947, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.0511846041885664, |
|
"grad_norm": 1.4632256031036377, |
|
"learning_rate": 3.248025659685723e-05, |
|
"loss": 0.7898, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.0781380555780167, |
|
"grad_norm": 1.5164929628372192, |
|
"learning_rate": 3.2031032407033056e-05, |
|
"loss": 0.7843, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.1050915069674672, |
|
"grad_norm": 1.4825857877731323, |
|
"learning_rate": 3.158180821720888e-05, |
|
"loss": 0.7796, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.1320449583569177, |
|
"grad_norm": 1.4526509046554565, |
|
"learning_rate": 3.113258402738471e-05, |
|
"loss": 0.7771, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.158998409746368, |
|
"grad_norm": 1.4606186151504517, |
|
"learning_rate": 3.068335983756053e-05, |
|
"loss": 0.7728, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.1859518611358184, |
|
"grad_norm": 3.3901302814483643, |
|
"learning_rate": 3.0234135647736362e-05, |
|
"loss": 0.7664, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.212905312525269, |
|
"grad_norm": 1.4377284049987793, |
|
"learning_rate": 2.9784911457912185e-05, |
|
"loss": 0.7656, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.2398587639147194, |
|
"grad_norm": 1.432142734527588, |
|
"learning_rate": 2.9335687268088014e-05, |
|
"loss": 0.7592, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.2668122153041697, |
|
"grad_norm": 1.3727272748947144, |
|
"learning_rate": 2.8886463078263843e-05, |
|
"loss": 0.7563, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.2937656666936201, |
|
"grad_norm": 1.5601295232772827, |
|
"learning_rate": 2.8437238888439665e-05, |
|
"loss": 0.7509, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.3207191180830704, |
|
"grad_norm": 1.4392520189285278, |
|
"learning_rate": 2.7988014698615494e-05, |
|
"loss": 0.7461, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.347672569472521, |
|
"grad_norm": 1.365379810333252, |
|
"learning_rate": 2.7538790508791316e-05, |
|
"loss": 0.7416, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.3746260208619714, |
|
"grad_norm": 1.352710247039795, |
|
"learning_rate": 2.7089566318967145e-05, |
|
"loss": 0.7403, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.4015794722514219, |
|
"grad_norm": 1.3562721014022827, |
|
"learning_rate": 2.664034212914297e-05, |
|
"loss": 0.7359, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.4285329236408721, |
|
"grad_norm": 1.627213954925537, |
|
"learning_rate": 2.61911179393188e-05, |
|
"loss": 0.7329, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.4554863750303226, |
|
"grad_norm": 1.4729622602462769, |
|
"learning_rate": 2.5741893749494622e-05, |
|
"loss": 0.7302, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.4824398264197731, |
|
"grad_norm": 1.4408637285232544, |
|
"learning_rate": 2.529266955967045e-05, |
|
"loss": 0.7283, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.5093932778092234, |
|
"grad_norm": 1.504920482635498, |
|
"learning_rate": 2.4843445369846277e-05, |
|
"loss": 0.7233, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.5363467291986739, |
|
"grad_norm": 1.4795109033584595, |
|
"learning_rate": 2.4394221180022103e-05, |
|
"loss": 0.7211, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.5633001805881244, |
|
"grad_norm": 1.4444007873535156, |
|
"learning_rate": 2.394499699019793e-05, |
|
"loss": 0.7162, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.5902536319775749, |
|
"grad_norm": 1.4556983709335327, |
|
"learning_rate": 2.3495772800373754e-05, |
|
"loss": 0.7156, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.6172070833670251, |
|
"grad_norm": 1.5000152587890625, |
|
"learning_rate": 2.304654861054958e-05, |
|
"loss": 0.7118, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.6441605347564756, |
|
"grad_norm": 1.385746717453003, |
|
"learning_rate": 2.259732442072541e-05, |
|
"loss": 0.7091, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.6711139861459259, |
|
"grad_norm": 1.5274490118026733, |
|
"learning_rate": 2.2148100230901235e-05, |
|
"loss": 0.7073, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.6980674375353764, |
|
"grad_norm": 1.411332607269287, |
|
"learning_rate": 2.169887604107706e-05, |
|
"loss": 0.703, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.7250208889248269, |
|
"grad_norm": 1.6316828727722168, |
|
"learning_rate": 2.124965185125289e-05, |
|
"loss": 0.702, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.7519743403142773, |
|
"grad_norm": 1.4710853099822998, |
|
"learning_rate": 2.0800427661428715e-05, |
|
"loss": 0.6995, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.7789277917037276, |
|
"grad_norm": 1.984465479850769, |
|
"learning_rate": 2.035120347160454e-05, |
|
"loss": 0.6976, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.805881243093178, |
|
"grad_norm": 1.5006201267242432, |
|
"learning_rate": 1.9901979281780366e-05, |
|
"loss": 0.6917, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.8328346944826284, |
|
"grad_norm": 1.4539721012115479, |
|
"learning_rate": 1.9452755091956192e-05, |
|
"loss": 0.6906, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.8597881458720789, |
|
"grad_norm": 1.4637404680252075, |
|
"learning_rate": 1.9003530902132018e-05, |
|
"loss": 0.6866, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.8867415972615293, |
|
"grad_norm": 1.4094914197921753, |
|
"learning_rate": 1.8554306712307847e-05, |
|
"loss": 0.6856, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.9136950486509798, |
|
"grad_norm": 1.828584909439087, |
|
"learning_rate": 1.8105082522483672e-05, |
|
"loss": 0.6838, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.9406485000404303, |
|
"grad_norm": 1.3690617084503174, |
|
"learning_rate": 1.7655858332659498e-05, |
|
"loss": 0.6815, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.9676019514298806, |
|
"grad_norm": 1.438297986984253, |
|
"learning_rate": 1.7206634142835324e-05, |
|
"loss": 0.678, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.9945554028193309, |
|
"grad_norm": 1.482967495918274, |
|
"learning_rate": 1.675740995301115e-05, |
|
"loss": 0.6761, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.0215088542087813, |
|
"grad_norm": 1.5024503469467163, |
|
"learning_rate": 1.6308185763186975e-05, |
|
"loss": 0.6719, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.048462305598232, |
|
"grad_norm": 1.444847583770752, |
|
"learning_rate": 1.5858961573362804e-05, |
|
"loss": 0.6715, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 2.0754157569876823, |
|
"grad_norm": 1.4710804224014282, |
|
"learning_rate": 1.540973738353863e-05, |
|
"loss": 0.6697, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 2.102369208377133, |
|
"grad_norm": 1.4638535976409912, |
|
"learning_rate": 1.4960513193714457e-05, |
|
"loss": 0.6676, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.1293226597665833, |
|
"grad_norm": 1.4491604566574097, |
|
"learning_rate": 1.4511289003890283e-05, |
|
"loss": 0.666, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 2.1562761111560333, |
|
"grad_norm": 1.5296227931976318, |
|
"learning_rate": 1.4062064814066108e-05, |
|
"loss": 0.6638, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.183229562545484, |
|
"grad_norm": 1.4576656818389893, |
|
"learning_rate": 1.3612840624241934e-05, |
|
"loss": 0.6603, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.2101830139349343, |
|
"grad_norm": 1.3864960670471191, |
|
"learning_rate": 1.3163616434417761e-05, |
|
"loss": 0.6593, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.237136465324385, |
|
"grad_norm": 1.7719892263412476, |
|
"learning_rate": 1.2714392244593587e-05, |
|
"loss": 0.6602, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.2640899167138353, |
|
"grad_norm": 1.3673619031906128, |
|
"learning_rate": 1.2265168054769415e-05, |
|
"loss": 0.656, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.291043368103286, |
|
"grad_norm": 1.4584944248199463, |
|
"learning_rate": 1.181594386494524e-05, |
|
"loss": 0.6536, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.317996819492736, |
|
"grad_norm": 1.3770402669906616, |
|
"learning_rate": 1.1366719675121066e-05, |
|
"loss": 0.6519, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.3449502708821863, |
|
"grad_norm": 1.3435842990875244, |
|
"learning_rate": 1.0917495485296893e-05, |
|
"loss": 0.6498, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.371903722271637, |
|
"grad_norm": 1.4926843643188477, |
|
"learning_rate": 1.0468271295472719e-05, |
|
"loss": 0.6497, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 2.3988571736610873, |
|
"grad_norm": 1.4428088665008545, |
|
"learning_rate": 1.0019047105648545e-05, |
|
"loss": 0.648, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.425810625050538, |
|
"grad_norm": 1.3555138111114502, |
|
"learning_rate": 9.569822915824372e-06, |
|
"loss": 0.6463, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.4527640764399883, |
|
"grad_norm": 1.8231887817382812, |
|
"learning_rate": 9.1205987260002e-06, |
|
"loss": 0.6441, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.4797175278294388, |
|
"grad_norm": 1.4294542074203491, |
|
"learning_rate": 8.671374536176025e-06, |
|
"loss": 0.6443, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.506670979218889, |
|
"grad_norm": 1.4158700704574585, |
|
"learning_rate": 8.22215034635185e-06, |
|
"loss": 0.6408, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.5336244306083393, |
|
"grad_norm": 1.3847190141677856, |
|
"learning_rate": 7.772926156527676e-06, |
|
"loss": 0.6407, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 2.56057788199779, |
|
"grad_norm": 1.4408886432647705, |
|
"learning_rate": 7.323701966703503e-06, |
|
"loss": 0.6408, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.5875313333872403, |
|
"grad_norm": 1.5943797826766968, |
|
"learning_rate": 6.87447777687933e-06, |
|
"loss": 0.6379, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 2.6144847847766908, |
|
"grad_norm": 1.4184848070144653, |
|
"learning_rate": 6.425253587055157e-06, |
|
"loss": 0.6367, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 2.641438236166141, |
|
"grad_norm": 1.4443740844726562, |
|
"learning_rate": 5.976029397230982e-06, |
|
"loss": 0.635, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.6683916875555918, |
|
"grad_norm": 1.5342421531677246, |
|
"learning_rate": 5.526805207406809e-06, |
|
"loss": 0.6349, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.695345138945042, |
|
"grad_norm": 1.4228054285049438, |
|
"learning_rate": 5.077581017582635e-06, |
|
"loss": 0.6349, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.7222985903344923, |
|
"grad_norm": 1.5616410970687866, |
|
"learning_rate": 4.628356827758461e-06, |
|
"loss": 0.6345, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 2.7492520417239428, |
|
"grad_norm": 1.4892140626907349, |
|
"learning_rate": 4.1791326379342876e-06, |
|
"loss": 0.6322, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 2.7762054931133933, |
|
"grad_norm": 1.48268723487854, |
|
"learning_rate": 3.7299084481101145e-06, |
|
"loss": 0.6318, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 2.8031589445028438, |
|
"grad_norm": 1.6489795446395874, |
|
"learning_rate": 3.2806842582859406e-06, |
|
"loss": 0.6295, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 2.830112395892294, |
|
"grad_norm": 1.458978295326233, |
|
"learning_rate": 2.8314600684617667e-06, |
|
"loss": 0.6291, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 2.8570658472817443, |
|
"grad_norm": 1.338670015335083, |
|
"learning_rate": 2.3822358786375928e-06, |
|
"loss": 0.6265, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 2.8840192986711948, |
|
"grad_norm": 1.3654705286026, |
|
"learning_rate": 1.9330116888134193e-06, |
|
"loss": 0.6259, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 2.9109727500606453, |
|
"grad_norm": 1.3295788764953613, |
|
"learning_rate": 1.4837874989892456e-06, |
|
"loss": 0.6274, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 2.9379262014500958, |
|
"grad_norm": 1.3103362321853638, |
|
"learning_rate": 1.0345633091650721e-06, |
|
"loss": 0.6261, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 2.9648796528395462, |
|
"grad_norm": 1.3937286138534546, |
|
"learning_rate": 5.853391193408983e-07, |
|
"loss": 0.6253, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 2.9918331042289967, |
|
"grad_norm": 1.3265410661697388, |
|
"learning_rate": 1.3611492951672462e-07, |
|
"loss": 0.6257, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 111303, |
|
"total_flos": 2.6556045146219667e+19, |
|
"train_loss": 0.8057731239829722, |
|
"train_runtime": 323766.3615, |
|
"train_samples_per_second": 88.005, |
|
"train_steps_per_second": 0.344 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 111303, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6556045146219667e+19, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|