mini_qwen_pt / trainer_state.json
qiufengqijun's picture
上传权重
b42ae7d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999537845806875,
"eval_steps": 500,
"global_step": 13523,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014788934179999814,
"grad_norm": 1.9570056200027466,
"learning_rate": 1.4781966001478198e-06,
"loss": 12.123,
"step": 20
},
{
"epoch": 0.002957786835999963,
"grad_norm": 3.435842990875244,
"learning_rate": 2.9563932002956396e-06,
"loss": 11.8952,
"step": 40
},
{
"epoch": 0.0044366802539999445,
"grad_norm": 1.3055179119110107,
"learning_rate": 4.434589800443459e-06,
"loss": 11.1244,
"step": 60
},
{
"epoch": 0.005915573671999926,
"grad_norm": 1.1435202360153198,
"learning_rate": 5.912786400591279e-06,
"loss": 10.6584,
"step": 80
},
{
"epoch": 0.007394467089999908,
"grad_norm": 1.1122593879699707,
"learning_rate": 7.390983000739099e-06,
"loss": 10.3924,
"step": 100
},
{
"epoch": 0.008873360507999889,
"grad_norm": 1.0903944969177246,
"learning_rate": 8.869179600886918e-06,
"loss": 10.1278,
"step": 120
},
{
"epoch": 0.010352253925999871,
"grad_norm": 1.0405408143997192,
"learning_rate": 1.0347376201034738e-05,
"loss": 9.829,
"step": 140
},
{
"epoch": 0.011831147343999851,
"grad_norm": 1.032538652420044,
"learning_rate": 1.1825572801182558e-05,
"loss": 9.4957,
"step": 160
},
{
"epoch": 0.013310040761999833,
"grad_norm": 1.4152177572250366,
"learning_rate": 1.3303769401330377e-05,
"loss": 9.1722,
"step": 180
},
{
"epoch": 0.014788934179999816,
"grad_norm": 0.8978266716003418,
"learning_rate": 1.4781966001478198e-05,
"loss": 8.8736,
"step": 200
},
{
"epoch": 0.016267827597999798,
"grad_norm": 1.0230133533477783,
"learning_rate": 1.6260162601626018e-05,
"loss": 8.6163,
"step": 220
},
{
"epoch": 0.017746721015999778,
"grad_norm": 1.3886386156082153,
"learning_rate": 1.7738359201773837e-05,
"loss": 8.3772,
"step": 240
},
{
"epoch": 0.019225614433999758,
"grad_norm": 0.8950226306915283,
"learning_rate": 1.9216555801921658e-05,
"loss": 8.1872,
"step": 260
},
{
"epoch": 0.020704507851999742,
"grad_norm": 1.3098183870315552,
"learning_rate": 2.0694752402069477e-05,
"loss": 8.0067,
"step": 280
},
{
"epoch": 0.022183401269999722,
"grad_norm": 1.3033353090286255,
"learning_rate": 2.2172949002217298e-05,
"loss": 7.8361,
"step": 300
},
{
"epoch": 0.023662294687999703,
"grad_norm": 1.6088228225708008,
"learning_rate": 2.3651145602365117e-05,
"loss": 7.69,
"step": 320
},
{
"epoch": 0.025141188105999687,
"grad_norm": 1.0888606309890747,
"learning_rate": 2.5129342202512935e-05,
"loss": 7.5744,
"step": 340
},
{
"epoch": 0.026620081523999667,
"grad_norm": 1.0944548845291138,
"learning_rate": 2.6607538802660753e-05,
"loss": 7.4501,
"step": 360
},
{
"epoch": 0.028098974941999647,
"grad_norm": 1.5041922330856323,
"learning_rate": 2.8085735402808578e-05,
"loss": 7.3575,
"step": 380
},
{
"epoch": 0.02957786835999963,
"grad_norm": 1.4672595262527466,
"learning_rate": 2.9563932002956397e-05,
"loss": 7.2633,
"step": 400
},
{
"epoch": 0.03105676177799961,
"grad_norm": 1.3001948595046997,
"learning_rate": 3.104212860310421e-05,
"loss": 7.1749,
"step": 420
},
{
"epoch": 0.032535655195999595,
"grad_norm": 1.4149699211120605,
"learning_rate": 3.2520325203252037e-05,
"loss": 7.098,
"step": 440
},
{
"epoch": 0.03401454861399957,
"grad_norm": 1.6322951316833496,
"learning_rate": 3.3998521803399855e-05,
"loss": 7.015,
"step": 460
},
{
"epoch": 0.035493442031999556,
"grad_norm": 1.659485101699829,
"learning_rate": 3.547671840354767e-05,
"loss": 6.9398,
"step": 480
},
{
"epoch": 0.03697233544999954,
"grad_norm": 1.7957265377044678,
"learning_rate": 3.69549150036955e-05,
"loss": 6.8648,
"step": 500
},
{
"epoch": 0.038451228867999517,
"grad_norm": 1.4912447929382324,
"learning_rate": 3.8433111603843317e-05,
"loss": 6.7973,
"step": 520
},
{
"epoch": 0.0399301222859995,
"grad_norm": 1.7237913608551025,
"learning_rate": 3.9911308203991135e-05,
"loss": 6.7331,
"step": 540
},
{
"epoch": 0.041409015703999484,
"grad_norm": 1.8182610273361206,
"learning_rate": 4.138950480413895e-05,
"loss": 6.668,
"step": 560
},
{
"epoch": 0.04288790912199946,
"grad_norm": 1.6812163591384888,
"learning_rate": 4.286770140428677e-05,
"loss": 6.5894,
"step": 580
},
{
"epoch": 0.044366802539999445,
"grad_norm": 1.818665623664856,
"learning_rate": 4.4345898004434597e-05,
"loss": 6.5361,
"step": 600
},
{
"epoch": 0.04584569595799943,
"grad_norm": 1.3113698959350586,
"learning_rate": 4.5824094604582415e-05,
"loss": 6.4732,
"step": 620
},
{
"epoch": 0.047324589375999405,
"grad_norm": 1.9587410688400269,
"learning_rate": 4.730229120473023e-05,
"loss": 6.4143,
"step": 640
},
{
"epoch": 0.04880348279399939,
"grad_norm": 1.4764151573181152,
"learning_rate": 4.878048780487805e-05,
"loss": 6.358,
"step": 660
},
{
"epoch": 0.05028237621199937,
"grad_norm": 1.5685200691223145,
"learning_rate": 5.025868440502587e-05,
"loss": 6.3084,
"step": 680
},
{
"epoch": 0.05176126962999935,
"grad_norm": 2.1411592960357666,
"learning_rate": 5.173688100517369e-05,
"loss": 6.2515,
"step": 700
},
{
"epoch": 0.053240163047999334,
"grad_norm": 2.6792619228363037,
"learning_rate": 5.3215077605321506e-05,
"loss": 6.2091,
"step": 720
},
{
"epoch": 0.05471905646599932,
"grad_norm": 1.5457326173782349,
"learning_rate": 5.4693274205469325e-05,
"loss": 6.1512,
"step": 740
},
{
"epoch": 0.056197949883999294,
"grad_norm": 1.931794285774231,
"learning_rate": 5.6171470805617157e-05,
"loss": 6.0981,
"step": 760
},
{
"epoch": 0.05767684330199928,
"grad_norm": 2.3924379348754883,
"learning_rate": 5.7649667405764975e-05,
"loss": 6.0439,
"step": 780
},
{
"epoch": 0.05915573671999926,
"grad_norm": 2.1078522205352783,
"learning_rate": 5.912786400591279e-05,
"loss": 6.0081,
"step": 800
},
{
"epoch": 0.06063463013799924,
"grad_norm": 1.8126791715621948,
"learning_rate": 6.060606060606061e-05,
"loss": 5.9435,
"step": 820
},
{
"epoch": 0.06211352355599922,
"grad_norm": 1.6939939260482788,
"learning_rate": 6.208425720620842e-05,
"loss": 5.9,
"step": 840
},
{
"epoch": 0.0635924169739992,
"grad_norm": 1.7903132438659668,
"learning_rate": 6.356245380635625e-05,
"loss": 5.8536,
"step": 860
},
{
"epoch": 0.06507131039199919,
"grad_norm": 2.1418817043304443,
"learning_rate": 6.504065040650407e-05,
"loss": 5.8192,
"step": 880
},
{
"epoch": 0.06655020380999917,
"grad_norm": 1.6386531591415405,
"learning_rate": 6.651884700665188e-05,
"loss": 5.768,
"step": 900
},
{
"epoch": 0.06802909722799914,
"grad_norm": 1.82034432888031,
"learning_rate": 6.799704360679971e-05,
"loss": 5.7162,
"step": 920
},
{
"epoch": 0.06950799064599913,
"grad_norm": 1.9206963777542114,
"learning_rate": 6.947524020694752e-05,
"loss": 5.6755,
"step": 940
},
{
"epoch": 0.07098688406399911,
"grad_norm": 1.4253259897232056,
"learning_rate": 7.095343680709535e-05,
"loss": 5.6321,
"step": 960
},
{
"epoch": 0.07246577748199909,
"grad_norm": 2.0578746795654297,
"learning_rate": 7.243163340724317e-05,
"loss": 5.5907,
"step": 980
},
{
"epoch": 0.07394467089999908,
"grad_norm": 1.4132108688354492,
"learning_rate": 7.3909830007391e-05,
"loss": 5.5483,
"step": 1000
},
{
"epoch": 0.07542356431799906,
"grad_norm": 1.6758071184158325,
"learning_rate": 7.538802660753881e-05,
"loss": 5.5136,
"step": 1020
},
{
"epoch": 0.07690245773599903,
"grad_norm": 1.5184019804000854,
"learning_rate": 7.686622320768663e-05,
"loss": 5.4715,
"step": 1040
},
{
"epoch": 0.07838135115399902,
"grad_norm": 1.731789231300354,
"learning_rate": 7.834441980783444e-05,
"loss": 5.4289,
"step": 1060
},
{
"epoch": 0.079860244571999,
"grad_norm": 1.4423941373825073,
"learning_rate": 7.982261640798227e-05,
"loss": 5.3799,
"step": 1080
},
{
"epoch": 0.08133913798999898,
"grad_norm": 1.200088620185852,
"learning_rate": 8.130081300813008e-05,
"loss": 5.3446,
"step": 1100
},
{
"epoch": 0.08281803140799897,
"grad_norm": 1.5034804344177246,
"learning_rate": 8.27790096082779e-05,
"loss": 5.3011,
"step": 1120
},
{
"epoch": 0.08429692482599895,
"grad_norm": 1.6272141933441162,
"learning_rate": 8.425720620842572e-05,
"loss": 5.2573,
"step": 1140
},
{
"epoch": 0.08577581824399892,
"grad_norm": 1.6940892934799194,
"learning_rate": 8.573540280857354e-05,
"loss": 5.2206,
"step": 1160
},
{
"epoch": 0.08725471166199891,
"grad_norm": 1.531122088432312,
"learning_rate": 8.721359940872137e-05,
"loss": 5.1842,
"step": 1180
},
{
"epoch": 0.08873360507999889,
"grad_norm": 1.3891607522964478,
"learning_rate": 8.869179600886919e-05,
"loss": 5.1574,
"step": 1200
},
{
"epoch": 0.09021249849799887,
"grad_norm": 1.5175141096115112,
"learning_rate": 9.0169992609017e-05,
"loss": 5.0965,
"step": 1220
},
{
"epoch": 0.09169139191599886,
"grad_norm": 1.2954392433166504,
"learning_rate": 9.164818920916483e-05,
"loss": 5.0615,
"step": 1240
},
{
"epoch": 0.09317028533399883,
"grad_norm": 1.1776789426803589,
"learning_rate": 9.312638580931264e-05,
"loss": 5.0263,
"step": 1260
},
{
"epoch": 0.09464917875199881,
"grad_norm": 1.342835545539856,
"learning_rate": 9.460458240946047e-05,
"loss": 4.9938,
"step": 1280
},
{
"epoch": 0.0961280721699988,
"grad_norm": 1.5098336935043335,
"learning_rate": 9.608277900960828e-05,
"loss": 4.9579,
"step": 1300
},
{
"epoch": 0.09760696558799878,
"grad_norm": 1.3883858919143677,
"learning_rate": 9.75609756097561e-05,
"loss": 4.9159,
"step": 1320
},
{
"epoch": 0.09908585900599876,
"grad_norm": 1.6131935119628906,
"learning_rate": 9.903917220990391e-05,
"loss": 4.8716,
"step": 1340
},
{
"epoch": 0.10056475242399875,
"grad_norm": 1.3793425559997559,
"learning_rate": 9.999991836910476e-05,
"loss": 4.8389,
"step": 1360
},
{
"epoch": 0.10204364584199872,
"grad_norm": 1.2413076162338257,
"learning_rate": 9.999878553677705e-05,
"loss": 4.8044,
"step": 1380
},
{
"epoch": 0.1035225392599987,
"grad_norm": 1.4875175952911377,
"learning_rate": 9.99963199901083e-05,
"loss": 4.759,
"step": 1400
},
{
"epoch": 0.10500143267799869,
"grad_norm": 1.281230092048645,
"learning_rate": 9.999252179481748e-05,
"loss": 4.733,
"step": 1420
},
{
"epoch": 0.10648032609599867,
"grad_norm": 1.179935336112976,
"learning_rate": 9.998739105214525e-05,
"loss": 4.6965,
"step": 1440
},
{
"epoch": 0.10795921951399864,
"grad_norm": 1.2033872604370117,
"learning_rate": 9.998092789885118e-05,
"loss": 4.649,
"step": 1460
},
{
"epoch": 0.10943811293199864,
"grad_norm": 1.310261607170105,
"learning_rate": 9.997313250721026e-05,
"loss": 4.6158,
"step": 1480
},
{
"epoch": 0.11091700634999861,
"grad_norm": 1.1370333433151245,
"learning_rate": 9.996400508500809e-05,
"loss": 4.5917,
"step": 1500
},
{
"epoch": 0.11239589976799859,
"grad_norm": 0.9518343210220337,
"learning_rate": 9.995354587553553e-05,
"loss": 4.5477,
"step": 1520
},
{
"epoch": 0.11387479318599858,
"grad_norm": 1.1209640502929688,
"learning_rate": 9.994175515758211e-05,
"loss": 4.5169,
"step": 1540
},
{
"epoch": 0.11535368660399856,
"grad_norm": 1.1134682893753052,
"learning_rate": 9.992863324542865e-05,
"loss": 4.4921,
"step": 1560
},
{
"epoch": 0.11683258002199853,
"grad_norm": 1.1962740421295166,
"learning_rate": 9.991418048883885e-05,
"loss": 4.4678,
"step": 1580
},
{
"epoch": 0.11831147343999852,
"grad_norm": 1.0190341472625732,
"learning_rate": 9.989839727305e-05,
"loss": 4.4265,
"step": 1600
},
{
"epoch": 0.1197903668579985,
"grad_norm": 1.1323659420013428,
"learning_rate": 9.988128401876267e-05,
"loss": 4.3951,
"step": 1620
},
{
"epoch": 0.12126926027599848,
"grad_norm": 1.2068976163864136,
"learning_rate": 9.986284118212951e-05,
"loss": 4.3762,
"step": 1640
},
{
"epoch": 0.12274815369399847,
"grad_norm": 1.1199101209640503,
"learning_rate": 9.984306925474313e-05,
"loss": 4.3519,
"step": 1660
},
{
"epoch": 0.12422704711199845,
"grad_norm": 0.8594743013381958,
"learning_rate": 9.982196876362298e-05,
"loss": 4.3268,
"step": 1680
},
{
"epoch": 0.12570594052999842,
"grad_norm": 1.0981128215789795,
"learning_rate": 9.979954027120124e-05,
"loss": 4.3018,
"step": 1700
},
{
"epoch": 0.1271848339479984,
"grad_norm": 0.9453332424163818,
"learning_rate": 9.97757843753079e-05,
"loss": 4.2747,
"step": 1720
},
{
"epoch": 0.1286637273659984,
"grad_norm": 0.9754221439361572,
"learning_rate": 9.975070170915481e-05,
"loss": 4.2539,
"step": 1740
},
{
"epoch": 0.13014262078399838,
"grad_norm": 0.7794106602668762,
"learning_rate": 9.972429294131878e-05,
"loss": 4.2331,
"step": 1760
},
{
"epoch": 0.13162151420199836,
"grad_norm": 0.8084755539894104,
"learning_rate": 9.969655877572379e-05,
"loss": 4.2076,
"step": 1780
},
{
"epoch": 0.13310040761999833,
"grad_norm": 0.9451693296432495,
"learning_rate": 9.96674999516222e-05,
"loss": 4.2023,
"step": 1800
},
{
"epoch": 0.1345793010379983,
"grad_norm": 0.9662824869155884,
"learning_rate": 9.963711724357503e-05,
"loss": 4.1661,
"step": 1820
},
{
"epoch": 0.1360581944559983,
"grad_norm": 0.8646146655082703,
"learning_rate": 9.960541146143138e-05,
"loss": 4.1529,
"step": 1840
},
{
"epoch": 0.1375370878739983,
"grad_norm": 0.819580078125,
"learning_rate": 9.957238345030681e-05,
"loss": 4.1353,
"step": 1860
},
{
"epoch": 0.13901598129199827,
"grad_norm": 0.793268620967865,
"learning_rate": 9.953803409056077e-05,
"loss": 4.1205,
"step": 1880
},
{
"epoch": 0.14049487470999825,
"grad_norm": 0.8794734477996826,
"learning_rate": 9.950236429777319e-05,
"loss": 4.1034,
"step": 1900
},
{
"epoch": 0.14197376812799822,
"grad_norm": 0.8757349252700806,
"learning_rate": 9.946537502272004e-05,
"loss": 4.0896,
"step": 1920
},
{
"epoch": 0.1434526615459982,
"grad_norm": 0.806181788444519,
"learning_rate": 9.942706725134801e-05,
"loss": 4.0792,
"step": 1940
},
{
"epoch": 0.14493155496399818,
"grad_norm": 0.568131148815155,
"learning_rate": 9.938744200474825e-05,
"loss": 4.0483,
"step": 1960
},
{
"epoch": 0.14641044838199818,
"grad_norm": 0.9386783242225647,
"learning_rate": 9.934650033912909e-05,
"loss": 4.0349,
"step": 1980
},
{
"epoch": 0.14788934179999816,
"grad_norm": 0.8668307065963745,
"learning_rate": 9.930424334578793e-05,
"loss": 4.0249,
"step": 2000
},
{
"epoch": 0.14936823521799814,
"grad_norm": 0.7728129625320435,
"learning_rate": 9.926067215108216e-05,
"loss": 4.001,
"step": 2020
},
{
"epoch": 0.1508471286359981,
"grad_norm": 0.8983877301216125,
"learning_rate": 9.92157879163991e-05,
"loss": 4.0099,
"step": 2040
},
{
"epoch": 0.1523260220539981,
"grad_norm": 0.7290263772010803,
"learning_rate": 9.916959183812508e-05,
"loss": 3.9816,
"step": 2060
},
{
"epoch": 0.15380491547199807,
"grad_norm": 1.0002912282943726,
"learning_rate": 9.912208514761353e-05,
"loss": 3.964,
"step": 2080
},
{
"epoch": 0.15528380888999807,
"grad_norm": 0.8696877956390381,
"learning_rate": 9.907326911115215e-05,
"loss": 3.9532,
"step": 2100
},
{
"epoch": 0.15676270230799805,
"grad_norm": 0.9264429211616516,
"learning_rate": 9.90231450299292e-05,
"loss": 3.9405,
"step": 2120
},
{
"epoch": 0.15824159572599802,
"grad_norm": 0.6036892533302307,
"learning_rate": 9.897171423999877e-05,
"loss": 3.9308,
"step": 2140
},
{
"epoch": 0.159720489143998,
"grad_norm": 0.6206973791122437,
"learning_rate": 9.891897811224516e-05,
"loss": 3.9089,
"step": 2160
},
{
"epoch": 0.16119938256199798,
"grad_norm": 0.9498934149742126,
"learning_rate": 9.886493805234642e-05,
"loss": 3.9101,
"step": 2180
},
{
"epoch": 0.16267827597999795,
"grad_norm": 0.8084043264389038,
"learning_rate": 9.880959550073676e-05,
"loss": 3.9108,
"step": 2200
},
{
"epoch": 0.16415716939799796,
"grad_norm": 0.7810977697372437,
"learning_rate": 9.875295193256829e-05,
"loss": 3.8923,
"step": 2220
},
{
"epoch": 0.16563606281599794,
"grad_norm": 0.5951938033103943,
"learning_rate": 9.869500885767156e-05,
"loss": 3.8676,
"step": 2240
},
{
"epoch": 0.1671149562339979,
"grad_norm": 0.7140426635742188,
"learning_rate": 9.863576782051544e-05,
"loss": 3.8717,
"step": 2260
},
{
"epoch": 0.1685938496519979,
"grad_norm": 0.7328889966011047,
"learning_rate": 9.857523040016588e-05,
"loss": 3.8585,
"step": 2280
},
{
"epoch": 0.17007274306999787,
"grad_norm": 0.9172821044921875,
"learning_rate": 9.851339821024383e-05,
"loss": 3.8515,
"step": 2300
},
{
"epoch": 0.17155163648799784,
"grad_norm": 0.70406574010849,
"learning_rate": 9.845027289888226e-05,
"loss": 3.8322,
"step": 2320
},
{
"epoch": 0.17303052990599785,
"grad_norm": 0.6545581221580505,
"learning_rate": 9.838585614868221e-05,
"loss": 3.8342,
"step": 2340
},
{
"epoch": 0.17450942332399783,
"grad_norm": 0.8262337446212769,
"learning_rate": 9.832014967666788e-05,
"loss": 3.8178,
"step": 2360
},
{
"epoch": 0.1759883167419978,
"grad_norm": 0.748437225818634,
"learning_rate": 9.825315523424097e-05,
"loss": 3.8054,
"step": 2380
},
{
"epoch": 0.17746721015999778,
"grad_norm": 0.7961335778236389,
"learning_rate": 9.818487460713397e-05,
"loss": 3.803,
"step": 2400
},
{
"epoch": 0.17894610357799776,
"grad_norm": 0.5949457287788391,
"learning_rate": 9.811530961536246e-05,
"loss": 3.7988,
"step": 2420
},
{
"epoch": 0.18042499699599773,
"grad_norm": 0.6500332355499268,
"learning_rate": 9.804446211317677e-05,
"loss": 3.7902,
"step": 2440
},
{
"epoch": 0.18190389041399774,
"grad_norm": 0.5734246969223022,
"learning_rate": 9.797233398901238e-05,
"loss": 3.7788,
"step": 2460
},
{
"epoch": 0.18338278383199771,
"grad_norm": 0.6358067393302917,
"learning_rate": 9.78989271654397e-05,
"loss": 3.7581,
"step": 2480
},
{
"epoch": 0.1848616772499977,
"grad_norm": 0.7676229476928711,
"learning_rate": 9.78242435991128e-05,
"loss": 3.7566,
"step": 2500
},
{
"epoch": 0.18634057066799767,
"grad_norm": 0.5594522356987,
"learning_rate": 9.774828528071722e-05,
"loss": 3.7552,
"step": 2520
},
{
"epoch": 0.18781946408599764,
"grad_norm": 0.7414741516113281,
"learning_rate": 9.767105423491694e-05,
"loss": 3.7404,
"step": 2540
},
{
"epoch": 0.18929835750399762,
"grad_norm": 0.6007790565490723,
"learning_rate": 9.759255252030042e-05,
"loss": 3.7308,
"step": 2560
},
{
"epoch": 0.19077725092199763,
"grad_norm": 0.6344082355499268,
"learning_rate": 9.751278222932569e-05,
"loss": 3.7179,
"step": 2580
},
{
"epoch": 0.1922561443399976,
"grad_norm": 0.6184104681015015,
"learning_rate": 9.743174548826461e-05,
"loss": 3.7177,
"step": 2600
},
{
"epoch": 0.19373503775799758,
"grad_norm": 0.785652756690979,
"learning_rate": 9.734944445714618e-05,
"loss": 3.7022,
"step": 2620
},
{
"epoch": 0.19521393117599756,
"grad_norm": 0.664434015750885,
"learning_rate": 9.726588132969901e-05,
"loss": 3.6885,
"step": 2640
},
{
"epoch": 0.19669282459399753,
"grad_norm": 0.6987696290016174,
"learning_rate": 9.718105833329272e-05,
"loss": 3.682,
"step": 2660
},
{
"epoch": 0.1981717180119975,
"grad_norm": 0.5085122585296631,
"learning_rate": 9.709497772887874e-05,
"loss": 3.6707,
"step": 2680
},
{
"epoch": 0.19965061142999752,
"grad_norm": 0.8911309838294983,
"learning_rate": 9.700764181092988e-05,
"loss": 3.6517,
"step": 2700
},
{
"epoch": 0.2011295048479975,
"grad_norm": 0.7100036144256592,
"learning_rate": 9.691905290737932e-05,
"loss": 3.6738,
"step": 2720
},
{
"epoch": 0.20260839826599747,
"grad_norm": 0.5330691933631897,
"learning_rate": 9.682921337955847e-05,
"loss": 3.664,
"step": 2740
},
{
"epoch": 0.20408729168399745,
"grad_norm": 0.5505249500274658,
"learning_rate": 9.673812562213401e-05,
"loss": 3.6491,
"step": 2760
},
{
"epoch": 0.20556618510199742,
"grad_norm": 0.7107018232345581,
"learning_rate": 9.664579206304413e-05,
"loss": 3.6406,
"step": 2780
},
{
"epoch": 0.2070450785199974,
"grad_norm": 0.5617266893386841,
"learning_rate": 9.65522151634338e-05,
"loss": 3.653,
"step": 2800
},
{
"epoch": 0.2085239719379974,
"grad_norm": 0.5702326893806458,
"learning_rate": 9.64573974175891e-05,
"loss": 3.6311,
"step": 2820
},
{
"epoch": 0.21000286535599738,
"grad_norm": 0.5759734511375427,
"learning_rate": 9.636134135287081e-05,
"loss": 3.6256,
"step": 2840
},
{
"epoch": 0.21148175877399736,
"grad_norm": 0.6595752835273743,
"learning_rate": 9.626404952964704e-05,
"loss": 3.6184,
"step": 2860
},
{
"epoch": 0.21296065219199733,
"grad_norm": 0.7071236371994019,
"learning_rate": 9.616552454122492e-05,
"loss": 3.6138,
"step": 2880
},
{
"epoch": 0.2144395456099973,
"grad_norm": 0.7660998702049255,
"learning_rate": 9.606576901378156e-05,
"loss": 3.6059,
"step": 2900
},
{
"epoch": 0.2159184390279973,
"grad_norm": 0.9190542101860046,
"learning_rate": 9.596478560629397e-05,
"loss": 3.5887,
"step": 2920
},
{
"epoch": 0.2173973324459973,
"grad_norm": 0.5795056223869324,
"learning_rate": 9.586257701046824e-05,
"loss": 3.5981,
"step": 2940
},
{
"epoch": 0.21887622586399727,
"grad_norm": 0.607071578502655,
"learning_rate": 9.575914595066777e-05,
"loss": 3.592,
"step": 2960
},
{
"epoch": 0.22035511928199725,
"grad_norm": 0.7824068069458008,
"learning_rate": 9.565449518384066e-05,
"loss": 3.5919,
"step": 2980
},
{
"epoch": 0.22183401269999722,
"grad_norm": 0.5169054269790649,
"learning_rate": 9.554862749944622e-05,
"loss": 3.5899,
"step": 3000
},
{
"epoch": 0.2233129061179972,
"grad_norm": 0.8486248850822449,
"learning_rate": 9.544154571938062e-05,
"loss": 3.5707,
"step": 3020
},
{
"epoch": 0.22479179953599718,
"grad_norm": 0.47671154141426086,
"learning_rate": 9.533325269790167e-05,
"loss": 3.559,
"step": 3040
},
{
"epoch": 0.22627069295399718,
"grad_norm": 0.5938573479652405,
"learning_rate": 9.522375132155272e-05,
"loss": 3.5422,
"step": 3060
},
{
"epoch": 0.22774958637199716,
"grad_norm": 0.6117560267448425,
"learning_rate": 9.511304450908576e-05,
"loss": 3.5671,
"step": 3080
},
{
"epoch": 0.22922847978999714,
"grad_norm": 0.6173937916755676,
"learning_rate": 9.500113521138361e-05,
"loss": 3.5669,
"step": 3100
},
{
"epoch": 0.2307073732079971,
"grad_norm": 0.726667046546936,
"learning_rate": 9.488802641138125e-05,
"loss": 3.5366,
"step": 3120
},
{
"epoch": 0.2321862666259971,
"grad_norm": 0.5627657771110535,
"learning_rate": 9.477372112398629e-05,
"loss": 3.53,
"step": 3140
},
{
"epoch": 0.23366516004399707,
"grad_norm": 0.49706488847732544,
"learning_rate": 9.465822239599864e-05,
"loss": 3.5406,
"step": 3160
},
{
"epoch": 0.23514405346199707,
"grad_norm": 0.9899396896362305,
"learning_rate": 9.454153330602932e-05,
"loss": 3.5231,
"step": 3180
},
{
"epoch": 0.23662294687999705,
"grad_norm": 0.4798751771450043,
"learning_rate": 9.442365696441835e-05,
"loss": 3.5116,
"step": 3200
},
{
"epoch": 0.23810184029799702,
"grad_norm": 0.6276853084564209,
"learning_rate": 9.430459651315185e-05,
"loss": 3.5184,
"step": 3220
},
{
"epoch": 0.239580733715997,
"grad_norm": 0.4986541271209717,
"learning_rate": 9.418435512577833e-05,
"loss": 3.5119,
"step": 3240
},
{
"epoch": 0.24105962713399698,
"grad_norm": 0.535453736782074,
"learning_rate": 9.406293600732408e-05,
"loss": 3.5147,
"step": 3260
},
{
"epoch": 0.24253852055199696,
"grad_norm": 0.5945438146591187,
"learning_rate": 9.39403423942077e-05,
"loss": 3.5023,
"step": 3280
},
{
"epoch": 0.24401741396999696,
"grad_norm": 0.6451681852340698,
"learning_rate": 9.381657755415387e-05,
"loss": 3.4846,
"step": 3300
},
{
"epoch": 0.24549630738799694,
"grad_norm": 0.6193166375160217,
"learning_rate": 9.369164478610631e-05,
"loss": 3.488,
"step": 3320
},
{
"epoch": 0.24697520080599691,
"grad_norm": 0.7059178352355957,
"learning_rate": 9.35655474201397e-05,
"loss": 3.4883,
"step": 3340
},
{
"epoch": 0.2484540942239969,
"grad_norm": 0.6481304168701172,
"learning_rate": 9.343828881737107e-05,
"loss": 3.4762,
"step": 3360
},
{
"epoch": 0.24993298764199687,
"grad_norm": 0.5440752506256104,
"learning_rate": 9.330987236987008e-05,
"loss": 3.481,
"step": 3380
},
{
"epoch": 0.25141188105999684,
"grad_norm": 0.5582643747329712,
"learning_rate": 9.318030150056869e-05,
"loss": 3.4755,
"step": 3400
},
{
"epoch": 0.25289077447799685,
"grad_norm": 0.6249572038650513,
"learning_rate": 9.304957966316995e-05,
"loss": 3.4775,
"step": 3420
},
{
"epoch": 0.2543696678959968,
"grad_norm": 0.6695943474769592,
"learning_rate": 9.291771034205578e-05,
"loss": 3.463,
"step": 3440
},
{
"epoch": 0.2558485613139968,
"grad_norm": 0.4462078809738159,
"learning_rate": 9.27846970521943e-05,
"loss": 3.4561,
"step": 3460
},
{
"epoch": 0.2573274547319968,
"grad_norm": 0.49235352873802185,
"learning_rate": 9.265054333904601e-05,
"loss": 3.4515,
"step": 3480
},
{
"epoch": 0.25880634814999676,
"grad_norm": 0.6507192254066467,
"learning_rate": 9.251525277846929e-05,
"loss": 3.4514,
"step": 3500
},
{
"epoch": 0.26028524156799676,
"grad_norm": 0.4588228166103363,
"learning_rate": 9.237882897662515e-05,
"loss": 3.4286,
"step": 3520
},
{
"epoch": 0.2617641349859967,
"grad_norm": 0.575430691242218,
"learning_rate": 9.224127556988107e-05,
"loss": 3.4458,
"step": 3540
},
{
"epoch": 0.2632430284039967,
"grad_norm": 0.7287342548370361,
"learning_rate": 9.210259622471403e-05,
"loss": 3.4318,
"step": 3560
},
{
"epoch": 0.26472192182199666,
"grad_norm": 0.6866022348403931,
"learning_rate": 9.19627946376129e-05,
"loss": 3.4361,
"step": 3580
},
{
"epoch": 0.26620081523999667,
"grad_norm": 0.5268846750259399,
"learning_rate": 9.182187453497974e-05,
"loss": 3.4364,
"step": 3600
},
{
"epoch": 0.2676797086579967,
"grad_norm": 0.6380168795585632,
"learning_rate": 9.167983967303066e-05,
"loss": 3.4389,
"step": 3620
},
{
"epoch": 0.2691586020759966,
"grad_norm": 0.6250066757202148,
"learning_rate": 9.153669383769556e-05,
"loss": 3.4322,
"step": 3640
},
{
"epoch": 0.2706374954939966,
"grad_norm": 0.6497014164924622,
"learning_rate": 9.139244084451729e-05,
"loss": 3.4068,
"step": 3660
},
{
"epoch": 0.2721163889119966,
"grad_norm": 0.8837792277336121,
"learning_rate": 9.124708453854983e-05,
"loss": 3.4132,
"step": 3680
},
{
"epoch": 0.2735952823299966,
"grad_norm": 0.5183786153793335,
"learning_rate": 9.110062879425602e-05,
"loss": 3.4081,
"step": 3700
},
{
"epoch": 0.2750741757479966,
"grad_norm": 0.7497463226318359,
"learning_rate": 9.095307751540407e-05,
"loss": 3.3986,
"step": 3720
},
{
"epoch": 0.27655306916599653,
"grad_norm": 0.5026047825813293,
"learning_rate": 9.080443463496363e-05,
"loss": 3.4111,
"step": 3740
},
{
"epoch": 0.27803196258399654,
"grad_norm": 0.4640219211578369,
"learning_rate": 9.06547041150009e-05,
"loss": 3.3865,
"step": 3760
},
{
"epoch": 0.2795108560019965,
"grad_norm": 0.5095507502555847,
"learning_rate": 9.050388994657303e-05,
"loss": 3.3915,
"step": 3780
},
{
"epoch": 0.2809897494199965,
"grad_norm": 0.5542161464691162,
"learning_rate": 9.035199614962178e-05,
"loss": 3.3924,
"step": 3800
},
{
"epoch": 0.28246864283799644,
"grad_norm": 0.44914740324020386,
"learning_rate": 9.019902677286631e-05,
"loss": 3.3968,
"step": 3820
},
{
"epoch": 0.28394753625599645,
"grad_norm": 0.4764072000980377,
"learning_rate": 9.004498589369532e-05,
"loss": 3.3937,
"step": 3840
},
{
"epoch": 0.28542642967399645,
"grad_norm": 1.0480468273162842,
"learning_rate": 8.98898776180583e-05,
"loss": 3.3926,
"step": 3860
},
{
"epoch": 0.2869053230919964,
"grad_norm": 0.5355066061019897,
"learning_rate": 8.973370608035612e-05,
"loss": 3.3895,
"step": 3880
},
{
"epoch": 0.2883842165099964,
"grad_norm": 0.4495852589607239,
"learning_rate": 8.957647544333088e-05,
"loss": 3.3717,
"step": 3900
},
{
"epoch": 0.28986310992799635,
"grad_norm": 0.5025330781936646,
"learning_rate": 8.941818989795487e-05,
"loss": 3.3653,
"step": 3920
},
{
"epoch": 0.29134200334599636,
"grad_norm": 0.7565049529075623,
"learning_rate": 8.925885366331887e-05,
"loss": 3.3668,
"step": 3940
},
{
"epoch": 0.29282089676399636,
"grad_norm": 0.8078230619430542,
"learning_rate": 8.909847098651978e-05,
"loss": 3.3678,
"step": 3960
},
{
"epoch": 0.2942997901819963,
"grad_norm": 0.532131552696228,
"learning_rate": 8.893704614254725e-05,
"loss": 3.3616,
"step": 3980
},
{
"epoch": 0.2957786835999963,
"grad_norm": 0.6017030477523804,
"learning_rate": 8.877458343416993e-05,
"loss": 3.349,
"step": 4000
},
{
"epoch": 0.29725757701799627,
"grad_norm": 0.5634870529174805,
"learning_rate": 8.861108719182061e-05,
"loss": 3.3385,
"step": 4020
},
{
"epoch": 0.29873647043599627,
"grad_norm": 0.5135075449943542,
"learning_rate": 8.844656177348087e-05,
"loss": 3.353,
"step": 4040
},
{
"epoch": 0.3002153638539962,
"grad_norm": 0.49317190051078796,
"learning_rate": 8.828101156456493e-05,
"loss": 3.3455,
"step": 4060
},
{
"epoch": 0.3016942572719962,
"grad_norm": 0.5618060827255249,
"learning_rate": 8.811444097780273e-05,
"loss": 3.3444,
"step": 4080
},
{
"epoch": 0.30317315068999623,
"grad_norm": 0.5211082100868225,
"learning_rate": 8.79468544531223e-05,
"loss": 3.3491,
"step": 4100
},
{
"epoch": 0.3046520441079962,
"grad_norm": 0.5708051919937134,
"learning_rate": 8.777825645753144e-05,
"loss": 3.3345,
"step": 4120
},
{
"epoch": 0.3061309375259962,
"grad_norm": 0.5056930184364319,
"learning_rate": 8.760865148499862e-05,
"loss": 3.3333,
"step": 4140
},
{
"epoch": 0.30760983094399613,
"grad_norm": 0.5034912824630737,
"learning_rate": 8.743804405633327e-05,
"loss": 3.3313,
"step": 4160
},
{
"epoch": 0.30908872436199614,
"grad_norm": 0.6101865768432617,
"learning_rate": 8.726643871906512e-05,
"loss": 3.3211,
"step": 4180
},
{
"epoch": 0.31056761777999614,
"grad_norm": 0.49354320764541626,
"learning_rate": 8.709384004732322e-05,
"loss": 3.328,
"step": 4200
},
{
"epoch": 0.3120465111979961,
"grad_norm": 1.0049197673797607,
"learning_rate": 8.69202526417138e-05,
"loss": 3.3256,
"step": 4220
},
{
"epoch": 0.3135254046159961,
"grad_norm": 0.4796050786972046,
"learning_rate": 8.67456811291977e-05,
"loss": 3.3264,
"step": 4240
},
{
"epoch": 0.31500429803399604,
"grad_norm": 0.6114419102668762,
"learning_rate": 8.657013016296716e-05,
"loss": 3.3041,
"step": 4260
},
{
"epoch": 0.31648319145199605,
"grad_norm": 0.6853553652763367,
"learning_rate": 8.639360442232163e-05,
"loss": 3.3123,
"step": 4280
},
{
"epoch": 0.317962084869996,
"grad_norm": 0.4117718040943146,
"learning_rate": 8.621610861254307e-05,
"loss": 3.3036,
"step": 4300
},
{
"epoch": 0.319440978287996,
"grad_norm": 0.4868248701095581,
"learning_rate": 8.60376474647707e-05,
"loss": 3.3112,
"step": 4320
},
{
"epoch": 0.320919871705996,
"grad_norm": 0.4655211865901947,
"learning_rate": 8.585822573587463e-05,
"loss": 3.2959,
"step": 4340
},
{
"epoch": 0.32239876512399596,
"grad_norm": 0.4244300127029419,
"learning_rate": 8.567784820832926e-05,
"loss": 3.3006,
"step": 4360
},
{
"epoch": 0.32387765854199596,
"grad_norm": 0.5585177540779114,
"learning_rate": 8.549651969008572e-05,
"loss": 3.304,
"step": 4380
},
{
"epoch": 0.3253565519599959,
"grad_norm": 0.4044816493988037,
"learning_rate": 8.531424501444376e-05,
"loss": 3.2943,
"step": 4400
},
{
"epoch": 0.3268354453779959,
"grad_norm": 0.5332701802253723,
"learning_rate": 8.513102903992285e-05,
"loss": 3.2691,
"step": 4420
},
{
"epoch": 0.3283143387959959,
"grad_norm": 0.6828725934028625,
"learning_rate": 8.494687665013274e-05,
"loss": 3.2757,
"step": 4440
},
{
"epoch": 0.32979323221399587,
"grad_norm": 0.4340764284133911,
"learning_rate": 8.476179275364331e-05,
"loss": 3.2798,
"step": 4460
},
{
"epoch": 0.3312721256319959,
"grad_norm": 0.5927674770355225,
"learning_rate": 8.457578228385362e-05,
"loss": 3.277,
"step": 4480
},
{
"epoch": 0.3327510190499958,
"grad_norm": 0.5142761468887329,
"learning_rate": 8.438885019886051e-05,
"loss": 3.2745,
"step": 4500
},
{
"epoch": 0.3342299124679958,
"grad_norm": 0.5035094618797302,
"learning_rate": 8.420100148132643e-05,
"loss": 3.282,
"step": 4520
},
{
"epoch": 0.33570880588599583,
"grad_norm": 0.4529162049293518,
"learning_rate": 8.40122411383466e-05,
"loss": 3.2741,
"step": 4540
},
{
"epoch": 0.3371876993039958,
"grad_norm": 0.47236135601997375,
"learning_rate": 8.382257420131554e-05,
"loss": 3.2566,
"step": 4560
},
{
"epoch": 0.3386665927219958,
"grad_norm": 0.5067903995513916,
"learning_rate": 8.363200572579297e-05,
"loss": 3.2729,
"step": 4580
},
{
"epoch": 0.34014548613999573,
"grad_norm": 0.5891897678375244,
"learning_rate": 8.344054079136911e-05,
"loss": 3.254,
"step": 4600
},
{
"epoch": 0.34162437955799574,
"grad_norm": 0.4857490062713623,
"learning_rate": 8.324818450152917e-05,
"loss": 3.2704,
"step": 4620
},
{
"epoch": 0.3431032729759957,
"grad_norm": 0.5922226309776306,
"learning_rate": 8.305494198351741e-05,
"loss": 3.2511,
"step": 4640
},
{
"epoch": 0.3445821663939957,
"grad_norm": 0.5176606178283691,
"learning_rate": 8.286081838820047e-05,
"loss": 3.2577,
"step": 4660
},
{
"epoch": 0.3460610598119957,
"grad_norm": 0.4542312026023865,
"learning_rate": 8.266581888993e-05,
"loss": 3.269,
"step": 4680
},
{
"epoch": 0.34753995322999565,
"grad_norm": 0.4864133596420288,
"learning_rate": 8.246994868640478e-05,
"loss": 3.2468,
"step": 4700
},
{
"epoch": 0.34901884664799565,
"grad_norm": 0.5213157534599304,
"learning_rate": 8.227321299853225e-05,
"loss": 3.2431,
"step": 4720
},
{
"epoch": 0.3504977400659956,
"grad_norm": 0.495194673538208,
"learning_rate": 8.207561707028921e-05,
"loss": 3.26,
"step": 4740
},
{
"epoch": 0.3519766334839956,
"grad_norm": 0.47876933217048645,
"learning_rate": 8.187716616858217e-05,
"loss": 3.2397,
"step": 4760
},
{
"epoch": 0.3534555269019956,
"grad_norm": 0.558392345905304,
"learning_rate": 8.167786558310679e-05,
"loss": 3.2357,
"step": 4780
},
{
"epoch": 0.35493442031999556,
"grad_norm": 0.5333178043365479,
"learning_rate": 8.147772062620715e-05,
"loss": 3.2374,
"step": 4800
},
{
"epoch": 0.35641331373799556,
"grad_norm": 0.41947266459465027,
"learning_rate": 8.127673663273388e-05,
"loss": 3.238,
"step": 4820
},
{
"epoch": 0.3578922071559955,
"grad_norm": 0.6376889944076538,
"learning_rate": 8.107491895990213e-05,
"loss": 3.2295,
"step": 4840
},
{
"epoch": 0.3593711005739955,
"grad_norm": 0.46790727972984314,
"learning_rate": 8.087227298714865e-05,
"loss": 3.2203,
"step": 4860
},
{
"epoch": 0.36084999399199547,
"grad_norm": 0.4850638508796692,
"learning_rate": 8.06688041159886e-05,
"loss": 3.2282,
"step": 4880
},
{
"epoch": 0.36232888740999547,
"grad_norm": 0.48408469557762146,
"learning_rate": 8.04645177698713e-05,
"loss": 3.2156,
"step": 4900
},
{
"epoch": 0.3638077808279955,
"grad_norm": 0.4044775068759918,
"learning_rate": 8.025941939403589e-05,
"loss": 3.2054,
"step": 4920
},
{
"epoch": 0.3652866742459954,
"grad_norm": 0.5881346464157104,
"learning_rate": 8.005351445536611e-05,
"loss": 3.2179,
"step": 4940
},
{
"epoch": 0.36676556766399543,
"grad_norm": 0.49967604875564575,
"learning_rate": 7.984680844224455e-05,
"loss": 3.2243,
"step": 4960
},
{
"epoch": 0.3682444610819954,
"grad_norm": 0.3812451958656311,
"learning_rate": 7.963930686440638e-05,
"loss": 3.2071,
"step": 4980
},
{
"epoch": 0.3697233544999954,
"grad_norm": 0.5718510150909424,
"learning_rate": 7.943101525279254e-05,
"loss": 3.2097,
"step": 5000
},
{
"epoch": 0.3712022479179954,
"grad_norm": 0.4486338198184967,
"learning_rate": 7.922193915940223e-05,
"loss": 3.2108,
"step": 5020
},
{
"epoch": 0.37268114133599534,
"grad_norm": 0.3966203033924103,
"learning_rate": 7.901208415714498e-05,
"loss": 3.2079,
"step": 5040
},
{
"epoch": 0.37416003475399534,
"grad_norm": 0.5968387722969055,
"learning_rate": 7.880145583969208e-05,
"loss": 3.2194,
"step": 5060
},
{
"epoch": 0.3756389281719953,
"grad_norm": 0.4266614019870758,
"learning_rate": 7.859005982132746e-05,
"loss": 3.2041,
"step": 5080
},
{
"epoch": 0.3771178215899953,
"grad_norm": 0.39778637886047363,
"learning_rate": 7.83779017367981e-05,
"loss": 3.1994,
"step": 5100
},
{
"epoch": 0.37859671500799524,
"grad_norm": 0.5236369967460632,
"learning_rate": 7.816498724116384e-05,
"loss": 3.1862,
"step": 5120
},
{
"epoch": 0.38007560842599525,
"grad_norm": 0.7279762625694275,
"learning_rate": 7.79513220096465e-05,
"loss": 3.1994,
"step": 5140
},
{
"epoch": 0.38155450184399525,
"grad_norm": 0.4763568639755249,
"learning_rate": 7.773691173747878e-05,
"loss": 3.1906,
"step": 5160
},
{
"epoch": 0.3830333952619952,
"grad_norm": 0.44299814105033875,
"learning_rate": 7.752176213975242e-05,
"loss": 3.1834,
"step": 5180
},
{
"epoch": 0.3845122886799952,
"grad_norm": 0.5032374262809753,
"learning_rate": 7.73058789512658e-05,
"loss": 3.195,
"step": 5200
},
{
"epoch": 0.38599118209799516,
"grad_norm": 0.4971736669540405,
"learning_rate": 7.708926792637109e-05,
"loss": 3.1912,
"step": 5220
},
{
"epoch": 0.38747007551599516,
"grad_norm": 0.3745681941509247,
"learning_rate": 7.687193483882094e-05,
"loss": 3.1822,
"step": 5240
},
{
"epoch": 0.38894896893399517,
"grad_norm": 0.45209985971450806,
"learning_rate": 7.665388548161449e-05,
"loss": 3.1747,
"step": 5260
},
{
"epoch": 0.3904278623519951,
"grad_norm": 0.45653989911079407,
"learning_rate": 7.643512566684302e-05,
"loss": 3.1586,
"step": 5280
},
{
"epoch": 0.3919067557699951,
"grad_norm": 0.5007410049438477,
"learning_rate": 7.621566122553503e-05,
"loss": 3.1777,
"step": 5300
},
{
"epoch": 0.39338564918799507,
"grad_norm": 0.39367878437042236,
"learning_rate": 7.599549800750075e-05,
"loss": 3.1713,
"step": 5320
},
{
"epoch": 0.3948645426059951,
"grad_norm": 0.41411903500556946,
"learning_rate": 7.577464188117629e-05,
"loss": 3.1743,
"step": 5340
},
{
"epoch": 0.396343436023995,
"grad_norm": 0.45292773842811584,
"learning_rate": 7.555309873346719e-05,
"loss": 3.1615,
"step": 5360
},
{
"epoch": 0.397822329441995,
"grad_norm": 0.8281717300415039,
"learning_rate": 7.533087446959146e-05,
"loss": 3.167,
"step": 5380
},
{
"epoch": 0.39930122285999503,
"grad_norm": 0.4002739489078522,
"learning_rate": 7.510797501292224e-05,
"loss": 3.1778,
"step": 5400
},
{
"epoch": 0.400780116277995,
"grad_norm": 0.4849472641944885,
"learning_rate": 7.488440630482993e-05,
"loss": 3.156,
"step": 5420
},
{
"epoch": 0.402259009695995,
"grad_norm": 0.5112612247467041,
"learning_rate": 7.466017430452372e-05,
"loss": 3.1722,
"step": 5440
},
{
"epoch": 0.40373790311399493,
"grad_norm": 0.7139009833335876,
"learning_rate": 7.443528498889282e-05,
"loss": 3.1638,
"step": 5460
},
{
"epoch": 0.40521679653199494,
"grad_norm": 0.508050262928009,
"learning_rate": 7.420974435234718e-05,
"loss": 3.178,
"step": 5480
},
{
"epoch": 0.40669568994999494,
"grad_norm": 0.42061784863471985,
"learning_rate": 7.398355840665762e-05,
"loss": 3.1644,
"step": 5500
},
{
"epoch": 0.4081745833679949,
"grad_norm": 0.4205974340438843,
"learning_rate": 7.375673318079566e-05,
"loss": 3.1405,
"step": 5520
},
{
"epoch": 0.4096534767859949,
"grad_norm": 0.37122201919555664,
"learning_rate": 7.352927472077278e-05,
"loss": 3.1446,
"step": 5540
},
{
"epoch": 0.41113237020399485,
"grad_norm": 0.42649346590042114,
"learning_rate": 7.330118908947927e-05,
"loss": 3.1553,
"step": 5560
},
{
"epoch": 0.41261126362199485,
"grad_norm": 0.4024769365787506,
"learning_rate": 7.307248236652264e-05,
"loss": 3.1468,
"step": 5580
},
{
"epoch": 0.4140901570399948,
"grad_norm": 0.44164013862609863,
"learning_rate": 7.284316064806555e-05,
"loss": 3.1431,
"step": 5600
},
{
"epoch": 0.4155690504579948,
"grad_norm": 0.43745094537734985,
"learning_rate": 7.261323004666332e-05,
"loss": 3.1566,
"step": 5620
},
{
"epoch": 0.4170479438759948,
"grad_norm": 0.5233656764030457,
"learning_rate": 7.238269669110104e-05,
"loss": 3.1387,
"step": 5640
},
{
"epoch": 0.41852683729399476,
"grad_norm": 0.5196412801742554,
"learning_rate": 7.215156672623011e-05,
"loss": 3.1359,
"step": 5660
},
{
"epoch": 0.42000573071199476,
"grad_norm": 0.46823379397392273,
"learning_rate": 7.191984631280457e-05,
"loss": 3.1274,
"step": 5680
},
{
"epoch": 0.4214846241299947,
"grad_norm": 0.4213380217552185,
"learning_rate": 7.168754162731682e-05,
"loss": 3.1261,
"step": 5700
},
{
"epoch": 0.4229635175479947,
"grad_norm": 0.48972517251968384,
"learning_rate": 7.145465886183291e-05,
"loss": 3.1367,
"step": 5720
},
{
"epoch": 0.4244424109659947,
"grad_norm": 0.4298087954521179,
"learning_rate": 7.122120422382771e-05,
"loss": 3.1342,
"step": 5740
},
{
"epoch": 0.42592130438399467,
"grad_norm": 0.6111768484115601,
"learning_rate": 7.098718393601922e-05,
"loss": 3.1323,
"step": 5760
},
{
"epoch": 0.4274001978019947,
"grad_norm": 0.4182634949684143,
"learning_rate": 7.075260423620284e-05,
"loss": 3.1206,
"step": 5780
},
{
"epoch": 0.4288790912199946,
"grad_norm": 0.4418911337852478,
"learning_rate": 7.051747137708503e-05,
"loss": 3.1252,
"step": 5800
},
{
"epoch": 0.43035798463799463,
"grad_norm": 0.4269157350063324,
"learning_rate": 7.028179162611668e-05,
"loss": 3.1291,
"step": 5820
},
{
"epoch": 0.4318368780559946,
"grad_norm": 0.38284796476364136,
"learning_rate": 7.004557126532608e-05,
"loss": 3.1272,
"step": 5840
},
{
"epoch": 0.4333157714739946,
"grad_norm": 0.42110738158226013,
"learning_rate": 6.98088165911514e-05,
"loss": 3.1277,
"step": 5860
},
{
"epoch": 0.4347946648919946,
"grad_norm": 0.45251357555389404,
"learning_rate": 6.957153391427293e-05,
"loss": 3.1258,
"step": 5880
},
{
"epoch": 0.43627355830999454,
"grad_norm": 0.5021226406097412,
"learning_rate": 6.933372955944478e-05,
"loss": 3.1132,
"step": 5900
},
{
"epoch": 0.43775245172799454,
"grad_norm": 0.5621367692947388,
"learning_rate": 6.909540986532644e-05,
"loss": 3.1223,
"step": 5920
},
{
"epoch": 0.4392313451459945,
"grad_norm": 0.48778969049453735,
"learning_rate": 6.885658118431367e-05,
"loss": 3.1239,
"step": 5940
},
{
"epoch": 0.4407102385639945,
"grad_norm": 0.4777956008911133,
"learning_rate": 6.861724988236926e-05,
"loss": 3.1096,
"step": 5960
},
{
"epoch": 0.4421891319819945,
"grad_norm": 0.5108891725540161,
"learning_rate": 6.83774223388533e-05,
"loss": 3.1172,
"step": 5980
},
{
"epoch": 0.44366802539999445,
"grad_norm": 0.42329996824264526,
"learning_rate": 6.813710494635325e-05,
"loss": 3.0999,
"step": 6000
},
{
"epoch": 0.44514691881799445,
"grad_norm": 0.538500964641571,
"learning_rate": 6.789630411051336e-05,
"loss": 3.1098,
"step": 6020
},
{
"epoch": 0.4466258122359944,
"grad_norm": 0.51045823097229,
"learning_rate": 6.765502624986409e-05,
"loss": 3.1021,
"step": 6040
},
{
"epoch": 0.4481047056539944,
"grad_norm": 0.46791911125183105,
"learning_rate": 6.741327779565096e-05,
"loss": 3.1031,
"step": 6060
},
{
"epoch": 0.44958359907199436,
"grad_norm": 0.4351001977920532,
"learning_rate": 6.71710651916631e-05,
"loss": 3.0976,
"step": 6080
},
{
"epoch": 0.45106249248999436,
"grad_norm": 0.3884891867637634,
"learning_rate": 6.692839489406155e-05,
"loss": 3.0977,
"step": 6100
},
{
"epoch": 0.45254138590799436,
"grad_norm": 0.44683268666267395,
"learning_rate": 6.668527337120717e-05,
"loss": 3.0915,
"step": 6120
},
{
"epoch": 0.4540202793259943,
"grad_norm": 0.36208999156951904,
"learning_rate": 6.644170710348813e-05,
"loss": 3.1036,
"step": 6140
},
{
"epoch": 0.4554991727439943,
"grad_norm": 0.6256937384605408,
"learning_rate": 6.619770258314729e-05,
"loss": 3.0841,
"step": 6160
},
{
"epoch": 0.45697806616199427,
"grad_norm": 0.44526803493499756,
"learning_rate": 6.595326631410911e-05,
"loss": 3.0801,
"step": 6180
},
{
"epoch": 0.45845695957999427,
"grad_norm": 0.37642255425453186,
"learning_rate": 6.570840481180624e-05,
"loss": 3.0923,
"step": 6200
},
{
"epoch": 0.4599358529979943,
"grad_norm": 0.4022856056690216,
"learning_rate": 6.546312460300595e-05,
"loss": 3.0865,
"step": 6220
},
{
"epoch": 0.4614147464159942,
"grad_norm": 0.41262638568878174,
"learning_rate": 6.521743222563608e-05,
"loss": 3.0895,
"step": 6240
},
{
"epoch": 0.46289363983399423,
"grad_norm": 0.6894219517707825,
"learning_rate": 6.49713342286108e-05,
"loss": 3.0882,
"step": 6260
},
{
"epoch": 0.4643725332519942,
"grad_norm": 0.4044055938720703,
"learning_rate": 6.4724837171656e-05,
"loss": 3.0811,
"step": 6280
},
{
"epoch": 0.4658514266699942,
"grad_norm": 0.5523516535758972,
"learning_rate": 6.447794762513456e-05,
"loss": 3.0687,
"step": 6300
},
{
"epoch": 0.46733032008799413,
"grad_norm": 0.6067591309547424,
"learning_rate": 6.42306721698711e-05,
"loss": 3.0651,
"step": 6320
},
{
"epoch": 0.46880921350599414,
"grad_norm": 0.48093098402023315,
"learning_rate": 6.398301739697661e-05,
"loss": 3.0862,
"step": 6340
},
{
"epoch": 0.47028810692399414,
"grad_norm": 0.516197144985199,
"learning_rate": 6.373498990767281e-05,
"loss": 3.0879,
"step": 6360
},
{
"epoch": 0.4717670003419941,
"grad_norm": 0.4190840721130371,
"learning_rate": 6.348659631311608e-05,
"loss": 3.0786,
"step": 6380
},
{
"epoch": 0.4732458937599941,
"grad_norm": 0.42481333017349243,
"learning_rate": 6.32378432342214e-05,
"loss": 3.0701,
"step": 6400
},
{
"epoch": 0.47472478717799405,
"grad_norm": 0.5522997379302979,
"learning_rate": 6.29887373014857e-05,
"loss": 3.0722,
"step": 6420
},
{
"epoch": 0.47620368059599405,
"grad_norm": 0.3823126554489136,
"learning_rate": 6.27392851548112e-05,
"loss": 3.0722,
"step": 6440
},
{
"epoch": 0.47768257401399405,
"grad_norm": 0.38790881633758545,
"learning_rate": 6.248949344332853e-05,
"loss": 3.0726,
"step": 6460
},
{
"epoch": 0.479161467431994,
"grad_norm": 0.503336489200592,
"learning_rate": 6.223936882521935e-05,
"loss": 3.0652,
"step": 6480
},
{
"epoch": 0.480640360849994,
"grad_norm": 0.5279501080513,
"learning_rate": 6.198891796753885e-05,
"loss": 3.0771,
"step": 6500
},
{
"epoch": 0.48211925426799396,
"grad_norm": 0.4080502986907959,
"learning_rate": 6.17381475460382e-05,
"loss": 3.064,
"step": 6520
},
{
"epoch": 0.48359814768599396,
"grad_norm": 0.45085135102272034,
"learning_rate": 6.148706424498649e-05,
"loss": 3.0594,
"step": 6540
},
{
"epoch": 0.4850770411039939,
"grad_norm": 0.42239508032798767,
"learning_rate": 6.123567475699261e-05,
"loss": 3.064,
"step": 6560
},
{
"epoch": 0.4865559345219939,
"grad_norm": 0.43709495663642883,
"learning_rate": 6.098398578282682e-05,
"loss": 3.0563,
"step": 6580
},
{
"epoch": 0.4880348279399939,
"grad_norm": 0.6891195178031921,
"learning_rate": 6.073200403124222e-05,
"loss": 3.0594,
"step": 6600
},
{
"epoch": 0.48951372135799387,
"grad_norm": 0.37419646978378296,
"learning_rate": 6.047973621879577e-05,
"loss": 3.0448,
"step": 6620
},
{
"epoch": 0.4909926147759939,
"grad_norm": 0.3710575997829437,
"learning_rate": 6.0227189069669464e-05,
"loss": 3.0518,
"step": 6640
},
{
"epoch": 0.4924715081939938,
"grad_norm": 0.7165172696113586,
"learning_rate": 5.997436931549096e-05,
"loss": 3.0589,
"step": 6660
},
{
"epoch": 0.49395040161199383,
"grad_norm": 0.48645517230033875,
"learning_rate": 5.972128369515415e-05,
"loss": 3.0507,
"step": 6680
},
{
"epoch": 0.49542929502999383,
"grad_norm": 0.3613664507865906,
"learning_rate": 5.9467938954639624e-05,
"loss": 3.05,
"step": 6700
},
{
"epoch": 0.4969081884479938,
"grad_norm": 0.44066616892814636,
"learning_rate": 5.921434184683479e-05,
"loss": 3.0452,
"step": 6720
},
{
"epoch": 0.4983870818659938,
"grad_norm": 0.4224984049797058,
"learning_rate": 5.896049913135386e-05,
"loss": 3.0474,
"step": 6740
},
{
"epoch": 0.49986597528399374,
"grad_norm": 0.4076259434223175,
"learning_rate": 5.870641757435775e-05,
"loss": 3.0424,
"step": 6760
},
{
"epoch": 0.5013448687019937,
"grad_norm": 0.6098340153694153,
"learning_rate": 5.845210394837366e-05,
"loss": 3.0581,
"step": 6780
},
{
"epoch": 0.5028237621199937,
"grad_norm": 1.0002901554107666,
"learning_rate": 5.8197565032114533e-05,
"loss": 3.0335,
"step": 6800
},
{
"epoch": 0.5043026555379937,
"grad_norm": 0.4866860508918762,
"learning_rate": 5.7942807610298456e-05,
"loss": 3.0329,
"step": 6820
},
{
"epoch": 0.5057815489559937,
"grad_norm": 0.4324921667575836,
"learning_rate": 5.768783847346779e-05,
"loss": 3.0366,
"step": 6840
},
{
"epoch": 0.5072604423739937,
"grad_norm": 0.40503060817718506,
"learning_rate": 5.743266441780808e-05,
"loss": 3.0461,
"step": 6860
},
{
"epoch": 0.5087393357919936,
"grad_norm": 0.38576483726501465,
"learning_rate": 5.717729224496703e-05,
"loss": 3.0238,
"step": 6880
},
{
"epoch": 0.5102182292099936,
"grad_norm": 0.4007696211338043,
"learning_rate": 5.6921728761873086e-05,
"loss": 3.0221,
"step": 6900
},
{
"epoch": 0.5116971226279936,
"grad_norm": 0.4254515469074249,
"learning_rate": 5.6665980780554096e-05,
"loss": 3.0421,
"step": 6920
},
{
"epoch": 0.5131760160459936,
"grad_norm": 0.42919921875,
"learning_rate": 5.6410055117955695e-05,
"loss": 3.0435,
"step": 6940
},
{
"epoch": 0.5146549094639936,
"grad_norm": 0.45048367977142334,
"learning_rate": 5.615395859575958e-05,
"loss": 3.0331,
"step": 6960
},
{
"epoch": 0.5161338028819935,
"grad_norm": 0.3860481381416321,
"learning_rate": 5.589769804020173e-05,
"loss": 3.0255,
"step": 6980
},
{
"epoch": 0.5176126962999935,
"grad_norm": 0.3789386749267578,
"learning_rate": 5.5641280281890394e-05,
"loss": 3.0364,
"step": 7000
},
{
"epoch": 0.5190915897179935,
"grad_norm": 0.3918616473674774,
"learning_rate": 5.538471215562406e-05,
"loss": 3.0288,
"step": 7020
},
{
"epoch": 0.5205704831359935,
"grad_norm": 0.5674075484275818,
"learning_rate": 5.5128000500209254e-05,
"loss": 3.034,
"step": 7040
},
{
"epoch": 0.5220493765539935,
"grad_norm": 0.38289138674736023,
"learning_rate": 5.48711521582783e-05,
"loss": 3.0228,
"step": 7060
},
{
"epoch": 0.5235282699719934,
"grad_norm": 0.5652275681495667,
"learning_rate": 5.461417397610682e-05,
"loss": 3.0148,
"step": 7080
},
{
"epoch": 0.5250071633899934,
"grad_norm": 0.39682313799858093,
"learning_rate": 5.4357072803431396e-05,
"loss": 3.0168,
"step": 7100
},
{
"epoch": 0.5264860568079934,
"grad_norm": 0.5409131646156311,
"learning_rate": 5.4099855493266896e-05,
"loss": 3.0071,
"step": 7120
},
{
"epoch": 0.5279649502259934,
"grad_norm": 0.465202659368515,
"learning_rate": 5.3842528901723786e-05,
"loss": 3.0236,
"step": 7140
},
{
"epoch": 0.5294438436439933,
"grad_norm": 0.4230177104473114,
"learning_rate": 5.358509988782543e-05,
"loss": 3.0209,
"step": 7160
},
{
"epoch": 0.5309227370619933,
"grad_norm": 0.3867465555667877,
"learning_rate": 5.332757531332529e-05,
"loss": 3.0212,
"step": 7180
},
{
"epoch": 0.5324016304799933,
"grad_norm": 0.57347172498703,
"learning_rate": 5.306996204252397e-05,
"loss": 3.0197,
"step": 7200
},
{
"epoch": 0.5338805238979933,
"grad_norm": 0.45516273379325867,
"learning_rate": 5.2812266942086256e-05,
"loss": 3.0118,
"step": 7220
},
{
"epoch": 0.5353594173159933,
"grad_norm": 0.45842480659484863,
"learning_rate": 5.2554496880858106e-05,
"loss": 3.0229,
"step": 7240
},
{
"epoch": 0.5368383107339932,
"grad_norm": 0.4081624448299408,
"learning_rate": 5.2296658729683555e-05,
"loss": 3.0109,
"step": 7260
},
{
"epoch": 0.5383172041519932,
"grad_norm": 0.36024734377861023,
"learning_rate": 5.203875936122158e-05,
"loss": 3.007,
"step": 7280
},
{
"epoch": 0.5397960975699932,
"grad_norm": 0.5755016803741455,
"learning_rate": 5.178080564976287e-05,
"loss": 3.0073,
"step": 7300
},
{
"epoch": 0.5412749909879933,
"grad_norm": 0.4267408847808838,
"learning_rate": 5.152280447104665e-05,
"loss": 3.0077,
"step": 7320
},
{
"epoch": 0.5427538844059933,
"grad_norm": 0.4339446723461151,
"learning_rate": 5.126476270207739e-05,
"loss": 2.9991,
"step": 7340
},
{
"epoch": 0.5442327778239932,
"grad_norm": 0.3711448907852173,
"learning_rate": 5.1006687220941455e-05,
"loss": 3.0091,
"step": 7360
},
{
"epoch": 0.5457116712419932,
"grad_norm": 0.4235258996486664,
"learning_rate": 5.074858490662384e-05,
"loss": 3.0015,
"step": 7380
},
{
"epoch": 0.5471905646599932,
"grad_norm": 0.3901888430118561,
"learning_rate": 5.0490462638824764e-05,
"loss": 2.9862,
"step": 7400
},
{
"epoch": 0.5486694580779932,
"grad_norm": 0.40519407391548157,
"learning_rate": 5.023232729777628e-05,
"loss": 3.0052,
"step": 7420
},
{
"epoch": 0.5501483514959932,
"grad_norm": 0.5243799686431885,
"learning_rate": 4.997418576405896e-05,
"loss": 3.0002,
"step": 7440
},
{
"epoch": 0.5516272449139931,
"grad_norm": 0.444050133228302,
"learning_rate": 4.9716044918418414e-05,
"loss": 3.0037,
"step": 7460
},
{
"epoch": 0.5531061383319931,
"grad_norm": 0.3496316075325012,
"learning_rate": 4.945791164158188e-05,
"loss": 3.0084,
"step": 7480
},
{
"epoch": 0.5545850317499931,
"grad_norm": 0.5127915740013123,
"learning_rate": 4.9199792814074896e-05,
"loss": 2.9986,
"step": 7500
},
{
"epoch": 0.5560639251679931,
"grad_norm": 0.4601123332977295,
"learning_rate": 4.8941695316037865e-05,
"loss": 3.0057,
"step": 7520
},
{
"epoch": 0.5575428185859931,
"grad_norm": 0.48755237460136414,
"learning_rate": 4.868362602704258e-05,
"loss": 2.9809,
"step": 7540
},
{
"epoch": 0.559021712003993,
"grad_norm": 0.3724111318588257,
"learning_rate": 4.842559182590899e-05,
"loss": 2.9975,
"step": 7560
},
{
"epoch": 0.560500605421993,
"grad_norm": 0.46181684732437134,
"learning_rate": 4.816759959052177e-05,
"loss": 2.9781,
"step": 7580
},
{
"epoch": 0.561979498839993,
"grad_norm": 0.39748480916023254,
"learning_rate": 4.790965619764698e-05,
"loss": 2.9965,
"step": 7600
},
{
"epoch": 0.563458392257993,
"grad_norm": 0.5718439221382141,
"learning_rate": 4.76517685227488e-05,
"loss": 2.9806,
"step": 7620
},
{
"epoch": 0.5649372856759929,
"grad_norm": 0.5939317941665649,
"learning_rate": 4.7393943439806264e-05,
"loss": 2.9801,
"step": 7640
},
{
"epoch": 0.5664161790939929,
"grad_norm": 0.4281553626060486,
"learning_rate": 4.713618782112997e-05,
"loss": 2.9829,
"step": 7660
},
{
"epoch": 0.5678950725119929,
"grad_norm": 0.37646615505218506,
"learning_rate": 4.6878508537179015e-05,
"loss": 2.9829,
"step": 7680
},
{
"epoch": 0.5693739659299929,
"grad_norm": 0.4106582701206207,
"learning_rate": 4.662091245637777e-05,
"loss": 2.9694,
"step": 7700
},
{
"epoch": 0.5708528593479929,
"grad_norm": 0.3310515582561493,
"learning_rate": 4.6363406444932814e-05,
"loss": 2.9799,
"step": 7720
},
{
"epoch": 0.5723317527659928,
"grad_norm": 0.36721667647361755,
"learning_rate": 4.610599736664996e-05,
"loss": 2.9794,
"step": 7740
},
{
"epoch": 0.5738106461839928,
"grad_norm": 0.45474308729171753,
"learning_rate": 4.5848692082751296e-05,
"loss": 2.9848,
"step": 7760
},
{
"epoch": 0.5752895396019928,
"grad_norm": 0.6072131991386414,
"learning_rate": 4.559149745169218e-05,
"loss": 2.972,
"step": 7780
},
{
"epoch": 0.5767684330199928,
"grad_norm": 0.486600786447525,
"learning_rate": 4.533442032897864e-05,
"loss": 2.9602,
"step": 7800
},
{
"epoch": 0.5782473264379928,
"grad_norm": 0.4024549126625061,
"learning_rate": 4.5077467566984474e-05,
"loss": 2.9852,
"step": 7820
},
{
"epoch": 0.5797262198559927,
"grad_norm": 0.3547488749027252,
"learning_rate": 4.4820646014768644e-05,
"loss": 2.9794,
"step": 7840
},
{
"epoch": 0.5812051132739927,
"grad_norm": 0.38729000091552734,
"learning_rate": 4.456396251789274e-05,
"loss": 2.9822,
"step": 7860
},
{
"epoch": 0.5826840066919927,
"grad_norm": 0.35460221767425537,
"learning_rate": 4.430742391823853e-05,
"loss": 2.9768,
"step": 7880
},
{
"epoch": 0.5841629001099927,
"grad_norm": 0.3545529544353485,
"learning_rate": 4.405103705382547e-05,
"loss": 2.9681,
"step": 7900
},
{
"epoch": 0.5856417935279927,
"grad_norm": 0.3542696237564087,
"learning_rate": 4.379480875862859e-05,
"loss": 2.9748,
"step": 7920
},
{
"epoch": 0.5871206869459926,
"grad_norm": 0.34213724732398987,
"learning_rate": 4.3538745862396275e-05,
"loss": 2.969,
"step": 7940
},
{
"epoch": 0.5885995803639926,
"grad_norm": 0.35730448365211487,
"learning_rate": 4.328285519046815e-05,
"loss": 2.9627,
"step": 7960
},
{
"epoch": 0.5900784737819926,
"grad_norm": 0.4420771598815918,
"learning_rate": 4.302714356359327e-05,
"loss": 2.9781,
"step": 7980
},
{
"epoch": 0.5915573671999926,
"grad_norm": 0.47289857268333435,
"learning_rate": 4.2771617797748256e-05,
"loss": 2.9637,
"step": 8000
},
{
"epoch": 0.5930362606179926,
"grad_norm": 0.4006676971912384,
"learning_rate": 4.251628470395556e-05,
"loss": 2.9721,
"step": 8020
},
{
"epoch": 0.5945151540359925,
"grad_norm": 0.39483192563056946,
"learning_rate": 4.226115108810201e-05,
"loss": 2.9607,
"step": 8040
},
{
"epoch": 0.5959940474539925,
"grad_norm": 0.49096304178237915,
"learning_rate": 4.20062237507574e-05,
"loss": 2.9567,
"step": 8060
},
{
"epoch": 0.5974729408719925,
"grad_norm": 0.373417466878891,
"learning_rate": 4.175150948699311e-05,
"loss": 2.965,
"step": 8080
},
{
"epoch": 0.5989518342899925,
"grad_norm": 0.33696213364601135,
"learning_rate": 4.149701508620109e-05,
"loss": 2.9636,
"step": 8100
},
{
"epoch": 0.6004307277079924,
"grad_norm": 0.5063782930374146,
"learning_rate": 4.124274733191291e-05,
"loss": 2.9737,
"step": 8120
},
{
"epoch": 0.6019096211259924,
"grad_norm": 0.39363813400268555,
"learning_rate": 4.098871300161878e-05,
"loss": 2.9516,
"step": 8140
},
{
"epoch": 0.6033885145439924,
"grad_norm": 0.3740212023258209,
"learning_rate": 4.07349188665871e-05,
"loss": 2.9472,
"step": 8160
},
{
"epoch": 0.6048674079619925,
"grad_norm": 0.42378878593444824,
"learning_rate": 4.048137169168385e-05,
"loss": 2.9684,
"step": 8180
},
{
"epoch": 0.6063463013799925,
"grad_norm": 0.4358353614807129,
"learning_rate": 4.02280782351923e-05,
"loss": 2.9643,
"step": 8200
},
{
"epoch": 0.6078251947979924,
"grad_norm": 0.35567548871040344,
"learning_rate": 3.997504524863291e-05,
"loss": 2.9435,
"step": 8220
},
{
"epoch": 0.6093040882159924,
"grad_norm": 0.3486579358577728,
"learning_rate": 3.972227947658325e-05,
"loss": 2.9605,
"step": 8240
},
{
"epoch": 0.6107829816339924,
"grad_norm": 0.42745381593704224,
"learning_rate": 3.946978765649838e-05,
"loss": 2.9481,
"step": 8260
},
{
"epoch": 0.6122618750519924,
"grad_norm": 0.4889651834964752,
"learning_rate": 3.921757651853117e-05,
"loss": 2.9492,
"step": 8280
},
{
"epoch": 0.6137407684699924,
"grad_norm": 0.44278714060783386,
"learning_rate": 3.896565278535291e-05,
"loss": 2.9578,
"step": 8300
},
{
"epoch": 0.6152196618879923,
"grad_norm": 0.42498791217803955,
"learning_rate": 3.8714023171974135e-05,
"loss": 2.9439,
"step": 8320
},
{
"epoch": 0.6166985553059923,
"grad_norm": 0.36626169085502625,
"learning_rate": 3.846269438556568e-05,
"loss": 2.9549,
"step": 8340
},
{
"epoch": 0.6181774487239923,
"grad_norm": 0.369567334651947,
"learning_rate": 3.8211673125279776e-05,
"loss": 2.947,
"step": 8360
},
{
"epoch": 0.6196563421419923,
"grad_norm": 0.43409767746925354,
"learning_rate": 3.7960966082071636e-05,
"loss": 2.9363,
"step": 8380
},
{
"epoch": 0.6211352355599923,
"grad_norm": 0.4202839434146881,
"learning_rate": 3.771057993852101e-05,
"loss": 2.9501,
"step": 8400
},
{
"epoch": 0.6226141289779922,
"grad_norm": 0.3709544241428375,
"learning_rate": 3.746052136865409e-05,
"loss": 2.9452,
"step": 8420
},
{
"epoch": 0.6240930223959922,
"grad_norm": 0.3776955008506775,
"learning_rate": 3.721079703776561e-05,
"loss": 2.9249,
"step": 8440
},
{
"epoch": 0.6255719158139922,
"grad_norm": 0.41565999388694763,
"learning_rate": 3.6961413602241215e-05,
"loss": 2.9304,
"step": 8460
},
{
"epoch": 0.6270508092319922,
"grad_norm": 0.3948330581188202,
"learning_rate": 3.6712377709379944e-05,
"loss": 2.9371,
"step": 8480
},
{
"epoch": 0.6285297026499922,
"grad_norm": 0.3861006498336792,
"learning_rate": 3.646369599721716e-05,
"loss": 2.9399,
"step": 8500
},
{
"epoch": 0.6300085960679921,
"grad_norm": 0.3641924560070038,
"learning_rate": 3.621537509434757e-05,
"loss": 2.9283,
"step": 8520
},
{
"epoch": 0.6314874894859921,
"grad_norm": 0.4140797555446625,
"learning_rate": 3.596742161974848e-05,
"loss": 2.9321,
"step": 8540
},
{
"epoch": 0.6329663829039921,
"grad_norm": 0.40179234743118286,
"learning_rate": 3.571984218260348e-05,
"loss": 2.9439,
"step": 8560
},
{
"epoch": 0.6344452763219921,
"grad_norm": 0.4169887602329254,
"learning_rate": 3.547264338212619e-05,
"loss": 2.9299,
"step": 8580
},
{
"epoch": 0.635924169739992,
"grad_norm": 0.4229363203048706,
"learning_rate": 3.522583180738436e-05,
"loss": 2.927,
"step": 8600
},
{
"epoch": 0.637403063157992,
"grad_norm": 0.33680644631385803,
"learning_rate": 3.497941403712429e-05,
"loss": 2.9373,
"step": 8620
},
{
"epoch": 0.638881956575992,
"grad_norm": 0.39601895213127136,
"learning_rate": 3.473339663959547e-05,
"loss": 2.9363,
"step": 8640
},
{
"epoch": 0.640360849993992,
"grad_norm": 0.356684148311615,
"learning_rate": 3.448778617237543e-05,
"loss": 2.9275,
"step": 8660
},
{
"epoch": 0.641839743411992,
"grad_norm": 0.37500935792922974,
"learning_rate": 3.424258918219503e-05,
"loss": 2.9224,
"step": 8680
},
{
"epoch": 0.6433186368299919,
"grad_norm": 0.3620283901691437,
"learning_rate": 3.399781220476394e-05,
"loss": 2.9294,
"step": 8700
},
{
"epoch": 0.6447975302479919,
"grad_norm": 0.3849022090435028,
"learning_rate": 3.3753461764596375e-05,
"loss": 2.9332,
"step": 8720
},
{
"epoch": 0.6462764236659919,
"grad_norm": 0.598598837852478,
"learning_rate": 3.350954437483725e-05,
"loss": 2.9268,
"step": 8740
},
{
"epoch": 0.6477553170839919,
"grad_norm": 0.42141565680503845,
"learning_rate": 3.326606653708857e-05,
"loss": 2.926,
"step": 8760
},
{
"epoch": 0.6492342105019919,
"grad_norm": 0.39355704188346863,
"learning_rate": 3.302303474123608e-05,
"loss": 2.9302,
"step": 8780
},
{
"epoch": 0.6507131039199918,
"grad_norm": 0.3644985258579254,
"learning_rate": 3.278045546527633e-05,
"loss": 2.9178,
"step": 8800
},
{
"epoch": 0.6521919973379918,
"grad_norm": 0.3427523672580719,
"learning_rate": 3.253833517514397e-05,
"loss": 2.9291,
"step": 8820
},
{
"epoch": 0.6536708907559918,
"grad_norm": 0.433736652135849,
"learning_rate": 3.22966803245394e-05,
"loss": 2.914,
"step": 8840
},
{
"epoch": 0.6551497841739918,
"grad_norm": 0.38325321674346924,
"learning_rate": 3.205549735475677e-05,
"loss": 2.9242,
"step": 8860
},
{
"epoch": 0.6566286775919918,
"grad_norm": 0.4170295000076294,
"learning_rate": 3.181479269451231e-05,
"loss": 2.9175,
"step": 8880
},
{
"epoch": 0.6581075710099917,
"grad_norm": 0.4253075420856476,
"learning_rate": 3.1574572759772885e-05,
"loss": 2.9211,
"step": 8900
},
{
"epoch": 0.6595864644279917,
"grad_norm": 0.38273829221725464,
"learning_rate": 3.133484395358507e-05,
"loss": 2.914,
"step": 8920
},
{
"epoch": 0.6610653578459917,
"grad_norm": 0.3915143609046936,
"learning_rate": 3.109561266590445e-05,
"loss": 2.9207,
"step": 8940
},
{
"epoch": 0.6625442512639917,
"grad_norm": 0.37426161766052246,
"learning_rate": 3.085688527342524e-05,
"loss": 2.927,
"step": 8960
},
{
"epoch": 0.6640231446819918,
"grad_norm": 0.34895965456962585,
"learning_rate": 3.06186681394104e-05,
"loss": 2.9157,
"step": 8980
},
{
"epoch": 0.6655020380999916,
"grad_norm": 0.3564130663871765,
"learning_rate": 3.038096761352199e-05,
"loss": 2.9178,
"step": 9000
},
{
"epoch": 0.6669809315179916,
"grad_norm": 0.3817369043827057,
"learning_rate": 3.0143790031651863e-05,
"loss": 2.9252,
"step": 9020
},
{
"epoch": 0.6684598249359917,
"grad_norm": 0.37359967827796936,
"learning_rate": 2.9907141715752906e-05,
"loss": 2.9134,
"step": 9040
},
{
"epoch": 0.6699387183539917,
"grad_norm": 0.3740251660346985,
"learning_rate": 2.9671028973670418e-05,
"loss": 2.9175,
"step": 9060
},
{
"epoch": 0.6714176117719917,
"grad_norm": 0.3896474242210388,
"learning_rate": 2.943545809897398e-05,
"loss": 2.9153,
"step": 9080
},
{
"epoch": 0.6728965051899916,
"grad_norm": 0.4986639618873596,
"learning_rate": 2.9200435370789792e-05,
"loss": 2.9215,
"step": 9100
},
{
"epoch": 0.6743753986079916,
"grad_norm": 0.3836432099342346,
"learning_rate": 2.8965967053633225e-05,
"loss": 2.9123,
"step": 9120
},
{
"epoch": 0.6758542920259916,
"grad_norm": 0.3539137840270996,
"learning_rate": 2.873205939724185e-05,
"loss": 2.9172,
"step": 9140
},
{
"epoch": 0.6773331854439916,
"grad_norm": 0.4474085569381714,
"learning_rate": 2.8498718636408862e-05,
"loss": 2.9126,
"step": 9160
},
{
"epoch": 0.6788120788619915,
"grad_norm": 0.3727508783340454,
"learning_rate": 2.8265950990816926e-05,
"loss": 2.9136,
"step": 9180
},
{
"epoch": 0.6802909722799915,
"grad_norm": 0.3365872800350189,
"learning_rate": 2.8033762664872293e-05,
"loss": 2.9074,
"step": 9200
},
{
"epoch": 0.6817698656979915,
"grad_norm": 0.3774373233318329,
"learning_rate": 2.7802159847539545e-05,
"loss": 2.9078,
"step": 9220
},
{
"epoch": 0.6832487591159915,
"grad_norm": 0.34899139404296875,
"learning_rate": 2.757114871217656e-05,
"loss": 2.9117,
"step": 9240
},
{
"epoch": 0.6847276525339915,
"grad_norm": 0.3489275276660919,
"learning_rate": 2.7340735416369934e-05,
"loss": 2.9,
"step": 9260
},
{
"epoch": 0.6862065459519914,
"grad_norm": 0.3772989511489868,
"learning_rate": 2.7110926101770927e-05,
"loss": 2.8968,
"step": 9280
},
{
"epoch": 0.6876854393699914,
"grad_norm": 0.3743598461151123,
"learning_rate": 2.688172689393172e-05,
"loss": 2.8978,
"step": 9300
},
{
"epoch": 0.6891643327879914,
"grad_norm": 0.3543947637081146,
"learning_rate": 2.665314390214212e-05,
"loss": 2.9029,
"step": 9320
},
{
"epoch": 0.6906432262059914,
"grad_norm": 0.3778015673160553,
"learning_rate": 2.6425183219266746e-05,
"loss": 2.8875,
"step": 9340
},
{
"epoch": 0.6921221196239914,
"grad_norm": 0.3994954824447632,
"learning_rate": 2.6197850921582633e-05,
"loss": 2.8988,
"step": 9360
},
{
"epoch": 0.6936010130419913,
"grad_norm": 0.4375861883163452,
"learning_rate": 2.5971153068617195e-05,
"loss": 2.8888,
"step": 9380
},
{
"epoch": 0.6950799064599913,
"grad_norm": 0.3965347111225128,
"learning_rate": 2.57450957029868e-05,
"loss": 2.896,
"step": 9400
},
{
"epoch": 0.6965587998779913,
"grad_norm": 0.3397294580936432,
"learning_rate": 2.5519684850235703e-05,
"loss": 2.8979,
"step": 9420
},
{
"epoch": 0.6980376932959913,
"grad_norm": 0.38435131311416626,
"learning_rate": 2.529492651867531e-05,
"loss": 2.8914,
"step": 9440
},
{
"epoch": 0.6995165867139913,
"grad_norm": 0.4583021402359009,
"learning_rate": 2.5070826699224202e-05,
"loss": 2.8994,
"step": 9460
},
{
"epoch": 0.7009954801319912,
"grad_norm": 0.35780495405197144,
"learning_rate": 2.4847391365248346e-05,
"loss": 2.904,
"step": 9480
},
{
"epoch": 0.7024743735499912,
"grad_norm": 0.48425179719924927,
"learning_rate": 2.4624626472401834e-05,
"loss": 2.8902,
"step": 9500
},
{
"epoch": 0.7039532669679912,
"grad_norm": 0.34029942750930786,
"learning_rate": 2.440253795846827e-05,
"loss": 2.8964,
"step": 9520
},
{
"epoch": 0.7054321603859912,
"grad_norm": 0.33855918049812317,
"learning_rate": 2.4181131743202377e-05,
"loss": 2.8917,
"step": 9540
},
{
"epoch": 0.7069110538039912,
"grad_norm": 0.3716065287590027,
"learning_rate": 2.3960413728172277e-05,
"loss": 2.9,
"step": 9560
},
{
"epoch": 0.7083899472219911,
"grad_norm": 0.3275023102760315,
"learning_rate": 2.374038979660214e-05,
"loss": 2.9032,
"step": 9580
},
{
"epoch": 0.7098688406399911,
"grad_norm": 0.3434765040874481,
"learning_rate": 2.352106581321542e-05,
"loss": 2.8992,
"step": 9600
},
{
"epoch": 0.7113477340579911,
"grad_norm": 0.3282793462276459,
"learning_rate": 2.3302447624078427e-05,
"loss": 2.8918,
"step": 9620
},
{
"epoch": 0.7128266274759911,
"grad_norm": 0.4167431890964508,
"learning_rate": 2.3084541056444654e-05,
"loss": 2.8844,
"step": 9640
},
{
"epoch": 0.714305520893991,
"grad_norm": 0.3788709342479706,
"learning_rate": 2.2867351918599333e-05,
"loss": 2.8737,
"step": 9660
},
{
"epoch": 0.715784414311991,
"grad_norm": 0.32435911893844604,
"learning_rate": 2.2650885999704628e-05,
"loss": 2.8946,
"step": 9680
},
{
"epoch": 0.717263307729991,
"grad_norm": 0.37471237778663635,
"learning_rate": 2.243514906964539e-05,
"loss": 2.8935,
"step": 9700
},
{
"epoch": 0.718742201147991,
"grad_norm": 0.3652307093143463,
"learning_rate": 2.222014687887532e-05,
"loss": 2.8767,
"step": 9720
},
{
"epoch": 0.720221094565991,
"grad_norm": 0.37537747621536255,
"learning_rate": 2.2005885158263645e-05,
"loss": 2.8802,
"step": 9740
},
{
"epoch": 0.7216999879839909,
"grad_norm": 0.40164393186569214,
"learning_rate": 2.1792369618942455e-05,
"loss": 2.881,
"step": 9760
},
{
"epoch": 0.7231788814019909,
"grad_norm": 0.35087114572525024,
"learning_rate": 2.1579605952154435e-05,
"loss": 2.8904,
"step": 9780
},
{
"epoch": 0.7246577748199909,
"grad_norm": 0.4332689046859741,
"learning_rate": 2.136759982910107e-05,
"loss": 2.8778,
"step": 9800
},
{
"epoch": 0.726136668237991,
"grad_norm": 0.34787076711654663,
"learning_rate": 2.1156356900791695e-05,
"loss": 2.8845,
"step": 9820
},
{
"epoch": 0.727615561655991,
"grad_norm": 0.37883126735687256,
"learning_rate": 2.0945882797892673e-05,
"loss": 2.8876,
"step": 9840
},
{
"epoch": 0.7290944550739908,
"grad_norm": 0.3691736161708832,
"learning_rate": 2.0736183130577335e-05,
"loss": 2.8887,
"step": 9860
},
{
"epoch": 0.7305733484919908,
"grad_norm": 0.31982922554016113,
"learning_rate": 2.0527263488376552e-05,
"loss": 2.8815,
"step": 9880
},
{
"epoch": 0.7320522419099909,
"grad_norm": 0.3566115200519562,
"learning_rate": 2.031912944002966e-05,
"loss": 2.8884,
"step": 9900
},
{
"epoch": 0.7335311353279909,
"grad_norm": 0.33468520641326904,
"learning_rate": 2.0111786533336e-05,
"loss": 2.8818,
"step": 9920
},
{
"epoch": 0.7350100287459909,
"grad_norm": 0.3208761513233185,
"learning_rate": 1.9905240295007145e-05,
"loss": 2.8803,
"step": 9940
},
{
"epoch": 0.7364889221639908,
"grad_norm": 0.34477704763412476,
"learning_rate": 1.9699496230519497e-05,
"loss": 2.8917,
"step": 9960
},
{
"epoch": 0.7379678155819908,
"grad_norm": 0.37035301327705383,
"learning_rate": 1.949455982396755e-05,
"loss": 2.8786,
"step": 9980
},
{
"epoch": 0.7394467089999908,
"grad_norm": 0.3365253210067749,
"learning_rate": 1.929043653791775e-05,
"loss": 2.8675,
"step": 10000
},
{
"epoch": 0.7409256024179908,
"grad_norm": 0.3333218991756439,
"learning_rate": 1.9087131813262886e-05,
"loss": 2.8687,
"step": 10020
},
{
"epoch": 0.7424044958359908,
"grad_norm": 0.3710993230342865,
"learning_rate": 1.8884651069076992e-05,
"loss": 2.8718,
"step": 10040
},
{
"epoch": 0.7438833892539907,
"grad_norm": 0.36842554807662964,
"learning_rate": 1.8682999702471014e-05,
"loss": 2.8631,
"step": 10060
},
{
"epoch": 0.7453622826719907,
"grad_norm": 0.35305920243263245,
"learning_rate": 1.8482183088448862e-05,
"loss": 2.8708,
"step": 10080
},
{
"epoch": 0.7468411760899907,
"grad_norm": 0.3375717103481293,
"learning_rate": 1.828220657976419e-05,
"loss": 2.8817,
"step": 10100
},
{
"epoch": 0.7483200695079907,
"grad_norm": 0.37821289896965027,
"learning_rate": 1.8083075506777676e-05,
"loss": 2.8787,
"step": 10120
},
{
"epoch": 0.7497989629259906,
"grad_norm": 0.3393423557281494,
"learning_rate": 1.7884795177314995e-05,
"loss": 2.8681,
"step": 10140
},
{
"epoch": 0.7512778563439906,
"grad_norm": 0.35140156745910645,
"learning_rate": 1.7687370876525273e-05,
"loss": 2.8742,
"step": 10160
},
{
"epoch": 0.7527567497619906,
"grad_norm": 0.3378312587738037,
"learning_rate": 1.7490807866740268e-05,
"loss": 2.8736,
"step": 10180
},
{
"epoch": 0.7542356431799906,
"grad_norm": 0.37517204880714417,
"learning_rate": 1.7295111387334103e-05,
"loss": 2.8623,
"step": 10200
},
{
"epoch": 0.7557145365979906,
"grad_norm": 0.3355712890625,
"learning_rate": 1.7100286654583543e-05,
"loss": 2.8721,
"step": 10220
},
{
"epoch": 0.7571934300159905,
"grad_norm": 0.3331904411315918,
"learning_rate": 1.690633886152903e-05,
"loss": 2.8701,
"step": 10240
},
{
"epoch": 0.7586723234339905,
"grad_norm": 0.34373047947883606,
"learning_rate": 1.6713273177836276e-05,
"loss": 2.8718,
"step": 10260
},
{
"epoch": 0.7601512168519905,
"grad_norm": 0.3202342987060547,
"learning_rate": 1.6521094749658328e-05,
"loss": 2.8658,
"step": 10280
},
{
"epoch": 0.7616301102699905,
"grad_norm": 0.33778509497642517,
"learning_rate": 1.6329808699498588e-05,
"loss": 2.8786,
"step": 10300
},
{
"epoch": 0.7631090036879905,
"grad_norm": 0.33873429894447327,
"learning_rate": 1.613942012607414e-05,
"loss": 2.8731,
"step": 10320
},
{
"epoch": 0.7645878971059904,
"grad_norm": 0.3424777090549469,
"learning_rate": 1.5949934104179887e-05,
"loss": 2.8715,
"step": 10340
},
{
"epoch": 0.7660667905239904,
"grad_norm": 0.33158713579177856,
"learning_rate": 1.5761355684553286e-05,
"loss": 2.8545,
"step": 10360
},
{
"epoch": 0.7675456839419904,
"grad_norm": 0.3395291566848755,
"learning_rate": 1.557368989373973e-05,
"loss": 2.8533,
"step": 10380
},
{
"epoch": 0.7690245773599904,
"grad_norm": 0.31933024525642395,
"learning_rate": 1.5386941733958503e-05,
"loss": 2.8651,
"step": 10400
},
{
"epoch": 0.7705034707779904,
"grad_norm": 0.3164694309234619,
"learning_rate": 1.5201116182969538e-05,
"loss": 2.8773,
"step": 10420
},
{
"epoch": 0.7719823641959903,
"grad_norm": 0.35544392466545105,
"learning_rate": 1.50162181939407e-05,
"loss": 2.859,
"step": 10440
},
{
"epoch": 0.7734612576139903,
"grad_norm": 0.3556651175022125,
"learning_rate": 1.4832252695315691e-05,
"loss": 2.8463,
"step": 10460
},
{
"epoch": 0.7749401510319903,
"grad_norm": 0.335028737783432,
"learning_rate": 1.4649224590682802e-05,
"loss": 2.8635,
"step": 10480
},
{
"epoch": 0.7764190444499903,
"grad_norm": 0.4239474833011627,
"learning_rate": 1.4467138758644139e-05,
"loss": 2.8493,
"step": 10500
},
{
"epoch": 0.7778979378679903,
"grad_norm": 0.3199774920940399,
"learning_rate": 1.4286000052685556e-05,
"loss": 2.8687,
"step": 10520
},
{
"epoch": 0.7793768312859902,
"grad_norm": 0.3779512643814087,
"learning_rate": 1.4105813301047366e-05,
"loss": 2.8518,
"step": 10540
},
{
"epoch": 0.7808557247039902,
"grad_norm": 0.3382132649421692,
"learning_rate": 1.3926583306595581e-05,
"loss": 2.8572,
"step": 10560
},
{
"epoch": 0.7823346181219902,
"grad_norm": 0.3185078203678131,
"learning_rate": 1.374831484669392e-05,
"loss": 2.8607,
"step": 10580
},
{
"epoch": 0.7838135115399902,
"grad_norm": 0.35780152678489685,
"learning_rate": 1.3571012673076472e-05,
"loss": 2.8564,
"step": 10600
},
{
"epoch": 0.7852924049579901,
"grad_norm": 0.3039771616458893,
"learning_rate": 1.3394681511721013e-05,
"loss": 2.8587,
"step": 10620
},
{
"epoch": 0.7867712983759901,
"grad_norm": 0.3119048774242401,
"learning_rate": 1.3219326062723042e-05,
"loss": 2.864,
"step": 10640
},
{
"epoch": 0.7882501917939901,
"grad_norm": 0.3685562312602997,
"learning_rate": 1.304495100017053e-05,
"loss": 2.8551,
"step": 10660
},
{
"epoch": 0.7897290852119901,
"grad_norm": 0.32328301668167114,
"learning_rate": 1.2871560972019314e-05,
"loss": 2.8537,
"step": 10680
},
{
"epoch": 0.7912079786299901,
"grad_norm": 0.32044264674186707,
"learning_rate": 1.2699160599969174e-05,
"loss": 2.8647,
"step": 10700
},
{
"epoch": 0.79268687204799,
"grad_norm": 0.39615657925605774,
"learning_rate": 1.2527754479340703e-05,
"loss": 2.8558,
"step": 10720
},
{
"epoch": 0.79416576546599,
"grad_norm": 0.31399622559547424,
"learning_rate": 1.2357347178952788e-05,
"loss": 2.8582,
"step": 10740
},
{
"epoch": 0.79564465888399,
"grad_norm": 0.33324578404426575,
"learning_rate": 1.2187943241000794e-05,
"loss": 2.8447,
"step": 10760
},
{
"epoch": 0.7971235523019901,
"grad_norm": 0.32412442564964294,
"learning_rate": 1.2019547180935552e-05,
"loss": 2.842,
"step": 10780
},
{
"epoch": 0.7986024457199901,
"grad_norm": 0.3198014795780182,
"learning_rate": 1.1852163487342981e-05,
"loss": 2.8594,
"step": 10800
},
{
"epoch": 0.80008133913799,
"grad_norm": 0.3332209289073944,
"learning_rate": 1.1685796621824423e-05,
"loss": 2.8542,
"step": 10820
},
{
"epoch": 0.80156023255599,
"grad_norm": 0.3251478374004364,
"learning_rate": 1.1520451018877742e-05,
"loss": 2.8623,
"step": 10840
},
{
"epoch": 0.80303912597399,
"grad_norm": 0.3332981765270233,
"learning_rate": 1.1356131085779131e-05,
"loss": 2.8566,
"step": 10860
},
{
"epoch": 0.80451801939199,
"grad_norm": 0.30493640899658203,
"learning_rate": 1.1192841202465565e-05,
"loss": 2.8596,
"step": 10880
},
{
"epoch": 0.80599691280999,
"grad_norm": 0.3335663974285126,
"learning_rate": 1.1030585721418174e-05,
"loss": 2.854,
"step": 10900
},
{
"epoch": 0.8074758062279899,
"grad_norm": 0.3442290127277374,
"learning_rate": 1.0869368967546134e-05,
"loss": 2.8471,
"step": 10920
},
{
"epoch": 0.8089546996459899,
"grad_norm": 0.3200606107711792,
"learning_rate": 1.0709195238071407e-05,
"loss": 2.8553,
"step": 10940
},
{
"epoch": 0.8104335930639899,
"grad_norm": 0.30462324619293213,
"learning_rate": 1.0550068802414231e-05,
"loss": 2.8487,
"step": 10960
},
{
"epoch": 0.8119124864819899,
"grad_norm": 0.3395856022834778,
"learning_rate": 1.0391993902079295e-05,
"loss": 2.8472,
"step": 10980
},
{
"epoch": 0.8133913798999899,
"grad_norm": 0.3614775836467743,
"learning_rate": 1.0234974750542647e-05,
"loss": 2.8427,
"step": 11000
},
{
"epoch": 0.8148702733179898,
"grad_norm": 0.3020230829715729,
"learning_rate": 1.0079015533139463e-05,
"loss": 2.8606,
"step": 11020
},
{
"epoch": 0.8163491667359898,
"grad_norm": 0.32456544041633606,
"learning_rate": 9.924120406952431e-06,
"loss": 2.8508,
"step": 11040
},
{
"epoch": 0.8178280601539898,
"grad_norm": 0.3214119076728821,
"learning_rate": 9.77029350070095e-06,
"loss": 2.8391,
"step": 11060
},
{
"epoch": 0.8193069535719898,
"grad_norm": 0.3201681077480316,
"learning_rate": 9.61753891463109e-06,
"loss": 2.8532,
"step": 11080
},
{
"epoch": 0.8207858469899897,
"grad_norm": 0.323337584733963,
"learning_rate": 9.465860720406327e-06,
"loss": 2.8499,
"step": 11100
},
{
"epoch": 0.8222647404079897,
"grad_norm": 0.31912675499916077,
"learning_rate": 9.315262960998911e-06,
"loss": 2.852,
"step": 11120
},
{
"epoch": 0.8237436338259897,
"grad_norm": 0.31801870465278625,
"learning_rate": 9.165749650582239e-06,
"loss": 2.8373,
"step": 11140
},
{
"epoch": 0.8252225272439897,
"grad_norm": 0.3083365559577942,
"learning_rate": 9.017324774423785e-06,
"loss": 2.8565,
"step": 11160
},
{
"epoch": 0.8267014206619897,
"grad_norm": 0.34097760915756226,
"learning_rate": 8.869992288778834e-06,
"loss": 2.8389,
"step": 11180
},
{
"epoch": 0.8281803140799896,
"grad_norm": 0.32595744729042053,
"learning_rate": 8.72375612078511e-06,
"loss": 2.8588,
"step": 11200
},
{
"epoch": 0.8296592074979896,
"grad_norm": 0.3241618275642395,
"learning_rate": 8.578620168358082e-06,
"loss": 2.8527,
"step": 11220
},
{
"epoch": 0.8311381009159896,
"grad_norm": 0.31303274631500244,
"learning_rate": 8.434588300086988e-06,
"loss": 2.8326,
"step": 11240
},
{
"epoch": 0.8326169943339896,
"grad_norm": 0.3417539596557617,
"learning_rate": 8.291664355131818e-06,
"loss": 2.8477,
"step": 11260
},
{
"epoch": 0.8340958877519896,
"grad_norm": 0.3075898289680481,
"learning_rate": 8.149852143120923e-06,
"loss": 2.8353,
"step": 11280
},
{
"epoch": 0.8355747811699895,
"grad_norm": 0.32699164748191833,
"learning_rate": 8.009155444049499e-06,
"loss": 2.8432,
"step": 11300
},
{
"epoch": 0.8370536745879895,
"grad_norm": 0.29232412576675415,
"learning_rate": 7.869578008178808e-06,
"loss": 2.8538,
"step": 11320
},
{
"epoch": 0.8385325680059895,
"grad_norm": 0.2949979901313782,
"learning_rate": 7.731123555936232e-06,
"loss": 2.8494,
"step": 11340
},
{
"epoch": 0.8400114614239895,
"grad_norm": 0.2993783950805664,
"learning_rate": 7.593795777816071e-06,
"loss": 2.8439,
"step": 11360
},
{
"epoch": 0.8414903548419895,
"grad_norm": 0.31987783312797546,
"learning_rate": 7.457598334281235e-06,
"loss": 2.8364,
"step": 11380
},
{
"epoch": 0.8429692482599894,
"grad_norm": 0.3066832721233368,
"learning_rate": 7.322534855665636e-06,
"loss": 2.8414,
"step": 11400
},
{
"epoch": 0.8444481416779894,
"grad_norm": 0.3674749433994293,
"learning_rate": 7.1886089420773965e-06,
"loss": 2.8346,
"step": 11420
},
{
"epoch": 0.8459270350959894,
"grad_norm": 0.3142234981060028,
"learning_rate": 7.055824163302943e-06,
"loss": 2.8478,
"step": 11440
},
{
"epoch": 0.8474059285139894,
"grad_norm": 0.30251550674438477,
"learning_rate": 6.924184058711836e-06,
"loss": 2.8447,
"step": 11460
},
{
"epoch": 0.8488848219319894,
"grad_norm": 0.35557475686073303,
"learning_rate": 6.7936921371623885e-06,
"loss": 2.8387,
"step": 11480
},
{
"epoch": 0.8503637153499893,
"grad_norm": 0.2999821901321411,
"learning_rate": 6.6643518769082036e-06,
"loss": 2.8484,
"step": 11500
},
{
"epoch": 0.8518426087679893,
"grad_norm": 0.29102715849876404,
"learning_rate": 6.536166725505405e-06,
"loss": 2.8418,
"step": 11520
},
{
"epoch": 0.8533215021859893,
"grad_norm": 0.3709971606731415,
"learning_rate": 6.4091400997207785e-06,
"loss": 2.8393,
"step": 11540
},
{
"epoch": 0.8548003956039893,
"grad_norm": 0.3058640658855438,
"learning_rate": 6.2832753854406846e-06,
"loss": 2.8428,
"step": 11560
},
{
"epoch": 0.8562792890219892,
"grad_norm": 0.2915048599243164,
"learning_rate": 6.158575937580818e-06,
"loss": 2.8446,
"step": 11580
},
{
"epoch": 0.8577581824399892,
"grad_norm": 0.31149548292160034,
"learning_rate": 6.035045079996743e-06,
"loss": 2.8438,
"step": 11600
},
{
"epoch": 0.8592370758579893,
"grad_norm": 0.2985529601573944,
"learning_rate": 5.9126861053953595e-06,
"loss": 2.8246,
"step": 11620
},
{
"epoch": 0.8607159692759893,
"grad_norm": 0.33099082112312317,
"learning_rate": 5.791502275247079e-06,
"loss": 2.8412,
"step": 11640
},
{
"epoch": 0.8621948626939893,
"grad_norm": 0.28865981101989746,
"learning_rate": 5.6714968196989295e-06,
"loss": 2.8299,
"step": 11660
},
{
"epoch": 0.8636737561119892,
"grad_norm": 0.34115445613861084,
"learning_rate": 5.5526729374884456e-06,
"loss": 2.8368,
"step": 11680
},
{
"epoch": 0.8651526495299892,
"grad_norm": 0.3019537925720215,
"learning_rate": 5.435033795858385e-06,
"loss": 2.8424,
"step": 11700
},
{
"epoch": 0.8666315429479892,
"grad_norm": 0.2919292449951172,
"learning_rate": 5.318582530472338e-06,
"loss": 2.8449,
"step": 11720
},
{
"epoch": 0.8681104363659892,
"grad_norm": 0.2975643575191498,
"learning_rate": 5.203322245331127e-06,
"loss": 2.8484,
"step": 11740
},
{
"epoch": 0.8695893297839892,
"grad_norm": 0.30803442001342773,
"learning_rate": 5.089256012690069e-06,
"loss": 2.839,
"step": 11760
},
{
"epoch": 0.8710682232019891,
"grad_norm": 0.3415025770664215,
"learning_rate": 4.976386872977107e-06,
"loss": 2.8406,
"step": 11780
},
{
"epoch": 0.8725471166199891,
"grad_norm": 0.3077727258205414,
"learning_rate": 4.864717834711735e-06,
"loss": 2.8262,
"step": 11800
},
{
"epoch": 0.8740260100379891,
"grad_norm": 0.3027855455875397,
"learning_rate": 4.75425187442482e-06,
"loss": 2.8394,
"step": 11820
},
{
"epoch": 0.8755049034559891,
"grad_norm": 0.3020201027393341,
"learning_rate": 4.644991936579268e-06,
"loss": 2.8397,
"step": 11840
},
{
"epoch": 0.8769837968739891,
"grad_norm": 0.2942678928375244,
"learning_rate": 4.536940933491552e-06,
"loss": 2.8506,
"step": 11860
},
{
"epoch": 0.878462690291989,
"grad_norm": 0.30446386337280273,
"learning_rate": 4.43010174525404e-06,
"loss": 2.8323,
"step": 11880
},
{
"epoch": 0.879941583709989,
"grad_norm": 0.2892758250236511,
"learning_rate": 4.324477219658274e-06,
"loss": 2.8268,
"step": 11900
},
{
"epoch": 0.881420477127989,
"grad_norm": 0.29356256127357483,
"learning_rate": 4.220070172119045e-06,
"loss": 2.8561,
"step": 11920
},
{
"epoch": 0.882899370545989,
"grad_norm": 0.2972046136856079,
"learning_rate": 4.116883385599335e-06,
"loss": 2.8459,
"step": 11940
},
{
"epoch": 0.884378263963989,
"grad_norm": 0.30883651971817017,
"learning_rate": 4.01491961053615e-06,
"loss": 2.8526,
"step": 11960
},
{
"epoch": 0.8858571573819889,
"grad_norm": 0.30948570370674133,
"learning_rate": 3.914181564767216e-06,
"loss": 2.8335,
"step": 11980
},
{
"epoch": 0.8873360507999889,
"grad_norm": 0.2896897494792938,
"learning_rate": 3.8146719334585246e-06,
"loss": 2.8353,
"step": 12000
},
{
"epoch": 0.8888149442179889,
"grad_norm": 0.29304638504981995,
"learning_rate": 3.7163933690327447e-06,
"loss": 2.8352,
"step": 12020
},
{
"epoch": 0.8902938376359889,
"grad_norm": 0.29079097509384155,
"learning_rate": 3.619348491098562e-06,
"loss": 2.8256,
"step": 12040
},
{
"epoch": 0.8917727310539888,
"grad_norm": 0.3122529089450836,
"learning_rate": 3.5235398863808055e-06,
"loss": 2.8211,
"step": 12060
},
{
"epoch": 0.8932516244719888,
"grad_norm": 0.2927321493625641,
"learning_rate": 3.4289701086515357e-06,
"loss": 2.8338,
"step": 12080
},
{
"epoch": 0.8947305178899888,
"grad_norm": 0.2869907319545746,
"learning_rate": 3.3356416786619716e-06,
"loss": 2.8313,
"step": 12100
},
{
"epoch": 0.8962094113079888,
"grad_norm": 0.27835631370544434,
"learning_rate": 3.2435570840752605e-06,
"loss": 2.8346,
"step": 12120
},
{
"epoch": 0.8976883047259888,
"grad_norm": 0.2780158817768097,
"learning_rate": 3.152718779400221e-06,
"loss": 2.8315,
"step": 12140
},
{
"epoch": 0.8991671981439887,
"grad_norm": 0.2955233156681061,
"learning_rate": 3.0631291859259114e-06,
"loss": 2.8241,
"step": 12160
},
{
"epoch": 0.9006460915619887,
"grad_norm": 0.29205450415611267,
"learning_rate": 2.9747906916570258e-06,
"loss": 2.8308,
"step": 12180
},
{
"epoch": 0.9021249849799887,
"grad_norm": 0.289033979177475,
"learning_rate": 2.8877056512503386e-06,
"loss": 2.8469,
"step": 12200
},
{
"epoch": 0.9036038783979887,
"grad_norm": 0.29402533173561096,
"learning_rate": 2.8018763859518736e-06,
"loss": 2.82,
"step": 12220
},
{
"epoch": 0.9050827718159887,
"grad_norm": 0.30112123489379883,
"learning_rate": 2.7173051835350517e-06,
"loss": 2.8269,
"step": 12240
},
{
"epoch": 0.9065616652339886,
"grad_norm": 0.2986692488193512,
"learning_rate": 2.6339942982397116e-06,
"loss": 2.8269,
"step": 12260
},
{
"epoch": 0.9080405586519886,
"grad_norm": 0.3106101453304291,
"learning_rate": 2.5519459507120313e-06,
"loss": 2.8415,
"step": 12280
},
{
"epoch": 0.9095194520699886,
"grad_norm": 0.2930283844470978,
"learning_rate": 2.471162327945303e-06,
"loss": 2.8353,
"step": 12300
},
{
"epoch": 0.9109983454879886,
"grad_norm": 0.28059104084968567,
"learning_rate": 2.3916455832216964e-06,
"loss": 2.8318,
"step": 12320
},
{
"epoch": 0.9124772389059886,
"grad_norm": 0.2927623987197876,
"learning_rate": 2.313397836054815e-06,
"loss": 2.841,
"step": 12340
},
{
"epoch": 0.9139561323239885,
"grad_norm": 0.28432729840278625,
"learning_rate": 2.2364211721331964e-06,
"loss": 2.8294,
"step": 12360
},
{
"epoch": 0.9154350257419885,
"grad_norm": 0.2854309678077698,
"learning_rate": 2.1607176432647703e-06,
"loss": 2.8389,
"step": 12380
},
{
"epoch": 0.9169139191599885,
"grad_norm": 0.2870195209980011,
"learning_rate": 2.0862892673221224e-06,
"loss": 2.8355,
"step": 12400
},
{
"epoch": 0.9183928125779885,
"grad_norm": 0.27523091435432434,
"learning_rate": 2.01313802818871e-06,
"loss": 2.8379,
"step": 12420
},
{
"epoch": 0.9198717059959886,
"grad_norm": 0.2815629839897156,
"learning_rate": 1.9412658757060053e-06,
"loss": 2.8279,
"step": 12440
},
{
"epoch": 0.9213505994139884,
"grad_norm": 0.28886112570762634,
"learning_rate": 1.870674725621513e-06,
"loss": 2.8242,
"step": 12460
},
{
"epoch": 0.9228294928319885,
"grad_norm": 0.2753719985485077,
"learning_rate": 1.80136645953769e-06,
"loss": 2.8234,
"step": 12480
},
{
"epoch": 0.9243083862499885,
"grad_norm": 0.2705097496509552,
"learning_rate": 1.7333429248618194e-06,
"loss": 2.8209,
"step": 12500
},
{
"epoch": 0.9257872796679885,
"grad_norm": 0.284212589263916,
"learning_rate": 1.6666059347567485e-06,
"loss": 2.838,
"step": 12520
},
{
"epoch": 0.9272661730859884,
"grad_norm": 0.28033483028411865,
"learning_rate": 1.6011572680925458e-06,
"loss": 2.827,
"step": 12540
},
{
"epoch": 0.9287450665039884,
"grad_norm": 0.27618134021759033,
"learning_rate": 1.5369986693991255e-06,
"loss": 2.8415,
"step": 12560
},
{
"epoch": 0.9302239599219884,
"grad_norm": 0.28289562463760376,
"learning_rate": 1.474131848819721e-06,
"loss": 2.834,
"step": 12580
},
{
"epoch": 0.9317028533399884,
"grad_norm": 0.2737962305545807,
"learning_rate": 1.4125584820652959e-06,
"loss": 2.8228,
"step": 12600
},
{
"epoch": 0.9331817467579884,
"grad_norm": 0.27976194024086,
"learning_rate": 1.352280210369894e-06,
"loss": 2.8387,
"step": 12620
},
{
"epoch": 0.9346606401759883,
"grad_norm": 0.27253544330596924,
"learning_rate": 1.2932986404468883e-06,
"loss": 2.8417,
"step": 12640
},
{
"epoch": 0.9361395335939883,
"grad_norm": 0.2787373661994934,
"learning_rate": 1.2356153444461393e-06,
"loss": 2.8295,
"step": 12660
},
{
"epoch": 0.9376184270119883,
"grad_norm": 0.27786681056022644,
"learning_rate": 1.1792318599121165e-06,
"loss": 2.8238,
"step": 12680
},
{
"epoch": 0.9390973204299883,
"grad_norm": 0.2707980275154114,
"learning_rate": 1.1241496897428872e-06,
"loss": 2.8216,
"step": 12700
},
{
"epoch": 0.9405762138479883,
"grad_norm": 0.2854357063770294,
"learning_rate": 1.0703703021500811e-06,
"loss": 2.8108,
"step": 12720
},
{
"epoch": 0.9420551072659882,
"grad_norm": 0.2822173833847046,
"learning_rate": 1.0178951306197337e-06,
"loss": 2.8093,
"step": 12740
},
{
"epoch": 0.9435340006839882,
"grad_norm": 0.29024040699005127,
"learning_rate": 9.667255738740943e-07,
"loss": 2.8258,
"step": 12760
},
{
"epoch": 0.9450128941019882,
"grad_norm": 0.2967122793197632,
"learning_rate": 9.168629958343334e-07,
"loss": 2.842,
"step": 12780
},
{
"epoch": 0.9464917875199882,
"grad_norm": 0.2722231149673462,
"learning_rate": 8.683087255841881e-07,
"loss": 2.8341,
"step": 12800
},
{
"epoch": 0.9479706809379882,
"grad_norm": 0.2952738106250763,
"learning_rate": 8.210640573345474e-07,
"loss": 2.8212,
"step": 12820
},
{
"epoch": 0.9494495743559881,
"grad_norm": 0.27017560601234436,
"learning_rate": 7.751302503889224e-07,
"loss": 2.8123,
"step": 12840
},
{
"epoch": 0.9509284677739881,
"grad_norm": 0.2811236083507538,
"learning_rate": 7.305085291099301e-07,
"loss": 2.8426,
"step": 12860
},
{
"epoch": 0.9524073611919881,
"grad_norm": 0.282913476228714,
"learning_rate": 6.872000828866131e-07,
"loss": 2.8348,
"step": 12880
},
{
"epoch": 0.9538862546099881,
"grad_norm": 0.2759126126766205,
"learning_rate": 6.452060661027548e-07,
"loss": 2.8301,
"step": 12900
},
{
"epoch": 0.9553651480279881,
"grad_norm": 0.2853533923625946,
"learning_rate": 6.045275981061138e-07,
"loss": 2.8415,
"step": 12920
},
{
"epoch": 0.956844041445988,
"grad_norm": 0.2731573283672333,
"learning_rate": 5.651657631785878e-07,
"loss": 2.826,
"step": 12940
},
{
"epoch": 0.958322934863988,
"grad_norm": 0.2759709060192108,
"learning_rate": 5.271216105072863e-07,
"loss": 2.8261,
"step": 12960
},
{
"epoch": 0.959801828281988,
"grad_norm": 0.2832717001438141,
"learning_rate": 4.903961541565971e-07,
"loss": 2.8332,
"step": 12980
},
{
"epoch": 0.961280721699988,
"grad_norm": 0.269037127494812,
"learning_rate": 4.5499037304115866e-07,
"loss": 2.8229,
"step": 13000
},
{
"epoch": 0.9627596151179879,
"grad_norm": 0.271410197019577,
"learning_rate": 4.2090521089972466e-07,
"loss": 2.8401,
"step": 13020
},
{
"epoch": 0.9642385085359879,
"grad_norm": 0.26483696699142456,
"learning_rate": 3.8814157627005685e-07,
"loss": 2.8376,
"step": 13040
},
{
"epoch": 0.9657174019539879,
"grad_norm": 0.2761934697628021,
"learning_rate": 3.567003424646831e-07,
"loss": 2.8374,
"step": 13060
},
{
"epoch": 0.9671962953719879,
"grad_norm": 0.27471932768821716,
"learning_rate": 3.265823475476215e-07,
"loss": 2.8358,
"step": 13080
},
{
"epoch": 0.9686751887899879,
"grad_norm": 0.27371978759765625,
"learning_rate": 2.97788394312043e-07,
"loss": 2.8289,
"step": 13100
},
{
"epoch": 0.9701540822079878,
"grad_norm": 0.2889103889465332,
"learning_rate": 2.7031925025888247e-07,
"loss": 2.8145,
"step": 13120
},
{
"epoch": 0.9716329756259878,
"grad_norm": 0.2687681317329407,
"learning_rate": 2.441756475763668e-07,
"loss": 2.818,
"step": 13140
},
{
"epoch": 0.9731118690439878,
"grad_norm": 0.2686457931995392,
"learning_rate": 2.1935828312050766e-07,
"loss": 2.8344,
"step": 13160
},
{
"epoch": 0.9745907624619878,
"grad_norm": 0.26769590377807617,
"learning_rate": 1.9586781839652235e-07,
"loss": 2.8236,
"step": 13180
},
{
"epoch": 0.9760696558799878,
"grad_norm": 0.27022501826286316,
"learning_rate": 1.737048795412033e-07,
"loss": 2.8307,
"step": 13200
},
{
"epoch": 0.9775485492979877,
"grad_norm": 0.2741018533706665,
"learning_rate": 1.5287005730623138e-07,
"loss": 2.8312,
"step": 13220
},
{
"epoch": 0.9790274427159877,
"grad_norm": 0.27768802642822266,
"learning_rate": 1.333639070424164e-07,
"loss": 2.8281,
"step": 13240
},
{
"epoch": 0.9805063361339877,
"grad_norm": 0.26736685633659363,
"learning_rate": 1.1518694868491442e-07,
"loss": 2.8342,
"step": 13260
},
{
"epoch": 0.9819852295519877,
"grad_norm": 0.26495057344436646,
"learning_rate": 9.833966673935546e-08,
"loss": 2.8236,
"step": 13280
},
{
"epoch": 0.9834641229699878,
"grad_norm": 0.27052661776542664,
"learning_rate": 8.282251026893728e-08,
"loss": 2.8214,
"step": 13300
},
{
"epoch": 0.9849430163879876,
"grad_norm": 0.2683194875717163,
"learning_rate": 6.863589288244043e-08,
"loss": 2.8468,
"step": 13320
},
{
"epoch": 0.9864219098059877,
"grad_norm": 0.27812352776527405,
"learning_rate": 5.5780192723214884e-08,
"loss": 2.8254,
"step": 13340
},
{
"epoch": 0.9879008032239877,
"grad_norm": 0.2842520773410797,
"learning_rate": 4.425575245911029e-08,
"loss": 2.8273,
"step": 13360
},
{
"epoch": 0.9893796966419877,
"grad_norm": 0.2864263355731964,
"learning_rate": 3.406287927332219e-08,
"loss": 2.8311,
"step": 13380
},
{
"epoch": 0.9908585900599877,
"grad_norm": 0.26490774750709534,
"learning_rate": 2.520184485620969e-08,
"loss": 2.8298,
"step": 13400
},
{
"epoch": 0.9923374834779876,
"grad_norm": 0.2666003406047821,
"learning_rate": 1.7672885398067883e-08,
"loss": 2.8303,
"step": 13420
},
{
"epoch": 0.9938163768959876,
"grad_norm": 0.27174392342567444,
"learning_rate": 1.147620158281626e-08,
"loss": 2.8177,
"step": 13440
},
{
"epoch": 0.9952952703139876,
"grad_norm": 0.2677934467792511,
"learning_rate": 6.6119585826529554e-09,
"loss": 2.8123,
"step": 13460
},
{
"epoch": 0.9967741637319876,
"grad_norm": 0.2655700445175171,
"learning_rate": 3.0802860536582876e-09,
"loss": 2.8268,
"step": 13480
},
{
"epoch": 0.9982530571499876,
"grad_norm": 0.2759760022163391,
"learning_rate": 8.812781323253027e-10,
"loss": 2.8247,
"step": 13500
},
{
"epoch": 0.9997319505679875,
"grad_norm": 0.2634597718715668,
"learning_rate": 1.4993433072874042e-11,
"loss": 2.831,
"step": 13520
}
],
"logging_steps": 20,
"max_steps": 13523,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.070897645108016e+19,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}