Laim's picture
Upload folder using huggingface_hub
4f2bb5e verified
raw
history blame contribute delete
No virus
44.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.48482497818287595,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019392999127315039,
"grad_norm": 0.5741730332374573,
"learning_rate": 0.00019987071333915123,
"loss": 1.7671,
"step": 20
},
{
"epoch": 0.0038785998254630078,
"grad_norm": 0.3640196621417999,
"learning_rate": 0.00019974142667830247,
"loss": 1.2552,
"step": 40
},
{
"epoch": 0.005817899738194512,
"grad_norm": 0.33685317635536194,
"learning_rate": 0.0001996121400174537,
"loss": 1.1058,
"step": 60
},
{
"epoch": 0.0077571996509260156,
"grad_norm": 0.5886279940605164,
"learning_rate": 0.00019948285335660496,
"loss": 1.1087,
"step": 80
},
{
"epoch": 0.009696499563657519,
"grad_norm": 0.5692952871322632,
"learning_rate": 0.00019935356669575617,
"loss": 1.1196,
"step": 100
},
{
"epoch": 0.011635799476389024,
"grad_norm": 0.49794259667396545,
"learning_rate": 0.00019922428003490741,
"loss": 1.1866,
"step": 120
},
{
"epoch": 0.013575099389120528,
"grad_norm": 0.4546428322792053,
"learning_rate": 0.00019909499337405863,
"loss": 1.1637,
"step": 140
},
{
"epoch": 0.015514399301852031,
"grad_norm": 0.4478498697280884,
"learning_rate": 0.00019896570671320987,
"loss": 1.1465,
"step": 160
},
{
"epoch": 0.017453699214583535,
"grad_norm": 0.5994399785995483,
"learning_rate": 0.00019883642005236112,
"loss": 1.1346,
"step": 180
},
{
"epoch": 0.019392999127315038,
"grad_norm": 0.44898244738578796,
"learning_rate": 0.00019870713339151233,
"loss": 1.1577,
"step": 200
},
{
"epoch": 0.02133229904004654,
"grad_norm": 0.4165457785129547,
"learning_rate": 0.00019857784673066358,
"loss": 1.1123,
"step": 220
},
{
"epoch": 0.02327159895277805,
"grad_norm": 0.4125240743160248,
"learning_rate": 0.0001984485600698148,
"loss": 1.1449,
"step": 240
},
{
"epoch": 0.025210898865509552,
"grad_norm": 0.49324268102645874,
"learning_rate": 0.00019831927340896604,
"loss": 1.0512,
"step": 260
},
{
"epoch": 0.027150198778241055,
"grad_norm": 0.5381457209587097,
"learning_rate": 0.00019818998674811725,
"loss": 1.1418,
"step": 280
},
{
"epoch": 0.02908949869097256,
"grad_norm": 0.47153565287590027,
"learning_rate": 0.00019806070008726852,
"loss": 1.0823,
"step": 300
},
{
"epoch": 0.031028798603704062,
"grad_norm": 0.5824480056762695,
"learning_rate": 0.00019793141342641974,
"loss": 1.178,
"step": 320
},
{
"epoch": 0.03296809851643557,
"grad_norm": 0.4500732719898224,
"learning_rate": 0.00019780212676557098,
"loss": 1.1021,
"step": 340
},
{
"epoch": 0.03490739842916707,
"grad_norm": 0.5821068286895752,
"learning_rate": 0.0001976728401047222,
"loss": 1.0628,
"step": 360
},
{
"epoch": 0.036846698341898576,
"grad_norm": 0.39794814586639404,
"learning_rate": 0.00019754355344387344,
"loss": 1.0391,
"step": 380
},
{
"epoch": 0.038785998254630076,
"grad_norm": 0.38640904426574707,
"learning_rate": 0.00019741426678302469,
"loss": 1.0417,
"step": 400
},
{
"epoch": 0.04072529816736158,
"grad_norm": 0.3635335862636566,
"learning_rate": 0.0001972849801221759,
"loss": 0.9385,
"step": 420
},
{
"epoch": 0.04266459808009308,
"grad_norm": 0.40371426939964294,
"learning_rate": 0.00019715569346132714,
"loss": 1.0481,
"step": 440
},
{
"epoch": 0.04460389799282459,
"grad_norm": 0.37828192114830017,
"learning_rate": 0.00019702640680047836,
"loss": 1.0186,
"step": 460
},
{
"epoch": 0.0465431979055561,
"grad_norm": 0.4067387282848358,
"learning_rate": 0.0001968971201396296,
"loss": 1.0012,
"step": 480
},
{
"epoch": 0.0484824978182876,
"grad_norm": 0.42465779185295105,
"learning_rate": 0.00019676783347878082,
"loss": 1.11,
"step": 500
},
{
"epoch": 0.050421797731019104,
"grad_norm": 0.4794886112213135,
"learning_rate": 0.0001966385468179321,
"loss": 1.0221,
"step": 520
},
{
"epoch": 0.052361097643750604,
"grad_norm": 0.40525391697883606,
"learning_rate": 0.0001965092601570833,
"loss": 0.993,
"step": 540
},
{
"epoch": 0.05430039755648211,
"grad_norm": 0.47314971685409546,
"learning_rate": 0.00019637997349623455,
"loss": 1.0939,
"step": 560
},
{
"epoch": 0.05623969746921361,
"grad_norm": 0.30406779050827026,
"learning_rate": 0.00019625068683538577,
"loss": 1.087,
"step": 580
},
{
"epoch": 0.05817899738194512,
"grad_norm": 0.4694748818874359,
"learning_rate": 0.00019612140017453698,
"loss": 1.1412,
"step": 600
},
{
"epoch": 0.060118297294676624,
"grad_norm": 0.4146924316883087,
"learning_rate": 0.00019599211351368823,
"loss": 1.0378,
"step": 620
},
{
"epoch": 0.062057597207408124,
"grad_norm": 0.40017783641815186,
"learning_rate": 0.00019586282685283947,
"loss": 1.0924,
"step": 640
},
{
"epoch": 0.06399689712013963,
"grad_norm": 0.6136152744293213,
"learning_rate": 0.0001957335401919907,
"loss": 1.0273,
"step": 660
},
{
"epoch": 0.06593619703287114,
"grad_norm": 0.46415477991104126,
"learning_rate": 0.00019560425353114193,
"loss": 1.0324,
"step": 680
},
{
"epoch": 0.06787549694560263,
"grad_norm": 0.579818606376648,
"learning_rate": 0.00019547496687029317,
"loss": 1.0475,
"step": 700
},
{
"epoch": 0.06981479685833414,
"grad_norm": 0.35234829783439636,
"learning_rate": 0.0001953456802094444,
"loss": 0.9991,
"step": 720
},
{
"epoch": 0.07175409677106565,
"grad_norm": 0.4519262909889221,
"learning_rate": 0.00019521639354859563,
"loss": 1.0038,
"step": 740
},
{
"epoch": 0.07369339668379715,
"grad_norm": 0.47795727849006653,
"learning_rate": 0.00019508710688774687,
"loss": 1.0077,
"step": 760
},
{
"epoch": 0.07563269659652866,
"grad_norm": 1.1003836393356323,
"learning_rate": 0.0001949578202268981,
"loss": 0.9657,
"step": 780
},
{
"epoch": 0.07757199650926015,
"grad_norm": 0.38754114508628845,
"learning_rate": 0.00019482853356604933,
"loss": 1.1136,
"step": 800
},
{
"epoch": 0.07951129642199166,
"grad_norm": 0.42165011167526245,
"learning_rate": 0.00019469924690520055,
"loss": 1.0678,
"step": 820
},
{
"epoch": 0.08145059633472317,
"grad_norm": 0.28059273958206177,
"learning_rate": 0.0001945699602443518,
"loss": 1.1409,
"step": 840
},
{
"epoch": 0.08338989624745467,
"grad_norm": 0.4145539700984955,
"learning_rate": 0.00019444067358350304,
"loss": 1.1071,
"step": 860
},
{
"epoch": 0.08532919616018617,
"grad_norm": 0.46548235416412354,
"learning_rate": 0.00019431138692265428,
"loss": 0.9627,
"step": 880
},
{
"epoch": 0.08726849607291767,
"grad_norm": 0.41764864325523376,
"learning_rate": 0.0001941821002618055,
"loss": 0.9689,
"step": 900
},
{
"epoch": 0.08920779598564918,
"grad_norm": 0.5186419486999512,
"learning_rate": 0.00019405281360095674,
"loss": 1.079,
"step": 920
},
{
"epoch": 0.09114709589838069,
"grad_norm": 0.4437251091003418,
"learning_rate": 0.00019392352694010796,
"loss": 1.0988,
"step": 940
},
{
"epoch": 0.0930863958111122,
"grad_norm": 0.37555068731307983,
"learning_rate": 0.0001937942402792592,
"loss": 1.0355,
"step": 960
},
{
"epoch": 0.09502569572384369,
"grad_norm": 0.305160790681839,
"learning_rate": 0.00019366495361841044,
"loss": 0.9559,
"step": 980
},
{
"epoch": 0.0969649956365752,
"grad_norm": 0.36364415287971497,
"learning_rate": 0.00019353566695756166,
"loss": 1.0374,
"step": 1000
},
{
"epoch": 0.0989042955493067,
"grad_norm": 0.3676876127719879,
"learning_rate": 0.0001934063802967129,
"loss": 1.1034,
"step": 1020
},
{
"epoch": 0.10084359546203821,
"grad_norm": 0.41380801796913147,
"learning_rate": 0.00019327709363586412,
"loss": 0.9731,
"step": 1040
},
{
"epoch": 0.10278289537476971,
"grad_norm": 0.3573336899280548,
"learning_rate": 0.00019314780697501536,
"loss": 1.0479,
"step": 1060
},
{
"epoch": 0.10472219528750121,
"grad_norm": 0.45588839054107666,
"learning_rate": 0.00019301852031416658,
"loss": 1.0819,
"step": 1080
},
{
"epoch": 0.10666149520023271,
"grad_norm": 0.37001991271972656,
"learning_rate": 0.00019288923365331785,
"loss": 1.0197,
"step": 1100
},
{
"epoch": 0.10860079511296422,
"grad_norm": 0.41651812195777893,
"learning_rate": 0.00019275994699246906,
"loss": 1.0142,
"step": 1120
},
{
"epoch": 0.11054009502569573,
"grad_norm": 0.4468708038330078,
"learning_rate": 0.0001926306603316203,
"loss": 1.1007,
"step": 1140
},
{
"epoch": 0.11247939493842722,
"grad_norm": 0.3199480473995209,
"learning_rate": 0.00019250137367077152,
"loss": 1.004,
"step": 1160
},
{
"epoch": 0.11441869485115873,
"grad_norm": 0.4078716039657593,
"learning_rate": 0.00019237208700992274,
"loss": 1.0726,
"step": 1180
},
{
"epoch": 0.11635799476389024,
"grad_norm": 0.42165908217430115,
"learning_rate": 0.000192242800349074,
"loss": 1.0257,
"step": 1200
},
{
"epoch": 0.11829729467662174,
"grad_norm": 0.3128000795841217,
"learning_rate": 0.00019211351368822523,
"loss": 1.0694,
"step": 1220
},
{
"epoch": 0.12023659458935325,
"grad_norm": 0.3406975567340851,
"learning_rate": 0.00019198422702737647,
"loss": 1.0577,
"step": 1240
},
{
"epoch": 0.12217589450208474,
"grad_norm": 0.3885703384876251,
"learning_rate": 0.00019185494036652768,
"loss": 1.0074,
"step": 1260
},
{
"epoch": 0.12411519441481625,
"grad_norm": 0.37669333815574646,
"learning_rate": 0.00019172565370567893,
"loss": 1.012,
"step": 1280
},
{
"epoch": 0.12605449432754776,
"grad_norm": 0.4600653350353241,
"learning_rate": 0.00019159636704483014,
"loss": 1.0039,
"step": 1300
},
{
"epoch": 0.12799379424027926,
"grad_norm": 0.3202829957008362,
"learning_rate": 0.0001914670803839814,
"loss": 1.0403,
"step": 1320
},
{
"epoch": 0.12993309415301077,
"grad_norm": 0.441177636384964,
"learning_rate": 0.00019133779372313263,
"loss": 1.0413,
"step": 1340
},
{
"epoch": 0.13187239406574228,
"grad_norm": 0.39573097229003906,
"learning_rate": 0.00019120850706228385,
"loss": 1.0821,
"step": 1360
},
{
"epoch": 0.13381169397847378,
"grad_norm": 0.45021307468414307,
"learning_rate": 0.0001910792204014351,
"loss": 1.0128,
"step": 1380
},
{
"epoch": 0.13575099389120526,
"grad_norm": 0.44374045729637146,
"learning_rate": 0.0001909499337405863,
"loss": 1.0713,
"step": 1400
},
{
"epoch": 0.13769029380393677,
"grad_norm": 0.33873313665390015,
"learning_rate": 0.00019082064707973755,
"loss": 1.0951,
"step": 1420
},
{
"epoch": 0.13962959371666828,
"grad_norm": 0.36703407764434814,
"learning_rate": 0.0001906913604188888,
"loss": 1.0662,
"step": 1440
},
{
"epoch": 0.14156889362939978,
"grad_norm": 0.31029993295669556,
"learning_rate": 0.00019056207375804004,
"loss": 1.0673,
"step": 1460
},
{
"epoch": 0.1435081935421313,
"grad_norm": 0.35601869225502014,
"learning_rate": 0.00019043278709719125,
"loss": 1.0696,
"step": 1480
},
{
"epoch": 0.1454474934548628,
"grad_norm": 0.39259615540504456,
"learning_rate": 0.0001903035004363425,
"loss": 1.0173,
"step": 1500
},
{
"epoch": 0.1473867933675943,
"grad_norm": 0.39904189109802246,
"learning_rate": 0.0001901742137754937,
"loss": 1.1188,
"step": 1520
},
{
"epoch": 0.1493260932803258,
"grad_norm": 0.395321786403656,
"learning_rate": 0.00019004492711464495,
"loss": 1.0039,
"step": 1540
},
{
"epoch": 0.15126539319305732,
"grad_norm": 0.37806248664855957,
"learning_rate": 0.0001899156404537962,
"loss": 1.1001,
"step": 1560
},
{
"epoch": 0.1532046931057888,
"grad_norm": 0.3514845669269562,
"learning_rate": 0.00018978635379294741,
"loss": 1.0592,
"step": 1580
},
{
"epoch": 0.1551439930185203,
"grad_norm": 0.3304252624511719,
"learning_rate": 0.00018965706713209866,
"loss": 1.0237,
"step": 1600
},
{
"epoch": 0.1570832929312518,
"grad_norm": 0.31678342819213867,
"learning_rate": 0.00018952778047124987,
"loss": 0.962,
"step": 1620
},
{
"epoch": 0.15902259284398332,
"grad_norm": 0.34594252705574036,
"learning_rate": 0.00018939849381040112,
"loss": 0.9677,
"step": 1640
},
{
"epoch": 0.16096189275671482,
"grad_norm": 0.35923656821250916,
"learning_rate": 0.00018926920714955236,
"loss": 1.0206,
"step": 1660
},
{
"epoch": 0.16290119266944633,
"grad_norm": 0.4773354232311249,
"learning_rate": 0.0001891399204887036,
"loss": 1.0682,
"step": 1680
},
{
"epoch": 0.16484049258217784,
"grad_norm": 0.3007306456565857,
"learning_rate": 0.00018901063382785482,
"loss": 0.9395,
"step": 1700
},
{
"epoch": 0.16677979249490935,
"grad_norm": 0.40934816002845764,
"learning_rate": 0.00018888134716700606,
"loss": 1.08,
"step": 1720
},
{
"epoch": 0.16871909240764085,
"grad_norm": 0.35403525829315186,
"learning_rate": 0.00018875206050615728,
"loss": 0.9955,
"step": 1740
},
{
"epoch": 0.17065839232037233,
"grad_norm": 0.38091763854026794,
"learning_rate": 0.00018862277384530852,
"loss": 1.0324,
"step": 1760
},
{
"epoch": 0.17259769223310384,
"grad_norm": 0.37331679463386536,
"learning_rate": 0.00018849348718445977,
"loss": 1.0621,
"step": 1780
},
{
"epoch": 0.17453699214583535,
"grad_norm": 0.23007287085056305,
"learning_rate": 0.00018836420052361098,
"loss": 1.1031,
"step": 1800
},
{
"epoch": 0.17647629205856685,
"grad_norm": 0.32545435428619385,
"learning_rate": 0.00018823491386276223,
"loss": 0.9931,
"step": 1820
},
{
"epoch": 0.17841559197129836,
"grad_norm": 0.44806909561157227,
"learning_rate": 0.00018810562720191344,
"loss": 1.0262,
"step": 1840
},
{
"epoch": 0.18035489188402987,
"grad_norm": 0.37432861328125,
"learning_rate": 0.00018797634054106468,
"loss": 0.9484,
"step": 1860
},
{
"epoch": 0.18229419179676137,
"grad_norm": 0.32971423864364624,
"learning_rate": 0.00018784705388021593,
"loss": 1.0482,
"step": 1880
},
{
"epoch": 0.18423349170949288,
"grad_norm": 0.2877088189125061,
"learning_rate": 0.00018771776721936717,
"loss": 1.1487,
"step": 1900
},
{
"epoch": 0.1861727916222244,
"grad_norm": 0.45374777913093567,
"learning_rate": 0.0001875884805585184,
"loss": 1.0074,
"step": 1920
},
{
"epoch": 0.1881120915349559,
"grad_norm": 0.38346460461616516,
"learning_rate": 0.0001874591938976696,
"loss": 1.0462,
"step": 1940
},
{
"epoch": 0.19005139144768737,
"grad_norm": 0.3672585189342499,
"learning_rate": 0.00018732990723682085,
"loss": 1.0646,
"step": 1960
},
{
"epoch": 0.19199069136041888,
"grad_norm": 0.33648261427879333,
"learning_rate": 0.00018720062057597206,
"loss": 0.9792,
"step": 1980
},
{
"epoch": 0.1939299912731504,
"grad_norm": 0.37683922052383423,
"learning_rate": 0.00018707133391512333,
"loss": 1.015,
"step": 2000
},
{
"epoch": 0.1958692911858819,
"grad_norm": 0.4112173616886139,
"learning_rate": 0.00018694204725427455,
"loss": 0.9402,
"step": 2020
},
{
"epoch": 0.1978085910986134,
"grad_norm": 0.33208009600639343,
"learning_rate": 0.0001868127605934258,
"loss": 1.0754,
"step": 2040
},
{
"epoch": 0.1997478910113449,
"grad_norm": 0.3744449317455292,
"learning_rate": 0.000186683473932577,
"loss": 0.9324,
"step": 2060
},
{
"epoch": 0.20168719092407641,
"grad_norm": 0.30895674228668213,
"learning_rate": 0.00018655418727172825,
"loss": 1.0611,
"step": 2080
},
{
"epoch": 0.20362649083680792,
"grad_norm": 0.3781767189502716,
"learning_rate": 0.00018642490061087947,
"loss": 1.0332,
"step": 2100
},
{
"epoch": 0.20556579074953943,
"grad_norm": 0.40028223395347595,
"learning_rate": 0.0001862956139500307,
"loss": 0.9508,
"step": 2120
},
{
"epoch": 0.2075050906622709,
"grad_norm": 0.36377599835395813,
"learning_rate": 0.00018616632728918195,
"loss": 0.9085,
"step": 2140
},
{
"epoch": 0.20944439057500241,
"grad_norm": 0.28180891275405884,
"learning_rate": 0.00018603704062833317,
"loss": 1.0315,
"step": 2160
},
{
"epoch": 0.21138369048773392,
"grad_norm": 0.41635292768478394,
"learning_rate": 0.00018590775396748441,
"loss": 1.0278,
"step": 2180
},
{
"epoch": 0.21332299040046543,
"grad_norm": 0.45736029744148254,
"learning_rate": 0.00018577846730663563,
"loss": 0.9909,
"step": 2200
},
{
"epoch": 0.21526229031319694,
"grad_norm": 0.4075273871421814,
"learning_rate": 0.00018564918064578687,
"loss": 0.9759,
"step": 2220
},
{
"epoch": 0.21720159022592844,
"grad_norm": 0.3832456171512604,
"learning_rate": 0.00018551989398493812,
"loss": 1.0131,
"step": 2240
},
{
"epoch": 0.21914089013865995,
"grad_norm": 0.39031124114990234,
"learning_rate": 0.00018539060732408936,
"loss": 1.0805,
"step": 2260
},
{
"epoch": 0.22108019005139146,
"grad_norm": 0.27879253029823303,
"learning_rate": 0.00018526132066324058,
"loss": 1.0528,
"step": 2280
},
{
"epoch": 0.22301948996412296,
"grad_norm": 0.35129788517951965,
"learning_rate": 0.00018513203400239182,
"loss": 1.0757,
"step": 2300
},
{
"epoch": 0.22495878987685444,
"grad_norm": 0.3349596858024597,
"learning_rate": 0.00018500274734154304,
"loss": 1.0534,
"step": 2320
},
{
"epoch": 0.22689808978958595,
"grad_norm": 0.38909098505973816,
"learning_rate": 0.00018487346068069428,
"loss": 1.0352,
"step": 2340
},
{
"epoch": 0.22883738970231746,
"grad_norm": 0.36970818042755127,
"learning_rate": 0.00018474417401984552,
"loss": 1.0365,
"step": 2360
},
{
"epoch": 0.23077668961504896,
"grad_norm": 0.3963492810726166,
"learning_rate": 0.00018461488735899674,
"loss": 0.9925,
"step": 2380
},
{
"epoch": 0.23271598952778047,
"grad_norm": 0.36081552505493164,
"learning_rate": 0.00018448560069814798,
"loss": 1.1273,
"step": 2400
},
{
"epoch": 0.23465528944051198,
"grad_norm": 0.34602901339530945,
"learning_rate": 0.0001843563140372992,
"loss": 0.9404,
"step": 2420
},
{
"epoch": 0.23659458935324348,
"grad_norm": 0.4692002832889557,
"learning_rate": 0.00018422702737645044,
"loss": 1.0589,
"step": 2440
},
{
"epoch": 0.238533889265975,
"grad_norm": 0.2668192386627197,
"learning_rate": 0.00018409774071560168,
"loss": 1.0315,
"step": 2460
},
{
"epoch": 0.2404731891787065,
"grad_norm": 0.32327035069465637,
"learning_rate": 0.00018396845405475293,
"loss": 1.0387,
"step": 2480
},
{
"epoch": 0.24241248909143798,
"grad_norm": 0.3037966787815094,
"learning_rate": 0.00018383916739390414,
"loss": 1.0396,
"step": 2500
},
{
"epoch": 0.24435178900416948,
"grad_norm": 0.4088995158672333,
"learning_rate": 0.00018370988073305536,
"loss": 0.9526,
"step": 2520
},
{
"epoch": 0.246291088916901,
"grad_norm": 0.3864790201187134,
"learning_rate": 0.0001835805940722066,
"loss": 1.1621,
"step": 2540
},
{
"epoch": 0.2482303888296325,
"grad_norm": 0.3342384099960327,
"learning_rate": 0.00018345130741135782,
"loss": 1.1299,
"step": 2560
},
{
"epoch": 0.250169688742364,
"grad_norm": 0.3113463222980499,
"learning_rate": 0.0001833220207505091,
"loss": 1.0323,
"step": 2580
},
{
"epoch": 0.2521089886550955,
"grad_norm": 0.33959150314331055,
"learning_rate": 0.0001831927340896603,
"loss": 1.0998,
"step": 2600
},
{
"epoch": 0.254048288567827,
"grad_norm": 0.35392144322395325,
"learning_rate": 0.00018306344742881155,
"loss": 1.1187,
"step": 2620
},
{
"epoch": 0.2559875884805585,
"grad_norm": 0.36454859375953674,
"learning_rate": 0.00018293416076796277,
"loss": 1.0935,
"step": 2640
},
{
"epoch": 0.25792688839329003,
"grad_norm": 0.5319137573242188,
"learning_rate": 0.000182804874107114,
"loss": 0.9902,
"step": 2660
},
{
"epoch": 0.25986618830602154,
"grad_norm": 0.3222362697124481,
"learning_rate": 0.00018267558744626525,
"loss": 1.0396,
"step": 2680
},
{
"epoch": 0.26180548821875305,
"grad_norm": 0.3684043288230896,
"learning_rate": 0.00018254630078541647,
"loss": 1.004,
"step": 2700
},
{
"epoch": 0.26374478813148455,
"grad_norm": 0.33372971415519714,
"learning_rate": 0.0001824170141245677,
"loss": 1.0479,
"step": 2720
},
{
"epoch": 0.26568408804421606,
"grad_norm": 0.28436896204948425,
"learning_rate": 0.00018228772746371893,
"loss": 0.9811,
"step": 2740
},
{
"epoch": 0.26762338795694757,
"grad_norm": 0.42128750681877136,
"learning_rate": 0.00018215844080287017,
"loss": 1.0022,
"step": 2760
},
{
"epoch": 0.269562687869679,
"grad_norm": 0.38527703285217285,
"learning_rate": 0.0001820291541420214,
"loss": 1.0407,
"step": 2780
},
{
"epoch": 0.2715019877824105,
"grad_norm": 0.39973485469818115,
"learning_rate": 0.00018189986748117266,
"loss": 1.0739,
"step": 2800
},
{
"epoch": 0.27344128769514203,
"grad_norm": 0.39641934633255005,
"learning_rate": 0.00018177058082032387,
"loss": 0.9724,
"step": 2820
},
{
"epoch": 0.27538058760787354,
"grad_norm": 0.3733800947666168,
"learning_rate": 0.00018164129415947512,
"loss": 0.9696,
"step": 2840
},
{
"epoch": 0.27731988752060505,
"grad_norm": 0.42063865065574646,
"learning_rate": 0.00018151200749862633,
"loss": 1.0355,
"step": 2860
},
{
"epoch": 0.27925918743333655,
"grad_norm": 0.3460356593132019,
"learning_rate": 0.00018138272083777758,
"loss": 0.9773,
"step": 2880
},
{
"epoch": 0.28119848734606806,
"grad_norm": 0.45516788959503174,
"learning_rate": 0.0001812534341769288,
"loss": 1.0263,
"step": 2900
},
{
"epoch": 0.28313778725879957,
"grad_norm": 0.3858850300312042,
"learning_rate": 0.00018112414751608004,
"loss": 1.0375,
"step": 2920
},
{
"epoch": 0.2850770871715311,
"grad_norm": 0.419709712266922,
"learning_rate": 0.00018099486085523128,
"loss": 0.9879,
"step": 2940
},
{
"epoch": 0.2870163870842626,
"grad_norm": 0.37489813566207886,
"learning_rate": 0.0001808655741943825,
"loss": 1.0098,
"step": 2960
},
{
"epoch": 0.2889556869969941,
"grad_norm": 0.36090582609176636,
"learning_rate": 0.00018073628753353374,
"loss": 1.0185,
"step": 2980
},
{
"epoch": 0.2908949869097256,
"grad_norm": 0.42885035276412964,
"learning_rate": 0.00018060700087268495,
"loss": 0.9338,
"step": 3000
},
{
"epoch": 0.2928342868224571,
"grad_norm": 0.3570854961872101,
"learning_rate": 0.00018047771421183622,
"loss": 1.0928,
"step": 3020
},
{
"epoch": 0.2947735867351886,
"grad_norm": 0.3873838484287262,
"learning_rate": 0.00018034842755098744,
"loss": 1.0241,
"step": 3040
},
{
"epoch": 0.2967128866479201,
"grad_norm": 0.5299991965293884,
"learning_rate": 0.00018021914089013868,
"loss": 0.994,
"step": 3060
},
{
"epoch": 0.2986521865606516,
"grad_norm": 0.40146002173423767,
"learning_rate": 0.0001800898542292899,
"loss": 1.035,
"step": 3080
},
{
"epoch": 0.30059148647338313,
"grad_norm": 0.4087996780872345,
"learning_rate": 0.00017996056756844112,
"loss": 1.0682,
"step": 3100
},
{
"epoch": 0.30253078638611464,
"grad_norm": 0.39104148745536804,
"learning_rate": 0.00017983128090759236,
"loss": 1.0486,
"step": 3120
},
{
"epoch": 0.30447008629884614,
"grad_norm": 0.36926761269569397,
"learning_rate": 0.0001797019942467436,
"loss": 1.0358,
"step": 3140
},
{
"epoch": 0.3064093862115776,
"grad_norm": 0.32747143507003784,
"learning_rate": 0.00017957270758589485,
"loss": 0.9951,
"step": 3160
},
{
"epoch": 0.3083486861243091,
"grad_norm": 0.42642009258270264,
"learning_rate": 0.00017944342092504606,
"loss": 1.0647,
"step": 3180
},
{
"epoch": 0.3102879860370406,
"grad_norm": 0.4502064883708954,
"learning_rate": 0.0001793141342641973,
"loss": 0.9939,
"step": 3200
},
{
"epoch": 0.3122272859497721,
"grad_norm": 0.5583937764167786,
"learning_rate": 0.00017918484760334852,
"loss": 0.9618,
"step": 3220
},
{
"epoch": 0.3141665858625036,
"grad_norm": 0.30554625391960144,
"learning_rate": 0.00017905556094249977,
"loss": 1.0496,
"step": 3240
},
{
"epoch": 0.31610588577523513,
"grad_norm": 0.4174688458442688,
"learning_rate": 0.000178926274281651,
"loss": 1.0396,
"step": 3260
},
{
"epoch": 0.31804518568796664,
"grad_norm": 0.410165011882782,
"learning_rate": 0.00017879698762080222,
"loss": 1.0758,
"step": 3280
},
{
"epoch": 0.31998448560069814,
"grad_norm": 0.34972333908081055,
"learning_rate": 0.00017866770095995347,
"loss": 1.034,
"step": 3300
},
{
"epoch": 0.32192378551342965,
"grad_norm": 0.4535263478755951,
"learning_rate": 0.00017853841429910468,
"loss": 0.977,
"step": 3320
},
{
"epoch": 0.32386308542616116,
"grad_norm": 0.42111313343048096,
"learning_rate": 0.00017840912763825593,
"loss": 1.0414,
"step": 3340
},
{
"epoch": 0.32580238533889266,
"grad_norm": 0.38072848320007324,
"learning_rate": 0.00017827984097740717,
"loss": 0.9297,
"step": 3360
},
{
"epoch": 0.32774168525162417,
"grad_norm": 0.4085475504398346,
"learning_rate": 0.00017815055431655841,
"loss": 0.997,
"step": 3380
},
{
"epoch": 0.3296809851643557,
"grad_norm": 0.37679746747016907,
"learning_rate": 0.00017802126765570963,
"loss": 1.0623,
"step": 3400
},
{
"epoch": 0.3316202850770872,
"grad_norm": 0.398362398147583,
"learning_rate": 0.00017789198099486087,
"loss": 1.0116,
"step": 3420
},
{
"epoch": 0.3335595849898187,
"grad_norm": 0.4207305610179901,
"learning_rate": 0.0001777626943340121,
"loss": 0.9085,
"step": 3440
},
{
"epoch": 0.3354988849025502,
"grad_norm": 0.48672834038734436,
"learning_rate": 0.00017763340767316333,
"loss": 0.9377,
"step": 3460
},
{
"epoch": 0.3374381848152817,
"grad_norm": 0.500481128692627,
"learning_rate": 0.00017750412101231458,
"loss": 0.9824,
"step": 3480
},
{
"epoch": 0.3393774847280132,
"grad_norm": 0.3926856517791748,
"learning_rate": 0.0001773748343514658,
"loss": 0.9774,
"step": 3500
},
{
"epoch": 0.34131678464074466,
"grad_norm": 0.5099716782569885,
"learning_rate": 0.00017724554769061704,
"loss": 1.0182,
"step": 3520
},
{
"epoch": 0.34325608455347617,
"grad_norm": 0.3890150785446167,
"learning_rate": 0.00017711626102976825,
"loss": 1.0333,
"step": 3540
},
{
"epoch": 0.3451953844662077,
"grad_norm": 0.39873331785202026,
"learning_rate": 0.0001769869743689195,
"loss": 0.9978,
"step": 3560
},
{
"epoch": 0.3471346843789392,
"grad_norm": 0.41631972789764404,
"learning_rate": 0.0001768576877080707,
"loss": 1.0392,
"step": 3580
},
{
"epoch": 0.3490739842916707,
"grad_norm": 0.37472009658813477,
"learning_rate": 0.00017672840104722198,
"loss": 1.0268,
"step": 3600
},
{
"epoch": 0.3510132842044022,
"grad_norm": 0.41661858558654785,
"learning_rate": 0.0001765991143863732,
"loss": 1.0529,
"step": 3620
},
{
"epoch": 0.3529525841171337,
"grad_norm": 0.4210350811481476,
"learning_rate": 0.00017646982772552444,
"loss": 1.0517,
"step": 3640
},
{
"epoch": 0.3548918840298652,
"grad_norm": 0.39142927527427673,
"learning_rate": 0.00017634054106467566,
"loss": 1.0273,
"step": 3660
},
{
"epoch": 0.3568311839425967,
"grad_norm": 0.34525975584983826,
"learning_rate": 0.00017621125440382687,
"loss": 1.0621,
"step": 3680
},
{
"epoch": 0.3587704838553282,
"grad_norm": 0.3758637607097626,
"learning_rate": 0.00017608196774297812,
"loss": 1.1157,
"step": 3700
},
{
"epoch": 0.36070978376805973,
"grad_norm": 0.33912938833236694,
"learning_rate": 0.00017595268108212936,
"loss": 1.0442,
"step": 3720
},
{
"epoch": 0.36264908368079124,
"grad_norm": 0.41856470704078674,
"learning_rate": 0.0001758233944212806,
"loss": 1.0365,
"step": 3740
},
{
"epoch": 0.36458838359352275,
"grad_norm": 0.361141562461853,
"learning_rate": 0.00017569410776043182,
"loss": 0.9991,
"step": 3760
},
{
"epoch": 0.36652768350625425,
"grad_norm": 0.3321206867694855,
"learning_rate": 0.00017556482109958306,
"loss": 0.9699,
"step": 3780
},
{
"epoch": 0.36846698341898576,
"grad_norm": 0.2905769646167755,
"learning_rate": 0.00017543553443873428,
"loss": 1.0107,
"step": 3800
},
{
"epoch": 0.37040628333171727,
"grad_norm": 0.3579418659210205,
"learning_rate": 0.00017530624777788552,
"loss": 1.0296,
"step": 3820
},
{
"epoch": 0.3723455832444488,
"grad_norm": 0.3342229425907135,
"learning_rate": 0.00017517696111703677,
"loss": 0.9138,
"step": 3840
},
{
"epoch": 0.3742848831571803,
"grad_norm": 0.30966290831565857,
"learning_rate": 0.00017504767445618798,
"loss": 0.9813,
"step": 3860
},
{
"epoch": 0.3762241830699118,
"grad_norm": 0.29645347595214844,
"learning_rate": 0.00017491838779533922,
"loss": 0.9978,
"step": 3880
},
{
"epoch": 0.37816348298264324,
"grad_norm": 0.4376464784145355,
"learning_rate": 0.00017478910113449044,
"loss": 0.9493,
"step": 3900
},
{
"epoch": 0.38010278289537475,
"grad_norm": 0.3137510120868683,
"learning_rate": 0.00017465981447364168,
"loss": 1.1089,
"step": 3920
},
{
"epoch": 0.38204208280810625,
"grad_norm": 0.3381621241569519,
"learning_rate": 0.00017453052781279293,
"loss": 0.9413,
"step": 3940
},
{
"epoch": 0.38398138272083776,
"grad_norm": 0.3513394296169281,
"learning_rate": 0.00017440124115194417,
"loss": 1.0343,
"step": 3960
},
{
"epoch": 0.38592068263356927,
"grad_norm": 0.4285072684288025,
"learning_rate": 0.0001742719544910954,
"loss": 1.0056,
"step": 3980
},
{
"epoch": 0.3878599825463008,
"grad_norm": 0.33688676357269287,
"learning_rate": 0.00017414266783024663,
"loss": 0.942,
"step": 4000
},
{
"epoch": 0.3897992824590323,
"grad_norm": 0.33148035407066345,
"learning_rate": 0.00017401338116939785,
"loss": 1.0435,
"step": 4020
},
{
"epoch": 0.3917385823717638,
"grad_norm": 0.35128480195999146,
"learning_rate": 0.0001738840945085491,
"loss": 0.9324,
"step": 4040
},
{
"epoch": 0.3936778822844953,
"grad_norm": 0.4236750304698944,
"learning_rate": 0.00017375480784770033,
"loss": 1.0786,
"step": 4060
},
{
"epoch": 0.3956171821972268,
"grad_norm": 0.37926408648490906,
"learning_rate": 0.00017362552118685155,
"loss": 1.0738,
"step": 4080
},
{
"epoch": 0.3975564821099583,
"grad_norm": 0.38108745217323303,
"learning_rate": 0.0001734962345260028,
"loss": 1.0038,
"step": 4100
},
{
"epoch": 0.3994957820226898,
"grad_norm": 0.4261017143726349,
"learning_rate": 0.000173366947865154,
"loss": 1.0599,
"step": 4120
},
{
"epoch": 0.4014350819354213,
"grad_norm": 0.5772719383239746,
"learning_rate": 0.00017323766120430525,
"loss": 0.9949,
"step": 4140
},
{
"epoch": 0.40337438184815283,
"grad_norm": 0.8369653224945068,
"learning_rate": 0.0001731083745434565,
"loss": 1.0253,
"step": 4160
},
{
"epoch": 0.40531368176088434,
"grad_norm": 0.47140738368034363,
"learning_rate": 0.00017297908788260774,
"loss": 1.0016,
"step": 4180
},
{
"epoch": 0.40725298167361584,
"grad_norm": 0.33165350556373596,
"learning_rate": 0.00017284980122175895,
"loss": 1.0053,
"step": 4200
},
{
"epoch": 0.40919228158634735,
"grad_norm": 0.35803553462028503,
"learning_rate": 0.0001727205145609102,
"loss": 1.0511,
"step": 4220
},
{
"epoch": 0.41113158149907886,
"grad_norm": 0.2802737057209015,
"learning_rate": 0.0001725912279000614,
"loss": 1.0259,
"step": 4240
},
{
"epoch": 0.4130708814118103,
"grad_norm": 0.4227242171764374,
"learning_rate": 0.00017246194123921266,
"loss": 1.015,
"step": 4260
},
{
"epoch": 0.4150101813245418,
"grad_norm": 0.3336915075778961,
"learning_rate": 0.0001723326545783639,
"loss": 1.0925,
"step": 4280
},
{
"epoch": 0.4169494812372733,
"grad_norm": 0.4097139537334442,
"learning_rate": 0.00017220336791751512,
"loss": 0.982,
"step": 4300
},
{
"epoch": 0.41888878115000483,
"grad_norm": 0.33094799518585205,
"learning_rate": 0.00017207408125666636,
"loss": 0.9775,
"step": 4320
},
{
"epoch": 0.42082808106273634,
"grad_norm": 0.4628673195838928,
"learning_rate": 0.00017194479459581758,
"loss": 1.0447,
"step": 4340
},
{
"epoch": 0.42276738097546784,
"grad_norm": 0.34319597482681274,
"learning_rate": 0.00017181550793496882,
"loss": 1.0082,
"step": 4360
},
{
"epoch": 0.42470668088819935,
"grad_norm": 0.4299662709236145,
"learning_rate": 0.00017168622127412004,
"loss": 1.0364,
"step": 4380
},
{
"epoch": 0.42664598080093086,
"grad_norm": 0.26310792565345764,
"learning_rate": 0.00017155693461327128,
"loss": 0.9894,
"step": 4400
},
{
"epoch": 0.42858528071366236,
"grad_norm": 0.3722013235092163,
"learning_rate": 0.00017142764795242252,
"loss": 1.0648,
"step": 4420
},
{
"epoch": 0.43052458062639387,
"grad_norm": 0.31930679082870483,
"learning_rate": 0.00017129836129157374,
"loss": 0.9883,
"step": 4440
},
{
"epoch": 0.4324638805391254,
"grad_norm": 0.40671297907829285,
"learning_rate": 0.00017116907463072498,
"loss": 1.019,
"step": 4460
},
{
"epoch": 0.4344031804518569,
"grad_norm": 0.2964651882648468,
"learning_rate": 0.0001710397879698762,
"loss": 1.0149,
"step": 4480
},
{
"epoch": 0.4363424803645884,
"grad_norm": 0.37347179651260376,
"learning_rate": 0.00017091050130902747,
"loss": 1.0291,
"step": 4500
},
{
"epoch": 0.4382817802773199,
"grad_norm": 0.4047054946422577,
"learning_rate": 0.00017078121464817868,
"loss": 1.033,
"step": 4520
},
{
"epoch": 0.4402210801900514,
"grad_norm": 0.5139878988265991,
"learning_rate": 0.00017065192798732993,
"loss": 0.919,
"step": 4540
},
{
"epoch": 0.4421603801027829,
"grad_norm": 0.3413216173648834,
"learning_rate": 0.00017052264132648114,
"loss": 1.0896,
"step": 4560
},
{
"epoch": 0.4440996800155144,
"grad_norm": 0.6158967614173889,
"learning_rate": 0.00017039335466563239,
"loss": 1.0038,
"step": 4580
},
{
"epoch": 0.4460389799282459,
"grad_norm": 0.39351287484169006,
"learning_rate": 0.0001702640680047836,
"loss": 1.0121,
"step": 4600
},
{
"epoch": 0.44797827984097743,
"grad_norm": 0.33461031317710876,
"learning_rate": 0.00017013478134393485,
"loss": 0.9365,
"step": 4620
},
{
"epoch": 0.4499175797537089,
"grad_norm": 0.4201250970363617,
"learning_rate": 0.0001700054946830861,
"loss": 1.0742,
"step": 4640
},
{
"epoch": 0.4518568796664404,
"grad_norm": 0.33260366320610046,
"learning_rate": 0.0001698762080222373,
"loss": 1.0086,
"step": 4660
},
{
"epoch": 0.4537961795791719,
"grad_norm": 0.422360360622406,
"learning_rate": 0.00016974692136138855,
"loss": 1.0209,
"step": 4680
},
{
"epoch": 0.4557354794919034,
"grad_norm": 0.4392210841178894,
"learning_rate": 0.00016961763470053976,
"loss": 0.9916,
"step": 4700
},
{
"epoch": 0.4576747794046349,
"grad_norm": 0.4346536099910736,
"learning_rate": 0.000169488348039691,
"loss": 1.0598,
"step": 4720
},
{
"epoch": 0.4596140793173664,
"grad_norm": 0.4439583718776703,
"learning_rate": 0.00016935906137884225,
"loss": 1.0661,
"step": 4740
},
{
"epoch": 0.4615533792300979,
"grad_norm": 0.40431517362594604,
"learning_rate": 0.0001692297747179935,
"loss": 0.984,
"step": 4760
},
{
"epoch": 0.46349267914282943,
"grad_norm": 0.4152517020702362,
"learning_rate": 0.0001691004880571447,
"loss": 1.0098,
"step": 4780
},
{
"epoch": 0.46543197905556094,
"grad_norm": 0.3530963957309723,
"learning_rate": 0.00016897120139629595,
"loss": 1.0124,
"step": 4800
},
{
"epoch": 0.46737127896829245,
"grad_norm": 0.28457310795783997,
"learning_rate": 0.00016884191473544717,
"loss": 0.9547,
"step": 4820
},
{
"epoch": 0.46931057888102395,
"grad_norm": 0.402810275554657,
"learning_rate": 0.0001687126280745984,
"loss": 1.0848,
"step": 4840
},
{
"epoch": 0.47124987879375546,
"grad_norm": 0.3450517952442169,
"learning_rate": 0.00016858334141374966,
"loss": 1.0986,
"step": 4860
},
{
"epoch": 0.47318917870648697,
"grad_norm": 0.34125566482543945,
"learning_rate": 0.00016845405475290087,
"loss": 1.0324,
"step": 4880
},
{
"epoch": 0.4751284786192185,
"grad_norm": 0.3199256658554077,
"learning_rate": 0.00016832476809205212,
"loss": 1.0329,
"step": 4900
},
{
"epoch": 0.47706777853195,
"grad_norm": 0.3269367814064026,
"learning_rate": 0.00016819548143120333,
"loss": 0.986,
"step": 4920
},
{
"epoch": 0.4790070784446815,
"grad_norm": 0.41509127616882324,
"learning_rate": 0.00016806619477035458,
"loss": 0.9819,
"step": 4940
},
{
"epoch": 0.480946378357413,
"grad_norm": 0.378519743680954,
"learning_rate": 0.00016793690810950582,
"loss": 1.0048,
"step": 4960
},
{
"epoch": 0.4828856782701445,
"grad_norm": 0.3766985833644867,
"learning_rate": 0.00016780762144865706,
"loss": 0.9755,
"step": 4980
},
{
"epoch": 0.48482497818287595,
"grad_norm": 0.42813560366630554,
"learning_rate": 0.00016767833478780828,
"loss": 0.9251,
"step": 5000
}
],
"logging_steps": 20,
"max_steps": 30939,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.16313571466199e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}