|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.48482497818287595, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019392999127315039, |
|
"grad_norm": 0.5741730332374573, |
|
"learning_rate": 0.00019987071333915123, |
|
"loss": 1.7671, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0038785998254630078, |
|
"grad_norm": 0.3640196621417999, |
|
"learning_rate": 0.00019974142667830247, |
|
"loss": 1.2552, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005817899738194512, |
|
"grad_norm": 0.33685317635536194, |
|
"learning_rate": 0.0001996121400174537, |
|
"loss": 1.1058, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0077571996509260156, |
|
"grad_norm": 0.5886279940605164, |
|
"learning_rate": 0.00019948285335660496, |
|
"loss": 1.1087, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009696499563657519, |
|
"grad_norm": 0.5692952871322632, |
|
"learning_rate": 0.00019935356669575617, |
|
"loss": 1.1196, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011635799476389024, |
|
"grad_norm": 0.49794259667396545, |
|
"learning_rate": 0.00019922428003490741, |
|
"loss": 1.1866, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.013575099389120528, |
|
"grad_norm": 0.4546428322792053, |
|
"learning_rate": 0.00019909499337405863, |
|
"loss": 1.1637, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.015514399301852031, |
|
"grad_norm": 0.4478498697280884, |
|
"learning_rate": 0.00019896570671320987, |
|
"loss": 1.1465, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.017453699214583535, |
|
"grad_norm": 0.5994399785995483, |
|
"learning_rate": 0.00019883642005236112, |
|
"loss": 1.1346, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.019392999127315038, |
|
"grad_norm": 0.44898244738578796, |
|
"learning_rate": 0.00019870713339151233, |
|
"loss": 1.1577, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02133229904004654, |
|
"grad_norm": 0.4165457785129547, |
|
"learning_rate": 0.00019857784673066358, |
|
"loss": 1.1123, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02327159895277805, |
|
"grad_norm": 0.4125240743160248, |
|
"learning_rate": 0.0001984485600698148, |
|
"loss": 1.1449, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.025210898865509552, |
|
"grad_norm": 0.49324268102645874, |
|
"learning_rate": 0.00019831927340896604, |
|
"loss": 1.0512, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.027150198778241055, |
|
"grad_norm": 0.5381457209587097, |
|
"learning_rate": 0.00019818998674811725, |
|
"loss": 1.1418, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02908949869097256, |
|
"grad_norm": 0.47153565287590027, |
|
"learning_rate": 0.00019806070008726852, |
|
"loss": 1.0823, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.031028798603704062, |
|
"grad_norm": 0.5824480056762695, |
|
"learning_rate": 0.00019793141342641974, |
|
"loss": 1.178, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03296809851643557, |
|
"grad_norm": 0.4500732719898224, |
|
"learning_rate": 0.00019780212676557098, |
|
"loss": 1.1021, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03490739842916707, |
|
"grad_norm": 0.5821068286895752, |
|
"learning_rate": 0.0001976728401047222, |
|
"loss": 1.0628, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.036846698341898576, |
|
"grad_norm": 0.39794814586639404, |
|
"learning_rate": 0.00019754355344387344, |
|
"loss": 1.0391, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.038785998254630076, |
|
"grad_norm": 0.38640904426574707, |
|
"learning_rate": 0.00019741426678302469, |
|
"loss": 1.0417, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04072529816736158, |
|
"grad_norm": 0.3635335862636566, |
|
"learning_rate": 0.0001972849801221759, |
|
"loss": 0.9385, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04266459808009308, |
|
"grad_norm": 0.40371426939964294, |
|
"learning_rate": 0.00019715569346132714, |
|
"loss": 1.0481, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04460389799282459, |
|
"grad_norm": 0.37828192114830017, |
|
"learning_rate": 0.00019702640680047836, |
|
"loss": 1.0186, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0465431979055561, |
|
"grad_norm": 0.4067387282848358, |
|
"learning_rate": 0.0001968971201396296, |
|
"loss": 1.0012, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0484824978182876, |
|
"grad_norm": 0.42465779185295105, |
|
"learning_rate": 0.00019676783347878082, |
|
"loss": 1.11, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.050421797731019104, |
|
"grad_norm": 0.4794886112213135, |
|
"learning_rate": 0.0001966385468179321, |
|
"loss": 1.0221, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.052361097643750604, |
|
"grad_norm": 0.40525391697883606, |
|
"learning_rate": 0.0001965092601570833, |
|
"loss": 0.993, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05430039755648211, |
|
"grad_norm": 0.47314971685409546, |
|
"learning_rate": 0.00019637997349623455, |
|
"loss": 1.0939, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05623969746921361, |
|
"grad_norm": 0.30406779050827026, |
|
"learning_rate": 0.00019625068683538577, |
|
"loss": 1.087, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05817899738194512, |
|
"grad_norm": 0.4694748818874359, |
|
"learning_rate": 0.00019612140017453698, |
|
"loss": 1.1412, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.060118297294676624, |
|
"grad_norm": 0.4146924316883087, |
|
"learning_rate": 0.00019599211351368823, |
|
"loss": 1.0378, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.062057597207408124, |
|
"grad_norm": 0.40017783641815186, |
|
"learning_rate": 0.00019586282685283947, |
|
"loss": 1.0924, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.06399689712013963, |
|
"grad_norm": 0.6136152744293213, |
|
"learning_rate": 0.0001957335401919907, |
|
"loss": 1.0273, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.06593619703287114, |
|
"grad_norm": 0.46415477991104126, |
|
"learning_rate": 0.00019560425353114193, |
|
"loss": 1.0324, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06787549694560263, |
|
"grad_norm": 0.579818606376648, |
|
"learning_rate": 0.00019547496687029317, |
|
"loss": 1.0475, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06981479685833414, |
|
"grad_norm": 0.35234829783439636, |
|
"learning_rate": 0.0001953456802094444, |
|
"loss": 0.9991, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.07175409677106565, |
|
"grad_norm": 0.4519262909889221, |
|
"learning_rate": 0.00019521639354859563, |
|
"loss": 1.0038, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.07369339668379715, |
|
"grad_norm": 0.47795727849006653, |
|
"learning_rate": 0.00019508710688774687, |
|
"loss": 1.0077, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.07563269659652866, |
|
"grad_norm": 1.1003836393356323, |
|
"learning_rate": 0.0001949578202268981, |
|
"loss": 0.9657, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.07757199650926015, |
|
"grad_norm": 0.38754114508628845, |
|
"learning_rate": 0.00019482853356604933, |
|
"loss": 1.1136, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07951129642199166, |
|
"grad_norm": 0.42165011167526245, |
|
"learning_rate": 0.00019469924690520055, |
|
"loss": 1.0678, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.08145059633472317, |
|
"grad_norm": 0.28059273958206177, |
|
"learning_rate": 0.0001945699602443518, |
|
"loss": 1.1409, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.08338989624745467, |
|
"grad_norm": 0.4145539700984955, |
|
"learning_rate": 0.00019444067358350304, |
|
"loss": 1.1071, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.08532919616018617, |
|
"grad_norm": 0.46548235416412354, |
|
"learning_rate": 0.00019431138692265428, |
|
"loss": 0.9627, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.08726849607291767, |
|
"grad_norm": 0.41764864325523376, |
|
"learning_rate": 0.0001941821002618055, |
|
"loss": 0.9689, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08920779598564918, |
|
"grad_norm": 0.5186419486999512, |
|
"learning_rate": 0.00019405281360095674, |
|
"loss": 1.079, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.09114709589838069, |
|
"grad_norm": 0.4437251091003418, |
|
"learning_rate": 0.00019392352694010796, |
|
"loss": 1.0988, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.0930863958111122, |
|
"grad_norm": 0.37555068731307983, |
|
"learning_rate": 0.0001937942402792592, |
|
"loss": 1.0355, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.09502569572384369, |
|
"grad_norm": 0.305160790681839, |
|
"learning_rate": 0.00019366495361841044, |
|
"loss": 0.9559, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.0969649956365752, |
|
"grad_norm": 0.36364415287971497, |
|
"learning_rate": 0.00019353566695756166, |
|
"loss": 1.0374, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0989042955493067, |
|
"grad_norm": 0.3676876127719879, |
|
"learning_rate": 0.0001934063802967129, |
|
"loss": 1.1034, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.10084359546203821, |
|
"grad_norm": 0.41380801796913147, |
|
"learning_rate": 0.00019327709363586412, |
|
"loss": 0.9731, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.10278289537476971, |
|
"grad_norm": 0.3573336899280548, |
|
"learning_rate": 0.00019314780697501536, |
|
"loss": 1.0479, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.10472219528750121, |
|
"grad_norm": 0.45588839054107666, |
|
"learning_rate": 0.00019301852031416658, |
|
"loss": 1.0819, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.10666149520023271, |
|
"grad_norm": 0.37001991271972656, |
|
"learning_rate": 0.00019288923365331785, |
|
"loss": 1.0197, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.10860079511296422, |
|
"grad_norm": 0.41651812195777893, |
|
"learning_rate": 0.00019275994699246906, |
|
"loss": 1.0142, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.11054009502569573, |
|
"grad_norm": 0.4468708038330078, |
|
"learning_rate": 0.0001926306603316203, |
|
"loss": 1.1007, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.11247939493842722, |
|
"grad_norm": 0.3199480473995209, |
|
"learning_rate": 0.00019250137367077152, |
|
"loss": 1.004, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.11441869485115873, |
|
"grad_norm": 0.4078716039657593, |
|
"learning_rate": 0.00019237208700992274, |
|
"loss": 1.0726, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.11635799476389024, |
|
"grad_norm": 0.42165908217430115, |
|
"learning_rate": 0.000192242800349074, |
|
"loss": 1.0257, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.11829729467662174, |
|
"grad_norm": 0.3128000795841217, |
|
"learning_rate": 0.00019211351368822523, |
|
"loss": 1.0694, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.12023659458935325, |
|
"grad_norm": 0.3406975567340851, |
|
"learning_rate": 0.00019198422702737647, |
|
"loss": 1.0577, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.12217589450208474, |
|
"grad_norm": 0.3885703384876251, |
|
"learning_rate": 0.00019185494036652768, |
|
"loss": 1.0074, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.12411519441481625, |
|
"grad_norm": 0.37669333815574646, |
|
"learning_rate": 0.00019172565370567893, |
|
"loss": 1.012, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.12605449432754776, |
|
"grad_norm": 0.4600653350353241, |
|
"learning_rate": 0.00019159636704483014, |
|
"loss": 1.0039, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.12799379424027926, |
|
"grad_norm": 0.3202829957008362, |
|
"learning_rate": 0.0001914670803839814, |
|
"loss": 1.0403, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.12993309415301077, |
|
"grad_norm": 0.441177636384964, |
|
"learning_rate": 0.00019133779372313263, |
|
"loss": 1.0413, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.13187239406574228, |
|
"grad_norm": 0.39573097229003906, |
|
"learning_rate": 0.00019120850706228385, |
|
"loss": 1.0821, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.13381169397847378, |
|
"grad_norm": 0.45021307468414307, |
|
"learning_rate": 0.0001910792204014351, |
|
"loss": 1.0128, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.13575099389120526, |
|
"grad_norm": 0.44374045729637146, |
|
"learning_rate": 0.0001909499337405863, |
|
"loss": 1.0713, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.13769029380393677, |
|
"grad_norm": 0.33873313665390015, |
|
"learning_rate": 0.00019082064707973755, |
|
"loss": 1.0951, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.13962959371666828, |
|
"grad_norm": 0.36703407764434814, |
|
"learning_rate": 0.0001906913604188888, |
|
"loss": 1.0662, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.14156889362939978, |
|
"grad_norm": 0.31029993295669556, |
|
"learning_rate": 0.00019056207375804004, |
|
"loss": 1.0673, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.1435081935421313, |
|
"grad_norm": 0.35601869225502014, |
|
"learning_rate": 0.00019043278709719125, |
|
"loss": 1.0696, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.1454474934548628, |
|
"grad_norm": 0.39259615540504456, |
|
"learning_rate": 0.0001903035004363425, |
|
"loss": 1.0173, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1473867933675943, |
|
"grad_norm": 0.39904189109802246, |
|
"learning_rate": 0.0001901742137754937, |
|
"loss": 1.1188, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.1493260932803258, |
|
"grad_norm": 0.395321786403656, |
|
"learning_rate": 0.00019004492711464495, |
|
"loss": 1.0039, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.15126539319305732, |
|
"grad_norm": 0.37806248664855957, |
|
"learning_rate": 0.0001899156404537962, |
|
"loss": 1.1001, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.1532046931057888, |
|
"grad_norm": 0.3514845669269562, |
|
"learning_rate": 0.00018978635379294741, |
|
"loss": 1.0592, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.1551439930185203, |
|
"grad_norm": 0.3304252624511719, |
|
"learning_rate": 0.00018965706713209866, |
|
"loss": 1.0237, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1570832929312518, |
|
"grad_norm": 0.31678342819213867, |
|
"learning_rate": 0.00018952778047124987, |
|
"loss": 0.962, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.15902259284398332, |
|
"grad_norm": 0.34594252705574036, |
|
"learning_rate": 0.00018939849381040112, |
|
"loss": 0.9677, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.16096189275671482, |
|
"grad_norm": 0.35923656821250916, |
|
"learning_rate": 0.00018926920714955236, |
|
"loss": 1.0206, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.16290119266944633, |
|
"grad_norm": 0.4773354232311249, |
|
"learning_rate": 0.0001891399204887036, |
|
"loss": 1.0682, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.16484049258217784, |
|
"grad_norm": 0.3007306456565857, |
|
"learning_rate": 0.00018901063382785482, |
|
"loss": 0.9395, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.16677979249490935, |
|
"grad_norm": 0.40934816002845764, |
|
"learning_rate": 0.00018888134716700606, |
|
"loss": 1.08, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.16871909240764085, |
|
"grad_norm": 0.35403525829315186, |
|
"learning_rate": 0.00018875206050615728, |
|
"loss": 0.9955, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.17065839232037233, |
|
"grad_norm": 0.38091763854026794, |
|
"learning_rate": 0.00018862277384530852, |
|
"loss": 1.0324, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.17259769223310384, |
|
"grad_norm": 0.37331679463386536, |
|
"learning_rate": 0.00018849348718445977, |
|
"loss": 1.0621, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.17453699214583535, |
|
"grad_norm": 0.23007287085056305, |
|
"learning_rate": 0.00018836420052361098, |
|
"loss": 1.1031, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.17647629205856685, |
|
"grad_norm": 0.32545435428619385, |
|
"learning_rate": 0.00018823491386276223, |
|
"loss": 0.9931, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.17841559197129836, |
|
"grad_norm": 0.44806909561157227, |
|
"learning_rate": 0.00018810562720191344, |
|
"loss": 1.0262, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.18035489188402987, |
|
"grad_norm": 0.37432861328125, |
|
"learning_rate": 0.00018797634054106468, |
|
"loss": 0.9484, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.18229419179676137, |
|
"grad_norm": 0.32971423864364624, |
|
"learning_rate": 0.00018784705388021593, |
|
"loss": 1.0482, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.18423349170949288, |
|
"grad_norm": 0.2877088189125061, |
|
"learning_rate": 0.00018771776721936717, |
|
"loss": 1.1487, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1861727916222244, |
|
"grad_norm": 0.45374777913093567, |
|
"learning_rate": 0.0001875884805585184, |
|
"loss": 1.0074, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.1881120915349559, |
|
"grad_norm": 0.38346460461616516, |
|
"learning_rate": 0.0001874591938976696, |
|
"loss": 1.0462, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.19005139144768737, |
|
"grad_norm": 0.3672585189342499, |
|
"learning_rate": 0.00018732990723682085, |
|
"loss": 1.0646, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.19199069136041888, |
|
"grad_norm": 0.33648261427879333, |
|
"learning_rate": 0.00018720062057597206, |
|
"loss": 0.9792, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1939299912731504, |
|
"grad_norm": 0.37683922052383423, |
|
"learning_rate": 0.00018707133391512333, |
|
"loss": 1.015, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1958692911858819, |
|
"grad_norm": 0.4112173616886139, |
|
"learning_rate": 0.00018694204725427455, |
|
"loss": 0.9402, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.1978085910986134, |
|
"grad_norm": 0.33208009600639343, |
|
"learning_rate": 0.0001868127605934258, |
|
"loss": 1.0754, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.1997478910113449, |
|
"grad_norm": 0.3744449317455292, |
|
"learning_rate": 0.000186683473932577, |
|
"loss": 0.9324, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.20168719092407641, |
|
"grad_norm": 0.30895674228668213, |
|
"learning_rate": 0.00018655418727172825, |
|
"loss": 1.0611, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.20362649083680792, |
|
"grad_norm": 0.3781767189502716, |
|
"learning_rate": 0.00018642490061087947, |
|
"loss": 1.0332, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.20556579074953943, |
|
"grad_norm": 0.40028223395347595, |
|
"learning_rate": 0.0001862956139500307, |
|
"loss": 0.9508, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.2075050906622709, |
|
"grad_norm": 0.36377599835395813, |
|
"learning_rate": 0.00018616632728918195, |
|
"loss": 0.9085, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.20944439057500241, |
|
"grad_norm": 0.28180891275405884, |
|
"learning_rate": 0.00018603704062833317, |
|
"loss": 1.0315, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.21138369048773392, |
|
"grad_norm": 0.41635292768478394, |
|
"learning_rate": 0.00018590775396748441, |
|
"loss": 1.0278, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.21332299040046543, |
|
"grad_norm": 0.45736029744148254, |
|
"learning_rate": 0.00018577846730663563, |
|
"loss": 0.9909, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.21526229031319694, |
|
"grad_norm": 0.4075273871421814, |
|
"learning_rate": 0.00018564918064578687, |
|
"loss": 0.9759, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.21720159022592844, |
|
"grad_norm": 0.3832456171512604, |
|
"learning_rate": 0.00018551989398493812, |
|
"loss": 1.0131, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.21914089013865995, |
|
"grad_norm": 0.39031124114990234, |
|
"learning_rate": 0.00018539060732408936, |
|
"loss": 1.0805, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.22108019005139146, |
|
"grad_norm": 0.27879253029823303, |
|
"learning_rate": 0.00018526132066324058, |
|
"loss": 1.0528, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.22301948996412296, |
|
"grad_norm": 0.35129788517951965, |
|
"learning_rate": 0.00018513203400239182, |
|
"loss": 1.0757, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.22495878987685444, |
|
"grad_norm": 0.3349596858024597, |
|
"learning_rate": 0.00018500274734154304, |
|
"loss": 1.0534, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.22689808978958595, |
|
"grad_norm": 0.38909098505973816, |
|
"learning_rate": 0.00018487346068069428, |
|
"loss": 1.0352, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.22883738970231746, |
|
"grad_norm": 0.36970818042755127, |
|
"learning_rate": 0.00018474417401984552, |
|
"loss": 1.0365, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.23077668961504896, |
|
"grad_norm": 0.3963492810726166, |
|
"learning_rate": 0.00018461488735899674, |
|
"loss": 0.9925, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.23271598952778047, |
|
"grad_norm": 0.36081552505493164, |
|
"learning_rate": 0.00018448560069814798, |
|
"loss": 1.1273, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.23465528944051198, |
|
"grad_norm": 0.34602901339530945, |
|
"learning_rate": 0.0001843563140372992, |
|
"loss": 0.9404, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.23659458935324348, |
|
"grad_norm": 0.4692002832889557, |
|
"learning_rate": 0.00018422702737645044, |
|
"loss": 1.0589, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.238533889265975, |
|
"grad_norm": 0.2668192386627197, |
|
"learning_rate": 0.00018409774071560168, |
|
"loss": 1.0315, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.2404731891787065, |
|
"grad_norm": 0.32327035069465637, |
|
"learning_rate": 0.00018396845405475293, |
|
"loss": 1.0387, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.24241248909143798, |
|
"grad_norm": 0.3037966787815094, |
|
"learning_rate": 0.00018383916739390414, |
|
"loss": 1.0396, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24435178900416948, |
|
"grad_norm": 0.4088995158672333, |
|
"learning_rate": 0.00018370988073305536, |
|
"loss": 0.9526, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.246291088916901, |
|
"grad_norm": 0.3864790201187134, |
|
"learning_rate": 0.0001835805940722066, |
|
"loss": 1.1621, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.2482303888296325, |
|
"grad_norm": 0.3342384099960327, |
|
"learning_rate": 0.00018345130741135782, |
|
"loss": 1.1299, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.250169688742364, |
|
"grad_norm": 0.3113463222980499, |
|
"learning_rate": 0.0001833220207505091, |
|
"loss": 1.0323, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.2521089886550955, |
|
"grad_norm": 0.33959150314331055, |
|
"learning_rate": 0.0001831927340896603, |
|
"loss": 1.0998, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.254048288567827, |
|
"grad_norm": 0.35392144322395325, |
|
"learning_rate": 0.00018306344742881155, |
|
"loss": 1.1187, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.2559875884805585, |
|
"grad_norm": 0.36454859375953674, |
|
"learning_rate": 0.00018293416076796277, |
|
"loss": 1.0935, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.25792688839329003, |
|
"grad_norm": 0.5319137573242188, |
|
"learning_rate": 0.000182804874107114, |
|
"loss": 0.9902, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.25986618830602154, |
|
"grad_norm": 0.3222362697124481, |
|
"learning_rate": 0.00018267558744626525, |
|
"loss": 1.0396, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.26180548821875305, |
|
"grad_norm": 0.3684043288230896, |
|
"learning_rate": 0.00018254630078541647, |
|
"loss": 1.004, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.26374478813148455, |
|
"grad_norm": 0.33372971415519714, |
|
"learning_rate": 0.0001824170141245677, |
|
"loss": 1.0479, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.26568408804421606, |
|
"grad_norm": 0.28436896204948425, |
|
"learning_rate": 0.00018228772746371893, |
|
"loss": 0.9811, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.26762338795694757, |
|
"grad_norm": 0.42128750681877136, |
|
"learning_rate": 0.00018215844080287017, |
|
"loss": 1.0022, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.269562687869679, |
|
"grad_norm": 0.38527703285217285, |
|
"learning_rate": 0.0001820291541420214, |
|
"loss": 1.0407, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.2715019877824105, |
|
"grad_norm": 0.39973485469818115, |
|
"learning_rate": 0.00018189986748117266, |
|
"loss": 1.0739, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.27344128769514203, |
|
"grad_norm": 0.39641934633255005, |
|
"learning_rate": 0.00018177058082032387, |
|
"loss": 0.9724, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.27538058760787354, |
|
"grad_norm": 0.3733800947666168, |
|
"learning_rate": 0.00018164129415947512, |
|
"loss": 0.9696, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.27731988752060505, |
|
"grad_norm": 0.42063865065574646, |
|
"learning_rate": 0.00018151200749862633, |
|
"loss": 1.0355, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.27925918743333655, |
|
"grad_norm": 0.3460356593132019, |
|
"learning_rate": 0.00018138272083777758, |
|
"loss": 0.9773, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.28119848734606806, |
|
"grad_norm": 0.45516788959503174, |
|
"learning_rate": 0.0001812534341769288, |
|
"loss": 1.0263, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.28313778725879957, |
|
"grad_norm": 0.3858850300312042, |
|
"learning_rate": 0.00018112414751608004, |
|
"loss": 1.0375, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.2850770871715311, |
|
"grad_norm": 0.419709712266922, |
|
"learning_rate": 0.00018099486085523128, |
|
"loss": 0.9879, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.2870163870842626, |
|
"grad_norm": 0.37489813566207886, |
|
"learning_rate": 0.0001808655741943825, |
|
"loss": 1.0098, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.2889556869969941, |
|
"grad_norm": 0.36090582609176636, |
|
"learning_rate": 0.00018073628753353374, |
|
"loss": 1.0185, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.2908949869097256, |
|
"grad_norm": 0.42885035276412964, |
|
"learning_rate": 0.00018060700087268495, |
|
"loss": 0.9338, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2928342868224571, |
|
"grad_norm": 0.3570854961872101, |
|
"learning_rate": 0.00018047771421183622, |
|
"loss": 1.0928, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.2947735867351886, |
|
"grad_norm": 0.3873838484287262, |
|
"learning_rate": 0.00018034842755098744, |
|
"loss": 1.0241, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.2967128866479201, |
|
"grad_norm": 0.5299991965293884, |
|
"learning_rate": 0.00018021914089013868, |
|
"loss": 0.994, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.2986521865606516, |
|
"grad_norm": 0.40146002173423767, |
|
"learning_rate": 0.0001800898542292899, |
|
"loss": 1.035, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.30059148647338313, |
|
"grad_norm": 0.4087996780872345, |
|
"learning_rate": 0.00017996056756844112, |
|
"loss": 1.0682, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.30253078638611464, |
|
"grad_norm": 0.39104148745536804, |
|
"learning_rate": 0.00017983128090759236, |
|
"loss": 1.0486, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.30447008629884614, |
|
"grad_norm": 0.36926761269569397, |
|
"learning_rate": 0.0001797019942467436, |
|
"loss": 1.0358, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.3064093862115776, |
|
"grad_norm": 0.32747143507003784, |
|
"learning_rate": 0.00017957270758589485, |
|
"loss": 0.9951, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.3083486861243091, |
|
"grad_norm": 0.42642009258270264, |
|
"learning_rate": 0.00017944342092504606, |
|
"loss": 1.0647, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.3102879860370406, |
|
"grad_norm": 0.4502064883708954, |
|
"learning_rate": 0.0001793141342641973, |
|
"loss": 0.9939, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.3122272859497721, |
|
"grad_norm": 0.5583937764167786, |
|
"learning_rate": 0.00017918484760334852, |
|
"loss": 0.9618, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.3141665858625036, |
|
"grad_norm": 0.30554625391960144, |
|
"learning_rate": 0.00017905556094249977, |
|
"loss": 1.0496, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.31610588577523513, |
|
"grad_norm": 0.4174688458442688, |
|
"learning_rate": 0.000178926274281651, |
|
"loss": 1.0396, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.31804518568796664, |
|
"grad_norm": 0.410165011882782, |
|
"learning_rate": 0.00017879698762080222, |
|
"loss": 1.0758, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.31998448560069814, |
|
"grad_norm": 0.34972333908081055, |
|
"learning_rate": 0.00017866770095995347, |
|
"loss": 1.034, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.32192378551342965, |
|
"grad_norm": 0.4535263478755951, |
|
"learning_rate": 0.00017853841429910468, |
|
"loss": 0.977, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.32386308542616116, |
|
"grad_norm": 0.42111313343048096, |
|
"learning_rate": 0.00017840912763825593, |
|
"loss": 1.0414, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.32580238533889266, |
|
"grad_norm": 0.38072848320007324, |
|
"learning_rate": 0.00017827984097740717, |
|
"loss": 0.9297, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.32774168525162417, |
|
"grad_norm": 0.4085475504398346, |
|
"learning_rate": 0.00017815055431655841, |
|
"loss": 0.997, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.3296809851643557, |
|
"grad_norm": 0.37679746747016907, |
|
"learning_rate": 0.00017802126765570963, |
|
"loss": 1.0623, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3316202850770872, |
|
"grad_norm": 0.398362398147583, |
|
"learning_rate": 0.00017789198099486087, |
|
"loss": 1.0116, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.3335595849898187, |
|
"grad_norm": 0.4207305610179901, |
|
"learning_rate": 0.0001777626943340121, |
|
"loss": 0.9085, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.3354988849025502, |
|
"grad_norm": 0.48672834038734436, |
|
"learning_rate": 0.00017763340767316333, |
|
"loss": 0.9377, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.3374381848152817, |
|
"grad_norm": 0.500481128692627, |
|
"learning_rate": 0.00017750412101231458, |
|
"loss": 0.9824, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.3393774847280132, |
|
"grad_norm": 0.3926856517791748, |
|
"learning_rate": 0.0001773748343514658, |
|
"loss": 0.9774, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.34131678464074466, |
|
"grad_norm": 0.5099716782569885, |
|
"learning_rate": 0.00017724554769061704, |
|
"loss": 1.0182, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.34325608455347617, |
|
"grad_norm": 0.3890150785446167, |
|
"learning_rate": 0.00017711626102976825, |
|
"loss": 1.0333, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.3451953844662077, |
|
"grad_norm": 0.39873331785202026, |
|
"learning_rate": 0.0001769869743689195, |
|
"loss": 0.9978, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.3471346843789392, |
|
"grad_norm": 0.41631972789764404, |
|
"learning_rate": 0.0001768576877080707, |
|
"loss": 1.0392, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.3490739842916707, |
|
"grad_norm": 0.37472009658813477, |
|
"learning_rate": 0.00017672840104722198, |
|
"loss": 1.0268, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3510132842044022, |
|
"grad_norm": 0.41661858558654785, |
|
"learning_rate": 0.0001765991143863732, |
|
"loss": 1.0529, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.3529525841171337, |
|
"grad_norm": 0.4210350811481476, |
|
"learning_rate": 0.00017646982772552444, |
|
"loss": 1.0517, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.3548918840298652, |
|
"grad_norm": 0.39142927527427673, |
|
"learning_rate": 0.00017634054106467566, |
|
"loss": 1.0273, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.3568311839425967, |
|
"grad_norm": 0.34525975584983826, |
|
"learning_rate": 0.00017621125440382687, |
|
"loss": 1.0621, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3587704838553282, |
|
"grad_norm": 0.3758637607097626, |
|
"learning_rate": 0.00017608196774297812, |
|
"loss": 1.1157, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.36070978376805973, |
|
"grad_norm": 0.33912938833236694, |
|
"learning_rate": 0.00017595268108212936, |
|
"loss": 1.0442, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.36264908368079124, |
|
"grad_norm": 0.41856470704078674, |
|
"learning_rate": 0.0001758233944212806, |
|
"loss": 1.0365, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.36458838359352275, |
|
"grad_norm": 0.361141562461853, |
|
"learning_rate": 0.00017569410776043182, |
|
"loss": 0.9991, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.36652768350625425, |
|
"grad_norm": 0.3321206867694855, |
|
"learning_rate": 0.00017556482109958306, |
|
"loss": 0.9699, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.36846698341898576, |
|
"grad_norm": 0.2905769646167755, |
|
"learning_rate": 0.00017543553443873428, |
|
"loss": 1.0107, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.37040628333171727, |
|
"grad_norm": 0.3579418659210205, |
|
"learning_rate": 0.00017530624777788552, |
|
"loss": 1.0296, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.3723455832444488, |
|
"grad_norm": 0.3342229425907135, |
|
"learning_rate": 0.00017517696111703677, |
|
"loss": 0.9138, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.3742848831571803, |
|
"grad_norm": 0.30966290831565857, |
|
"learning_rate": 0.00017504767445618798, |
|
"loss": 0.9813, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.3762241830699118, |
|
"grad_norm": 0.29645347595214844, |
|
"learning_rate": 0.00017491838779533922, |
|
"loss": 0.9978, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.37816348298264324, |
|
"grad_norm": 0.4376464784145355, |
|
"learning_rate": 0.00017478910113449044, |
|
"loss": 0.9493, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.38010278289537475, |
|
"grad_norm": 0.3137510120868683, |
|
"learning_rate": 0.00017465981447364168, |
|
"loss": 1.1089, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.38204208280810625, |
|
"grad_norm": 0.3381621241569519, |
|
"learning_rate": 0.00017453052781279293, |
|
"loss": 0.9413, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.38398138272083776, |
|
"grad_norm": 0.3513394296169281, |
|
"learning_rate": 0.00017440124115194417, |
|
"loss": 1.0343, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.38592068263356927, |
|
"grad_norm": 0.4285072684288025, |
|
"learning_rate": 0.0001742719544910954, |
|
"loss": 1.0056, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.3878599825463008, |
|
"grad_norm": 0.33688676357269287, |
|
"learning_rate": 0.00017414266783024663, |
|
"loss": 0.942, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3897992824590323, |
|
"grad_norm": 0.33148035407066345, |
|
"learning_rate": 0.00017401338116939785, |
|
"loss": 1.0435, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.3917385823717638, |
|
"grad_norm": 0.35128480195999146, |
|
"learning_rate": 0.0001738840945085491, |
|
"loss": 0.9324, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.3936778822844953, |
|
"grad_norm": 0.4236750304698944, |
|
"learning_rate": 0.00017375480784770033, |
|
"loss": 1.0786, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.3956171821972268, |
|
"grad_norm": 0.37926408648490906, |
|
"learning_rate": 0.00017362552118685155, |
|
"loss": 1.0738, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.3975564821099583, |
|
"grad_norm": 0.38108745217323303, |
|
"learning_rate": 0.0001734962345260028, |
|
"loss": 1.0038, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.3994957820226898, |
|
"grad_norm": 0.4261017143726349, |
|
"learning_rate": 0.000173366947865154, |
|
"loss": 1.0599, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.4014350819354213, |
|
"grad_norm": 0.5772719383239746, |
|
"learning_rate": 0.00017323766120430525, |
|
"loss": 0.9949, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.40337438184815283, |
|
"grad_norm": 0.8369653224945068, |
|
"learning_rate": 0.0001731083745434565, |
|
"loss": 1.0253, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.40531368176088434, |
|
"grad_norm": 0.47140738368034363, |
|
"learning_rate": 0.00017297908788260774, |
|
"loss": 1.0016, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.40725298167361584, |
|
"grad_norm": 0.33165350556373596, |
|
"learning_rate": 0.00017284980122175895, |
|
"loss": 1.0053, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.40919228158634735, |
|
"grad_norm": 0.35803553462028503, |
|
"learning_rate": 0.0001727205145609102, |
|
"loss": 1.0511, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.41113158149907886, |
|
"grad_norm": 0.2802737057209015, |
|
"learning_rate": 0.0001725912279000614, |
|
"loss": 1.0259, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.4130708814118103, |
|
"grad_norm": 0.4227242171764374, |
|
"learning_rate": 0.00017246194123921266, |
|
"loss": 1.015, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.4150101813245418, |
|
"grad_norm": 0.3336915075778961, |
|
"learning_rate": 0.0001723326545783639, |
|
"loss": 1.0925, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.4169494812372733, |
|
"grad_norm": 0.4097139537334442, |
|
"learning_rate": 0.00017220336791751512, |
|
"loss": 0.982, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.41888878115000483, |
|
"grad_norm": 0.33094799518585205, |
|
"learning_rate": 0.00017207408125666636, |
|
"loss": 0.9775, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.42082808106273634, |
|
"grad_norm": 0.4628673195838928, |
|
"learning_rate": 0.00017194479459581758, |
|
"loss": 1.0447, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.42276738097546784, |
|
"grad_norm": 0.34319597482681274, |
|
"learning_rate": 0.00017181550793496882, |
|
"loss": 1.0082, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.42470668088819935, |
|
"grad_norm": 0.4299662709236145, |
|
"learning_rate": 0.00017168622127412004, |
|
"loss": 1.0364, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.42664598080093086, |
|
"grad_norm": 0.26310792565345764, |
|
"learning_rate": 0.00017155693461327128, |
|
"loss": 0.9894, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.42858528071366236, |
|
"grad_norm": 0.3722013235092163, |
|
"learning_rate": 0.00017142764795242252, |
|
"loss": 1.0648, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.43052458062639387, |
|
"grad_norm": 0.31930679082870483, |
|
"learning_rate": 0.00017129836129157374, |
|
"loss": 0.9883, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.4324638805391254, |
|
"grad_norm": 0.40671297907829285, |
|
"learning_rate": 0.00017116907463072498, |
|
"loss": 1.019, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.4344031804518569, |
|
"grad_norm": 0.2964651882648468, |
|
"learning_rate": 0.0001710397879698762, |
|
"loss": 1.0149, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.4363424803645884, |
|
"grad_norm": 0.37347179651260376, |
|
"learning_rate": 0.00017091050130902747, |
|
"loss": 1.0291, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4382817802773199, |
|
"grad_norm": 0.4047054946422577, |
|
"learning_rate": 0.00017078121464817868, |
|
"loss": 1.033, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.4402210801900514, |
|
"grad_norm": 0.5139878988265991, |
|
"learning_rate": 0.00017065192798732993, |
|
"loss": 0.919, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.4421603801027829, |
|
"grad_norm": 0.3413216173648834, |
|
"learning_rate": 0.00017052264132648114, |
|
"loss": 1.0896, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.4440996800155144, |
|
"grad_norm": 0.6158967614173889, |
|
"learning_rate": 0.00017039335466563239, |
|
"loss": 1.0038, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.4460389799282459, |
|
"grad_norm": 0.39351287484169006, |
|
"learning_rate": 0.0001702640680047836, |
|
"loss": 1.0121, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.44797827984097743, |
|
"grad_norm": 0.33461031317710876, |
|
"learning_rate": 0.00017013478134393485, |
|
"loss": 0.9365, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.4499175797537089, |
|
"grad_norm": 0.4201250970363617, |
|
"learning_rate": 0.0001700054946830861, |
|
"loss": 1.0742, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.4518568796664404, |
|
"grad_norm": 0.33260366320610046, |
|
"learning_rate": 0.0001698762080222373, |
|
"loss": 1.0086, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.4537961795791719, |
|
"grad_norm": 0.422360360622406, |
|
"learning_rate": 0.00016974692136138855, |
|
"loss": 1.0209, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.4557354794919034, |
|
"grad_norm": 0.4392210841178894, |
|
"learning_rate": 0.00016961763470053976, |
|
"loss": 0.9916, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.4576747794046349, |
|
"grad_norm": 0.4346536099910736, |
|
"learning_rate": 0.000169488348039691, |
|
"loss": 1.0598, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.4596140793173664, |
|
"grad_norm": 0.4439583718776703, |
|
"learning_rate": 0.00016935906137884225, |
|
"loss": 1.0661, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.4615533792300979, |
|
"grad_norm": 0.40431517362594604, |
|
"learning_rate": 0.0001692297747179935, |
|
"loss": 0.984, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.46349267914282943, |
|
"grad_norm": 0.4152517020702362, |
|
"learning_rate": 0.0001691004880571447, |
|
"loss": 1.0098, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.46543197905556094, |
|
"grad_norm": 0.3530963957309723, |
|
"learning_rate": 0.00016897120139629595, |
|
"loss": 1.0124, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.46737127896829245, |
|
"grad_norm": 0.28457310795783997, |
|
"learning_rate": 0.00016884191473544717, |
|
"loss": 0.9547, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.46931057888102395, |
|
"grad_norm": 0.402810275554657, |
|
"learning_rate": 0.0001687126280745984, |
|
"loss": 1.0848, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.47124987879375546, |
|
"grad_norm": 0.3450517952442169, |
|
"learning_rate": 0.00016858334141374966, |
|
"loss": 1.0986, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.47318917870648697, |
|
"grad_norm": 0.34125566482543945, |
|
"learning_rate": 0.00016845405475290087, |
|
"loss": 1.0324, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.4751284786192185, |
|
"grad_norm": 0.3199256658554077, |
|
"learning_rate": 0.00016832476809205212, |
|
"loss": 1.0329, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.47706777853195, |
|
"grad_norm": 0.3269367814064026, |
|
"learning_rate": 0.00016819548143120333, |
|
"loss": 0.986, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.4790070784446815, |
|
"grad_norm": 0.41509127616882324, |
|
"learning_rate": 0.00016806619477035458, |
|
"loss": 0.9819, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.480946378357413, |
|
"grad_norm": 0.378519743680954, |
|
"learning_rate": 0.00016793690810950582, |
|
"loss": 1.0048, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.4828856782701445, |
|
"grad_norm": 0.3766985833644867, |
|
"learning_rate": 0.00016780762144865706, |
|
"loss": 0.9755, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.48482497818287595, |
|
"grad_norm": 0.42813560366630554, |
|
"learning_rate": 0.00016767833478780828, |
|
"loss": 0.9251, |
|
"step": 5000 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 30939, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.16313571466199e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|