|
{ |
|
"best_metric": 0.08193562924861908, |
|
"best_model_checkpoint": "realFake-img/checkpoint-2500", |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 3960, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025252525252525252, |
|
"grad_norm": 8.32949447631836, |
|
"learning_rate": 0.0001994949494949495, |
|
"loss": 0.1124, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.050505050505050504, |
|
"grad_norm": 4.660865306854248, |
|
"learning_rate": 0.000198989898989899, |
|
"loss": 0.2631, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"grad_norm": 4.1171956062316895, |
|
"learning_rate": 0.0001984848484848485, |
|
"loss": 0.1366, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10101010101010101, |
|
"grad_norm": 4.586099147796631, |
|
"learning_rate": 0.000197979797979798, |
|
"loss": 0.1395, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12626262626262627, |
|
"grad_norm": 3.6707675457000732, |
|
"learning_rate": 0.0001974747474747475, |
|
"loss": 0.178, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 0.39073047041893005, |
|
"learning_rate": 0.00019696969696969698, |
|
"loss": 0.2038, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17676767676767677, |
|
"grad_norm": 3.4298012256622314, |
|
"learning_rate": 0.0001964646464646465, |
|
"loss": 0.0964, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 4.532003402709961, |
|
"learning_rate": 0.00019595959595959596, |
|
"loss": 0.171, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 2.3665497303009033, |
|
"learning_rate": 0.00019545454545454548, |
|
"loss": 0.1166, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.25252525252525254, |
|
"grad_norm": 1.0514458417892456, |
|
"learning_rate": 0.00019494949494949494, |
|
"loss": 0.2578, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25252525252525254, |
|
"eval_accuracy": 0.9418084153983886, |
|
"eval_loss": 0.1593756079673767, |
|
"eval_runtime": 72.9833, |
|
"eval_samples_per_second": 15.305, |
|
"eval_steps_per_second": 1.918, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 2.9928767681121826, |
|
"learning_rate": 0.00019444444444444446, |
|
"loss": 0.1794, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 0.6943581104278564, |
|
"learning_rate": 0.00019393939393939395, |
|
"loss": 0.1713, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3282828282828283, |
|
"grad_norm": 5.296023845672607, |
|
"learning_rate": 0.00019343434343434344, |
|
"loss": 0.1822, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35353535353535354, |
|
"grad_norm": 4.849494934082031, |
|
"learning_rate": 0.00019292929292929293, |
|
"loss": 0.1667, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3787878787878788, |
|
"grad_norm": 2.1953601837158203, |
|
"learning_rate": 0.00019242424242424245, |
|
"loss": 0.1353, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 3.5325512886047363, |
|
"learning_rate": 0.00019191919191919191, |
|
"loss": 0.2191, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4292929292929293, |
|
"grad_norm": 1.513462781906128, |
|
"learning_rate": 0.00019141414141414143, |
|
"loss": 0.0864, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.1227214336395264, |
|
"learning_rate": 0.00019090909090909092, |
|
"loss": 0.0972, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4797979797979798, |
|
"grad_norm": 4.3201212882995605, |
|
"learning_rate": 0.0001904040404040404, |
|
"loss": 0.1356, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5050505050505051, |
|
"grad_norm": 0.13399846851825714, |
|
"learning_rate": 0.0001898989898989899, |
|
"loss": 0.0944, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5050505050505051, |
|
"eval_accuracy": 0.937332139659803, |
|
"eval_loss": 0.22425174713134766, |
|
"eval_runtime": 72.9458, |
|
"eval_samples_per_second": 15.313, |
|
"eval_steps_per_second": 1.919, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5303030303030303, |
|
"grad_norm": 0.07937999069690704, |
|
"learning_rate": 0.00018939393939393942, |
|
"loss": 0.0798, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 6.126536846160889, |
|
"learning_rate": 0.00018888888888888888, |
|
"loss": 0.2437, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5808080808080808, |
|
"grad_norm": 8.01685619354248, |
|
"learning_rate": 0.0001883838383838384, |
|
"loss": 0.2746, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 3.1425938606262207, |
|
"learning_rate": 0.0001878787878787879, |
|
"loss": 0.1937, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6313131313131313, |
|
"grad_norm": 1.1262303590774536, |
|
"learning_rate": 0.00018737373737373738, |
|
"loss": 0.2495, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6565656565656566, |
|
"grad_norm": 3.994985342025757, |
|
"learning_rate": 0.00018686868686868687, |
|
"loss": 0.0914, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 3.6686558723449707, |
|
"learning_rate": 0.00018636363636363636, |
|
"loss": 0.1241, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7070707070707071, |
|
"grad_norm": 2.8421552181243896, |
|
"learning_rate": 0.00018585858585858586, |
|
"loss": 0.162, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7323232323232324, |
|
"grad_norm": 0.06576777994632721, |
|
"learning_rate": 0.00018535353535353537, |
|
"loss": 0.0863, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 3.127112865447998, |
|
"learning_rate": 0.00018484848484848484, |
|
"loss": 0.1747, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"eval_accuracy": 0.9292748433303492, |
|
"eval_loss": 0.24716989696025848, |
|
"eval_runtime": 73.2274, |
|
"eval_samples_per_second": 15.254, |
|
"eval_steps_per_second": 1.912, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7828282828282829, |
|
"grad_norm": 1.235567569732666, |
|
"learning_rate": 0.00018434343434343435, |
|
"loss": 0.0742, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 5.305884838104248, |
|
"learning_rate": 0.00018383838383838384, |
|
"loss": 0.1013, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 3.124811887741089, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.2439, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8585858585858586, |
|
"grad_norm": 5.361472129821777, |
|
"learning_rate": 0.00018282828282828283, |
|
"loss": 0.0468, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8838383838383839, |
|
"grad_norm": 3.3062198162078857, |
|
"learning_rate": 0.00018232323232323234, |
|
"loss": 0.0855, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.9714092016220093, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.1645, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9343434343434344, |
|
"grad_norm": 1.7579039335250854, |
|
"learning_rate": 0.00018131313131313132, |
|
"loss": 0.193, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9595959595959596, |
|
"grad_norm": 3.588534355163574, |
|
"learning_rate": 0.00018080808080808082, |
|
"loss": 0.1305, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9848484848484849, |
|
"grad_norm": 6.151834487915039, |
|
"learning_rate": 0.0001803030303030303, |
|
"loss": 0.1004, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0101010101010102, |
|
"grad_norm": 3.521318197250366, |
|
"learning_rate": 0.0001797979797979798, |
|
"loss": 0.1328, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0101010101010102, |
|
"eval_accuracy": 0.9337511190689346, |
|
"eval_loss": 0.17739379405975342, |
|
"eval_runtime": 72.9497, |
|
"eval_samples_per_second": 15.312, |
|
"eval_steps_per_second": 1.919, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0353535353535352, |
|
"grad_norm": 0.5116239786148071, |
|
"learning_rate": 0.00017929292929292931, |
|
"loss": 0.0932, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"grad_norm": 0.37958571314811707, |
|
"learning_rate": 0.0001787878787878788, |
|
"loss": 0.0538, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0858585858585859, |
|
"grad_norm": 3.976700782775879, |
|
"learning_rate": 0.0001782828282828283, |
|
"loss": 0.2245, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 2.8285045623779297, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.1332, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 3.683419704437256, |
|
"learning_rate": 0.00017727272727272728, |
|
"loss": 0.1162, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1616161616161615, |
|
"grad_norm": 4.30293607711792, |
|
"learning_rate": 0.0001767676767676768, |
|
"loss": 0.0678, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1868686868686869, |
|
"grad_norm": 0.15934455394744873, |
|
"learning_rate": 0.00017626262626262626, |
|
"loss": 0.1587, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 1.5525578260421753, |
|
"learning_rate": 0.00017575757575757578, |
|
"loss": 0.0637, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2373737373737375, |
|
"grad_norm": 1.534348964691162, |
|
"learning_rate": 0.00017525252525252527, |
|
"loss": 0.1103, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2626262626262625, |
|
"grad_norm": 1.6843178272247314, |
|
"learning_rate": 0.00017474747474747476, |
|
"loss": 0.1918, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2626262626262625, |
|
"eval_accuracy": 0.9570277529095792, |
|
"eval_loss": 0.12820282578468323, |
|
"eval_runtime": 73.1443, |
|
"eval_samples_per_second": 15.271, |
|
"eval_steps_per_second": 1.914, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2878787878787878, |
|
"grad_norm": 0.6296999454498291, |
|
"learning_rate": 0.00017424242424242425, |
|
"loss": 0.0461, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3131313131313131, |
|
"grad_norm": 4.980341911315918, |
|
"learning_rate": 0.00017373737373737377, |
|
"loss": 0.1479, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3383838383838385, |
|
"grad_norm": 0.36140933632850647, |
|
"learning_rate": 0.00017323232323232323, |
|
"loss": 0.0726, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.2907123267650604, |
|
"learning_rate": 0.00017272727272727275, |
|
"loss": 0.1109, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 1.1450049877166748, |
|
"learning_rate": 0.00017222222222222224, |
|
"loss": 0.0888, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4141414141414141, |
|
"grad_norm": 3.324134588241577, |
|
"learning_rate": 0.00017171717171717173, |
|
"loss": 0.1074, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4393939393939394, |
|
"grad_norm": 0.9428613185882568, |
|
"learning_rate": 0.00017121212121212122, |
|
"loss": 0.0856, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4646464646464645, |
|
"grad_norm": 0.1330060064792633, |
|
"learning_rate": 0.0001707070707070707, |
|
"loss": 0.061, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4898989898989898, |
|
"grad_norm": 4.435102939605713, |
|
"learning_rate": 0.0001702020202020202, |
|
"loss": 0.1137, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 2.5744283199310303, |
|
"learning_rate": 0.00016969696969696972, |
|
"loss": 0.169, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"eval_accuracy": 0.9346463742166518, |
|
"eval_loss": 0.2247086614370346, |
|
"eval_runtime": 73.2754, |
|
"eval_samples_per_second": 15.244, |
|
"eval_steps_per_second": 1.911, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5404040404040404, |
|
"grad_norm": 3.7209930419921875, |
|
"learning_rate": 0.00016919191919191918, |
|
"loss": 0.1929, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5656565656565657, |
|
"grad_norm": 4.9047322273254395, |
|
"learning_rate": 0.0001686868686868687, |
|
"loss": 0.144, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 8.181381225585938, |
|
"learning_rate": 0.0001681818181818182, |
|
"loss": 0.1008, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6161616161616161, |
|
"grad_norm": 0.5650784969329834, |
|
"learning_rate": 0.00016767676767676768, |
|
"loss": 0.1385, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6414141414141414, |
|
"grad_norm": 0.4483976364135742, |
|
"learning_rate": 0.00016717171717171717, |
|
"loss": 0.1112, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 2.8870067596435547, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.0868, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.691919191919192, |
|
"grad_norm": 5.016068458557129, |
|
"learning_rate": 0.00016616161616161615, |
|
"loss": 0.0948, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7171717171717171, |
|
"grad_norm": 4.62065315246582, |
|
"learning_rate": 0.00016565656565656567, |
|
"loss": 0.2336, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7424242424242424, |
|
"grad_norm": 0.04882610961794853, |
|
"learning_rate": 0.00016515151515151516, |
|
"loss": 0.1006, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7676767676767677, |
|
"grad_norm": 1.2523910999298096, |
|
"learning_rate": 0.00016464646464646465, |
|
"loss": 0.2595, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7676767676767677, |
|
"eval_accuracy": 0.9444941808415398, |
|
"eval_loss": 0.1785079687833786, |
|
"eval_runtime": 73.2828, |
|
"eval_samples_per_second": 15.242, |
|
"eval_steps_per_second": 1.91, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7929292929292928, |
|
"grad_norm": 0.28372153639793396, |
|
"learning_rate": 0.00016414141414141414, |
|
"loss": 0.0657, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.061366915702819824, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 0.2048, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8434343434343434, |
|
"grad_norm": 2.9858274459838867, |
|
"learning_rate": 0.00016313131313131312, |
|
"loss": 0.0489, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8686868686868687, |
|
"grad_norm": 4.050809383392334, |
|
"learning_rate": 0.00016262626262626264, |
|
"loss": 0.1095, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.893939393939394, |
|
"grad_norm": 3.725325584411621, |
|
"learning_rate": 0.00016212121212121213, |
|
"loss": 0.2613, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9191919191919191, |
|
"grad_norm": 2.09786319732666, |
|
"learning_rate": 0.00016161616161616162, |
|
"loss": 0.0492, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 1.9398726224899292, |
|
"learning_rate": 0.0001611111111111111, |
|
"loss": 0.0831, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"grad_norm": 0.6055514812469482, |
|
"learning_rate": 0.0001606060606060606, |
|
"loss": 0.1733, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9949494949494948, |
|
"grad_norm": 0.22102850675582886, |
|
"learning_rate": 0.00016010101010101012, |
|
"loss": 0.1106, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"grad_norm": 3.681710720062256, |
|
"learning_rate": 0.0001595959595959596, |
|
"loss": 0.0911, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"eval_accuracy": 0.9534467323187108, |
|
"eval_loss": 0.1352938562631607, |
|
"eval_runtime": 73.2218, |
|
"eval_samples_per_second": 15.255, |
|
"eval_steps_per_second": 1.912, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 0.574734091758728, |
|
"learning_rate": 0.0001590909090909091, |
|
"loss": 0.044, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.0707070707070705, |
|
"grad_norm": 0.253918319940567, |
|
"learning_rate": 0.0001585858585858586, |
|
"loss": 0.0476, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.095959595959596, |
|
"grad_norm": 0.1252337247133255, |
|
"learning_rate": 0.00015808080808080808, |
|
"loss": 0.1279, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 0.26320022344589233, |
|
"learning_rate": 0.00015757575757575757, |
|
"loss": 0.2042, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.1464646464646466, |
|
"grad_norm": 0.7983365058898926, |
|
"learning_rate": 0.0001570707070707071, |
|
"loss": 0.1208, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.1717171717171717, |
|
"grad_norm": 0.36479347944259644, |
|
"learning_rate": 0.00015656565656565658, |
|
"loss": 0.0881, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.196969696969697, |
|
"grad_norm": 0.11645219475030899, |
|
"learning_rate": 0.00015606060606060607, |
|
"loss": 0.0955, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.1980379819869995, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 0.077, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.2474747474747474, |
|
"grad_norm": 0.06797017902135849, |
|
"learning_rate": 0.00015505050505050508, |
|
"loss": 0.0377, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.48521897196769714, |
|
"learning_rate": 0.00015454545454545454, |
|
"loss": 0.0548, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"eval_accuracy": 0.9471799462846912, |
|
"eval_loss": 0.19982792437076569, |
|
"eval_runtime": 72.9425, |
|
"eval_samples_per_second": 15.313, |
|
"eval_steps_per_second": 1.919, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.297979797979798, |
|
"grad_norm": 0.017012102529406548, |
|
"learning_rate": 0.00015404040404040406, |
|
"loss": 0.1089, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.323232323232323, |
|
"grad_norm": 0.2808210849761963, |
|
"learning_rate": 0.00015353535353535353, |
|
"loss": 0.0789, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.3484848484848486, |
|
"grad_norm": 4.9768781661987305, |
|
"learning_rate": 0.00015303030303030304, |
|
"loss": 0.1004, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.3737373737373737, |
|
"grad_norm": 1.5323927402496338, |
|
"learning_rate": 0.00015252525252525253, |
|
"loss": 0.0357, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.398989898989899, |
|
"grad_norm": 4.321779251098633, |
|
"learning_rate": 0.00015202020202020202, |
|
"loss": 0.0348, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 6.227025032043457, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 0.1679, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.4494949494949494, |
|
"grad_norm": 1.045432209968567, |
|
"learning_rate": 0.00015101010101010103, |
|
"loss": 0.1222, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.474747474747475, |
|
"grad_norm": 3.0685787200927734, |
|
"learning_rate": 0.0001505050505050505, |
|
"loss": 0.1434, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.04191284626722336, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.086, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.525252525252525, |
|
"grad_norm": 3.1016695499420166, |
|
"learning_rate": 0.0001494949494949495, |
|
"loss": 0.1399, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.525252525252525, |
|
"eval_accuracy": 0.9444941808415398, |
|
"eval_loss": 0.19705650210380554, |
|
"eval_runtime": 73.3829, |
|
"eval_samples_per_second": 15.222, |
|
"eval_steps_per_second": 1.908, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5505050505050506, |
|
"grad_norm": 4.877354145050049, |
|
"learning_rate": 0.000148989898989899, |
|
"loss": 0.1418, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.5757575757575757, |
|
"grad_norm": 4.7359700202941895, |
|
"learning_rate": 0.00014848484848484849, |
|
"loss": 0.1084, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.601010101010101, |
|
"grad_norm": 0.7143091559410095, |
|
"learning_rate": 0.000147979797979798, |
|
"loss": 0.1074, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.6262626262626263, |
|
"grad_norm": 0.4162321388721466, |
|
"learning_rate": 0.00014747474747474747, |
|
"loss": 0.1317, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.6515151515151514, |
|
"grad_norm": 5.558507442474365, |
|
"learning_rate": 0.00014696969696969698, |
|
"loss": 0.0829, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.676767676767677, |
|
"grad_norm": 0.08041220903396606, |
|
"learning_rate": 0.00014646464646464648, |
|
"loss": 0.0905, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.702020202020202, |
|
"grad_norm": 3.554946184158325, |
|
"learning_rate": 0.00014595959595959597, |
|
"loss": 0.14, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.9108226895332336, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.0355, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.7525252525252526, |
|
"grad_norm": 1.091728925704956, |
|
"learning_rate": 0.00014494949494949495, |
|
"loss": 0.059, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.07620527595281601, |
|
"learning_rate": 0.00014444444444444444, |
|
"loss": 0.2001, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"eval_accuracy": 0.937332139659803, |
|
"eval_loss": 0.24790146946907043, |
|
"eval_runtime": 73.1059, |
|
"eval_samples_per_second": 15.279, |
|
"eval_steps_per_second": 1.915, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.8030303030303028, |
|
"grad_norm": 0.10709954053163528, |
|
"learning_rate": 0.00014393939393939396, |
|
"loss": 0.0487, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.8282828282828283, |
|
"grad_norm": 4.047976493835449, |
|
"learning_rate": 0.00014343434343434342, |
|
"loss": 0.0774, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.8535353535353534, |
|
"grad_norm": 2.409966468811035, |
|
"learning_rate": 0.00014292929292929294, |
|
"loss": 0.0744, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.878787878787879, |
|
"grad_norm": 0.3456668257713318, |
|
"learning_rate": 0.00014242424242424243, |
|
"loss": 0.0125, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.904040404040404, |
|
"grad_norm": 0.046853143721818924, |
|
"learning_rate": 0.00014191919191919192, |
|
"loss": 0.0756, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.929292929292929, |
|
"grad_norm": 3.4357807636260986, |
|
"learning_rate": 0.0001414141414141414, |
|
"loss": 0.1375, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 1.010414719581604, |
|
"learning_rate": 0.00014090909090909093, |
|
"loss": 0.0704, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.9797979797979797, |
|
"grad_norm": 0.008091296069324017, |
|
"learning_rate": 0.00014040404040404042, |
|
"loss": 0.0791, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.005050505050505, |
|
"grad_norm": 1.9511629343032837, |
|
"learning_rate": 0.0001398989898989899, |
|
"loss": 0.0754, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 10.075323104858398, |
|
"learning_rate": 0.0001393939393939394, |
|
"loss": 0.0976, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"eval_accuracy": 0.9498657117278424, |
|
"eval_loss": 0.16011768579483032, |
|
"eval_runtime": 73.2182, |
|
"eval_samples_per_second": 15.256, |
|
"eval_steps_per_second": 1.912, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.0555555555555554, |
|
"grad_norm": 0.027206294238567352, |
|
"learning_rate": 0.0001388888888888889, |
|
"loss": 0.0906, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.080808080808081, |
|
"grad_norm": 1.425262689590454, |
|
"learning_rate": 0.0001383838383838384, |
|
"loss": 0.0349, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.106060606060606, |
|
"grad_norm": 7.3463616371154785, |
|
"learning_rate": 0.0001378787878787879, |
|
"loss": 0.0804, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.1313131313131315, |
|
"grad_norm": 1.0737591981887817, |
|
"learning_rate": 0.0001373737373737374, |
|
"loss": 0.068, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.1565656565656566, |
|
"grad_norm": 7.525305271148682, |
|
"learning_rate": 0.00013686868686868688, |
|
"loss": 0.1145, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.4561030864715576, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 0.0977, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.207070707070707, |
|
"grad_norm": 0.11276185512542725, |
|
"learning_rate": 0.00013585858585858586, |
|
"loss": 0.0743, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.2323232323232323, |
|
"grad_norm": 1.0171997547149658, |
|
"learning_rate": 0.00013535353535353538, |
|
"loss": 0.0775, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.257575757575758, |
|
"grad_norm": 3.1414084434509277, |
|
"learning_rate": 0.00013484848484848484, |
|
"loss": 0.0309, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.282828282828283, |
|
"grad_norm": 0.037932224571704865, |
|
"learning_rate": 0.00013434343434343436, |
|
"loss": 0.1291, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.282828282828283, |
|
"eval_accuracy": 0.9588182632050134, |
|
"eval_loss": 0.160703644156456, |
|
"eval_runtime": 73.0017, |
|
"eval_samples_per_second": 15.301, |
|
"eval_steps_per_second": 1.918, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.308080808080808, |
|
"grad_norm": 2.9155356884002686, |
|
"learning_rate": 0.00013383838383838385, |
|
"loss": 0.0215, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 5.102810382843018, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.0716, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.3585858585858586, |
|
"grad_norm": 0.020925424993038177, |
|
"learning_rate": 0.00013282828282828283, |
|
"loss": 0.0372, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.3838383838383836, |
|
"grad_norm": 0.10292687267065048, |
|
"learning_rate": 0.00013232323232323235, |
|
"loss": 0.0211, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"grad_norm": 2.7968993186950684, |
|
"learning_rate": 0.0001318181818181818, |
|
"loss": 0.0708, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.4343434343434343, |
|
"grad_norm": 3.1068055629730225, |
|
"learning_rate": 0.00013131313131313133, |
|
"loss": 0.1007, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.45959595959596, |
|
"grad_norm": 0.032499730587005615, |
|
"learning_rate": 0.00013080808080808082, |
|
"loss": 0.0713, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.484848484848485, |
|
"grad_norm": 0.20779326558113098, |
|
"learning_rate": 0.0001303030303030303, |
|
"loss": 0.048, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.51010101010101, |
|
"grad_norm": 5.266826152801514, |
|
"learning_rate": 0.0001297979797979798, |
|
"loss": 0.193, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.5353535353535355, |
|
"grad_norm": 0.42106470465660095, |
|
"learning_rate": 0.00012929292929292932, |
|
"loss": 0.0721, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.5353535353535355, |
|
"eval_accuracy": 0.9588182632050134, |
|
"eval_loss": 0.18219807744026184, |
|
"eval_runtime": 73.033, |
|
"eval_samples_per_second": 15.294, |
|
"eval_steps_per_second": 1.917, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.5606060606060606, |
|
"grad_norm": 1.7371455430984497, |
|
"learning_rate": 0.00012878787878787878, |
|
"loss": 0.0927, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.5858585858585856, |
|
"grad_norm": 0.636141836643219, |
|
"learning_rate": 0.0001282828282828283, |
|
"loss": 0.0295, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.611111111111111, |
|
"grad_norm": 0.10211779177188873, |
|
"learning_rate": 0.00012777777777777776, |
|
"loss": 0.0287, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.803653359413147, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 0.0621, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.6616161616161618, |
|
"grad_norm": 0.11753907799720764, |
|
"learning_rate": 0.00012676767676767677, |
|
"loss": 0.0465, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.686868686868687, |
|
"grad_norm": 0.05394851416349411, |
|
"learning_rate": 0.00012626262626262626, |
|
"loss": 0.0474, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.712121212121212, |
|
"grad_norm": 3.631462574005127, |
|
"learning_rate": 0.00012575757575757575, |
|
"loss": 0.093, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.7373737373737375, |
|
"grad_norm": 0.1336178481578827, |
|
"learning_rate": 0.00012525252525252527, |
|
"loss": 0.0736, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.7626262626262625, |
|
"grad_norm": 0.0858420580625534, |
|
"learning_rate": 0.00012474747474747473, |
|
"loss": 0.1211, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.787878787878788, |
|
"grad_norm": 1.1731150150299072, |
|
"learning_rate": 0.00012424242424242425, |
|
"loss": 0.0592, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.787878787878788, |
|
"eval_accuracy": 0.9623992837958818, |
|
"eval_loss": 0.12546713650226593, |
|
"eval_runtime": 73.0966, |
|
"eval_samples_per_second": 15.281, |
|
"eval_steps_per_second": 1.915, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.813131313131313, |
|
"grad_norm": 1.533412218093872, |
|
"learning_rate": 0.00012373737373737374, |
|
"loss": 0.0663, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.8383838383838382, |
|
"grad_norm": 7.734765529632568, |
|
"learning_rate": 0.00012323232323232323, |
|
"loss": 0.075, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.8636363636363638, |
|
"grad_norm": 0.4143606126308441, |
|
"learning_rate": 0.00012272727272727272, |
|
"loss": 0.0158, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 4.032654762268066, |
|
"learning_rate": 0.00012222222222222224, |
|
"loss": 0.0898, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.9141414141414144, |
|
"grad_norm": 0.2919144928455353, |
|
"learning_rate": 0.00012171717171717172, |
|
"loss": 0.0904, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 6.036355018615723, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 0.0725, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.9646464646464645, |
|
"grad_norm": 0.34402996301651, |
|
"learning_rate": 0.0001207070707070707, |
|
"loss": 0.0643, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.98989898989899, |
|
"grad_norm": 0.307706356048584, |
|
"learning_rate": 0.0001202020202020202, |
|
"loss": 0.1061, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.015151515151516, |
|
"grad_norm": 0.04210241511464119, |
|
"learning_rate": 0.00011969696969696971, |
|
"loss": 0.1015, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"grad_norm": 4.686149597167969, |
|
"learning_rate": 0.00011919191919191919, |
|
"loss": 0.0964, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"eval_accuracy": 0.954341987466428, |
|
"eval_loss": 0.16204935312271118, |
|
"eval_runtime": 72.8935, |
|
"eval_samples_per_second": 15.324, |
|
"eval_steps_per_second": 1.921, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.065656565656566, |
|
"grad_norm": 0.9774217009544373, |
|
"learning_rate": 0.00011868686868686869, |
|
"loss": 0.0342, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 2.1450870037078857, |
|
"learning_rate": 0.0001181818181818182, |
|
"loss": 0.0852, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.116161616161616, |
|
"grad_norm": 4.826761722564697, |
|
"learning_rate": 0.00011767676767676767, |
|
"loss": 0.0612, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.141414141414141, |
|
"grad_norm": 0.7088700532913208, |
|
"learning_rate": 0.00011717171717171717, |
|
"loss": 0.0369, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 0.07485224306583405, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 0.0075, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.191919191919192, |
|
"grad_norm": 7.588441371917725, |
|
"learning_rate": 0.00011616161616161616, |
|
"loss": 0.0492, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.217171717171717, |
|
"grad_norm": 0.06588041037321091, |
|
"learning_rate": 0.00011565656565656566, |
|
"loss": 0.0619, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.242424242424242, |
|
"grad_norm": 0.3317614495754242, |
|
"learning_rate": 0.00011515151515151516, |
|
"loss": 0.0504, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.267676767676767, |
|
"grad_norm": 4.261381149291992, |
|
"learning_rate": 0.00011464646464646464, |
|
"loss": 0.0534, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.292929292929293, |
|
"grad_norm": 1.7030925750732422, |
|
"learning_rate": 0.00011414141414141415, |
|
"loss": 0.0738, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.292929292929293, |
|
"eval_accuracy": 0.9650850492390332, |
|
"eval_loss": 0.12794509530067444, |
|
"eval_runtime": 73.4006, |
|
"eval_samples_per_second": 15.218, |
|
"eval_steps_per_second": 1.907, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.318181818181818, |
|
"grad_norm": 3.9137349128723145, |
|
"learning_rate": 0.00011363636363636365, |
|
"loss": 0.0269, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.343434343434343, |
|
"grad_norm": 0.012919370085000992, |
|
"learning_rate": 0.00011313131313131313, |
|
"loss": 0.0314, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.3686868686868685, |
|
"grad_norm": 0.07363598793745041, |
|
"learning_rate": 0.00011262626262626263, |
|
"loss": 0.0233, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.393939393939394, |
|
"grad_norm": 0.137301966547966, |
|
"learning_rate": 0.00011212121212121212, |
|
"loss": 0.0863, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.41919191919192, |
|
"grad_norm": 6.548308849334717, |
|
"learning_rate": 0.00011161616161616161, |
|
"loss": 0.0463, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 2.40230655670166, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 0.0668, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.46969696969697, |
|
"grad_norm": 0.018276751041412354, |
|
"learning_rate": 0.00011060606060606061, |
|
"loss": 0.0193, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.494949494949495, |
|
"grad_norm": 4.558255195617676, |
|
"learning_rate": 0.00011010101010101011, |
|
"loss": 0.1149, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.52020202020202, |
|
"grad_norm": 0.04581284150481224, |
|
"learning_rate": 0.0001095959595959596, |
|
"loss": 0.0227, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 1.2669509649276733, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.0504, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"eval_accuracy": 0.9588182632050134, |
|
"eval_loss": 0.16235476732254028, |
|
"eval_runtime": 73.0538, |
|
"eval_samples_per_second": 15.29, |
|
"eval_steps_per_second": 1.916, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.570707070707071, |
|
"grad_norm": 0.07127434760332108, |
|
"learning_rate": 0.0001085858585858586, |
|
"loss": 0.0492, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.595959595959596, |
|
"grad_norm": 1.7907336950302124, |
|
"learning_rate": 0.00010808080808080809, |
|
"loss": 0.0358, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.621212121212121, |
|
"grad_norm": 4.024843692779541, |
|
"learning_rate": 0.00010757575757575758, |
|
"loss": 0.0856, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.646464646464646, |
|
"grad_norm": 0.020713260397315025, |
|
"learning_rate": 0.00010707070707070708, |
|
"loss": 0.0101, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.671717171717171, |
|
"grad_norm": 0.06845160573720932, |
|
"learning_rate": 0.00010656565656565659, |
|
"loss": 0.0153, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.696969696969697, |
|
"grad_norm": 1.0333762168884277, |
|
"learning_rate": 0.00010606060606060606, |
|
"loss": 0.1535, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.722222222222222, |
|
"grad_norm": 0.019528638571500778, |
|
"learning_rate": 0.00010555555555555557, |
|
"loss": 0.089, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.747474747474747, |
|
"grad_norm": 0.12054427713155746, |
|
"learning_rate": 0.00010505050505050507, |
|
"loss": 0.0154, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.7727272727272725, |
|
"grad_norm": 0.053187351673841476, |
|
"learning_rate": 0.00010454545454545455, |
|
"loss": 0.1073, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.797979797979798, |
|
"grad_norm": 0.03637217357754707, |
|
"learning_rate": 0.00010404040404040405, |
|
"loss": 0.0972, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.797979797979798, |
|
"eval_accuracy": 0.9623992837958818, |
|
"eval_loss": 0.15791860222816467, |
|
"eval_runtime": 73.2114, |
|
"eval_samples_per_second": 15.257, |
|
"eval_steps_per_second": 1.912, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.8232323232323235, |
|
"grad_norm": 6.812131404876709, |
|
"learning_rate": 0.00010353535353535353, |
|
"loss": 0.1274, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 2.3793511390686035, |
|
"learning_rate": 0.00010303030303030303, |
|
"loss": 0.1051, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.873737373737374, |
|
"grad_norm": 1.2393810749053955, |
|
"learning_rate": 0.00010252525252525254, |
|
"loss": 0.0167, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.898989898989899, |
|
"grad_norm": 1.5232930183410645, |
|
"learning_rate": 0.00010202020202020202, |
|
"loss": 0.0065, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.924242424242424, |
|
"grad_norm": 0.00905653741210699, |
|
"learning_rate": 0.00010151515151515152, |
|
"loss": 0.0419, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.94949494949495, |
|
"grad_norm": 0.8604415655136108, |
|
"learning_rate": 0.00010101010101010102, |
|
"loss": 0.0769, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.974747474747475, |
|
"grad_norm": 4.089222431182861, |
|
"learning_rate": 0.0001005050505050505, |
|
"loss": 0.0366, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 2.2072501182556152, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0746, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.025252525252525, |
|
"grad_norm": 0.010899940505623817, |
|
"learning_rate": 9.94949494949495e-05, |
|
"loss": 0.0597, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 5.05050505050505, |
|
"grad_norm": 1.6260383129119873, |
|
"learning_rate": 9.8989898989899e-05, |
|
"loss": 0.0456, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.05050505050505, |
|
"eval_accuracy": 0.9489704565801254, |
|
"eval_loss": 0.19649948179721832, |
|
"eval_runtime": 73.1131, |
|
"eval_samples_per_second": 15.278, |
|
"eval_steps_per_second": 1.915, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.075757575757576, |
|
"grad_norm": 0.009620290249586105, |
|
"learning_rate": 9.848484848484849e-05, |
|
"loss": 0.018, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 5.101010101010101, |
|
"grad_norm": 4.627386093139648, |
|
"learning_rate": 9.797979797979798e-05, |
|
"loss": 0.0906, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 5.126262626262626, |
|
"grad_norm": 0.5775233507156372, |
|
"learning_rate": 9.747474747474747e-05, |
|
"loss": 0.0179, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 5.151515151515151, |
|
"grad_norm": 0.3100966513156891, |
|
"learning_rate": 9.696969696969698e-05, |
|
"loss": 0.0225, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 5.1767676767676765, |
|
"grad_norm": 0.012251541949808598, |
|
"learning_rate": 9.646464646464647e-05, |
|
"loss": 0.0062, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 5.202020202020202, |
|
"grad_norm": 3.9397971630096436, |
|
"learning_rate": 9.595959595959596e-05, |
|
"loss": 0.0497, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 5.2272727272727275, |
|
"grad_norm": 0.002988005056977272, |
|
"learning_rate": 9.545454545454546e-05, |
|
"loss": 0.0242, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 5.252525252525253, |
|
"grad_norm": 0.15744374692440033, |
|
"learning_rate": 9.494949494949495e-05, |
|
"loss": 0.0165, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 5.277777777777778, |
|
"grad_norm": 2.624490976333618, |
|
"learning_rate": 9.444444444444444e-05, |
|
"loss": 0.0595, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 5.303030303030303, |
|
"grad_norm": 1.7126376628875732, |
|
"learning_rate": 9.393939393939395e-05, |
|
"loss": 0.0334, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.303030303030303, |
|
"eval_accuracy": 0.9570277529095792, |
|
"eval_loss": 0.165226012468338, |
|
"eval_runtime": 73.2601, |
|
"eval_samples_per_second": 15.247, |
|
"eval_steps_per_second": 1.911, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.328282828282829, |
|
"grad_norm": 0.003406533505767584, |
|
"learning_rate": 9.343434343434344e-05, |
|
"loss": 0.0201, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 5.353535353535354, |
|
"grad_norm": 0.18647323548793793, |
|
"learning_rate": 9.292929292929293e-05, |
|
"loss": 0.0471, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 5.378787878787879, |
|
"grad_norm": 4.275173664093018, |
|
"learning_rate": 9.242424242424242e-05, |
|
"loss": 0.0565, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 5.404040404040404, |
|
"grad_norm": 3.319251537322998, |
|
"learning_rate": 9.191919191919192e-05, |
|
"loss": 0.0687, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 5.429292929292929, |
|
"grad_norm": 0.067157082259655, |
|
"learning_rate": 9.141414141414141e-05, |
|
"loss": 0.0507, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.18047641217708588, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.0555, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 5.47979797979798, |
|
"grad_norm": 0.0075127603486180305, |
|
"learning_rate": 9.040404040404041e-05, |
|
"loss": 0.0488, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 5.505050505050505, |
|
"grad_norm": 0.01690557599067688, |
|
"learning_rate": 8.98989898989899e-05, |
|
"loss": 0.0626, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 5.53030303030303, |
|
"grad_norm": 0.005741783883422613, |
|
"learning_rate": 8.93939393939394e-05, |
|
"loss": 0.0014, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 0.05627870187163353, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.0242, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"eval_accuracy": 0.9749328558639212, |
|
"eval_loss": 0.11822798103094101, |
|
"eval_runtime": 73.1232, |
|
"eval_samples_per_second": 15.276, |
|
"eval_steps_per_second": 1.915, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.58080808080808, |
|
"grad_norm": 0.012817220762372017, |
|
"learning_rate": 8.83838383838384e-05, |
|
"loss": 0.0277, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 5.606060606060606, |
|
"grad_norm": 0.00884329341351986, |
|
"learning_rate": 8.787878787878789e-05, |
|
"loss": 0.0067, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 5.6313131313131315, |
|
"grad_norm": 0.034603264182806015, |
|
"learning_rate": 8.737373737373738e-05, |
|
"loss": 0.0702, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 5.656565656565657, |
|
"grad_norm": 0.0622437559068203, |
|
"learning_rate": 8.686868686868688e-05, |
|
"loss": 0.0171, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 5.681818181818182, |
|
"grad_norm": 0.04042644053697586, |
|
"learning_rate": 8.636363636363637e-05, |
|
"loss": 0.0592, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 5.707070707070707, |
|
"grad_norm": 0.04215148836374283, |
|
"learning_rate": 8.585858585858586e-05, |
|
"loss": 0.0761, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 5.732323232323233, |
|
"grad_norm": 0.22815492749214172, |
|
"learning_rate": 8.535353535353535e-05, |
|
"loss": 0.0133, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 5.757575757575758, |
|
"grad_norm": 0.3139846622943878, |
|
"learning_rate": 8.484848484848486e-05, |
|
"loss": 0.0013, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 5.782828282828283, |
|
"grad_norm": 0.008748591877520084, |
|
"learning_rate": 8.434343434343435e-05, |
|
"loss": 0.036, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 5.808080808080808, |
|
"grad_norm": 0.10703355818986893, |
|
"learning_rate": 8.383838383838384e-05, |
|
"loss": 0.0715, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.808080808080808, |
|
"eval_accuracy": 0.9650850492390332, |
|
"eval_loss": 0.12497912347316742, |
|
"eval_runtime": 72.9451, |
|
"eval_samples_per_second": 15.313, |
|
"eval_steps_per_second": 1.919, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.833333333333333, |
|
"grad_norm": 0.02993335947394371, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.017, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 5.858585858585858, |
|
"grad_norm": 0.004180525429546833, |
|
"learning_rate": 8.282828282828283e-05, |
|
"loss": 0.0388, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 5.883838383838384, |
|
"grad_norm": 0.0341310054063797, |
|
"learning_rate": 8.232323232323233e-05, |
|
"loss": 0.0193, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 5.909090909090909, |
|
"grad_norm": 0.02368093468248844, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 0.0314, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 5.934343434343434, |
|
"grad_norm": 0.01623358018696308, |
|
"learning_rate": 8.131313131313132e-05, |
|
"loss": 0.0578, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 5.959595959595959, |
|
"grad_norm": 0.006059895269572735, |
|
"learning_rate": 8.080808080808081e-05, |
|
"loss": 0.0066, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 5.984848484848484, |
|
"grad_norm": 0.024945911020040512, |
|
"learning_rate": 8.03030303030303e-05, |
|
"loss": 0.0032, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 6.01010101010101, |
|
"grad_norm": 0.010317071340978146, |
|
"learning_rate": 7.97979797979798e-05, |
|
"loss": 0.0047, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 6.0353535353535355, |
|
"grad_norm": 0.4775066673755646, |
|
"learning_rate": 7.92929292929293e-05, |
|
"loss": 0.0193, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 6.0606060606060606, |
|
"grad_norm": 6.233785629272461, |
|
"learning_rate": 7.878787878787879e-05, |
|
"loss": 0.0407, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.0606060606060606, |
|
"eval_accuracy": 0.9695613249776186, |
|
"eval_loss": 0.11715386807918549, |
|
"eval_runtime": 73.3488, |
|
"eval_samples_per_second": 15.229, |
|
"eval_steps_per_second": 1.909, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.085858585858586, |
|
"grad_norm": 0.04230092465877533, |
|
"learning_rate": 7.828282828282829e-05, |
|
"loss": 0.0028, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 6.111111111111111, |
|
"grad_norm": 0.0015748771838843822, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 0.0421, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 6.136363636363637, |
|
"grad_norm": 0.00564368162304163, |
|
"learning_rate": 7.727272727272727e-05, |
|
"loss": 0.0631, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 6.161616161616162, |
|
"grad_norm": 0.4366774559020996, |
|
"learning_rate": 7.676767676767676e-05, |
|
"loss": 0.0429, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 6.186868686868687, |
|
"grad_norm": 0.6611001491546631, |
|
"learning_rate": 7.626262626262627e-05, |
|
"loss": 0.0901, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 6.212121212121212, |
|
"grad_norm": 5.706575870513916, |
|
"learning_rate": 7.575757575757576e-05, |
|
"loss": 0.0857, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 6.237373737373737, |
|
"grad_norm": 0.007969530299305916, |
|
"learning_rate": 7.525252525252525e-05, |
|
"loss": 0.0227, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 6.262626262626263, |
|
"grad_norm": 0.28915736079216003, |
|
"learning_rate": 7.474747474747475e-05, |
|
"loss": 0.0113, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 6.287878787878788, |
|
"grad_norm": 0.2088274508714676, |
|
"learning_rate": 7.424242424242424e-05, |
|
"loss": 0.0026, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 6.313131313131313, |
|
"grad_norm": 0.004980772268027067, |
|
"learning_rate": 7.373737373737373e-05, |
|
"loss": 0.0003, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.313131313131313, |
|
"eval_accuracy": 0.9785138764547896, |
|
"eval_loss": 0.08193562924861908, |
|
"eval_runtime": 73.1145, |
|
"eval_samples_per_second": 15.277, |
|
"eval_steps_per_second": 1.915, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.338383838383838, |
|
"grad_norm": 0.001987410243600607, |
|
"learning_rate": 7.323232323232324e-05, |
|
"loss": 0.0383, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 1.1499226093292236, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.0171, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 6.388888888888889, |
|
"grad_norm": 0.03895330801606178, |
|
"learning_rate": 7.222222222222222e-05, |
|
"loss": 0.0127, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 6.414141414141414, |
|
"grad_norm": 0.3166453540325165, |
|
"learning_rate": 7.171717171717171e-05, |
|
"loss": 0.0278, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 6.4393939393939394, |
|
"grad_norm": 0.005140668712556362, |
|
"learning_rate": 7.121212121212121e-05, |
|
"loss": 0.0795, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 6.4646464646464645, |
|
"grad_norm": 14.462100982666016, |
|
"learning_rate": 7.07070707070707e-05, |
|
"loss": 0.085, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 6.48989898989899, |
|
"grad_norm": 0.24089215695858002, |
|
"learning_rate": 7.020202020202021e-05, |
|
"loss": 0.0026, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 6.515151515151516, |
|
"grad_norm": 0.22834239900112152, |
|
"learning_rate": 6.96969696969697e-05, |
|
"loss": 0.005, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 6.540404040404041, |
|
"grad_norm": 8.35010814666748, |
|
"learning_rate": 6.91919191919192e-05, |
|
"loss": 0.0728, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 6.565656565656566, |
|
"grad_norm": 4.920100212097168, |
|
"learning_rate": 6.86868686868687e-05, |
|
"loss": 0.0072, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.565656565656566, |
|
"eval_accuracy": 0.9713518352730528, |
|
"eval_loss": 0.14060670137405396, |
|
"eval_runtime": 73.0266, |
|
"eval_samples_per_second": 15.296, |
|
"eval_steps_per_second": 1.917, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.590909090909091, |
|
"grad_norm": 0.23918700218200684, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 0.0821, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 6.616161616161616, |
|
"grad_norm": 0.06384919583797455, |
|
"learning_rate": 6.767676767676769e-05, |
|
"loss": 0.0761, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 6.641414141414142, |
|
"grad_norm": 0.4447100758552551, |
|
"learning_rate": 6.717171717171718e-05, |
|
"loss": 0.0139, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.0030958615243434906, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.0341, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 6.691919191919192, |
|
"grad_norm": 0.05117692053318024, |
|
"learning_rate": 6.616161616161617e-05, |
|
"loss": 0.0152, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 6.717171717171717, |
|
"grad_norm": 0.003273693146184087, |
|
"learning_rate": 6.565656565656566e-05, |
|
"loss": 0.0314, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 6.742424242424242, |
|
"grad_norm": 0.005075991619378328, |
|
"learning_rate": 6.515151515151516e-05, |
|
"loss": 0.0164, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 6.767676767676767, |
|
"grad_norm": 0.23585616052150726, |
|
"learning_rate": 6.464646464646466e-05, |
|
"loss": 0.0139, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 6.792929292929293, |
|
"grad_norm": 6.123977184295654, |
|
"learning_rate": 6.414141414141415e-05, |
|
"loss": 0.0113, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 6.818181818181818, |
|
"grad_norm": 2.395871162414551, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 0.0183, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.818181818181818, |
|
"eval_accuracy": 0.9749328558639212, |
|
"eval_loss": 0.11515188962221146, |
|
"eval_runtime": 73.0277, |
|
"eval_samples_per_second": 15.296, |
|
"eval_steps_per_second": 1.917, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.843434343434343, |
|
"grad_norm": 0.005218807607889175, |
|
"learning_rate": 6.313131313131313e-05, |
|
"loss": 0.003, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 6.8686868686868685, |
|
"grad_norm": 0.0012497535208240151, |
|
"learning_rate": 6.262626262626264e-05, |
|
"loss": 0.0116, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 6.893939393939394, |
|
"grad_norm": 0.0025018516462296247, |
|
"learning_rate": 6.212121212121213e-05, |
|
"loss": 0.005, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 6.91919191919192, |
|
"grad_norm": 0.005596707109361887, |
|
"learning_rate": 6.161616161616162e-05, |
|
"loss": 0.037, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 6.944444444444445, |
|
"grad_norm": 0.0010910239070653915, |
|
"learning_rate": 6.111111111111112e-05, |
|
"loss": 0.0338, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 6.96969696969697, |
|
"grad_norm": 0.6075408458709717, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 0.0268, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 6.994949494949495, |
|
"grad_norm": 0.25022584199905396, |
|
"learning_rate": 6.01010101010101e-05, |
|
"loss": 0.0125, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 7.02020202020202, |
|
"grad_norm": 0.12169167399406433, |
|
"learning_rate": 5.959595959595959e-05, |
|
"loss": 0.0082, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 7.045454545454546, |
|
"grad_norm": 3.5715599060058594, |
|
"learning_rate": 5.90909090909091e-05, |
|
"loss": 0.0144, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 7.070707070707071, |
|
"grad_norm": 0.09293267875909805, |
|
"learning_rate": 5.858585858585859e-05, |
|
"loss": 0.0021, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.070707070707071, |
|
"eval_accuracy": 0.973142345568487, |
|
"eval_loss": 0.13676650822162628, |
|
"eval_runtime": 72.9405, |
|
"eval_samples_per_second": 15.314, |
|
"eval_steps_per_second": 1.919, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.095959595959596, |
|
"grad_norm": 0.009541651234030724, |
|
"learning_rate": 5.808080808080808e-05, |
|
"loss": 0.0058, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 7.121212121212121, |
|
"grad_norm": 0.0016315419925376773, |
|
"learning_rate": 5.757575757575758e-05, |
|
"loss": 0.0064, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 7.146464646464646, |
|
"grad_norm": 10.356843948364258, |
|
"learning_rate": 5.707070707070707e-05, |
|
"loss": 0.0595, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 7.171717171717171, |
|
"grad_norm": 0.0018419253174215555, |
|
"learning_rate": 5.6565656565656563e-05, |
|
"loss": 0.016, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 7.196969696969697, |
|
"grad_norm": 0.010135513730347157, |
|
"learning_rate": 5.606060606060606e-05, |
|
"loss": 0.052, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 7.222222222222222, |
|
"grad_norm": 6.740849494934082, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 0.0374, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 7.247474747474747, |
|
"grad_norm": 0.4412079155445099, |
|
"learning_rate": 5.5050505050505056e-05, |
|
"loss": 0.0117, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 0.001609967672266066, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.0824, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 7.297979797979798, |
|
"grad_norm": 0.005415134131908417, |
|
"learning_rate": 5.4040404040404044e-05, |
|
"loss": 0.0177, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 7.3232323232323235, |
|
"grad_norm": 0.02915014885365963, |
|
"learning_rate": 5.353535353535354e-05, |
|
"loss": 0.046, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.3232323232323235, |
|
"eval_accuracy": 0.9794091316025068, |
|
"eval_loss": 0.09002197533845901, |
|
"eval_runtime": 73.1136, |
|
"eval_samples_per_second": 15.278, |
|
"eval_steps_per_second": 1.915, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.348484848484849, |
|
"grad_norm": 0.020192056894302368, |
|
"learning_rate": 5.303030303030303e-05, |
|
"loss": 0.0004, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 7.373737373737374, |
|
"grad_norm": 0.7057023644447327, |
|
"learning_rate": 5.2525252525252536e-05, |
|
"loss": 0.0699, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 7.398989898989899, |
|
"grad_norm": 0.0018105951603502035, |
|
"learning_rate": 5.2020202020202026e-05, |
|
"loss": 0.0379, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 7.424242424242424, |
|
"grad_norm": 0.002236352302134037, |
|
"learning_rate": 5.151515151515152e-05, |
|
"loss": 0.0576, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 7.44949494949495, |
|
"grad_norm": 0.46005484461784363, |
|
"learning_rate": 5.101010101010101e-05, |
|
"loss": 0.0007, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 7.474747474747475, |
|
"grad_norm": 0.17090271413326263, |
|
"learning_rate": 5.050505050505051e-05, |
|
"loss": 0.0066, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.002259742235764861, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0043, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 7.525252525252525, |
|
"grad_norm": 0.0029255333356559277, |
|
"learning_rate": 4.94949494949495e-05, |
|
"loss": 0.0239, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 7.55050505050505, |
|
"grad_norm": 2.9925894737243652, |
|
"learning_rate": 4.898989898989899e-05, |
|
"loss": 0.0063, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 7.575757575757576, |
|
"grad_norm": 0.052914004772901535, |
|
"learning_rate": 4.848484848484849e-05, |
|
"loss": 0.033, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.575757575757576, |
|
"eval_accuracy": 0.9785138764547896, |
|
"eval_loss": 0.10143210738897324, |
|
"eval_runtime": 73.4907, |
|
"eval_samples_per_second": 15.199, |
|
"eval_steps_per_second": 1.905, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.601010101010101, |
|
"grad_norm": 0.04058058559894562, |
|
"learning_rate": 4.797979797979798e-05, |
|
"loss": 0.0245, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 7.626262626262626, |
|
"grad_norm": 0.03967829421162605, |
|
"learning_rate": 4.7474747474747476e-05, |
|
"loss": 0.0006, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 7.651515151515151, |
|
"grad_norm": 0.621035635471344, |
|
"learning_rate": 4.696969696969697e-05, |
|
"loss": 0.0175, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 7.6767676767676765, |
|
"grad_norm": 0.36977216601371765, |
|
"learning_rate": 4.6464646464646464e-05, |
|
"loss": 0.0388, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 7.702020202020202, |
|
"grad_norm": 3.2532241344451904, |
|
"learning_rate": 4.595959595959596e-05, |
|
"loss": 0.0905, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 7.7272727272727275, |
|
"grad_norm": 0.004156060051172972, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.0002, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 7.752525252525253, |
|
"grad_norm": 0.6550003290176392, |
|
"learning_rate": 4.494949494949495e-05, |
|
"loss": 0.0066, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 7.777777777777778, |
|
"grad_norm": 0.0028251020703464746, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.0083, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 7.803030303030303, |
|
"grad_norm": 0.008767428807914257, |
|
"learning_rate": 4.3939393939393944e-05, |
|
"loss": 0.0006, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 7.828282828282829, |
|
"grad_norm": 0.04811250418424606, |
|
"learning_rate": 4.343434343434344e-05, |
|
"loss": 0.0354, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 7.828282828282829, |
|
"eval_accuracy": 0.9767233661593554, |
|
"eval_loss": 0.09683331102132797, |
|
"eval_runtime": 73.2348, |
|
"eval_samples_per_second": 15.252, |
|
"eval_steps_per_second": 1.912, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 7.853535353535354, |
|
"grad_norm": 0.00525275431573391, |
|
"learning_rate": 4.292929292929293e-05, |
|
"loss": 0.0088, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 7.878787878787879, |
|
"grad_norm": 0.015972474589943886, |
|
"learning_rate": 4.242424242424243e-05, |
|
"loss": 0.0011, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 7.904040404040404, |
|
"grad_norm": 0.006997071672230959, |
|
"learning_rate": 4.191919191919192e-05, |
|
"loss": 0.0017, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 7.929292929292929, |
|
"grad_norm": 0.023101719096302986, |
|
"learning_rate": 4.141414141414142e-05, |
|
"loss": 0.0567, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 7.954545454545455, |
|
"grad_norm": 0.003169642062857747, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 0.1026, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 7.97979797979798, |
|
"grad_norm": 0.003613903187215328, |
|
"learning_rate": 4.0404040404040405e-05, |
|
"loss": 0.005, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 8.005050505050505, |
|
"grad_norm": 1.0490131378173828, |
|
"learning_rate": 3.98989898989899e-05, |
|
"loss": 0.0023, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 8.030303030303031, |
|
"grad_norm": 0.003916851244866848, |
|
"learning_rate": 3.939393939393939e-05, |
|
"loss": 0.0023, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 8.055555555555555, |
|
"grad_norm": 0.016336582601070404, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.0079, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 8.080808080808081, |
|
"grad_norm": 0.8970369696617126, |
|
"learning_rate": 3.838383838383838e-05, |
|
"loss": 0.0026, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.080808080808081, |
|
"eval_accuracy": 0.973142345568487, |
|
"eval_loss": 0.1217464730143547, |
|
"eval_runtime": 73.5035, |
|
"eval_samples_per_second": 15.197, |
|
"eval_steps_per_second": 1.905, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.106060606060606, |
|
"grad_norm": 0.03298179805278778, |
|
"learning_rate": 3.787878787878788e-05, |
|
"loss": 0.0051, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 8.131313131313131, |
|
"grad_norm": 0.5918856263160706, |
|
"learning_rate": 3.7373737373737376e-05, |
|
"loss": 0.032, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 8.156565656565657, |
|
"grad_norm": 0.0031904878560453653, |
|
"learning_rate": 3.686868686868687e-05, |
|
"loss": 0.029, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 0.043024152517318726, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.0003, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 8.207070707070708, |
|
"grad_norm": 0.011919928714632988, |
|
"learning_rate": 3.5858585858585855e-05, |
|
"loss": 0.0028, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 8.232323232323232, |
|
"grad_norm": 0.007164669223129749, |
|
"learning_rate": 3.535353535353535e-05, |
|
"loss": 0.0146, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 8.257575757575758, |
|
"grad_norm": 0.03415270894765854, |
|
"learning_rate": 3.484848484848485e-05, |
|
"loss": 0.0041, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 8.282828282828282, |
|
"grad_norm": 0.03534342721104622, |
|
"learning_rate": 3.434343434343435e-05, |
|
"loss": 0.0035, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 8.308080808080808, |
|
"grad_norm": 0.3735661804676056, |
|
"learning_rate": 3.3838383838383844e-05, |
|
"loss": 0.0745, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 0.0013512909645214677, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.0002, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"eval_accuracy": 0.9794091316025068, |
|
"eval_loss": 0.08283615112304688, |
|
"eval_runtime": 73.1651, |
|
"eval_samples_per_second": 15.267, |
|
"eval_steps_per_second": 1.913, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.358585858585858, |
|
"grad_norm": 0.023621654137969017, |
|
"learning_rate": 3.282828282828283e-05, |
|
"loss": 0.0174, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 8.383838383838384, |
|
"grad_norm": 0.006960035767406225, |
|
"learning_rate": 3.232323232323233e-05, |
|
"loss": 0.0004, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 8.409090909090908, |
|
"grad_norm": 0.0008190835942514241, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 0.0374, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 8.434343434343434, |
|
"grad_norm": 0.016193361952900887, |
|
"learning_rate": 3.131313131313132e-05, |
|
"loss": 0.0007, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 8.45959595959596, |
|
"grad_norm": 0.2075665146112442, |
|
"learning_rate": 3.080808080808081e-05, |
|
"loss": 0.0422, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 8.484848484848484, |
|
"grad_norm": 0.009178784675896168, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 0.0332, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 8.51010101010101, |
|
"grad_norm": 8.036938667297363, |
|
"learning_rate": 2.9797979797979796e-05, |
|
"loss": 0.0436, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 8.535353535353535, |
|
"grad_norm": 0.0013093262678012252, |
|
"learning_rate": 2.9292929292929294e-05, |
|
"loss": 0.0109, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 8.56060606060606, |
|
"grad_norm": 0.0033100605942308903, |
|
"learning_rate": 2.878787878787879e-05, |
|
"loss": 0.0011, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 8.585858585858587, |
|
"grad_norm": 0.0015343882841989398, |
|
"learning_rate": 2.8282828282828282e-05, |
|
"loss": 0.0006, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.585858585858587, |
|
"eval_accuracy": 0.9794091316025068, |
|
"eval_loss": 0.09259337186813354, |
|
"eval_runtime": 72.8639, |
|
"eval_samples_per_second": 15.33, |
|
"eval_steps_per_second": 1.921, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.61111111111111, |
|
"grad_norm": 0.030406756326556206, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.0026, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 8.636363636363637, |
|
"grad_norm": 0.0022419544402509928, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.0007, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 8.66161616161616, |
|
"grad_norm": 0.0011131414212286472, |
|
"learning_rate": 2.676767676767677e-05, |
|
"loss": 0.0006, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 8.686868686868687, |
|
"grad_norm": 0.005616435315459967, |
|
"learning_rate": 2.6262626262626268e-05, |
|
"loss": 0.0003, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 8.712121212121213, |
|
"grad_norm": 0.1008942499756813, |
|
"learning_rate": 2.575757575757576e-05, |
|
"loss": 0.0097, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 8.737373737373737, |
|
"grad_norm": 0.002821123693138361, |
|
"learning_rate": 2.5252525252525256e-05, |
|
"loss": 0.0669, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 8.762626262626263, |
|
"grad_norm": 0.013286658562719822, |
|
"learning_rate": 2.474747474747475e-05, |
|
"loss": 0.0265, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 8.787878787878787, |
|
"grad_norm": 0.003963208291679621, |
|
"learning_rate": 2.4242424242424244e-05, |
|
"loss": 0.0178, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 8.813131313131313, |
|
"grad_norm": 0.002018690574914217, |
|
"learning_rate": 2.3737373737373738e-05, |
|
"loss": 0.0082, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 8.83838383838384, |
|
"grad_norm": 0.1014542207121849, |
|
"learning_rate": 2.3232323232323232e-05, |
|
"loss": 0.0006, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 8.83838383838384, |
|
"eval_accuracy": 0.9794091316025068, |
|
"eval_loss": 0.10012003779411316, |
|
"eval_runtime": 73.1859, |
|
"eval_samples_per_second": 15.263, |
|
"eval_steps_per_second": 1.913, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 8.863636363636363, |
|
"grad_norm": 0.002746024401858449, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.0063, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.0018340348033234477, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.0024, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 8.914141414141413, |
|
"grad_norm": 0.004108617547899485, |
|
"learning_rate": 2.171717171717172e-05, |
|
"loss": 0.0083, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 8.93939393939394, |
|
"grad_norm": 0.00315410690382123, |
|
"learning_rate": 2.1212121212121215e-05, |
|
"loss": 0.0462, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 8.964646464646465, |
|
"grad_norm": 0.024781817570328712, |
|
"learning_rate": 2.070707070707071e-05, |
|
"loss": 0.0029, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 8.98989898989899, |
|
"grad_norm": 0.005382045172154903, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 0.0047, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 9.015151515151516, |
|
"grad_norm": 1.6344341039657593, |
|
"learning_rate": 1.9696969696969697e-05, |
|
"loss": 0.0038, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 9.04040404040404, |
|
"grad_norm": 0.010318132117390633, |
|
"learning_rate": 1.919191919191919e-05, |
|
"loss": 0.0096, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 9.065656565656566, |
|
"grad_norm": 0.0016402292530983686, |
|
"learning_rate": 1.8686868686868688e-05, |
|
"loss": 0.0321, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 0.004027374088764191, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.0006, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"eval_accuracy": 0.9847806624888094, |
|
"eval_loss": 0.08629997074604034, |
|
"eval_runtime": 73.127, |
|
"eval_samples_per_second": 15.275, |
|
"eval_steps_per_second": 1.914, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.116161616161616, |
|
"grad_norm": 0.0007902685320004821, |
|
"learning_rate": 1.7676767676767676e-05, |
|
"loss": 0.0059, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 9.141414141414142, |
|
"grad_norm": 0.0024135063868016005, |
|
"learning_rate": 1.7171717171717173e-05, |
|
"loss": 0.0269, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 9.166666666666666, |
|
"grad_norm": 0.026507705450057983, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.0003, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 9.191919191919192, |
|
"grad_norm": 0.10678762197494507, |
|
"learning_rate": 1.6161616161616165e-05, |
|
"loss": 0.0059, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 9.217171717171718, |
|
"grad_norm": 0.08362487703561783, |
|
"learning_rate": 1.565656565656566e-05, |
|
"loss": 0.0545, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 9.242424242424242, |
|
"grad_norm": 0.002414940157905221, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 0.0221, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 9.267676767676768, |
|
"grad_norm": 0.0013868235982954502, |
|
"learning_rate": 1.4646464646464647e-05, |
|
"loss": 0.0005, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 9.292929292929292, |
|
"grad_norm": 0.0013921884819865227, |
|
"learning_rate": 1.4141414141414141e-05, |
|
"loss": 0.041, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 9.318181818181818, |
|
"grad_norm": 0.08867702633142471, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 0.026, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 9.343434343434343, |
|
"grad_norm": 0.0012104762718081474, |
|
"learning_rate": 1.3131313131313134e-05, |
|
"loss": 0.0633, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.343434343434343, |
|
"eval_accuracy": 0.9803043867502238, |
|
"eval_loss": 0.09109070897102356, |
|
"eval_runtime": 71.4974, |
|
"eval_samples_per_second": 15.623, |
|
"eval_steps_per_second": 1.958, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 9.368686868686869, |
|
"grad_norm": 0.007544935215264559, |
|
"learning_rate": 1.2626262626262628e-05, |
|
"loss": 0.002, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 9.393939393939394, |
|
"grad_norm": 0.01898648589849472, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 0.0005, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 9.419191919191919, |
|
"grad_norm": 0.00644712382927537, |
|
"learning_rate": 1.1616161616161616e-05, |
|
"loss": 0.0059, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 9.444444444444445, |
|
"grad_norm": 0.00872492603957653, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.0011, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 9.469696969696969, |
|
"grad_norm": 1.6075825691223145, |
|
"learning_rate": 1.0606060606060607e-05, |
|
"loss": 0.0099, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 9.494949494949495, |
|
"grad_norm": 6.320465087890625, |
|
"learning_rate": 1.0101010101010101e-05, |
|
"loss": 0.0163, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 9.52020202020202, |
|
"grad_norm": 0.0037208800204098225, |
|
"learning_rate": 9.595959595959595e-06, |
|
"loss": 0.0002, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 9.545454545454545, |
|
"grad_norm": 3.3599369525909424, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.0053, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 9.570707070707071, |
|
"grad_norm": 0.5879691243171692, |
|
"learning_rate": 8.585858585858587e-06, |
|
"loss": 0.0019, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 9.595959595959595, |
|
"grad_norm": 0.26342862844467163, |
|
"learning_rate": 8.080808080808082e-06, |
|
"loss": 0.0009, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.595959595959595, |
|
"eval_accuracy": 0.982094897045658, |
|
"eval_loss": 0.09413682669401169, |
|
"eval_runtime": 73.1451, |
|
"eval_samples_per_second": 15.271, |
|
"eval_steps_per_second": 1.914, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.621212121212121, |
|
"grad_norm": 0.042649831622838974, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 0.0226, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 9.646464646464647, |
|
"grad_norm": 0.0022528120316565037, |
|
"learning_rate": 7.0707070707070704e-06, |
|
"loss": 0.0136, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 9.671717171717171, |
|
"grad_norm": 0.12108311802148819, |
|
"learning_rate": 6.565656565656567e-06, |
|
"loss": 0.0408, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 9.696969696969697, |
|
"grad_norm": 0.7086867690086365, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 0.0035, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 9.722222222222221, |
|
"grad_norm": 0.049748744815588, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.0012, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 9.747474747474747, |
|
"grad_norm": 0.004345474299043417, |
|
"learning_rate": 5.050505050505051e-06, |
|
"loss": 0.0002, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 9.772727272727273, |
|
"grad_norm": 0.005164165981113911, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.0049, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 9.797979797979798, |
|
"grad_norm": 0.003518365090712905, |
|
"learning_rate": 4.040404040404041e-06, |
|
"loss": 0.002, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 9.823232323232324, |
|
"grad_norm": 0.0017797194886952639, |
|
"learning_rate": 3.5353535353535352e-06, |
|
"loss": 0.0005, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 9.848484848484848, |
|
"grad_norm": 4.788568496704102, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 0.0247, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 9.848484848484848, |
|
"eval_accuracy": 0.9785138764547896, |
|
"eval_loss": 0.09876807779073715, |
|
"eval_runtime": 73.1729, |
|
"eval_samples_per_second": 15.265, |
|
"eval_steps_per_second": 1.913, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 9.873737373737374, |
|
"grad_norm": 0.0013341947924345732, |
|
"learning_rate": 2.5252525252525253e-06, |
|
"loss": 0.0082, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 9.8989898989899, |
|
"grad_norm": 0.004278136417269707, |
|
"learning_rate": 2.0202020202020206e-06, |
|
"loss": 0.0019, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 9.924242424242424, |
|
"grad_norm": 0.002301498083397746, |
|
"learning_rate": 1.5151515151515152e-06, |
|
"loss": 0.0245, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 9.94949494949495, |
|
"grad_norm": 0.000858976156450808, |
|
"learning_rate": 1.0101010101010103e-06, |
|
"loss": 0.0013, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 9.974747474747474, |
|
"grad_norm": 0.007369679398834705, |
|
"learning_rate": 5.050505050505052e-07, |
|
"loss": 0.0774, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.008844327181577682, |
|
"learning_rate": 0.0, |
|
"loss": 0.0004, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 3960, |
|
"total_flos": 4.904158054749069e+18, |
|
"train_loss": 0.06213315485569771, |
|
"train_runtime": 7084.9204, |
|
"train_samples_per_second": 8.93, |
|
"train_steps_per_second": 0.559 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3960, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.904158054749069e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|