{ "best_metric": 0.08193562924861908, "best_model_checkpoint": "realFake-img/checkpoint-2500", "epoch": 10.0, "eval_steps": 100, "global_step": 3960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025252525252525252, "grad_norm": 8.32949447631836, "learning_rate": 0.0001994949494949495, "loss": 0.1124, "step": 10 }, { "epoch": 0.050505050505050504, "grad_norm": 4.660865306854248, "learning_rate": 0.000198989898989899, "loss": 0.2631, "step": 20 }, { "epoch": 0.07575757575757576, "grad_norm": 4.1171956062316895, "learning_rate": 0.0001984848484848485, "loss": 0.1366, "step": 30 }, { "epoch": 0.10101010101010101, "grad_norm": 4.586099147796631, "learning_rate": 0.000197979797979798, "loss": 0.1395, "step": 40 }, { "epoch": 0.12626262626262627, "grad_norm": 3.6707675457000732, "learning_rate": 0.0001974747474747475, "loss": 0.178, "step": 50 }, { "epoch": 0.15151515151515152, "grad_norm": 0.39073047041893005, "learning_rate": 0.00019696969696969698, "loss": 0.2038, "step": 60 }, { "epoch": 0.17676767676767677, "grad_norm": 3.4298012256622314, "learning_rate": 0.0001964646464646465, "loss": 0.0964, "step": 70 }, { "epoch": 0.20202020202020202, "grad_norm": 4.532003402709961, "learning_rate": 0.00019595959595959596, "loss": 0.171, "step": 80 }, { "epoch": 0.22727272727272727, "grad_norm": 2.3665497303009033, "learning_rate": 0.00019545454545454548, "loss": 0.1166, "step": 90 }, { "epoch": 0.25252525252525254, "grad_norm": 1.0514458417892456, "learning_rate": 0.00019494949494949494, "loss": 0.2578, "step": 100 }, { "epoch": 0.25252525252525254, "eval_accuracy": 0.9418084153983886, "eval_loss": 0.1593756079673767, "eval_runtime": 72.9833, "eval_samples_per_second": 15.305, "eval_steps_per_second": 1.918, "step": 100 }, { "epoch": 0.2777777777777778, "grad_norm": 2.9928767681121826, "learning_rate": 0.00019444444444444446, "loss": 0.1794, "step": 110 }, { "epoch": 0.30303030303030304, "grad_norm": 0.6943581104278564, "learning_rate": 0.00019393939393939395, "loss": 0.1713, "step": 120 }, { "epoch": 0.3282828282828283, "grad_norm": 5.296023845672607, "learning_rate": 0.00019343434343434344, "loss": 0.1822, "step": 130 }, { "epoch": 0.35353535353535354, "grad_norm": 4.849494934082031, "learning_rate": 0.00019292929292929293, "loss": 0.1667, "step": 140 }, { "epoch": 0.3787878787878788, "grad_norm": 2.1953601837158203, "learning_rate": 0.00019242424242424245, "loss": 0.1353, "step": 150 }, { "epoch": 0.40404040404040403, "grad_norm": 3.5325512886047363, "learning_rate": 0.00019191919191919191, "loss": 0.2191, "step": 160 }, { "epoch": 0.4292929292929293, "grad_norm": 1.513462781906128, "learning_rate": 0.00019141414141414143, "loss": 0.0864, "step": 170 }, { "epoch": 0.45454545454545453, "grad_norm": 1.1227214336395264, "learning_rate": 0.00019090909090909092, "loss": 0.0972, "step": 180 }, { "epoch": 0.4797979797979798, "grad_norm": 4.3201212882995605, "learning_rate": 0.0001904040404040404, "loss": 0.1356, "step": 190 }, { "epoch": 0.5050505050505051, "grad_norm": 0.13399846851825714, "learning_rate": 0.0001898989898989899, "loss": 0.0944, "step": 200 }, { "epoch": 0.5050505050505051, "eval_accuracy": 0.937332139659803, "eval_loss": 0.22425174713134766, "eval_runtime": 72.9458, "eval_samples_per_second": 15.313, "eval_steps_per_second": 1.919, "step": 200 }, { "epoch": 0.5303030303030303, "grad_norm": 0.07937999069690704, "learning_rate": 0.00018939393939393942, "loss": 0.0798, "step": 210 }, { "epoch": 0.5555555555555556, "grad_norm": 6.126536846160889, "learning_rate": 0.00018888888888888888, "loss": 0.2437, "step": 220 }, { "epoch": 0.5808080808080808, "grad_norm": 8.01685619354248, "learning_rate": 0.0001883838383838384, "loss": 0.2746, "step": 230 }, { "epoch": 0.6060606060606061, "grad_norm": 3.1425938606262207, "learning_rate": 0.0001878787878787879, "loss": 0.1937, "step": 240 }, { "epoch": 0.6313131313131313, "grad_norm": 1.1262303590774536, "learning_rate": 0.00018737373737373738, "loss": 0.2495, "step": 250 }, { "epoch": 0.6565656565656566, "grad_norm": 3.994985342025757, "learning_rate": 0.00018686868686868687, "loss": 0.0914, "step": 260 }, { "epoch": 0.6818181818181818, "grad_norm": 3.6686558723449707, "learning_rate": 0.00018636363636363636, "loss": 0.1241, "step": 270 }, { "epoch": 0.7070707070707071, "grad_norm": 2.8421552181243896, "learning_rate": 0.00018585858585858586, "loss": 0.162, "step": 280 }, { "epoch": 0.7323232323232324, "grad_norm": 0.06576777994632721, "learning_rate": 0.00018535353535353537, "loss": 0.0863, "step": 290 }, { "epoch": 0.7575757575757576, "grad_norm": 3.127112865447998, "learning_rate": 0.00018484848484848484, "loss": 0.1747, "step": 300 }, { "epoch": 0.7575757575757576, "eval_accuracy": 0.9292748433303492, "eval_loss": 0.24716989696025848, "eval_runtime": 73.2274, "eval_samples_per_second": 15.254, "eval_steps_per_second": 1.912, "step": 300 }, { "epoch": 0.7828282828282829, "grad_norm": 1.235567569732666, "learning_rate": 0.00018434343434343435, "loss": 0.0742, "step": 310 }, { "epoch": 0.8080808080808081, "grad_norm": 5.305884838104248, "learning_rate": 0.00018383838383838384, "loss": 0.1013, "step": 320 }, { "epoch": 0.8333333333333334, "grad_norm": 3.124811887741089, "learning_rate": 0.00018333333333333334, "loss": 0.2439, "step": 330 }, { "epoch": 0.8585858585858586, "grad_norm": 5.361472129821777, "learning_rate": 0.00018282828282828283, "loss": 0.0468, "step": 340 }, { "epoch": 0.8838383838383839, "grad_norm": 3.3062198162078857, "learning_rate": 0.00018232323232323234, "loss": 0.0855, "step": 350 }, { "epoch": 0.9090909090909091, "grad_norm": 1.9714092016220093, "learning_rate": 0.00018181818181818183, "loss": 0.1645, "step": 360 }, { "epoch": 0.9343434343434344, "grad_norm": 1.7579039335250854, "learning_rate": 0.00018131313131313132, "loss": 0.193, "step": 370 }, { "epoch": 0.9595959595959596, "grad_norm": 3.588534355163574, "learning_rate": 0.00018080808080808082, "loss": 0.1305, "step": 380 }, { "epoch": 0.9848484848484849, "grad_norm": 6.151834487915039, "learning_rate": 0.0001803030303030303, "loss": 0.1004, "step": 390 }, { "epoch": 1.0101010101010102, "grad_norm": 3.521318197250366, "learning_rate": 0.0001797979797979798, "loss": 0.1328, "step": 400 }, { "epoch": 1.0101010101010102, "eval_accuracy": 0.9337511190689346, "eval_loss": 0.17739379405975342, "eval_runtime": 72.9497, "eval_samples_per_second": 15.312, "eval_steps_per_second": 1.919, "step": 400 }, { "epoch": 1.0353535353535352, "grad_norm": 0.5116239786148071, "learning_rate": 0.00017929292929292931, "loss": 0.0932, "step": 410 }, { "epoch": 1.0606060606060606, "grad_norm": 0.37958571314811707, "learning_rate": 0.0001787878787878788, "loss": 0.0538, "step": 420 }, { "epoch": 1.0858585858585859, "grad_norm": 3.976700782775879, "learning_rate": 0.0001782828282828283, "loss": 0.2245, "step": 430 }, { "epoch": 1.1111111111111112, "grad_norm": 2.8285045623779297, "learning_rate": 0.00017777777777777779, "loss": 0.1332, "step": 440 }, { "epoch": 1.1363636363636362, "grad_norm": 3.683419704437256, "learning_rate": 0.00017727272727272728, "loss": 0.1162, "step": 450 }, { "epoch": 1.1616161616161615, "grad_norm": 4.30293607711792, "learning_rate": 0.0001767676767676768, "loss": 0.0678, "step": 460 }, { "epoch": 1.1868686868686869, "grad_norm": 0.15934455394744873, "learning_rate": 0.00017626262626262626, "loss": 0.1587, "step": 470 }, { "epoch": 1.2121212121212122, "grad_norm": 1.5525578260421753, "learning_rate": 0.00017575757575757578, "loss": 0.0637, "step": 480 }, { "epoch": 1.2373737373737375, "grad_norm": 1.534348964691162, "learning_rate": 0.00017525252525252527, "loss": 0.1103, "step": 490 }, { "epoch": 1.2626262626262625, "grad_norm": 1.6843178272247314, "learning_rate": 0.00017474747474747476, "loss": 0.1918, "step": 500 }, { "epoch": 1.2626262626262625, "eval_accuracy": 0.9570277529095792, "eval_loss": 0.12820282578468323, "eval_runtime": 73.1443, "eval_samples_per_second": 15.271, "eval_steps_per_second": 1.914, "step": 500 }, { "epoch": 1.2878787878787878, "grad_norm": 0.6296999454498291, "learning_rate": 0.00017424242424242425, "loss": 0.0461, "step": 510 }, { "epoch": 1.3131313131313131, "grad_norm": 4.980341911315918, "learning_rate": 0.00017373737373737377, "loss": 0.1479, "step": 520 }, { "epoch": 1.3383838383838385, "grad_norm": 0.36140933632850647, "learning_rate": 0.00017323232323232323, "loss": 0.0726, "step": 530 }, { "epoch": 1.3636363636363638, "grad_norm": 0.2907123267650604, "learning_rate": 0.00017272727272727275, "loss": 0.1109, "step": 540 }, { "epoch": 1.3888888888888888, "grad_norm": 1.1450049877166748, "learning_rate": 0.00017222222222222224, "loss": 0.0888, "step": 550 }, { "epoch": 1.4141414141414141, "grad_norm": 3.324134588241577, "learning_rate": 0.00017171717171717173, "loss": 0.1074, "step": 560 }, { "epoch": 1.4393939393939394, "grad_norm": 0.9428613185882568, "learning_rate": 0.00017121212121212122, "loss": 0.0856, "step": 570 }, { "epoch": 1.4646464646464645, "grad_norm": 0.1330060064792633, "learning_rate": 0.0001707070707070707, "loss": 0.061, "step": 580 }, { "epoch": 1.4898989898989898, "grad_norm": 4.435102939605713, "learning_rate": 0.0001702020202020202, "loss": 0.1137, "step": 590 }, { "epoch": 1.5151515151515151, "grad_norm": 2.5744283199310303, "learning_rate": 0.00016969696969696972, "loss": 0.169, "step": 600 }, { "epoch": 1.5151515151515151, "eval_accuracy": 0.9346463742166518, "eval_loss": 0.2247086614370346, "eval_runtime": 73.2754, "eval_samples_per_second": 15.244, "eval_steps_per_second": 1.911, "step": 600 }, { "epoch": 1.5404040404040404, "grad_norm": 3.7209930419921875, "learning_rate": 0.00016919191919191918, "loss": 0.1929, "step": 610 }, { "epoch": 1.5656565656565657, "grad_norm": 4.9047322273254395, "learning_rate": 0.0001686868686868687, "loss": 0.144, "step": 620 }, { "epoch": 1.5909090909090908, "grad_norm": 8.181381225585938, "learning_rate": 0.0001681818181818182, "loss": 0.1008, "step": 630 }, { "epoch": 1.6161616161616161, "grad_norm": 0.5650784969329834, "learning_rate": 0.00016767676767676768, "loss": 0.1385, "step": 640 }, { "epoch": 1.6414141414141414, "grad_norm": 0.4483976364135742, "learning_rate": 0.00016717171717171717, "loss": 0.1112, "step": 650 }, { "epoch": 1.6666666666666665, "grad_norm": 2.8870067596435547, "learning_rate": 0.0001666666666666667, "loss": 0.0868, "step": 660 }, { "epoch": 1.691919191919192, "grad_norm": 5.016068458557129, "learning_rate": 0.00016616161616161615, "loss": 0.0948, "step": 670 }, { "epoch": 1.7171717171717171, "grad_norm": 4.62065315246582, "learning_rate": 0.00016565656565656567, "loss": 0.2336, "step": 680 }, { "epoch": 1.7424242424242424, "grad_norm": 0.04882610961794853, "learning_rate": 0.00016515151515151516, "loss": 0.1006, "step": 690 }, { "epoch": 1.7676767676767677, "grad_norm": 1.2523910999298096, "learning_rate": 0.00016464646464646465, "loss": 0.2595, "step": 700 }, { "epoch": 1.7676767676767677, "eval_accuracy": 0.9444941808415398, "eval_loss": 0.1785079687833786, "eval_runtime": 73.2828, "eval_samples_per_second": 15.242, "eval_steps_per_second": 1.91, "step": 700 }, { "epoch": 1.7929292929292928, "grad_norm": 0.28372153639793396, "learning_rate": 0.00016414141414141414, "loss": 0.0657, "step": 710 }, { "epoch": 1.8181818181818183, "grad_norm": 0.061366915702819824, "learning_rate": 0.00016363636363636366, "loss": 0.2048, "step": 720 }, { "epoch": 1.8434343434343434, "grad_norm": 2.9858274459838867, "learning_rate": 0.00016313131313131312, "loss": 0.0489, "step": 730 }, { "epoch": 1.8686868686868687, "grad_norm": 4.050809383392334, "learning_rate": 0.00016262626262626264, "loss": 0.1095, "step": 740 }, { "epoch": 1.893939393939394, "grad_norm": 3.725325584411621, "learning_rate": 0.00016212121212121213, "loss": 0.2613, "step": 750 }, { "epoch": 1.9191919191919191, "grad_norm": 2.09786319732666, "learning_rate": 0.00016161616161616162, "loss": 0.0492, "step": 760 }, { "epoch": 1.9444444444444444, "grad_norm": 1.9398726224899292, "learning_rate": 0.0001611111111111111, "loss": 0.0831, "step": 770 }, { "epoch": 1.9696969696969697, "grad_norm": 0.6055514812469482, "learning_rate": 0.0001606060606060606, "loss": 0.1733, "step": 780 }, { "epoch": 1.9949494949494948, "grad_norm": 0.22102850675582886, "learning_rate": 0.00016010101010101012, "loss": 0.1106, "step": 790 }, { "epoch": 2.0202020202020203, "grad_norm": 3.681710720062256, "learning_rate": 0.0001595959595959596, "loss": 0.0911, "step": 800 }, { "epoch": 2.0202020202020203, "eval_accuracy": 0.9534467323187108, "eval_loss": 0.1352938562631607, "eval_runtime": 73.2218, "eval_samples_per_second": 15.255, "eval_steps_per_second": 1.912, "step": 800 }, { "epoch": 2.0454545454545454, "grad_norm": 0.574734091758728, "learning_rate": 0.0001590909090909091, "loss": 0.044, "step": 810 }, { "epoch": 2.0707070707070705, "grad_norm": 0.253918319940567, "learning_rate": 0.0001585858585858586, "loss": 0.0476, "step": 820 }, { "epoch": 2.095959595959596, "grad_norm": 0.1252337247133255, "learning_rate": 0.00015808080808080808, "loss": 0.1279, "step": 830 }, { "epoch": 2.121212121212121, "grad_norm": 0.26320022344589233, "learning_rate": 0.00015757575757575757, "loss": 0.2042, "step": 840 }, { "epoch": 2.1464646464646466, "grad_norm": 0.7983365058898926, "learning_rate": 0.0001570707070707071, "loss": 0.1208, "step": 850 }, { "epoch": 2.1717171717171717, "grad_norm": 0.36479347944259644, "learning_rate": 0.00015656565656565658, "loss": 0.0881, "step": 860 }, { "epoch": 2.196969696969697, "grad_norm": 0.11645219475030899, "learning_rate": 0.00015606060606060607, "loss": 0.0955, "step": 870 }, { "epoch": 2.2222222222222223, "grad_norm": 1.1980379819869995, "learning_rate": 0.00015555555555555556, "loss": 0.077, "step": 880 }, { "epoch": 2.2474747474747474, "grad_norm": 0.06797017902135849, "learning_rate": 0.00015505050505050508, "loss": 0.0377, "step": 890 }, { "epoch": 2.2727272727272725, "grad_norm": 0.48521897196769714, "learning_rate": 0.00015454545454545454, "loss": 0.0548, "step": 900 }, { "epoch": 2.2727272727272725, "eval_accuracy": 0.9471799462846912, "eval_loss": 0.19982792437076569, "eval_runtime": 72.9425, "eval_samples_per_second": 15.313, "eval_steps_per_second": 1.919, "step": 900 }, { "epoch": 2.297979797979798, "grad_norm": 0.017012102529406548, "learning_rate": 0.00015404040404040406, "loss": 0.1089, "step": 910 }, { "epoch": 2.323232323232323, "grad_norm": 0.2808210849761963, "learning_rate": 0.00015353535353535353, "loss": 0.0789, "step": 920 }, { "epoch": 2.3484848484848486, "grad_norm": 4.9768781661987305, "learning_rate": 0.00015303030303030304, "loss": 0.1004, "step": 930 }, { "epoch": 2.3737373737373737, "grad_norm": 1.5323927402496338, "learning_rate": 0.00015252525252525253, "loss": 0.0357, "step": 940 }, { "epoch": 2.398989898989899, "grad_norm": 4.321779251098633, "learning_rate": 0.00015202020202020202, "loss": 0.0348, "step": 950 }, { "epoch": 2.4242424242424243, "grad_norm": 6.227025032043457, "learning_rate": 0.00015151515151515152, "loss": 0.1679, "step": 960 }, { "epoch": 2.4494949494949494, "grad_norm": 1.045432209968567, "learning_rate": 0.00015101010101010103, "loss": 0.1222, "step": 970 }, { "epoch": 2.474747474747475, "grad_norm": 3.0685787200927734, "learning_rate": 0.0001505050505050505, "loss": 0.1434, "step": 980 }, { "epoch": 2.5, "grad_norm": 0.04191284626722336, "learning_rate": 0.00015000000000000001, "loss": 0.086, "step": 990 }, { "epoch": 2.525252525252525, "grad_norm": 3.1016695499420166, "learning_rate": 0.0001494949494949495, "loss": 0.1399, "step": 1000 }, { "epoch": 2.525252525252525, "eval_accuracy": 0.9444941808415398, "eval_loss": 0.19705650210380554, "eval_runtime": 73.3829, "eval_samples_per_second": 15.222, "eval_steps_per_second": 1.908, "step": 1000 }, { "epoch": 2.5505050505050506, "grad_norm": 4.877354145050049, "learning_rate": 0.000148989898989899, "loss": 0.1418, "step": 1010 }, { "epoch": 2.5757575757575757, "grad_norm": 4.7359700202941895, "learning_rate": 0.00014848484848484849, "loss": 0.1084, "step": 1020 }, { "epoch": 2.601010101010101, "grad_norm": 0.7143091559410095, "learning_rate": 0.000147979797979798, "loss": 0.1074, "step": 1030 }, { "epoch": 2.6262626262626263, "grad_norm": 0.4162321388721466, "learning_rate": 0.00014747474747474747, "loss": 0.1317, "step": 1040 }, { "epoch": 2.6515151515151514, "grad_norm": 5.558507442474365, "learning_rate": 0.00014696969696969698, "loss": 0.0829, "step": 1050 }, { "epoch": 2.676767676767677, "grad_norm": 0.08041220903396606, "learning_rate": 0.00014646464646464648, "loss": 0.0905, "step": 1060 }, { "epoch": 2.702020202020202, "grad_norm": 3.554946184158325, "learning_rate": 0.00014595959595959597, "loss": 0.14, "step": 1070 }, { "epoch": 2.7272727272727275, "grad_norm": 0.9108226895332336, "learning_rate": 0.00014545454545454546, "loss": 0.0355, "step": 1080 }, { "epoch": 2.7525252525252526, "grad_norm": 1.091728925704956, "learning_rate": 0.00014494949494949495, "loss": 0.059, "step": 1090 }, { "epoch": 2.7777777777777777, "grad_norm": 0.07620527595281601, "learning_rate": 0.00014444444444444444, "loss": 0.2001, "step": 1100 }, { "epoch": 2.7777777777777777, "eval_accuracy": 0.937332139659803, "eval_loss": 0.24790146946907043, "eval_runtime": 73.1059, "eval_samples_per_second": 15.279, "eval_steps_per_second": 1.915, "step": 1100 }, { "epoch": 2.8030303030303028, "grad_norm": 0.10709954053163528, "learning_rate": 0.00014393939393939396, "loss": 0.0487, "step": 1110 }, { "epoch": 2.8282828282828283, "grad_norm": 4.047976493835449, "learning_rate": 0.00014343434343434342, "loss": 0.0774, "step": 1120 }, { "epoch": 2.8535353535353534, "grad_norm": 2.409966468811035, "learning_rate": 0.00014292929292929294, "loss": 0.0744, "step": 1130 }, { "epoch": 2.878787878787879, "grad_norm": 0.3456668257713318, "learning_rate": 0.00014242424242424243, "loss": 0.0125, "step": 1140 }, { "epoch": 2.904040404040404, "grad_norm": 0.046853143721818924, "learning_rate": 0.00014191919191919192, "loss": 0.0756, "step": 1150 }, { "epoch": 2.929292929292929, "grad_norm": 3.4357807636260986, "learning_rate": 0.0001414141414141414, "loss": 0.1375, "step": 1160 }, { "epoch": 2.9545454545454546, "grad_norm": 1.010414719581604, "learning_rate": 0.00014090909090909093, "loss": 0.0704, "step": 1170 }, { "epoch": 2.9797979797979797, "grad_norm": 0.008091296069324017, "learning_rate": 0.00014040404040404042, "loss": 0.0791, "step": 1180 }, { "epoch": 3.005050505050505, "grad_norm": 1.9511629343032837, "learning_rate": 0.0001398989898989899, "loss": 0.0754, "step": 1190 }, { "epoch": 3.0303030303030303, "grad_norm": 10.075323104858398, "learning_rate": 0.0001393939393939394, "loss": 0.0976, "step": 1200 }, { "epoch": 3.0303030303030303, "eval_accuracy": 0.9498657117278424, "eval_loss": 0.16011768579483032, "eval_runtime": 73.2182, "eval_samples_per_second": 15.256, "eval_steps_per_second": 1.912, "step": 1200 }, { "epoch": 3.0555555555555554, "grad_norm": 0.027206294238567352, "learning_rate": 0.0001388888888888889, "loss": 0.0906, "step": 1210 }, { "epoch": 3.080808080808081, "grad_norm": 1.425262689590454, "learning_rate": 0.0001383838383838384, "loss": 0.0349, "step": 1220 }, { "epoch": 3.106060606060606, "grad_norm": 7.3463616371154785, "learning_rate": 0.0001378787878787879, "loss": 0.0804, "step": 1230 }, { "epoch": 3.1313131313131315, "grad_norm": 1.0737591981887817, "learning_rate": 0.0001373737373737374, "loss": 0.068, "step": 1240 }, { "epoch": 3.1565656565656566, "grad_norm": 7.525305271148682, "learning_rate": 0.00013686868686868688, "loss": 0.1145, "step": 1250 }, { "epoch": 3.1818181818181817, "grad_norm": 0.4561030864715576, "learning_rate": 0.00013636363636363637, "loss": 0.0977, "step": 1260 }, { "epoch": 3.207070707070707, "grad_norm": 0.11276185512542725, "learning_rate": 0.00013585858585858586, "loss": 0.0743, "step": 1270 }, { "epoch": 3.2323232323232323, "grad_norm": 1.0171997547149658, "learning_rate": 0.00013535353535353538, "loss": 0.0775, "step": 1280 }, { "epoch": 3.257575757575758, "grad_norm": 3.1414084434509277, "learning_rate": 0.00013484848484848484, "loss": 0.0309, "step": 1290 }, { "epoch": 3.282828282828283, "grad_norm": 0.037932224571704865, "learning_rate": 0.00013434343434343436, "loss": 0.1291, "step": 1300 }, { "epoch": 3.282828282828283, "eval_accuracy": 0.9588182632050134, "eval_loss": 0.160703644156456, "eval_runtime": 73.0017, "eval_samples_per_second": 15.301, "eval_steps_per_second": 1.918, "step": 1300 }, { "epoch": 3.308080808080808, "grad_norm": 2.9155356884002686, "learning_rate": 0.00013383838383838385, "loss": 0.0215, "step": 1310 }, { "epoch": 3.3333333333333335, "grad_norm": 5.102810382843018, "learning_rate": 0.00013333333333333334, "loss": 0.0716, "step": 1320 }, { "epoch": 3.3585858585858586, "grad_norm": 0.020925424993038177, "learning_rate": 0.00013282828282828283, "loss": 0.0372, "step": 1330 }, { "epoch": 3.3838383838383836, "grad_norm": 0.10292687267065048, "learning_rate": 0.00013232323232323235, "loss": 0.0211, "step": 1340 }, { "epoch": 3.409090909090909, "grad_norm": 2.7968993186950684, "learning_rate": 0.0001318181818181818, "loss": 0.0708, "step": 1350 }, { "epoch": 3.4343434343434343, "grad_norm": 3.1068055629730225, "learning_rate": 0.00013131313131313133, "loss": 0.1007, "step": 1360 }, { "epoch": 3.45959595959596, "grad_norm": 0.032499730587005615, "learning_rate": 0.00013080808080808082, "loss": 0.0713, "step": 1370 }, { "epoch": 3.484848484848485, "grad_norm": 0.20779326558113098, "learning_rate": 0.0001303030303030303, "loss": 0.048, "step": 1380 }, { "epoch": 3.51010101010101, "grad_norm": 5.266826152801514, "learning_rate": 0.0001297979797979798, "loss": 0.193, "step": 1390 }, { "epoch": 3.5353535353535355, "grad_norm": 0.42106470465660095, "learning_rate": 0.00012929292929292932, "loss": 0.0721, "step": 1400 }, { "epoch": 3.5353535353535355, "eval_accuracy": 0.9588182632050134, "eval_loss": 0.18219807744026184, "eval_runtime": 73.033, "eval_samples_per_second": 15.294, "eval_steps_per_second": 1.917, "step": 1400 }, { "epoch": 3.5606060606060606, "grad_norm": 1.7371455430984497, "learning_rate": 0.00012878787878787878, "loss": 0.0927, "step": 1410 }, { "epoch": 3.5858585858585856, "grad_norm": 0.636141836643219, "learning_rate": 0.0001282828282828283, "loss": 0.0295, "step": 1420 }, { "epoch": 3.611111111111111, "grad_norm": 0.10211779177188873, "learning_rate": 0.00012777777777777776, "loss": 0.0287, "step": 1430 }, { "epoch": 3.6363636363636362, "grad_norm": 0.803653359413147, "learning_rate": 0.00012727272727272728, "loss": 0.0621, "step": 1440 }, { "epoch": 3.6616161616161618, "grad_norm": 0.11753907799720764, "learning_rate": 0.00012676767676767677, "loss": 0.0465, "step": 1450 }, { "epoch": 3.686868686868687, "grad_norm": 0.05394851416349411, "learning_rate": 0.00012626262626262626, "loss": 0.0474, "step": 1460 }, { "epoch": 3.712121212121212, "grad_norm": 3.631462574005127, "learning_rate": 0.00012575757575757575, "loss": 0.093, "step": 1470 }, { "epoch": 3.7373737373737375, "grad_norm": 0.1336178481578827, "learning_rate": 0.00012525252525252527, "loss": 0.0736, "step": 1480 }, { "epoch": 3.7626262626262625, "grad_norm": 0.0858420580625534, "learning_rate": 0.00012474747474747473, "loss": 0.1211, "step": 1490 }, { "epoch": 3.787878787878788, "grad_norm": 1.1731150150299072, "learning_rate": 0.00012424242424242425, "loss": 0.0592, "step": 1500 }, { "epoch": 3.787878787878788, "eval_accuracy": 0.9623992837958818, "eval_loss": 0.12546713650226593, "eval_runtime": 73.0966, "eval_samples_per_second": 15.281, "eval_steps_per_second": 1.915, "step": 1500 }, { "epoch": 3.813131313131313, "grad_norm": 1.533412218093872, "learning_rate": 0.00012373737373737374, "loss": 0.0663, "step": 1510 }, { "epoch": 3.8383838383838382, "grad_norm": 7.734765529632568, "learning_rate": 0.00012323232323232323, "loss": 0.075, "step": 1520 }, { "epoch": 3.8636363636363638, "grad_norm": 0.4143606126308441, "learning_rate": 0.00012272727272727272, "loss": 0.0158, "step": 1530 }, { "epoch": 3.888888888888889, "grad_norm": 4.032654762268066, "learning_rate": 0.00012222222222222224, "loss": 0.0898, "step": 1540 }, { "epoch": 3.9141414141414144, "grad_norm": 0.2919144928455353, "learning_rate": 0.00012171717171717172, "loss": 0.0904, "step": 1550 }, { "epoch": 3.9393939393939394, "grad_norm": 6.036355018615723, "learning_rate": 0.00012121212121212122, "loss": 0.0725, "step": 1560 }, { "epoch": 3.9646464646464645, "grad_norm": 0.34402996301651, "learning_rate": 0.0001207070707070707, "loss": 0.0643, "step": 1570 }, { "epoch": 3.98989898989899, "grad_norm": 0.307706356048584, "learning_rate": 0.0001202020202020202, "loss": 0.1061, "step": 1580 }, { "epoch": 4.015151515151516, "grad_norm": 0.04210241511464119, "learning_rate": 0.00011969696969696971, "loss": 0.1015, "step": 1590 }, { "epoch": 4.040404040404041, "grad_norm": 4.686149597167969, "learning_rate": 0.00011919191919191919, "loss": 0.0964, "step": 1600 }, { "epoch": 4.040404040404041, "eval_accuracy": 0.954341987466428, "eval_loss": 0.16204935312271118, "eval_runtime": 72.8935, "eval_samples_per_second": 15.324, "eval_steps_per_second": 1.921, "step": 1600 }, { "epoch": 4.065656565656566, "grad_norm": 0.9774217009544373, "learning_rate": 0.00011868686868686869, "loss": 0.0342, "step": 1610 }, { "epoch": 4.090909090909091, "grad_norm": 2.1450870037078857, "learning_rate": 0.0001181818181818182, "loss": 0.0852, "step": 1620 }, { "epoch": 4.116161616161616, "grad_norm": 4.826761722564697, "learning_rate": 0.00011767676767676767, "loss": 0.0612, "step": 1630 }, { "epoch": 4.141414141414141, "grad_norm": 0.7088700532913208, "learning_rate": 0.00011717171717171717, "loss": 0.0369, "step": 1640 }, { "epoch": 4.166666666666667, "grad_norm": 0.07485224306583405, "learning_rate": 0.00011666666666666668, "loss": 0.0075, "step": 1650 }, { "epoch": 4.191919191919192, "grad_norm": 7.588441371917725, "learning_rate": 0.00011616161616161616, "loss": 0.0492, "step": 1660 }, { "epoch": 4.217171717171717, "grad_norm": 0.06588041037321091, "learning_rate": 0.00011565656565656566, "loss": 0.0619, "step": 1670 }, { "epoch": 4.242424242424242, "grad_norm": 0.3317614495754242, "learning_rate": 0.00011515151515151516, "loss": 0.0504, "step": 1680 }, { "epoch": 4.267676767676767, "grad_norm": 4.261381149291992, "learning_rate": 0.00011464646464646464, "loss": 0.0534, "step": 1690 }, { "epoch": 4.292929292929293, "grad_norm": 1.7030925750732422, "learning_rate": 0.00011414141414141415, "loss": 0.0738, "step": 1700 }, { "epoch": 4.292929292929293, "eval_accuracy": 0.9650850492390332, "eval_loss": 0.12794509530067444, "eval_runtime": 73.4006, "eval_samples_per_second": 15.218, "eval_steps_per_second": 1.907, "step": 1700 }, { "epoch": 4.318181818181818, "grad_norm": 3.9137349128723145, "learning_rate": 0.00011363636363636365, "loss": 0.0269, "step": 1710 }, { "epoch": 4.343434343434343, "grad_norm": 0.012919370085000992, "learning_rate": 0.00011313131313131313, "loss": 0.0314, "step": 1720 }, { "epoch": 4.3686868686868685, "grad_norm": 0.07363598793745041, "learning_rate": 0.00011262626262626263, "loss": 0.0233, "step": 1730 }, { "epoch": 4.393939393939394, "grad_norm": 0.137301966547966, "learning_rate": 0.00011212121212121212, "loss": 0.0863, "step": 1740 }, { "epoch": 4.41919191919192, "grad_norm": 6.548308849334717, "learning_rate": 0.00011161616161616161, "loss": 0.0463, "step": 1750 }, { "epoch": 4.444444444444445, "grad_norm": 2.40230655670166, "learning_rate": 0.00011111111111111112, "loss": 0.0668, "step": 1760 }, { "epoch": 4.46969696969697, "grad_norm": 0.018276751041412354, "learning_rate": 0.00011060606060606061, "loss": 0.0193, "step": 1770 }, { "epoch": 4.494949494949495, "grad_norm": 4.558255195617676, "learning_rate": 0.00011010101010101011, "loss": 0.1149, "step": 1780 }, { "epoch": 4.52020202020202, "grad_norm": 0.04581284150481224, "learning_rate": 0.0001095959595959596, "loss": 0.0227, "step": 1790 }, { "epoch": 4.545454545454545, "grad_norm": 1.2669509649276733, "learning_rate": 0.00010909090909090909, "loss": 0.0504, "step": 1800 }, { "epoch": 4.545454545454545, "eval_accuracy": 0.9588182632050134, "eval_loss": 0.16235476732254028, "eval_runtime": 73.0538, "eval_samples_per_second": 15.29, "eval_steps_per_second": 1.916, "step": 1800 }, { "epoch": 4.570707070707071, "grad_norm": 0.07127434760332108, "learning_rate": 0.0001085858585858586, "loss": 0.0492, "step": 1810 }, { "epoch": 4.595959595959596, "grad_norm": 1.7907336950302124, "learning_rate": 0.00010808080808080809, "loss": 0.0358, "step": 1820 }, { "epoch": 4.621212121212121, "grad_norm": 4.024843692779541, "learning_rate": 0.00010757575757575758, "loss": 0.0856, "step": 1830 }, { "epoch": 4.646464646464646, "grad_norm": 0.020713260397315025, "learning_rate": 0.00010707070707070708, "loss": 0.0101, "step": 1840 }, { "epoch": 4.671717171717171, "grad_norm": 0.06845160573720932, "learning_rate": 0.00010656565656565659, "loss": 0.0153, "step": 1850 }, { "epoch": 4.696969696969697, "grad_norm": 1.0333762168884277, "learning_rate": 0.00010606060606060606, "loss": 0.1535, "step": 1860 }, { "epoch": 4.722222222222222, "grad_norm": 0.019528638571500778, "learning_rate": 0.00010555555555555557, "loss": 0.089, "step": 1870 }, { "epoch": 4.747474747474747, "grad_norm": 0.12054427713155746, "learning_rate": 0.00010505050505050507, "loss": 0.0154, "step": 1880 }, { "epoch": 4.7727272727272725, "grad_norm": 0.053187351673841476, "learning_rate": 0.00010454545454545455, "loss": 0.1073, "step": 1890 }, { "epoch": 4.797979797979798, "grad_norm": 0.03637217357754707, "learning_rate": 0.00010404040404040405, "loss": 0.0972, "step": 1900 }, { "epoch": 4.797979797979798, "eval_accuracy": 0.9623992837958818, "eval_loss": 0.15791860222816467, "eval_runtime": 73.2114, "eval_samples_per_second": 15.257, "eval_steps_per_second": 1.912, "step": 1900 }, { "epoch": 4.8232323232323235, "grad_norm": 6.812131404876709, "learning_rate": 0.00010353535353535353, "loss": 0.1274, "step": 1910 }, { "epoch": 4.848484848484849, "grad_norm": 2.3793511390686035, "learning_rate": 0.00010303030303030303, "loss": 0.1051, "step": 1920 }, { "epoch": 4.873737373737374, "grad_norm": 1.2393810749053955, "learning_rate": 0.00010252525252525254, "loss": 0.0167, "step": 1930 }, { "epoch": 4.898989898989899, "grad_norm": 1.5232930183410645, "learning_rate": 0.00010202020202020202, "loss": 0.0065, "step": 1940 }, { "epoch": 4.924242424242424, "grad_norm": 0.00905653741210699, "learning_rate": 0.00010151515151515152, "loss": 0.0419, "step": 1950 }, { "epoch": 4.94949494949495, "grad_norm": 0.8604415655136108, "learning_rate": 0.00010101010101010102, "loss": 0.0769, "step": 1960 }, { "epoch": 4.974747474747475, "grad_norm": 4.089222431182861, "learning_rate": 0.0001005050505050505, "loss": 0.0366, "step": 1970 }, { "epoch": 5.0, "grad_norm": 2.2072501182556152, "learning_rate": 0.0001, "loss": 0.0746, "step": 1980 }, { "epoch": 5.025252525252525, "grad_norm": 0.010899940505623817, "learning_rate": 9.94949494949495e-05, "loss": 0.0597, "step": 1990 }, { "epoch": 5.05050505050505, "grad_norm": 1.6260383129119873, "learning_rate": 9.8989898989899e-05, "loss": 0.0456, "step": 2000 }, { "epoch": 5.05050505050505, "eval_accuracy": 0.9489704565801254, "eval_loss": 0.19649948179721832, "eval_runtime": 73.1131, "eval_samples_per_second": 15.278, "eval_steps_per_second": 1.915, "step": 2000 }, { "epoch": 5.075757575757576, "grad_norm": 0.009620290249586105, "learning_rate": 9.848484848484849e-05, "loss": 0.018, "step": 2010 }, { "epoch": 5.101010101010101, "grad_norm": 4.627386093139648, "learning_rate": 9.797979797979798e-05, "loss": 0.0906, "step": 2020 }, { "epoch": 5.126262626262626, "grad_norm": 0.5775233507156372, "learning_rate": 9.747474747474747e-05, "loss": 0.0179, "step": 2030 }, { "epoch": 5.151515151515151, "grad_norm": 0.3100966513156891, "learning_rate": 9.696969696969698e-05, "loss": 0.0225, "step": 2040 }, { "epoch": 5.1767676767676765, "grad_norm": 0.012251541949808598, "learning_rate": 9.646464646464647e-05, "loss": 0.0062, "step": 2050 }, { "epoch": 5.202020202020202, "grad_norm": 3.9397971630096436, "learning_rate": 9.595959595959596e-05, "loss": 0.0497, "step": 2060 }, { "epoch": 5.2272727272727275, "grad_norm": 0.002988005056977272, "learning_rate": 9.545454545454546e-05, "loss": 0.0242, "step": 2070 }, { "epoch": 5.252525252525253, "grad_norm": 0.15744374692440033, "learning_rate": 9.494949494949495e-05, "loss": 0.0165, "step": 2080 }, { "epoch": 5.277777777777778, "grad_norm": 2.624490976333618, "learning_rate": 9.444444444444444e-05, "loss": 0.0595, "step": 2090 }, { "epoch": 5.303030303030303, "grad_norm": 1.7126376628875732, "learning_rate": 9.393939393939395e-05, "loss": 0.0334, "step": 2100 }, { "epoch": 5.303030303030303, "eval_accuracy": 0.9570277529095792, "eval_loss": 0.165226012468338, "eval_runtime": 73.2601, "eval_samples_per_second": 15.247, "eval_steps_per_second": 1.911, "step": 2100 }, { "epoch": 5.328282828282829, "grad_norm": 0.003406533505767584, "learning_rate": 9.343434343434344e-05, "loss": 0.0201, "step": 2110 }, { "epoch": 5.353535353535354, "grad_norm": 0.18647323548793793, "learning_rate": 9.292929292929293e-05, "loss": 0.0471, "step": 2120 }, { "epoch": 5.378787878787879, "grad_norm": 4.275173664093018, "learning_rate": 9.242424242424242e-05, "loss": 0.0565, "step": 2130 }, { "epoch": 5.404040404040404, "grad_norm": 3.319251537322998, "learning_rate": 9.191919191919192e-05, "loss": 0.0687, "step": 2140 }, { "epoch": 5.429292929292929, "grad_norm": 0.067157082259655, "learning_rate": 9.141414141414141e-05, "loss": 0.0507, "step": 2150 }, { "epoch": 5.454545454545454, "grad_norm": 0.18047641217708588, "learning_rate": 9.090909090909092e-05, "loss": 0.0555, "step": 2160 }, { "epoch": 5.47979797979798, "grad_norm": 0.0075127603486180305, "learning_rate": 9.040404040404041e-05, "loss": 0.0488, "step": 2170 }, { "epoch": 5.505050505050505, "grad_norm": 0.01690557599067688, "learning_rate": 8.98989898989899e-05, "loss": 0.0626, "step": 2180 }, { "epoch": 5.53030303030303, "grad_norm": 0.005741783883422613, "learning_rate": 8.93939393939394e-05, "loss": 0.0014, "step": 2190 }, { "epoch": 5.555555555555555, "grad_norm": 0.05627870187163353, "learning_rate": 8.888888888888889e-05, "loss": 0.0242, "step": 2200 }, { "epoch": 5.555555555555555, "eval_accuracy": 0.9749328558639212, "eval_loss": 0.11822798103094101, "eval_runtime": 73.1232, "eval_samples_per_second": 15.276, "eval_steps_per_second": 1.915, "step": 2200 }, { "epoch": 5.58080808080808, "grad_norm": 0.012817220762372017, "learning_rate": 8.83838383838384e-05, "loss": 0.0277, "step": 2210 }, { "epoch": 5.606060606060606, "grad_norm": 0.00884329341351986, "learning_rate": 8.787878787878789e-05, "loss": 0.0067, "step": 2220 }, { "epoch": 5.6313131313131315, "grad_norm": 0.034603264182806015, "learning_rate": 8.737373737373738e-05, "loss": 0.0702, "step": 2230 }, { "epoch": 5.656565656565657, "grad_norm": 0.0622437559068203, "learning_rate": 8.686868686868688e-05, "loss": 0.0171, "step": 2240 }, { "epoch": 5.681818181818182, "grad_norm": 0.04042644053697586, "learning_rate": 8.636363636363637e-05, "loss": 0.0592, "step": 2250 }, { "epoch": 5.707070707070707, "grad_norm": 0.04215148836374283, "learning_rate": 8.585858585858586e-05, "loss": 0.0761, "step": 2260 }, { "epoch": 5.732323232323233, "grad_norm": 0.22815492749214172, "learning_rate": 8.535353535353535e-05, "loss": 0.0133, "step": 2270 }, { "epoch": 5.757575757575758, "grad_norm": 0.3139846622943878, "learning_rate": 8.484848484848486e-05, "loss": 0.0013, "step": 2280 }, { "epoch": 5.782828282828283, "grad_norm": 0.008748591877520084, "learning_rate": 8.434343434343435e-05, "loss": 0.036, "step": 2290 }, { "epoch": 5.808080808080808, "grad_norm": 0.10703355818986893, "learning_rate": 8.383838383838384e-05, "loss": 0.0715, "step": 2300 }, { "epoch": 5.808080808080808, "eval_accuracy": 0.9650850492390332, "eval_loss": 0.12497912347316742, "eval_runtime": 72.9451, "eval_samples_per_second": 15.313, "eval_steps_per_second": 1.919, "step": 2300 }, { "epoch": 5.833333333333333, "grad_norm": 0.02993335947394371, "learning_rate": 8.333333333333334e-05, "loss": 0.017, "step": 2310 }, { "epoch": 5.858585858585858, "grad_norm": 0.004180525429546833, "learning_rate": 8.282828282828283e-05, "loss": 0.0388, "step": 2320 }, { "epoch": 5.883838383838384, "grad_norm": 0.0341310054063797, "learning_rate": 8.232323232323233e-05, "loss": 0.0193, "step": 2330 }, { "epoch": 5.909090909090909, "grad_norm": 0.02368093468248844, "learning_rate": 8.181818181818183e-05, "loss": 0.0314, "step": 2340 }, { "epoch": 5.934343434343434, "grad_norm": 0.01623358018696308, "learning_rate": 8.131313131313132e-05, "loss": 0.0578, "step": 2350 }, { "epoch": 5.959595959595959, "grad_norm": 0.006059895269572735, "learning_rate": 8.080808080808081e-05, "loss": 0.0066, "step": 2360 }, { "epoch": 5.984848484848484, "grad_norm": 0.024945911020040512, "learning_rate": 8.03030303030303e-05, "loss": 0.0032, "step": 2370 }, { "epoch": 6.01010101010101, "grad_norm": 0.010317071340978146, "learning_rate": 7.97979797979798e-05, "loss": 0.0047, "step": 2380 }, { "epoch": 6.0353535353535355, "grad_norm": 0.4775066673755646, "learning_rate": 7.92929292929293e-05, "loss": 0.0193, "step": 2390 }, { "epoch": 6.0606060606060606, "grad_norm": 6.233785629272461, "learning_rate": 7.878787878787879e-05, "loss": 0.0407, "step": 2400 }, { "epoch": 6.0606060606060606, "eval_accuracy": 0.9695613249776186, "eval_loss": 0.11715386807918549, "eval_runtime": 73.3488, "eval_samples_per_second": 15.229, "eval_steps_per_second": 1.909, "step": 2400 }, { "epoch": 6.085858585858586, "grad_norm": 0.04230092465877533, "learning_rate": 7.828282828282829e-05, "loss": 0.0028, "step": 2410 }, { "epoch": 6.111111111111111, "grad_norm": 0.0015748771838843822, "learning_rate": 7.777777777777778e-05, "loss": 0.0421, "step": 2420 }, { "epoch": 6.136363636363637, "grad_norm": 0.00564368162304163, "learning_rate": 7.727272727272727e-05, "loss": 0.0631, "step": 2430 }, { "epoch": 6.161616161616162, "grad_norm": 0.4366774559020996, "learning_rate": 7.676767676767676e-05, "loss": 0.0429, "step": 2440 }, { "epoch": 6.186868686868687, "grad_norm": 0.6611001491546631, "learning_rate": 7.626262626262627e-05, "loss": 0.0901, "step": 2450 }, { "epoch": 6.212121212121212, "grad_norm": 5.706575870513916, "learning_rate": 7.575757575757576e-05, "loss": 0.0857, "step": 2460 }, { "epoch": 6.237373737373737, "grad_norm": 0.007969530299305916, "learning_rate": 7.525252525252525e-05, "loss": 0.0227, "step": 2470 }, { "epoch": 6.262626262626263, "grad_norm": 0.28915736079216003, "learning_rate": 7.474747474747475e-05, "loss": 0.0113, "step": 2480 }, { "epoch": 6.287878787878788, "grad_norm": 0.2088274508714676, "learning_rate": 7.424242424242424e-05, "loss": 0.0026, "step": 2490 }, { "epoch": 6.313131313131313, "grad_norm": 0.004980772268027067, "learning_rate": 7.373737373737373e-05, "loss": 0.0003, "step": 2500 }, { "epoch": 6.313131313131313, "eval_accuracy": 0.9785138764547896, "eval_loss": 0.08193562924861908, "eval_runtime": 73.1145, "eval_samples_per_second": 15.277, "eval_steps_per_second": 1.915, "step": 2500 }, { "epoch": 6.338383838383838, "grad_norm": 0.001987410243600607, "learning_rate": 7.323232323232324e-05, "loss": 0.0383, "step": 2510 }, { "epoch": 6.363636363636363, "grad_norm": 1.1499226093292236, "learning_rate": 7.272727272727273e-05, "loss": 0.0171, "step": 2520 }, { "epoch": 6.388888888888889, "grad_norm": 0.03895330801606178, "learning_rate": 7.222222222222222e-05, "loss": 0.0127, "step": 2530 }, { "epoch": 6.414141414141414, "grad_norm": 0.3166453540325165, "learning_rate": 7.171717171717171e-05, "loss": 0.0278, "step": 2540 }, { "epoch": 6.4393939393939394, "grad_norm": 0.005140668712556362, "learning_rate": 7.121212121212121e-05, "loss": 0.0795, "step": 2550 }, { "epoch": 6.4646464646464645, "grad_norm": 14.462100982666016, "learning_rate": 7.07070707070707e-05, "loss": 0.085, "step": 2560 }, { "epoch": 6.48989898989899, "grad_norm": 0.24089215695858002, "learning_rate": 7.020202020202021e-05, "loss": 0.0026, "step": 2570 }, { "epoch": 6.515151515151516, "grad_norm": 0.22834239900112152, "learning_rate": 6.96969696969697e-05, "loss": 0.005, "step": 2580 }, { "epoch": 6.540404040404041, "grad_norm": 8.35010814666748, "learning_rate": 6.91919191919192e-05, "loss": 0.0728, "step": 2590 }, { "epoch": 6.565656565656566, "grad_norm": 4.920100212097168, "learning_rate": 6.86868686868687e-05, "loss": 0.0072, "step": 2600 }, { "epoch": 6.565656565656566, "eval_accuracy": 0.9713518352730528, "eval_loss": 0.14060670137405396, "eval_runtime": 73.0266, "eval_samples_per_second": 15.296, "eval_steps_per_second": 1.917, "step": 2600 }, { "epoch": 6.590909090909091, "grad_norm": 0.23918700218200684, "learning_rate": 6.818181818181818e-05, "loss": 0.0821, "step": 2610 }, { "epoch": 6.616161616161616, "grad_norm": 0.06384919583797455, "learning_rate": 6.767676767676769e-05, "loss": 0.0761, "step": 2620 }, { "epoch": 6.641414141414142, "grad_norm": 0.4447100758552551, "learning_rate": 6.717171717171718e-05, "loss": 0.0139, "step": 2630 }, { "epoch": 6.666666666666667, "grad_norm": 0.0030958615243434906, "learning_rate": 6.666666666666667e-05, "loss": 0.0341, "step": 2640 }, { "epoch": 6.691919191919192, "grad_norm": 0.05117692053318024, "learning_rate": 6.616161616161617e-05, "loss": 0.0152, "step": 2650 }, { "epoch": 6.717171717171717, "grad_norm": 0.003273693146184087, "learning_rate": 6.565656565656566e-05, "loss": 0.0314, "step": 2660 }, { "epoch": 6.742424242424242, "grad_norm": 0.005075991619378328, "learning_rate": 6.515151515151516e-05, "loss": 0.0164, "step": 2670 }, { "epoch": 6.767676767676767, "grad_norm": 0.23585616052150726, "learning_rate": 6.464646464646466e-05, "loss": 0.0139, "step": 2680 }, { "epoch": 6.792929292929293, "grad_norm": 6.123977184295654, "learning_rate": 6.414141414141415e-05, "loss": 0.0113, "step": 2690 }, { "epoch": 6.818181818181818, "grad_norm": 2.395871162414551, "learning_rate": 6.363636363636364e-05, "loss": 0.0183, "step": 2700 }, { "epoch": 6.818181818181818, "eval_accuracy": 0.9749328558639212, "eval_loss": 0.11515188962221146, "eval_runtime": 73.0277, "eval_samples_per_second": 15.296, "eval_steps_per_second": 1.917, "step": 2700 }, { "epoch": 6.843434343434343, "grad_norm": 0.005218807607889175, "learning_rate": 6.313131313131313e-05, "loss": 0.003, "step": 2710 }, { "epoch": 6.8686868686868685, "grad_norm": 0.0012497535208240151, "learning_rate": 6.262626262626264e-05, "loss": 0.0116, "step": 2720 }, { "epoch": 6.893939393939394, "grad_norm": 0.0025018516462296247, "learning_rate": 6.212121212121213e-05, "loss": 0.005, "step": 2730 }, { "epoch": 6.91919191919192, "grad_norm": 0.005596707109361887, "learning_rate": 6.161616161616162e-05, "loss": 0.037, "step": 2740 }, { "epoch": 6.944444444444445, "grad_norm": 0.0010910239070653915, "learning_rate": 6.111111111111112e-05, "loss": 0.0338, "step": 2750 }, { "epoch": 6.96969696969697, "grad_norm": 0.6075408458709717, "learning_rate": 6.060606060606061e-05, "loss": 0.0268, "step": 2760 }, { "epoch": 6.994949494949495, "grad_norm": 0.25022584199905396, "learning_rate": 6.01010101010101e-05, "loss": 0.0125, "step": 2770 }, { "epoch": 7.02020202020202, "grad_norm": 0.12169167399406433, "learning_rate": 5.959595959595959e-05, "loss": 0.0082, "step": 2780 }, { "epoch": 7.045454545454546, "grad_norm": 3.5715599060058594, "learning_rate": 5.90909090909091e-05, "loss": 0.0144, "step": 2790 }, { "epoch": 7.070707070707071, "grad_norm": 0.09293267875909805, "learning_rate": 5.858585858585859e-05, "loss": 0.0021, "step": 2800 }, { "epoch": 7.070707070707071, "eval_accuracy": 0.973142345568487, "eval_loss": 0.13676650822162628, "eval_runtime": 72.9405, "eval_samples_per_second": 15.314, "eval_steps_per_second": 1.919, "step": 2800 }, { "epoch": 7.095959595959596, "grad_norm": 0.009541651234030724, "learning_rate": 5.808080808080808e-05, "loss": 0.0058, "step": 2810 }, { "epoch": 7.121212121212121, "grad_norm": 0.0016315419925376773, "learning_rate": 5.757575757575758e-05, "loss": 0.0064, "step": 2820 }, { "epoch": 7.146464646464646, "grad_norm": 10.356843948364258, "learning_rate": 5.707070707070707e-05, "loss": 0.0595, "step": 2830 }, { "epoch": 7.171717171717171, "grad_norm": 0.0018419253174215555, "learning_rate": 5.6565656565656563e-05, "loss": 0.016, "step": 2840 }, { "epoch": 7.196969696969697, "grad_norm": 0.010135513730347157, "learning_rate": 5.606060606060606e-05, "loss": 0.052, "step": 2850 }, { "epoch": 7.222222222222222, "grad_norm": 6.740849494934082, "learning_rate": 5.555555555555556e-05, "loss": 0.0374, "step": 2860 }, { "epoch": 7.247474747474747, "grad_norm": 0.4412079155445099, "learning_rate": 5.5050505050505056e-05, "loss": 0.0117, "step": 2870 }, { "epoch": 7.2727272727272725, "grad_norm": 0.001609967672266066, "learning_rate": 5.4545454545454546e-05, "loss": 0.0824, "step": 2880 }, { "epoch": 7.297979797979798, "grad_norm": 0.005415134131908417, "learning_rate": 5.4040404040404044e-05, "loss": 0.0177, "step": 2890 }, { "epoch": 7.3232323232323235, "grad_norm": 0.02915014885365963, "learning_rate": 5.353535353535354e-05, "loss": 0.046, "step": 2900 }, { "epoch": 7.3232323232323235, "eval_accuracy": 0.9794091316025068, "eval_loss": 0.09002197533845901, "eval_runtime": 73.1136, "eval_samples_per_second": 15.278, "eval_steps_per_second": 1.915, "step": 2900 }, { "epoch": 7.348484848484849, "grad_norm": 0.020192056894302368, "learning_rate": 5.303030303030303e-05, "loss": 0.0004, "step": 2910 }, { "epoch": 7.373737373737374, "grad_norm": 0.7057023644447327, "learning_rate": 5.2525252525252536e-05, "loss": 0.0699, "step": 2920 }, { "epoch": 7.398989898989899, "grad_norm": 0.0018105951603502035, "learning_rate": 5.2020202020202026e-05, "loss": 0.0379, "step": 2930 }, { "epoch": 7.424242424242424, "grad_norm": 0.002236352302134037, "learning_rate": 5.151515151515152e-05, "loss": 0.0576, "step": 2940 }, { "epoch": 7.44949494949495, "grad_norm": 0.46005484461784363, "learning_rate": 5.101010101010101e-05, "loss": 0.0007, "step": 2950 }, { "epoch": 7.474747474747475, "grad_norm": 0.17090271413326263, "learning_rate": 5.050505050505051e-05, "loss": 0.0066, "step": 2960 }, { "epoch": 7.5, "grad_norm": 0.002259742235764861, "learning_rate": 5e-05, "loss": 0.0043, "step": 2970 }, { "epoch": 7.525252525252525, "grad_norm": 0.0029255333356559277, "learning_rate": 4.94949494949495e-05, "loss": 0.0239, "step": 2980 }, { "epoch": 7.55050505050505, "grad_norm": 2.9925894737243652, "learning_rate": 4.898989898989899e-05, "loss": 0.0063, "step": 2990 }, { "epoch": 7.575757575757576, "grad_norm": 0.052914004772901535, "learning_rate": 4.848484848484849e-05, "loss": 0.033, "step": 3000 }, { "epoch": 7.575757575757576, "eval_accuracy": 0.9785138764547896, "eval_loss": 0.10143210738897324, "eval_runtime": 73.4907, "eval_samples_per_second": 15.199, "eval_steps_per_second": 1.905, "step": 3000 }, { "epoch": 7.601010101010101, "grad_norm": 0.04058058559894562, "learning_rate": 4.797979797979798e-05, "loss": 0.0245, "step": 3010 }, { "epoch": 7.626262626262626, "grad_norm": 0.03967829421162605, "learning_rate": 4.7474747474747476e-05, "loss": 0.0006, "step": 3020 }, { "epoch": 7.651515151515151, "grad_norm": 0.621035635471344, "learning_rate": 4.696969696969697e-05, "loss": 0.0175, "step": 3030 }, { "epoch": 7.6767676767676765, "grad_norm": 0.36977216601371765, "learning_rate": 4.6464646464646464e-05, "loss": 0.0388, "step": 3040 }, { "epoch": 7.702020202020202, "grad_norm": 3.2532241344451904, "learning_rate": 4.595959595959596e-05, "loss": 0.0905, "step": 3050 }, { "epoch": 7.7272727272727275, "grad_norm": 0.004156060051172972, "learning_rate": 4.545454545454546e-05, "loss": 0.0002, "step": 3060 }, { "epoch": 7.752525252525253, "grad_norm": 0.6550003290176392, "learning_rate": 4.494949494949495e-05, "loss": 0.0066, "step": 3070 }, { "epoch": 7.777777777777778, "grad_norm": 0.0028251020703464746, "learning_rate": 4.4444444444444447e-05, "loss": 0.0083, "step": 3080 }, { "epoch": 7.803030303030303, "grad_norm": 0.008767428807914257, "learning_rate": 4.3939393939393944e-05, "loss": 0.0006, "step": 3090 }, { "epoch": 7.828282828282829, "grad_norm": 0.04811250418424606, "learning_rate": 4.343434343434344e-05, "loss": 0.0354, "step": 3100 }, { "epoch": 7.828282828282829, "eval_accuracy": 0.9767233661593554, "eval_loss": 0.09683331102132797, "eval_runtime": 73.2348, "eval_samples_per_second": 15.252, "eval_steps_per_second": 1.912, "step": 3100 }, { "epoch": 7.853535353535354, "grad_norm": 0.00525275431573391, "learning_rate": 4.292929292929293e-05, "loss": 0.0088, "step": 3110 }, { "epoch": 7.878787878787879, "grad_norm": 0.015972474589943886, "learning_rate": 4.242424242424243e-05, "loss": 0.0011, "step": 3120 }, { "epoch": 7.904040404040404, "grad_norm": 0.006997071672230959, "learning_rate": 4.191919191919192e-05, "loss": 0.0017, "step": 3130 }, { "epoch": 7.929292929292929, "grad_norm": 0.023101719096302986, "learning_rate": 4.141414141414142e-05, "loss": 0.0567, "step": 3140 }, { "epoch": 7.954545454545455, "grad_norm": 0.003169642062857747, "learning_rate": 4.0909090909090915e-05, "loss": 0.1026, "step": 3150 }, { "epoch": 7.97979797979798, "grad_norm": 0.003613903187215328, "learning_rate": 4.0404040404040405e-05, "loss": 0.005, "step": 3160 }, { "epoch": 8.005050505050505, "grad_norm": 1.0490131378173828, "learning_rate": 3.98989898989899e-05, "loss": 0.0023, "step": 3170 }, { "epoch": 8.030303030303031, "grad_norm": 0.003916851244866848, "learning_rate": 3.939393939393939e-05, "loss": 0.0023, "step": 3180 }, { "epoch": 8.055555555555555, "grad_norm": 0.016336582601070404, "learning_rate": 3.888888888888889e-05, "loss": 0.0079, "step": 3190 }, { "epoch": 8.080808080808081, "grad_norm": 0.8970369696617126, "learning_rate": 3.838383838383838e-05, "loss": 0.0026, "step": 3200 }, { "epoch": 8.080808080808081, "eval_accuracy": 0.973142345568487, "eval_loss": 0.1217464730143547, "eval_runtime": 73.5035, "eval_samples_per_second": 15.197, "eval_steps_per_second": 1.905, "step": 3200 }, { "epoch": 8.106060606060606, "grad_norm": 0.03298179805278778, "learning_rate": 3.787878787878788e-05, "loss": 0.0051, "step": 3210 }, { "epoch": 8.131313131313131, "grad_norm": 0.5918856263160706, "learning_rate": 3.7373737373737376e-05, "loss": 0.032, "step": 3220 }, { "epoch": 8.156565656565657, "grad_norm": 0.0031904878560453653, "learning_rate": 3.686868686868687e-05, "loss": 0.029, "step": 3230 }, { "epoch": 8.181818181818182, "grad_norm": 0.043024152517318726, "learning_rate": 3.6363636363636364e-05, "loss": 0.0003, "step": 3240 }, { "epoch": 8.207070707070708, "grad_norm": 0.011919928714632988, "learning_rate": 3.5858585858585855e-05, "loss": 0.0028, "step": 3250 }, { "epoch": 8.232323232323232, "grad_norm": 0.007164669223129749, "learning_rate": 3.535353535353535e-05, "loss": 0.0146, "step": 3260 }, { "epoch": 8.257575757575758, "grad_norm": 0.03415270894765854, "learning_rate": 3.484848484848485e-05, "loss": 0.0041, "step": 3270 }, { "epoch": 8.282828282828282, "grad_norm": 0.03534342721104622, "learning_rate": 3.434343434343435e-05, "loss": 0.0035, "step": 3280 }, { "epoch": 8.308080808080808, "grad_norm": 0.3735661804676056, "learning_rate": 3.3838383838383844e-05, "loss": 0.0745, "step": 3290 }, { "epoch": 8.333333333333334, "grad_norm": 0.0013512909645214677, "learning_rate": 3.3333333333333335e-05, "loss": 0.0002, "step": 3300 }, { "epoch": 8.333333333333334, "eval_accuracy": 0.9794091316025068, "eval_loss": 0.08283615112304688, "eval_runtime": 73.1651, "eval_samples_per_second": 15.267, "eval_steps_per_second": 1.913, "step": 3300 }, { "epoch": 8.358585858585858, "grad_norm": 0.023621654137969017, "learning_rate": 3.282828282828283e-05, "loss": 0.0174, "step": 3310 }, { "epoch": 8.383838383838384, "grad_norm": 0.006960035767406225, "learning_rate": 3.232323232323233e-05, "loss": 0.0004, "step": 3320 }, { "epoch": 8.409090909090908, "grad_norm": 0.0008190835942514241, "learning_rate": 3.181818181818182e-05, "loss": 0.0374, "step": 3330 }, { "epoch": 8.434343434343434, "grad_norm": 0.016193361952900887, "learning_rate": 3.131313131313132e-05, "loss": 0.0007, "step": 3340 }, { "epoch": 8.45959595959596, "grad_norm": 0.2075665146112442, "learning_rate": 3.080808080808081e-05, "loss": 0.0422, "step": 3350 }, { "epoch": 8.484848484848484, "grad_norm": 0.009178784675896168, "learning_rate": 3.0303030303030306e-05, "loss": 0.0332, "step": 3360 }, { "epoch": 8.51010101010101, "grad_norm": 8.036938667297363, "learning_rate": 2.9797979797979796e-05, "loss": 0.0436, "step": 3370 }, { "epoch": 8.535353535353535, "grad_norm": 0.0013093262678012252, "learning_rate": 2.9292929292929294e-05, "loss": 0.0109, "step": 3380 }, { "epoch": 8.56060606060606, "grad_norm": 0.0033100605942308903, "learning_rate": 2.878787878787879e-05, "loss": 0.0011, "step": 3390 }, { "epoch": 8.585858585858587, "grad_norm": 0.0015343882841989398, "learning_rate": 2.8282828282828282e-05, "loss": 0.0006, "step": 3400 }, { "epoch": 8.585858585858587, "eval_accuracy": 0.9794091316025068, "eval_loss": 0.09259337186813354, "eval_runtime": 72.8639, "eval_samples_per_second": 15.33, "eval_steps_per_second": 1.921, "step": 3400 }, { "epoch": 8.61111111111111, "grad_norm": 0.030406756326556206, "learning_rate": 2.777777777777778e-05, "loss": 0.0026, "step": 3410 }, { "epoch": 8.636363636363637, "grad_norm": 0.0022419544402509928, "learning_rate": 2.7272727272727273e-05, "loss": 0.0007, "step": 3420 }, { "epoch": 8.66161616161616, "grad_norm": 0.0011131414212286472, "learning_rate": 2.676767676767677e-05, "loss": 0.0006, "step": 3430 }, { "epoch": 8.686868686868687, "grad_norm": 0.005616435315459967, "learning_rate": 2.6262626262626268e-05, "loss": 0.0003, "step": 3440 }, { "epoch": 8.712121212121213, "grad_norm": 0.1008942499756813, "learning_rate": 2.575757575757576e-05, "loss": 0.0097, "step": 3450 }, { "epoch": 8.737373737373737, "grad_norm": 0.002821123693138361, "learning_rate": 2.5252525252525256e-05, "loss": 0.0669, "step": 3460 }, { "epoch": 8.762626262626263, "grad_norm": 0.013286658562719822, "learning_rate": 2.474747474747475e-05, "loss": 0.0265, "step": 3470 }, { "epoch": 8.787878787878787, "grad_norm": 0.003963208291679621, "learning_rate": 2.4242424242424244e-05, "loss": 0.0178, "step": 3480 }, { "epoch": 8.813131313131313, "grad_norm": 0.002018690574914217, "learning_rate": 2.3737373737373738e-05, "loss": 0.0082, "step": 3490 }, { "epoch": 8.83838383838384, "grad_norm": 0.1014542207121849, "learning_rate": 2.3232323232323232e-05, "loss": 0.0006, "step": 3500 }, { "epoch": 8.83838383838384, "eval_accuracy": 0.9794091316025068, "eval_loss": 0.10012003779411316, "eval_runtime": 73.1859, "eval_samples_per_second": 15.263, "eval_steps_per_second": 1.913, "step": 3500 }, { "epoch": 8.863636363636363, "grad_norm": 0.002746024401858449, "learning_rate": 2.272727272727273e-05, "loss": 0.0063, "step": 3510 }, { "epoch": 8.88888888888889, "grad_norm": 0.0018340348033234477, "learning_rate": 2.2222222222222223e-05, "loss": 0.0024, "step": 3520 }, { "epoch": 8.914141414141413, "grad_norm": 0.004108617547899485, "learning_rate": 2.171717171717172e-05, "loss": 0.0083, "step": 3530 }, { "epoch": 8.93939393939394, "grad_norm": 0.00315410690382123, "learning_rate": 2.1212121212121215e-05, "loss": 0.0462, "step": 3540 }, { "epoch": 8.964646464646465, "grad_norm": 0.024781817570328712, "learning_rate": 2.070707070707071e-05, "loss": 0.0029, "step": 3550 }, { "epoch": 8.98989898989899, "grad_norm": 0.005382045172154903, "learning_rate": 2.0202020202020203e-05, "loss": 0.0047, "step": 3560 }, { "epoch": 9.015151515151516, "grad_norm": 1.6344341039657593, "learning_rate": 1.9696969696969697e-05, "loss": 0.0038, "step": 3570 }, { "epoch": 9.04040404040404, "grad_norm": 0.010318132117390633, "learning_rate": 1.919191919191919e-05, "loss": 0.0096, "step": 3580 }, { "epoch": 9.065656565656566, "grad_norm": 0.0016402292530983686, "learning_rate": 1.8686868686868688e-05, "loss": 0.0321, "step": 3590 }, { "epoch": 9.090909090909092, "grad_norm": 0.004027374088764191, "learning_rate": 1.8181818181818182e-05, "loss": 0.0006, "step": 3600 }, { "epoch": 9.090909090909092, "eval_accuracy": 0.9847806624888094, "eval_loss": 0.08629997074604034, "eval_runtime": 73.127, "eval_samples_per_second": 15.275, "eval_steps_per_second": 1.914, "step": 3600 }, { "epoch": 9.116161616161616, "grad_norm": 0.0007902685320004821, "learning_rate": 1.7676767676767676e-05, "loss": 0.0059, "step": 3610 }, { "epoch": 9.141414141414142, "grad_norm": 0.0024135063868016005, "learning_rate": 1.7171717171717173e-05, "loss": 0.0269, "step": 3620 }, { "epoch": 9.166666666666666, "grad_norm": 0.026507705450057983, "learning_rate": 1.6666666666666667e-05, "loss": 0.0003, "step": 3630 }, { "epoch": 9.191919191919192, "grad_norm": 0.10678762197494507, "learning_rate": 1.6161616161616165e-05, "loss": 0.0059, "step": 3640 }, { "epoch": 9.217171717171718, "grad_norm": 0.08362487703561783, "learning_rate": 1.565656565656566e-05, "loss": 0.0545, "step": 3650 }, { "epoch": 9.242424242424242, "grad_norm": 0.002414940157905221, "learning_rate": 1.5151515151515153e-05, "loss": 0.0221, "step": 3660 }, { "epoch": 9.267676767676768, "grad_norm": 0.0013868235982954502, "learning_rate": 1.4646464646464647e-05, "loss": 0.0005, "step": 3670 }, { "epoch": 9.292929292929292, "grad_norm": 0.0013921884819865227, "learning_rate": 1.4141414141414141e-05, "loss": 0.041, "step": 3680 }, { "epoch": 9.318181818181818, "grad_norm": 0.08867702633142471, "learning_rate": 1.3636363636363637e-05, "loss": 0.026, "step": 3690 }, { "epoch": 9.343434343434343, "grad_norm": 0.0012104762718081474, "learning_rate": 1.3131313131313134e-05, "loss": 0.0633, "step": 3700 }, { "epoch": 9.343434343434343, "eval_accuracy": 0.9803043867502238, "eval_loss": 0.09109070897102356, "eval_runtime": 71.4974, "eval_samples_per_second": 15.623, "eval_steps_per_second": 1.958, "step": 3700 }, { "epoch": 9.368686868686869, "grad_norm": 0.007544935215264559, "learning_rate": 1.2626262626262628e-05, "loss": 0.002, "step": 3710 }, { "epoch": 9.393939393939394, "grad_norm": 0.01898648589849472, "learning_rate": 1.2121212121212122e-05, "loss": 0.0005, "step": 3720 }, { "epoch": 9.419191919191919, "grad_norm": 0.00644712382927537, "learning_rate": 1.1616161616161616e-05, "loss": 0.0059, "step": 3730 }, { "epoch": 9.444444444444445, "grad_norm": 0.00872492603957653, "learning_rate": 1.1111111111111112e-05, "loss": 0.0011, "step": 3740 }, { "epoch": 9.469696969696969, "grad_norm": 1.6075825691223145, "learning_rate": 1.0606060606060607e-05, "loss": 0.0099, "step": 3750 }, { "epoch": 9.494949494949495, "grad_norm": 6.320465087890625, "learning_rate": 1.0101010101010101e-05, "loss": 0.0163, "step": 3760 }, { "epoch": 9.52020202020202, "grad_norm": 0.0037208800204098225, "learning_rate": 9.595959595959595e-06, "loss": 0.0002, "step": 3770 }, { "epoch": 9.545454545454545, "grad_norm": 3.3599369525909424, "learning_rate": 9.090909090909091e-06, "loss": 0.0053, "step": 3780 }, { "epoch": 9.570707070707071, "grad_norm": 0.5879691243171692, "learning_rate": 8.585858585858587e-06, "loss": 0.0019, "step": 3790 }, { "epoch": 9.595959595959595, "grad_norm": 0.26342862844467163, "learning_rate": 8.080808080808082e-06, "loss": 0.0009, "step": 3800 }, { "epoch": 9.595959595959595, "eval_accuracy": 0.982094897045658, "eval_loss": 0.09413682669401169, "eval_runtime": 73.1451, "eval_samples_per_second": 15.271, "eval_steps_per_second": 1.914, "step": 3800 }, { "epoch": 9.621212121212121, "grad_norm": 0.042649831622838974, "learning_rate": 7.5757575757575764e-06, "loss": 0.0226, "step": 3810 }, { "epoch": 9.646464646464647, "grad_norm": 0.0022528120316565037, "learning_rate": 7.0707070707070704e-06, "loss": 0.0136, "step": 3820 }, { "epoch": 9.671717171717171, "grad_norm": 0.12108311802148819, "learning_rate": 6.565656565656567e-06, "loss": 0.0408, "step": 3830 }, { "epoch": 9.696969696969697, "grad_norm": 0.7086867690086365, "learning_rate": 6.060606060606061e-06, "loss": 0.0035, "step": 3840 }, { "epoch": 9.722222222222221, "grad_norm": 0.049748744815588, "learning_rate": 5.555555555555556e-06, "loss": 0.0012, "step": 3850 }, { "epoch": 9.747474747474747, "grad_norm": 0.004345474299043417, "learning_rate": 5.050505050505051e-06, "loss": 0.0002, "step": 3860 }, { "epoch": 9.772727272727273, "grad_norm": 0.005164165981113911, "learning_rate": 4.5454545454545455e-06, "loss": 0.0049, "step": 3870 }, { "epoch": 9.797979797979798, "grad_norm": 0.003518365090712905, "learning_rate": 4.040404040404041e-06, "loss": 0.002, "step": 3880 }, { "epoch": 9.823232323232324, "grad_norm": 0.0017797194886952639, "learning_rate": 3.5353535353535352e-06, "loss": 0.0005, "step": 3890 }, { "epoch": 9.848484848484848, "grad_norm": 4.788568496704102, "learning_rate": 3.0303030303030305e-06, "loss": 0.0247, "step": 3900 }, { "epoch": 9.848484848484848, "eval_accuracy": 0.9785138764547896, "eval_loss": 0.09876807779073715, "eval_runtime": 73.1729, "eval_samples_per_second": 15.265, "eval_steps_per_second": 1.913, "step": 3900 }, { "epoch": 9.873737373737374, "grad_norm": 0.0013341947924345732, "learning_rate": 2.5252525252525253e-06, "loss": 0.0082, "step": 3910 }, { "epoch": 9.8989898989899, "grad_norm": 0.004278136417269707, "learning_rate": 2.0202020202020206e-06, "loss": 0.0019, "step": 3920 }, { "epoch": 9.924242424242424, "grad_norm": 0.002301498083397746, "learning_rate": 1.5151515151515152e-06, "loss": 0.0245, "step": 3930 }, { "epoch": 9.94949494949495, "grad_norm": 0.000858976156450808, "learning_rate": 1.0101010101010103e-06, "loss": 0.0013, "step": 3940 }, { "epoch": 9.974747474747474, "grad_norm": 0.007369679398834705, "learning_rate": 5.050505050505052e-07, "loss": 0.0774, "step": 3950 }, { "epoch": 10.0, "grad_norm": 0.008844327181577682, "learning_rate": 0.0, "loss": 0.0004, "step": 3960 }, { "epoch": 10.0, "step": 3960, "total_flos": 4.904158054749069e+18, "train_loss": 0.06213315485569771, "train_runtime": 7084.9204, "train_samples_per_second": 8.93, "train_steps_per_second": 0.559 } ], "logging_steps": 10, "max_steps": 3960, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.904158054749069e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }