diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1254 +1,3150 @@ { - "best_metric": 0.33288896083831787, - "best_model_checkpoint": "realFake-img/checkpoint-700", - "epoch": 4.0, + "best_metric": 0.08193562924861908, + "best_model_checkpoint": "realFake-img/checkpoint-2500", + "epoch": 10.0, "eval_steps": 100, - "global_step": 1560, + "global_step": 3960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.02564102564102564, - "grad_norm": 2.928205728530884, - "learning_rate": 0.00019871794871794874, - "loss": 1.1209, + "epoch": 0.025252525252525252, + "grad_norm": 8.32949447631836, + "learning_rate": 0.0001994949494949495, + "loss": 0.1124, "step": 10 }, { - "epoch": 0.05128205128205128, - "grad_norm": 3.1282591819763184, - "learning_rate": 0.00019743589743589744, - "loss": 0.6435, + "epoch": 0.050505050505050504, + "grad_norm": 4.660865306854248, + "learning_rate": 0.000198989898989899, + "loss": 0.2631, "step": 20 }, { - "epoch": 0.07692307692307693, - "grad_norm": 2.2042582035064697, - "learning_rate": 0.00019615384615384615, - "loss": 0.6513, + "epoch": 0.07575757575757576, + "grad_norm": 4.1171956062316895, + "learning_rate": 0.0001984848484848485, + "loss": 0.1366, "step": 30 }, { - "epoch": 0.10256410256410256, - "grad_norm": 4.91236686706543, - "learning_rate": 0.00019487179487179487, - "loss": 0.7536, + "epoch": 0.10101010101010101, + "grad_norm": 4.586099147796631, + "learning_rate": 0.000197979797979798, + "loss": 0.1395, "step": 40 }, { - "epoch": 0.1282051282051282, - "grad_norm": 2.019882917404175, - "learning_rate": 0.0001935897435897436, - "loss": 0.6197, + "epoch": 0.12626262626262627, + "grad_norm": 3.6707675457000732, + "learning_rate": 0.0001974747474747475, + "loss": 0.178, "step": 50 }, { - "epoch": 0.15384615384615385, - "grad_norm": 2.4161789417266846, - "learning_rate": 0.00019230769230769233, - "loss": 0.5531, + "epoch": 0.15151515151515152, + "grad_norm": 0.39073047041893005, + "learning_rate": 0.00019696969696969698, + "loss": 0.2038, "step": 60 }, { - "epoch": 0.1794871794871795, - "grad_norm": 2.189767360687256, - "learning_rate": 0.00019102564102564104, - "loss": 0.4985, + "epoch": 0.17676767676767677, + "grad_norm": 3.4298012256622314, + "learning_rate": 0.0001964646464646465, + "loss": 0.0964, "step": 70 }, { - "epoch": 0.20512820512820512, - "grad_norm": 3.2690813541412354, - "learning_rate": 0.00018974358974358974, - "loss": 0.5499, + "epoch": 0.20202020202020202, + "grad_norm": 4.532003402709961, + "learning_rate": 0.00019595959595959596, + "loss": 0.171, "step": 80 }, { - "epoch": 0.23076923076923078, - "grad_norm": 1.6541521549224854, - "learning_rate": 0.00018846153846153847, - "loss": 0.5365, + "epoch": 0.22727272727272727, + "grad_norm": 2.3665497303009033, + "learning_rate": 0.00019545454545454548, + "loss": 0.1166, "step": 90 }, { - "epoch": 0.2564102564102564, - "grad_norm": 1.9506237506866455, - "learning_rate": 0.0001871794871794872, - "loss": 0.4892, + "epoch": 0.25252525252525254, + "grad_norm": 1.0514458417892456, + "learning_rate": 0.00019494949494949494, + "loss": 0.2578, "step": 100 }, { - "epoch": 0.2564102564102564, - "eval_accuracy": 0.7227272727272728, - "eval_loss": 0.5756350159645081, - "eval_runtime": 24.0859, - "eval_samples_per_second": 45.67, - "eval_steps_per_second": 5.73, + "epoch": 0.25252525252525254, + "eval_accuracy": 0.9418084153983886, + "eval_loss": 0.1593756079673767, + "eval_runtime": 72.9833, + "eval_samples_per_second": 15.305, + "eval_steps_per_second": 1.918, "step": 100 }, { - "epoch": 0.28205128205128205, - "grad_norm": 3.7501442432403564, - "learning_rate": 0.0001858974358974359, - "loss": 0.4306, + "epoch": 0.2777777777777778, + "grad_norm": 2.9928767681121826, + "learning_rate": 0.00019444444444444446, + "loss": 0.1794, "step": 110 }, { - "epoch": 0.3076923076923077, - "grad_norm": 2.692314386367798, - "learning_rate": 0.00018461538461538463, - "loss": 0.5122, + "epoch": 0.30303030303030304, + "grad_norm": 0.6943581104278564, + "learning_rate": 0.00019393939393939395, + "loss": 0.1713, "step": 120 }, { - "epoch": 0.3333333333333333, - "grad_norm": 2.5989458560943604, - "learning_rate": 0.00018333333333333334, - "loss": 0.4974, + "epoch": 0.3282828282828283, + "grad_norm": 5.296023845672607, + "learning_rate": 0.00019343434343434344, + "loss": 0.1822, "step": 130 }, { - "epoch": 0.358974358974359, - "grad_norm": 1.8959237337112427, - "learning_rate": 0.00018205128205128207, - "loss": 0.4464, + "epoch": 0.35353535353535354, + "grad_norm": 4.849494934082031, + "learning_rate": 0.00019292929292929293, + "loss": 0.1667, "step": 140 }, { - "epoch": 0.38461538461538464, - "grad_norm": 1.9950543642044067, - "learning_rate": 0.00018076923076923077, - "loss": 0.4366, + "epoch": 0.3787878787878788, + "grad_norm": 2.1953601837158203, + "learning_rate": 0.00019242424242424245, + "loss": 0.1353, "step": 150 }, { - "epoch": 0.41025641025641024, - "grad_norm": 1.4334654808044434, - "learning_rate": 0.0001794871794871795, - "loss": 0.5248, + "epoch": 0.40404040404040403, + "grad_norm": 3.5325512886047363, + "learning_rate": 0.00019191919191919191, + "loss": 0.2191, "step": 160 }, { - "epoch": 0.4358974358974359, - "grad_norm": 1.7422298192977905, - "learning_rate": 0.00017820512820512823, - "loss": 0.4314, + "epoch": 0.4292929292929293, + "grad_norm": 1.513462781906128, + "learning_rate": 0.00019141414141414143, + "loss": 0.0864, "step": 170 }, { - "epoch": 0.46153846153846156, - "grad_norm": 1.0945038795471191, - "learning_rate": 0.00017692307692307693, - "loss": 0.3464, + "epoch": 0.45454545454545453, + "grad_norm": 1.1227214336395264, + "learning_rate": 0.00019090909090909092, + "loss": 0.0972, "step": 180 }, { - "epoch": 0.48717948717948717, - "grad_norm": 2.3801605701446533, - "learning_rate": 0.00017564102564102566, - "loss": 0.4059, + "epoch": 0.4797979797979798, + "grad_norm": 4.3201212882995605, + "learning_rate": 0.0001904040404040404, + "loss": 0.1356, "step": 190 }, { - "epoch": 0.5128205128205128, - "grad_norm": 1.6624411344528198, - "learning_rate": 0.00017435897435897436, - "loss": 0.683, + "epoch": 0.5050505050505051, + "grad_norm": 0.13399846851825714, + "learning_rate": 0.0001898989898989899, + "loss": 0.0944, "step": 200 }, { - "epoch": 0.5128205128205128, - "eval_accuracy": 0.6372727272727273, - "eval_loss": 0.6742109656333923, - "eval_runtime": 24.1183, - "eval_samples_per_second": 45.609, - "eval_steps_per_second": 5.722, + "epoch": 0.5050505050505051, + "eval_accuracy": 0.937332139659803, + "eval_loss": 0.22425174713134766, + "eval_runtime": 72.9458, + "eval_samples_per_second": 15.313, + "eval_steps_per_second": 1.919, "step": 200 }, { - "epoch": 0.5384615384615384, - "grad_norm": 3.1722240447998047, - "learning_rate": 0.0001730769230769231, - "loss": 0.4548, + "epoch": 0.5303030303030303, + "grad_norm": 0.07937999069690704, + "learning_rate": 0.00018939393939393942, + "loss": 0.0798, "step": 210 }, { - "epoch": 0.5641025641025641, - "grad_norm": 2.0260214805603027, - "learning_rate": 0.0001717948717948718, - "loss": 0.4612, + "epoch": 0.5555555555555556, + "grad_norm": 6.126536846160889, + "learning_rate": 0.00018888888888888888, + "loss": 0.2437, "step": 220 }, { - "epoch": 0.5897435897435898, - "grad_norm": 2.1661155223846436, - "learning_rate": 0.00017051282051282053, - "loss": 0.3824, + "epoch": 0.5808080808080808, + "grad_norm": 8.01685619354248, + "learning_rate": 0.0001883838383838384, + "loss": 0.2746, "step": 230 }, { - "epoch": 0.6153846153846154, - "grad_norm": 2.2094335556030273, - "learning_rate": 0.00016923076923076923, - "loss": 0.3815, + "epoch": 0.6060606060606061, + "grad_norm": 3.1425938606262207, + "learning_rate": 0.0001878787878787879, + "loss": 0.1937, "step": 240 }, { - "epoch": 0.6410256410256411, - "grad_norm": 2.571754217147827, - "learning_rate": 0.00016794871794871796, - "loss": 0.3743, + "epoch": 0.6313131313131313, + "grad_norm": 1.1262303590774536, + "learning_rate": 0.00018737373737373738, + "loss": 0.2495, "step": 250 }, { - "epoch": 0.6666666666666666, - "grad_norm": 1.545766830444336, - "learning_rate": 0.0001666666666666667, - "loss": 0.3708, + "epoch": 0.6565656565656566, + "grad_norm": 3.994985342025757, + "learning_rate": 0.00018686868686868687, + "loss": 0.0914, "step": 260 }, { - "epoch": 0.6923076923076923, - "grad_norm": 2.742072343826294, - "learning_rate": 0.0001653846153846154, - "loss": 0.3729, + "epoch": 0.6818181818181818, + "grad_norm": 3.6686558723449707, + "learning_rate": 0.00018636363636363636, + "loss": 0.1241, "step": 270 }, { - "epoch": 0.717948717948718, - "grad_norm": 2.677527904510498, - "learning_rate": 0.0001641025641025641, - "loss": 0.3555, + "epoch": 0.7070707070707071, + "grad_norm": 2.8421552181243896, + "learning_rate": 0.00018585858585858586, + "loss": 0.162, "step": 280 }, { - "epoch": 0.7435897435897436, - "grad_norm": 1.6417704820632935, - "learning_rate": 0.00016282051282051282, - "loss": 0.4212, + "epoch": 0.7323232323232324, + "grad_norm": 0.06576777994632721, + "learning_rate": 0.00018535353535353537, + "loss": 0.0863, "step": 290 }, { - "epoch": 0.7692307692307693, - "grad_norm": 2.6961071491241455, - "learning_rate": 0.00016153846153846155, - "loss": 0.3737, + "epoch": 0.7575757575757576, + "grad_norm": 3.127112865447998, + "learning_rate": 0.00018484848484848484, + "loss": 0.1747, "step": 300 }, { - "epoch": 0.7692307692307693, - "eval_accuracy": 0.7554545454545455, - "eval_loss": 0.5462190508842468, - "eval_runtime": 24.1467, - "eval_samples_per_second": 45.555, - "eval_steps_per_second": 5.715, + "epoch": 0.7575757575757576, + "eval_accuracy": 0.9292748433303492, + "eval_loss": 0.24716989696025848, + "eval_runtime": 73.2274, + "eval_samples_per_second": 15.254, + "eval_steps_per_second": 1.912, "step": 300 }, { - "epoch": 0.7948717948717948, - "grad_norm": 3.5049915313720703, - "learning_rate": 0.00016025641025641028, - "loss": 0.3995, + "epoch": 0.7828282828282829, + "grad_norm": 1.235567569732666, + "learning_rate": 0.00018434343434343435, + "loss": 0.0742, "step": 310 }, { - "epoch": 0.8205128205128205, - "grad_norm": 2.599503517150879, - "learning_rate": 0.00015897435897435896, - "loss": 0.3805, + "epoch": 0.8080808080808081, + "grad_norm": 5.305884838104248, + "learning_rate": 0.00018383838383838384, + "loss": 0.1013, "step": 320 }, { - "epoch": 0.8461538461538461, - "grad_norm": 2.3924107551574707, - "learning_rate": 0.0001576923076923077, - "loss": 0.3586, + "epoch": 0.8333333333333334, + "grad_norm": 3.124811887741089, + "learning_rate": 0.00018333333333333334, + "loss": 0.2439, "step": 330 }, { - "epoch": 0.8717948717948718, - "grad_norm": 3.0169456005096436, - "learning_rate": 0.00015641025641025642, - "loss": 0.3964, + "epoch": 0.8585858585858586, + "grad_norm": 5.361472129821777, + "learning_rate": 0.00018282828282828283, + "loss": 0.0468, "step": 340 }, { - "epoch": 0.8974358974358975, - "grad_norm": 3.907792091369629, - "learning_rate": 0.00015512820512820515, - "loss": 0.3915, + "epoch": 0.8838383838383839, + "grad_norm": 3.3062198162078857, + "learning_rate": 0.00018232323232323234, + "loss": 0.0855, "step": 350 }, { - "epoch": 0.9230769230769231, - "grad_norm": 1.2597954273223877, - "learning_rate": 0.00015384615384615385, - "loss": 0.3471, + "epoch": 0.9090909090909091, + "grad_norm": 1.9714092016220093, + "learning_rate": 0.00018181818181818183, + "loss": 0.1645, "step": 360 }, { - "epoch": 0.9487179487179487, - "grad_norm": 4.323169231414795, - "learning_rate": 0.00015256410256410255, - "loss": 0.2476, + "epoch": 0.9343434343434344, + "grad_norm": 1.7579039335250854, + "learning_rate": 0.00018131313131313132, + "loss": 0.193, "step": 370 }, { - "epoch": 0.9743589743589743, - "grad_norm": 3.7260568141937256, - "learning_rate": 0.00015128205128205128, - "loss": 0.41, + "epoch": 0.9595959595959596, + "grad_norm": 3.588534355163574, + "learning_rate": 0.00018080808080808082, + "loss": 0.1305, "step": 380 }, { - "epoch": 1.0, - "grad_norm": 0.22700244188308716, - "learning_rate": 0.00015000000000000001, - "loss": 0.3773, + "epoch": 0.9848484848484849, + "grad_norm": 6.151834487915039, + "learning_rate": 0.0001803030303030303, + "loss": 0.1004, "step": 390 }, { - "epoch": 1.0256410256410255, - "grad_norm": 1.8512117862701416, - "learning_rate": 0.00014871794871794872, - "loss": 0.3554, + "epoch": 1.0101010101010102, + "grad_norm": 3.521318197250366, + "learning_rate": 0.0001797979797979798, + "loss": 0.1328, "step": 400 }, { - "epoch": 1.0256410256410255, - "eval_accuracy": 0.8009090909090909, - "eval_loss": 0.4354061186313629, - "eval_runtime": 24.0034, - "eval_samples_per_second": 45.827, - "eval_steps_per_second": 5.749, + "epoch": 1.0101010101010102, + "eval_accuracy": 0.9337511190689346, + "eval_loss": 0.17739379405975342, + "eval_runtime": 72.9497, + "eval_samples_per_second": 15.312, + "eval_steps_per_second": 1.919, "step": 400 }, { - "epoch": 1.0512820512820513, - "grad_norm": 1.9039109945297241, - "learning_rate": 0.00014743589743589745, - "loss": 0.3033, + "epoch": 1.0353535353535352, + "grad_norm": 0.5116239786148071, + "learning_rate": 0.00017929292929292931, + "loss": 0.0932, "step": 410 }, { - "epoch": 1.0769230769230769, - "grad_norm": 2.7765700817108154, - "learning_rate": 0.00014615384615384615, - "loss": 0.2594, + "epoch": 1.0606060606060606, + "grad_norm": 0.37958571314811707, + "learning_rate": 0.0001787878787878788, + "loss": 0.0538, "step": 420 }, { - "epoch": 1.1025641025641026, - "grad_norm": 2.247612953186035, - "learning_rate": 0.00014487179487179488, - "loss": 0.3434, + "epoch": 1.0858585858585859, + "grad_norm": 3.976700782775879, + "learning_rate": 0.0001782828282828283, + "loss": 0.2245, "step": 430 }, { - "epoch": 1.1282051282051282, - "grad_norm": 1.161192536354065, - "learning_rate": 0.0001435897435897436, - "loss": 0.2653, + "epoch": 1.1111111111111112, + "grad_norm": 2.8285045623779297, + "learning_rate": 0.00017777777777777779, + "loss": 0.1332, "step": 440 }, { - "epoch": 1.1538461538461537, - "grad_norm": 2.9420008659362793, - "learning_rate": 0.0001423076923076923, - "loss": 0.3184, + "epoch": 1.1363636363636362, + "grad_norm": 3.683419704437256, + "learning_rate": 0.00017727272727272728, + "loss": 0.1162, "step": 450 }, { - "epoch": 1.1794871794871795, - "grad_norm": 2.359160900115967, - "learning_rate": 0.00014102564102564104, - "loss": 0.4049, + "epoch": 1.1616161616161615, + "grad_norm": 4.30293607711792, + "learning_rate": 0.0001767676767676768, + "loss": 0.0678, "step": 460 }, { - "epoch": 1.205128205128205, - "grad_norm": 1.5929157733917236, - "learning_rate": 0.00013974358974358974, - "loss": 0.371, + "epoch": 1.1868686868686869, + "grad_norm": 0.15934455394744873, + "learning_rate": 0.00017626262626262626, + "loss": 0.1587, "step": 470 }, { - "epoch": 1.2307692307692308, - "grad_norm": 3.8561315536499023, - "learning_rate": 0.00013846153846153847, - "loss": 0.2864, + "epoch": 1.2121212121212122, + "grad_norm": 1.5525578260421753, + "learning_rate": 0.00017575757575757578, + "loss": 0.0637, "step": 480 }, { - "epoch": 1.2564102564102564, - "grad_norm": 2.111147403717041, - "learning_rate": 0.00013717948717948718, - "loss": 0.3173, + "epoch": 1.2373737373737375, + "grad_norm": 1.534348964691162, + "learning_rate": 0.00017525252525252527, + "loss": 0.1103, "step": 490 }, { - "epoch": 1.282051282051282, - "grad_norm": 1.6825300455093384, - "learning_rate": 0.0001358974358974359, - "loss": 0.2368, + "epoch": 1.2626262626262625, + "grad_norm": 1.6843178272247314, + "learning_rate": 0.00017474747474747476, + "loss": 0.1918, "step": 500 }, { - "epoch": 1.282051282051282, - "eval_accuracy": 0.8309090909090909, - "eval_loss": 0.4046396017074585, - "eval_runtime": 24.1395, - "eval_samples_per_second": 45.568, - "eval_steps_per_second": 5.717, + "epoch": 1.2626262626262625, + "eval_accuracy": 0.9570277529095792, + "eval_loss": 0.12820282578468323, + "eval_runtime": 73.1443, + "eval_samples_per_second": 15.271, + "eval_steps_per_second": 1.914, "step": 500 }, { - "epoch": 1.3076923076923077, - "grad_norm": 1.7883163690567017, - "learning_rate": 0.00013461538461538464, - "loss": 0.2616, + "epoch": 1.2878787878787878, + "grad_norm": 0.6296999454498291, + "learning_rate": 0.00017424242424242425, + "loss": 0.0461, "step": 510 }, { - "epoch": 1.3333333333333333, - "grad_norm": 3.3474502563476562, - "learning_rate": 0.00013333333333333334, - "loss": 0.292, + "epoch": 1.3131313131313131, + "grad_norm": 4.980341911315918, + "learning_rate": 0.00017373737373737377, + "loss": 0.1479, "step": 520 }, { - "epoch": 1.358974358974359, - "grad_norm": 1.9872941970825195, - "learning_rate": 0.00013205128205128204, - "loss": 0.3284, + "epoch": 1.3383838383838385, + "grad_norm": 0.36140933632850647, + "learning_rate": 0.00017323232323232323, + "loss": 0.0726, "step": 530 }, { - "epoch": 1.3846153846153846, - "grad_norm": 1.508928894996643, - "learning_rate": 0.00013076923076923077, - "loss": 0.3132, + "epoch": 1.3636363636363638, + "grad_norm": 0.2907123267650604, + "learning_rate": 0.00017272727272727275, + "loss": 0.1109, "step": 540 }, { - "epoch": 1.4102564102564101, - "grad_norm": 2.3678171634674072, - "learning_rate": 0.0001294871794871795, - "loss": 0.2698, + "epoch": 1.3888888888888888, + "grad_norm": 1.1450049877166748, + "learning_rate": 0.00017222222222222224, + "loss": 0.0888, "step": 550 }, { - "epoch": 1.435897435897436, - "grad_norm": 3.961099147796631, - "learning_rate": 0.00012820512820512823, - "loss": 0.2522, + "epoch": 1.4141414141414141, + "grad_norm": 3.324134588241577, + "learning_rate": 0.00017171717171717173, + "loss": 0.1074, "step": 560 }, { - "epoch": 1.4615384615384617, - "grad_norm": 1.7161500453948975, - "learning_rate": 0.00012692307692307693, - "loss": 0.3514, + "epoch": 1.4393939393939394, + "grad_norm": 0.9428613185882568, + "learning_rate": 0.00017121212121212122, + "loss": 0.0856, "step": 570 }, { - "epoch": 1.4871794871794872, - "grad_norm": 2.0210063457489014, - "learning_rate": 0.00012564102564102564, - "loss": 0.2064, + "epoch": 1.4646464646464645, + "grad_norm": 0.1330060064792633, + "learning_rate": 0.0001707070707070707, + "loss": 0.061, "step": 580 }, { - "epoch": 1.5128205128205128, - "grad_norm": 1.9867080450057983, - "learning_rate": 0.00012435897435897437, - "loss": 0.3796, + "epoch": 1.4898989898989898, + "grad_norm": 4.435102939605713, + "learning_rate": 0.0001702020202020202, + "loss": 0.1137, "step": 590 }, { - "epoch": 1.5384615384615383, - "grad_norm": 5.6288981437683105, - "learning_rate": 0.0001230769230769231, - "loss": 0.3696, + "epoch": 1.5151515151515151, + "grad_norm": 2.5744283199310303, + "learning_rate": 0.00016969696969696972, + "loss": 0.169, "step": 600 }, { - "epoch": 1.5384615384615383, - "eval_accuracy": 0.7809090909090909, - "eval_loss": 0.5547047257423401, - "eval_runtime": 22.8194, - "eval_samples_per_second": 48.205, - "eval_steps_per_second": 6.047, + "epoch": 1.5151515151515151, + "eval_accuracy": 0.9346463742166518, + "eval_loss": 0.2247086614370346, + "eval_runtime": 73.2754, + "eval_samples_per_second": 15.244, + "eval_steps_per_second": 1.911, "step": 600 }, { - "epoch": 1.564102564102564, - "grad_norm": 2.3821280002593994, - "learning_rate": 0.00012179487179487179, - "loss": 0.169, + "epoch": 1.5404040404040404, + "grad_norm": 3.7209930419921875, + "learning_rate": 0.00016919191919191918, + "loss": 0.1929, "step": 610 }, { - "epoch": 1.5897435897435899, - "grad_norm": 2.7712907791137695, - "learning_rate": 0.00012051282051282052, - "loss": 0.294, + "epoch": 1.5656565656565657, + "grad_norm": 4.9047322273254395, + "learning_rate": 0.0001686868686868687, + "loss": 0.144, "step": 620 }, { - "epoch": 1.6153846153846154, - "grad_norm": 2.803150177001953, - "learning_rate": 0.00011923076923076923, - "loss": 0.3544, + "epoch": 1.5909090909090908, + "grad_norm": 8.181381225585938, + "learning_rate": 0.0001681818181818182, + "loss": 0.1008, "step": 630 }, { - "epoch": 1.641025641025641, - "grad_norm": 2.660898447036743, - "learning_rate": 0.00011794871794871796, - "loss": 0.2811, + "epoch": 1.6161616161616161, + "grad_norm": 0.5650784969329834, + "learning_rate": 0.00016767676767676768, + "loss": 0.1385, "step": 640 }, { - "epoch": 1.6666666666666665, - "grad_norm": 1.4722263813018799, - "learning_rate": 0.00011666666666666668, - "loss": 0.26, + "epoch": 1.6414141414141414, + "grad_norm": 0.4483976364135742, + "learning_rate": 0.00016717171717171717, + "loss": 0.1112, "step": 650 }, { - "epoch": 1.6923076923076923, - "grad_norm": 2.1373977661132812, - "learning_rate": 0.00011538461538461538, - "loss": 0.2282, + "epoch": 1.6666666666666665, + "grad_norm": 2.8870067596435547, + "learning_rate": 0.0001666666666666667, + "loss": 0.0868, "step": 660 }, { - "epoch": 1.717948717948718, - "grad_norm": 4.172289848327637, - "learning_rate": 0.0001141025641025641, - "loss": 0.2673, + "epoch": 1.691919191919192, + "grad_norm": 5.016068458557129, + "learning_rate": 0.00016616161616161615, + "loss": 0.0948, "step": 670 }, { - "epoch": 1.7435897435897436, - "grad_norm": 1.547676682472229, - "learning_rate": 0.00011282051282051283, - "loss": 0.234, + "epoch": 1.7171717171717171, + "grad_norm": 4.62065315246582, + "learning_rate": 0.00016565656565656567, + "loss": 0.2336, "step": 680 }, { - "epoch": 1.7692307692307692, - "grad_norm": 1.9279000759124756, - "learning_rate": 0.00011153846153846154, - "loss": 0.4459, + "epoch": 1.7424242424242424, + "grad_norm": 0.04882610961794853, + "learning_rate": 0.00016515151515151516, + "loss": 0.1006, "step": 690 }, { - "epoch": 1.7948717948717947, - "grad_norm": 2.7669806480407715, - "learning_rate": 0.00011025641025641027, - "loss": 0.2824, + "epoch": 1.7676767676767677, + "grad_norm": 1.2523910999298096, + "learning_rate": 0.00016464646464646465, + "loss": 0.2595, "step": 700 }, { - "epoch": 1.7948717948717947, - "eval_accuracy": 0.8518181818181818, - "eval_loss": 0.33288896083831787, - "eval_runtime": 23.8646, - "eval_samples_per_second": 46.093, - "eval_steps_per_second": 5.783, + "epoch": 1.7676767676767677, + "eval_accuracy": 0.9444941808415398, + "eval_loss": 0.1785079687833786, + "eval_runtime": 73.2828, + "eval_samples_per_second": 15.242, + "eval_steps_per_second": 1.91, "step": 700 }, { - "epoch": 1.8205128205128205, - "grad_norm": 1.626448154449463, - "learning_rate": 0.00010897435897435896, - "loss": 0.2789, + "epoch": 1.7929292929292928, + "grad_norm": 0.28372153639793396, + "learning_rate": 0.00016414141414141414, + "loss": 0.0657, "step": 710 }, { - "epoch": 1.8461538461538463, - "grad_norm": 2.5008246898651123, - "learning_rate": 0.0001076923076923077, - "loss": 0.2939, + "epoch": 1.8181818181818183, + "grad_norm": 0.061366915702819824, + "learning_rate": 0.00016363636363636366, + "loss": 0.2048, "step": 720 }, { - "epoch": 1.8717948717948718, - "grad_norm": 1.4484879970550537, - "learning_rate": 0.00010641025641025641, - "loss": 0.3107, + "epoch": 1.8434343434343434, + "grad_norm": 2.9858274459838867, + "learning_rate": 0.00016313131313131312, + "loss": 0.0489, "step": 730 }, { - "epoch": 1.8974358974358974, - "grad_norm": 2.9797451496124268, - "learning_rate": 0.00010512820512820514, - "loss": 0.1989, + "epoch": 1.8686868686868687, + "grad_norm": 4.050809383392334, + "learning_rate": 0.00016262626262626264, + "loss": 0.1095, "step": 740 }, { - "epoch": 1.9230769230769231, - "grad_norm": 2.551682710647583, - "learning_rate": 0.00010384615384615386, - "loss": 0.3133, + "epoch": 1.893939393939394, + "grad_norm": 3.725325584411621, + "learning_rate": 0.00016212121212121213, + "loss": 0.2613, "step": 750 }, { - "epoch": 1.9487179487179487, - "grad_norm": 3.318741798400879, - "learning_rate": 0.00010256410256410256, - "loss": 0.2384, + "epoch": 1.9191919191919191, + "grad_norm": 2.09786319732666, + "learning_rate": 0.00016161616161616162, + "loss": 0.0492, "step": 760 }, { - "epoch": 1.9743589743589745, - "grad_norm": 0.9309015274047852, - "learning_rate": 0.00010128205128205129, - "loss": 0.2314, + "epoch": 1.9444444444444444, + "grad_norm": 1.9398726224899292, + "learning_rate": 0.0001611111111111111, + "loss": 0.0831, "step": 770 }, { - "epoch": 2.0, - "grad_norm": 3.10331392288208, - "learning_rate": 0.0001, - "loss": 0.2341, + "epoch": 1.9696969696969697, + "grad_norm": 0.6055514812469482, + "learning_rate": 0.0001606060606060606, + "loss": 0.1733, "step": 780 }, { - "epoch": 2.0256410256410255, - "grad_norm": 2.6792232990264893, - "learning_rate": 9.871794871794872e-05, - "loss": 0.1841, + "epoch": 1.9949494949494948, + "grad_norm": 0.22102850675582886, + "learning_rate": 0.00016010101010101012, + "loss": 0.1106, "step": 790 }, { - "epoch": 2.051282051282051, - "grad_norm": 2.074448585510254, - "learning_rate": 9.743589743589744e-05, - "loss": 0.2366, + "epoch": 2.0202020202020203, + "grad_norm": 3.681710720062256, + "learning_rate": 0.0001595959595959596, + "loss": 0.0911, "step": 800 }, { - "epoch": 2.051282051282051, - "eval_accuracy": 0.8254545454545454, - "eval_loss": 0.45823609828948975, - "eval_runtime": 24.0195, - "eval_samples_per_second": 45.796, - "eval_steps_per_second": 5.745, + "epoch": 2.0202020202020203, + "eval_accuracy": 0.9534467323187108, + "eval_loss": 0.1352938562631607, + "eval_runtime": 73.2218, + "eval_samples_per_second": 15.255, + "eval_steps_per_second": 1.912, "step": 800 }, { - "epoch": 2.076923076923077, - "grad_norm": 0.9467771649360657, - "learning_rate": 9.615384615384617e-05, - "loss": 0.1895, + "epoch": 2.0454545454545454, + "grad_norm": 0.574734091758728, + "learning_rate": 0.0001590909090909091, + "loss": 0.044, "step": 810 }, { - "epoch": 2.1025641025641026, - "grad_norm": 3.1332082748413086, - "learning_rate": 9.487179487179487e-05, - "loss": 0.2665, + "epoch": 2.0707070707070705, + "grad_norm": 0.253918319940567, + "learning_rate": 0.0001585858585858586, + "loss": 0.0476, "step": 820 }, { - "epoch": 2.128205128205128, - "grad_norm": 3.9276282787323, - "learning_rate": 9.35897435897436e-05, - "loss": 0.2388, + "epoch": 2.095959595959596, + "grad_norm": 0.1252337247133255, + "learning_rate": 0.00015808080808080808, + "loss": 0.1279, "step": 830 }, { - "epoch": 2.1538461538461537, - "grad_norm": 2.7033755779266357, - "learning_rate": 9.230769230769232e-05, - "loss": 0.1917, + "epoch": 2.121212121212121, + "grad_norm": 0.26320022344589233, + "learning_rate": 0.00015757575757575757, + "loss": 0.2042, "step": 840 }, { - "epoch": 2.1794871794871793, - "grad_norm": 1.5250920057296753, - "learning_rate": 9.102564102564103e-05, - "loss": 0.3245, + "epoch": 2.1464646464646466, + "grad_norm": 0.7983365058898926, + "learning_rate": 0.0001570707070707071, + "loss": 0.1208, "step": 850 }, { - "epoch": 2.2051282051282053, - "grad_norm": 1.5101457834243774, - "learning_rate": 8.974358974358975e-05, - "loss": 0.1377, + "epoch": 2.1717171717171717, + "grad_norm": 0.36479347944259644, + "learning_rate": 0.00015656565656565658, + "loss": 0.0881, "step": 860 }, { - "epoch": 2.230769230769231, - "grad_norm": 0.743198573589325, - "learning_rate": 8.846153846153847e-05, - "loss": 0.2243, + "epoch": 2.196969696969697, + "grad_norm": 0.11645219475030899, + "learning_rate": 0.00015606060606060607, + "loss": 0.0955, "step": 870 }, { - "epoch": 2.2564102564102564, - "grad_norm": 5.429717540740967, - "learning_rate": 8.717948717948718e-05, - "loss": 0.2683, + "epoch": 2.2222222222222223, + "grad_norm": 1.1980379819869995, + "learning_rate": 0.00015555555555555556, + "loss": 0.077, "step": 880 }, { - "epoch": 2.282051282051282, - "grad_norm": 2.3276283740997314, - "learning_rate": 8.58974358974359e-05, - "loss": 0.2083, + "epoch": 2.2474747474747474, + "grad_norm": 0.06797017902135849, + "learning_rate": 0.00015505050505050508, + "loss": 0.0377, "step": 890 }, { - "epoch": 2.3076923076923075, - "grad_norm": 1.3464454412460327, - "learning_rate": 8.461538461538461e-05, - "loss": 0.2212, + "epoch": 2.2727272727272725, + "grad_norm": 0.48521897196769714, + "learning_rate": 0.00015454545454545454, + "loss": 0.0548, "step": 900 }, { - "epoch": 2.3076923076923075, - "eval_accuracy": 0.8254545454545454, - "eval_loss": 0.4885379374027252, - "eval_runtime": 24.2062, - "eval_samples_per_second": 45.443, - "eval_steps_per_second": 5.701, + "epoch": 2.2727272727272725, + "eval_accuracy": 0.9471799462846912, + "eval_loss": 0.19982792437076569, + "eval_runtime": 72.9425, + "eval_samples_per_second": 15.313, + "eval_steps_per_second": 1.919, "step": 900 }, { - "epoch": 2.3333333333333335, - "grad_norm": 2.0947823524475098, - "learning_rate": 8.333333333333334e-05, - "loss": 0.1469, + "epoch": 2.297979797979798, + "grad_norm": 0.017012102529406548, + "learning_rate": 0.00015404040404040406, + "loss": 0.1089, "step": 910 }, { - "epoch": 2.358974358974359, - "grad_norm": 2.0833053588867188, - "learning_rate": 8.205128205128205e-05, - "loss": 0.1637, + "epoch": 2.323232323232323, + "grad_norm": 0.2808210849761963, + "learning_rate": 0.00015353535353535353, + "loss": 0.0789, "step": 920 }, { - "epoch": 2.3846153846153846, - "grad_norm": 3.5598833560943604, - "learning_rate": 8.076923076923078e-05, - "loss": 0.2127, + "epoch": 2.3484848484848486, + "grad_norm": 4.9768781661987305, + "learning_rate": 0.00015303030303030304, + "loss": 0.1004, "step": 930 }, { - "epoch": 2.41025641025641, - "grad_norm": 5.30457878112793, - "learning_rate": 7.948717948717948e-05, - "loss": 0.1561, + "epoch": 2.3737373737373737, + "grad_norm": 1.5323927402496338, + "learning_rate": 0.00015252525252525253, + "loss": 0.0357, "step": 940 }, { - "epoch": 2.435897435897436, - "grad_norm": 2.163148880004883, - "learning_rate": 7.820512820512821e-05, - "loss": 0.1539, + "epoch": 2.398989898989899, + "grad_norm": 4.321779251098633, + "learning_rate": 0.00015202020202020202, + "loss": 0.0348, "step": 950 }, { - "epoch": 2.4615384615384617, - "grad_norm": 4.815582752227783, - "learning_rate": 7.692307692307693e-05, - "loss": 0.1802, + "epoch": 2.4242424242424243, + "grad_norm": 6.227025032043457, + "learning_rate": 0.00015151515151515152, + "loss": 0.1679, "step": 960 }, { - "epoch": 2.4871794871794872, - "grad_norm": 1.3422257900238037, - "learning_rate": 7.564102564102564e-05, - "loss": 0.2059, + "epoch": 2.4494949494949494, + "grad_norm": 1.045432209968567, + "learning_rate": 0.00015101010101010103, + "loss": 0.1222, "step": 970 }, { - "epoch": 2.5128205128205128, - "grad_norm": 2.441047430038452, - "learning_rate": 7.435897435897436e-05, - "loss": 0.2389, + "epoch": 2.474747474747475, + "grad_norm": 3.0685787200927734, + "learning_rate": 0.0001505050505050505, + "loss": 0.1434, "step": 980 }, { - "epoch": 2.5384615384615383, - "grad_norm": 3.386566162109375, - "learning_rate": 7.307692307692307e-05, - "loss": 0.1882, + "epoch": 2.5, + "grad_norm": 0.04191284626722336, + "learning_rate": 0.00015000000000000001, + "loss": 0.086, "step": 990 }, { - "epoch": 2.564102564102564, - "grad_norm": 3.123497247695923, - "learning_rate": 7.17948717948718e-05, - "loss": 0.2031, + "epoch": 2.525252525252525, + "grad_norm": 3.1016695499420166, + "learning_rate": 0.0001494949494949495, + "loss": 0.1399, "step": 1000 }, { - "epoch": 2.564102564102564, - "eval_accuracy": 0.8563636363636363, - "eval_loss": 0.42818111181259155, - "eval_runtime": 24.1992, - "eval_samples_per_second": 45.456, - "eval_steps_per_second": 5.703, + "epoch": 2.525252525252525, + "eval_accuracy": 0.9444941808415398, + "eval_loss": 0.19705650210380554, + "eval_runtime": 73.3829, + "eval_samples_per_second": 15.222, + "eval_steps_per_second": 1.908, "step": 1000 }, { - "epoch": 2.58974358974359, - "grad_norm": 2.518524646759033, - "learning_rate": 7.051282051282052e-05, - "loss": 0.2551, + "epoch": 2.5505050505050506, + "grad_norm": 4.877354145050049, + "learning_rate": 0.000148989898989899, + "loss": 0.1418, "step": 1010 }, { - "epoch": 2.6153846153846154, - "grad_norm": 2.376194715499878, - "learning_rate": 6.923076923076924e-05, - "loss": 0.1853, + "epoch": 2.5757575757575757, + "grad_norm": 4.7359700202941895, + "learning_rate": 0.00014848484848484849, + "loss": 0.1084, "step": 1020 }, { - "epoch": 2.641025641025641, - "grad_norm": 0.8307498097419739, - "learning_rate": 6.794871794871795e-05, - "loss": 0.1643, + "epoch": 2.601010101010101, + "grad_norm": 0.7143091559410095, + "learning_rate": 0.000147979797979798, + "loss": 0.1074, "step": 1030 }, { - "epoch": 2.6666666666666665, - "grad_norm": 2.050661087036133, - "learning_rate": 6.666666666666667e-05, - "loss": 0.105, + "epoch": 2.6262626262626263, + "grad_norm": 0.4162321388721466, + "learning_rate": 0.00014747474747474747, + "loss": 0.1317, "step": 1040 }, { - "epoch": 2.6923076923076925, - "grad_norm": 2.984266996383667, - "learning_rate": 6.538461538461539e-05, - "loss": 0.1774, + "epoch": 2.6515151515151514, + "grad_norm": 5.558507442474365, + "learning_rate": 0.00014696969696969698, + "loss": 0.0829, "step": 1050 }, { - "epoch": 2.717948717948718, - "grad_norm": 3.933162212371826, - "learning_rate": 6.410256410256412e-05, - "loss": 0.1079, + "epoch": 2.676767676767677, + "grad_norm": 0.08041220903396606, + "learning_rate": 0.00014646464646464648, + "loss": 0.0905, "step": 1060 }, { - "epoch": 2.7435897435897436, - "grad_norm": 4.650693893432617, - "learning_rate": 6.282051282051282e-05, - "loss": 0.1542, + "epoch": 2.702020202020202, + "grad_norm": 3.554946184158325, + "learning_rate": 0.00014595959595959597, + "loss": 0.14, "step": 1070 }, { - "epoch": 2.769230769230769, - "grad_norm": 2.796116828918457, - "learning_rate": 6.153846153846155e-05, - "loss": 0.4463, + "epoch": 2.7272727272727275, + "grad_norm": 0.9108226895332336, + "learning_rate": 0.00014545454545454546, + "loss": 0.0355, "step": 1080 }, { - "epoch": 2.7948717948717947, - "grad_norm": 3.1699883937835693, - "learning_rate": 6.025641025641026e-05, - "loss": 0.1348, + "epoch": 2.7525252525252526, + "grad_norm": 1.091728925704956, + "learning_rate": 0.00014494949494949495, + "loss": 0.059, "step": 1090 }, { - "epoch": 2.8205128205128203, - "grad_norm": 2.0635199546813965, - "learning_rate": 5.897435897435898e-05, - "loss": 0.1717, + "epoch": 2.7777777777777777, + "grad_norm": 0.07620527595281601, + "learning_rate": 0.00014444444444444444, + "loss": 0.2001, "step": 1100 }, { - "epoch": 2.8205128205128203, - "eval_accuracy": 0.85, - "eval_loss": 0.4373130798339844, - "eval_runtime": 23.6976, - "eval_samples_per_second": 46.418, - "eval_steps_per_second": 5.823, + "epoch": 2.7777777777777777, + "eval_accuracy": 0.937332139659803, + "eval_loss": 0.24790146946907043, + "eval_runtime": 73.1059, + "eval_samples_per_second": 15.279, + "eval_steps_per_second": 1.915, "step": 1100 }, { - "epoch": 2.8461538461538463, - "grad_norm": 3.7501204013824463, - "learning_rate": 5.769230769230769e-05, - "loss": 0.1505, + "epoch": 2.8030303030303028, + "grad_norm": 0.10709954053163528, + "learning_rate": 0.00014393939393939396, + "loss": 0.0487, "step": 1110 }, { - "epoch": 2.871794871794872, - "grad_norm": 0.866908609867096, - "learning_rate": 5.6410256410256414e-05, - "loss": 0.131, + "epoch": 2.8282828282828283, + "grad_norm": 4.047976493835449, + "learning_rate": 0.00014343434343434342, + "loss": 0.0774, "step": 1120 }, { - "epoch": 2.8974358974358974, - "grad_norm": 2.7631490230560303, - "learning_rate": 5.512820512820514e-05, - "loss": 0.1067, + "epoch": 2.8535353535353534, + "grad_norm": 2.409966468811035, + "learning_rate": 0.00014292929292929294, + "loss": 0.0744, "step": 1130 }, { - "epoch": 2.9230769230769234, - "grad_norm": 0.8835192918777466, - "learning_rate": 5.384615384615385e-05, - "loss": 0.2449, + "epoch": 2.878787878787879, + "grad_norm": 0.3456668257713318, + "learning_rate": 0.00014242424242424243, + "loss": 0.0125, "step": 1140 }, { - "epoch": 2.948717948717949, - "grad_norm": 0.17269015312194824, - "learning_rate": 5.256410256410257e-05, - "loss": 0.1235, + "epoch": 2.904040404040404, + "grad_norm": 0.046853143721818924, + "learning_rate": 0.00014191919191919192, + "loss": 0.0756, "step": 1150 }, { - "epoch": 2.9743589743589745, - "grad_norm": 2.5380775928497314, - "learning_rate": 5.128205128205128e-05, - "loss": 0.1214, + "epoch": 2.929292929292929, + "grad_norm": 3.4357807636260986, + "learning_rate": 0.0001414141414141414, + "loss": 0.1375, "step": 1160 }, { - "epoch": 3.0, - "grad_norm": 9.240225791931152, - "learning_rate": 5e-05, - "loss": 0.2372, + "epoch": 2.9545454545454546, + "grad_norm": 1.010414719581604, + "learning_rate": 0.00014090909090909093, + "loss": 0.0704, "step": 1170 }, { - "epoch": 3.0256410256410255, - "grad_norm": 0.454428106546402, - "learning_rate": 4.871794871794872e-05, - "loss": 0.1121, + "epoch": 2.9797979797979797, + "grad_norm": 0.008091296069324017, + "learning_rate": 0.00014040404040404042, + "loss": 0.0791, "step": 1180 }, { - "epoch": 3.051282051282051, - "grad_norm": 3.3110735416412354, - "learning_rate": 4.7435897435897435e-05, - "loss": 0.1121, + "epoch": 3.005050505050505, + "grad_norm": 1.9511629343032837, + "learning_rate": 0.0001398989898989899, + "loss": 0.0754, "step": 1190 }, { - "epoch": 3.076923076923077, - "grad_norm": 0.4833953380584717, - "learning_rate": 4.615384615384616e-05, - "loss": 0.1303, + "epoch": 3.0303030303030303, + "grad_norm": 10.075323104858398, + "learning_rate": 0.0001393939393939394, + "loss": 0.0976, "step": 1200 }, { - "epoch": 3.076923076923077, - "eval_accuracy": 0.8718181818181818, - "eval_loss": 0.36585894227027893, - "eval_runtime": 24.2959, - "eval_samples_per_second": 45.275, - "eval_steps_per_second": 5.68, + "epoch": 3.0303030303030303, + "eval_accuracy": 0.9498657117278424, + "eval_loss": 0.16011768579483032, + "eval_runtime": 73.2182, + "eval_samples_per_second": 15.256, + "eval_steps_per_second": 1.912, "step": 1200 }, { - "epoch": 3.1025641025641026, - "grad_norm": 0.08100098371505737, - "learning_rate": 4.4871794871794874e-05, - "loss": 0.0911, + "epoch": 3.0555555555555554, + "grad_norm": 0.027206294238567352, + "learning_rate": 0.0001388888888888889, + "loss": 0.0906, "step": 1210 }, { - "epoch": 3.128205128205128, - "grad_norm": 0.30585813522338867, - "learning_rate": 4.358974358974359e-05, - "loss": 0.0834, + "epoch": 3.080808080808081, + "grad_norm": 1.425262689590454, + "learning_rate": 0.0001383838383838384, + "loss": 0.0349, "step": 1220 }, { - "epoch": 3.1538461538461537, - "grad_norm": 4.129181385040283, - "learning_rate": 4.230769230769231e-05, - "loss": 0.1144, + "epoch": 3.106060606060606, + "grad_norm": 7.3463616371154785, + "learning_rate": 0.0001378787878787879, + "loss": 0.0804, "step": 1230 }, { - "epoch": 3.1794871794871793, - "grad_norm": 0.367727667093277, - "learning_rate": 4.1025641025641023e-05, - "loss": 0.0808, + "epoch": 3.1313131313131315, + "grad_norm": 1.0737591981887817, + "learning_rate": 0.0001373737373737374, + "loss": 0.068, "step": 1240 }, { - "epoch": 3.2051282051282053, - "grad_norm": 0.10303868353366852, - "learning_rate": 3.974358974358974e-05, - "loss": 0.1758, + "epoch": 3.1565656565656566, + "grad_norm": 7.525305271148682, + "learning_rate": 0.00013686868686868688, + "loss": 0.1145, "step": 1250 }, { - "epoch": 3.230769230769231, - "grad_norm": 2.300645589828491, - "learning_rate": 3.846153846153846e-05, - "loss": 0.227, + "epoch": 3.1818181818181817, + "grad_norm": 0.4561030864715576, + "learning_rate": 0.00013636363636363637, + "loss": 0.0977, "step": 1260 }, { - "epoch": 3.2564102564102564, - "grad_norm": 1.345780372619629, - "learning_rate": 3.717948717948718e-05, - "loss": 0.1345, + "epoch": 3.207070707070707, + "grad_norm": 0.11276185512542725, + "learning_rate": 0.00013585858585858586, + "loss": 0.0743, "step": 1270 }, { - "epoch": 3.282051282051282, - "grad_norm": 2.5391829013824463, - "learning_rate": 3.58974358974359e-05, - "loss": 0.0496, + "epoch": 3.2323232323232323, + "grad_norm": 1.0171997547149658, + "learning_rate": 0.00013535353535353538, + "loss": 0.0775, "step": 1280 }, { - "epoch": 3.3076923076923075, - "grad_norm": 0.31912463903427124, - "learning_rate": 3.461538461538462e-05, - "loss": 0.1165, + "epoch": 3.257575757575758, + "grad_norm": 3.1414084434509277, + "learning_rate": 0.00013484848484848484, + "loss": 0.0309, "step": 1290 }, { - "epoch": 3.3333333333333335, - "grad_norm": 0.5431106686592102, - "learning_rate": 3.3333333333333335e-05, - "loss": 0.0889, + "epoch": 3.282828282828283, + "grad_norm": 0.037932224571704865, + "learning_rate": 0.00013434343434343436, + "loss": 0.1291, "step": 1300 }, { - "epoch": 3.3333333333333335, - "eval_accuracy": 0.8736363636363637, - "eval_loss": 0.3662668764591217, - "eval_runtime": 23.4444, - "eval_samples_per_second": 46.92, - "eval_steps_per_second": 5.886, + "epoch": 3.282828282828283, + "eval_accuracy": 0.9588182632050134, + "eval_loss": 0.160703644156456, + "eval_runtime": 73.0017, + "eval_samples_per_second": 15.301, + "eval_steps_per_second": 1.918, "step": 1300 }, { - "epoch": 3.358974358974359, - "grad_norm": 2.443268299102783, - "learning_rate": 3.205128205128206e-05, - "loss": 0.1256, + "epoch": 3.308080808080808, + "grad_norm": 2.9155356884002686, + "learning_rate": 0.00013383838383838385, + "loss": 0.0215, "step": 1310 }, { - "epoch": 3.3846153846153846, - "grad_norm": 2.0804026126861572, - "learning_rate": 3.0769230769230774e-05, - "loss": 0.0973, + "epoch": 3.3333333333333335, + "grad_norm": 5.102810382843018, + "learning_rate": 0.00013333333333333334, + "loss": 0.0716, "step": 1320 }, { - "epoch": 3.41025641025641, - "grad_norm": 10.397607803344727, - "learning_rate": 2.948717948717949e-05, - "loss": 0.1183, + "epoch": 3.3585858585858586, + "grad_norm": 0.020925424993038177, + "learning_rate": 0.00013282828282828283, + "loss": 0.0372, "step": 1330 }, { - "epoch": 3.435897435897436, - "grad_norm": 3.746250867843628, - "learning_rate": 2.8205128205128207e-05, - "loss": 0.046, + "epoch": 3.3838383838383836, + "grad_norm": 0.10292687267065048, + "learning_rate": 0.00013232323232323235, + "loss": 0.0211, "step": 1340 }, { - "epoch": 3.4615384615384617, - "grad_norm": 0.7118757367134094, - "learning_rate": 2.6923076923076923e-05, - "loss": 0.1611, + "epoch": 3.409090909090909, + "grad_norm": 2.7968993186950684, + "learning_rate": 0.0001318181818181818, + "loss": 0.0708, "step": 1350 }, { - "epoch": 3.4871794871794872, - "grad_norm": 0.34771645069122314, - "learning_rate": 2.564102564102564e-05, - "loss": 0.1974, + "epoch": 3.4343434343434343, + "grad_norm": 3.1068055629730225, + "learning_rate": 0.00013131313131313133, + "loss": 0.1007, "step": 1360 }, { - "epoch": 3.5128205128205128, - "grad_norm": 6.590170860290527, - "learning_rate": 2.435897435897436e-05, - "loss": 0.1392, + "epoch": 3.45959595959596, + "grad_norm": 0.032499730587005615, + "learning_rate": 0.00013080808080808082, + "loss": 0.0713, "step": 1370 }, { - "epoch": 3.5384615384615383, - "grad_norm": 3.6979663372039795, - "learning_rate": 2.307692307692308e-05, - "loss": 0.1153, + "epoch": 3.484848484848485, + "grad_norm": 0.20779326558113098, + "learning_rate": 0.0001303030303030303, + "loss": 0.048, "step": 1380 }, { - "epoch": 3.564102564102564, - "grad_norm": 0.12197946012020111, - "learning_rate": 2.1794871794871795e-05, - "loss": 0.1027, + "epoch": 3.51010101010101, + "grad_norm": 5.266826152801514, + "learning_rate": 0.0001297979797979798, + "loss": 0.193, "step": 1390 }, { - "epoch": 3.58974358974359, - "grad_norm": 2.5246639251708984, - "learning_rate": 2.0512820512820512e-05, - "loss": 0.1157, + "epoch": 3.5353535353535355, + "grad_norm": 0.42106470465660095, + "learning_rate": 0.00012929292929292932, + "loss": 0.0721, "step": 1400 }, { - "epoch": 3.58974358974359, - "eval_accuracy": 0.8436363636363636, - "eval_loss": 0.4587700366973877, - "eval_runtime": 22.8201, - "eval_samples_per_second": 48.203, - "eval_steps_per_second": 6.047, + "epoch": 3.5353535353535355, + "eval_accuracy": 0.9588182632050134, + "eval_loss": 0.18219807744026184, + "eval_runtime": 73.033, + "eval_samples_per_second": 15.294, + "eval_steps_per_second": 1.917, "step": 1400 }, { - "epoch": 3.6153846153846154, - "grad_norm": 0.37446674704551697, - "learning_rate": 1.923076923076923e-05, - "loss": 0.0839, + "epoch": 3.5606060606060606, + "grad_norm": 1.7371455430984497, + "learning_rate": 0.00012878787878787878, + "loss": 0.0927, "step": 1410 }, { - "epoch": 3.641025641025641, - "grad_norm": 0.7361642718315125, - "learning_rate": 1.794871794871795e-05, - "loss": 0.0541, + "epoch": 3.5858585858585856, + "grad_norm": 0.636141836643219, + "learning_rate": 0.0001282828282828283, + "loss": 0.0295, "step": 1420 }, { - "epoch": 3.6666666666666665, - "grad_norm": 0.11162062734365463, - "learning_rate": 1.6666666666666667e-05, - "loss": 0.1791, + "epoch": 3.611111111111111, + "grad_norm": 0.10211779177188873, + "learning_rate": 0.00012777777777777776, + "loss": 0.0287, "step": 1430 }, { - "epoch": 3.6923076923076925, - "grad_norm": 3.2151377201080322, - "learning_rate": 1.5384615384615387e-05, - "loss": 0.0597, + "epoch": 3.6363636363636362, + "grad_norm": 0.803653359413147, + "learning_rate": 0.00012727272727272728, + "loss": 0.0621, "step": 1440 }, { - "epoch": 3.717948717948718, - "grad_norm": 0.853471040725708, - "learning_rate": 1.4102564102564104e-05, - "loss": 0.1246, + "epoch": 3.6616161616161618, + "grad_norm": 0.11753907799720764, + "learning_rate": 0.00012676767676767677, + "loss": 0.0465, "step": 1450 }, { - "epoch": 3.7435897435897436, - "grad_norm": 0.2989501953125, - "learning_rate": 1.282051282051282e-05, - "loss": 0.0747, + "epoch": 3.686868686868687, + "grad_norm": 0.05394851416349411, + "learning_rate": 0.00012626262626262626, + "loss": 0.0474, "step": 1460 }, { - "epoch": 3.769230769230769, - "grad_norm": 0.4194205403327942, - "learning_rate": 1.153846153846154e-05, - "loss": 0.078, + "epoch": 3.712121212121212, + "grad_norm": 3.631462574005127, + "learning_rate": 0.00012575757575757575, + "loss": 0.093, "step": 1470 }, { - "epoch": 3.7948717948717947, - "grad_norm": 0.2623525857925415, - "learning_rate": 1.0256410256410256e-05, - "loss": 0.064, + "epoch": 3.7373737373737375, + "grad_norm": 0.1336178481578827, + "learning_rate": 0.00012525252525252527, + "loss": 0.0736, "step": 1480 }, { - "epoch": 3.8205128205128203, - "grad_norm": 1.1962109804153442, - "learning_rate": 8.974358974358976e-06, - "loss": 0.0955, + "epoch": 3.7626262626262625, + "grad_norm": 0.0858420580625534, + "learning_rate": 0.00012474747474747473, + "loss": 0.1211, "step": 1490 }, { - "epoch": 3.8461538461538463, - "grad_norm": 2.009432792663574, - "learning_rate": 7.692307692307694e-06, - "loss": 0.1215, + "epoch": 3.787878787878788, + "grad_norm": 1.1731150150299072, + "learning_rate": 0.00012424242424242425, + "loss": 0.0592, "step": 1500 }, { - "epoch": 3.8461538461538463, - "eval_accuracy": 0.8654545454545455, - "eval_loss": 0.43503817915916443, - "eval_runtime": 23.6622, - "eval_samples_per_second": 46.488, - "eval_steps_per_second": 5.832, + "epoch": 3.787878787878788, + "eval_accuracy": 0.9623992837958818, + "eval_loss": 0.12546713650226593, + "eval_runtime": 73.0966, + "eval_samples_per_second": 15.281, + "eval_steps_per_second": 1.915, "step": 1500 }, { - "epoch": 3.871794871794872, - "grad_norm": 3.284787178039551, - "learning_rate": 6.41025641025641e-06, - "loss": 0.0614, + "epoch": 3.813131313131313, + "grad_norm": 1.533412218093872, + "learning_rate": 0.00012373737373737374, + "loss": 0.0663, "step": 1510 }, { - "epoch": 3.8974358974358974, - "grad_norm": 0.1390266716480255, - "learning_rate": 5.128205128205128e-06, - "loss": 0.0795, + "epoch": 3.8383838383838382, + "grad_norm": 7.734765529632568, + "learning_rate": 0.00012323232323232323, + "loss": 0.075, "step": 1520 }, { - "epoch": 3.9230769230769234, - "grad_norm": 3.4633984565734863, - "learning_rate": 3.846153846153847e-06, - "loss": 0.1268, + "epoch": 3.8636363636363638, + "grad_norm": 0.4143606126308441, + "learning_rate": 0.00012272727272727272, + "loss": 0.0158, "step": 1530 }, { - "epoch": 3.948717948717949, - "grad_norm": 3.78682804107666, - "learning_rate": 2.564102564102564e-06, - "loss": 0.1049, + "epoch": 3.888888888888889, + "grad_norm": 4.032654762268066, + "learning_rate": 0.00012222222222222224, + "loss": 0.0898, "step": 1540 }, { - "epoch": 3.9743589743589745, - "grad_norm": 3.6551170349121094, - "learning_rate": 1.282051282051282e-06, - "loss": 0.0924, + "epoch": 3.9141414141414144, + "grad_norm": 0.2919144928455353, + "learning_rate": 0.00012171717171717172, + "loss": 0.0904, "step": 1550 }, { - "epoch": 4.0, - "grad_norm": 0.1017698347568512, - "learning_rate": 0.0, - "loss": 0.1392, + "epoch": 3.9393939393939394, + "grad_norm": 6.036355018615723, + "learning_rate": 0.00012121212121212122, + "loss": 0.0725, "step": 1560 }, { - "epoch": 4.0, - "step": 1560, - "total_flos": 1.9301704773202575e+18, - "train_loss": 0.266804637053074, - "train_runtime": 1157.865, - "train_samples_per_second": 21.512, - "train_steps_per_second": 1.347 + "epoch": 3.9646464646464645, + "grad_norm": 0.34402996301651, + "learning_rate": 0.0001207070707070707, + "loss": 0.0643, + "step": 1570 + }, + { + "epoch": 3.98989898989899, + "grad_norm": 0.307706356048584, + "learning_rate": 0.0001202020202020202, + "loss": 0.1061, + "step": 1580 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.04210241511464119, + "learning_rate": 0.00011969696969696971, + "loss": 0.1015, + "step": 1590 + }, + { + "epoch": 4.040404040404041, + "grad_norm": 4.686149597167969, + "learning_rate": 0.00011919191919191919, + "loss": 0.0964, + "step": 1600 + }, + { + "epoch": 4.040404040404041, + "eval_accuracy": 0.954341987466428, + "eval_loss": 0.16204935312271118, + "eval_runtime": 72.8935, + "eval_samples_per_second": 15.324, + "eval_steps_per_second": 1.921, + "step": 1600 + }, + { + "epoch": 4.065656565656566, + "grad_norm": 0.9774217009544373, + "learning_rate": 0.00011868686868686869, + "loss": 0.0342, + "step": 1610 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 2.1450870037078857, + "learning_rate": 0.0001181818181818182, + "loss": 0.0852, + "step": 1620 + }, + { + "epoch": 4.116161616161616, + "grad_norm": 4.826761722564697, + "learning_rate": 0.00011767676767676767, + "loss": 0.0612, + "step": 1630 + }, + { + "epoch": 4.141414141414141, + "grad_norm": 0.7088700532913208, + "learning_rate": 0.00011717171717171717, + "loss": 0.0369, + "step": 1640 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.07485224306583405, + "learning_rate": 0.00011666666666666668, + "loss": 0.0075, + "step": 1650 + }, + { + "epoch": 4.191919191919192, + "grad_norm": 7.588441371917725, + "learning_rate": 0.00011616161616161616, + "loss": 0.0492, + "step": 1660 + }, + { + "epoch": 4.217171717171717, + "grad_norm": 0.06588041037321091, + "learning_rate": 0.00011565656565656566, + "loss": 0.0619, + "step": 1670 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.3317614495754242, + "learning_rate": 0.00011515151515151516, + "loss": 0.0504, + "step": 1680 + }, + { + "epoch": 4.267676767676767, + "grad_norm": 4.261381149291992, + "learning_rate": 0.00011464646464646464, + "loss": 0.0534, + "step": 1690 + }, + { + "epoch": 4.292929292929293, + "grad_norm": 1.7030925750732422, + "learning_rate": 0.00011414141414141415, + "loss": 0.0738, + "step": 1700 + }, + { + "epoch": 4.292929292929293, + "eval_accuracy": 0.9650850492390332, + "eval_loss": 0.12794509530067444, + "eval_runtime": 73.4006, + "eval_samples_per_second": 15.218, + "eval_steps_per_second": 1.907, + "step": 1700 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 3.9137349128723145, + "learning_rate": 0.00011363636363636365, + "loss": 0.0269, + "step": 1710 + }, + { + "epoch": 4.343434343434343, + "grad_norm": 0.012919370085000992, + "learning_rate": 0.00011313131313131313, + "loss": 0.0314, + "step": 1720 + }, + { + "epoch": 4.3686868686868685, + "grad_norm": 0.07363598793745041, + "learning_rate": 0.00011262626262626263, + "loss": 0.0233, + "step": 1730 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.137301966547966, + "learning_rate": 0.00011212121212121212, + "loss": 0.0863, + "step": 1740 + }, + { + "epoch": 4.41919191919192, + "grad_norm": 6.548308849334717, + "learning_rate": 0.00011161616161616161, + "loss": 0.0463, + "step": 1750 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 2.40230655670166, + "learning_rate": 0.00011111111111111112, + "loss": 0.0668, + "step": 1760 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.018276751041412354, + "learning_rate": 0.00011060606060606061, + "loss": 0.0193, + "step": 1770 + }, + { + "epoch": 4.494949494949495, + "grad_norm": 4.558255195617676, + "learning_rate": 0.00011010101010101011, + "loss": 0.1149, + "step": 1780 + }, + { + "epoch": 4.52020202020202, + "grad_norm": 0.04581284150481224, + "learning_rate": 0.0001095959595959596, + "loss": 0.0227, + "step": 1790 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 1.2669509649276733, + "learning_rate": 0.00010909090909090909, + "loss": 0.0504, + "step": 1800 + }, + { + "epoch": 4.545454545454545, + "eval_accuracy": 0.9588182632050134, + "eval_loss": 0.16235476732254028, + "eval_runtime": 73.0538, + "eval_samples_per_second": 15.29, + "eval_steps_per_second": 1.916, + "step": 1800 + }, + { + "epoch": 4.570707070707071, + "grad_norm": 0.07127434760332108, + "learning_rate": 0.0001085858585858586, + "loss": 0.0492, + "step": 1810 + }, + { + "epoch": 4.595959595959596, + "grad_norm": 1.7907336950302124, + "learning_rate": 0.00010808080808080809, + "loss": 0.0358, + "step": 1820 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 4.024843692779541, + "learning_rate": 0.00010757575757575758, + "loss": 0.0856, + "step": 1830 + }, + { + "epoch": 4.646464646464646, + "grad_norm": 0.020713260397315025, + "learning_rate": 0.00010707070707070708, + "loss": 0.0101, + "step": 1840 + }, + { + "epoch": 4.671717171717171, + "grad_norm": 0.06845160573720932, + "learning_rate": 0.00010656565656565659, + "loss": 0.0153, + "step": 1850 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 1.0333762168884277, + "learning_rate": 0.00010606060606060606, + "loss": 0.1535, + "step": 1860 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.019528638571500778, + "learning_rate": 0.00010555555555555557, + "loss": 0.089, + "step": 1870 + }, + { + "epoch": 4.747474747474747, + "grad_norm": 0.12054427713155746, + "learning_rate": 0.00010505050505050507, + "loss": 0.0154, + "step": 1880 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.053187351673841476, + "learning_rate": 0.00010454545454545455, + "loss": 0.1073, + "step": 1890 + }, + { + "epoch": 4.797979797979798, + "grad_norm": 0.03637217357754707, + "learning_rate": 0.00010404040404040405, + "loss": 0.0972, + "step": 1900 + }, + { + "epoch": 4.797979797979798, + "eval_accuracy": 0.9623992837958818, + "eval_loss": 0.15791860222816467, + "eval_runtime": 73.2114, + "eval_samples_per_second": 15.257, + "eval_steps_per_second": 1.912, + "step": 1900 + }, + { + "epoch": 4.8232323232323235, + "grad_norm": 6.812131404876709, + "learning_rate": 0.00010353535353535353, + "loss": 0.1274, + "step": 1910 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 2.3793511390686035, + "learning_rate": 0.00010303030303030303, + "loss": 0.1051, + "step": 1920 + }, + { + "epoch": 4.873737373737374, + "grad_norm": 1.2393810749053955, + "learning_rate": 0.00010252525252525254, + "loss": 0.0167, + "step": 1930 + }, + { + "epoch": 4.898989898989899, + "grad_norm": 1.5232930183410645, + "learning_rate": 0.00010202020202020202, + "loss": 0.0065, + "step": 1940 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.00905653741210699, + "learning_rate": 0.00010151515151515152, + "loss": 0.0419, + "step": 1950 + }, + { + "epoch": 4.94949494949495, + "grad_norm": 0.8604415655136108, + "learning_rate": 0.00010101010101010102, + "loss": 0.0769, + "step": 1960 + }, + { + "epoch": 4.974747474747475, + "grad_norm": 4.089222431182861, + "learning_rate": 0.0001005050505050505, + "loss": 0.0366, + "step": 1970 + }, + { + "epoch": 5.0, + "grad_norm": 2.2072501182556152, + "learning_rate": 0.0001, + "loss": 0.0746, + "step": 1980 + }, + { + "epoch": 5.025252525252525, + "grad_norm": 0.010899940505623817, + "learning_rate": 9.94949494949495e-05, + "loss": 0.0597, + "step": 1990 + }, + { + "epoch": 5.05050505050505, + "grad_norm": 1.6260383129119873, + "learning_rate": 9.8989898989899e-05, + "loss": 0.0456, + "step": 2000 + }, + { + "epoch": 5.05050505050505, + "eval_accuracy": 0.9489704565801254, + "eval_loss": 0.19649948179721832, + "eval_runtime": 73.1131, + "eval_samples_per_second": 15.278, + "eval_steps_per_second": 1.915, + "step": 2000 + }, + { + "epoch": 5.075757575757576, + "grad_norm": 0.009620290249586105, + "learning_rate": 9.848484848484849e-05, + "loss": 0.018, + "step": 2010 + }, + { + "epoch": 5.101010101010101, + "grad_norm": 4.627386093139648, + "learning_rate": 9.797979797979798e-05, + "loss": 0.0906, + "step": 2020 + }, + { + "epoch": 5.126262626262626, + "grad_norm": 0.5775233507156372, + "learning_rate": 9.747474747474747e-05, + "loss": 0.0179, + "step": 2030 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 0.3100966513156891, + "learning_rate": 9.696969696969698e-05, + "loss": 0.0225, + "step": 2040 + }, + { + "epoch": 5.1767676767676765, + "grad_norm": 0.012251541949808598, + "learning_rate": 9.646464646464647e-05, + "loss": 0.0062, + "step": 2050 + }, + { + "epoch": 5.202020202020202, + "grad_norm": 3.9397971630096436, + "learning_rate": 9.595959595959596e-05, + "loss": 0.0497, + "step": 2060 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 0.002988005056977272, + "learning_rate": 9.545454545454546e-05, + "loss": 0.0242, + "step": 2070 + }, + { + "epoch": 5.252525252525253, + "grad_norm": 0.15744374692440033, + "learning_rate": 9.494949494949495e-05, + "loss": 0.0165, + "step": 2080 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 2.624490976333618, + "learning_rate": 9.444444444444444e-05, + "loss": 0.0595, + "step": 2090 + }, + { + "epoch": 5.303030303030303, + "grad_norm": 1.7126376628875732, + "learning_rate": 9.393939393939395e-05, + "loss": 0.0334, + "step": 2100 + }, + { + "epoch": 5.303030303030303, + "eval_accuracy": 0.9570277529095792, + "eval_loss": 0.165226012468338, + "eval_runtime": 73.2601, + "eval_samples_per_second": 15.247, + "eval_steps_per_second": 1.911, + "step": 2100 + }, + { + "epoch": 5.328282828282829, + "grad_norm": 0.003406533505767584, + "learning_rate": 9.343434343434344e-05, + "loss": 0.0201, + "step": 2110 + }, + { + "epoch": 5.353535353535354, + "grad_norm": 0.18647323548793793, + "learning_rate": 9.292929292929293e-05, + "loss": 0.0471, + "step": 2120 + }, + { + "epoch": 5.378787878787879, + "grad_norm": 4.275173664093018, + "learning_rate": 9.242424242424242e-05, + "loss": 0.0565, + "step": 2130 + }, + { + "epoch": 5.404040404040404, + "grad_norm": 3.319251537322998, + "learning_rate": 9.191919191919192e-05, + "loss": 0.0687, + "step": 2140 + }, + { + "epoch": 5.429292929292929, + "grad_norm": 0.067157082259655, + "learning_rate": 9.141414141414141e-05, + "loss": 0.0507, + "step": 2150 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.18047641217708588, + "learning_rate": 9.090909090909092e-05, + "loss": 0.0555, + "step": 2160 + }, + { + "epoch": 5.47979797979798, + "grad_norm": 0.0075127603486180305, + "learning_rate": 9.040404040404041e-05, + "loss": 0.0488, + "step": 2170 + }, + { + "epoch": 5.505050505050505, + "grad_norm": 0.01690557599067688, + "learning_rate": 8.98989898989899e-05, + "loss": 0.0626, + "step": 2180 + }, + { + "epoch": 5.53030303030303, + "grad_norm": 0.005741783883422613, + "learning_rate": 8.93939393939394e-05, + "loss": 0.0014, + "step": 2190 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.05627870187163353, + "learning_rate": 8.888888888888889e-05, + "loss": 0.0242, + "step": 2200 + }, + { + "epoch": 5.555555555555555, + "eval_accuracy": 0.9749328558639212, + "eval_loss": 0.11822798103094101, + "eval_runtime": 73.1232, + "eval_samples_per_second": 15.276, + "eval_steps_per_second": 1.915, + "step": 2200 + }, + { + "epoch": 5.58080808080808, + "grad_norm": 0.012817220762372017, + "learning_rate": 8.83838383838384e-05, + "loss": 0.0277, + "step": 2210 + }, + { + "epoch": 5.606060606060606, + "grad_norm": 0.00884329341351986, + "learning_rate": 8.787878787878789e-05, + "loss": 0.0067, + "step": 2220 + }, + { + "epoch": 5.6313131313131315, + "grad_norm": 0.034603264182806015, + "learning_rate": 8.737373737373738e-05, + "loss": 0.0702, + "step": 2230 + }, + { + "epoch": 5.656565656565657, + "grad_norm": 0.0622437559068203, + "learning_rate": 8.686868686868688e-05, + "loss": 0.0171, + "step": 2240 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 0.04042644053697586, + "learning_rate": 8.636363636363637e-05, + "loss": 0.0592, + "step": 2250 + }, + { + "epoch": 5.707070707070707, + "grad_norm": 0.04215148836374283, + "learning_rate": 8.585858585858586e-05, + "loss": 0.0761, + "step": 2260 + }, + { + "epoch": 5.732323232323233, + "grad_norm": 0.22815492749214172, + "learning_rate": 8.535353535353535e-05, + "loss": 0.0133, + "step": 2270 + }, + { + "epoch": 5.757575757575758, + "grad_norm": 0.3139846622943878, + "learning_rate": 8.484848484848486e-05, + "loss": 0.0013, + "step": 2280 + }, + { + "epoch": 5.782828282828283, + "grad_norm": 0.008748591877520084, + "learning_rate": 8.434343434343435e-05, + "loss": 0.036, + "step": 2290 + }, + { + "epoch": 5.808080808080808, + "grad_norm": 0.10703355818986893, + "learning_rate": 8.383838383838384e-05, + "loss": 0.0715, + "step": 2300 + }, + { + "epoch": 5.808080808080808, + "eval_accuracy": 0.9650850492390332, + "eval_loss": 0.12497912347316742, + "eval_runtime": 72.9451, + "eval_samples_per_second": 15.313, + "eval_steps_per_second": 1.919, + "step": 2300 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.02993335947394371, + "learning_rate": 8.333333333333334e-05, + "loss": 0.017, + "step": 2310 + }, + { + "epoch": 5.858585858585858, + "grad_norm": 0.004180525429546833, + "learning_rate": 8.282828282828283e-05, + "loss": 0.0388, + "step": 2320 + }, + { + "epoch": 5.883838383838384, + "grad_norm": 0.0341310054063797, + "learning_rate": 8.232323232323233e-05, + "loss": 0.0193, + "step": 2330 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 0.02368093468248844, + "learning_rate": 8.181818181818183e-05, + "loss": 0.0314, + "step": 2340 + }, + { + "epoch": 5.934343434343434, + "grad_norm": 0.01623358018696308, + "learning_rate": 8.131313131313132e-05, + "loss": 0.0578, + "step": 2350 + }, + { + "epoch": 5.959595959595959, + "grad_norm": 0.006059895269572735, + "learning_rate": 8.080808080808081e-05, + "loss": 0.0066, + "step": 2360 + }, + { + "epoch": 5.984848484848484, + "grad_norm": 0.024945911020040512, + "learning_rate": 8.03030303030303e-05, + "loss": 0.0032, + "step": 2370 + }, + { + "epoch": 6.01010101010101, + "grad_norm": 0.010317071340978146, + "learning_rate": 7.97979797979798e-05, + "loss": 0.0047, + "step": 2380 + }, + { + "epoch": 6.0353535353535355, + "grad_norm": 0.4775066673755646, + "learning_rate": 7.92929292929293e-05, + "loss": 0.0193, + "step": 2390 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 6.233785629272461, + "learning_rate": 7.878787878787879e-05, + "loss": 0.0407, + "step": 2400 + }, + { + "epoch": 6.0606060606060606, + "eval_accuracy": 0.9695613249776186, + "eval_loss": 0.11715386807918549, + "eval_runtime": 73.3488, + "eval_samples_per_second": 15.229, + "eval_steps_per_second": 1.909, + "step": 2400 + }, + { + "epoch": 6.085858585858586, + "grad_norm": 0.04230092465877533, + "learning_rate": 7.828282828282829e-05, + "loss": 0.0028, + "step": 2410 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.0015748771838843822, + "learning_rate": 7.777777777777778e-05, + "loss": 0.0421, + "step": 2420 + }, + { + "epoch": 6.136363636363637, + "grad_norm": 0.00564368162304163, + "learning_rate": 7.727272727272727e-05, + "loss": 0.0631, + "step": 2430 + }, + { + "epoch": 6.161616161616162, + "grad_norm": 0.4366774559020996, + "learning_rate": 7.676767676767676e-05, + "loss": 0.0429, + "step": 2440 + }, + { + "epoch": 6.186868686868687, + "grad_norm": 0.6611001491546631, + "learning_rate": 7.626262626262627e-05, + "loss": 0.0901, + "step": 2450 + }, + { + "epoch": 6.212121212121212, + "grad_norm": 5.706575870513916, + "learning_rate": 7.575757575757576e-05, + "loss": 0.0857, + "step": 2460 + }, + { + "epoch": 6.237373737373737, + "grad_norm": 0.007969530299305916, + "learning_rate": 7.525252525252525e-05, + "loss": 0.0227, + "step": 2470 + }, + { + "epoch": 6.262626262626263, + "grad_norm": 0.28915736079216003, + "learning_rate": 7.474747474747475e-05, + "loss": 0.0113, + "step": 2480 + }, + { + "epoch": 6.287878787878788, + "grad_norm": 0.2088274508714676, + "learning_rate": 7.424242424242424e-05, + "loss": 0.0026, + "step": 2490 + }, + { + "epoch": 6.313131313131313, + "grad_norm": 0.004980772268027067, + "learning_rate": 7.373737373737373e-05, + "loss": 0.0003, + "step": 2500 + }, + { + "epoch": 6.313131313131313, + "eval_accuracy": 0.9785138764547896, + "eval_loss": 0.08193562924861908, + "eval_runtime": 73.1145, + "eval_samples_per_second": 15.277, + "eval_steps_per_second": 1.915, + "step": 2500 + }, + { + "epoch": 6.338383838383838, + "grad_norm": 0.001987410243600607, + "learning_rate": 7.323232323232324e-05, + "loss": 0.0383, + "step": 2510 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 1.1499226093292236, + "learning_rate": 7.272727272727273e-05, + "loss": 0.0171, + "step": 2520 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.03895330801606178, + "learning_rate": 7.222222222222222e-05, + "loss": 0.0127, + "step": 2530 + }, + { + "epoch": 6.414141414141414, + "grad_norm": 0.3166453540325165, + "learning_rate": 7.171717171717171e-05, + "loss": 0.0278, + "step": 2540 + }, + { + "epoch": 6.4393939393939394, + "grad_norm": 0.005140668712556362, + "learning_rate": 7.121212121212121e-05, + "loss": 0.0795, + "step": 2550 + }, + { + "epoch": 6.4646464646464645, + "grad_norm": 14.462100982666016, + "learning_rate": 7.07070707070707e-05, + "loss": 0.085, + "step": 2560 + }, + { + "epoch": 6.48989898989899, + "grad_norm": 0.24089215695858002, + "learning_rate": 7.020202020202021e-05, + "loss": 0.0026, + "step": 2570 + }, + { + "epoch": 6.515151515151516, + "grad_norm": 0.22834239900112152, + "learning_rate": 6.96969696969697e-05, + "loss": 0.005, + "step": 2580 + }, + { + "epoch": 6.540404040404041, + "grad_norm": 8.35010814666748, + "learning_rate": 6.91919191919192e-05, + "loss": 0.0728, + "step": 2590 + }, + { + "epoch": 6.565656565656566, + "grad_norm": 4.920100212097168, + "learning_rate": 6.86868686868687e-05, + "loss": 0.0072, + "step": 2600 + }, + { + "epoch": 6.565656565656566, + "eval_accuracy": 0.9713518352730528, + "eval_loss": 0.14060670137405396, + "eval_runtime": 73.0266, + "eval_samples_per_second": 15.296, + "eval_steps_per_second": 1.917, + "step": 2600 + }, + { + "epoch": 6.590909090909091, + "grad_norm": 0.23918700218200684, + "learning_rate": 6.818181818181818e-05, + "loss": 0.0821, + "step": 2610 + }, + { + "epoch": 6.616161616161616, + "grad_norm": 0.06384919583797455, + "learning_rate": 6.767676767676769e-05, + "loss": 0.0761, + "step": 2620 + }, + { + "epoch": 6.641414141414142, + "grad_norm": 0.4447100758552551, + "learning_rate": 6.717171717171718e-05, + "loss": 0.0139, + "step": 2630 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.0030958615243434906, + "learning_rate": 6.666666666666667e-05, + "loss": 0.0341, + "step": 2640 + }, + { + "epoch": 6.691919191919192, + "grad_norm": 0.05117692053318024, + "learning_rate": 6.616161616161617e-05, + "loss": 0.0152, + "step": 2650 + }, + { + "epoch": 6.717171717171717, + "grad_norm": 0.003273693146184087, + "learning_rate": 6.565656565656566e-05, + "loss": 0.0314, + "step": 2660 + }, + { + "epoch": 6.742424242424242, + "grad_norm": 0.005075991619378328, + "learning_rate": 6.515151515151516e-05, + "loss": 0.0164, + "step": 2670 + }, + { + "epoch": 6.767676767676767, + "grad_norm": 0.23585616052150726, + "learning_rate": 6.464646464646466e-05, + "loss": 0.0139, + "step": 2680 + }, + { + "epoch": 6.792929292929293, + "grad_norm": 6.123977184295654, + "learning_rate": 6.414141414141415e-05, + "loss": 0.0113, + "step": 2690 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 2.395871162414551, + "learning_rate": 6.363636363636364e-05, + "loss": 0.0183, + "step": 2700 + }, + { + "epoch": 6.818181818181818, + "eval_accuracy": 0.9749328558639212, + "eval_loss": 0.11515188962221146, + "eval_runtime": 73.0277, + "eval_samples_per_second": 15.296, + "eval_steps_per_second": 1.917, + "step": 2700 + }, + { + "epoch": 6.843434343434343, + "grad_norm": 0.005218807607889175, + "learning_rate": 6.313131313131313e-05, + "loss": 0.003, + "step": 2710 + }, + { + "epoch": 6.8686868686868685, + "grad_norm": 0.0012497535208240151, + "learning_rate": 6.262626262626264e-05, + "loss": 0.0116, + "step": 2720 + }, + { + "epoch": 6.893939393939394, + "grad_norm": 0.0025018516462296247, + "learning_rate": 6.212121212121213e-05, + "loss": 0.005, + "step": 2730 + }, + { + "epoch": 6.91919191919192, + "grad_norm": 0.005596707109361887, + "learning_rate": 6.161616161616162e-05, + "loss": 0.037, + "step": 2740 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.0010910239070653915, + "learning_rate": 6.111111111111112e-05, + "loss": 0.0338, + "step": 2750 + }, + { + "epoch": 6.96969696969697, + "grad_norm": 0.6075408458709717, + "learning_rate": 6.060606060606061e-05, + "loss": 0.0268, + "step": 2760 + }, + { + "epoch": 6.994949494949495, + "grad_norm": 0.25022584199905396, + "learning_rate": 6.01010101010101e-05, + "loss": 0.0125, + "step": 2770 + }, + { + "epoch": 7.02020202020202, + "grad_norm": 0.12169167399406433, + "learning_rate": 5.959595959595959e-05, + "loss": 0.0082, + "step": 2780 + }, + { + "epoch": 7.045454545454546, + "grad_norm": 3.5715599060058594, + "learning_rate": 5.90909090909091e-05, + "loss": 0.0144, + "step": 2790 + }, + { + "epoch": 7.070707070707071, + "grad_norm": 0.09293267875909805, + "learning_rate": 5.858585858585859e-05, + "loss": 0.0021, + "step": 2800 + }, + { + "epoch": 7.070707070707071, + "eval_accuracy": 0.973142345568487, + "eval_loss": 0.13676650822162628, + "eval_runtime": 72.9405, + "eval_samples_per_second": 15.314, + "eval_steps_per_second": 1.919, + "step": 2800 + }, + { + "epoch": 7.095959595959596, + "grad_norm": 0.009541651234030724, + "learning_rate": 5.808080808080808e-05, + "loss": 0.0058, + "step": 2810 + }, + { + "epoch": 7.121212121212121, + "grad_norm": 0.0016315419925376773, + "learning_rate": 5.757575757575758e-05, + "loss": 0.0064, + "step": 2820 + }, + { + "epoch": 7.146464646464646, + "grad_norm": 10.356843948364258, + "learning_rate": 5.707070707070707e-05, + "loss": 0.0595, + "step": 2830 + }, + { + "epoch": 7.171717171717171, + "grad_norm": 0.0018419253174215555, + "learning_rate": 5.6565656565656563e-05, + "loss": 0.016, + "step": 2840 + }, + { + "epoch": 7.196969696969697, + "grad_norm": 0.010135513730347157, + "learning_rate": 5.606060606060606e-05, + "loss": 0.052, + "step": 2850 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 6.740849494934082, + "learning_rate": 5.555555555555556e-05, + "loss": 0.0374, + "step": 2860 + }, + { + "epoch": 7.247474747474747, + "grad_norm": 0.4412079155445099, + "learning_rate": 5.5050505050505056e-05, + "loss": 0.0117, + "step": 2870 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 0.001609967672266066, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.0824, + "step": 2880 + }, + { + "epoch": 7.297979797979798, + "grad_norm": 0.005415134131908417, + "learning_rate": 5.4040404040404044e-05, + "loss": 0.0177, + "step": 2890 + }, + { + "epoch": 7.3232323232323235, + "grad_norm": 0.02915014885365963, + "learning_rate": 5.353535353535354e-05, + "loss": 0.046, + "step": 2900 + }, + { + "epoch": 7.3232323232323235, + "eval_accuracy": 0.9794091316025068, + "eval_loss": 0.09002197533845901, + "eval_runtime": 73.1136, + "eval_samples_per_second": 15.278, + "eval_steps_per_second": 1.915, + "step": 2900 + }, + { + "epoch": 7.348484848484849, + "grad_norm": 0.020192056894302368, + "learning_rate": 5.303030303030303e-05, + "loss": 0.0004, + "step": 2910 + }, + { + "epoch": 7.373737373737374, + "grad_norm": 0.7057023644447327, + "learning_rate": 5.2525252525252536e-05, + "loss": 0.0699, + "step": 2920 + }, + { + "epoch": 7.398989898989899, + "grad_norm": 0.0018105951603502035, + "learning_rate": 5.2020202020202026e-05, + "loss": 0.0379, + "step": 2930 + }, + { + "epoch": 7.424242424242424, + "grad_norm": 0.002236352302134037, + "learning_rate": 5.151515151515152e-05, + "loss": 0.0576, + "step": 2940 + }, + { + "epoch": 7.44949494949495, + "grad_norm": 0.46005484461784363, + "learning_rate": 5.101010101010101e-05, + "loss": 0.0007, + "step": 2950 + }, + { + "epoch": 7.474747474747475, + "grad_norm": 0.17090271413326263, + "learning_rate": 5.050505050505051e-05, + "loss": 0.0066, + "step": 2960 + }, + { + "epoch": 7.5, + "grad_norm": 0.002259742235764861, + "learning_rate": 5e-05, + "loss": 0.0043, + "step": 2970 + }, + { + "epoch": 7.525252525252525, + "grad_norm": 0.0029255333356559277, + "learning_rate": 4.94949494949495e-05, + "loss": 0.0239, + "step": 2980 + }, + { + "epoch": 7.55050505050505, + "grad_norm": 2.9925894737243652, + "learning_rate": 4.898989898989899e-05, + "loss": 0.0063, + "step": 2990 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 0.052914004772901535, + "learning_rate": 4.848484848484849e-05, + "loss": 0.033, + "step": 3000 + }, + { + "epoch": 7.575757575757576, + "eval_accuracy": 0.9785138764547896, + "eval_loss": 0.10143210738897324, + "eval_runtime": 73.4907, + "eval_samples_per_second": 15.199, + "eval_steps_per_second": 1.905, + "step": 3000 + }, + { + "epoch": 7.601010101010101, + "grad_norm": 0.04058058559894562, + "learning_rate": 4.797979797979798e-05, + "loss": 0.0245, + "step": 3010 + }, + { + "epoch": 7.626262626262626, + "grad_norm": 0.03967829421162605, + "learning_rate": 4.7474747474747476e-05, + "loss": 0.0006, + "step": 3020 + }, + { + "epoch": 7.651515151515151, + "grad_norm": 0.621035635471344, + "learning_rate": 4.696969696969697e-05, + "loss": 0.0175, + "step": 3030 + }, + { + "epoch": 7.6767676767676765, + "grad_norm": 0.36977216601371765, + "learning_rate": 4.6464646464646464e-05, + "loss": 0.0388, + "step": 3040 + }, + { + "epoch": 7.702020202020202, + "grad_norm": 3.2532241344451904, + "learning_rate": 4.595959595959596e-05, + "loss": 0.0905, + "step": 3050 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 0.004156060051172972, + "learning_rate": 4.545454545454546e-05, + "loss": 0.0002, + "step": 3060 + }, + { + "epoch": 7.752525252525253, + "grad_norm": 0.6550003290176392, + "learning_rate": 4.494949494949495e-05, + "loss": 0.0066, + "step": 3070 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.0028251020703464746, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.0083, + "step": 3080 + }, + { + "epoch": 7.803030303030303, + "grad_norm": 0.008767428807914257, + "learning_rate": 4.3939393939393944e-05, + "loss": 0.0006, + "step": 3090 + }, + { + "epoch": 7.828282828282829, + "grad_norm": 0.04811250418424606, + "learning_rate": 4.343434343434344e-05, + "loss": 0.0354, + "step": 3100 + }, + { + "epoch": 7.828282828282829, + "eval_accuracy": 0.9767233661593554, + "eval_loss": 0.09683331102132797, + "eval_runtime": 73.2348, + "eval_samples_per_second": 15.252, + "eval_steps_per_second": 1.912, + "step": 3100 + }, + { + "epoch": 7.853535353535354, + "grad_norm": 0.00525275431573391, + "learning_rate": 4.292929292929293e-05, + "loss": 0.0088, + "step": 3110 + }, + { + "epoch": 7.878787878787879, + "grad_norm": 0.015972474589943886, + "learning_rate": 4.242424242424243e-05, + "loss": 0.0011, + "step": 3120 + }, + { + "epoch": 7.904040404040404, + "grad_norm": 0.006997071672230959, + "learning_rate": 4.191919191919192e-05, + "loss": 0.0017, + "step": 3130 + }, + { + "epoch": 7.929292929292929, + "grad_norm": 0.023101719096302986, + "learning_rate": 4.141414141414142e-05, + "loss": 0.0567, + "step": 3140 + }, + { + "epoch": 7.954545454545455, + "grad_norm": 0.003169642062857747, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.1026, + "step": 3150 + }, + { + "epoch": 7.97979797979798, + "grad_norm": 0.003613903187215328, + "learning_rate": 4.0404040404040405e-05, + "loss": 0.005, + "step": 3160 + }, + { + "epoch": 8.005050505050505, + "grad_norm": 1.0490131378173828, + "learning_rate": 3.98989898989899e-05, + "loss": 0.0023, + "step": 3170 + }, + { + "epoch": 8.030303030303031, + "grad_norm": 0.003916851244866848, + "learning_rate": 3.939393939393939e-05, + "loss": 0.0023, + "step": 3180 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 0.016336582601070404, + "learning_rate": 3.888888888888889e-05, + "loss": 0.0079, + "step": 3190 + }, + { + "epoch": 8.080808080808081, + "grad_norm": 0.8970369696617126, + "learning_rate": 3.838383838383838e-05, + "loss": 0.0026, + "step": 3200 + }, + { + "epoch": 8.080808080808081, + "eval_accuracy": 0.973142345568487, + "eval_loss": 0.1217464730143547, + "eval_runtime": 73.5035, + "eval_samples_per_second": 15.197, + "eval_steps_per_second": 1.905, + "step": 3200 + }, + { + "epoch": 8.106060606060606, + "grad_norm": 0.03298179805278778, + "learning_rate": 3.787878787878788e-05, + "loss": 0.0051, + "step": 3210 + }, + { + "epoch": 8.131313131313131, + "grad_norm": 0.5918856263160706, + "learning_rate": 3.7373737373737376e-05, + "loss": 0.032, + "step": 3220 + }, + { + "epoch": 8.156565656565657, + "grad_norm": 0.0031904878560453653, + "learning_rate": 3.686868686868687e-05, + "loss": 0.029, + "step": 3230 + }, + { + "epoch": 8.181818181818182, + "grad_norm": 0.043024152517318726, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.0003, + "step": 3240 + }, + { + "epoch": 8.207070707070708, + "grad_norm": 0.011919928714632988, + "learning_rate": 3.5858585858585855e-05, + "loss": 0.0028, + "step": 3250 + }, + { + "epoch": 8.232323232323232, + "grad_norm": 0.007164669223129749, + "learning_rate": 3.535353535353535e-05, + "loss": 0.0146, + "step": 3260 + }, + { + "epoch": 8.257575757575758, + "grad_norm": 0.03415270894765854, + "learning_rate": 3.484848484848485e-05, + "loss": 0.0041, + "step": 3270 + }, + { + "epoch": 8.282828282828282, + "grad_norm": 0.03534342721104622, + "learning_rate": 3.434343434343435e-05, + "loss": 0.0035, + "step": 3280 + }, + { + "epoch": 8.308080808080808, + "grad_norm": 0.3735661804676056, + "learning_rate": 3.3838383838383844e-05, + "loss": 0.0745, + "step": 3290 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.0013512909645214677, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0002, + "step": 3300 + }, + { + "epoch": 8.333333333333334, + "eval_accuracy": 0.9794091316025068, + "eval_loss": 0.08283615112304688, + "eval_runtime": 73.1651, + "eval_samples_per_second": 15.267, + "eval_steps_per_second": 1.913, + "step": 3300 + }, + { + "epoch": 8.358585858585858, + "grad_norm": 0.023621654137969017, + "learning_rate": 3.282828282828283e-05, + "loss": 0.0174, + "step": 3310 + }, + { + "epoch": 8.383838383838384, + "grad_norm": 0.006960035767406225, + "learning_rate": 3.232323232323233e-05, + "loss": 0.0004, + "step": 3320 + }, + { + "epoch": 8.409090909090908, + "grad_norm": 0.0008190835942514241, + "learning_rate": 3.181818181818182e-05, + "loss": 0.0374, + "step": 3330 + }, + { + "epoch": 8.434343434343434, + "grad_norm": 0.016193361952900887, + "learning_rate": 3.131313131313132e-05, + "loss": 0.0007, + "step": 3340 + }, + { + "epoch": 8.45959595959596, + "grad_norm": 0.2075665146112442, + "learning_rate": 3.080808080808081e-05, + "loss": 0.0422, + "step": 3350 + }, + { + "epoch": 8.484848484848484, + "grad_norm": 0.009178784675896168, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.0332, + "step": 3360 + }, + { + "epoch": 8.51010101010101, + "grad_norm": 8.036938667297363, + "learning_rate": 2.9797979797979796e-05, + "loss": 0.0436, + "step": 3370 + }, + { + "epoch": 8.535353535353535, + "grad_norm": 0.0013093262678012252, + "learning_rate": 2.9292929292929294e-05, + "loss": 0.0109, + "step": 3380 + }, + { + "epoch": 8.56060606060606, + "grad_norm": 0.0033100605942308903, + "learning_rate": 2.878787878787879e-05, + "loss": 0.0011, + "step": 3390 + }, + { + "epoch": 8.585858585858587, + "grad_norm": 0.0015343882841989398, + "learning_rate": 2.8282828282828282e-05, + "loss": 0.0006, + "step": 3400 + }, + { + "epoch": 8.585858585858587, + "eval_accuracy": 0.9794091316025068, + "eval_loss": 0.09259337186813354, + "eval_runtime": 72.8639, + "eval_samples_per_second": 15.33, + "eval_steps_per_second": 1.921, + "step": 3400 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 0.030406756326556206, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0026, + "step": 3410 + }, + { + "epoch": 8.636363636363637, + "grad_norm": 0.0022419544402509928, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.0007, + "step": 3420 + }, + { + "epoch": 8.66161616161616, + "grad_norm": 0.0011131414212286472, + "learning_rate": 2.676767676767677e-05, + "loss": 0.0006, + "step": 3430 + }, + { + "epoch": 8.686868686868687, + "grad_norm": 0.005616435315459967, + "learning_rate": 2.6262626262626268e-05, + "loss": 0.0003, + "step": 3440 + }, + { + "epoch": 8.712121212121213, + "grad_norm": 0.1008942499756813, + "learning_rate": 2.575757575757576e-05, + "loss": 0.0097, + "step": 3450 + }, + { + "epoch": 8.737373737373737, + "grad_norm": 0.002821123693138361, + "learning_rate": 2.5252525252525256e-05, + "loss": 0.0669, + "step": 3460 + }, + { + "epoch": 8.762626262626263, + "grad_norm": 0.013286658562719822, + "learning_rate": 2.474747474747475e-05, + "loss": 0.0265, + "step": 3470 + }, + { + "epoch": 8.787878787878787, + "grad_norm": 0.003963208291679621, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.0178, + "step": 3480 + }, + { + "epoch": 8.813131313131313, + "grad_norm": 0.002018690574914217, + "learning_rate": 2.3737373737373738e-05, + "loss": 0.0082, + "step": 3490 + }, + { + "epoch": 8.83838383838384, + "grad_norm": 0.1014542207121849, + "learning_rate": 2.3232323232323232e-05, + "loss": 0.0006, + "step": 3500 + }, + { + "epoch": 8.83838383838384, + "eval_accuracy": 0.9794091316025068, + "eval_loss": 0.10012003779411316, + "eval_runtime": 73.1859, + "eval_samples_per_second": 15.263, + "eval_steps_per_second": 1.913, + "step": 3500 + }, + { + "epoch": 8.863636363636363, + "grad_norm": 0.002746024401858449, + "learning_rate": 2.272727272727273e-05, + "loss": 0.0063, + "step": 3510 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.0018340348033234477, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0024, + "step": 3520 + }, + { + "epoch": 8.914141414141413, + "grad_norm": 0.004108617547899485, + "learning_rate": 2.171717171717172e-05, + "loss": 0.0083, + "step": 3530 + }, + { + "epoch": 8.93939393939394, + "grad_norm": 0.00315410690382123, + "learning_rate": 2.1212121212121215e-05, + "loss": 0.0462, + "step": 3540 + }, + { + "epoch": 8.964646464646465, + "grad_norm": 0.024781817570328712, + "learning_rate": 2.070707070707071e-05, + "loss": 0.0029, + "step": 3550 + }, + { + "epoch": 8.98989898989899, + "grad_norm": 0.005382045172154903, + "learning_rate": 2.0202020202020203e-05, + "loss": 0.0047, + "step": 3560 + }, + { + "epoch": 9.015151515151516, + "grad_norm": 1.6344341039657593, + "learning_rate": 1.9696969696969697e-05, + "loss": 0.0038, + "step": 3570 + }, + { + "epoch": 9.04040404040404, + "grad_norm": 0.010318132117390633, + "learning_rate": 1.919191919191919e-05, + "loss": 0.0096, + "step": 3580 + }, + { + "epoch": 9.065656565656566, + "grad_norm": 0.0016402292530983686, + "learning_rate": 1.8686868686868688e-05, + "loss": 0.0321, + "step": 3590 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.004027374088764191, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.0006, + "step": 3600 + }, + { + "epoch": 9.090909090909092, + "eval_accuracy": 0.9847806624888094, + "eval_loss": 0.08629997074604034, + "eval_runtime": 73.127, + "eval_samples_per_second": 15.275, + "eval_steps_per_second": 1.914, + "step": 3600 + }, + { + "epoch": 9.116161616161616, + "grad_norm": 0.0007902685320004821, + "learning_rate": 1.7676767676767676e-05, + "loss": 0.0059, + "step": 3610 + }, + { + "epoch": 9.141414141414142, + "grad_norm": 0.0024135063868016005, + "learning_rate": 1.7171717171717173e-05, + "loss": 0.0269, + "step": 3620 + }, + { + "epoch": 9.166666666666666, + "grad_norm": 0.026507705450057983, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0003, + "step": 3630 + }, + { + "epoch": 9.191919191919192, + "grad_norm": 0.10678762197494507, + "learning_rate": 1.6161616161616165e-05, + "loss": 0.0059, + "step": 3640 + }, + { + "epoch": 9.217171717171718, + "grad_norm": 0.08362487703561783, + "learning_rate": 1.565656565656566e-05, + "loss": 0.0545, + "step": 3650 + }, + { + "epoch": 9.242424242424242, + "grad_norm": 0.002414940157905221, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.0221, + "step": 3660 + }, + { + "epoch": 9.267676767676768, + "grad_norm": 0.0013868235982954502, + "learning_rate": 1.4646464646464647e-05, + "loss": 0.0005, + "step": 3670 + }, + { + "epoch": 9.292929292929292, + "grad_norm": 0.0013921884819865227, + "learning_rate": 1.4141414141414141e-05, + "loss": 0.041, + "step": 3680 + }, + { + "epoch": 9.318181818181818, + "grad_norm": 0.08867702633142471, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.026, + "step": 3690 + }, + { + "epoch": 9.343434343434343, + "grad_norm": 0.0012104762718081474, + "learning_rate": 1.3131313131313134e-05, + "loss": 0.0633, + "step": 3700 + }, + { + "epoch": 9.343434343434343, + "eval_accuracy": 0.9803043867502238, + "eval_loss": 0.09109070897102356, + "eval_runtime": 71.4974, + "eval_samples_per_second": 15.623, + "eval_steps_per_second": 1.958, + "step": 3700 + }, + { + "epoch": 9.368686868686869, + "grad_norm": 0.007544935215264559, + "learning_rate": 1.2626262626262628e-05, + "loss": 0.002, + "step": 3710 + }, + { + "epoch": 9.393939393939394, + "grad_norm": 0.01898648589849472, + "learning_rate": 1.2121212121212122e-05, + "loss": 0.0005, + "step": 3720 + }, + { + "epoch": 9.419191919191919, + "grad_norm": 0.00644712382927537, + "learning_rate": 1.1616161616161616e-05, + "loss": 0.0059, + "step": 3730 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.00872492603957653, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0011, + "step": 3740 + }, + { + "epoch": 9.469696969696969, + "grad_norm": 1.6075825691223145, + "learning_rate": 1.0606060606060607e-05, + "loss": 0.0099, + "step": 3750 + }, + { + "epoch": 9.494949494949495, + "grad_norm": 6.320465087890625, + "learning_rate": 1.0101010101010101e-05, + "loss": 0.0163, + "step": 3760 + }, + { + "epoch": 9.52020202020202, + "grad_norm": 0.0037208800204098225, + "learning_rate": 9.595959595959595e-06, + "loss": 0.0002, + "step": 3770 + }, + { + "epoch": 9.545454545454545, + "grad_norm": 3.3599369525909424, + "learning_rate": 9.090909090909091e-06, + "loss": 0.0053, + "step": 3780 + }, + { + "epoch": 9.570707070707071, + "grad_norm": 0.5879691243171692, + "learning_rate": 8.585858585858587e-06, + "loss": 0.0019, + "step": 3790 + }, + { + "epoch": 9.595959595959595, + "grad_norm": 0.26342862844467163, + "learning_rate": 8.080808080808082e-06, + "loss": 0.0009, + "step": 3800 + }, + { + "epoch": 9.595959595959595, + "eval_accuracy": 0.982094897045658, + "eval_loss": 0.09413682669401169, + "eval_runtime": 73.1451, + "eval_samples_per_second": 15.271, + "eval_steps_per_second": 1.914, + "step": 3800 + }, + { + "epoch": 9.621212121212121, + "grad_norm": 0.042649831622838974, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.0226, + "step": 3810 + }, + { + "epoch": 9.646464646464647, + "grad_norm": 0.0022528120316565037, + "learning_rate": 7.0707070707070704e-06, + "loss": 0.0136, + "step": 3820 + }, + { + "epoch": 9.671717171717171, + "grad_norm": 0.12108311802148819, + "learning_rate": 6.565656565656567e-06, + "loss": 0.0408, + "step": 3830 + }, + { + "epoch": 9.696969696969697, + "grad_norm": 0.7086867690086365, + "learning_rate": 6.060606060606061e-06, + "loss": 0.0035, + "step": 3840 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 0.049748744815588, + "learning_rate": 5.555555555555556e-06, + "loss": 0.0012, + "step": 3850 + }, + { + "epoch": 9.747474747474747, + "grad_norm": 0.004345474299043417, + "learning_rate": 5.050505050505051e-06, + "loss": 0.0002, + "step": 3860 + }, + { + "epoch": 9.772727272727273, + "grad_norm": 0.005164165981113911, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.0049, + "step": 3870 + }, + { + "epoch": 9.797979797979798, + "grad_norm": 0.003518365090712905, + "learning_rate": 4.040404040404041e-06, + "loss": 0.002, + "step": 3880 + }, + { + "epoch": 9.823232323232324, + "grad_norm": 0.0017797194886952639, + "learning_rate": 3.5353535353535352e-06, + "loss": 0.0005, + "step": 3890 + }, + { + "epoch": 9.848484848484848, + "grad_norm": 4.788568496704102, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.0247, + "step": 3900 + }, + { + "epoch": 9.848484848484848, + "eval_accuracy": 0.9785138764547896, + "eval_loss": 0.09876807779073715, + "eval_runtime": 73.1729, + "eval_samples_per_second": 15.265, + "eval_steps_per_second": 1.913, + "step": 3900 + }, + { + "epoch": 9.873737373737374, + "grad_norm": 0.0013341947924345732, + "learning_rate": 2.5252525252525253e-06, + "loss": 0.0082, + "step": 3910 + }, + { + "epoch": 9.8989898989899, + "grad_norm": 0.004278136417269707, + "learning_rate": 2.0202020202020206e-06, + "loss": 0.0019, + "step": 3920 + }, + { + "epoch": 9.924242424242424, + "grad_norm": 0.002301498083397746, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.0245, + "step": 3930 + }, + { + "epoch": 9.94949494949495, + "grad_norm": 0.000858976156450808, + "learning_rate": 1.0101010101010103e-06, + "loss": 0.0013, + "step": 3940 + }, + { + "epoch": 9.974747474747474, + "grad_norm": 0.007369679398834705, + "learning_rate": 5.050505050505052e-07, + "loss": 0.0774, + "step": 3950 + }, + { + "epoch": 10.0, + "grad_norm": 0.008844327181577682, + "learning_rate": 0.0, + "loss": 0.0004, + "step": 3960 + }, + { + "epoch": 10.0, + "step": 3960, + "total_flos": 4.904158054749069e+18, + "train_loss": 0.06213315485569771, + "train_runtime": 7084.9204, + "train_samples_per_second": 8.93, + "train_steps_per_second": 0.559 } ], "logging_steps": 10, - "max_steps": 1560, + "max_steps": 3960, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { @@ -1262,7 +3158,7 @@ "attributes": {} } }, - "total_flos": 1.9301704773202575e+18, + "total_flos": 4.904158054749069e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null