{ "best_metric": 0.9731993299832495, "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-papsmear/checkpoint-1680", "epoch": 50.0, "eval_steps": 500, "global_step": 2100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23809523809523808, "grad_norm": 1.9611310958862305, "learning_rate": 2.3809523809523808e-06, "loss": 1.4059, "step": 10 }, { "epoch": 0.47619047619047616, "grad_norm": 1.8307807445526123, "learning_rate": 4.7619047619047615e-06, "loss": 1.3406, "step": 20 }, { "epoch": 0.7142857142857143, "grad_norm": 1.8257180452346802, "learning_rate": 7.142857142857143e-06, "loss": 1.2533, "step": 30 }, { "epoch": 0.9523809523809523, "grad_norm": 1.2919795513153076, "learning_rate": 9.523809523809523e-06, "loss": 1.1553, "step": 40 }, { "epoch": 1.0, "eval_accuracy": 0.5477386934673367, "eval_loss": 1.0950442552566528, "eval_runtime": 104.5, "eval_samples_per_second": 5.713, "eval_steps_per_second": 0.182, "step": 42 }, { "epoch": 1.1904761904761905, "grad_norm": 0.9992073178291321, "learning_rate": 1.1904761904761905e-05, "loss": 1.0508, "step": 50 }, { "epoch": 1.4285714285714286, "grad_norm": 0.8946818113327026, "learning_rate": 1.4285714285714285e-05, "loss": 0.952, "step": 60 }, { "epoch": 1.6666666666666665, "grad_norm": 1.0831001996994019, "learning_rate": 1.6666666666666667e-05, "loss": 0.8539, "step": 70 }, { "epoch": 1.9047619047619047, "grad_norm": 1.271881103515625, "learning_rate": 1.9047619047619046e-05, "loss": 0.7791, "step": 80 }, { "epoch": 2.0, "eval_accuracy": 0.8525963149078727, "eval_loss": 0.6485655307769775, "eval_runtime": 107.1905, "eval_samples_per_second": 5.57, "eval_steps_per_second": 0.177, "step": 84 }, { "epoch": 2.142857142857143, "grad_norm": 1.7645951509475708, "learning_rate": 2.1428571428571428e-05, "loss": 0.6112, "step": 90 }, { "epoch": 2.380952380952381, "grad_norm": 2.271763563156128, "learning_rate": 2.380952380952381e-05, "loss": 0.5029, "step": 100 }, { "epoch": 2.619047619047619, "grad_norm": 2.248612403869629, "learning_rate": 2.6190476190476192e-05, "loss": 0.4809, "step": 110 }, { "epoch": 2.857142857142857, "grad_norm": 2.1449034214019775, "learning_rate": 2.857142857142857e-05, "loss": 0.433, "step": 120 }, { "epoch": 3.0, "eval_accuracy": 0.9128978224455612, "eval_loss": 0.3716076910495758, "eval_runtime": 107.5442, "eval_samples_per_second": 5.551, "eval_steps_per_second": 0.177, "step": 126 }, { "epoch": 3.0952380952380953, "grad_norm": 2.3766279220581055, "learning_rate": 3.095238095238095e-05, "loss": 0.3777, "step": 130 }, { "epoch": 3.3333333333333335, "grad_norm": 1.3414802551269531, "learning_rate": 3.3333333333333335e-05, "loss": 0.3424, "step": 140 }, { "epoch": 3.571428571428571, "grad_norm": 7.162674903869629, "learning_rate": 3.571428571428572e-05, "loss": 0.3719, "step": 150 }, { "epoch": 3.8095238095238093, "grad_norm": 2.170807123184204, "learning_rate": 3.809523809523809e-05, "loss": 0.3495, "step": 160 }, { "epoch": 4.0, "eval_accuracy": 0.9346733668341709, "eval_loss": 0.28690266609191895, "eval_runtime": 106.4197, "eval_samples_per_second": 5.61, "eval_steps_per_second": 0.179, "step": 168 }, { "epoch": 4.0476190476190474, "grad_norm": 1.2484022378921509, "learning_rate": 4.047619047619048e-05, "loss": 0.2892, "step": 170 }, { "epoch": 4.285714285714286, "grad_norm": 7.085641860961914, "learning_rate": 4.2857142857142856e-05, "loss": 0.3386, "step": 180 }, { "epoch": 4.523809523809524, "grad_norm": 1.4148013591766357, "learning_rate": 4.523809523809524e-05, "loss": 0.3225, "step": 190 }, { "epoch": 4.761904761904762, "grad_norm": 2.7628612518310547, "learning_rate": 4.761904761904762e-05, "loss": 0.2979, "step": 200 }, { "epoch": 5.0, "grad_norm": 1.5106964111328125, "learning_rate": 5e-05, "loss": 0.2556, "step": 210 }, { "epoch": 5.0, "eval_accuracy": 0.9279731993299832, "eval_loss": 0.27220866084098816, "eval_runtime": 106.2256, "eval_samples_per_second": 5.62, "eval_steps_per_second": 0.179, "step": 210 }, { "epoch": 5.238095238095238, "grad_norm": 4.107568740844727, "learning_rate": 4.973544973544973e-05, "loss": 0.2635, "step": 220 }, { "epoch": 5.476190476190476, "grad_norm": 2.1257033348083496, "learning_rate": 4.9470899470899475e-05, "loss": 0.2581, "step": 230 }, { "epoch": 5.714285714285714, "grad_norm": 2.4218671321868896, "learning_rate": 4.9206349206349204e-05, "loss": 0.2303, "step": 240 }, { "epoch": 5.9523809523809526, "grad_norm": 2.0036144256591797, "learning_rate": 4.894179894179895e-05, "loss": 0.2791, "step": 250 }, { "epoch": 6.0, "eval_accuracy": 0.932998324958124, "eval_loss": 0.2610774338245392, "eval_runtime": 106.375, "eval_samples_per_second": 5.612, "eval_steps_per_second": 0.179, "step": 252 }, { "epoch": 6.190476190476191, "grad_norm": 2.8978521823883057, "learning_rate": 4.8677248677248676e-05, "loss": 0.224, "step": 260 }, { "epoch": 6.428571428571429, "grad_norm": 1.3699370622634888, "learning_rate": 4.841269841269841e-05, "loss": 0.2208, "step": 270 }, { "epoch": 6.666666666666667, "grad_norm": 1.7077748775482178, "learning_rate": 4.814814814814815e-05, "loss": 0.2294, "step": 280 }, { "epoch": 6.904761904761905, "grad_norm": 2.22580885887146, "learning_rate": 4.7883597883597884e-05, "loss": 0.2343, "step": 290 }, { "epoch": 7.0, "eval_accuracy": 0.9380234505862647, "eval_loss": 0.2376711070537567, "eval_runtime": 104.2609, "eval_samples_per_second": 5.726, "eval_steps_per_second": 0.182, "step": 294 }, { "epoch": 7.142857142857143, "grad_norm": 1.8247556686401367, "learning_rate": 4.761904761904762e-05, "loss": 0.2295, "step": 300 }, { "epoch": 7.380952380952381, "grad_norm": 2.389512538909912, "learning_rate": 4.7354497354497356e-05, "loss": 0.2059, "step": 310 }, { "epoch": 7.619047619047619, "grad_norm": 1.6845765113830566, "learning_rate": 4.708994708994709e-05, "loss": 0.1938, "step": 320 }, { "epoch": 7.857142857142857, "grad_norm": 1.345927119255066, "learning_rate": 4.682539682539683e-05, "loss": 0.186, "step": 330 }, { "epoch": 8.0, "eval_accuracy": 0.9396984924623115, "eval_loss": 0.2157616764307022, "eval_runtime": 104.0067, "eval_samples_per_second": 5.74, "eval_steps_per_second": 0.183, "step": 336 }, { "epoch": 8.095238095238095, "grad_norm": 1.470657467842102, "learning_rate": 4.656084656084656e-05, "loss": 0.2077, "step": 340 }, { "epoch": 8.333333333333334, "grad_norm": 1.4709361791610718, "learning_rate": 4.62962962962963e-05, "loss": 0.1809, "step": 350 }, { "epoch": 8.571428571428571, "grad_norm": 1.731719732284546, "learning_rate": 4.603174603174603e-05, "loss": 0.2163, "step": 360 }, { "epoch": 8.80952380952381, "grad_norm": 1.8033312559127808, "learning_rate": 4.576719576719577e-05, "loss": 0.1984, "step": 370 }, { "epoch": 9.0, "eval_accuracy": 0.9346733668341709, "eval_loss": 0.22224725782871246, "eval_runtime": 104.8934, "eval_samples_per_second": 5.691, "eval_steps_per_second": 0.181, "step": 378 }, { "epoch": 9.047619047619047, "grad_norm": 1.9741649627685547, "learning_rate": 4.55026455026455e-05, "loss": 0.2093, "step": 380 }, { "epoch": 9.285714285714286, "grad_norm": 1.6360487937927246, "learning_rate": 4.523809523809524e-05, "loss": 0.1586, "step": 390 }, { "epoch": 9.523809523809524, "grad_norm": 2.772472858428955, "learning_rate": 4.4973544973544974e-05, "loss": 0.1423, "step": 400 }, { "epoch": 9.761904761904763, "grad_norm": 1.9369553327560425, "learning_rate": 4.470899470899471e-05, "loss": 0.1458, "step": 410 }, { "epoch": 10.0, "grad_norm": 2.552593231201172, "learning_rate": 4.4444444444444447e-05, "loss": 0.1751, "step": 420 }, { "epoch": 10.0, "eval_accuracy": 0.9514237855946399, "eval_loss": 0.19929908215999603, "eval_runtime": 106.1844, "eval_samples_per_second": 5.622, "eval_steps_per_second": 0.179, "step": 420 }, { "epoch": 10.238095238095237, "grad_norm": 2.9462811946868896, "learning_rate": 4.417989417989418e-05, "loss": 0.1305, "step": 430 }, { "epoch": 10.476190476190476, "grad_norm": 2.5522372722625732, "learning_rate": 4.391534391534391e-05, "loss": 0.1555, "step": 440 }, { "epoch": 10.714285714285714, "grad_norm": 1.2257236242294312, "learning_rate": 4.3650793650793655e-05, "loss": 0.1575, "step": 450 }, { "epoch": 10.952380952380953, "grad_norm": 0.9242783188819885, "learning_rate": 4.3386243386243384e-05, "loss": 0.1529, "step": 460 }, { "epoch": 11.0, "eval_accuracy": 0.9430485762144054, "eval_loss": 0.2100822627544403, "eval_runtime": 102.6058, "eval_samples_per_second": 5.818, "eval_steps_per_second": 0.185, "step": 462 }, { "epoch": 11.19047619047619, "grad_norm": 1.2224242687225342, "learning_rate": 4.312169312169313e-05, "loss": 0.1171, "step": 470 }, { "epoch": 11.428571428571429, "grad_norm": 1.332038164138794, "learning_rate": 4.2857142857142856e-05, "loss": 0.1399, "step": 480 }, { "epoch": 11.666666666666666, "grad_norm": 2.5912814140319824, "learning_rate": 4.259259259259259e-05, "loss": 0.129, "step": 490 }, { "epoch": 11.904761904761905, "grad_norm": 3.0691795349121094, "learning_rate": 4.232804232804233e-05, "loss": 0.1616, "step": 500 }, { "epoch": 12.0, "eval_accuracy": 0.9296482412060302, "eval_loss": 0.2542562484741211, "eval_runtime": 107.198, "eval_samples_per_second": 5.569, "eval_steps_per_second": 0.177, "step": 504 }, { "epoch": 12.142857142857142, "grad_norm": 1.015551209449768, "learning_rate": 4.2063492063492065e-05, "loss": 0.1374, "step": 510 }, { "epoch": 12.380952380952381, "grad_norm": 1.5744130611419678, "learning_rate": 4.17989417989418e-05, "loss": 0.1219, "step": 520 }, { "epoch": 12.619047619047619, "grad_norm": 2.7706940174102783, "learning_rate": 4.153439153439154e-05, "loss": 0.1159, "step": 530 }, { "epoch": 12.857142857142858, "grad_norm": 0.9542893767356873, "learning_rate": 4.126984126984127e-05, "loss": 0.1404, "step": 540 }, { "epoch": 13.0, "eval_accuracy": 0.9396984924623115, "eval_loss": 0.2028820812702179, "eval_runtime": 106.2276, "eval_samples_per_second": 5.62, "eval_steps_per_second": 0.179, "step": 546 }, { "epoch": 13.095238095238095, "grad_norm": 1.9553595781326294, "learning_rate": 4.100529100529101e-05, "loss": 0.1128, "step": 550 }, { "epoch": 13.333333333333334, "grad_norm": 0.4872543513774872, "learning_rate": 4.074074074074074e-05, "loss": 0.1133, "step": 560 }, { "epoch": 13.571428571428571, "grad_norm": 1.1375516653060913, "learning_rate": 4.047619047619048e-05, "loss": 0.1195, "step": 570 }, { "epoch": 13.80952380952381, "grad_norm": 2.1001851558685303, "learning_rate": 4.021164021164021e-05, "loss": 0.1078, "step": 580 }, { "epoch": 14.0, "eval_accuracy": 0.9413735343383585, "eval_loss": 0.20870448648929596, "eval_runtime": 103.1036, "eval_samples_per_second": 5.79, "eval_steps_per_second": 0.184, "step": 588 }, { "epoch": 14.047619047619047, "grad_norm": 1.273701786994934, "learning_rate": 3.9947089947089946e-05, "loss": 0.1078, "step": 590 }, { "epoch": 14.285714285714286, "grad_norm": 2.2141387462615967, "learning_rate": 3.968253968253968e-05, "loss": 0.1005, "step": 600 }, { "epoch": 14.523809523809524, "grad_norm": 2.147643566131592, "learning_rate": 3.941798941798942e-05, "loss": 0.1195, "step": 610 }, { "epoch": 14.761904761904763, "grad_norm": 1.8408890962600708, "learning_rate": 3.9153439153439155e-05, "loss": 0.0981, "step": 620 }, { "epoch": 15.0, "grad_norm": 2.7218682765960693, "learning_rate": 3.888888888888889e-05, "loss": 0.1109, "step": 630 }, { "epoch": 15.0, "eval_accuracy": 0.9614740368509213, "eval_loss": 0.1381397545337677, "eval_runtime": 102.743, "eval_samples_per_second": 5.811, "eval_steps_per_second": 0.185, "step": 630 }, { "epoch": 15.238095238095237, "grad_norm": 2.087874412536621, "learning_rate": 3.862433862433863e-05, "loss": 0.0932, "step": 640 }, { "epoch": 15.476190476190476, "grad_norm": 1.6582282781600952, "learning_rate": 3.835978835978836e-05, "loss": 0.12, "step": 650 }, { "epoch": 15.714285714285714, "grad_norm": 1.8671879768371582, "learning_rate": 3.809523809523809e-05, "loss": 0.0981, "step": 660 }, { "epoch": 15.952380952380953, "grad_norm": 1.7882401943206787, "learning_rate": 3.7830687830687835e-05, "loss": 0.1072, "step": 670 }, { "epoch": 16.0, "eval_accuracy": 0.9413735343383585, "eval_loss": 0.18952256441116333, "eval_runtime": 104.7109, "eval_samples_per_second": 5.701, "eval_steps_per_second": 0.181, "step": 672 }, { "epoch": 16.19047619047619, "grad_norm": 3.6218953132629395, "learning_rate": 3.7566137566137564e-05, "loss": 0.0818, "step": 680 }, { "epoch": 16.428571428571427, "grad_norm": 1.1185909509658813, "learning_rate": 3.730158730158731e-05, "loss": 0.0951, "step": 690 }, { "epoch": 16.666666666666668, "grad_norm": 0.9545117616653442, "learning_rate": 3.7037037037037037e-05, "loss": 0.1064, "step": 700 }, { "epoch": 16.904761904761905, "grad_norm": 0.7912140488624573, "learning_rate": 3.677248677248677e-05, "loss": 0.0949, "step": 710 }, { "epoch": 17.0, "eval_accuracy": 0.9396984924623115, "eval_loss": 0.19812369346618652, "eval_runtime": 103.2395, "eval_samples_per_second": 5.783, "eval_steps_per_second": 0.184, "step": 714 }, { "epoch": 17.142857142857142, "grad_norm": 2.5102455615997314, "learning_rate": 3.650793650793651e-05, "loss": 0.0942, "step": 720 }, { "epoch": 17.38095238095238, "grad_norm": 0.907739520072937, "learning_rate": 3.6243386243386245e-05, "loss": 0.081, "step": 730 }, { "epoch": 17.61904761904762, "grad_norm": 0.9580628275871277, "learning_rate": 3.597883597883598e-05, "loss": 0.0939, "step": 740 }, { "epoch": 17.857142857142858, "grad_norm": 0.4892265498638153, "learning_rate": 3.571428571428572e-05, "loss": 0.0908, "step": 750 }, { "epoch": 18.0, "eval_accuracy": 0.9581239530988275, "eval_loss": 0.16083765029907227, "eval_runtime": 103.0208, "eval_samples_per_second": 5.795, "eval_steps_per_second": 0.184, "step": 756 }, { "epoch": 18.095238095238095, "grad_norm": 0.6354324221611023, "learning_rate": 3.5449735449735446e-05, "loss": 0.0674, "step": 760 }, { "epoch": 18.333333333333332, "grad_norm": 1.7310459613800049, "learning_rate": 3.518518518518519e-05, "loss": 0.0963, "step": 770 }, { "epoch": 18.571428571428573, "grad_norm": 1.0339725017547607, "learning_rate": 3.492063492063492e-05, "loss": 0.0846, "step": 780 }, { "epoch": 18.80952380952381, "grad_norm": 2.478813648223877, "learning_rate": 3.465608465608466e-05, "loss": 0.0809, "step": 790 }, { "epoch": 19.0, "eval_accuracy": 0.9581239530988275, "eval_loss": 0.1764398217201233, "eval_runtime": 104.4696, "eval_samples_per_second": 5.715, "eval_steps_per_second": 0.182, "step": 798 }, { "epoch": 19.047619047619047, "grad_norm": 0.9190245270729065, "learning_rate": 3.439153439153439e-05, "loss": 0.0831, "step": 800 }, { "epoch": 19.285714285714285, "grad_norm": 1.1679134368896484, "learning_rate": 3.412698412698413e-05, "loss": 0.0737, "step": 810 }, { "epoch": 19.523809523809526, "grad_norm": 1.52895987033844, "learning_rate": 3.386243386243386e-05, "loss": 0.0695, "step": 820 }, { "epoch": 19.761904761904763, "grad_norm": 1.6142102479934692, "learning_rate": 3.35978835978836e-05, "loss": 0.0696, "step": 830 }, { "epoch": 20.0, "grad_norm": 2.2493298053741455, "learning_rate": 3.3333333333333335e-05, "loss": 0.0708, "step": 840 }, { "epoch": 20.0, "eval_accuracy": 0.9530988274706867, "eval_loss": 0.15123647451400757, "eval_runtime": 102.2975, "eval_samples_per_second": 5.836, "eval_steps_per_second": 0.186, "step": 840 }, { "epoch": 20.238095238095237, "grad_norm": 4.499787330627441, "learning_rate": 3.306878306878307e-05, "loss": 0.0936, "step": 850 }, { "epoch": 20.476190476190474, "grad_norm": 5.423216819763184, "learning_rate": 3.280423280423281e-05, "loss": 0.0712, "step": 860 }, { "epoch": 20.714285714285715, "grad_norm": 1.0831531286239624, "learning_rate": 3.253968253968254e-05, "loss": 0.0817, "step": 870 }, { "epoch": 20.952380952380953, "grad_norm": 1.6317639350891113, "learning_rate": 3.227513227513227e-05, "loss": 0.0757, "step": 880 }, { "epoch": 21.0, "eval_accuracy": 0.948073701842546, "eval_loss": 0.20271137356758118, "eval_runtime": 102.4721, "eval_samples_per_second": 5.826, "eval_steps_per_second": 0.185, "step": 882 }, { "epoch": 21.19047619047619, "grad_norm": 2.1182172298431396, "learning_rate": 3.2010582010582015e-05, "loss": 0.0882, "step": 890 }, { "epoch": 21.428571428571427, "grad_norm": 0.5835541486740112, "learning_rate": 3.1746031746031745e-05, "loss": 0.0607, "step": 900 }, { "epoch": 21.666666666666668, "grad_norm": 1.441300392150879, "learning_rate": 3.148148148148148e-05, "loss": 0.0859, "step": 910 }, { "epoch": 21.904761904761905, "grad_norm": 0.6337174773216248, "learning_rate": 3.121693121693122e-05, "loss": 0.0919, "step": 920 }, { "epoch": 22.0, "eval_accuracy": 0.9614740368509213, "eval_loss": 0.14867298305034637, "eval_runtime": 103.3409, "eval_samples_per_second": 5.777, "eval_steps_per_second": 0.184, "step": 924 }, { "epoch": 22.142857142857142, "grad_norm": 1.7783087491989136, "learning_rate": 3.095238095238095e-05, "loss": 0.0591, "step": 930 }, { "epoch": 22.38095238095238, "grad_norm": 0.4258907735347748, "learning_rate": 3.068783068783069e-05, "loss": 0.0722, "step": 940 }, { "epoch": 22.61904761904762, "grad_norm": 0.975234866142273, "learning_rate": 3.0423280423280425e-05, "loss": 0.0582, "step": 950 }, { "epoch": 22.857142857142858, "grad_norm": 0.9665831327438354, "learning_rate": 3.0158730158730158e-05, "loss": 0.07, "step": 960 }, { "epoch": 23.0, "eval_accuracy": 0.9614740368509213, "eval_loss": 0.16668196022510529, "eval_runtime": 103.2831, "eval_samples_per_second": 5.78, "eval_steps_per_second": 0.184, "step": 966 }, { "epoch": 23.095238095238095, "grad_norm": 1.7929288148880005, "learning_rate": 2.9894179894179897e-05, "loss": 0.0582, "step": 970 }, { "epoch": 23.333333333333332, "grad_norm": 0.43466824293136597, "learning_rate": 2.962962962962963e-05, "loss": 0.061, "step": 980 }, { "epoch": 23.571428571428573, "grad_norm": 2.2438175678253174, "learning_rate": 2.9365079365079366e-05, "loss": 0.0644, "step": 990 }, { "epoch": 23.80952380952381, "grad_norm": 0.8747345805168152, "learning_rate": 2.91005291005291e-05, "loss": 0.0629, "step": 1000 }, { "epoch": 24.0, "eval_accuracy": 0.9530988274706867, "eval_loss": 0.19044645130634308, "eval_runtime": 103.4446, "eval_samples_per_second": 5.771, "eval_steps_per_second": 0.184, "step": 1008 }, { "epoch": 24.047619047619047, "grad_norm": 1.7004871368408203, "learning_rate": 2.8835978835978838e-05, "loss": 0.0496, "step": 1010 }, { "epoch": 24.285714285714285, "grad_norm": 1.3886641263961792, "learning_rate": 2.857142857142857e-05, "loss": 0.0468, "step": 1020 }, { "epoch": 24.523809523809526, "grad_norm": 2.0595200061798096, "learning_rate": 2.830687830687831e-05, "loss": 0.0657, "step": 1030 }, { "epoch": 24.761904761904763, "grad_norm": 0.9633322954177856, "learning_rate": 2.8042328042328043e-05, "loss": 0.0626, "step": 1040 }, { "epoch": 25.0, "grad_norm": 0.629084050655365, "learning_rate": 2.777777777777778e-05, "loss": 0.0584, "step": 1050 }, { "epoch": 25.0, "eval_accuracy": 0.9631490787269682, "eval_loss": 0.15212486684322357, "eval_runtime": 104.2052, "eval_samples_per_second": 5.729, "eval_steps_per_second": 0.182, "step": 1050 }, { "epoch": 25.238095238095237, "grad_norm": 1.9510831832885742, "learning_rate": 2.7513227513227512e-05, "loss": 0.0514, "step": 1060 }, { "epoch": 25.476190476190474, "grad_norm": 0.6461337208747864, "learning_rate": 2.724867724867725e-05, "loss": 0.0619, "step": 1070 }, { "epoch": 25.714285714285715, "grad_norm": 0.8791431784629822, "learning_rate": 2.6984126984126984e-05, "loss": 0.0626, "step": 1080 }, { "epoch": 25.952380952380953, "grad_norm": 1.83372163772583, "learning_rate": 2.6719576719576723e-05, "loss": 0.0666, "step": 1090 }, { "epoch": 26.0, "eval_accuracy": 0.966499162479062, "eval_loss": 0.1326070874929428, "eval_runtime": 103.6677, "eval_samples_per_second": 5.759, "eval_steps_per_second": 0.183, "step": 1092 }, { "epoch": 26.19047619047619, "grad_norm": 3.1547927856445312, "learning_rate": 2.6455026455026456e-05, "loss": 0.0607, "step": 1100 }, { "epoch": 26.428571428571427, "grad_norm": 0.8336120247840881, "learning_rate": 2.6190476190476192e-05, "loss": 0.0458, "step": 1110 }, { "epoch": 26.666666666666668, "grad_norm": 0.5386803150177002, "learning_rate": 2.5925925925925925e-05, "loss": 0.0638, "step": 1120 }, { "epoch": 26.904761904761905, "grad_norm": 0.8411057591438293, "learning_rate": 2.5661375661375664e-05, "loss": 0.062, "step": 1130 }, { "epoch": 27.0, "eval_accuracy": 0.9564489112227805, "eval_loss": 0.17715045809745789, "eval_runtime": 102.5942, "eval_samples_per_second": 5.819, "eval_steps_per_second": 0.185, "step": 1134 }, { "epoch": 27.142857142857142, "grad_norm": 1.1352622509002686, "learning_rate": 2.5396825396825397e-05, "loss": 0.0396, "step": 1140 }, { "epoch": 27.38095238095238, "grad_norm": 1.9047770500183105, "learning_rate": 2.5132275132275137e-05, "loss": 0.0383, "step": 1150 }, { "epoch": 27.61904761904762, "grad_norm": 2.154599666595459, "learning_rate": 2.4867724867724866e-05, "loss": 0.0728, "step": 1160 }, { "epoch": 27.857142857142858, "grad_norm": 1.5056850910186768, "learning_rate": 2.4603174603174602e-05, "loss": 0.0568, "step": 1170 }, { "epoch": 28.0, "eval_accuracy": 0.9564489112227805, "eval_loss": 0.14654366672039032, "eval_runtime": 103.1379, "eval_samples_per_second": 5.788, "eval_steps_per_second": 0.184, "step": 1176 }, { "epoch": 28.095238095238095, "grad_norm": 1.634865641593933, "learning_rate": 2.4338624338624338e-05, "loss": 0.0663, "step": 1180 }, { "epoch": 28.333333333333332, "grad_norm": 1.265386939048767, "learning_rate": 2.4074074074074074e-05, "loss": 0.0487, "step": 1190 }, { "epoch": 28.571428571428573, "grad_norm": 0.6159355044364929, "learning_rate": 2.380952380952381e-05, "loss": 0.0596, "step": 1200 }, { "epoch": 28.80952380952381, "grad_norm": 1.0339206457138062, "learning_rate": 2.3544973544973546e-05, "loss": 0.0453, "step": 1210 }, { "epoch": 29.0, "eval_accuracy": 0.9681742043551089, "eval_loss": 0.13472113013267517, "eval_runtime": 103.1249, "eval_samples_per_second": 5.789, "eval_steps_per_second": 0.184, "step": 1218 }, { "epoch": 29.047619047619047, "grad_norm": 0.8177947998046875, "learning_rate": 2.328042328042328e-05, "loss": 0.055, "step": 1220 }, { "epoch": 29.285714285714285, "grad_norm": 1.7629382610321045, "learning_rate": 2.3015873015873015e-05, "loss": 0.0476, "step": 1230 }, { "epoch": 29.523809523809526, "grad_norm": 1.8531335592269897, "learning_rate": 2.275132275132275e-05, "loss": 0.0431, "step": 1240 }, { "epoch": 29.761904761904763, "grad_norm": 0.961283802986145, "learning_rate": 2.2486772486772487e-05, "loss": 0.0579, "step": 1250 }, { "epoch": 30.0, "grad_norm": 0.18748821318149567, "learning_rate": 2.2222222222222223e-05, "loss": 0.0469, "step": 1260 }, { "epoch": 30.0, "eval_accuracy": 0.9631490787269682, "eval_loss": 0.16871798038482666, "eval_runtime": 105.6517, "eval_samples_per_second": 5.651, "eval_steps_per_second": 0.18, "step": 1260 }, { "epoch": 30.238095238095237, "grad_norm": 0.8756251931190491, "learning_rate": 2.1957671957671956e-05, "loss": 0.0536, "step": 1270 }, { "epoch": 30.476190476190474, "grad_norm": 0.7314756512641907, "learning_rate": 2.1693121693121692e-05, "loss": 0.0394, "step": 1280 }, { "epoch": 30.714285714285715, "grad_norm": 1.9777828454971313, "learning_rate": 2.1428571428571428e-05, "loss": 0.0346, "step": 1290 }, { "epoch": 30.952380952380953, "grad_norm": 1.4753316640853882, "learning_rate": 2.1164021164021164e-05, "loss": 0.0541, "step": 1300 }, { "epoch": 31.0, "eval_accuracy": 0.9715242881072027, "eval_loss": 0.13902144134044647, "eval_runtime": 104.4849, "eval_samples_per_second": 5.714, "eval_steps_per_second": 0.182, "step": 1302 }, { "epoch": 31.19047619047619, "grad_norm": 1.706081748008728, "learning_rate": 2.08994708994709e-05, "loss": 0.0613, "step": 1310 }, { "epoch": 31.428571428571427, "grad_norm": 0.12419818341732025, "learning_rate": 2.0634920634920636e-05, "loss": 0.075, "step": 1320 }, { "epoch": 31.666666666666668, "grad_norm": 0.7071540951728821, "learning_rate": 2.037037037037037e-05, "loss": 0.0468, "step": 1330 }, { "epoch": 31.904761904761905, "grad_norm": 0.10454891622066498, "learning_rate": 2.0105820105820105e-05, "loss": 0.0602, "step": 1340 }, { "epoch": 32.0, "eval_accuracy": 0.9614740368509213, "eval_loss": 0.16181902587413788, "eval_runtime": 102.8803, "eval_samples_per_second": 5.803, "eval_steps_per_second": 0.185, "step": 1344 }, { "epoch": 32.142857142857146, "grad_norm": 0.773094654083252, "learning_rate": 1.984126984126984e-05, "loss": 0.0555, "step": 1350 }, { "epoch": 32.38095238095238, "grad_norm": 1.865349292755127, "learning_rate": 1.9576719576719577e-05, "loss": 0.0518, "step": 1360 }, { "epoch": 32.61904761904762, "grad_norm": 0.4416976571083069, "learning_rate": 1.9312169312169313e-05, "loss": 0.049, "step": 1370 }, { "epoch": 32.857142857142854, "grad_norm": 2.0832931995391846, "learning_rate": 1.9047619047619046e-05, "loss": 0.0497, "step": 1380 }, { "epoch": 33.0, "eval_accuracy": 0.9614740368509213, "eval_loss": 0.1414780616760254, "eval_runtime": 103.4955, "eval_samples_per_second": 5.768, "eval_steps_per_second": 0.184, "step": 1386 }, { "epoch": 33.095238095238095, "grad_norm": 0.9570621848106384, "learning_rate": 1.8783068783068782e-05, "loss": 0.0532, "step": 1390 }, { "epoch": 33.333333333333336, "grad_norm": 1.0575621128082275, "learning_rate": 1.8518518518518518e-05, "loss": 0.0555, "step": 1400 }, { "epoch": 33.57142857142857, "grad_norm": 0.6880443096160889, "learning_rate": 1.8253968253968254e-05, "loss": 0.0454, "step": 1410 }, { "epoch": 33.80952380952381, "grad_norm": 1.0119032859802246, "learning_rate": 1.798941798941799e-05, "loss": 0.0493, "step": 1420 }, { "epoch": 34.0, "eval_accuracy": 0.9631490787269682, "eval_loss": 0.1520875245332718, "eval_runtime": 104.0599, "eval_samples_per_second": 5.737, "eval_steps_per_second": 0.183, "step": 1428 }, { "epoch": 34.04761904761905, "grad_norm": 1.220492959022522, "learning_rate": 1.7724867724867723e-05, "loss": 0.0361, "step": 1430 }, { "epoch": 34.285714285714285, "grad_norm": 0.5547053217887878, "learning_rate": 1.746031746031746e-05, "loss": 0.0412, "step": 1440 }, { "epoch": 34.523809523809526, "grad_norm": 0.7015855312347412, "learning_rate": 1.7195767195767195e-05, "loss": 0.0425, "step": 1450 }, { "epoch": 34.76190476190476, "grad_norm": 1.388316035270691, "learning_rate": 1.693121693121693e-05, "loss": 0.0342, "step": 1460 }, { "epoch": 35.0, "grad_norm": 1.6578996181488037, "learning_rate": 1.6666666666666667e-05, "loss": 0.0606, "step": 1470 }, { "epoch": 35.0, "eval_accuracy": 0.9698492462311558, "eval_loss": 0.14287406206130981, "eval_runtime": 103.938, "eval_samples_per_second": 5.744, "eval_steps_per_second": 0.183, "step": 1470 }, { "epoch": 35.23809523809524, "grad_norm": 1.2321009635925293, "learning_rate": 1.6402116402116404e-05, "loss": 0.036, "step": 1480 }, { "epoch": 35.476190476190474, "grad_norm": 1.9735788106918335, "learning_rate": 1.6137566137566136e-05, "loss": 0.0485, "step": 1490 }, { "epoch": 35.714285714285715, "grad_norm": 0.7221766114234924, "learning_rate": 1.5873015873015872e-05, "loss": 0.0518, "step": 1500 }, { "epoch": 35.95238095238095, "grad_norm": 1.484800100326538, "learning_rate": 1.560846560846561e-05, "loss": 0.0332, "step": 1510 }, { "epoch": 36.0, "eval_accuracy": 0.964824120603015, "eval_loss": 0.16714587807655334, "eval_runtime": 103.1161, "eval_samples_per_second": 5.79, "eval_steps_per_second": 0.184, "step": 1512 }, { "epoch": 36.19047619047619, "grad_norm": 2.1778924465179443, "learning_rate": 1.5343915343915344e-05, "loss": 0.058, "step": 1520 }, { "epoch": 36.42857142857143, "grad_norm": 0.6789590120315552, "learning_rate": 1.5079365079365079e-05, "loss": 0.0341, "step": 1530 }, { "epoch": 36.666666666666664, "grad_norm": 1.6879972219467163, "learning_rate": 1.4814814814814815e-05, "loss": 0.0395, "step": 1540 }, { "epoch": 36.904761904761905, "grad_norm": 0.9731617569923401, "learning_rate": 1.455026455026455e-05, "loss": 0.0432, "step": 1550 }, { "epoch": 37.0, "eval_accuracy": 0.966499162479062, "eval_loss": 0.14411257207393646, "eval_runtime": 103.3621, "eval_samples_per_second": 5.776, "eval_steps_per_second": 0.184, "step": 1554 }, { "epoch": 37.142857142857146, "grad_norm": 1.3034260272979736, "learning_rate": 1.4285714285714285e-05, "loss": 0.0376, "step": 1560 }, { "epoch": 37.38095238095238, "grad_norm": 0.8573827743530273, "learning_rate": 1.4021164021164022e-05, "loss": 0.0475, "step": 1570 }, { "epoch": 37.61904761904762, "grad_norm": 0.7534766793251038, "learning_rate": 1.3756613756613756e-05, "loss": 0.0519, "step": 1580 }, { "epoch": 37.857142857142854, "grad_norm": 0.7222949266433716, "learning_rate": 1.3492063492063492e-05, "loss": 0.0354, "step": 1590 }, { "epoch": 38.0, "eval_accuracy": 0.9681742043551089, "eval_loss": 0.15929608047008514, "eval_runtime": 107.453, "eval_samples_per_second": 5.556, "eval_steps_per_second": 0.177, "step": 1596 }, { "epoch": 38.095238095238095, "grad_norm": 1.8834507465362549, "learning_rate": 1.3227513227513228e-05, "loss": 0.0338, "step": 1600 }, { "epoch": 38.333333333333336, "grad_norm": 0.6104734539985657, "learning_rate": 1.2962962962962962e-05, "loss": 0.0462, "step": 1610 }, { "epoch": 38.57142857142857, "grad_norm": 0.7713958024978638, "learning_rate": 1.2698412698412699e-05, "loss": 0.0279, "step": 1620 }, { "epoch": 38.80952380952381, "grad_norm": 0.2185550034046173, "learning_rate": 1.2433862433862433e-05, "loss": 0.0432, "step": 1630 }, { "epoch": 39.0, "eval_accuracy": 0.966499162479062, "eval_loss": 0.13952085375785828, "eval_runtime": 107.8325, "eval_samples_per_second": 5.536, "eval_steps_per_second": 0.176, "step": 1638 }, { "epoch": 39.04761904761905, "grad_norm": 1.2039453983306885, "learning_rate": 1.2169312169312169e-05, "loss": 0.0451, "step": 1640 }, { "epoch": 39.285714285714285, "grad_norm": 2.02799391746521, "learning_rate": 1.1904761904761905e-05, "loss": 0.0249, "step": 1650 }, { "epoch": 39.523809523809526, "grad_norm": 0.5916322469711304, "learning_rate": 1.164021164021164e-05, "loss": 0.0408, "step": 1660 }, { "epoch": 39.76190476190476, "grad_norm": 2.4064204692840576, "learning_rate": 1.1375661375661376e-05, "loss": 0.0439, "step": 1670 }, { "epoch": 40.0, "grad_norm": 0.05854567885398865, "learning_rate": 1.1111111111111112e-05, "loss": 0.0363, "step": 1680 }, { "epoch": 40.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.10922118276357651, "eval_runtime": 107.5538, "eval_samples_per_second": 5.551, "eval_steps_per_second": 0.177, "step": 1680 }, { "epoch": 40.23809523809524, "grad_norm": 1.7765388488769531, "learning_rate": 1.0846560846560846e-05, "loss": 0.0337, "step": 1690 }, { "epoch": 40.476190476190474, "grad_norm": 1.7078924179077148, "learning_rate": 1.0582010582010582e-05, "loss": 0.0272, "step": 1700 }, { "epoch": 40.714285714285715, "grad_norm": 1.3066785335540771, "learning_rate": 1.0317460317460318e-05, "loss": 0.0408, "step": 1710 }, { "epoch": 40.95238095238095, "grad_norm": 0.261738121509552, "learning_rate": 1.0052910052910053e-05, "loss": 0.0288, "step": 1720 }, { "epoch": 41.0, "eval_accuracy": 0.966499162479062, "eval_loss": 0.15500447154045105, "eval_runtime": 107.8053, "eval_samples_per_second": 5.538, "eval_steps_per_second": 0.176, "step": 1722 }, { "epoch": 41.19047619047619, "grad_norm": 0.7850339412689209, "learning_rate": 9.788359788359789e-06, "loss": 0.031, "step": 1730 }, { "epoch": 41.42857142857143, "grad_norm": 1.2644939422607422, "learning_rate": 9.523809523809523e-06, "loss": 0.0435, "step": 1740 }, { "epoch": 41.666666666666664, "grad_norm": 1.832721471786499, "learning_rate": 9.259259259259259e-06, "loss": 0.0275, "step": 1750 }, { "epoch": 41.904761904761905, "grad_norm": 0.4885420501232147, "learning_rate": 8.994708994708995e-06, "loss": 0.0305, "step": 1760 }, { "epoch": 42.0, "eval_accuracy": 0.9681742043551089, "eval_loss": 0.14619530737400055, "eval_runtime": 107.0057, "eval_samples_per_second": 5.579, "eval_steps_per_second": 0.178, "step": 1764 }, { "epoch": 42.142857142857146, "grad_norm": 1.3336294889450073, "learning_rate": 8.73015873015873e-06, "loss": 0.0442, "step": 1770 }, { "epoch": 42.38095238095238, "grad_norm": 1.4869195222854614, "learning_rate": 8.465608465608466e-06, "loss": 0.0367, "step": 1780 }, { "epoch": 42.61904761904762, "grad_norm": 0.8099101781845093, "learning_rate": 8.201058201058202e-06, "loss": 0.0457, "step": 1790 }, { "epoch": 42.857142857142854, "grad_norm": 0.36065608263015747, "learning_rate": 7.936507936507936e-06, "loss": 0.0326, "step": 1800 }, { "epoch": 43.0, "eval_accuracy": 0.9681742043551089, "eval_loss": 0.1343374401330948, "eval_runtime": 107.1742, "eval_samples_per_second": 5.57, "eval_steps_per_second": 0.177, "step": 1806 }, { "epoch": 43.095238095238095, "grad_norm": 0.7654176354408264, "learning_rate": 7.671957671957672e-06, "loss": 0.0182, "step": 1810 }, { "epoch": 43.333333333333336, "grad_norm": 0.9363707900047302, "learning_rate": 7.4074074074074075e-06, "loss": 0.0364, "step": 1820 }, { "epoch": 43.57142857142857, "grad_norm": 1.6939359903335571, "learning_rate": 7.142857142857143e-06, "loss": 0.0274, "step": 1830 }, { "epoch": 43.80952380952381, "grad_norm": 0.285452663898468, "learning_rate": 6.878306878306878e-06, "loss": 0.027, "step": 1840 }, { "epoch": 44.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.11093433946371078, "eval_runtime": 107.916, "eval_samples_per_second": 5.532, "eval_steps_per_second": 0.176, "step": 1848 }, { "epoch": 44.04761904761905, "grad_norm": 1.4939181804656982, "learning_rate": 6.613756613756614e-06, "loss": 0.035, "step": 1850 }, { "epoch": 44.285714285714285, "grad_norm": 0.328795850276947, "learning_rate": 6.349206349206349e-06, "loss": 0.0328, "step": 1860 }, { "epoch": 44.523809523809526, "grad_norm": 0.7609361410140991, "learning_rate": 6.0846560846560845e-06, "loss": 0.0383, "step": 1870 }, { "epoch": 44.76190476190476, "grad_norm": 0.4592248201370239, "learning_rate": 5.82010582010582e-06, "loss": 0.0248, "step": 1880 }, { "epoch": 45.0, "grad_norm": 0.208053857088089, "learning_rate": 5.555555555555556e-06, "loss": 0.0233, "step": 1890 }, { "epoch": 45.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.1315459907054901, "eval_runtime": 105.7748, "eval_samples_per_second": 5.644, "eval_steps_per_second": 0.18, "step": 1890 }, { "epoch": 45.23809523809524, "grad_norm": 0.4980098605155945, "learning_rate": 5.291005291005291e-06, "loss": 0.0209, "step": 1900 }, { "epoch": 45.476190476190474, "grad_norm": 1.4569426774978638, "learning_rate": 5.026455026455026e-06, "loss": 0.0249, "step": 1910 }, { "epoch": 45.714285714285715, "grad_norm": 0.21140669286251068, "learning_rate": 4.7619047619047615e-06, "loss": 0.0274, "step": 1920 }, { "epoch": 45.95238095238095, "grad_norm": 0.9561221599578857, "learning_rate": 4.497354497354498e-06, "loss": 0.042, "step": 1930 }, { "epoch": 46.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.1261177957057953, "eval_runtime": 106.0727, "eval_samples_per_second": 5.628, "eval_steps_per_second": 0.179, "step": 1932 }, { "epoch": 46.19047619047619, "grad_norm": 1.0604710578918457, "learning_rate": 4.232804232804233e-06, "loss": 0.022, "step": 1940 }, { "epoch": 46.42857142857143, "grad_norm": 2.230454683303833, "learning_rate": 3.968253968253968e-06, "loss": 0.0351, "step": 1950 }, { "epoch": 46.666666666666664, "grad_norm": 0.6946862936019897, "learning_rate": 3.7037037037037037e-06, "loss": 0.0301, "step": 1960 }, { "epoch": 46.904761904761905, "grad_norm": 0.760875403881073, "learning_rate": 3.439153439153439e-06, "loss": 0.0251, "step": 1970 }, { "epoch": 47.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.13198845088481903, "eval_runtime": 107.949, "eval_samples_per_second": 5.53, "eval_steps_per_second": 0.176, "step": 1974 }, { "epoch": 47.142857142857146, "grad_norm": 0.19793380796909332, "learning_rate": 3.1746031746031746e-06, "loss": 0.0194, "step": 1980 }, { "epoch": 47.38095238095238, "grad_norm": 1.171229362487793, "learning_rate": 2.91005291005291e-06, "loss": 0.0207, "step": 1990 }, { "epoch": 47.61904761904762, "grad_norm": 1.4025137424468994, "learning_rate": 2.6455026455026455e-06, "loss": 0.0264, "step": 2000 }, { "epoch": 47.857142857142854, "grad_norm": 0.5469716191291809, "learning_rate": 2.3809523809523808e-06, "loss": 0.041, "step": 2010 }, { "epoch": 48.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.12820355594158173, "eval_runtime": 107.9105, "eval_samples_per_second": 5.532, "eval_steps_per_second": 0.176, "step": 2016 }, { "epoch": 48.095238095238095, "grad_norm": 0.5962417125701904, "learning_rate": 2.1164021164021164e-06, "loss": 0.0257, "step": 2020 }, { "epoch": 48.333333333333336, "grad_norm": 1.9686793088912964, "learning_rate": 1.8518518518518519e-06, "loss": 0.0297, "step": 2030 }, { "epoch": 48.57142857142857, "grad_norm": 1.7944121360778809, "learning_rate": 1.5873015873015873e-06, "loss": 0.0272, "step": 2040 }, { "epoch": 48.80952380952381, "grad_norm": 0.4869058430194855, "learning_rate": 1.3227513227513228e-06, "loss": 0.0445, "step": 2050 }, { "epoch": 49.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.12962684035301208, "eval_runtime": 107.6607, "eval_samples_per_second": 5.545, "eval_steps_per_second": 0.176, "step": 2058 }, { "epoch": 49.04761904761905, "grad_norm": 0.599247932434082, "learning_rate": 1.0582010582010582e-06, "loss": 0.0289, "step": 2060 }, { "epoch": 49.285714285714285, "grad_norm": 1.3819619417190552, "learning_rate": 7.936507936507937e-07, "loss": 0.0236, "step": 2070 }, { "epoch": 49.523809523809526, "grad_norm": 0.9784670472145081, "learning_rate": 5.291005291005291e-07, "loss": 0.0306, "step": 2080 }, { "epoch": 49.76190476190476, "grad_norm": 0.8563596606254578, "learning_rate": 2.6455026455026455e-07, "loss": 0.0212, "step": 2090 }, { "epoch": 50.0, "grad_norm": 1.10076904296875, "learning_rate": 0.0, "loss": 0.0308, "step": 2100 }, { "epoch": 50.0, "eval_accuracy": 0.9731993299832495, "eval_loss": 0.13253676891326904, "eval_runtime": 110.4625, "eval_samples_per_second": 5.405, "eval_steps_per_second": 0.172, "step": 2100 }, { "epoch": 50.0, "step": 2100, "total_flos": 2.0803097508518707e+19, "train_loss": 0.1354110169055916, "train_runtime": 52600.7464, "train_samples_per_second": 5.104, "train_steps_per_second": 0.04 } ], "logging_steps": 10, "max_steps": 2100, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0803097508518707e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }