{ "best_metric": 0.9271936241481539, "best_model_checkpoint": "vit-msn-small-wbc-classifier-100/checkpoint-208", "epoch": 100.0, "eval_steps": 500, "global_step": 20800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04807692307692308, "grad_norm": 4.29103946685791, "learning_rate": 2.403846153846154e-07, "loss": 0.2741, "step": 10 }, { "epoch": 0.09615384615384616, "grad_norm": 5.064257621765137, "learning_rate": 4.807692307692308e-07, "loss": 0.2818, "step": 20 }, { "epoch": 0.14423076923076922, "grad_norm": 4.503714561462402, "learning_rate": 7.211538461538462e-07, "loss": 0.2512, "step": 30 }, { "epoch": 0.19230769230769232, "grad_norm": 3.301870822906494, "learning_rate": 9.615384615384617e-07, "loss": 0.2723, "step": 40 }, { "epoch": 0.2403846153846154, "grad_norm": 3.582566976547241, "learning_rate": 1.201923076923077e-06, "loss": 0.2524, "step": 50 }, { "epoch": 0.28846153846153844, "grad_norm": 4.525320529937744, "learning_rate": 1.4423076923076924e-06, "loss": 0.2561, "step": 60 }, { "epoch": 0.33653846153846156, "grad_norm": 3.1389365196228027, "learning_rate": 1.6826923076923077e-06, "loss": 0.2627, "step": 70 }, { "epoch": 0.38461538461538464, "grad_norm": 3.0843207836151123, "learning_rate": 1.9230769230769234e-06, "loss": 0.2298, "step": 80 }, { "epoch": 0.4326923076923077, "grad_norm": 3.9689910411834717, "learning_rate": 2.1634615384615387e-06, "loss": 0.2328, "step": 90 }, { "epoch": 0.4807692307692308, "grad_norm": 3.815237283706665, "learning_rate": 2.403846153846154e-06, "loss": 0.2377, "step": 100 }, { "epoch": 0.5288461538461539, "grad_norm": 5.2160210609436035, "learning_rate": 2.644230769230769e-06, "loss": 0.2421, "step": 110 }, { "epoch": 0.5769230769230769, "grad_norm": 3.797703981399536, "learning_rate": 2.884615384615385e-06, "loss": 0.2467, "step": 120 }, { "epoch": 0.625, "grad_norm": 5.251151084899902, "learning_rate": 3.125e-06, "loss": 0.246, "step": 130 }, { "epoch": 0.6730769230769231, "grad_norm": 4.506635665893555, "learning_rate": 3.3653846153846154e-06, "loss": 0.2523, "step": 140 }, { "epoch": 0.7211538461538461, "grad_norm": 3.7222163677215576, "learning_rate": 3.6057692307692307e-06, "loss": 0.2302, "step": 150 }, { "epoch": 0.7692307692307693, "grad_norm": 3.2590866088867188, "learning_rate": 3.846153846153847e-06, "loss": 0.2382, "step": 160 }, { "epoch": 0.8173076923076923, "grad_norm": 3.414531946182251, "learning_rate": 4.086538461538462e-06, "loss": 0.2356, "step": 170 }, { "epoch": 0.8653846153846154, "grad_norm": 4.646815776824951, "learning_rate": 4.326923076923077e-06, "loss": 0.2813, "step": 180 }, { "epoch": 0.9134615384615384, "grad_norm": 3.8170089721679688, "learning_rate": 4.567307692307692e-06, "loss": 0.2352, "step": 190 }, { "epoch": 0.9615384615384616, "grad_norm": 2.8930060863494873, "learning_rate": 4.807692307692308e-06, "loss": 0.2356, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.9271936241481539, "eval_loss": 0.2004517912864685, "eval_runtime": 80.9771, "eval_samples_per_second": 320.745, "eval_steps_per_second": 5.014, "step": 208 }, { "epoch": 1.0096153846153846, "grad_norm": 3.100111961364746, "learning_rate": 5.0480769230769235e-06, "loss": 0.223, "step": 210 }, { "epoch": 1.0576923076923077, "grad_norm": 3.7471377849578857, "learning_rate": 5.288461538461538e-06, "loss": 0.2456, "step": 220 }, { "epoch": 1.1057692307692308, "grad_norm": 4.501901626586914, "learning_rate": 5.528846153846154e-06, "loss": 0.2215, "step": 230 }, { "epoch": 1.1538461538461537, "grad_norm": 4.0346574783325195, "learning_rate": 5.76923076923077e-06, "loss": 0.2457, "step": 240 }, { "epoch": 1.2019230769230769, "grad_norm": 5.3190813064575195, "learning_rate": 6.0096153846153855e-06, "loss": 0.2611, "step": 250 }, { "epoch": 1.25, "grad_norm": 3.502983331680298, "learning_rate": 6.25e-06, "loss": 0.2468, "step": 260 }, { "epoch": 1.2980769230769231, "grad_norm": 4.400745868682861, "learning_rate": 6.490384615384616e-06, "loss": 0.2461, "step": 270 }, { "epoch": 1.3461538461538463, "grad_norm": 4.3637518882751465, "learning_rate": 6.730769230769231e-06, "loss": 0.2321, "step": 280 }, { "epoch": 1.3942307692307692, "grad_norm": 4.646700382232666, "learning_rate": 6.9711538461538465e-06, "loss": 0.2389, "step": 290 }, { "epoch": 1.4423076923076923, "grad_norm": 3.3019397258758545, "learning_rate": 7.211538461538461e-06, "loss": 0.2158, "step": 300 }, { "epoch": 1.4903846153846154, "grad_norm": 3.199413299560547, "learning_rate": 7.451923076923077e-06, "loss": 0.2263, "step": 310 }, { "epoch": 1.5384615384615383, "grad_norm": 3.9084699153900146, "learning_rate": 7.692307692307694e-06, "loss": 0.2489, "step": 320 }, { "epoch": 1.5865384615384617, "grad_norm": 5.1932268142700195, "learning_rate": 7.932692307692308e-06, "loss": 0.2314, "step": 330 }, { "epoch": 1.6346153846153846, "grad_norm": 4.395415306091309, "learning_rate": 8.173076923076923e-06, "loss": 0.2377, "step": 340 }, { "epoch": 1.6826923076923077, "grad_norm": 3.9761104583740234, "learning_rate": 8.41346153846154e-06, "loss": 0.228, "step": 350 }, { "epoch": 1.7307692307692308, "grad_norm": 5.794794082641602, "learning_rate": 8.653846153846155e-06, "loss": 0.2495, "step": 360 }, { "epoch": 1.7788461538461537, "grad_norm": 5.0175323486328125, "learning_rate": 8.89423076923077e-06, "loss": 0.2383, "step": 370 }, { "epoch": 1.8269230769230769, "grad_norm": 3.607489824295044, "learning_rate": 9.134615384615384e-06, "loss": 0.2288, "step": 380 }, { "epoch": 1.875, "grad_norm": 4.5138092041015625, "learning_rate": 9.375000000000001e-06, "loss": 0.2244, "step": 390 }, { "epoch": 1.9230769230769231, "grad_norm": 5.24376106262207, "learning_rate": 9.615384615384616e-06, "loss": 0.2296, "step": 400 }, { "epoch": 1.9711538461538463, "grad_norm": 4.655590057373047, "learning_rate": 9.85576923076923e-06, "loss": 0.2305, "step": 410 }, { "epoch": 2.0, "eval_accuracy": 0.9194933199861395, "eval_loss": 0.22587276995182037, "eval_runtime": 78.0461, "eval_samples_per_second": 332.791, "eval_steps_per_second": 5.202, "step": 416 }, { "epoch": 2.019230769230769, "grad_norm": 3.783843994140625, "learning_rate": 1.0096153846153847e-05, "loss": 0.2176, "step": 420 }, { "epoch": 2.0673076923076925, "grad_norm": 4.57997989654541, "learning_rate": 1.0336538461538462e-05, "loss": 0.2393, "step": 430 }, { "epoch": 2.1153846153846154, "grad_norm": 5.619403839111328, "learning_rate": 1.0576923076923077e-05, "loss": 0.2278, "step": 440 }, { "epoch": 2.1634615384615383, "grad_norm": 5.449454307556152, "learning_rate": 1.0817307692307693e-05, "loss": 0.2232, "step": 450 }, { "epoch": 2.2115384615384617, "grad_norm": 4.424691677093506, "learning_rate": 1.1057692307692308e-05, "loss": 0.2527, "step": 460 }, { "epoch": 2.2596153846153846, "grad_norm": 4.623299598693848, "learning_rate": 1.1298076923076923e-05, "loss": 0.2236, "step": 470 }, { "epoch": 2.3076923076923075, "grad_norm": 4.3317437171936035, "learning_rate": 1.153846153846154e-05, "loss": 0.2258, "step": 480 }, { "epoch": 2.355769230769231, "grad_norm": 3.8849406242370605, "learning_rate": 1.1778846153846154e-05, "loss": 0.2308, "step": 490 }, { "epoch": 2.4038461538461537, "grad_norm": 3.7695813179016113, "learning_rate": 1.2019230769230771e-05, "loss": 0.2457, "step": 500 }, { "epoch": 2.451923076923077, "grad_norm": 5.957648754119873, "learning_rate": 1.2259615384615384e-05, "loss": 0.2099, "step": 510 }, { "epoch": 2.5, "grad_norm": 4.979465961456299, "learning_rate": 1.25e-05, "loss": 0.2355, "step": 520 }, { "epoch": 2.5480769230769234, "grad_norm": 5.07026481628418, "learning_rate": 1.2740384615384615e-05, "loss": 0.2615, "step": 530 }, { "epoch": 2.5961538461538463, "grad_norm": 4.378418922424316, "learning_rate": 1.2980769230769232e-05, "loss": 0.2364, "step": 540 }, { "epoch": 2.644230769230769, "grad_norm": 4.601984024047852, "learning_rate": 1.3221153846153847e-05, "loss": 0.2272, "step": 550 }, { "epoch": 2.6923076923076925, "grad_norm": 4.097207069396973, "learning_rate": 1.3461538461538462e-05, "loss": 0.2378, "step": 560 }, { "epoch": 2.7403846153846154, "grad_norm": 4.52841854095459, "learning_rate": 1.3701923076923078e-05, "loss": 0.2329, "step": 570 }, { "epoch": 2.7884615384615383, "grad_norm": 4.187917232513428, "learning_rate": 1.3942307692307693e-05, "loss": 0.2357, "step": 580 }, { "epoch": 2.8365384615384617, "grad_norm": 5.146624565124512, "learning_rate": 1.4182692307692308e-05, "loss": 0.233, "step": 590 }, { "epoch": 2.8846153846153846, "grad_norm": 3.8445827960968018, "learning_rate": 1.4423076923076923e-05, "loss": 0.2167, "step": 600 }, { "epoch": 2.9326923076923075, "grad_norm": 3.9132585525512695, "learning_rate": 1.466346153846154e-05, "loss": 0.2372, "step": 610 }, { "epoch": 2.980769230769231, "grad_norm": 4.0533905029296875, "learning_rate": 1.4903846153846154e-05, "loss": 0.246, "step": 620 }, { "epoch": 3.0, "eval_accuracy": 0.9209948792977323, "eval_loss": 0.20970715582370758, "eval_runtime": 78.245, "eval_samples_per_second": 331.944, "eval_steps_per_second": 5.189, "step": 624 }, { "epoch": 3.0288461538461537, "grad_norm": 3.8862717151641846, "learning_rate": 1.5144230769230769e-05, "loss": 0.2201, "step": 630 }, { "epoch": 3.076923076923077, "grad_norm": 5.575253963470459, "learning_rate": 1.5384615384615387e-05, "loss": 0.2526, "step": 640 }, { "epoch": 3.125, "grad_norm": 4.2997660636901855, "learning_rate": 1.5625e-05, "loss": 0.2367, "step": 650 }, { "epoch": 3.173076923076923, "grad_norm": 5.853203773498535, "learning_rate": 1.5865384615384617e-05, "loss": 0.2172, "step": 660 }, { "epoch": 3.2211538461538463, "grad_norm": 4.454317092895508, "learning_rate": 1.6105769230769233e-05, "loss": 0.2428, "step": 670 }, { "epoch": 3.269230769230769, "grad_norm": 4.648312568664551, "learning_rate": 1.6346153846153847e-05, "loss": 0.2249, "step": 680 }, { "epoch": 3.3173076923076925, "grad_norm": 5.323423385620117, "learning_rate": 1.6586538461538463e-05, "loss": 0.2238, "step": 690 }, { "epoch": 3.3653846153846154, "grad_norm": 4.521032810211182, "learning_rate": 1.682692307692308e-05, "loss": 0.2405, "step": 700 }, { "epoch": 3.4134615384615383, "grad_norm": 4.855774402618408, "learning_rate": 1.7067307692307693e-05, "loss": 0.2217, "step": 710 }, { "epoch": 3.4615384615384617, "grad_norm": 4.4206366539001465, "learning_rate": 1.730769230769231e-05, "loss": 0.2218, "step": 720 }, { "epoch": 3.5096153846153846, "grad_norm": 4.768404483795166, "learning_rate": 1.7548076923076922e-05, "loss": 0.2246, "step": 730 }, { "epoch": 3.5576923076923075, "grad_norm": 4.879373550415039, "learning_rate": 1.778846153846154e-05, "loss": 0.2603, "step": 740 }, { "epoch": 3.605769230769231, "grad_norm": 3.8240325450897217, "learning_rate": 1.8028846153846156e-05, "loss": 0.2329, "step": 750 }, { "epoch": 3.6538461538461537, "grad_norm": 5.28994083404541, "learning_rate": 1.826923076923077e-05, "loss": 0.2326, "step": 760 }, { "epoch": 3.7019230769230766, "grad_norm": 5.803524971008301, "learning_rate": 1.8509615384615385e-05, "loss": 0.2197, "step": 770 }, { "epoch": 3.75, "grad_norm": 5.854836940765381, "learning_rate": 1.8750000000000002e-05, "loss": 0.2476, "step": 780 }, { "epoch": 3.7980769230769234, "grad_norm": 6.034967422485352, "learning_rate": 1.8990384615384615e-05, "loss": 0.2293, "step": 790 }, { "epoch": 3.8461538461538463, "grad_norm": 6.684122562408447, "learning_rate": 1.923076923076923e-05, "loss": 0.2558, "step": 800 }, { "epoch": 3.894230769230769, "grad_norm": 5.798404216766357, "learning_rate": 1.9471153846153848e-05, "loss": 0.2484, "step": 810 }, { "epoch": 3.9423076923076925, "grad_norm": 4.843292236328125, "learning_rate": 1.971153846153846e-05, "loss": 0.2625, "step": 820 }, { "epoch": 3.9903846153846154, "grad_norm": 4.687588691711426, "learning_rate": 1.9951923076923078e-05, "loss": 0.2585, "step": 830 }, { "epoch": 4.0, "eval_accuracy": 0.9180302621953567, "eval_loss": 0.21835412085056305, "eval_runtime": 77.92, "eval_samples_per_second": 333.329, "eval_steps_per_second": 5.21, "step": 832 }, { "epoch": 4.038461538461538, "grad_norm": 4.906230926513672, "learning_rate": 2.0192307692307694e-05, "loss": 0.238, "step": 840 }, { "epoch": 4.086538461538462, "grad_norm": 3.579962730407715, "learning_rate": 2.0432692307692307e-05, "loss": 0.2595, "step": 850 }, { "epoch": 4.134615384615385, "grad_norm": 5.881065845489502, "learning_rate": 2.0673076923076924e-05, "loss": 0.2397, "step": 860 }, { "epoch": 4.1826923076923075, "grad_norm": 5.481284141540527, "learning_rate": 2.091346153846154e-05, "loss": 0.2613, "step": 870 }, { "epoch": 4.230769230769231, "grad_norm": 4.215633869171143, "learning_rate": 2.1153846153846154e-05, "loss": 0.2551, "step": 880 }, { "epoch": 4.278846153846154, "grad_norm": 6.041876792907715, "learning_rate": 2.139423076923077e-05, "loss": 0.2441, "step": 890 }, { "epoch": 4.326923076923077, "grad_norm": 5.073339939117432, "learning_rate": 2.1634615384615387e-05, "loss": 0.2743, "step": 900 }, { "epoch": 4.375, "grad_norm": 4.3463454246521, "learning_rate": 2.1875e-05, "loss": 0.2442, "step": 910 }, { "epoch": 4.423076923076923, "grad_norm": 4.773877143859863, "learning_rate": 2.2115384615384616e-05, "loss": 0.258, "step": 920 }, { "epoch": 4.471153846153846, "grad_norm": 4.989078998565674, "learning_rate": 2.2355769230769233e-05, "loss": 0.2678, "step": 930 }, { "epoch": 4.519230769230769, "grad_norm": 5.039970874786377, "learning_rate": 2.2596153846153846e-05, "loss": 0.2546, "step": 940 }, { "epoch": 4.5673076923076925, "grad_norm": 4.789408206939697, "learning_rate": 2.2836538461538463e-05, "loss": 0.2621, "step": 950 }, { "epoch": 4.615384615384615, "grad_norm": 5.630142688751221, "learning_rate": 2.307692307692308e-05, "loss": 0.2494, "step": 960 }, { "epoch": 4.663461538461538, "grad_norm": 5.471822261810303, "learning_rate": 2.3317307692307692e-05, "loss": 0.2675, "step": 970 }, { "epoch": 4.711538461538462, "grad_norm": 4.036755561828613, "learning_rate": 2.355769230769231e-05, "loss": 0.2504, "step": 980 }, { "epoch": 4.759615384615385, "grad_norm": 5.181748867034912, "learning_rate": 2.3798076923076922e-05, "loss": 0.275, "step": 990 }, { "epoch": 4.8076923076923075, "grad_norm": 4.462552547454834, "learning_rate": 2.4038461538461542e-05, "loss": 0.2869, "step": 1000 }, { "epoch": 4.855769230769231, "grad_norm": 4.369950294494629, "learning_rate": 2.4278846153846155e-05, "loss": 0.2722, "step": 1010 }, { "epoch": 4.903846153846154, "grad_norm": 3.9299652576446533, "learning_rate": 2.4519230769230768e-05, "loss": 0.2634, "step": 1020 }, { "epoch": 4.951923076923077, "grad_norm": 3.8897883892059326, "learning_rate": 2.4759615384615388e-05, "loss": 0.2811, "step": 1030 }, { "epoch": 5.0, "grad_norm": 4.734266757965088, "learning_rate": 2.5e-05, "loss": 0.2593, "step": 1040 }, { "epoch": 5.0, "eval_accuracy": 0.917144727216725, "eval_loss": 0.23313818871974945, "eval_runtime": 77.6854, "eval_samples_per_second": 334.336, "eval_steps_per_second": 5.226, "step": 1040 }, { "epoch": 5.048076923076923, "grad_norm": 5.219655513763428, "learning_rate": 2.5240384615384614e-05, "loss": 0.2557, "step": 1050 }, { "epoch": 5.096153846153846, "grad_norm": 5.333519458770752, "learning_rate": 2.548076923076923e-05, "loss": 0.276, "step": 1060 }, { "epoch": 5.144230769230769, "grad_norm": 4.282153606414795, "learning_rate": 2.5721153846153844e-05, "loss": 0.2676, "step": 1070 }, { "epoch": 5.1923076923076925, "grad_norm": 3.8132925033569336, "learning_rate": 2.5961538461538464e-05, "loss": 0.2823, "step": 1080 }, { "epoch": 5.240384615384615, "grad_norm": 4.480808258056641, "learning_rate": 2.620192307692308e-05, "loss": 0.2788, "step": 1090 }, { "epoch": 5.288461538461538, "grad_norm": 7.331939220428467, "learning_rate": 2.6442307692307694e-05, "loss": 0.2739, "step": 1100 }, { "epoch": 5.336538461538462, "grad_norm": 4.584115028381348, "learning_rate": 2.668269230769231e-05, "loss": 0.2951, "step": 1110 }, { "epoch": 5.384615384615385, "grad_norm": 5.8024797439575195, "learning_rate": 2.6923076923076923e-05, "loss": 0.2498, "step": 1120 }, { "epoch": 5.4326923076923075, "grad_norm": 4.045364856719971, "learning_rate": 2.7163461538461536e-05, "loss": 0.2668, "step": 1130 }, { "epoch": 5.480769230769231, "grad_norm": 5.548737525939941, "learning_rate": 2.7403846153846156e-05, "loss": 0.275, "step": 1140 }, { "epoch": 5.528846153846154, "grad_norm": 4.42161226272583, "learning_rate": 2.7644230769230773e-05, "loss": 0.287, "step": 1150 }, { "epoch": 5.576923076923077, "grad_norm": 7.245948314666748, "learning_rate": 2.7884615384615386e-05, "loss": 0.2784, "step": 1160 }, { "epoch": 5.625, "grad_norm": 4.1363420486450195, "learning_rate": 2.8125000000000003e-05, "loss": 0.2602, "step": 1170 }, { "epoch": 5.673076923076923, "grad_norm": 3.9322397708892822, "learning_rate": 2.8365384615384616e-05, "loss": 0.2495, "step": 1180 }, { "epoch": 5.721153846153846, "grad_norm": 5.294655799865723, "learning_rate": 2.860576923076923e-05, "loss": 0.2811, "step": 1190 }, { "epoch": 5.769230769230769, "grad_norm": 3.9423635005950928, "learning_rate": 2.8846153846153845e-05, "loss": 0.2907, "step": 1200 }, { "epoch": 5.8173076923076925, "grad_norm": 5.503436088562012, "learning_rate": 2.9086538461538465e-05, "loss": 0.2704, "step": 1210 }, { "epoch": 5.865384615384615, "grad_norm": 4.416607856750488, "learning_rate": 2.932692307692308e-05, "loss": 0.2682, "step": 1220 }, { "epoch": 5.913461538461538, "grad_norm": 5.369599342346191, "learning_rate": 2.9567307692307695e-05, "loss": 0.2889, "step": 1230 }, { "epoch": 5.961538461538462, "grad_norm": 5.921248912811279, "learning_rate": 2.9807692307692308e-05, "loss": 0.2483, "step": 1240 }, { "epoch": 6.0, "eval_accuracy": 0.91980133215262, "eval_loss": 0.21702727675437927, "eval_runtime": 77.9093, "eval_samples_per_second": 333.375, "eval_steps_per_second": 5.211, "step": 1248 }, { "epoch": 6.009615384615385, "grad_norm": 3.976677656173706, "learning_rate": 3.0048076923076925e-05, "loss": 0.2711, "step": 1250 }, { "epoch": 6.0576923076923075, "grad_norm": 6.093148231506348, "learning_rate": 3.0288461538461538e-05, "loss": 0.2752, "step": 1260 }, { "epoch": 6.105769230769231, "grad_norm": 4.532403469085693, "learning_rate": 3.052884615384616e-05, "loss": 0.2811, "step": 1270 }, { "epoch": 6.153846153846154, "grad_norm": 5.155460357666016, "learning_rate": 3.0769230769230774e-05, "loss": 0.271, "step": 1280 }, { "epoch": 6.201923076923077, "grad_norm": 3.9699394702911377, "learning_rate": 3.1009615384615384e-05, "loss": 0.2507, "step": 1290 }, { "epoch": 6.25, "grad_norm": 4.8900146484375, "learning_rate": 3.125e-05, "loss": 0.2499, "step": 1300 }, { "epoch": 6.298076923076923, "grad_norm": 4.191086292266846, "learning_rate": 3.149038461538462e-05, "loss": 0.2568, "step": 1310 }, { "epoch": 6.346153846153846, "grad_norm": 6.065373420715332, "learning_rate": 3.1730769230769234e-05, "loss": 0.2564, "step": 1320 }, { "epoch": 6.394230769230769, "grad_norm": 3.3966197967529297, "learning_rate": 3.1971153846153843e-05, "loss": 0.2843, "step": 1330 }, { "epoch": 6.4423076923076925, "grad_norm": 10.084853172302246, "learning_rate": 3.221153846153847e-05, "loss": 0.2459, "step": 1340 }, { "epoch": 6.490384615384615, "grad_norm": 6.90236759185791, "learning_rate": 3.2451923076923077e-05, "loss": 0.2954, "step": 1350 }, { "epoch": 6.538461538461538, "grad_norm": 5.337307929992676, "learning_rate": 3.269230769230769e-05, "loss": 0.2863, "step": 1360 }, { "epoch": 6.586538461538462, "grad_norm": 4.353206634521484, "learning_rate": 3.293269230769231e-05, "loss": 0.2938, "step": 1370 }, { "epoch": 6.634615384615385, "grad_norm": 4.649317741394043, "learning_rate": 3.3173076923076926e-05, "loss": 0.2777, "step": 1380 }, { "epoch": 6.6826923076923075, "grad_norm": 5.86411714553833, "learning_rate": 3.3413461538461536e-05, "loss": 0.2733, "step": 1390 }, { "epoch": 6.730769230769231, "grad_norm": 3.992302894592285, "learning_rate": 3.365384615384616e-05, "loss": 0.2772, "step": 1400 }, { "epoch": 6.778846153846154, "grad_norm": 4.525715351104736, "learning_rate": 3.3894230769230776e-05, "loss": 0.2595, "step": 1410 }, { "epoch": 6.826923076923077, "grad_norm": 4.447147369384766, "learning_rate": 3.4134615384615386e-05, "loss": 0.2878, "step": 1420 }, { "epoch": 6.875, "grad_norm": 5.148702144622803, "learning_rate": 3.4375e-05, "loss": 0.2813, "step": 1430 }, { "epoch": 6.923076923076923, "grad_norm": 4.983438491821289, "learning_rate": 3.461538461538462e-05, "loss": 0.2708, "step": 1440 }, { "epoch": 6.971153846153846, "grad_norm": 7.712164402008057, "learning_rate": 3.485576923076923e-05, "loss": 0.268, "step": 1450 }, { "epoch": 7.0, "eval_accuracy": 0.9181457667577869, "eval_loss": 0.222809299826622, "eval_runtime": 78.0158, "eval_samples_per_second": 332.92, "eval_steps_per_second": 5.204, "step": 1456 }, { "epoch": 7.019230769230769, "grad_norm": 6.001709461212158, "learning_rate": 3.5096153846153845e-05, "loss": 0.273, "step": 1460 }, { "epoch": 7.0673076923076925, "grad_norm": 4.538992404937744, "learning_rate": 3.533653846153847e-05, "loss": 0.2702, "step": 1470 }, { "epoch": 7.115384615384615, "grad_norm": 6.299995422363281, "learning_rate": 3.557692307692308e-05, "loss": 0.2834, "step": 1480 }, { "epoch": 7.163461538461538, "grad_norm": 8.894652366638184, "learning_rate": 3.5817307692307695e-05, "loss": 0.2825, "step": 1490 }, { "epoch": 7.211538461538462, "grad_norm": 6.099963665008545, "learning_rate": 3.605769230769231e-05, "loss": 0.2572, "step": 1500 }, { "epoch": 7.259615384615385, "grad_norm": 4.3440022468566895, "learning_rate": 3.629807692307692e-05, "loss": 0.274, "step": 1510 }, { "epoch": 7.3076923076923075, "grad_norm": 4.327615737915039, "learning_rate": 3.653846153846154e-05, "loss": 0.2835, "step": 1520 }, { "epoch": 7.355769230769231, "grad_norm": 5.570151329040527, "learning_rate": 3.677884615384616e-05, "loss": 0.2767, "step": 1530 }, { "epoch": 7.403846153846154, "grad_norm": 5.796697616577148, "learning_rate": 3.701923076923077e-05, "loss": 0.2763, "step": 1540 }, { "epoch": 7.451923076923077, "grad_norm": 4.1582794189453125, "learning_rate": 3.725961538461539e-05, "loss": 0.2816, "step": 1550 }, { "epoch": 7.5, "grad_norm": 4.4326348304748535, "learning_rate": 3.7500000000000003e-05, "loss": 0.2552, "step": 1560 }, { "epoch": 7.548076923076923, "grad_norm": 4.937860012054443, "learning_rate": 3.774038461538461e-05, "loss": 0.2473, "step": 1570 }, { "epoch": 7.596153846153846, "grad_norm": 4.28021764755249, "learning_rate": 3.798076923076923e-05, "loss": 0.2654, "step": 1580 }, { "epoch": 7.644230769230769, "grad_norm": 4.267920970916748, "learning_rate": 3.8221153846153846e-05, "loss": 0.2463, "step": 1590 }, { "epoch": 7.6923076923076925, "grad_norm": 4.122708797454834, "learning_rate": 3.846153846153846e-05, "loss": 0.2821, "step": 1600 }, { "epoch": 7.740384615384615, "grad_norm": 4.7097859382629395, "learning_rate": 3.870192307692308e-05, "loss": 0.2758, "step": 1610 }, { "epoch": 7.788461538461538, "grad_norm": 4.672336101531982, "learning_rate": 3.8942307692307696e-05, "loss": 0.2894, "step": 1620 }, { "epoch": 7.836538461538462, "grad_norm": 5.280921459197998, "learning_rate": 3.918269230769231e-05, "loss": 0.2699, "step": 1630 }, { "epoch": 7.884615384615385, "grad_norm": 3.452415943145752, "learning_rate": 3.942307692307692e-05, "loss": 0.2715, "step": 1640 }, { "epoch": 7.9326923076923075, "grad_norm": 4.116134166717529, "learning_rate": 3.966346153846154e-05, "loss": 0.2665, "step": 1650 }, { "epoch": 7.980769230769231, "grad_norm": 5.053679466247559, "learning_rate": 3.9903846153846155e-05, "loss": 0.3112, "step": 1660 }, { "epoch": 8.0, "eval_accuracy": 0.917144727216725, "eval_loss": 0.23608891665935516, "eval_runtime": 77.8231, "eval_samples_per_second": 333.744, "eval_steps_per_second": 5.217, "step": 1664 }, { "epoch": 8.028846153846153, "grad_norm": 5.0257134437561035, "learning_rate": 4.014423076923077e-05, "loss": 0.2763, "step": 1670 }, { "epoch": 8.076923076923077, "grad_norm": 4.245357513427734, "learning_rate": 4.038461538461539e-05, "loss": 0.2639, "step": 1680 }, { "epoch": 8.125, "grad_norm": 3.736905813217163, "learning_rate": 4.0625000000000005e-05, "loss": 0.2784, "step": 1690 }, { "epoch": 8.173076923076923, "grad_norm": 4.1451921463012695, "learning_rate": 4.0865384615384615e-05, "loss": 0.2803, "step": 1700 }, { "epoch": 8.221153846153847, "grad_norm": 4.860665798187256, "learning_rate": 4.110576923076923e-05, "loss": 0.2848, "step": 1710 }, { "epoch": 8.26923076923077, "grad_norm": 3.6125144958496094, "learning_rate": 4.134615384615385e-05, "loss": 0.2485, "step": 1720 }, { "epoch": 8.317307692307692, "grad_norm": 4.480985164642334, "learning_rate": 4.1586538461538464e-05, "loss": 0.2821, "step": 1730 }, { "epoch": 8.365384615384615, "grad_norm": 5.564511299133301, "learning_rate": 4.182692307692308e-05, "loss": 0.2761, "step": 1740 }, { "epoch": 8.413461538461538, "grad_norm": 4.201336860656738, "learning_rate": 4.20673076923077e-05, "loss": 0.2804, "step": 1750 }, { "epoch": 8.461538461538462, "grad_norm": 5.1193718910217285, "learning_rate": 4.230769230769231e-05, "loss": 0.2722, "step": 1760 }, { "epoch": 8.509615384615385, "grad_norm": 3.6050448417663574, "learning_rate": 4.2548076923076924e-05, "loss": 0.2524, "step": 1770 }, { "epoch": 8.557692307692308, "grad_norm": 4.053289413452148, "learning_rate": 4.278846153846154e-05, "loss": 0.2784, "step": 1780 }, { "epoch": 8.60576923076923, "grad_norm": 4.6787519454956055, "learning_rate": 4.302884615384616e-05, "loss": 0.2695, "step": 1790 }, { "epoch": 8.653846153846153, "grad_norm": 6.316561222076416, "learning_rate": 4.326923076923077e-05, "loss": 0.2711, "step": 1800 }, { "epoch": 8.701923076923077, "grad_norm": 4.557034492492676, "learning_rate": 4.350961538461539e-05, "loss": 0.2707, "step": 1810 }, { "epoch": 8.75, "grad_norm": 4.557834625244141, "learning_rate": 4.375e-05, "loss": 0.2733, "step": 1820 }, { "epoch": 8.798076923076923, "grad_norm": 5.813235759735107, "learning_rate": 4.3990384615384616e-05, "loss": 0.2762, "step": 1830 }, { "epoch": 8.846153846153847, "grad_norm": 3.4934988021850586, "learning_rate": 4.423076923076923e-05, "loss": 0.2941, "step": 1840 }, { "epoch": 8.89423076923077, "grad_norm": 3.623195171356201, "learning_rate": 4.447115384615384e-05, "loss": 0.2964, "step": 1850 }, { "epoch": 8.942307692307692, "grad_norm": 4.589170932769775, "learning_rate": 4.4711538461538466e-05, "loss": 0.2767, "step": 1860 }, { "epoch": 8.990384615384615, "grad_norm": 5.017085075378418, "learning_rate": 4.495192307692308e-05, "loss": 0.2679, "step": 1870 }, { "epoch": 9.0, "eval_accuracy": 0.9184537789242675, "eval_loss": 0.22731688618659973, "eval_runtime": 77.7007, "eval_samples_per_second": 334.27, "eval_steps_per_second": 5.225, "step": 1872 }, { "epoch": 9.038461538461538, "grad_norm": 6.059936046600342, "learning_rate": 4.519230769230769e-05, "loss": 0.2526, "step": 1880 }, { "epoch": 9.086538461538462, "grad_norm": 6.131364345550537, "learning_rate": 4.543269230769231e-05, "loss": 0.2653, "step": 1890 }, { "epoch": 9.134615384615385, "grad_norm": 3.699256420135498, "learning_rate": 4.5673076923076925e-05, "loss": 0.2675, "step": 1900 }, { "epoch": 9.182692307692308, "grad_norm": 4.112985610961914, "learning_rate": 4.591346153846154e-05, "loss": 0.2767, "step": 1910 }, { "epoch": 9.23076923076923, "grad_norm": 4.60912561416626, "learning_rate": 4.615384615384616e-05, "loss": 0.2731, "step": 1920 }, { "epoch": 9.278846153846153, "grad_norm": 5.9688801765441895, "learning_rate": 4.6394230769230775e-05, "loss": 0.2742, "step": 1930 }, { "epoch": 9.326923076923077, "grad_norm": 3.451012372970581, "learning_rate": 4.6634615384615384e-05, "loss": 0.2666, "step": 1940 }, { "epoch": 9.375, "grad_norm": 4.740671157836914, "learning_rate": 4.6875e-05, "loss": 0.2741, "step": 1950 }, { "epoch": 9.423076923076923, "grad_norm": 4.5006208419799805, "learning_rate": 4.711538461538462e-05, "loss": 0.2528, "step": 1960 }, { "epoch": 9.471153846153847, "grad_norm": 6.515170097351074, "learning_rate": 4.7355769230769234e-05, "loss": 0.2735, "step": 1970 }, { "epoch": 9.51923076923077, "grad_norm": 5.413421630859375, "learning_rate": 4.7596153846153844e-05, "loss": 0.2603, "step": 1980 }, { "epoch": 9.567307692307692, "grad_norm": 5.114051818847656, "learning_rate": 4.783653846153847e-05, "loss": 0.2739, "step": 1990 }, { "epoch": 9.615384615384615, "grad_norm": 3.541574239730835, "learning_rate": 4.8076923076923084e-05, "loss": 0.2652, "step": 2000 }, { "epoch": 9.663461538461538, "grad_norm": 4.178200721740723, "learning_rate": 4.8317307692307693e-05, "loss": 0.2882, "step": 2010 }, { "epoch": 9.711538461538462, "grad_norm": 4.062794208526611, "learning_rate": 4.855769230769231e-05, "loss": 0.269, "step": 2020 }, { "epoch": 9.759615384615385, "grad_norm": 4.985105514526367, "learning_rate": 4.8798076923076926e-05, "loss": 0.2578, "step": 2030 }, { "epoch": 9.807692307692308, "grad_norm": 6.200753688812256, "learning_rate": 4.9038461538461536e-05, "loss": 0.275, "step": 2040 }, { "epoch": 9.85576923076923, "grad_norm": 4.100587844848633, "learning_rate": 4.927884615384616e-05, "loss": 0.2793, "step": 2050 }, { "epoch": 9.903846153846153, "grad_norm": 3.268134832382202, "learning_rate": 4.9519230769230776e-05, "loss": 0.2781, "step": 2060 }, { "epoch": 9.951923076923077, "grad_norm": 3.9231317043304443, "learning_rate": 4.9759615384615386e-05, "loss": 0.2739, "step": 2070 }, { "epoch": 10.0, "grad_norm": 5.452599048614502, "learning_rate": 5e-05, "loss": 0.3099, "step": 2080 }, { "epoch": 10.0, "eval_accuracy": 0.9144111192392099, "eval_loss": 0.23027944564819336, "eval_runtime": 77.4593, "eval_samples_per_second": 335.311, "eval_steps_per_second": 5.241, "step": 2080 }, { "epoch": 10.048076923076923, "grad_norm": 3.9833130836486816, "learning_rate": 4.99732905982906e-05, "loss": 0.2634, "step": 2090 }, { "epoch": 10.096153846153847, "grad_norm": 6.280396461486816, "learning_rate": 4.99465811965812e-05, "loss": 0.2497, "step": 2100 }, { "epoch": 10.14423076923077, "grad_norm": 7.405674457550049, "learning_rate": 4.9919871794871795e-05, "loss": 0.2956, "step": 2110 }, { "epoch": 10.192307692307692, "grad_norm": 3.8241496086120605, "learning_rate": 4.98931623931624e-05, "loss": 0.2766, "step": 2120 }, { "epoch": 10.240384615384615, "grad_norm": 5.66024112701416, "learning_rate": 4.9866452991452996e-05, "loss": 0.2935, "step": 2130 }, { "epoch": 10.288461538461538, "grad_norm": 2.9617269039154053, "learning_rate": 4.9839743589743594e-05, "loss": 0.2608, "step": 2140 }, { "epoch": 10.336538461538462, "grad_norm": 7.182979583740234, "learning_rate": 4.981303418803419e-05, "loss": 0.2696, "step": 2150 }, { "epoch": 10.384615384615385, "grad_norm": 3.454568862915039, "learning_rate": 4.978632478632479e-05, "loss": 0.2266, "step": 2160 }, { "epoch": 10.432692307692308, "grad_norm": 3.4203550815582275, "learning_rate": 4.9759615384615386e-05, "loss": 0.2746, "step": 2170 }, { "epoch": 10.48076923076923, "grad_norm": 5.04231595993042, "learning_rate": 4.973290598290598e-05, "loss": 0.2823, "step": 2180 }, { "epoch": 10.528846153846153, "grad_norm": 3.6935813426971436, "learning_rate": 4.970619658119659e-05, "loss": 0.2558, "step": 2190 }, { "epoch": 10.576923076923077, "grad_norm": 3.9472596645355225, "learning_rate": 4.9679487179487185e-05, "loss": 0.2565, "step": 2200 }, { "epoch": 10.625, "grad_norm": 3.6713578701019287, "learning_rate": 4.965277777777778e-05, "loss": 0.2584, "step": 2210 }, { "epoch": 10.673076923076923, "grad_norm": 4.998437404632568, "learning_rate": 4.962606837606838e-05, "loss": 0.282, "step": 2220 }, { "epoch": 10.721153846153847, "grad_norm": 5.487468242645264, "learning_rate": 4.959935897435898e-05, "loss": 0.2455, "step": 2230 }, { "epoch": 10.76923076923077, "grad_norm": 2.9530045986175537, "learning_rate": 4.9572649572649575e-05, "loss": 0.2871, "step": 2240 }, { "epoch": 10.817307692307692, "grad_norm": 4.572380065917969, "learning_rate": 4.954594017094017e-05, "loss": 0.2596, "step": 2250 }, { "epoch": 10.865384615384615, "grad_norm": 4.948703289031982, "learning_rate": 4.9519230769230776e-05, "loss": 0.2879, "step": 2260 }, { "epoch": 10.913461538461538, "grad_norm": 3.7530524730682373, "learning_rate": 4.9492521367521374e-05, "loss": 0.2729, "step": 2270 }, { "epoch": 10.961538461538462, "grad_norm": 4.797047138214111, "learning_rate": 4.946581196581197e-05, "loss": 0.2749, "step": 2280 }, { "epoch": 11.0, "eval_accuracy": 0.9124860431987063, "eval_loss": 0.26575374603271484, "eval_runtime": 77.7131, "eval_samples_per_second": 334.216, "eval_steps_per_second": 5.224, "step": 2288 }, { "epoch": 11.009615384615385, "grad_norm": 4.663016319274902, "learning_rate": 4.943910256410257e-05, "loss": 0.286, "step": 2290 }, { "epoch": 11.057692307692308, "grad_norm": 3.6393942832946777, "learning_rate": 4.9412393162393166e-05, "loss": 0.2784, "step": 2300 }, { "epoch": 11.10576923076923, "grad_norm": 4.042668342590332, "learning_rate": 4.938568376068376e-05, "loss": 0.2774, "step": 2310 }, { "epoch": 11.153846153846153, "grad_norm": 3.476438283920288, "learning_rate": 4.935897435897436e-05, "loss": 0.2609, "step": 2320 }, { "epoch": 11.201923076923077, "grad_norm": 5.5616350173950195, "learning_rate": 4.9332264957264965e-05, "loss": 0.2385, "step": 2330 }, { "epoch": 11.25, "grad_norm": 3.2799580097198486, "learning_rate": 4.930555555555556e-05, "loss": 0.2615, "step": 2340 }, { "epoch": 11.298076923076923, "grad_norm": 4.4760637283325195, "learning_rate": 4.927884615384616e-05, "loss": 0.2594, "step": 2350 }, { "epoch": 11.346153846153847, "grad_norm": 5.351235866546631, "learning_rate": 4.925213675213676e-05, "loss": 0.2636, "step": 2360 }, { "epoch": 11.39423076923077, "grad_norm": 4.085772514343262, "learning_rate": 4.9225427350427354e-05, "loss": 0.2443, "step": 2370 }, { "epoch": 11.442307692307692, "grad_norm": 4.496190547943115, "learning_rate": 4.9198717948717945e-05, "loss": 0.2576, "step": 2380 }, { "epoch": 11.490384615384615, "grad_norm": 3.8483011722564697, "learning_rate": 4.917200854700855e-05, "loss": 0.2529, "step": 2390 }, { "epoch": 11.538461538461538, "grad_norm": 4.397938251495361, "learning_rate": 4.9145299145299147e-05, "loss": 0.2607, "step": 2400 }, { "epoch": 11.586538461538462, "grad_norm": 4.076712608337402, "learning_rate": 4.9118589743589744e-05, "loss": 0.2742, "step": 2410 }, { "epoch": 11.634615384615385, "grad_norm": 4.988025188446045, "learning_rate": 4.909188034188034e-05, "loss": 0.2796, "step": 2420 }, { "epoch": 11.682692307692308, "grad_norm": 3.701575517654419, "learning_rate": 4.906517094017094e-05, "loss": 0.2482, "step": 2430 }, { "epoch": 11.73076923076923, "grad_norm": 5.806597709655762, "learning_rate": 4.9038461538461536e-05, "loss": 0.29, "step": 2440 }, { "epoch": 11.778846153846153, "grad_norm": 5.08134651184082, "learning_rate": 4.901175213675214e-05, "loss": 0.2708, "step": 2450 }, { "epoch": 11.826923076923077, "grad_norm": 3.941290855407715, "learning_rate": 4.898504273504274e-05, "loss": 0.2615, "step": 2460 }, { "epoch": 11.875, "grad_norm": 4.545041084289551, "learning_rate": 4.8958333333333335e-05, "loss": 0.2963, "step": 2470 }, { "epoch": 11.923076923076923, "grad_norm": 3.398264169692993, "learning_rate": 4.893162393162393e-05, "loss": 0.2838, "step": 2480 }, { "epoch": 11.971153846153847, "grad_norm": 3.760629177093506, "learning_rate": 4.890491452991453e-05, "loss": 0.2475, "step": 2490 }, { "epoch": 12.0, "eval_accuracy": 0.9179147576329265, "eval_loss": 0.22473447024822235, "eval_runtime": 77.8629, "eval_samples_per_second": 333.574, "eval_steps_per_second": 5.214, "step": 2496 }, { "epoch": 12.01923076923077, "grad_norm": 4.164438724517822, "learning_rate": 4.887820512820513e-05, "loss": 0.2762, "step": 2500 }, { "epoch": 12.067307692307692, "grad_norm": 4.374632835388184, "learning_rate": 4.8851495726495725e-05, "loss": 0.2361, "step": 2510 }, { "epoch": 12.115384615384615, "grad_norm": 3.692383050918579, "learning_rate": 4.882478632478633e-05, "loss": 0.2299, "step": 2520 }, { "epoch": 12.163461538461538, "grad_norm": 4.734486103057861, "learning_rate": 4.8798076923076926e-05, "loss": 0.2461, "step": 2530 }, { "epoch": 12.211538461538462, "grad_norm": 3.861116409301758, "learning_rate": 4.8771367521367524e-05, "loss": 0.2511, "step": 2540 }, { "epoch": 12.259615384615385, "grad_norm": 6.658755302429199, "learning_rate": 4.874465811965812e-05, "loss": 0.2584, "step": 2550 }, { "epoch": 12.307692307692308, "grad_norm": 4.118496894836426, "learning_rate": 4.871794871794872e-05, "loss": 0.263, "step": 2560 }, { "epoch": 12.35576923076923, "grad_norm": 5.517817497253418, "learning_rate": 4.8691239316239316e-05, "loss": 0.2428, "step": 2570 }, { "epoch": 12.403846153846153, "grad_norm": 4.727729797363281, "learning_rate": 4.8664529914529914e-05, "loss": 0.2486, "step": 2580 }, { "epoch": 12.451923076923077, "grad_norm": 5.5840959548950195, "learning_rate": 4.863782051282052e-05, "loss": 0.2641, "step": 2590 }, { "epoch": 12.5, "grad_norm": 4.7720465660095215, "learning_rate": 4.8611111111111115e-05, "loss": 0.2683, "step": 2600 }, { "epoch": 12.548076923076923, "grad_norm": 3.709080219268799, "learning_rate": 4.858440170940171e-05, "loss": 0.2566, "step": 2610 }, { "epoch": 12.596153846153847, "grad_norm": 3.6514525413513184, "learning_rate": 4.855769230769231e-05, "loss": 0.274, "step": 2620 }, { "epoch": 12.64423076923077, "grad_norm": 5.246626853942871, "learning_rate": 4.853098290598291e-05, "loss": 0.2574, "step": 2630 }, { "epoch": 12.692307692307692, "grad_norm": 4.3117756843566895, "learning_rate": 4.8504273504273505e-05, "loss": 0.2588, "step": 2640 }, { "epoch": 12.740384615384615, "grad_norm": 5.134099006652832, "learning_rate": 4.84775641025641e-05, "loss": 0.2426, "step": 2650 }, { "epoch": 12.788461538461538, "grad_norm": 5.046911716461182, "learning_rate": 4.8450854700854706e-05, "loss": 0.2706, "step": 2660 }, { "epoch": 12.836538461538462, "grad_norm": 3.087756872177124, "learning_rate": 4.8424145299145304e-05, "loss": 0.2727, "step": 2670 }, { "epoch": 12.884615384615385, "grad_norm": 3.111970901489258, "learning_rate": 4.83974358974359e-05, "loss": 0.2532, "step": 2680 }, { "epoch": 12.932692307692308, "grad_norm": 7.671596527099609, "learning_rate": 4.83707264957265e-05, "loss": 0.2446, "step": 2690 }, { "epoch": 12.98076923076923, "grad_norm": 3.8762941360473633, "learning_rate": 4.8344017094017096e-05, "loss": 0.2338, "step": 2700 }, { "epoch": 13.0, "eval_accuracy": 0.9139491009894891, "eval_loss": 0.23333381116390228, "eval_runtime": 77.7175, "eval_samples_per_second": 334.198, "eval_steps_per_second": 5.224, "step": 2704 }, { "epoch": 13.028846153846153, "grad_norm": 4.7313551902771, "learning_rate": 4.8317307692307693e-05, "loss": 0.2715, "step": 2710 }, { "epoch": 13.076923076923077, "grad_norm": 4.133815288543701, "learning_rate": 4.829059829059829e-05, "loss": 0.229, "step": 2720 }, { "epoch": 13.125, "grad_norm": 3.6891396045684814, "learning_rate": 4.8263888888888895e-05, "loss": 0.2488, "step": 2730 }, { "epoch": 13.173076923076923, "grad_norm": 3.3708419799804688, "learning_rate": 4.823717948717949e-05, "loss": 0.2539, "step": 2740 }, { "epoch": 13.221153846153847, "grad_norm": 4.9210591316223145, "learning_rate": 4.821047008547009e-05, "loss": 0.2453, "step": 2750 }, { "epoch": 13.26923076923077, "grad_norm": 3.7781081199645996, "learning_rate": 4.818376068376069e-05, "loss": 0.2423, "step": 2760 }, { "epoch": 13.317307692307692, "grad_norm": 5.449267387390137, "learning_rate": 4.8157051282051285e-05, "loss": 0.232, "step": 2770 }, { "epoch": 13.365384615384615, "grad_norm": 6.0684404373168945, "learning_rate": 4.813034188034188e-05, "loss": 0.2487, "step": 2780 }, { "epoch": 13.413461538461538, "grad_norm": 3.816370964050293, "learning_rate": 4.810363247863248e-05, "loss": 0.2454, "step": 2790 }, { "epoch": 13.461538461538462, "grad_norm": 4.11817741394043, "learning_rate": 4.8076923076923084e-05, "loss": 0.252, "step": 2800 }, { "epoch": 13.509615384615385, "grad_norm": 3.9054250717163086, "learning_rate": 4.805021367521368e-05, "loss": 0.2441, "step": 2810 }, { "epoch": 13.557692307692308, "grad_norm": 3.1405208110809326, "learning_rate": 4.802350427350428e-05, "loss": 0.2407, "step": 2820 }, { "epoch": 13.60576923076923, "grad_norm": 3.9626333713531494, "learning_rate": 4.7996794871794876e-05, "loss": 0.2379, "step": 2830 }, { "epoch": 13.653846153846153, "grad_norm": 3.8241114616394043, "learning_rate": 4.797008547008547e-05, "loss": 0.2561, "step": 2840 }, { "epoch": 13.701923076923077, "grad_norm": 4.236550331115723, "learning_rate": 4.794337606837607e-05, "loss": 0.2535, "step": 2850 }, { "epoch": 13.75, "grad_norm": 3.947105646133423, "learning_rate": 4.791666666666667e-05, "loss": 0.2691, "step": 2860 }, { "epoch": 13.798076923076923, "grad_norm": 4.561082363128662, "learning_rate": 4.788995726495727e-05, "loss": 0.2576, "step": 2870 }, { "epoch": 13.846153846153847, "grad_norm": 3.4533469676971436, "learning_rate": 4.786324786324787e-05, "loss": 0.2753, "step": 2880 }, { "epoch": 13.89423076923077, "grad_norm": 4.0948381423950195, "learning_rate": 4.783653846153847e-05, "loss": 0.2226, "step": 2890 }, { "epoch": 13.942307692307692, "grad_norm": 4.362265110015869, "learning_rate": 4.7809829059829065e-05, "loss": 0.2463, "step": 2900 }, { "epoch": 13.990384615384615, "grad_norm": 3.8559274673461914, "learning_rate": 4.778311965811966e-05, "loss": 0.2731, "step": 2910 }, { "epoch": 14.0, "eval_accuracy": 0.9152581526970315, "eval_loss": 0.229460209608078, "eval_runtime": 78.1479, "eval_samples_per_second": 332.357, "eval_steps_per_second": 5.195, "step": 2912 }, { "epoch": 14.038461538461538, "grad_norm": 4.947530746459961, "learning_rate": 4.775641025641026e-05, "loss": 0.2379, "step": 2920 }, { "epoch": 14.086538461538462, "grad_norm": 4.03806209564209, "learning_rate": 4.772970085470086e-05, "loss": 0.2225, "step": 2930 }, { "epoch": 14.134615384615385, "grad_norm": 5.728341579437256, "learning_rate": 4.770299145299146e-05, "loss": 0.243, "step": 2940 }, { "epoch": 14.182692307692308, "grad_norm": 4.597044944763184, "learning_rate": 4.767628205128206e-05, "loss": 0.2484, "step": 2950 }, { "epoch": 14.23076923076923, "grad_norm": 3.9122045040130615, "learning_rate": 4.764957264957265e-05, "loss": 0.2469, "step": 2960 }, { "epoch": 14.278846153846153, "grad_norm": 5.7149658203125, "learning_rate": 4.7622863247863246e-05, "loss": 0.2535, "step": 2970 }, { "epoch": 14.326923076923077, "grad_norm": 4.391417980194092, "learning_rate": 4.7596153846153844e-05, "loss": 0.2713, "step": 2980 }, { "epoch": 14.375, "grad_norm": 3.225754976272583, "learning_rate": 4.756944444444444e-05, "loss": 0.2456, "step": 2990 }, { "epoch": 14.423076923076923, "grad_norm": 2.866344451904297, "learning_rate": 4.7542735042735045e-05, "loss": 0.2536, "step": 3000 }, { "epoch": 14.471153846153847, "grad_norm": 4.8120951652526855, "learning_rate": 4.751602564102564e-05, "loss": 0.2288, "step": 3010 }, { "epoch": 14.51923076923077, "grad_norm": 3.8738813400268555, "learning_rate": 4.748931623931624e-05, "loss": 0.2603, "step": 3020 }, { "epoch": 14.567307692307692, "grad_norm": 4.724725723266602, "learning_rate": 4.746260683760684e-05, "loss": 0.2317, "step": 3030 }, { "epoch": 14.615384615384615, "grad_norm": 3.7020509243011475, "learning_rate": 4.7435897435897435e-05, "loss": 0.262, "step": 3040 }, { "epoch": 14.663461538461538, "grad_norm": 4.149657249450684, "learning_rate": 4.740918803418803e-05, "loss": 0.2468, "step": 3050 }, { "epoch": 14.711538461538462, "grad_norm": 4.489691257476807, "learning_rate": 4.738247863247863e-05, "loss": 0.235, "step": 3060 }, { "epoch": 14.759615384615385, "grad_norm": 3.8826403617858887, "learning_rate": 4.7355769230769234e-05, "loss": 0.223, "step": 3070 }, { "epoch": 14.807692307692308, "grad_norm": 4.002881050109863, "learning_rate": 4.732905982905983e-05, "loss": 0.2456, "step": 3080 }, { "epoch": 14.85576923076923, "grad_norm": 4.360832214355469, "learning_rate": 4.730235042735043e-05, "loss": 0.2534, "step": 3090 }, { "epoch": 14.903846153846153, "grad_norm": 3.4078216552734375, "learning_rate": 4.7275641025641026e-05, "loss": 0.2484, "step": 3100 }, { "epoch": 14.951923076923077, "grad_norm": 4.183500289916992, "learning_rate": 4.7248931623931624e-05, "loss": 0.2384, "step": 3110 }, { "epoch": 15.0, "grad_norm": 4.534628391265869, "learning_rate": 4.722222222222222e-05, "loss": 0.229, "step": 3120 }, { "epoch": 15.0, "eval_accuracy": 0.9138335964270589, "eval_loss": 0.2505186200141907, "eval_runtime": 78.0242, "eval_samples_per_second": 332.884, "eval_steps_per_second": 5.204, "step": 3120 }, { "epoch": 15.048076923076923, "grad_norm": 4.2897725105285645, "learning_rate": 4.719551282051282e-05, "loss": 0.211, "step": 3130 }, { "epoch": 15.096153846153847, "grad_norm": 4.403299808502197, "learning_rate": 4.716880341880342e-05, "loss": 0.2307, "step": 3140 }, { "epoch": 15.14423076923077, "grad_norm": 3.4769153594970703, "learning_rate": 4.714209401709402e-05, "loss": 0.257, "step": 3150 }, { "epoch": 15.192307692307692, "grad_norm": 4.1038498878479, "learning_rate": 4.711538461538462e-05, "loss": 0.2373, "step": 3160 }, { "epoch": 15.240384615384615, "grad_norm": 4.549970626831055, "learning_rate": 4.7088675213675215e-05, "loss": 0.2125, "step": 3170 }, { "epoch": 15.288461538461538, "grad_norm": 4.485842704772949, "learning_rate": 4.706196581196581e-05, "loss": 0.2399, "step": 3180 }, { "epoch": 15.336538461538462, "grad_norm": 3.8056819438934326, "learning_rate": 4.703525641025641e-05, "loss": 0.2365, "step": 3190 }, { "epoch": 15.384615384615385, "grad_norm": 4.008427143096924, "learning_rate": 4.700854700854701e-05, "loss": 0.235, "step": 3200 }, { "epoch": 15.432692307692308, "grad_norm": 3.669658899307251, "learning_rate": 4.698183760683761e-05, "loss": 0.2126, "step": 3210 }, { "epoch": 15.48076923076923, "grad_norm": 4.9357380867004395, "learning_rate": 4.695512820512821e-05, "loss": 0.246, "step": 3220 }, { "epoch": 15.528846153846153, "grad_norm": 4.826357841491699, "learning_rate": 4.6928418803418806e-05, "loss": 0.2093, "step": 3230 }, { "epoch": 15.576923076923077, "grad_norm": 5.386723518371582, "learning_rate": 4.6901709401709404e-05, "loss": 0.248, "step": 3240 }, { "epoch": 15.625, "grad_norm": 3.112846851348877, "learning_rate": 4.6875e-05, "loss": 0.2546, "step": 3250 }, { "epoch": 15.673076923076923, "grad_norm": 4.133208751678467, "learning_rate": 4.68482905982906e-05, "loss": 0.21, "step": 3260 }, { "epoch": 15.721153846153847, "grad_norm": 3.2168807983398438, "learning_rate": 4.6821581196581196e-05, "loss": 0.2326, "step": 3270 }, { "epoch": 15.76923076923077, "grad_norm": 4.412736415863037, "learning_rate": 4.67948717948718e-05, "loss": 0.2379, "step": 3280 }, { "epoch": 15.817307692307692, "grad_norm": 5.15736198425293, "learning_rate": 4.67681623931624e-05, "loss": 0.2524, "step": 3290 }, { "epoch": 15.865384615384615, "grad_norm": 4.065254211425781, "learning_rate": 4.6741452991452995e-05, "loss": 0.2431, "step": 3300 }, { "epoch": 15.913461538461538, "grad_norm": 4.249202728271484, "learning_rate": 4.671474358974359e-05, "loss": 0.2396, "step": 3310 }, { "epoch": 15.961538461538462, "grad_norm": 3.4853355884552, "learning_rate": 4.668803418803419e-05, "loss": 0.2462, "step": 3320 }, { "epoch": 16.0, "eval_accuracy": 0.9137180918646286, "eval_loss": 0.253385454416275, "eval_runtime": 77.9239, "eval_samples_per_second": 333.313, "eval_steps_per_second": 5.21, "step": 3328 }, { "epoch": 16.009615384615383, "grad_norm": 4.444523811340332, "learning_rate": 4.666132478632479e-05, "loss": 0.2211, "step": 3330 }, { "epoch": 16.057692307692307, "grad_norm": 5.856003284454346, "learning_rate": 4.6634615384615384e-05, "loss": 0.2328, "step": 3340 }, { "epoch": 16.10576923076923, "grad_norm": 6.36641788482666, "learning_rate": 4.660790598290599e-05, "loss": 0.2151, "step": 3350 }, { "epoch": 16.153846153846153, "grad_norm": 4.984757423400879, "learning_rate": 4.6581196581196586e-05, "loss": 0.222, "step": 3360 }, { "epoch": 16.201923076923077, "grad_norm": 6.398218631744385, "learning_rate": 4.6554487179487183e-05, "loss": 0.233, "step": 3370 }, { "epoch": 16.25, "grad_norm": 6.950900554656982, "learning_rate": 4.652777777777778e-05, "loss": 0.2547, "step": 3380 }, { "epoch": 16.298076923076923, "grad_norm": 5.884543418884277, "learning_rate": 4.650106837606838e-05, "loss": 0.2308, "step": 3390 }, { "epoch": 16.346153846153847, "grad_norm": 3.6590914726257324, "learning_rate": 4.6474358974358976e-05, "loss": 0.2443, "step": 3400 }, { "epoch": 16.39423076923077, "grad_norm": 4.361863136291504, "learning_rate": 4.644764957264957e-05, "loss": 0.2118, "step": 3410 }, { "epoch": 16.442307692307693, "grad_norm": 4.221235752105713, "learning_rate": 4.642094017094018e-05, "loss": 0.227, "step": 3420 }, { "epoch": 16.490384615384617, "grad_norm": 3.6799087524414062, "learning_rate": 4.6394230769230775e-05, "loss": 0.2259, "step": 3430 }, { "epoch": 16.53846153846154, "grad_norm": 4.38169527053833, "learning_rate": 4.636752136752137e-05, "loss": 0.2291, "step": 3440 }, { "epoch": 16.58653846153846, "grad_norm": 3.5283689498901367, "learning_rate": 4.634081196581197e-05, "loss": 0.2409, "step": 3450 }, { "epoch": 16.634615384615383, "grad_norm": 3.5082035064697266, "learning_rate": 4.631410256410257e-05, "loss": 0.2128, "step": 3460 }, { "epoch": 16.682692307692307, "grad_norm": 3.1252126693725586, "learning_rate": 4.6287393162393164e-05, "loss": 0.2238, "step": 3470 }, { "epoch": 16.73076923076923, "grad_norm": 5.692535400390625, "learning_rate": 4.626068376068376e-05, "loss": 0.2509, "step": 3480 }, { "epoch": 16.778846153846153, "grad_norm": 3.0481839179992676, "learning_rate": 4.6233974358974366e-05, "loss": 0.2217, "step": 3490 }, { "epoch": 16.826923076923077, "grad_norm": 5.5205793380737305, "learning_rate": 4.620726495726496e-05, "loss": 0.2174, "step": 3500 }, { "epoch": 16.875, "grad_norm": 4.8773674964904785, "learning_rate": 4.618055555555556e-05, "loss": 0.2637, "step": 3510 }, { "epoch": 16.923076923076923, "grad_norm": 3.354867696762085, "learning_rate": 4.615384615384616e-05, "loss": 0.2389, "step": 3520 }, { "epoch": 16.971153846153847, "grad_norm": 3.0015499591827393, "learning_rate": 4.6127136752136756e-05, "loss": 0.2274, "step": 3530 }, { "epoch": 17.0, "eval_accuracy": 0.9079043622223077, "eval_loss": 0.2652486562728882, "eval_runtime": 78.0613, "eval_samples_per_second": 332.726, "eval_steps_per_second": 5.201, "step": 3536 }, { "epoch": 17.01923076923077, "grad_norm": 5.557812690734863, "learning_rate": 4.610042735042735e-05, "loss": 0.2232, "step": 3540 }, { "epoch": 17.067307692307693, "grad_norm": 4.5486040115356445, "learning_rate": 4.607371794871795e-05, "loss": 0.2362, "step": 3550 }, { "epoch": 17.115384615384617, "grad_norm": 3.1142873764038086, "learning_rate": 4.604700854700855e-05, "loss": 0.1913, "step": 3560 }, { "epoch": 17.16346153846154, "grad_norm": 4.3920440673828125, "learning_rate": 4.6020299145299145e-05, "loss": 0.2368, "step": 3570 }, { "epoch": 17.21153846153846, "grad_norm": 2.7898828983306885, "learning_rate": 4.599358974358974e-05, "loss": 0.2107, "step": 3580 }, { "epoch": 17.259615384615383, "grad_norm": 5.218888759613037, "learning_rate": 4.596688034188034e-05, "loss": 0.2158, "step": 3590 }, { "epoch": 17.307692307692307, "grad_norm": 4.6842851638793945, "learning_rate": 4.594017094017094e-05, "loss": 0.2165, "step": 3600 }, { "epoch": 17.35576923076923, "grad_norm": 4.1012163162231445, "learning_rate": 4.591346153846154e-05, "loss": 0.2096, "step": 3610 }, { "epoch": 17.403846153846153, "grad_norm": 3.842759609222412, "learning_rate": 4.588675213675214e-05, "loss": 0.2196, "step": 3620 }, { "epoch": 17.451923076923077, "grad_norm": 3.027179002761841, "learning_rate": 4.5860042735042736e-05, "loss": 0.2192, "step": 3630 }, { "epoch": 17.5, "grad_norm": 3.5128471851348877, "learning_rate": 4.5833333333333334e-05, "loss": 0.2323, "step": 3640 }, { "epoch": 17.548076923076923, "grad_norm": 5.511923313140869, "learning_rate": 4.580662393162393e-05, "loss": 0.1991, "step": 3650 }, { "epoch": 17.596153846153847, "grad_norm": 3.5712032318115234, "learning_rate": 4.577991452991453e-05, "loss": 0.2502, "step": 3660 }, { "epoch": 17.64423076923077, "grad_norm": 2.9027912616729736, "learning_rate": 4.5753205128205126e-05, "loss": 0.2047, "step": 3670 }, { "epoch": 17.692307692307693, "grad_norm": 4.112655162811279, "learning_rate": 4.572649572649573e-05, "loss": 0.2277, "step": 3680 }, { "epoch": 17.740384615384617, "grad_norm": 4.49221658706665, "learning_rate": 4.569978632478633e-05, "loss": 0.2075, "step": 3690 }, { "epoch": 17.78846153846154, "grad_norm": 5.769828796386719, "learning_rate": 4.5673076923076925e-05, "loss": 0.2248, "step": 3700 }, { "epoch": 17.83653846153846, "grad_norm": 3.5926618576049805, "learning_rate": 4.564636752136752e-05, "loss": 0.2101, "step": 3710 }, { "epoch": 17.884615384615383, "grad_norm": 4.068170547485352, "learning_rate": 4.561965811965812e-05, "loss": 0.214, "step": 3720 }, { "epoch": 17.932692307692307, "grad_norm": 4.889640808105469, "learning_rate": 4.559294871794872e-05, "loss": 0.2348, "step": 3730 }, { "epoch": 17.98076923076923, "grad_norm": 4.091845989227295, "learning_rate": 4.5566239316239315e-05, "loss": 0.2339, "step": 3740 }, { "epoch": 18.0, "eval_accuracy": 0.9152966542178416, "eval_loss": 0.254966139793396, "eval_runtime": 77.6106, "eval_samples_per_second": 334.658, "eval_steps_per_second": 5.231, "step": 3744 }, { "epoch": 18.028846153846153, "grad_norm": 3.866283655166626, "learning_rate": 4.553952991452992e-05, "loss": 0.2212, "step": 3750 }, { "epoch": 18.076923076923077, "grad_norm": 4.562051296234131, "learning_rate": 4.5512820512820516e-05, "loss": 0.2162, "step": 3760 }, { "epoch": 18.125, "grad_norm": 4.441306114196777, "learning_rate": 4.5486111111111114e-05, "loss": 0.2075, "step": 3770 }, { "epoch": 18.173076923076923, "grad_norm": 4.256568908691406, "learning_rate": 4.545940170940171e-05, "loss": 0.2056, "step": 3780 }, { "epoch": 18.221153846153847, "grad_norm": 3.9941353797912598, "learning_rate": 4.543269230769231e-05, "loss": 0.2315, "step": 3790 }, { "epoch": 18.26923076923077, "grad_norm": 4.601536750793457, "learning_rate": 4.5405982905982906e-05, "loss": 0.2028, "step": 3800 }, { "epoch": 18.317307692307693, "grad_norm": 5.038381099700928, "learning_rate": 4.53792735042735e-05, "loss": 0.2141, "step": 3810 }, { "epoch": 18.365384615384617, "grad_norm": 3.406494379043579, "learning_rate": 4.535256410256411e-05, "loss": 0.2139, "step": 3820 }, { "epoch": 18.41346153846154, "grad_norm": 4.697597980499268, "learning_rate": 4.5325854700854705e-05, "loss": 0.2131, "step": 3830 }, { "epoch": 18.46153846153846, "grad_norm": 5.912860870361328, "learning_rate": 4.52991452991453e-05, "loss": 0.191, "step": 3840 }, { "epoch": 18.509615384615383, "grad_norm": 2.9721107482910156, "learning_rate": 4.52724358974359e-05, "loss": 0.2262, "step": 3850 }, { "epoch": 18.557692307692307, "grad_norm": 4.594091892242432, "learning_rate": 4.52457264957265e-05, "loss": 0.2109, "step": 3860 }, { "epoch": 18.60576923076923, "grad_norm": 4.6607747077941895, "learning_rate": 4.5219017094017095e-05, "loss": 0.1984, "step": 3870 }, { "epoch": 18.653846153846153, "grad_norm": 3.563316822052002, "learning_rate": 4.519230769230769e-05, "loss": 0.2191, "step": 3880 }, { "epoch": 18.701923076923077, "grad_norm": 3.444108247756958, "learning_rate": 4.5165598290598296e-05, "loss": 0.2173, "step": 3890 }, { "epoch": 18.75, "grad_norm": 6.761950492858887, "learning_rate": 4.5138888888888894e-05, "loss": 0.2131, "step": 3900 }, { "epoch": 18.798076923076923, "grad_norm": 4.137702941894531, "learning_rate": 4.511217948717949e-05, "loss": 0.214, "step": 3910 }, { "epoch": 18.846153846153847, "grad_norm": 4.277977466583252, "learning_rate": 4.508547008547009e-05, "loss": 0.2069, "step": 3920 }, { "epoch": 18.89423076923077, "grad_norm": 3.8296375274658203, "learning_rate": 4.5058760683760686e-05, "loss": 0.2232, "step": 3930 }, { "epoch": 18.942307692307693, "grad_norm": 4.196337699890137, "learning_rate": 4.503205128205128e-05, "loss": 0.2463, "step": 3940 }, { "epoch": 18.990384615384617, "grad_norm": 4.461086750030518, "learning_rate": 4.500534188034188e-05, "loss": 0.2053, "step": 3950 }, { "epoch": 19.0, "eval_accuracy": 0.9105609671582028, "eval_loss": 0.28188812732696533, "eval_runtime": 78.0005, "eval_samples_per_second": 332.985, "eval_steps_per_second": 5.205, "step": 3952 }, { "epoch": 19.03846153846154, "grad_norm": 3.8619091510772705, "learning_rate": 4.4978632478632485e-05, "loss": 0.1986, "step": 3960 }, { "epoch": 19.08653846153846, "grad_norm": 2.832623243331909, "learning_rate": 4.495192307692308e-05, "loss": 0.1984, "step": 3970 }, { "epoch": 19.134615384615383, "grad_norm": 3.364708185195923, "learning_rate": 4.492521367521368e-05, "loss": 0.1979, "step": 3980 }, { "epoch": 19.182692307692307, "grad_norm": 3.1990065574645996, "learning_rate": 4.489850427350428e-05, "loss": 0.2035, "step": 3990 }, { "epoch": 19.23076923076923, "grad_norm": 3.45723557472229, "learning_rate": 4.4871794871794874e-05, "loss": 0.2183, "step": 4000 }, { "epoch": 19.278846153846153, "grad_norm": 5.302587509155273, "learning_rate": 4.484508547008547e-05, "loss": 0.2042, "step": 4010 }, { "epoch": 19.326923076923077, "grad_norm": 4.559330940246582, "learning_rate": 4.481837606837607e-05, "loss": 0.2086, "step": 4020 }, { "epoch": 19.375, "grad_norm": 4.479496002197266, "learning_rate": 4.4791666666666673e-05, "loss": 0.1998, "step": 4030 }, { "epoch": 19.423076923076923, "grad_norm": 5.866844654083252, "learning_rate": 4.476495726495727e-05, "loss": 0.2278, "step": 4040 }, { "epoch": 19.471153846153847, "grad_norm": 5.313492774963379, "learning_rate": 4.473824786324787e-05, "loss": 0.2096, "step": 4050 }, { "epoch": 19.51923076923077, "grad_norm": 3.7483298778533936, "learning_rate": 4.4711538461538466e-05, "loss": 0.2084, "step": 4060 }, { "epoch": 19.567307692307693, "grad_norm": 4.931823253631592, "learning_rate": 4.468482905982906e-05, "loss": 0.2236, "step": 4070 }, { "epoch": 19.615384615384617, "grad_norm": 4.328209400177002, "learning_rate": 4.465811965811966e-05, "loss": 0.2085, "step": 4080 }, { "epoch": 19.66346153846154, "grad_norm": 4.9296088218688965, "learning_rate": 4.463141025641026e-05, "loss": 0.2155, "step": 4090 }, { "epoch": 19.71153846153846, "grad_norm": 4.916458606719971, "learning_rate": 4.460470085470086e-05, "loss": 0.2197, "step": 4100 }, { "epoch": 19.759615384615383, "grad_norm": 3.0691449642181396, "learning_rate": 4.457799145299146e-05, "loss": 0.1981, "step": 4110 }, { "epoch": 19.807692307692307, "grad_norm": 4.687877655029297, "learning_rate": 4.455128205128206e-05, "loss": 0.2209, "step": 4120 }, { "epoch": 19.85576923076923, "grad_norm": 3.2619102001190186, "learning_rate": 4.452457264957265e-05, "loss": 0.2052, "step": 4130 }, { "epoch": 19.903846153846153, "grad_norm": 4.5530571937561035, "learning_rate": 4.4497863247863245e-05, "loss": 0.2187, "step": 4140 }, { "epoch": 19.951923076923077, "grad_norm": 2.5921518802642822, "learning_rate": 4.447115384615384e-05, "loss": 0.2113, "step": 4150 }, { "epoch": 20.0, "grad_norm": 5.733920574188232, "learning_rate": 4.4444444444444447e-05, "loss": 0.2063, "step": 4160 }, { "epoch": 20.0, "eval_accuracy": 0.9129095599276171, "eval_loss": 0.27466538548469543, "eval_runtime": 77.8279, "eval_samples_per_second": 333.723, "eval_steps_per_second": 5.217, "step": 4160 }, { "epoch": 20.048076923076923, "grad_norm": 2.932892084121704, "learning_rate": 4.4417735042735044e-05, "loss": 0.1988, "step": 4170 }, { "epoch": 20.096153846153847, "grad_norm": 3.929918050765991, "learning_rate": 4.439102564102564e-05, "loss": 0.1952, "step": 4180 }, { "epoch": 20.14423076923077, "grad_norm": 4.714585304260254, "learning_rate": 4.436431623931624e-05, "loss": 0.2128, "step": 4190 }, { "epoch": 20.192307692307693, "grad_norm": 2.976383924484253, "learning_rate": 4.4337606837606836e-05, "loss": 0.1944, "step": 4200 }, { "epoch": 20.240384615384617, "grad_norm": 6.976171493530273, "learning_rate": 4.4310897435897434e-05, "loss": 0.222, "step": 4210 }, { "epoch": 20.28846153846154, "grad_norm": 2.9546520709991455, "learning_rate": 4.428418803418803e-05, "loss": 0.2035, "step": 4220 }, { "epoch": 20.33653846153846, "grad_norm": 4.392848491668701, "learning_rate": 4.4257478632478635e-05, "loss": 0.206, "step": 4230 }, { "epoch": 20.384615384615383, "grad_norm": 5.038520336151123, "learning_rate": 4.423076923076923e-05, "loss": 0.1912, "step": 4240 }, { "epoch": 20.432692307692307, "grad_norm": 5.235860824584961, "learning_rate": 4.420405982905983e-05, "loss": 0.2032, "step": 4250 }, { "epoch": 20.48076923076923, "grad_norm": 3.610867738723755, "learning_rate": 4.417735042735043e-05, "loss": 0.1915, "step": 4260 }, { "epoch": 20.528846153846153, "grad_norm": 3.7599639892578125, "learning_rate": 4.4150641025641025e-05, "loss": 0.1744, "step": 4270 }, { "epoch": 20.576923076923077, "grad_norm": 4.142217636108398, "learning_rate": 4.412393162393162e-05, "loss": 0.207, "step": 4280 }, { "epoch": 20.625, "grad_norm": 4.824557304382324, "learning_rate": 4.4097222222222226e-05, "loss": 0.2075, "step": 4290 }, { "epoch": 20.673076923076923, "grad_norm": 4.178677558898926, "learning_rate": 4.4070512820512824e-05, "loss": 0.1847, "step": 4300 }, { "epoch": 20.721153846153847, "grad_norm": 3.8840489387512207, "learning_rate": 4.404380341880342e-05, "loss": 0.1738, "step": 4310 }, { "epoch": 20.76923076923077, "grad_norm": 5.399970531463623, "learning_rate": 4.401709401709402e-05, "loss": 0.1945, "step": 4320 }, { "epoch": 20.817307692307693, "grad_norm": 4.5591888427734375, "learning_rate": 4.3990384615384616e-05, "loss": 0.2123, "step": 4330 }, { "epoch": 20.865384615384617, "grad_norm": 3.1210103034973145, "learning_rate": 4.3963675213675214e-05, "loss": 0.2143, "step": 4340 }, { "epoch": 20.91346153846154, "grad_norm": 3.3902838230133057, "learning_rate": 4.393696581196581e-05, "loss": 0.2036, "step": 4350 }, { "epoch": 20.96153846153846, "grad_norm": 3.522028923034668, "learning_rate": 4.3910256410256415e-05, "loss": 0.1964, "step": 4360 }, { "epoch": 21.0, "eval_accuracy": 0.9117930158241251, "eval_loss": 0.29752397537231445, "eval_runtime": 78.0771, "eval_samples_per_second": 332.658, "eval_steps_per_second": 5.2, "step": 4368 }, { "epoch": 21.009615384615383, "grad_norm": 3.7490057945251465, "learning_rate": 4.388354700854701e-05, "loss": 0.2106, "step": 4370 }, { "epoch": 21.057692307692307, "grad_norm": 4.133479595184326, "learning_rate": 4.385683760683761e-05, "loss": 0.1782, "step": 4380 }, { "epoch": 21.10576923076923, "grad_norm": 5.655460357666016, "learning_rate": 4.383012820512821e-05, "loss": 0.2228, "step": 4390 }, { "epoch": 21.153846153846153, "grad_norm": 4.605985164642334, "learning_rate": 4.3803418803418805e-05, "loss": 0.1976, "step": 4400 }, { "epoch": 21.201923076923077, "grad_norm": 3.823129653930664, "learning_rate": 4.37767094017094e-05, "loss": 0.1904, "step": 4410 }, { "epoch": 21.25, "grad_norm": 3.248720407485962, "learning_rate": 4.375e-05, "loss": 0.1836, "step": 4420 }, { "epoch": 21.298076923076923, "grad_norm": 5.669900894165039, "learning_rate": 4.3723290598290604e-05, "loss": 0.2036, "step": 4430 }, { "epoch": 21.346153846153847, "grad_norm": 4.537118434906006, "learning_rate": 4.36965811965812e-05, "loss": 0.1925, "step": 4440 }, { "epoch": 21.39423076923077, "grad_norm": 4.245039463043213, "learning_rate": 4.36698717948718e-05, "loss": 0.1901, "step": 4450 }, { "epoch": 21.442307692307693, "grad_norm": 3.576913356781006, "learning_rate": 4.3643162393162396e-05, "loss": 0.1943, "step": 4460 }, { "epoch": 21.490384615384617, "grad_norm": 4.054525852203369, "learning_rate": 4.361645299145299e-05, "loss": 0.1971, "step": 4470 }, { "epoch": 21.53846153846154, "grad_norm": 4.403632640838623, "learning_rate": 4.358974358974359e-05, "loss": 0.2085, "step": 4480 }, { "epoch": 21.58653846153846, "grad_norm": 5.972105026245117, "learning_rate": 4.356303418803419e-05, "loss": 0.1996, "step": 4490 }, { "epoch": 21.634615384615383, "grad_norm": 3.7211532592773438, "learning_rate": 4.353632478632479e-05, "loss": 0.2195, "step": 4500 }, { "epoch": 21.682692307692307, "grad_norm": 5.070723533630371, "learning_rate": 4.350961538461539e-05, "loss": 0.2103, "step": 4510 }, { "epoch": 21.73076923076923, "grad_norm": 4.668215751647949, "learning_rate": 4.348290598290599e-05, "loss": 0.1975, "step": 4520 }, { "epoch": 21.778846153846153, "grad_norm": 2.9622445106506348, "learning_rate": 4.3456196581196585e-05, "loss": 0.1919, "step": 4530 }, { "epoch": 21.826923076923077, "grad_norm": 4.453085422515869, "learning_rate": 4.342948717948718e-05, "loss": 0.1902, "step": 4540 }, { "epoch": 21.875, "grad_norm": 4.67719030380249, "learning_rate": 4.340277777777778e-05, "loss": 0.2025, "step": 4550 }, { "epoch": 21.923076923076923, "grad_norm": 2.941673994064331, "learning_rate": 4.337606837606838e-05, "loss": 0.1894, "step": 4560 }, { "epoch": 21.971153846153847, "grad_norm": 4.3872575759887695, "learning_rate": 4.334935897435898e-05, "loss": 0.1953, "step": 4570 }, { "epoch": 22.0, "eval_accuracy": 0.9145266238016402, "eval_loss": 0.2799212336540222, "eval_runtime": 78.6039, "eval_samples_per_second": 330.429, "eval_steps_per_second": 5.165, "step": 4576 }, { "epoch": 22.01923076923077, "grad_norm": 3.441617727279663, "learning_rate": 4.332264957264958e-05, "loss": 0.1984, "step": 4580 }, { "epoch": 22.067307692307693, "grad_norm": 3.881721258163452, "learning_rate": 4.3295940170940176e-05, "loss": 0.1644, "step": 4590 }, { "epoch": 22.115384615384617, "grad_norm": 3.488006830215454, "learning_rate": 4.326923076923077e-05, "loss": 0.1793, "step": 4600 }, { "epoch": 22.16346153846154, "grad_norm": 3.5052764415740967, "learning_rate": 4.324252136752137e-05, "loss": 0.1798, "step": 4610 }, { "epoch": 22.21153846153846, "grad_norm": 3.9601776599884033, "learning_rate": 4.321581196581197e-05, "loss": 0.1624, "step": 4620 }, { "epoch": 22.259615384615383, "grad_norm": 3.1455678939819336, "learning_rate": 4.3189102564102565e-05, "loss": 0.1835, "step": 4630 }, { "epoch": 22.307692307692307, "grad_norm": 6.543357849121094, "learning_rate": 4.316239316239317e-05, "loss": 0.2123, "step": 4640 }, { "epoch": 22.35576923076923, "grad_norm": 3.4983468055725098, "learning_rate": 4.313568376068377e-05, "loss": 0.1813, "step": 4650 }, { "epoch": 22.403846153846153, "grad_norm": 3.4483296871185303, "learning_rate": 4.3108974358974364e-05, "loss": 0.1744, "step": 4660 }, { "epoch": 22.451923076923077, "grad_norm": 3.959857225418091, "learning_rate": 4.308226495726496e-05, "loss": 0.1919, "step": 4670 }, { "epoch": 22.5, "grad_norm": 4.487167835235596, "learning_rate": 4.305555555555556e-05, "loss": 0.2025, "step": 4680 }, { "epoch": 22.548076923076923, "grad_norm": 3.9795658588409424, "learning_rate": 4.302884615384616e-05, "loss": 0.1858, "step": 4690 }, { "epoch": 22.596153846153847, "grad_norm": 4.143675327301025, "learning_rate": 4.3002136752136754e-05, "loss": 0.1948, "step": 4700 }, { "epoch": 22.64423076923077, "grad_norm": 4.833667755126953, "learning_rate": 4.297542735042736e-05, "loss": 0.1909, "step": 4710 }, { "epoch": 22.692307692307693, "grad_norm": 2.7231855392456055, "learning_rate": 4.294871794871795e-05, "loss": 0.188, "step": 4720 }, { "epoch": 22.740384615384617, "grad_norm": 4.040490627288818, "learning_rate": 4.2922008547008546e-05, "loss": 0.1891, "step": 4730 }, { "epoch": 22.78846153846154, "grad_norm": 4.15437126159668, "learning_rate": 4.2895299145299144e-05, "loss": 0.197, "step": 4740 }, { "epoch": 22.83653846153846, "grad_norm": 4.379530906677246, "learning_rate": 4.286858974358974e-05, "loss": 0.1861, "step": 4750 }, { "epoch": 22.884615384615383, "grad_norm": 3.890592575073242, "learning_rate": 4.284188034188034e-05, "loss": 0.1931, "step": 4760 }, { "epoch": 22.932692307692307, "grad_norm": 6.1403021812438965, "learning_rate": 4.281517094017094e-05, "loss": 0.1894, "step": 4770 }, { "epoch": 22.98076923076923, "grad_norm": 3.668121337890625, "learning_rate": 4.278846153846154e-05, "loss": 0.1938, "step": 4780 }, { "epoch": 23.0, "eval_accuracy": 0.9099834443460517, "eval_loss": 0.3196880519390106, "eval_runtime": 78.4982, "eval_samples_per_second": 330.874, "eval_steps_per_second": 5.172, "step": 4784 }, { "epoch": 23.028846153846153, "grad_norm": 4.384853363037109, "learning_rate": 4.276175213675214e-05, "loss": 0.1832, "step": 4790 }, { "epoch": 23.076923076923077, "grad_norm": 5.741386413574219, "learning_rate": 4.2735042735042735e-05, "loss": 0.1711, "step": 4800 }, { "epoch": 23.125, "grad_norm": 3.7387707233428955, "learning_rate": 4.270833333333333e-05, "loss": 0.1976, "step": 4810 }, { "epoch": 23.173076923076923, "grad_norm": 4.587501049041748, "learning_rate": 4.268162393162393e-05, "loss": 0.1623, "step": 4820 }, { "epoch": 23.221153846153847, "grad_norm": 5.079151630401611, "learning_rate": 4.265491452991453e-05, "loss": 0.1644, "step": 4830 }, { "epoch": 23.26923076923077, "grad_norm": 4.399616718292236, "learning_rate": 4.262820512820513e-05, "loss": 0.1828, "step": 4840 }, { "epoch": 23.317307692307693, "grad_norm": 3.907343626022339, "learning_rate": 4.260149572649573e-05, "loss": 0.1781, "step": 4850 }, { "epoch": 23.365384615384617, "grad_norm": 3.6834473609924316, "learning_rate": 4.2574786324786326e-05, "loss": 0.1838, "step": 4860 }, { "epoch": 23.41346153846154, "grad_norm": 3.7385332584381104, "learning_rate": 4.2548076923076924e-05, "loss": 0.1794, "step": 4870 }, { "epoch": 23.46153846153846, "grad_norm": 3.6774544715881348, "learning_rate": 4.252136752136752e-05, "loss": 0.1888, "step": 4880 }, { "epoch": 23.509615384615383, "grad_norm": 4.1113481521606445, "learning_rate": 4.249465811965812e-05, "loss": 0.1833, "step": 4890 }, { "epoch": 23.557692307692307, "grad_norm": 4.279334545135498, "learning_rate": 4.2467948717948716e-05, "loss": 0.1896, "step": 4900 }, { "epoch": 23.60576923076923, "grad_norm": 3.306779623031616, "learning_rate": 4.244123931623932e-05, "loss": 0.1922, "step": 4910 }, { "epoch": 23.653846153846153, "grad_norm": 5.118663787841797, "learning_rate": 4.241452991452992e-05, "loss": 0.196, "step": 4920 }, { "epoch": 23.701923076923077, "grad_norm": 4.5482707023620605, "learning_rate": 4.2387820512820515e-05, "loss": 0.2072, "step": 4930 }, { "epoch": 23.75, "grad_norm": 4.629793167114258, "learning_rate": 4.236111111111111e-05, "loss": 0.1821, "step": 4940 }, { "epoch": 23.798076923076923, "grad_norm": 3.465374708175659, "learning_rate": 4.233440170940171e-05, "loss": 0.2012, "step": 4950 }, { "epoch": 23.846153846153847, "grad_norm": 3.466402530670166, "learning_rate": 4.230769230769231e-05, "loss": 0.1716, "step": 4960 }, { "epoch": 23.89423076923077, "grad_norm": 3.39054536819458, "learning_rate": 4.2280982905982905e-05, "loss": 0.1744, "step": 4970 }, { "epoch": 23.942307692307693, "grad_norm": 3.57611083984375, "learning_rate": 4.225427350427351e-05, "loss": 0.1966, "step": 4980 }, { "epoch": 23.990384615384617, "grad_norm": 4.6689133644104, "learning_rate": 4.2227564102564106e-05, "loss": 0.1851, "step": 4990 }, { "epoch": 24.0, "eval_accuracy": 0.9138335964270589, "eval_loss": 0.3143465518951416, "eval_runtime": 78.173, "eval_samples_per_second": 332.25, "eval_steps_per_second": 5.194, "step": 4992 }, { "epoch": 24.03846153846154, "grad_norm": 3.8470330238342285, "learning_rate": 4.2200854700854704e-05, "loss": 0.1758, "step": 5000 }, { "epoch": 24.08653846153846, "grad_norm": 3.0494766235351562, "learning_rate": 4.21741452991453e-05, "loss": 0.1724, "step": 5010 }, { "epoch": 24.134615384615383, "grad_norm": 3.84008526802063, "learning_rate": 4.21474358974359e-05, "loss": 0.1881, "step": 5020 }, { "epoch": 24.182692307692307, "grad_norm": 3.5797128677368164, "learning_rate": 4.2120726495726496e-05, "loss": 0.1742, "step": 5030 }, { "epoch": 24.23076923076923, "grad_norm": 3.875005006790161, "learning_rate": 4.209401709401709e-05, "loss": 0.1743, "step": 5040 }, { "epoch": 24.278846153846153, "grad_norm": 4.656473636627197, "learning_rate": 4.20673076923077e-05, "loss": 0.1767, "step": 5050 }, { "epoch": 24.326923076923077, "grad_norm": 5.2383623123168945, "learning_rate": 4.2040598290598295e-05, "loss": 0.1754, "step": 5060 }, { "epoch": 24.375, "grad_norm": 4.615761756896973, "learning_rate": 4.201388888888889e-05, "loss": 0.1788, "step": 5070 }, { "epoch": 24.423076923076923, "grad_norm": 3.3107142448425293, "learning_rate": 4.198717948717949e-05, "loss": 0.1703, "step": 5080 }, { "epoch": 24.471153846153847, "grad_norm": 5.435229778289795, "learning_rate": 4.196047008547009e-05, "loss": 0.1709, "step": 5090 }, { "epoch": 24.51923076923077, "grad_norm": 4.586573123931885, "learning_rate": 4.1933760683760684e-05, "loss": 0.1678, "step": 5100 }, { "epoch": 24.567307692307693, "grad_norm": 4.591434001922607, "learning_rate": 4.190705128205128e-05, "loss": 0.1924, "step": 5110 }, { "epoch": 24.615384615384617, "grad_norm": 4.1498565673828125, "learning_rate": 4.1880341880341886e-05, "loss": 0.201, "step": 5120 }, { "epoch": 24.66346153846154, "grad_norm": 5.31571626663208, "learning_rate": 4.1853632478632483e-05, "loss": 0.1778, "step": 5130 }, { "epoch": 24.71153846153846, "grad_norm": 5.839378356933594, "learning_rate": 4.182692307692308e-05, "loss": 0.1807, "step": 5140 }, { "epoch": 24.759615384615383, "grad_norm": 3.960416316986084, "learning_rate": 4.180021367521368e-05, "loss": 0.1649, "step": 5150 }, { "epoch": 24.807692307692307, "grad_norm": 3.815809726715088, "learning_rate": 4.1773504273504276e-05, "loss": 0.1793, "step": 5160 }, { "epoch": 24.85576923076923, "grad_norm": 4.320824146270752, "learning_rate": 4.174679487179487e-05, "loss": 0.175, "step": 5170 }, { "epoch": 24.903846153846153, "grad_norm": 3.01131534576416, "learning_rate": 4.172008547008547e-05, "loss": 0.1611, "step": 5180 }, { "epoch": 24.951923076923077, "grad_norm": 2.6518893241882324, "learning_rate": 4.1693376068376075e-05, "loss": 0.173, "step": 5190 }, { "epoch": 25.0, "grad_norm": 6.5286478996276855, "learning_rate": 4.166666666666667e-05, "loss": 0.1931, "step": 5200 }, { "epoch": 25.0, "eval_accuracy": 0.9125245447195164, "eval_loss": 0.33313611149787903, "eval_runtime": 78.0267, "eval_samples_per_second": 332.873, "eval_steps_per_second": 5.203, "step": 5200 }, { "epoch": 25.048076923076923, "grad_norm": 4.209245681762695, "learning_rate": 4.163995726495727e-05, "loss": 0.157, "step": 5210 }, { "epoch": 25.096153846153847, "grad_norm": 3.714681625366211, "learning_rate": 4.161324786324787e-05, "loss": 0.1928, "step": 5220 }, { "epoch": 25.14423076923077, "grad_norm": 3.0338079929351807, "learning_rate": 4.1586538461538464e-05, "loss": 0.1677, "step": 5230 }, { "epoch": 25.192307692307693, "grad_norm": 3.987922430038452, "learning_rate": 4.155982905982906e-05, "loss": 0.1898, "step": 5240 }, { "epoch": 25.240384615384617, "grad_norm": 4.097054481506348, "learning_rate": 4.153311965811966e-05, "loss": 0.182, "step": 5250 }, { "epoch": 25.28846153846154, "grad_norm": 4.124067783355713, "learning_rate": 4.150641025641026e-05, "loss": 0.1696, "step": 5260 }, { "epoch": 25.33653846153846, "grad_norm": 3.2985739707946777, "learning_rate": 4.147970085470086e-05, "loss": 0.172, "step": 5270 }, { "epoch": 25.384615384615383, "grad_norm": 3.3725616931915283, "learning_rate": 4.145299145299146e-05, "loss": 0.1738, "step": 5280 }, { "epoch": 25.432692307692307, "grad_norm": 5.46973180770874, "learning_rate": 4.1426282051282056e-05, "loss": 0.1755, "step": 5290 }, { "epoch": 25.48076923076923, "grad_norm": 3.2343928813934326, "learning_rate": 4.1399572649572646e-05, "loss": 0.1588, "step": 5300 }, { "epoch": 25.528846153846153, "grad_norm": 3.7011661529541016, "learning_rate": 4.137286324786325e-05, "loss": 0.1688, "step": 5310 }, { "epoch": 25.576923076923077, "grad_norm": 4.862238883972168, "learning_rate": 4.134615384615385e-05, "loss": 0.1784, "step": 5320 }, { "epoch": 25.625, "grad_norm": 6.049991607666016, "learning_rate": 4.1319444444444445e-05, "loss": 0.1703, "step": 5330 }, { "epoch": 25.673076923076923, "grad_norm": 3.842723846435547, "learning_rate": 4.129273504273504e-05, "loss": 0.1784, "step": 5340 }, { "epoch": 25.721153846153847, "grad_norm": 4.640078067779541, "learning_rate": 4.126602564102564e-05, "loss": 0.1668, "step": 5350 }, { "epoch": 25.76923076923077, "grad_norm": 4.079240798950195, "learning_rate": 4.123931623931624e-05, "loss": 0.1868, "step": 5360 }, { "epoch": 25.817307692307693, "grad_norm": 4.697221279144287, "learning_rate": 4.1212606837606835e-05, "loss": 0.1761, "step": 5370 }, { "epoch": 25.865384615384617, "grad_norm": 4.092137336730957, "learning_rate": 4.118589743589744e-05, "loss": 0.1514, "step": 5380 }, { "epoch": 25.91346153846154, "grad_norm": 4.174044132232666, "learning_rate": 4.1159188034188036e-05, "loss": 0.1815, "step": 5390 }, { "epoch": 25.96153846153846, "grad_norm": 3.681436538696289, "learning_rate": 4.1132478632478634e-05, "loss": 0.1877, "step": 5400 }, { "epoch": 26.0, "eval_accuracy": 0.9110229854079236, "eval_loss": 0.3044121563434601, "eval_runtime": 78.1018, "eval_samples_per_second": 332.553, "eval_steps_per_second": 5.198, "step": 5408 }, { "epoch": 26.009615384615383, "grad_norm": 2.8443665504455566, "learning_rate": 4.110576923076923e-05, "loss": 0.1905, "step": 5410 }, { "epoch": 26.057692307692307, "grad_norm": 6.0207743644714355, "learning_rate": 4.107905982905983e-05, "loss": 0.1614, "step": 5420 }, { "epoch": 26.10576923076923, "grad_norm": 6.522647857666016, "learning_rate": 4.1052350427350426e-05, "loss": 0.1741, "step": 5430 }, { "epoch": 26.153846153846153, "grad_norm": 3.280303955078125, "learning_rate": 4.1025641025641023e-05, "loss": 0.1687, "step": 5440 }, { "epoch": 26.201923076923077, "grad_norm": 3.9062161445617676, "learning_rate": 4.099893162393163e-05, "loss": 0.1646, "step": 5450 }, { "epoch": 26.25, "grad_norm": 4.2894978523254395, "learning_rate": 4.0972222222222225e-05, "loss": 0.1681, "step": 5460 }, { "epoch": 26.298076923076923, "grad_norm": 5.572530746459961, "learning_rate": 4.094551282051282e-05, "loss": 0.155, "step": 5470 }, { "epoch": 26.346153846153847, "grad_norm": 6.004790782928467, "learning_rate": 4.091880341880342e-05, "loss": 0.1652, "step": 5480 }, { "epoch": 26.39423076923077, "grad_norm": 3.8833580017089844, "learning_rate": 4.089209401709402e-05, "loss": 0.1635, "step": 5490 }, { "epoch": 26.442307692307693, "grad_norm": 3.724947452545166, "learning_rate": 4.0865384615384615e-05, "loss": 0.1593, "step": 5500 }, { "epoch": 26.490384615384617, "grad_norm": 5.7651238441467285, "learning_rate": 4.083867521367521e-05, "loss": 0.1743, "step": 5510 }, { "epoch": 26.53846153846154, "grad_norm": 2.607546329498291, "learning_rate": 4.0811965811965816e-05, "loss": 0.1478, "step": 5520 }, { "epoch": 26.58653846153846, "grad_norm": 3.8152599334716797, "learning_rate": 4.0785256410256414e-05, "loss": 0.1628, "step": 5530 }, { "epoch": 26.634615384615383, "grad_norm": 3.988053560256958, "learning_rate": 4.075854700854701e-05, "loss": 0.1683, "step": 5540 }, { "epoch": 26.682692307692307, "grad_norm": 5.031129837036133, "learning_rate": 4.073183760683761e-05, "loss": 0.1687, "step": 5550 }, { "epoch": 26.73076923076923, "grad_norm": 2.5394625663757324, "learning_rate": 4.0705128205128206e-05, "loss": 0.1677, "step": 5560 }, { "epoch": 26.778846153846153, "grad_norm": 4.519762992858887, "learning_rate": 4.06784188034188e-05, "loss": 0.1627, "step": 5570 }, { "epoch": 26.826923076923077, "grad_norm": 4.4241509437561035, "learning_rate": 4.06517094017094e-05, "loss": 0.1577, "step": 5580 }, { "epoch": 26.875, "grad_norm": 5.211919784545898, "learning_rate": 4.0625000000000005e-05, "loss": 0.1849, "step": 5590 }, { "epoch": 26.923076923076923, "grad_norm": 3.0204975605010986, "learning_rate": 4.05982905982906e-05, "loss": 0.179, "step": 5600 }, { "epoch": 26.971153846153847, "grad_norm": 3.9461841583251953, "learning_rate": 4.05715811965812e-05, "loss": 0.177, "step": 5610 }, { "epoch": 27.0, "eval_accuracy": 0.9109459823663034, "eval_loss": 0.3271152079105377, "eval_runtime": 78.0745, "eval_samples_per_second": 332.67, "eval_steps_per_second": 5.2, "step": 5616 }, { "epoch": 27.01923076923077, "grad_norm": 3.6752266883850098, "learning_rate": 4.05448717948718e-05, "loss": 0.1862, "step": 5620 }, { "epoch": 27.067307692307693, "grad_norm": 2.437899589538574, "learning_rate": 4.0518162393162395e-05, "loss": 0.1613, "step": 5630 }, { "epoch": 27.115384615384617, "grad_norm": 4.542654037475586, "learning_rate": 4.049145299145299e-05, "loss": 0.1818, "step": 5640 }, { "epoch": 27.16346153846154, "grad_norm": 3.3452320098876953, "learning_rate": 4.046474358974359e-05, "loss": 0.1485, "step": 5650 }, { "epoch": 27.21153846153846, "grad_norm": 6.216183662414551, "learning_rate": 4.0438034188034194e-05, "loss": 0.1613, "step": 5660 }, { "epoch": 27.259615384615383, "grad_norm": 4.504946708679199, "learning_rate": 4.041132478632479e-05, "loss": 0.1658, "step": 5670 }, { "epoch": 27.307692307692307, "grad_norm": 3.30070161819458, "learning_rate": 4.038461538461539e-05, "loss": 0.1742, "step": 5680 }, { "epoch": 27.35576923076923, "grad_norm": 3.771266222000122, "learning_rate": 4.0357905982905986e-05, "loss": 0.1583, "step": 5690 }, { "epoch": 27.403846153846153, "grad_norm": 6.5939435958862305, "learning_rate": 4.033119658119658e-05, "loss": 0.1715, "step": 5700 }, { "epoch": 27.451923076923077, "grad_norm": 4.273276329040527, "learning_rate": 4.030448717948718e-05, "loss": 0.1473, "step": 5710 }, { "epoch": 27.5, "grad_norm": 4.141189098358154, "learning_rate": 4.027777777777778e-05, "loss": 0.1739, "step": 5720 }, { "epoch": 27.548076923076923, "grad_norm": 3.7584409713745117, "learning_rate": 4.025106837606838e-05, "loss": 0.1689, "step": 5730 }, { "epoch": 27.596153846153847, "grad_norm": 4.809976577758789, "learning_rate": 4.022435897435898e-05, "loss": 0.1654, "step": 5740 }, { "epoch": 27.64423076923077, "grad_norm": 3.4946725368499756, "learning_rate": 4.019764957264958e-05, "loss": 0.1571, "step": 5750 }, { "epoch": 27.692307692307693, "grad_norm": 3.878688097000122, "learning_rate": 4.0170940170940174e-05, "loss": 0.1561, "step": 5760 }, { "epoch": 27.740384615384617, "grad_norm": 3.749715805053711, "learning_rate": 4.014423076923077e-05, "loss": 0.1636, "step": 5770 }, { "epoch": 27.78846153846154, "grad_norm": 4.7965006828308105, "learning_rate": 4.011752136752137e-05, "loss": 0.1791, "step": 5780 }, { "epoch": 27.83653846153846, "grad_norm": 5.355994701385498, "learning_rate": 4.009081196581197e-05, "loss": 0.1699, "step": 5790 }, { "epoch": 27.884615384615383, "grad_norm": 3.9702951908111572, "learning_rate": 4.006410256410257e-05, "loss": 0.1577, "step": 5800 }, { "epoch": 27.932692307692307, "grad_norm": 5.720573902130127, "learning_rate": 4.003739316239317e-05, "loss": 0.1791, "step": 5810 }, { "epoch": 27.98076923076923, "grad_norm": 3.1767055988311768, "learning_rate": 4.0010683760683766e-05, "loss": 0.1529, "step": 5820 }, { "epoch": 28.0, "eval_accuracy": 0.9094444230547106, "eval_loss": 0.3381701707839966, "eval_runtime": 77.7571, "eval_samples_per_second": 334.027, "eval_steps_per_second": 5.221, "step": 5824 }, { "epoch": 28.028846153846153, "grad_norm": 4.9542999267578125, "learning_rate": 3.998397435897436e-05, "loss": 0.154, "step": 5830 }, { "epoch": 28.076923076923077, "grad_norm": 3.718600273132324, "learning_rate": 3.995726495726496e-05, "loss": 0.1561, "step": 5840 }, { "epoch": 28.125, "grad_norm": 4.380571365356445, "learning_rate": 3.993055555555556e-05, "loss": 0.1642, "step": 5850 }, { "epoch": 28.173076923076923, "grad_norm": 4.008854866027832, "learning_rate": 3.9903846153846155e-05, "loss": 0.1564, "step": 5860 }, { "epoch": 28.221153846153847, "grad_norm": 4.495736598968506, "learning_rate": 3.987713675213676e-05, "loss": 0.1858, "step": 5870 }, { "epoch": 28.26923076923077, "grad_norm": 4.154090881347656, "learning_rate": 3.985042735042736e-05, "loss": 0.1445, "step": 5880 }, { "epoch": 28.317307692307693, "grad_norm": 3.517040729522705, "learning_rate": 3.982371794871795e-05, "loss": 0.1551, "step": 5890 }, { "epoch": 28.365384615384617, "grad_norm": 4.263125896453857, "learning_rate": 3.9797008547008545e-05, "loss": 0.157, "step": 5900 }, { "epoch": 28.41346153846154, "grad_norm": 4.738256454467773, "learning_rate": 3.977029914529914e-05, "loss": 0.1554, "step": 5910 }, { "epoch": 28.46153846153846, "grad_norm": 5.1687188148498535, "learning_rate": 3.974358974358974e-05, "loss": 0.1737, "step": 5920 }, { "epoch": 28.509615384615383, "grad_norm": 3.083665132522583, "learning_rate": 3.9716880341880344e-05, "loss": 0.1528, "step": 5930 }, { "epoch": 28.557692307692307, "grad_norm": 4.325019359588623, "learning_rate": 3.969017094017094e-05, "loss": 0.1384, "step": 5940 }, { "epoch": 28.60576923076923, "grad_norm": 4.344009876251221, "learning_rate": 3.966346153846154e-05, "loss": 0.1726, "step": 5950 }, { "epoch": 28.653846153846153, "grad_norm": 5.334991455078125, "learning_rate": 3.9636752136752136e-05, "loss": 0.1694, "step": 5960 }, { "epoch": 28.701923076923077, "grad_norm": 4.016361713409424, "learning_rate": 3.9610042735042734e-05, "loss": 0.1673, "step": 5970 }, { "epoch": 28.75, "grad_norm": 3.5568084716796875, "learning_rate": 3.958333333333333e-05, "loss": 0.1518, "step": 5980 }, { "epoch": 28.798076923076923, "grad_norm": 5.881350040435791, "learning_rate": 3.955662393162393e-05, "loss": 0.1604, "step": 5990 }, { "epoch": 28.846153846153847, "grad_norm": 6.0495076179504395, "learning_rate": 3.952991452991453e-05, "loss": 0.1826, "step": 6000 }, { "epoch": 28.89423076923077, "grad_norm": 4.724186897277832, "learning_rate": 3.950320512820513e-05, "loss": 0.1447, "step": 6010 }, { "epoch": 28.942307692307693, "grad_norm": 4.489931106567383, "learning_rate": 3.947649572649573e-05, "loss": 0.1512, "step": 6020 }, { "epoch": 28.990384615384617, "grad_norm": 2.5890371799468994, "learning_rate": 3.9449786324786325e-05, "loss": 0.1684, "step": 6030 }, { "epoch": 29.0, "eval_accuracy": 0.9127940553651869, "eval_loss": 0.34147897362709045, "eval_runtime": 78.0338, "eval_samples_per_second": 332.843, "eval_steps_per_second": 5.203, "step": 6032 }, { "epoch": 29.03846153846154, "grad_norm": 3.635453939437866, "learning_rate": 3.942307692307692e-05, "loss": 0.1492, "step": 6040 }, { "epoch": 29.08653846153846, "grad_norm": 3.4948105812072754, "learning_rate": 3.939636752136752e-05, "loss": 0.153, "step": 6050 }, { "epoch": 29.134615384615383, "grad_norm": 4.1236891746521, "learning_rate": 3.9369658119658124e-05, "loss": 0.1565, "step": 6060 }, { "epoch": 29.182692307692307, "grad_norm": 2.5531985759735107, "learning_rate": 3.934294871794872e-05, "loss": 0.1655, "step": 6070 }, { "epoch": 29.23076923076923, "grad_norm": 5.651024341583252, "learning_rate": 3.931623931623932e-05, "loss": 0.1453, "step": 6080 }, { "epoch": 29.278846153846153, "grad_norm": 3.000316858291626, "learning_rate": 3.9289529914529916e-05, "loss": 0.1589, "step": 6090 }, { "epoch": 29.326923076923077, "grad_norm": 4.09881591796875, "learning_rate": 3.9262820512820513e-05, "loss": 0.155, "step": 6100 }, { "epoch": 29.375, "grad_norm": 3.785364866256714, "learning_rate": 3.923611111111111e-05, "loss": 0.1478, "step": 6110 }, { "epoch": 29.423076923076923, "grad_norm": 4.990180015563965, "learning_rate": 3.920940170940171e-05, "loss": 0.1462, "step": 6120 }, { "epoch": 29.471153846153847, "grad_norm": 5.7128119468688965, "learning_rate": 3.918269230769231e-05, "loss": 0.1661, "step": 6130 }, { "epoch": 29.51923076923077, "grad_norm": 3.551820993423462, "learning_rate": 3.915598290598291e-05, "loss": 0.1639, "step": 6140 }, { "epoch": 29.567307692307693, "grad_norm": 4.078210830688477, "learning_rate": 3.912927350427351e-05, "loss": 0.16, "step": 6150 }, { "epoch": 29.615384615384617, "grad_norm": 3.7664098739624023, "learning_rate": 3.9102564102564105e-05, "loss": 0.1558, "step": 6160 }, { "epoch": 29.66346153846154, "grad_norm": 4.6141157150268555, "learning_rate": 3.90758547008547e-05, "loss": 0.1679, "step": 6170 }, { "epoch": 29.71153846153846, "grad_norm": 2.926002025604248, "learning_rate": 3.90491452991453e-05, "loss": 0.1439, "step": 6180 }, { "epoch": 29.759615384615383, "grad_norm": 4.996341228485107, "learning_rate": 3.90224358974359e-05, "loss": 0.1557, "step": 6190 }, { "epoch": 29.807692307692307, "grad_norm": 3.630732536315918, "learning_rate": 3.89957264957265e-05, "loss": 0.1592, "step": 6200 }, { "epoch": 29.85576923076923, "grad_norm": 5.569990634918213, "learning_rate": 3.89690170940171e-05, "loss": 0.1595, "step": 6210 }, { "epoch": 29.903846153846153, "grad_norm": 4.978424072265625, "learning_rate": 3.8942307692307696e-05, "loss": 0.1546, "step": 6220 }, { "epoch": 29.951923076923077, "grad_norm": 3.663975954055786, "learning_rate": 3.891559829059829e-05, "loss": 0.1674, "step": 6230 }, { "epoch": 30.0, "grad_norm": 5.387861728668213, "learning_rate": 3.888888888888889e-05, "loss": 0.176, "step": 6240 }, { "epoch": 30.0, "eval_accuracy": 0.9094829245755207, "eval_loss": 0.34633028507232666, "eval_runtime": 77.952, "eval_samples_per_second": 333.192, "eval_steps_per_second": 5.208, "step": 6240 }, { "epoch": 30.048076923076923, "grad_norm": 4.731963634490967, "learning_rate": 3.886217948717949e-05, "loss": 0.1491, "step": 6250 }, { "epoch": 30.096153846153847, "grad_norm": 5.631259441375732, "learning_rate": 3.8835470085470086e-05, "loss": 0.1584, "step": 6260 }, { "epoch": 30.14423076923077, "grad_norm": 2.7207729816436768, "learning_rate": 3.880876068376069e-05, "loss": 0.1594, "step": 6270 }, { "epoch": 30.192307692307693, "grad_norm": 2.7028956413269043, "learning_rate": 3.878205128205129e-05, "loss": 0.1436, "step": 6280 }, { "epoch": 30.240384615384617, "grad_norm": 2.496997117996216, "learning_rate": 3.8755341880341885e-05, "loss": 0.1541, "step": 6290 }, { "epoch": 30.28846153846154, "grad_norm": 3.8166728019714355, "learning_rate": 3.872863247863248e-05, "loss": 0.1654, "step": 6300 }, { "epoch": 30.33653846153846, "grad_norm": 4.879892349243164, "learning_rate": 3.870192307692308e-05, "loss": 0.146, "step": 6310 }, { "epoch": 30.384615384615383, "grad_norm": 5.255593299865723, "learning_rate": 3.867521367521368e-05, "loss": 0.1452, "step": 6320 }, { "epoch": 30.432692307692307, "grad_norm": 4.3609724044799805, "learning_rate": 3.8648504273504274e-05, "loss": 0.1533, "step": 6330 }, { "epoch": 30.48076923076923, "grad_norm": 4.558903217315674, "learning_rate": 3.862179487179488e-05, "loss": 0.1467, "step": 6340 }, { "epoch": 30.528846153846153, "grad_norm": 3.0544934272766113, "learning_rate": 3.8595085470085476e-05, "loss": 0.1661, "step": 6350 }, { "epoch": 30.576923076923077, "grad_norm": 2.5807228088378906, "learning_rate": 3.856837606837607e-05, "loss": 0.1594, "step": 6360 }, { "epoch": 30.625, "grad_norm": 4.0550947189331055, "learning_rate": 3.854166666666667e-05, "loss": 0.1507, "step": 6370 }, { "epoch": 30.673076923076923, "grad_norm": 7.072223663330078, "learning_rate": 3.851495726495727e-05, "loss": 0.1654, "step": 6380 }, { "epoch": 30.721153846153847, "grad_norm": 3.393714427947998, "learning_rate": 3.8488247863247865e-05, "loss": 0.1706, "step": 6390 }, { "epoch": 30.76923076923077, "grad_norm": 3.19111967086792, "learning_rate": 3.846153846153846e-05, "loss": 0.1824, "step": 6400 }, { "epoch": 30.817307692307693, "grad_norm": 4.449220180511475, "learning_rate": 3.843482905982907e-05, "loss": 0.1524, "step": 6410 }, { "epoch": 30.865384615384617, "grad_norm": 4.07639217376709, "learning_rate": 3.8408119658119664e-05, "loss": 0.1499, "step": 6420 }, { "epoch": 30.91346153846154, "grad_norm": 5.803819179534912, "learning_rate": 3.838141025641026e-05, "loss": 0.1706, "step": 6430 }, { "epoch": 30.96153846153846, "grad_norm": 5.6098432540893555, "learning_rate": 3.835470085470086e-05, "loss": 0.1496, "step": 6440 }, { "epoch": 31.0, "eval_accuracy": 0.9136025873021985, "eval_loss": 0.39517924189567566, "eval_runtime": 77.7832, "eval_samples_per_second": 333.915, "eval_steps_per_second": 5.22, "step": 6448 }, { "epoch": 31.009615384615383, "grad_norm": 4.868350028991699, "learning_rate": 3.832799145299146e-05, "loss": 0.1603, "step": 6450 }, { "epoch": 31.057692307692307, "grad_norm": 2.3056046962738037, "learning_rate": 3.8301282051282054e-05, "loss": 0.1512, "step": 6460 }, { "epoch": 31.10576923076923, "grad_norm": 6.724637508392334, "learning_rate": 3.827457264957265e-05, "loss": 0.12, "step": 6470 }, { "epoch": 31.153846153846153, "grad_norm": 5.528247356414795, "learning_rate": 3.824786324786325e-05, "loss": 0.1673, "step": 6480 }, { "epoch": 31.201923076923077, "grad_norm": 2.7605347633361816, "learning_rate": 3.8221153846153846e-05, "loss": 0.1609, "step": 6490 }, { "epoch": 31.25, "grad_norm": 5.8477396965026855, "learning_rate": 3.8194444444444444e-05, "loss": 0.1488, "step": 6500 }, { "epoch": 31.298076923076923, "grad_norm": 3.589146137237549, "learning_rate": 3.816773504273504e-05, "loss": 0.1713, "step": 6510 }, { "epoch": 31.346153846153847, "grad_norm": 2.3862528800964355, "learning_rate": 3.814102564102564e-05, "loss": 0.1565, "step": 6520 }, { "epoch": 31.39423076923077, "grad_norm": 4.958017349243164, "learning_rate": 3.8114316239316236e-05, "loss": 0.1795, "step": 6530 }, { "epoch": 31.442307692307693, "grad_norm": 3.660580635070801, "learning_rate": 3.808760683760684e-05, "loss": 0.1378, "step": 6540 }, { "epoch": 31.490384615384617, "grad_norm": 2.8025529384613037, "learning_rate": 3.806089743589744e-05, "loss": 0.1635, "step": 6550 }, { "epoch": 31.53846153846154, "grad_norm": 4.64219331741333, "learning_rate": 3.8034188034188035e-05, "loss": 0.1592, "step": 6560 }, { "epoch": 31.58653846153846, "grad_norm": 3.3402113914489746, "learning_rate": 3.800747863247863e-05, "loss": 0.164, "step": 6570 }, { "epoch": 31.634615384615383, "grad_norm": 2.019383192062378, "learning_rate": 3.798076923076923e-05, "loss": 0.151, "step": 6580 }, { "epoch": 31.682692307692307, "grad_norm": 5.954462051391602, "learning_rate": 3.795405982905983e-05, "loss": 0.1627, "step": 6590 }, { "epoch": 31.73076923076923, "grad_norm": 6.63533353805542, "learning_rate": 3.7927350427350425e-05, "loss": 0.1608, "step": 6600 }, { "epoch": 31.778846153846153, "grad_norm": 3.5446202754974365, "learning_rate": 3.790064102564103e-05, "loss": 0.1472, "step": 6610 }, { "epoch": 31.826923076923077, "grad_norm": 3.7349255084991455, "learning_rate": 3.7873931623931626e-05, "loss": 0.1787, "step": 6620 }, { "epoch": 31.875, "grad_norm": 3.451387882232666, "learning_rate": 3.7847222222222224e-05, "loss": 0.1422, "step": 6630 }, { "epoch": 31.923076923076923, "grad_norm": 4.484333515167236, "learning_rate": 3.782051282051282e-05, "loss": 0.14, "step": 6640 }, { "epoch": 31.971153846153847, "grad_norm": 4.382156848907471, "learning_rate": 3.779380341880342e-05, "loss": 0.1509, "step": 6650 }, { "epoch": 32.0, "eval_accuracy": 0.9121010279906057, "eval_loss": 0.368988960981369, "eval_runtime": 78.1565, "eval_samples_per_second": 332.32, "eval_steps_per_second": 5.195, "step": 6656 }, { "epoch": 32.01923076923077, "grad_norm": 3.465475082397461, "learning_rate": 3.7767094017094016e-05, "loss": 0.1502, "step": 6660 }, { "epoch": 32.06730769230769, "grad_norm": 2.533485174179077, "learning_rate": 3.774038461538461e-05, "loss": 0.1481, "step": 6670 }, { "epoch": 32.11538461538461, "grad_norm": 5.192600727081299, "learning_rate": 3.771367521367522e-05, "loss": 0.1489, "step": 6680 }, { "epoch": 32.16346153846154, "grad_norm": 6.614710807800293, "learning_rate": 3.7686965811965815e-05, "loss": 0.1485, "step": 6690 }, { "epoch": 32.21153846153846, "grad_norm": 2.734715700149536, "learning_rate": 3.766025641025641e-05, "loss": 0.1524, "step": 6700 }, { "epoch": 32.25961538461539, "grad_norm": 5.554875373840332, "learning_rate": 3.763354700854701e-05, "loss": 0.1447, "step": 6710 }, { "epoch": 32.30769230769231, "grad_norm": 3.767120361328125, "learning_rate": 3.760683760683761e-05, "loss": 0.152, "step": 6720 }, { "epoch": 32.35576923076923, "grad_norm": 4.088368892669678, "learning_rate": 3.7580128205128204e-05, "loss": 0.1365, "step": 6730 }, { "epoch": 32.40384615384615, "grad_norm": 5.602968215942383, "learning_rate": 3.75534188034188e-05, "loss": 0.1382, "step": 6740 }, { "epoch": 32.45192307692308, "grad_norm": 4.0960187911987305, "learning_rate": 3.7526709401709406e-05, "loss": 0.1504, "step": 6750 }, { "epoch": 32.5, "grad_norm": 5.152284145355225, "learning_rate": 3.7500000000000003e-05, "loss": 0.1422, "step": 6760 }, { "epoch": 32.54807692307692, "grad_norm": 4.057671546936035, "learning_rate": 3.74732905982906e-05, "loss": 0.1559, "step": 6770 }, { "epoch": 32.59615384615385, "grad_norm": 4.661623954772949, "learning_rate": 3.74465811965812e-05, "loss": 0.1584, "step": 6780 }, { "epoch": 32.64423076923077, "grad_norm": 5.32603120803833, "learning_rate": 3.7419871794871796e-05, "loss": 0.1481, "step": 6790 }, { "epoch": 32.69230769230769, "grad_norm": 3.7380731105804443, "learning_rate": 3.739316239316239e-05, "loss": 0.1423, "step": 6800 }, { "epoch": 32.74038461538461, "grad_norm": 3.9668354988098145, "learning_rate": 3.736645299145299e-05, "loss": 0.1609, "step": 6810 }, { "epoch": 32.78846153846154, "grad_norm": 5.3843793869018555, "learning_rate": 3.7339743589743595e-05, "loss": 0.1635, "step": 6820 }, { "epoch": 32.83653846153846, "grad_norm": 3.7663753032684326, "learning_rate": 3.731303418803419e-05, "loss": 0.1379, "step": 6830 }, { "epoch": 32.88461538461539, "grad_norm": 3.247709035873413, "learning_rate": 3.728632478632479e-05, "loss": 0.1456, "step": 6840 }, { "epoch": 32.93269230769231, "grad_norm": 2.4754912853240967, "learning_rate": 3.725961538461539e-05, "loss": 0.1448, "step": 6850 }, { "epoch": 32.98076923076923, "grad_norm": 3.1328771114349365, "learning_rate": 3.7232905982905984e-05, "loss": 0.1463, "step": 6860 }, { "epoch": 33.0, "eval_accuracy": 0.9094444230547106, "eval_loss": 0.39991047978401184, "eval_runtime": 78.2155, "eval_samples_per_second": 332.07, "eval_steps_per_second": 5.191, "step": 6864 }, { "epoch": 33.02884615384615, "grad_norm": 5.404738903045654, "learning_rate": 3.720619658119658e-05, "loss": 0.1517, "step": 6870 }, { "epoch": 33.07692307692308, "grad_norm": 5.986526012420654, "learning_rate": 3.717948717948718e-05, "loss": 0.1587, "step": 6880 }, { "epoch": 33.125, "grad_norm": 4.535394668579102, "learning_rate": 3.715277777777778e-05, "loss": 0.1403, "step": 6890 }, { "epoch": 33.17307692307692, "grad_norm": 4.8741865158081055, "learning_rate": 3.712606837606838e-05, "loss": 0.1431, "step": 6900 }, { "epoch": 33.22115384615385, "grad_norm": 6.3070068359375, "learning_rate": 3.709935897435898e-05, "loss": 0.1241, "step": 6910 }, { "epoch": 33.26923076923077, "grad_norm": 5.597393035888672, "learning_rate": 3.7072649572649576e-05, "loss": 0.1453, "step": 6920 }, { "epoch": 33.31730769230769, "grad_norm": 4.556349754333496, "learning_rate": 3.704594017094017e-05, "loss": 0.1491, "step": 6930 }, { "epoch": 33.36538461538461, "grad_norm": 3.0740392208099365, "learning_rate": 3.701923076923077e-05, "loss": 0.1401, "step": 6940 }, { "epoch": 33.41346153846154, "grad_norm": 3.6338937282562256, "learning_rate": 3.699252136752137e-05, "loss": 0.165, "step": 6950 }, { "epoch": 33.46153846153846, "grad_norm": 4.686102867126465, "learning_rate": 3.696581196581197e-05, "loss": 0.1502, "step": 6960 }, { "epoch": 33.50961538461539, "grad_norm": 2.5798964500427246, "learning_rate": 3.693910256410257e-05, "loss": 0.1421, "step": 6970 }, { "epoch": 33.55769230769231, "grad_norm": 4.405801296234131, "learning_rate": 3.691239316239317e-05, "loss": 0.1499, "step": 6980 }, { "epoch": 33.60576923076923, "grad_norm": 3.9691855907440186, "learning_rate": 3.6885683760683764e-05, "loss": 0.1359, "step": 6990 }, { "epoch": 33.65384615384615, "grad_norm": 4.121423721313477, "learning_rate": 3.685897435897436e-05, "loss": 0.1267, "step": 7000 }, { "epoch": 33.70192307692308, "grad_norm": 2.616647243499756, "learning_rate": 3.683226495726496e-05, "loss": 0.1356, "step": 7010 }, { "epoch": 33.75, "grad_norm": 4.886743545532227, "learning_rate": 3.6805555555555556e-05, "loss": 0.16, "step": 7020 }, { "epoch": 33.79807692307692, "grad_norm": 5.181188106536865, "learning_rate": 3.677884615384616e-05, "loss": 0.1444, "step": 7030 }, { "epoch": 33.84615384615385, "grad_norm": 4.703493118286133, "learning_rate": 3.675213675213676e-05, "loss": 0.1519, "step": 7040 }, { "epoch": 33.89423076923077, "grad_norm": 3.6483147144317627, "learning_rate": 3.6725427350427355e-05, "loss": 0.1177, "step": 7050 }, { "epoch": 33.94230769230769, "grad_norm": 3.589634656906128, "learning_rate": 3.6698717948717946e-05, "loss": 0.1614, "step": 7060 }, { "epoch": 33.99038461538461, "grad_norm": 2.448310375213623, "learning_rate": 3.6672008547008544e-05, "loss": 0.1354, "step": 7070 }, { "epoch": 34.0, "eval_accuracy": 0.9134870827397682, "eval_loss": 0.399640291929245, "eval_runtime": 77.9004, "eval_samples_per_second": 333.413, "eval_steps_per_second": 5.212, "step": 7072 }, { "epoch": 34.03846153846154, "grad_norm": 3.9780120849609375, "learning_rate": 3.664529914529915e-05, "loss": 0.1472, "step": 7080 }, { "epoch": 34.08653846153846, "grad_norm": 3.813169002532959, "learning_rate": 3.6618589743589745e-05, "loss": 0.1355, "step": 7090 }, { "epoch": 34.13461538461539, "grad_norm": 2.574537754058838, "learning_rate": 3.659188034188034e-05, "loss": 0.1482, "step": 7100 }, { "epoch": 34.18269230769231, "grad_norm": 5.6071882247924805, "learning_rate": 3.656517094017094e-05, "loss": 0.1421, "step": 7110 }, { "epoch": 34.23076923076923, "grad_norm": 3.218581199645996, "learning_rate": 3.653846153846154e-05, "loss": 0.1573, "step": 7120 }, { "epoch": 34.27884615384615, "grad_norm": 4.6184797286987305, "learning_rate": 3.6511752136752135e-05, "loss": 0.1546, "step": 7130 }, { "epoch": 34.32692307692308, "grad_norm": 2.503598690032959, "learning_rate": 3.648504273504273e-05, "loss": 0.1338, "step": 7140 }, { "epoch": 34.375, "grad_norm": 5.598848342895508, "learning_rate": 3.6458333333333336e-05, "loss": 0.1472, "step": 7150 }, { "epoch": 34.42307692307692, "grad_norm": 4.900747776031494, "learning_rate": 3.6431623931623934e-05, "loss": 0.1531, "step": 7160 }, { "epoch": 34.47115384615385, "grad_norm": 2.710580587387085, "learning_rate": 3.640491452991453e-05, "loss": 0.1552, "step": 7170 }, { "epoch": 34.51923076923077, "grad_norm": 3.2416439056396484, "learning_rate": 3.637820512820513e-05, "loss": 0.143, "step": 7180 }, { "epoch": 34.56730769230769, "grad_norm": 3.833693504333496, "learning_rate": 3.6351495726495726e-05, "loss": 0.1345, "step": 7190 }, { "epoch": 34.61538461538461, "grad_norm": 5.972621917724609, "learning_rate": 3.6324786324786323e-05, "loss": 0.1432, "step": 7200 }, { "epoch": 34.66346153846154, "grad_norm": 4.934006214141846, "learning_rate": 3.629807692307692e-05, "loss": 0.1486, "step": 7210 }, { "epoch": 34.71153846153846, "grad_norm": 3.2558631896972656, "learning_rate": 3.6271367521367525e-05, "loss": 0.1353, "step": 7220 }, { "epoch": 34.75961538461539, "grad_norm": 7.657632350921631, "learning_rate": 3.624465811965812e-05, "loss": 0.14, "step": 7230 }, { "epoch": 34.80769230769231, "grad_norm": 4.702003479003906, "learning_rate": 3.621794871794872e-05, "loss": 0.1603, "step": 7240 }, { "epoch": 34.85576923076923, "grad_norm": 4.301212310791016, "learning_rate": 3.619123931623932e-05, "loss": 0.1682, "step": 7250 }, { "epoch": 34.90384615384615, "grad_norm": 2.99410080909729, "learning_rate": 3.6164529914529915e-05, "loss": 0.1567, "step": 7260 }, { "epoch": 34.95192307692308, "grad_norm": 2.489537000656128, "learning_rate": 3.613782051282051e-05, "loss": 0.1513, "step": 7270 }, { "epoch": 35.0, "grad_norm": 2.8082985877990723, "learning_rate": 3.611111111111111e-05, "loss": 0.1546, "step": 7280 }, { "epoch": 35.0, "eval_accuracy": 0.9115620066992646, "eval_loss": 0.3809815049171448, "eval_runtime": 78.1622, "eval_samples_per_second": 332.296, "eval_steps_per_second": 5.194, "step": 7280 }, { "epoch": 35.04807692307692, "grad_norm": 2.901674747467041, "learning_rate": 3.6084401709401714e-05, "loss": 0.1385, "step": 7290 }, { "epoch": 35.09615384615385, "grad_norm": 3.64674711227417, "learning_rate": 3.605769230769231e-05, "loss": 0.141, "step": 7300 }, { "epoch": 35.14423076923077, "grad_norm": 2.643458843231201, "learning_rate": 3.603098290598291e-05, "loss": 0.1561, "step": 7310 }, { "epoch": 35.19230769230769, "grad_norm": 3.6889781951904297, "learning_rate": 3.6004273504273506e-05, "loss": 0.1357, "step": 7320 }, { "epoch": 35.24038461538461, "grad_norm": 4.648171424865723, "learning_rate": 3.59775641025641e-05, "loss": 0.1427, "step": 7330 }, { "epoch": 35.28846153846154, "grad_norm": 3.280611991882324, "learning_rate": 3.59508547008547e-05, "loss": 0.1257, "step": 7340 }, { "epoch": 35.33653846153846, "grad_norm": 4.853590488433838, "learning_rate": 3.59241452991453e-05, "loss": 0.1518, "step": 7350 }, { "epoch": 35.38461538461539, "grad_norm": 3.892134428024292, "learning_rate": 3.58974358974359e-05, "loss": 0.1528, "step": 7360 }, { "epoch": 35.43269230769231, "grad_norm": 4.401625633239746, "learning_rate": 3.58707264957265e-05, "loss": 0.1578, "step": 7370 }, { "epoch": 35.48076923076923, "grad_norm": 2.9867522716522217, "learning_rate": 3.58440170940171e-05, "loss": 0.1424, "step": 7380 }, { "epoch": 35.52884615384615, "grad_norm": 4.039861679077148, "learning_rate": 3.5817307692307695e-05, "loss": 0.1426, "step": 7390 }, { "epoch": 35.57692307692308, "grad_norm": 2.149143695831299, "learning_rate": 3.579059829059829e-05, "loss": 0.1346, "step": 7400 }, { "epoch": 35.625, "grad_norm": 4.661281585693359, "learning_rate": 3.576388888888889e-05, "loss": 0.1396, "step": 7410 }, { "epoch": 35.67307692307692, "grad_norm": 6.029346942901611, "learning_rate": 3.573717948717949e-05, "loss": 0.1352, "step": 7420 }, { "epoch": 35.72115384615385, "grad_norm": 2.6054129600524902, "learning_rate": 3.571047008547009e-05, "loss": 0.1292, "step": 7430 }, { "epoch": 35.76923076923077, "grad_norm": 3.909296989440918, "learning_rate": 3.568376068376069e-05, "loss": 0.157, "step": 7440 }, { "epoch": 35.81730769230769, "grad_norm": 5.399840354919434, "learning_rate": 3.5657051282051286e-05, "loss": 0.1462, "step": 7450 }, { "epoch": 35.86538461538461, "grad_norm": 5.190165996551514, "learning_rate": 3.563034188034188e-05, "loss": 0.1375, "step": 7460 }, { "epoch": 35.91346153846154, "grad_norm": 3.650630235671997, "learning_rate": 3.560363247863248e-05, "loss": 0.1426, "step": 7470 }, { "epoch": 35.96153846153846, "grad_norm": 3.56064510345459, "learning_rate": 3.557692307692308e-05, "loss": 0.1513, "step": 7480 }, { "epoch": 36.0, "eval_accuracy": 0.9121395295114157, "eval_loss": 0.3991892635822296, "eval_runtime": 77.8032, "eval_samples_per_second": 333.83, "eval_steps_per_second": 5.218, "step": 7488 }, { "epoch": 36.00961538461539, "grad_norm": 2.8871936798095703, "learning_rate": 3.5550213675213675e-05, "loss": 0.1511, "step": 7490 }, { "epoch": 36.05769230769231, "grad_norm": 3.344449520111084, "learning_rate": 3.552350427350428e-05, "loss": 0.131, "step": 7500 }, { "epoch": 36.10576923076923, "grad_norm": 4.236584186553955, "learning_rate": 3.549679487179488e-05, "loss": 0.1398, "step": 7510 }, { "epoch": 36.15384615384615, "grad_norm": 4.394327163696289, "learning_rate": 3.5470085470085474e-05, "loss": 0.1362, "step": 7520 }, { "epoch": 36.20192307692308, "grad_norm": 4.46572208404541, "learning_rate": 3.544337606837607e-05, "loss": 0.1377, "step": 7530 }, { "epoch": 36.25, "grad_norm": 3.2499351501464844, "learning_rate": 3.541666666666667e-05, "loss": 0.1352, "step": 7540 }, { "epoch": 36.29807692307692, "grad_norm": 4.803370952606201, "learning_rate": 3.538995726495727e-05, "loss": 0.1425, "step": 7550 }, { "epoch": 36.34615384615385, "grad_norm": 3.9128172397613525, "learning_rate": 3.5363247863247864e-05, "loss": 0.1322, "step": 7560 }, { "epoch": 36.39423076923077, "grad_norm": 6.595008850097656, "learning_rate": 3.533653846153847e-05, "loss": 0.1461, "step": 7570 }, { "epoch": 36.44230769230769, "grad_norm": 5.029943943023682, "learning_rate": 3.5309829059829066e-05, "loss": 0.1427, "step": 7580 }, { "epoch": 36.49038461538461, "grad_norm": 4.977112770080566, "learning_rate": 3.528311965811966e-05, "loss": 0.1298, "step": 7590 }, { "epoch": 36.53846153846154, "grad_norm": 2.9619500637054443, "learning_rate": 3.525641025641026e-05, "loss": 0.1328, "step": 7600 }, { "epoch": 36.58653846153846, "grad_norm": 3.4531147480010986, "learning_rate": 3.522970085470086e-05, "loss": 0.1363, "step": 7610 }, { "epoch": 36.63461538461539, "grad_norm": 4.343055725097656, "learning_rate": 3.5202991452991455e-05, "loss": 0.1439, "step": 7620 }, { "epoch": 36.68269230769231, "grad_norm": 4.6562886238098145, "learning_rate": 3.517628205128205e-05, "loss": 0.1359, "step": 7630 }, { "epoch": 36.73076923076923, "grad_norm": 2.607271909713745, "learning_rate": 3.514957264957265e-05, "loss": 0.1435, "step": 7640 }, { "epoch": 36.77884615384615, "grad_norm": 4.257352352142334, "learning_rate": 3.512286324786325e-05, "loss": 0.1503, "step": 7650 }, { "epoch": 36.82692307692308, "grad_norm": 4.039111614227295, "learning_rate": 3.5096153846153845e-05, "loss": 0.1324, "step": 7660 }, { "epoch": 36.875, "grad_norm": 4.199617385864258, "learning_rate": 3.506944444444444e-05, "loss": 0.1586, "step": 7670 }, { "epoch": 36.92307692307692, "grad_norm": 5.03641939163208, "learning_rate": 3.504273504273504e-05, "loss": 0.1305, "step": 7680 }, { "epoch": 36.97115384615385, "grad_norm": 4.0007758140563965, "learning_rate": 3.501602564102564e-05, "loss": 0.115, "step": 7690 }, { "epoch": 37.0, "eval_accuracy": 0.9131790705732876, "eval_loss": 0.4295423924922943, "eval_runtime": 77.9725, "eval_samples_per_second": 333.105, "eval_steps_per_second": 5.207, "step": 7696 }, { "epoch": 37.01923076923077, "grad_norm": 4.269541263580322, "learning_rate": 3.498931623931624e-05, "loss": 0.1512, "step": 7700 }, { "epoch": 37.06730769230769, "grad_norm": 4.56640100479126, "learning_rate": 3.496260683760684e-05, "loss": 0.1472, "step": 7710 }, { "epoch": 37.11538461538461, "grad_norm": 4.770233631134033, "learning_rate": 3.4935897435897436e-05, "loss": 0.1404, "step": 7720 }, { "epoch": 37.16346153846154, "grad_norm": 3.509243965148926, "learning_rate": 3.4909188034188034e-05, "loss": 0.128, "step": 7730 }, { "epoch": 37.21153846153846, "grad_norm": 4.376786708831787, "learning_rate": 3.488247863247863e-05, "loss": 0.1451, "step": 7740 }, { "epoch": 37.25961538461539, "grad_norm": 2.0512568950653076, "learning_rate": 3.485576923076923e-05, "loss": 0.1404, "step": 7750 }, { "epoch": 37.30769230769231, "grad_norm": 5.076381206512451, "learning_rate": 3.4829059829059826e-05, "loss": 0.1482, "step": 7760 }, { "epoch": 37.35576923076923, "grad_norm": 3.7548201084136963, "learning_rate": 3.480235042735043e-05, "loss": 0.1469, "step": 7770 }, { "epoch": 37.40384615384615, "grad_norm": 3.815889835357666, "learning_rate": 3.477564102564103e-05, "loss": 0.12, "step": 7780 }, { "epoch": 37.45192307692308, "grad_norm": 4.114205837249756, "learning_rate": 3.4748931623931625e-05, "loss": 0.1297, "step": 7790 }, { "epoch": 37.5, "grad_norm": 3.5584936141967773, "learning_rate": 3.472222222222222e-05, "loss": 0.1257, "step": 7800 }, { "epoch": 37.54807692307692, "grad_norm": 5.417263507843018, "learning_rate": 3.469551282051282e-05, "loss": 0.1428, "step": 7810 }, { "epoch": 37.59615384615385, "grad_norm": 3.7350008487701416, "learning_rate": 3.466880341880342e-05, "loss": 0.1422, "step": 7820 }, { "epoch": 37.64423076923077, "grad_norm": 3.697824239730835, "learning_rate": 3.4642094017094014e-05, "loss": 0.1379, "step": 7830 }, { "epoch": 37.69230769230769, "grad_norm": 3.1287758350372314, "learning_rate": 3.461538461538462e-05, "loss": 0.1431, "step": 7840 }, { "epoch": 37.74038461538461, "grad_norm": 4.281350135803223, "learning_rate": 3.4588675213675216e-05, "loss": 0.1329, "step": 7850 }, { "epoch": 37.78846153846154, "grad_norm": 2.75110125541687, "learning_rate": 3.4561965811965813e-05, "loss": 0.1507, "step": 7860 }, { "epoch": 37.83653846153846, "grad_norm": 5.800961494445801, "learning_rate": 3.453525641025641e-05, "loss": 0.142, "step": 7870 }, { "epoch": 37.88461538461539, "grad_norm": 2.861645221710205, "learning_rate": 3.450854700854701e-05, "loss": 0.1293, "step": 7880 }, { "epoch": 37.93269230769231, "grad_norm": 3.8630967140197754, "learning_rate": 3.4481837606837606e-05, "loss": 0.1556, "step": 7890 }, { "epoch": 37.98076923076923, "grad_norm": 4.848718643188477, "learning_rate": 3.445512820512821e-05, "loss": 0.1479, "step": 7900 }, { "epoch": 38.0, "eval_accuracy": 0.912293535594656, "eval_loss": 0.43626850843429565, "eval_runtime": 77.945, "eval_samples_per_second": 333.222, "eval_steps_per_second": 5.209, "step": 7904 }, { "epoch": 38.02884615384615, "grad_norm": 3.6879303455352783, "learning_rate": 3.442841880341881e-05, "loss": 0.1321, "step": 7910 }, { "epoch": 38.07692307692308, "grad_norm": 3.2512354850769043, "learning_rate": 3.4401709401709405e-05, "loss": 0.1313, "step": 7920 }, { "epoch": 38.125, "grad_norm": 4.81367826461792, "learning_rate": 3.4375e-05, "loss": 0.1329, "step": 7930 }, { "epoch": 38.17307692307692, "grad_norm": 2.9697203636169434, "learning_rate": 3.43482905982906e-05, "loss": 0.1359, "step": 7940 }, { "epoch": 38.22115384615385, "grad_norm": 3.288330078125, "learning_rate": 3.43215811965812e-05, "loss": 0.1376, "step": 7950 }, { "epoch": 38.26923076923077, "grad_norm": 3.7957329750061035, "learning_rate": 3.4294871794871794e-05, "loss": 0.1486, "step": 7960 }, { "epoch": 38.31730769230769, "grad_norm": 1.8747841119766235, "learning_rate": 3.42681623931624e-05, "loss": 0.1186, "step": 7970 }, { "epoch": 38.36538461538461, "grad_norm": 2.897852897644043, "learning_rate": 3.4241452991452996e-05, "loss": 0.1399, "step": 7980 }, { "epoch": 38.41346153846154, "grad_norm": 4.293024063110352, "learning_rate": 3.421474358974359e-05, "loss": 0.1423, "step": 7990 }, { "epoch": 38.46153846153846, "grad_norm": 4.698550701141357, "learning_rate": 3.418803418803419e-05, "loss": 0.1307, "step": 8000 }, { "epoch": 38.50961538461539, "grad_norm": 2.5049352645874023, "learning_rate": 3.416132478632479e-05, "loss": 0.1328, "step": 8010 }, { "epoch": 38.55769230769231, "grad_norm": 4.674388885498047, "learning_rate": 3.4134615384615386e-05, "loss": 0.144, "step": 8020 }, { "epoch": 38.60576923076923, "grad_norm": 4.012131690979004, "learning_rate": 3.410790598290598e-05, "loss": 0.144, "step": 8030 }, { "epoch": 38.65384615384615, "grad_norm": 2.6297082901000977, "learning_rate": 3.408119658119659e-05, "loss": 0.1232, "step": 8040 }, { "epoch": 38.70192307692308, "grad_norm": 3.545236587524414, "learning_rate": 3.4054487179487185e-05, "loss": 0.135, "step": 8050 }, { "epoch": 38.75, "grad_norm": 5.545792102813721, "learning_rate": 3.402777777777778e-05, "loss": 0.1392, "step": 8060 }, { "epoch": 38.79807692307692, "grad_norm": 2.0516915321350098, "learning_rate": 3.400106837606838e-05, "loss": 0.1314, "step": 8070 }, { "epoch": 38.84615384615385, "grad_norm": 6.177470684051514, "learning_rate": 3.397435897435898e-05, "loss": 0.1482, "step": 8080 }, { "epoch": 38.89423076923077, "grad_norm": 2.696314811706543, "learning_rate": 3.3947649572649574e-05, "loss": 0.1443, "step": 8090 }, { "epoch": 38.94230769230769, "grad_norm": 4.212292194366455, "learning_rate": 3.392094017094017e-05, "loss": 0.1429, "step": 8100 }, { "epoch": 38.99038461538461, "grad_norm": 3.451840400695801, "learning_rate": 3.3894230769230776e-05, "loss": 0.1455, "step": 8110 }, { "epoch": 39.0, "eval_accuracy": 0.9139876025102992, "eval_loss": 0.42195364832878113, "eval_runtime": 78.1596, "eval_samples_per_second": 332.307, "eval_steps_per_second": 5.195, "step": 8112 }, { "epoch": 39.03846153846154, "grad_norm": 4.72067403793335, "learning_rate": 3.386752136752137e-05, "loss": 0.146, "step": 8120 }, { "epoch": 39.08653846153846, "grad_norm": 3.229255199432373, "learning_rate": 3.384081196581197e-05, "loss": 0.1357, "step": 8130 }, { "epoch": 39.13461538461539, "grad_norm": 3.9783942699432373, "learning_rate": 3.381410256410257e-05, "loss": 0.1237, "step": 8140 }, { "epoch": 39.18269230769231, "grad_norm": 2.6386287212371826, "learning_rate": 3.3787393162393165e-05, "loss": 0.1333, "step": 8150 }, { "epoch": 39.23076923076923, "grad_norm": 4.458517074584961, "learning_rate": 3.376068376068376e-05, "loss": 0.1312, "step": 8160 }, { "epoch": 39.27884615384615, "grad_norm": 5.552135944366455, "learning_rate": 3.373397435897436e-05, "loss": 0.1368, "step": 8170 }, { "epoch": 39.32692307692308, "grad_norm": 5.39725399017334, "learning_rate": 3.3707264957264964e-05, "loss": 0.1457, "step": 8180 }, { "epoch": 39.375, "grad_norm": 2.196831464767456, "learning_rate": 3.368055555555556e-05, "loss": 0.1272, "step": 8190 }, { "epoch": 39.42307692307692, "grad_norm": 2.695016384124756, "learning_rate": 3.365384615384616e-05, "loss": 0.1399, "step": 8200 }, { "epoch": 39.47115384615385, "grad_norm": 4.425684452056885, "learning_rate": 3.362713675213676e-05, "loss": 0.1317, "step": 8210 }, { "epoch": 39.51923076923077, "grad_norm": 4.451691150665283, "learning_rate": 3.3600427350427354e-05, "loss": 0.1294, "step": 8220 }, { "epoch": 39.56730769230769, "grad_norm": 3.4934184551239014, "learning_rate": 3.3573717948717945e-05, "loss": 0.1349, "step": 8230 }, { "epoch": 39.61538461538461, "grad_norm": 3.9950873851776123, "learning_rate": 3.354700854700855e-05, "loss": 0.1289, "step": 8240 }, { "epoch": 39.66346153846154, "grad_norm": 3.999178171157837, "learning_rate": 3.3520299145299146e-05, "loss": 0.1364, "step": 8250 }, { "epoch": 39.71153846153846, "grad_norm": 2.6505894660949707, "learning_rate": 3.3493589743589744e-05, "loss": 0.1195, "step": 8260 }, { "epoch": 39.75961538461539, "grad_norm": 3.8250606060028076, "learning_rate": 3.346688034188034e-05, "loss": 0.1339, "step": 8270 }, { "epoch": 39.80769230769231, "grad_norm": 6.19631814956665, "learning_rate": 3.344017094017094e-05, "loss": 0.1508, "step": 8280 }, { "epoch": 39.85576923076923, "grad_norm": 3.5337677001953125, "learning_rate": 3.3413461538461536e-05, "loss": 0.1402, "step": 8290 }, { "epoch": 39.90384615384615, "grad_norm": 5.602427005767822, "learning_rate": 3.338675213675213e-05, "loss": 0.134, "step": 8300 }, { "epoch": 39.95192307692308, "grad_norm": 5.377026081085205, "learning_rate": 3.336004273504274e-05, "loss": 0.128, "step": 8310 }, { "epoch": 40.0, "grad_norm": 2.995851755142212, "learning_rate": 3.3333333333333335e-05, "loss": 0.1353, "step": 8320 }, { "epoch": 40.0, "eval_accuracy": 0.9127170523235668, "eval_loss": 0.4111533761024475, "eval_runtime": 78.0912, "eval_samples_per_second": 332.598, "eval_steps_per_second": 5.199, "step": 8320 }, { "epoch": 40.04807692307692, "grad_norm": 2.4657936096191406, "learning_rate": 3.330662393162393e-05, "loss": 0.1416, "step": 8330 }, { "epoch": 40.09615384615385, "grad_norm": 4.973947048187256, "learning_rate": 3.327991452991453e-05, "loss": 0.1332, "step": 8340 }, { "epoch": 40.14423076923077, "grad_norm": 3.786114454269409, "learning_rate": 3.325320512820513e-05, "loss": 0.1316, "step": 8350 }, { "epoch": 40.19230769230769, "grad_norm": 3.3897037506103516, "learning_rate": 3.3226495726495725e-05, "loss": 0.1411, "step": 8360 }, { "epoch": 40.24038461538461, "grad_norm": 2.5333926677703857, "learning_rate": 3.319978632478632e-05, "loss": 0.1452, "step": 8370 }, { "epoch": 40.28846153846154, "grad_norm": 3.665581703186035, "learning_rate": 3.3173076923076926e-05, "loss": 0.1432, "step": 8380 }, { "epoch": 40.33653846153846, "grad_norm": 2.8528714179992676, "learning_rate": 3.3146367521367524e-05, "loss": 0.1298, "step": 8390 }, { "epoch": 40.38461538461539, "grad_norm": 4.943005561828613, "learning_rate": 3.311965811965812e-05, "loss": 0.1284, "step": 8400 }, { "epoch": 40.43269230769231, "grad_norm": 2.428046464920044, "learning_rate": 3.309294871794872e-05, "loss": 0.1398, "step": 8410 }, { "epoch": 40.48076923076923, "grad_norm": 6.478982448577881, "learning_rate": 3.3066239316239316e-05, "loss": 0.1241, "step": 8420 }, { "epoch": 40.52884615384615, "grad_norm": 3.9154534339904785, "learning_rate": 3.303952991452991e-05, "loss": 0.1353, "step": 8430 }, { "epoch": 40.57692307692308, "grad_norm": 5.304614543914795, "learning_rate": 3.301282051282051e-05, "loss": 0.1277, "step": 8440 }, { "epoch": 40.625, "grad_norm": 5.162892818450928, "learning_rate": 3.2986111111111115e-05, "loss": 0.1569, "step": 8450 }, { "epoch": 40.67307692307692, "grad_norm": 4.79444694519043, "learning_rate": 3.295940170940171e-05, "loss": 0.1175, "step": 8460 }, { "epoch": 40.72115384615385, "grad_norm": 5.053725242614746, "learning_rate": 3.293269230769231e-05, "loss": 0.1147, "step": 8470 }, { "epoch": 40.76923076923077, "grad_norm": 3.1853818893432617, "learning_rate": 3.290598290598291e-05, "loss": 0.1464, "step": 8480 }, { "epoch": 40.81730769230769, "grad_norm": 3.9082818031311035, "learning_rate": 3.2879273504273504e-05, "loss": 0.1291, "step": 8490 }, { "epoch": 40.86538461538461, "grad_norm": 3.9759304523468018, "learning_rate": 3.28525641025641e-05, "loss": 0.1225, "step": 8500 }, { "epoch": 40.91346153846154, "grad_norm": 4.922185897827148, "learning_rate": 3.28258547008547e-05, "loss": 0.1195, "step": 8510 }, { "epoch": 40.96153846153846, "grad_norm": 5.6854248046875, "learning_rate": 3.2799145299145303e-05, "loss": 0.141, "step": 8520 }, { "epoch": 41.0, "eval_accuracy": 0.9138720979478689, "eval_loss": 0.43222731351852417, "eval_runtime": 78.046, "eval_samples_per_second": 332.791, "eval_steps_per_second": 5.202, "step": 8528 }, { "epoch": 41.00961538461539, "grad_norm": 5.037850379943848, "learning_rate": 3.27724358974359e-05, "loss": 0.1395, "step": 8530 }, { "epoch": 41.05769230769231, "grad_norm": 3.7313344478607178, "learning_rate": 3.27457264957265e-05, "loss": 0.1363, "step": 8540 }, { "epoch": 41.10576923076923, "grad_norm": 3.8801286220550537, "learning_rate": 3.2719017094017096e-05, "loss": 0.1318, "step": 8550 }, { "epoch": 41.15384615384615, "grad_norm": 3.698017120361328, "learning_rate": 3.269230769230769e-05, "loss": 0.1546, "step": 8560 }, { "epoch": 41.20192307692308, "grad_norm": 3.978790760040283, "learning_rate": 3.266559829059829e-05, "loss": 0.1525, "step": 8570 }, { "epoch": 41.25, "grad_norm": 4.410404682159424, "learning_rate": 3.263888888888889e-05, "loss": 0.1217, "step": 8580 }, { "epoch": 41.29807692307692, "grad_norm": 3.0331263542175293, "learning_rate": 3.261217948717949e-05, "loss": 0.1236, "step": 8590 }, { "epoch": 41.34615384615385, "grad_norm": 4.436903953552246, "learning_rate": 3.258547008547009e-05, "loss": 0.1168, "step": 8600 }, { "epoch": 41.39423076923077, "grad_norm": 4.08458948135376, "learning_rate": 3.255876068376069e-05, "loss": 0.1261, "step": 8610 }, { "epoch": 41.44230769230769, "grad_norm": 2.2983810901641846, "learning_rate": 3.2532051282051284e-05, "loss": 0.1116, "step": 8620 }, { "epoch": 41.49038461538461, "grad_norm": 3.856792449951172, "learning_rate": 3.250534188034188e-05, "loss": 0.115, "step": 8630 }, { "epoch": 41.53846153846154, "grad_norm": 4.201444149017334, "learning_rate": 3.247863247863248e-05, "loss": 0.15, "step": 8640 }, { "epoch": 41.58653846153846, "grad_norm": 3.081361770629883, "learning_rate": 3.2451923076923077e-05, "loss": 0.1276, "step": 8650 }, { "epoch": 41.63461538461539, "grad_norm": 2.978205680847168, "learning_rate": 3.242521367521368e-05, "loss": 0.1278, "step": 8660 }, { "epoch": 41.68269230769231, "grad_norm": 4.003065586090088, "learning_rate": 3.239850427350428e-05, "loss": 0.1201, "step": 8670 }, { "epoch": 41.73076923076923, "grad_norm": 4.2802910804748535, "learning_rate": 3.2371794871794876e-05, "loss": 0.1233, "step": 8680 }, { "epoch": 41.77884615384615, "grad_norm": 4.1065168380737305, "learning_rate": 3.234508547008547e-05, "loss": 0.1545, "step": 8690 }, { "epoch": 41.82692307692308, "grad_norm": 3.2800841331481934, "learning_rate": 3.231837606837607e-05, "loss": 0.1368, "step": 8700 }, { "epoch": 41.875, "grad_norm": 4.697321891784668, "learning_rate": 3.229166666666667e-05, "loss": 0.1427, "step": 8710 }, { "epoch": 41.92307692307692, "grad_norm": 2.4469494819641113, "learning_rate": 3.2264957264957265e-05, "loss": 0.1478, "step": 8720 }, { "epoch": 41.97115384615385, "grad_norm": 2.823274850845337, "learning_rate": 3.223824786324787e-05, "loss": 0.1272, "step": 8730 }, { "epoch": 42.0, "eval_accuracy": 0.9118700188657451, "eval_loss": 0.4176381528377533, "eval_runtime": 77.9013, "eval_samples_per_second": 333.409, "eval_steps_per_second": 5.212, "step": 8736 }, { "epoch": 42.01923076923077, "grad_norm": 3.925825357437134, "learning_rate": 3.221153846153847e-05, "loss": 0.1354, "step": 8740 }, { "epoch": 42.06730769230769, "grad_norm": 6.150909900665283, "learning_rate": 3.2184829059829064e-05, "loss": 0.1396, "step": 8750 }, { "epoch": 42.11538461538461, "grad_norm": 3.7641279697418213, "learning_rate": 3.215811965811966e-05, "loss": 0.1329, "step": 8760 }, { "epoch": 42.16346153846154, "grad_norm": 3.2021288871765137, "learning_rate": 3.213141025641026e-05, "loss": 0.1291, "step": 8770 }, { "epoch": 42.21153846153846, "grad_norm": 3.4581141471862793, "learning_rate": 3.2104700854700856e-05, "loss": 0.1402, "step": 8780 }, { "epoch": 42.25961538461539, "grad_norm": 2.841310501098633, "learning_rate": 3.2077991452991454e-05, "loss": 0.1381, "step": 8790 }, { "epoch": 42.30769230769231, "grad_norm": 3.2659449577331543, "learning_rate": 3.205128205128206e-05, "loss": 0.1371, "step": 8800 }, { "epoch": 42.35576923076923, "grad_norm": 5.641805648803711, "learning_rate": 3.202457264957265e-05, "loss": 0.1161, "step": 8810 }, { "epoch": 42.40384615384615, "grad_norm": 4.007379055023193, "learning_rate": 3.1997863247863246e-05, "loss": 0.1245, "step": 8820 }, { "epoch": 42.45192307692308, "grad_norm": 4.4251017570495605, "learning_rate": 3.1971153846153843e-05, "loss": 0.1179, "step": 8830 }, { "epoch": 42.5, "grad_norm": 3.700692892074585, "learning_rate": 3.194444444444444e-05, "loss": 0.1375, "step": 8840 }, { "epoch": 42.54807692307692, "grad_norm": 5.554290771484375, "learning_rate": 3.1917735042735045e-05, "loss": 0.1265, "step": 8850 }, { "epoch": 42.59615384615385, "grad_norm": 4.622366428375244, "learning_rate": 3.189102564102564e-05, "loss": 0.1426, "step": 8860 }, { "epoch": 42.64423076923077, "grad_norm": 2.6794703006744385, "learning_rate": 3.186431623931624e-05, "loss": 0.1172, "step": 8870 }, { "epoch": 42.69230769230769, "grad_norm": 4.14111328125, "learning_rate": 3.183760683760684e-05, "loss": 0.1424, "step": 8880 }, { "epoch": 42.74038461538461, "grad_norm": 2.9329793453216553, "learning_rate": 3.1810897435897435e-05, "loss": 0.1305, "step": 8890 }, { "epoch": 42.78846153846154, "grad_norm": 3.5885093212127686, "learning_rate": 3.178418803418803e-05, "loss": 0.1306, "step": 8900 }, { "epoch": 42.83653846153846, "grad_norm": 3.1461338996887207, "learning_rate": 3.175747863247863e-05, "loss": 0.1398, "step": 8910 }, { "epoch": 42.88461538461539, "grad_norm": 4.502335071563721, "learning_rate": 3.1730769230769234e-05, "loss": 0.1369, "step": 8920 }, { "epoch": 42.93269230769231, "grad_norm": 2.9337639808654785, "learning_rate": 3.170405982905983e-05, "loss": 0.1437, "step": 8930 }, { "epoch": 42.98076923076923, "grad_norm": 3.799452304840088, "learning_rate": 3.167735042735043e-05, "loss": 0.1402, "step": 8940 }, { "epoch": 43.0, "eval_accuracy": 0.9107919762830632, "eval_loss": 0.40408849716186523, "eval_runtime": 78.4829, "eval_samples_per_second": 330.938, "eval_steps_per_second": 5.173, "step": 8944 }, { "epoch": 43.02884615384615, "grad_norm": 3.338930368423462, "learning_rate": 3.1650641025641026e-05, "loss": 0.1362, "step": 8950 }, { "epoch": 43.07692307692308, "grad_norm": 2.8538975715637207, "learning_rate": 3.162393162393162e-05, "loss": 0.1388, "step": 8960 }, { "epoch": 43.125, "grad_norm": 3.008975028991699, "learning_rate": 3.159722222222222e-05, "loss": 0.1024, "step": 8970 }, { "epoch": 43.17307692307692, "grad_norm": 4.151202201843262, "learning_rate": 3.157051282051282e-05, "loss": 0.1153, "step": 8980 }, { "epoch": 43.22115384615385, "grad_norm": 4.838099002838135, "learning_rate": 3.154380341880342e-05, "loss": 0.1478, "step": 8990 }, { "epoch": 43.26923076923077, "grad_norm": 2.7738840579986572, "learning_rate": 3.151709401709402e-05, "loss": 0.1395, "step": 9000 }, { "epoch": 43.31730769230769, "grad_norm": 1.8630739450454712, "learning_rate": 3.149038461538462e-05, "loss": 0.1249, "step": 9010 }, { "epoch": 43.36538461538461, "grad_norm": 5.910093784332275, "learning_rate": 3.1463675213675215e-05, "loss": 0.1344, "step": 9020 }, { "epoch": 43.41346153846154, "grad_norm": 4.388566493988037, "learning_rate": 3.143696581196581e-05, "loss": 0.1468, "step": 9030 }, { "epoch": 43.46153846153846, "grad_norm": 4.310922145843506, "learning_rate": 3.141025641025641e-05, "loss": 0.1254, "step": 9040 }, { "epoch": 43.50961538461539, "grad_norm": 1.7014272212982178, "learning_rate": 3.138354700854701e-05, "loss": 0.1335, "step": 9050 }, { "epoch": 43.55769230769231, "grad_norm": 2.8892176151275635, "learning_rate": 3.135683760683761e-05, "loss": 0.1301, "step": 9060 }, { "epoch": 43.60576923076923, "grad_norm": 4.522170066833496, "learning_rate": 3.133012820512821e-05, "loss": 0.1305, "step": 9070 }, { "epoch": 43.65384615384615, "grad_norm": 1.8718706369400024, "learning_rate": 3.1303418803418806e-05, "loss": 0.1364, "step": 9080 }, { "epoch": 43.70192307692308, "grad_norm": 5.292089939117432, "learning_rate": 3.12767094017094e-05, "loss": 0.1379, "step": 9090 }, { "epoch": 43.75, "grad_norm": 5.24566650390625, "learning_rate": 3.125e-05, "loss": 0.1352, "step": 9100 }, { "epoch": 43.79807692307692, "grad_norm": 3.3527610301971436, "learning_rate": 3.12232905982906e-05, "loss": 0.1297, "step": 9110 }, { "epoch": 43.84615384615385, "grad_norm": 4.431436538696289, "learning_rate": 3.1196581196581195e-05, "loss": 0.1256, "step": 9120 }, { "epoch": 43.89423076923077, "grad_norm": 3.346484422683716, "learning_rate": 3.11698717948718e-05, "loss": 0.1231, "step": 9130 }, { "epoch": 43.94230769230769, "grad_norm": 3.181917905807495, "learning_rate": 3.11431623931624e-05, "loss": 0.1259, "step": 9140 }, { "epoch": 43.99038461538461, "grad_norm": 4.5647807121276855, "learning_rate": 3.1116452991452994e-05, "loss": 0.1236, "step": 9150 }, { "epoch": 44.0, "eval_accuracy": 0.9095214260963308, "eval_loss": 0.44777753949165344, "eval_runtime": 78.2726, "eval_samples_per_second": 331.828, "eval_steps_per_second": 5.187, "step": 9152 }, { "epoch": 44.03846153846154, "grad_norm": 4.13914680480957, "learning_rate": 3.108974358974359e-05, "loss": 0.1321, "step": 9160 }, { "epoch": 44.08653846153846, "grad_norm": 3.3026795387268066, "learning_rate": 3.106303418803419e-05, "loss": 0.1231, "step": 9170 }, { "epoch": 44.13461538461539, "grad_norm": 3.7818453311920166, "learning_rate": 3.103632478632479e-05, "loss": 0.1342, "step": 9180 }, { "epoch": 44.18269230769231, "grad_norm": 3.9949917793273926, "learning_rate": 3.1009615384615384e-05, "loss": 0.1352, "step": 9190 }, { "epoch": 44.23076923076923, "grad_norm": 3.604119300842285, "learning_rate": 3.098290598290599e-05, "loss": 0.1267, "step": 9200 }, { "epoch": 44.27884615384615, "grad_norm": 3.5399770736694336, "learning_rate": 3.0956196581196586e-05, "loss": 0.114, "step": 9210 }, { "epoch": 44.32692307692308, "grad_norm": 3.458493232727051, "learning_rate": 3.092948717948718e-05, "loss": 0.1249, "step": 9220 }, { "epoch": 44.375, "grad_norm": 3.1036999225616455, "learning_rate": 3.090277777777778e-05, "loss": 0.1426, "step": 9230 }, { "epoch": 44.42307692307692, "grad_norm": 3.3277719020843506, "learning_rate": 3.087606837606838e-05, "loss": 0.1312, "step": 9240 }, { "epoch": 44.47115384615385, "grad_norm": 4.682319641113281, "learning_rate": 3.0849358974358975e-05, "loss": 0.1387, "step": 9250 }, { "epoch": 44.51923076923077, "grad_norm": 6.270090579986572, "learning_rate": 3.082264957264957e-05, "loss": 0.1299, "step": 9260 }, { "epoch": 44.56730769230769, "grad_norm": 4.141357898712158, "learning_rate": 3.079594017094018e-05, "loss": 0.1141, "step": 9270 }, { "epoch": 44.61538461538461, "grad_norm": 2.555760145187378, "learning_rate": 3.0769230769230774e-05, "loss": 0.1197, "step": 9280 }, { "epoch": 44.66346153846154, "grad_norm": 3.634464979171753, "learning_rate": 3.074252136752137e-05, "loss": 0.1305, "step": 9290 }, { "epoch": 44.71153846153846, "grad_norm": 2.4016380310058594, "learning_rate": 3.071581196581197e-05, "loss": 0.1303, "step": 9300 }, { "epoch": 44.75961538461539, "grad_norm": 3.4394328594207764, "learning_rate": 3.0689102564102567e-05, "loss": 0.1359, "step": 9310 }, { "epoch": 44.80769230769231, "grad_norm": 5.297127723693848, "learning_rate": 3.0662393162393164e-05, "loss": 0.1492, "step": 9320 }, { "epoch": 44.85576923076923, "grad_norm": 5.604681015014648, "learning_rate": 3.063568376068376e-05, "loss": 0.1326, "step": 9330 }, { "epoch": 44.90384615384615, "grad_norm": 4.618442535400391, "learning_rate": 3.0608974358974366e-05, "loss": 0.1393, "step": 9340 }, { "epoch": 44.95192307692308, "grad_norm": 3.5694384574890137, "learning_rate": 3.058226495726496e-05, "loss": 0.1328, "step": 9350 }, { "epoch": 45.0, "grad_norm": 5.041379451751709, "learning_rate": 3.055555555555556e-05, "loss": 0.1349, "step": 9360 }, { "epoch": 45.0, "eval_accuracy": 0.911215493011974, "eval_loss": 0.42110779881477356, "eval_runtime": 77.9177, "eval_samples_per_second": 333.339, "eval_steps_per_second": 5.211, "step": 9360 }, { "epoch": 45.04807692307692, "grad_norm": 4.6125969886779785, "learning_rate": 3.052884615384616e-05, "loss": 0.1218, "step": 9370 }, { "epoch": 45.09615384615385, "grad_norm": 1.742217779159546, "learning_rate": 3.0502136752136755e-05, "loss": 0.1369, "step": 9380 }, { "epoch": 45.14423076923077, "grad_norm": 4.071923732757568, "learning_rate": 3.0475427350427356e-05, "loss": 0.1395, "step": 9390 }, { "epoch": 45.19230769230769, "grad_norm": 5.158945083618164, "learning_rate": 3.0448717948717947e-05, "loss": 0.1223, "step": 9400 }, { "epoch": 45.24038461538461, "grad_norm": 3.4633195400238037, "learning_rate": 3.0422008547008547e-05, "loss": 0.1285, "step": 9410 }, { "epoch": 45.28846153846154, "grad_norm": 2.4155452251434326, "learning_rate": 3.0395299145299145e-05, "loss": 0.1295, "step": 9420 }, { "epoch": 45.33653846153846, "grad_norm": 5.185429096221924, "learning_rate": 3.0368589743589742e-05, "loss": 0.1279, "step": 9430 }, { "epoch": 45.38461538461539, "grad_norm": 2.166855812072754, "learning_rate": 3.034188034188034e-05, "loss": 0.1386, "step": 9440 }, { "epoch": 45.43269230769231, "grad_norm": 4.341554164886475, "learning_rate": 3.031517094017094e-05, "loss": 0.1376, "step": 9450 }, { "epoch": 45.48076923076923, "grad_norm": 4.972920894622803, "learning_rate": 3.0288461538461538e-05, "loss": 0.1165, "step": 9460 }, { "epoch": 45.52884615384615, "grad_norm": 5.782724380493164, "learning_rate": 3.0261752136752135e-05, "loss": 0.1241, "step": 9470 }, { "epoch": 45.57692307692308, "grad_norm": 4.149045467376709, "learning_rate": 3.0235042735042736e-05, "loss": 0.109, "step": 9480 }, { "epoch": 45.625, "grad_norm": 4.505321025848389, "learning_rate": 3.0208333333333334e-05, "loss": 0.1195, "step": 9490 }, { "epoch": 45.67307692307692, "grad_norm": 2.791372060775757, "learning_rate": 3.018162393162393e-05, "loss": 0.1166, "step": 9500 }, { "epoch": 45.72115384615385, "grad_norm": 3.4741852283477783, "learning_rate": 3.015491452991453e-05, "loss": 0.1179, "step": 9510 }, { "epoch": 45.76923076923077, "grad_norm": 4.075725078582764, "learning_rate": 3.012820512820513e-05, "loss": 0.1237, "step": 9520 }, { "epoch": 45.81730769230769, "grad_norm": 2.4122588634490967, "learning_rate": 3.0101495726495727e-05, "loss": 0.1476, "step": 9530 }, { "epoch": 45.86538461538461, "grad_norm": 3.3185482025146484, "learning_rate": 3.0074786324786324e-05, "loss": 0.1243, "step": 9540 }, { "epoch": 45.91346153846154, "grad_norm": 2.7626290321350098, "learning_rate": 3.0048076923076925e-05, "loss": 0.1421, "step": 9550 }, { "epoch": 45.96153846153846, "grad_norm": 3.4393885135650635, "learning_rate": 3.0021367521367522e-05, "loss": 0.1472, "step": 9560 }, { "epoch": 46.0, "eval_accuracy": 0.9112924960535941, "eval_loss": 0.4510321319103241, "eval_runtime": 78.7275, "eval_samples_per_second": 329.91, "eval_steps_per_second": 5.157, "step": 9568 }, { "epoch": 46.00961538461539, "grad_norm": 3.982539653778076, "learning_rate": 2.999465811965812e-05, "loss": 0.1377, "step": 9570 }, { "epoch": 46.05769230769231, "grad_norm": 4.187382698059082, "learning_rate": 2.9967948717948717e-05, "loss": 0.1188, "step": 9580 }, { "epoch": 46.10576923076923, "grad_norm": 3.222717761993408, "learning_rate": 2.9941239316239318e-05, "loss": 0.132, "step": 9590 }, { "epoch": 46.15384615384615, "grad_norm": 3.3402624130249023, "learning_rate": 2.9914529914529915e-05, "loss": 0.1286, "step": 9600 }, { "epoch": 46.20192307692308, "grad_norm": 4.574628829956055, "learning_rate": 2.9887820512820513e-05, "loss": 0.1255, "step": 9610 }, { "epoch": 46.25, "grad_norm": 2.402407169342041, "learning_rate": 2.9861111111111113e-05, "loss": 0.1397, "step": 9620 }, { "epoch": 46.29807692307692, "grad_norm": 2.638659715652466, "learning_rate": 2.983440170940171e-05, "loss": 0.1246, "step": 9630 }, { "epoch": 46.34615384615385, "grad_norm": 2.4204704761505127, "learning_rate": 2.9807692307692308e-05, "loss": 0.1181, "step": 9640 }, { "epoch": 46.39423076923077, "grad_norm": 2.313361406326294, "learning_rate": 2.9780982905982906e-05, "loss": 0.129, "step": 9650 }, { "epoch": 46.44230769230769, "grad_norm": 4.1202168464660645, "learning_rate": 2.9754273504273506e-05, "loss": 0.1233, "step": 9660 }, { "epoch": 46.49038461538461, "grad_norm": 3.2036938667297363, "learning_rate": 2.9727564102564104e-05, "loss": 0.1293, "step": 9670 }, { "epoch": 46.53846153846154, "grad_norm": 4.180737018585205, "learning_rate": 2.97008547008547e-05, "loss": 0.1357, "step": 9680 }, { "epoch": 46.58653846153846, "grad_norm": 3.5023419857025146, "learning_rate": 2.9674145299145302e-05, "loss": 0.127, "step": 9690 }, { "epoch": 46.63461538461539, "grad_norm": 3.5417490005493164, "learning_rate": 2.96474358974359e-05, "loss": 0.1361, "step": 9700 }, { "epoch": 46.68269230769231, "grad_norm": 2.3944456577301025, "learning_rate": 2.9620726495726497e-05, "loss": 0.1355, "step": 9710 }, { "epoch": 46.73076923076923, "grad_norm": 6.486210823059082, "learning_rate": 2.9594017094017094e-05, "loss": 0.1232, "step": 9720 }, { "epoch": 46.77884615384615, "grad_norm": 5.7294135093688965, "learning_rate": 2.9567307692307695e-05, "loss": 0.1157, "step": 9730 }, { "epoch": 46.82692307692308, "grad_norm": 2.536653518676758, "learning_rate": 2.9540598290598292e-05, "loss": 0.1187, "step": 9740 }, { "epoch": 46.875, "grad_norm": 3.1829535961151123, "learning_rate": 2.951388888888889e-05, "loss": 0.1328, "step": 9750 }, { "epoch": 46.92307692307692, "grad_norm": 2.0465404987335205, "learning_rate": 2.948717948717949e-05, "loss": 0.1208, "step": 9760 }, { "epoch": 46.97115384615385, "grad_norm": 2.994865655899048, "learning_rate": 2.9460470085470088e-05, "loss": 0.1115, "step": 9770 }, { "epoch": 47.0, "eval_accuracy": 0.9118700188657451, "eval_loss": 0.4372657537460327, "eval_runtime": 78.2084, "eval_samples_per_second": 332.1, "eval_steps_per_second": 5.191, "step": 9776 }, { "epoch": 47.01923076923077, "grad_norm": 4.709226608276367, "learning_rate": 2.9433760683760685e-05, "loss": 0.1333, "step": 9780 }, { "epoch": 47.06730769230769, "grad_norm": 2.698336124420166, "learning_rate": 2.9407051282051283e-05, "loss": 0.1168, "step": 9790 }, { "epoch": 47.11538461538461, "grad_norm": 4.093177795410156, "learning_rate": 2.9380341880341884e-05, "loss": 0.1192, "step": 9800 }, { "epoch": 47.16346153846154, "grad_norm": 4.972163200378418, "learning_rate": 2.935363247863248e-05, "loss": 0.1389, "step": 9810 }, { "epoch": 47.21153846153846, "grad_norm": 5.758094310760498, "learning_rate": 2.932692307692308e-05, "loss": 0.1162, "step": 9820 }, { "epoch": 47.25961538461539, "grad_norm": 6.006216049194336, "learning_rate": 2.930021367521368e-05, "loss": 0.1389, "step": 9830 }, { "epoch": 47.30769230769231, "grad_norm": 3.374110698699951, "learning_rate": 2.9273504273504277e-05, "loss": 0.1219, "step": 9840 }, { "epoch": 47.35576923076923, "grad_norm": 2.2649238109588623, "learning_rate": 2.9246794871794874e-05, "loss": 0.1267, "step": 9850 }, { "epoch": 47.40384615384615, "grad_norm": 3.6668264865875244, "learning_rate": 2.922008547008547e-05, "loss": 0.1268, "step": 9860 }, { "epoch": 47.45192307692308, "grad_norm": 2.9146456718444824, "learning_rate": 2.9193376068376072e-05, "loss": 0.1227, "step": 9870 }, { "epoch": 47.5, "grad_norm": 6.244070529937744, "learning_rate": 2.916666666666667e-05, "loss": 0.1235, "step": 9880 }, { "epoch": 47.54807692307692, "grad_norm": 2.7303998470306396, "learning_rate": 2.9139957264957267e-05, "loss": 0.1188, "step": 9890 }, { "epoch": 47.59615384615385, "grad_norm": 5.614633083343506, "learning_rate": 2.9113247863247868e-05, "loss": 0.1226, "step": 9900 }, { "epoch": 47.64423076923077, "grad_norm": 2.484719753265381, "learning_rate": 2.9086538461538465e-05, "loss": 0.1215, "step": 9910 }, { "epoch": 47.69230769230769, "grad_norm": 3.7377326488494873, "learning_rate": 2.9059829059829063e-05, "loss": 0.1408, "step": 9920 }, { "epoch": 47.74038461538461, "grad_norm": 2.8469793796539307, "learning_rate": 2.903311965811966e-05, "loss": 0.1316, "step": 9930 }, { "epoch": 47.78846153846154, "grad_norm": 5.1570305824279785, "learning_rate": 2.900641025641026e-05, "loss": 0.1201, "step": 9940 }, { "epoch": 47.83653846153846, "grad_norm": 5.565586090087891, "learning_rate": 2.897970085470086e-05, "loss": 0.1152, "step": 9950 }, { "epoch": 47.88461538461539, "grad_norm": 2.430243492126465, "learning_rate": 2.8952991452991456e-05, "loss": 0.1165, "step": 9960 }, { "epoch": 47.93269230769231, "grad_norm": 3.4977474212646484, "learning_rate": 2.8926282051282057e-05, "loss": 0.1165, "step": 9970 }, { "epoch": 47.98076923076923, "grad_norm": 3.4275944232940674, "learning_rate": 2.8899572649572647e-05, "loss": 0.1122, "step": 9980 }, { "epoch": 48.0, "eval_accuracy": 0.9129480614484272, "eval_loss": 0.4689112603664398, "eval_runtime": 78.6762, "eval_samples_per_second": 330.125, "eval_steps_per_second": 5.16, "step": 9984 }, { "epoch": 48.02884615384615, "grad_norm": 1.730431318283081, "learning_rate": 2.8872863247863248e-05, "loss": 0.122, "step": 9990 }, { "epoch": 48.07692307692308, "grad_norm": 3.093275785446167, "learning_rate": 2.8846153846153845e-05, "loss": 0.122, "step": 10000 }, { "epoch": 48.125, "grad_norm": 3.8737919330596924, "learning_rate": 2.8819444444444443e-05, "loss": 0.1332, "step": 10010 }, { "epoch": 48.17307692307692, "grad_norm": 5.1174235343933105, "learning_rate": 2.879273504273504e-05, "loss": 0.1346, "step": 10020 }, { "epoch": 48.22115384615385, "grad_norm": 4.439518928527832, "learning_rate": 2.876602564102564e-05, "loss": 0.1203, "step": 10030 }, { "epoch": 48.26923076923077, "grad_norm": 4.093201160430908, "learning_rate": 2.873931623931624e-05, "loss": 0.136, "step": 10040 }, { "epoch": 48.31730769230769, "grad_norm": 5.383187770843506, "learning_rate": 2.8712606837606836e-05, "loss": 0.1347, "step": 10050 }, { "epoch": 48.36538461538461, "grad_norm": 2.708061695098877, "learning_rate": 2.8685897435897437e-05, "loss": 0.1263, "step": 10060 }, { "epoch": 48.41346153846154, "grad_norm": 3.4715631008148193, "learning_rate": 2.8659188034188034e-05, "loss": 0.115, "step": 10070 }, { "epoch": 48.46153846153846, "grad_norm": 3.8159687519073486, "learning_rate": 2.863247863247863e-05, "loss": 0.1267, "step": 10080 }, { "epoch": 48.50961538461539, "grad_norm": 4.40144681930542, "learning_rate": 2.860576923076923e-05, "loss": 0.1108, "step": 10090 }, { "epoch": 48.55769230769231, "grad_norm": 3.521183967590332, "learning_rate": 2.857905982905983e-05, "loss": 0.1222, "step": 10100 }, { "epoch": 48.60576923076923, "grad_norm": 2.7271006107330322, "learning_rate": 2.8552350427350427e-05, "loss": 0.1272, "step": 10110 }, { "epoch": 48.65384615384615, "grad_norm": 3.788349151611328, "learning_rate": 2.8525641025641025e-05, "loss": 0.1396, "step": 10120 }, { "epoch": 48.70192307692308, "grad_norm": 3.203994035720825, "learning_rate": 2.8498931623931625e-05, "loss": 0.1223, "step": 10130 }, { "epoch": 48.75, "grad_norm": 3.4969069957733154, "learning_rate": 2.8472222222222223e-05, "loss": 0.1174, "step": 10140 }, { "epoch": 48.79807692307692, "grad_norm": 5.669582843780518, "learning_rate": 2.844551282051282e-05, "loss": 0.1136, "step": 10150 }, { "epoch": 48.84615384615385, "grad_norm": 1.5459041595458984, "learning_rate": 2.8418803418803418e-05, "loss": 0.1215, "step": 10160 }, { "epoch": 48.89423076923077, "grad_norm": 3.2844650745391846, "learning_rate": 2.839209401709402e-05, "loss": 0.1171, "step": 10170 }, { "epoch": 48.94230769230769, "grad_norm": 2.4200456142425537, "learning_rate": 2.8365384615384616e-05, "loss": 0.1151, "step": 10180 }, { "epoch": 48.99038461538461, "grad_norm": 3.556185007095337, "learning_rate": 2.8338675213675213e-05, "loss": 0.1297, "step": 10190 }, { "epoch": 49.0, "eval_accuracy": 0.9139876025102992, "eval_loss": 0.4569230377674103, "eval_runtime": 78.4751, "eval_samples_per_second": 330.971, "eval_steps_per_second": 5.174, "step": 10192 }, { "epoch": 49.03846153846154, "grad_norm": 2.682948112487793, "learning_rate": 2.8311965811965814e-05, "loss": 0.1173, "step": 10200 }, { "epoch": 49.08653846153846, "grad_norm": 2.8545734882354736, "learning_rate": 2.828525641025641e-05, "loss": 0.1373, "step": 10210 }, { "epoch": 49.13461538461539, "grad_norm": 3.95514178276062, "learning_rate": 2.825854700854701e-05, "loss": 0.1151, "step": 10220 }, { "epoch": 49.18269230769231, "grad_norm": 4.677338123321533, "learning_rate": 2.8231837606837606e-05, "loss": 0.1314, "step": 10230 }, { "epoch": 49.23076923076923, "grad_norm": 4.182057857513428, "learning_rate": 2.8205128205128207e-05, "loss": 0.1104, "step": 10240 }, { "epoch": 49.27884615384615, "grad_norm": 4.903316497802734, "learning_rate": 2.8178418803418804e-05, "loss": 0.1228, "step": 10250 }, { "epoch": 49.32692307692308, "grad_norm": 3.504810333251953, "learning_rate": 2.8151709401709402e-05, "loss": 0.1291, "step": 10260 }, { "epoch": 49.375, "grad_norm": 2.457141876220703, "learning_rate": 2.8125000000000003e-05, "loss": 0.1232, "step": 10270 }, { "epoch": 49.42307692307692, "grad_norm": 4.332405090332031, "learning_rate": 2.80982905982906e-05, "loss": 0.1249, "step": 10280 }, { "epoch": 49.47115384615385, "grad_norm": 3.574674367904663, "learning_rate": 2.8071581196581197e-05, "loss": 0.112, "step": 10290 }, { "epoch": 49.51923076923077, "grad_norm": 2.8961660861968994, "learning_rate": 2.8044871794871795e-05, "loss": 0.1188, "step": 10300 }, { "epoch": 49.56730769230769, "grad_norm": 2.2778284549713135, "learning_rate": 2.8018162393162396e-05, "loss": 0.1182, "step": 10310 }, { "epoch": 49.61538461538461, "grad_norm": 3.4326441287994385, "learning_rate": 2.7991452991452993e-05, "loss": 0.1402, "step": 10320 }, { "epoch": 49.66346153846154, "grad_norm": 6.3039631843566895, "learning_rate": 2.796474358974359e-05, "loss": 0.1302, "step": 10330 }, { "epoch": 49.71153846153846, "grad_norm": 5.184956073760986, "learning_rate": 2.793803418803419e-05, "loss": 0.1123, "step": 10340 }, { "epoch": 49.75961538461539, "grad_norm": 2.7902655601501465, "learning_rate": 2.791132478632479e-05, "loss": 0.1153, "step": 10350 }, { "epoch": 49.80769230769231, "grad_norm": 6.268124580383301, "learning_rate": 2.7884615384615386e-05, "loss": 0.1088, "step": 10360 }, { "epoch": 49.85576923076923, "grad_norm": 3.863260507583618, "learning_rate": 2.7857905982905987e-05, "loss": 0.1427, "step": 10370 }, { "epoch": 49.90384615384615, "grad_norm": 3.735539197921753, "learning_rate": 2.7831196581196584e-05, "loss": 0.1391, "step": 10380 }, { "epoch": 49.95192307692308, "grad_norm": 2.9845633506774902, "learning_rate": 2.7804487179487182e-05, "loss": 0.1256, "step": 10390 }, { "epoch": 50.0, "grad_norm": 2.5295674800872803, "learning_rate": 2.777777777777778e-05, "loss": 0.1337, "step": 10400 }, { "epoch": 50.0, "eval_accuracy": 0.9111384899703538, "eval_loss": 0.4621720016002655, "eval_runtime": 77.7373, "eval_samples_per_second": 334.112, "eval_steps_per_second": 5.223, "step": 10400 }, { "epoch": 50.04807692307692, "grad_norm": 7.119401931762695, "learning_rate": 2.775106837606838e-05, "loss": 0.1224, "step": 10410 }, { "epoch": 50.09615384615385, "grad_norm": 2.7359299659729004, "learning_rate": 2.7724358974358977e-05, "loss": 0.1123, "step": 10420 }, { "epoch": 50.14423076923077, "grad_norm": 6.7106404304504395, "learning_rate": 2.7697649572649575e-05, "loss": 0.1256, "step": 10430 }, { "epoch": 50.19230769230769, "grad_norm": 4.271613597869873, "learning_rate": 2.7670940170940176e-05, "loss": 0.1195, "step": 10440 }, { "epoch": 50.24038461538461, "grad_norm": 2.779911756515503, "learning_rate": 2.7644230769230773e-05, "loss": 0.131, "step": 10450 }, { "epoch": 50.28846153846154, "grad_norm": 3.9109108448028564, "learning_rate": 2.761752136752137e-05, "loss": 0.1238, "step": 10460 }, { "epoch": 50.33653846153846, "grad_norm": 2.5632646083831787, "learning_rate": 2.7590811965811968e-05, "loss": 0.1126, "step": 10470 }, { "epoch": 50.38461538461539, "grad_norm": 3.7923965454101562, "learning_rate": 2.756410256410257e-05, "loss": 0.1119, "step": 10480 }, { "epoch": 50.43269230769231, "grad_norm": 2.9260525703430176, "learning_rate": 2.7537393162393166e-05, "loss": 0.1181, "step": 10490 }, { "epoch": 50.48076923076923, "grad_norm": 3.150475025177002, "learning_rate": 2.7510683760683763e-05, "loss": 0.114, "step": 10500 }, { "epoch": 50.52884615384615, "grad_norm": 2.863992691040039, "learning_rate": 2.7483974358974364e-05, "loss": 0.1118, "step": 10510 }, { "epoch": 50.57692307692308, "grad_norm": 2.5041580200195312, "learning_rate": 2.745726495726496e-05, "loss": 0.1263, "step": 10520 }, { "epoch": 50.625, "grad_norm": 4.238677501678467, "learning_rate": 2.743055555555556e-05, "loss": 0.1335, "step": 10530 }, { "epoch": 50.67307692307692, "grad_norm": 2.283719301223755, "learning_rate": 2.7403846153846156e-05, "loss": 0.1233, "step": 10540 }, { "epoch": 50.72115384615385, "grad_norm": 3.276979684829712, "learning_rate": 2.7377136752136757e-05, "loss": 0.1174, "step": 10550 }, { "epoch": 50.76923076923077, "grad_norm": 3.5386555194854736, "learning_rate": 2.7350427350427355e-05, "loss": 0.1083, "step": 10560 }, { "epoch": 50.81730769230769, "grad_norm": 4.48482608795166, "learning_rate": 2.732371794871795e-05, "loss": 0.1252, "step": 10570 }, { "epoch": 50.86538461538461, "grad_norm": 2.7078731060028076, "learning_rate": 2.7297008547008546e-05, "loss": 0.1322, "step": 10580 }, { "epoch": 50.91346153846154, "grad_norm": 6.585299968719482, "learning_rate": 2.7270299145299143e-05, "loss": 0.1208, "step": 10590 }, { "epoch": 50.96153846153846, "grad_norm": 3.052446126937866, "learning_rate": 2.724358974358974e-05, "loss": 0.1194, "step": 10600 }, { "epoch": 51.0, "eval_accuracy": 0.9150656450929812, "eval_loss": 0.4579237401485443, "eval_runtime": 78.2574, "eval_samples_per_second": 331.892, "eval_steps_per_second": 5.188, "step": 10608 }, { "epoch": 51.00961538461539, "grad_norm": 4.198080062866211, "learning_rate": 2.721688034188034e-05, "loss": 0.1307, "step": 10610 }, { "epoch": 51.05769230769231, "grad_norm": 2.255786418914795, "learning_rate": 2.719017094017094e-05, "loss": 0.1137, "step": 10620 }, { "epoch": 51.10576923076923, "grad_norm": 2.3282811641693115, "learning_rate": 2.7163461538461536e-05, "loss": 0.1127, "step": 10630 }, { "epoch": 51.15384615384615, "grad_norm": 3.794638156890869, "learning_rate": 2.7136752136752137e-05, "loss": 0.1229, "step": 10640 }, { "epoch": 51.20192307692308, "grad_norm": 3.2138009071350098, "learning_rate": 2.7110042735042735e-05, "loss": 0.1304, "step": 10650 }, { "epoch": 51.25, "grad_norm": 7.202627658843994, "learning_rate": 2.7083333333333332e-05, "loss": 0.1131, "step": 10660 }, { "epoch": 51.29807692307692, "grad_norm": 3.7487599849700928, "learning_rate": 2.705662393162393e-05, "loss": 0.1083, "step": 10670 }, { "epoch": 51.34615384615385, "grad_norm": 3.412104845046997, "learning_rate": 2.702991452991453e-05, "loss": 0.103, "step": 10680 }, { "epoch": 51.39423076923077, "grad_norm": 4.1538310050964355, "learning_rate": 2.7003205128205128e-05, "loss": 0.1212, "step": 10690 }, { "epoch": 51.44230769230769, "grad_norm": 2.7077202796936035, "learning_rate": 2.6976495726495725e-05, "loss": 0.1188, "step": 10700 }, { "epoch": 51.49038461538461, "grad_norm": 2.3533706665039062, "learning_rate": 2.6949786324786326e-05, "loss": 0.1303, "step": 10710 }, { "epoch": 51.53846153846154, "grad_norm": 3.2530131340026855, "learning_rate": 2.6923076923076923e-05, "loss": 0.1203, "step": 10720 }, { "epoch": 51.58653846153846, "grad_norm": 6.1418914794921875, "learning_rate": 2.689636752136752e-05, "loss": 0.1279, "step": 10730 }, { "epoch": 51.63461538461539, "grad_norm": 4.1204915046691895, "learning_rate": 2.686965811965812e-05, "loss": 0.1194, "step": 10740 }, { "epoch": 51.68269230769231, "grad_norm": 4.252894878387451, "learning_rate": 2.684294871794872e-05, "loss": 0.1311, "step": 10750 }, { "epoch": 51.73076923076923, "grad_norm": 5.428421497344971, "learning_rate": 2.6816239316239316e-05, "loss": 0.1316, "step": 10760 }, { "epoch": 51.77884615384615, "grad_norm": 2.0434482097625732, "learning_rate": 2.6789529914529914e-05, "loss": 0.1272, "step": 10770 }, { "epoch": 51.82692307692308, "grad_norm": 2.702918767929077, "learning_rate": 2.6762820512820515e-05, "loss": 0.1119, "step": 10780 }, { "epoch": 51.875, "grad_norm": 3.788759469985962, "learning_rate": 2.6736111111111112e-05, "loss": 0.1138, "step": 10790 }, { "epoch": 51.92307692307692, "grad_norm": 2.494617462158203, "learning_rate": 2.670940170940171e-05, "loss": 0.1214, "step": 10800 }, { "epoch": 51.97115384615385, "grad_norm": 3.8489279747009277, "learning_rate": 2.668269230769231e-05, "loss": 0.1322, "step": 10810 }, { "epoch": 52.0, "eval_accuracy": 0.9104454625957725, "eval_loss": 0.4728435277938843, "eval_runtime": 78.0167, "eval_samples_per_second": 332.916, "eval_steps_per_second": 5.204, "step": 10816 }, { "epoch": 52.01923076923077, "grad_norm": 3.619662284851074, "learning_rate": 2.6655982905982908e-05, "loss": 0.1217, "step": 10820 }, { "epoch": 52.06730769230769, "grad_norm": 2.0659255981445312, "learning_rate": 2.6629273504273505e-05, "loss": 0.1175, "step": 10830 }, { "epoch": 52.11538461538461, "grad_norm": 1.943116307258606, "learning_rate": 2.6602564102564102e-05, "loss": 0.1197, "step": 10840 }, { "epoch": 52.16346153846154, "grad_norm": 2.4552886486053467, "learning_rate": 2.6575854700854703e-05, "loss": 0.1105, "step": 10850 }, { "epoch": 52.21153846153846, "grad_norm": 2.291505813598633, "learning_rate": 2.65491452991453e-05, "loss": 0.1093, "step": 10860 }, { "epoch": 52.25961538461539, "grad_norm": 2.3074071407318115, "learning_rate": 2.6522435897435898e-05, "loss": 0.1183, "step": 10870 }, { "epoch": 52.30769230769231, "grad_norm": 4.398214817047119, "learning_rate": 2.64957264957265e-05, "loss": 0.1072, "step": 10880 }, { "epoch": 52.35576923076923, "grad_norm": 6.301834583282471, "learning_rate": 2.6469017094017096e-05, "loss": 0.1409, "step": 10890 }, { "epoch": 52.40384615384615, "grad_norm": 1.8858188390731812, "learning_rate": 2.6442307692307694e-05, "loss": 0.1102, "step": 10900 }, { "epoch": 52.45192307692308, "grad_norm": 4.247086048126221, "learning_rate": 2.641559829059829e-05, "loss": 0.1149, "step": 10910 }, { "epoch": 52.5, "grad_norm": 3.1861870288848877, "learning_rate": 2.6388888888888892e-05, "loss": 0.1293, "step": 10920 }, { "epoch": 52.54807692307692, "grad_norm": 2.435285806655884, "learning_rate": 2.636217948717949e-05, "loss": 0.1413, "step": 10930 }, { "epoch": 52.59615384615385, "grad_norm": 1.990319013595581, "learning_rate": 2.6335470085470087e-05, "loss": 0.1136, "step": 10940 }, { "epoch": 52.64423076923077, "grad_norm": 4.6072187423706055, "learning_rate": 2.6308760683760687e-05, "loss": 0.1319, "step": 10950 }, { "epoch": 52.69230769230769, "grad_norm": 2.8308162689208984, "learning_rate": 2.6282051282051285e-05, "loss": 0.1306, "step": 10960 }, { "epoch": 52.74038461538461, "grad_norm": 4.259917259216309, "learning_rate": 2.6255341880341882e-05, "loss": 0.1216, "step": 10970 }, { "epoch": 52.78846153846154, "grad_norm": 2.774456262588501, "learning_rate": 2.622863247863248e-05, "loss": 0.1164, "step": 10980 }, { "epoch": 52.83653846153846, "grad_norm": 3.1761629581451416, "learning_rate": 2.620192307692308e-05, "loss": 0.1158, "step": 10990 }, { "epoch": 52.88461538461539, "grad_norm": 3.4038262367248535, "learning_rate": 2.6175213675213678e-05, "loss": 0.1265, "step": 11000 }, { "epoch": 52.93269230769231, "grad_norm": 3.5514979362487793, "learning_rate": 2.6148504273504275e-05, "loss": 0.1129, "step": 11010 }, { "epoch": 52.98076923076923, "grad_norm": 1.9937667846679688, "learning_rate": 2.6121794871794876e-05, "loss": 0.1179, "step": 11020 }, { "epoch": 53.0, "eval_accuracy": 0.9125245447195164, "eval_loss": 0.4728582203388214, "eval_runtime": 77.6673, "eval_samples_per_second": 334.414, "eval_steps_per_second": 5.227, "step": 11024 }, { "epoch": 53.02884615384615, "grad_norm": 5.636879920959473, "learning_rate": 2.6095085470085474e-05, "loss": 0.1096, "step": 11030 }, { "epoch": 53.07692307692308, "grad_norm": 2.0618398189544678, "learning_rate": 2.606837606837607e-05, "loss": 0.1306, "step": 11040 }, { "epoch": 53.125, "grad_norm": 2.799105644226074, "learning_rate": 2.604166666666667e-05, "loss": 0.1142, "step": 11050 }, { "epoch": 53.17307692307692, "grad_norm": 3.4428446292877197, "learning_rate": 2.601495726495727e-05, "loss": 0.1138, "step": 11060 }, { "epoch": 53.22115384615385, "grad_norm": 2.2549190521240234, "learning_rate": 2.5988247863247867e-05, "loss": 0.112, "step": 11070 }, { "epoch": 53.26923076923077, "grad_norm": 4.905076026916504, "learning_rate": 2.5961538461538464e-05, "loss": 0.1238, "step": 11080 }, { "epoch": 53.31730769230769, "grad_norm": 2.2689309120178223, "learning_rate": 2.5934829059829065e-05, "loss": 0.1275, "step": 11090 }, { "epoch": 53.36538461538461, "grad_norm": 2.9593002796173096, "learning_rate": 2.5908119658119662e-05, "loss": 0.128, "step": 11100 }, { "epoch": 53.41346153846154, "grad_norm": 2.7662665843963623, "learning_rate": 2.588141025641026e-05, "loss": 0.1102, "step": 11110 }, { "epoch": 53.46153846153846, "grad_norm": 4.773651123046875, "learning_rate": 2.5854700854700857e-05, "loss": 0.1217, "step": 11120 }, { "epoch": 53.50961538461539, "grad_norm": 2.8189163208007812, "learning_rate": 2.5827991452991458e-05, "loss": 0.1328, "step": 11130 }, { "epoch": 53.55769230769231, "grad_norm": 2.8633482456207275, "learning_rate": 2.5801282051282055e-05, "loss": 0.1133, "step": 11140 }, { "epoch": 53.60576923076923, "grad_norm": 4.044659614562988, "learning_rate": 2.577457264957265e-05, "loss": 0.1117, "step": 11150 }, { "epoch": 53.65384615384615, "grad_norm": 2.189962148666382, "learning_rate": 2.5747863247863247e-05, "loss": 0.1126, "step": 11160 }, { "epoch": 53.70192307692308, "grad_norm": 1.8323568105697632, "learning_rate": 2.5721153846153844e-05, "loss": 0.1217, "step": 11170 }, { "epoch": 53.75, "grad_norm": 4.215389251708984, "learning_rate": 2.5694444444444445e-05, "loss": 0.1075, "step": 11180 }, { "epoch": 53.79807692307692, "grad_norm": 3.476200580596924, "learning_rate": 2.5667735042735042e-05, "loss": 0.1264, "step": 11190 }, { "epoch": 53.84615384615385, "grad_norm": 1.4204655885696411, "learning_rate": 2.564102564102564e-05, "loss": 0.1287, "step": 11200 }, { "epoch": 53.89423076923077, "grad_norm": 2.7961106300354004, "learning_rate": 2.5614316239316237e-05, "loss": 0.114, "step": 11210 }, { "epoch": 53.94230769230769, "grad_norm": 3.570132255554199, "learning_rate": 2.5587606837606838e-05, "loss": 0.1199, "step": 11220 }, { "epoch": 53.99038461538461, "grad_norm": 4.37250280380249, "learning_rate": 2.5560897435897435e-05, "loss": 0.1216, "step": 11230 }, { "epoch": 54.0, "eval_accuracy": 0.9114465021368344, "eval_loss": 0.5198814272880554, "eval_runtime": 78.1387, "eval_samples_per_second": 332.396, "eval_steps_per_second": 5.196, "step": 11232 }, { "epoch": 54.03846153846154, "grad_norm": 3.773003578186035, "learning_rate": 2.5534188034188033e-05, "loss": 0.1161, "step": 11240 }, { "epoch": 54.08653846153846, "grad_norm": 2.551826238632202, "learning_rate": 2.5507478632478633e-05, "loss": 0.1239, "step": 11250 }, { "epoch": 54.13461538461539, "grad_norm": 3.1615707874298096, "learning_rate": 2.548076923076923e-05, "loss": 0.121, "step": 11260 }, { "epoch": 54.18269230769231, "grad_norm": 1.5748475790023804, "learning_rate": 2.5454059829059828e-05, "loss": 0.1215, "step": 11270 }, { "epoch": 54.23076923076923, "grad_norm": 3.8726210594177246, "learning_rate": 2.5427350427350426e-05, "loss": 0.1096, "step": 11280 }, { "epoch": 54.27884615384615, "grad_norm": 2.460731029510498, "learning_rate": 2.5400641025641026e-05, "loss": 0.1216, "step": 11290 }, { "epoch": 54.32692307692308, "grad_norm": 3.817136526107788, "learning_rate": 2.5373931623931624e-05, "loss": 0.123, "step": 11300 }, { "epoch": 54.375, "grad_norm": 8.07175350189209, "learning_rate": 2.534722222222222e-05, "loss": 0.1218, "step": 11310 }, { "epoch": 54.42307692307692, "grad_norm": 1.9205825328826904, "learning_rate": 2.5320512820512822e-05, "loss": 0.1145, "step": 11320 }, { "epoch": 54.47115384615385, "grad_norm": 3.115976572036743, "learning_rate": 2.529380341880342e-05, "loss": 0.122, "step": 11330 }, { "epoch": 54.51923076923077, "grad_norm": 3.0378775596618652, "learning_rate": 2.5267094017094017e-05, "loss": 0.1249, "step": 11340 }, { "epoch": 54.56730769230769, "grad_norm": 1.6623467206954956, "learning_rate": 2.5240384615384614e-05, "loss": 0.1096, "step": 11350 }, { "epoch": 54.61538461538461, "grad_norm": 3.9832887649536133, "learning_rate": 2.5213675213675215e-05, "loss": 0.12, "step": 11360 }, { "epoch": 54.66346153846154, "grad_norm": 5.797616481781006, "learning_rate": 2.5186965811965813e-05, "loss": 0.1203, "step": 11370 }, { "epoch": 54.71153846153846, "grad_norm": 4.0042033195495605, "learning_rate": 2.516025641025641e-05, "loss": 0.1181, "step": 11380 }, { "epoch": 54.75961538461539, "grad_norm": 3.771461009979248, "learning_rate": 2.513354700854701e-05, "loss": 0.1356, "step": 11390 }, { "epoch": 54.80769230769231, "grad_norm": 5.511805057525635, "learning_rate": 2.5106837606837608e-05, "loss": 0.1218, "step": 11400 }, { "epoch": 54.85576923076923, "grad_norm": 3.279745101928711, "learning_rate": 2.5080128205128206e-05, "loss": 0.1259, "step": 11410 }, { "epoch": 54.90384615384615, "grad_norm": 2.1719985008239746, "learning_rate": 2.5053418803418803e-05, "loss": 0.1086, "step": 11420 }, { "epoch": 54.95192307692308, "grad_norm": 1.649346947669983, "learning_rate": 2.5026709401709404e-05, "loss": 0.1236, "step": 11430 }, { "epoch": 55.0, "grad_norm": 2.893418788909912, "learning_rate": 2.5e-05, "loss": 0.1234, "step": 11440 }, { "epoch": 55.0, "eval_accuracy": 0.9135255842605783, "eval_loss": 0.4769396185874939, "eval_runtime": 78.3394, "eval_samples_per_second": 331.544, "eval_steps_per_second": 5.183, "step": 11440 }, { "epoch": 55.04807692307692, "grad_norm": 3.678145408630371, "learning_rate": 2.49732905982906e-05, "loss": 0.1219, "step": 11450 }, { "epoch": 55.09615384615385, "grad_norm": 4.193170070648193, "learning_rate": 2.49465811965812e-05, "loss": 0.1085, "step": 11460 }, { "epoch": 55.14423076923077, "grad_norm": 5.9016032218933105, "learning_rate": 2.4919871794871797e-05, "loss": 0.1287, "step": 11470 }, { "epoch": 55.19230769230769, "grad_norm": 1.8191204071044922, "learning_rate": 2.4893162393162394e-05, "loss": 0.0988, "step": 11480 }, { "epoch": 55.24038461538461, "grad_norm": 1.923538088798523, "learning_rate": 2.486645299145299e-05, "loss": 0.118, "step": 11490 }, { "epoch": 55.28846153846154, "grad_norm": 4.134803295135498, "learning_rate": 2.4839743589743592e-05, "loss": 0.1136, "step": 11500 }, { "epoch": 55.33653846153846, "grad_norm": 3.899116039276123, "learning_rate": 2.481303418803419e-05, "loss": 0.1041, "step": 11510 }, { "epoch": 55.38461538461539, "grad_norm": 4.519454002380371, "learning_rate": 2.4786324786324787e-05, "loss": 0.1212, "step": 11520 }, { "epoch": 55.43269230769231, "grad_norm": 5.235537052154541, "learning_rate": 2.4759615384615388e-05, "loss": 0.1132, "step": 11530 }, { "epoch": 55.48076923076923, "grad_norm": 4.584019660949707, "learning_rate": 2.4732905982905985e-05, "loss": 0.1168, "step": 11540 }, { "epoch": 55.52884615384615, "grad_norm": 1.6670295000076294, "learning_rate": 2.4706196581196583e-05, "loss": 0.1061, "step": 11550 }, { "epoch": 55.57692307692308, "grad_norm": 3.833710193634033, "learning_rate": 2.467948717948718e-05, "loss": 0.1142, "step": 11560 }, { "epoch": 55.625, "grad_norm": 2.30841064453125, "learning_rate": 2.465277777777778e-05, "loss": 0.1157, "step": 11570 }, { "epoch": 55.67307692307692, "grad_norm": 3.125105381011963, "learning_rate": 2.462606837606838e-05, "loss": 0.1177, "step": 11580 }, { "epoch": 55.72115384615385, "grad_norm": 2.2397775650024414, "learning_rate": 2.4599358974358973e-05, "loss": 0.1233, "step": 11590 }, { "epoch": 55.76923076923077, "grad_norm": 3.620213031768799, "learning_rate": 2.4572649572649573e-05, "loss": 0.1243, "step": 11600 }, { "epoch": 55.81730769230769, "grad_norm": 2.252659559249878, "learning_rate": 2.454594017094017e-05, "loss": 0.1017, "step": 11610 }, { "epoch": 55.86538461538461, "grad_norm": 3.298651695251465, "learning_rate": 2.4519230769230768e-05, "loss": 0.1101, "step": 11620 }, { "epoch": 55.91346153846154, "grad_norm": 2.6556055545806885, "learning_rate": 2.449252136752137e-05, "loss": 0.1001, "step": 11630 }, { "epoch": 55.96153846153846, "grad_norm": 3.5320870876312256, "learning_rate": 2.4465811965811966e-05, "loss": 0.1125, "step": 11640 }, { "epoch": 56.0, "eval_accuracy": 0.9117930158241251, "eval_loss": 0.4871075451374054, "eval_runtime": 77.9572, "eval_samples_per_second": 333.17, "eval_steps_per_second": 5.208, "step": 11648 }, { "epoch": 56.00961538461539, "grad_norm": 2.530327796936035, "learning_rate": 2.4439102564102564e-05, "loss": 0.1229, "step": 11650 }, { "epoch": 56.05769230769231, "grad_norm": 2.3239967823028564, "learning_rate": 2.4412393162393165e-05, "loss": 0.109, "step": 11660 }, { "epoch": 56.10576923076923, "grad_norm": 3.364677906036377, "learning_rate": 2.4385683760683762e-05, "loss": 0.1198, "step": 11670 }, { "epoch": 56.15384615384615, "grad_norm": 6.129501819610596, "learning_rate": 2.435897435897436e-05, "loss": 0.1253, "step": 11680 }, { "epoch": 56.20192307692308, "grad_norm": 2.564126968383789, "learning_rate": 2.4332264957264957e-05, "loss": 0.114, "step": 11690 }, { "epoch": 56.25, "grad_norm": 1.9636075496673584, "learning_rate": 2.4305555555555558e-05, "loss": 0.1081, "step": 11700 }, { "epoch": 56.29807692307692, "grad_norm": 3.7714650630950928, "learning_rate": 2.4278846153846155e-05, "loss": 0.1143, "step": 11710 }, { "epoch": 56.34615384615385, "grad_norm": 2.8995754718780518, "learning_rate": 2.4252136752136752e-05, "loss": 0.1115, "step": 11720 }, { "epoch": 56.39423076923077, "grad_norm": 3.238665819168091, "learning_rate": 2.4225427350427353e-05, "loss": 0.1256, "step": 11730 }, { "epoch": 56.44230769230769, "grad_norm": 2.2107467651367188, "learning_rate": 2.419871794871795e-05, "loss": 0.1031, "step": 11740 }, { "epoch": 56.49038461538461, "grad_norm": 1.9958751201629639, "learning_rate": 2.4172008547008548e-05, "loss": 0.1135, "step": 11750 }, { "epoch": 56.53846153846154, "grad_norm": 4.221354007720947, "learning_rate": 2.4145299145299145e-05, "loss": 0.1282, "step": 11760 }, { "epoch": 56.58653846153846, "grad_norm": 4.520522594451904, "learning_rate": 2.4118589743589746e-05, "loss": 0.118, "step": 11770 }, { "epoch": 56.63461538461539, "grad_norm": 2.9906861782073975, "learning_rate": 2.4091880341880344e-05, "loss": 0.116, "step": 11780 }, { "epoch": 56.68269230769231, "grad_norm": 2.5491654872894287, "learning_rate": 2.406517094017094e-05, "loss": 0.1144, "step": 11790 }, { "epoch": 56.73076923076923, "grad_norm": 2.6354892253875732, "learning_rate": 2.4038461538461542e-05, "loss": 0.1137, "step": 11800 }, { "epoch": 56.77884615384615, "grad_norm": 2.6601855754852295, "learning_rate": 2.401175213675214e-05, "loss": 0.112, "step": 11810 }, { "epoch": 56.82692307692308, "grad_norm": 3.304023504257202, "learning_rate": 2.3985042735042737e-05, "loss": 0.1021, "step": 11820 }, { "epoch": 56.875, "grad_norm": 3.7852001190185547, "learning_rate": 2.3958333333333334e-05, "loss": 0.1146, "step": 11830 }, { "epoch": 56.92307692307692, "grad_norm": 2.5482029914855957, "learning_rate": 2.3931623931623935e-05, "loss": 0.1169, "step": 11840 }, { "epoch": 56.97115384615385, "grad_norm": 2.2264962196350098, "learning_rate": 2.3904914529914532e-05, "loss": 0.1234, "step": 11850 }, { "epoch": 57.0, "eval_accuracy": 0.9146036268432604, "eval_loss": 0.46674150228500366, "eval_runtime": 78.607, "eval_samples_per_second": 330.416, "eval_steps_per_second": 5.165, "step": 11856 }, { "epoch": 57.01923076923077, "grad_norm": 3.011582374572754, "learning_rate": 2.387820512820513e-05, "loss": 0.1129, "step": 11860 }, { "epoch": 57.06730769230769, "grad_norm": 2.318850517272949, "learning_rate": 2.385149572649573e-05, "loss": 0.1036, "step": 11870 }, { "epoch": 57.11538461538461, "grad_norm": 2.0134687423706055, "learning_rate": 2.3824786324786324e-05, "loss": 0.1156, "step": 11880 }, { "epoch": 57.16346153846154, "grad_norm": 4.2365617752075195, "learning_rate": 2.3798076923076922e-05, "loss": 0.1112, "step": 11890 }, { "epoch": 57.21153846153846, "grad_norm": 3.604442834854126, "learning_rate": 2.3771367521367523e-05, "loss": 0.1127, "step": 11900 }, { "epoch": 57.25961538461539, "grad_norm": 1.9331001043319702, "learning_rate": 2.374465811965812e-05, "loss": 0.1221, "step": 11910 }, { "epoch": 57.30769230769231, "grad_norm": 3.9943687915802, "learning_rate": 2.3717948717948718e-05, "loss": 0.1158, "step": 11920 }, { "epoch": 57.35576923076923, "grad_norm": 2.179633617401123, "learning_rate": 2.3691239316239315e-05, "loss": 0.1131, "step": 11930 }, { "epoch": 57.40384615384615, "grad_norm": 4.202084541320801, "learning_rate": 2.3664529914529916e-05, "loss": 0.1106, "step": 11940 }, { "epoch": 57.45192307692308, "grad_norm": 4.669320106506348, "learning_rate": 2.3637820512820513e-05, "loss": 0.1045, "step": 11950 }, { "epoch": 57.5, "grad_norm": 4.976787567138672, "learning_rate": 2.361111111111111e-05, "loss": 0.1262, "step": 11960 }, { "epoch": 57.54807692307692, "grad_norm": 3.087357759475708, "learning_rate": 2.358440170940171e-05, "loss": 0.1241, "step": 11970 }, { "epoch": 57.59615384615385, "grad_norm": 5.800329208374023, "learning_rate": 2.355769230769231e-05, "loss": 0.1137, "step": 11980 }, { "epoch": 57.64423076923077, "grad_norm": 4.631556510925293, "learning_rate": 2.3530982905982906e-05, "loss": 0.1168, "step": 11990 }, { "epoch": 57.69230769230769, "grad_norm": 3.7280545234680176, "learning_rate": 2.3504273504273504e-05, "loss": 0.1196, "step": 12000 }, { "epoch": 57.74038461538461, "grad_norm": 5.011423110961914, "learning_rate": 2.3477564102564104e-05, "loss": 0.1123, "step": 12010 }, { "epoch": 57.78846153846154, "grad_norm": 2.5139973163604736, "learning_rate": 2.3450854700854702e-05, "loss": 0.1122, "step": 12020 }, { "epoch": 57.83653846153846, "grad_norm": 2.114069938659668, "learning_rate": 2.34241452991453e-05, "loss": 0.1077, "step": 12030 }, { "epoch": 57.88461538461539, "grad_norm": 4.743785381317139, "learning_rate": 2.33974358974359e-05, "loss": 0.1176, "step": 12040 }, { "epoch": 57.93269230769231, "grad_norm": 2.133976459503174, "learning_rate": 2.3370726495726497e-05, "loss": 0.1116, "step": 12050 }, { "epoch": 57.98076923076923, "grad_norm": 2.3025712966918945, "learning_rate": 2.3344017094017095e-05, "loss": 0.1103, "step": 12060 }, { "epoch": 58.0, "eval_accuracy": 0.9119085203865552, "eval_loss": 0.47413960099220276, "eval_runtime": 77.8864, "eval_samples_per_second": 333.473, "eval_steps_per_second": 5.213, "step": 12064 }, { "epoch": 58.02884615384615, "grad_norm": 2.7315313816070557, "learning_rate": 2.3317307692307692e-05, "loss": 0.1341, "step": 12070 }, { "epoch": 58.07692307692308, "grad_norm": 1.830894112586975, "learning_rate": 2.3290598290598293e-05, "loss": 0.1098, "step": 12080 }, { "epoch": 58.125, "grad_norm": 2.0341928005218506, "learning_rate": 2.326388888888889e-05, "loss": 0.1015, "step": 12090 }, { "epoch": 58.17307692307692, "grad_norm": 2.184168577194214, "learning_rate": 2.3237179487179488e-05, "loss": 0.1011, "step": 12100 }, { "epoch": 58.22115384615385, "grad_norm": 3.4694275856018066, "learning_rate": 2.321047008547009e-05, "loss": 0.1031, "step": 12110 }, { "epoch": 58.26923076923077, "grad_norm": 1.9668906927108765, "learning_rate": 2.3183760683760686e-05, "loss": 0.1145, "step": 12120 }, { "epoch": 58.31730769230769, "grad_norm": 2.402838945388794, "learning_rate": 2.3157051282051283e-05, "loss": 0.1177, "step": 12130 }, { "epoch": 58.36538461538461, "grad_norm": 1.9061172008514404, "learning_rate": 2.313034188034188e-05, "loss": 0.1141, "step": 12140 }, { "epoch": 58.41346153846154, "grad_norm": 4.977060794830322, "learning_rate": 2.310363247863248e-05, "loss": 0.1132, "step": 12150 }, { "epoch": 58.46153846153846, "grad_norm": 2.9488227367401123, "learning_rate": 2.307692307692308e-05, "loss": 0.1366, "step": 12160 }, { "epoch": 58.50961538461539, "grad_norm": 2.8471012115478516, "learning_rate": 2.3050213675213676e-05, "loss": 0.1054, "step": 12170 }, { "epoch": 58.55769230769231, "grad_norm": 3.1144158840179443, "learning_rate": 2.3023504273504274e-05, "loss": 0.1051, "step": 12180 }, { "epoch": 58.60576923076923, "grad_norm": 2.76145601272583, "learning_rate": 2.299679487179487e-05, "loss": 0.1272, "step": 12190 }, { "epoch": 58.65384615384615, "grad_norm": 3.100525140762329, "learning_rate": 2.297008547008547e-05, "loss": 0.1042, "step": 12200 }, { "epoch": 58.70192307692308, "grad_norm": 6.584566116333008, "learning_rate": 2.294337606837607e-05, "loss": 0.1255, "step": 12210 }, { "epoch": 58.75, "grad_norm": 4.911331653594971, "learning_rate": 2.2916666666666667e-05, "loss": 0.1146, "step": 12220 }, { "epoch": 58.79807692307692, "grad_norm": 3.9977779388427734, "learning_rate": 2.2889957264957264e-05, "loss": 0.1107, "step": 12230 }, { "epoch": 58.84615384615385, "grad_norm": 5.3445611000061035, "learning_rate": 2.2863247863247865e-05, "loss": 0.1183, "step": 12240 }, { "epoch": 58.89423076923077, "grad_norm": 6.93602991104126, "learning_rate": 2.2836538461538463e-05, "loss": 0.1122, "step": 12250 }, { "epoch": 58.94230769230769, "grad_norm": 2.2795255184173584, "learning_rate": 2.280982905982906e-05, "loss": 0.1188, "step": 12260 }, { "epoch": 58.99038461538461, "grad_norm": 2.4674072265625, "learning_rate": 2.2783119658119657e-05, "loss": 0.1103, "step": 12270 }, { "epoch": 59.0, "eval_accuracy": 0.9129095599276171, "eval_loss": 0.4863601326942444, "eval_runtime": 78.1345, "eval_samples_per_second": 332.414, "eval_steps_per_second": 5.196, "step": 12272 }, { "epoch": 59.03846153846154, "grad_norm": 2.1130690574645996, "learning_rate": 2.2756410256410258e-05, "loss": 0.1107, "step": 12280 }, { "epoch": 59.08653846153846, "grad_norm": 4.055871963500977, "learning_rate": 2.2729700854700856e-05, "loss": 0.1089, "step": 12290 }, { "epoch": 59.13461538461539, "grad_norm": 3.0454015731811523, "learning_rate": 2.2702991452991453e-05, "loss": 0.1017, "step": 12300 }, { "epoch": 59.18269230769231, "grad_norm": 5.77498197555542, "learning_rate": 2.2676282051282054e-05, "loss": 0.1035, "step": 12310 }, { "epoch": 59.23076923076923, "grad_norm": 1.401763916015625, "learning_rate": 2.264957264957265e-05, "loss": 0.1195, "step": 12320 }, { "epoch": 59.27884615384615, "grad_norm": 4.848992824554443, "learning_rate": 2.262286324786325e-05, "loss": 0.1116, "step": 12330 }, { "epoch": 59.32692307692308, "grad_norm": 5.570082664489746, "learning_rate": 2.2596153846153846e-05, "loss": 0.1175, "step": 12340 }, { "epoch": 59.375, "grad_norm": 4.811284065246582, "learning_rate": 2.2569444444444447e-05, "loss": 0.1121, "step": 12350 }, { "epoch": 59.42307692307692, "grad_norm": 5.271439075469971, "learning_rate": 2.2542735042735044e-05, "loss": 0.1036, "step": 12360 }, { "epoch": 59.47115384615385, "grad_norm": 3.9689602851867676, "learning_rate": 2.251602564102564e-05, "loss": 0.1039, "step": 12370 }, { "epoch": 59.51923076923077, "grad_norm": 2.5298197269439697, "learning_rate": 2.2489316239316242e-05, "loss": 0.1128, "step": 12380 }, { "epoch": 59.56730769230769, "grad_norm": 2.402423143386841, "learning_rate": 2.246260683760684e-05, "loss": 0.1145, "step": 12390 }, { "epoch": 59.61538461538461, "grad_norm": 2.8057563304901123, "learning_rate": 2.2435897435897437e-05, "loss": 0.0996, "step": 12400 }, { "epoch": 59.66346153846154, "grad_norm": 5.833059787750244, "learning_rate": 2.2409188034188035e-05, "loss": 0.1072, "step": 12410 }, { "epoch": 59.71153846153846, "grad_norm": 5.409130573272705, "learning_rate": 2.2382478632478635e-05, "loss": 0.1207, "step": 12420 }, { "epoch": 59.75961538461539, "grad_norm": 2.514173746109009, "learning_rate": 2.2355769230769233e-05, "loss": 0.1231, "step": 12430 }, { "epoch": 59.80769230769231, "grad_norm": 2.9180729389190674, "learning_rate": 2.232905982905983e-05, "loss": 0.1146, "step": 12440 }, { "epoch": 59.85576923076923, "grad_norm": 2.714390516281128, "learning_rate": 2.230235042735043e-05, "loss": 0.1109, "step": 12450 }, { "epoch": 59.90384615384615, "grad_norm": 2.8669919967651367, "learning_rate": 2.227564102564103e-05, "loss": 0.1076, "step": 12460 }, { "epoch": 59.95192307692308, "grad_norm": 3.3324756622314453, "learning_rate": 2.2248931623931622e-05, "loss": 0.1293, "step": 12470 }, { "epoch": 60.0, "grad_norm": 1.857649564743042, "learning_rate": 2.2222222222222223e-05, "loss": 0.1222, "step": 12480 }, { "epoch": 60.0, "eval_accuracy": 0.9142956146767798, "eval_loss": 0.4549580216407776, "eval_runtime": 78.7092, "eval_samples_per_second": 329.987, "eval_steps_per_second": 5.158, "step": 12480 }, { "epoch": 60.04807692307692, "grad_norm": 3.5316238403320312, "learning_rate": 2.219551282051282e-05, "loss": 0.0991, "step": 12490 }, { "epoch": 60.09615384615385, "grad_norm": 2.5067341327667236, "learning_rate": 2.2168803418803418e-05, "loss": 0.1187, "step": 12500 }, { "epoch": 60.14423076923077, "grad_norm": 3.00165057182312, "learning_rate": 2.2142094017094016e-05, "loss": 0.128, "step": 12510 }, { "epoch": 60.19230769230769, "grad_norm": 2.8315227031707764, "learning_rate": 2.2115384615384616e-05, "loss": 0.0969, "step": 12520 }, { "epoch": 60.24038461538461, "grad_norm": 3.2189154624938965, "learning_rate": 2.2088675213675214e-05, "loss": 0.1123, "step": 12530 }, { "epoch": 60.28846153846154, "grad_norm": 2.2785096168518066, "learning_rate": 2.206196581196581e-05, "loss": 0.0998, "step": 12540 }, { "epoch": 60.33653846153846, "grad_norm": 3.7143566608428955, "learning_rate": 2.2035256410256412e-05, "loss": 0.1313, "step": 12550 }, { "epoch": 60.38461538461539, "grad_norm": 2.789030075073242, "learning_rate": 2.200854700854701e-05, "loss": 0.1109, "step": 12560 }, { "epoch": 60.43269230769231, "grad_norm": 3.2706820964813232, "learning_rate": 2.1981837606837607e-05, "loss": 0.1038, "step": 12570 }, { "epoch": 60.48076923076923, "grad_norm": 1.9594868421554565, "learning_rate": 2.1955128205128208e-05, "loss": 0.1141, "step": 12580 }, { "epoch": 60.52884615384615, "grad_norm": 3.641110897064209, "learning_rate": 2.1928418803418805e-05, "loss": 0.1136, "step": 12590 }, { "epoch": 60.57692307692308, "grad_norm": 3.7676408290863037, "learning_rate": 2.1901709401709402e-05, "loss": 0.1083, "step": 12600 }, { "epoch": 60.625, "grad_norm": 2.854519844055176, "learning_rate": 2.1875e-05, "loss": 0.1151, "step": 12610 }, { "epoch": 60.67307692307692, "grad_norm": 4.564648151397705, "learning_rate": 2.18482905982906e-05, "loss": 0.116, "step": 12620 }, { "epoch": 60.72115384615385, "grad_norm": 3.8612818717956543, "learning_rate": 2.1821581196581198e-05, "loss": 0.131, "step": 12630 }, { "epoch": 60.76923076923077, "grad_norm": 3.252014398574829, "learning_rate": 2.1794871794871795e-05, "loss": 0.1122, "step": 12640 }, { "epoch": 60.81730769230769, "grad_norm": 2.703852653503418, "learning_rate": 2.1768162393162396e-05, "loss": 0.1115, "step": 12650 }, { "epoch": 60.86538461538461, "grad_norm": 2.4698398113250732, "learning_rate": 2.1741452991452994e-05, "loss": 0.1111, "step": 12660 }, { "epoch": 60.91346153846154, "grad_norm": 2.6428062915802, "learning_rate": 2.171474358974359e-05, "loss": 0.116, "step": 12670 }, { "epoch": 60.96153846153846, "grad_norm": 3.9071784019470215, "learning_rate": 2.168803418803419e-05, "loss": 0.127, "step": 12680 }, { "epoch": 61.0, "eval_accuracy": 0.9135255842605783, "eval_loss": 0.4919072091579437, "eval_runtime": 78.4751, "eval_samples_per_second": 330.971, "eval_steps_per_second": 5.174, "step": 12688 }, { "epoch": 61.00961538461539, "grad_norm": 2.8796143531799316, "learning_rate": 2.166132478632479e-05, "loss": 0.1308, "step": 12690 }, { "epoch": 61.05769230769231, "grad_norm": 1.3426052331924438, "learning_rate": 2.1634615384615387e-05, "loss": 0.1032, "step": 12700 }, { "epoch": 61.10576923076923, "grad_norm": 3.5874171257019043, "learning_rate": 2.1607905982905984e-05, "loss": 0.1163, "step": 12710 }, { "epoch": 61.15384615384615, "grad_norm": 4.612774848937988, "learning_rate": 2.1581196581196585e-05, "loss": 0.114, "step": 12720 }, { "epoch": 61.20192307692308, "grad_norm": 6.494158744812012, "learning_rate": 2.1554487179487182e-05, "loss": 0.1224, "step": 12730 }, { "epoch": 61.25, "grad_norm": 1.6744678020477295, "learning_rate": 2.152777777777778e-05, "loss": 0.1152, "step": 12740 }, { "epoch": 61.29807692307692, "grad_norm": 2.897639751434326, "learning_rate": 2.1501068376068377e-05, "loss": 0.1107, "step": 12750 }, { "epoch": 61.34615384615385, "grad_norm": 2.1457056999206543, "learning_rate": 2.1474358974358974e-05, "loss": 0.1139, "step": 12760 }, { "epoch": 61.39423076923077, "grad_norm": 3.32236647605896, "learning_rate": 2.1447649572649572e-05, "loss": 0.1028, "step": 12770 }, { "epoch": 61.44230769230769, "grad_norm": 2.3627710342407227, "learning_rate": 2.142094017094017e-05, "loss": 0.1033, "step": 12780 }, { "epoch": 61.49038461538461, "grad_norm": 2.2550461292266846, "learning_rate": 2.139423076923077e-05, "loss": 0.1049, "step": 12790 }, { "epoch": 61.53846153846154, "grad_norm": 5.034332275390625, "learning_rate": 2.1367521367521368e-05, "loss": 0.1157, "step": 12800 }, { "epoch": 61.58653846153846, "grad_norm": 4.20222806930542, "learning_rate": 2.1340811965811965e-05, "loss": 0.1133, "step": 12810 }, { "epoch": 61.63461538461539, "grad_norm": 4.950078010559082, "learning_rate": 2.1314102564102566e-05, "loss": 0.1157, "step": 12820 }, { "epoch": 61.68269230769231, "grad_norm": 3.283453941345215, "learning_rate": 2.1287393162393163e-05, "loss": 0.1145, "step": 12830 }, { "epoch": 61.73076923076923, "grad_norm": 2.7735848426818848, "learning_rate": 2.126068376068376e-05, "loss": 0.121, "step": 12840 }, { "epoch": 61.77884615384615, "grad_norm": 3.2145426273345947, "learning_rate": 2.1233974358974358e-05, "loss": 0.124, "step": 12850 }, { "epoch": 61.82692307692308, "grad_norm": 2.8548941612243652, "learning_rate": 2.120726495726496e-05, "loss": 0.1041, "step": 12860 }, { "epoch": 61.875, "grad_norm": 2.286043405532837, "learning_rate": 2.1180555555555556e-05, "loss": 0.1083, "step": 12870 }, { "epoch": 61.92307692307692, "grad_norm": 4.1478753089904785, "learning_rate": 2.1153846153846154e-05, "loss": 0.1121, "step": 12880 }, { "epoch": 61.97115384615385, "grad_norm": 5.213771820068359, "learning_rate": 2.1127136752136754e-05, "loss": 0.1117, "step": 12890 }, { "epoch": 62.0, "eval_accuracy": 0.913910599468679, "eval_loss": 0.494579941034317, "eval_runtime": 78.2282, "eval_samples_per_second": 332.016, "eval_steps_per_second": 5.19, "step": 12896 }, { "epoch": 62.01923076923077, "grad_norm": 5.287962913513184, "learning_rate": 2.1100427350427352e-05, "loss": 0.1034, "step": 12900 }, { "epoch": 62.06730769230769, "grad_norm": 4.007287979125977, "learning_rate": 2.107371794871795e-05, "loss": 0.1201, "step": 12910 }, { "epoch": 62.11538461538461, "grad_norm": 1.7900118827819824, "learning_rate": 2.1047008547008547e-05, "loss": 0.1155, "step": 12920 }, { "epoch": 62.16346153846154, "grad_norm": 2.857836961746216, "learning_rate": 2.1020299145299147e-05, "loss": 0.0968, "step": 12930 }, { "epoch": 62.21153846153846, "grad_norm": 3.517437219619751, "learning_rate": 2.0993589743589745e-05, "loss": 0.1169, "step": 12940 }, { "epoch": 62.25961538461539, "grad_norm": 3.222231388092041, "learning_rate": 2.0966880341880342e-05, "loss": 0.1101, "step": 12950 }, { "epoch": 62.30769230769231, "grad_norm": 2.788534641265869, "learning_rate": 2.0940170940170943e-05, "loss": 0.1108, "step": 12960 }, { "epoch": 62.35576923076923, "grad_norm": 2.305405855178833, "learning_rate": 2.091346153846154e-05, "loss": 0.1219, "step": 12970 }, { "epoch": 62.40384615384615, "grad_norm": 4.3148512840271, "learning_rate": 2.0886752136752138e-05, "loss": 0.1091, "step": 12980 }, { "epoch": 62.45192307692308, "grad_norm": 1.8329429626464844, "learning_rate": 2.0860042735042735e-05, "loss": 0.1245, "step": 12990 }, { "epoch": 62.5, "grad_norm": 2.836042642593384, "learning_rate": 2.0833333333333336e-05, "loss": 0.1128, "step": 13000 }, { "epoch": 62.54807692307692, "grad_norm": 4.68430757522583, "learning_rate": 2.0806623931623933e-05, "loss": 0.1068, "step": 13010 }, { "epoch": 62.59615384615385, "grad_norm": 3.8583757877349854, "learning_rate": 2.077991452991453e-05, "loss": 0.1085, "step": 13020 }, { "epoch": 62.64423076923077, "grad_norm": 2.2620856761932373, "learning_rate": 2.075320512820513e-05, "loss": 0.104, "step": 13030 }, { "epoch": 62.69230769230769, "grad_norm": 4.502205848693848, "learning_rate": 2.072649572649573e-05, "loss": 0.1274, "step": 13040 }, { "epoch": 62.74038461538461, "grad_norm": 2.8937313556671143, "learning_rate": 2.0699786324786323e-05, "loss": 0.1083, "step": 13050 }, { "epoch": 62.78846153846154, "grad_norm": 3.295976400375366, "learning_rate": 2.0673076923076924e-05, "loss": 0.1002, "step": 13060 }, { "epoch": 62.83653846153846, "grad_norm": 1.3650420904159546, "learning_rate": 2.064636752136752e-05, "loss": 0.1035, "step": 13070 }, { "epoch": 62.88461538461539, "grad_norm": 2.415153980255127, "learning_rate": 2.061965811965812e-05, "loss": 0.1124, "step": 13080 }, { "epoch": 62.93269230769231, "grad_norm": 3.3649742603302, "learning_rate": 2.059294871794872e-05, "loss": 0.1098, "step": 13090 }, { "epoch": 62.98076923076923, "grad_norm": 3.4000754356384277, "learning_rate": 2.0566239316239317e-05, "loss": 0.1078, "step": 13100 }, { "epoch": 63.0, "eval_accuracy": 0.9132560736149078, "eval_loss": 0.503955602645874, "eval_runtime": 78.3762, "eval_samples_per_second": 331.389, "eval_steps_per_second": 5.18, "step": 13104 }, { "epoch": 63.02884615384615, "grad_norm": 3.5699262619018555, "learning_rate": 2.0539529914529914e-05, "loss": 0.1207, "step": 13110 }, { "epoch": 63.07692307692308, "grad_norm": 6.163900852203369, "learning_rate": 2.0512820512820512e-05, "loss": 0.1028, "step": 13120 }, { "epoch": 63.125, "grad_norm": 1.8877509832382202, "learning_rate": 2.0486111111111113e-05, "loss": 0.1084, "step": 13130 }, { "epoch": 63.17307692307692, "grad_norm": 2.632613182067871, "learning_rate": 2.045940170940171e-05, "loss": 0.1183, "step": 13140 }, { "epoch": 63.22115384615385, "grad_norm": 3.9920690059661865, "learning_rate": 2.0432692307692307e-05, "loss": 0.1148, "step": 13150 }, { "epoch": 63.26923076923077, "grad_norm": 1.3978849649429321, "learning_rate": 2.0405982905982908e-05, "loss": 0.1023, "step": 13160 }, { "epoch": 63.31730769230769, "grad_norm": 2.530336380004883, "learning_rate": 2.0379273504273506e-05, "loss": 0.1057, "step": 13170 }, { "epoch": 63.36538461538461, "grad_norm": 3.645420789718628, "learning_rate": 2.0352564102564103e-05, "loss": 0.1021, "step": 13180 }, { "epoch": 63.41346153846154, "grad_norm": 2.8399977684020996, "learning_rate": 2.03258547008547e-05, "loss": 0.1056, "step": 13190 }, { "epoch": 63.46153846153846, "grad_norm": 3.5413079261779785, "learning_rate": 2.02991452991453e-05, "loss": 0.1241, "step": 13200 }, { "epoch": 63.50961538461539, "grad_norm": 2.9555580615997314, "learning_rate": 2.02724358974359e-05, "loss": 0.1083, "step": 13210 }, { "epoch": 63.55769230769231, "grad_norm": 2.396477222442627, "learning_rate": 2.0245726495726496e-05, "loss": 0.1186, "step": 13220 }, { "epoch": 63.60576923076923, "grad_norm": 3.9445061683654785, "learning_rate": 2.0219017094017097e-05, "loss": 0.1083, "step": 13230 }, { "epoch": 63.65384615384615, "grad_norm": 2.6939499378204346, "learning_rate": 2.0192307692307694e-05, "loss": 0.112, "step": 13240 }, { "epoch": 63.70192307692308, "grad_norm": 3.1032865047454834, "learning_rate": 2.016559829059829e-05, "loss": 0.1143, "step": 13250 }, { "epoch": 63.75, "grad_norm": 5.213830947875977, "learning_rate": 2.013888888888889e-05, "loss": 0.1175, "step": 13260 }, { "epoch": 63.79807692307692, "grad_norm": 2.574918508529663, "learning_rate": 2.011217948717949e-05, "loss": 0.1192, "step": 13270 }, { "epoch": 63.84615384615385, "grad_norm": 2.9340407848358154, "learning_rate": 2.0085470085470087e-05, "loss": 0.1104, "step": 13280 }, { "epoch": 63.89423076923077, "grad_norm": 4.804204940795898, "learning_rate": 2.0058760683760685e-05, "loss": 0.1116, "step": 13290 }, { "epoch": 63.94230769230769, "grad_norm": 2.519008159637451, "learning_rate": 2.0032051282051285e-05, "loss": 0.1112, "step": 13300 }, { "epoch": 63.99038461538461, "grad_norm": 2.8682045936584473, "learning_rate": 2.0005341880341883e-05, "loss": 0.1127, "step": 13310 }, { "epoch": 64.0, "eval_accuracy": 0.9126400492819466, "eval_loss": 0.48044249415397644, "eval_runtime": 78.1828, "eval_samples_per_second": 332.209, "eval_steps_per_second": 5.193, "step": 13312 }, { "epoch": 64.03846153846153, "grad_norm": 2.5934066772460938, "learning_rate": 1.997863247863248e-05, "loss": 0.1057, "step": 13320 }, { "epoch": 64.08653846153847, "grad_norm": 2.1237521171569824, "learning_rate": 1.9951923076923078e-05, "loss": 0.1095, "step": 13330 }, { "epoch": 64.13461538461539, "grad_norm": 3.5337085723876953, "learning_rate": 1.992521367521368e-05, "loss": 0.1184, "step": 13340 }, { "epoch": 64.1826923076923, "grad_norm": 2.4335849285125732, "learning_rate": 1.9898504273504272e-05, "loss": 0.1246, "step": 13350 }, { "epoch": 64.23076923076923, "grad_norm": 1.799056887626648, "learning_rate": 1.987179487179487e-05, "loss": 0.1145, "step": 13360 }, { "epoch": 64.27884615384616, "grad_norm": 2.960726261138916, "learning_rate": 1.984508547008547e-05, "loss": 0.1081, "step": 13370 }, { "epoch": 64.32692307692308, "grad_norm": 3.348620653152466, "learning_rate": 1.9818376068376068e-05, "loss": 0.1288, "step": 13380 }, { "epoch": 64.375, "grad_norm": 3.2422678470611572, "learning_rate": 1.9791666666666665e-05, "loss": 0.1052, "step": 13390 }, { "epoch": 64.42307692307692, "grad_norm": 2.8324074745178223, "learning_rate": 1.9764957264957266e-05, "loss": 0.1106, "step": 13400 }, { "epoch": 64.47115384615384, "grad_norm": 2.215822219848633, "learning_rate": 1.9738247863247864e-05, "loss": 0.1234, "step": 13410 }, { "epoch": 64.51923076923077, "grad_norm": 6.540410995483398, "learning_rate": 1.971153846153846e-05, "loss": 0.1095, "step": 13420 }, { "epoch": 64.5673076923077, "grad_norm": 4.109079360961914, "learning_rate": 1.9684829059829062e-05, "loss": 0.1095, "step": 13430 }, { "epoch": 64.61538461538461, "grad_norm": 2.471306085586548, "learning_rate": 1.965811965811966e-05, "loss": 0.1172, "step": 13440 }, { "epoch": 64.66346153846153, "grad_norm": 2.965770721435547, "learning_rate": 1.9631410256410257e-05, "loss": 0.1085, "step": 13450 }, { "epoch": 64.71153846153847, "grad_norm": 3.5223493576049805, "learning_rate": 1.9604700854700854e-05, "loss": 0.1186, "step": 13460 }, { "epoch": 64.75961538461539, "grad_norm": 2.785916328430176, "learning_rate": 1.9577991452991455e-05, "loss": 0.123, "step": 13470 }, { "epoch": 64.8076923076923, "grad_norm": 3.197739362716675, "learning_rate": 1.9551282051282052e-05, "loss": 0.1118, "step": 13480 }, { "epoch": 64.85576923076923, "grad_norm": 1.7819161415100098, "learning_rate": 1.952457264957265e-05, "loss": 0.1176, "step": 13490 }, { "epoch": 64.90384615384616, "grad_norm": 3.2425882816314697, "learning_rate": 1.949786324786325e-05, "loss": 0.127, "step": 13500 }, { "epoch": 64.95192307692308, "grad_norm": 4.678527355194092, "learning_rate": 1.9471153846153848e-05, "loss": 0.0922, "step": 13510 }, { "epoch": 65.0, "grad_norm": 2.9307703971862793, "learning_rate": 1.9444444444444445e-05, "loss": 0.1122, "step": 13520 }, { "epoch": 65.0, "eval_accuracy": 0.9135640857813884, "eval_loss": 0.49971988797187805, "eval_runtime": 77.8406, "eval_samples_per_second": 333.669, "eval_steps_per_second": 5.216, "step": 13520 }, { "epoch": 65.04807692307692, "grad_norm": 4.088137149810791, "learning_rate": 1.9417735042735043e-05, "loss": 0.0936, "step": 13530 }, { "epoch": 65.09615384615384, "grad_norm": 5.379458427429199, "learning_rate": 1.9391025641025644e-05, "loss": 0.1157, "step": 13540 }, { "epoch": 65.14423076923077, "grad_norm": 4.591498851776123, "learning_rate": 1.936431623931624e-05, "loss": 0.0978, "step": 13550 }, { "epoch": 65.1923076923077, "grad_norm": 3.62667179107666, "learning_rate": 1.933760683760684e-05, "loss": 0.0956, "step": 13560 }, { "epoch": 65.24038461538461, "grad_norm": 2.040648937225342, "learning_rate": 1.931089743589744e-05, "loss": 0.1106, "step": 13570 }, { "epoch": 65.28846153846153, "grad_norm": 5.202915668487549, "learning_rate": 1.9284188034188037e-05, "loss": 0.1009, "step": 13580 }, { "epoch": 65.33653846153847, "grad_norm": 4.7136006355285645, "learning_rate": 1.9257478632478634e-05, "loss": 0.1211, "step": 13590 }, { "epoch": 65.38461538461539, "grad_norm": 3.6419131755828857, "learning_rate": 1.923076923076923e-05, "loss": 0.1213, "step": 13600 }, { "epoch": 65.4326923076923, "grad_norm": 5.758768081665039, "learning_rate": 1.9204059829059832e-05, "loss": 0.1216, "step": 13610 }, { "epoch": 65.48076923076923, "grad_norm": 3.7634239196777344, "learning_rate": 1.917735042735043e-05, "loss": 0.1173, "step": 13620 }, { "epoch": 65.52884615384616, "grad_norm": 3.9899024963378906, "learning_rate": 1.9150641025641027e-05, "loss": 0.1113, "step": 13630 }, { "epoch": 65.57692307692308, "grad_norm": 3.3848178386688232, "learning_rate": 1.9123931623931624e-05, "loss": 0.1092, "step": 13640 }, { "epoch": 65.625, "grad_norm": 3.981973648071289, "learning_rate": 1.9097222222222222e-05, "loss": 0.1093, "step": 13650 }, { "epoch": 65.67307692307692, "grad_norm": 2.259121894836426, "learning_rate": 1.907051282051282e-05, "loss": 0.109, "step": 13660 }, { "epoch": 65.72115384615384, "grad_norm": 2.834968090057373, "learning_rate": 1.904380341880342e-05, "loss": 0.1062, "step": 13670 }, { "epoch": 65.76923076923077, "grad_norm": 4.149142742156982, "learning_rate": 1.9017094017094017e-05, "loss": 0.1214, "step": 13680 }, { "epoch": 65.8173076923077, "grad_norm": 3.0291099548339844, "learning_rate": 1.8990384615384615e-05, "loss": 0.1074, "step": 13690 }, { "epoch": 65.86538461538461, "grad_norm": 3.4079811573028564, "learning_rate": 1.8963675213675212e-05, "loss": 0.1061, "step": 13700 }, { "epoch": 65.91346153846153, "grad_norm": 3.3245084285736084, "learning_rate": 1.8936965811965813e-05, "loss": 0.11, "step": 13710 }, { "epoch": 65.96153846153847, "grad_norm": 2.563086748123169, "learning_rate": 1.891025641025641e-05, "loss": 0.1089, "step": 13720 }, { "epoch": 66.0, "eval_accuracy": 0.9139491009894891, "eval_loss": 0.513412356376648, "eval_runtime": 78.2865, "eval_samples_per_second": 331.769, "eval_steps_per_second": 5.186, "step": 13728 }, { "epoch": 66.00961538461539, "grad_norm": 3.413740634918213, "learning_rate": 1.8883547008547008e-05, "loss": 0.1149, "step": 13730 }, { "epoch": 66.0576923076923, "grad_norm": 2.286529064178467, "learning_rate": 1.885683760683761e-05, "loss": 0.1071, "step": 13740 }, { "epoch": 66.10576923076923, "grad_norm": 3.566603422164917, "learning_rate": 1.8830128205128206e-05, "loss": 0.1076, "step": 13750 }, { "epoch": 66.15384615384616, "grad_norm": 3.0578103065490723, "learning_rate": 1.8803418803418804e-05, "loss": 0.0908, "step": 13760 }, { "epoch": 66.20192307692308, "grad_norm": 5.4556121826171875, "learning_rate": 1.87767094017094e-05, "loss": 0.1115, "step": 13770 }, { "epoch": 66.25, "grad_norm": 2.449287176132202, "learning_rate": 1.8750000000000002e-05, "loss": 0.1076, "step": 13780 }, { "epoch": 66.29807692307692, "grad_norm": 2.7167327404022217, "learning_rate": 1.87232905982906e-05, "loss": 0.1052, "step": 13790 }, { "epoch": 66.34615384615384, "grad_norm": 2.4165351390838623, "learning_rate": 1.8696581196581197e-05, "loss": 0.1133, "step": 13800 }, { "epoch": 66.39423076923077, "grad_norm": 3.601806879043579, "learning_rate": 1.8669871794871797e-05, "loss": 0.1171, "step": 13810 }, { "epoch": 66.4423076923077, "grad_norm": 1.8974459171295166, "learning_rate": 1.8643162393162395e-05, "loss": 0.113, "step": 13820 }, { "epoch": 66.49038461538461, "grad_norm": 1.367411494255066, "learning_rate": 1.8616452991452992e-05, "loss": 0.1114, "step": 13830 }, { "epoch": 66.53846153846153, "grad_norm": 2.276789665222168, "learning_rate": 1.858974358974359e-05, "loss": 0.0992, "step": 13840 }, { "epoch": 66.58653846153847, "grad_norm": 2.931976795196533, "learning_rate": 1.856303418803419e-05, "loss": 0.1083, "step": 13850 }, { "epoch": 66.63461538461539, "grad_norm": 6.692258358001709, "learning_rate": 1.8536324786324788e-05, "loss": 0.1163, "step": 13860 }, { "epoch": 66.6826923076923, "grad_norm": 2.0232577323913574, "learning_rate": 1.8509615384615385e-05, "loss": 0.1041, "step": 13870 }, { "epoch": 66.73076923076923, "grad_norm": 3.789863109588623, "learning_rate": 1.8482905982905986e-05, "loss": 0.1019, "step": 13880 }, { "epoch": 66.77884615384616, "grad_norm": 3.699845314025879, "learning_rate": 1.8456196581196583e-05, "loss": 0.1157, "step": 13890 }, { "epoch": 66.82692307692308, "grad_norm": 2.7714157104492188, "learning_rate": 1.842948717948718e-05, "loss": 0.1196, "step": 13900 }, { "epoch": 66.875, "grad_norm": 1.6233631372451782, "learning_rate": 1.8402777777777778e-05, "loss": 0.1073, "step": 13910 }, { "epoch": 66.92307692307692, "grad_norm": 4.6705708503723145, "learning_rate": 1.837606837606838e-05, "loss": 0.109, "step": 13920 }, { "epoch": 66.97115384615384, "grad_norm": 3.1575567722320557, "learning_rate": 1.8349358974358973e-05, "loss": 0.1179, "step": 13930 }, { "epoch": 67.0, "eval_accuracy": 0.915489161821892, "eval_loss": 0.5245709419250488, "eval_runtime": 77.8171, "eval_samples_per_second": 333.77, "eval_steps_per_second": 5.217, "step": 13936 }, { "epoch": 67.01923076923077, "grad_norm": 1.7736655473709106, "learning_rate": 1.8322649572649574e-05, "loss": 0.0887, "step": 13940 }, { "epoch": 67.0673076923077, "grad_norm": 2.5629584789276123, "learning_rate": 1.829594017094017e-05, "loss": 0.1245, "step": 13950 }, { "epoch": 67.11538461538461, "grad_norm": 3.231628179550171, "learning_rate": 1.826923076923077e-05, "loss": 0.112, "step": 13960 }, { "epoch": 67.16346153846153, "grad_norm": 4.511459827423096, "learning_rate": 1.8242521367521366e-05, "loss": 0.0899, "step": 13970 }, { "epoch": 67.21153846153847, "grad_norm": 1.8632941246032715, "learning_rate": 1.8215811965811967e-05, "loss": 0.0918, "step": 13980 }, { "epoch": 67.25961538461539, "grad_norm": 4.662261486053467, "learning_rate": 1.8189102564102564e-05, "loss": 0.1069, "step": 13990 }, { "epoch": 67.3076923076923, "grad_norm": 2.418731689453125, "learning_rate": 1.8162393162393162e-05, "loss": 0.1158, "step": 14000 }, { "epoch": 67.35576923076923, "grad_norm": 2.7127861976623535, "learning_rate": 1.8135683760683762e-05, "loss": 0.1053, "step": 14010 }, { "epoch": 67.40384615384616, "grad_norm": 2.3538851737976074, "learning_rate": 1.810897435897436e-05, "loss": 0.1186, "step": 14020 }, { "epoch": 67.45192307692308, "grad_norm": 3.0625810623168945, "learning_rate": 1.8082264957264957e-05, "loss": 0.1007, "step": 14030 }, { "epoch": 67.5, "grad_norm": 5.781796932220459, "learning_rate": 1.8055555555555555e-05, "loss": 0.1082, "step": 14040 }, { "epoch": 67.54807692307692, "grad_norm": 3.5307602882385254, "learning_rate": 1.8028846153846156e-05, "loss": 0.0968, "step": 14050 }, { "epoch": 67.59615384615384, "grad_norm": 3.1812825202941895, "learning_rate": 1.8002136752136753e-05, "loss": 0.1124, "step": 14060 }, { "epoch": 67.64423076923077, "grad_norm": 1.8479608297348022, "learning_rate": 1.797542735042735e-05, "loss": 0.1118, "step": 14070 }, { "epoch": 67.6923076923077, "grad_norm": 1.4379651546478271, "learning_rate": 1.794871794871795e-05, "loss": 0.1003, "step": 14080 }, { "epoch": 67.74038461538461, "grad_norm": 1.2789790630340576, "learning_rate": 1.792200854700855e-05, "loss": 0.1016, "step": 14090 }, { "epoch": 67.78846153846153, "grad_norm": 5.423507213592529, "learning_rate": 1.7895299145299146e-05, "loss": 0.099, "step": 14100 }, { "epoch": 67.83653846153847, "grad_norm": 1.740647554397583, "learning_rate": 1.7868589743589743e-05, "loss": 0.1118, "step": 14110 }, { "epoch": 67.88461538461539, "grad_norm": 1.6359113454818726, "learning_rate": 1.7841880341880344e-05, "loss": 0.113, "step": 14120 }, { "epoch": 67.9326923076923, "grad_norm": 2.601740837097168, "learning_rate": 1.781517094017094e-05, "loss": 0.1171, "step": 14130 }, { "epoch": 67.98076923076923, "grad_norm": 2.506756544113159, "learning_rate": 1.778846153846154e-05, "loss": 0.0934, "step": 14140 }, { "epoch": 68.0, "eval_accuracy": 0.9126015477611366, "eval_loss": 0.515838086605072, "eval_runtime": 78.1431, "eval_samples_per_second": 332.377, "eval_steps_per_second": 5.196, "step": 14144 }, { "epoch": 68.02884615384616, "grad_norm": 2.9861562252044678, "learning_rate": 1.776175213675214e-05, "loss": 0.1187, "step": 14150 }, { "epoch": 68.07692307692308, "grad_norm": 4.135077953338623, "learning_rate": 1.7735042735042737e-05, "loss": 0.0934, "step": 14160 }, { "epoch": 68.125, "grad_norm": 3.672657012939453, "learning_rate": 1.7708333333333335e-05, "loss": 0.102, "step": 14170 }, { "epoch": 68.17307692307692, "grad_norm": 2.94230580329895, "learning_rate": 1.7681623931623932e-05, "loss": 0.1047, "step": 14180 }, { "epoch": 68.22115384615384, "grad_norm": 2.2634410858154297, "learning_rate": 1.7654914529914533e-05, "loss": 0.1132, "step": 14190 }, { "epoch": 68.26923076923077, "grad_norm": 5.441643238067627, "learning_rate": 1.762820512820513e-05, "loss": 0.098, "step": 14200 }, { "epoch": 68.3173076923077, "grad_norm": 3.5039448738098145, "learning_rate": 1.7601495726495728e-05, "loss": 0.1086, "step": 14210 }, { "epoch": 68.36538461538461, "grad_norm": 2.240744113922119, "learning_rate": 1.7574786324786325e-05, "loss": 0.1076, "step": 14220 }, { "epoch": 68.41346153846153, "grad_norm": 2.006989002227783, "learning_rate": 1.7548076923076922e-05, "loss": 0.1058, "step": 14230 }, { "epoch": 68.46153846153847, "grad_norm": 1.8120282888412476, "learning_rate": 1.752136752136752e-05, "loss": 0.1059, "step": 14240 }, { "epoch": 68.50961538461539, "grad_norm": 5.546671390533447, "learning_rate": 1.749465811965812e-05, "loss": 0.1086, "step": 14250 }, { "epoch": 68.5576923076923, "grad_norm": 2.289293050765991, "learning_rate": 1.7467948717948718e-05, "loss": 0.11, "step": 14260 }, { "epoch": 68.60576923076923, "grad_norm": 3.4425573348999023, "learning_rate": 1.7441239316239315e-05, "loss": 0.112, "step": 14270 }, { "epoch": 68.65384615384616, "grad_norm": 2.0034613609313965, "learning_rate": 1.7414529914529913e-05, "loss": 0.0963, "step": 14280 }, { "epoch": 68.70192307692308, "grad_norm": 2.532435655593872, "learning_rate": 1.7387820512820514e-05, "loss": 0.1046, "step": 14290 }, { "epoch": 68.75, "grad_norm": 5.371732711791992, "learning_rate": 1.736111111111111e-05, "loss": 0.1298, "step": 14300 }, { "epoch": 68.79807692307692, "grad_norm": 1.5286190509796143, "learning_rate": 1.733440170940171e-05, "loss": 0.1053, "step": 14310 }, { "epoch": 68.84615384615384, "grad_norm": 1.9245264530181885, "learning_rate": 1.730769230769231e-05, "loss": 0.1142, "step": 14320 }, { "epoch": 68.89423076923077, "grad_norm": 2.233588933944702, "learning_rate": 1.7280982905982907e-05, "loss": 0.1166, "step": 14330 }, { "epoch": 68.9423076923077, "grad_norm": 3.3462846279144287, "learning_rate": 1.7254273504273504e-05, "loss": 0.0905, "step": 14340 }, { "epoch": 68.99038461538461, "grad_norm": 5.635544300079346, "learning_rate": 1.7227564102564105e-05, "loss": 0.1011, "step": 14350 }, { "epoch": 69.0, "eval_accuracy": 0.9139876025102992, "eval_loss": 0.5361305475234985, "eval_runtime": 77.8948, "eval_samples_per_second": 333.437, "eval_steps_per_second": 5.212, "step": 14352 }, { "epoch": 69.03846153846153, "grad_norm": 5.274500370025635, "learning_rate": 1.7200854700854702e-05, "loss": 0.1179, "step": 14360 }, { "epoch": 69.08653846153847, "grad_norm": 5.332326889038086, "learning_rate": 1.71741452991453e-05, "loss": 0.1057, "step": 14370 }, { "epoch": 69.13461538461539, "grad_norm": 2.1405301094055176, "learning_rate": 1.7147435897435897e-05, "loss": 0.1019, "step": 14380 }, { "epoch": 69.1826923076923, "grad_norm": 4.011329174041748, "learning_rate": 1.7120726495726498e-05, "loss": 0.0988, "step": 14390 }, { "epoch": 69.23076923076923, "grad_norm": 3.6533944606781006, "learning_rate": 1.7094017094017095e-05, "loss": 0.1055, "step": 14400 }, { "epoch": 69.27884615384616, "grad_norm": 1.3695071935653687, "learning_rate": 1.7067307692307693e-05, "loss": 0.1118, "step": 14410 }, { "epoch": 69.32692307692308, "grad_norm": 3.0202457904815674, "learning_rate": 1.7040598290598294e-05, "loss": 0.1205, "step": 14420 }, { "epoch": 69.375, "grad_norm": 1.414981722831726, "learning_rate": 1.701388888888889e-05, "loss": 0.0993, "step": 14430 }, { "epoch": 69.42307692307692, "grad_norm": 4.544799327850342, "learning_rate": 1.698717948717949e-05, "loss": 0.1069, "step": 14440 }, { "epoch": 69.47115384615384, "grad_norm": 2.550673007965088, "learning_rate": 1.6960470085470086e-05, "loss": 0.101, "step": 14450 }, { "epoch": 69.51923076923077, "grad_norm": 3.903299570083618, "learning_rate": 1.6933760683760687e-05, "loss": 0.1093, "step": 14460 }, { "epoch": 69.5673076923077, "grad_norm": 2.8616390228271484, "learning_rate": 1.6907051282051284e-05, "loss": 0.1152, "step": 14470 }, { "epoch": 69.61538461538461, "grad_norm": 4.211837291717529, "learning_rate": 1.688034188034188e-05, "loss": 0.1108, "step": 14480 }, { "epoch": 69.66346153846153, "grad_norm": 1.5082464218139648, "learning_rate": 1.6853632478632482e-05, "loss": 0.1046, "step": 14490 }, { "epoch": 69.71153846153847, "grad_norm": 2.0481510162353516, "learning_rate": 1.682692307692308e-05, "loss": 0.1094, "step": 14500 }, { "epoch": 69.75961538461539, "grad_norm": 5.138906955718994, "learning_rate": 1.6800213675213677e-05, "loss": 0.1041, "step": 14510 }, { "epoch": 69.8076923076923, "grad_norm": 3.433218240737915, "learning_rate": 1.6773504273504274e-05, "loss": 0.0962, "step": 14520 }, { "epoch": 69.85576923076923, "grad_norm": 2.955880880355835, "learning_rate": 1.6746794871794872e-05, "loss": 0.1092, "step": 14530 }, { "epoch": 69.90384615384616, "grad_norm": 2.595198154449463, "learning_rate": 1.672008547008547e-05, "loss": 0.1191, "step": 14540 }, { "epoch": 69.95192307692308, "grad_norm": 7.040813446044922, "learning_rate": 1.6693376068376067e-05, "loss": 0.1196, "step": 14550 }, { "epoch": 70.0, "grad_norm": 3.943873882293701, "learning_rate": 1.6666666666666667e-05, "loss": 0.1063, "step": 14560 }, { "epoch": 70.0, "eval_accuracy": 0.9134870827397682, "eval_loss": 0.5325689315795898, "eval_runtime": 78.5608, "eval_samples_per_second": 330.61, "eval_steps_per_second": 5.168, "step": 14560 }, { "epoch": 70.04807692307692, "grad_norm": 2.833099603652954, "learning_rate": 1.6639957264957265e-05, "loss": 0.1057, "step": 14570 }, { "epoch": 70.09615384615384, "grad_norm": 1.8072550296783447, "learning_rate": 1.6613247863247862e-05, "loss": 0.095, "step": 14580 }, { "epoch": 70.14423076923077, "grad_norm": 1.8567041158676147, "learning_rate": 1.6586538461538463e-05, "loss": 0.0905, "step": 14590 }, { "epoch": 70.1923076923077, "grad_norm": 2.701629400253296, "learning_rate": 1.655982905982906e-05, "loss": 0.1085, "step": 14600 }, { "epoch": 70.24038461538461, "grad_norm": 4.328616619110107, "learning_rate": 1.6533119658119658e-05, "loss": 0.109, "step": 14610 }, { "epoch": 70.28846153846153, "grad_norm": 2.331547498703003, "learning_rate": 1.6506410256410255e-05, "loss": 0.0981, "step": 14620 }, { "epoch": 70.33653846153847, "grad_norm": 5.52113676071167, "learning_rate": 1.6479700854700856e-05, "loss": 0.1074, "step": 14630 }, { "epoch": 70.38461538461539, "grad_norm": 2.3755226135253906, "learning_rate": 1.6452991452991454e-05, "loss": 0.1073, "step": 14640 }, { "epoch": 70.4326923076923, "grad_norm": 1.862847924232483, "learning_rate": 1.642628205128205e-05, "loss": 0.1142, "step": 14650 }, { "epoch": 70.48076923076923, "grad_norm": 3.9783945083618164, "learning_rate": 1.6399572649572652e-05, "loss": 0.1198, "step": 14660 }, { "epoch": 70.52884615384616, "grad_norm": 2.81416916847229, "learning_rate": 1.637286324786325e-05, "loss": 0.1052, "step": 14670 }, { "epoch": 70.57692307692308, "grad_norm": 2.609402656555176, "learning_rate": 1.6346153846153847e-05, "loss": 0.1029, "step": 14680 }, { "epoch": 70.625, "grad_norm": 0.9833598136901855, "learning_rate": 1.6319444444444444e-05, "loss": 0.0973, "step": 14690 }, { "epoch": 70.67307692307692, "grad_norm": 2.913058042526245, "learning_rate": 1.6292735042735045e-05, "loss": 0.107, "step": 14700 }, { "epoch": 70.72115384615384, "grad_norm": 2.6925768852233887, "learning_rate": 1.6266025641025642e-05, "loss": 0.1084, "step": 14710 }, { "epoch": 70.76923076923077, "grad_norm": 2.5607311725616455, "learning_rate": 1.623931623931624e-05, "loss": 0.1132, "step": 14720 }, { "epoch": 70.8173076923077, "grad_norm": 2.265489339828491, "learning_rate": 1.621260683760684e-05, "loss": 0.0942, "step": 14730 }, { "epoch": 70.86538461538461, "grad_norm": 2.9140942096710205, "learning_rate": 1.6185897435897438e-05, "loss": 0.1106, "step": 14740 }, { "epoch": 70.91346153846153, "grad_norm": 2.9793787002563477, "learning_rate": 1.6159188034188035e-05, "loss": 0.0939, "step": 14750 }, { "epoch": 70.96153846153847, "grad_norm": 2.7988831996917725, "learning_rate": 1.6132478632478633e-05, "loss": 0.1021, "step": 14760 }, { "epoch": 71.0, "eval_accuracy": 0.9142956146767798, "eval_loss": 0.5150610208511353, "eval_runtime": 78.2448, "eval_samples_per_second": 331.945, "eval_steps_per_second": 5.189, "step": 14768 }, { "epoch": 71.00961538461539, "grad_norm": 1.671943187713623, "learning_rate": 1.6105769230769233e-05, "loss": 0.1089, "step": 14770 }, { "epoch": 71.0576923076923, "grad_norm": 4.578638076782227, "learning_rate": 1.607905982905983e-05, "loss": 0.116, "step": 14780 }, { "epoch": 71.10576923076923, "grad_norm": 2.376232862472534, "learning_rate": 1.6052350427350428e-05, "loss": 0.087, "step": 14790 }, { "epoch": 71.15384615384616, "grad_norm": 3.426814556121826, "learning_rate": 1.602564102564103e-05, "loss": 0.0913, "step": 14800 }, { "epoch": 71.20192307692308, "grad_norm": 3.4905331134796143, "learning_rate": 1.5998931623931623e-05, "loss": 0.1123, "step": 14810 }, { "epoch": 71.25, "grad_norm": 2.537357807159424, "learning_rate": 1.597222222222222e-05, "loss": 0.0977, "step": 14820 }, { "epoch": 71.29807692307692, "grad_norm": 2.570263385772705, "learning_rate": 1.594551282051282e-05, "loss": 0.1031, "step": 14830 }, { "epoch": 71.34615384615384, "grad_norm": 4.026021957397461, "learning_rate": 1.591880341880342e-05, "loss": 0.1121, "step": 14840 }, { "epoch": 71.39423076923077, "grad_norm": 3.7087254524230957, "learning_rate": 1.5892094017094016e-05, "loss": 0.1018, "step": 14850 }, { "epoch": 71.4423076923077, "grad_norm": 3.3756182193756104, "learning_rate": 1.5865384615384617e-05, "loss": 0.105, "step": 14860 }, { "epoch": 71.49038461538461, "grad_norm": 4.8673529624938965, "learning_rate": 1.5838675213675214e-05, "loss": 0.0916, "step": 14870 }, { "epoch": 71.53846153846153, "grad_norm": 4.355525493621826, "learning_rate": 1.581196581196581e-05, "loss": 0.0988, "step": 14880 }, { "epoch": 71.58653846153847, "grad_norm": 2.1988093852996826, "learning_rate": 1.578525641025641e-05, "loss": 0.1091, "step": 14890 }, { "epoch": 71.63461538461539, "grad_norm": 6.29646110534668, "learning_rate": 1.575854700854701e-05, "loss": 0.1219, "step": 14900 }, { "epoch": 71.6826923076923, "grad_norm": 4.134830474853516, "learning_rate": 1.5731837606837607e-05, "loss": 0.1, "step": 14910 }, { "epoch": 71.73076923076923, "grad_norm": 2.4253101348876953, "learning_rate": 1.5705128205128205e-05, "loss": 0.1119, "step": 14920 }, { "epoch": 71.77884615384616, "grad_norm": 4.388179779052734, "learning_rate": 1.5678418803418806e-05, "loss": 0.0966, "step": 14930 }, { "epoch": 71.82692307692308, "grad_norm": 5.484368801116943, "learning_rate": 1.5651709401709403e-05, "loss": 0.1054, "step": 14940 }, { "epoch": 71.875, "grad_norm": 2.2221689224243164, "learning_rate": 1.5625e-05, "loss": 0.1009, "step": 14950 }, { "epoch": 71.92307692307692, "grad_norm": 2.478428363800049, "learning_rate": 1.5598290598290598e-05, "loss": 0.0972, "step": 14960 }, { "epoch": 71.97115384615384, "grad_norm": 4.488917350769043, "learning_rate": 1.55715811965812e-05, "loss": 0.1007, "step": 14970 }, { "epoch": 72.0, "eval_accuracy": 0.9143341161975898, "eval_loss": 0.5390403270721436, "eval_runtime": 77.9802, "eval_samples_per_second": 333.072, "eval_steps_per_second": 5.206, "step": 14976 }, { "epoch": 72.01923076923077, "grad_norm": 2.235558271408081, "learning_rate": 1.5544871794871796e-05, "loss": 0.1004, "step": 14980 }, { "epoch": 72.0673076923077, "grad_norm": 2.4506728649139404, "learning_rate": 1.5518162393162393e-05, "loss": 0.1029, "step": 14990 }, { "epoch": 72.11538461538461, "grad_norm": 4.077917098999023, "learning_rate": 1.5491452991452994e-05, "loss": 0.1031, "step": 15000 }, { "epoch": 72.16346153846153, "grad_norm": 2.384612560272217, "learning_rate": 1.546474358974359e-05, "loss": 0.1, "step": 15010 }, { "epoch": 72.21153846153847, "grad_norm": 3.019204616546631, "learning_rate": 1.543803418803419e-05, "loss": 0.0897, "step": 15020 }, { "epoch": 72.25961538461539, "grad_norm": 2.0474941730499268, "learning_rate": 1.5411324786324786e-05, "loss": 0.1068, "step": 15030 }, { "epoch": 72.3076923076923, "grad_norm": 3.6571717262268066, "learning_rate": 1.5384615384615387e-05, "loss": 0.1043, "step": 15040 }, { "epoch": 72.35576923076923, "grad_norm": 3.4864730834960938, "learning_rate": 1.5357905982905985e-05, "loss": 0.1077, "step": 15050 }, { "epoch": 72.40384615384616, "grad_norm": 3.221208333969116, "learning_rate": 1.5331196581196582e-05, "loss": 0.1265, "step": 15060 }, { "epoch": 72.45192307692308, "grad_norm": 2.776719331741333, "learning_rate": 1.5304487179487183e-05, "loss": 0.1121, "step": 15070 }, { "epoch": 72.5, "grad_norm": 5.7793426513671875, "learning_rate": 1.527777777777778e-05, "loss": 0.1065, "step": 15080 }, { "epoch": 72.54807692307692, "grad_norm": 2.313784122467041, "learning_rate": 1.5251068376068378e-05, "loss": 0.0885, "step": 15090 }, { "epoch": 72.59615384615384, "grad_norm": 2.0415291786193848, "learning_rate": 1.5224358974358973e-05, "loss": 0.1129, "step": 15100 }, { "epoch": 72.64423076923077, "grad_norm": 3.4211819171905518, "learning_rate": 1.5197649572649572e-05, "loss": 0.1016, "step": 15110 }, { "epoch": 72.6923076923077, "grad_norm": 1.5076862573623657, "learning_rate": 1.517094017094017e-05, "loss": 0.0883, "step": 15120 }, { "epoch": 72.74038461538461, "grad_norm": 3.787508964538574, "learning_rate": 1.5144230769230769e-05, "loss": 0.1217, "step": 15130 }, { "epoch": 72.78846153846153, "grad_norm": 3.222301483154297, "learning_rate": 1.5117521367521368e-05, "loss": 0.0954, "step": 15140 }, { "epoch": 72.83653846153847, "grad_norm": 6.939395904541016, "learning_rate": 1.5090811965811965e-05, "loss": 0.1007, "step": 15150 }, { "epoch": 72.88461538461539, "grad_norm": 2.2790746688842773, "learning_rate": 1.5064102564102565e-05, "loss": 0.0989, "step": 15160 }, { "epoch": 72.9326923076923, "grad_norm": 2.2317144870758057, "learning_rate": 1.5037393162393162e-05, "loss": 0.1084, "step": 15170 }, { "epoch": 72.98076923076923, "grad_norm": 2.2722866535186768, "learning_rate": 1.5010683760683761e-05, "loss": 0.0946, "step": 15180 }, { "epoch": 73.0, "eval_accuracy": 0.9113694990952143, "eval_loss": 0.5255917906761169, "eval_runtime": 78.4704, "eval_samples_per_second": 330.991, "eval_steps_per_second": 5.174, "step": 15184 }, { "epoch": 73.02884615384616, "grad_norm": 3.466823101043701, "learning_rate": 1.4983974358974358e-05, "loss": 0.1167, "step": 15190 }, { "epoch": 73.07692307692308, "grad_norm": 3.292056083679199, "learning_rate": 1.4957264957264958e-05, "loss": 0.096, "step": 15200 }, { "epoch": 73.125, "grad_norm": 3.2137527465820312, "learning_rate": 1.4930555555555557e-05, "loss": 0.1071, "step": 15210 }, { "epoch": 73.17307692307692, "grad_norm": 1.7435493469238281, "learning_rate": 1.4903846153846154e-05, "loss": 0.1203, "step": 15220 }, { "epoch": 73.22115384615384, "grad_norm": 3.5387256145477295, "learning_rate": 1.4877136752136753e-05, "loss": 0.0928, "step": 15230 }, { "epoch": 73.26923076923077, "grad_norm": 2.7466559410095215, "learning_rate": 1.485042735042735e-05, "loss": 0.0958, "step": 15240 }, { "epoch": 73.3173076923077, "grad_norm": 2.3199105262756348, "learning_rate": 1.482371794871795e-05, "loss": 0.1042, "step": 15250 }, { "epoch": 73.36538461538461, "grad_norm": 3.56225323677063, "learning_rate": 1.4797008547008547e-05, "loss": 0.1033, "step": 15260 }, { "epoch": 73.41346153846153, "grad_norm": 2.1168835163116455, "learning_rate": 1.4770299145299146e-05, "loss": 0.105, "step": 15270 }, { "epoch": 73.46153846153847, "grad_norm": 2.026855707168579, "learning_rate": 1.4743589743589745e-05, "loss": 0.0942, "step": 15280 }, { "epoch": 73.50961538461539, "grad_norm": 3.9390501976013184, "learning_rate": 1.4716880341880343e-05, "loss": 0.1024, "step": 15290 }, { "epoch": 73.5576923076923, "grad_norm": 2.745870351791382, "learning_rate": 1.4690170940170942e-05, "loss": 0.1188, "step": 15300 }, { "epoch": 73.60576923076923, "grad_norm": 2.124110221862793, "learning_rate": 1.466346153846154e-05, "loss": 0.1045, "step": 15310 }, { "epoch": 73.65384615384616, "grad_norm": 1.746979832649231, "learning_rate": 1.4636752136752138e-05, "loss": 0.1182, "step": 15320 }, { "epoch": 73.70192307692308, "grad_norm": 2.209077835083008, "learning_rate": 1.4610042735042736e-05, "loss": 0.098, "step": 15330 }, { "epoch": 73.75, "grad_norm": 2.9679689407348633, "learning_rate": 1.4583333333333335e-05, "loss": 0.1061, "step": 15340 }, { "epoch": 73.79807692307692, "grad_norm": 3.369373083114624, "learning_rate": 1.4556623931623934e-05, "loss": 0.0962, "step": 15350 }, { "epoch": 73.84615384615384, "grad_norm": 3.0389370918273926, "learning_rate": 1.4529914529914531e-05, "loss": 0.0961, "step": 15360 }, { "epoch": 73.89423076923077, "grad_norm": 1.9590582847595215, "learning_rate": 1.450320512820513e-05, "loss": 0.0964, "step": 15370 }, { "epoch": 73.9423076923077, "grad_norm": 1.9072679281234741, "learning_rate": 1.4476495726495728e-05, "loss": 0.0928, "step": 15380 }, { "epoch": 73.99038461538461, "grad_norm": 2.2671141624450684, "learning_rate": 1.4449786324786324e-05, "loss": 0.097, "step": 15390 }, { "epoch": 74.0, "eval_accuracy": 0.9135255842605783, "eval_loss": 0.5247063040733337, "eval_runtime": 77.7064, "eval_samples_per_second": 334.245, "eval_steps_per_second": 5.225, "step": 15392 }, { "epoch": 74.03846153846153, "grad_norm": 3.8708548545837402, "learning_rate": 1.4423076923076923e-05, "loss": 0.109, "step": 15400 }, { "epoch": 74.08653846153847, "grad_norm": 2.789531707763672, "learning_rate": 1.439636752136752e-05, "loss": 0.1016, "step": 15410 }, { "epoch": 74.13461538461539, "grad_norm": 3.0866551399230957, "learning_rate": 1.436965811965812e-05, "loss": 0.1022, "step": 15420 }, { "epoch": 74.1826923076923, "grad_norm": 1.9794228076934814, "learning_rate": 1.4342948717948718e-05, "loss": 0.0969, "step": 15430 }, { "epoch": 74.23076923076923, "grad_norm": 2.115936279296875, "learning_rate": 1.4316239316239316e-05, "loss": 0.107, "step": 15440 }, { "epoch": 74.27884615384616, "grad_norm": 2.1809399127960205, "learning_rate": 1.4289529914529915e-05, "loss": 0.097, "step": 15450 }, { "epoch": 74.32692307692308, "grad_norm": 3.2765209674835205, "learning_rate": 1.4262820512820512e-05, "loss": 0.1117, "step": 15460 }, { "epoch": 74.375, "grad_norm": 2.5656588077545166, "learning_rate": 1.4236111111111111e-05, "loss": 0.1024, "step": 15470 }, { "epoch": 74.42307692307692, "grad_norm": 4.139174461364746, "learning_rate": 1.4209401709401709e-05, "loss": 0.1026, "step": 15480 }, { "epoch": 74.47115384615384, "grad_norm": 3.3554646968841553, "learning_rate": 1.4182692307692308e-05, "loss": 0.1111, "step": 15490 }, { "epoch": 74.51923076923077, "grad_norm": 2.0241644382476807, "learning_rate": 1.4155982905982907e-05, "loss": 0.0916, "step": 15500 }, { "epoch": 74.5673076923077, "grad_norm": 3.404829978942871, "learning_rate": 1.4129273504273504e-05, "loss": 0.1026, "step": 15510 }, { "epoch": 74.61538461538461, "grad_norm": 2.527398109436035, "learning_rate": 1.4102564102564104e-05, "loss": 0.0941, "step": 15520 }, { "epoch": 74.66346153846153, "grad_norm": 4.009344100952148, "learning_rate": 1.4075854700854701e-05, "loss": 0.1059, "step": 15530 }, { "epoch": 74.71153846153847, "grad_norm": 3.0246875286102295, "learning_rate": 1.40491452991453e-05, "loss": 0.0962, "step": 15540 }, { "epoch": 74.75961538461539, "grad_norm": 5.306474685668945, "learning_rate": 1.4022435897435897e-05, "loss": 0.126, "step": 15550 }, { "epoch": 74.8076923076923, "grad_norm": 1.8366721868515015, "learning_rate": 1.3995726495726497e-05, "loss": 0.0894, "step": 15560 }, { "epoch": 74.85576923076923, "grad_norm": 1.816127061843872, "learning_rate": 1.3969017094017096e-05, "loss": 0.106, "step": 15570 }, { "epoch": 74.90384615384616, "grad_norm": 4.604760646820068, "learning_rate": 1.3942307692307693e-05, "loss": 0.1062, "step": 15580 }, { "epoch": 74.95192307692308, "grad_norm": 4.432866096496582, "learning_rate": 1.3915598290598292e-05, "loss": 0.1271, "step": 15590 }, { "epoch": 75.0, "grad_norm": 1.393760323524475, "learning_rate": 1.388888888888889e-05, "loss": 0.0967, "step": 15600 }, { "epoch": 75.0, "eval_accuracy": 0.9143726177183998, "eval_loss": 0.5153675675392151, "eval_runtime": 77.6812, "eval_samples_per_second": 334.354, "eval_steps_per_second": 5.226, "step": 15600 }, { "epoch": 75.04807692307692, "grad_norm": 1.6909375190734863, "learning_rate": 1.3862179487179489e-05, "loss": 0.095, "step": 15610 }, { "epoch": 75.09615384615384, "grad_norm": 1.2919073104858398, "learning_rate": 1.3835470085470088e-05, "loss": 0.1108, "step": 15620 }, { "epoch": 75.14423076923077, "grad_norm": 3.768542766571045, "learning_rate": 1.3808760683760685e-05, "loss": 0.0986, "step": 15630 }, { "epoch": 75.1923076923077, "grad_norm": 3.6087300777435303, "learning_rate": 1.3782051282051284e-05, "loss": 0.0972, "step": 15640 }, { "epoch": 75.24038461538461, "grad_norm": 1.879106879234314, "learning_rate": 1.3755341880341882e-05, "loss": 0.1059, "step": 15650 }, { "epoch": 75.28846153846153, "grad_norm": 2.5039432048797607, "learning_rate": 1.372863247863248e-05, "loss": 0.1003, "step": 15660 }, { "epoch": 75.33653846153847, "grad_norm": 1.420624017715454, "learning_rate": 1.3701923076923078e-05, "loss": 0.093, "step": 15670 }, { "epoch": 75.38461538461539, "grad_norm": 3.1253209114074707, "learning_rate": 1.3675213675213677e-05, "loss": 0.1088, "step": 15680 }, { "epoch": 75.4326923076923, "grad_norm": 4.088733673095703, "learning_rate": 1.3648504273504273e-05, "loss": 0.1015, "step": 15690 }, { "epoch": 75.48076923076923, "grad_norm": 3.230811834335327, "learning_rate": 1.362179487179487e-05, "loss": 0.1012, "step": 15700 }, { "epoch": 75.52884615384616, "grad_norm": 2.6948182582855225, "learning_rate": 1.359508547008547e-05, "loss": 0.0993, "step": 15710 }, { "epoch": 75.57692307692308, "grad_norm": 1.9517728090286255, "learning_rate": 1.3568376068376069e-05, "loss": 0.1127, "step": 15720 }, { "epoch": 75.625, "grad_norm": 5.374283313751221, "learning_rate": 1.3541666666666666e-05, "loss": 0.1128, "step": 15730 }, { "epoch": 75.67307692307692, "grad_norm": 3.352522373199463, "learning_rate": 1.3514957264957265e-05, "loss": 0.1084, "step": 15740 }, { "epoch": 75.72115384615384, "grad_norm": 2.188495397567749, "learning_rate": 1.3488247863247863e-05, "loss": 0.1004, "step": 15750 }, { "epoch": 75.76923076923077, "grad_norm": 3.542970895767212, "learning_rate": 1.3461538461538462e-05, "loss": 0.0964, "step": 15760 }, { "epoch": 75.8173076923077, "grad_norm": 1.4103538990020752, "learning_rate": 1.343482905982906e-05, "loss": 0.0922, "step": 15770 }, { "epoch": 75.86538461538461, "grad_norm": 2.184051752090454, "learning_rate": 1.3408119658119658e-05, "loss": 0.0979, "step": 15780 }, { "epoch": 75.91346153846153, "grad_norm": 1.7559014558792114, "learning_rate": 1.3381410256410257e-05, "loss": 0.1042, "step": 15790 }, { "epoch": 75.96153846153847, "grad_norm": 2.4822113513946533, "learning_rate": 1.3354700854700855e-05, "loss": 0.0985, "step": 15800 }, { "epoch": 76.0, "eval_accuracy": 0.9153736572594617, "eval_loss": 0.5411583185195923, "eval_runtime": 78.7627, "eval_samples_per_second": 329.763, "eval_steps_per_second": 5.155, "step": 15808 }, { "epoch": 76.00961538461539, "grad_norm": 2.8133301734924316, "learning_rate": 1.3327991452991454e-05, "loss": 0.1108, "step": 15810 }, { "epoch": 76.0576923076923, "grad_norm": 3.0187880992889404, "learning_rate": 1.3301282051282051e-05, "loss": 0.0908, "step": 15820 }, { "epoch": 76.10576923076923, "grad_norm": 1.6150387525558472, "learning_rate": 1.327457264957265e-05, "loss": 0.114, "step": 15830 }, { "epoch": 76.15384615384616, "grad_norm": 2.2432942390441895, "learning_rate": 1.324786324786325e-05, "loss": 0.0874, "step": 15840 }, { "epoch": 76.20192307692308, "grad_norm": 1.2803951501846313, "learning_rate": 1.3221153846153847e-05, "loss": 0.0902, "step": 15850 }, { "epoch": 76.25, "grad_norm": 3.288942813873291, "learning_rate": 1.3194444444444446e-05, "loss": 0.0909, "step": 15860 }, { "epoch": 76.29807692307692, "grad_norm": 3.1004574298858643, "learning_rate": 1.3167735042735043e-05, "loss": 0.0904, "step": 15870 }, { "epoch": 76.34615384615384, "grad_norm": 3.7634103298187256, "learning_rate": 1.3141025641025642e-05, "loss": 0.1131, "step": 15880 }, { "epoch": 76.39423076923077, "grad_norm": 1.3710881471633911, "learning_rate": 1.311431623931624e-05, "loss": 0.0955, "step": 15890 }, { "epoch": 76.4423076923077, "grad_norm": 3.6761765480041504, "learning_rate": 1.3087606837606839e-05, "loss": 0.1212, "step": 15900 }, { "epoch": 76.49038461538461, "grad_norm": 1.5698484182357788, "learning_rate": 1.3060897435897438e-05, "loss": 0.0973, "step": 15910 }, { "epoch": 76.53846153846153, "grad_norm": 2.516489267349243, "learning_rate": 1.3034188034188035e-05, "loss": 0.1011, "step": 15920 }, { "epoch": 76.58653846153847, "grad_norm": 3.90030574798584, "learning_rate": 1.3007478632478635e-05, "loss": 0.1166, "step": 15930 }, { "epoch": 76.63461538461539, "grad_norm": 1.8028769493103027, "learning_rate": 1.2980769230769232e-05, "loss": 0.108, "step": 15940 }, { "epoch": 76.6826923076923, "grad_norm": 3.0352816581726074, "learning_rate": 1.2954059829059831e-05, "loss": 0.0926, "step": 15950 }, { "epoch": 76.73076923076923, "grad_norm": 2.146054267883301, "learning_rate": 1.2927350427350428e-05, "loss": 0.1033, "step": 15960 }, { "epoch": 76.77884615384616, "grad_norm": 2.1270077228546143, "learning_rate": 1.2900641025641028e-05, "loss": 0.1017, "step": 15970 }, { "epoch": 76.82692307692308, "grad_norm": 4.207494735717773, "learning_rate": 1.2873931623931623e-05, "loss": 0.099, "step": 15980 }, { "epoch": 76.875, "grad_norm": 3.2533559799194336, "learning_rate": 1.2847222222222222e-05, "loss": 0.1066, "step": 15990 }, { "epoch": 76.92307692307692, "grad_norm": 3.1646978855133057, "learning_rate": 1.282051282051282e-05, "loss": 0.0956, "step": 16000 }, { "epoch": 76.97115384615384, "grad_norm": 2.268117666244507, "learning_rate": 1.2793803418803419e-05, "loss": 0.0856, "step": 16010 }, { "epoch": 77.0, "eval_accuracy": 0.9148346359681208, "eval_loss": 0.5335399508476257, "eval_runtime": 77.9946, "eval_samples_per_second": 333.01, "eval_steps_per_second": 5.205, "step": 16016 }, { "epoch": 77.01923076923077, "grad_norm": 1.7859405279159546, "learning_rate": 1.2767094017094016e-05, "loss": 0.0989, "step": 16020 }, { "epoch": 77.0673076923077, "grad_norm": 3.278480291366577, "learning_rate": 1.2740384615384615e-05, "loss": 0.0877, "step": 16030 }, { "epoch": 77.11538461538461, "grad_norm": 2.065220832824707, "learning_rate": 1.2713675213675213e-05, "loss": 0.0995, "step": 16040 }, { "epoch": 77.16346153846153, "grad_norm": 2.589522361755371, "learning_rate": 1.2686965811965812e-05, "loss": 0.0912, "step": 16050 }, { "epoch": 77.21153846153847, "grad_norm": 2.7340521812438965, "learning_rate": 1.2660256410256411e-05, "loss": 0.0979, "step": 16060 }, { "epoch": 77.25961538461539, "grad_norm": 1.905396580696106, "learning_rate": 1.2633547008547008e-05, "loss": 0.1085, "step": 16070 }, { "epoch": 77.3076923076923, "grad_norm": 6.6660475730896, "learning_rate": 1.2606837606837608e-05, "loss": 0.1147, "step": 16080 }, { "epoch": 77.35576923076923, "grad_norm": 2.0939126014709473, "learning_rate": 1.2580128205128205e-05, "loss": 0.1001, "step": 16090 }, { "epoch": 77.40384615384616, "grad_norm": 1.9415788650512695, "learning_rate": 1.2553418803418804e-05, "loss": 0.1, "step": 16100 }, { "epoch": 77.45192307692308, "grad_norm": 2.789294958114624, "learning_rate": 1.2526709401709401e-05, "loss": 0.1002, "step": 16110 }, { "epoch": 77.5, "grad_norm": 1.9326748847961426, "learning_rate": 1.25e-05, "loss": 0.0968, "step": 16120 }, { "epoch": 77.54807692307692, "grad_norm": 1.6828869581222534, "learning_rate": 1.24732905982906e-05, "loss": 0.1004, "step": 16130 }, { "epoch": 77.59615384615384, "grad_norm": 3.6223015785217285, "learning_rate": 1.2446581196581197e-05, "loss": 0.113, "step": 16140 }, { "epoch": 77.64423076923077, "grad_norm": 2.483081340789795, "learning_rate": 1.2419871794871796e-05, "loss": 0.0916, "step": 16150 }, { "epoch": 77.6923076923077, "grad_norm": 1.4216243028640747, "learning_rate": 1.2393162393162394e-05, "loss": 0.0816, "step": 16160 }, { "epoch": 77.74038461538461, "grad_norm": 2.9108896255493164, "learning_rate": 1.2366452991452993e-05, "loss": 0.1041, "step": 16170 }, { "epoch": 77.78846153846153, "grad_norm": 4.6839799880981445, "learning_rate": 1.233974358974359e-05, "loss": 0.0979, "step": 16180 }, { "epoch": 77.83653846153847, "grad_norm": 2.515260934829712, "learning_rate": 1.231303418803419e-05, "loss": 0.1074, "step": 16190 }, { "epoch": 77.88461538461539, "grad_norm": 2.2231338024139404, "learning_rate": 1.2286324786324787e-05, "loss": 0.0844, "step": 16200 }, { "epoch": 77.9326923076923, "grad_norm": 1.8690119981765747, "learning_rate": 1.2259615384615384e-05, "loss": 0.0914, "step": 16210 }, { "epoch": 77.98076923076923, "grad_norm": 2.128370523452759, "learning_rate": 1.2232905982905983e-05, "loss": 0.103, "step": 16220 }, { "epoch": 78.0, "eval_accuracy": 0.9162206907172833, "eval_loss": 0.5209861993789673, "eval_runtime": 77.9395, "eval_samples_per_second": 333.246, "eval_steps_per_second": 5.209, "step": 16224 }, { "epoch": 78.02884615384616, "grad_norm": 2.3467838764190674, "learning_rate": 1.2206196581196582e-05, "loss": 0.0837, "step": 16230 }, { "epoch": 78.07692307692308, "grad_norm": 1.4151639938354492, "learning_rate": 1.217948717948718e-05, "loss": 0.09, "step": 16240 }, { "epoch": 78.125, "grad_norm": 2.7052934169769287, "learning_rate": 1.2152777777777779e-05, "loss": 0.1053, "step": 16250 }, { "epoch": 78.17307692307692, "grad_norm": 3.6011502742767334, "learning_rate": 1.2126068376068376e-05, "loss": 0.1021, "step": 16260 }, { "epoch": 78.22115384615384, "grad_norm": 2.308262825012207, "learning_rate": 1.2099358974358975e-05, "loss": 0.0938, "step": 16270 }, { "epoch": 78.26923076923077, "grad_norm": 1.5467151403427124, "learning_rate": 1.2072649572649573e-05, "loss": 0.1015, "step": 16280 }, { "epoch": 78.3173076923077, "grad_norm": 3.977851390838623, "learning_rate": 1.2045940170940172e-05, "loss": 0.1055, "step": 16290 }, { "epoch": 78.36538461538461, "grad_norm": 2.0429205894470215, "learning_rate": 1.2019230769230771e-05, "loss": 0.0923, "step": 16300 }, { "epoch": 78.41346153846153, "grad_norm": 2.193911075592041, "learning_rate": 1.1992521367521368e-05, "loss": 0.0887, "step": 16310 }, { "epoch": 78.46153846153847, "grad_norm": 1.7971532344818115, "learning_rate": 1.1965811965811967e-05, "loss": 0.0974, "step": 16320 }, { "epoch": 78.50961538461539, "grad_norm": 3.3205957412719727, "learning_rate": 1.1939102564102565e-05, "loss": 0.0998, "step": 16330 }, { "epoch": 78.5576923076923, "grad_norm": 2.761896848678589, "learning_rate": 1.1912393162393162e-05, "loss": 0.0961, "step": 16340 }, { "epoch": 78.60576923076923, "grad_norm": 1.9480913877487183, "learning_rate": 1.1885683760683761e-05, "loss": 0.0996, "step": 16350 }, { "epoch": 78.65384615384616, "grad_norm": 1.5991393327713013, "learning_rate": 1.1858974358974359e-05, "loss": 0.0938, "step": 16360 }, { "epoch": 78.70192307692308, "grad_norm": 2.2447869777679443, "learning_rate": 1.1832264957264958e-05, "loss": 0.1043, "step": 16370 }, { "epoch": 78.75, "grad_norm": 2.8785407543182373, "learning_rate": 1.1805555555555555e-05, "loss": 0.1054, "step": 16380 }, { "epoch": 78.79807692307692, "grad_norm": 1.589373230934143, "learning_rate": 1.1778846153846154e-05, "loss": 0.0885, "step": 16390 }, { "epoch": 78.84615384615384, "grad_norm": 2.5924251079559326, "learning_rate": 1.1752136752136752e-05, "loss": 0.1, "step": 16400 }, { "epoch": 78.89423076923077, "grad_norm": 1.2802966833114624, "learning_rate": 1.1725427350427351e-05, "loss": 0.0851, "step": 16410 }, { "epoch": 78.9423076923077, "grad_norm": 3.003061532974243, "learning_rate": 1.169871794871795e-05, "loss": 0.1066, "step": 16420 }, { "epoch": 78.99038461538461, "grad_norm": 2.6442697048187256, "learning_rate": 1.1672008547008547e-05, "loss": 0.1033, "step": 16430 }, { "epoch": 79.0, "eval_accuracy": 0.9156046663843221, "eval_loss": 0.5164893865585327, "eval_runtime": 78.6869, "eval_samples_per_second": 330.08, "eval_steps_per_second": 5.16, "step": 16432 }, { "epoch": 79.03846153846153, "grad_norm": 1.8951301574707031, "learning_rate": 1.1645299145299147e-05, "loss": 0.0998, "step": 16440 }, { "epoch": 79.08653846153847, "grad_norm": 2.636141061782837, "learning_rate": 1.1618589743589744e-05, "loss": 0.0974, "step": 16450 }, { "epoch": 79.13461538461539, "grad_norm": 2.707096576690674, "learning_rate": 1.1591880341880343e-05, "loss": 0.1011, "step": 16460 }, { "epoch": 79.1826923076923, "grad_norm": 2.843956232070923, "learning_rate": 1.156517094017094e-05, "loss": 0.1033, "step": 16470 }, { "epoch": 79.23076923076923, "grad_norm": 1.6076048612594604, "learning_rate": 1.153846153846154e-05, "loss": 0.0807, "step": 16480 }, { "epoch": 79.27884615384616, "grad_norm": 2.835524797439575, "learning_rate": 1.1511752136752137e-05, "loss": 0.0956, "step": 16490 }, { "epoch": 79.32692307692308, "grad_norm": 2.5794942378997803, "learning_rate": 1.1485042735042734e-05, "loss": 0.1072, "step": 16500 }, { "epoch": 79.375, "grad_norm": 3.4328360557556152, "learning_rate": 1.1458333333333333e-05, "loss": 0.1043, "step": 16510 }, { "epoch": 79.42307692307692, "grad_norm": 2.5818586349487305, "learning_rate": 1.1431623931623933e-05, "loss": 0.091, "step": 16520 }, { "epoch": 79.47115384615384, "grad_norm": 3.7210066318511963, "learning_rate": 1.140491452991453e-05, "loss": 0.1117, "step": 16530 }, { "epoch": 79.51923076923077, "grad_norm": 1.6356157064437866, "learning_rate": 1.1378205128205129e-05, "loss": 0.098, "step": 16540 }, { "epoch": 79.5673076923077, "grad_norm": 1.4184412956237793, "learning_rate": 1.1351495726495726e-05, "loss": 0.0899, "step": 16550 }, { "epoch": 79.61538461538461, "grad_norm": 2.22453236579895, "learning_rate": 1.1324786324786326e-05, "loss": 0.1029, "step": 16560 }, { "epoch": 79.66346153846153, "grad_norm": 3.3252480030059814, "learning_rate": 1.1298076923076923e-05, "loss": 0.0885, "step": 16570 }, { "epoch": 79.71153846153847, "grad_norm": 5.26723575592041, "learning_rate": 1.1271367521367522e-05, "loss": 0.0925, "step": 16580 }, { "epoch": 79.75961538461539, "grad_norm": 1.7291336059570312, "learning_rate": 1.1244658119658121e-05, "loss": 0.1053, "step": 16590 }, { "epoch": 79.8076923076923, "grad_norm": 2.542537212371826, "learning_rate": 1.1217948717948719e-05, "loss": 0.0864, "step": 16600 }, { "epoch": 79.85576923076923, "grad_norm": 1.5308027267456055, "learning_rate": 1.1191239316239318e-05, "loss": 0.0906, "step": 16610 }, { "epoch": 79.90384615384616, "grad_norm": 1.4290151596069336, "learning_rate": 1.1164529914529915e-05, "loss": 0.0931, "step": 16620 }, { "epoch": 79.95192307692308, "grad_norm": 3.7415621280670166, "learning_rate": 1.1137820512820514e-05, "loss": 0.1053, "step": 16630 }, { "epoch": 80.0, "grad_norm": 3.6093027591705322, "learning_rate": 1.1111111111111112e-05, "loss": 0.109, "step": 16640 }, { "epoch": 80.0, "eval_accuracy": 0.914988642051361, "eval_loss": 0.5302606821060181, "eval_runtime": 78.3458, "eval_samples_per_second": 331.517, "eval_steps_per_second": 5.182, "step": 16640 }, { "epoch": 80.04807692307692, "grad_norm": 2.0652153491973877, "learning_rate": 1.1084401709401709e-05, "loss": 0.0952, "step": 16650 }, { "epoch": 80.09615384615384, "grad_norm": 4.086421966552734, "learning_rate": 1.1057692307692308e-05, "loss": 0.1067, "step": 16660 }, { "epoch": 80.14423076923077, "grad_norm": 2.5876922607421875, "learning_rate": 1.1030982905982906e-05, "loss": 0.0882, "step": 16670 }, { "epoch": 80.1923076923077, "grad_norm": 2.317852258682251, "learning_rate": 1.1004273504273505e-05, "loss": 0.0949, "step": 16680 }, { "epoch": 80.24038461538461, "grad_norm": 2.003063440322876, "learning_rate": 1.0977564102564104e-05, "loss": 0.1089, "step": 16690 }, { "epoch": 80.28846153846153, "grad_norm": 4.4261322021484375, "learning_rate": 1.0950854700854701e-05, "loss": 0.0976, "step": 16700 }, { "epoch": 80.33653846153847, "grad_norm": 2.4389376640319824, "learning_rate": 1.09241452991453e-05, "loss": 0.0982, "step": 16710 }, { "epoch": 80.38461538461539, "grad_norm": 2.006608724594116, "learning_rate": 1.0897435897435898e-05, "loss": 0.1013, "step": 16720 }, { "epoch": 80.4326923076923, "grad_norm": 2.919907569885254, "learning_rate": 1.0870726495726497e-05, "loss": 0.098, "step": 16730 }, { "epoch": 80.48076923076923, "grad_norm": 4.661648273468018, "learning_rate": 1.0844017094017094e-05, "loss": 0.1054, "step": 16740 }, { "epoch": 80.52884615384616, "grad_norm": 2.1424789428710938, "learning_rate": 1.0817307692307693e-05, "loss": 0.0998, "step": 16750 }, { "epoch": 80.57692307692308, "grad_norm": 5.303124904632568, "learning_rate": 1.0790598290598292e-05, "loss": 0.0939, "step": 16760 }, { "epoch": 80.625, "grad_norm": 3.8277587890625, "learning_rate": 1.076388888888889e-05, "loss": 0.0844, "step": 16770 }, { "epoch": 80.67307692307692, "grad_norm": 2.162317991256714, "learning_rate": 1.0737179487179487e-05, "loss": 0.1085, "step": 16780 }, { "epoch": 80.72115384615384, "grad_norm": 4.783056259155273, "learning_rate": 1.0710470085470085e-05, "loss": 0.0976, "step": 16790 }, { "epoch": 80.76923076923077, "grad_norm": 3.1820759773254395, "learning_rate": 1.0683760683760684e-05, "loss": 0.0901, "step": 16800 }, { "epoch": 80.8173076923077, "grad_norm": 2.7339284420013428, "learning_rate": 1.0657051282051283e-05, "loss": 0.1014, "step": 16810 }, { "epoch": 80.86538461538461, "grad_norm": 3.8054990768432617, "learning_rate": 1.063034188034188e-05, "loss": 0.0913, "step": 16820 }, { "epoch": 80.91346153846153, "grad_norm": 6.0745062828063965, "learning_rate": 1.060363247863248e-05, "loss": 0.1061, "step": 16830 }, { "epoch": 80.96153846153847, "grad_norm": 2.10842227935791, "learning_rate": 1.0576923076923077e-05, "loss": 0.0999, "step": 16840 }, { "epoch": 81.0, "eval_accuracy": 0.9157971739883726, "eval_loss": 0.5298890471458435, "eval_runtime": 78.1923, "eval_samples_per_second": 332.168, "eval_steps_per_second": 5.192, "step": 16848 }, { "epoch": 81.00961538461539, "grad_norm": 2.0044116973876953, "learning_rate": 1.0550213675213676e-05, "loss": 0.1008, "step": 16850 }, { "epoch": 81.0576923076923, "grad_norm": 2.25429630279541, "learning_rate": 1.0523504273504273e-05, "loss": 0.1021, "step": 16860 }, { "epoch": 81.10576923076923, "grad_norm": 1.916979193687439, "learning_rate": 1.0496794871794872e-05, "loss": 0.0961, "step": 16870 }, { "epoch": 81.15384615384616, "grad_norm": 2.196889638900757, "learning_rate": 1.0470085470085471e-05, "loss": 0.0914, "step": 16880 }, { "epoch": 81.20192307692308, "grad_norm": 1.7861170768737793, "learning_rate": 1.0443376068376069e-05, "loss": 0.1014, "step": 16890 }, { "epoch": 81.25, "grad_norm": 1.894887924194336, "learning_rate": 1.0416666666666668e-05, "loss": 0.0907, "step": 16900 }, { "epoch": 81.29807692307692, "grad_norm": 3.014737367630005, "learning_rate": 1.0389957264957265e-05, "loss": 0.0954, "step": 16910 }, { "epoch": 81.34615384615384, "grad_norm": 1.5412670373916626, "learning_rate": 1.0363247863247865e-05, "loss": 0.081, "step": 16920 }, { "epoch": 81.39423076923077, "grad_norm": 5.066231727600098, "learning_rate": 1.0336538461538462e-05, "loss": 0.102, "step": 16930 }, { "epoch": 81.4423076923077, "grad_norm": 3.146491765975952, "learning_rate": 1.030982905982906e-05, "loss": 0.0894, "step": 16940 }, { "epoch": 81.49038461538461, "grad_norm": 2.3850066661834717, "learning_rate": 1.0283119658119658e-05, "loss": 0.0955, "step": 16950 }, { "epoch": 81.53846153846153, "grad_norm": 3.728621482849121, "learning_rate": 1.0256410256410256e-05, "loss": 0.1155, "step": 16960 }, { "epoch": 81.58653846153847, "grad_norm": 2.534695863723755, "learning_rate": 1.0229700854700855e-05, "loss": 0.0969, "step": 16970 }, { "epoch": 81.63461538461539, "grad_norm": 1.1658564805984497, "learning_rate": 1.0202991452991454e-05, "loss": 0.0817, "step": 16980 }, { "epoch": 81.6826923076923, "grad_norm": 4.179039478302002, "learning_rate": 1.0176282051282051e-05, "loss": 0.0995, "step": 16990 }, { "epoch": 81.73076923076923, "grad_norm": 6.340668201446533, "learning_rate": 1.014957264957265e-05, "loss": 0.0897, "step": 17000 }, { "epoch": 81.77884615384616, "grad_norm": 5.517063140869141, "learning_rate": 1.0122863247863248e-05, "loss": 0.0943, "step": 17010 }, { "epoch": 81.82692307692308, "grad_norm": 2.8249242305755615, "learning_rate": 1.0096153846153847e-05, "loss": 0.0997, "step": 17020 }, { "epoch": 81.875, "grad_norm": 1.863574504852295, "learning_rate": 1.0069444444444445e-05, "loss": 0.0748, "step": 17030 }, { "epoch": 81.92307692307692, "grad_norm": 3.802948474884033, "learning_rate": 1.0042735042735044e-05, "loss": 0.1032, "step": 17040 }, { "epoch": 81.97115384615384, "grad_norm": 1.6542152166366577, "learning_rate": 1.0016025641025643e-05, "loss": 0.0966, "step": 17050 }, { "epoch": 82.0, "eval_accuracy": 0.9166827089670042, "eval_loss": 0.5323548913002014, "eval_runtime": 77.9537, "eval_samples_per_second": 333.185, "eval_steps_per_second": 5.208, "step": 17056 }, { "epoch": 82.01923076923077, "grad_norm": 3.3811721801757812, "learning_rate": 9.98931623931624e-06, "loss": 0.0997, "step": 17060 }, { "epoch": 82.0673076923077, "grad_norm": 2.5621304512023926, "learning_rate": 9.96260683760684e-06, "loss": 0.0912, "step": 17070 }, { "epoch": 82.11538461538461, "grad_norm": 2.871241331100464, "learning_rate": 9.935897435897435e-06, "loss": 0.0851, "step": 17080 }, { "epoch": 82.16346153846153, "grad_norm": 6.005497455596924, "learning_rate": 9.909188034188034e-06, "loss": 0.0898, "step": 17090 }, { "epoch": 82.21153846153847, "grad_norm": 3.3357009887695312, "learning_rate": 9.882478632478633e-06, "loss": 0.1084, "step": 17100 }, { "epoch": 82.25961538461539, "grad_norm": 2.5626237392425537, "learning_rate": 9.85576923076923e-06, "loss": 0.1037, "step": 17110 }, { "epoch": 82.3076923076923, "grad_norm": 2.3182120323181152, "learning_rate": 9.82905982905983e-06, "loss": 0.0993, "step": 17120 }, { "epoch": 82.35576923076923, "grad_norm": 3.3285627365112305, "learning_rate": 9.802350427350427e-06, "loss": 0.0939, "step": 17130 }, { "epoch": 82.40384615384616, "grad_norm": 2.1187620162963867, "learning_rate": 9.775641025641026e-06, "loss": 0.0897, "step": 17140 }, { "epoch": 82.45192307692308, "grad_norm": 3.0599284172058105, "learning_rate": 9.748931623931625e-06, "loss": 0.1008, "step": 17150 }, { "epoch": 82.5, "grad_norm": 2.745887041091919, "learning_rate": 9.722222222222223e-06, "loss": 0.0897, "step": 17160 }, { "epoch": 82.54807692307692, "grad_norm": 2.137702226638794, "learning_rate": 9.695512820512822e-06, "loss": 0.109, "step": 17170 }, { "epoch": 82.59615384615384, "grad_norm": 2.8012797832489014, "learning_rate": 9.66880341880342e-06, "loss": 0.0975, "step": 17180 }, { "epoch": 82.64423076923077, "grad_norm": 2.84123158454895, "learning_rate": 9.642094017094018e-06, "loss": 0.0927, "step": 17190 }, { "epoch": 82.6923076923077, "grad_norm": 2.570007562637329, "learning_rate": 9.615384615384616e-06, "loss": 0.1053, "step": 17200 }, { "epoch": 82.74038461538461, "grad_norm": 1.9705079793930054, "learning_rate": 9.588675213675215e-06, "loss": 0.0953, "step": 17210 }, { "epoch": 82.78846153846153, "grad_norm": 2.1044111251831055, "learning_rate": 9.561965811965812e-06, "loss": 0.0985, "step": 17220 }, { "epoch": 82.83653846153847, "grad_norm": 2.124061107635498, "learning_rate": 9.53525641025641e-06, "loss": 0.1037, "step": 17230 }, { "epoch": 82.88461538461539, "grad_norm": 3.0550591945648193, "learning_rate": 9.508547008547009e-06, "loss": 0.0995, "step": 17240 }, { "epoch": 82.9326923076923, "grad_norm": 3.842979907989502, "learning_rate": 9.481837606837606e-06, "loss": 0.0956, "step": 17250 }, { "epoch": 82.98076923076923, "grad_norm": 2.420640230178833, "learning_rate": 9.455128205128205e-06, "loss": 0.0952, "step": 17260 }, { "epoch": 83.0, "eval_accuracy": 0.9167597120086244, "eval_loss": 0.522853672504425, "eval_runtime": 78.1192, "eval_samples_per_second": 332.479, "eval_steps_per_second": 5.197, "step": 17264 }, { "epoch": 83.02884615384616, "grad_norm": 1.639867901802063, "learning_rate": 9.428418803418804e-06, "loss": 0.1022, "step": 17270 }, { "epoch": 83.07692307692308, "grad_norm": 3.2902331352233887, "learning_rate": 9.401709401709402e-06, "loss": 0.0869, "step": 17280 }, { "epoch": 83.125, "grad_norm": 2.5934789180755615, "learning_rate": 9.375000000000001e-06, "loss": 0.1081, "step": 17290 }, { "epoch": 83.17307692307692, "grad_norm": 3.5262787342071533, "learning_rate": 9.348290598290598e-06, "loss": 0.0838, "step": 17300 }, { "epoch": 83.22115384615384, "grad_norm": 1.8926507234573364, "learning_rate": 9.321581196581197e-06, "loss": 0.0876, "step": 17310 }, { "epoch": 83.26923076923077, "grad_norm": 1.6931732892990112, "learning_rate": 9.294871794871795e-06, "loss": 0.0887, "step": 17320 }, { "epoch": 83.3173076923077, "grad_norm": 2.1631009578704834, "learning_rate": 9.268162393162394e-06, "loss": 0.0948, "step": 17330 }, { "epoch": 83.36538461538461, "grad_norm": 2.9750123023986816, "learning_rate": 9.241452991452993e-06, "loss": 0.1037, "step": 17340 }, { "epoch": 83.41346153846153, "grad_norm": 1.5475324392318726, "learning_rate": 9.21474358974359e-06, "loss": 0.098, "step": 17350 }, { "epoch": 83.46153846153847, "grad_norm": 1.8872077465057373, "learning_rate": 9.18803418803419e-06, "loss": 0.099, "step": 17360 }, { "epoch": 83.50961538461539, "grad_norm": 2.4859344959259033, "learning_rate": 9.161324786324787e-06, "loss": 0.0904, "step": 17370 }, { "epoch": 83.5576923076923, "grad_norm": 4.024520397186279, "learning_rate": 9.134615384615384e-06, "loss": 0.0834, "step": 17380 }, { "epoch": 83.60576923076923, "grad_norm": 2.011033773422241, "learning_rate": 9.107905982905983e-06, "loss": 0.0948, "step": 17390 }, { "epoch": 83.65384615384616, "grad_norm": 3.3780665397644043, "learning_rate": 9.081196581196581e-06, "loss": 0.0923, "step": 17400 }, { "epoch": 83.70192307692308, "grad_norm": 2.6567065715789795, "learning_rate": 9.05448717948718e-06, "loss": 0.1131, "step": 17410 }, { "epoch": 83.75, "grad_norm": 3.0161404609680176, "learning_rate": 9.027777777777777e-06, "loss": 0.1005, "step": 17420 }, { "epoch": 83.79807692307692, "grad_norm": 1.7947297096252441, "learning_rate": 9.001068376068376e-06, "loss": 0.0979, "step": 17430 }, { "epoch": 83.84615384615384, "grad_norm": 3.2232980728149414, "learning_rate": 8.974358974358976e-06, "loss": 0.1045, "step": 17440 }, { "epoch": 83.89423076923077, "grad_norm": 1.726688027381897, "learning_rate": 8.947649572649573e-06, "loss": 0.0912, "step": 17450 }, { "epoch": 83.9423076923077, "grad_norm": 4.295827388763428, "learning_rate": 8.920940170940172e-06, "loss": 0.1001, "step": 17460 }, { "epoch": 83.99038461538461, "grad_norm": 1.8016958236694336, "learning_rate": 8.89423076923077e-06, "loss": 0.1071, "step": 17470 }, { "epoch": 84.0, "eval_accuracy": 0.9175682439456359, "eval_loss": 0.5302887558937073, "eval_runtime": 78.357, "eval_samples_per_second": 331.47, "eval_steps_per_second": 5.181, "step": 17472 }, { "epoch": 84.03846153846153, "grad_norm": 1.68791925907135, "learning_rate": 8.867521367521369e-06, "loss": 0.1018, "step": 17480 }, { "epoch": 84.08653846153847, "grad_norm": 1.902655005455017, "learning_rate": 8.840811965811966e-06, "loss": 0.0907, "step": 17490 }, { "epoch": 84.13461538461539, "grad_norm": 4.15269660949707, "learning_rate": 8.814102564102565e-06, "loss": 0.0907, "step": 17500 }, { "epoch": 84.1826923076923, "grad_norm": 1.617684006690979, "learning_rate": 8.787393162393163e-06, "loss": 0.1013, "step": 17510 }, { "epoch": 84.23076923076923, "grad_norm": 2.001791000366211, "learning_rate": 8.76068376068376e-06, "loss": 0.1112, "step": 17520 }, { "epoch": 84.27884615384616, "grad_norm": 2.293957471847534, "learning_rate": 8.733974358974359e-06, "loss": 0.1044, "step": 17530 }, { "epoch": 84.32692307692308, "grad_norm": 1.6065454483032227, "learning_rate": 8.707264957264956e-06, "loss": 0.0943, "step": 17540 }, { "epoch": 84.375, "grad_norm": 2.508031129837036, "learning_rate": 8.680555555555556e-06, "loss": 0.0965, "step": 17550 }, { "epoch": 84.42307692307692, "grad_norm": 2.6377179622650146, "learning_rate": 8.653846153846155e-06, "loss": 0.0866, "step": 17560 }, { "epoch": 84.47115384615384, "grad_norm": 3.3555262088775635, "learning_rate": 8.627136752136752e-06, "loss": 0.088, "step": 17570 }, { "epoch": 84.51923076923077, "grad_norm": 2.2307512760162354, "learning_rate": 8.600427350427351e-06, "loss": 0.0839, "step": 17580 }, { "epoch": 84.5673076923077, "grad_norm": 2.555112361907959, "learning_rate": 8.573717948717949e-06, "loss": 0.1039, "step": 17590 }, { "epoch": 84.61538461538461, "grad_norm": 2.182796001434326, "learning_rate": 8.547008547008548e-06, "loss": 0.0928, "step": 17600 }, { "epoch": 84.66346153846153, "grad_norm": 1.9356637001037598, "learning_rate": 8.520299145299147e-06, "loss": 0.0957, "step": 17610 }, { "epoch": 84.71153846153847, "grad_norm": 1.8750778436660767, "learning_rate": 8.493589743589744e-06, "loss": 0.0983, "step": 17620 }, { "epoch": 84.75961538461539, "grad_norm": 2.635309934616089, "learning_rate": 8.466880341880343e-06, "loss": 0.0951, "step": 17630 }, { "epoch": 84.8076923076923, "grad_norm": 3.1567320823669434, "learning_rate": 8.44017094017094e-06, "loss": 0.0977, "step": 17640 }, { "epoch": 84.85576923076923, "grad_norm": 4.162692546844482, "learning_rate": 8.41346153846154e-06, "loss": 0.0969, "step": 17650 }, { "epoch": 84.90384615384616, "grad_norm": 2.7375292778015137, "learning_rate": 8.386752136752137e-06, "loss": 0.1024, "step": 17660 }, { "epoch": 84.95192307692308, "grad_norm": 2.0062754154205322, "learning_rate": 8.360042735042735e-06, "loss": 0.0954, "step": 17670 }, { "epoch": 85.0, "grad_norm": 2.515605926513672, "learning_rate": 8.333333333333334e-06, "loss": 0.0899, "step": 17680 }, { "epoch": 85.0, "eval_accuracy": 0.916028183113233, "eval_loss": 0.5227550864219666, "eval_runtime": 78.0093, "eval_samples_per_second": 332.948, "eval_steps_per_second": 5.205, "step": 17680 }, { "epoch": 85.04807692307692, "grad_norm": 3.851296901702881, "learning_rate": 8.306623931623931e-06, "loss": 0.0821, "step": 17690 }, { "epoch": 85.09615384615384, "grad_norm": 2.491825580596924, "learning_rate": 8.27991452991453e-06, "loss": 0.0975, "step": 17700 }, { "epoch": 85.14423076923077, "grad_norm": 3.3636233806610107, "learning_rate": 8.253205128205128e-06, "loss": 0.0912, "step": 17710 }, { "epoch": 85.1923076923077, "grad_norm": 2.5982868671417236, "learning_rate": 8.226495726495727e-06, "loss": 0.091, "step": 17720 }, { "epoch": 85.24038461538461, "grad_norm": 3.063547372817993, "learning_rate": 8.199786324786326e-06, "loss": 0.0863, "step": 17730 }, { "epoch": 85.28846153846153, "grad_norm": 2.6590001583099365, "learning_rate": 8.173076923076923e-06, "loss": 0.0866, "step": 17740 }, { "epoch": 85.33653846153847, "grad_norm": 2.0314643383026123, "learning_rate": 8.146367521367522e-06, "loss": 0.0963, "step": 17750 }, { "epoch": 85.38461538461539, "grad_norm": 4.529903888702393, "learning_rate": 8.11965811965812e-06, "loss": 0.1036, "step": 17760 }, { "epoch": 85.4326923076923, "grad_norm": 1.8191704750061035, "learning_rate": 8.092948717948719e-06, "loss": 0.0967, "step": 17770 }, { "epoch": 85.48076923076923, "grad_norm": 2.286109209060669, "learning_rate": 8.066239316239316e-06, "loss": 0.1114, "step": 17780 }, { "epoch": 85.52884615384616, "grad_norm": 2.5147948265075684, "learning_rate": 8.039529914529915e-06, "loss": 0.098, "step": 17790 }, { "epoch": 85.57692307692308, "grad_norm": 1.6920441389083862, "learning_rate": 8.012820512820515e-06, "loss": 0.1039, "step": 17800 }, { "epoch": 85.625, "grad_norm": 2.620615005493164, "learning_rate": 7.98611111111111e-06, "loss": 0.0934, "step": 17810 }, { "epoch": 85.67307692307692, "grad_norm": 2.0192463397979736, "learning_rate": 7.95940170940171e-06, "loss": 0.092, "step": 17820 }, { "epoch": 85.72115384615384, "grad_norm": 2.0878403186798096, "learning_rate": 7.932692307692308e-06, "loss": 0.0912, "step": 17830 }, { "epoch": 85.76923076923077, "grad_norm": 2.22430682182312, "learning_rate": 7.905982905982906e-06, "loss": 0.0894, "step": 17840 }, { "epoch": 85.8173076923077, "grad_norm": 2.7106664180755615, "learning_rate": 7.879273504273505e-06, "loss": 0.0997, "step": 17850 }, { "epoch": 85.86538461538461, "grad_norm": 3.859205722808838, "learning_rate": 7.852564102564102e-06, "loss": 0.0957, "step": 17860 }, { "epoch": 85.91346153846153, "grad_norm": 2.859315872192383, "learning_rate": 7.825854700854701e-06, "loss": 0.0845, "step": 17870 }, { "epoch": 85.96153846153847, "grad_norm": 3.8132479190826416, "learning_rate": 7.799145299145299e-06, "loss": 0.0868, "step": 17880 }, { "epoch": 86.0, "eval_accuracy": 0.9149116390097409, "eval_loss": 0.5296818017959595, "eval_runtime": 78.1425, "eval_samples_per_second": 332.38, "eval_steps_per_second": 5.196, "step": 17888 }, { "epoch": 86.00961538461539, "grad_norm": 3.6663906574249268, "learning_rate": 7.772435897435898e-06, "loss": 0.0861, "step": 17890 }, { "epoch": 86.0576923076923, "grad_norm": 3.9391050338745117, "learning_rate": 7.745726495726497e-06, "loss": 0.0953, "step": 17900 }, { "epoch": 86.10576923076923, "grad_norm": 4.926783561706543, "learning_rate": 7.719017094017094e-06, "loss": 0.0964, "step": 17910 }, { "epoch": 86.15384615384616, "grad_norm": 3.128918170928955, "learning_rate": 7.692307692307694e-06, "loss": 0.0887, "step": 17920 }, { "epoch": 86.20192307692308, "grad_norm": 1.594325065612793, "learning_rate": 7.665598290598291e-06, "loss": 0.0862, "step": 17930 }, { "epoch": 86.25, "grad_norm": 2.060176134109497, "learning_rate": 7.63888888888889e-06, "loss": 0.099, "step": 17940 }, { "epoch": 86.29807692307692, "grad_norm": 1.7453858852386475, "learning_rate": 7.612179487179487e-06, "loss": 0.087, "step": 17950 }, { "epoch": 86.34615384615384, "grad_norm": 1.8897358179092407, "learning_rate": 7.585470085470085e-06, "loss": 0.0841, "step": 17960 }, { "epoch": 86.39423076923077, "grad_norm": 1.9192811250686646, "learning_rate": 7.558760683760684e-06, "loss": 0.084, "step": 17970 }, { "epoch": 86.4423076923077, "grad_norm": 1.7597984075546265, "learning_rate": 7.532051282051282e-06, "loss": 0.0947, "step": 17980 }, { "epoch": 86.49038461538461, "grad_norm": 2.8047990798950195, "learning_rate": 7.5053418803418805e-06, "loss": 0.0934, "step": 17990 }, { "epoch": 86.53846153846153, "grad_norm": 2.9179420471191406, "learning_rate": 7.478632478632479e-06, "loss": 0.0873, "step": 18000 }, { "epoch": 86.58653846153847, "grad_norm": 2.4710028171539307, "learning_rate": 7.451923076923077e-06, "loss": 0.0904, "step": 18010 }, { "epoch": 86.63461538461539, "grad_norm": 3.3666841983795166, "learning_rate": 7.425213675213675e-06, "loss": 0.09, "step": 18020 }, { "epoch": 86.6826923076923, "grad_norm": 1.6975278854370117, "learning_rate": 7.3985042735042736e-06, "loss": 0.1073, "step": 18030 }, { "epoch": 86.73076923076923, "grad_norm": 2.508455514907837, "learning_rate": 7.371794871794873e-06, "loss": 0.0983, "step": 18040 }, { "epoch": 86.77884615384616, "grad_norm": 3.4124417304992676, "learning_rate": 7.345085470085471e-06, "loss": 0.0979, "step": 18050 }, { "epoch": 86.82692307692308, "grad_norm": 2.492166757583618, "learning_rate": 7.318376068376069e-06, "loss": 0.097, "step": 18060 }, { "epoch": 86.875, "grad_norm": 2.2536985874176025, "learning_rate": 7.2916666666666674e-06, "loss": 0.0874, "step": 18070 }, { "epoch": 86.92307692307692, "grad_norm": 2.7703311443328857, "learning_rate": 7.264957264957266e-06, "loss": 0.0925, "step": 18080 }, { "epoch": 86.97115384615384, "grad_norm": 1.933685541152954, "learning_rate": 7.238247863247864e-06, "loss": 0.1011, "step": 18090 }, { "epoch": 87.0, "eval_accuracy": 0.915566164863512, "eval_loss": 0.5369883179664612, "eval_runtime": 78.425, "eval_samples_per_second": 331.183, "eval_steps_per_second": 5.177, "step": 18096 }, { "epoch": 87.01923076923077, "grad_norm": 2.4887280464172363, "learning_rate": 7.211538461538461e-06, "loss": 0.0858, "step": 18100 }, { "epoch": 87.0673076923077, "grad_norm": 7.681400299072266, "learning_rate": 7.18482905982906e-06, "loss": 0.098, "step": 18110 }, { "epoch": 87.11538461538461, "grad_norm": 2.7640480995178223, "learning_rate": 7.158119658119658e-06, "loss": 0.1008, "step": 18120 }, { "epoch": 87.16346153846153, "grad_norm": 2.80342435836792, "learning_rate": 7.131410256410256e-06, "loss": 0.084, "step": 18130 }, { "epoch": 87.21153846153847, "grad_norm": 3.2576663494110107, "learning_rate": 7.104700854700854e-06, "loss": 0.0992, "step": 18140 }, { "epoch": 87.25961538461539, "grad_norm": 2.1969540119171143, "learning_rate": 7.0779914529914535e-06, "loss": 0.0875, "step": 18150 }, { "epoch": 87.3076923076923, "grad_norm": 2.3451759815216064, "learning_rate": 7.051282051282052e-06, "loss": 0.0902, "step": 18160 }, { "epoch": 87.35576923076923, "grad_norm": 3.1568682193756104, "learning_rate": 7.02457264957265e-06, "loss": 0.1057, "step": 18170 }, { "epoch": 87.40384615384616, "grad_norm": 2.000685930252075, "learning_rate": 6.997863247863248e-06, "loss": 0.0918, "step": 18180 }, { "epoch": 87.45192307692308, "grad_norm": 3.801213502883911, "learning_rate": 6.9711538461538465e-06, "loss": 0.1086, "step": 18190 }, { "epoch": 87.5, "grad_norm": 2.0308778285980225, "learning_rate": 6.944444444444445e-06, "loss": 0.0929, "step": 18200 }, { "epoch": 87.54807692307692, "grad_norm": 1.9415568113327026, "learning_rate": 6.917735042735044e-06, "loss": 0.1004, "step": 18210 }, { "epoch": 87.59615384615384, "grad_norm": 1.6685975790023804, "learning_rate": 6.891025641025642e-06, "loss": 0.083, "step": 18220 }, { "epoch": 87.64423076923077, "grad_norm": 2.1610097885131836, "learning_rate": 6.86431623931624e-06, "loss": 0.0852, "step": 18230 }, { "epoch": 87.6923076923077, "grad_norm": 3.024106025695801, "learning_rate": 6.837606837606839e-06, "loss": 0.1024, "step": 18240 }, { "epoch": 87.74038461538461, "grad_norm": 2.43920636177063, "learning_rate": 6.810897435897435e-06, "loss": 0.0908, "step": 18250 }, { "epoch": 87.78846153846153, "grad_norm": 3.1118924617767334, "learning_rate": 6.784188034188034e-06, "loss": 0.0912, "step": 18260 }, { "epoch": 87.83653846153847, "grad_norm": 2.262655735015869, "learning_rate": 6.7574786324786326e-06, "loss": 0.0964, "step": 18270 }, { "epoch": 87.88461538461539, "grad_norm": 4.118659019470215, "learning_rate": 6.730769230769231e-06, "loss": 0.0981, "step": 18280 }, { "epoch": 87.9326923076923, "grad_norm": 1.0015894174575806, "learning_rate": 6.704059829059829e-06, "loss": 0.0854, "step": 18290 }, { "epoch": 87.98076923076923, "grad_norm": 4.732231616973877, "learning_rate": 6.677350427350427e-06, "loss": 0.0867, "step": 18300 }, { "epoch": 88.0, "eval_accuracy": 0.9158356755091827, "eval_loss": 0.5429995656013489, "eval_runtime": 78.3331, "eval_samples_per_second": 331.571, "eval_steps_per_second": 5.183, "step": 18304 }, { "epoch": 88.02884615384616, "grad_norm": 3.124476194381714, "learning_rate": 6.650641025641026e-06, "loss": 0.081, "step": 18310 }, { "epoch": 88.07692307692308, "grad_norm": 4.160712718963623, "learning_rate": 6.623931623931625e-06, "loss": 0.1051, "step": 18320 }, { "epoch": 88.125, "grad_norm": 4.911158084869385, "learning_rate": 6.597222222222223e-06, "loss": 0.096, "step": 18330 }, { "epoch": 88.17307692307692, "grad_norm": 2.8197944164276123, "learning_rate": 6.570512820512821e-06, "loss": 0.0905, "step": 18340 }, { "epoch": 88.22115384615384, "grad_norm": 2.1368861198425293, "learning_rate": 6.5438034188034195e-06, "loss": 0.1028, "step": 18350 }, { "epoch": 88.26923076923077, "grad_norm": 2.0642058849334717, "learning_rate": 6.517094017094018e-06, "loss": 0.0973, "step": 18360 }, { "epoch": 88.3173076923077, "grad_norm": 3.724170446395874, "learning_rate": 6.490384615384616e-06, "loss": 0.0963, "step": 18370 }, { "epoch": 88.36538461538461, "grad_norm": 4.327381610870361, "learning_rate": 6.463675213675214e-06, "loss": 0.0988, "step": 18380 }, { "epoch": 88.41346153846153, "grad_norm": 4.974121570587158, "learning_rate": 6.436965811965812e-06, "loss": 0.101, "step": 18390 }, { "epoch": 88.46153846153847, "grad_norm": 2.9087493419647217, "learning_rate": 6.41025641025641e-06, "loss": 0.086, "step": 18400 }, { "epoch": 88.50961538461539, "grad_norm": 1.8609813451766968, "learning_rate": 6.383547008547008e-06, "loss": 0.073, "step": 18410 }, { "epoch": 88.5576923076923, "grad_norm": 3.458106279373169, "learning_rate": 6.3568376068376064e-06, "loss": 0.0832, "step": 18420 }, { "epoch": 88.60576923076923, "grad_norm": 1.7712996006011963, "learning_rate": 6.3301282051282055e-06, "loss": 0.0946, "step": 18430 }, { "epoch": 88.65384615384616, "grad_norm": 2.066318988800049, "learning_rate": 6.303418803418804e-06, "loss": 0.0848, "step": 18440 }, { "epoch": 88.70192307692308, "grad_norm": 1.8005577325820923, "learning_rate": 6.276709401709402e-06, "loss": 0.0834, "step": 18450 }, { "epoch": 88.75, "grad_norm": 2.7007827758789062, "learning_rate": 6.25e-06, "loss": 0.0878, "step": 18460 }, { "epoch": 88.79807692307692, "grad_norm": 2.48016357421875, "learning_rate": 6.2232905982905986e-06, "loss": 0.1028, "step": 18470 }, { "epoch": 88.84615384615384, "grad_norm": 2.3920891284942627, "learning_rate": 6.196581196581197e-06, "loss": 0.0946, "step": 18480 }, { "epoch": 88.89423076923077, "grad_norm": 4.705691337585449, "learning_rate": 6.169871794871795e-06, "loss": 0.0925, "step": 18490 }, { "epoch": 88.9423076923077, "grad_norm": 1.9182746410369873, "learning_rate": 6.143162393162393e-06, "loss": 0.0926, "step": 18500 }, { "epoch": 88.99038461538461, "grad_norm": 2.8190078735351562, "learning_rate": 6.116452991452992e-06, "loss": 0.0936, "step": 18510 }, { "epoch": 89.0, "eval_accuracy": 0.9164516998421438, "eval_loss": 0.5345762968063354, "eval_runtime": 78.3428, "eval_samples_per_second": 331.53, "eval_steps_per_second": 5.182, "step": 18512 }, { "epoch": 89.03846153846153, "grad_norm": 2.553131103515625, "learning_rate": 6.08974358974359e-06, "loss": 0.089, "step": 18520 }, { "epoch": 89.08653846153847, "grad_norm": 2.973276376724243, "learning_rate": 6.063034188034188e-06, "loss": 0.1067, "step": 18530 }, { "epoch": 89.13461538461539, "grad_norm": 3.4265568256378174, "learning_rate": 6.036324786324786e-06, "loss": 0.1009, "step": 18540 }, { "epoch": 89.1826923076923, "grad_norm": 3.077608108520508, "learning_rate": 6.0096153846153855e-06, "loss": 0.0972, "step": 18550 }, { "epoch": 89.23076923076923, "grad_norm": 2.0409412384033203, "learning_rate": 5.982905982905984e-06, "loss": 0.0845, "step": 18560 }, { "epoch": 89.27884615384616, "grad_norm": 3.3964855670928955, "learning_rate": 5.956196581196581e-06, "loss": 0.097, "step": 18570 }, { "epoch": 89.32692307692308, "grad_norm": 3.634277582168579, "learning_rate": 5.929487179487179e-06, "loss": 0.084, "step": 18580 }, { "epoch": 89.375, "grad_norm": 2.2638773918151855, "learning_rate": 5.902777777777778e-06, "loss": 0.0955, "step": 18590 }, { "epoch": 89.42307692307692, "grad_norm": 4.06154203414917, "learning_rate": 5.876068376068376e-06, "loss": 0.0862, "step": 18600 }, { "epoch": 89.47115384615384, "grad_norm": 3.2456071376800537, "learning_rate": 5.849358974358975e-06, "loss": 0.0841, "step": 18610 }, { "epoch": 89.51923076923077, "grad_norm": 4.182865619659424, "learning_rate": 5.822649572649573e-06, "loss": 0.0995, "step": 18620 }, { "epoch": 89.5673076923077, "grad_norm": 1.2705217599868774, "learning_rate": 5.7959401709401715e-06, "loss": 0.0815, "step": 18630 }, { "epoch": 89.61538461538461, "grad_norm": 1.7068849802017212, "learning_rate": 5.76923076923077e-06, "loss": 0.0853, "step": 18640 }, { "epoch": 89.66346153846153, "grad_norm": 3.615556240081787, "learning_rate": 5.742521367521367e-06, "loss": 0.0871, "step": 18650 }, { "epoch": 89.71153846153847, "grad_norm": 4.5766987800598145, "learning_rate": 5.715811965811966e-06, "loss": 0.0858, "step": 18660 }, { "epoch": 89.75961538461539, "grad_norm": 4.48842191696167, "learning_rate": 5.6891025641025645e-06, "loss": 0.0909, "step": 18670 }, { "epoch": 89.8076923076923, "grad_norm": 2.6715054512023926, "learning_rate": 5.662393162393163e-06, "loss": 0.0913, "step": 18680 }, { "epoch": 89.85576923076923, "grad_norm": 2.0227317810058594, "learning_rate": 5.635683760683761e-06, "loss": 0.0794, "step": 18690 }, { "epoch": 89.90384615384616, "grad_norm": 2.8515403270721436, "learning_rate": 5.608974358974359e-06, "loss": 0.1026, "step": 18700 }, { "epoch": 89.95192307692308, "grad_norm": 3.24908709526062, "learning_rate": 5.5822649572649576e-06, "loss": 0.0904, "step": 18710 }, { "epoch": 90.0, "grad_norm": 4.083438396453857, "learning_rate": 5.555555555555556e-06, "loss": 0.0929, "step": 18720 }, { "epoch": 90.0, "eval_accuracy": 0.9162976937589035, "eval_loss": 0.5386949777603149, "eval_runtime": 78.548, "eval_samples_per_second": 330.664, "eval_steps_per_second": 5.169, "step": 18720 }, { "epoch": 90.04807692307692, "grad_norm": 3.049201488494873, "learning_rate": 5.528846153846154e-06, "loss": 0.0897, "step": 18730 }, { "epoch": 90.09615384615384, "grad_norm": 2.435035228729248, "learning_rate": 5.502136752136752e-06, "loss": 0.1083, "step": 18740 }, { "epoch": 90.14423076923077, "grad_norm": 3.1708567142486572, "learning_rate": 5.475427350427351e-06, "loss": 0.101, "step": 18750 }, { "epoch": 90.1923076923077, "grad_norm": 1.3026307821273804, "learning_rate": 5.448717948717949e-06, "loss": 0.0831, "step": 18760 }, { "epoch": 90.24038461538461, "grad_norm": 3.1718575954437256, "learning_rate": 5.422008547008547e-06, "loss": 0.0899, "step": 18770 }, { "epoch": 90.28846153846153, "grad_norm": 5.550716400146484, "learning_rate": 5.395299145299146e-06, "loss": 0.0797, "step": 18780 }, { "epoch": 90.33653846153847, "grad_norm": 2.490976095199585, "learning_rate": 5.368589743589744e-06, "loss": 0.0952, "step": 18790 }, { "epoch": 90.38461538461539, "grad_norm": 1.4641972780227661, "learning_rate": 5.341880341880342e-06, "loss": 0.0829, "step": 18800 }, { "epoch": 90.4326923076923, "grad_norm": 5.316316604614258, "learning_rate": 5.31517094017094e-06, "loss": 0.0865, "step": 18810 }, { "epoch": 90.48076923076923, "grad_norm": 3.719576120376587, "learning_rate": 5.288461538461538e-06, "loss": 0.0923, "step": 18820 }, { "epoch": 90.52884615384616, "grad_norm": 4.394183158874512, "learning_rate": 5.261752136752137e-06, "loss": 0.0965, "step": 18830 }, { "epoch": 90.57692307692308, "grad_norm": 2.03344464302063, "learning_rate": 5.235042735042736e-06, "loss": 0.1004, "step": 18840 }, { "epoch": 90.625, "grad_norm": 1.7884031534194946, "learning_rate": 5.208333333333334e-06, "loss": 0.0938, "step": 18850 }, { "epoch": 90.67307692307692, "grad_norm": 4.004364013671875, "learning_rate": 5.181623931623932e-06, "loss": 0.088, "step": 18860 }, { "epoch": 90.72115384615384, "grad_norm": 3.062541961669922, "learning_rate": 5.15491452991453e-06, "loss": 0.0882, "step": 18870 }, { "epoch": 90.76923076923077, "grad_norm": 1.5338741540908813, "learning_rate": 5.128205128205128e-06, "loss": 0.0794, "step": 18880 }, { "epoch": 90.8173076923077, "grad_norm": 2.7588295936584473, "learning_rate": 5.101495726495727e-06, "loss": 0.0713, "step": 18890 }, { "epoch": 90.86538461538461, "grad_norm": 1.9977960586547852, "learning_rate": 5.074786324786325e-06, "loss": 0.0859, "step": 18900 }, { "epoch": 90.91346153846153, "grad_norm": 2.276606321334839, "learning_rate": 5.0480769230769235e-06, "loss": 0.1008, "step": 18910 }, { "epoch": 90.96153846153847, "grad_norm": 2.346315860748291, "learning_rate": 5.021367521367522e-06, "loss": 0.0792, "step": 18920 }, { "epoch": 91.0, "eval_accuracy": 0.914988642051361, "eval_loss": 0.5459111332893372, "eval_runtime": 78.5062, "eval_samples_per_second": 330.84, "eval_steps_per_second": 5.172, "step": 18928 }, { "epoch": 91.00961538461539, "grad_norm": 1.9895801544189453, "learning_rate": 4.99465811965812e-06, "loss": 0.0829, "step": 18930 }, { "epoch": 91.0576923076923, "grad_norm": 3.7853267192840576, "learning_rate": 4.9679487179487175e-06, "loss": 0.0859, "step": 18940 }, { "epoch": 91.10576923076923, "grad_norm": 2.0683205127716064, "learning_rate": 4.9412393162393166e-06, "loss": 0.0907, "step": 18950 }, { "epoch": 91.15384615384616, "grad_norm": 3.527277708053589, "learning_rate": 4.914529914529915e-06, "loss": 0.087, "step": 18960 }, { "epoch": 91.20192307692308, "grad_norm": 2.3382017612457275, "learning_rate": 4.887820512820513e-06, "loss": 0.0839, "step": 18970 }, { "epoch": 91.25, "grad_norm": 3.2342114448547363, "learning_rate": 4.861111111111111e-06, "loss": 0.0936, "step": 18980 }, { "epoch": 91.29807692307692, "grad_norm": 1.5640313625335693, "learning_rate": 4.83440170940171e-06, "loss": 0.1004, "step": 18990 }, { "epoch": 91.34615384615384, "grad_norm": 2.375319004058838, "learning_rate": 4.807692307692308e-06, "loss": 0.086, "step": 19000 }, { "epoch": 91.39423076923077, "grad_norm": 2.3515665531158447, "learning_rate": 4.780982905982906e-06, "loss": 0.089, "step": 19010 }, { "epoch": 91.4423076923077, "grad_norm": 2.523813247680664, "learning_rate": 4.754273504273504e-06, "loss": 0.1026, "step": 19020 }, { "epoch": 91.49038461538461, "grad_norm": 2.4831182956695557, "learning_rate": 4.727564102564103e-06, "loss": 0.0907, "step": 19030 }, { "epoch": 91.53846153846153, "grad_norm": 2.740260124206543, "learning_rate": 4.700854700854701e-06, "loss": 0.0952, "step": 19040 }, { "epoch": 91.58653846153847, "grad_norm": 1.7920145988464355, "learning_rate": 4.674145299145299e-06, "loss": 0.0829, "step": 19050 }, { "epoch": 91.63461538461539, "grad_norm": 2.8840527534484863, "learning_rate": 4.647435897435897e-06, "loss": 0.0835, "step": 19060 }, { "epoch": 91.6826923076923, "grad_norm": 4.222652912139893, "learning_rate": 4.6207264957264965e-06, "loss": 0.0894, "step": 19070 }, { "epoch": 91.73076923076923, "grad_norm": 1.9264655113220215, "learning_rate": 4.594017094017095e-06, "loss": 0.0999, "step": 19080 }, { "epoch": 91.77884615384616, "grad_norm": 2.3394622802734375, "learning_rate": 4.567307692307692e-06, "loss": 0.0936, "step": 19090 }, { "epoch": 91.82692307692308, "grad_norm": 3.1210083961486816, "learning_rate": 4.5405982905982904e-06, "loss": 0.0787, "step": 19100 }, { "epoch": 91.875, "grad_norm": 4.227526664733887, "learning_rate": 4.513888888888889e-06, "loss": 0.0877, "step": 19110 }, { "epoch": 91.92307692307692, "grad_norm": 2.7307140827178955, "learning_rate": 4.487179487179488e-06, "loss": 0.0962, "step": 19120 }, { "epoch": 91.97115384615384, "grad_norm": 2.822927951812744, "learning_rate": 4.460470085470086e-06, "loss": 0.0918, "step": 19130 }, { "epoch": 92.0, "eval_accuracy": 0.9165287028837639, "eval_loss": 0.5257331728935242, "eval_runtime": 78.3804, "eval_samples_per_second": 331.371, "eval_steps_per_second": 5.18, "step": 19136 }, { "epoch": 92.01923076923077, "grad_norm": 4.071859836578369, "learning_rate": 4.433760683760684e-06, "loss": 0.0926, "step": 19140 }, { "epoch": 92.0673076923077, "grad_norm": 1.9095423221588135, "learning_rate": 4.4070512820512826e-06, "loss": 0.0857, "step": 19150 }, { "epoch": 92.11538461538461, "grad_norm": 2.5223920345306396, "learning_rate": 4.38034188034188e-06, "loss": 0.0908, "step": 19160 }, { "epoch": 92.16346153846153, "grad_norm": 2.0615479946136475, "learning_rate": 4.353632478632478e-06, "loss": 0.1006, "step": 19170 }, { "epoch": 92.21153846153847, "grad_norm": 3.498220205307007, "learning_rate": 4.326923076923077e-06, "loss": 0.0935, "step": 19180 }, { "epoch": 92.25961538461539, "grad_norm": 2.546128749847412, "learning_rate": 4.300213675213676e-06, "loss": 0.0831, "step": 19190 }, { "epoch": 92.3076923076923, "grad_norm": 1.2541508674621582, "learning_rate": 4.273504273504274e-06, "loss": 0.0664, "step": 19200 }, { "epoch": 92.35576923076923, "grad_norm": 2.479755163192749, "learning_rate": 4.246794871794872e-06, "loss": 0.0848, "step": 19210 }, { "epoch": 92.40384615384616, "grad_norm": 3.9854955673217773, "learning_rate": 4.22008547008547e-06, "loss": 0.0985, "step": 19220 }, { "epoch": 92.45192307692308, "grad_norm": 2.7706658840179443, "learning_rate": 4.193376068376069e-06, "loss": 0.0953, "step": 19230 }, { "epoch": 92.5, "grad_norm": 3.777275562286377, "learning_rate": 4.166666666666667e-06, "loss": 0.1039, "step": 19240 }, { "epoch": 92.54807692307692, "grad_norm": 2.1898856163024902, "learning_rate": 4.139957264957265e-06, "loss": 0.0929, "step": 19250 }, { "epoch": 92.59615384615384, "grad_norm": 3.146277666091919, "learning_rate": 4.113247863247863e-06, "loss": 0.0951, "step": 19260 }, { "epoch": 92.64423076923077, "grad_norm": 2.1728804111480713, "learning_rate": 4.086538461538462e-06, "loss": 0.0831, "step": 19270 }, { "epoch": 92.6923076923077, "grad_norm": 2.087242364883423, "learning_rate": 4.05982905982906e-06, "loss": 0.0944, "step": 19280 }, { "epoch": 92.74038461538461, "grad_norm": 5.14932918548584, "learning_rate": 4.033119658119658e-06, "loss": 0.0884, "step": 19290 }, { "epoch": 92.78846153846153, "grad_norm": 1.9424299001693726, "learning_rate": 4.006410256410257e-06, "loss": 0.0846, "step": 19300 }, { "epoch": 92.83653846153847, "grad_norm": 2.304175615310669, "learning_rate": 3.979700854700855e-06, "loss": 0.0859, "step": 19310 }, { "epoch": 92.88461538461539, "grad_norm": 2.019481897354126, "learning_rate": 3.952991452991453e-06, "loss": 0.1055, "step": 19320 }, { "epoch": 92.9326923076923, "grad_norm": 1.3866316080093384, "learning_rate": 3.926282051282051e-06, "loss": 0.1007, "step": 19330 }, { "epoch": 92.98076923076923, "grad_norm": 4.055050849914551, "learning_rate": 3.8995726495726494e-06, "loss": 0.0853, "step": 19340 }, { "epoch": 93.0, "eval_accuracy": 0.9155276633427021, "eval_loss": 0.5426116585731506, "eval_runtime": 78.3585, "eval_samples_per_second": 331.464, "eval_steps_per_second": 5.181, "step": 19344 }, { "epoch": 93.02884615384616, "grad_norm": 1.4447909593582153, "learning_rate": 3.8728632478632485e-06, "loss": 0.0811, "step": 19350 }, { "epoch": 93.07692307692308, "grad_norm": 3.8229141235351562, "learning_rate": 3.846153846153847e-06, "loss": 0.1009, "step": 19360 }, { "epoch": 93.125, "grad_norm": 4.551164150238037, "learning_rate": 3.819444444444445e-06, "loss": 0.0947, "step": 19370 }, { "epoch": 93.17307692307692, "grad_norm": 2.263272523880005, "learning_rate": 3.7927350427350425e-06, "loss": 0.0928, "step": 19380 }, { "epoch": 93.22115384615384, "grad_norm": 2.3758928775787354, "learning_rate": 3.766025641025641e-06, "loss": 0.0806, "step": 19390 }, { "epoch": 93.26923076923077, "grad_norm": 2.4724762439727783, "learning_rate": 3.7393162393162394e-06, "loss": 0.0969, "step": 19400 }, { "epoch": 93.3173076923077, "grad_norm": 4.640396595001221, "learning_rate": 3.7126068376068377e-06, "loss": 0.0863, "step": 19410 }, { "epoch": 93.36538461538461, "grad_norm": 2.8010809421539307, "learning_rate": 3.6858974358974363e-06, "loss": 0.0978, "step": 19420 }, { "epoch": 93.41346153846153, "grad_norm": 1.5172042846679688, "learning_rate": 3.6591880341880346e-06, "loss": 0.0799, "step": 19430 }, { "epoch": 93.46153846153847, "grad_norm": 2.2691357135772705, "learning_rate": 3.632478632478633e-06, "loss": 0.0995, "step": 19440 }, { "epoch": 93.50961538461539, "grad_norm": 3.48667049407959, "learning_rate": 3.6057692307692307e-06, "loss": 0.0776, "step": 19450 }, { "epoch": 93.5576923076923, "grad_norm": 1.969976544380188, "learning_rate": 3.579059829059829e-06, "loss": 0.0852, "step": 19460 }, { "epoch": 93.60576923076923, "grad_norm": 1.7461744546890259, "learning_rate": 3.552350427350427e-06, "loss": 0.0975, "step": 19470 }, { "epoch": 93.65384615384616, "grad_norm": 2.5230228900909424, "learning_rate": 3.525641025641026e-06, "loss": 0.09, "step": 19480 }, { "epoch": 93.70192307692308, "grad_norm": 4.101469993591309, "learning_rate": 3.498931623931624e-06, "loss": 0.0886, "step": 19490 }, { "epoch": 93.75, "grad_norm": 2.251159191131592, "learning_rate": 3.4722222222222224e-06, "loss": 0.0844, "step": 19500 }, { "epoch": 93.79807692307692, "grad_norm": 3.2499520778656006, "learning_rate": 3.445512820512821e-06, "loss": 0.0857, "step": 19510 }, { "epoch": 93.84615384615384, "grad_norm": 4.2564311027526855, "learning_rate": 3.4188034188034193e-06, "loss": 0.0902, "step": 19520 }, { "epoch": 93.89423076923077, "grad_norm": 5.964477062225342, "learning_rate": 3.392094017094017e-06, "loss": 0.1099, "step": 19530 }, { "epoch": 93.9423076923077, "grad_norm": 4.313998699188232, "learning_rate": 3.3653846153846154e-06, "loss": 0.0852, "step": 19540 }, { "epoch": 93.99038461538461, "grad_norm": 1.9203779697418213, "learning_rate": 3.3386752136752137e-06, "loss": 0.0908, "step": 19550 }, { "epoch": 94.0, "eval_accuracy": 0.9153351557386517, "eval_loss": 0.5429276823997498, "eval_runtime": 78.186, "eval_samples_per_second": 332.195, "eval_steps_per_second": 5.193, "step": 19552 }, { "epoch": 94.03846153846153, "grad_norm": 2.201056957244873, "learning_rate": 3.3119658119658124e-06, "loss": 0.096, "step": 19560 }, { "epoch": 94.08653846153847, "grad_norm": 4.478857517242432, "learning_rate": 3.2852564102564106e-06, "loss": 0.0944, "step": 19570 }, { "epoch": 94.13461538461539, "grad_norm": 3.315441370010376, "learning_rate": 3.258547008547009e-06, "loss": 0.0879, "step": 19580 }, { "epoch": 94.1826923076923, "grad_norm": 2.7168235778808594, "learning_rate": 3.231837606837607e-06, "loss": 0.0992, "step": 19590 }, { "epoch": 94.23076923076923, "grad_norm": 3.699129819869995, "learning_rate": 3.205128205128205e-06, "loss": 0.0895, "step": 19600 }, { "epoch": 94.27884615384616, "grad_norm": 2.136414051055908, "learning_rate": 3.1784188034188032e-06, "loss": 0.0995, "step": 19610 }, { "epoch": 94.32692307692308, "grad_norm": 2.145853042602539, "learning_rate": 3.151709401709402e-06, "loss": 0.0869, "step": 19620 }, { "epoch": 94.375, "grad_norm": 2.0337014198303223, "learning_rate": 3.125e-06, "loss": 0.0801, "step": 19630 }, { "epoch": 94.42307692307692, "grad_norm": 4.12987756729126, "learning_rate": 3.0982905982905984e-06, "loss": 0.0946, "step": 19640 }, { "epoch": 94.47115384615384, "grad_norm": 2.308933734893799, "learning_rate": 3.0715811965811967e-06, "loss": 0.0664, "step": 19650 }, { "epoch": 94.51923076923077, "grad_norm": 1.9404751062393188, "learning_rate": 3.044871794871795e-06, "loss": 0.0884, "step": 19660 }, { "epoch": 94.5673076923077, "grad_norm": 3.447838544845581, "learning_rate": 3.018162393162393e-06, "loss": 0.0976, "step": 19670 }, { "epoch": 94.61538461538461, "grad_norm": 2.772629976272583, "learning_rate": 2.991452991452992e-06, "loss": 0.0871, "step": 19680 }, { "epoch": 94.66346153846153, "grad_norm": 2.4654018878936768, "learning_rate": 2.9647435897435897e-06, "loss": 0.0864, "step": 19690 }, { "epoch": 94.71153846153847, "grad_norm": 2.6697187423706055, "learning_rate": 2.938034188034188e-06, "loss": 0.0845, "step": 19700 }, { "epoch": 94.75961538461539, "grad_norm": 3.116687059402466, "learning_rate": 2.9113247863247866e-06, "loss": 0.0904, "step": 19710 }, { "epoch": 94.8076923076923, "grad_norm": 1.0808930397033691, "learning_rate": 2.884615384615385e-06, "loss": 0.0837, "step": 19720 }, { "epoch": 94.85576923076923, "grad_norm": 1.651819109916687, "learning_rate": 2.857905982905983e-06, "loss": 0.0828, "step": 19730 }, { "epoch": 94.90384615384616, "grad_norm": 3.3371169567108154, "learning_rate": 2.8311965811965814e-06, "loss": 0.0909, "step": 19740 }, { "epoch": 94.95192307692308, "grad_norm": 2.6093366146087646, "learning_rate": 2.8044871794871797e-06, "loss": 0.0841, "step": 19750 }, { "epoch": 95.0, "grad_norm": 2.926112413406372, "learning_rate": 2.777777777777778e-06, "loss": 0.0981, "step": 19760 }, { "epoch": 95.0, "eval_accuracy": 0.9155276633427021, "eval_loss": 0.5393919348716736, "eval_runtime": 78.2701, "eval_samples_per_second": 331.838, "eval_steps_per_second": 5.187, "step": 19760 }, { "epoch": 95.04807692307692, "grad_norm": 2.6271018981933594, "learning_rate": 2.751068376068376e-06, "loss": 0.0869, "step": 19770 }, { "epoch": 95.09615384615384, "grad_norm": 2.1668832302093506, "learning_rate": 2.7243589743589744e-06, "loss": 0.0847, "step": 19780 }, { "epoch": 95.14423076923077, "grad_norm": 2.689929962158203, "learning_rate": 2.697649572649573e-06, "loss": 0.0857, "step": 19790 }, { "epoch": 95.1923076923077, "grad_norm": 3.521143913269043, "learning_rate": 2.670940170940171e-06, "loss": 0.0816, "step": 19800 }, { "epoch": 95.24038461538461, "grad_norm": 3.5733397006988525, "learning_rate": 2.644230769230769e-06, "loss": 0.0819, "step": 19810 }, { "epoch": 95.28846153846153, "grad_norm": 4.201375961303711, "learning_rate": 2.617521367521368e-06, "loss": 0.11, "step": 19820 }, { "epoch": 95.33653846153847, "grad_norm": 2.521632432937622, "learning_rate": 2.590811965811966e-06, "loss": 0.0872, "step": 19830 }, { "epoch": 95.38461538461539, "grad_norm": 3.253234624862671, "learning_rate": 2.564102564102564e-06, "loss": 0.0868, "step": 19840 }, { "epoch": 95.4326923076923, "grad_norm": 3.185703992843628, "learning_rate": 2.5373931623931626e-06, "loss": 0.0889, "step": 19850 }, { "epoch": 95.48076923076923, "grad_norm": 2.4609901905059814, "learning_rate": 2.510683760683761e-06, "loss": 0.0802, "step": 19860 }, { "epoch": 95.52884615384616, "grad_norm": 2.8063552379608154, "learning_rate": 2.4839743589743587e-06, "loss": 0.0822, "step": 19870 }, { "epoch": 95.57692307692308, "grad_norm": 3.1660261154174805, "learning_rate": 2.4572649572649574e-06, "loss": 0.0826, "step": 19880 }, { "epoch": 95.625, "grad_norm": 2.7315001487731934, "learning_rate": 2.4305555555555557e-06, "loss": 0.0775, "step": 19890 }, { "epoch": 95.67307692307692, "grad_norm": 2.078993320465088, "learning_rate": 2.403846153846154e-06, "loss": 0.0798, "step": 19900 }, { "epoch": 95.72115384615384, "grad_norm": 1.900250792503357, "learning_rate": 2.377136752136752e-06, "loss": 0.0805, "step": 19910 }, { "epoch": 95.76923076923077, "grad_norm": 2.2171967029571533, "learning_rate": 2.3504273504273504e-06, "loss": 0.0784, "step": 19920 }, { "epoch": 95.8173076923077, "grad_norm": 1.2115119695663452, "learning_rate": 2.3237179487179487e-06, "loss": 0.0966, "step": 19930 }, { "epoch": 95.86538461538461, "grad_norm": 2.8577866554260254, "learning_rate": 2.2970085470085474e-06, "loss": 0.0927, "step": 19940 }, { "epoch": 95.91346153846153, "grad_norm": 2.587303400039673, "learning_rate": 2.2702991452991452e-06, "loss": 0.0844, "step": 19950 }, { "epoch": 95.96153846153847, "grad_norm": 3.6483945846557617, "learning_rate": 2.243589743589744e-06, "loss": 0.0825, "step": 19960 }, { "epoch": 96.0, "eval_accuracy": 0.9167982135294344, "eval_loss": 0.5345015525817871, "eval_runtime": 78.5002, "eval_samples_per_second": 330.865, "eval_steps_per_second": 5.172, "step": 19968 }, { "epoch": 96.00961538461539, "grad_norm": 1.8269048929214478, "learning_rate": 2.216880341880342e-06, "loss": 0.0842, "step": 19970 }, { "epoch": 96.0576923076923, "grad_norm": 2.337278366088867, "learning_rate": 2.19017094017094e-06, "loss": 0.0731, "step": 19980 }, { "epoch": 96.10576923076923, "grad_norm": 2.1186327934265137, "learning_rate": 2.1634615384615387e-06, "loss": 0.0839, "step": 19990 }, { "epoch": 96.15384615384616, "grad_norm": 3.0470283031463623, "learning_rate": 2.136752136752137e-06, "loss": 0.0905, "step": 20000 }, { "epoch": 96.20192307692308, "grad_norm": 4.353131294250488, "learning_rate": 2.110042735042735e-06, "loss": 0.0882, "step": 20010 }, { "epoch": 96.25, "grad_norm": 3.388697624206543, "learning_rate": 2.0833333333333334e-06, "loss": 0.0833, "step": 20020 }, { "epoch": 96.29807692307692, "grad_norm": 2.093270778656006, "learning_rate": 2.0566239316239317e-06, "loss": 0.08, "step": 20030 }, { "epoch": 96.34615384615384, "grad_norm": 2.1641745567321777, "learning_rate": 2.02991452991453e-06, "loss": 0.0814, "step": 20040 }, { "epoch": 96.39423076923077, "grad_norm": 2.838172674179077, "learning_rate": 2.0032051282051286e-06, "loss": 0.1002, "step": 20050 }, { "epoch": 96.4423076923077, "grad_norm": 1.9606313705444336, "learning_rate": 1.9764957264957265e-06, "loss": 0.0925, "step": 20060 }, { "epoch": 96.49038461538461, "grad_norm": 2.4042484760284424, "learning_rate": 1.9497863247863247e-06, "loss": 0.0877, "step": 20070 }, { "epoch": 96.53846153846153, "grad_norm": 1.5793424844741821, "learning_rate": 1.9230769230769234e-06, "loss": 0.0761, "step": 20080 }, { "epoch": 96.58653846153847, "grad_norm": 1.382026195526123, "learning_rate": 1.8963675213675212e-06, "loss": 0.0915, "step": 20090 }, { "epoch": 96.63461538461539, "grad_norm": 2.3535654544830322, "learning_rate": 1.8696581196581197e-06, "loss": 0.0785, "step": 20100 }, { "epoch": 96.6826923076923, "grad_norm": 3.286661386489868, "learning_rate": 1.8429487179487182e-06, "loss": 0.0813, "step": 20110 }, { "epoch": 96.73076923076923, "grad_norm": 1.9795715808868408, "learning_rate": 1.8162393162393164e-06, "loss": 0.0811, "step": 20120 }, { "epoch": 96.77884615384616, "grad_norm": 2.4154164791107178, "learning_rate": 1.7895299145299145e-06, "loss": 0.0822, "step": 20130 }, { "epoch": 96.82692307692308, "grad_norm": 3.207357168197632, "learning_rate": 1.762820512820513e-06, "loss": 0.0928, "step": 20140 }, { "epoch": 96.875, "grad_norm": 2.6497135162353516, "learning_rate": 1.7361111111111112e-06, "loss": 0.0848, "step": 20150 }, { "epoch": 96.92307692307692, "grad_norm": 1.0960537195205688, "learning_rate": 1.7094017094017097e-06, "loss": 0.097, "step": 20160 }, { "epoch": 96.97115384615384, "grad_norm": 3.0557754039764404, "learning_rate": 1.6826923076923077e-06, "loss": 0.0849, "step": 20170 }, { "epoch": 97.0, "eval_accuracy": 0.9163746968005236, "eval_loss": 0.5387876033782959, "eval_runtime": 78.3559, "eval_samples_per_second": 331.475, "eval_steps_per_second": 5.181, "step": 20176 }, { "epoch": 97.01923076923077, "grad_norm": 3.6695168018341064, "learning_rate": 1.6559829059829062e-06, "loss": 0.0829, "step": 20180 }, { "epoch": 97.0673076923077, "grad_norm": 2.089900016784668, "learning_rate": 1.6292735042735044e-06, "loss": 0.0819, "step": 20190 }, { "epoch": 97.11538461538461, "grad_norm": 1.5185492038726807, "learning_rate": 1.6025641025641025e-06, "loss": 0.102, "step": 20200 }, { "epoch": 97.16346153846153, "grad_norm": 2.696809768676758, "learning_rate": 1.575854700854701e-06, "loss": 0.0922, "step": 20210 }, { "epoch": 97.21153846153847, "grad_norm": 1.414451241493225, "learning_rate": 1.5491452991452992e-06, "loss": 0.0921, "step": 20220 }, { "epoch": 97.25961538461539, "grad_norm": 1.9001017808914185, "learning_rate": 1.5224358974358975e-06, "loss": 0.0738, "step": 20230 }, { "epoch": 97.3076923076923, "grad_norm": 2.1416666507720947, "learning_rate": 1.495726495726496e-06, "loss": 0.0925, "step": 20240 }, { "epoch": 97.35576923076923, "grad_norm": 1.9170575141906738, "learning_rate": 1.469017094017094e-06, "loss": 0.0818, "step": 20250 }, { "epoch": 97.40384615384616, "grad_norm": 2.437810182571411, "learning_rate": 1.4423076923076924e-06, "loss": 0.0826, "step": 20260 }, { "epoch": 97.45192307692308, "grad_norm": 2.0733442306518555, "learning_rate": 1.4155982905982907e-06, "loss": 0.088, "step": 20270 }, { "epoch": 97.5, "grad_norm": 2.3623077869415283, "learning_rate": 1.388888888888889e-06, "loss": 0.0998, "step": 20280 }, { "epoch": 97.54807692307692, "grad_norm": 4.359771251678467, "learning_rate": 1.3621794871794872e-06, "loss": 0.0798, "step": 20290 }, { "epoch": 97.59615384615384, "grad_norm": 1.8762307167053223, "learning_rate": 1.3354700854700855e-06, "loss": 0.0891, "step": 20300 }, { "epoch": 97.64423076923077, "grad_norm": 1.985803246498108, "learning_rate": 1.308760683760684e-06, "loss": 0.0889, "step": 20310 }, { "epoch": 97.6923076923077, "grad_norm": 3.4929778575897217, "learning_rate": 1.282051282051282e-06, "loss": 0.0859, "step": 20320 }, { "epoch": 97.74038461538461, "grad_norm": 4.576716899871826, "learning_rate": 1.2553418803418805e-06, "loss": 0.0813, "step": 20330 }, { "epoch": 97.78846153846153, "grad_norm": 1.8167568445205688, "learning_rate": 1.2286324786324787e-06, "loss": 0.0838, "step": 20340 }, { "epoch": 97.83653846153847, "grad_norm": 2.9137980937957764, "learning_rate": 1.201923076923077e-06, "loss": 0.0904, "step": 20350 }, { "epoch": 97.88461538461539, "grad_norm": 2.4444997310638428, "learning_rate": 1.1752136752136752e-06, "loss": 0.0893, "step": 20360 }, { "epoch": 97.9326923076923, "grad_norm": 2.92171573638916, "learning_rate": 1.1485042735042737e-06, "loss": 0.0849, "step": 20370 }, { "epoch": 97.98076923076923, "grad_norm": 1.6745736598968506, "learning_rate": 1.121794871794872e-06, "loss": 0.0992, "step": 20380 }, { "epoch": 98.0, "eval_accuracy": 0.9167982135294344, "eval_loss": 0.5357361435890198, "eval_runtime": 78.4193, "eval_samples_per_second": 331.207, "eval_steps_per_second": 5.177, "step": 20384 }, { "epoch": 98.02884615384616, "grad_norm": 2.1571438312530518, "learning_rate": 1.09508547008547e-06, "loss": 0.0817, "step": 20390 }, { "epoch": 98.07692307692308, "grad_norm": 3.478376865386963, "learning_rate": 1.0683760683760685e-06, "loss": 0.0708, "step": 20400 }, { "epoch": 98.125, "grad_norm": 4.045080184936523, "learning_rate": 1.0416666666666667e-06, "loss": 0.0963, "step": 20410 }, { "epoch": 98.17307692307692, "grad_norm": 2.4191033840179443, "learning_rate": 1.014957264957265e-06, "loss": 0.1021, "step": 20420 }, { "epoch": 98.22115384615384, "grad_norm": 1.8308476209640503, "learning_rate": 9.882478632478632e-07, "loss": 0.079, "step": 20430 }, { "epoch": 98.26923076923077, "grad_norm": 3.303584575653076, "learning_rate": 9.615384615384617e-07, "loss": 0.0818, "step": 20440 }, { "epoch": 98.3173076923077, "grad_norm": 3.3403000831604004, "learning_rate": 9.348290598290598e-07, "loss": 0.0896, "step": 20450 }, { "epoch": 98.36538461538461, "grad_norm": 2.505937099456787, "learning_rate": 9.081196581196582e-07, "loss": 0.09, "step": 20460 }, { "epoch": 98.41346153846153, "grad_norm": 2.0049941539764404, "learning_rate": 8.814102564102565e-07, "loss": 0.0773, "step": 20470 }, { "epoch": 98.46153846153847, "grad_norm": 1.388603925704956, "learning_rate": 8.547008547008548e-07, "loss": 0.0899, "step": 20480 }, { "epoch": 98.50961538461539, "grad_norm": 3.0991697311401367, "learning_rate": 8.279914529914531e-07, "loss": 0.1077, "step": 20490 }, { "epoch": 98.5576923076923, "grad_norm": 1.1116154193878174, "learning_rate": 8.012820512820512e-07, "loss": 0.0689, "step": 20500 }, { "epoch": 98.60576923076923, "grad_norm": 2.1255810260772705, "learning_rate": 7.745726495726496e-07, "loss": 0.0821, "step": 20510 }, { "epoch": 98.65384615384616, "grad_norm": 2.537010431289673, "learning_rate": 7.47863247863248e-07, "loss": 0.0918, "step": 20520 }, { "epoch": 98.70192307692308, "grad_norm": 3.2583508491516113, "learning_rate": 7.211538461538462e-07, "loss": 0.0923, "step": 20530 }, { "epoch": 98.75, "grad_norm": 3.071305751800537, "learning_rate": 6.944444444444445e-07, "loss": 0.0849, "step": 20540 }, { "epoch": 98.79807692307692, "grad_norm": 3.134579658508301, "learning_rate": 6.677350427350427e-07, "loss": 0.0831, "step": 20550 }, { "epoch": 98.84615384615384, "grad_norm": 1.6228570938110352, "learning_rate": 6.41025641025641e-07, "loss": 0.08, "step": 20560 }, { "epoch": 98.89423076923077, "grad_norm": 2.2970974445343018, "learning_rate": 6.143162393162394e-07, "loss": 0.0839, "step": 20570 }, { "epoch": 98.9423076923077, "grad_norm": 1.9471375942230225, "learning_rate": 5.876068376068376e-07, "loss": 0.0955, "step": 20580 }, { "epoch": 98.99038461538461, "grad_norm": 3.664748191833496, "learning_rate": 5.60897435897436e-07, "loss": 0.0909, "step": 20590 }, { "epoch": 99.0, "eval_accuracy": 0.9166827089670042, "eval_loss": 0.5375308990478516, "eval_runtime": 78.4451, "eval_samples_per_second": 331.098, "eval_steps_per_second": 5.176, "step": 20592 }, { "epoch": 99.03846153846153, "grad_norm": 2.618673324584961, "learning_rate": 5.341880341880342e-07, "loss": 0.0964, "step": 20600 }, { "epoch": 99.08653846153847, "grad_norm": 3.6571216583251953, "learning_rate": 5.074786324786325e-07, "loss": 0.0913, "step": 20610 }, { "epoch": 99.13461538461539, "grad_norm": 4.818175315856934, "learning_rate": 4.807692307692308e-07, "loss": 0.0886, "step": 20620 }, { "epoch": 99.1826923076923, "grad_norm": 1.8623826503753662, "learning_rate": 4.540598290598291e-07, "loss": 0.0871, "step": 20630 }, { "epoch": 99.23076923076923, "grad_norm": 1.3203959465026855, "learning_rate": 4.273504273504274e-07, "loss": 0.0875, "step": 20640 }, { "epoch": 99.27884615384616, "grad_norm": 1.7559832334518433, "learning_rate": 4.006410256410256e-07, "loss": 0.0916, "step": 20650 }, { "epoch": 99.32692307692308, "grad_norm": 1.8870049715042114, "learning_rate": 3.73931623931624e-07, "loss": 0.0795, "step": 20660 }, { "epoch": 99.375, "grad_norm": 3.046023368835449, "learning_rate": 3.4722222222222224e-07, "loss": 0.0888, "step": 20670 }, { "epoch": 99.42307692307692, "grad_norm": 2.357982635498047, "learning_rate": 3.205128205128205e-07, "loss": 0.0844, "step": 20680 }, { "epoch": 99.47115384615384, "grad_norm": 2.698892593383789, "learning_rate": 2.938034188034188e-07, "loss": 0.0779, "step": 20690 }, { "epoch": 99.51923076923077, "grad_norm": 2.3253889083862305, "learning_rate": 2.670940170940171e-07, "loss": 0.0914, "step": 20700 }, { "epoch": 99.5673076923077, "grad_norm": 3.250081777572632, "learning_rate": 2.403846153846154e-07, "loss": 0.0828, "step": 20710 }, { "epoch": 99.61538461538461, "grad_norm": 2.032201051712036, "learning_rate": 2.136752136752137e-07, "loss": 0.0813, "step": 20720 }, { "epoch": 99.66346153846153, "grad_norm": 2.525221347808838, "learning_rate": 1.86965811965812e-07, "loss": 0.0853, "step": 20730 }, { "epoch": 99.71153846153847, "grad_norm": 3.1810877323150635, "learning_rate": 1.6025641025641025e-07, "loss": 0.083, "step": 20740 }, { "epoch": 99.75961538461539, "grad_norm": 1.958433747291565, "learning_rate": 1.3354700854700856e-07, "loss": 0.0865, "step": 20750 }, { "epoch": 99.8076923076923, "grad_norm": 2.332122564315796, "learning_rate": 1.0683760683760685e-07, "loss": 0.0979, "step": 20760 }, { "epoch": 99.85576923076923, "grad_norm": 3.1678478717803955, "learning_rate": 8.012820512820512e-08, "loss": 0.0905, "step": 20770 }, { "epoch": 99.90384615384616, "grad_norm": 3.3074278831481934, "learning_rate": 5.341880341880343e-08, "loss": 0.088, "step": 20780 }, { "epoch": 99.95192307692308, "grad_norm": 2.6984152793884277, "learning_rate": 2.6709401709401713e-08, "loss": 0.0837, "step": 20790 }, { "epoch": 100.0, "grad_norm": 2.783062219619751, "learning_rate": 0.0, "loss": 0.0861, "step": 20800 }, { "epoch": 100.0, "eval_accuracy": 0.916605705925384, "eval_loss": 0.5371515154838562, "eval_runtime": 78.4073, "eval_samples_per_second": 331.257, "eval_steps_per_second": 5.178, "step": 20800 }, { "epoch": 100.0, "step": 20800, "total_flos": 1.041625401589334e+20, "train_loss": 0.14594437316060066, "train_runtime": 46903.0603, "train_samples_per_second": 113.485, "train_steps_per_second": 0.443 } ], "logging_steps": 10, "max_steps": 20800, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.041625401589334e+20, "train_batch_size": 64, "trial_name": null, "trial_params": null }