diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9645 @@ +{ + "best_metric": 0.8666666666666667, + "best_model_checkpoint": "CTMAE-P2-V5-3g-S5/checkpoint-3654", + "epoch": 49.02, + "eval_steps": 500, + "global_step": 13050, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007662835249042146, + "grad_norm": 6.515963554382324, + "learning_rate": 7.662835249042146e-08, + "loss": 0.7348, + "step": 10 + }, + { + "epoch": 0.0015325670498084292, + "grad_norm": 6.009317874908447, + "learning_rate": 1.5325670498084292e-07, + "loss": 0.8031, + "step": 20 + }, + { + "epoch": 0.0022988505747126436, + "grad_norm": 4.49639892578125, + "learning_rate": 2.2988505747126437e-07, + "loss": 0.71, + "step": 30 + }, + { + "epoch": 0.0030651340996168583, + "grad_norm": 5.10139799118042, + "learning_rate": 3.0651340996168583e-07, + "loss": 0.7344, + "step": 40 + }, + { + "epoch": 0.0038314176245210726, + "grad_norm": 6.141551971435547, + "learning_rate": 3.831417624521073e-07, + "loss": 0.7151, + "step": 50 + }, + { + "epoch": 0.004597701149425287, + "grad_norm": 4.640506267547607, + "learning_rate": 4.5977011494252875e-07, + "loss": 0.672, + "step": 60 + }, + { + "epoch": 0.0053639846743295016, + "grad_norm": 6.34081506729126, + "learning_rate": 5.363984674329502e-07, + "loss": 0.779, + "step": 70 + }, + { + "epoch": 0.006130268199233717, + "grad_norm": 5.613821983337402, + "learning_rate": 6.130268199233717e-07, + "loss": 0.7196, + "step": 80 + }, + { + "epoch": 0.006896551724137931, + "grad_norm": 6.071259021759033, + "learning_rate": 6.896551724137931e-07, + "loss": 0.7607, + "step": 90 + }, + { + "epoch": 0.007662835249042145, + "grad_norm": 5.040432453155518, + "learning_rate": 7.662835249042146e-07, + "loss": 0.6946, + "step": 100 + }, + { + "epoch": 0.00842911877394636, + "grad_norm": 5.427196502685547, + "learning_rate": 8.429118773946361e-07, + "loss": 0.7196, + "step": 110 + }, + { + "epoch": 0.009195402298850575, + "grad_norm": 5.190512180328369, + "learning_rate": 9.195402298850575e-07, + "loss": 0.7041, + "step": 120 + }, + { + "epoch": 0.00996168582375479, + "grad_norm": 4.816593647003174, + "learning_rate": 9.96168582375479e-07, + "loss": 0.6509, + "step": 130 + }, + { + "epoch": 0.010727969348659003, + "grad_norm": 4.155104637145996, + "learning_rate": 1.0727969348659004e-06, + "loss": 0.7013, + "step": 140 + }, + { + "epoch": 0.011494252873563218, + "grad_norm": 6.712978363037109, + "learning_rate": 1.1494252873563219e-06, + "loss": 0.6218, + "step": 150 + }, + { + "epoch": 0.012260536398467433, + "grad_norm": 7.192935466766357, + "learning_rate": 1.2260536398467433e-06, + "loss": 0.6437, + "step": 160 + }, + { + "epoch": 0.013026819923371647, + "grad_norm": 3.722414493560791, + "learning_rate": 1.3026819923371648e-06, + "loss": 0.6214, + "step": 170 + }, + { + "epoch": 0.013793103448275862, + "grad_norm": 5.249204635620117, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.6217, + "step": 180 + }, + { + "epoch": 0.014559386973180077, + "grad_norm": 3.773120880126953, + "learning_rate": 1.455938697318008e-06, + "loss": 0.714, + "step": 190 + }, + { + "epoch": 0.01532567049808429, + "grad_norm": 4.748465538024902, + "learning_rate": 1.5325670498084292e-06, + "loss": 0.6316, + "step": 200 + }, + { + "epoch": 0.016091954022988506, + "grad_norm": 4.1444501876831055, + "learning_rate": 1.6091954022988506e-06, + "loss": 0.7874, + "step": 210 + }, + { + "epoch": 0.01685823754789272, + "grad_norm": 5.518341064453125, + "learning_rate": 1.6858237547892723e-06, + "loss": 0.8583, + "step": 220 + }, + { + "epoch": 0.017624521072796936, + "grad_norm": 5.157224655151367, + "learning_rate": 1.7624521072796935e-06, + "loss": 0.3741, + "step": 230 + }, + { + "epoch": 0.01839080459770115, + "grad_norm": 3.7469072341918945, + "learning_rate": 1.839080459770115e-06, + "loss": 1.2018, + "step": 240 + }, + { + "epoch": 0.019157088122605363, + "grad_norm": 2.4285781383514404, + "learning_rate": 1.9157088122605367e-06, + "loss": 1.0356, + "step": 250 + }, + { + "epoch": 0.01992337164750958, + "grad_norm": 3.1459391117095947, + "learning_rate": 1.992337164750958e-06, + "loss": 1.8752, + "step": 260 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 2.000931978225708, + "eval_runtime": 17.5306, + "eval_samples_per_second": 2.567, + "eval_steps_per_second": 2.567, + "step": 261 + }, + { + "epoch": 1.0006896551724138, + "grad_norm": 1.068664312362671, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.8689, + "step": 270 + }, + { + "epoch": 1.001455938697318, + "grad_norm": 55.32078552246094, + "learning_rate": 2.145593869731801e-06, + "loss": 2.2527, + "step": 280 + }, + { + "epoch": 1.0022222222222221, + "grad_norm": 54.1043701171875, + "learning_rate": 2.222222222222222e-06, + "loss": 1.395, + "step": 290 + }, + { + "epoch": 1.0029885057471264, + "grad_norm": 0.6383324265480042, + "learning_rate": 2.2988505747126437e-06, + "loss": 1.3467, + "step": 300 + }, + { + "epoch": 1.0037547892720307, + "grad_norm": 1.1302201747894287, + "learning_rate": 2.3754789272030654e-06, + "loss": 3.2186, + "step": 310 + }, + { + "epoch": 1.004521072796935, + "grad_norm": 1.0692795515060425, + "learning_rate": 2.4521072796934867e-06, + "loss": 1.545, + "step": 320 + }, + { + "epoch": 1.0052873563218392, + "grad_norm": 1.788543462753296, + "learning_rate": 2.5287356321839083e-06, + "loss": 1.4408, + "step": 330 + }, + { + "epoch": 1.0060536398467432, + "grad_norm": 0.7168189287185669, + "learning_rate": 2.6053639846743296e-06, + "loss": 1.6561, + "step": 340 + }, + { + "epoch": 1.0068199233716475, + "grad_norm": 1.9753859043121338, + "learning_rate": 2.6819923371647512e-06, + "loss": 1.9563, + "step": 350 + }, + { + "epoch": 1.0075862068965518, + "grad_norm": 39.59136199951172, + "learning_rate": 2.7586206896551725e-06, + "loss": 1.5893, + "step": 360 + }, + { + "epoch": 1.008352490421456, + "grad_norm": 40.53175354003906, + "learning_rate": 2.835249042145594e-06, + "loss": 2.2763, + "step": 370 + }, + { + "epoch": 1.00911877394636, + "grad_norm": 0.5290699005126953, + "learning_rate": 2.911877394636016e-06, + "loss": 0.4305, + "step": 380 + }, + { + "epoch": 1.0098850574712643, + "grad_norm": 34.556400299072266, + "learning_rate": 2.988505747126437e-06, + "loss": 2.1726, + "step": 390 + }, + { + "epoch": 1.0106513409961686, + "grad_norm": 0.360198050737381, + "learning_rate": 3.0651340996168583e-06, + "loss": 0.4444, + "step": 400 + }, + { + "epoch": 1.0114176245210729, + "grad_norm": 37.28133773803711, + "learning_rate": 3.14176245210728e-06, + "loss": 1.5316, + "step": 410 + }, + { + "epoch": 1.012183908045977, + "grad_norm": 0.7995922565460205, + "learning_rate": 3.2183908045977012e-06, + "loss": 1.832, + "step": 420 + }, + { + "epoch": 1.0129501915708812, + "grad_norm": 0.4065437614917755, + "learning_rate": 3.295019157088123e-06, + "loss": 1.2957, + "step": 430 + }, + { + "epoch": 1.0137164750957854, + "grad_norm": 0.4343510866165161, + "learning_rate": 3.3716475095785446e-06, + "loss": 1.3974, + "step": 440 + }, + { + "epoch": 1.0144827586206897, + "grad_norm": 0.17931339144706726, + "learning_rate": 3.448275862068966e-06, + "loss": 0.009, + "step": 450 + }, + { + "epoch": 1.015249042145594, + "grad_norm": 38.98386001586914, + "learning_rate": 3.524904214559387e-06, + "loss": 2.1494, + "step": 460 + }, + { + "epoch": 1.016015325670498, + "grad_norm": 0.6379644870758057, + "learning_rate": 3.6015325670498087e-06, + "loss": 1.4487, + "step": 470 + }, + { + "epoch": 1.0167816091954023, + "grad_norm": 35.66837692260742, + "learning_rate": 3.67816091954023e-06, + "loss": 2.0168, + "step": 480 + }, + { + "epoch": 1.0175478927203065, + "grad_norm": 1.7801227569580078, + "learning_rate": 3.7547892720306517e-06, + "loss": 1.7985, + "step": 490 + }, + { + "epoch": 1.0183141762452108, + "grad_norm": 0.5001053810119629, + "learning_rate": 3.831417624521073e-06, + "loss": 0.3404, + "step": 500 + }, + { + "epoch": 1.0190804597701149, + "grad_norm": 0.29768675565719604, + "learning_rate": 3.908045977011495e-06, + "loss": 0.901, + "step": 510 + }, + { + "epoch": 1.0198467432950191, + "grad_norm": 0.17741112411022186, + "learning_rate": 3.984674329501916e-06, + "loss": 0.5365, + "step": 520 + }, + { + "epoch": 1.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 2.8415279388427734, + "eval_runtime": 16.3662, + "eval_samples_per_second": 2.75, + "eval_steps_per_second": 2.75, + "step": 522 + }, + { + "epoch": 2.0006130268199236, + "grad_norm": 69.17105102539062, + "learning_rate": 4.0613026819923375e-06, + "loss": 2.5607, + "step": 530 + }, + { + "epoch": 2.0013793103448276, + "grad_norm": 24.83037567138672, + "learning_rate": 4.137931034482759e-06, + "loss": 2.1662, + "step": 540 + }, + { + "epoch": 2.0021455938697317, + "grad_norm": 31.966571807861328, + "learning_rate": 4.214559386973181e-06, + "loss": 0.6009, + "step": 550 + }, + { + "epoch": 2.002911877394636, + "grad_norm": 42.25388717651367, + "learning_rate": 4.291187739463602e-06, + "loss": 1.5873, + "step": 560 + }, + { + "epoch": 2.00367816091954, + "grad_norm": 0.9124504327774048, + "learning_rate": 4.367816091954023e-06, + "loss": 1.5907, + "step": 570 + }, + { + "epoch": 2.0044444444444443, + "grad_norm": 0.7654790878295898, + "learning_rate": 4.444444444444444e-06, + "loss": 1.3789, + "step": 580 + }, + { + "epoch": 2.0052107279693487, + "grad_norm": 0.4567537307739258, + "learning_rate": 4.521072796934866e-06, + "loss": 1.3995, + "step": 590 + }, + { + "epoch": 2.005977011494253, + "grad_norm": 0.9925104975700378, + "learning_rate": 4.5977011494252875e-06, + "loss": 1.995, + "step": 600 + }, + { + "epoch": 2.0067432950191573, + "grad_norm": 0.2815808653831482, + "learning_rate": 4.674329501915709e-06, + "loss": 0.3418, + "step": 610 + }, + { + "epoch": 2.0075095785440613, + "grad_norm": 0.21866410970687866, + "learning_rate": 4.750957854406131e-06, + "loss": 1.5961, + "step": 620 + }, + { + "epoch": 2.0082758620689654, + "grad_norm": 0.11550973355770111, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.0053, + "step": 630 + }, + { + "epoch": 2.00904214559387, + "grad_norm": 31.14862823486328, + "learning_rate": 4.904214559386973e-06, + "loss": 2.2064, + "step": 640 + }, + { + "epoch": 2.009808429118774, + "grad_norm": 36.643001556396484, + "learning_rate": 4.980842911877395e-06, + "loss": 1.3828, + "step": 650 + }, + { + "epoch": 2.0105747126436784, + "grad_norm": 0.2615107297897339, + "learning_rate": 5.057471264367817e-06, + "loss": 0.4727, + "step": 660 + }, + { + "epoch": 2.0113409961685824, + "grad_norm": 28.37118148803711, + "learning_rate": 5.134099616858238e-06, + "loss": 0.9614, + "step": 670 + }, + { + "epoch": 2.0121072796934865, + "grad_norm": 30.042430877685547, + "learning_rate": 5.210727969348659e-06, + "loss": 2.7478, + "step": 680 + }, + { + "epoch": 2.012873563218391, + "grad_norm": 0.43138423562049866, + "learning_rate": 5.287356321839081e-06, + "loss": 0.3899, + "step": 690 + }, + { + "epoch": 2.013639846743295, + "grad_norm": 26.292491912841797, + "learning_rate": 5.3639846743295025e-06, + "loss": 1.8188, + "step": 700 + }, + { + "epoch": 2.014406130268199, + "grad_norm": 0.4923742115497589, + "learning_rate": 5.440613026819924e-06, + "loss": 1.3005, + "step": 710 + }, + { + "epoch": 2.0151724137931035, + "grad_norm": 0.6332253217697144, + "learning_rate": 5.517241379310345e-06, + "loss": 2.3829, + "step": 720 + }, + { + "epoch": 2.0159386973180076, + "grad_norm": 22.945175170898438, + "learning_rate": 5.593869731800766e-06, + "loss": 1.7472, + "step": 730 + }, + { + "epoch": 2.016704980842912, + "grad_norm": 1.0190438032150269, + "learning_rate": 5.670498084291188e-06, + "loss": 0.9494, + "step": 740 + }, + { + "epoch": 2.017471264367816, + "grad_norm": 23.87836265563965, + "learning_rate": 5.747126436781609e-06, + "loss": 1.3298, + "step": 750 + }, + { + "epoch": 2.01823754789272, + "grad_norm": 0.3306765854358673, + "learning_rate": 5.823754789272032e-06, + "loss": 0.7995, + "step": 760 + }, + { + "epoch": 2.0190038314176246, + "grad_norm": 25.512117385864258, + "learning_rate": 5.9003831417624525e-06, + "loss": 1.3456, + "step": 770 + }, + { + "epoch": 2.0197701149425287, + "grad_norm": 0.47690722346305847, + "learning_rate": 5.977011494252874e-06, + "loss": 1.8181, + "step": 780 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 2.1330909729003906, + "eval_runtime": 15.2884, + "eval_samples_per_second": 2.943, + "eval_steps_per_second": 2.943, + "step": 783 + }, + { + "epoch": 3.000536398467433, + "grad_norm": 22.61998176574707, + "learning_rate": 6.053639846743296e-06, + "loss": 2.2133, + "step": 790 + }, + { + "epoch": 3.001302681992337, + "grad_norm": 0.9248330593109131, + "learning_rate": 6.130268199233717e-06, + "loss": 0.6473, + "step": 800 + }, + { + "epoch": 3.0020689655172412, + "grad_norm": 0.3790869116783142, + "learning_rate": 6.206896551724138e-06, + "loss": 1.1646, + "step": 810 + }, + { + "epoch": 3.0028352490421457, + "grad_norm": 0.31349891424179077, + "learning_rate": 6.28352490421456e-06, + "loss": 1.3614, + "step": 820 + }, + { + "epoch": 3.0036015325670498, + "grad_norm": 0.4672112464904785, + "learning_rate": 6.360153256704982e-06, + "loss": 1.2955, + "step": 830 + }, + { + "epoch": 3.004367816091954, + "grad_norm": 24.16468048095703, + "learning_rate": 6.4367816091954025e-06, + "loss": 0.8488, + "step": 840 + }, + { + "epoch": 3.0051340996168583, + "grad_norm": 0.5347825288772583, + "learning_rate": 6.513409961685824e-06, + "loss": 1.7289, + "step": 850 + }, + { + "epoch": 3.0059003831417623, + "grad_norm": 0.4466633200645447, + "learning_rate": 6.590038314176246e-06, + "loss": 0.7971, + "step": 860 + }, + { + "epoch": 3.006666666666667, + "grad_norm": 0.29940515756607056, + "learning_rate": 6.666666666666667e-06, + "loss": 1.2878, + "step": 870 + }, + { + "epoch": 3.007432950191571, + "grad_norm": 0.1792927086353302, + "learning_rate": 6.743295019157089e-06, + "loss": 0.0103, + "step": 880 + }, + { + "epoch": 3.008199233716475, + "grad_norm": 24.740402221679688, + "learning_rate": 6.81992337164751e-06, + "loss": 1.9771, + "step": 890 + }, + { + "epoch": 3.0089655172413794, + "grad_norm": 0.5116446018218994, + "learning_rate": 6.896551724137932e-06, + "loss": 1.3359, + "step": 900 + }, + { + "epoch": 3.0097318007662834, + "grad_norm": 0.3623947203159332, + "learning_rate": 6.973180076628353e-06, + "loss": 1.6457, + "step": 910 + }, + { + "epoch": 3.010498084291188, + "grad_norm": 19.105806350708008, + "learning_rate": 7.049808429118774e-06, + "loss": 2.4007, + "step": 920 + }, + { + "epoch": 3.011264367816092, + "grad_norm": 0.9638222455978394, + "learning_rate": 7.126436781609196e-06, + "loss": 0.3213, + "step": 930 + }, + { + "epoch": 3.012030651340996, + "grad_norm": 22.83616065979004, + "learning_rate": 7.2030651340996175e-06, + "loss": 1.9054, + "step": 940 + }, + { + "epoch": 3.0127969348659005, + "grad_norm": 0.5195755958557129, + "learning_rate": 7.279693486590039e-06, + "loss": 1.1278, + "step": 950 + }, + { + "epoch": 3.0135632183908045, + "grad_norm": 21.764245986938477, + "learning_rate": 7.35632183908046e-06, + "loss": 1.1217, + "step": 960 + }, + { + "epoch": 3.014329501915709, + "grad_norm": 20.36184310913086, + "learning_rate": 7.4329501915708825e-06, + "loss": 2.2718, + "step": 970 + }, + { + "epoch": 3.015095785440613, + "grad_norm": 0.6821834444999695, + "learning_rate": 7.509578544061303e-06, + "loss": 0.7317, + "step": 980 + }, + { + "epoch": 3.015862068965517, + "grad_norm": 0.6219151616096497, + "learning_rate": 7.586206896551724e-06, + "loss": 1.427, + "step": 990 + }, + { + "epoch": 3.0166283524904216, + "grad_norm": 0.6644460558891296, + "learning_rate": 7.662835249042147e-06, + "loss": 1.0886, + "step": 1000 + }, + { + "epoch": 3.0173946360153256, + "grad_norm": 22.19208335876465, + "learning_rate": 7.739463601532567e-06, + "loss": 2.3383, + "step": 1010 + }, + { + "epoch": 3.0181609195402297, + "grad_norm": 1.41365647315979, + "learning_rate": 7.81609195402299e-06, + "loss": 1.5415, + "step": 1020 + }, + { + "epoch": 3.018927203065134, + "grad_norm": 1.0619806051254272, + "learning_rate": 7.89272030651341e-06, + "loss": 0.8251, + "step": 1030 + }, + { + "epoch": 3.0196934865900382, + "grad_norm": 0.8489217758178711, + "learning_rate": 7.969348659003832e-06, + "loss": 0.6674, + "step": 1040 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 2.5702672004699707, + "eval_runtime": 15.9903, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 2.814, + "step": 1044 + }, + { + "epoch": 4.000459770114943, + "grad_norm": 0.16821230947971344, + "learning_rate": 8.045977011494253e-06, + "loss": 0.9716, + "step": 1050 + }, + { + "epoch": 4.001226053639847, + "grad_norm": 29.10614013671875, + "learning_rate": 8.122605363984675e-06, + "loss": 0.5545, + "step": 1060 + }, + { + "epoch": 4.001992337164751, + "grad_norm": 28.9191951751709, + "learning_rate": 8.199233716475097e-06, + "loss": 1.0854, + "step": 1070 + }, + { + "epoch": 4.002758620689655, + "grad_norm": 0.3250425457954407, + "learning_rate": 8.275862068965518e-06, + "loss": 2.2773, + "step": 1080 + }, + { + "epoch": 4.00352490421456, + "grad_norm": 0.6510913968086243, + "learning_rate": 8.35249042145594e-06, + "loss": 0.7474, + "step": 1090 + }, + { + "epoch": 4.004291187739463, + "grad_norm": 23.383602142333984, + "learning_rate": 8.429118773946362e-06, + "loss": 1.6119, + "step": 1100 + }, + { + "epoch": 4.005057471264368, + "grad_norm": 18.749574661254883, + "learning_rate": 8.505747126436782e-06, + "loss": 1.812, + "step": 1110 + }, + { + "epoch": 4.005823754789272, + "grad_norm": 0.4811108112335205, + "learning_rate": 8.582375478927203e-06, + "loss": 1.2609, + "step": 1120 + }, + { + "epoch": 4.006590038314176, + "grad_norm": 2.357823133468628, + "learning_rate": 8.659003831417625e-06, + "loss": 1.449, + "step": 1130 + }, + { + "epoch": 4.00735632183908, + "grad_norm": 22.41640853881836, + "learning_rate": 8.735632183908047e-06, + "loss": 1.4203, + "step": 1140 + }, + { + "epoch": 4.008122605363985, + "grad_norm": 0.318185031414032, + "learning_rate": 8.812260536398468e-06, + "loss": 0.717, + "step": 1150 + }, + { + "epoch": 4.0088888888888885, + "grad_norm": 0.09603653103113174, + "learning_rate": 8.888888888888888e-06, + "loss": 0.9936, + "step": 1160 + }, + { + "epoch": 4.009655172413793, + "grad_norm": 0.7078070640563965, + "learning_rate": 8.965517241379312e-06, + "loss": 2.4035, + "step": 1170 + }, + { + "epoch": 4.0104214559386975, + "grad_norm": 1.1963130235671997, + "learning_rate": 9.042145593869732e-06, + "loss": 1.2541, + "step": 1180 + }, + { + "epoch": 4.011187739463602, + "grad_norm": 0.7769260406494141, + "learning_rate": 9.118773946360155e-06, + "loss": 1.5987, + "step": 1190 + }, + { + "epoch": 4.011954022988506, + "grad_norm": 1.0503462553024292, + "learning_rate": 9.195402298850575e-06, + "loss": 0.9641, + "step": 1200 + }, + { + "epoch": 4.01272030651341, + "grad_norm": 0.12320452928543091, + "learning_rate": 9.272030651340997e-06, + "loss": 0.397, + "step": 1210 + }, + { + "epoch": 4.0134865900383145, + "grad_norm": 0.14361780881881714, + "learning_rate": 9.348659003831418e-06, + "loss": 0.9065, + "step": 1220 + }, + { + "epoch": 4.014252873563218, + "grad_norm": 0.17569315433502197, + "learning_rate": 9.42528735632184e-06, + "loss": 2.1919, + "step": 1230 + }, + { + "epoch": 4.015019157088123, + "grad_norm": 0.5088914632797241, + "learning_rate": 9.501915708812262e-06, + "loss": 1.5856, + "step": 1240 + }, + { + "epoch": 4.015785440613027, + "grad_norm": 100.4439468383789, + "learning_rate": 9.578544061302683e-06, + "loss": 1.0188, + "step": 1250 + }, + { + "epoch": 4.016551724137931, + "grad_norm": 0.5644910335540771, + "learning_rate": 9.655172413793105e-06, + "loss": 1.995, + "step": 1260 + }, + { + "epoch": 4.017318007662835, + "grad_norm": 0.18410761654376984, + "learning_rate": 9.731800766283525e-06, + "loss": 0.4526, + "step": 1270 + }, + { + "epoch": 4.01808429118774, + "grad_norm": 0.1986103355884552, + "learning_rate": 9.808429118773947e-06, + "loss": 1.589, + "step": 1280 + }, + { + "epoch": 4.018850574712643, + "grad_norm": 0.7268558740615845, + "learning_rate": 9.885057471264368e-06, + "loss": 1.6857, + "step": 1290 + }, + { + "epoch": 4.019616858237548, + "grad_norm": 23.243221282958984, + "learning_rate": 9.96168582375479e-06, + "loss": 1.7586, + "step": 1300 + }, + { + "epoch": 4.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 1.1412569284439087, + "eval_runtime": 16.4314, + "eval_samples_per_second": 2.739, + "eval_steps_per_second": 2.739, + "step": 1305 + }, + { + "epoch": 5.000383141762452, + "grad_norm": 1.1051661968231201, + "learning_rate": 9.995742869306088e-06, + "loss": 0.8011, + "step": 1310 + }, + { + "epoch": 5.001149425287356, + "grad_norm": 0.23424309492111206, + "learning_rate": 9.987228607918263e-06, + "loss": 1.2352, + "step": 1320 + }, + { + "epoch": 5.001915708812261, + "grad_norm": 0.19838300347328186, + "learning_rate": 9.97871434653044e-06, + "loss": 0.493, + "step": 1330 + }, + { + "epoch": 5.002681992337164, + "grad_norm": 0.2764905095100403, + "learning_rate": 9.970200085142615e-06, + "loss": 1.4016, + "step": 1340 + }, + { + "epoch": 5.003448275862069, + "grad_norm": 19.335851669311523, + "learning_rate": 9.96168582375479e-06, + "loss": 2.0891, + "step": 1350 + }, + { + "epoch": 5.004214559386973, + "grad_norm": 17.96892547607422, + "learning_rate": 9.953171562366965e-06, + "loss": 1.3015, + "step": 1360 + }, + { + "epoch": 5.004980842911878, + "grad_norm": 0.40493690967559814, + "learning_rate": 9.944657300979142e-06, + "loss": 0.0353, + "step": 1370 + }, + { + "epoch": 5.005747126436781, + "grad_norm": 0.46046707034111023, + "learning_rate": 9.936143039591317e-06, + "loss": 1.6454, + "step": 1380 + }, + { + "epoch": 5.006513409961686, + "grad_norm": 19.26921844482422, + "learning_rate": 9.927628778203492e-06, + "loss": 2.6308, + "step": 1390 + }, + { + "epoch": 5.00727969348659, + "grad_norm": 0.6409273147583008, + "learning_rate": 9.919114516815667e-06, + "loss": 0.3655, + "step": 1400 + }, + { + "epoch": 5.008045977011494, + "grad_norm": 20.259307861328125, + "learning_rate": 9.910600255427842e-06, + "loss": 1.8601, + "step": 1410 + }, + { + "epoch": 5.0088122605363985, + "grad_norm": 0.2982420325279236, + "learning_rate": 9.902085994040018e-06, + "loss": 0.4097, + "step": 1420 + }, + { + "epoch": 5.009578544061303, + "grad_norm": 21.23941993713379, + "learning_rate": 9.893571732652193e-06, + "loss": 1.3039, + "step": 1430 + }, + { + "epoch": 5.010344827586207, + "grad_norm": 0.37259936332702637, + "learning_rate": 9.885057471264368e-06, + "loss": 1.6146, + "step": 1440 + }, + { + "epoch": 5.011111111111111, + "grad_norm": 0.6786749958992004, + "learning_rate": 9.876543209876543e-06, + "loss": 1.3758, + "step": 1450 + }, + { + "epoch": 5.011877394636016, + "grad_norm": 0.5649729371070862, + "learning_rate": 9.86802894848872e-06, + "loss": 0.7113, + "step": 1460 + }, + { + "epoch": 5.012643678160919, + "grad_norm": 19.882408142089844, + "learning_rate": 9.859514687100895e-06, + "loss": 1.191, + "step": 1470 + }, + { + "epoch": 5.013409961685824, + "grad_norm": 0.5207841396331787, + "learning_rate": 9.85100042571307e-06, + "loss": 1.6009, + "step": 1480 + }, + { + "epoch": 5.014176245210728, + "grad_norm": 0.5753087997436523, + "learning_rate": 9.842486164325245e-06, + "loss": 1.1187, + "step": 1490 + }, + { + "epoch": 5.014942528735633, + "grad_norm": 0.377913236618042, + "learning_rate": 9.833971902937422e-06, + "loss": 1.1263, + "step": 1500 + }, + { + "epoch": 5.015708812260536, + "grad_norm": 16.884944915771484, + "learning_rate": 9.825457641549597e-06, + "loss": 1.4622, + "step": 1510 + }, + { + "epoch": 5.016475095785441, + "grad_norm": 0.6560386419296265, + "learning_rate": 9.816943380161772e-06, + "loss": 0.9987, + "step": 1520 + }, + { + "epoch": 5.017241379310345, + "grad_norm": 18.208162307739258, + "learning_rate": 9.808429118773947e-06, + "loss": 2.1104, + "step": 1530 + }, + { + "epoch": 5.018007662835249, + "grad_norm": 18.947525024414062, + "learning_rate": 9.799914857386122e-06, + "loss": 1.8238, + "step": 1540 + }, + { + "epoch": 5.018773946360153, + "grad_norm": 1.023240566253662, + "learning_rate": 9.791400595998298e-06, + "loss": 0.8421, + "step": 1550 + }, + { + "epoch": 5.019540229885058, + "grad_norm": 25.64605140686035, + "learning_rate": 9.782886334610473e-06, + "loss": 1.0731, + "step": 1560 + }, + { + "epoch": 5.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 1.8861749172210693, + "eval_runtime": 16.269, + "eval_samples_per_second": 2.766, + "eval_steps_per_second": 2.766, + "step": 1566 + }, + { + "epoch": 6.000306513409962, + "grad_norm": 18.987686157226562, + "learning_rate": 9.774372073222648e-06, + "loss": 1.5048, + "step": 1570 + }, + { + "epoch": 6.001072796934866, + "grad_norm": 19.56184196472168, + "learning_rate": 9.765857811834825e-06, + "loss": 1.072, + "step": 1580 + }, + { + "epoch": 6.00183908045977, + "grad_norm": 0.44697844982147217, + "learning_rate": 9.757343550447e-06, + "loss": 0.7249, + "step": 1590 + }, + { + "epoch": 6.002605363984674, + "grad_norm": 0.06227334588766098, + "learning_rate": 9.748829289059175e-06, + "loss": 1.42, + "step": 1600 + }, + { + "epoch": 6.003371647509579, + "grad_norm": 0.5351743102073669, + "learning_rate": 9.74031502767135e-06, + "loss": 1.7496, + "step": 1610 + }, + { + "epoch": 6.0041379310344825, + "grad_norm": 0.6440963745117188, + "learning_rate": 9.731800766283525e-06, + "loss": 1.095, + "step": 1620 + }, + { + "epoch": 6.004904214559387, + "grad_norm": 28.36171531677246, + "learning_rate": 9.723286504895702e-06, + "loss": 1.7879, + "step": 1630 + }, + { + "epoch": 6.005670498084291, + "grad_norm": 0.16215454041957855, + "learning_rate": 9.714772243507877e-06, + "loss": 0.0226, + "step": 1640 + }, + { + "epoch": 6.006436781609195, + "grad_norm": 34.247615814208984, + "learning_rate": 9.706257982120052e-06, + "loss": 1.473, + "step": 1650 + }, + { + "epoch": 6.0072030651340995, + "grad_norm": 28.649755477905273, + "learning_rate": 9.697743720732228e-06, + "loss": 0.919, + "step": 1660 + }, + { + "epoch": 6.007969348659004, + "grad_norm": 18.813629150390625, + "learning_rate": 9.689229459344403e-06, + "loss": 2.428, + "step": 1670 + }, + { + "epoch": 6.008735632183908, + "grad_norm": 1.3150634765625, + "learning_rate": 9.680715197956578e-06, + "loss": 0.8913, + "step": 1680 + }, + { + "epoch": 6.009501915708812, + "grad_norm": 0.3356521427631378, + "learning_rate": 9.672200936568753e-06, + "loss": 0.9907, + "step": 1690 + }, + { + "epoch": 6.010268199233717, + "grad_norm": 17.486032485961914, + "learning_rate": 9.663686675180928e-06, + "loss": 1.6711, + "step": 1700 + }, + { + "epoch": 6.011034482758621, + "grad_norm": 1.6453771591186523, + "learning_rate": 9.655172413793105e-06, + "loss": 1.6673, + "step": 1710 + }, + { + "epoch": 6.011800766283525, + "grad_norm": 35.23017883300781, + "learning_rate": 9.64665815240528e-06, + "loss": 1.1546, + "step": 1720 + }, + { + "epoch": 6.012567049808429, + "grad_norm": 1.0530450344085693, + "learning_rate": 9.638143891017455e-06, + "loss": 0.0673, + "step": 1730 + }, + { + "epoch": 6.013333333333334, + "grad_norm": 44.10451126098633, + "learning_rate": 9.62962962962963e-06, + "loss": 1.704, + "step": 1740 + }, + { + "epoch": 6.014099616858237, + "grad_norm": 24.64010238647461, + "learning_rate": 9.621115368241805e-06, + "loss": 1.9377, + "step": 1750 + }, + { + "epoch": 6.014865900383142, + "grad_norm": 0.5813682675361633, + "learning_rate": 9.612601106853982e-06, + "loss": 0.7754, + "step": 1760 + }, + { + "epoch": 6.015632183908046, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.604086845466157e-06, + "loss": 0.3575, + "step": 1770 + }, + { + "epoch": 6.01639846743295, + "grad_norm": 58.88032913208008, + "learning_rate": 9.595572584078332e-06, + "loss": 1.7625, + "step": 1780 + }, + { + "epoch": 6.017164750957854, + "grad_norm": 53.162986755371094, + "learning_rate": 9.587058322690508e-06, + "loss": 2.6159, + "step": 1790 + }, + { + "epoch": 6.017931034482759, + "grad_norm": 17.41657829284668, + "learning_rate": 9.578544061302683e-06, + "loss": 1.7754, + "step": 1800 + }, + { + "epoch": 6.018697318007663, + "grad_norm": 1.2673225402832031, + "learning_rate": 9.570029799914858e-06, + "loss": 0.739, + "step": 1810 + }, + { + "epoch": 6.019463601532567, + "grad_norm": 30.341552734375, + "learning_rate": 9.561515538527033e-06, + "loss": 1.3539, + "step": 1820 + }, + { + "epoch": 6.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 1.82771635055542, + "eval_runtime": 16.5146, + "eval_samples_per_second": 2.725, + "eval_steps_per_second": 2.725, + "step": 1827 + }, + { + "epoch": 7.000229885057471, + "grad_norm": 40.3336067199707, + "learning_rate": 9.553001277139208e-06, + "loss": 0.7449, + "step": 1830 + }, + { + "epoch": 7.000996168582375, + "grad_norm": 47.77488327026367, + "learning_rate": 9.544487015751385e-06, + "loss": 1.6668, + "step": 1840 + }, + { + "epoch": 7.00176245210728, + "grad_norm": 0.17062117159366608, + "learning_rate": 9.53597275436356e-06, + "loss": 0.6863, + "step": 1850 + }, + { + "epoch": 7.0025287356321835, + "grad_norm": 52.891422271728516, + "learning_rate": 9.527458492975735e-06, + "loss": 2.1523, + "step": 1860 + }, + { + "epoch": 7.003295019157088, + "grad_norm": 1.2075984477996826, + "learning_rate": 9.518944231587912e-06, + "loss": 0.7233, + "step": 1870 + }, + { + "epoch": 7.0040613026819925, + "grad_norm": 51.95742416381836, + "learning_rate": 9.510429970200085e-06, + "loss": 1.3763, + "step": 1880 + }, + { + "epoch": 7.004827586206897, + "grad_norm": 49.27976608276367, + "learning_rate": 9.501915708812262e-06, + "loss": 1.3568, + "step": 1890 + }, + { + "epoch": 7.0055938697318005, + "grad_norm": 14.621587753295898, + "learning_rate": 9.493401447424437e-06, + "loss": 1.8932, + "step": 1900 + }, + { + "epoch": 7.006360153256705, + "grad_norm": 2.2778844833374023, + "learning_rate": 9.484887186036612e-06, + "loss": 0.7018, + "step": 1910 + }, + { + "epoch": 7.0071264367816095, + "grad_norm": 4.356560230255127, + "learning_rate": 9.476372924648788e-06, + "loss": 1.4481, + "step": 1920 + }, + { + "epoch": 7.007892720306513, + "grad_norm": 0.10934069752693176, + "learning_rate": 9.467858663260963e-06, + "loss": 0.7275, + "step": 1930 + }, + { + "epoch": 7.008659003831418, + "grad_norm": 0.4002261459827423, + "learning_rate": 9.459344401873138e-06, + "loss": 1.648, + "step": 1940 + }, + { + "epoch": 7.009425287356322, + "grad_norm": 57.19015121459961, + "learning_rate": 9.450830140485315e-06, + "loss": 1.4453, + "step": 1950 + }, + { + "epoch": 7.010191570881226, + "grad_norm": 45.90049362182617, + "learning_rate": 9.442315879097488e-06, + "loss": 1.2843, + "step": 1960 + }, + { + "epoch": 7.01095785440613, + "grad_norm": 61.82890701293945, + "learning_rate": 9.433801617709665e-06, + "loss": 1.7412, + "step": 1970 + }, + { + "epoch": 7.011724137931035, + "grad_norm": 0.06092312932014465, + "learning_rate": 9.42528735632184e-06, + "loss": 0.2979, + "step": 1980 + }, + { + "epoch": 7.012490421455938, + "grad_norm": 46.58448028564453, + "learning_rate": 9.416773094934015e-06, + "loss": 1.9555, + "step": 1990 + }, + { + "epoch": 7.013256704980843, + "grad_norm": 0.8877707719802856, + "learning_rate": 9.408258833546192e-06, + "loss": 0.6078, + "step": 2000 + }, + { + "epoch": 7.014022988505747, + "grad_norm": 1.0596059560775757, + "learning_rate": 9.399744572158365e-06, + "loss": 0.6166, + "step": 2010 + }, + { + "epoch": 7.014789272030652, + "grad_norm": 93.8338623046875, + "learning_rate": 9.391230310770542e-06, + "loss": 1.2646, + "step": 2020 + }, + { + "epoch": 7.015555555555555, + "grad_norm": 0.6872724294662476, + "learning_rate": 9.382716049382717e-06, + "loss": 0.5737, + "step": 2030 + }, + { + "epoch": 7.01632183908046, + "grad_norm": 13.280865669250488, + "learning_rate": 9.374201787994892e-06, + "loss": 0.1844, + "step": 2040 + }, + { + "epoch": 7.017088122605364, + "grad_norm": 0.06641419976949692, + "learning_rate": 9.365687526607068e-06, + "loss": 1.1597, + "step": 2050 + }, + { + "epoch": 7.017854406130268, + "grad_norm": 0.3752352297306061, + "learning_rate": 9.357173265219243e-06, + "loss": 3.2409, + "step": 2060 + }, + { + "epoch": 7.018620689655172, + "grad_norm": 8.496101379394531, + "learning_rate": 9.348659003831418e-06, + "loss": 1.3105, + "step": 2070 + }, + { + "epoch": 7.019386973180077, + "grad_norm": 57.28456497192383, + "learning_rate": 9.340144742443595e-06, + "loss": 0.8906, + "step": 2080 + }, + { + "epoch": 7.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 1.7558737993240356, + "eval_runtime": 15.2334, + "eval_samples_per_second": 2.954, + "eval_steps_per_second": 2.954, + "step": 2088 + }, + { + "epoch": 8.00015325670498, + "grad_norm": 2.2176547050476074, + "learning_rate": 9.331630481055768e-06, + "loss": 0.786, + "step": 2090 + }, + { + "epoch": 8.000919540229885, + "grad_norm": 146.79502868652344, + "learning_rate": 9.323116219667945e-06, + "loss": 0.6585, + "step": 2100 + }, + { + "epoch": 8.001685823754789, + "grad_norm": 0.0524924211204052, + "learning_rate": 9.31460195828012e-06, + "loss": 1.4611, + "step": 2110 + }, + { + "epoch": 8.002452107279694, + "grad_norm": 0.035575300455093384, + "learning_rate": 9.306087696892295e-06, + "loss": 1.2997, + "step": 2120 + }, + { + "epoch": 8.003218390804598, + "grad_norm": 0.5172576904296875, + "learning_rate": 9.297573435504472e-06, + "loss": 2.0572, + "step": 2130 + }, + { + "epoch": 8.003984674329502, + "grad_norm": 6.22727108001709, + "learning_rate": 9.289059174116647e-06, + "loss": 0.0919, + "step": 2140 + }, + { + "epoch": 8.004750957854407, + "grad_norm": 0.8947464227676392, + "learning_rate": 9.280544912728822e-06, + "loss": 0.3653, + "step": 2150 + }, + { + "epoch": 8.00551724137931, + "grad_norm": 79.02603912353516, + "learning_rate": 9.272030651340997e-06, + "loss": 1.5212, + "step": 2160 + }, + { + "epoch": 8.006283524904214, + "grad_norm": 28.48909568786621, + "learning_rate": 9.263516389953172e-06, + "loss": 3.6352, + "step": 2170 + }, + { + "epoch": 8.00704980842912, + "grad_norm": 23.416492462158203, + "learning_rate": 9.255002128565348e-06, + "loss": 1.8813, + "step": 2180 + }, + { + "epoch": 8.007816091954023, + "grad_norm": 10.679008483886719, + "learning_rate": 9.246487867177523e-06, + "loss": 0.434, + "step": 2190 + }, + { + "epoch": 8.008582375478927, + "grad_norm": 105.9638900756836, + "learning_rate": 9.237973605789698e-06, + "loss": 0.5633, + "step": 2200 + }, + { + "epoch": 8.009348659003832, + "grad_norm": 0.22639591991901398, + "learning_rate": 9.229459344401875e-06, + "loss": 2.3839, + "step": 2210 + }, + { + "epoch": 8.010114942528736, + "grad_norm": 0.23017604649066925, + "learning_rate": 9.220945083014048e-06, + "loss": 0.7008, + "step": 2220 + }, + { + "epoch": 8.01088122605364, + "grad_norm": 127.92184448242188, + "learning_rate": 9.212430821626225e-06, + "loss": 0.949, + "step": 2230 + }, + { + "epoch": 8.011647509578545, + "grad_norm": 61.606781005859375, + "learning_rate": 9.2039165602384e-06, + "loss": 1.359, + "step": 2240 + }, + { + "epoch": 8.012413793103448, + "grad_norm": 22.344465255737305, + "learning_rate": 9.195402298850575e-06, + "loss": 0.2749, + "step": 2250 + }, + { + "epoch": 8.013180076628352, + "grad_norm": 268.2001647949219, + "learning_rate": 9.186888037462752e-06, + "loss": 1.2585, + "step": 2260 + }, + { + "epoch": 8.013946360153257, + "grad_norm": 60.801551818847656, + "learning_rate": 9.178373776074927e-06, + "loss": 1.553, + "step": 2270 + }, + { + "epoch": 8.01471264367816, + "grad_norm": 2.967214822769165, + "learning_rate": 9.169859514687102e-06, + "loss": 0.6537, + "step": 2280 + }, + { + "epoch": 8.015478927203064, + "grad_norm": 83.84700012207031, + "learning_rate": 9.161345253299277e-06, + "loss": 1.1928, + "step": 2290 + }, + { + "epoch": 8.01624521072797, + "grad_norm": 0.36373046040534973, + "learning_rate": 9.152830991911452e-06, + "loss": 2.2204, + "step": 2300 + }, + { + "epoch": 8.017011494252873, + "grad_norm": 0.08310884982347488, + "learning_rate": 9.144316730523628e-06, + "loss": 0.9423, + "step": 2310 + }, + { + "epoch": 8.017777777777777, + "grad_norm": 126.53848266601562, + "learning_rate": 9.135802469135803e-06, + "loss": 1.4315, + "step": 2320 + }, + { + "epoch": 8.018544061302682, + "grad_norm": 40.52521514892578, + "learning_rate": 9.127288207747978e-06, + "loss": 0.4683, + "step": 2330 + }, + { + "epoch": 8.019310344827586, + "grad_norm": 56.70500564575195, + "learning_rate": 9.118773946360155e-06, + "loss": 1.5706, + "step": 2340 + }, + { + "epoch": 8.02, + "eval_accuracy": 0.4666666666666667, + "eval_loss": 2.0013673305511475, + "eval_runtime": 15.4307, + "eval_samples_per_second": 2.916, + "eval_steps_per_second": 2.916, + "step": 2349 + }, + { + "epoch": 9.00007662835249, + "grad_norm": 0.04681820422410965, + "learning_rate": 9.110259684972328e-06, + "loss": 0.6411, + "step": 2350 + }, + { + "epoch": 9.000842911877395, + "grad_norm": 79.75486755371094, + "learning_rate": 9.101745423584505e-06, + "loss": 0.5805, + "step": 2360 + }, + { + "epoch": 9.001609195402299, + "grad_norm": 59.682044982910156, + "learning_rate": 9.09323116219668e-06, + "loss": 1.9657, + "step": 2370 + }, + { + "epoch": 9.002375478927203, + "grad_norm": 25.446149826049805, + "learning_rate": 9.084716900808855e-06, + "loss": 0.1048, + "step": 2380 + }, + { + "epoch": 9.003141762452108, + "grad_norm": 0.19314312934875488, + "learning_rate": 9.076202639421032e-06, + "loss": 1.7946, + "step": 2390 + }, + { + "epoch": 9.003908045977012, + "grad_norm": 46.27597427368164, + "learning_rate": 9.067688378033207e-06, + "loss": 2.5579, + "step": 2400 + }, + { + "epoch": 9.004674329501915, + "grad_norm": 51.210208892822266, + "learning_rate": 9.059174116645382e-06, + "loss": 0.7125, + "step": 2410 + }, + { + "epoch": 9.00544061302682, + "grad_norm": 10.87521743774414, + "learning_rate": 9.050659855257558e-06, + "loss": 0.6746, + "step": 2420 + }, + { + "epoch": 9.006206896551724, + "grad_norm": 13.82815933227539, + "learning_rate": 9.042145593869732e-06, + "loss": 1.0982, + "step": 2430 + }, + { + "epoch": 9.006973180076628, + "grad_norm": 34.410831451416016, + "learning_rate": 9.033631332481908e-06, + "loss": 1.1018, + "step": 2440 + }, + { + "epoch": 9.007739463601533, + "grad_norm": 0.4074358642101288, + "learning_rate": 9.025117071094083e-06, + "loss": 1.4791, + "step": 2450 + }, + { + "epoch": 9.008505747126437, + "grad_norm": 34.35459518432617, + "learning_rate": 9.016602809706258e-06, + "loss": 2.391, + "step": 2460 + }, + { + "epoch": 9.00927203065134, + "grad_norm": 9.86334228515625, + "learning_rate": 9.008088548318435e-06, + "loss": 0.5826, + "step": 2470 + }, + { + "epoch": 9.010038314176246, + "grad_norm": 0.3567091226577759, + "learning_rate": 8.999574286930608e-06, + "loss": 0.2535, + "step": 2480 + }, + { + "epoch": 9.01080459770115, + "grad_norm": 0.1000981479883194, + "learning_rate": 8.991060025542785e-06, + "loss": 0.7387, + "step": 2490 + }, + { + "epoch": 9.011570881226053, + "grad_norm": 0.6302473545074463, + "learning_rate": 8.98254576415496e-06, + "loss": 0.8229, + "step": 2500 + }, + { + "epoch": 9.012337164750958, + "grad_norm": 0.02157972939312458, + "learning_rate": 8.974031502767135e-06, + "loss": 0.9654, + "step": 2510 + }, + { + "epoch": 9.013103448275862, + "grad_norm": 0.21129199862480164, + "learning_rate": 8.965517241379312e-06, + "loss": 2.5648, + "step": 2520 + }, + { + "epoch": 9.013869731800765, + "grad_norm": 0.05354182422161102, + "learning_rate": 8.957002979991487e-06, + "loss": 0.1968, + "step": 2530 + }, + { + "epoch": 9.01463601532567, + "grad_norm": 7.898284435272217, + "learning_rate": 8.948488718603662e-06, + "loss": 1.5846, + "step": 2540 + }, + { + "epoch": 9.015402298850574, + "grad_norm": 0.8477972745895386, + "learning_rate": 8.939974457215838e-06, + "loss": 0.6042, + "step": 2550 + }, + { + "epoch": 9.01616858237548, + "grad_norm": 2.289872646331787, + "learning_rate": 8.931460195828012e-06, + "loss": 0.2621, + "step": 2560 + }, + { + "epoch": 9.016934865900383, + "grad_norm": 127.58092498779297, + "learning_rate": 8.922945934440188e-06, + "loss": 0.1349, + "step": 2570 + }, + { + "epoch": 9.017701149425287, + "grad_norm": 0.35964861512184143, + "learning_rate": 8.914431673052363e-06, + "loss": 0.2656, + "step": 2580 + }, + { + "epoch": 9.018467432950192, + "grad_norm": 97.85234069824219, + "learning_rate": 8.905917411664538e-06, + "loss": 1.4753, + "step": 2590 + }, + { + "epoch": 9.019233716475096, + "grad_norm": 0.021428819745779037, + "learning_rate": 8.897403150276715e-06, + "loss": 1.069, + "step": 2600 + }, + { + "epoch": 9.02, + "grad_norm": 0.36919301748275757, + "learning_rate": 8.888888888888888e-06, + "loss": 0.3113, + "step": 2610 + }, + { + "epoch": 9.02, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 1.3057535886764526, + "eval_runtime": 16.4734, + "eval_samples_per_second": 2.732, + "eval_steps_per_second": 2.732, + "step": 2610 + }, + { + "epoch": 10.000766283524904, + "grad_norm": 0.06585513800382614, + "learning_rate": 8.880374627501065e-06, + "loss": 1.5534, + "step": 2620 + }, + { + "epoch": 10.001532567049809, + "grad_norm": 1.7465332746505737, + "learning_rate": 8.87186036611324e-06, + "loss": 1.0735, + "step": 2630 + }, + { + "epoch": 10.002298850574713, + "grad_norm": 0.041456516832113266, + "learning_rate": 8.863346104725415e-06, + "loss": 0.2433, + "step": 2640 + }, + { + "epoch": 10.003065134099616, + "grad_norm": 118.21949005126953, + "learning_rate": 8.854831843337592e-06, + "loss": 0.9212, + "step": 2650 + }, + { + "epoch": 10.003831417624522, + "grad_norm": 6.035428524017334, + "learning_rate": 8.846317581949767e-06, + "loss": 0.9281, + "step": 2660 + }, + { + "epoch": 10.004597701149425, + "grad_norm": 0.02484976500272751, + "learning_rate": 8.837803320561942e-06, + "loss": 1.6067, + "step": 2670 + }, + { + "epoch": 10.005363984674329, + "grad_norm": 4.0249433517456055, + "learning_rate": 8.829289059174118e-06, + "loss": 1.5351, + "step": 2680 + }, + { + "epoch": 10.006130268199234, + "grad_norm": 0.6469050049781799, + "learning_rate": 8.820774797786292e-06, + "loss": 0.0113, + "step": 2690 + }, + { + "epoch": 10.006896551724138, + "grad_norm": 0.02827748842537403, + "learning_rate": 8.812260536398468e-06, + "loss": 0.9929, + "step": 2700 + }, + { + "epoch": 10.007662835249041, + "grad_norm": 84.25238037109375, + "learning_rate": 8.803746275010643e-06, + "loss": 2.1007, + "step": 2710 + }, + { + "epoch": 10.008429118773947, + "grad_norm": 1244.77197265625, + "learning_rate": 8.795232013622818e-06, + "loss": 1.3255, + "step": 2720 + }, + { + "epoch": 10.00919540229885, + "grad_norm": 0.021987877786159515, + "learning_rate": 8.786717752234995e-06, + "loss": 1.5886, + "step": 2730 + }, + { + "epoch": 10.009961685823756, + "grad_norm": 0.67450350522995, + "learning_rate": 8.77820349084717e-06, + "loss": 1.1008, + "step": 2740 + }, + { + "epoch": 10.01072796934866, + "grad_norm": 9.497635841369629, + "learning_rate": 8.769689229459345e-06, + "loss": 0.5836, + "step": 2750 + }, + { + "epoch": 10.011494252873563, + "grad_norm": 0.020129255950450897, + "learning_rate": 8.76117496807152e-06, + "loss": 1.2045, + "step": 2760 + }, + { + "epoch": 10.012260536398468, + "grad_norm": 0.017797917127609253, + "learning_rate": 8.752660706683695e-06, + "loss": 0.8505, + "step": 2770 + }, + { + "epoch": 10.013026819923372, + "grad_norm": 0.5655618906021118, + "learning_rate": 8.744146445295872e-06, + "loss": 0.9844, + "step": 2780 + }, + { + "epoch": 10.013793103448275, + "grad_norm": 0.10610664635896683, + "learning_rate": 8.735632183908047e-06, + "loss": 0.4595, + "step": 2790 + }, + { + "epoch": 10.01455938697318, + "grad_norm": 66.6644515991211, + "learning_rate": 8.727117922520222e-06, + "loss": 1.4337, + "step": 2800 + }, + { + "epoch": 10.015325670498084, + "grad_norm": 196.35189819335938, + "learning_rate": 8.718603661132398e-06, + "loss": 1.218, + "step": 2810 + }, + { + "epoch": 10.016091954022988, + "grad_norm": 328.2312316894531, + "learning_rate": 8.710089399744572e-06, + "loss": 0.795, + "step": 2820 + }, + { + "epoch": 10.016858237547893, + "grad_norm": 2.4351890087127686, + "learning_rate": 8.701575138356748e-06, + "loss": 0.5883, + "step": 2830 + }, + { + "epoch": 10.017624521072797, + "grad_norm": 0.07693709433078766, + "learning_rate": 8.693060876968923e-06, + "loss": 0.3392, + "step": 2840 + }, + { + "epoch": 10.0183908045977, + "grad_norm": 0.016635416075587273, + "learning_rate": 8.684546615581098e-06, + "loss": 0.7708, + "step": 2850 + }, + { + "epoch": 10.019157088122606, + "grad_norm": 1.0986331701278687, + "learning_rate": 8.676032354193275e-06, + "loss": 0.6342, + "step": 2860 + }, + { + "epoch": 10.01992337164751, + "grad_norm": 105.34931182861328, + "learning_rate": 8.66751809280545e-06, + "loss": 1.6658, + "step": 2870 + }, + { + "epoch": 10.02, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 1.5835341215133667, + "eval_runtime": 16.0892, + "eval_samples_per_second": 2.797, + "eval_steps_per_second": 2.797, + "step": 2871 + }, + { + "epoch": 11.000689655172414, + "grad_norm": 0.042755126953125, + "learning_rate": 8.659003831417625e-06, + "loss": 1.2281, + "step": 2880 + }, + { + "epoch": 11.001455938697317, + "grad_norm": 0.13419094681739807, + "learning_rate": 8.650489570029802e-06, + "loss": 0.643, + "step": 2890 + }, + { + "epoch": 11.002222222222223, + "grad_norm": 0.5722396969795227, + "learning_rate": 8.641975308641975e-06, + "loss": 1.1059, + "step": 2900 + }, + { + "epoch": 11.002988505747126, + "grad_norm": 512.3408203125, + "learning_rate": 8.633461047254152e-06, + "loss": 0.5732, + "step": 2910 + }, + { + "epoch": 11.00375478927203, + "grad_norm": 0.011800810694694519, + "learning_rate": 8.624946785866327e-06, + "loss": 1.0223, + "step": 2920 + }, + { + "epoch": 11.004521072796935, + "grad_norm": 8.030099868774414, + "learning_rate": 8.616432524478502e-06, + "loss": 0.3257, + "step": 2930 + }, + { + "epoch": 11.005287356321839, + "grad_norm": 45.093021392822266, + "learning_rate": 8.607918263090678e-06, + "loss": 1.7945, + "step": 2940 + }, + { + "epoch": 11.006053639846744, + "grad_norm": 0.22022829949855804, + "learning_rate": 8.599404001702853e-06, + "loss": 1.0616, + "step": 2950 + }, + { + "epoch": 11.006819923371648, + "grad_norm": 0.08585281670093536, + "learning_rate": 8.590889740315028e-06, + "loss": 1.644, + "step": 2960 + }, + { + "epoch": 11.007586206896551, + "grad_norm": 2.1208853721618652, + "learning_rate": 8.582375478927203e-06, + "loss": 0.6133, + "step": 2970 + }, + { + "epoch": 11.008352490421457, + "grad_norm": 91.65081024169922, + "learning_rate": 8.573861217539378e-06, + "loss": 1.0825, + "step": 2980 + }, + { + "epoch": 11.00911877394636, + "grad_norm": 3.9088544845581055, + "learning_rate": 8.565346956151555e-06, + "loss": 1.7217, + "step": 2990 + }, + { + "epoch": 11.009885057471264, + "grad_norm": 6.6397929191589355, + "learning_rate": 8.55683269476373e-06, + "loss": 1.8179, + "step": 3000 + }, + { + "epoch": 11.01065134099617, + "grad_norm": 11.316490173339844, + "learning_rate": 8.548318433375905e-06, + "loss": 0.0325, + "step": 3010 + }, + { + "epoch": 11.011417624521073, + "grad_norm": 0.28897494077682495, + "learning_rate": 8.539804171988082e-06, + "loss": 0.6603, + "step": 3020 + }, + { + "epoch": 11.012183908045976, + "grad_norm": 68.44873809814453, + "learning_rate": 8.531289910600255e-06, + "loss": 0.7484, + "step": 3030 + }, + { + "epoch": 11.012950191570882, + "grad_norm": 0.008221070282161236, + "learning_rate": 8.522775649212432e-06, + "loss": 0.0064, + "step": 3040 + }, + { + "epoch": 11.013716475095785, + "grad_norm": 189.57403564453125, + "learning_rate": 8.514261387824607e-06, + "loss": 1.7867, + "step": 3050 + }, + { + "epoch": 11.014482758620689, + "grad_norm": 0.04196823760867119, + "learning_rate": 8.505747126436782e-06, + "loss": 1.6422, + "step": 3060 + }, + { + "epoch": 11.015249042145594, + "grad_norm": 0.06809419393539429, + "learning_rate": 8.497232865048958e-06, + "loss": 0.0093, + "step": 3070 + }, + { + "epoch": 11.016015325670498, + "grad_norm": 0.6770594120025635, + "learning_rate": 8.488718603661133e-06, + "loss": 1.0171, + "step": 3080 + }, + { + "epoch": 11.016781609195402, + "grad_norm": 0.04048323631286621, + "learning_rate": 8.480204342273308e-06, + "loss": 2.187, + "step": 3090 + }, + { + "epoch": 11.017547892720307, + "grad_norm": 0.030397607013583183, + "learning_rate": 8.471690080885483e-06, + "loss": 0.5514, + "step": 3100 + }, + { + "epoch": 11.01831417624521, + "grad_norm": 32.990638732910156, + "learning_rate": 8.463175819497658e-06, + "loss": 1.2996, + "step": 3110 + }, + { + "epoch": 11.019080459770114, + "grad_norm": 0.03340402990579605, + "learning_rate": 8.454661558109835e-06, + "loss": 0.4193, + "step": 3120 + }, + { + "epoch": 11.01984674329502, + "grad_norm": 50.187522888183594, + "learning_rate": 8.44614729672201e-06, + "loss": 1.3587, + "step": 3130 + }, + { + "epoch": 11.02, + "eval_accuracy": 0.7555555555555555, + "eval_loss": 0.8309432864189148, + "eval_runtime": 16.5175, + "eval_samples_per_second": 2.724, + "eval_steps_per_second": 2.724, + "step": 3132 + }, + { + "epoch": 12.000613026819924, + "grad_norm": 0.5363060235977173, + "learning_rate": 8.437633035334185e-06, + "loss": 0.0229, + "step": 3140 + }, + { + "epoch": 12.001379310344827, + "grad_norm": 0.007902957499027252, + "learning_rate": 8.429118773946362e-06, + "loss": 1.1452, + "step": 3150 + }, + { + "epoch": 12.002145593869733, + "grad_norm": 232.3303680419922, + "learning_rate": 8.420604512558537e-06, + "loss": 1.4698, + "step": 3160 + }, + { + "epoch": 12.002911877394636, + "grad_norm": 2.575563669204712, + "learning_rate": 8.412090251170712e-06, + "loss": 1.0589, + "step": 3170 + }, + { + "epoch": 12.00367816091954, + "grad_norm": 28.641246795654297, + "learning_rate": 8.403575989782887e-06, + "loss": 1.9366, + "step": 3180 + }, + { + "epoch": 12.004444444444445, + "grad_norm": 31.25893211364746, + "learning_rate": 8.395061728395062e-06, + "loss": 0.5945, + "step": 3190 + }, + { + "epoch": 12.005210727969349, + "grad_norm": 1.1079418659210205, + "learning_rate": 8.386547467007238e-06, + "loss": 0.012, + "step": 3200 + }, + { + "epoch": 12.005977011494252, + "grad_norm": 0.012604065239429474, + "learning_rate": 8.378033205619413e-06, + "loss": 0.9649, + "step": 3210 + }, + { + "epoch": 12.006743295019158, + "grad_norm": 87.2633056640625, + "learning_rate": 8.369518944231588e-06, + "loss": 1.5178, + "step": 3220 + }, + { + "epoch": 12.007509578544061, + "grad_norm": 0.013699457980692387, + "learning_rate": 8.361004682843763e-06, + "loss": 1.3879, + "step": 3230 + }, + { + "epoch": 12.008275862068965, + "grad_norm": 52.45587158203125, + "learning_rate": 8.35249042145594e-06, + "loss": 1.9076, + "step": 3240 + }, + { + "epoch": 12.00904214559387, + "grad_norm": 41.09255599975586, + "learning_rate": 8.343976160068115e-06, + "loss": 1.284, + "step": 3250 + }, + { + "epoch": 12.009808429118774, + "grad_norm": 0.022570697590708733, + "learning_rate": 8.33546189868029e-06, + "loss": 0.1351, + "step": 3260 + }, + { + "epoch": 12.010574712643677, + "grad_norm": 0.04445415362715721, + "learning_rate": 8.326947637292465e-06, + "loss": 2.312, + "step": 3270 + }, + { + "epoch": 12.011340996168583, + "grad_norm": 0.39725983142852783, + "learning_rate": 8.318433375904642e-06, + "loss": 1.4933, + "step": 3280 + }, + { + "epoch": 12.012107279693486, + "grad_norm": 1.8836842775344849, + "learning_rate": 8.309919114516817e-06, + "loss": 0.9747, + "step": 3290 + }, + { + "epoch": 12.01287356321839, + "grad_norm": 147.71759033203125, + "learning_rate": 8.301404853128992e-06, + "loss": 1.5269, + "step": 3300 + }, + { + "epoch": 12.013639846743295, + "grad_norm": 1.9329392910003662, + "learning_rate": 8.292890591741167e-06, + "loss": 0.9688, + "step": 3310 + }, + { + "epoch": 12.014406130268199, + "grad_norm": 38.93643569946289, + "learning_rate": 8.284376330353342e-06, + "loss": 1.4943, + "step": 3320 + }, + { + "epoch": 12.015172413793103, + "grad_norm": 0.17174561321735382, + "learning_rate": 8.275862068965518e-06, + "loss": 1.0749, + "step": 3330 + }, + { + "epoch": 12.015938697318008, + "grad_norm": 31.687175750732422, + "learning_rate": 8.267347807577693e-06, + "loss": 0.9041, + "step": 3340 + }, + { + "epoch": 12.016704980842912, + "grad_norm": 0.12959319353103638, + "learning_rate": 8.258833546189868e-06, + "loss": 1.0097, + "step": 3350 + }, + { + "epoch": 12.017471264367815, + "grad_norm": 0.02093736082315445, + "learning_rate": 8.250319284802043e-06, + "loss": 1.1702, + "step": 3360 + }, + { + "epoch": 12.01823754789272, + "grad_norm": 25.50687026977539, + "learning_rate": 8.24180502341422e-06, + "loss": 0.5414, + "step": 3370 + }, + { + "epoch": 12.019003831417624, + "grad_norm": 0.06102278456091881, + "learning_rate": 8.233290762026395e-06, + "loss": 1.0346, + "step": 3380 + }, + { + "epoch": 12.01977011494253, + "grad_norm": 0.15459617972373962, + "learning_rate": 8.22477650063857e-06, + "loss": 0.31, + "step": 3390 + }, + { + "epoch": 12.02, + "eval_accuracy": 0.7777777777777778, + "eval_loss": 0.8154128193855286, + "eval_runtime": 14.3191, + "eval_samples_per_second": 3.143, + "eval_steps_per_second": 3.143, + "step": 3393 + }, + { + "epoch": 13.000536398467434, + "grad_norm": 19.95181655883789, + "learning_rate": 8.216262239250745e-06, + "loss": 1.3112, + "step": 3400 + }, + { + "epoch": 13.001302681992337, + "grad_norm": 0.020247263833880424, + "learning_rate": 8.207747977862922e-06, + "loss": 0.7465, + "step": 3410 + }, + { + "epoch": 13.00206896551724, + "grad_norm": 18.45448112487793, + "learning_rate": 8.199233716475097e-06, + "loss": 1.4082, + "step": 3420 + }, + { + "epoch": 13.002835249042146, + "grad_norm": 39.223690032958984, + "learning_rate": 8.190719455087272e-06, + "loss": 0.1986, + "step": 3430 + }, + { + "epoch": 13.00360153256705, + "grad_norm": 0.008466905914247036, + "learning_rate": 8.182205193699447e-06, + "loss": 0.7523, + "step": 3440 + }, + { + "epoch": 13.004367816091953, + "grad_norm": 0.12857244908809662, + "learning_rate": 8.173690932311623e-06, + "loss": 0.6167, + "step": 3450 + }, + { + "epoch": 13.005134099616859, + "grad_norm": 1.4279407262802124, + "learning_rate": 8.165176670923798e-06, + "loss": 0.5034, + "step": 3460 + }, + { + "epoch": 13.005900383141762, + "grad_norm": 0.1705969125032425, + "learning_rate": 8.156662409535973e-06, + "loss": 0.5446, + "step": 3470 + }, + { + "epoch": 13.006666666666666, + "grad_norm": 0.724726140499115, + "learning_rate": 8.148148148148148e-06, + "loss": 0.6268, + "step": 3480 + }, + { + "epoch": 13.007432950191571, + "grad_norm": 0.21082398295402527, + "learning_rate": 8.139633886760325e-06, + "loss": 0.3352, + "step": 3490 + }, + { + "epoch": 13.008199233716475, + "grad_norm": 234.13392639160156, + "learning_rate": 8.1311196253725e-06, + "loss": 2.6406, + "step": 3500 + }, + { + "epoch": 13.008965517241379, + "grad_norm": 4.463503837585449, + "learning_rate": 8.122605363984675e-06, + "loss": 1.2419, + "step": 3510 + }, + { + "epoch": 13.009731800766284, + "grad_norm": 6.503235816955566, + "learning_rate": 8.11409110259685e-06, + "loss": 0.349, + "step": 3520 + }, + { + "epoch": 13.010498084291187, + "grad_norm": 50.43798828125, + "learning_rate": 8.105576841209027e-06, + "loss": 1.2279, + "step": 3530 + }, + { + "epoch": 13.011264367816091, + "grad_norm": 5.821425914764404, + "learning_rate": 8.097062579821202e-06, + "loss": 1.5754, + "step": 3540 + }, + { + "epoch": 13.012030651340996, + "grad_norm": 0.020040128380060196, + "learning_rate": 8.088548318433377e-06, + "loss": 0.9977, + "step": 3550 + }, + { + "epoch": 13.0127969348659, + "grad_norm": 213.29562377929688, + "learning_rate": 8.080034057045552e-06, + "loss": 1.5818, + "step": 3560 + }, + { + "epoch": 13.013563218390805, + "grad_norm": 18.644872665405273, + "learning_rate": 8.071519795657727e-06, + "loss": 0.8922, + "step": 3570 + }, + { + "epoch": 13.014329501915709, + "grad_norm": 0.29788148403167725, + "learning_rate": 8.063005534269903e-06, + "loss": 0.4074, + "step": 3580 + }, + { + "epoch": 13.015095785440613, + "grad_norm": 0.3964133858680725, + "learning_rate": 8.054491272882078e-06, + "loss": 0.3987, + "step": 3590 + }, + { + "epoch": 13.015862068965518, + "grad_norm": 0.017643410712480545, + "learning_rate": 8.045977011494253e-06, + "loss": 0.1342, + "step": 3600 + }, + { + "epoch": 13.016628352490422, + "grad_norm": 0.29650014638900757, + "learning_rate": 8.037462750106428e-06, + "loss": 1.569, + "step": 3610 + }, + { + "epoch": 13.017394636015325, + "grad_norm": 3.4459073543548584, + "learning_rate": 8.028948488718605e-06, + "loss": 1.8939, + "step": 3620 + }, + { + "epoch": 13.01816091954023, + "grad_norm": 957.4302978515625, + "learning_rate": 8.02043422733078e-06, + "loss": 1.4765, + "step": 3630 + }, + { + "epoch": 13.018927203065134, + "grad_norm": 0.47024571895599365, + "learning_rate": 8.011919965942955e-06, + "loss": 0.9642, + "step": 3640 + }, + { + "epoch": 13.019693486590038, + "grad_norm": 1.1732593774795532, + "learning_rate": 8.00340570455513e-06, + "loss": 1.2834, + "step": 3650 + }, + { + "epoch": 13.02, + "eval_accuracy": 0.8666666666666667, + "eval_loss": 0.5090304613113403, + "eval_runtime": 15.0562, + "eval_samples_per_second": 2.989, + "eval_steps_per_second": 2.989, + "step": 3654 + }, + { + "epoch": 14.000459770114942, + "grad_norm": 594.275634765625, + "learning_rate": 7.994891443167307e-06, + "loss": 0.8765, + "step": 3660 + }, + { + "epoch": 14.001226053639847, + "grad_norm": 0.0192561112344265, + "learning_rate": 7.986377181779482e-06, + "loss": 1.1006, + "step": 3670 + }, + { + "epoch": 14.00199233716475, + "grad_norm": 0.007181501016020775, + "learning_rate": 7.977862920391657e-06, + "loss": 0.4418, + "step": 3680 + }, + { + "epoch": 14.002758620689654, + "grad_norm": 0.016903869807720184, + "learning_rate": 7.969348659003832e-06, + "loss": 0.4686, + "step": 3690 + }, + { + "epoch": 14.00352490421456, + "grad_norm": 0.25686225295066833, + "learning_rate": 7.960834397616007e-06, + "loss": 0.9965, + "step": 3700 + }, + { + "epoch": 14.004291187739463, + "grad_norm": 0.7215161919593811, + "learning_rate": 7.952320136228183e-06, + "loss": 0.006, + "step": 3710 + }, + { + "epoch": 14.005057471264367, + "grad_norm": 19.98381996154785, + "learning_rate": 7.943805874840358e-06, + "loss": 1.5662, + "step": 3720 + }, + { + "epoch": 14.005823754789272, + "grad_norm": 0.016891516745090485, + "learning_rate": 7.935291613452533e-06, + "loss": 1.9538, + "step": 3730 + }, + { + "epoch": 14.006590038314176, + "grad_norm": 1.6335327625274658, + "learning_rate": 7.92677735206471e-06, + "loss": 0.6845, + "step": 3740 + }, + { + "epoch": 14.007356321839081, + "grad_norm": 74.78628540039062, + "learning_rate": 7.918263090676885e-06, + "loss": 0.7214, + "step": 3750 + }, + { + "epoch": 14.008122605363985, + "grad_norm": 0.01958787813782692, + "learning_rate": 7.90974882928906e-06, + "loss": 0.5193, + "step": 3760 + }, + { + "epoch": 14.008888888888889, + "grad_norm": 948.969970703125, + "learning_rate": 7.901234567901235e-06, + "loss": 1.4058, + "step": 3770 + }, + { + "epoch": 14.009655172413794, + "grad_norm": 0.07399129867553711, + "learning_rate": 7.89272030651341e-06, + "loss": 1.5285, + "step": 3780 + }, + { + "epoch": 14.010421455938697, + "grad_norm": 0.11624588072299957, + "learning_rate": 7.884206045125587e-06, + "loss": 1.6973, + "step": 3790 + }, + { + "epoch": 14.011187739463601, + "grad_norm": 1.2204347848892212, + "learning_rate": 7.875691783737762e-06, + "loss": 1.1063, + "step": 3800 + }, + { + "epoch": 14.011954022988506, + "grad_norm": 0.1173257902264595, + "learning_rate": 7.867177522349937e-06, + "loss": 0.9653, + "step": 3810 + }, + { + "epoch": 14.01272030651341, + "grad_norm": 0.05164877697825432, + "learning_rate": 7.858663260962112e-06, + "loss": 1.7888, + "step": 3820 + }, + { + "epoch": 14.013486590038314, + "grad_norm": 151.31732177734375, + "learning_rate": 7.850148999574287e-06, + "loss": 0.9606, + "step": 3830 + }, + { + "epoch": 14.014252873563219, + "grad_norm": 0.1222265437245369, + "learning_rate": 7.841634738186463e-06, + "loss": 0.2589, + "step": 3840 + }, + { + "epoch": 14.015019157088123, + "grad_norm": 0.30466440320014954, + "learning_rate": 7.833120476798638e-06, + "loss": 0.5544, + "step": 3850 + }, + { + "epoch": 14.015785440613026, + "grad_norm": 0.015721678733825684, + "learning_rate": 7.824606215410813e-06, + "loss": 0.6975, + "step": 3860 + }, + { + "epoch": 14.016551724137932, + "grad_norm": 96.57806396484375, + "learning_rate": 7.81609195402299e-06, + "loss": 0.6042, + "step": 3870 + }, + { + "epoch": 14.017318007662835, + "grad_norm": 0.08435537666082382, + "learning_rate": 7.807577692635165e-06, + "loss": 0.1693, + "step": 3880 + }, + { + "epoch": 14.018084291187739, + "grad_norm": 0.6997672915458679, + "learning_rate": 7.79906343124734e-06, + "loss": 1.8493, + "step": 3890 + }, + { + "epoch": 14.018850574712644, + "grad_norm": 0.7452917098999023, + "learning_rate": 7.790549169859515e-06, + "loss": 1.0341, + "step": 3900 + }, + { + "epoch": 14.019616858237548, + "grad_norm": 31.302000045776367, + "learning_rate": 7.78203490847169e-06, + "loss": 0.7111, + "step": 3910 + }, + { + "epoch": 14.02, + "eval_accuracy": 0.6888888888888889, + "eval_loss": 1.2499884366989136, + "eval_runtime": 14.7662, + "eval_samples_per_second": 3.048, + "eval_steps_per_second": 3.048, + "step": 3915 + }, + { + "epoch": 15.000383141762452, + "grad_norm": 0.09833050519227982, + "learning_rate": 7.773520647083867e-06, + "loss": 2.1868, + "step": 3920 + }, + { + "epoch": 15.001149425287357, + "grad_norm": 0.0771474689245224, + "learning_rate": 7.765006385696042e-06, + "loss": 0.8732, + "step": 3930 + }, + { + "epoch": 15.00191570881226, + "grad_norm": 0.12298612296581268, + "learning_rate": 7.756492124308217e-06, + "loss": 1.2974, + "step": 3940 + }, + { + "epoch": 15.002681992337164, + "grad_norm": 386.5329284667969, + "learning_rate": 7.747977862920393e-06, + "loss": 0.1457, + "step": 3950 + }, + { + "epoch": 15.00344827586207, + "grad_norm": 0.1593717783689499, + "learning_rate": 7.739463601532567e-06, + "loss": 1.3177, + "step": 3960 + }, + { + "epoch": 15.004214559386973, + "grad_norm": 5.293508052825928, + "learning_rate": 7.730949340144743e-06, + "loss": 0.5551, + "step": 3970 + }, + { + "epoch": 15.004980842911877, + "grad_norm": 408.6733093261719, + "learning_rate": 7.722435078756918e-06, + "loss": 1.4917, + "step": 3980 + }, + { + "epoch": 15.005747126436782, + "grad_norm": 137.54627990722656, + "learning_rate": 7.713920817369093e-06, + "loss": 0.4599, + "step": 3990 + }, + { + "epoch": 15.006513409961686, + "grad_norm": 335.1684875488281, + "learning_rate": 7.70540655598127e-06, + "loss": 1.0811, + "step": 4000 + }, + { + "epoch": 15.00727969348659, + "grad_norm": 19.96000099182129, + "learning_rate": 7.696892294593445e-06, + "loss": 1.2689, + "step": 4010 + }, + { + "epoch": 15.008045977011495, + "grad_norm": 5.857684135437012, + "learning_rate": 7.68837803320562e-06, + "loss": 1.0879, + "step": 4020 + }, + { + "epoch": 15.008812260536398, + "grad_norm": 0.012607713229954243, + "learning_rate": 7.679863771817797e-06, + "loss": 0.3736, + "step": 4030 + }, + { + "epoch": 15.009578544061302, + "grad_norm": 2.0022695064544678, + "learning_rate": 7.67134951042997e-06, + "loss": 1.3587, + "step": 4040 + }, + { + "epoch": 15.010344827586207, + "grad_norm": 0.08656860142946243, + "learning_rate": 7.662835249042147e-06, + "loss": 0.3155, + "step": 4050 + }, + { + "epoch": 15.011111111111111, + "grad_norm": 0.09074855595827103, + "learning_rate": 7.654320987654322e-06, + "loss": 0.0547, + "step": 4060 + }, + { + "epoch": 15.011877394636015, + "grad_norm": 4.4385223388671875, + "learning_rate": 7.645806726266497e-06, + "loss": 3.1175, + "step": 4070 + }, + { + "epoch": 15.01264367816092, + "grad_norm": 0.578523576259613, + "learning_rate": 7.637292464878673e-06, + "loss": 1.1603, + "step": 4080 + }, + { + "epoch": 15.013409961685824, + "grad_norm": 31.44670295715332, + "learning_rate": 7.6287782034908475e-06, + "loss": 1.1973, + "step": 4090 + }, + { + "epoch": 15.014176245210727, + "grad_norm": 0.009388735517859459, + "learning_rate": 7.620263942103023e-06, + "loss": 1.1828, + "step": 4100 + }, + { + "epoch": 15.014942528735633, + "grad_norm": 0.02472112700343132, + "learning_rate": 7.611749680715198e-06, + "loss": 0.357, + "step": 4110 + }, + { + "epoch": 15.015708812260536, + "grad_norm": 54.052547454833984, + "learning_rate": 7.603235419327374e-06, + "loss": 2.107, + "step": 4120 + }, + { + "epoch": 15.01647509578544, + "grad_norm": 0.5136244297027588, + "learning_rate": 7.59472115793955e-06, + "loss": 0.1768, + "step": 4130 + }, + { + "epoch": 15.017241379310345, + "grad_norm": 0.0888374000787735, + "learning_rate": 7.586206896551724e-06, + "loss": 0.6062, + "step": 4140 + }, + { + "epoch": 15.018007662835249, + "grad_norm": 0.32292622327804565, + "learning_rate": 7.5776926351639e-06, + "loss": 0.7942, + "step": 4150 + }, + { + "epoch": 15.018773946360152, + "grad_norm": 54.13147735595703, + "learning_rate": 7.569178373776076e-06, + "loss": 1.2646, + "step": 4160 + }, + { + "epoch": 15.019540229885058, + "grad_norm": 72.40896606445312, + "learning_rate": 7.560664112388251e-06, + "loss": 2.3551, + "step": 4170 + }, + { + "epoch": 15.02, + "eval_accuracy": 0.8222222222222222, + "eval_loss": 0.6881113648414612, + "eval_runtime": 15.5764, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 2.889, + "step": 4176 + }, + { + "epoch": 16.00030651340996, + "grad_norm": 0.17236869037151337, + "learning_rate": 7.552149851000427e-06, + "loss": 0.9726, + "step": 4180 + }, + { + "epoch": 16.001072796934867, + "grad_norm": 0.37512490153312683, + "learning_rate": 7.543635589612601e-06, + "loss": 0.0073, + "step": 4190 + }, + { + "epoch": 16.00183908045977, + "grad_norm": 341.52520751953125, + "learning_rate": 7.535121328224777e-06, + "loss": 1.8988, + "step": 4200 + }, + { + "epoch": 16.002605363984674, + "grad_norm": 43.17485427856445, + "learning_rate": 7.5266070668369525e-06, + "loss": 1.7032, + "step": 4210 + }, + { + "epoch": 16.003371647509578, + "grad_norm": 0.2590516209602356, + "learning_rate": 7.5180928054491275e-06, + "loss": 1.2217, + "step": 4220 + }, + { + "epoch": 16.00413793103448, + "grad_norm": 0.7092514038085938, + "learning_rate": 7.509578544061303e-06, + "loss": 0.9297, + "step": 4230 + }, + { + "epoch": 16.00490421455939, + "grad_norm": 0.04314722120761871, + "learning_rate": 7.501064282673479e-06, + "loss": 0.4931, + "step": 4240 + }, + { + "epoch": 16.005670498084292, + "grad_norm": 0.051585156470537186, + "learning_rate": 7.492550021285654e-06, + "loss": 1.5563, + "step": 4250 + }, + { + "epoch": 16.006436781609196, + "grad_norm": 1.28795325756073, + "learning_rate": 7.48403575989783e-06, + "loss": 1.452, + "step": 4260 + }, + { + "epoch": 16.0072030651341, + "grad_norm": 403.7496032714844, + "learning_rate": 7.475521498510004e-06, + "loss": 1.7767, + "step": 4270 + }, + { + "epoch": 16.007969348659003, + "grad_norm": 0.0961364358663559, + "learning_rate": 7.46700723712218e-06, + "loss": 0.4419, + "step": 4280 + }, + { + "epoch": 16.008735632183907, + "grad_norm": 0.14619210362434387, + "learning_rate": 7.458492975734356e-06, + "loss": 0.9539, + "step": 4290 + }, + { + "epoch": 16.009501915708814, + "grad_norm": 47.26947021484375, + "learning_rate": 7.449978714346531e-06, + "loss": 1.6533, + "step": 4300 + }, + { + "epoch": 16.010268199233717, + "grad_norm": 280.16326904296875, + "learning_rate": 7.441464452958707e-06, + "loss": 0.8886, + "step": 4310 + }, + { + "epoch": 16.01103448275862, + "grad_norm": 51.095741271972656, + "learning_rate": 7.4329501915708825e-06, + "loss": 0.7588, + "step": 4320 + }, + { + "epoch": 16.011800766283525, + "grad_norm": 2.6517045497894287, + "learning_rate": 7.4244359301830575e-06, + "loss": 0.1651, + "step": 4330 + }, + { + "epoch": 16.01256704980843, + "grad_norm": 0.10823695361614227, + "learning_rate": 7.4159216687952325e-06, + "loss": 0.8843, + "step": 4340 + }, + { + "epoch": 16.013333333333332, + "grad_norm": 0.05993505194783211, + "learning_rate": 7.4074074074074075e-06, + "loss": 1.3364, + "step": 4350 + }, + { + "epoch": 16.01409961685824, + "grad_norm": 0.5145869255065918, + "learning_rate": 7.398893146019583e-06, + "loss": 0.9984, + "step": 4360 + }, + { + "epoch": 16.014865900383143, + "grad_norm": 3.6809394359588623, + "learning_rate": 7.390378884631759e-06, + "loss": 1.3562, + "step": 4370 + }, + { + "epoch": 16.015632183908046, + "grad_norm": 516.924560546875, + "learning_rate": 7.381864623243934e-06, + "loss": 0.4271, + "step": 4380 + }, + { + "epoch": 16.01639846743295, + "grad_norm": 1.5104776620864868, + "learning_rate": 7.37335036185611e-06, + "loss": 0.8018, + "step": 4390 + }, + { + "epoch": 16.017164750957853, + "grad_norm": 0.01973959617316723, + "learning_rate": 7.364836100468284e-06, + "loss": 0.9224, + "step": 4400 + }, + { + "epoch": 16.017931034482757, + "grad_norm": 2.8555283546447754, + "learning_rate": 7.35632183908046e-06, + "loss": 0.0167, + "step": 4410 + }, + { + "epoch": 16.018697318007664, + "grad_norm": 0.00982819963246584, + "learning_rate": 7.347807577692636e-06, + "loss": 0.7395, + "step": 4420 + }, + { + "epoch": 16.019463601532568, + "grad_norm": 0.015840256586670876, + "learning_rate": 7.339293316304811e-06, + "loss": 0.2734, + "step": 4430 + }, + { + "epoch": 16.02, + "eval_accuracy": 0.8444444444444444, + "eval_loss": 0.4506371021270752, + "eval_runtime": 13.6245, + "eval_samples_per_second": 3.303, + "eval_steps_per_second": 3.303, + "step": 4437 + }, + { + "epoch": 17.000229885057472, + "grad_norm": 1.947767972946167, + "learning_rate": 7.330779054916987e-06, + "loss": 0.0567, + "step": 4440 + }, + { + "epoch": 17.000996168582375, + "grad_norm": 383.13385009765625, + "learning_rate": 7.3222647935291625e-06, + "loss": 1.4525, + "step": 4450 + }, + { + "epoch": 17.00176245210728, + "grad_norm": 0.005475871730595827, + "learning_rate": 7.3137505321413375e-06, + "loss": 0.2238, + "step": 4460 + }, + { + "epoch": 17.002528735632183, + "grad_norm": 199.94427490234375, + "learning_rate": 7.305236270753513e-06, + "loss": 0.395, + "step": 4470 + }, + { + "epoch": 17.00329501915709, + "grad_norm": 0.12597985565662384, + "learning_rate": 7.2967220093656875e-06, + "loss": 1.2407, + "step": 4480 + }, + { + "epoch": 17.004061302681993, + "grad_norm": 22.926143646240234, + "learning_rate": 7.288207747977863e-06, + "loss": 1.3336, + "step": 4490 + }, + { + "epoch": 17.004827586206897, + "grad_norm": 2.7508955001831055, + "learning_rate": 7.279693486590039e-06, + "loss": 1.735, + "step": 4500 + }, + { + "epoch": 17.0055938697318, + "grad_norm": 0.7520158886909485, + "learning_rate": 7.271179225202214e-06, + "loss": 0.5005, + "step": 4510 + }, + { + "epoch": 17.006360153256704, + "grad_norm": 172.81605529785156, + "learning_rate": 7.26266496381439e-06, + "loss": 1.0526, + "step": 4520 + }, + { + "epoch": 17.007126436781608, + "grad_norm": 2.283644199371338, + "learning_rate": 7.254150702426566e-06, + "loss": 1.2169, + "step": 4530 + }, + { + "epoch": 17.007892720306515, + "grad_norm": 0.5311365127563477, + "learning_rate": 7.24563644103874e-06, + "loss": 0.5474, + "step": 4540 + }, + { + "epoch": 17.00865900383142, + "grad_norm": 0.0861196294426918, + "learning_rate": 7.237122179650916e-06, + "loss": 1.158, + "step": 4550 + }, + { + "epoch": 17.009425287356322, + "grad_norm": 0.42244112491607666, + "learning_rate": 7.228607918263091e-06, + "loss": 1.3332, + "step": 4560 + }, + { + "epoch": 17.010191570881226, + "grad_norm": 0.16180934011936188, + "learning_rate": 7.220093656875267e-06, + "loss": 0.0296, + "step": 4570 + }, + { + "epoch": 17.01095785440613, + "grad_norm": 529.973388671875, + "learning_rate": 7.2115793954874425e-06, + "loss": 0.9939, + "step": 4580 + }, + { + "epoch": 17.011724137931033, + "grad_norm": 0.008947962895035744, + "learning_rate": 7.2030651340996175e-06, + "loss": 0.3492, + "step": 4590 + }, + { + "epoch": 17.01249042145594, + "grad_norm": 0.0053781550377607346, + "learning_rate": 7.194550872711793e-06, + "loss": 0.8263, + "step": 4600 + }, + { + "epoch": 17.013256704980844, + "grad_norm": 106.14468383789062, + "learning_rate": 7.1860366113239675e-06, + "loss": 1.6513, + "step": 4610 + }, + { + "epoch": 17.014022988505747, + "grad_norm": 0.2938723564147949, + "learning_rate": 7.177522349936143e-06, + "loss": 0.447, + "step": 4620 + }, + { + "epoch": 17.01478927203065, + "grad_norm": 2.80863881111145, + "learning_rate": 7.169008088548319e-06, + "loss": 1.0175, + "step": 4630 + }, + { + "epoch": 17.015555555555554, + "grad_norm": 57.34475326538086, + "learning_rate": 7.160493827160494e-06, + "loss": 0.5093, + "step": 4640 + }, + { + "epoch": 17.016321839080458, + "grad_norm": 0.034363433718681335, + "learning_rate": 7.15197956577267e-06, + "loss": 0.584, + "step": 4650 + }, + { + "epoch": 17.017088122605365, + "grad_norm": 0.12468564510345459, + "learning_rate": 7.143465304384846e-06, + "loss": 0.3767, + "step": 4660 + }, + { + "epoch": 17.01785440613027, + "grad_norm": 0.005793506279587746, + "learning_rate": 7.13495104299702e-06, + "loss": 0.7981, + "step": 4670 + }, + { + "epoch": 17.018620689655172, + "grad_norm": 0.0034271469339728355, + "learning_rate": 7.126436781609196e-06, + "loss": 0.6918, + "step": 4680 + }, + { + "epoch": 17.019386973180076, + "grad_norm": 0.31982433795928955, + "learning_rate": 7.117922520221371e-06, + "loss": 0.9675, + "step": 4690 + }, + { + "epoch": 17.02, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 1.7514704465866089, + "eval_runtime": 13.3064, + "eval_samples_per_second": 3.382, + "eval_steps_per_second": 3.382, + "step": 4698 + }, + { + "epoch": 18.00015325670498, + "grad_norm": 0.16283735632896423, + "learning_rate": 7.109408258833547e-06, + "loss": 1.3484, + "step": 4700 + }, + { + "epoch": 18.000919540229884, + "grad_norm": 16.916929244995117, + "learning_rate": 7.1008939974457225e-06, + "loss": 2.394, + "step": 4710 + }, + { + "epoch": 18.00168582375479, + "grad_norm": 16.609804153442383, + "learning_rate": 7.0923797360578975e-06, + "loss": 0.3868, + "step": 4720 + }, + { + "epoch": 18.002452107279694, + "grad_norm": 16.240642547607422, + "learning_rate": 7.083865474670073e-06, + "loss": 0.7722, + "step": 4730 + }, + { + "epoch": 18.003218390804598, + "grad_norm": 1.8989708423614502, + "learning_rate": 7.075351213282249e-06, + "loss": 0.2564, + "step": 4740 + }, + { + "epoch": 18.0039846743295, + "grad_norm": 0.0033782178070396185, + "learning_rate": 7.066836951894423e-06, + "loss": 0.6987, + "step": 4750 + }, + { + "epoch": 18.004750957854405, + "grad_norm": 18.385026931762695, + "learning_rate": 7.058322690506599e-06, + "loss": 2.9738, + "step": 4760 + }, + { + "epoch": 18.00551724137931, + "grad_norm": 0.004265069495886564, + "learning_rate": 7.049808429118774e-06, + "loss": 1.5315, + "step": 4770 + }, + { + "epoch": 18.006283524904216, + "grad_norm": 0.07541969418525696, + "learning_rate": 7.04129416773095e-06, + "loss": 1.4052, + "step": 4780 + }, + { + "epoch": 18.00704980842912, + "grad_norm": 0.004919661208987236, + "learning_rate": 7.032779906343126e-06, + "loss": 0.7981, + "step": 4790 + }, + { + "epoch": 18.007816091954023, + "grad_norm": 0.38469570875167847, + "learning_rate": 7.0242656449553e-06, + "loss": 0.0221, + "step": 4800 + }, + { + "epoch": 18.008582375478927, + "grad_norm": 0.6571864485740662, + "learning_rate": 7.015751383567476e-06, + "loss": 0.341, + "step": 4810 + }, + { + "epoch": 18.00934865900383, + "grad_norm": 18.751163482666016, + "learning_rate": 7.007237122179652e-06, + "loss": 2.1029, + "step": 4820 + }, + { + "epoch": 18.010114942528734, + "grad_norm": 67.68429565429688, + "learning_rate": 6.998722860791827e-06, + "loss": 0.6903, + "step": 4830 + }, + { + "epoch": 18.01088122605364, + "grad_norm": 0.42845791578292847, + "learning_rate": 6.9902085994040025e-06, + "loss": 0.8219, + "step": 4840 + }, + { + "epoch": 18.011647509578545, + "grad_norm": 41.950721740722656, + "learning_rate": 6.9816943380161775e-06, + "loss": 0.8313, + "step": 4850 + }, + { + "epoch": 18.01241379310345, + "grad_norm": 16.151935577392578, + "learning_rate": 6.973180076628353e-06, + "loss": 1.5668, + "step": 4860 + }, + { + "epoch": 18.013180076628352, + "grad_norm": 0.09513141959905624, + "learning_rate": 6.964665815240529e-06, + "loss": 0.0103, + "step": 4870 + }, + { + "epoch": 18.013946360153255, + "grad_norm": 54.980491638183594, + "learning_rate": 6.956151553852703e-06, + "loss": 0.6221, + "step": 4880 + }, + { + "epoch": 18.014712643678163, + "grad_norm": 0.5550600290298462, + "learning_rate": 6.947637292464879e-06, + "loss": 1.1071, + "step": 4890 + }, + { + "epoch": 18.015478927203066, + "grad_norm": 0.00374088017269969, + "learning_rate": 6.939123031077054e-06, + "loss": 0.6851, + "step": 4900 + }, + { + "epoch": 18.01624521072797, + "grad_norm": 0.05271366983652115, + "learning_rate": 6.93060876968923e-06, + "loss": 0.0099, + "step": 4910 + }, + { + "epoch": 18.017011494252873, + "grad_norm": 0.3116820752620697, + "learning_rate": 6.922094508301406e-06, + "loss": 0.4008, + "step": 4920 + }, + { + "epoch": 18.017777777777777, + "grad_norm": 6.082730770111084, + "learning_rate": 6.913580246913581e-06, + "loss": 1.5001, + "step": 4930 + }, + { + "epoch": 18.01854406130268, + "grad_norm": 0.003171339863911271, + "learning_rate": 6.905065985525757e-06, + "loss": 1.2366, + "step": 4940 + }, + { + "epoch": 18.019310344827588, + "grad_norm": 0.35565054416656494, + "learning_rate": 6.896551724137932e-06, + "loss": 0.449, + "step": 4950 + }, + { + "epoch": 18.02, + "eval_accuracy": 0.7777777777777778, + "eval_loss": 0.7239938378334045, + "eval_runtime": 13.2593, + "eval_samples_per_second": 3.394, + "eval_steps_per_second": 3.394, + "step": 4959 + }, + { + "epoch": 19.000076628352492, + "grad_norm": 173.56851196289062, + "learning_rate": 6.888037462750107e-06, + "loss": 0.5259, + "step": 4960 + }, + { + "epoch": 19.000842911877395, + "grad_norm": 25.518754959106445, + "learning_rate": 6.8795232013622825e-06, + "loss": 0.8176, + "step": 4970 + }, + { + "epoch": 19.0016091954023, + "grad_norm": 0.004466726444661617, + "learning_rate": 6.8710089399744575e-06, + "loss": 0.3623, + "step": 4980 + }, + { + "epoch": 19.002375478927203, + "grad_norm": 1.6072368621826172, + "learning_rate": 6.862494678586633e-06, + "loss": 0.7608, + "step": 4990 + }, + { + "epoch": 19.003141762452106, + "grad_norm": 262.5456848144531, + "learning_rate": 6.853980417198809e-06, + "loss": 1.0434, + "step": 5000 + }, + { + "epoch": 19.00390804597701, + "grad_norm": 0.3474750220775604, + "learning_rate": 6.845466155810983e-06, + "loss": 2.76, + "step": 5010 + }, + { + "epoch": 19.004674329501917, + "grad_norm": 0.3191026449203491, + "learning_rate": 6.836951894423159e-06, + "loss": 0.921, + "step": 5020 + }, + { + "epoch": 19.00544061302682, + "grad_norm": 0.008138914592564106, + "learning_rate": 6.828437633035335e-06, + "loss": 1.0089, + "step": 5030 + }, + { + "epoch": 19.006206896551724, + "grad_norm": 226.68228149414062, + "learning_rate": 6.81992337164751e-06, + "loss": 0.466, + "step": 5040 + }, + { + "epoch": 19.006973180076628, + "grad_norm": 141.62277221679688, + "learning_rate": 6.811409110259686e-06, + "loss": 0.4943, + "step": 5050 + }, + { + "epoch": 19.00773946360153, + "grad_norm": 0.542086660861969, + "learning_rate": 6.802894848871861e-06, + "loss": 0.1911, + "step": 5060 + }, + { + "epoch": 19.00850574712644, + "grad_norm": 0.004684785380959511, + "learning_rate": 6.794380587484037e-06, + "loss": 1.7487, + "step": 5070 + }, + { + "epoch": 19.009272030651342, + "grad_norm": 0.17809058725833893, + "learning_rate": 6.7858663260962125e-06, + "loss": 0.4984, + "step": 5080 + }, + { + "epoch": 19.010038314176246, + "grad_norm": 3.824796438217163, + "learning_rate": 6.777352064708387e-06, + "loss": 0.4295, + "step": 5090 + }, + { + "epoch": 19.01080459770115, + "grad_norm": 18.227113723754883, + "learning_rate": 6.7688378033205625e-06, + "loss": 1.0123, + "step": 5100 + }, + { + "epoch": 19.011570881226053, + "grad_norm": 0.016013886779546738, + "learning_rate": 6.760323541932738e-06, + "loss": 0.5572, + "step": 5110 + }, + { + "epoch": 19.012337164750956, + "grad_norm": 0.010713410563766956, + "learning_rate": 6.751809280544913e-06, + "loss": 0.0082, + "step": 5120 + }, + { + "epoch": 19.013103448275864, + "grad_norm": 0.2963060140609741, + "learning_rate": 6.743295019157089e-06, + "loss": 0.823, + "step": 5130 + }, + { + "epoch": 19.013869731800767, + "grad_norm": 435.67083740234375, + "learning_rate": 6.734780757769263e-06, + "loss": 0.8881, + "step": 5140 + }, + { + "epoch": 19.01463601532567, + "grad_norm": 0.31719154119491577, + "learning_rate": 6.726266496381439e-06, + "loss": 0.405, + "step": 5150 + }, + { + "epoch": 19.015402298850574, + "grad_norm": 9.507041931152344, + "learning_rate": 6.717752234993615e-06, + "loss": 0.9877, + "step": 5160 + }, + { + "epoch": 19.016168582375478, + "grad_norm": 0.00443949643522501, + "learning_rate": 6.70923797360579e-06, + "loss": 0.0049, + "step": 5170 + }, + { + "epoch": 19.01693486590038, + "grad_norm": 17.073001861572266, + "learning_rate": 6.700723712217966e-06, + "loss": 1.0038, + "step": 5180 + }, + { + "epoch": 19.01770114942529, + "grad_norm": 0.0035155299119651318, + "learning_rate": 6.692209450830141e-06, + "loss": 0.3384, + "step": 5190 + }, + { + "epoch": 19.018467432950192, + "grad_norm": 0.05622873082756996, + "learning_rate": 6.683695189442317e-06, + "loss": 0.4446, + "step": 5200 + }, + { + "epoch": 19.019233716475096, + "grad_norm": 118.16485595703125, + "learning_rate": 6.6751809280544925e-06, + "loss": 1.8194, + "step": 5210 + }, + { + "epoch": 19.02, + "grad_norm": 0.19169829785823822, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5843, + "step": 5220 + }, + { + "epoch": 19.02, + "eval_accuracy": 0.7777777777777778, + "eval_loss": 0.9560596942901611, + "eval_runtime": 14.2022, + "eval_samples_per_second": 3.169, + "eval_steps_per_second": 3.169, + "step": 5220 + }, + { + "epoch": 20.000766283524904, + "grad_norm": 138.09426879882812, + "learning_rate": 6.6581524052788425e-06, + "loss": 0.6963, + "step": 5230 + }, + { + "epoch": 20.001532567049807, + "grad_norm": 0.012549187056720257, + "learning_rate": 6.649638143891018e-06, + "loss": 0.6626, + "step": 5240 + }, + { + "epoch": 20.002298850574714, + "grad_norm": 0.004213971551507711, + "learning_rate": 6.641123882503193e-06, + "loss": 0.7115, + "step": 5250 + }, + { + "epoch": 20.003065134099618, + "grad_norm": 0.5979669690132141, + "learning_rate": 6.632609621115369e-06, + "loss": 0.9221, + "step": 5260 + }, + { + "epoch": 20.00383141762452, + "grad_norm": 0.29207003116607666, + "learning_rate": 6.624095359727543e-06, + "loss": 0.5816, + "step": 5270 + }, + { + "epoch": 20.004597701149425, + "grad_norm": 28.108585357666016, + "learning_rate": 6.615581098339719e-06, + "loss": 0.8503, + "step": 5280 + }, + { + "epoch": 20.00536398467433, + "grad_norm": 0.2536892890930176, + "learning_rate": 6.607066836951895e-06, + "loss": 0.4296, + "step": 5290 + }, + { + "epoch": 20.006130268199232, + "grad_norm": 0.2590063512325287, + "learning_rate": 6.59855257556407e-06, + "loss": 0.265, + "step": 5300 + }, + { + "epoch": 20.00689655172414, + "grad_norm": 0.3933335244655609, + "learning_rate": 6.590038314176246e-06, + "loss": 0.8066, + "step": 5310 + }, + { + "epoch": 20.007662835249043, + "grad_norm": 0.00344213773496449, + "learning_rate": 6.581524052788422e-06, + "loss": 0.3928, + "step": 5320 + }, + { + "epoch": 20.008429118773947, + "grad_norm": 0.003421328729018569, + "learning_rate": 6.573009791400597e-06, + "loss": 0.5046, + "step": 5330 + }, + { + "epoch": 20.00919540229885, + "grad_norm": 119.35005187988281, + "learning_rate": 6.5644955300127725e-06, + "loss": 0.6248, + "step": 5340 + }, + { + "epoch": 20.009961685823754, + "grad_norm": 1.8184597492218018, + "learning_rate": 6.555981268624947e-06, + "loss": 0.5608, + "step": 5350 + }, + { + "epoch": 20.010727969348657, + "grad_norm": 0.0024780328385531902, + "learning_rate": 6.5474670072371225e-06, + "loss": 0.6801, + "step": 5360 + }, + { + "epoch": 20.011494252873565, + "grad_norm": 0.004241599701344967, + "learning_rate": 6.538952745849298e-06, + "loss": 0.8765, + "step": 5370 + }, + { + "epoch": 20.01226053639847, + "grad_norm": 162.39012145996094, + "learning_rate": 6.530438484461473e-06, + "loss": 1.77, + "step": 5380 + }, + { + "epoch": 20.013026819923372, + "grad_norm": 0.0034117966424673796, + "learning_rate": 6.521924223073649e-06, + "loss": 1.6807, + "step": 5390 + }, + { + "epoch": 20.013793103448275, + "grad_norm": 0.011221050284802914, + "learning_rate": 6.513409961685824e-06, + "loss": 1.3964, + "step": 5400 + }, + { + "epoch": 20.01455938697318, + "grad_norm": 0.10529179871082306, + "learning_rate": 6.504895700297999e-06, + "loss": 1.1688, + "step": 5410 + }, + { + "epoch": 20.015325670498083, + "grad_norm": 0.2718166410923004, + "learning_rate": 6.496381438910175e-06, + "loss": 0.4018, + "step": 5420 + }, + { + "epoch": 20.01609195402299, + "grad_norm": 0.010549083352088928, + "learning_rate": 6.48786717752235e-06, + "loss": 0.6929, + "step": 5430 + }, + { + "epoch": 20.016858237547893, + "grad_norm": 0.9952169060707092, + "learning_rate": 6.479352916134526e-06, + "loss": 0.1293, + "step": 5440 + }, + { + "epoch": 20.017624521072797, + "grad_norm": 137.77610778808594, + "learning_rate": 6.470838654746702e-06, + "loss": 0.8465, + "step": 5450 + }, + { + "epoch": 20.0183908045977, + "grad_norm": 0.004174103494733572, + "learning_rate": 6.462324393358877e-06, + "loss": 0.9037, + "step": 5460 + }, + { + "epoch": 20.019157088122604, + "grad_norm": 0.019375838339328766, + "learning_rate": 6.4538101319710525e-06, + "loss": 0.5878, + "step": 5470 + }, + { + "epoch": 20.01992337164751, + "grad_norm": 0.01853746734559536, + "learning_rate": 6.445295870583227e-06, + "loss": 1.0949, + "step": 5480 + }, + { + "epoch": 20.02, + "eval_accuracy": 0.6888888888888889, + "eval_loss": 1.2866085767745972, + "eval_runtime": 13.2844, + "eval_samples_per_second": 3.387, + "eval_steps_per_second": 3.387, + "step": 5481 + }, + { + "epoch": 21.000689655172415, + "grad_norm": 0.010936838574707508, + "learning_rate": 6.4367816091954025e-06, + "loss": 1.8286, + "step": 5490 + }, + { + "epoch": 21.00145593869732, + "grad_norm": 0.0537358820438385, + "learning_rate": 6.428267347807578e-06, + "loss": 1.1297, + "step": 5500 + }, + { + "epoch": 21.002222222222223, + "grad_norm": 0.018108276650309563, + "learning_rate": 6.419753086419753e-06, + "loss": 0.2811, + "step": 5510 + }, + { + "epoch": 21.002988505747126, + "grad_norm": 0.00682029128074646, + "learning_rate": 6.411238825031929e-06, + "loss": 0.8071, + "step": 5520 + }, + { + "epoch": 21.00375478927203, + "grad_norm": 0.05539993569254875, + "learning_rate": 6.402724563644105e-06, + "loss": 0.9488, + "step": 5530 + }, + { + "epoch": 21.004521072796933, + "grad_norm": 0.2652181386947632, + "learning_rate": 6.39421030225628e-06, + "loss": 0.8627, + "step": 5540 + }, + { + "epoch": 21.00528735632184, + "grad_norm": 581.5076904296875, + "learning_rate": 6.385696040868455e-06, + "loss": 0.5053, + "step": 5550 + }, + { + "epoch": 21.006053639846744, + "grad_norm": 0.4918936789035797, + "learning_rate": 6.37718177948063e-06, + "loss": 0.0138, + "step": 5560 + }, + { + "epoch": 21.006819923371648, + "grad_norm": 0.37181881070137024, + "learning_rate": 6.368667518092806e-06, + "loss": 0.0198, + "step": 5570 + }, + { + "epoch": 21.00758620689655, + "grad_norm": 0.009263670071959496, + "learning_rate": 6.360153256704982e-06, + "loss": 1.2818, + "step": 5580 + }, + { + "epoch": 21.008352490421455, + "grad_norm": 0.0035500682424753904, + "learning_rate": 6.351638995317157e-06, + "loss": 0.6626, + "step": 5590 + }, + { + "epoch": 21.00911877394636, + "grad_norm": 391.8262023925781, + "learning_rate": 6.3431247339293325e-06, + "loss": 0.776, + "step": 5600 + }, + { + "epoch": 21.009885057471266, + "grad_norm": 227.4144287109375, + "learning_rate": 6.334610472541508e-06, + "loss": 1.1783, + "step": 5610 + }, + { + "epoch": 21.01065134099617, + "grad_norm": 0.007147862110286951, + "learning_rate": 6.3260962111536825e-06, + "loss": 0.0036, + "step": 5620 + }, + { + "epoch": 21.011417624521073, + "grad_norm": 0.0029464103281497955, + "learning_rate": 6.317581949765858e-06, + "loss": 1.3983, + "step": 5630 + }, + { + "epoch": 21.012183908045976, + "grad_norm": 0.2004404515028, + "learning_rate": 6.309067688378033e-06, + "loss": 1.3843, + "step": 5640 + }, + { + "epoch": 21.01295019157088, + "grad_norm": 0.06141708418726921, + "learning_rate": 6.300553426990209e-06, + "loss": 0.0749, + "step": 5650 + }, + { + "epoch": 21.013716475095784, + "grad_norm": 0.10852473974227905, + "learning_rate": 6.292039165602385e-06, + "loss": 0.256, + "step": 5660 + }, + { + "epoch": 21.01448275862069, + "grad_norm": 0.00618616072461009, + "learning_rate": 6.28352490421456e-06, + "loss": 0.4822, + "step": 5670 + }, + { + "epoch": 21.015249042145594, + "grad_norm": 348.1380615234375, + "learning_rate": 6.275010642826736e-06, + "loss": 1.542, + "step": 5680 + }, + { + "epoch": 21.016015325670498, + "grad_norm": 59.00828552246094, + "learning_rate": 6.26649638143891e-06, + "loss": 0.3801, + "step": 5690 + }, + { + "epoch": 21.0167816091954, + "grad_norm": 32.972965240478516, + "learning_rate": 6.257982120051086e-06, + "loss": 1.4585, + "step": 5700 + }, + { + "epoch": 21.017547892720305, + "grad_norm": 0.1276509165763855, + "learning_rate": 6.249467858663262e-06, + "loss": 0.5012, + "step": 5710 + }, + { + "epoch": 21.018314176245212, + "grad_norm": 0.27684059739112854, + "learning_rate": 6.240953597275437e-06, + "loss": 1.6095, + "step": 5720 + }, + { + "epoch": 21.019080459770116, + "grad_norm": 66.66431427001953, + "learning_rate": 6.2324393358876125e-06, + "loss": 1.5397, + "step": 5730 + }, + { + "epoch": 21.01984674329502, + "grad_norm": 2.397763967514038, + "learning_rate": 6.223925074499788e-06, + "loss": 1.2073, + "step": 5740 + }, + { + "epoch": 21.02, + "eval_accuracy": 0.7555555555555555, + "eval_loss": 1.033568024635315, + "eval_runtime": 14.5617, + "eval_samples_per_second": 3.09, + "eval_steps_per_second": 3.09, + "step": 5742 + }, + { + "epoch": 22.000613026819924, + "grad_norm": 0.05683393031358719, + "learning_rate": 6.2154108131119625e-06, + "loss": 0.4272, + "step": 5750 + }, + { + "epoch": 22.001379310344827, + "grad_norm": 0.4416787326335907, + "learning_rate": 6.206896551724138e-06, + "loss": 0.75, + "step": 5760 + }, + { + "epoch": 22.00214559386973, + "grad_norm": 0.009222766384482384, + "learning_rate": 6.198382290336313e-06, + "loss": 0.0079, + "step": 5770 + }, + { + "epoch": 22.002911877394634, + "grad_norm": 25.15428352355957, + "learning_rate": 6.189868028948489e-06, + "loss": 0.2564, + "step": 5780 + }, + { + "epoch": 22.00367816091954, + "grad_norm": 0.09567564725875854, + "learning_rate": 6.181353767560665e-06, + "loss": 0.9887, + "step": 5790 + }, + { + "epoch": 22.004444444444445, + "grad_norm": 0.02086619660258293, + "learning_rate": 6.17283950617284e-06, + "loss": 0.5132, + "step": 5800 + }, + { + "epoch": 22.00521072796935, + "grad_norm": 0.1771589070558548, + "learning_rate": 6.164325244785016e-06, + "loss": 0.623, + "step": 5810 + }, + { + "epoch": 22.005977011494252, + "grad_norm": 0.08775811642408371, + "learning_rate": 6.155810983397192e-06, + "loss": 0.0024, + "step": 5820 + }, + { + "epoch": 22.006743295019156, + "grad_norm": 0.9292959570884705, + "learning_rate": 6.147296722009366e-06, + "loss": 1.7215, + "step": 5830 + }, + { + "epoch": 22.00750957854406, + "grad_norm": 0.1804896742105484, + "learning_rate": 6.138782460621542e-06, + "loss": 1.1822, + "step": 5840 + }, + { + "epoch": 22.008275862068967, + "grad_norm": 0.0041860733181238174, + "learning_rate": 6.130268199233717e-06, + "loss": 1.1366, + "step": 5850 + }, + { + "epoch": 22.00904214559387, + "grad_norm": 0.029360786080360413, + "learning_rate": 6.1217539378458925e-06, + "loss": 0.8683, + "step": 5860 + }, + { + "epoch": 22.009808429118774, + "grad_norm": 0.3557896614074707, + "learning_rate": 6.113239676458068e-06, + "loss": 0.8134, + "step": 5870 + }, + { + "epoch": 22.010574712643677, + "grad_norm": 0.0051303138025105, + "learning_rate": 6.1047254150702425e-06, + "loss": 0.4578, + "step": 5880 + }, + { + "epoch": 22.01134099616858, + "grad_norm": 0.017394129186868668, + "learning_rate": 6.096211153682418e-06, + "loss": 0.9201, + "step": 5890 + }, + { + "epoch": 22.01210727969349, + "grad_norm": 0.4005459249019623, + "learning_rate": 6.087696892294594e-06, + "loss": 0.6393, + "step": 5900 + }, + { + "epoch": 22.012873563218392, + "grad_norm": 0.012193914502859116, + "learning_rate": 6.079182630906769e-06, + "loss": 0.8868, + "step": 5910 + }, + { + "epoch": 22.013639846743295, + "grad_norm": 0.42486461997032166, + "learning_rate": 6.070668369518945e-06, + "loss": 0.0113, + "step": 5920 + }, + { + "epoch": 22.0144061302682, + "grad_norm": 0.20244956016540527, + "learning_rate": 6.06215410813112e-06, + "loss": 0.0046, + "step": 5930 + }, + { + "epoch": 22.015172413793103, + "grad_norm": 0.005920723546296358, + "learning_rate": 6.053639846743296e-06, + "loss": 0.2765, + "step": 5940 + }, + { + "epoch": 22.015938697318006, + "grad_norm": 0.14310921728610992, + "learning_rate": 6.045125585355472e-06, + "loss": 1.2898, + "step": 5950 + }, + { + "epoch": 22.016704980842913, + "grad_norm": 0.08543712645769119, + "learning_rate": 6.036611323967646e-06, + "loss": 0.4782, + "step": 5960 + }, + { + "epoch": 22.017471264367817, + "grad_norm": 0.10066840052604675, + "learning_rate": 6.028097062579822e-06, + "loss": 0.2598, + "step": 5970 + }, + { + "epoch": 22.01823754789272, + "grad_norm": 0.3465437591075897, + "learning_rate": 6.019582801191997e-06, + "loss": 0.6391, + "step": 5980 + }, + { + "epoch": 22.019003831417624, + "grad_norm": 0.9225018620491028, + "learning_rate": 6.0110685398041725e-06, + "loss": 1.2283, + "step": 5990 + }, + { + "epoch": 22.019770114942528, + "grad_norm": 219.1271514892578, + "learning_rate": 6.002554278416348e-06, + "loss": 1.3534, + "step": 6000 + }, + { + "epoch": 22.02, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 1.8028539419174194, + "eval_runtime": 15.1916, + "eval_samples_per_second": 2.962, + "eval_steps_per_second": 2.962, + "step": 6003 + }, + { + "epoch": 23.000536398467432, + "grad_norm": 289.6174011230469, + "learning_rate": 5.9940400170285225e-06, + "loss": 1.1603, + "step": 6010 + }, + { + "epoch": 23.001302681992335, + "grad_norm": 0.2584246098995209, + "learning_rate": 5.985525755640698e-06, + "loss": 2.4426, + "step": 6020 + }, + { + "epoch": 23.002068965517243, + "grad_norm": 217.7529754638672, + "learning_rate": 5.977011494252874e-06, + "loss": 0.1134, + "step": 6030 + }, + { + "epoch": 23.002835249042146, + "grad_norm": 0.3803919851779938, + "learning_rate": 5.968497232865049e-06, + "loss": 0.2302, + "step": 6040 + }, + { + "epoch": 23.00360153256705, + "grad_norm": 0.02779291570186615, + "learning_rate": 5.959982971477225e-06, + "loss": 0.5589, + "step": 6050 + }, + { + "epoch": 23.004367816091953, + "grad_norm": 0.044463906437158585, + "learning_rate": 5.9514687100894e-06, + "loss": 0.4939, + "step": 6060 + }, + { + "epoch": 23.005134099616857, + "grad_norm": 0.021367818117141724, + "learning_rate": 5.942954448701576e-06, + "loss": 1.4178, + "step": 6070 + }, + { + "epoch": 23.005900383141764, + "grad_norm": 0.23603151738643646, + "learning_rate": 5.934440187313752e-06, + "loss": 0.6045, + "step": 6080 + }, + { + "epoch": 23.006666666666668, + "grad_norm": 0.0922141820192337, + "learning_rate": 5.925925925925926e-06, + "loss": 0.7727, + "step": 6090 + }, + { + "epoch": 23.00743295019157, + "grad_norm": 6.932680606842041, + "learning_rate": 5.917411664538102e-06, + "loss": 0.0075, + "step": 6100 + }, + { + "epoch": 23.008199233716475, + "grad_norm": 0.014019292779266834, + "learning_rate": 5.9088974031502775e-06, + "loss": 0.8208, + "step": 6110 + }, + { + "epoch": 23.00896551724138, + "grad_norm": 275.9669494628906, + "learning_rate": 5.9003831417624525e-06, + "loss": 0.9227, + "step": 6120 + }, + { + "epoch": 23.009731800766282, + "grad_norm": 0.7556512951850891, + "learning_rate": 5.891868880374628e-06, + "loss": 0.4839, + "step": 6130 + }, + { + "epoch": 23.01049808429119, + "grad_norm": 0.13979677855968475, + "learning_rate": 5.883354618986803e-06, + "loss": 0.0037, + "step": 6140 + }, + { + "epoch": 23.011264367816093, + "grad_norm": 71.52664184570312, + "learning_rate": 5.874840357598979e-06, + "loss": 0.708, + "step": 6150 + }, + { + "epoch": 23.012030651340996, + "grad_norm": 0.011060028336942196, + "learning_rate": 5.866326096211154e-06, + "loss": 0.5177, + "step": 6160 + }, + { + "epoch": 23.0127969348659, + "grad_norm": 53.9853401184082, + "learning_rate": 5.857811834823329e-06, + "loss": 1.2794, + "step": 6170 + }, + { + "epoch": 23.013563218390804, + "grad_norm": 0.004570251330733299, + "learning_rate": 5.849297573435505e-06, + "loss": 0.5029, + "step": 6180 + }, + { + "epoch": 23.014329501915707, + "grad_norm": 0.02976115606725216, + "learning_rate": 5.84078331204768e-06, + "loss": 1.0682, + "step": 6190 + }, + { + "epoch": 23.015095785440614, + "grad_norm": 0.003304552286863327, + "learning_rate": 5.832269050659856e-06, + "loss": 0.0027, + "step": 6200 + }, + { + "epoch": 23.015862068965518, + "grad_norm": 0.004245653282850981, + "learning_rate": 5.823754789272032e-06, + "loss": 0.5402, + "step": 6210 + }, + { + "epoch": 23.01662835249042, + "grad_norm": 0.0026863988023251295, + "learning_rate": 5.815240527884206e-06, + "loss": 0.1568, + "step": 6220 + }, + { + "epoch": 23.017394636015325, + "grad_norm": 0.047919392585754395, + "learning_rate": 5.806726266496382e-06, + "loss": 0.0018, + "step": 6230 + }, + { + "epoch": 23.01816091954023, + "grad_norm": 0.13500118255615234, + "learning_rate": 5.7982120051085575e-06, + "loss": 1.5147, + "step": 6240 + }, + { + "epoch": 23.018927203065132, + "grad_norm": 664.8700561523438, + "learning_rate": 5.7896977437207325e-06, + "loss": 0.5652, + "step": 6250 + }, + { + "epoch": 23.01969348659004, + "grad_norm": 0.010892790742218494, + "learning_rate": 5.781183482332908e-06, + "loss": 0.0423, + "step": 6260 + }, + { + "epoch": 23.02, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 1.457066297531128, + "eval_runtime": 16.4944, + "eval_samples_per_second": 2.728, + "eval_steps_per_second": 2.728, + "step": 6264 + }, + { + "epoch": 24.000459770114944, + "grad_norm": 54.31410598754883, + "learning_rate": 5.772669220945083e-06, + "loss": 0.7064, + "step": 6270 + }, + { + "epoch": 24.001226053639847, + "grad_norm": 755.3297119140625, + "learning_rate": 5.764154959557259e-06, + "loss": 1.8701, + "step": 6280 + }, + { + "epoch": 24.00199233716475, + "grad_norm": 0.004258429165929556, + "learning_rate": 5.755640698169435e-06, + "loss": 1.2701, + "step": 6290 + }, + { + "epoch": 24.002758620689654, + "grad_norm": 0.0329936183989048, + "learning_rate": 5.747126436781609e-06, + "loss": 2.0358, + "step": 6300 + }, + { + "epoch": 24.003524904214558, + "grad_norm": 0.7360894680023193, + "learning_rate": 5.738612175393785e-06, + "loss": 0.4042, + "step": 6310 + }, + { + "epoch": 24.004291187739465, + "grad_norm": 0.011302406899631023, + "learning_rate": 5.730097914005961e-06, + "loss": 1.0555, + "step": 6320 + }, + { + "epoch": 24.00505747126437, + "grad_norm": 5.05350399017334, + "learning_rate": 5.721583652618136e-06, + "loss": 0.7022, + "step": 6330 + }, + { + "epoch": 24.005823754789272, + "grad_norm": 557.7550048828125, + "learning_rate": 5.713069391230312e-06, + "loss": 0.6111, + "step": 6340 + }, + { + "epoch": 24.006590038314176, + "grad_norm": 2.5530898571014404, + "learning_rate": 5.704555129842486e-06, + "loss": 0.4165, + "step": 6350 + }, + { + "epoch": 24.00735632183908, + "grad_norm": 0.43523475527763367, + "learning_rate": 5.696040868454662e-06, + "loss": 0.7824, + "step": 6360 + }, + { + "epoch": 24.008122605363983, + "grad_norm": 49.91386032104492, + "learning_rate": 5.6875266070668375e-06, + "loss": 2.0053, + "step": 6370 + }, + { + "epoch": 24.00888888888889, + "grad_norm": 0.011766809970140457, + "learning_rate": 5.6790123456790125e-06, + "loss": 0.4156, + "step": 6380 + }, + { + "epoch": 24.009655172413794, + "grad_norm": 0.0064444891177117825, + "learning_rate": 5.670498084291188e-06, + "loss": 1.0425, + "step": 6390 + }, + { + "epoch": 24.010421455938697, + "grad_norm": 365.0169372558594, + "learning_rate": 5.661983822903364e-06, + "loss": 1.0558, + "step": 6400 + }, + { + "epoch": 24.0111877394636, + "grad_norm": 1.042160153388977, + "learning_rate": 5.653469561515539e-06, + "loss": 0.3502, + "step": 6410 + }, + { + "epoch": 24.011954022988505, + "grad_norm": 135.4849395751953, + "learning_rate": 5.644955300127715e-06, + "loss": 1.4494, + "step": 6420 + }, + { + "epoch": 24.01272030651341, + "grad_norm": 0.0020956031512469053, + "learning_rate": 5.636441038739889e-06, + "loss": 0.9078, + "step": 6430 + }, + { + "epoch": 24.013486590038315, + "grad_norm": 0.004369780886918306, + "learning_rate": 5.627926777352065e-06, + "loss": 0.0083, + "step": 6440 + }, + { + "epoch": 24.01425287356322, + "grad_norm": 0.1496662199497223, + "learning_rate": 5.619412515964241e-06, + "loss": 0.4498, + "step": 6450 + }, + { + "epoch": 24.015019157088123, + "grad_norm": 0.19664430618286133, + "learning_rate": 5.610898254576416e-06, + "loss": 0.4715, + "step": 6460 + }, + { + "epoch": 24.015785440613026, + "grad_norm": 2.650960683822632, + "learning_rate": 5.602383993188592e-06, + "loss": 0.5836, + "step": 6470 + }, + { + "epoch": 24.01655172413793, + "grad_norm": 0.002323655877262354, + "learning_rate": 5.593869731800766e-06, + "loss": 0.346, + "step": 6480 + }, + { + "epoch": 24.017318007662837, + "grad_norm": 0.03131331130862236, + "learning_rate": 5.585355470412942e-06, + "loss": 1.4927, + "step": 6490 + }, + { + "epoch": 24.01808429118774, + "grad_norm": 0.0025550054851919413, + "learning_rate": 5.5768412090251175e-06, + "loss": 1.1915, + "step": 6500 + }, + { + "epoch": 24.018850574712644, + "grad_norm": 811.5127563476562, + "learning_rate": 5.5683269476372925e-06, + "loss": 0.3877, + "step": 6510 + }, + { + "epoch": 24.019616858237548, + "grad_norm": 0.04266892001032829, + "learning_rate": 5.559812686249468e-06, + "loss": 1.0068, + "step": 6520 + }, + { + "epoch": 24.02, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 1.7790253162384033, + "eval_runtime": 14.3666, + "eval_samples_per_second": 3.132, + "eval_steps_per_second": 3.132, + "step": 6525 + }, + { + "epoch": 25.000383141762452, + "grad_norm": 0.18075942993164062, + "learning_rate": 5.551298424861644e-06, + "loss": 0.0016, + "step": 6530 + }, + { + "epoch": 25.001149425287355, + "grad_norm": 0.08188223838806152, + "learning_rate": 5.542784163473819e-06, + "loss": 1.1096, + "step": 6540 + }, + { + "epoch": 25.00191570881226, + "grad_norm": 0.08906185626983643, + "learning_rate": 5.534269902085995e-06, + "loss": 1.2369, + "step": 6550 + }, + { + "epoch": 25.002681992337166, + "grad_norm": 90.94135284423828, + "learning_rate": 5.525755640698169e-06, + "loss": 1.1431, + "step": 6560 + }, + { + "epoch": 25.00344827586207, + "grad_norm": 0.11608108878135681, + "learning_rate": 5.517241379310345e-06, + "loss": 0.3466, + "step": 6570 + }, + { + "epoch": 25.004214559386973, + "grad_norm": 1376.1837158203125, + "learning_rate": 5.508727117922521e-06, + "loss": 0.1968, + "step": 6580 + }, + { + "epoch": 25.004980842911877, + "grad_norm": 0.018664706498384476, + "learning_rate": 5.500212856534696e-06, + "loss": 0.441, + "step": 6590 + }, + { + "epoch": 25.00574712643678, + "grad_norm": 0.007804445456713438, + "learning_rate": 5.491698595146872e-06, + "loss": 0.5534, + "step": 6600 + }, + { + "epoch": 25.006513409961684, + "grad_norm": 0.017437539994716644, + "learning_rate": 5.4831843337590475e-06, + "loss": 0.0191, + "step": 6610 + }, + { + "epoch": 25.00727969348659, + "grad_norm": 5.701467990875244, + "learning_rate": 5.474670072371222e-06, + "loss": 1.3893, + "step": 6620 + }, + { + "epoch": 25.008045977011495, + "grad_norm": 0.06029016524553299, + "learning_rate": 5.4661558109833975e-06, + "loss": 0.0441, + "step": 6630 + }, + { + "epoch": 25.0088122605364, + "grad_norm": 0.007205882575362921, + "learning_rate": 5.4576415495955725e-06, + "loss": 0.3946, + "step": 6640 + }, + { + "epoch": 25.009578544061302, + "grad_norm": 0.004521137103438377, + "learning_rate": 5.449127288207748e-06, + "loss": 0.5575, + "step": 6650 + }, + { + "epoch": 25.010344827586206, + "grad_norm": 63.423763275146484, + "learning_rate": 5.440613026819924e-06, + "loss": 0.7411, + "step": 6660 + }, + { + "epoch": 25.011111111111113, + "grad_norm": 0.05708396062254906, + "learning_rate": 5.432098765432099e-06, + "loss": 0.6057, + "step": 6670 + }, + { + "epoch": 25.011877394636016, + "grad_norm": 0.01645585335791111, + "learning_rate": 5.423584504044275e-06, + "loss": 1.1414, + "step": 6680 + }, + { + "epoch": 25.01264367816092, + "grad_norm": 0.03839023783802986, + "learning_rate": 5.415070242656451e-06, + "loss": 1.191, + "step": 6690 + }, + { + "epoch": 25.013409961685824, + "grad_norm": 0.14333435893058777, + "learning_rate": 5.406555981268625e-06, + "loss": 0.444, + "step": 6700 + }, + { + "epoch": 25.014176245210727, + "grad_norm": 0.006897382903844118, + "learning_rate": 5.398041719880801e-06, + "loss": 0.5621, + "step": 6710 + }, + { + "epoch": 25.01494252873563, + "grad_norm": 0.0030311744194477797, + "learning_rate": 5.389527458492976e-06, + "loss": 0.6271, + "step": 6720 + }, + { + "epoch": 25.015708812260538, + "grad_norm": 0.03591395169496536, + "learning_rate": 5.381013197105152e-06, + "loss": 0.4289, + "step": 6730 + }, + { + "epoch": 25.01647509578544, + "grad_norm": 130.26043701171875, + "learning_rate": 5.3724989357173275e-06, + "loss": 1.6805, + "step": 6740 + }, + { + "epoch": 25.017241379310345, + "grad_norm": 0.023178286850452423, + "learning_rate": 5.3639846743295025e-06, + "loss": 1.0406, + "step": 6750 + }, + { + "epoch": 25.01800766283525, + "grad_norm": 0.0045823934487998486, + "learning_rate": 5.3554704129416775e-06, + "loss": 0.3643, + "step": 6760 + }, + { + "epoch": 25.018773946360152, + "grad_norm": 1255.3531494140625, + "learning_rate": 5.3469561515538525e-06, + "loss": 0.3438, + "step": 6770 + }, + { + "epoch": 25.019540229885056, + "grad_norm": 0.820978581905365, + "learning_rate": 5.338441890166028e-06, + "loss": 1.5772, + "step": 6780 + }, + { + "epoch": 25.02, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 1.7893223762512207, + "eval_runtime": 15.2457, + "eval_samples_per_second": 2.952, + "eval_steps_per_second": 2.952, + "step": 6786 + }, + { + "epoch": 26.00030651340996, + "grad_norm": 0.3054686188697815, + "learning_rate": 5.329927628778204e-06, + "loss": 0.7461, + "step": 6790 + }, + { + "epoch": 26.001072796934867, + "grad_norm": 0.04109475016593933, + "learning_rate": 5.321413367390379e-06, + "loss": 0.0492, + "step": 6800 + }, + { + "epoch": 26.00183908045977, + "grad_norm": 46.89008712768555, + "learning_rate": 5.312899106002555e-06, + "loss": 0.6362, + "step": 6810 + }, + { + "epoch": 26.002605363984674, + "grad_norm": 0.004781241994351149, + "learning_rate": 5.304384844614731e-06, + "loss": 1.1682, + "step": 6820 + }, + { + "epoch": 26.003371647509578, + "grad_norm": 0.21307355165481567, + "learning_rate": 5.295870583226905e-06, + "loss": 0.5552, + "step": 6830 + }, + { + "epoch": 26.00413793103448, + "grad_norm": 0.026560096070170403, + "learning_rate": 5.287356321839081e-06, + "loss": 0.7666, + "step": 6840 + }, + { + "epoch": 26.00490421455939, + "grad_norm": 0.003154867561534047, + "learning_rate": 5.278842060451256e-06, + "loss": 0.4521, + "step": 6850 + }, + { + "epoch": 26.005670498084292, + "grad_norm": 0.25655969977378845, + "learning_rate": 5.270327799063432e-06, + "loss": 0.4738, + "step": 6860 + }, + { + "epoch": 26.006436781609196, + "grad_norm": 0.053296688944101334, + "learning_rate": 5.2618135376756075e-06, + "loss": 0.0045, + "step": 6870 + }, + { + "epoch": 26.0072030651341, + "grad_norm": 0.027281196787953377, + "learning_rate": 5.2532992762877825e-06, + "loss": 0.6215, + "step": 6880 + }, + { + "epoch": 26.007969348659003, + "grad_norm": 0.02793821506202221, + "learning_rate": 5.244785014899958e-06, + "loss": 0.0015, + "step": 6890 + }, + { + "epoch": 26.008735632183907, + "grad_norm": 0.005113948602229357, + "learning_rate": 5.236270753512134e-06, + "loss": 0.0007, + "step": 6900 + }, + { + "epoch": 26.009501915708814, + "grad_norm": 0.1896917223930359, + "learning_rate": 5.227756492124308e-06, + "loss": 0.7445, + "step": 6910 + }, + { + "epoch": 26.010268199233717, + "grad_norm": 0.02426121011376381, + "learning_rate": 5.219242230736484e-06, + "loss": 0.9747, + "step": 6920 + }, + { + "epoch": 26.01103448275862, + "grad_norm": 0.15907245874404907, + "learning_rate": 5.210727969348659e-06, + "loss": 0.1155, + "step": 6930 + }, + { + "epoch": 26.011800766283525, + "grad_norm": 0.09672944247722626, + "learning_rate": 5.202213707960835e-06, + "loss": 0.0018, + "step": 6940 + }, + { + "epoch": 26.01256704980843, + "grad_norm": 0.00523536279797554, + "learning_rate": 5.193699446573011e-06, + "loss": 0.1146, + "step": 6950 + }, + { + "epoch": 26.013333333333332, + "grad_norm": 21.234874725341797, + "learning_rate": 5.185185185185185e-06, + "loss": 1.1605, + "step": 6960 + }, + { + "epoch": 26.01409961685824, + "grad_norm": 0.0017688468797132373, + "learning_rate": 5.176670923797361e-06, + "loss": 0.0008, + "step": 6970 + }, + { + "epoch": 26.014865900383143, + "grad_norm": 31.0344181060791, + "learning_rate": 5.168156662409536e-06, + "loss": 3.1422, + "step": 6980 + }, + { + "epoch": 26.015632183908046, + "grad_norm": 19.070343017578125, + "learning_rate": 5.159642401021712e-06, + "loss": 1.0949, + "step": 6990 + }, + { + "epoch": 26.01639846743295, + "grad_norm": 0.05039878934621811, + "learning_rate": 5.1511281396338875e-06, + "loss": 1.7007, + "step": 7000 + }, + { + "epoch": 26.017164750957853, + "grad_norm": 0.08749368041753769, + "learning_rate": 5.1426138782460625e-06, + "loss": 0.8494, + "step": 7010 + }, + { + "epoch": 26.017931034482757, + "grad_norm": 0.12880153954029083, + "learning_rate": 5.134099616858238e-06, + "loss": 1.417, + "step": 7020 + }, + { + "epoch": 26.018697318007664, + "grad_norm": 7.412830829620361, + "learning_rate": 5.125585355470414e-06, + "loss": 0.0061, + "step": 7030 + }, + { + "epoch": 26.019463601532568, + "grad_norm": 0.015643224120140076, + "learning_rate": 5.117071094082588e-06, + "loss": 0.8409, + "step": 7040 + }, + { + "epoch": 26.02, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 1.6453967094421387, + "eval_runtime": 14.268, + "eval_samples_per_second": 3.154, + "eval_steps_per_second": 3.154, + "step": 7047 + }, + { + "epoch": 27.000229885057472, + "grad_norm": 19.965852737426758, + "learning_rate": 5.108556832694764e-06, + "loss": 0.658, + "step": 7050 + }, + { + "epoch": 27.000996168582375, + "grad_norm": 2.890854597091675, + "learning_rate": 5.100042571306939e-06, + "loss": 0.5235, + "step": 7060 + }, + { + "epoch": 27.00176245210728, + "grad_norm": 0.012044958770275116, + "learning_rate": 5.091528309919115e-06, + "loss": 0.006, + "step": 7070 + }, + { + "epoch": 27.002528735632183, + "grad_norm": 4.6264967918396, + "learning_rate": 5.083014048531291e-06, + "loss": 0.0541, + "step": 7080 + }, + { + "epoch": 27.00329501915709, + "grad_norm": 0.008120655082166195, + "learning_rate": 5.074499787143465e-06, + "loss": 1.1141, + "step": 7090 + }, + { + "epoch": 27.004061302681993, + "grad_norm": 0.02046016789972782, + "learning_rate": 5.065985525755641e-06, + "loss": 0.0064, + "step": 7100 + }, + { + "epoch": 27.004827586206897, + "grad_norm": 0.04679565131664276, + "learning_rate": 5.057471264367817e-06, + "loss": 0.4713, + "step": 7110 + }, + { + "epoch": 27.0055938697318, + "grad_norm": 0.003998655825853348, + "learning_rate": 5.048957002979992e-06, + "loss": 0.8596, + "step": 7120 + }, + { + "epoch": 27.006360153256704, + "grad_norm": 0.0444590263068676, + "learning_rate": 5.0404427415921675e-06, + "loss": 1.0469, + "step": 7130 + }, + { + "epoch": 27.007126436781608, + "grad_norm": 0.021894382312893867, + "learning_rate": 5.0319284802043425e-06, + "loss": 0.0378, + "step": 7140 + }, + { + "epoch": 27.007892720306515, + "grad_norm": 57.658607482910156, + "learning_rate": 5.023414218816518e-06, + "loss": 2.1019, + "step": 7150 + }, + { + "epoch": 27.00865900383142, + "grad_norm": 0.034019336104393005, + "learning_rate": 5.014899957428694e-06, + "loss": 0.4746, + "step": 7160 + }, + { + "epoch": 27.009425287356322, + "grad_norm": 0.005097901914268732, + "learning_rate": 5.006385696040868e-06, + "loss": 0.0269, + "step": 7170 + }, + { + "epoch": 27.010191570881226, + "grad_norm": 0.5953840017318726, + "learning_rate": 4.997871434653044e-06, + "loss": 1.6232, + "step": 7180 + }, + { + "epoch": 27.01095785440613, + "grad_norm": 0.003489244729280472, + "learning_rate": 4.98935717326522e-06, + "loss": 0.7787, + "step": 7190 + }, + { + "epoch": 27.011724137931033, + "grad_norm": 809.7433471679688, + "learning_rate": 4.980842911877395e-06, + "loss": 1.401, + "step": 7200 + }, + { + "epoch": 27.01249042145594, + "grad_norm": 0.12849143147468567, + "learning_rate": 4.972328650489571e-06, + "loss": 0.4929, + "step": 7210 + }, + { + "epoch": 27.013256704980844, + "grad_norm": 0.3912018835544586, + "learning_rate": 4.963814389101746e-06, + "loss": 0.0422, + "step": 7220 + }, + { + "epoch": 27.014022988505747, + "grad_norm": 0.024638323113322258, + "learning_rate": 4.955300127713921e-06, + "loss": 1.1046, + "step": 7230 + }, + { + "epoch": 27.01478927203065, + "grad_norm": 2.2969040870666504, + "learning_rate": 4.946785866326097e-06, + "loss": 1.0911, + "step": 7240 + }, + { + "epoch": 27.015555555555554, + "grad_norm": 0.24452875554561615, + "learning_rate": 4.938271604938272e-06, + "loss": 0.5244, + "step": 7250 + }, + { + "epoch": 27.016321839080458, + "grad_norm": 0.004459705203771591, + "learning_rate": 4.9297573435504475e-06, + "loss": 0.4651, + "step": 7260 + }, + { + "epoch": 27.017088122605365, + "grad_norm": 0.05164725333452225, + "learning_rate": 4.9212430821626225e-06, + "loss": 0.0037, + "step": 7270 + }, + { + "epoch": 27.01785440613027, + "grad_norm": 0.25497305393218994, + "learning_rate": 4.912728820774798e-06, + "loss": 0.5484, + "step": 7280 + }, + { + "epoch": 27.018620689655172, + "grad_norm": 116.34848022460938, + "learning_rate": 4.904214559386973e-06, + "loss": 0.0093, + "step": 7290 + }, + { + "epoch": 27.019386973180076, + "grad_norm": 0.22834795713424683, + "learning_rate": 4.895700297999149e-06, + "loss": 0.6828, + "step": 7300 + }, + { + "epoch": 27.02, + "eval_accuracy": 0.6888888888888889, + "eval_loss": 1.852113962173462, + "eval_runtime": 15.2767, + "eval_samples_per_second": 2.946, + "eval_steps_per_second": 2.946, + "step": 7308 + }, + { + "epoch": 28.00015325670498, + "grad_norm": 0.01499945018440485, + "learning_rate": 4.887186036611324e-06, + "loss": 1.3009, + "step": 7310 + }, + { + "epoch": 28.000919540229884, + "grad_norm": 20.283266067504883, + "learning_rate": 4.8786717752235e-06, + "loss": 1.0598, + "step": 7320 + }, + { + "epoch": 28.00168582375479, + "grad_norm": 680.4053344726562, + "learning_rate": 4.870157513835675e-06, + "loss": 0.9413, + "step": 7330 + }, + { + "epoch": 28.002452107279694, + "grad_norm": 0.0037998862098902464, + "learning_rate": 4.861643252447851e-06, + "loss": 1.6265, + "step": 7340 + }, + { + "epoch": 28.003218390804598, + "grad_norm": 0.028214074671268463, + "learning_rate": 4.853128991060026e-06, + "loss": 0.2507, + "step": 7350 + }, + { + "epoch": 28.0039846743295, + "grad_norm": 0.27702319622039795, + "learning_rate": 4.844614729672202e-06, + "loss": 0.7148, + "step": 7360 + }, + { + "epoch": 28.004750957854405, + "grad_norm": 88.49553680419922, + "learning_rate": 4.836100468284377e-06, + "loss": 1.1467, + "step": 7370 + }, + { + "epoch": 28.00551724137931, + "grad_norm": 195.29783630371094, + "learning_rate": 4.8275862068965525e-06, + "loss": 1.0426, + "step": 7380 + }, + { + "epoch": 28.006283524904216, + "grad_norm": 0.013184229843318462, + "learning_rate": 4.8190719455087275e-06, + "loss": 0.6131, + "step": 7390 + }, + { + "epoch": 28.00704980842912, + "grad_norm": 824.2332763671875, + "learning_rate": 4.8105576841209025e-06, + "loss": 0.5226, + "step": 7400 + }, + { + "epoch": 28.007816091954023, + "grad_norm": 0.014120840467512608, + "learning_rate": 4.802043422733078e-06, + "loss": 0.6207, + "step": 7410 + }, + { + "epoch": 28.008582375478927, + "grad_norm": 0.0181321669369936, + "learning_rate": 4.793529161345254e-06, + "loss": 0.5579, + "step": 7420 + }, + { + "epoch": 28.00934865900383, + "grad_norm": 0.033630553632974625, + "learning_rate": 4.785014899957429e-06, + "loss": 0.2249, + "step": 7430 + }, + { + "epoch": 28.010114942528734, + "grad_norm": 0.004489063750952482, + "learning_rate": 4.776500638569604e-06, + "loss": 0.4761, + "step": 7440 + }, + { + "epoch": 28.01088122605364, + "grad_norm": 0.01676010712981224, + "learning_rate": 4.76798637718178e-06, + "loss": 0.0105, + "step": 7450 + }, + { + "epoch": 28.011647509578545, + "grad_norm": 0.0805138424038887, + "learning_rate": 4.759472115793956e-06, + "loss": 0.786, + "step": 7460 + }, + { + "epoch": 28.01241379310345, + "grad_norm": 0.4894491136074066, + "learning_rate": 4.750957854406131e-06, + "loss": 0.4835, + "step": 7470 + }, + { + "epoch": 28.013180076628352, + "grad_norm": 0.00567967863753438, + "learning_rate": 4.742443593018306e-06, + "loss": 0.92, + "step": 7480 + }, + { + "epoch": 28.013946360153255, + "grad_norm": 0.13989655673503876, + "learning_rate": 4.733929331630482e-06, + "loss": 0.0026, + "step": 7490 + }, + { + "epoch": 28.014712643678163, + "grad_norm": 0.006515056826174259, + "learning_rate": 4.7254150702426575e-06, + "loss": 0.0017, + "step": 7500 + }, + { + "epoch": 28.015478927203066, + "grad_norm": 0.013979991897940636, + "learning_rate": 4.7169008088548325e-06, + "loss": 0.8258, + "step": 7510 + }, + { + "epoch": 28.01624521072797, + "grad_norm": 0.00985270831733942, + "learning_rate": 4.7083865474670075e-06, + "loss": 1.0633, + "step": 7520 + }, + { + "epoch": 28.017011494252873, + "grad_norm": 0.5813823342323303, + "learning_rate": 4.6998722860791825e-06, + "loss": 0.882, + "step": 7530 + }, + { + "epoch": 28.017777777777777, + "grad_norm": 0.03663960471749306, + "learning_rate": 4.691358024691358e-06, + "loss": 0.9276, + "step": 7540 + }, + { + "epoch": 28.01854406130268, + "grad_norm": 0.22019168734550476, + "learning_rate": 4.682843763303534e-06, + "loss": 0.5073, + "step": 7550 + }, + { + "epoch": 28.019310344827588, + "grad_norm": 0.007242546882480383, + "learning_rate": 4.674329501915709e-06, + "loss": 0.5191, + "step": 7560 + }, + { + "epoch": 28.02, + "eval_accuracy": 0.7555555555555555, + "eval_loss": 1.2733657360076904, + "eval_runtime": 14.1698, + "eval_samples_per_second": 3.176, + "eval_steps_per_second": 3.176, + "step": 7569 + }, + { + "epoch": 29.000076628352492, + "grad_norm": 0.017437102273106575, + "learning_rate": 4.665815240527884e-06, + "loss": 0.2045, + "step": 7570 + }, + { + "epoch": 29.000842911877395, + "grad_norm": 0.7838713526725769, + "learning_rate": 4.65730097914006e-06, + "loss": 0.0042, + "step": 7580 + }, + { + "epoch": 29.0016091954023, + "grad_norm": 0.003179553197696805, + "learning_rate": 4.648786717752236e-06, + "loss": 0.0065, + "step": 7590 + }, + { + "epoch": 29.002375478927203, + "grad_norm": 0.0024605966173112392, + "learning_rate": 4.640272456364411e-06, + "loss": 0.6129, + "step": 7600 + }, + { + "epoch": 29.003141762452106, + "grad_norm": 0.1865365356206894, + "learning_rate": 4.631758194976586e-06, + "loss": 0.6739, + "step": 7610 + }, + { + "epoch": 29.00390804597701, + "grad_norm": 0.08629602938890457, + "learning_rate": 4.623243933588762e-06, + "loss": 0.0105, + "step": 7620 + }, + { + "epoch": 29.004674329501917, + "grad_norm": 0.002063839230686426, + "learning_rate": 4.6147296722009375e-06, + "loss": 0.5017, + "step": 7630 + }, + { + "epoch": 29.00544061302682, + "grad_norm": 0.19979241490364075, + "learning_rate": 4.6062154108131125e-06, + "loss": 1.6036, + "step": 7640 + }, + { + "epoch": 29.006206896551724, + "grad_norm": 0.009347641840577126, + "learning_rate": 4.5977011494252875e-06, + "loss": 0.2936, + "step": 7650 + }, + { + "epoch": 29.006973180076628, + "grad_norm": 0.43149271607398987, + "learning_rate": 4.589186888037463e-06, + "loss": 0.4749, + "step": 7660 + }, + { + "epoch": 29.00773946360153, + "grad_norm": 0.0016137725906446576, + "learning_rate": 4.580672626649638e-06, + "loss": 0.002, + "step": 7670 + }, + { + "epoch": 29.00850574712644, + "grad_norm": 0.0033685702364891768, + "learning_rate": 4.572158365261814e-06, + "loss": 0.4395, + "step": 7680 + }, + { + "epoch": 29.009272030651342, + "grad_norm": 0.01022439356893301, + "learning_rate": 4.563644103873989e-06, + "loss": 0.0013, + "step": 7690 + }, + { + "epoch": 29.010038314176246, + "grad_norm": 0.022741427645087242, + "learning_rate": 4.555129842486164e-06, + "loss": 0.0026, + "step": 7700 + }, + { + "epoch": 29.01080459770115, + "grad_norm": 0.0013971420703455806, + "learning_rate": 4.54661558109834e-06, + "loss": 1.2997, + "step": 7710 + }, + { + "epoch": 29.011570881226053, + "grad_norm": 0.0018495945259928703, + "learning_rate": 4.538101319710516e-06, + "loss": 0.0621, + "step": 7720 + }, + { + "epoch": 29.012337164750956, + "grad_norm": 8.727816581726074, + "learning_rate": 4.529587058322691e-06, + "loss": 0.6664, + "step": 7730 + }, + { + "epoch": 29.013103448275864, + "grad_norm": 0.003989522345364094, + "learning_rate": 4.521072796934866e-06, + "loss": 0.046, + "step": 7740 + }, + { + "epoch": 29.013869731800767, + "grad_norm": 0.1331661492586136, + "learning_rate": 4.512558535547042e-06, + "loss": 0.9251, + "step": 7750 + }, + { + "epoch": 29.01463601532567, + "grad_norm": 0.17615145444869995, + "learning_rate": 4.5040442741592175e-06, + "loss": 0.4809, + "step": 7760 + }, + { + "epoch": 29.015402298850574, + "grad_norm": 0.0029606192838400602, + "learning_rate": 4.4955300127713925e-06, + "loss": 0.3072, + "step": 7770 + }, + { + "epoch": 29.016168582375478, + "grad_norm": 1.035951852798462, + "learning_rate": 4.4870157513835675e-06, + "loss": 0.0023, + "step": 7780 + }, + { + "epoch": 29.01693486590038, + "grad_norm": 88.19917297363281, + "learning_rate": 4.478501489995743e-06, + "loss": 1.0988, + "step": 7790 + }, + { + "epoch": 29.01770114942529, + "grad_norm": 0.0017601007129997015, + "learning_rate": 4.469987228607919e-06, + "loss": 0.4938, + "step": 7800 + }, + { + "epoch": 29.018467432950192, + "grad_norm": 0.001365689910016954, + "learning_rate": 4.461472967220094e-06, + "loss": 1.212, + "step": 7810 + }, + { + "epoch": 29.019233716475096, + "grad_norm": 4.030818939208984, + "learning_rate": 4.452958705832269e-06, + "loss": 1.0075, + "step": 7820 + }, + { + "epoch": 29.02, + "grad_norm": 0.0018792204791679978, + "learning_rate": 4.444444444444444e-06, + "loss": 0.4537, + "step": 7830 + }, + { + "epoch": 29.02, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 1.8098787069320679, + "eval_runtime": 16.2348, + "eval_samples_per_second": 2.772, + "eval_steps_per_second": 2.772, + "step": 7830 + }, + { + "epoch": 30.000766283524904, + "grad_norm": 0.0031461918260902166, + "learning_rate": 4.43593018305662e-06, + "loss": 0.7998, + "step": 7840 + }, + { + "epoch": 30.001532567049807, + "grad_norm": 0.21870797872543335, + "learning_rate": 4.427415921668796e-06, + "loss": 0.006, + "step": 7850 + }, + { + "epoch": 30.002298850574714, + "grad_norm": 0.04204126074910164, + "learning_rate": 4.418901660280971e-06, + "loss": 0.0008, + "step": 7860 + }, + { + "epoch": 30.003065134099618, + "grad_norm": 0.053357142955064774, + "learning_rate": 4.410387398893146e-06, + "loss": 0.6211, + "step": 7870 + }, + { + "epoch": 30.00383141762452, + "grad_norm": 0.0014875102788209915, + "learning_rate": 4.401873137505322e-06, + "loss": 0.588, + "step": 7880 + }, + { + "epoch": 30.004597701149425, + "grad_norm": 0.14551614224910736, + "learning_rate": 4.3933588761174975e-06, + "loss": 0.434, + "step": 7890 + }, + { + "epoch": 30.00536398467433, + "grad_norm": 0.003355887020006776, + "learning_rate": 4.3848446147296725e-06, + "loss": 0.4216, + "step": 7900 + }, + { + "epoch": 30.006130268199232, + "grad_norm": 235.74745178222656, + "learning_rate": 4.3763303533418475e-06, + "loss": 1.7108, + "step": 7910 + }, + { + "epoch": 30.00689655172414, + "grad_norm": 0.4113011658191681, + "learning_rate": 4.367816091954023e-06, + "loss": 0.9043, + "step": 7920 + }, + { + "epoch": 30.007662835249043, + "grad_norm": 359.8921813964844, + "learning_rate": 4.359301830566199e-06, + "loss": 0.5145, + "step": 7930 + }, + { + "epoch": 30.008429118773947, + "grad_norm": 5.985605239868164, + "learning_rate": 4.350787569178374e-06, + "loss": 0.0965, + "step": 7940 + }, + { + "epoch": 30.00919540229885, + "grad_norm": 0.07877785712480545, + "learning_rate": 4.342273307790549e-06, + "loss": 0.577, + "step": 7950 + }, + { + "epoch": 30.009961685823754, + "grad_norm": 0.01540356408804655, + "learning_rate": 4.333759046402725e-06, + "loss": 0.2528, + "step": 7960 + }, + { + "epoch": 30.010727969348657, + "grad_norm": 0.06074034050107002, + "learning_rate": 4.325244785014901e-06, + "loss": 0.0008, + "step": 7970 + }, + { + "epoch": 30.011494252873565, + "grad_norm": 0.17096516489982605, + "learning_rate": 4.316730523627076e-06, + "loss": 0.0174, + "step": 7980 + }, + { + "epoch": 30.01226053639847, + "grad_norm": 0.005554437171667814, + "learning_rate": 4.308216262239251e-06, + "loss": 0.5229, + "step": 7990 + }, + { + "epoch": 30.013026819923372, + "grad_norm": 0.231408953666687, + "learning_rate": 4.299702000851427e-06, + "loss": 0.938, + "step": 8000 + }, + { + "epoch": 30.013793103448275, + "grad_norm": 0.001429896685294807, + "learning_rate": 4.291187739463602e-06, + "loss": 1.4188, + "step": 8010 + }, + { + "epoch": 30.01455938697318, + "grad_norm": 0.17686352133750916, + "learning_rate": 4.2826734780757775e-06, + "loss": 0.5479, + "step": 8020 + }, + { + "epoch": 30.015325670498083, + "grad_norm": 0.00278076040558517, + "learning_rate": 4.2741592166879525e-06, + "loss": 0.0121, + "step": 8030 + }, + { + "epoch": 30.01609195402299, + "grad_norm": 306.07781982421875, + "learning_rate": 4.2656449553001275e-06, + "loss": 0.5206, + "step": 8040 + }, + { + "epoch": 30.016858237547893, + "grad_norm": 0.002114189090207219, + "learning_rate": 4.257130693912303e-06, + "loss": 0.6451, + "step": 8050 + }, + { + "epoch": 30.017624521072797, + "grad_norm": 0.0014462157851085067, + "learning_rate": 4.248616432524479e-06, + "loss": 0.591, + "step": 8060 + }, + { + "epoch": 30.0183908045977, + "grad_norm": 0.09331923723220825, + "learning_rate": 4.240102171136654e-06, + "loss": 1.0715, + "step": 8070 + }, + { + "epoch": 30.019157088122604, + "grad_norm": 153.68565368652344, + "learning_rate": 4.231587909748829e-06, + "loss": 2.1039, + "step": 8080 + }, + { + "epoch": 30.01992337164751, + "grad_norm": 0.002513670129701495, + "learning_rate": 4.223073648361005e-06, + "loss": 0.003, + "step": 8090 + }, + { + "epoch": 30.02, + "eval_accuracy": 0.7333333333333333, + "eval_loss": 1.5859893560409546, + "eval_runtime": 15.4753, + "eval_samples_per_second": 2.908, + "eval_steps_per_second": 2.908, + "step": 8091 + }, + { + "epoch": 31.000689655172415, + "grad_norm": 0.5041216611862183, + "learning_rate": 4.214559386973181e-06, + "loss": 0.4334, + "step": 8100 + }, + { + "epoch": 31.00145593869732, + "grad_norm": 0.0025321398861706257, + "learning_rate": 4.206045125585356e-06, + "loss": 0.0029, + "step": 8110 + }, + { + "epoch": 31.002222222222223, + "grad_norm": 0.005120292771607637, + "learning_rate": 4.197530864197531e-06, + "loss": 0.4062, + "step": 8120 + }, + { + "epoch": 31.002988505747126, + "grad_norm": 72.58399963378906, + "learning_rate": 4.189016602809707e-06, + "loss": 0.0169, + "step": 8130 + }, + { + "epoch": 31.00375478927203, + "grad_norm": 0.001410025986842811, + "learning_rate": 4.180502341421882e-06, + "loss": 0.4409, + "step": 8140 + }, + { + "epoch": 31.004521072796933, + "grad_norm": 0.07055753469467163, + "learning_rate": 4.1719880800340575e-06, + "loss": 0.0572, + "step": 8150 + }, + { + "epoch": 31.00528735632184, + "grad_norm": 0.011598301120102406, + "learning_rate": 4.1634738186462325e-06, + "loss": 0.4805, + "step": 8160 + }, + { + "epoch": 31.006053639846744, + "grad_norm": 0.003230190370231867, + "learning_rate": 4.154959557258408e-06, + "loss": 0.0171, + "step": 8170 + }, + { + "epoch": 31.006819923371648, + "grad_norm": 482.70013427734375, + "learning_rate": 4.146445295870583e-06, + "loss": 0.8875, + "step": 8180 + }, + { + "epoch": 31.00758620689655, + "grad_norm": 0.0022959550842642784, + "learning_rate": 4.137931034482759e-06, + "loss": 1.376, + "step": 8190 + }, + { + "epoch": 31.008352490421455, + "grad_norm": 141.5695343017578, + "learning_rate": 4.129416773094934e-06, + "loss": 1.0537, + "step": 8200 + }, + { + "epoch": 31.00911877394636, + "grad_norm": 0.06153375282883644, + "learning_rate": 4.12090251170711e-06, + "loss": 0.5831, + "step": 8210 + }, + { + "epoch": 31.009885057471266, + "grad_norm": 0.09612569957971573, + "learning_rate": 4.112388250319285e-06, + "loss": 1.1069, + "step": 8220 + }, + { + "epoch": 31.01065134099617, + "grad_norm": 431.49713134765625, + "learning_rate": 4.103873988931461e-06, + "loss": 0.8904, + "step": 8230 + }, + { + "epoch": 31.011417624521073, + "grad_norm": 0.008050643838942051, + "learning_rate": 4.095359727543636e-06, + "loss": 0.0011, + "step": 8240 + }, + { + "epoch": 31.012183908045976, + "grad_norm": 0.009870373643934727, + "learning_rate": 4.086845466155812e-06, + "loss": 0.2455, + "step": 8250 + }, + { + "epoch": 31.01295019157088, + "grad_norm": 0.027511747553944588, + "learning_rate": 4.078331204767987e-06, + "loss": 0.0028, + "step": 8260 + }, + { + "epoch": 31.013716475095784, + "grad_norm": 0.011268673464655876, + "learning_rate": 4.0698169433801625e-06, + "loss": 0.5757, + "step": 8270 + }, + { + "epoch": 31.01448275862069, + "grad_norm": 0.17219111323356628, + "learning_rate": 4.0613026819923375e-06, + "loss": 1.1817, + "step": 8280 + }, + { + "epoch": 31.015249042145594, + "grad_norm": 0.0020057205110788345, + "learning_rate": 4.052788420604513e-06, + "loss": 0.0356, + "step": 8290 + }, + { + "epoch": 31.016015325670498, + "grad_norm": 0.03642117604613304, + "learning_rate": 4.044274159216688e-06, + "loss": 0.011, + "step": 8300 + }, + { + "epoch": 31.0167816091954, + "grad_norm": 0.18827003240585327, + "learning_rate": 4.035759897828863e-06, + "loss": 1.0117, + "step": 8310 + }, + { + "epoch": 31.017547892720305, + "grad_norm": 0.004165468737483025, + "learning_rate": 4.027245636441039e-06, + "loss": 0.6789, + "step": 8320 + }, + { + "epoch": 31.018314176245212, + "grad_norm": 0.22859685122966766, + "learning_rate": 4.018731375053214e-06, + "loss": 0.6094, + "step": 8330 + }, + { + "epoch": 31.019080459770116, + "grad_norm": 0.32649898529052734, + "learning_rate": 4.01021711366539e-06, + "loss": 0.7456, + "step": 8340 + }, + { + "epoch": 31.01984674329502, + "grad_norm": 0.15983377397060394, + "learning_rate": 4.001702852277565e-06, + "loss": 0.0004, + "step": 8350 + }, + { + "epoch": 31.02, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.2568414211273193, + "eval_runtime": 16.4461, + "eval_samples_per_second": 2.736, + "eval_steps_per_second": 2.736, + "step": 8352 + }, + { + "epoch": 32.00061302681992, + "grad_norm": 0.07776673138141632, + "learning_rate": 3.993188590889741e-06, + "loss": 0.0006, + "step": 8360 + }, + { + "epoch": 32.00137931034483, + "grad_norm": 0.06379430741071701, + "learning_rate": 3.984674329501916e-06, + "loss": 0.0012, + "step": 8370 + }, + { + "epoch": 32.002145593869734, + "grad_norm": 0.016460496932268143, + "learning_rate": 3.976160068114092e-06, + "loss": 1.0225, + "step": 8380 + }, + { + "epoch": 32.00291187739464, + "grad_norm": 0.055286090821027756, + "learning_rate": 3.967645806726267e-06, + "loss": 1.295, + "step": 8390 + }, + { + "epoch": 32.00367816091954, + "grad_norm": 0.01606907695531845, + "learning_rate": 3.9591315453384425e-06, + "loss": 0.0004, + "step": 8400 + }, + { + "epoch": 32.004444444444445, + "grad_norm": 0.3932900130748749, + "learning_rate": 3.9506172839506175e-06, + "loss": 0.0016, + "step": 8410 + }, + { + "epoch": 32.00521072796935, + "grad_norm": 0.6772463321685791, + "learning_rate": 3.942103022562793e-06, + "loss": 0.5876, + "step": 8420 + }, + { + "epoch": 32.00597701149425, + "grad_norm": 194.08460998535156, + "learning_rate": 3.933588761174968e-06, + "loss": 0.5877, + "step": 8430 + }, + { + "epoch": 32.006743295019156, + "grad_norm": 0.01706990785896778, + "learning_rate": 3.925074499787143e-06, + "loss": 0.5995, + "step": 8440 + }, + { + "epoch": 32.00750957854406, + "grad_norm": 0.0012355584185570478, + "learning_rate": 3.916560238399319e-06, + "loss": 0.8429, + "step": 8450 + }, + { + "epoch": 32.00827586206896, + "grad_norm": 0.0049840230494737625, + "learning_rate": 3.908045977011495e-06, + "loss": 0.449, + "step": 8460 + }, + { + "epoch": 32.00904214559387, + "grad_norm": 0.004344654735177755, + "learning_rate": 3.89953171562367e-06, + "loss": 0.0006, + "step": 8470 + }, + { + "epoch": 32.00980842911878, + "grad_norm": 2.9717156887054443, + "learning_rate": 3.891017454235845e-06, + "loss": 0.3473, + "step": 8480 + }, + { + "epoch": 32.01057471264368, + "grad_norm": 0.026496445760130882, + "learning_rate": 3.882503192848021e-06, + "loss": 0.0017, + "step": 8490 + }, + { + "epoch": 32.011340996168585, + "grad_norm": 0.02829962782561779, + "learning_rate": 3.873988931460197e-06, + "loss": 0.0012, + "step": 8500 + }, + { + "epoch": 32.01210727969349, + "grad_norm": 0.10174506902694702, + "learning_rate": 3.865474670072372e-06, + "loss": 0.5407, + "step": 8510 + }, + { + "epoch": 32.01287356321839, + "grad_norm": 0.0033128669019788504, + "learning_rate": 3.856960408684547e-06, + "loss": 0.0004, + "step": 8520 + }, + { + "epoch": 32.013639846743295, + "grad_norm": 0.003256371710449457, + "learning_rate": 3.8484461472967225e-06, + "loss": 0.0009, + "step": 8530 + }, + { + "epoch": 32.0144061302682, + "grad_norm": 0.14455612003803253, + "learning_rate": 3.839931885908898e-06, + "loss": 0.6465, + "step": 8540 + }, + { + "epoch": 32.0151724137931, + "grad_norm": 0.05306978151202202, + "learning_rate": 3.831417624521073e-06, + "loss": 0.0009, + "step": 8550 + }, + { + "epoch": 32.015938697318006, + "grad_norm": 0.031177254393696785, + "learning_rate": 3.822903363133248e-06, + "loss": 0.7795, + "step": 8560 + }, + { + "epoch": 32.01670498084291, + "grad_norm": 0.01676277443766594, + "learning_rate": 3.8143891017454237e-06, + "loss": 0.0016, + "step": 8570 + }, + { + "epoch": 32.01747126436781, + "grad_norm": 0.011179128661751747, + "learning_rate": 3.805874840357599e-06, + "loss": 0.0007, + "step": 8580 + }, + { + "epoch": 32.01823754789272, + "grad_norm": 0.03544120863080025, + "learning_rate": 3.797360578969775e-06, + "loss": 0.7343, + "step": 8590 + }, + { + "epoch": 32.01900383141763, + "grad_norm": 0.0019313012016937137, + "learning_rate": 3.78884631758195e-06, + "loss": 1.6009, + "step": 8600 + }, + { + "epoch": 32.01977011494253, + "grad_norm": 378.3231506347656, + "learning_rate": 3.7803320561941254e-06, + "loss": 0.1452, + "step": 8610 + }, + { + "epoch": 32.02, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.4112446308135986, + "eval_runtime": 13.281, + "eval_samples_per_second": 3.388, + "eval_steps_per_second": 3.388, + "step": 8613 + }, + { + "epoch": 33.00053639846743, + "grad_norm": 0.15281549096107483, + "learning_rate": 3.7718177948063004e-06, + "loss": 0.0016, + "step": 8620 + }, + { + "epoch": 33.001302681992335, + "grad_norm": 0.053307775408029556, + "learning_rate": 3.7633035334184762e-06, + "loss": 0.2357, + "step": 8630 + }, + { + "epoch": 33.00206896551724, + "grad_norm": 0.09021246433258057, + "learning_rate": 3.7547892720306517e-06, + "loss": 0.0004, + "step": 8640 + }, + { + "epoch": 33.00283524904214, + "grad_norm": 0.0028194631449878216, + "learning_rate": 3.746275010642827e-06, + "loss": 0.5114, + "step": 8650 + }, + { + "epoch": 33.00360153256705, + "grad_norm": 0.0028222943656146526, + "learning_rate": 3.737760749255002e-06, + "loss": 0.001, + "step": 8660 + }, + { + "epoch": 33.00436781609196, + "grad_norm": 0.0010228869505226612, + "learning_rate": 3.729246487867178e-06, + "loss": 0.0011, + "step": 8670 + }, + { + "epoch": 33.00513409961686, + "grad_norm": 0.062401916831731796, + "learning_rate": 3.7207322264793533e-06, + "loss": 0.5505, + "step": 8680 + }, + { + "epoch": 33.005900383141764, + "grad_norm": 362.3966979980469, + "learning_rate": 3.7122179650915287e-06, + "loss": 0.4197, + "step": 8690 + }, + { + "epoch": 33.00666666666667, + "grad_norm": 0.028478719294071198, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.4369, + "step": 8700 + }, + { + "epoch": 33.00743295019157, + "grad_norm": 0.0008925276342779398, + "learning_rate": 3.6951894423158796e-06, + "loss": 0.0018, + "step": 8710 + }, + { + "epoch": 33.008199233716475, + "grad_norm": 69.93021392822266, + "learning_rate": 3.686675180928055e-06, + "loss": 0.5998, + "step": 8720 + }, + { + "epoch": 33.00896551724138, + "grad_norm": 0.008064580149948597, + "learning_rate": 3.67816091954023e-06, + "loss": 0.0008, + "step": 8730 + }, + { + "epoch": 33.00973180076628, + "grad_norm": 0.0009439765708521008, + "learning_rate": 3.6696466581524054e-06, + "loss": 0.001, + "step": 8740 + }, + { + "epoch": 33.010498084291186, + "grad_norm": 0.0008714428986422718, + "learning_rate": 3.6611323967645812e-06, + "loss": 0.2434, + "step": 8750 + }, + { + "epoch": 33.01126436781609, + "grad_norm": 0.0016324096359312534, + "learning_rate": 3.6526181353767567e-06, + "loss": 0.0026, + "step": 8760 + }, + { + "epoch": 33.01203065134099, + "grad_norm": 0.2354644238948822, + "learning_rate": 3.6441038739889317e-06, + "loss": 0.947, + "step": 8770 + }, + { + "epoch": 33.012796934865904, + "grad_norm": 0.017056241631507874, + "learning_rate": 3.635589612601107e-06, + "loss": 0.5597, + "step": 8780 + }, + { + "epoch": 33.01356321839081, + "grad_norm": 0.007571222726255655, + "learning_rate": 3.627075351213283e-06, + "loss": 0.5969, + "step": 8790 + }, + { + "epoch": 33.01432950191571, + "grad_norm": 0.06942231208086014, + "learning_rate": 3.618561089825458e-06, + "loss": 0.2976, + "step": 8800 + }, + { + "epoch": 33.015095785440614, + "grad_norm": 0.0020972101483494043, + "learning_rate": 3.6100468284376333e-06, + "loss": 0.7729, + "step": 8810 + }, + { + "epoch": 33.01586206896552, + "grad_norm": 0.0008585660834796727, + "learning_rate": 3.6015325670498087e-06, + "loss": 1.0393, + "step": 8820 + }, + { + "epoch": 33.01662835249042, + "grad_norm": 0.03751160949468613, + "learning_rate": 3.5930183056619837e-06, + "loss": 0.0003, + "step": 8830 + }, + { + "epoch": 33.017394636015325, + "grad_norm": 0.002867656061425805, + "learning_rate": 3.5845040442741596e-06, + "loss": 0.5826, + "step": 8840 + }, + { + "epoch": 33.01816091954023, + "grad_norm": 0.0013876117300242186, + "learning_rate": 3.575989782886335e-06, + "loss": 0.6495, + "step": 8850 + }, + { + "epoch": 33.01892720306513, + "grad_norm": 983.3112182617188, + "learning_rate": 3.56747552149851e-06, + "loss": 1.1592, + "step": 8860 + }, + { + "epoch": 33.019693486590036, + "grad_norm": 0.011666815727949142, + "learning_rate": 3.5589612601106854e-06, + "loss": 0.3815, + "step": 8870 + }, + { + "epoch": 33.02, + "eval_accuracy": 0.7555555555555555, + "eval_loss": 1.3678969144821167, + "eval_runtime": 13.8216, + "eval_samples_per_second": 3.256, + "eval_steps_per_second": 3.256, + "step": 8874 + }, + { + "epoch": 34.000459770114944, + "grad_norm": 0.2762603461742401, + "learning_rate": 3.5504469987228612e-06, + "loss": 3.2451, + "step": 8880 + }, + { + "epoch": 34.00122605363985, + "grad_norm": 0.004093986004590988, + "learning_rate": 3.5419327373350367e-06, + "loss": 0.2555, + "step": 8890 + }, + { + "epoch": 34.00199233716475, + "grad_norm": 0.003161295084282756, + "learning_rate": 3.5334184759472117e-06, + "loss": 0.0051, + "step": 8900 + }, + { + "epoch": 34.002758620689654, + "grad_norm": 0.046276550740003586, + "learning_rate": 3.524904214559387e-06, + "loss": 0.3972, + "step": 8910 + }, + { + "epoch": 34.00352490421456, + "grad_norm": 0.0007882455829530954, + "learning_rate": 3.516389953171563e-06, + "loss": 0.0008, + "step": 8920 + }, + { + "epoch": 34.00429118773946, + "grad_norm": 0.00730425538495183, + "learning_rate": 3.507875691783738e-06, + "loss": 0.7386, + "step": 8930 + }, + { + "epoch": 34.005057471264365, + "grad_norm": 0.18467597663402557, + "learning_rate": 3.4993614303959133e-06, + "loss": 1.5343, + "step": 8940 + }, + { + "epoch": 34.00582375478927, + "grad_norm": 0.19425460696220398, + "learning_rate": 3.4908471690080887e-06, + "loss": 0.001, + "step": 8950 + }, + { + "epoch": 34.00659003831418, + "grad_norm": 0.5217742323875427, + "learning_rate": 3.4823329076202646e-06, + "loss": 0.0153, + "step": 8960 + }, + { + "epoch": 34.00735632183908, + "grad_norm": 0.013114755041897297, + "learning_rate": 3.4738186462324396e-06, + "loss": 2.0398, + "step": 8970 + }, + { + "epoch": 34.00812260536399, + "grad_norm": 0.0025911638513207436, + "learning_rate": 3.465304384844615e-06, + "loss": 0.0116, + "step": 8980 + }, + { + "epoch": 34.00888888888889, + "grad_norm": 0.002555274870246649, + "learning_rate": 3.4567901234567904e-06, + "loss": 0.7117, + "step": 8990 + }, + { + "epoch": 34.009655172413794, + "grad_norm": 0.0019624708220362663, + "learning_rate": 3.448275862068966e-06, + "loss": 0.0025, + "step": 9000 + }, + { + "epoch": 34.0104214559387, + "grad_norm": 0.006993942428380251, + "learning_rate": 3.4397616006811412e-06, + "loss": 0.752, + "step": 9010 + }, + { + "epoch": 34.0111877394636, + "grad_norm": 0.12625227868556976, + "learning_rate": 3.4312473392933167e-06, + "loss": 0.2687, + "step": 9020 + }, + { + "epoch": 34.011954022988505, + "grad_norm": 0.0011391348671168089, + "learning_rate": 3.4227330779054917e-06, + "loss": 0.5354, + "step": 9030 + }, + { + "epoch": 34.01272030651341, + "grad_norm": 0.3375532031059265, + "learning_rate": 3.4142188165176675e-06, + "loss": 0.001, + "step": 9040 + }, + { + "epoch": 34.01348659003831, + "grad_norm": 0.0027614478021860123, + "learning_rate": 3.405704555129843e-06, + "loss": 1.7076, + "step": 9050 + }, + { + "epoch": 34.014252873563215, + "grad_norm": 7.6758856773376465, + "learning_rate": 3.3971902937420183e-06, + "loss": 0.008, + "step": 9060 + }, + { + "epoch": 34.01501915708812, + "grad_norm": 0.001212018309161067, + "learning_rate": 3.3886760323541933e-06, + "loss": 0.0027, + "step": 9070 + }, + { + "epoch": 34.01578544061303, + "grad_norm": 0.0031360953580588102, + "learning_rate": 3.380161770966369e-06, + "loss": 0.0019, + "step": 9080 + }, + { + "epoch": 34.01655172413793, + "grad_norm": 1.0964499711990356, + "learning_rate": 3.3716475095785446e-06, + "loss": 0.0009, + "step": 9090 + }, + { + "epoch": 34.01731800766284, + "grad_norm": 135.07989501953125, + "learning_rate": 3.3631332481907196e-06, + "loss": 2.0915, + "step": 9100 + }, + { + "epoch": 34.01808429118774, + "grad_norm": 0.005994447972625494, + "learning_rate": 3.354618986802895e-06, + "loss": 0.5322, + "step": 9110 + }, + { + "epoch": 34.018850574712644, + "grad_norm": 0.001170378876850009, + "learning_rate": 3.3461047254150704e-06, + "loss": 0.0008, + "step": 9120 + }, + { + "epoch": 34.01961685823755, + "grad_norm": 0.0009761180845089257, + "learning_rate": 3.3375904640272463e-06, + "loss": 0.0013, + "step": 9130 + }, + { + "epoch": 34.02, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 1.8306384086608887, + "eval_runtime": 14.4419, + "eval_samples_per_second": 3.116, + "eval_steps_per_second": 3.116, + "step": 9135 + }, + { + "epoch": 35.000383141762455, + "grad_norm": 0.008362943306565285, + "learning_rate": 3.3290762026394212e-06, + "loss": 0.0008, + "step": 9140 + }, + { + "epoch": 35.00114942528736, + "grad_norm": 0.09023687988519669, + "learning_rate": 3.3205619412515967e-06, + "loss": 0.0005, + "step": 9150 + }, + { + "epoch": 35.00191570881226, + "grad_norm": 0.012076799757778645, + "learning_rate": 3.3120476798637717e-06, + "loss": 0.0592, + "step": 9160 + }, + { + "epoch": 35.002681992337166, + "grad_norm": 0.0008539219270460308, + "learning_rate": 3.3035334184759475e-06, + "loss": 0.0013, + "step": 9170 + }, + { + "epoch": 35.00344827586207, + "grad_norm": 6.743452072143555, + "learning_rate": 3.295019157088123e-06, + "loss": 1.3463, + "step": 9180 + }, + { + "epoch": 35.00421455938697, + "grad_norm": 0.0008903272682800889, + "learning_rate": 3.2865048957002983e-06, + "loss": 0.0009, + "step": 9190 + }, + { + "epoch": 35.00498084291188, + "grad_norm": 0.0033099281135946512, + "learning_rate": 3.2779906343124733e-06, + "loss": 0.0095, + "step": 9200 + }, + { + "epoch": 35.00574712643678, + "grad_norm": 0.0029596216045320034, + "learning_rate": 3.269476372924649e-06, + "loss": 0.5017, + "step": 9210 + }, + { + "epoch": 35.006513409961684, + "grad_norm": 0.1439450979232788, + "learning_rate": 3.2609621115368246e-06, + "loss": 0.7239, + "step": 9220 + }, + { + "epoch": 35.00727969348659, + "grad_norm": 0.05335940048098564, + "learning_rate": 3.2524478501489996e-06, + "loss": 0.0105, + "step": 9230 + }, + { + "epoch": 35.00804597701149, + "grad_norm": 0.08966932445764542, + "learning_rate": 3.243933588761175e-06, + "loss": 0.6246, + "step": 9240 + }, + { + "epoch": 35.008812260536395, + "grad_norm": 0.13768485188484192, + "learning_rate": 3.235419327373351e-06, + "loss": 0.0014, + "step": 9250 + }, + { + "epoch": 35.009578544061306, + "grad_norm": 0.003539847442880273, + "learning_rate": 3.2269050659855262e-06, + "loss": 0.4629, + "step": 9260 + }, + { + "epoch": 35.01034482758621, + "grad_norm": 0.17655432224273682, + "learning_rate": 3.2183908045977012e-06, + "loss": 1.1958, + "step": 9270 + }, + { + "epoch": 35.01111111111111, + "grad_norm": 0.3544051945209503, + "learning_rate": 3.2098765432098767e-06, + "loss": 0.0111, + "step": 9280 + }, + { + "epoch": 35.01187739463602, + "grad_norm": 0.2912541627883911, + "learning_rate": 3.2013622818220525e-06, + "loss": 1.2761, + "step": 9290 + }, + { + "epoch": 35.01264367816092, + "grad_norm": 0.005516159813851118, + "learning_rate": 3.1928480204342275e-06, + "loss": 0.0014, + "step": 9300 + }, + { + "epoch": 35.013409961685824, + "grad_norm": 25.71188735961914, + "learning_rate": 3.184333759046403e-06, + "loss": 0.5261, + "step": 9310 + }, + { + "epoch": 35.01417624521073, + "grad_norm": 0.0013126012636348605, + "learning_rate": 3.1758194976585783e-06, + "loss": 0.077, + "step": 9320 + }, + { + "epoch": 35.01494252873563, + "grad_norm": 49.9453125, + "learning_rate": 3.167305236270754e-06, + "loss": 0.4977, + "step": 9330 + }, + { + "epoch": 35.015708812260534, + "grad_norm": 0.0009205365204252303, + "learning_rate": 3.158790974882929e-06, + "loss": 0.5102, + "step": 9340 + }, + { + "epoch": 35.01647509578544, + "grad_norm": 0.06953918188810349, + "learning_rate": 3.1502767134951046e-06, + "loss": 0.0009, + "step": 9350 + }, + { + "epoch": 35.01724137931034, + "grad_norm": 0.04685080796480179, + "learning_rate": 3.14176245210728e-06, + "loss": 0.6047, + "step": 9360 + }, + { + "epoch": 35.01800766283525, + "grad_norm": 54.803558349609375, + "learning_rate": 3.133248190719455e-06, + "loss": 1.22, + "step": 9370 + }, + { + "epoch": 35.018773946360156, + "grad_norm": 0.0024438260588794947, + "learning_rate": 3.124733929331631e-06, + "loss": 0.0024, + "step": 9380 + }, + { + "epoch": 35.01954022988506, + "grad_norm": 0.004958088044077158, + "learning_rate": 3.1162196679438062e-06, + "loss": 0.7655, + "step": 9390 + }, + { + "epoch": 35.02, + "eval_accuracy": 0.7333333333333333, + "eval_loss": 1.4607685804367065, + "eval_runtime": 14.1654, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 3.177, + "step": 9396 + }, + { + "epoch": 36.00030651340996, + "grad_norm": 0.061240918934345245, + "learning_rate": 3.1077054065559812e-06, + "loss": 0.0131, + "step": 9400 + }, + { + "epoch": 36.001072796934864, + "grad_norm": 0.004116494674235582, + "learning_rate": 3.0991911451681567e-06, + "loss": 0.0026, + "step": 9410 + }, + { + "epoch": 36.00183908045977, + "grad_norm": 0.0022276234813034534, + "learning_rate": 3.0906768837803325e-06, + "loss": 0.6602, + "step": 9420 + }, + { + "epoch": 36.00260536398467, + "grad_norm": 0.007051907479763031, + "learning_rate": 3.082162622392508e-06, + "loss": 0.6238, + "step": 9430 + }, + { + "epoch": 36.00337164750958, + "grad_norm": 0.010487622581422329, + "learning_rate": 3.073648361004683e-06, + "loss": 0.0005, + "step": 9440 + }, + { + "epoch": 36.004137931034485, + "grad_norm": 0.020656408742070198, + "learning_rate": 3.0651340996168583e-06, + "loss": 0.0003, + "step": 9450 + }, + { + "epoch": 36.00490421455939, + "grad_norm": 0.0014071677578613162, + "learning_rate": 3.056619838229034e-06, + "loss": 0.0008, + "step": 9460 + }, + { + "epoch": 36.00567049808429, + "grad_norm": 0.04767834395170212, + "learning_rate": 3.048105576841209e-06, + "loss": 0.6376, + "step": 9470 + }, + { + "epoch": 36.006436781609196, + "grad_norm": 0.001168592949397862, + "learning_rate": 3.0395913154533846e-06, + "loss": 1.2768, + "step": 9480 + }, + { + "epoch": 36.0072030651341, + "grad_norm": 0.12005781382322311, + "learning_rate": 3.03107705406556e-06, + "loss": 0.7032, + "step": 9490 + }, + { + "epoch": 36.007969348659, + "grad_norm": 0.049370937049388885, + "learning_rate": 3.022562792677736e-06, + "loss": 0.3523, + "step": 9500 + }, + { + "epoch": 36.00873563218391, + "grad_norm": 25.666242599487305, + "learning_rate": 3.014048531289911e-06, + "loss": 0.5111, + "step": 9510 + }, + { + "epoch": 36.00950191570881, + "grad_norm": 0.0010242098942399025, + "learning_rate": 3.0055342699020862e-06, + "loss": 0.9859, + "step": 9520 + }, + { + "epoch": 36.010268199233714, + "grad_norm": 0.020110201090574265, + "learning_rate": 2.9970200085142612e-06, + "loss": 0.0011, + "step": 9530 + }, + { + "epoch": 36.01103448275862, + "grad_norm": 0.12330694496631622, + "learning_rate": 2.988505747126437e-06, + "loss": 0.0268, + "step": 9540 + }, + { + "epoch": 36.01180076628353, + "grad_norm": 0.0015601320192217827, + "learning_rate": 2.9799914857386125e-06, + "loss": 0.0023, + "step": 9550 + }, + { + "epoch": 36.01256704980843, + "grad_norm": 0.0010449588298797607, + "learning_rate": 2.971477224350788e-06, + "loss": 0.0034, + "step": 9560 + }, + { + "epoch": 36.013333333333335, + "grad_norm": 60.46844482421875, + "learning_rate": 2.962962962962963e-06, + "loss": 1.1134, + "step": 9570 + }, + { + "epoch": 36.01409961685824, + "grad_norm": 0.13516788184642792, + "learning_rate": 2.9544487015751387e-06, + "loss": 1.1393, + "step": 9580 + }, + { + "epoch": 36.01486590038314, + "grad_norm": 0.19734834134578705, + "learning_rate": 2.945934440187314e-06, + "loss": 1.1429, + "step": 9590 + }, + { + "epoch": 36.015632183908046, + "grad_norm": 0.04803425073623657, + "learning_rate": 2.9374201787994896e-06, + "loss": 0.8387, + "step": 9600 + }, + { + "epoch": 36.01639846743295, + "grad_norm": 0.004391372203826904, + "learning_rate": 2.9289059174116646e-06, + "loss": 1.5079, + "step": 9610 + }, + { + "epoch": 36.01716475095785, + "grad_norm": 0.04461931064724922, + "learning_rate": 2.92039165602384e-06, + "loss": 0.0154, + "step": 9620 + }, + { + "epoch": 36.01793103448276, + "grad_norm": 0.16307049989700317, + "learning_rate": 2.911877394636016e-06, + "loss": 0.0026, + "step": 9630 + }, + { + "epoch": 36.01869731800766, + "grad_norm": 0.17401742935180664, + "learning_rate": 2.903363133248191e-06, + "loss": 0.6489, + "step": 9640 + }, + { + "epoch": 36.019463601532564, + "grad_norm": 0.0012012688675895333, + "learning_rate": 2.8948488718603662e-06, + "loss": 0.003, + "step": 9650 + }, + { + "epoch": 36.02, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 2.202925205230713, + "eval_runtime": 13.8555, + "eval_samples_per_second": 3.248, + "eval_steps_per_second": 3.248, + "step": 9657 + }, + { + "epoch": 37.00022988505747, + "grad_norm": 0.23111657798290253, + "learning_rate": 2.8863346104725417e-06, + "loss": 0.0025, + "step": 9660 + }, + { + "epoch": 37.000996168582375, + "grad_norm": 0.0007817544392310083, + "learning_rate": 2.8778203490847175e-06, + "loss": 0.4859, + "step": 9670 + }, + { + "epoch": 37.00176245210728, + "grad_norm": 0.0477173775434494, + "learning_rate": 2.8693060876968925e-06, + "loss": 0.0008, + "step": 9680 + }, + { + "epoch": 37.00252873563218, + "grad_norm": 0.000743559910915792, + "learning_rate": 2.860791826309068e-06, + "loss": 0.0004, + "step": 9690 + }, + { + "epoch": 37.003295019157086, + "grad_norm": 0.000858695711940527, + "learning_rate": 2.852277564921243e-06, + "loss": 0.6213, + "step": 9700 + }, + { + "epoch": 37.00406130268199, + "grad_norm": 2.1517531871795654, + "learning_rate": 2.8437633035334187e-06, + "loss": 0.9624, + "step": 9710 + }, + { + "epoch": 37.00482758620689, + "grad_norm": 0.012443887069821358, + "learning_rate": 2.835249042145594e-06, + "loss": 0.0015, + "step": 9720 + }, + { + "epoch": 37.005593869731804, + "grad_norm": 0.006380707025527954, + "learning_rate": 2.8267347807577696e-06, + "loss": 0.4598, + "step": 9730 + }, + { + "epoch": 37.00636015325671, + "grad_norm": 0.0008849736768752337, + "learning_rate": 2.8182205193699446e-06, + "loss": 0.0001, + "step": 9740 + }, + { + "epoch": 37.00712643678161, + "grad_norm": 0.0005933063803240657, + "learning_rate": 2.8097062579821204e-06, + "loss": 0.0007, + "step": 9750 + }, + { + "epoch": 37.007892720306515, + "grad_norm": 0.0010139941005036235, + "learning_rate": 2.801191996594296e-06, + "loss": 1.2024, + "step": 9760 + }, + { + "epoch": 37.00865900383142, + "grad_norm": 0.0027134341653436422, + "learning_rate": 2.792677735206471e-06, + "loss": 1.7367, + "step": 9770 + }, + { + "epoch": 37.00942528735632, + "grad_norm": 0.001102240988984704, + "learning_rate": 2.7841634738186462e-06, + "loss": 0.7939, + "step": 9780 + }, + { + "epoch": 37.010191570881226, + "grad_norm": 0.013465874828398228, + "learning_rate": 2.775649212430822e-06, + "loss": 0.1877, + "step": 9790 + }, + { + "epoch": 37.01095785440613, + "grad_norm": 0.004637483507394791, + "learning_rate": 2.7671349510429975e-06, + "loss": 0.0006, + "step": 9800 + }, + { + "epoch": 37.01172413793103, + "grad_norm": 0.0016687975730746984, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.0005, + "step": 9810 + }, + { + "epoch": 37.01249042145594, + "grad_norm": 0.10679076611995697, + "learning_rate": 2.750106428267348e-06, + "loss": 0.0824, + "step": 9820 + }, + { + "epoch": 37.01325670498084, + "grad_norm": 0.008856571279466152, + "learning_rate": 2.7415921668795238e-06, + "loss": 0.0004, + "step": 9830 + }, + { + "epoch": 37.014022988505744, + "grad_norm": 0.0008473598281852901, + "learning_rate": 2.7330779054916987e-06, + "loss": 0.0011, + "step": 9840 + }, + { + "epoch": 37.014789272030654, + "grad_norm": 0.11508283019065857, + "learning_rate": 2.724563644103874e-06, + "loss": 1.1716, + "step": 9850 + }, + { + "epoch": 37.01555555555556, + "grad_norm": 0.03232954069972038, + "learning_rate": 2.7160493827160496e-06, + "loss": 0.0005, + "step": 9860 + }, + { + "epoch": 37.01632183908046, + "grad_norm": 0.2793327569961548, + "learning_rate": 2.7075351213282254e-06, + "loss": 0.0017, + "step": 9870 + }, + { + "epoch": 37.017088122605365, + "grad_norm": 0.0020207474008202553, + "learning_rate": 2.6990208599404004e-06, + "loss": 0.0006, + "step": 9880 + }, + { + "epoch": 37.01785440613027, + "grad_norm": 0.0022292647045105696, + "learning_rate": 2.690506598552576e-06, + "loss": 0.559, + "step": 9890 + }, + { + "epoch": 37.01862068965517, + "grad_norm": 0.007433728780597448, + "learning_rate": 2.6819923371647512e-06, + "loss": 0.0557, + "step": 9900 + }, + { + "epoch": 37.019386973180076, + "grad_norm": 0.0006471008528023958, + "learning_rate": 2.6734780757769262e-06, + "loss": 0.0246, + "step": 9910 + }, + { + "epoch": 37.02, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 2.758582592010498, + "eval_runtime": 15.3967, + "eval_samples_per_second": 2.923, + "eval_steps_per_second": 2.923, + "step": 9918 + }, + { + "epoch": 38.000153256704984, + "grad_norm": 1.0717862844467163, + "learning_rate": 2.664963814389102e-06, + "loss": 1.7363, + "step": 9920 + }, + { + "epoch": 38.00091954022989, + "grad_norm": 0.17203135788440704, + "learning_rate": 2.6564495530012775e-06, + "loss": 0.0015, + "step": 9930 + }, + { + "epoch": 38.00168582375479, + "grad_norm": 0.11133809387683868, + "learning_rate": 2.6479352916134525e-06, + "loss": 0.0013, + "step": 9940 + }, + { + "epoch": 38.002452107279694, + "grad_norm": 0.5325710773468018, + "learning_rate": 2.639421030225628e-06, + "loss": 1.3669, + "step": 9950 + }, + { + "epoch": 38.0032183908046, + "grad_norm": 0.7260966897010803, + "learning_rate": 2.6309067688378037e-06, + "loss": 0.0061, + "step": 9960 + }, + { + "epoch": 38.0039846743295, + "grad_norm": 22.296585083007812, + "learning_rate": 2.622392507449979e-06, + "loss": 0.0036, + "step": 9970 + }, + { + "epoch": 38.004750957854405, + "grad_norm": 0.11299914866685867, + "learning_rate": 2.613878246062154e-06, + "loss": 0.0022, + "step": 9980 + }, + { + "epoch": 38.00551724137931, + "grad_norm": 0.049726832658052444, + "learning_rate": 2.6053639846743296e-06, + "loss": 1.2186, + "step": 9990 + }, + { + "epoch": 38.00628352490421, + "grad_norm": 0.0019447283120825887, + "learning_rate": 2.5968497232865054e-06, + "loss": 0.508, + "step": 10000 + }, + { + "epoch": 38.007049808429116, + "grad_norm": 0.0006838354747742414, + "learning_rate": 2.5883354618986804e-06, + "loss": 1.5742, + "step": 10010 + }, + { + "epoch": 38.00781609195402, + "grad_norm": 1236.9111328125, + "learning_rate": 2.579821200510856e-06, + "loss": 1.7882, + "step": 10020 + }, + { + "epoch": 38.00858237547893, + "grad_norm": 0.12140507996082306, + "learning_rate": 2.5713069391230312e-06, + "loss": 0.0014, + "step": 10030 + }, + { + "epoch": 38.009348659003834, + "grad_norm": 0.00671731773763895, + "learning_rate": 2.562792677735207e-06, + "loss": 0.001, + "step": 10040 + }, + { + "epoch": 38.01011494252874, + "grad_norm": 0.0007894644513726234, + "learning_rate": 2.554278416347382e-06, + "loss": 0.6358, + "step": 10050 + }, + { + "epoch": 38.01088122605364, + "grad_norm": 0.18793921172618866, + "learning_rate": 2.5457641549595575e-06, + "loss": 0.2218, + "step": 10060 + }, + { + "epoch": 38.011647509578545, + "grad_norm": 0.00469109695404768, + "learning_rate": 2.5372498935717325e-06, + "loss": 0.0006, + "step": 10070 + }, + { + "epoch": 38.01241379310345, + "grad_norm": 0.037202171981334686, + "learning_rate": 2.5287356321839083e-06, + "loss": 0.0056, + "step": 10080 + }, + { + "epoch": 38.01318007662835, + "grad_norm": 0.06974373012781143, + "learning_rate": 2.5202213707960837e-06, + "loss": 0.0065, + "step": 10090 + }, + { + "epoch": 38.013946360153255, + "grad_norm": 0.0019093331648036838, + "learning_rate": 2.511707109408259e-06, + "loss": 0.5021, + "step": 10100 + }, + { + "epoch": 38.01471264367816, + "grad_norm": 0.0015057099517434835, + "learning_rate": 2.503192848020434e-06, + "loss": 0.0011, + "step": 10110 + }, + { + "epoch": 38.01547892720306, + "grad_norm": 0.0014377710176631808, + "learning_rate": 2.49467858663261e-06, + "loss": 1.1023, + "step": 10120 + }, + { + "epoch": 38.016245210727966, + "grad_norm": 0.12007944285869598, + "learning_rate": 2.4861643252447854e-06, + "loss": 0.0129, + "step": 10130 + }, + { + "epoch": 38.01701149425288, + "grad_norm": 0.10494039952754974, + "learning_rate": 2.4776500638569604e-06, + "loss": 0.3025, + "step": 10140 + }, + { + "epoch": 38.01777777777778, + "grad_norm": 183.83294677734375, + "learning_rate": 2.469135802469136e-06, + "loss": 0.5541, + "step": 10150 + }, + { + "epoch": 38.018544061302684, + "grad_norm": 0.016142094507813454, + "learning_rate": 2.4606215410813112e-06, + "loss": 0.0807, + "step": 10160 + }, + { + "epoch": 38.01931034482759, + "grad_norm": 0.007817224599421024, + "learning_rate": 2.4521072796934867e-06, + "loss": 0.0007, + "step": 10170 + }, + { + "epoch": 38.02, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.6803932189941406, + "eval_runtime": 16.4696, + "eval_samples_per_second": 2.732, + "eval_steps_per_second": 2.732, + "step": 10179 + }, + { + "epoch": 39.00007662835249, + "grad_norm": 0.011514992453157902, + "learning_rate": 2.443593018305662e-06, + "loss": 0.711, + "step": 10180 + }, + { + "epoch": 39.00084291187739, + "grad_norm": 0.038958631455898285, + "learning_rate": 2.4350787569178375e-06, + "loss": 1.1963, + "step": 10190 + }, + { + "epoch": 39.001609195402295, + "grad_norm": 0.062485817819833755, + "learning_rate": 2.426564495530013e-06, + "loss": 0.0074, + "step": 10200 + }, + { + "epoch": 39.002375478927206, + "grad_norm": 25.11356544494629, + "learning_rate": 2.4180502341421883e-06, + "loss": 0.0049, + "step": 10210 + }, + { + "epoch": 39.00314176245211, + "grad_norm": 0.00820657704025507, + "learning_rate": 2.4095359727543637e-06, + "loss": 0.5552, + "step": 10220 + }, + { + "epoch": 39.00390804597701, + "grad_norm": 558.9811401367188, + "learning_rate": 2.401021711366539e-06, + "loss": 0.5175, + "step": 10230 + }, + { + "epoch": 39.00467432950192, + "grad_norm": 0.001662991475313902, + "learning_rate": 2.3925074499787146e-06, + "loss": 0.8054, + "step": 10240 + }, + { + "epoch": 39.00544061302682, + "grad_norm": 32.78825378417969, + "learning_rate": 2.38399318859089e-06, + "loss": 0.6772, + "step": 10250 + }, + { + "epoch": 39.006206896551724, + "grad_norm": 0.01901569776237011, + "learning_rate": 2.3754789272030654e-06, + "loss": 0.3458, + "step": 10260 + }, + { + "epoch": 39.00697318007663, + "grad_norm": 0.8498271703720093, + "learning_rate": 2.366964665815241e-06, + "loss": 0.473, + "step": 10270 + }, + { + "epoch": 39.00773946360153, + "grad_norm": 434.0633544921875, + "learning_rate": 2.3584504044274162e-06, + "loss": 1.7739, + "step": 10280 + }, + { + "epoch": 39.008505747126435, + "grad_norm": 0.05720654875040054, + "learning_rate": 2.3499361430395912e-06, + "loss": 0.0005, + "step": 10290 + }, + { + "epoch": 39.00927203065134, + "grad_norm": 0.29777073860168457, + "learning_rate": 2.341421881651767e-06, + "loss": 0.001, + "step": 10300 + }, + { + "epoch": 39.01003831417624, + "grad_norm": 0.0018811736954376101, + "learning_rate": 2.332907620263942e-06, + "loss": 0.6039, + "step": 10310 + }, + { + "epoch": 39.01080459770115, + "grad_norm": 0.0006688185385428369, + "learning_rate": 2.324393358876118e-06, + "loss": 0.6483, + "step": 10320 + }, + { + "epoch": 39.011570881226056, + "grad_norm": 0.0013783015310764313, + "learning_rate": 2.315879097488293e-06, + "loss": 0.0058, + "step": 10330 + }, + { + "epoch": 39.01233716475096, + "grad_norm": 0.0050174882635474205, + "learning_rate": 2.3073648361004688e-06, + "loss": 0.0005, + "step": 10340 + }, + { + "epoch": 39.013103448275864, + "grad_norm": 0.02628183178603649, + "learning_rate": 2.2988505747126437e-06, + "loss": 0.4835, + "step": 10350 + }, + { + "epoch": 39.01386973180077, + "grad_norm": 0.18247047066688538, + "learning_rate": 2.290336313324819e-06, + "loss": 0.3041, + "step": 10360 + }, + { + "epoch": 39.01463601532567, + "grad_norm": 0.042854152619838715, + "learning_rate": 2.2818220519369946e-06, + "loss": 0.4872, + "step": 10370 + }, + { + "epoch": 39.015402298850574, + "grad_norm": 0.0007316448027268052, + "learning_rate": 2.27330779054917e-06, + "loss": 0.5496, + "step": 10380 + }, + { + "epoch": 39.01616858237548, + "grad_norm": 0.06810470670461655, + "learning_rate": 2.2647935291613454e-06, + "loss": 0.0009, + "step": 10390 + }, + { + "epoch": 39.01693486590038, + "grad_norm": 1.610714077949524, + "learning_rate": 2.256279267773521e-06, + "loss": 0.3855, + "step": 10400 + }, + { + "epoch": 39.017701149425285, + "grad_norm": 0.0006430544308386743, + "learning_rate": 2.2477650063856962e-06, + "loss": 0.2692, + "step": 10410 + }, + { + "epoch": 39.01846743295019, + "grad_norm": 0.009712583385407925, + "learning_rate": 2.2392507449978717e-06, + "loss": 0.6717, + "step": 10420 + }, + { + "epoch": 39.01923371647509, + "grad_norm": 0.0006527869845740497, + "learning_rate": 2.230736483610047e-06, + "loss": 0.0016, + "step": 10430 + }, + { + "epoch": 39.02, + "grad_norm": 0.001249029883183539, + "learning_rate": 2.222222222222222e-06, + "loss": 0.5967, + "step": 10440 + }, + { + "epoch": 39.02, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.596872568130493, + "eval_runtime": 14.1307, + "eval_samples_per_second": 3.185, + "eval_steps_per_second": 3.185, + "step": 10440 + }, + { + "epoch": 40.000766283524904, + "grad_norm": 0.0007803108892403543, + "learning_rate": 2.213707960834398e-06, + "loss": 0.8372, + "step": 10450 + }, + { + "epoch": 40.00153256704981, + "grad_norm": 0.0010762476595118642, + "learning_rate": 2.205193699446573e-06, + "loss": 0.0006, + "step": 10460 + }, + { + "epoch": 40.00229885057471, + "grad_norm": 0.004534687381237745, + "learning_rate": 2.1966794380587487e-06, + "loss": 0.6515, + "step": 10470 + }, + { + "epoch": 40.003065134099614, + "grad_norm": 0.0006635275203734636, + "learning_rate": 2.1881651766709237e-06, + "loss": 0.5376, + "step": 10480 + }, + { + "epoch": 40.00383141762452, + "grad_norm": 0.007556573953479528, + "learning_rate": 2.1796509152830996e-06, + "loss": 0.0631, + "step": 10490 + }, + { + "epoch": 40.00459770114943, + "grad_norm": 0.0008903385605663061, + "learning_rate": 2.1711366538952746e-06, + "loss": 0.6217, + "step": 10500 + }, + { + "epoch": 40.00536398467433, + "grad_norm": 0.13304153084754944, + "learning_rate": 2.1626223925074504e-06, + "loss": 0.0064, + "step": 10510 + }, + { + "epoch": 40.006130268199236, + "grad_norm": 0.1292513608932495, + "learning_rate": 2.1541081311196254e-06, + "loss": 0.2007, + "step": 10520 + }, + { + "epoch": 40.00689655172414, + "grad_norm": 0.001017851522192359, + "learning_rate": 2.145593869731801e-06, + "loss": 0.9946, + "step": 10530 + }, + { + "epoch": 40.00766283524904, + "grad_norm": 0.028370540589094162, + "learning_rate": 2.1370796083439762e-06, + "loss": 0.0004, + "step": 10540 + }, + { + "epoch": 40.00842911877395, + "grad_norm": 0.006354500073939562, + "learning_rate": 2.1285653469561517e-06, + "loss": 0.0455, + "step": 10550 + }, + { + "epoch": 40.00919540229885, + "grad_norm": 0.0005885363207198679, + "learning_rate": 2.120051085568327e-06, + "loss": 0.7072, + "step": 10560 + }, + { + "epoch": 40.009961685823754, + "grad_norm": 1.9478639364242554, + "learning_rate": 2.1115368241805025e-06, + "loss": 1.1553, + "step": 10570 + }, + { + "epoch": 40.01072796934866, + "grad_norm": 0.0016932955477386713, + "learning_rate": 2.103022562792678e-06, + "loss": 0.0008, + "step": 10580 + }, + { + "epoch": 40.01149425287356, + "grad_norm": 2.8671131134033203, + "learning_rate": 2.0945083014048533e-06, + "loss": 0.6408, + "step": 10590 + }, + { + "epoch": 40.012260536398465, + "grad_norm": 0.0005292880814522505, + "learning_rate": 2.0859940400170287e-06, + "loss": 0.0005, + "step": 10600 + }, + { + "epoch": 40.01302681992337, + "grad_norm": 0.17220841348171234, + "learning_rate": 2.077479778629204e-06, + "loss": 0.0007, + "step": 10610 + }, + { + "epoch": 40.01379310344828, + "grad_norm": 0.0016101287910714746, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.0255, + "step": 10620 + }, + { + "epoch": 40.01455938697318, + "grad_norm": 0.09431593865156174, + "learning_rate": 2.060451255853555e-06, + "loss": 0.0056, + "step": 10630 + }, + { + "epoch": 40.015325670498086, + "grad_norm": 0.0005647214711643755, + "learning_rate": 2.0519369944657304e-06, + "loss": 0.0008, + "step": 10640 + }, + { + "epoch": 40.01609195402299, + "grad_norm": 0.0007362478063441813, + "learning_rate": 2.043422733077906e-06, + "loss": 0.4844, + "step": 10650 + }, + { + "epoch": 40.01685823754789, + "grad_norm": 0.0007467013783752918, + "learning_rate": 2.0349084716900813e-06, + "loss": 0.3836, + "step": 10660 + }, + { + "epoch": 40.0176245210728, + "grad_norm": 0.0005726607050746679, + "learning_rate": 2.0263942103022567e-06, + "loss": 0.0002, + "step": 10670 + }, + { + "epoch": 40.0183908045977, + "grad_norm": 0.030910883098840714, + "learning_rate": 2.0178799489144317e-06, + "loss": 0.0352, + "step": 10680 + }, + { + "epoch": 40.019157088122604, + "grad_norm": 0.001673361286520958, + "learning_rate": 2.009365687526607e-06, + "loss": 0.2036, + "step": 10690 + }, + { + "epoch": 40.01992337164751, + "grad_norm": 0.0005115558160468936, + "learning_rate": 2.0008514261387825e-06, + "loss": 0.0006, + "step": 10700 + }, + { + "epoch": 40.02, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.638057231903076, + "eval_runtime": 14.1734, + "eval_samples_per_second": 3.175, + "eval_steps_per_second": 3.175, + "step": 10701 + }, + { + "epoch": 41.000689655172415, + "grad_norm": 0.013981521129608154, + "learning_rate": 1.992337164750958e-06, + "loss": 0.0002, + "step": 10710 + }, + { + "epoch": 41.00145593869732, + "grad_norm": 0.0017696694703772664, + "learning_rate": 1.9838229033631333e-06, + "loss": 0.0005, + "step": 10720 + }, + { + "epoch": 41.00222222222222, + "grad_norm": 0.0005197998834773898, + "learning_rate": 1.9753086419753087e-06, + "loss": 0.6036, + "step": 10730 + }, + { + "epoch": 41.002988505747126, + "grad_norm": 0.0005476609221659601, + "learning_rate": 1.966794380587484e-06, + "loss": 0.5618, + "step": 10740 + }, + { + "epoch": 41.00375478927203, + "grad_norm": 0.0012840996496379375, + "learning_rate": 1.9582801191996596e-06, + "loss": 0.0005, + "step": 10750 + }, + { + "epoch": 41.00452107279693, + "grad_norm": 0.0009479990694671869, + "learning_rate": 1.949765857811835e-06, + "loss": 0.0036, + "step": 10760 + }, + { + "epoch": 41.00528735632184, + "grad_norm": 0.04873271659016609, + "learning_rate": 1.9412515964240104e-06, + "loss": 0.1735, + "step": 10770 + }, + { + "epoch": 41.00605363984674, + "grad_norm": 0.14039330184459686, + "learning_rate": 1.932737335036186e-06, + "loss": 0.0056, + "step": 10780 + }, + { + "epoch": 41.006819923371644, + "grad_norm": 0.005437355488538742, + "learning_rate": 1.9242230736483612e-06, + "loss": 0.0154, + "step": 10790 + }, + { + "epoch": 41.007586206896555, + "grad_norm": 0.0047526853159070015, + "learning_rate": 1.9157088122605367e-06, + "loss": 0.3819, + "step": 10800 + }, + { + "epoch": 41.00835249042146, + "grad_norm": 0.007671783212572336, + "learning_rate": 1.9071945508727119e-06, + "loss": 0.0091, + "step": 10810 + }, + { + "epoch": 41.00911877394636, + "grad_norm": 1302.8955078125, + "learning_rate": 1.8986802894848875e-06, + "loss": 0.714, + "step": 10820 + }, + { + "epoch": 41.009885057471266, + "grad_norm": 0.0007232970674522221, + "learning_rate": 1.8901660280970627e-06, + "loss": 0.0, + "step": 10830 + }, + { + "epoch": 41.01065134099617, + "grad_norm": 0.0005007084691897035, + "learning_rate": 1.8816517667092381e-06, + "loss": 0.0005, + "step": 10840 + }, + { + "epoch": 41.01141762452107, + "grad_norm": 0.008196084760129452, + "learning_rate": 1.8731375053214135e-06, + "loss": 0.2376, + "step": 10850 + }, + { + "epoch": 41.01218390804598, + "grad_norm": 0.07016335427761078, + "learning_rate": 1.864623243933589e-06, + "loss": 1.0228, + "step": 10860 + }, + { + "epoch": 41.01295019157088, + "grad_norm": 0.10909241437911987, + "learning_rate": 1.8561089825457644e-06, + "loss": 0.297, + "step": 10870 + }, + { + "epoch": 41.013716475095784, + "grad_norm": 0.057047039270401, + "learning_rate": 1.8475947211579398e-06, + "loss": 0.001, + "step": 10880 + }, + { + "epoch": 41.01448275862069, + "grad_norm": 0.0005653087282553315, + "learning_rate": 1.839080459770115e-06, + "loss": 0.6173, + "step": 10890 + }, + { + "epoch": 41.01524904214559, + "grad_norm": 0.033106788992881775, + "learning_rate": 1.8305661983822906e-06, + "loss": 0.0074, + "step": 10900 + }, + { + "epoch": 41.0160153256705, + "grad_norm": 0.0005475867656059563, + "learning_rate": 1.8220519369944658e-06, + "loss": 0.0063, + "step": 10910 + }, + { + "epoch": 41.016781609195405, + "grad_norm": 0.053291477262973785, + "learning_rate": 1.8135376756066415e-06, + "loss": 0.0004, + "step": 10920 + }, + { + "epoch": 41.01754789272031, + "grad_norm": 69.3751449584961, + "learning_rate": 1.8050234142188167e-06, + "loss": 0.9711, + "step": 10930 + }, + { + "epoch": 41.01831417624521, + "grad_norm": 0.09939111769199371, + "learning_rate": 1.7965091528309919e-06, + "loss": 0.0019, + "step": 10940 + }, + { + "epoch": 41.019080459770116, + "grad_norm": 0.14862337708473206, + "learning_rate": 1.7879948914431675e-06, + "loss": 0.0394, + "step": 10950 + }, + { + "epoch": 41.01984674329502, + "grad_norm": 0.0012591223930940032, + "learning_rate": 1.7794806300553427e-06, + "loss": 0.0004, + "step": 10960 + }, + { + "epoch": 41.02, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 2.9590911865234375, + "eval_runtime": 15.8526, + "eval_samples_per_second": 2.839, + "eval_steps_per_second": 2.839, + "step": 10962 + }, + { + "epoch": 42.00061302681992, + "grad_norm": 0.001878397073596716, + "learning_rate": 1.7709663686675183e-06, + "loss": 0.9958, + "step": 10970 + }, + { + "epoch": 42.00137931034483, + "grad_norm": 0.009673736989498138, + "learning_rate": 1.7624521072796935e-06, + "loss": 0.1777, + "step": 10980 + }, + { + "epoch": 42.002145593869734, + "grad_norm": 0.001056146458722651, + "learning_rate": 1.753937845891869e-06, + "loss": 0.0004, + "step": 10990 + }, + { + "epoch": 42.00291187739464, + "grad_norm": 0.0020266796927899122, + "learning_rate": 1.7454235845040444e-06, + "loss": 0.0007, + "step": 11000 + }, + { + "epoch": 42.00367816091954, + "grad_norm": 0.0008849030709825456, + "learning_rate": 1.7369093231162198e-06, + "loss": 0.0004, + "step": 11010 + }, + { + "epoch": 42.004444444444445, + "grad_norm": 0.0004981319652870297, + "learning_rate": 1.7283950617283952e-06, + "loss": 0.0052, + "step": 11020 + }, + { + "epoch": 42.00521072796935, + "grad_norm": 0.001915338565595448, + "learning_rate": 1.7198808003405706e-06, + "loss": 0.6275, + "step": 11030 + }, + { + "epoch": 42.00597701149425, + "grad_norm": 0.0005534188239835203, + "learning_rate": 1.7113665389527458e-06, + "loss": 0.0006, + "step": 11040 + }, + { + "epoch": 42.006743295019156, + "grad_norm": 397.24444580078125, + "learning_rate": 1.7028522775649215e-06, + "loss": 0.0332, + "step": 11050 + }, + { + "epoch": 42.00750957854406, + "grad_norm": 0.002358200028538704, + "learning_rate": 1.6943380161770967e-06, + "loss": 0.0001, + "step": 11060 + }, + { + "epoch": 42.00827586206896, + "grad_norm": 0.09730993211269379, + "learning_rate": 1.6858237547892723e-06, + "loss": 0.0003, + "step": 11070 + }, + { + "epoch": 42.00904214559387, + "grad_norm": 0.002618571277707815, + "learning_rate": 1.6773094934014475e-06, + "loss": 0.7586, + "step": 11080 + }, + { + "epoch": 42.00980842911878, + "grad_norm": 0.010873343795537949, + "learning_rate": 1.6687952320136231e-06, + "loss": 0.0015, + "step": 11090 + }, + { + "epoch": 42.01057471264368, + "grad_norm": 0.0019932216964662075, + "learning_rate": 1.6602809706257983e-06, + "loss": 0.1034, + "step": 11100 + }, + { + "epoch": 42.011340996168585, + "grad_norm": 0.0022048912942409515, + "learning_rate": 1.6517667092379737e-06, + "loss": 0.1463, + "step": 11110 + }, + { + "epoch": 42.01210727969349, + "grad_norm": 6.6127095222473145, + "learning_rate": 1.6432524478501492e-06, + "loss": 0.0007, + "step": 11120 + }, + { + "epoch": 42.01287356321839, + "grad_norm": 1.7690030336380005, + "learning_rate": 1.6347381864623246e-06, + "loss": 0.6631, + "step": 11130 + }, + { + "epoch": 42.013639846743295, + "grad_norm": 0.0005693206912837923, + "learning_rate": 1.6262239250744998e-06, + "loss": 0.0003, + "step": 11140 + }, + { + "epoch": 42.0144061302682, + "grad_norm": 0.0005519501864910126, + "learning_rate": 1.6177096636866754e-06, + "loss": 0.5764, + "step": 11150 + }, + { + "epoch": 42.0151724137931, + "grad_norm": 0.0006714254850521684, + "learning_rate": 1.6091954022988506e-06, + "loss": 0.6308, + "step": 11160 + }, + { + "epoch": 42.015938697318006, + "grad_norm": 0.0019181851530447602, + "learning_rate": 1.6006811409110262e-06, + "loss": 0.0038, + "step": 11170 + }, + { + "epoch": 42.01670498084291, + "grad_norm": 0.0021259617060422897, + "learning_rate": 1.5921668795232015e-06, + "loss": 0.0024, + "step": 11180 + }, + { + "epoch": 42.01747126436781, + "grad_norm": 0.0013196076033636928, + "learning_rate": 1.583652618135377e-06, + "loss": 0.5949, + "step": 11190 + }, + { + "epoch": 42.01823754789272, + "grad_norm": 0.0004218143585603684, + "learning_rate": 1.5751383567475523e-06, + "loss": 0.0001, + "step": 11200 + }, + { + "epoch": 42.01900383141763, + "grad_norm": 0.00047586203436367214, + "learning_rate": 1.5666240953597275e-06, + "loss": 0.5043, + "step": 11210 + }, + { + "epoch": 42.01977011494253, + "grad_norm": 0.07100889831781387, + "learning_rate": 1.5581098339719031e-06, + "loss": 0.0004, + "step": 11220 + }, + { + "epoch": 42.02, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 2.12404727935791, + "eval_runtime": 16.4348, + "eval_samples_per_second": 2.738, + "eval_steps_per_second": 2.738, + "step": 11223 + }, + { + "epoch": 43.00053639846743, + "grad_norm": 0.0005287248059175909, + "learning_rate": 1.5495955725840783e-06, + "loss": 0.6208, + "step": 11230 + }, + { + "epoch": 43.001302681992335, + "grad_norm": 0.015506441704928875, + "learning_rate": 1.541081311196254e-06, + "loss": 0.0003, + "step": 11240 + }, + { + "epoch": 43.00206896551724, + "grad_norm": 272.2698974609375, + "learning_rate": 1.5325670498084292e-06, + "loss": 1.0295, + "step": 11250 + }, + { + "epoch": 43.00283524904214, + "grad_norm": 0.05146227031946182, + "learning_rate": 1.5240527884206046e-06, + "loss": 0.6084, + "step": 11260 + }, + { + "epoch": 43.00360153256705, + "grad_norm": 103.00202178955078, + "learning_rate": 1.51553852703278e-06, + "loss": 1.2964, + "step": 11270 + }, + { + "epoch": 43.00436781609196, + "grad_norm": 0.0007857238524593413, + "learning_rate": 1.5070242656449554e-06, + "loss": 0.0002, + "step": 11280 + }, + { + "epoch": 43.00513409961686, + "grad_norm": 0.0006366973393596709, + "learning_rate": 1.4985100042571306e-06, + "loss": 0.0002, + "step": 11290 + }, + { + "epoch": 43.005900383141764, + "grad_norm": 0.006167873274534941, + "learning_rate": 1.4899957428693062e-06, + "loss": 0.0002, + "step": 11300 + }, + { + "epoch": 43.00666666666667, + "grad_norm": 0.009739157743752003, + "learning_rate": 1.4814814814814815e-06, + "loss": 0.7216, + "step": 11310 + }, + { + "epoch": 43.00743295019157, + "grad_norm": 1272.1881103515625, + "learning_rate": 1.472967220093657e-06, + "loss": 0.066, + "step": 11320 + }, + { + "epoch": 43.008199233716475, + "grad_norm": 0.0007570978486910462, + "learning_rate": 1.4644529587058323e-06, + "loss": 0.5015, + "step": 11330 + }, + { + "epoch": 43.00896551724138, + "grad_norm": 0.0004982962273061275, + "learning_rate": 1.455938697318008e-06, + "loss": 0.5787, + "step": 11340 + }, + { + "epoch": 43.00973180076628, + "grad_norm": 0.02334807999432087, + "learning_rate": 1.4474244359301831e-06, + "loss": 0.0005, + "step": 11350 + }, + { + "epoch": 43.010498084291186, + "grad_norm": 0.000609906914178282, + "learning_rate": 1.4389101745423588e-06, + "loss": 0.0006, + "step": 11360 + }, + { + "epoch": 43.01126436781609, + "grad_norm": 0.04427050054073334, + "learning_rate": 1.430395913154534e-06, + "loss": 0.6396, + "step": 11370 + }, + { + "epoch": 43.01203065134099, + "grad_norm": 0.11915020644664764, + "learning_rate": 1.4218816517667094e-06, + "loss": 1.1079, + "step": 11380 + }, + { + "epoch": 43.012796934865904, + "grad_norm": 176.30267333984375, + "learning_rate": 1.4133673903788848e-06, + "loss": 0.4117, + "step": 11390 + }, + { + "epoch": 43.01356321839081, + "grad_norm": 0.0005373156745918095, + "learning_rate": 1.4048531289910602e-06, + "loss": 0.0022, + "step": 11400 + }, + { + "epoch": 43.01432950191571, + "grad_norm": 0.014790279790759087, + "learning_rate": 1.3963388676032354e-06, + "loss": 0.0007, + "step": 11410 + }, + { + "epoch": 43.015095785440614, + "grad_norm": 0.0004909559502266347, + "learning_rate": 1.387824606215411e-06, + "loss": 1.1803, + "step": 11420 + }, + { + "epoch": 43.01586206896552, + "grad_norm": 0.0029413867741823196, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.0001, + "step": 11430 + }, + { + "epoch": 43.01662835249042, + "grad_norm": 0.0030794162303209305, + "learning_rate": 1.3707960834397619e-06, + "loss": 0.8496, + "step": 11440 + }, + { + "epoch": 43.017394636015325, + "grad_norm": 0.10320518910884857, + "learning_rate": 1.362281822051937e-06, + "loss": 1.1705, + "step": 11450 + }, + { + "epoch": 43.01816091954023, + "grad_norm": 0.002483441261574626, + "learning_rate": 1.3537675606641127e-06, + "loss": 0.6525, + "step": 11460 + }, + { + "epoch": 43.01892720306513, + "grad_norm": 0.000820959045086056, + "learning_rate": 1.345253299276288e-06, + "loss": 0.0315, + "step": 11470 + }, + { + "epoch": 43.019693486590036, + "grad_norm": 0.0005174216348677874, + "learning_rate": 1.3367390378884631e-06, + "loss": 1.109, + "step": 11480 + }, + { + "epoch": 43.02, + "eval_accuracy": 0.6, + "eval_loss": 2.763355016708374, + "eval_runtime": 16.1842, + "eval_samples_per_second": 2.78, + "eval_steps_per_second": 2.78, + "step": 11484 + }, + { + "epoch": 44.000459770114944, + "grad_norm": 0.0006952774128876626, + "learning_rate": 1.3282247765006387e-06, + "loss": 0.0002, + "step": 11490 + }, + { + "epoch": 44.00122605363985, + "grad_norm": 0.02722359634935856, + "learning_rate": 1.319710515112814e-06, + "loss": 1.6376, + "step": 11500 + }, + { + "epoch": 44.00199233716475, + "grad_norm": 0.006522133480757475, + "learning_rate": 1.3111962537249896e-06, + "loss": 0.0, + "step": 11510 + }, + { + "epoch": 44.002758620689654, + "grad_norm": 0.03956187888979912, + "learning_rate": 1.3026819923371648e-06, + "loss": 0.0214, + "step": 11520 + }, + { + "epoch": 44.00352490421456, + "grad_norm": 0.11104047298431396, + "learning_rate": 1.2941677309493402e-06, + "loss": 0.7209, + "step": 11530 + }, + { + "epoch": 44.00429118773946, + "grad_norm": 0.0005099502741359174, + "learning_rate": 1.2856534695615156e-06, + "loss": 0.3281, + "step": 11540 + }, + { + "epoch": 44.005057471264365, + "grad_norm": 0.0005218337755650282, + "learning_rate": 1.277139208173691e-06, + "loss": 0.0006, + "step": 11550 + }, + { + "epoch": 44.00582375478927, + "grad_norm": 0.0004927856498397887, + "learning_rate": 1.2686249467858662e-06, + "loss": 0.0002, + "step": 11560 + }, + { + "epoch": 44.00659003831418, + "grad_norm": 12.864944458007812, + "learning_rate": 1.2601106853980419e-06, + "loss": 0.0027, + "step": 11570 + }, + { + "epoch": 44.00735632183908, + "grad_norm": 0.004535001236945391, + "learning_rate": 1.251596424010217e-06, + "loss": 0.0004, + "step": 11580 + }, + { + "epoch": 44.00812260536399, + "grad_norm": 0.04480987787246704, + "learning_rate": 1.2430821626223927e-06, + "loss": 0.0016, + "step": 11590 + }, + { + "epoch": 44.00888888888889, + "grad_norm": 10.11349868774414, + "learning_rate": 1.234567901234568e-06, + "loss": 0.0017, + "step": 11600 + }, + { + "epoch": 44.009655172413794, + "grad_norm": 0.000782816088758409, + "learning_rate": 1.2260536398467433e-06, + "loss": 0.3825, + "step": 11610 + }, + { + "epoch": 44.0104214559387, + "grad_norm": 0.0004511185979936272, + "learning_rate": 1.2175393784589187e-06, + "loss": 1.0973, + "step": 11620 + }, + { + "epoch": 44.0111877394636, + "grad_norm": 0.0004485390963964164, + "learning_rate": 1.2090251170710942e-06, + "loss": 0.0009, + "step": 11630 + }, + { + "epoch": 44.011954022988505, + "grad_norm": 0.0005191498785279691, + "learning_rate": 1.2005108556832696e-06, + "loss": 0.0009, + "step": 11640 + }, + { + "epoch": 44.01272030651341, + "grad_norm": 0.0012321961112320423, + "learning_rate": 1.191996594295445e-06, + "loss": 0.0004, + "step": 11650 + }, + { + "epoch": 44.01348659003831, + "grad_norm": 0.006523193791508675, + "learning_rate": 1.1834823329076204e-06, + "loss": 0.541, + "step": 11660 + }, + { + "epoch": 44.014252873563215, + "grad_norm": 0.001767021487466991, + "learning_rate": 1.1749680715197956e-06, + "loss": 0.0001, + "step": 11670 + }, + { + "epoch": 44.01501915708812, + "grad_norm": 0.0005826118867844343, + "learning_rate": 1.166453810131971e-06, + "loss": 0.0003, + "step": 11680 + }, + { + "epoch": 44.01578544061303, + "grad_norm": 0.0010437779128551483, + "learning_rate": 1.1579395487441465e-06, + "loss": 0.0001, + "step": 11690 + }, + { + "epoch": 44.01655172413793, + "grad_norm": 0.0007777928840368986, + "learning_rate": 1.1494252873563219e-06, + "loss": 0.3824, + "step": 11700 + }, + { + "epoch": 44.01731800766284, + "grad_norm": 0.0009876244002953172, + "learning_rate": 1.1409110259684973e-06, + "loss": 0.0002, + "step": 11710 + }, + { + "epoch": 44.01808429118774, + "grad_norm": 0.007304656784981489, + "learning_rate": 1.1323967645806727e-06, + "loss": 0.55, + "step": 11720 + }, + { + "epoch": 44.018850574712644, + "grad_norm": 0.12421630322933197, + "learning_rate": 1.1238825031928481e-06, + "loss": 0.0326, + "step": 11730 + }, + { + "epoch": 44.01961685823755, + "grad_norm": 2.9402077198028564, + "learning_rate": 1.1153682418050235e-06, + "loss": 1.074, + "step": 11740 + }, + { + "epoch": 44.02, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 2.1594574451446533, + "eval_runtime": 16.432, + "eval_samples_per_second": 2.739, + "eval_steps_per_second": 2.739, + "step": 11745 + }, + { + "epoch": 45.000383141762455, + "grad_norm": 0.0022075134329497814, + "learning_rate": 1.106853980417199e-06, + "loss": 0.0002, + "step": 11750 + }, + { + "epoch": 45.00114942528736, + "grad_norm": 0.0017397621413692832, + "learning_rate": 1.0983397190293744e-06, + "loss": 0.5031, + "step": 11760 + }, + { + "epoch": 45.00191570881226, + "grad_norm": 0.0005939102848060429, + "learning_rate": 1.0898254576415498e-06, + "loss": 0.0004, + "step": 11770 + }, + { + "epoch": 45.002681992337166, + "grad_norm": 0.00116265332326293, + "learning_rate": 1.0813111962537252e-06, + "loss": 0.0004, + "step": 11780 + }, + { + "epoch": 45.00344827586207, + "grad_norm": 0.0006234461907297373, + "learning_rate": 1.0727969348659004e-06, + "loss": 0.7825, + "step": 11790 + }, + { + "epoch": 45.00421455938697, + "grad_norm": 806.7313842773438, + "learning_rate": 1.0642826734780758e-06, + "loss": 0.3269, + "step": 11800 + }, + { + "epoch": 45.00498084291188, + "grad_norm": 0.00044658681144937873, + "learning_rate": 1.0557684120902512e-06, + "loss": 0.4785, + "step": 11810 + }, + { + "epoch": 45.00574712643678, + "grad_norm": 0.0012156810844317079, + "learning_rate": 1.0472541507024267e-06, + "loss": 0.5853, + "step": 11820 + }, + { + "epoch": 45.006513409961684, + "grad_norm": 0.00045517139369621873, + "learning_rate": 1.038739889314602e-06, + "loss": 0.0044, + "step": 11830 + }, + { + "epoch": 45.00727969348659, + "grad_norm": 0.0006456039263866842, + "learning_rate": 1.0302256279267775e-06, + "loss": 0.0009, + "step": 11840 + }, + { + "epoch": 45.00804597701149, + "grad_norm": 0.0011488504242151976, + "learning_rate": 1.021711366538953e-06, + "loss": 0.7269, + "step": 11850 + }, + { + "epoch": 45.008812260536395, + "grad_norm": 0.0012480664299800992, + "learning_rate": 1.0131971051511283e-06, + "loss": 0.7995, + "step": 11860 + }, + { + "epoch": 45.009578544061306, + "grad_norm": 0.09593397378921509, + "learning_rate": 1.0046828437633035e-06, + "loss": 0.6449, + "step": 11870 + }, + { + "epoch": 45.01034482758621, + "grad_norm": 0.44798505306243896, + "learning_rate": 9.96168582375479e-07, + "loss": 0.0003, + "step": 11880 + }, + { + "epoch": 45.01111111111111, + "grad_norm": 0.004326379392296076, + "learning_rate": 9.876543209876544e-07, + "loss": 0.4509, + "step": 11890 + }, + { + "epoch": 45.01187739463602, + "grad_norm": 0.010555016808211803, + "learning_rate": 9.791400595998298e-07, + "loss": 0.0005, + "step": 11900 + }, + { + "epoch": 45.01264367816092, + "grad_norm": 0.0004949315916746855, + "learning_rate": 9.706257982120052e-07, + "loss": 0.0003, + "step": 11910 + }, + { + "epoch": 45.013409961685824, + "grad_norm": 0.001112055848352611, + "learning_rate": 9.621115368241806e-07, + "loss": 0.0002, + "step": 11920 + }, + { + "epoch": 45.01417624521073, + "grad_norm": 0.1513567864894867, + "learning_rate": 9.535972754363559e-07, + "loss": 0.001, + "step": 11930 + }, + { + "epoch": 45.01494252873563, + "grad_norm": 0.003296300768852234, + "learning_rate": 9.450830140485314e-07, + "loss": 0.0001, + "step": 11940 + }, + { + "epoch": 45.015708812260534, + "grad_norm": 0.07278383523225784, + "learning_rate": 9.365687526607068e-07, + "loss": 0.9561, + "step": 11950 + }, + { + "epoch": 45.01647509578544, + "grad_norm": 0.0007136106723919511, + "learning_rate": 9.280544912728822e-07, + "loss": 0.0005, + "step": 11960 + }, + { + "epoch": 45.01724137931034, + "grad_norm": 0.015829024836421013, + "learning_rate": 9.195402298850575e-07, + "loss": 0.7246, + "step": 11970 + }, + { + "epoch": 45.01800766283525, + "grad_norm": 334.7724914550781, + "learning_rate": 9.110259684972329e-07, + "loss": 0.4681, + "step": 11980 + }, + { + "epoch": 45.018773946360156, + "grad_norm": 0.00398367689922452, + "learning_rate": 9.025117071094083e-07, + "loss": 0.7101, + "step": 11990 + }, + { + "epoch": 45.01954022988506, + "grad_norm": 0.04738711565732956, + "learning_rate": 8.939974457215837e-07, + "loss": 0.0016, + "step": 12000 + }, + { + "epoch": 45.02, + "eval_accuracy": 0.7555555555555555, + "eval_loss": 1.853808879852295, + "eval_runtime": 16.4639, + "eval_samples_per_second": 2.733, + "eval_steps_per_second": 2.733, + "step": 12006 + }, + { + "epoch": 46.00030651340996, + "grad_norm": 0.0007470714044757187, + "learning_rate": 8.854831843337592e-07, + "loss": 0.0004, + "step": 12010 + }, + { + "epoch": 46.001072796934864, + "grad_norm": 0.008812682703137398, + "learning_rate": 8.769689229459345e-07, + "loss": 0.0366, + "step": 12020 + }, + { + "epoch": 46.00183908045977, + "grad_norm": 0.08954495936632156, + "learning_rate": 8.684546615581099e-07, + "loss": 0.0008, + "step": 12030 + }, + { + "epoch": 46.00260536398467, + "grad_norm": 0.016129348427057266, + "learning_rate": 8.599404001702853e-07, + "loss": 0.0002, + "step": 12040 + }, + { + "epoch": 46.00337164750958, + "grad_norm": 0.015925360843539238, + "learning_rate": 8.514261387824607e-07, + "loss": 0.0001, + "step": 12050 + }, + { + "epoch": 46.004137931034485, + "grad_norm": 0.004947807639837265, + "learning_rate": 8.429118773946361e-07, + "loss": 0.0003, + "step": 12060 + }, + { + "epoch": 46.00490421455939, + "grad_norm": 0.038208961486816406, + "learning_rate": 8.343976160068116e-07, + "loss": 0.5025, + "step": 12070 + }, + { + "epoch": 46.00567049808429, + "grad_norm": 0.05429494380950928, + "learning_rate": 8.258833546189869e-07, + "loss": 0.7446, + "step": 12080 + }, + { + "epoch": 46.006436781609196, + "grad_norm": 0.005066359415650368, + "learning_rate": 8.173690932311623e-07, + "loss": 0.0002, + "step": 12090 + }, + { + "epoch": 46.0072030651341, + "grad_norm": 0.0029733364935964346, + "learning_rate": 8.088548318433377e-07, + "loss": 0.5126, + "step": 12100 + }, + { + "epoch": 46.007969348659, + "grad_norm": 0.30708321928977966, + "learning_rate": 8.003405704555131e-07, + "loss": 0.0009, + "step": 12110 + }, + { + "epoch": 46.00873563218391, + "grad_norm": 0.006712548900395632, + "learning_rate": 7.918263090676885e-07, + "loss": 1.0584, + "step": 12120 + }, + { + "epoch": 46.00950191570881, + "grad_norm": 0.0009609467815607786, + "learning_rate": 7.833120476798637e-07, + "loss": 0.0413, + "step": 12130 + }, + { + "epoch": 46.010268199233714, + "grad_norm": 0.008377721533179283, + "learning_rate": 7.747977862920392e-07, + "loss": 0.0002, + "step": 12140 + }, + { + "epoch": 46.01103448275862, + "grad_norm": 0.0004620867548510432, + "learning_rate": 7.662835249042146e-07, + "loss": 0.0002, + "step": 12150 + }, + { + "epoch": 46.01180076628353, + "grad_norm": 0.002079889178276062, + "learning_rate": 7.5776926351639e-07, + "loss": 0.0005, + "step": 12160 + }, + { + "epoch": 46.01256704980843, + "grad_norm": 0.0010952705051749945, + "learning_rate": 7.492550021285653e-07, + "loss": 1.1892, + "step": 12170 + }, + { + "epoch": 46.013333333333335, + "grad_norm": 0.000436866917880252, + "learning_rate": 7.407407407407407e-07, + "loss": 0.8613, + "step": 12180 + }, + { + "epoch": 46.01409961685824, + "grad_norm": 0.023280123248696327, + "learning_rate": 7.322264793529161e-07, + "loss": 0.001, + "step": 12190 + }, + { + "epoch": 46.01486590038314, + "grad_norm": 0.0006040579755790532, + "learning_rate": 7.237122179650916e-07, + "loss": 0.0006, + "step": 12200 + }, + { + "epoch": 46.015632183908046, + "grad_norm": 1269.834228515625, + "learning_rate": 7.15197956577267e-07, + "loss": 0.3661, + "step": 12210 + }, + { + "epoch": 46.01639846743295, + "grad_norm": 0.028524765744805336, + "learning_rate": 7.066836951894424e-07, + "loss": 0.0003, + "step": 12220 + }, + { + "epoch": 46.01716475095785, + "grad_norm": 0.006299199536442757, + "learning_rate": 6.981694338016177e-07, + "loss": 0.6344, + "step": 12230 + }, + { + "epoch": 46.01793103448276, + "grad_norm": 0.0004130868474021554, + "learning_rate": 6.896551724137931e-07, + "loss": 0.606, + "step": 12240 + }, + { + "epoch": 46.01869731800766, + "grad_norm": 0.006240964867174625, + "learning_rate": 6.811409110259685e-07, + "loss": 0.0001, + "step": 12250 + }, + { + "epoch": 46.019463601532564, + "grad_norm": 0.004726331681013107, + "learning_rate": 6.72626649638144e-07, + "loss": 0.0002, + "step": 12260 + }, + { + "epoch": 46.02, + "eval_accuracy": 0.7555555555555555, + "eval_loss": 1.9418870210647583, + "eval_runtime": 15.2295, + "eval_samples_per_second": 2.955, + "eval_steps_per_second": 2.955, + "step": 12267 + }, + { + "epoch": 47.00022988505747, + "grad_norm": 0.0046911342069506645, + "learning_rate": 6.641123882503194e-07, + "loss": 0.0022, + "step": 12270 + }, + { + "epoch": 47.000996168582375, + "grad_norm": 36.75137710571289, + "learning_rate": 6.555981268624948e-07, + "loss": 1.1365, + "step": 12280 + }, + { + "epoch": 47.00176245210728, + "grad_norm": 0.03164047747850418, + "learning_rate": 6.470838654746701e-07, + "loss": 0.0012, + "step": 12290 + }, + { + "epoch": 47.00252873563218, + "grad_norm": 114.06194305419922, + "learning_rate": 6.385696040868455e-07, + "loss": 0.5134, + "step": 12300 + }, + { + "epoch": 47.003295019157086, + "grad_norm": 0.004502067808061838, + "learning_rate": 6.300553426990209e-07, + "loss": 0.0009, + "step": 12310 + }, + { + "epoch": 47.00406130268199, + "grad_norm": 0.004849694203585386, + "learning_rate": 6.215410813111964e-07, + "loss": 0.6976, + "step": 12320 + }, + { + "epoch": 47.00482758620689, + "grad_norm": 0.0008688484085723758, + "learning_rate": 6.130268199233717e-07, + "loss": 0.4239, + "step": 12330 + }, + { + "epoch": 47.005593869731804, + "grad_norm": 0.0006198923219926655, + "learning_rate": 6.045125585355471e-07, + "loss": 0.5128, + "step": 12340 + }, + { + "epoch": 47.00636015325671, + "grad_norm": 0.0004461881471797824, + "learning_rate": 5.959982971477225e-07, + "loss": 0.0003, + "step": 12350 + }, + { + "epoch": 47.00712643678161, + "grad_norm": 0.005352925043553114, + "learning_rate": 5.874840357598978e-07, + "loss": 0.0004, + "step": 12360 + }, + { + "epoch": 47.007892720306515, + "grad_norm": 0.00048328927368856966, + "learning_rate": 5.789697743720732e-07, + "loss": 0.0001, + "step": 12370 + }, + { + "epoch": 47.00865900383142, + "grad_norm": 0.0007524097454734147, + "learning_rate": 5.704555129842486e-07, + "loss": 0.0016, + "step": 12380 + }, + { + "epoch": 47.00942528735632, + "grad_norm": 17.501802444458008, + "learning_rate": 5.619412515964241e-07, + "loss": 0.0019, + "step": 12390 + }, + { + "epoch": 47.010191570881226, + "grad_norm": 0.0005570445209741592, + "learning_rate": 5.534269902085995e-07, + "loss": 0.0004, + "step": 12400 + }, + { + "epoch": 47.01095785440613, + "grad_norm": 0.002129359170794487, + "learning_rate": 5.449127288207749e-07, + "loss": 0.0003, + "step": 12410 + }, + { + "epoch": 47.01172413793103, + "grad_norm": 0.0004879856714978814, + "learning_rate": 5.363984674329502e-07, + "loss": 0.0002, + "step": 12420 + }, + { + "epoch": 47.01249042145594, + "grad_norm": 0.00046909274533391, + "learning_rate": 5.278842060451256e-07, + "loss": 0.3502, + "step": 12430 + }, + { + "epoch": 47.01325670498084, + "grad_norm": 0.03282985836267471, + "learning_rate": 5.19369944657301e-07, + "loss": 0.4477, + "step": 12440 + }, + { + "epoch": 47.014022988505744, + "grad_norm": 0.000536035280674696, + "learning_rate": 5.108556832694765e-07, + "loss": 0.0009, + "step": 12450 + }, + { + "epoch": 47.014789272030654, + "grad_norm": 0.0028050511609762907, + "learning_rate": 5.023414218816518e-07, + "loss": 0.0005, + "step": 12460 + }, + { + "epoch": 47.01555555555556, + "grad_norm": 0.12039543688297272, + "learning_rate": 4.938271604938272e-07, + "loss": 0.0112, + "step": 12470 + }, + { + "epoch": 47.01632183908046, + "grad_norm": 0.11689059436321259, + "learning_rate": 4.853128991060026e-07, + "loss": 0.0003, + "step": 12480 + }, + { + "epoch": 47.017088122605365, + "grad_norm": 0.004974637646228075, + "learning_rate": 4.7679863771817797e-07, + "loss": 0.0002, + "step": 12490 + }, + { + "epoch": 47.01785440613027, + "grad_norm": 0.00041797617450356483, + "learning_rate": 4.682843763303534e-07, + "loss": 0.0017, + "step": 12500 + }, + { + "epoch": 47.01862068965517, + "grad_norm": 0.018261730670928955, + "learning_rate": 4.5977011494252875e-07, + "loss": 0.0022, + "step": 12510 + }, + { + "epoch": 47.019386973180076, + "grad_norm": 0.0004435388545971364, + "learning_rate": 4.5125585355470417e-07, + "loss": 0.0005, + "step": 12520 + }, + { + "epoch": 47.02, + "eval_accuracy": 0.7777777777777778, + "eval_loss": 1.8378126621246338, + "eval_runtime": 13.2167, + "eval_samples_per_second": 3.405, + "eval_steps_per_second": 3.405, + "step": 12528 + }, + { + "epoch": 48.000153256704984, + "grad_norm": 0.0004780022718477994, + "learning_rate": 4.427415921668796e-07, + "loss": 0.0002, + "step": 12530 + }, + { + "epoch": 48.00091954022989, + "grad_norm": 0.0005424380651675165, + "learning_rate": 4.3422733077905495e-07, + "loss": 0.2688, + "step": 12540 + }, + { + "epoch": 48.00168582375479, + "grad_norm": 0.0015980154275894165, + "learning_rate": 4.2571306939123036e-07, + "loss": 0.0003, + "step": 12550 + }, + { + "epoch": 48.002452107279694, + "grad_norm": 0.0005452589248307049, + "learning_rate": 4.171988080034058e-07, + "loss": 0.0006, + "step": 12560 + }, + { + "epoch": 48.0032183908046, + "grad_norm": 0.0007816003053449094, + "learning_rate": 4.0868454661558115e-07, + "loss": 0.0001, + "step": 12570 + }, + { + "epoch": 48.0039846743295, + "grad_norm": 0.0005313084693625569, + "learning_rate": 4.0017028522775656e-07, + "loss": 0.7947, + "step": 12580 + }, + { + "epoch": 48.004750957854405, + "grad_norm": 0.04077745974063873, + "learning_rate": 3.916560238399319e-07, + "loss": 0.6254, + "step": 12590 + }, + { + "epoch": 48.00551724137931, + "grad_norm": 0.0005556725664064288, + "learning_rate": 3.831417624521073e-07, + "loss": 0.0001, + "step": 12600 + }, + { + "epoch": 48.00628352490421, + "grad_norm": 0.003675886895507574, + "learning_rate": 3.7462750106428265e-07, + "loss": 1.3723, + "step": 12610 + }, + { + "epoch": 48.007049808429116, + "grad_norm": 16.252607345581055, + "learning_rate": 3.6611323967645807e-07, + "loss": 0.712, + "step": 12620 + }, + { + "epoch": 48.00781609195402, + "grad_norm": 0.0017170652281492949, + "learning_rate": 3.575989782886335e-07, + "loss": 0.0001, + "step": 12630 + }, + { + "epoch": 48.00858237547893, + "grad_norm": 0.0009851758368313313, + "learning_rate": 3.4908471690080885e-07, + "loss": 0.0005, + "step": 12640 + }, + { + "epoch": 48.009348659003834, + "grad_norm": 170.0392608642578, + "learning_rate": 3.4057045551298427e-07, + "loss": 0.6978, + "step": 12650 + }, + { + "epoch": 48.01011494252874, + "grad_norm": 0.000492373073939234, + "learning_rate": 3.320561941251597e-07, + "loss": 0.0001, + "step": 12660 + }, + { + "epoch": 48.01088122605364, + "grad_norm": 0.0003946028009522706, + "learning_rate": 3.2354193273733505e-07, + "loss": 0.0001, + "step": 12670 + }, + { + "epoch": 48.011647509578545, + "grad_norm": 0.008205920457839966, + "learning_rate": 3.1502767134951047e-07, + "loss": 0.0002, + "step": 12680 + }, + { + "epoch": 48.01241379310345, + "grad_norm": 0.003728834679350257, + "learning_rate": 3.0651340996168583e-07, + "loss": 0.0002, + "step": 12690 + }, + { + "epoch": 48.01318007662835, + "grad_norm": 0.0004123710095882416, + "learning_rate": 2.9799914857386125e-07, + "loss": 0.433, + "step": 12700 + }, + { + "epoch": 48.013946360153255, + "grad_norm": 0.011552422307431698, + "learning_rate": 2.894848871860366e-07, + "loss": 0.0003, + "step": 12710 + }, + { + "epoch": 48.01471264367816, + "grad_norm": 0.4225425124168396, + "learning_rate": 2.8097062579821203e-07, + "loss": 0.0005, + "step": 12720 + }, + { + "epoch": 48.01547892720306, + "grad_norm": 0.23171181976795197, + "learning_rate": 2.7245636441038745e-07, + "loss": 1.3635, + "step": 12730 + }, + { + "epoch": 48.016245210727966, + "grad_norm": 0.0025505023077130318, + "learning_rate": 2.639421030225628e-07, + "loss": 0.0002, + "step": 12740 + }, + { + "epoch": 48.01701149425288, + "grad_norm": 0.0010226961458101869, + "learning_rate": 2.5542784163473823e-07, + "loss": 0.4759, + "step": 12750 + }, + { + "epoch": 48.01777777777778, + "grad_norm": 0.0028585917316377163, + "learning_rate": 2.469135802469136e-07, + "loss": 0.0005, + "step": 12760 + }, + { + "epoch": 48.018544061302684, + "grad_norm": 0.0008389436989091337, + "learning_rate": 2.3839931885908898e-07, + "loss": 0.0002, + "step": 12770 + }, + { + "epoch": 48.01931034482759, + "grad_norm": 0.015469871461391449, + "learning_rate": 2.2988505747126437e-07, + "loss": 0.0001, + "step": 12780 + }, + { + "epoch": 48.02, + "eval_accuracy": 0.7333333333333333, + "eval_loss": 2.0624947547912598, + "eval_runtime": 13.2804, + "eval_samples_per_second": 3.388, + "eval_steps_per_second": 3.388, + "step": 12789 + }, + { + "epoch": 49.00007662835249, + "grad_norm": 0.0008100093109533191, + "learning_rate": 2.213707960834398e-07, + "loss": 0.0009, + "step": 12790 + }, + { + "epoch": 49.00084291187739, + "grad_norm": 0.0005122015136294067, + "learning_rate": 2.1285653469561518e-07, + "loss": 0.0006, + "step": 12800 + }, + { + "epoch": 49.001609195402295, + "grad_norm": 0.003411679295822978, + "learning_rate": 2.0434227330779057e-07, + "loss": 0.6112, + "step": 12810 + }, + { + "epoch": 49.002375478927206, + "grad_norm": 0.00041588529711589217, + "learning_rate": 1.9582801191996594e-07, + "loss": 0.6942, + "step": 12820 + }, + { + "epoch": 49.00314176245211, + "grad_norm": 0.0009530864772386849, + "learning_rate": 1.8731375053214133e-07, + "loss": 0.8652, + "step": 12830 + }, + { + "epoch": 49.00390804597701, + "grad_norm": 1160.8369140625, + "learning_rate": 1.7879948914431674e-07, + "loss": 0.9585, + "step": 12840 + }, + { + "epoch": 49.00467432950192, + "grad_norm": 0.24398158490657806, + "learning_rate": 1.7028522775649214e-07, + "loss": 0.0002, + "step": 12850 + }, + { + "epoch": 49.00544061302682, + "grad_norm": 0.001491599716246128, + "learning_rate": 1.6177096636866753e-07, + "loss": 0.0011, + "step": 12860 + }, + { + "epoch": 49.006206896551724, + "grad_norm": 0.013720017857849598, + "learning_rate": 1.5325670498084292e-07, + "loss": 0.0002, + "step": 12870 + }, + { + "epoch": 49.00697318007663, + "grad_norm": 0.006028338335454464, + "learning_rate": 1.447424435930183e-07, + "loss": 0.0006, + "step": 12880 + }, + { + "epoch": 49.00773946360153, + "grad_norm": 0.000989487743936479, + "learning_rate": 1.3622818220519372e-07, + "loss": 0.0002, + "step": 12890 + }, + { + "epoch": 49.008505747126435, + "grad_norm": 0.0008877341751940548, + "learning_rate": 1.2771392081736911e-07, + "loss": 0.7048, + "step": 12900 + }, + { + "epoch": 49.00927203065134, + "grad_norm": 0.0015821445267647505, + "learning_rate": 1.1919965942954449e-07, + "loss": 0.0001, + "step": 12910 + }, + { + "epoch": 49.01003831417624, + "grad_norm": 0.0005364256794564426, + "learning_rate": 1.106853980417199e-07, + "loss": 0.0005, + "step": 12920 + }, + { + "epoch": 49.01080459770115, + "grad_norm": 0.0021387161687016487, + "learning_rate": 1.0217113665389529e-07, + "loss": 0.6235, + "step": 12930 + }, + { + "epoch": 49.011570881226056, + "grad_norm": 0.0005515425582416356, + "learning_rate": 9.365687526607066e-08, + "loss": 0.0001, + "step": 12940 + }, + { + "epoch": 49.01233716475096, + "grad_norm": 0.46449440717697144, + "learning_rate": 8.514261387824607e-08, + "loss": 0.0002, + "step": 12950 + }, + { + "epoch": 49.013103448275864, + "grad_norm": 0.002538820030167699, + "learning_rate": 7.662835249042146e-08, + "loss": 0.406, + "step": 12960 + }, + { + "epoch": 49.01386973180077, + "grad_norm": 0.00038823066279292107, + "learning_rate": 6.811409110259686e-08, + "loss": 0.0527, + "step": 12970 + }, + { + "epoch": 49.01463601532567, + "grad_norm": 0.32803836464881897, + "learning_rate": 5.9599829714772246e-08, + "loss": 0.0039, + "step": 12980 + }, + { + "epoch": 49.015402298850574, + "grad_norm": 0.0092452522367239, + "learning_rate": 5.108556832694764e-08, + "loss": 0.0001, + "step": 12990 + }, + { + "epoch": 49.01616858237548, + "grad_norm": 0.039832472801208496, + "learning_rate": 4.2571306939123034e-08, + "loss": 0.0313, + "step": 13000 + }, + { + "epoch": 49.01693486590038, + "grad_norm": 0.4108397960662842, + "learning_rate": 3.405704555129843e-08, + "loss": 0.0005, + "step": 13010 + }, + { + "epoch": 49.017701149425285, + "grad_norm": 0.00046747943270020187, + "learning_rate": 2.554278416347382e-08, + "loss": 0.0004, + "step": 13020 + }, + { + "epoch": 49.01846743295019, + "grad_norm": 0.0007605894934386015, + "learning_rate": 1.7028522775649215e-08, + "loss": 0.0832, + "step": 13030 + }, + { + "epoch": 49.01923371647509, + "grad_norm": 0.0008342632208950818, + "learning_rate": 8.514261387824608e-09, + "loss": 0.0001, + "step": 13040 + }, + { + "epoch": 49.02, + "grad_norm": 0.03882928565144539, + "learning_rate": 0.0, + "loss": 0.59, + "step": 13050 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.7333333333333333, + "eval_loss": 2.0340445041656494, + "eval_runtime": 15.8259, + "eval_samples_per_second": 2.843, + "eval_steps_per_second": 2.843, + "step": 13050 + }, + { + "epoch": 49.02, + "step": 13050, + "total_flos": 5.730289341462282e+19, + "train_loss": 0.7221503470706512, + "train_runtime": 9500.8761, + "train_samples_per_second": 1.374, + "train_steps_per_second": 1.374 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.8666666666666667, + "eval_loss": 0.5090304017066956, + "eval_runtime": 13.1433, + "eval_samples_per_second": 3.424, + "eval_steps_per_second": 3.424, + "step": 13050 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.8666666666666667, + "eval_loss": 0.5090304017066956, + "eval_runtime": 13.1401, + "eval_samples_per_second": 3.425, + "eval_steps_per_second": 3.425, + "step": 13050 + } + ], + "logging_steps": 10, + "max_steps": 13050, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.730289341462282e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}