🍻 cheers

Browse files

Files changed (6) hide show

README.md +3 -2
all_results.json +13 -0
eval_results.json +8 -0
runs/May11_19-53-00_9479ebf6a298/events.out.tfevents.1715466479.9479ebf6a298.34.1 +3 -0
train_results.json +8 -0
trainer_state.json +3727 -0

README.md CHANGED Viewed

@@ -2,6 +2,7 @@
 license: apache-2.0
 base_model: google/vit-base-patch16-384
 tags:
 - generated_from_trainer
 datasets:
 - imagefolder
@@ -14,7 +15,7 @@ model-index:
       name: Image Classification
       type: image-classification
     dataset:
-      name: imagefolder
       type: imagefolder
       config: default
       split: train
@@ -30,7 +31,7 @@ should probably proofread and complete it, then remove this comment. -->
 # google-vit-base-patch16-384-in21k-batch_16_epoch_4_classes_24_final_withAug_12th_May
-This model is a fine-tuned version of [google/vit-base-patch16-384](https://huggingface.co/google/vit-base-patch16-384) on the imagefolder dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.0282
 - Accuracy: 0.9905

 license: apache-2.0
 base_model: google/vit-base-patch16-384
 tags:
+- image-classification
 - generated_from_trainer
 datasets:
 - imagefolder
       name: Image Classification
       type: image-classification
     dataset:
+      name: bengali_food_images
       type: imagefolder
       config: default
       split: train
 # google-vit-base-patch16-384-in21k-batch_16_epoch_4_classes_24_final_withAug_12th_May
+This model is a fine-tuned version of [google/vit-base-patch16-384](https://huggingface.co/google/vit-base-patch16-384) on the bengali_food_images dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.0282
 - Accuracy: 0.9905

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.9904891304347826,
+    "eval_loss": 0.02818419598042965,
+    "eval_runtime": 37.6338,
+    "eval_samples_per_second": 19.557,
+    "eval_steps_per_second": 2.445,
+    "total_flos": 1.715138644229908e+19,
+    "train_loss": 0.04991757846501284,
+    "train_runtime": 7931.4156,
+    "train_samples_per_second": 9.462,
+    "train_steps_per_second": 0.592
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.9904891304347826,
+    "eval_loss": 0.02818419598042965,
+    "eval_runtime": 37.6338,
+    "eval_samples_per_second": 19.557,
+    "eval_steps_per_second": 2.445
+}

runs/May11_19-53-00_9479ebf6a298/events.out.tfevents.1715466479.9479ebf6a298.34.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79fd1ecbec4d8be65136aac06c403dc6ba03fc90a57ca9f92195dda9fd49c426
+size 411

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "total_flos": 1.715138644229908e+19,
+    "train_loss": 0.04991757846501284,
+    "train_runtime": 7931.4156,
+    "train_samples_per_second": 9.462,
+    "train_steps_per_second": 0.592
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3727 @@

+{
+  "best_metric": 0.02818419598042965,
+  "best_model_checkpoint": "/kaggle/working/Model/google-vit-base-patch16-384-in21k-batch_16_epoch_4_classes_24_final_withAug_12th_May/checkpoint-4600",
+  "epoch": 4.0,
+  "eval_steps": 100,
+  "global_step": 4692,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.906991004943848,
+      "learning_rate": 0.00019957374254049446,
+      "loss": 2.6333,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.6195640563964844,
+      "learning_rate": 0.00019914748508098894,
+      "loss": 1.188,
+      "step": 20
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 6.639026641845703,
+      "learning_rate": 0.00019872122762148339,
+      "loss": 0.574,
+      "step": 30
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 3.0548319816589355,
+      "learning_rate": 0.00019829497016197784,
+      "loss": 0.5497,
+      "step": 40
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 4.425312519073486,
+      "learning_rate": 0.00019786871270247228,
+      "loss": 0.3116,
+      "step": 50
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 6.9624152183532715,
+      "learning_rate": 0.00019744245524296676,
+      "loss": 0.189,
+      "step": 60
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 3.1430420875549316,
+      "learning_rate": 0.0001970161977834612,
+      "loss": 0.2623,
+      "step": 70
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 3.863529682159424,
+      "learning_rate": 0.0001965899403239557,
+      "loss": 0.2368,
+      "step": 80
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 3.1321828365325928,
+      "learning_rate": 0.00019616368286445014,
+      "loss": 0.2392,
+      "step": 90
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 3.542583703994751,
+      "learning_rate": 0.0001957374254049446,
+      "loss": 0.269,
+      "step": 100
+    },
+    {
+      "epoch": 0.09,
+      "eval_accuracy": 0.8722826086956522,
+      "eval_loss": 0.40010249614715576,
+      "eval_runtime": 38.5928,
+      "eval_samples_per_second": 19.071,
+      "eval_steps_per_second": 2.384,
+      "step": 100
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.091883897781372,
+      "learning_rate": 0.00019531116794543904,
+      "loss": 0.2591,
+      "step": 110
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.6820523738861084,
+      "learning_rate": 0.00019488491048593351,
+      "loss": 0.276,
+      "step": 120
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 3.8274409770965576,
+      "learning_rate": 0.00019445865302642796,
+      "loss": 0.1666,
+      "step": 130
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 3.5986907482147217,
+      "learning_rate": 0.00019403239556692244,
+      "loss": 0.314,
+      "step": 140
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.7001522779464722,
+      "learning_rate": 0.0001936061381074169,
+      "loss": 0.2925,
+      "step": 150
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 2.005779981613159,
+      "learning_rate": 0.00019317988064791134,
+      "loss": 0.2421,
+      "step": 160
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 5.918694496154785,
+      "learning_rate": 0.0001927536231884058,
+      "loss": 0.2631,
+      "step": 170
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 8.725202560424805,
+      "learning_rate": 0.00019232736572890027,
+      "loss": 0.2363,
+      "step": 180
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.6118396520614624,
+      "learning_rate": 0.00019190110826939472,
+      "loss": 0.127,
+      "step": 190
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 6.456448554992676,
+      "learning_rate": 0.0001914748508098892,
+      "loss": 0.1829,
+      "step": 200
+    },
+    {
+      "epoch": 0.17,
+      "eval_accuracy": 0.9728260869565217,
+      "eval_loss": 0.0928073525428772,
+      "eval_runtime": 35.1241,
+      "eval_samples_per_second": 20.954,
+      "eval_steps_per_second": 2.619,
+      "step": 200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.9948309659957886,
+      "learning_rate": 0.00019104859335038364,
+      "loss": 0.0722,
+      "step": 210
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 4.800300121307373,
+      "learning_rate": 0.0001906223358908781,
+      "loss": 0.1305,
+      "step": 220
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 5.83320951461792,
+      "learning_rate": 0.00019019607843137254,
+      "loss": 0.1848,
+      "step": 230
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 3.1153628826141357,
+      "learning_rate": 0.00018976982097186702,
+      "loss": 0.0525,
+      "step": 240
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 4.322732448577881,
+      "learning_rate": 0.00018934356351236147,
+      "loss": 0.1815,
+      "step": 250
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0198371410369873,
+      "learning_rate": 0.00018891730605285594,
+      "loss": 0.0529,
+      "step": 260
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 12.497954368591309,
+      "learning_rate": 0.0001884910485933504,
+      "loss": 0.2785,
+      "step": 270
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 8.491466522216797,
+      "learning_rate": 0.00018806479113384484,
+      "loss": 0.1566,
+      "step": 280
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5412869453430176,
+      "learning_rate": 0.0001876385336743393,
+      "loss": 0.0762,
+      "step": 290
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 4.8113322257995605,
+      "learning_rate": 0.00018721227621483377,
+      "loss": 0.1737,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "eval_accuracy": 0.9456521739130435,
+      "eval_loss": 0.16146068274974823,
+      "eval_runtime": 35.2099,
+      "eval_samples_per_second": 20.903,
+      "eval_steps_per_second": 2.613,
+      "step": 300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 3.4217417240142822,
+      "learning_rate": 0.00018678601875532822,
+      "loss": 0.1649,
+      "step": 310
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 5.795102596282959,
+      "learning_rate": 0.0001863597612958227,
+      "loss": 0.1752,
+      "step": 320
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.3124635219573975,
+      "learning_rate": 0.00018593350383631715,
+      "loss": 0.1436,
+      "step": 330
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 2.955580949783325,
+      "learning_rate": 0.0001855072463768116,
+      "loss": 0.102,
+      "step": 340
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.491114616394043,
+      "learning_rate": 0.00018508098891730605,
+      "loss": 0.0966,
+      "step": 350
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 5.680546283721924,
+      "learning_rate": 0.00018465473145780052,
+      "loss": 0.1507,
+      "step": 360
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.0036741495132446,
+      "learning_rate": 0.00018422847399829497,
+      "loss": 0.106,
+      "step": 370
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 9.161335945129395,
+      "learning_rate": 0.00018380221653878945,
+      "loss": 0.1634,
+      "step": 380
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.424228310585022,
+      "learning_rate": 0.0001833759590792839,
+      "loss": 0.2556,
+      "step": 390
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.27238574624061584,
+      "learning_rate": 0.00018294970161977835,
+      "loss": 0.2096,
+      "step": 400
+    },
+    {
+      "epoch": 0.34,
+      "eval_accuracy": 0.9021739130434783,
+      "eval_loss": 0.41383373737335205,
+      "eval_runtime": 35.2087,
+      "eval_samples_per_second": 20.904,
+      "eval_steps_per_second": 2.613,
+      "step": 400
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 3.9731991291046143,
+      "learning_rate": 0.0001825234441602728,
+      "loss": 0.159,
+      "step": 410
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.442360281944275,
+      "learning_rate": 0.00018209718670076727,
+      "loss": 0.0725,
+      "step": 420
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.8118956089019775,
+      "learning_rate": 0.00018167092924126172,
+      "loss": 0.1007,
+      "step": 430
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.04008404538035393,
+      "learning_rate": 0.0001812446717817562,
+      "loss": 0.1467,
+      "step": 440
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.182392954826355,
+      "learning_rate": 0.00018081841432225065,
+      "loss": 0.1886,
+      "step": 450
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.017091387882828712,
+      "learning_rate": 0.0001803921568627451,
+      "loss": 0.1205,
+      "step": 460
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.4202051162719727,
+      "learning_rate": 0.00017996589940323955,
+      "loss": 0.0578,
+      "step": 470
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2947380542755127,
+      "learning_rate": 0.00017953964194373403,
+      "loss": 0.1209,
+      "step": 480
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.019774317741394043,
+      "learning_rate": 0.00017911338448422848,
+      "loss": 0.1035,
+      "step": 490
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.02473970502614975,
+      "learning_rate": 0.00017868712702472295,
+      "loss": 0.1855,
+      "step": 500
+    },
+    {
+      "epoch": 0.43,
+      "eval_accuracy": 0.9510869565217391,
+      "eval_loss": 0.18139685690402985,
+      "eval_runtime": 35.3047,
+      "eval_samples_per_second": 20.847,
+      "eval_steps_per_second": 2.606,
+      "step": 500
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 5.77871036529541,
+      "learning_rate": 0.0001782608695652174,
+      "loss": 0.1413,
+      "step": 510
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.2217156887054443,
+      "learning_rate": 0.00017783461210571185,
+      "loss": 0.2021,
+      "step": 520
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.3217977285385132,
+      "learning_rate": 0.0001774083546462063,
+      "loss": 0.0737,
+      "step": 530
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 12.503950119018555,
+      "learning_rate": 0.00017698209718670078,
+      "loss": 0.1672,
+      "step": 540
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 9.1078519821167,
+      "learning_rate": 0.00017655583972719523,
+      "loss": 0.2553,
+      "step": 550
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4439474940299988,
+      "learning_rate": 0.0001761295822676897,
+      "loss": 0.1189,
+      "step": 560
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.04978885501623154,
+      "learning_rate": 0.00017570332480818415,
+      "loss": 0.0918,
+      "step": 570
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 4.984426498413086,
+      "learning_rate": 0.0001752770673486786,
+      "loss": 0.1974,
+      "step": 580
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.05325145274400711,
+      "learning_rate": 0.00017485080988917305,
+      "loss": 0.1798,
+      "step": 590
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.239779949188232,
+      "learning_rate": 0.00017442455242966753,
+      "loss": 0.0901,
+      "step": 600
+    },
+    {
+      "epoch": 0.51,
+      "eval_accuracy": 0.9578804347826086,
+      "eval_loss": 0.14351356029510498,
+      "eval_runtime": 35.7344,
+      "eval_samples_per_second": 20.596,
+      "eval_steps_per_second": 2.575,
+      "step": 600
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.050971053540706635,
+      "learning_rate": 0.00017399829497016198,
+      "loss": 0.2124,
+      "step": 610
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 3.303232192993164,
+      "learning_rate": 0.00017357203751065646,
+      "loss": 0.0349,
+      "step": 620
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.1450140476226807,
+      "learning_rate": 0.0001731457800511509,
+      "loss": 0.1135,
+      "step": 630
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 5.032433986663818,
+      "learning_rate": 0.00017271952259164536,
+      "loss": 0.1014,
+      "step": 640
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.12293048948049545,
+      "learning_rate": 0.0001722932651321398,
+      "loss": 0.0874,
+      "step": 650
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 3.784557342529297,
+      "learning_rate": 0.00017190963341858485,
+      "loss": 0.3172,
+      "step": 660
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.070284403860569,
+      "learning_rate": 0.0001714833759590793,
+      "loss": 0.0471,
+      "step": 670
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.204810380935669,
+      "learning_rate": 0.00017105711849957375,
+      "loss": 0.1632,
+      "step": 680
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 2.527963638305664,
+      "learning_rate": 0.0001706308610400682,
+      "loss": 0.0425,
+      "step": 690
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 4.840771198272705,
+      "learning_rate": 0.00017020460358056267,
+      "loss": 0.1406,
+      "step": 700
+    },
+    {
+      "epoch": 0.6,
+      "eval_accuracy": 0.9619565217391305,
+      "eval_loss": 0.1468360275030136,
+      "eval_runtime": 35.3013,
+      "eval_samples_per_second": 20.849,
+      "eval_steps_per_second": 2.606,
+      "step": 700
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 5.921713829040527,
+      "learning_rate": 0.00016977834612105712,
+      "loss": 0.146,
+      "step": 710
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.3161861896514893,
+      "learning_rate": 0.0001693520886615516,
+      "loss": 0.0994,
+      "step": 720
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.975315272808075,
+      "learning_rate": 0.00016892583120204605,
+      "loss": 0.0713,
+      "step": 730
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 2.5411438941955566,
+      "learning_rate": 0.0001684995737425405,
+      "loss": 0.1335,
+      "step": 740
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 3.4745705127716064,
+      "learning_rate": 0.00016807331628303495,
+      "loss": 0.1297,
+      "step": 750
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 7.845597267150879,
+      "learning_rate": 0.00016764705882352942,
+      "loss": 0.1963,
+      "step": 760
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8821056485176086,
+      "learning_rate": 0.00016722080136402387,
+      "loss": 0.034,
+      "step": 770
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 9.454300880432129,
+      "learning_rate": 0.00016679454390451835,
+      "loss": 0.0799,
+      "step": 780
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.8250120282173157,
+      "learning_rate": 0.0001663682864450128,
+      "loss": 0.0692,
+      "step": 790
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 6.947498321533203,
+      "learning_rate": 0.00016594202898550725,
+      "loss": 0.136,
+      "step": 800
+    },
+    {
+      "epoch": 0.68,
+      "eval_accuracy": 0.9565217391304348,
+      "eval_loss": 0.15317288041114807,
+      "eval_runtime": 35.4081,
+      "eval_samples_per_second": 20.786,
+      "eval_steps_per_second": 2.598,
+      "step": 800
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 5.202281951904297,
+      "learning_rate": 0.0001655157715260017,
+      "loss": 0.1312,
+      "step": 810
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.957801342010498,
+      "learning_rate": 0.00016508951406649618,
+      "loss": 0.0791,
+      "step": 820
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.21069864928722382,
+      "learning_rate": 0.00016466325660699063,
+      "loss": 0.0677,
+      "step": 830
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 5.757635593414307,
+      "learning_rate": 0.0001642369991474851,
+      "loss": 0.0362,
+      "step": 840
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5154548287391663,
+      "learning_rate": 0.00016381074168797955,
+      "loss": 0.0376,
+      "step": 850
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.27015453577041626,
+      "learning_rate": 0.000163384484228474,
+      "loss": 0.0713,
+      "step": 860
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 3.8011927604675293,
+      "learning_rate": 0.00016295822676896845,
+      "loss": 0.0737,
+      "step": 870
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.022304391488432884,
+      "learning_rate": 0.00016253196930946293,
+      "loss": 0.1347,
+      "step": 880
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 2.486687421798706,
+      "learning_rate": 0.00016210571184995738,
+      "loss": 0.0316,
+      "step": 890
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.04768240451812744,
+      "learning_rate": 0.00016167945439045185,
+      "loss": 0.0666,
+      "step": 900
+    },
+    {
+      "epoch": 0.77,
+      "eval_accuracy": 0.967391304347826,
+      "eval_loss": 0.11767712980508804,
+      "eval_runtime": 35.2067,
+      "eval_samples_per_second": 20.905,
+      "eval_steps_per_second": 2.613,
+      "step": 900
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.006724517792463303,
+      "learning_rate": 0.0001612531969309463,
+      "loss": 0.0386,
+      "step": 910
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 10.303292274475098,
+      "learning_rate": 0.00016082693947144075,
+      "loss": 0.0667,
+      "step": 920
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.029503727331757545,
+      "learning_rate": 0.0001604006820119352,
+      "loss": 0.1628,
+      "step": 930
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.09929509460926056,
+      "learning_rate": 0.00015997442455242968,
+      "loss": 0.0552,
+      "step": 940
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.7176109552383423,
+      "learning_rate": 0.00015954816709292413,
+      "loss": 0.1002,
+      "step": 950
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.25865957140922546,
+      "learning_rate": 0.0001591219096334186,
+      "loss": 0.0576,
+      "step": 960
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.06099778786301613,
+      "learning_rate": 0.00015869565217391306,
+      "loss": 0.0031,
+      "step": 970
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.08257748931646347,
+      "learning_rate": 0.0001582693947144075,
+      "loss": 0.047,
+      "step": 980
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 2.76910138130188,
+      "learning_rate": 0.00015784313725490196,
+      "loss": 0.1426,
+      "step": 990
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 5.632607936859131,
+      "learning_rate": 0.00015741687979539643,
+      "loss": 0.1145,
+      "step": 1000
+    },
+    {
+      "epoch": 0.85,
+      "eval_accuracy": 0.9497282608695652,
+      "eval_loss": 0.1794203668832779,
+      "eval_runtime": 35.4699,
+      "eval_samples_per_second": 20.75,
+      "eval_steps_per_second": 2.594,
+      "step": 1000
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 3.2017662525177,
+      "learning_rate": 0.00015699062233589088,
+      "loss": 0.1359,
+      "step": 1010
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.1852585077285767,
+      "learning_rate": 0.00015656436487638536,
+      "loss": 0.0522,
+      "step": 1020
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 2.903183698654175,
+      "learning_rate": 0.0001561381074168798,
+      "loss": 0.0893,
+      "step": 1030
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.20862096548080444,
+      "learning_rate": 0.00015571184995737426,
+      "loss": 0.0487,
+      "step": 1040
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.12325622886419296,
+      "learning_rate": 0.0001552855924978687,
+      "loss": 0.0734,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.027238862589001656,
+      "learning_rate": 0.00015485933503836318,
+      "loss": 0.1244,
+      "step": 1060
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.05954383313655853,
+      "learning_rate": 0.00015443307757885763,
+      "loss": 0.0357,
+      "step": 1070
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 3.9473464488983154,
+      "learning_rate": 0.0001540068201193521,
+      "loss": 0.1167,
+      "step": 1080
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.054384492337703705,
+      "learning_rate": 0.00015358056265984656,
+      "loss": 0.0712,
+      "step": 1090
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.4992036819458008,
+      "learning_rate": 0.000153154305200341,
+      "loss": 0.0865,
+      "step": 1100
+    },
+    {
+      "epoch": 0.94,
+      "eval_accuracy": 0.96875,
+      "eval_loss": 0.11128566414117813,
+      "eval_runtime": 35.307,
+      "eval_samples_per_second": 20.846,
+      "eval_steps_per_second": 2.606,
+      "step": 1100
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.1278241872787476,
+      "learning_rate": 0.00015272804774083546,
+      "loss": 0.0666,
+      "step": 1110
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.008853895589709282,
+      "learning_rate": 0.00015230179028132994,
+      "loss": 0.0446,
+      "step": 1120
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.18212011456489563,
+      "learning_rate": 0.0001518755328218244,
+      "loss": 0.0386,
+      "step": 1130
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.1280270516872406,
+      "learning_rate": 0.00015144927536231886,
+      "loss": 0.0545,
+      "step": 1140
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.9832583665847778,
+      "learning_rate": 0.0001510230179028133,
+      "loss": 0.016,
+      "step": 1150
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.5265651345252991,
+      "learning_rate": 0.00015059676044330776,
+      "loss": 0.0177,
+      "step": 1160
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 7.077823638916016,
+      "learning_rate": 0.0001501705029838022,
+      "loss": 0.0924,
+      "step": 1170
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 7.029810905456543,
+      "learning_rate": 0.0001497442455242967,
+      "loss": 0.0864,
+      "step": 1180
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.7946345806121826,
+      "learning_rate": 0.00014931798806479114,
+      "loss": 0.0113,
+      "step": 1190
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.7585835456848145,
+      "learning_rate": 0.00014889173060528562,
+      "loss": 0.0612,
+      "step": 1200
+    },
+    {
+      "epoch": 1.02,
+      "eval_accuracy": 0.96875,
+      "eval_loss": 0.1270022839307785,
+      "eval_runtime": 35.4385,
+      "eval_samples_per_second": 20.768,
+      "eval_steps_per_second": 2.596,
+      "step": 1200
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 4.02322244644165,
+      "learning_rate": 0.00014846547314578007,
+      "loss": 0.0694,
+      "step": 1210
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.1861138790845871,
+      "learning_rate": 0.00014803921568627451,
+      "loss": 0.0589,
+      "step": 1220
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.013929170556366444,
+      "learning_rate": 0.00014761295822676896,
+      "loss": 0.032,
+      "step": 1230
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.7604354619979858,
+      "learning_rate": 0.00014718670076726344,
+      "loss": 0.003,
+      "step": 1240
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.11580263823270798,
+      "learning_rate": 0.0001467604433077579,
+      "loss": 0.0921,
+      "step": 1250
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.5570498108863831,
+      "learning_rate": 0.00014633418584825237,
+      "loss": 0.0694,
+      "step": 1260
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.014709889888763428,
+      "learning_rate": 0.00014590792838874682,
+      "loss": 0.1376,
+      "step": 1270
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 5.309992790222168,
+      "learning_rate": 0.00014548167092924127,
+      "loss": 0.0667,
+      "step": 1280
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.8405873775482178,
+      "learning_rate": 0.00014505541346973572,
+      "loss": 0.0101,
+      "step": 1290
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.09988798946142197,
+      "learning_rate": 0.0001446291560102302,
+      "loss": 0.0038,
+      "step": 1300
+    },
+    {
+      "epoch": 1.11,
+      "eval_accuracy": 0.9782608695652174,
+      "eval_loss": 0.07244057208299637,
+      "eval_runtime": 35.5518,
+      "eval_samples_per_second": 20.702,
+      "eval_steps_per_second": 2.588,
+      "step": 1300
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 6.244915008544922,
+      "learning_rate": 0.00014420289855072464,
+      "loss": 0.0365,
+      "step": 1310
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.004242677241563797,
+      "learning_rate": 0.00014377664109121912,
+      "loss": 0.0497,
+      "step": 1320
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 6.292332649230957,
+      "learning_rate": 0.00014335038363171357,
+      "loss": 0.0263,
+      "step": 1330
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.0060243159532547,
+      "learning_rate": 0.00014292412617220802,
+      "loss": 0.0013,
+      "step": 1340
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.15038518607616425,
+      "learning_rate": 0.00014249786871270247,
+      "loss": 0.0339,
+      "step": 1350
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.007978633977472782,
+      "learning_rate": 0.00014207161125319695,
+      "loss": 0.0007,
+      "step": 1360
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.002196653513237834,
+      "learning_rate": 0.0001416453537936914,
+      "loss": 0.0082,
+      "step": 1370
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.002195573877543211,
+      "learning_rate": 0.00014121909633418587,
+      "loss": 0.0471,
+      "step": 1380
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.007308514788746834,
+      "learning_rate": 0.00014079283887468032,
+      "loss": 0.0012,
+      "step": 1390
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.06914997845888138,
+      "learning_rate": 0.00014036658141517477,
+      "loss": 0.0006,
+      "step": 1400
+    },
+    {
+      "epoch": 1.19,
+      "eval_accuracy": 0.9850543478260869,
+      "eval_loss": 0.07148104906082153,
+      "eval_runtime": 35.5936,
+      "eval_samples_per_second": 20.678,
+      "eval_steps_per_second": 2.585,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.0030923793092370033,
+      "learning_rate": 0.00013994032395566922,
+      "loss": 0.0311,
+      "step": 1410
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.003851505694910884,
+      "learning_rate": 0.0001395140664961637,
+      "loss": 0.0209,
+      "step": 1420
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 4.583674430847168,
+      "learning_rate": 0.00013908780903665815,
+      "loss": 0.0419,
+      "step": 1430
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.0060944948345422745,
+      "learning_rate": 0.00013866155157715262,
+      "loss": 0.0104,
+      "step": 1440
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.3952249586582184,
+      "learning_rate": 0.00013823529411764707,
+      "loss": 0.0081,
+      "step": 1450
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.054922040551900864,
+      "learning_rate": 0.00013780903665814152,
+      "loss": 0.004,
+      "step": 1460
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.5767747759819031,
+      "learning_rate": 0.00013738277919863597,
+      "loss": 0.0032,
+      "step": 1470
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.004362504463642836,
+      "learning_rate": 0.00013695652173913045,
+      "loss": 0.0503,
+      "step": 1480
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.007160673383623362,
+      "learning_rate": 0.0001365302642796249,
+      "loss": 0.0065,
+      "step": 1490
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.13416601717472076,
+      "learning_rate": 0.00013610400682011938,
+      "loss": 0.0007,
+      "step": 1500
+    },
+    {
+      "epoch": 1.28,
+      "eval_accuracy": 0.9796195652173914,
+      "eval_loss": 0.06163864955306053,
+      "eval_runtime": 35.8396,
+      "eval_samples_per_second": 20.536,
+      "eval_steps_per_second": 2.567,
+      "step": 1500
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 6.056341171264648,
+      "learning_rate": 0.00013567774936061383,
+      "loss": 0.0324,
+      "step": 1510
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.002421529032289982,
+      "learning_rate": 0.00013525149190110828,
+      "loss": 0.0089,
+      "step": 1520
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.023262590169906616,
+      "learning_rate": 0.00013482523444160273,
+      "loss": 0.0124,
+      "step": 1530
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.015889374539256096,
+      "learning_rate": 0.0001343989769820972,
+      "loss": 0.0055,
+      "step": 1540
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.7687301635742188,
+      "learning_rate": 0.00013397271952259165,
+      "loss": 0.0065,
+      "step": 1550
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.013485108502209187,
+      "learning_rate": 0.00013354646206308613,
+      "loss": 0.0204,
+      "step": 1560
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.0032094900961965322,
+      "learning_rate": 0.00013312020460358058,
+      "loss": 0.0568,
+      "step": 1570
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 3.2661609649658203,
+      "learning_rate": 0.00013269394714407503,
+      "loss": 0.0174,
+      "step": 1580
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.01120584737509489,
+      "learning_rate": 0.00013226768968456948,
+      "loss": 0.0243,
+      "step": 1590
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.002038970123976469,
+      "learning_rate": 0.00013184143222506395,
+      "loss": 0.0579,
+      "step": 1600
+    },
+    {
+      "epoch": 1.36,
+      "eval_accuracy": 0.9714673913043478,
+      "eval_loss": 0.1259184330701828,
+      "eval_runtime": 35.7203,
+      "eval_samples_per_second": 20.605,
+      "eval_steps_per_second": 2.576,
+      "step": 1600
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.5598017573356628,
+      "learning_rate": 0.0001314151747655584,
+      "loss": 0.0026,
+      "step": 1610
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.02395728975534439,
+      "learning_rate": 0.00013098891730605288,
+      "loss": 0.0031,
+      "step": 1620
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.12281103432178497,
+      "learning_rate": 0.00013056265984654733,
+      "loss": 0.0012,
+      "step": 1630
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.0021667671389877796,
+      "learning_rate": 0.00013013640238704178,
+      "loss": 0.0217,
+      "step": 1640
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.01249703485518694,
+      "learning_rate": 0.00012971014492753623,
+      "loss": 0.0148,
+      "step": 1650
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.001891128602437675,
+      "learning_rate": 0.0001292838874680307,
+      "loss": 0.0247,
+      "step": 1660
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.03076539747416973,
+      "learning_rate": 0.00012885763000852516,
+      "loss": 0.056,
+      "step": 1670
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.006363812834024429,
+      "learning_rate": 0.00012843137254901963,
+      "loss": 0.0583,
+      "step": 1680
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.024970410391688347,
+      "learning_rate": 0.00012800511508951408,
+      "loss": 0.0022,
+      "step": 1690
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.11185500770807266,
+      "learning_rate": 0.00012757885763000853,
+      "loss": 0.0009,
+      "step": 1700
+    },
+    {
+      "epoch": 1.45,
+      "eval_accuracy": 0.9755434782608695,
+      "eval_loss": 0.10284683108329773,
+      "eval_runtime": 35.3836,
+      "eval_samples_per_second": 20.801,
+      "eval_steps_per_second": 2.6,
+      "step": 1700
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.001679188571870327,
+      "learning_rate": 0.00012715260017050298,
+      "loss": 0.0055,
+      "step": 1710
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.006215417757630348,
+      "learning_rate": 0.00012672634271099746,
+      "loss": 0.0338,
+      "step": 1720
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 5.586231231689453,
+      "learning_rate": 0.0001263000852514919,
+      "loss": 0.0299,
+      "step": 1730
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.4323924779891968,
+      "learning_rate": 0.00012587382779198638,
+      "loss": 0.0814,
+      "step": 1740
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.01729062758386135,
+      "learning_rate": 0.00012544757033248083,
+      "loss": 0.0034,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.008044210262596607,
+      "learning_rate": 0.00012502131287297528,
+      "loss": 0.0172,
+      "step": 1760
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.2299610376358032,
+      "learning_rate": 0.00012459505541346973,
+      "loss": 0.0154,
+      "step": 1770
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.0019144455436617136,
+      "learning_rate": 0.0001241687979539642,
+      "loss": 0.0212,
+      "step": 1780
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 9.939857482910156,
+      "learning_rate": 0.00012374254049445866,
+      "loss": 0.2078,
+      "step": 1790
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.006784502416849136,
+      "learning_rate": 0.00012331628303495314,
+      "loss": 0.0295,
+      "step": 1800
+    },
+    {
+      "epoch": 1.53,
+      "eval_accuracy": 0.9823369565217391,
+      "eval_loss": 0.0636792704463005,
+      "eval_runtime": 35.9452,
+      "eval_samples_per_second": 20.476,
+      "eval_steps_per_second": 2.559,
+      "step": 1800
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.021205879747867584,
+      "learning_rate": 0.0001228900255754476,
+      "loss": 0.0022,
+      "step": 1810
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.004019047133624554,
+      "learning_rate": 0.00012246376811594204,
+      "loss": 0.0167,
+      "step": 1820
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.009964291006326675,
+      "learning_rate": 0.0001220375106564365,
+      "loss": 0.0501,
+      "step": 1830
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.008543290197849274,
+      "learning_rate": 0.00012161125319693096,
+      "loss": 0.0005,
+      "step": 1840
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.001487872563302517,
+      "learning_rate": 0.00012118499573742541,
+      "loss": 0.0033,
+      "step": 1850
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.005636855494230986,
+      "learning_rate": 0.00012075873827791986,
+      "loss": 0.0028,
+      "step": 1860
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 0.0020758220925927162,
+      "learning_rate": 0.00012033248081841433,
+      "loss": 0.0004,
+      "step": 1870
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.015278986655175686,
+      "learning_rate": 0.00011990622335890877,
+      "loss": 0.0013,
+      "step": 1880
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.003281915793195367,
+      "learning_rate": 0.00011947996589940325,
+      "loss": 0.0214,
+      "step": 1890
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.010990927927196026,
+      "learning_rate": 0.00011905370843989769,
+      "loss": 0.0484,
+      "step": 1900
+    },
+    {
+      "epoch": 1.62,
+      "eval_accuracy": 0.9782608695652174,
+      "eval_loss": 0.08926977962255478,
+      "eval_runtime": 35.2888,
+      "eval_samples_per_second": 20.856,
+      "eval_steps_per_second": 2.607,
+      "step": 1900
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.0031577609479427338,
+      "learning_rate": 0.00011862745098039216,
+      "loss": 0.0003,
+      "step": 1910
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.0015481531154364347,
+      "learning_rate": 0.00011820119352088661,
+      "loss": 0.0002,
+      "step": 1920
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.012322927825152874,
+      "learning_rate": 0.00011777493606138108,
+      "loss": 0.0016,
+      "step": 1930
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.027145786210894585,
+      "learning_rate": 0.00011734867860187553,
+      "loss": 0.0328,
+      "step": 1940
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 7.930657386779785,
+      "learning_rate": 0.00011692242114237,
+      "loss": 0.0697,
+      "step": 1950
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.024966897442936897,
+      "learning_rate": 0.00011649616368286444,
+      "loss": 0.0366,
+      "step": 1960
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.006519176997244358,
+      "learning_rate": 0.00011606990622335892,
+      "loss": 0.0346,
+      "step": 1970
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.0036061250139027834,
+      "learning_rate": 0.00011564364876385337,
+      "loss": 0.0003,
+      "step": 1980
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.32742467522621155,
+      "learning_rate": 0.00011521739130434783,
+      "loss": 0.0007,
+      "step": 1990
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.006401886697858572,
+      "learning_rate": 0.00011479113384484228,
+      "loss": 0.0371,
+      "step": 2000
+    },
+    {
+      "epoch": 1.71,
+      "eval_accuracy": 0.9836956521739131,
+      "eval_loss": 0.06366284191608429,
+      "eval_runtime": 35.2728,
+      "eval_samples_per_second": 20.866,
+      "eval_steps_per_second": 2.608,
+      "step": 2000
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.005329111125320196,
+      "learning_rate": 0.00011436487638533676,
+      "loss": 0.0023,
+      "step": 2010
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.15500983595848083,
+      "learning_rate": 0.00011393861892583119,
+      "loss": 0.021,
+      "step": 2020
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.04823674261569977,
+      "learning_rate": 0.00011351236146632567,
+      "loss": 0.0012,
+      "step": 2030
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.0023863280657678843,
+      "learning_rate": 0.00011308610400682012,
+      "loss": 0.0135,
+      "step": 2040
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.008456870913505554,
+      "learning_rate": 0.00011265984654731458,
+      "loss": 0.0061,
+      "step": 2050
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.001964428462088108,
+      "learning_rate": 0.00011223358908780903,
+      "loss": 0.0121,
+      "step": 2060
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.002651037648320198,
+      "learning_rate": 0.00011180733162830351,
+      "loss": 0.0441,
+      "step": 2070
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.0449827909469604,
+      "learning_rate": 0.00011138107416879794,
+      "loss": 0.0035,
+      "step": 2080
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.005107446573674679,
+      "learning_rate": 0.00011095481670929242,
+      "loss": 0.0465,
+      "step": 2090
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.879646897315979,
+      "learning_rate": 0.00011052855924978687,
+      "loss": 0.0359,
+      "step": 2100
+    },
+    {
+      "epoch": 1.79,
+      "eval_accuracy": 0.9877717391304348,
+      "eval_loss": 0.038947634398937225,
+      "eval_runtime": 35.3019,
+      "eval_samples_per_second": 20.849,
+      "eval_steps_per_second": 2.606,
+      "step": 2100
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.03196950629353523,
+      "learning_rate": 0.00011010230179028133,
+      "loss": 0.0575,
+      "step": 2110
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.018430249765515327,
+      "learning_rate": 0.00010967604433077578,
+      "loss": 0.0004,
+      "step": 2120
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.0045145428739488125,
+      "learning_rate": 0.00010924978687127026,
+      "loss": 0.0003,
+      "step": 2130
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.019347038120031357,
+      "learning_rate": 0.0001088235294117647,
+      "loss": 0.0049,
+      "step": 2140
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.0023944175336509943,
+      "learning_rate": 0.00010839727195225917,
+      "loss": 0.0281,
+      "step": 2150
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.018436182290315628,
+      "learning_rate": 0.00010797101449275362,
+      "loss": 0.0007,
+      "step": 2160
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.02079106867313385,
+      "learning_rate": 0.00010754475703324809,
+      "loss": 0.0781,
+      "step": 2170
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 0.0022807365749031305,
+      "learning_rate": 0.00010711849957374254,
+      "loss": 0.0144,
+      "step": 2180
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.0020325591322034597,
+      "learning_rate": 0.00010669224211423701,
+      "loss": 0.0022,
+      "step": 2190
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.0036507395561784506,
+      "learning_rate": 0.00010626598465473145,
+      "loss": 0.0006,
+      "step": 2200
+    },
+    {
+      "epoch": 1.88,
+      "eval_accuracy": 0.9823369565217391,
+      "eval_loss": 0.07500635832548141,
+      "eval_runtime": 35.3987,
+      "eval_samples_per_second": 20.792,
+      "eval_steps_per_second": 2.599,
+      "step": 2200
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.0012559212045744061,
+      "learning_rate": 0.00010583972719522593,
+      "loss": 0.0005,
+      "step": 2210
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.0018393185455352068,
+      "learning_rate": 0.00010541346973572037,
+      "loss": 0.0077,
+      "step": 2220
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.006080774124711752,
+      "learning_rate": 0.00010498721227621484,
+      "loss": 0.0003,
+      "step": 2230
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.004452521912753582,
+      "learning_rate": 0.00010456095481670929,
+      "loss": 0.006,
+      "step": 2240
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.008074764162302017,
+      "learning_rate": 0.00010413469735720376,
+      "loss": 0.0011,
+      "step": 2250
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.005144124384969473,
+      "learning_rate": 0.0001037084398976982,
+      "loss": 0.0012,
+      "step": 2260
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.0021739054936915636,
+      "learning_rate": 0.00010328218243819268,
+      "loss": 0.016,
+      "step": 2270
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.005759551655501127,
+      "learning_rate": 0.00010285592497868713,
+      "loss": 0.0031,
+      "step": 2280
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.0022709094919264317,
+      "learning_rate": 0.00010242966751918159,
+      "loss": 0.0469,
+      "step": 2290
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.0021238611079752445,
+      "learning_rate": 0.00010200341005967604,
+      "loss": 0.0189,
+      "step": 2300
+    },
+    {
+      "epoch": 1.96,
+      "eval_accuracy": 0.9850543478260869,
+      "eval_loss": 0.04511374980211258,
+      "eval_runtime": 35.4524,
+      "eval_samples_per_second": 20.76,
+      "eval_steps_per_second": 2.595,
+      "step": 2300
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.035481926053762436,
+      "learning_rate": 0.00010157715260017052,
+      "loss": 0.0028,
+      "step": 2310
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.0017244711052626371,
+      "learning_rate": 0.00010115089514066495,
+      "loss": 0.0003,
+      "step": 2320
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.005300660151988268,
+      "learning_rate": 0.00010072463768115943,
+      "loss": 0.0011,
+      "step": 2330
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.07476931810379028,
+      "learning_rate": 0.00010029838022165388,
+      "loss": 0.0141,
+      "step": 2340
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.0013850871473550797,
+      "learning_rate": 9.987212276214834e-05,
+      "loss": 0.0316,
+      "step": 2350
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 0.01931070163846016,
+      "learning_rate": 9.94458653026428e-05,
+      "loss": 0.0434,
+      "step": 2360
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.1971992552280426,
+      "learning_rate": 9.901960784313727e-05,
+      "loss": 0.0005,
+      "step": 2370
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 0.2410838007926941,
+      "learning_rate": 9.859335038363172e-05,
+      "loss": 0.0511,
+      "step": 2380
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.003502428298816085,
+      "learning_rate": 9.816709292412618e-05,
+      "loss": 0.0162,
+      "step": 2390
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 0.0009001785656437278,
+      "learning_rate": 9.774083546462064e-05,
+      "loss": 0.0442,
+      "step": 2400
+    },
+    {
+      "epoch": 2.05,
+      "eval_accuracy": 0.9796195652173914,
+      "eval_loss": 0.07724236696958542,
+      "eval_runtime": 35.5168,
+      "eval_samples_per_second": 20.723,
+      "eval_steps_per_second": 2.59,
+      "step": 2400
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 0.0027264945674687624,
+      "learning_rate": 9.73145780051151e-05,
+      "loss": 0.0453,
+      "step": 2410
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.005730016622692347,
+      "learning_rate": 9.688832054560956e-05,
+      "loss": 0.0038,
+      "step": 2420
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 0.0037351311184465885,
+      "learning_rate": 9.646206308610402e-05,
+      "loss": 0.0209,
+      "step": 2430
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.0017955221701413393,
+      "learning_rate": 9.603580562659847e-05,
+      "loss": 0.0569,
+      "step": 2440
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 0.12098924070596695,
+      "learning_rate": 9.560954816709293e-05,
+      "loss": 0.0007,
+      "step": 2450
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.0016344174509868026,
+      "learning_rate": 9.51832907075874e-05,
+      "loss": 0.0004,
+      "step": 2460
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 0.0016071271384134889,
+      "learning_rate": 9.475703324808185e-05,
+      "loss": 0.0069,
+      "step": 2470
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 0.012789487838745117,
+      "learning_rate": 9.433077578857631e-05,
+      "loss": 0.0003,
+      "step": 2480
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.002755184657871723,
+      "learning_rate": 9.390451832907077e-05,
+      "loss": 0.0027,
+      "step": 2490
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 0.001110222190618515,
+      "learning_rate": 9.347826086956522e-05,
+      "loss": 0.0006,
+      "step": 2500
+    },
+    {
+      "epoch": 2.13,
+      "eval_accuracy": 0.9619565217391305,
+      "eval_loss": 0.19875623285770416,
+      "eval_runtime": 35.2652,
+      "eval_samples_per_second": 20.87,
+      "eval_steps_per_second": 2.609,
+      "step": 2500
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.0037218218203634024,
+      "learning_rate": 9.305200341005969e-05,
+      "loss": 0.023,
+      "step": 2510
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 0.0025667762383818626,
+      "learning_rate": 9.262574595055415e-05,
+      "loss": 0.0159,
+      "step": 2520
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 8.974613189697266,
+      "learning_rate": 9.21994884910486e-05,
+      "loss": 0.066,
+      "step": 2530
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 0.004758073017001152,
+      "learning_rate": 9.177323103154306e-05,
+      "loss": 0.0002,
+      "step": 2540
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 0.002444899408146739,
+      "learning_rate": 9.134697357203753e-05,
+      "loss": 0.0266,
+      "step": 2550
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.0825042575597763,
+      "learning_rate": 9.092071611253197e-05,
+      "loss": 0.0008,
+      "step": 2560
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.008311850018799305,
+      "learning_rate": 9.049445865302644e-05,
+      "loss": 0.0034,
+      "step": 2570
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.0031178677454590797,
+      "learning_rate": 9.00682011935209e-05,
+      "loss": 0.0002,
+      "step": 2580
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 0.013823499903082848,
+      "learning_rate": 8.964194373401535e-05,
+      "loss": 0.0034,
+      "step": 2590
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.005909046158194542,
+      "learning_rate": 8.921568627450981e-05,
+      "loss": 0.006,
+      "step": 2600
+    },
+    {
+      "epoch": 2.22,
+      "eval_accuracy": 0.9864130434782609,
+      "eval_loss": 0.06588361412286758,
+      "eval_runtime": 35.1548,
+      "eval_samples_per_second": 20.936,
+      "eval_steps_per_second": 2.617,
+      "step": 2600
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 0.1148420125246048,
+      "learning_rate": 8.878942881500428e-05,
+      "loss": 0.0033,
+      "step": 2610
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 0.1661941260099411,
+      "learning_rate": 8.836317135549873e-05,
+      "loss": 0.0004,
+      "step": 2620
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.4073266386985779,
+      "learning_rate": 8.793691389599319e-05,
+      "loss": 0.0012,
+      "step": 2630
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.0015085344202816486,
+      "learning_rate": 8.751065643648765e-05,
+      "loss": 0.0072,
+      "step": 2640
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.0480346716940403,
+      "learning_rate": 8.70843989769821e-05,
+      "loss": 0.0327,
+      "step": 2650
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 0.0015394511865451932,
+      "learning_rate": 8.665814151747657e-05,
+      "loss": 0.0049,
+      "step": 2660
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 0.0018810660112649202,
+      "learning_rate": 8.623188405797103e-05,
+      "loss": 0.0216,
+      "step": 2670
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 0.003445707494392991,
+      "learning_rate": 8.580562659846548e-05,
+      "loss": 0.0002,
+      "step": 2680
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 0.0024161450564861298,
+      "learning_rate": 8.537936913895993e-05,
+      "loss": 0.0003,
+      "step": 2690
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.001016319845803082,
+      "learning_rate": 8.495311167945439e-05,
+      "loss": 0.0093,
+      "step": 2700
+    },
+    {
+      "epoch": 2.3,
+      "eval_accuracy": 0.9809782608695652,
+      "eval_loss": 0.07539312541484833,
+      "eval_runtime": 35.2074,
+      "eval_samples_per_second": 20.905,
+      "eval_steps_per_second": 2.613,
+      "step": 2700
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 0.0037928090896457434,
+      "learning_rate": 8.452685421994884e-05,
+      "loss": 0.0077,
+      "step": 2710
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.00291675073094666,
+      "learning_rate": 8.41005967604433e-05,
+      "loss": 0.0006,
+      "step": 2720
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 0.0031472251284867525,
+      "learning_rate": 8.367433930093777e-05,
+      "loss": 0.0058,
+      "step": 2730
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.0069138044491410255,
+      "learning_rate": 8.324808184143222e-05,
+      "loss": 0.0029,
+      "step": 2740
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.0014087699819356203,
+      "learning_rate": 8.282182438192668e-05,
+      "loss": 0.0001,
+      "step": 2750
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 0.012645886279642582,
+      "learning_rate": 8.239556692242114e-05,
+      "loss": 0.0001,
+      "step": 2760
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.6107314825057983,
+      "learning_rate": 8.19693094629156e-05,
+      "loss": 0.0083,
+      "step": 2770
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 0.0020549651235342026,
+      "learning_rate": 8.154305200341006e-05,
+      "loss": 0.0002,
+      "step": 2780
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.004861908033490181,
+      "learning_rate": 8.111679454390452e-05,
+      "loss": 0.0005,
+      "step": 2790
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 0.0013458251487463713,
+      "learning_rate": 8.069053708439897e-05,
+      "loss": 0.0008,
+      "step": 2800
+    },
+    {
+      "epoch": 2.39,
+      "eval_accuracy": 0.9782608695652174,
+      "eval_loss": 0.07996727526187897,
+      "eval_runtime": 35.2832,
+      "eval_samples_per_second": 20.86,
+      "eval_steps_per_second": 2.607,
+      "step": 2800
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.29993894696235657,
+      "learning_rate": 8.026427962489343e-05,
+      "loss": 0.0022,
+      "step": 2810
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.0008898138767108321,
+      "learning_rate": 7.98380221653879e-05,
+      "loss": 0.0007,
+      "step": 2820
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 0.006979215890169144,
+      "learning_rate": 7.941176470588235e-05,
+      "loss": 0.0281,
+      "step": 2830
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.002796901622787118,
+      "learning_rate": 7.898550724637681e-05,
+      "loss": 0.001,
+      "step": 2840
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 0.003984102047979832,
+      "learning_rate": 7.855924978687127e-05,
+      "loss": 0.0007,
+      "step": 2850
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.003083733143284917,
+      "learning_rate": 7.813299232736572e-05,
+      "loss": 0.0671,
+      "step": 2860
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 0.012969084084033966,
+      "learning_rate": 7.770673486786019e-05,
+      "loss": 0.0006,
+      "step": 2870
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.006222143769264221,
+      "learning_rate": 7.728047740835465e-05,
+      "loss": 0.0006,
+      "step": 2880
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.009051816537976265,
+      "learning_rate": 7.68542199488491e-05,
+      "loss": 0.0002,
+      "step": 2890
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 0.004602865315973759,
+      "learning_rate": 7.642796248934356e-05,
+      "loss": 0.0003,
+      "step": 2900
+    },
+    {
+      "epoch": 2.47,
+      "eval_accuracy": 0.9864130434782609,
+      "eval_loss": 0.061688102781772614,
+      "eval_runtime": 35.7898,
+      "eval_samples_per_second": 20.565,
+      "eval_steps_per_second": 2.571,
+      "step": 2900
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.0007365304627455771,
+      "learning_rate": 7.600170502983802e-05,
+      "loss": 0.0002,
+      "step": 2910
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 0.0011029945453628898,
+      "learning_rate": 7.557544757033247e-05,
+      "loss": 0.0001,
+      "step": 2920
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.001565741142258048,
+      "learning_rate": 7.514919011082694e-05,
+      "loss": 0.0002,
+      "step": 2930
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 0.0006651618168689311,
+      "learning_rate": 7.47229326513214e-05,
+      "loss": 0.0001,
+      "step": 2940
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 0.000838673091493547,
+      "learning_rate": 7.429667519181585e-05,
+      "loss": 0.0002,
+      "step": 2950
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.000473020103527233,
+      "learning_rate": 7.387041773231031e-05,
+      "loss": 0.0001,
+      "step": 2960
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 0.004068770445883274,
+      "learning_rate": 7.344416027280478e-05,
+      "loss": 0.0032,
+      "step": 2970
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.0008245965582318604,
+      "learning_rate": 7.301790281329923e-05,
+      "loss": 0.0342,
+      "step": 2980
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 0.004487219266593456,
+      "learning_rate": 7.259164535379369e-05,
+      "loss": 0.0001,
+      "step": 2990
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.002347569912672043,
+      "learning_rate": 7.216538789428815e-05,
+      "loss": 0.0094,
+      "step": 3000
+    },
+    {
+      "epoch": 2.56,
+      "eval_accuracy": 0.9836956521739131,
+      "eval_loss": 0.07361293584108353,
+      "eval_runtime": 36.0399,
+      "eval_samples_per_second": 20.422,
+      "eval_steps_per_second": 2.553,
+      "step": 3000
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 0.06282170116901398,
+      "learning_rate": 7.17391304347826e-05,
+      "loss": 0.0002,
+      "step": 3010
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 0.0016825624043121934,
+      "learning_rate": 7.131287297527707e-05,
+      "loss": 0.0012,
+      "step": 3020
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.0015001163119450212,
+      "learning_rate": 7.088661551577153e-05,
+      "loss": 0.0001,
+      "step": 3030
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 0.0014401893131434917,
+      "learning_rate": 7.046035805626598e-05,
+      "loss": 0.0003,
+      "step": 3040
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.0009955812711268663,
+      "learning_rate": 7.003410059676044e-05,
+      "loss": 0.0002,
+      "step": 3050
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 0.02410055138170719,
+      "learning_rate": 6.96078431372549e-05,
+      "loss": 0.0008,
+      "step": 3060
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 0.0009745031129568815,
+      "learning_rate": 6.918158567774935e-05,
+      "loss": 0.0001,
+      "step": 3070
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.002661016071215272,
+      "learning_rate": 6.875532821824382e-05,
+      "loss": 0.0003,
+      "step": 3080
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.003044945653527975,
+      "learning_rate": 6.832907075873828e-05,
+      "loss": 0.0692,
+      "step": 3090
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.002494238084182143,
+      "learning_rate": 6.790281329923273e-05,
+      "loss": 0.0001,
+      "step": 3100
+    },
+    {
+      "epoch": 2.64,
+      "eval_accuracy": 0.9823369565217391,
+      "eval_loss": 0.053765375167131424,
+      "eval_runtime": 35.5909,
+      "eval_samples_per_second": 20.679,
+      "eval_steps_per_second": 2.585,
+      "step": 3100
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 0.0018610358238220215,
+      "learning_rate": 6.74765558397272e-05,
+      "loss": 0.0004,
+      "step": 3110
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.0060824137181043625,
+      "learning_rate": 6.705029838022166e-05,
+      "loss": 0.0001,
+      "step": 3120
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 0.0031146130058914423,
+      "learning_rate": 6.66240409207161e-05,
+      "loss": 0.0001,
+      "step": 3130
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.0006994738942012191,
+      "learning_rate": 6.619778346121057e-05,
+      "loss": 0.0029,
+      "step": 3140
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 0.0014639056753367186,
+      "learning_rate": 6.577152600170503e-05,
+      "loss": 0.0001,
+      "step": 3150
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 0.012882738374173641,
+      "learning_rate": 6.534526854219948e-05,
+      "loss": 0.0001,
+      "step": 3160
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.0028791027143597603,
+      "learning_rate": 6.491901108269395e-05,
+      "loss": 0.0001,
+      "step": 3170
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 0.000892713142093271,
+      "learning_rate": 6.449275362318841e-05,
+      "loss": 0.0297,
+      "step": 3180
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 0.001191814080812037,
+      "learning_rate": 6.406649616368286e-05,
+      "loss": 0.0002,
+      "step": 3190
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 0.0021157979499548674,
+      "learning_rate": 6.364023870417732e-05,
+      "loss": 0.001,
+      "step": 3200
+    },
+    {
+      "epoch": 2.73,
+      "eval_accuracy": 0.9877717391304348,
+      "eval_loss": 0.06063992902636528,
+      "eval_runtime": 35.6699,
+      "eval_samples_per_second": 20.634,
+      "eval_steps_per_second": 2.579,
+      "step": 3200
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.0007219274411909282,
+      "learning_rate": 6.321398124467179e-05,
+      "loss": 0.0008,
+      "step": 3210
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.0005430682795122266,
+      "learning_rate": 6.278772378516623e-05,
+      "loss": 0.0001,
+      "step": 3220
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.0037848646752536297,
+      "learning_rate": 6.23614663256607e-05,
+      "loss": 0.0002,
+      "step": 3230
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.0025371257215738297,
+      "learning_rate": 6.193520886615516e-05,
+      "loss": 0.0383,
+      "step": 3240
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 0.0009226408437825739,
+      "learning_rate": 6.150895140664961e-05,
+      "loss": 0.0037,
+      "step": 3250
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 0.00079467793693766,
+      "learning_rate": 6.108269394714407e-05,
+      "loss": 0.0001,
+      "step": 3260
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 0.007261179853230715,
+      "learning_rate": 6.065643648763854e-05,
+      "loss": 0.0047,
+      "step": 3270
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.0018952098907902837,
+      "learning_rate": 6.0230179028132994e-05,
+      "loss": 0.0002,
+      "step": 3280
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.0014908647863194346,
+      "learning_rate": 5.980392156862745e-05,
+      "loss": 0.0002,
+      "step": 3290
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 0.0011345170205458999,
+      "learning_rate": 5.9377664109121913e-05,
+      "loss": 0.0001,
+      "step": 3300
+    },
+    {
+      "epoch": 2.81,
+      "eval_accuracy": 0.9864130434782609,
+      "eval_loss": 0.043328747153282166,
+      "eval_runtime": 35.0865,
+      "eval_samples_per_second": 20.977,
+      "eval_steps_per_second": 2.622,
+      "step": 3300
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 0.0026633902452886105,
+      "learning_rate": 5.895140664961637e-05,
+      "loss": 0.0001,
+      "step": 3310
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 0.0006624997477047145,
+      "learning_rate": 5.8525149190110826e-05,
+      "loss": 0.0003,
+      "step": 3320
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.0008892026962712407,
+      "learning_rate": 5.809889173060529e-05,
+      "loss": 0.0001,
+      "step": 3330
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.00067310401936993,
+      "learning_rate": 5.7672634271099746e-05,
+      "loss": 0.0001,
+      "step": 3340
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.0005306313978508115,
+      "learning_rate": 5.72463768115942e-05,
+      "loss": 0.0001,
+      "step": 3350
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 0.004409481305629015,
+      "learning_rate": 5.6820119352088666e-05,
+      "loss": 0.0003,
+      "step": 3360
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 0.005924859084188938,
+      "learning_rate": 5.639386189258312e-05,
+      "loss": 0.0007,
+      "step": 3370
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.00349413906224072,
+      "learning_rate": 5.596760443307758e-05,
+      "loss": 0.0003,
+      "step": 3380
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 0.008677713572978973,
+      "learning_rate": 5.554134697357204e-05,
+      "loss": 0.0001,
+      "step": 3390
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.0008297800086438656,
+      "learning_rate": 5.51150895140665e-05,
+      "loss": 0.0001,
+      "step": 3400
+    },
+    {
+      "epoch": 2.9,
+      "eval_accuracy": 0.9823369565217391,
+      "eval_loss": 0.0582539401948452,
+      "eval_runtime": 35.7734,
+      "eval_samples_per_second": 20.574,
+      "eval_steps_per_second": 2.572,
+      "step": 3400
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 0.0011900962563231587,
+      "learning_rate": 5.4688832054560955e-05,
+      "loss": 0.0001,
+      "step": 3410
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.0012697293423116207,
+      "learning_rate": 5.426257459505542e-05,
+      "loss": 0.0001,
+      "step": 3420
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.0008308067917823792,
+      "learning_rate": 5.3836317135549874e-05,
+      "loss": 0.0004,
+      "step": 3430
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 0.00043416779953986406,
+      "learning_rate": 5.341005967604433e-05,
+      "loss": 0.0001,
+      "step": 3440
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 0.0007626230362802744,
+      "learning_rate": 5.2983802216538794e-05,
+      "loss": 0.0002,
+      "step": 3450
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 0.00042898516403511167,
+      "learning_rate": 5.255754475703325e-05,
+      "loss": 0.0002,
+      "step": 3460
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.006079140119254589,
+      "learning_rate": 5.213128729752771e-05,
+      "loss": 0.0001,
+      "step": 3470
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 0.0007620278629474342,
+      "learning_rate": 5.170502983802217e-05,
+      "loss": 0.0001,
+      "step": 3480
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.007883735932409763,
+      "learning_rate": 5.1278772378516626e-05,
+      "loss": 0.0001,
+      "step": 3490
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.002050349023193121,
+      "learning_rate": 5.085251491901108e-05,
+      "loss": 0.0001,
+      "step": 3500
+    },
+    {
+      "epoch": 2.98,
+      "eval_accuracy": 0.9904891304347826,
+      "eval_loss": 0.03878939896821976,
+      "eval_runtime": 35.9666,
+      "eval_samples_per_second": 20.463,
+      "eval_steps_per_second": 2.558,
+      "step": 3500
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 0.006061529275029898,
+      "learning_rate": 5.0426257459505546e-05,
+      "loss": 0.0001,
+      "step": 3510
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.0008097901591099799,
+      "learning_rate": 5e-05,
+      "loss": 0.0001,
+      "step": 3520
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 0.0005056671216152608,
+      "learning_rate": 4.957374254049446e-05,
+      "loss": 0.0001,
+      "step": 3530
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 0.006060776766389608,
+      "learning_rate": 4.914748508098892e-05,
+      "loss": 0.0001,
+      "step": 3540
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 0.0008045104332268238,
+      "learning_rate": 4.872122762148338e-05,
+      "loss": 0.0001,
+      "step": 3550
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 0.0011429821606725454,
+      "learning_rate": 4.8294970161977835e-05,
+      "loss": 0.0001,
+      "step": 3560
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.0007523238891735673,
+      "learning_rate": 4.78687127024723e-05,
+      "loss": 0.0729,
+      "step": 3570
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 0.0028867837972939014,
+      "learning_rate": 4.7442455242966755e-05,
+      "loss": 0.0154,
+      "step": 3580
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 0.0009987674420699477,
+      "learning_rate": 4.701619778346121e-05,
+      "loss": 0.0001,
+      "step": 3590
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 0.0007740564760752022,
+      "learning_rate": 4.6589940323955674e-05,
+      "loss": 0.0001,
+      "step": 3600
+    },
+    {
+      "epoch": 3.07,
+      "eval_accuracy": 0.9891304347826086,
+      "eval_loss": 0.04076423496007919,
+      "eval_runtime": 35.421,
+      "eval_samples_per_second": 20.779,
+      "eval_steps_per_second": 2.597,
+      "step": 3600
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 0.0016550541622564197,
+      "learning_rate": 4.616368286445013e-05,
+      "loss": 0.0001,
+      "step": 3610
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 0.0007590887253172696,
+      "learning_rate": 4.573742540494459e-05,
+      "loss": 0.0001,
+      "step": 3620
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 0.00034116365713998675,
+      "learning_rate": 4.531116794543905e-05,
+      "loss": 0.0001,
+      "step": 3630
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 0.0012234909227117896,
+      "learning_rate": 4.488491048593351e-05,
+      "loss": 0.0001,
+      "step": 3640
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 0.00045663033961318433,
+      "learning_rate": 4.445865302642796e-05,
+      "loss": 0.0001,
+      "step": 3650
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.0010367368813604116,
+      "learning_rate": 4.4032395566922426e-05,
+      "loss": 0.0001,
+      "step": 3660
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 0.0013374440604820848,
+      "learning_rate": 4.360613810741688e-05,
+      "loss": 0.0001,
+      "step": 3670
+    },
+    {
+      "epoch": 3.14,
+      "grad_norm": 0.0006071662064641714,
+      "learning_rate": 4.317988064791134e-05,
+      "loss": 0.0001,
+      "step": 3680
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 0.000419705145759508,
+      "learning_rate": 4.27536231884058e-05,
+      "loss": 0.0001,
+      "step": 3690
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 0.000592475407756865,
+      "learning_rate": 4.232736572890026e-05,
+      "loss": 0.0001,
+      "step": 3700
+    },
+    {
+      "epoch": 3.15,
+      "eval_accuracy": 0.9891304347826086,
+      "eval_loss": 0.03751834109425545,
+      "eval_runtime": 35.5175,
+      "eval_samples_per_second": 20.722,
+      "eval_steps_per_second": 2.59,
+      "step": 3700
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 0.00161035917699337,
+      "learning_rate": 4.1901108269394715e-05,
+      "loss": 0.0001,
+      "step": 3710
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 0.0007734562386758626,
+      "learning_rate": 4.147485080988918e-05,
+      "loss": 0.0001,
+      "step": 3720
+    },
+    {
+      "epoch": 3.18,
+      "grad_norm": 0.00071456388104707,
+      "learning_rate": 4.1048593350383635e-05,
+      "loss": 0.0001,
+      "step": 3730
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 0.004727986175566912,
+      "learning_rate": 4.062233589087809e-05,
+      "loss": 0.0001,
+      "step": 3740
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.0005515944212675095,
+      "learning_rate": 4.0196078431372555e-05,
+      "loss": 0.0001,
+      "step": 3750
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 0.0006984106148593128,
+      "learning_rate": 3.976982097186701e-05,
+      "loss": 0.0003,
+      "step": 3760
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 0.0034689533058553934,
+      "learning_rate": 3.934356351236147e-05,
+      "loss": 0.0001,
+      "step": 3770
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 0.00037349184276536107,
+      "learning_rate": 3.891730605285593e-05,
+      "loss": 0.0001,
+      "step": 3780
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 0.00039833749178797007,
+      "learning_rate": 3.849104859335039e-05,
+      "loss": 0.0001,
+      "step": 3790
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 0.0005233868723735213,
+      "learning_rate": 3.8064791133844843e-05,
+      "loss": 0.0001,
+      "step": 3800
+    },
+    {
+      "epoch": 3.24,
+      "eval_accuracy": 0.9877717391304348,
+      "eval_loss": 0.03672608733177185,
+      "eval_runtime": 35.3409,
+      "eval_samples_per_second": 20.826,
+      "eval_steps_per_second": 2.603,
+      "step": 3800
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.00233672559261322,
+      "learning_rate": 3.763853367433931e-05,
+      "loss": 0.0001,
+      "step": 3810
+    },
+    {
+      "epoch": 3.26,
+      "grad_norm": 0.0006057602004148066,
+      "learning_rate": 3.721227621483376e-05,
+      "loss": 0.0001,
+      "step": 3820
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 0.0012565916404128075,
+      "learning_rate": 3.678601875532822e-05,
+      "loss": 0.0001,
+      "step": 3830
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 0.0007620604010298848,
+      "learning_rate": 3.635976129582268e-05,
+      "loss": 0.0001,
+      "step": 3840
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 0.001355510437861085,
+      "learning_rate": 3.593350383631714e-05,
+      "loss": 0.0001,
+      "step": 3850
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 0.0006817120010964572,
+      "learning_rate": 3.5507246376811596e-05,
+      "loss": 0.0003,
+      "step": 3860
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 0.0036707494873553514,
+      "learning_rate": 3.508098891730606e-05,
+      "loss": 0.0001,
+      "step": 3870
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 0.0004486854304559529,
+      "learning_rate": 3.4654731457800515e-05,
+      "loss": 0.0001,
+      "step": 3880
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 0.0047525763511657715,
+      "learning_rate": 3.422847399829497e-05,
+      "loss": 0.0001,
+      "step": 3890
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 0.00044202787103131413,
+      "learning_rate": 3.3802216538789435e-05,
+      "loss": 0.0001,
+      "step": 3900
+    },
+    {
+      "epoch": 3.32,
+      "eval_accuracy": 0.9877717391304348,
+      "eval_loss": 0.03553692623972893,
+      "eval_runtime": 35.3339,
+      "eval_samples_per_second": 20.83,
+      "eval_steps_per_second": 2.604,
+      "step": 3900
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 0.0005708124954253435,
+      "learning_rate": 3.337595907928389e-05,
+      "loss": 0.0001,
+      "step": 3910
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 0.0008867123397067189,
+      "learning_rate": 3.294970161977835e-05,
+      "loss": 0.0009,
+      "step": 3920
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 0.0006551162805408239,
+      "learning_rate": 3.252344416027281e-05,
+      "loss": 0.0001,
+      "step": 3930
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 0.0011884266277775168,
+      "learning_rate": 3.209718670076726e-05,
+      "loss": 0.0622,
+      "step": 3940
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 0.0007145259296521544,
+      "learning_rate": 3.1670929241261724e-05,
+      "loss": 0.0002,
+      "step": 3950
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 0.0006628567352890968,
+      "learning_rate": 3.124467178175618e-05,
+      "loss": 0.0001,
+      "step": 3960
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 0.0005057456437498331,
+      "learning_rate": 3.081841432225064e-05,
+      "loss": 0.0001,
+      "step": 3970
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 0.0027303395327180624,
+      "learning_rate": 3.0392156862745097e-05,
+      "loss": 0.0001,
+      "step": 3980
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 0.002173346932977438,
+      "learning_rate": 2.9965899403239556e-05,
+      "loss": 0.0001,
+      "step": 3990
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 0.001203179475851357,
+      "learning_rate": 2.9539641943734013e-05,
+      "loss": 0.0001,
+      "step": 4000
+    },
+    {
+      "epoch": 3.41,
+      "eval_accuracy": 0.9877717391304348,
+      "eval_loss": 0.03951473906636238,
+      "eval_runtime": 35.1959,
+      "eval_samples_per_second": 20.912,
+      "eval_steps_per_second": 2.614,
+      "step": 4000
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 0.0011582368751987815,
+      "learning_rate": 2.9113384484228473e-05,
+      "loss": 0.0001,
+      "step": 4010
+    },
+    {
+      "epoch": 3.43,
+      "grad_norm": 0.0027972415555268526,
+      "learning_rate": 2.8687127024722932e-05,
+      "loss": 0.0001,
+      "step": 4020
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.00038555546780116856,
+      "learning_rate": 2.826086956521739e-05,
+      "loss": 0.0001,
+      "step": 4030
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.0007446287199854851,
+      "learning_rate": 2.783461210571185e-05,
+      "loss": 0.0001,
+      "step": 4040
+    },
+    {
+      "epoch": 3.45,
+      "grad_norm": 0.0017564999870955944,
+      "learning_rate": 2.740835464620631e-05,
+      "loss": 0.0001,
+      "step": 4050
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 0.00037764673470519483,
+      "learning_rate": 2.6982097186700765e-05,
+      "loss": 0.0001,
+      "step": 4060
+    },
+    {
+      "epoch": 3.47,
+      "grad_norm": 0.00230345013551414,
+      "learning_rate": 2.6555839727195225e-05,
+      "loss": 0.0001,
+      "step": 4070
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 0.0005564872990362346,
+      "learning_rate": 2.6129582267689685e-05,
+      "loss": 0.0001,
+      "step": 4080
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 0.00039161540917120874,
+      "learning_rate": 2.5703324808184144e-05,
+      "loss": 0.0001,
+      "step": 4090
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.0016666314331814647,
+      "learning_rate": 2.52770673486786e-05,
+      "loss": 0.0001,
+      "step": 4100
+    },
+    {
+      "epoch": 3.5,
+      "eval_accuracy": 0.9877717391304348,
+      "eval_loss": 0.03819862753152847,
+      "eval_runtime": 35.298,
+      "eval_samples_per_second": 20.851,
+      "eval_steps_per_second": 2.606,
+      "step": 4100
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.00222257012501359,
+      "learning_rate": 2.4850809889173064e-05,
+      "loss": 0.0001,
+      "step": 4110
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 0.05478089302778244,
+      "learning_rate": 2.442455242966752e-05,
+      "loss": 0.0001,
+      "step": 4120
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 0.0005476613878272474,
+      "learning_rate": 2.399829497016198e-05,
+      "loss": 0.0001,
+      "step": 4130
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 0.003102705115452409,
+      "learning_rate": 2.357203751065644e-05,
+      "loss": 0.0001,
+      "step": 4140
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 0.0006446978659369051,
+      "learning_rate": 2.3145780051150897e-05,
+      "loss": 0.0001,
+      "step": 4150
+    },
+    {
+      "epoch": 3.55,
+      "grad_norm": 0.0020584387239068747,
+      "learning_rate": 2.2719522591645353e-05,
+      "loss": 0.0009,
+      "step": 4160
+    },
+    {
+      "epoch": 3.55,
+      "grad_norm": 0.0019774443935602903,
+      "learning_rate": 2.2293265132139813e-05,
+      "loss": 0.0001,
+      "step": 4170
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 0.00046734846546314657,
+      "learning_rate": 2.1867007672634273e-05,
+      "loss": 0.0001,
+      "step": 4180
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 0.005072999745607376,
+      "learning_rate": 2.144075021312873e-05,
+      "loss": 0.0002,
+      "step": 4190
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 0.0020108213648200035,
+      "learning_rate": 2.101449275362319e-05,
+      "loss": 0.0001,
+      "step": 4200
+    },
+    {
+      "epoch": 3.58,
+      "eval_accuracy": 0.9891304347826086,
+      "eval_loss": 0.03985697403550148,
+      "eval_runtime": 35.255,
+      "eval_samples_per_second": 20.876,
+      "eval_steps_per_second": 2.61,
+      "step": 4200
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 0.0004175342037342489,
+      "learning_rate": 2.058823529411765e-05,
+      "loss": 0.0001,
+      "step": 4210
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.0006376274977810681,
+      "learning_rate": 2.0161977834612105e-05,
+      "loss": 0.0001,
+      "step": 4220
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 0.0007830281392671168,
+      "learning_rate": 1.9735720375106565e-05,
+      "loss": 0.0001,
+      "step": 4230
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 0.0009144895593635738,
+      "learning_rate": 1.9309462915601025e-05,
+      "loss": 0.0001,
+      "step": 4240
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 0.0005221754545345902,
+      "learning_rate": 1.888320545609548e-05,
+      "loss": 0.0001,
+      "step": 4250
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 0.0003978173772338778,
+      "learning_rate": 1.845694799658994e-05,
+      "loss": 0.0001,
+      "step": 4260
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 0.0011031723115593195,
+      "learning_rate": 1.80306905370844e-05,
+      "loss": 0.0001,
+      "step": 4270
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 0.0007275677053257823,
+      "learning_rate": 1.7604433077578857e-05,
+      "loss": 0.0001,
+      "step": 4280
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 0.0004966453998349607,
+      "learning_rate": 1.7178175618073317e-05,
+      "loss": 0.0001,
+      "step": 4290
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 0.0007101698429323733,
+      "learning_rate": 1.6751918158567777e-05,
+      "loss": 0.0001,
+      "step": 4300
+    },
+    {
+      "epoch": 3.67,
+      "eval_accuracy": 0.9891304347826086,
+      "eval_loss": 0.03956419602036476,
+      "eval_runtime": 35.2803,
+      "eval_samples_per_second": 20.861,
+      "eval_steps_per_second": 2.608,
+      "step": 4300
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 0.000517147418577224,
+      "learning_rate": 1.6325660699062233e-05,
+      "loss": 0.0001,
+      "step": 4310
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 0.0006418406846933067,
+      "learning_rate": 1.5899403239556693e-05,
+      "loss": 0.0001,
+      "step": 4320
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 0.00048330274876207113,
+      "learning_rate": 1.5473145780051153e-05,
+      "loss": 0.0001,
+      "step": 4330
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 0.0020956608932465315,
+      "learning_rate": 1.504688832054561e-05,
+      "loss": 0.0001,
+      "step": 4340
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 0.0009279249934479594,
+      "learning_rate": 1.462063086104007e-05,
+      "loss": 0.0001,
+      "step": 4350
+    },
+    {
+      "epoch": 3.72,
+      "grad_norm": 0.0005609797080978751,
+      "learning_rate": 1.4194373401534527e-05,
+      "loss": 0.0004,
+      "step": 4360
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 0.00083589629502967,
+      "learning_rate": 1.3768115942028985e-05,
+      "loss": 0.0001,
+      "step": 4370
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 0.0004674463998526335,
+      "learning_rate": 1.3341858482523445e-05,
+      "loss": 0.0001,
+      "step": 4380
+    },
+    {
+      "epoch": 3.74,
+      "grad_norm": 0.0023368378169834614,
+      "learning_rate": 1.2915601023017903e-05,
+      "loss": 0.0001,
+      "step": 4390
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0029742561746388674,
+      "learning_rate": 1.2489343563512362e-05,
+      "loss": 0.0072,
+      "step": 4400
+    },
+    {
+      "epoch": 3.75,
+      "eval_accuracy": 0.9904891304347826,
+      "eval_loss": 0.03550506755709648,
+      "eval_runtime": 35.3855,
+      "eval_samples_per_second": 20.799,
+      "eval_steps_per_second": 2.6,
+      "step": 4400
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.0006715525523759425,
+      "learning_rate": 1.2063086104006821e-05,
+      "loss": 0.0001,
+      "step": 4410
+    },
+    {
+      "epoch": 3.77,
+      "grad_norm": 0.0005204956978559494,
+      "learning_rate": 1.163682864450128e-05,
+      "loss": 0.0001,
+      "step": 4420
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 0.004828631412237883,
+      "learning_rate": 1.121057118499574e-05,
+      "loss": 0.0001,
+      "step": 4430
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 0.0006186112877912819,
+      "learning_rate": 1.0784313725490197e-05,
+      "loss": 0.0001,
+      "step": 4440
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 0.00034449456143192947,
+      "learning_rate": 1.0358056265984656e-05,
+      "loss": 0.0001,
+      "step": 4450
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.0013000143226236105,
+      "learning_rate": 9.931798806479115e-06,
+      "loss": 0.0001,
+      "step": 4460
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 0.0019598386716097593,
+      "learning_rate": 9.505541346973572e-06,
+      "loss": 0.0001,
+      "step": 4470
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 0.0042680357582867146,
+      "learning_rate": 9.07928388746803e-06,
+      "loss": 0.0011,
+      "step": 4480
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 0.0007351756794378161,
+      "learning_rate": 8.65302642796249e-06,
+      "loss": 0.0001,
+      "step": 4490
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 0.0010118505451828241,
+      "learning_rate": 8.226768968456948e-06,
+      "loss": 0.0001,
+      "step": 4500
+    },
+    {
+      "epoch": 3.84,
+      "eval_accuracy": 0.9918478260869565,
+      "eval_loss": 0.028444211930036545,
+      "eval_runtime": 35.4062,
+      "eval_samples_per_second": 20.787,
+      "eval_steps_per_second": 2.598,
+      "step": 4500
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 0.0014920223038643599,
+      "learning_rate": 7.800511508951406e-06,
+      "loss": 0.0001,
+      "step": 4510
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 0.0008408814901486039,
+      "learning_rate": 7.374254049445865e-06,
+      "loss": 0.0001,
+      "step": 4520
+    },
+    {
+      "epoch": 3.86,
+      "grad_norm": 0.00037611491279676557,
+      "learning_rate": 6.947996589940324e-06,
+      "loss": 0.0001,
+      "step": 4530
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 0.0005198806757107377,
+      "learning_rate": 6.521739130434783e-06,
+      "loss": 0.0001,
+      "step": 4540
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 0.0009057046263478696,
+      "learning_rate": 6.095481670929242e-06,
+      "loss": 0.0001,
+      "step": 4550
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 0.0011800191132351756,
+      "learning_rate": 5.6692242114237e-06,
+      "loss": 0.0001,
+      "step": 4560
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 0.0005070503102615476,
+      "learning_rate": 5.242966751918159e-06,
+      "loss": 0.0001,
+      "step": 4570
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 0.000821206602267921,
+      "learning_rate": 4.816709292412618e-06,
+      "loss": 0.0001,
+      "step": 4580
+    },
+    {
+      "epoch": 3.91,
+      "grad_norm": 0.022186720743775368,
+      "learning_rate": 4.390451832907076e-06,
+      "loss": 0.0001,
+      "step": 4590
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.0013393107801675797,
+      "learning_rate": 3.964194373401535e-06,
+      "loss": 0.0001,
+      "step": 4600
+    },
+    {
+      "epoch": 3.92,
+      "eval_accuracy": 0.9904891304347826,
+      "eval_loss": 0.02818419598042965,
+      "eval_runtime": 35.997,
+      "eval_samples_per_second": 20.446,
+      "eval_steps_per_second": 2.556,
+      "step": 4600
+    },
+    {
+      "epoch": 3.93,
+      "grad_norm": 0.0007979010115377605,
+      "learning_rate": 3.5379369138959936e-06,
+      "loss": 0.0001,
+      "step": 4610
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 0.0007560172816738486,
+      "learning_rate": 3.111679454390452e-06,
+      "loss": 0.0001,
+      "step": 4620
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 0.000473241088911891,
+      "learning_rate": 2.6854219948849107e-06,
+      "loss": 0.0001,
+      "step": 4630
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 0.20867791771888733,
+      "learning_rate": 2.2591645353793692e-06,
+      "loss": 0.0002,
+      "step": 4640
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 0.0012438774574548006,
+      "learning_rate": 1.8329070758738278e-06,
+      "loss": 0.0001,
+      "step": 4650
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 0.0006186561658978462,
+      "learning_rate": 1.4066496163682865e-06,
+      "loss": 0.0001,
+      "step": 4660
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 0.0010864631040021777,
+      "learning_rate": 9.80392156862745e-07,
+      "loss": 0.0001,
+      "step": 4670
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 0.0014369528507813811,
+      "learning_rate": 5.541346973572037e-07,
+      "loss": 0.0001,
+      "step": 4680
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.00035434114397503436,
+      "learning_rate": 1.2787723785166242e-07,
+      "loss": 0.0001,
+      "step": 4690
+    },
+    {
+      "epoch": 4.0,
+      "step": 4692,
+      "total_flos": 1.715138644229908e+19,
+      "train_loss": 0.04991757846501284,
+      "train_runtime": 7931.4156,
+      "train_samples_per_second": 9.462,
+      "train_steps_per_second": 0.592
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4692,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 100,
+  "total_flos": 1.715138644229908e+19,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}