vit-finetune-kidney-stone-Michel_Daudon_-w256_1k_v1-_SEC-pretrain

Browse files

Files changed (9) hide show

README.md +113 -0
all_results.json +16 -0
config.json +40 -0
model.safetensors +3 -0
preprocessor_config.json +23 -0
test_results.json +11 -0
train_results.json +8 -0
trainer_state.json +3456 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,113 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: google/vit-base-patch16-224-in21k
+tags:
+- generated_from_trainer
+datasets:
+- imagefolder
+metrics:
+- accuracy
+- precision
+- recall
+- f1
+model-index:
+- name: vit-finetune-kidney-stone-Michel_Daudon_-w256_1k_v1-_SEC-pretrain
+  results:
+  - task:
+      name: Image Classification
+      type: image-classification
+    dataset:
+      name: imagefolder
+      type: imagefolder
+      config: default
+      split: test
+      args: default
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.9108333333333334
+    - name: Precision
+      type: precision
+      value: 0.9190361753451352
+    - name: Recall
+      type: recall
+      value: 0.9108333333333334
+    - name: F1
+      type: f1
+      value: 0.9102828889161464
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# vit-finetune-kidney-stone-Michel_Daudon_-w256_1k_v1-_SEC-pretrain
+This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.3455
+- Accuracy: 0.9108
+- Precision: 0.9190
+- Recall: 0.9108
+- F1: 0.9103
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 32
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- num_epochs: 15
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch   | Step | Validation Loss | Accuracy | Precision | Recall | F1     |
+|:-------------:|:-------:|:----:|:---------------:|:--------:|:---------:|:------:|:------:|
+| 0.1494        | 0.6667  | 100  | 0.6088          | 0.8442   | 0.8766    | 0.8442 | 0.8390 |
+| 0.0665        | 1.3333  | 200  | 0.5533          | 0.8492   | 0.8810    | 0.8492 | 0.8542 |
+| 0.0215        | 2.0     | 300  | 0.3721          | 0.9017   | 0.9082    | 0.9017 | 0.8985 |
+| 0.0101        | 2.6667  | 400  | 0.5347          | 0.8942   | 0.9061    | 0.8942 | 0.8920 |
+| 0.043         | 3.3333  | 500  | 0.7850          | 0.8425   | 0.8592    | 0.8425 | 0.8427 |
+| 0.0641        | 4.0     | 600  | 0.7735          | 0.8583   | 0.8770    | 0.8583 | 0.8574 |
+| 0.0036        | 4.6667  | 700  | 0.7351          | 0.8367   | 0.8623    | 0.8367 | 0.8250 |
+| 0.0039        | 5.3333  | 800  | 0.3455          | 0.9108   | 0.9190    | 0.9108 | 0.9103 |
+| 0.0021        | 6.0     | 900  | 0.5940          | 0.8758   | 0.8985    | 0.8758 | 0.8730 |
+| 0.054         | 6.6667  | 1000 | 0.7463          | 0.8733   | 0.9068    | 0.8733 | 0.8714 |
+| 0.0015        | 7.3333  | 1100 | 0.8915          | 0.8392   | 0.8722    | 0.8392 | 0.8243 |
+| 0.0013        | 8.0     | 1200 | 0.5725          | 0.8917   | 0.8943    | 0.8917 | 0.8909 |
+| 0.0011        | 8.6667  | 1300 | 0.5772          | 0.8933   | 0.8960    | 0.8933 | 0.8926 |
+| 0.001         | 9.3333  | 1400 | 0.5820          | 0.8933   | 0.8956    | 0.8933 | 0.8926 |
+| 0.0009        | 10.0    | 1500 | 0.5859          | 0.8933   | 0.8954    | 0.8933 | 0.8925 |
+| 0.0008        | 10.6667 | 1600 | 0.5901          | 0.8933   | 0.8955    | 0.8933 | 0.8926 |
+| 0.0008        | 11.3333 | 1700 | 0.5938          | 0.8933   | 0.8955    | 0.8933 | 0.8926 |
+| 0.0007        | 12.0    | 1800 | 0.5971          | 0.8933   | 0.8953    | 0.8933 | 0.8925 |
+| 0.0007        | 12.6667 | 1900 | 0.5998          | 0.8933   | 0.8952    | 0.8933 | 0.8926 |
+| 0.0007        | 13.3333 | 2000 | 0.6016          | 0.8933   | 0.8952    | 0.8933 | 0.8926 |
+| 0.0006        | 14.0    | 2100 | 0.6032          | 0.8933   | 0.8952    | 0.8933 | 0.8926 |
+| 0.0006        | 14.6667 | 2200 | 0.6039          | 0.8933   | 0.8952    | 0.8933 | 0.8926 |
+### Framework versions
+- Transformers 4.48.2
+- Pytorch 2.6.0+cu126
+- Datasets 3.2.0
+- Tokenizers 0.21.0

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 15.0,
+    "eval_accuracy": 0.9108333333333334,
+    "eval_f1": 0.9102828889161464,
+    "eval_loss": 0.3454643189907074,
+    "eval_precision": 0.9190361753451352,
+    "eval_recall": 0.9108333333333334,
+    "eval_runtime": 9.2262,
+    "eval_samples_per_second": 130.064,
+    "eval_steps_per_second": 16.258,
+    "total_flos": 5.57962327867392e+18,
+    "train_loss": 0.03856972599029541,
+    "train_runtime": 877.6839,
+    "train_samples_per_second": 82.034,
+    "train_steps_per_second": 2.564
+}

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "google/vit-base-patch16-224-in21k",
+  "architectures": [
+    "ViTForImageClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "encoder_stride": 16,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "SEC-Subtype_IVa",
+    "1": "SEC-Subtype_IVa2",
+    "2": "SEC-Subtype_IVc",
+    "3": "SEC-Subtype_IVd",
+    "4": "SEC-Subtype_Ia",
+    "5": "SEC-Subtype_Va"
+  },
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "SEC-Subtype_IVa": "0",
+    "SEC-Subtype_IVa2": "1",
+    "SEC-Subtype_IVc": "2",
+    "SEC-Subtype_IVd": "3",
+    "SEC-Subtype_Ia": "4",
+    "SEC-Subtype_Va": "5"
+  },
+  "layer_norm_eps": 1e-12,
+  "model_type": "vit",
+  "num_attention_heads": 12,
+  "num_channels": 3,
+  "num_hidden_layers": 12,
+  "patch_size": 16,
+  "problem_type": "single_label_classification",
+  "qkv_bias": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93bf64d74d9a9d8cb4cdeeab8e2200e73547ae04503781ef6f826aa61c743686
+size 343236280

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "ViTFeatureExtractor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

test_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "epoch": 15.0,
+    "eval_accuracy": 0.9108333333333334,
+    "eval_f1": 0.9102828889161464,
+    "eval_loss": 0.3454643189907074,
+    "eval_precision": 0.9190361753451352,
+    "eval_recall": 0.9108333333333334,
+    "eval_runtime": 9.2262,
+    "eval_samples_per_second": 130.064,
+    "eval_steps_per_second": 16.258
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 15.0,
+    "total_flos": 5.57962327867392e+18,
+    "train_loss": 0.03856972599029541,
+    "train_runtime": 877.6839,
+    "train_samples_per_second": 82.034,
+    "train_steps_per_second": 2.564
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3456 @@

+{
+  "best_metric": 0.3454643189907074,
+  "best_model_checkpoint": "vit-finetune-kidney-stone-Michel_Daudon_-w256_1k_v1-_SEC-pretrain\\checkpoint-800",
+  "epoch": 15.0,
+  "eval_steps": 100,
+  "global_step": 2250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 1.789089322090149,
+      "learning_rate": 0.00019955555555555558,
+      "loss": 1.6418,
+      "step": 5
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 1.9554781913757324,
+      "learning_rate": 0.00019911111111111111,
+      "loss": 1.3356,
+      "step": 10
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.6346088647842407,
+      "learning_rate": 0.00019866666666666668,
+      "loss": 0.9694,
+      "step": 15
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 3.838526725769043,
+      "learning_rate": 0.00019822222222222225,
+      "loss": 0.7895,
+      "step": 20
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 2.2996692657470703,
+      "learning_rate": 0.00019777777777777778,
+      "loss": 0.5608,
+      "step": 25
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.4577720165252686,
+      "learning_rate": 0.00019733333333333335,
+      "loss": 0.4723,
+      "step": 30
+    },
+    {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 1.0262706279754639,
+      "learning_rate": 0.0001968888888888889,
+      "loss": 0.442,
+      "step": 35
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 3.0762388706207275,
+      "learning_rate": 0.00019644444444444445,
+      "loss": 0.3641,
+      "step": 40
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.80593204498291,
+      "learning_rate": 0.000196,
+      "loss": 0.3185,
+      "step": 45
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.8633087873458862,
+      "learning_rate": 0.00019555555555555556,
+      "loss": 0.2134,
+      "step": 50
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 2.5019023418426514,
+      "learning_rate": 0.0001951111111111111,
+      "loss": 0.303,
+      "step": 55
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 4.1190361976623535,
+      "learning_rate": 0.0001946666666666667,
+      "loss": 0.3188,
+      "step": 60
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 4.875674247741699,
+      "learning_rate": 0.00019422222222222223,
+      "loss": 0.1905,
+      "step": 65
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.8875412940979004,
+      "learning_rate": 0.0001937777777777778,
+      "loss": 0.2546,
+      "step": 70
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.5920054316520691,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 0.2255,
+      "step": 75
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 1.1335688829421997,
+      "learning_rate": 0.0001928888888888889,
+      "loss": 0.1288,
+      "step": 80
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 1.0775712728500366,
+      "learning_rate": 0.00019244444444444444,
+      "loss": 0.1847,
+      "step": 85
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.5961558818817139,
+      "learning_rate": 0.000192,
+      "loss": 0.1618,
+      "step": 90
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 2.420574188232422,
+      "learning_rate": 0.00019155555555555554,
+      "loss": 0.2102,
+      "step": 95
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.7953271865844727,
+      "learning_rate": 0.00019111111111111114,
+      "loss": 0.1494,
+      "step": 100
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "eval_accuracy": 0.8441666666666666,
+      "eval_f1": 0.8389603304554332,
+      "eval_loss": 0.6088427901268005,
+      "eval_precision": 0.8766189856356855,
+      "eval_recall": 0.8441666666666666,
+      "eval_runtime": 9.7541,
+      "eval_samples_per_second": 123.025,
+      "eval_steps_per_second": 15.378,
+      "step": 100
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.2056903839111328,
+      "learning_rate": 0.00019066666666666668,
+      "loss": 0.2564,
+      "step": 105
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.280213326215744,
+      "learning_rate": 0.00019022222222222224,
+      "loss": 0.1638,
+      "step": 110
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 0.9152695536613464,
+      "learning_rate": 0.00018977777777777778,
+      "loss": 0.1242,
+      "step": 115
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8287985324859619,
+      "learning_rate": 0.00018933333333333335,
+      "loss": 0.1684,
+      "step": 120
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 6.265803337097168,
+      "learning_rate": 0.00018888888888888888,
+      "loss": 0.1403,
+      "step": 125
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 1.710580825805664,
+      "learning_rate": 0.00018844444444444445,
+      "loss": 0.1048,
+      "step": 130
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.5326979756355286,
+      "learning_rate": 0.000188,
+      "loss": 0.0594,
+      "step": 135
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 4.1653289794921875,
+      "learning_rate": 0.00018755555555555558,
+      "loss": 0.1216,
+      "step": 140
+    },
+    {
+      "epoch": 0.9666666666666667,
+      "grad_norm": 0.7304482460021973,
+      "learning_rate": 0.00018711111111111112,
+      "loss": 0.1131,
+      "step": 145
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.068601131439209,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.1653,
+      "step": 150
+    },
+    {
+      "epoch": 1.0333333333333334,
+      "grad_norm": 1.9872854948043823,
+      "learning_rate": 0.00018622222222222223,
+      "loss": 0.1083,
+      "step": 155
+    },
+    {
+      "epoch": 1.0666666666666667,
+      "grad_norm": 1.3612173795700073,
+      "learning_rate": 0.0001857777777777778,
+      "loss": 0.1147,
+      "step": 160
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 4.431027412414551,
+      "learning_rate": 0.00018533333333333333,
+      "loss": 0.1274,
+      "step": 165
+    },
+    {
+      "epoch": 1.1333333333333333,
+      "grad_norm": 4.902954578399658,
+      "learning_rate": 0.0001848888888888889,
+      "loss": 0.0944,
+      "step": 170
+    },
+    {
+      "epoch": 1.1666666666666667,
+      "grad_norm": 0.3168695271015167,
+      "learning_rate": 0.00018444444444444446,
+      "loss": 0.1347,
+      "step": 175
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.4788297712802887,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 0.108,
+      "step": 180
+    },
+    {
+      "epoch": 1.2333333333333334,
+      "grad_norm": 0.159646138548851,
+      "learning_rate": 0.00018355555555555557,
+      "loss": 0.0388,
+      "step": 185
+    },
+    {
+      "epoch": 1.2666666666666666,
+      "grad_norm": 0.08620858937501907,
+      "learning_rate": 0.00018311111111111113,
+      "loss": 0.0652,
+      "step": 190
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 3.772901773452759,
+      "learning_rate": 0.00018266666666666667,
+      "loss": 0.0483,
+      "step": 195
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.07443404942750931,
+      "learning_rate": 0.00018222222222222224,
+      "loss": 0.0665,
+      "step": 200
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "eval_accuracy": 0.8491666666666666,
+      "eval_f1": 0.8541751498812096,
+      "eval_loss": 0.5532689690589905,
+      "eval_precision": 0.881005645309773,
+      "eval_recall": 0.8491666666666666,
+      "eval_runtime": 9.2193,
+      "eval_samples_per_second": 130.162,
+      "eval_steps_per_second": 16.27,
+      "step": 200
+    },
+    {
+      "epoch": 1.3666666666666667,
+      "grad_norm": 6.843237400054932,
+      "learning_rate": 0.00018177777777777778,
+      "loss": 0.1125,
+      "step": 205
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 4.178153991699219,
+      "learning_rate": 0.00018133333333333334,
+      "loss": 0.0818,
+      "step": 210
+    },
+    {
+      "epoch": 1.4333333333333333,
+      "grad_norm": 0.07139376550912857,
+      "learning_rate": 0.0001808888888888889,
+      "loss": 0.0564,
+      "step": 215
+    },
+    {
+      "epoch": 1.4666666666666668,
+      "grad_norm": 2.2558469772338867,
+      "learning_rate": 0.00018044444444444447,
+      "loss": 0.0917,
+      "step": 220
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 5.316349506378174,
+      "learning_rate": 0.00018,
+      "loss": 0.0803,
+      "step": 225
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.21899549663066864,
+      "learning_rate": 0.00017955555555555558,
+      "loss": 0.113,
+      "step": 230
+    },
+    {
+      "epoch": 1.5666666666666667,
+      "grad_norm": 6.164524078369141,
+      "learning_rate": 0.00017911111111111112,
+      "loss": 0.0666,
+      "step": 235
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.5532757639884949,
+      "learning_rate": 0.00017866666666666668,
+      "loss": 0.0778,
+      "step": 240
+    },
+    {
+      "epoch": 1.6333333333333333,
+      "grad_norm": 0.05819055810570717,
+      "learning_rate": 0.00017822222222222222,
+      "loss": 0.0719,
+      "step": 245
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.2468406856060028,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 0.0438,
+      "step": 250
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 3.8207075595855713,
+      "learning_rate": 0.00017733333333333335,
+      "loss": 0.1873,
+      "step": 255
+    },
+    {
+      "epoch": 1.7333333333333334,
+      "grad_norm": 0.05965983495116234,
+      "learning_rate": 0.0001768888888888889,
+      "loss": 0.055,
+      "step": 260
+    },
+    {
+      "epoch": 1.7666666666666666,
+      "grad_norm": 4.2167205810546875,
+      "learning_rate": 0.00017644444444444446,
+      "loss": 0.042,
+      "step": 265
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.10185975581407547,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 0.0424,
+      "step": 270
+    },
+    {
+      "epoch": 1.8333333333333335,
+      "grad_norm": 1.9210762977600098,
+      "learning_rate": 0.00017555555555555556,
+      "loss": 0.0562,
+      "step": 275
+    },
+    {
+      "epoch": 1.8666666666666667,
+      "grad_norm": 0.6008936166763306,
+      "learning_rate": 0.00017511111111111113,
+      "loss": 0.0524,
+      "step": 280
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.841305136680603,
+      "learning_rate": 0.00017466666666666667,
+      "loss": 0.146,
+      "step": 285
+    },
+    {
+      "epoch": 1.9333333333333333,
+      "grad_norm": 0.04818285256624222,
+      "learning_rate": 0.00017422222222222223,
+      "loss": 0.015,
+      "step": 290
+    },
+    {
+      "epoch": 1.9666666666666668,
+      "grad_norm": 0.050033003091812134,
+      "learning_rate": 0.0001737777777777778,
+      "loss": 0.1287,
+      "step": 295
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.05448291450738907,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.0215,
+      "step": 300
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9016666666666666,
+      "eval_f1": 0.8984841257788492,
+      "eval_loss": 0.37212345004081726,
+      "eval_precision": 0.9081757709131506,
+      "eval_recall": 0.9016666666666666,
+      "eval_runtime": 8.9962,
+      "eval_samples_per_second": 133.389,
+      "eval_steps_per_second": 16.674,
+      "step": 300
+    },
+    {
+      "epoch": 2.033333333333333,
+      "grad_norm": 0.05407797917723656,
+      "learning_rate": 0.0001728888888888889,
+      "loss": 0.0321,
+      "step": 305
+    },
+    {
+      "epoch": 2.066666666666667,
+      "grad_norm": 4.20230770111084,
+      "learning_rate": 0.00017244444444444444,
+      "loss": 0.0491,
+      "step": 310
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.49053955078125,
+      "learning_rate": 0.000172,
+      "loss": 0.0539,
+      "step": 315
+    },
+    {
+      "epoch": 2.1333333333333333,
+      "grad_norm": 4.136805057525635,
+      "learning_rate": 0.00017155555555555555,
+      "loss": 0.0815,
+      "step": 320
+    },
+    {
+      "epoch": 2.1666666666666665,
+      "grad_norm": 4.121062278747559,
+      "learning_rate": 0.0001711111111111111,
+      "loss": 0.0551,
+      "step": 325
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 2.9513375759124756,
+      "learning_rate": 0.00017066666666666668,
+      "loss": 0.0429,
+      "step": 330
+    },
+    {
+      "epoch": 2.2333333333333334,
+      "grad_norm": 0.24824482202529907,
+      "learning_rate": 0.00017022222222222224,
+      "loss": 0.0402,
+      "step": 335
+    },
+    {
+      "epoch": 2.2666666666666666,
+      "grad_norm": 2.6825144290924072,
+      "learning_rate": 0.00016977777777777778,
+      "loss": 0.1037,
+      "step": 340
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 8.458391189575195,
+      "learning_rate": 0.00016933333333333335,
+      "loss": 0.0547,
+      "step": 345
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": 4.8150739669799805,
+      "learning_rate": 0.00016888888888888889,
+      "loss": 0.1288,
+      "step": 350
+    },
+    {
+      "epoch": 2.3666666666666667,
+      "grad_norm": 2.286815643310547,
+      "learning_rate": 0.00016844444444444445,
+      "loss": 0.0206,
+      "step": 355
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 6.384063720703125,
+      "learning_rate": 0.000168,
+      "loss": 0.0305,
+      "step": 360
+    },
+    {
+      "epoch": 2.4333333333333336,
+      "grad_norm": 0.08697441965341568,
+      "learning_rate": 0.00016755555555555556,
+      "loss": 0.0306,
+      "step": 365
+    },
+    {
+      "epoch": 2.466666666666667,
+      "grad_norm": 5.742947578430176,
+      "learning_rate": 0.00016711111111111112,
+      "loss": 0.0494,
+      "step": 370
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.03892851248383522,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.0096,
+      "step": 375
+    },
+    {
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.0344746969640255,
+      "learning_rate": 0.00016622222222222223,
+      "loss": 0.0283,
+      "step": 380
+    },
+    {
+      "epoch": 2.5666666666666664,
+      "grad_norm": 0.02712893858551979,
+      "learning_rate": 0.0001657777777777778,
+      "loss": 0.022,
+      "step": 385
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.03232201561331749,
+      "learning_rate": 0.00016533333333333333,
+      "loss": 0.1133,
+      "step": 390
+    },
+    {
+      "epoch": 2.6333333333333333,
+      "grad_norm": 0.034541644155979156,
+      "learning_rate": 0.0001648888888888889,
+      "loss": 0.009,
+      "step": 395
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.04563550278544426,
+      "learning_rate": 0.00016444444444444444,
+      "loss": 0.0101,
+      "step": 400
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "eval_accuracy": 0.8941666666666667,
+      "eval_f1": 0.8920220351736138,
+      "eval_loss": 0.5347270965576172,
+      "eval_precision": 0.9061420167128886,
+      "eval_recall": 0.8941666666666667,
+      "eval_runtime": 9.1242,
+      "eval_samples_per_second": 131.518,
+      "eval_steps_per_second": 16.44,
+      "step": 400
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.03443131595849991,
+      "learning_rate": 0.000164,
+      "loss": 0.0091,
+      "step": 405
+    },
+    {
+      "epoch": 2.7333333333333334,
+      "grad_norm": 0.025529412552714348,
+      "learning_rate": 0.00016355555555555557,
+      "loss": 0.0074,
+      "step": 410
+    },
+    {
+      "epoch": 2.7666666666666666,
+      "grad_norm": 0.06645037978887558,
+      "learning_rate": 0.00016311111111111113,
+      "loss": 0.0073,
+      "step": 415
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.026776228100061417,
+      "learning_rate": 0.00016266666666666667,
+      "loss": 0.0079,
+      "step": 420
+    },
+    {
+      "epoch": 2.8333333333333335,
+      "grad_norm": 0.20765815675258636,
+      "learning_rate": 0.00016222222222222224,
+      "loss": 0.0071,
+      "step": 425
+    },
+    {
+      "epoch": 2.8666666666666667,
+      "grad_norm": 0.023739568889141083,
+      "learning_rate": 0.00016177777777777778,
+      "loss": 0.0067,
+      "step": 430
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.023541877046227455,
+      "learning_rate": 0.00016133333333333334,
+      "loss": 0.0061,
+      "step": 435
+    },
+    {
+      "epoch": 2.9333333333333336,
+      "grad_norm": 0.02066374383866787,
+      "learning_rate": 0.00016088888888888888,
+      "loss": 0.006,
+      "step": 440
+    },
+    {
+      "epoch": 2.966666666666667,
+      "grad_norm": 0.022866908460855484,
+      "learning_rate": 0.00016044444444444445,
+      "loss": 0.006,
+      "step": 445
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.03311862796545029,
+      "learning_rate": 0.00016,
+      "loss": 0.0058,
+      "step": 450
+    },
+    {
+      "epoch": 3.033333333333333,
+      "grad_norm": 2.7163822650909424,
+      "learning_rate": 0.00015955555555555558,
+      "loss": 0.0187,
+      "step": 455
+    },
+    {
+      "epoch": 3.066666666666667,
+      "grad_norm": 0.020368332043290138,
+      "learning_rate": 0.00015911111111111112,
+      "loss": 0.0053,
+      "step": 460
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 0.02092335745692253,
+      "learning_rate": 0.00015866666666666668,
+      "loss": 0.0446,
+      "step": 465
+    },
+    {
+      "epoch": 3.1333333333333333,
+      "grad_norm": 0.01936771348118782,
+      "learning_rate": 0.00015822222222222222,
+      "loss": 0.019,
+      "step": 470
+    },
+    {
+      "epoch": 3.1666666666666665,
+      "grad_norm": 0.05188484862446785,
+      "learning_rate": 0.0001577777777777778,
+      "loss": 0.0064,
+      "step": 475
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.7598115801811218,
+      "learning_rate": 0.00015733333333333333,
+      "loss": 0.0065,
+      "step": 480
+    },
+    {
+      "epoch": 3.2333333333333334,
+      "grad_norm": 0.017925532534718513,
+      "learning_rate": 0.00015688888888888892,
+      "loss": 0.0052,
+      "step": 485
+    },
+    {
+      "epoch": 3.2666666666666666,
+      "grad_norm": 1.7092453241348267,
+      "learning_rate": 0.00015644444444444446,
+      "loss": 0.039,
+      "step": 490
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 0.03364104405045509,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 0.0306,
+      "step": 495
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 3.7608277797698975,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 0.043,
+      "step": 500
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "eval_accuracy": 0.8425,
+      "eval_f1": 0.8426894645957341,
+      "eval_loss": 0.784968912601471,
+      "eval_precision": 0.8592122423357443,
+      "eval_recall": 0.8425,
+      "eval_runtime": 9.2177,
+      "eval_samples_per_second": 130.185,
+      "eval_steps_per_second": 16.273,
+      "step": 500
+    },
+    {
+      "epoch": 3.3666666666666667,
+      "grad_norm": 0.1858338564634323,
+      "learning_rate": 0.00015511111111111113,
+      "loss": 0.0066,
+      "step": 505
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 0.022260984405875206,
+      "learning_rate": 0.00015466666666666667,
+      "loss": 0.0104,
+      "step": 510
+    },
+    {
+      "epoch": 3.4333333333333336,
+      "grad_norm": 3.2305305004119873,
+      "learning_rate": 0.00015422222222222223,
+      "loss": 0.0734,
+      "step": 515
+    },
+    {
+      "epoch": 3.466666666666667,
+      "grad_norm": 0.02268332615494728,
+      "learning_rate": 0.00015377777777777777,
+      "loss": 0.0191,
+      "step": 520
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.6931867599487305,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 0.0415,
+      "step": 525
+    },
+    {
+      "epoch": 3.533333333333333,
+      "grad_norm": 0.05407170578837395,
+      "learning_rate": 0.0001528888888888889,
+      "loss": 0.0445,
+      "step": 530
+    },
+    {
+      "epoch": 3.5666666666666664,
+      "grad_norm": 0.09081502258777618,
+      "learning_rate": 0.00015244444444444447,
+      "loss": 0.0555,
+      "step": 535
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.026441602036356926,
+      "learning_rate": 0.000152,
+      "loss": 0.0058,
+      "step": 540
+    },
+    {
+      "epoch": 3.6333333333333333,
+      "grad_norm": 0.25190070271492004,
+      "learning_rate": 0.00015155555555555557,
+      "loss": 0.005,
+      "step": 545
+    },
+    {
+      "epoch": 3.6666666666666665,
+      "grad_norm": 0.016116006299853325,
+      "learning_rate": 0.0001511111111111111,
+      "loss": 0.0049,
+      "step": 550
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 4.043995380401611,
+      "learning_rate": 0.00015066666666666668,
+      "loss": 0.0677,
+      "step": 555
+    },
+    {
+      "epoch": 3.7333333333333334,
+      "grad_norm": 0.02261391095817089,
+      "learning_rate": 0.00015022222222222222,
+      "loss": 0.0043,
+      "step": 560
+    },
+    {
+      "epoch": 3.7666666666666666,
+      "grad_norm": 0.015623694285750389,
+      "learning_rate": 0.00014977777777777778,
+      "loss": 0.0052,
+      "step": 565
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.0159298162907362,
+      "learning_rate": 0.00014933333333333335,
+      "loss": 0.0302,
+      "step": 570
+    },
+    {
+      "epoch": 3.8333333333333335,
+      "grad_norm": 0.020601999014616013,
+      "learning_rate": 0.0001488888888888889,
+      "loss": 0.0041,
+      "step": 575
+    },
+    {
+      "epoch": 3.8666666666666667,
+      "grad_norm": 0.013788328506052494,
+      "learning_rate": 0.00014844444444444445,
+      "loss": 0.0048,
+      "step": 580
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 1.414560317993164,
+      "learning_rate": 0.000148,
+      "loss": 0.0077,
+      "step": 585
+    },
+    {
+      "epoch": 3.9333333333333336,
+      "grad_norm": 4.793529033660889,
+      "learning_rate": 0.00014755555555555556,
+      "loss": 0.008,
+      "step": 590
+    },
+    {
+      "epoch": 3.966666666666667,
+      "grad_norm": 0.014001097530126572,
+      "learning_rate": 0.00014711111111111112,
+      "loss": 0.0415,
+      "step": 595
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.029955489560961723,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.0641,
+      "step": 600
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.8583333333333333,
+      "eval_f1": 0.8573833554569419,
+      "eval_loss": 0.7735322117805481,
+      "eval_precision": 0.8770446177892626,
+      "eval_recall": 0.8583333333333333,
+      "eval_runtime": 9.0657,
+      "eval_samples_per_second": 132.367,
+      "eval_steps_per_second": 16.546,
+      "step": 600
+    },
+    {
+      "epoch": 4.033333333333333,
+      "grad_norm": 1.7859865427017212,
+      "learning_rate": 0.00014622222222222223,
+      "loss": 0.0383,
+      "step": 605
+    },
+    {
+      "epoch": 4.066666666666666,
+      "grad_norm": 2.2146739959716797,
+      "learning_rate": 0.0001457777777777778,
+      "loss": 0.049,
+      "step": 610
+    },
+    {
+      "epoch": 4.1,
+      "grad_norm": 4.788426399230957,
+      "learning_rate": 0.00014533333333333333,
+      "loss": 0.0172,
+      "step": 615
+    },
+    {
+      "epoch": 4.133333333333334,
+      "grad_norm": 4.24533224105835,
+      "learning_rate": 0.0001448888888888889,
+      "loss": 0.0581,
+      "step": 620
+    },
+    {
+      "epoch": 4.166666666666667,
+      "grad_norm": 3.2386908531188965,
+      "learning_rate": 0.00014444444444444444,
+      "loss": 0.2119,
+      "step": 625
+    },
+    {
+      "epoch": 4.2,
+      "grad_norm": 3.0637991428375244,
+      "learning_rate": 0.000144,
+      "loss": 0.0952,
+      "step": 630
+    },
+    {
+      "epoch": 4.233333333333333,
+      "grad_norm": 1.8726221323013306,
+      "learning_rate": 0.00014355555555555554,
+      "loss": 0.0625,
+      "step": 635
+    },
+    {
+      "epoch": 4.266666666666667,
+      "grad_norm": 2.6457202434539795,
+      "learning_rate": 0.0001431111111111111,
+      "loss": 0.0337,
+      "step": 640
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 0.16701628267765045,
+      "learning_rate": 0.00014266666666666667,
+      "loss": 0.0452,
+      "step": 645
+    },
+    {
+      "epoch": 4.333333333333333,
+      "grad_norm": 0.021049339324235916,
+      "learning_rate": 0.00014222222222222224,
+      "loss": 0.0309,
+      "step": 650
+    },
+    {
+      "epoch": 4.366666666666666,
+      "grad_norm": 0.013704544864594936,
+      "learning_rate": 0.00014177777777777778,
+      "loss": 0.0362,
+      "step": 655
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 1.8577803373336792,
+      "learning_rate": 0.00014133333333333334,
+      "loss": 0.0962,
+      "step": 660
+    },
+    {
+      "epoch": 4.433333333333334,
+      "grad_norm": 6.272852897644043,
+      "learning_rate": 0.00014088888888888888,
+      "loss": 0.0107,
+      "step": 665
+    },
+    {
+      "epoch": 4.466666666666667,
+      "grad_norm": 0.03123130276799202,
+      "learning_rate": 0.00014044444444444445,
+      "loss": 0.0632,
+      "step": 670
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.26144641637802124,
+      "learning_rate": 0.00014,
+      "loss": 0.0608,
+      "step": 675
+    },
+    {
+      "epoch": 4.533333333333333,
+      "grad_norm": 0.032063163816928864,
+      "learning_rate": 0.00013955555555555558,
+      "loss": 0.0356,
+      "step": 680
+    },
+    {
+      "epoch": 4.566666666666666,
+      "grad_norm": 1.1699529886245728,
+      "learning_rate": 0.00013911111111111112,
+      "loss": 0.0284,
+      "step": 685
+    },
+    {
+      "epoch": 4.6,
+      "grad_norm": 1.968325138092041,
+      "learning_rate": 0.00013866666666666669,
+      "loss": 0.0615,
+      "step": 690
+    },
+    {
+      "epoch": 4.633333333333333,
+      "grad_norm": 0.014379513449966908,
+      "learning_rate": 0.00013822222222222222,
+      "loss": 0.0193,
+      "step": 695
+    },
+    {
+      "epoch": 4.666666666666667,
+      "grad_norm": 0.022963469848036766,
+      "learning_rate": 0.0001377777777777778,
+      "loss": 0.0036,
+      "step": 700
+    },
+    {
+      "epoch": 4.666666666666667,
+      "eval_accuracy": 0.8366666666666667,
+      "eval_f1": 0.8250255822533592,
+      "eval_loss": 0.7351471185684204,
+      "eval_precision": 0.8623013064610192,
+      "eval_recall": 0.8366666666666667,
+      "eval_runtime": 9.0085,
+      "eval_samples_per_second": 133.207,
+      "eval_steps_per_second": 16.651,
+      "step": 700
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 0.014110775664448738,
+      "learning_rate": 0.00013733333333333333,
+      "loss": 0.0147,
+      "step": 705
+    },
+    {
+      "epoch": 4.733333333333333,
+      "grad_norm": 0.014284992590546608,
+      "learning_rate": 0.0001368888888888889,
+      "loss": 0.0035,
+      "step": 710
+    },
+    {
+      "epoch": 4.766666666666667,
+      "grad_norm": 0.012826389633119106,
+      "learning_rate": 0.00013644444444444443,
+      "loss": 0.0044,
+      "step": 715
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.011032935231924057,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 0.0046,
+      "step": 720
+    },
+    {
+      "epoch": 4.833333333333333,
+      "grad_norm": 0.014207839034497738,
+      "learning_rate": 0.00013555555555555556,
+      "loss": 0.0339,
+      "step": 725
+    },
+    {
+      "epoch": 4.866666666666667,
+      "grad_norm": 0.010903984308242798,
+      "learning_rate": 0.00013511111111111113,
+      "loss": 0.0052,
+      "step": 730
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 1.6590766906738281,
+      "learning_rate": 0.00013466666666666667,
+      "loss": 0.0792,
+      "step": 735
+    },
+    {
+      "epoch": 4.933333333333334,
+      "grad_norm": 0.010764382779598236,
+      "learning_rate": 0.00013422222222222224,
+      "loss": 0.0912,
+      "step": 740
+    },
+    {
+      "epoch": 4.966666666666667,
+      "grad_norm": 6.360743045806885,
+      "learning_rate": 0.00013377777777777777,
+      "loss": 0.0142,
+      "step": 745
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.08818025141954422,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.0125,
+      "step": 750
+    },
+    {
+      "epoch": 5.033333333333333,
+      "grad_norm": 0.039597898721694946,
+      "learning_rate": 0.00013288888888888888,
+      "loss": 0.047,
+      "step": 755
+    },
+    {
+      "epoch": 5.066666666666666,
+      "grad_norm": 3.1502726078033447,
+      "learning_rate": 0.00013244444444444447,
+      "loss": 0.0817,
+      "step": 760
+    },
+    {
+      "epoch": 5.1,
+      "grad_norm": 0.04044979810714722,
+      "learning_rate": 0.000132,
+      "loss": 0.004,
+      "step": 765
+    },
+    {
+      "epoch": 5.133333333333334,
+      "grad_norm": 0.018100356683135033,
+      "learning_rate": 0.00013155555555555558,
+      "loss": 0.0059,
+      "step": 770
+    },
+    {
+      "epoch": 5.166666666666667,
+      "grad_norm": 0.1408000886440277,
+      "learning_rate": 0.00013111111111111111,
+      "loss": 0.0058,
+      "step": 775
+    },
+    {
+      "epoch": 5.2,
+      "grad_norm": 0.014914629980921745,
+      "learning_rate": 0.00013066666666666668,
+      "loss": 0.0397,
+      "step": 780
+    },
+    {
+      "epoch": 5.233333333333333,
+      "grad_norm": 0.010519575327634811,
+      "learning_rate": 0.00013022222222222222,
+      "loss": 0.0035,
+      "step": 785
+    },
+    {
+      "epoch": 5.266666666666667,
+      "grad_norm": 0.1079607829451561,
+      "learning_rate": 0.00012977777777777779,
+      "loss": 0.003,
+      "step": 790
+    },
+    {
+      "epoch": 5.3,
+      "grad_norm": 0.012321013025939465,
+      "learning_rate": 0.00012933333333333332,
+      "loss": 0.0043,
+      "step": 795
+    },
+    {
+      "epoch": 5.333333333333333,
+      "grad_norm": 0.009173799306154251,
+      "learning_rate": 0.00012888888888888892,
+      "loss": 0.0039,
+      "step": 800
+    },
+    {
+      "epoch": 5.333333333333333,
+      "eval_accuracy": 0.9108333333333334,
+      "eval_f1": 0.9102828889161464,
+      "eval_loss": 0.3454643189907074,
+      "eval_precision": 0.9190361753451352,
+      "eval_recall": 0.9108333333333334,
+      "eval_runtime": 9.0972,
+      "eval_samples_per_second": 131.909,
+      "eval_steps_per_second": 16.489,
+      "step": 800
+    },
+    {
+      "epoch": 5.366666666666666,
+      "grad_norm": 0.00932050310075283,
+      "learning_rate": 0.00012844444444444446,
+      "loss": 0.0031,
+      "step": 805
+    },
+    {
+      "epoch": 5.4,
+      "grad_norm": 0.009178749285638332,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 0.0025,
+      "step": 810
+    },
+    {
+      "epoch": 5.433333333333334,
+      "grad_norm": 0.008462225086987019,
+      "learning_rate": 0.00012755555555555556,
+      "loss": 0.0024,
+      "step": 815
+    },
+    {
+      "epoch": 5.466666666666667,
+      "grad_norm": 0.009262710809707642,
+      "learning_rate": 0.00012711111111111113,
+      "loss": 0.0392,
+      "step": 820
+    },
+    {
+      "epoch": 5.5,
+      "grad_norm": 0.008442485705018044,
+      "learning_rate": 0.00012666666666666666,
+      "loss": 0.0023,
+      "step": 825
+    },
+    {
+      "epoch": 5.533333333333333,
+      "grad_norm": 0.016601471230387688,
+      "learning_rate": 0.00012622222222222223,
+      "loss": 0.0146,
+      "step": 830
+    },
+    {
+      "epoch": 5.566666666666666,
+      "grad_norm": 0.008341116830706596,
+      "learning_rate": 0.0001257777777777778,
+      "loss": 0.0023,
+      "step": 835
+    },
+    {
+      "epoch": 5.6,
+      "grad_norm": 0.008404243737459183,
+      "learning_rate": 0.00012533333333333334,
+      "loss": 0.0023,
+      "step": 840
+    },
+    {
+      "epoch": 5.633333333333333,
+      "grad_norm": 0.05369413644075394,
+      "learning_rate": 0.0001248888888888889,
+      "loss": 0.0026,
+      "step": 845
+    },
+    {
+      "epoch": 5.666666666666667,
+      "grad_norm": 1.255624532699585,
+      "learning_rate": 0.00012444444444444444,
+      "loss": 0.0301,
+      "step": 850
+    },
+    {
+      "epoch": 5.7,
+      "grad_norm": 0.07422761619091034,
+      "learning_rate": 0.000124,
+      "loss": 0.0024,
+      "step": 855
+    },
+    {
+      "epoch": 5.733333333333333,
+      "grad_norm": 0.4810630679130554,
+      "learning_rate": 0.00012355555555555557,
+      "loss": 0.0043,
+      "step": 860
+    },
+    {
+      "epoch": 5.766666666666667,
+      "grad_norm": 0.009810343384742737,
+      "learning_rate": 0.0001231111111111111,
+      "loss": 0.0159,
+      "step": 865
+    },
+    {
+      "epoch": 5.8,
+      "grad_norm": 0.007798292208462954,
+      "learning_rate": 0.00012266666666666668,
+      "loss": 0.011,
+      "step": 870
+    },
+    {
+      "epoch": 5.833333333333333,
+      "grad_norm": 2.5725393295288086,
+      "learning_rate": 0.00012222222222222224,
+      "loss": 0.0323,
+      "step": 875
+    },
+    {
+      "epoch": 5.866666666666667,
+      "grad_norm": 0.1737234741449356,
+      "learning_rate": 0.0001217777777777778,
+      "loss": 0.0025,
+      "step": 880
+    },
+    {
+      "epoch": 5.9,
+      "grad_norm": 0.10713543742895126,
+      "learning_rate": 0.00012133333333333335,
+      "loss": 0.0137,
+      "step": 885
+    },
+    {
+      "epoch": 5.933333333333334,
+      "grad_norm": 0.0170454028993845,
+      "learning_rate": 0.0001208888888888889,
+      "loss": 0.0024,
+      "step": 890
+    },
+    {
+      "epoch": 5.966666666666667,
+      "grad_norm": 0.008214999921619892,
+      "learning_rate": 0.00012044444444444445,
+      "loss": 0.0227,
+      "step": 895
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.008387326262891293,
+      "learning_rate": 0.00012,
+      "loss": 0.0021,
+      "step": 900
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.8758333333333334,
+      "eval_f1": 0.8729622165068617,
+      "eval_loss": 0.5939724445343018,
+      "eval_precision": 0.898484934932439,
+      "eval_recall": 0.8758333333333334,
+      "eval_runtime": 9.3035,
+      "eval_samples_per_second": 128.984,
+      "eval_steps_per_second": 16.123,
+      "step": 900
+    },
+    {
+      "epoch": 6.033333333333333,
+      "grad_norm": 0.00791590753942728,
+      "learning_rate": 0.00011955555555555556,
+      "loss": 0.087,
+      "step": 905
+    },
+    {
+      "epoch": 6.066666666666666,
+      "grad_norm": 0.06848917156457901,
+      "learning_rate": 0.00011911111111111111,
+      "loss": 0.0276,
+      "step": 910
+    },
+    {
+      "epoch": 6.1,
+      "grad_norm": 0.029530571773648262,
+      "learning_rate": 0.00011866666666666669,
+      "loss": 0.0032,
+      "step": 915
+    },
+    {
+      "epoch": 6.133333333333334,
+      "grad_norm": 0.07200734317302704,
+      "learning_rate": 0.00011822222222222224,
+      "loss": 0.0066,
+      "step": 920
+    },
+    {
+      "epoch": 6.166666666666667,
+      "grad_norm": 0.013201626017689705,
+      "learning_rate": 0.00011777777777777779,
+      "loss": 0.0263,
+      "step": 925
+    },
+    {
+      "epoch": 6.2,
+      "grad_norm": 0.055678680539131165,
+      "learning_rate": 0.00011733333333333334,
+      "loss": 0.0027,
+      "step": 930
+    },
+    {
+      "epoch": 6.233333333333333,
+      "grad_norm": 7.015408515930176,
+      "learning_rate": 0.0001168888888888889,
+      "loss": 0.0369,
+      "step": 935
+    },
+    {
+      "epoch": 6.266666666666667,
+      "grad_norm": 0.009604562073946,
+      "learning_rate": 0.00011644444444444445,
+      "loss": 0.0023,
+      "step": 940
+    },
+    {
+      "epoch": 6.3,
+      "grad_norm": 0.008789158426225185,
+      "learning_rate": 0.000116,
+      "loss": 0.0027,
+      "step": 945
+    },
+    {
+      "epoch": 6.333333333333333,
+      "grad_norm": 0.009867105633020401,
+      "learning_rate": 0.00011555555555555555,
+      "loss": 0.025,
+      "step": 950
+    },
+    {
+      "epoch": 6.366666666666666,
+      "grad_norm": 0.09046179801225662,
+      "learning_rate": 0.00011511111111111112,
+      "loss": 0.021,
+      "step": 955
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 0.2990459203720093,
+      "learning_rate": 0.00011466666666666667,
+      "loss": 0.0437,
+      "step": 960
+    },
+    {
+      "epoch": 6.433333333333334,
+      "grad_norm": 2.726165533065796,
+      "learning_rate": 0.00011422222222222224,
+      "loss": 0.0033,
+      "step": 965
+    },
+    {
+      "epoch": 6.466666666666667,
+      "grad_norm": 0.23076964914798737,
+      "learning_rate": 0.00011377777777777779,
+      "loss": 0.0343,
+      "step": 970
+    },
+    {
+      "epoch": 6.5,
+      "grad_norm": 0.04384481534361839,
+      "learning_rate": 0.00011333333333333334,
+      "loss": 0.0152,
+      "step": 975
+    },
+    {
+      "epoch": 6.533333333333333,
+      "grad_norm": 0.007463160436600447,
+      "learning_rate": 0.0001128888888888889,
+      "loss": 0.0019,
+      "step": 980
+    },
+    {
+      "epoch": 6.566666666666666,
+      "grad_norm": 0.015924058854579926,
+      "learning_rate": 0.00011244444444444445,
+      "loss": 0.0019,
+      "step": 985
+    },
+    {
+      "epoch": 6.6,
+      "grad_norm": 0.007377041503787041,
+      "learning_rate": 0.00011200000000000001,
+      "loss": 0.0019,
+      "step": 990
+    },
+    {
+      "epoch": 6.633333333333333,
+      "grad_norm": 0.04355741664767265,
+      "learning_rate": 0.00011155555555555556,
+      "loss": 0.0379,
+      "step": 995
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 3.4687533378601074,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 0.054,
+      "step": 1000
+    },
+    {
+      "epoch": 6.666666666666667,
+      "eval_accuracy": 0.8733333333333333,
+      "eval_f1": 0.8714263425298715,
+      "eval_loss": 0.7463460564613342,
+      "eval_precision": 0.9067771826890199,
+      "eval_recall": 0.8733333333333333,
+      "eval_runtime": 9.3125,
+      "eval_samples_per_second": 128.859,
+      "eval_steps_per_second": 16.107,
+      "step": 1000
+    },
+    {
+      "epoch": 6.7,
+      "grad_norm": 0.00824672356247902,
+      "learning_rate": 0.00011066666666666667,
+      "loss": 0.002,
+      "step": 1005
+    },
+    {
+      "epoch": 6.733333333333333,
+      "grad_norm": 1.1352989673614502,
+      "learning_rate": 0.00011022222222222222,
+      "loss": 0.046,
+      "step": 1010
+    },
+    {
+      "epoch": 6.766666666666667,
+      "grad_norm": 0.010397437028586864,
+      "learning_rate": 0.00010977777777777777,
+      "loss": 0.0027,
+      "step": 1015
+    },
+    {
+      "epoch": 6.8,
+      "grad_norm": 0.12541429698467255,
+      "learning_rate": 0.00010933333333333333,
+      "loss": 0.003,
+      "step": 1020
+    },
+    {
+      "epoch": 6.833333333333333,
+      "grad_norm": 0.009704334661364555,
+      "learning_rate": 0.00010888888888888889,
+      "loss": 0.0174,
+      "step": 1025
+    },
+    {
+      "epoch": 6.866666666666667,
+      "grad_norm": 0.011578983627259731,
+      "learning_rate": 0.00010844444444444446,
+      "loss": 0.0161,
+      "step": 1030
+    },
+    {
+      "epoch": 6.9,
+      "grad_norm": 0.009918139316141605,
+      "learning_rate": 0.00010800000000000001,
+      "loss": 0.0022,
+      "step": 1035
+    },
+    {
+      "epoch": 6.933333333333334,
+      "grad_norm": 0.007548391819000244,
+      "learning_rate": 0.00010755555555555556,
+      "loss": 0.002,
+      "step": 1040
+    },
+    {
+      "epoch": 6.966666666666667,
+      "grad_norm": 0.007331849075853825,
+      "learning_rate": 0.00010711111111111111,
+      "loss": 0.002,
+      "step": 1045
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 0.008315078914165497,
+      "learning_rate": 0.00010666666666666667,
+      "loss": 0.002,
+      "step": 1050
+    },
+    {
+      "epoch": 7.033333333333333,
+      "grad_norm": 0.007235812954604626,
+      "learning_rate": 0.00010622222222222222,
+      "loss": 0.0023,
+      "step": 1055
+    },
+    {
+      "epoch": 7.066666666666666,
+      "grad_norm": 0.0062900567427277565,
+      "learning_rate": 0.00010577777777777777,
+      "loss": 0.0017,
+      "step": 1060
+    },
+    {
+      "epoch": 7.1,
+      "grad_norm": 0.009442648850381374,
+      "learning_rate": 0.00010533333333333332,
+      "loss": 0.0018,
+      "step": 1065
+    },
+    {
+      "epoch": 7.133333333333334,
+      "grad_norm": 0.006661895662546158,
+      "learning_rate": 0.0001048888888888889,
+      "loss": 0.0018,
+      "step": 1070
+    },
+    {
+      "epoch": 7.166666666666667,
+      "grad_norm": 0.009286611340939999,
+      "learning_rate": 0.00010444444444444445,
+      "loss": 0.0018,
+      "step": 1075
+    },
+    {
+      "epoch": 7.2,
+      "grad_norm": 0.006994432769715786,
+      "learning_rate": 0.00010400000000000001,
+      "loss": 0.0016,
+      "step": 1080
+    },
+    {
+      "epoch": 7.233333333333333,
+      "grad_norm": 0.006040586624294519,
+      "learning_rate": 0.00010355555555555556,
+      "loss": 0.0016,
+      "step": 1085
+    },
+    {
+      "epoch": 7.266666666666667,
+      "grad_norm": 0.005971700418740511,
+      "learning_rate": 0.00010311111111111111,
+      "loss": 0.0027,
+      "step": 1090
+    },
+    {
+      "epoch": 7.3,
+      "grad_norm": 0.0062834471464157104,
+      "learning_rate": 0.00010266666666666666,
+      "loss": 0.0023,
+      "step": 1095
+    },
+    {
+      "epoch": 7.333333333333333,
+      "grad_norm": 0.005578146781772375,
+      "learning_rate": 0.00010222222222222222,
+      "loss": 0.0015,
+      "step": 1100
+    },
+    {
+      "epoch": 7.333333333333333,
+      "eval_accuracy": 0.8391666666666666,
+      "eval_f1": 0.8243070430866863,
+      "eval_loss": 0.891526997089386,
+      "eval_precision": 0.8721606965107342,
+      "eval_recall": 0.8391666666666666,
+      "eval_runtime": 9.3025,
+      "eval_samples_per_second": 128.998,
+      "eval_steps_per_second": 16.125,
+      "step": 1100
+    },
+    {
+      "epoch": 7.366666666666666,
+      "grad_norm": 0.005597973708063364,
+      "learning_rate": 0.00010177777777777777,
+      "loss": 0.0223,
+      "step": 1105
+    },
+    {
+      "epoch": 7.4,
+      "grad_norm": 0.006115862168371677,
+      "learning_rate": 0.00010133333333333335,
+      "loss": 0.0015,
+      "step": 1110
+    },
+    {
+      "epoch": 7.433333333333334,
+      "grad_norm": 0.013728507794439793,
+      "learning_rate": 0.0001008888888888889,
+      "loss": 0.002,
+      "step": 1115
+    },
+    {
+      "epoch": 7.466666666666667,
+      "grad_norm": 0.005637649912387133,
+      "learning_rate": 0.00010044444444444445,
+      "loss": 0.0016,
+      "step": 1120
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.08930695056915283,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 1125
+    },
+    {
+      "epoch": 7.533333333333333,
+      "grad_norm": 0.011366425082087517,
+      "learning_rate": 9.955555555555556e-05,
+      "loss": 0.0016,
+      "step": 1130
+    },
+    {
+      "epoch": 7.566666666666666,
+      "grad_norm": 0.005648004822432995,
+      "learning_rate": 9.911111111111112e-05,
+      "loss": 0.0017,
+      "step": 1135
+    },
+    {
+      "epoch": 7.6,
+      "grad_norm": 0.05995195358991623,
+      "learning_rate": 9.866666666666668e-05,
+      "loss": 0.0015,
+      "step": 1140
+    },
+    {
+      "epoch": 7.633333333333333,
+      "grad_norm": 0.00540464511141181,
+      "learning_rate": 9.822222222222223e-05,
+      "loss": 0.0015,
+      "step": 1145
+    },
+    {
+      "epoch": 7.666666666666667,
+      "grad_norm": 0.00566456513479352,
+      "learning_rate": 9.777777777777778e-05,
+      "loss": 0.0014,
+      "step": 1150
+    },
+    {
+      "epoch": 7.7,
+      "grad_norm": 0.004974729381501675,
+      "learning_rate": 9.733333333333335e-05,
+      "loss": 0.0014,
+      "step": 1155
+    },
+    {
+      "epoch": 7.733333333333333,
+      "grad_norm": 0.0054329875856637955,
+      "learning_rate": 9.68888888888889e-05,
+      "loss": 0.0014,
+      "step": 1160
+    },
+    {
+      "epoch": 7.766666666666667,
+      "grad_norm": 0.0056770662777125835,
+      "learning_rate": 9.644444444444445e-05,
+      "loss": 0.0014,
+      "step": 1165
+    },
+    {
+      "epoch": 7.8,
+      "grad_norm": 0.005198624450713396,
+      "learning_rate": 9.6e-05,
+      "loss": 0.0013,
+      "step": 1170
+    },
+    {
+      "epoch": 7.833333333333333,
+      "grad_norm": 0.004914712626487017,
+      "learning_rate": 9.555555555555557e-05,
+      "loss": 0.0013,
+      "step": 1175
+    },
+    {
+      "epoch": 7.866666666666667,
+      "grad_norm": 0.0047191414050757885,
+      "learning_rate": 9.511111111111112e-05,
+      "loss": 0.0015,
+      "step": 1180
+    },
+    {
+      "epoch": 7.9,
+      "grad_norm": 0.005518757738173008,
+      "learning_rate": 9.466666666666667e-05,
+      "loss": 0.0013,
+      "step": 1185
+    },
+    {
+      "epoch": 7.933333333333334,
+      "grad_norm": 0.0048125083558261395,
+      "learning_rate": 9.422222222222223e-05,
+      "loss": 0.0013,
+      "step": 1190
+    },
+    {
+      "epoch": 7.966666666666667,
+      "grad_norm": 0.004910661838948727,
+      "learning_rate": 9.377777777777779e-05,
+      "loss": 0.0013,
+      "step": 1195
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.0058229537680745125,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.0013,
+      "step": 1200
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.8916666666666667,
+      "eval_f1": 0.8909265231423857,
+      "eval_loss": 0.5724537372589111,
+      "eval_precision": 0.8942914839150495,
+      "eval_recall": 0.8916666666666667,
+      "eval_runtime": 9.3088,
+      "eval_samples_per_second": 128.91,
+      "eval_steps_per_second": 16.114,
+      "step": 1200
+    },
+    {
+      "epoch": 8.033333333333333,
+      "grad_norm": 0.004967012442648411,
+      "learning_rate": 9.28888888888889e-05,
+      "loss": 0.0012,
+      "step": 1205
+    },
+    {
+      "epoch": 8.066666666666666,
+      "grad_norm": 0.004711432848125696,
+      "learning_rate": 9.244444444444445e-05,
+      "loss": 0.0012,
+      "step": 1210
+    },
+    {
+      "epoch": 8.1,
+      "grad_norm": 0.004830135498195887,
+      "learning_rate": 9.200000000000001e-05,
+      "loss": 0.0012,
+      "step": 1215
+    },
+    {
+      "epoch": 8.133333333333333,
+      "grad_norm": 0.004584715235978365,
+      "learning_rate": 9.155555555555557e-05,
+      "loss": 0.0012,
+      "step": 1220
+    },
+    {
+      "epoch": 8.166666666666666,
+      "grad_norm": 0.004607088398188353,
+      "learning_rate": 9.111111111111112e-05,
+      "loss": 0.0012,
+      "step": 1225
+    },
+    {
+      "epoch": 8.2,
+      "grad_norm": 0.004765022080391645,
+      "learning_rate": 9.066666666666667e-05,
+      "loss": 0.0012,
+      "step": 1230
+    },
+    {
+      "epoch": 8.233333333333333,
+      "grad_norm": 0.00450365012511611,
+      "learning_rate": 9.022222222222224e-05,
+      "loss": 0.0012,
+      "step": 1235
+    },
+    {
+      "epoch": 8.266666666666667,
+      "grad_norm": 0.00442493474110961,
+      "learning_rate": 8.977777777777779e-05,
+      "loss": 0.0012,
+      "step": 1240
+    },
+    {
+      "epoch": 8.3,
+      "grad_norm": 0.004229240119457245,
+      "learning_rate": 8.933333333333334e-05,
+      "loss": 0.0012,
+      "step": 1245
+    },
+    {
+      "epoch": 8.333333333333334,
+      "grad_norm": 0.004398667719215155,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 0.0012,
+      "step": 1250
+    },
+    {
+      "epoch": 8.366666666666667,
+      "grad_norm": 0.0043451967649161816,
+      "learning_rate": 8.844444444444445e-05,
+      "loss": 0.0012,
+      "step": 1255
+    },
+    {
+      "epoch": 8.4,
+      "grad_norm": 0.004788388032466173,
+      "learning_rate": 8.800000000000001e-05,
+      "loss": 0.0012,
+      "step": 1260
+    },
+    {
+      "epoch": 8.433333333333334,
+      "grad_norm": 0.004544616676867008,
+      "learning_rate": 8.755555555555556e-05,
+      "loss": 0.0011,
+      "step": 1265
+    },
+    {
+      "epoch": 8.466666666666667,
+      "grad_norm": 0.0046425084583461285,
+      "learning_rate": 8.711111111111112e-05,
+      "loss": 0.0011,
+      "step": 1270
+    },
+    {
+      "epoch": 8.5,
+      "grad_norm": 0.004542881157249212,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 0.0011,
+      "step": 1275
+    },
+    {
+      "epoch": 8.533333333333333,
+      "grad_norm": 0.0045421249233186245,
+      "learning_rate": 8.622222222222222e-05,
+      "loss": 0.0011,
+      "step": 1280
+    },
+    {
+      "epoch": 8.566666666666666,
+      "grad_norm": 0.00405106833204627,
+      "learning_rate": 8.577777777777777e-05,
+      "loss": 0.0011,
+      "step": 1285
+    },
+    {
+      "epoch": 8.6,
+      "grad_norm": 0.004394896794110537,
+      "learning_rate": 8.533333333333334e-05,
+      "loss": 0.0011,
+      "step": 1290
+    },
+    {
+      "epoch": 8.633333333333333,
+      "grad_norm": 0.004041062202304602,
+      "learning_rate": 8.488888888888889e-05,
+      "loss": 0.0011,
+      "step": 1295
+    },
+    {
+      "epoch": 8.666666666666666,
+      "grad_norm": 0.004039550665766001,
+      "learning_rate": 8.444444444444444e-05,
+      "loss": 0.0011,
+      "step": 1300
+    },
+    {
+      "epoch": 8.666666666666666,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8925802394981491,
+      "eval_loss": 0.5772183537483215,
+      "eval_precision": 0.8960185823716569,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.1025,
+      "eval_samples_per_second": 131.831,
+      "eval_steps_per_second": 16.479,
+      "step": 1300
+    },
+    {
+      "epoch": 8.7,
+      "grad_norm": 0.004443011712282896,
+      "learning_rate": 8.4e-05,
+      "loss": 0.0011,
+      "step": 1305
+    },
+    {
+      "epoch": 8.733333333333333,
+      "grad_norm": 0.004374076146632433,
+      "learning_rate": 8.355555555555556e-05,
+      "loss": 0.0011,
+      "step": 1310
+    },
+    {
+      "epoch": 8.766666666666667,
+      "grad_norm": 0.0038743040058761835,
+      "learning_rate": 8.311111111111111e-05,
+      "loss": 0.001,
+      "step": 1315
+    },
+    {
+      "epoch": 8.8,
+      "grad_norm": 0.004188260063529015,
+      "learning_rate": 8.266666666666667e-05,
+      "loss": 0.0011,
+      "step": 1320
+    },
+    {
+      "epoch": 8.833333333333334,
+      "grad_norm": 0.0040704007260501385,
+      "learning_rate": 8.222222222222222e-05,
+      "loss": 0.001,
+      "step": 1325
+    },
+    {
+      "epoch": 8.866666666666667,
+      "grad_norm": 0.004075295757502317,
+      "learning_rate": 8.177777777777778e-05,
+      "loss": 0.001,
+      "step": 1330
+    },
+    {
+      "epoch": 8.9,
+      "grad_norm": 0.00430710194632411,
+      "learning_rate": 8.133333333333334e-05,
+      "loss": 0.001,
+      "step": 1335
+    },
+    {
+      "epoch": 8.933333333333334,
+      "grad_norm": 0.004175866488367319,
+      "learning_rate": 8.088888888888889e-05,
+      "loss": 0.001,
+      "step": 1340
+    },
+    {
+      "epoch": 8.966666666666667,
+      "grad_norm": 0.0053639840334653854,
+      "learning_rate": 8.044444444444444e-05,
+      "loss": 0.001,
+      "step": 1345
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.004056727048009634,
+      "learning_rate": 8e-05,
+      "loss": 0.001,
+      "step": 1350
+    },
+    {
+      "epoch": 9.033333333333333,
+      "grad_norm": 0.004028972238302231,
+      "learning_rate": 7.955555555555556e-05,
+      "loss": 0.001,
+      "step": 1355
+    },
+    {
+      "epoch": 9.066666666666666,
+      "grad_norm": 0.0037729782052338123,
+      "learning_rate": 7.911111111111111e-05,
+      "loss": 0.001,
+      "step": 1360
+    },
+    {
+      "epoch": 9.1,
+      "grad_norm": 0.004193460568785667,
+      "learning_rate": 7.866666666666666e-05,
+      "loss": 0.001,
+      "step": 1365
+    },
+    {
+      "epoch": 9.133333333333333,
+      "grad_norm": 0.0038553299382328987,
+      "learning_rate": 7.822222222222223e-05,
+      "loss": 0.001,
+      "step": 1370
+    },
+    {
+      "epoch": 9.166666666666666,
+      "grad_norm": 0.0036280308850109577,
+      "learning_rate": 7.777777777777778e-05,
+      "loss": 0.001,
+      "step": 1375
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 0.003699022112414241,
+      "learning_rate": 7.733333333333333e-05,
+      "loss": 0.001,
+      "step": 1380
+    },
+    {
+      "epoch": 9.233333333333333,
+      "grad_norm": 0.00368758337572217,
+      "learning_rate": 7.688888888888889e-05,
+      "loss": 0.001,
+      "step": 1385
+    },
+    {
+      "epoch": 9.266666666666667,
+      "grad_norm": 0.0037148615811020136,
+      "learning_rate": 7.644444444444445e-05,
+      "loss": 0.001,
+      "step": 1390
+    },
+    {
+      "epoch": 9.3,
+      "grad_norm": 0.0037871082313358784,
+      "learning_rate": 7.6e-05,
+      "loss": 0.001,
+      "step": 1395
+    },
+    {
+      "epoch": 9.333333333333334,
+      "grad_norm": 0.004081921651959419,
+      "learning_rate": 7.555555555555556e-05,
+      "loss": 0.001,
+      "step": 1400
+    },
+    {
+      "epoch": 9.333333333333334,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8925776922397967,
+      "eval_loss": 0.582018256187439,
+      "eval_precision": 0.8956392450490401,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.1239,
+      "eval_samples_per_second": 131.522,
+      "eval_steps_per_second": 16.44,
+      "step": 1400
+    },
+    {
+      "epoch": 9.366666666666667,
+      "grad_norm": 0.003595563583076,
+      "learning_rate": 7.511111111111111e-05,
+      "loss": 0.0009,
+      "step": 1405
+    },
+    {
+      "epoch": 9.4,
+      "grad_norm": 0.0034805000759661198,
+      "learning_rate": 7.466666666666667e-05,
+      "loss": 0.001,
+      "step": 1410
+    },
+    {
+      "epoch": 9.433333333333334,
+      "grad_norm": 0.00359420501627028,
+      "learning_rate": 7.422222222222223e-05,
+      "loss": 0.0009,
+      "step": 1415
+    },
+    {
+      "epoch": 9.466666666666667,
+      "grad_norm": 0.003963571507483721,
+      "learning_rate": 7.377777777777778e-05,
+      "loss": 0.0009,
+      "step": 1420
+    },
+    {
+      "epoch": 9.5,
+      "grad_norm": 0.003502447856590152,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 0.0009,
+      "step": 1425
+    },
+    {
+      "epoch": 9.533333333333333,
+      "grad_norm": 0.0036816669162362814,
+      "learning_rate": 7.28888888888889e-05,
+      "loss": 0.0009,
+      "step": 1430
+    },
+    {
+      "epoch": 9.566666666666666,
+      "grad_norm": 0.0036629538517445326,
+      "learning_rate": 7.244444444444445e-05,
+      "loss": 0.0009,
+      "step": 1435
+    },
+    {
+      "epoch": 9.6,
+      "grad_norm": 0.003758195787668228,
+      "learning_rate": 7.2e-05,
+      "loss": 0.001,
+      "step": 1440
+    },
+    {
+      "epoch": 9.633333333333333,
+      "grad_norm": 0.0035963943228125572,
+      "learning_rate": 7.155555555555555e-05,
+      "loss": 0.0009,
+      "step": 1445
+    },
+    {
+      "epoch": 9.666666666666666,
+      "grad_norm": 0.0038454525638371706,
+      "learning_rate": 7.111111111111112e-05,
+      "loss": 0.0009,
+      "step": 1450
+    },
+    {
+      "epoch": 9.7,
+      "grad_norm": 0.003494839882478118,
+      "learning_rate": 7.066666666666667e-05,
+      "loss": 0.0009,
+      "step": 1455
+    },
+    {
+      "epoch": 9.733333333333333,
+      "grad_norm": 0.0035291763488203287,
+      "learning_rate": 7.022222222222222e-05,
+      "loss": 0.0009,
+      "step": 1460
+    },
+    {
+      "epoch": 9.766666666666667,
+      "grad_norm": 0.0035067368298768997,
+      "learning_rate": 6.977777777777779e-05,
+      "loss": 0.0009,
+      "step": 1465
+    },
+    {
+      "epoch": 9.8,
+      "grad_norm": 0.0036443774588406086,
+      "learning_rate": 6.933333333333334e-05,
+      "loss": 0.0009,
+      "step": 1470
+    },
+    {
+      "epoch": 9.833333333333334,
+      "grad_norm": 0.003411532612517476,
+      "learning_rate": 6.88888888888889e-05,
+      "loss": 0.0009,
+      "step": 1475
+    },
+    {
+      "epoch": 9.866666666666667,
+      "grad_norm": 0.0036557214334607124,
+      "learning_rate": 6.844444444444445e-05,
+      "loss": 0.0009,
+      "step": 1480
+    },
+    {
+      "epoch": 9.9,
+      "grad_norm": 0.003884528297930956,
+      "learning_rate": 6.800000000000001e-05,
+      "loss": 0.0009,
+      "step": 1485
+    },
+    {
+      "epoch": 9.933333333333334,
+      "grad_norm": 0.003588372375816107,
+      "learning_rate": 6.755555555555557e-05,
+      "loss": 0.0009,
+      "step": 1490
+    },
+    {
+      "epoch": 9.966666666666667,
+      "grad_norm": 0.0034349607303738594,
+      "learning_rate": 6.711111111111112e-05,
+      "loss": 0.0009,
+      "step": 1495
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.0033968989737331867,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.0009,
+      "step": 1500
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8924770543900274,
+      "eval_loss": 0.5859149098396301,
+      "eval_precision": 0.8954137223601668,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.2287,
+      "eval_samples_per_second": 130.029,
+      "eval_steps_per_second": 16.254,
+      "step": 1500
+    },
+    {
+      "epoch": 10.033333333333333,
+      "grad_norm": 0.0033387893345206976,
+      "learning_rate": 6.622222222222224e-05,
+      "loss": 0.0009,
+      "step": 1505
+    },
+    {
+      "epoch": 10.066666666666666,
+      "grad_norm": 0.003473068820312619,
+      "learning_rate": 6.577777777777779e-05,
+      "loss": 0.0009,
+      "step": 1510
+    },
+    {
+      "epoch": 10.1,
+      "grad_norm": 0.0033902309369295835,
+      "learning_rate": 6.533333333333334e-05,
+      "loss": 0.0009,
+      "step": 1515
+    },
+    {
+      "epoch": 10.133333333333333,
+      "grad_norm": 0.0031655190978199244,
+      "learning_rate": 6.488888888888889e-05,
+      "loss": 0.0009,
+      "step": 1520
+    },
+    {
+      "epoch": 10.166666666666666,
+      "grad_norm": 0.003475926583632827,
+      "learning_rate": 6.444444444444446e-05,
+      "loss": 0.0008,
+      "step": 1525
+    },
+    {
+      "epoch": 10.2,
+      "grad_norm": 0.003318113274872303,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 0.0008,
+      "step": 1530
+    },
+    {
+      "epoch": 10.233333333333333,
+      "grad_norm": 0.0037339297123253345,
+      "learning_rate": 6.355555555555556e-05,
+      "loss": 0.0008,
+      "step": 1535
+    },
+    {
+      "epoch": 10.266666666666667,
+      "grad_norm": 0.003497259458526969,
+      "learning_rate": 6.311111111111112e-05,
+      "loss": 0.0009,
+      "step": 1540
+    },
+    {
+      "epoch": 10.3,
+      "grad_norm": 0.0031404101755470037,
+      "learning_rate": 6.266666666666667e-05,
+      "loss": 0.0008,
+      "step": 1545
+    },
+    {
+      "epoch": 10.333333333333334,
+      "grad_norm": 0.003206310560926795,
+      "learning_rate": 6.222222222222222e-05,
+      "loss": 0.0008,
+      "step": 1550
+    },
+    {
+      "epoch": 10.366666666666667,
+      "grad_norm": 0.003239595564082265,
+      "learning_rate": 6.177777777777779e-05,
+      "loss": 0.0008,
+      "step": 1555
+    },
+    {
+      "epoch": 10.4,
+      "grad_norm": 0.0032086025457829237,
+      "learning_rate": 6.133333333333334e-05,
+      "loss": 0.0008,
+      "step": 1560
+    },
+    {
+      "epoch": 10.433333333333334,
+      "grad_norm": 0.0031386413611471653,
+      "learning_rate": 6.08888888888889e-05,
+      "loss": 0.0008,
+      "step": 1565
+    },
+    {
+      "epoch": 10.466666666666667,
+      "grad_norm": 0.0030272870790213346,
+      "learning_rate": 6.044444444444445e-05,
+      "loss": 0.0008,
+      "step": 1570
+    },
+    {
+      "epoch": 10.5,
+      "grad_norm": 0.0032087203580886126,
+      "learning_rate": 6e-05,
+      "loss": 0.0008,
+      "step": 1575
+    },
+    {
+      "epoch": 10.533333333333333,
+      "grad_norm": 0.003076587338000536,
+      "learning_rate": 5.9555555555555554e-05,
+      "loss": 0.0008,
+      "step": 1580
+    },
+    {
+      "epoch": 10.566666666666666,
+      "grad_norm": 0.0031864922493696213,
+      "learning_rate": 5.911111111111112e-05,
+      "loss": 0.0008,
+      "step": 1585
+    },
+    {
+      "epoch": 10.6,
+      "grad_norm": 0.003065708791837096,
+      "learning_rate": 5.866666666666667e-05,
+      "loss": 0.0008,
+      "step": 1590
+    },
+    {
+      "epoch": 10.633333333333333,
+      "grad_norm": 0.003103989874944091,
+      "learning_rate": 5.8222222222222224e-05,
+      "loss": 0.0008,
+      "step": 1595
+    },
+    {
+      "epoch": 10.666666666666666,
+      "grad_norm": 0.0032167991157621145,
+      "learning_rate": 5.7777777777777776e-05,
+      "loss": 0.0008,
+      "step": 1600
+    },
+    {
+      "epoch": 10.666666666666666,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.892564314412375,
+      "eval_loss": 0.5900735259056091,
+      "eval_precision": 0.895467423351318,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.1411,
+      "eval_samples_per_second": 131.276,
+      "eval_steps_per_second": 16.409,
+      "step": 1600
+    },
+    {
+      "epoch": 10.7,
+      "grad_norm": 0.0030533717945218086,
+      "learning_rate": 5.7333333333333336e-05,
+      "loss": 0.0008,
+      "step": 1605
+    },
+    {
+      "epoch": 10.733333333333333,
+      "grad_norm": 0.0032962567638605833,
+      "learning_rate": 5.6888888888888895e-05,
+      "loss": 0.0008,
+      "step": 1610
+    },
+    {
+      "epoch": 10.766666666666667,
+      "grad_norm": 0.0032729010563343763,
+      "learning_rate": 5.644444444444445e-05,
+      "loss": 0.0008,
+      "step": 1615
+    },
+    {
+      "epoch": 10.8,
+      "grad_norm": 0.003075138432905078,
+      "learning_rate": 5.6000000000000006e-05,
+      "loss": 0.0008,
+      "step": 1620
+    },
+    {
+      "epoch": 10.833333333333334,
+      "grad_norm": 0.0031263744458556175,
+      "learning_rate": 5.555555555555556e-05,
+      "loss": 0.0008,
+      "step": 1625
+    },
+    {
+      "epoch": 10.866666666666667,
+      "grad_norm": 0.003029222832992673,
+      "learning_rate": 5.511111111111111e-05,
+      "loss": 0.0008,
+      "step": 1630
+    },
+    {
+      "epoch": 10.9,
+      "grad_norm": 0.003155304118990898,
+      "learning_rate": 5.466666666666666e-05,
+      "loss": 0.0008,
+      "step": 1635
+    },
+    {
+      "epoch": 10.933333333333334,
+      "grad_norm": 0.0031627577263861895,
+      "learning_rate": 5.422222222222223e-05,
+      "loss": 0.0008,
+      "step": 1640
+    },
+    {
+      "epoch": 10.966666666666667,
+      "grad_norm": 0.002991090528666973,
+      "learning_rate": 5.377777777777778e-05,
+      "loss": 0.0008,
+      "step": 1645
+    },
+    {
+      "epoch": 11.0,
+      "grad_norm": 0.0030046808533370495,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 0.0008,
+      "step": 1650
+    },
+    {
+      "epoch": 11.033333333333333,
+      "grad_norm": 0.0029926581773906946,
+      "learning_rate": 5.2888888888888885e-05,
+      "loss": 0.0008,
+      "step": 1655
+    },
+    {
+      "epoch": 11.066666666666666,
+      "grad_norm": 0.0029577272944152355,
+      "learning_rate": 5.244444444444445e-05,
+      "loss": 0.0008,
+      "step": 1660
+    },
+    {
+      "epoch": 11.1,
+      "grad_norm": 0.00291816215030849,
+      "learning_rate": 5.2000000000000004e-05,
+      "loss": 0.0008,
+      "step": 1665
+    },
+    {
+      "epoch": 11.133333333333333,
+      "grad_norm": 0.0028882354963570833,
+      "learning_rate": 5.1555555555555556e-05,
+      "loss": 0.0008,
+      "step": 1670
+    },
+    {
+      "epoch": 11.166666666666666,
+      "grad_norm": 0.0029444245155900717,
+      "learning_rate": 5.111111111111111e-05,
+      "loss": 0.0008,
+      "step": 1675
+    },
+    {
+      "epoch": 11.2,
+      "grad_norm": 0.0028277155943214893,
+      "learning_rate": 5.0666666666666674e-05,
+      "loss": 0.0008,
+      "step": 1680
+    },
+    {
+      "epoch": 11.233333333333333,
+      "grad_norm": 0.0028599537909030914,
+      "learning_rate": 5.0222222222222226e-05,
+      "loss": 0.0008,
+      "step": 1685
+    },
+    {
+      "epoch": 11.266666666666667,
+      "grad_norm": 0.002907586982473731,
+      "learning_rate": 4.977777777777778e-05,
+      "loss": 0.0008,
+      "step": 1690
+    },
+    {
+      "epoch": 11.3,
+      "grad_norm": 0.0030153663828969,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 0.0008,
+      "step": 1695
+    },
+    {
+      "epoch": 11.333333333333334,
+      "grad_norm": 0.0028990330174565315,
+      "learning_rate": 4.888888888888889e-05,
+      "loss": 0.0008,
+      "step": 1700
+    },
+    {
+      "epoch": 11.333333333333334,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8925901113216209,
+      "eval_loss": 0.5938182473182678,
+      "eval_precision": 0.8954790398682951,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.2035,
+      "eval_samples_per_second": 130.385,
+      "eval_steps_per_second": 16.298,
+      "step": 1700
+    },
+    {
+      "epoch": 11.366666666666667,
+      "grad_norm": 0.0032022136729210615,
+      "learning_rate": 4.844444444444445e-05,
+      "loss": 0.0007,
+      "step": 1705
+    },
+    {
+      "epoch": 11.4,
+      "grad_norm": 0.0028595593757927418,
+      "learning_rate": 4.8e-05,
+      "loss": 0.0007,
+      "step": 1710
+    },
+    {
+      "epoch": 11.433333333333334,
+      "grad_norm": 0.0029640330467373133,
+      "learning_rate": 4.755555555555556e-05,
+      "loss": 0.0008,
+      "step": 1715
+    },
+    {
+      "epoch": 11.466666666666667,
+      "grad_norm": 0.0029016851913183928,
+      "learning_rate": 4.711111111111111e-05,
+      "loss": 0.0007,
+      "step": 1720
+    },
+    {
+      "epoch": 11.5,
+      "grad_norm": 0.0029438238125294447,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.0007,
+      "step": 1725
+    },
+    {
+      "epoch": 11.533333333333333,
+      "grad_norm": 0.003153452416881919,
+      "learning_rate": 4.6222222222222224e-05,
+      "loss": 0.0007,
+      "step": 1730
+    },
+    {
+      "epoch": 11.566666666666666,
+      "grad_norm": 0.0031303968280553818,
+      "learning_rate": 4.577777777777778e-05,
+      "loss": 0.0007,
+      "step": 1735
+    },
+    {
+      "epoch": 11.6,
+      "grad_norm": 0.0029108517337590456,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 0.0007,
+      "step": 1740
+    },
+    {
+      "epoch": 11.633333333333333,
+      "grad_norm": 0.0029351389966905117,
+      "learning_rate": 4.4888888888888894e-05,
+      "loss": 0.0007,
+      "step": 1745
+    },
+    {
+      "epoch": 11.666666666666666,
+      "grad_norm": 0.003020907286554575,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.0007,
+      "step": 1750
+    },
+    {
+      "epoch": 11.7,
+      "grad_norm": 0.003223966807126999,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 0.0007,
+      "step": 1755
+    },
+    {
+      "epoch": 11.733333333333333,
+      "grad_norm": 0.002827573334798217,
+      "learning_rate": 4.355555555555556e-05,
+      "loss": 0.0007,
+      "step": 1760
+    },
+    {
+      "epoch": 11.766666666666667,
+      "grad_norm": 0.0028787998016923666,
+      "learning_rate": 4.311111111111111e-05,
+      "loss": 0.0007,
+      "step": 1765
+    },
+    {
+      "epoch": 11.8,
+      "grad_norm": 0.0027750665321946144,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 0.0007,
+      "step": 1770
+    },
+    {
+      "epoch": 11.833333333333334,
+      "grad_norm": 0.002890476491302252,
+      "learning_rate": 4.222222222222222e-05,
+      "loss": 0.0007,
+      "step": 1775
+    },
+    {
+      "epoch": 11.866666666666667,
+      "grad_norm": 0.002820511581376195,
+      "learning_rate": 4.177777777777778e-05,
+      "loss": 0.0007,
+      "step": 1780
+    },
+    {
+      "epoch": 11.9,
+      "grad_norm": 0.0028251498006284237,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 0.0007,
+      "step": 1785
+    },
+    {
+      "epoch": 11.933333333333334,
+      "grad_norm": 0.002791108563542366,
+      "learning_rate": 4.088888888888889e-05,
+      "loss": 0.0007,
+      "step": 1790
+    },
+    {
+      "epoch": 11.966666666666667,
+      "grad_norm": 0.002661674050614238,
+      "learning_rate": 4.0444444444444444e-05,
+      "loss": 0.0007,
+      "step": 1795
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 0.002654584590345621,
+      "learning_rate": 4e-05,
+      "loss": 0.0007,
+      "step": 1800
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8924909258808299,
+      "eval_loss": 0.5971092581748962,
+      "eval_precision": 0.8952869179752938,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.2033,
+      "eval_samples_per_second": 130.388,
+      "eval_steps_per_second": 16.298,
+      "step": 1800
+    },
+    {
+      "epoch": 12.033333333333333,
+      "grad_norm": 0.002706103026866913,
+      "learning_rate": 3.9555555555555556e-05,
+      "loss": 0.0007,
+      "step": 1805
+    },
+    {
+      "epoch": 12.066666666666666,
+      "grad_norm": 0.0027892158832401037,
+      "learning_rate": 3.9111111111111115e-05,
+      "loss": 0.0007,
+      "step": 1810
+    },
+    {
+      "epoch": 12.1,
+      "grad_norm": 0.002687312662601471,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 0.0007,
+      "step": 1815
+    },
+    {
+      "epoch": 12.133333333333333,
+      "grad_norm": 0.002817670814692974,
+      "learning_rate": 3.8222222222222226e-05,
+      "loss": 0.0007,
+      "step": 1820
+    },
+    {
+      "epoch": 12.166666666666666,
+      "grad_norm": 0.002796135377138853,
+      "learning_rate": 3.777777777777778e-05,
+      "loss": 0.0007,
+      "step": 1825
+    },
+    {
+      "epoch": 12.2,
+      "grad_norm": 0.0026829817797988653,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 0.0007,
+      "step": 1830
+    },
+    {
+      "epoch": 12.233333333333333,
+      "grad_norm": 0.0027780223172158003,
+      "learning_rate": 3.688888888888889e-05,
+      "loss": 0.0007,
+      "step": 1835
+    },
+    {
+      "epoch": 12.266666666666667,
+      "grad_norm": 0.0026549564208835363,
+      "learning_rate": 3.644444444444445e-05,
+      "loss": 0.0007,
+      "step": 1840
+    },
+    {
+      "epoch": 12.3,
+      "grad_norm": 0.0025722929276525974,
+      "learning_rate": 3.6e-05,
+      "loss": 0.0007,
+      "step": 1845
+    },
+    {
+      "epoch": 12.333333333333334,
+      "grad_norm": 0.0031209082808345556,
+      "learning_rate": 3.555555555555556e-05,
+      "loss": 0.0007,
+      "step": 1850
+    },
+    {
+      "epoch": 12.366666666666667,
+      "grad_norm": 0.0028525665402412415,
+      "learning_rate": 3.511111111111111e-05,
+      "loss": 0.0007,
+      "step": 1855
+    },
+    {
+      "epoch": 12.4,
+      "grad_norm": 0.0027296545449644327,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 0.0007,
+      "step": 1860
+    },
+    {
+      "epoch": 12.433333333333334,
+      "grad_norm": 0.00296882726252079,
+      "learning_rate": 3.4222222222222224e-05,
+      "loss": 0.0007,
+      "step": 1865
+    },
+    {
+      "epoch": 12.466666666666667,
+      "grad_norm": 0.002597276819869876,
+      "learning_rate": 3.377777777777778e-05,
+      "loss": 0.0007,
+      "step": 1870
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.0027183096390217543,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.0007,
+      "step": 1875
+    },
+    {
+      "epoch": 12.533333333333333,
+      "grad_norm": 0.0025975320022553205,
+      "learning_rate": 3.2888888888888894e-05,
+      "loss": 0.0007,
+      "step": 1880
+    },
+    {
+      "epoch": 12.566666666666666,
+      "grad_norm": 0.0028336485847830772,
+      "learning_rate": 3.2444444444444446e-05,
+      "loss": 0.0007,
+      "step": 1885
+    },
+    {
+      "epoch": 12.6,
+      "grad_norm": 0.002649620408192277,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 0.0007,
+      "step": 1890
+    },
+    {
+      "epoch": 12.633333333333333,
+      "grad_norm": 0.0029035883489996195,
+      "learning_rate": 3.155555555555556e-05,
+      "loss": 0.0007,
+      "step": 1895
+    },
+    {
+      "epoch": 12.666666666666666,
+      "grad_norm": 0.002905472880229354,
+      "learning_rate": 3.111111111111111e-05,
+      "loss": 0.0007,
+      "step": 1900
+    },
+    {
+      "epoch": 12.666666666666666,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8925572625959594,
+      "eval_loss": 0.5997689962387085,
+      "eval_precision": 0.8952187213685717,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.124,
+      "eval_samples_per_second": 131.522,
+      "eval_steps_per_second": 16.44,
+      "step": 1900
+    },
+    {
+      "epoch": 12.7,
+      "grad_norm": 0.0026109640020877123,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 0.0007,
+      "step": 1905
+    },
+    {
+      "epoch": 12.733333333333333,
+      "grad_norm": 0.002541190944612026,
+      "learning_rate": 3.0222222222222225e-05,
+      "loss": 0.0007,
+      "step": 1910
+    },
+    {
+      "epoch": 12.766666666666667,
+      "grad_norm": 0.0027586170472204685,
+      "learning_rate": 2.9777777777777777e-05,
+      "loss": 0.0007,
+      "step": 1915
+    },
+    {
+      "epoch": 12.8,
+      "grad_norm": 0.0025249046739190817,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 0.0007,
+      "step": 1920
+    },
+    {
+      "epoch": 12.833333333333334,
+      "grad_norm": 0.002689517568796873,
+      "learning_rate": 2.8888888888888888e-05,
+      "loss": 0.0007,
+      "step": 1925
+    },
+    {
+      "epoch": 12.866666666666667,
+      "grad_norm": 0.0028575279284268618,
+      "learning_rate": 2.8444444444444447e-05,
+      "loss": 0.0007,
+      "step": 1930
+    },
+    {
+      "epoch": 12.9,
+      "grad_norm": 0.002582886489108205,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 0.0007,
+      "step": 1935
+    },
+    {
+      "epoch": 12.933333333333334,
+      "grad_norm": 0.0026416087057441473,
+      "learning_rate": 2.7555555555555555e-05,
+      "loss": 0.0007,
+      "step": 1940
+    },
+    {
+      "epoch": 12.966666666666667,
+      "grad_norm": 0.002628608839586377,
+      "learning_rate": 2.7111111111111114e-05,
+      "loss": 0.0007,
+      "step": 1945
+    },
+    {
+      "epoch": 13.0,
+      "grad_norm": 0.0025201744865626097,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 0.0007,
+      "step": 1950
+    },
+    {
+      "epoch": 13.033333333333333,
+      "grad_norm": 0.0026108119636774063,
+      "learning_rate": 2.6222222222222226e-05,
+      "loss": 0.0007,
+      "step": 1955
+    },
+    {
+      "epoch": 13.066666666666666,
+      "grad_norm": 0.002755657071247697,
+      "learning_rate": 2.5777777777777778e-05,
+      "loss": 0.0007,
+      "step": 1960
+    },
+    {
+      "epoch": 13.1,
+      "grad_norm": 0.002564393449574709,
+      "learning_rate": 2.5333333333333337e-05,
+      "loss": 0.0007,
+      "step": 1965
+    },
+    {
+      "epoch": 13.133333333333333,
+      "grad_norm": 0.0025547167751938105,
+      "learning_rate": 2.488888888888889e-05,
+      "loss": 0.0007,
+      "step": 1970
+    },
+    {
+      "epoch": 13.166666666666666,
+      "grad_norm": 0.002583136083558202,
+      "learning_rate": 2.4444444444444445e-05,
+      "loss": 0.0007,
+      "step": 1975
+    },
+    {
+      "epoch": 13.2,
+      "grad_norm": 0.002491503022611141,
+      "learning_rate": 2.4e-05,
+      "loss": 0.0007,
+      "step": 1980
+    },
+    {
+      "epoch": 13.233333333333333,
+      "grad_norm": 0.0025622770190238953,
+      "learning_rate": 2.3555555555555556e-05,
+      "loss": 0.0007,
+      "step": 1985
+    },
+    {
+      "epoch": 13.266666666666667,
+      "grad_norm": 0.0027995556592941284,
+      "learning_rate": 2.3111111111111112e-05,
+      "loss": 0.0007,
+      "step": 1990
+    },
+    {
+      "epoch": 13.3,
+      "grad_norm": 0.0027331418823450804,
+      "learning_rate": 2.2666666666666668e-05,
+      "loss": 0.0006,
+      "step": 1995
+    },
+    {
+      "epoch": 13.333333333333334,
+      "grad_norm": 0.0026821917854249477,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.0007,
+      "step": 2000
+    },
+    {
+      "epoch": 13.333333333333334,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8926087304810708,
+      "eval_loss": 0.6016172766685486,
+      "eval_precision": 0.8952164937109535,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.2849,
+      "eval_samples_per_second": 129.243,
+      "eval_steps_per_second": 16.155,
+      "step": 2000
+    },
+    {
+      "epoch": 13.366666666666667,
+      "grad_norm": 0.002859922591596842,
+      "learning_rate": 2.177777777777778e-05,
+      "loss": 0.0007,
+      "step": 2005
+    },
+    {
+      "epoch": 13.4,
+      "grad_norm": 0.0024555367417633533,
+      "learning_rate": 2.1333333333333335e-05,
+      "loss": 0.0006,
+      "step": 2010
+    },
+    {
+      "epoch": 13.433333333333334,
+      "grad_norm": 0.0026238062418997288,
+      "learning_rate": 2.088888888888889e-05,
+      "loss": 0.0007,
+      "step": 2015
+    },
+    {
+      "epoch": 13.466666666666667,
+      "grad_norm": 0.0028192377649247646,
+      "learning_rate": 2.0444444444444446e-05,
+      "loss": 0.0007,
+      "step": 2020
+    },
+    {
+      "epoch": 13.5,
+      "grad_norm": 0.0025528157129883766,
+      "learning_rate": 2e-05,
+      "loss": 0.0007,
+      "step": 2025
+    },
+    {
+      "epoch": 13.533333333333333,
+      "grad_norm": 0.0025877885054796934,
+      "learning_rate": 1.9555555555555557e-05,
+      "loss": 0.0006,
+      "step": 2030
+    },
+    {
+      "epoch": 13.566666666666666,
+      "grad_norm": 0.0025963452644646168,
+      "learning_rate": 1.9111111111111113e-05,
+      "loss": 0.0006,
+      "step": 2035
+    },
+    {
+      "epoch": 13.6,
+      "grad_norm": 0.002605219604447484,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 0.0006,
+      "step": 2040
+    },
+    {
+      "epoch": 13.633333333333333,
+      "grad_norm": 0.0024622888304293156,
+      "learning_rate": 1.8222222222222224e-05,
+      "loss": 0.0007,
+      "step": 2045
+    },
+    {
+      "epoch": 13.666666666666666,
+      "grad_norm": 0.00264735403470695,
+      "learning_rate": 1.777777777777778e-05,
+      "loss": 0.0007,
+      "step": 2050
+    },
+    {
+      "epoch": 13.7,
+      "grad_norm": 0.0025377385318279266,
+      "learning_rate": 1.7333333333333336e-05,
+      "loss": 0.0006,
+      "step": 2055
+    },
+    {
+      "epoch": 13.733333333333333,
+      "grad_norm": 0.0024450563360005617,
+      "learning_rate": 1.688888888888889e-05,
+      "loss": 0.0006,
+      "step": 2060
+    },
+    {
+      "epoch": 13.766666666666667,
+      "grad_norm": 0.0024939640425145626,
+      "learning_rate": 1.6444444444444447e-05,
+      "loss": 0.0006,
+      "step": 2065
+    },
+    {
+      "epoch": 13.8,
+      "grad_norm": 0.0026530995965003967,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 0.0006,
+      "step": 2070
+    },
+    {
+      "epoch": 13.833333333333334,
+      "grad_norm": 0.002551464829593897,
+      "learning_rate": 1.5555555555555555e-05,
+      "loss": 0.0006,
+      "step": 2075
+    },
+    {
+      "epoch": 13.866666666666667,
+      "grad_norm": 0.0026167011819779873,
+      "learning_rate": 1.5111111111111112e-05,
+      "loss": 0.0006,
+      "step": 2080
+    },
+    {
+      "epoch": 13.9,
+      "grad_norm": 0.0023945062421262264,
+      "learning_rate": 1.4666666666666668e-05,
+      "loss": 0.0006,
+      "step": 2085
+    },
+    {
+      "epoch": 13.933333333333334,
+      "grad_norm": 0.0023595585953444242,
+      "learning_rate": 1.4222222222222224e-05,
+      "loss": 0.0006,
+      "step": 2090
+    },
+    {
+      "epoch": 13.966666666666667,
+      "grad_norm": 0.002572057070210576,
+      "learning_rate": 1.3777777777777778e-05,
+      "loss": 0.0006,
+      "step": 2095
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 0.0027738932985812426,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 0.0006,
+      "step": 2100
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8926087304810708,
+      "eval_loss": 0.6031957268714905,
+      "eval_precision": 0.8952164937109535,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 8.9675,
+      "eval_samples_per_second": 133.816,
+      "eval_steps_per_second": 16.727,
+      "step": 2100
+    },
+    {
+      "epoch": 14.033333333333333,
+      "grad_norm": 0.002405740087851882,
+      "learning_rate": 1.2888888888888889e-05,
+      "loss": 0.0006,
+      "step": 2105
+    },
+    {
+      "epoch": 14.066666666666666,
+      "grad_norm": 0.002379121957346797,
+      "learning_rate": 1.2444444444444445e-05,
+      "loss": 0.0006,
+      "step": 2110
+    },
+    {
+      "epoch": 14.1,
+      "grad_norm": 0.002425256185233593,
+      "learning_rate": 1.2e-05,
+      "loss": 0.0006,
+      "step": 2115
+    },
+    {
+      "epoch": 14.133333333333333,
+      "grad_norm": 0.002582225715741515,
+      "learning_rate": 1.1555555555555556e-05,
+      "loss": 0.0006,
+      "step": 2120
+    },
+    {
+      "epoch": 14.166666666666666,
+      "grad_norm": 0.002389343688264489,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 0.0006,
+      "step": 2125
+    },
+    {
+      "epoch": 14.2,
+      "grad_norm": 0.0024247432593256235,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 0.0006,
+      "step": 2130
+    },
+    {
+      "epoch": 14.233333333333333,
+      "grad_norm": 0.002556301886215806,
+      "learning_rate": 1.0222222222222223e-05,
+      "loss": 0.0006,
+      "step": 2135
+    },
+    {
+      "epoch": 14.266666666666667,
+      "grad_norm": 0.002499427879229188,
+      "learning_rate": 9.777777777777779e-06,
+      "loss": 0.0006,
+      "step": 2140
+    },
+    {
+      "epoch": 14.3,
+      "grad_norm": 0.002539202570915222,
+      "learning_rate": 9.333333333333334e-06,
+      "loss": 0.0006,
+      "step": 2145
+    },
+    {
+      "epoch": 14.333333333333334,
+      "grad_norm": 0.0024849127512425184,
+      "learning_rate": 8.88888888888889e-06,
+      "loss": 0.0006,
+      "step": 2150
+    },
+    {
+      "epoch": 14.366666666666667,
+      "grad_norm": 0.002673888811841607,
+      "learning_rate": 8.444444444444446e-06,
+      "loss": 0.0006,
+      "step": 2155
+    },
+    {
+      "epoch": 14.4,
+      "grad_norm": 0.0023994643706828356,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.0006,
+      "step": 2160
+    },
+    {
+      "epoch": 14.433333333333334,
+      "grad_norm": 0.002650046721100807,
+      "learning_rate": 7.555555555555556e-06,
+      "loss": 0.0006,
+      "step": 2165
+    },
+    {
+      "epoch": 14.466666666666667,
+      "grad_norm": 0.0024455806706100702,
+      "learning_rate": 7.111111111111112e-06,
+      "loss": 0.0006,
+      "step": 2170
+    },
+    {
+      "epoch": 14.5,
+      "grad_norm": 0.0025402368046343327,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.0006,
+      "step": 2175
+    },
+    {
+      "epoch": 14.533333333333333,
+      "grad_norm": 0.0025018779560923576,
+      "learning_rate": 6.222222222222222e-06,
+      "loss": 0.0006,
+      "step": 2180
+    },
+    {
+      "epoch": 14.566666666666666,
+      "grad_norm": 0.0026018363423645496,
+      "learning_rate": 5.777777777777778e-06,
+      "loss": 0.0006,
+      "step": 2185
+    },
+    {
+      "epoch": 14.6,
+      "grad_norm": 0.002430640161037445,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 0.0006,
+      "step": 2190
+    },
+    {
+      "epoch": 14.633333333333333,
+      "grad_norm": 0.002809967612847686,
+      "learning_rate": 4.888888888888889e-06,
+      "loss": 0.0006,
+      "step": 2195
+    },
+    {
+      "epoch": 14.666666666666666,
+      "grad_norm": 0.002380241174250841,
+      "learning_rate": 4.444444444444445e-06,
+      "loss": 0.0006,
+      "step": 2200
+    },
+    {
+      "epoch": 14.666666666666666,
+      "eval_accuracy": 0.8933333333333333,
+      "eval_f1": 0.8926087304810708,
+      "eval_loss": 0.603855311870575,
+      "eval_precision": 0.8952164937109535,
+      "eval_recall": 0.8933333333333333,
+      "eval_runtime": 9.1221,
+      "eval_samples_per_second": 131.549,
+      "eval_steps_per_second": 16.444,
+      "step": 2200
+    },
+    {
+      "epoch": 14.7,
+      "grad_norm": 0.002586866496130824,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.0006,
+      "step": 2205
+    },
+    {
+      "epoch": 14.733333333333333,
+      "grad_norm": 0.002559363842010498,
+      "learning_rate": 3.555555555555556e-06,
+      "loss": 0.0006,
+      "step": 2210
+    },
+    {
+      "epoch": 14.766666666666667,
+      "grad_norm": 0.002355343895033002,
+      "learning_rate": 3.111111111111111e-06,
+      "loss": 0.0006,
+      "step": 2215
+    },
+    {
+      "epoch": 14.8,
+      "grad_norm": 0.002358679659664631,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 0.0006,
+      "step": 2220
+    },
+    {
+      "epoch": 14.833333333333334,
+      "grad_norm": 0.0025940965861082077,
+      "learning_rate": 2.2222222222222225e-06,
+      "loss": 0.0006,
+      "step": 2225
+    },
+    {
+      "epoch": 14.866666666666667,
+      "grad_norm": 0.0026809147093445063,
+      "learning_rate": 1.777777777777778e-06,
+      "loss": 0.0006,
+      "step": 2230
+    },
+    {
+      "epoch": 14.9,
+      "grad_norm": 0.0024571644607931376,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.0006,
+      "step": 2235
+    },
+    {
+      "epoch": 14.933333333333334,
+      "grad_norm": 0.0025657913647592068,
+      "learning_rate": 8.88888888888889e-07,
+      "loss": 0.0006,
+      "step": 2240
+    },
+    {
+      "epoch": 14.966666666666667,
+      "grad_norm": 0.002465125173330307,
+      "learning_rate": 4.444444444444445e-07,
+      "loss": 0.0006,
+      "step": 2245
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.002350958064198494,
+      "learning_rate": 0.0,
+      "loss": 0.0006,
+      "step": 2250
+    },
+    {
+      "epoch": 15.0,
+      "step": 2250,
+      "total_flos": 5.57962327867392e+18,
+      "train_loss": 0.03856972599029541,
+      "train_runtime": 877.6839,
+      "train_samples_per_second": 82.034,
+      "train_steps_per_second": 2.564
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 15,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.57962327867392e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c6d8940e22bf0ca1d1e707e068d3e8f5eaf8a2a622ba6d92b40a24d1a2fb3e0
+size 5432